package de.ugoe.cs.eventbench.web; import java.io.FileNotFoundException; import java.io.IOException; import java.net.URI; import java.net.URISyntaxException; import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Collection; import java.util.HashMap; import java.util.HashSet; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Set; import de.ugoe.cs.eventbench.web.data.WebEvent; import de.ugoe.cs.util.FileTools; import de.ugoe.cs.util.console.Console; /** *

* Provides functionality to parse log files with web request. *

* * @author Steffen Herbold * @version 1.0 */ public class WeblogParser { /** *

* Timeout between two sessions in milliseconds. *

*/ private long timeout; /** *

* Minimal length of a session. All shorter sessions will be pruned.
* Default: 2 *

*/ private int minLength = 2; /** *

* Maximal length of a session. All longer sessions will be pruned.
* Default: 100 *

*/ private int maxLength = 100; /** *

* URL of the server that generated the log that is currently parser; null * of URL is not available.
* Default: null *

*/ private String url = null; /** *

* Collection of generated sequences. *

*/ private List> sequences; /** *

* List that stores the users (identified through their cookie id) to each * sequence. *

*/ private List users; /** *

* List that stores the frequent users (identified through their cookie id) * to each sequence. *

*/ private List frequentUsers; /** *

* Sequences for all frequent users. *

*/ private List>> sequencesFrequentUsers; /** *

* Threshold that defines how many sessions of a user are require to deem * the user frequent. Note, that only sessions whose lengths is in range if * {@link #minLength} and {@link #maxLength} are counted. *

*/ private int frequentUsersThreshold = -1; /** *

* Name and path of the robot filter. *

*/ private static final String ROBOTFILTERFILE = "misc/robotfilter.txt"; /** *

* Field that contains a regular expression that matches all robots * contained in {@link #ROBOTFILTERFILE}. *

*/ private String robotRegex = null; /** *

* Constructor. Creates a new WeblogParser with a default timeout of * 3,600,000 milliseconds (1 hour). *

*/ public WeblogParser() { this(3600000); } /** *

* Constructor. Creates a new WeblogParser. *

* * @param timeout * session timeout */ public WeblogParser(long timeout) { this.timeout = timeout; } /** *

* Returns the generated event sequences. *

* * @return generated event sequences */ public Collection> getSequences() { return sequences; } /** *

* Sets the session timeout. *

* * @param timeout * new session timeout */ public void setTimeout(long timeout) { this.timeout = timeout; } /** *

* Sets the minimal length of a session. All sessions that contain less * events will be pruned. *

* * @param minLength * new minimal length */ public void setMinLength(int minLength) { this.minLength = minLength; } /** *

* Sets the maximal length of a session. All sessions that contain more * events will be pruned. *

* * @param maxLength * new maximal length */ public void setMaxLength(int maxLength) { this.maxLength = maxLength; } /** *

* Sets the URL of the server from which this log was generated. Often * required for replay generation *

* * @param url * URL of the server */ public void setUrl(String url) { this.url = url; } /** *

* Sets the threshold for frequent users. *

* * @param threshold * threshold value; if the value is <1, the sessions of the * frequent users will not be determined */ public void setFrequentUserThreshold(int threshold) { this.frequentUsersThreshold = threshold; } /** *

* Returns the IDs of all frequent users. *

* * @return IDs of the frequent users */ public List getFrequentUsers() { return frequentUsers; } /** *

* Returns the sequences of all frequent users. *

*

* * @return list of the sequences of all frequent users */ public List>> getFrequentUserSequences() { return sequencesFrequentUsers; } /** *

* Parses a web log file. *

* * @param filename * name and path of the log file * @throws IOException * thrown if there is a problem with reading the log file * @throws FileNotFoundException * thrown if the log file is not found * @throws ParseException * thrown the date format is invalid */ public void parseFile(String filename) throws IOException, FileNotFoundException, ParseException { String[] lines = FileTools.getLinesFromFile(filename); Map> cookieSessionMap = new HashMap>(); int lastId = -1; SimpleDateFormat dateFormat = new SimpleDateFormat( "yyyy-MM-dd HH:mm:ss"); loadRobotRegex(); sequences = new ArrayList>(); users = new ArrayList(); int lineCounter = 0; for (String line : lines) { lineCounter++; String[] values = line.substring(1, line.length() - 1).split( "\" \""); // use cookie as session identifier int cookieStart = values[0].lastIndexOf('.'); String cookie = values[0].substring(cookieStart + 1); String dateString = values[1]; long timestamp = dateFormat.parse(dateString).getTime(); String uriString = values[2]; // String ref = values[3]; // referer is not yet used! String agent; if (values.length > 4) { agent = values[4]; } else { agent = "noagent"; } List postedVars = new ArrayList(); if (values.length == 6) { // post vars found for (String postVar : values[5].trim().split(" ")) { // TODO manual filtering of bad variables, should be // automated if (!postVar.contains("and")) { postedVars.add(postVar); } } } if (!isRobot(agent)) { try { URI uri = new URI(uriString); String path = uri.getPath(); List getVars = extractGetVarsFromUri(uri); WebEvent event = new WebEvent(url, path, timestamp, postedVars, getVars); // find session and add event List sessionIds = cookieSessionMap.get(cookie); if (sessionIds == null) { sessionIds = new ArrayList(); // start new session sessionIds.add(++lastId); cookieSessionMap.put(cookie, sessionIds); sequences.add(new LinkedList()); users.add(cookie); } Integer lastSessionIndex = sessionIds .get(sessionIds.size() - 1); List lastSession = sequences .get(lastSessionIndex); long lastEventTime = timestamp; if (!lastSession.isEmpty()) { lastEventTime = lastSession.get(lastSession.size() - 1) .getTimestamp(); } if (timestamp - lastEventTime > timeout) { sessionIds.add(++lastId); List newSession = new LinkedList(); newSession.add(event); sequences.add(newSession); users.add(cookie); } else { lastSession.add(event); } } catch (URISyntaxException e) { Console.traceln("Ignored line " + lineCounter + ": " + e.getMessage()); } } } Console.traceln("" + sequences.size() + " user sequences found"); pruneSequences(); Console.traceln("" + sequences.size() + " remaining after pruning of sequences shorter than " + minLength); Set uniqueUsers = new HashSet(users); Console.traceln("" + uniqueUsers.size() + " unique users"); if (frequentUsersThreshold > 0) { generateFrequentUserSequences(uniqueUsers); } } /** *

* Generates the frequent user sequences, according to the threshold * {@link #frequentUsersThreshold}. *

* * @param uniqueUsers * set with all user IDs */ private void generateFrequentUserSequences(Set uniqueUsers) { frequentUsers = new ArrayList(); sequencesFrequentUsers = new ArrayList>>(); for (String user : uniqueUsers) { List tmp = new ArrayList(); tmp.add(user); List usersCopy = new LinkedList(users); usersCopy.retainAll(tmp); int size = usersCopy.size(); if (size >= frequentUsersThreshold) { frequentUsers.add(user); Collection> sequencesUser = new ArrayList>(); for (int i = 0; i < sequences.size(); i++) { if (users.get(i).equals(user)) { sequencesUser.add(sequences.get(i)); } } sequencesFrequentUsers.add(sequencesUser); } } Console.traceln("" + frequentUsers.size() + " users with more than " + frequentUsersThreshold + " sequences"); } /** *

* Prunes sequences shorter than {@link #minLength} and longer than * {@link #maxLength}. *

*/ private void pruneSequences() { int i = 0; while (i < sequences.size()) { if ((sequences.get(i).size() < minLength) || sequences.get(i).size() > maxLength) { sequences.remove(i); users.remove(i); } else { i++; } } } /** *

* Reads {@link #ROBOTFILTERFILE} and creates a regular expression that * matches all the robots defined in the file. The regular expression is * stored in the field {@link #robotRegex}. *

* * @throws IOException * thrown if there is a problem reading the robot filter * @throws FileNotFoundException * thrown if the robot filter is not found */ private void loadRobotRegex() throws IOException, FileNotFoundException { String[] lines = FileTools.getLinesFromFile(ROBOTFILTERFILE); StringBuilder regex = new StringBuilder(); for (int i = 0; i < lines.length; i++) { regex.append("(.*" + lines[i] + ".*)"); if (i != lines.length - 1) { regex.append('|'); } } robotRegex = regex.toString(); } /** *

* Checks whether an agent is a robot. *

* * @param agent * agent that is checked * @return true, if the agent is a robot; false otherwise */ private boolean isRobot(String agent) { return agent.matches(robotRegex); } /** *

* Parses the URI and extracts the GET variables that have been passed. *

* * @param uri * URI that is parsed * @return a list with all GET variables */ private List extractGetVarsFromUri(URI uri) { List getVars = new ArrayList(); String query = uri.getQuery(); if (query != null) { String[] paramPairs = query.split("&"); for (String paramPair : paramPairs) { String[] paramSplit = paramPair.split("="); // TODO manual filtering of bad variables, should be automated if (!paramSplit[0].contains("and")) { getVars.add(paramSplit[0]); } } } return getVars; } }