package de.ugoe.cs.eventbench.web; import java.io.FileNotFoundException; import java.io.IOException; import java.net.URI; import java.net.URISyntaxException; import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Collection; import java.util.HashMap; import java.util.HashSet; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Set; import de.ugoe.cs.eventbench.web.data.WebEvent; import de.ugoe.cs.util.FileTools; import de.ugoe.cs.util.console.Console; /** *
* Provides functionality to parse log files with web request. *
* * @author Steffen Herbold * @version 1.0 */ public class WeblogParser { /** ** Timeout between two sessions in milliseconds. *
*/ private long timeout; /** *
* Minimal length of a session. All shorter sessions will be pruned.
* Default: 2
*
* Maximal length of a session. All longer sessions will be pruned.
* Default: 100
*
* URL of the server that generated the log that is currently parser; null
* of URL is not available.
* Default: null
*
* Collection of generated sequences. *
*/ private List* List that stores the users (identified through their cookie id) to each * sequence. *
*/ private List* List that stores the frequent users (identified through their cookie id) * to each sequence. *
*/ private List* Sequences for all frequent users. *
*/ private List* Threshold that defines how many sessions of a user are require to deem * the user frequent. Note, that only sessions whose lengths is in range if * {@link #minLength} and {@link #maxLength} are counted. *
*/ private int frequentUsersThreshold = -1; /** ** Name and path of the robot filter. *
*/ private static final String ROBOTFILTERFILE = "misc/robotfilter.txt"; /** ** Field that contains a regular expression that matches all robots * contained in {@link #ROBOTFILTERFILE}. *
*/ private String robotRegex = null; /** ** Constructor. Creates a new WeblogParser with a default timeout of * 3,600,000 milliseconds (1 hour). *
*/ public WeblogParser() { this(3600000); } /** ** Constructor. Creates a new WeblogParser. *
* * @param timeout * session timeout */ public WeblogParser(long timeout) { this.timeout = timeout; } /** ** Returns the generated event sequences. *
* * @return generated event sequences */ public Collection* Sets the session timeout. *
* * @param timeout * new session timeout */ public void setTimeout(long timeout) { this.timeout = timeout; } /** ** Sets the minimal length of a session. All sessions that contain less * events will be pruned. *
* * @param minLength * new minimal length */ public void setMinLength(int minLength) { this.minLength = minLength; } /** ** Sets the maximal length of a session. All sessions that contain more * events will be pruned. *
* * @param maxLength * new maximal length */ public void setMaxLength(int maxLength) { this.maxLength = maxLength; } /** ** Sets the URL of the server from which this log was generated. Often * required for replay generation *
* * @param url * URL of the server */ public void setUrl(String url) { this.url = url; } /** ** Sets the threshold for frequent users. *
* * @param threshold * threshold value; if the value is <1, the sessions of the * frequent users will not be determined */ public void setFrequentUserThreshold(int threshold) { this.frequentUsersThreshold = threshold; } /** ** Returns the IDs of all frequent users. *
* * @return IDs of the frequent users */ public List* Returns the sequences of all frequent users. *
* * * @return list of the sequences of all frequent users */ public List* Parses a web log file. *
* * @param filename * name and path of the log file * @throws IOException * thrown if there is a problem with reading the log file * @throws FileNotFoundException * thrown if the log file is not found * @throws ParseException * thrown the date format is invalid */ public void parseFile(String filename) throws IOException, FileNotFoundException, ParseException { String[] lines = FileTools.getLinesFromFile(filename); Map* Generates the frequent user sequences, according to the threshold * {@link #frequentUsersThreshold}. *
* * @param uniqueUsers * set with all user IDs */ private void generateFrequentUserSequences(Set* Prunes sequences shorter than {@link #minLength} and longer than * {@link #maxLength}. *
*/ private void pruneSequences() { int i = 0; while (i < sequences.size()) { if ((sequences.get(i).size() < minLength) || sequences.get(i).size() > maxLength) { sequences.remove(i); users.remove(i); } else { i++; } } } /** ** Reads {@link #ROBOTFILTERFILE} and creates a regular expression that * matches all the robots defined in the file. The regular expression is * stored in the field {@link #robotRegex}. *
* * @throws IOException * thrown if there is a problem reading the robot filter * @throws FileNotFoundException * thrown if the robot filter is not found */ private void loadRobotRegex() throws IOException, FileNotFoundException { String[] lines = FileTools.getLinesFromFile(ROBOTFILTERFILE); StringBuilder regex = new StringBuilder(); for (int i = 0; i < lines.length; i++) { regex.append("(.*" + lines[i] + ".*)"); if (i != lines.length - 1) { regex.append('|'); } } robotRegex = regex.toString(); } /** ** Checks whether an agent is a robot. *
* * @param agent * agent that is checked * @return true, if the agent is a robot; false otherwise */ private boolean isRobot(String agent) { return agent.matches(robotRegex); } /** ** Parses the URI and extracts the GET variables that have been passed. *
* * @param uri * URI that is parsed * @return a list with all GET variables */ private List