package de.ugoe.cs.eventbench.web; import java.io.FileNotFoundException; import java.io.IOException; import java.net.URI; import java.net.URISyntaxException; import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.HashMap; import java.util.LinkedList; import java.util.List; import java.util.Map; import de.ugoe.cs.eventbench.web.data.WebEvent; import de.ugoe.cs.util.FileTools; import de.ugoe.cs.util.console.Console; /** *
* Provides functionality to parse log files with web request. *
* * @author Steffen Herbold * @version 1.0 */ public class WeblogParser { /** ** Timeout between two sessions in milliseconds. *
*/ private long timeout; /** ** Minimal length of a session. All shorter sessions will be pruned. * Default: 2 *
*/ private int minLength = 2; /** ** Collection of generated sequences. *
*/ private List* Name and path of the robot filter. *
*/ private static final String ROBOTFILTERFILE = "misc/robotfilter.txt"; /** ** Field that contains a regular expression that matches all robots * contained in {@link #ROBOTFILTERFILE}. *
*/ private String robotRegex = null; /** ** Constructor. Creates a new WeblogParser with a default timeout of * 3,600,000 milliseconds (1 hour). *
*/ public WeblogParser() { this(3600000); } /** ** Constructor. Creates a new WeblogParser. *
* * @param timeout * session timeout */ public WeblogParser(long timeout) { this.timeout = timeout; } /** ** Returns the generated event sequences. *
* * @return generated event sequences */ public List* Sets the session timeout. *
* * @param timeout * new session timeout */ public void setTimeout(long timeout) { this.timeout = timeout; } /** ** Sets the minimal length of a session. All sessions that contain less * events will be pruned. *
* * @param minLength * new minimal length */ public void setMinLength(int minLength) { this.minLength = minLength; } /** ** Parses a web log file. *
* * @param filename * name and path of the log file * @throws IOException * thrown if there is a problem with reading the log file * @throws FileNotFoundException * thrown if the log file is not found * @throws ParseException * thrown the date format is invalid * @throws URISyntaxException * thrown if the URI is invalid */ public void parseFile(String filename) throws IOException, FileNotFoundException, ParseException, URISyntaxException { String[] lines = FileTools.getLinesFromFile(filename); Map* Prunes sequences shorter than {@link #minLength}. *
*/ private void pruneShortSequences() { Console.traceln("" + sequences.size() + " user sequences found"); // prune sequences shorter than min-length int i = 0; while (i < sequences.size()) { if (sequences.get(i).size() < minLength) { sequences.remove(i); } else { i++; } } Console.traceln("" + sequences.size() + " remaining after pruning of sequences shorter than " + minLength); } /** ** Reads {@link #ROBOTFILTERFILE} and creates a regular expression that * matches all the robots defined in the file. The regular expression is * stored in the field {@link #robotRegex}. *
* * @throws IOException * thrown if there is a problem reading the robot filter * @throws FileNotFoundException * thrown if the robot filter is not found */ private void loadRobotRegex() throws IOException, FileNotFoundException { String[] lines = FileTools.getLinesFromFile(ROBOTFILTERFILE); StringBuilder regex = new StringBuilder(); for (int i = 0; i < lines.length; i++) { regex.append("(.*" + lines[i] + ".*)"); if (i != lines.length - 1) { regex.append('|'); } } robotRegex = regex.toString(); } /** ** Checks whether an agent is a robot. *
* * @param agent * agent that is checked * @return true, if the agent is a robot; false otherwise */ private boolean isRobot(String agent) { return agent.matches(robotRegex); } /** ** Parses the URI and extracts the GET variables that have been passed. *
* * @param uri * URI that is parsed * @return a list with all GET variables */ private List