package de.ugoe.cs.eventbench.web; import java.io.FileNotFoundException; import java.io.IOException; import java.net.URI; import java.net.URISyntaxException; import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.HashMap; import java.util.LinkedList; import java.util.List; import java.util.Map; import de.ugoe.cs.eventbench.web.data.WebEvent; import de.ugoe.cs.util.FileTools; import de.ugoe.cs.util.console.Console; /** *

* Provides functionality to parse log files with web request. *

* * @author Steffen Herbold * @version 1.0 */ public class WeblogParser { /** *

* Timeout between two sessions in milliseconds. *

*/ private long timeout; /** *

* Minimal length of a session. All shorter sessions will be pruned. * Default: 2 *

*/ private int minLength = 2; /** *

* Collection of generated sequences. *

*/ private List> sequences; /** *

* Name and path of the robot filter. *

*/ private static final String ROBOTFILTERFILE = "misc/robotfilter.txt"; /** *

* Field that contains a regular expression that matches all robots * contained in {@link #ROBOTFILTERFILE}. *

*/ private String robotRegex = null; /** *

* Constructor. Creates a new WeblogParser with a default timeout of * 3,600,000 milliseconds (1 hour). *

*/ public WeblogParser() { this(3600000); } /** *

* Constructor. Creates a new WeblogParser. *

* * @param timeout * session timeout */ public WeblogParser(long timeout) { this.timeout = timeout; } /** *

* Returns the generated event sequences. *

* * @return generated event sequences */ public List> getSequences() { return sequences; } /** *

* Sets the session timeout. *

* * @param timeout * new session timeout */ public void setTimeout(long timeout) { this.timeout = timeout; } /** *

* Sets the minimal length of a session. All sessions that contain less * events will be pruned. *

* * @param minLength * new minimal length */ public void setMinLength(int minLength) { this.minLength = minLength; } /** *

* Parses a web log file. *

* * @param filename * name and path of the log file * @throws IOException * thrown if there is a problem with reading the log file * @throws FileNotFoundException * thrown if the log file is not found * @throws ParseException * thrown the date format is invalid * @throws URISyntaxException * thrown if the URI is invalid */ public void parseFile(String filename) throws IOException, FileNotFoundException, ParseException, URISyntaxException { String[] lines = FileTools.getLinesFromFile(filename); Map> cookieSessionMap = new HashMap>(); int lastId = -1; SimpleDateFormat dateFormat = new SimpleDateFormat( "yyyy-MM-dd HH:mm:ss"); loadRobotRegex(); sequences = new ArrayList>(); for (String line : lines) { String[] values = line.substring(1, line.length() - 1).split( "\" \""); // use cookie as session identifier int cookieStart = values[0].lastIndexOf('.'); String cookie = values[0].substring(cookieStart + 1); String dateString = values[1]; long timestamp = dateFormat.parse(dateString).getTime(); String uriString = values[2]; // String ref = values[3]; // referer is not yet used! String agent; if (values.length > 4) { agent = values[4]; } else { agent = "noagent"; } List postedVars = new ArrayList(); if (values.length == 6) { // post vars found for (String postVar : values[5].trim().split(" ")) { postedVars.add(postVar); } } if (!isRobot(agent)) { URI uri = new URI(uriString); String path = uri.getPath(); List getVars = extractGetVarsFromUri(uri); WebEvent event = new WebEvent(path, timestamp, postedVars, getVars); // find session and add event List sessionIds = cookieSessionMap.get(cookie); if (sessionIds == null) { sessionIds = new ArrayList(); // start new session sessionIds.add(++lastId); cookieSessionMap.put(cookie, sessionIds); sequences.add(new LinkedList()); } Integer lastSessionIndex = sessionIds .get(sessionIds.size() - 1); List lastSession = sequences.get(lastSessionIndex); long lastEventTime = timestamp; if (!lastSession.isEmpty()) { lastEventTime = lastSession.get(lastSession.size() - 1) .getTimestamp(); } if (timestamp - lastEventTime > timeout) { sessionIds.add(++lastId); List newSession = new LinkedList(); newSession.add(event); sequences.add(newSession); } else { lastSession.add(event); } } } pruneShortSequences(); } /** *

* Prunes sequences shorter than {@link #minLength}. *

*/ private void pruneShortSequences() { Console.traceln("" + sequences.size() + " user sequences found"); // prune sequences shorter than min-length int i = 0; while (i < sequences.size()) { if (sequences.get(i).size() < minLength) { sequences.remove(i); } else { i++; } } Console.traceln("" + sequences.size() + " remaining after pruning of sequences shorter than " + minLength); } /** *

* Reads {@link #ROBOTFILTERFILE} and creates a regular expression that * matches all the robots defined in the file. The regular expression is * stored in the field {@link #robotRegex}. *

* * @throws IOException * thrown if there is a problem reading the robot filter * @throws FileNotFoundException * thrown if the robot filter is not found */ private void loadRobotRegex() throws IOException, FileNotFoundException { String[] lines = FileTools.getLinesFromFile(ROBOTFILTERFILE); StringBuilder regex = new StringBuilder(); for (int i = 0; i < lines.length; i++) { regex.append("(.*" + lines[i] + ".*)"); if (i != lines.length - 1) { regex.append('|'); } } robotRegex = regex.toString(); } /** *

* Checks whether an agent is a robot. *

* * @param agent * agent that is checked * @return true, if the agent is a robot; false otherwise */ private boolean isRobot(String agent) { return agent.matches(robotRegex); } /** *

* Parses the URI and extracts the GET variables that have been passed. *

* * @param uri * URI that is parsed * @return a list with all GET variables */ private List extractGetVarsFromUri(URI uri) { List getVars = new ArrayList(); String query = uri.getQuery(); if (query != null) { String[] paramPairs = query.split("&"); for (String paramPair : paramPairs) { String[] paramSplit = paramPair.split("="); getVars.add(paramSplit[0]); } } return getVars; } }