package de.ugoe.cs.eventbench.web; import java.io.FileNotFoundException; import java.io.IOException; import java.net.URI; import java.net.URISyntaxException; import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.HashMap; import java.util.LinkedList; import java.util.List; import java.util.Map; import de.ugoe.cs.eventbench.web.data.WebEvent; import de.ugoe.cs.util.FileTools; import de.ugoe.cs.util.console.Console; public class WeblogParser { private long timeout; private int minLength = 2; private List> sequences; private static final String ROBOTFILTERFILE = "misc/robotfilter.txt"; private String robotRegex = ".*"; public WeblogParser() { timeout = 3600000; // 1 hour session-timeout as default } public WeblogParser(long timeout) { this.timeout = timeout; } public List> getSequences() { return sequences; } public void setTimeout(long timeout) { this.timeout = timeout; } public void setMinLength(int minLength) { this.minLength = minLength; } public void parseFile(String filename) throws IOException, FileNotFoundException, ParseException, URISyntaxException { String[] lines = FileTools.getLinesFromFile(filename); Map> cookieSessionMap = new HashMap>(); int lastId = -1; SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); loadRobotRegex(); sequences = new ArrayList>(); for( String line : lines ) { String[] values = line.substring(1, line.length()-1).split("\" \""); // use cookie as session identifier int cookieStart = values[0].lastIndexOf('.'); String cookie = values[0].substring(cookieStart+1); String dateString = values[1]; long timestamp = dateFormat.parse(dateString).getTime(); String uriString = values[2]; // String ref = values[3]; // referer is not yet used! String agent; if( values.length>4 ) { agent = values[4]; } else { agent = "noagent"; } List postedVars = new ArrayList(); if( values.length==6 ) { // post vars found for( String postVar : values[5].trim().split(" ") ) { postedVars.add(postVar); } } if( !isRobot(agent) ) { URI uri = new URI(uriString); String path = uri.getPath(); List getVars = extractGetVarsFromUri(uri); WebEvent event = new WebEvent(path, timestamp, postedVars, getVars); // find session and add event List sessionIds = cookieSessionMap.get(cookie); if( sessionIds==null ) { sessionIds = new ArrayList(); // start new session sessionIds.add(++lastId); cookieSessionMap.put(cookie, sessionIds); sequences.add(new LinkedList()); } Integer lastSessionIndex = sessionIds.get(sessionIds.size()-1); List lastSession = sequences.get(lastSessionIndex); long lastEventTime = timestamp; if( !lastSession.isEmpty() ) { lastEventTime = lastSession.get(lastSession.size()-1).getTimestamp(); } if( timestamp-lastEventTime>timeout ) { sessionIds.add(++lastId); List newSession = new LinkedList(); newSession.add(event); sequences.add(newSession); } else { lastSession.add(event); } } } pruneShortSequences(); } private void pruneShortSequences() { Console.traceln(""+sequences.size()+ " user sequences found"); // prune sequences shorter than min-length int i=0; while( i extractGetVarsFromUri(URI uri) { List getVars = new ArrayList(); String query = uri.getQuery(); if( query!=null ) { String[] paramPairs = query.split("&"); for( String paramPair : paramPairs ) { String[] paramSplit = paramPair.split("="); getVars.add(paramSplit[0]); } } return getVars; } }