Index: /trunk/EventBenchConsole/misc/robotfilter.txt
===================================================================
--- /trunk/EventBenchConsole/misc/robotfilter.txt	(revision 72)
+++ /trunk/EventBenchConsole/misc/robotfilter.txt	(revision 72)
@@ -0,0 +1,6 @@
+findlinks
+discobot
+Googlebot
+Slurp
+YandexBot
+Spider
Index: /trunk/EventBenchConsole/src/de/ugoe/cs/eventbench/web/WeblogParser.java
===================================================================
--- /trunk/EventBenchConsole/src/de/ugoe/cs/eventbench/web/WeblogParser.java	(revision 71)
+++ /trunk/EventBenchConsole/src/de/ugoe/cs/eventbench/web/WeblogParser.java	(revision 72)
@@ -23,4 +23,8 @@
 	
 	private List<List<WebEvent>> sequences;
+	
+	private static final String ROBOTFILTERFILE = "misc/robotfilter.txt";
+	
+	private String robotRegex = ".*";
 	
 	public WeblogParser() {
@@ -50,5 +54,5 @@
 		reader.read(buffer);
 		reader.close();
-		String[] lines = (new String(buffer)).split("\n");
+		String[] lines = (new String(buffer)).split("\r\n");
 		
 		Map<String, List<Integer>> cookieSessionMap = new HashMap<String, List<Integer>>();
@@ -56,56 +60,89 @@
 		
 		SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
+		loadRobotRegex();
 		
 		sequences = new ArrayList<List<WebEvent>>();
 		
 		for( String line : lines ) {
-			String[] values = line.trim().split(" ");
+			String[] values = line.substring(1, line.length()-1).split("\" \"");
 			
 			// use cookie as session identifier
 			int cookieStart = values[0].lastIndexOf('.');
 			String cookie = values[0].substring(cookieStart+1);
-			String dateString = values[1].substring(1)+" "+values[2].substring(0, values[2].length()-1);
+			String dateString = values[1];
 			long timestamp = dateFormat.parse(dateString).getTime();
-			String uri = values[3];
-			// String ref = values[4]; // referer is not yet used!
+			String uri = values[2];
+			// String ref = values[3]; // referer is not yet used!
+			String agent = values[4]; // agent is not yet used!
 			List<String> postedVars = new ArrayList<String>();
-			for( int i=5 ; i<values.length ; i++ ) {
-				postedVars.add(values[i]);
+			if( values.length==6 ) { // post vars found
+				for( String postVar : values[5].split(" ") ) {
+					postedVars.add(postVar);
+				}
 			}
+			if( !isRobot(agent) ) {
+				WebEvent event = new WebEvent(uri, timestamp, postedVars);
 				
-			WebEvent event = new WebEvent(uri, timestamp, postedVars);
-			
-			// find session and add event
-			List<Integer> sessionIds = cookieSessionMap.get(cookie);
-			if( sessionIds==null ) {
-				sessionIds = new ArrayList<Integer>();
-				// start new session
-				sessionIds.add(++lastId);
-				cookieSessionMap.put(cookie, sessionIds);
-				sequences.add(new LinkedList<WebEvent>());
-			} 
-			Integer lastSessionIndex = sessionIds.get(sessionIds.size()-1);
-			List<WebEvent> lastSession = sequences.get(lastSessionIndex);
-			long lastEventTime = timestamp;
-			if( !lastSession.isEmpty() ) {
-				lastEventTime = lastSession.get(lastSession.size()-1).getTimestamp();
-			}
-			if( timestamp-lastEventTime>timeout ) {
-				sessionIds.add(++lastId);
-				List<WebEvent> newSession = new LinkedList<WebEvent>();
-				newSession.add(event);
-				sequences.add(newSession);
-			} else {
-				lastSession.add(event);
+				// find session and add event
+				List<Integer> sessionIds = cookieSessionMap.get(cookie);
+				if( sessionIds==null ) {
+					sessionIds = new ArrayList<Integer>();
+					// start new session
+					sessionIds.add(++lastId);
+					cookieSessionMap.put(cookie, sessionIds);
+					sequences.add(new LinkedList<WebEvent>());
+				} 
+				Integer lastSessionIndex = sessionIds.get(sessionIds.size()-1);
+				List<WebEvent> lastSession = sequences.get(lastSessionIndex);
+				long lastEventTime = timestamp;
+				if( !lastSession.isEmpty() ) {
+					lastEventTime = lastSession.get(lastSession.size()-1).getTimestamp();
+				}
+				if( timestamp-lastEventTime>timeout ) {
+					sessionIds.add(++lastId);
+					List<WebEvent> newSession = new LinkedList<WebEvent>();
+					newSession.add(event);
+					sequences.add(newSession);
+				} else {
+					lastSession.add(event);
+				}
 			}
 		}
 		Console.traceln(""+sequences.size()+ " user sequences found");
 		// prune sequences shorter than min-length
-		for( int i=0; i<sequences.size(); i++ ) {
+		int i=0;
+		while( i<sequences.size() ) {
 			if( sequences.get(i).size()<minLength ) {
 				sequences.remove(i);
+			} else {
+				Console.traceln(""+sequences.get(i).size());
+				if( sequences.get(i).size() > 152 ) {
+					Console.traceln(sequences.get(i).toString().replaceAll(", ", "\n"));
+				}
+				i++;
 			}
 		}
 		Console.traceln(""+sequences.size()+ " remaining after pruning of sequences shorter than " + minLength);
 	}
+	
+	private void loadRobotRegex() throws IOException, FileNotFoundException {
+		File f = new File(ROBOTFILTERFILE);
+		FileReader reader = new FileReader(f);
+		char[] buffer = new char[(int) f.length()];
+		reader.read(buffer);
+		reader.close();
+		String[] lines = (new String(buffer)).split("\r\n");
+		StringBuilder regex = new StringBuilder();
+		for( int i=0; i<lines.length; i++ ) {
+			regex.append("(.*"+lines[i]+".*)");
+			if( i!=lines.length-1 ) {
+				regex.append("|");
+			}
+		}
+		robotRegex = regex.toString();
+	}
+	
+	private boolean isRobot(String agent) {
+		return agent.matches(robotRegex);
+	}
 }
