- Timestamp:
- 06/16/11 13:53:05 (13 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/EventBenchConsole/src/de/ugoe/cs/eventbench/web/WeblogParser.java
r68 r72 23 23 24 24 private List<List<WebEvent>> sequences; 25 26 private static final String ROBOTFILTERFILE = "misc/robotfilter.txt"; 27 28 private String robotRegex = ".*"; 25 29 26 30 public WeblogParser() { … … 50 54 reader.read(buffer); 51 55 reader.close(); 52 String[] lines = (new String(buffer)).split("\ n");56 String[] lines = (new String(buffer)).split("\r\n"); 53 57 54 58 Map<String, List<Integer>> cookieSessionMap = new HashMap<String, List<Integer>>(); … … 56 60 57 61 SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); 62 loadRobotRegex(); 58 63 59 64 sequences = new ArrayList<List<WebEvent>>(); 60 65 61 66 for( String line : lines ) { 62 String[] values = line. trim().split("");67 String[] values = line.substring(1, line.length()-1).split("\" \""); 63 68 64 69 // use cookie as session identifier 65 70 int cookieStart = values[0].lastIndexOf('.'); 66 71 String cookie = values[0].substring(cookieStart+1); 67 String dateString = values[1] .substring(1)+" "+values[2].substring(0, values[2].length()-1);72 String dateString = values[1]; 68 73 long timestamp = dateFormat.parse(dateString).getTime(); 69 String uri = values[3]; 70 // String ref = values[4]; // referer is not yet used! 74 String uri = values[2]; 75 // String ref = values[3]; // referer is not yet used! 76 String agent = values[4]; // agent is not yet used! 71 77 List<String> postedVars = new ArrayList<String>(); 72 for( int i=5 ; i<values.length ; i++ ) { 73 postedVars.add(values[i]); 78 if( values.length==6 ) { // post vars found 79 for( String postVar : values[5].split(" ") ) { 80 postedVars.add(postVar); 81 } 74 82 } 83 if( !isRobot(agent) ) { 84 WebEvent event = new WebEvent(uri, timestamp, postedVars); 75 85 76 WebEvent event = new WebEvent(uri, timestamp, postedVars); 77 78 // find session and add event 79 List<Integer> sessionIds = cookieSessionMap.get(cookie); 80 if( sessionIds==null ) { 81 sessionIds = new ArrayList<Integer>(); 82 // start new session 83 sessionIds.add(++lastId); 84 cookieSessionMap.put(cookie, sessionIds); 85 sequences.add(new LinkedList<WebEvent>()); 86 } 87 Integer lastSessionIndex = sessionIds.get(sessionIds.size()-1); 88 List<WebEvent> lastSession = sequences.get(lastSessionIndex); 89 long lastEventTime = timestamp; 90 if( !lastSession.isEmpty() ) { 91 lastEventTime = lastSession.get(lastSession.size()-1).getTimestamp(); 92 } 93 if( timestamp-lastEventTime>timeout ) { 94 sessionIds.add(++lastId); 95 List<WebEvent> newSession = new LinkedList<WebEvent>(); 96 newSession.add(event); 97 sequences.add(newSession); 98 } else { 99 lastSession.add(event); 86 // find session and add event 87 List<Integer> sessionIds = cookieSessionMap.get(cookie); 88 if( sessionIds==null ) { 89 sessionIds = new ArrayList<Integer>(); 90 // start new session 91 sessionIds.add(++lastId); 92 cookieSessionMap.put(cookie, sessionIds); 93 sequences.add(new LinkedList<WebEvent>()); 94 } 95 Integer lastSessionIndex = sessionIds.get(sessionIds.size()-1); 96 List<WebEvent> lastSession = sequences.get(lastSessionIndex); 97 long lastEventTime = timestamp; 98 if( !lastSession.isEmpty() ) { 99 lastEventTime = lastSession.get(lastSession.size()-1).getTimestamp(); 100 } 101 if( timestamp-lastEventTime>timeout ) { 102 sessionIds.add(++lastId); 103 List<WebEvent> newSession = new LinkedList<WebEvent>(); 104 newSession.add(event); 105 sequences.add(newSession); 106 } else { 107 lastSession.add(event); 108 } 100 109 } 101 110 } 102 111 Console.traceln(""+sequences.size()+ " user sequences found"); 103 112 // prune sequences shorter than min-length 104 for( int i=0; i<sequences.size(); i++ ) { 113 int i=0; 114 while( i<sequences.size() ) { 105 115 if( sequences.get(i).size()<minLength ) { 106 116 sequences.remove(i); 117 } else { 118 Console.traceln(""+sequences.get(i).size()); 119 if( sequences.get(i).size() > 152 ) { 120 Console.traceln(sequences.get(i).toString().replaceAll(", ", "\n")); 121 } 122 i++; 107 123 } 108 124 } 109 125 Console.traceln(""+sequences.size()+ " remaining after pruning of sequences shorter than " + minLength); 110 126 } 127 128 private void loadRobotRegex() throws IOException, FileNotFoundException { 129 File f = new File(ROBOTFILTERFILE); 130 FileReader reader = new FileReader(f); 131 char[] buffer = new char[(int) f.length()]; 132 reader.read(buffer); 133 reader.close(); 134 String[] lines = (new String(buffer)).split("\r\n"); 135 StringBuilder regex = new StringBuilder(); 136 for( int i=0; i<lines.length; i++ ) { 137 regex.append("(.*"+lines[i]+".*)"); 138 if( i!=lines.length-1 ) { 139 regex.append("|"); 140 } 141 } 142 robotRegex = regex.toString(); 143 } 144 145 private boolean isRobot(String agent) { 146 return agent.matches(robotRegex); 147 } 111 148 }
Note: See TracChangeset
for help on using the changeset viewer.