package de.ugoe.cs.eventbench.web;

import java.io.FileNotFoundException;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;

import de.ugoe.cs.eventbench.web.data.WebEvent;
import de.ugoe.cs.util.FileTools;
import de.ugoe.cs.util.console.Console;

/**
 * <p>
 * Provides functionality to parse log files with web request.
 * </p>
 * 
 * @author Steffen Herbold
 * @version 1.0
 */
public class WeblogParser {

	/**
	 * <p>
	 * Timeout between two sessions in milliseconds.
	 * </p>
	 */
	private long timeout;

	/**
	 * <p>
	 * Minimal length of a session. All shorter sessions will be pruned.<br>
	 * Default: 2
	 * </p>
	 */
	private int minLength = 2;

	/**
	 * <p>
	 * Maximal length of a session. All longer sessions will be pruned.<br>
	 * Default: 100
	 * </p>
	 */
	private int maxLength = 100;

	/**
	 * <p>
	 * URL of the server that generated the log that is currently parser; null
	 * of URL is not available.<br>
	 * Default: null
	 * </p>
	 */
	private String url = null;

	/**
	 * <p>
	 * Collection of generated sequences.
	 * </p>
	 */
	private List<List<WebEvent>> sequences;

	/**
	 * <p>
	 * Name and path of the robot filter.
	 * </p>
	 */
	private static final String ROBOTFILTERFILE = "misc/robotfilter.txt";

	/**
	 * <p>
	 * Field that contains a regular expression that matches all robots
	 * contained in {@link #ROBOTFILTERFILE}.
	 * </p>
	 */
	private String robotRegex = null;

	/**
	 * <p>
	 * Constructor. Creates a new WeblogParser with a default timeout of
	 * 3,600,000 milliseconds (1 hour).
	 * </p>
	 */
	public WeblogParser() {
		this(3600000);
	}

	/**
	 * <p>
	 * Constructor. Creates a new WeblogParser.
	 * </p>
	 * 
	 * @param timeout
	 *            session timeout
	 */
	public WeblogParser(long timeout) {
		this.timeout = timeout;
	}

	/**
	 * <p>
	 * Returns the generated event sequences.
	 * </p>
	 * 
	 * @return generated event sequences
	 */
	public Collection<List<WebEvent>> getSequences() {
		return sequences;
	}

	/**
	 * <p>
	 * Sets the session timeout.
	 * </p>
	 * 
	 * @param timeout
	 *            new session timeout
	 */
	public void setTimeout(long timeout) {
		this.timeout = timeout;
	}

	/**
	 * <p>
	 * Sets the minimal length of a session. All sessions that contain less
	 * events will be pruned.
	 * </p>
	 * 
	 * @param minLength
	 *            new minimal length
	 */
	public void setMinLength(int minLength) {
		this.minLength = minLength;
	}

	/**
	 * <p>
	 * Sets the maximal length of a session. All sessions that contain more
	 * events will be pruned.
	 * </p>
	 * 
	 * @param maxLength
	 *            new maximal length
	 */
	public void setMaxLength(int maxLength) {
		this.maxLength = maxLength;
	}

	/**
	 * <p>
	 * Sets the URL of the server from which this log was generated. Often
	 * required for replay generation
	 * </p>
	 * 
	 * @param url
	 *            URL of the server
	 */
	public void setUrl(String url) {
		this.url = url;
	}

	/**
	 * <p>
	 * Parses a web log file.
	 * </p>
	 * 
	 * @param filename
	 *            name and path of the log file
	 * @throws IOException
	 *             thrown if there is a problem with reading the log file
	 * @throws FileNotFoundException
	 *             thrown if the log file is not found
	 * @throws ParseException
	 *             thrown the date format is invalid
	 */
	public void parseFile(String filename) throws IOException,
			FileNotFoundException, ParseException {
		String[] lines = FileTools.getLinesFromFile(filename);

		Map<String, List<Integer>> cookieSessionMap = new HashMap<String, List<Integer>>();
		int lastId = -1;

		SimpleDateFormat dateFormat = new SimpleDateFormat(
				"yyyy-MM-dd HH:mm:ss");
		loadRobotRegex();

		sequences = new ArrayList<List<WebEvent>>();

		int lineCounter = 0;
		for (String line : lines) {
			lineCounter++;
			String[] values = line.substring(1, line.length() - 1).split(
					"\" \"");

			// use cookie as session identifier
			int cookieStart = values[0].lastIndexOf('.');
			String cookie = values[0].substring(cookieStart + 1);
			String dateString = values[1];
			long timestamp = dateFormat.parse(dateString).getTime();
			String uriString = values[2];
			// String ref = values[3]; // referer is not yet used!
			String agent;
			if (values.length > 4) {
				agent = values[4];
			} else {
				agent = "noagent";
			}

			List<String> postedVars = new ArrayList<String>();
			if (values.length == 6) { // post vars found
				for (String postVar : values[5].trim().split(" ")) {
					postedVars.add(postVar);
				}
			}
			if (!isRobot(agent)) {
				try {
					URI uri = new URI(uriString);
					String path = uri.getPath();
					List<String> getVars = extractGetVarsFromUri(uri);

					WebEvent event = new WebEvent(url, path, timestamp,
							postedVars, getVars);

					// find session and add event
					List<Integer> sessionIds = cookieSessionMap.get(cookie);
					if (sessionIds == null) {
						sessionIds = new ArrayList<Integer>();
						// start new session
						sessionIds.add(++lastId);
						cookieSessionMap.put(cookie, sessionIds);
						sequences.add(new LinkedList<WebEvent>());
					}
					Integer lastSessionIndex = sessionIds
							.get(sessionIds.size() - 1);
					List<WebEvent> lastSession = sequences
							.get(lastSessionIndex);
					long lastEventTime = timestamp;
					if (!lastSession.isEmpty()) {
						lastEventTime = lastSession.get(lastSession.size() - 1)
								.getTimestamp();
					}
					if (timestamp - lastEventTime > timeout) {
						sessionIds.add(++lastId);
						List<WebEvent> newSession = new LinkedList<WebEvent>();
						newSession.add(event);
						sequences.add(newSession);
					} else {
						lastSession.add(event);
					}
				} catch (URISyntaxException e) {
					Console.traceln("Ignored line " + lineCounter + ": "
							+ e.getMessage());
				}
			}
		}
		pruneSequences();
	}

	/**
	 * <p>
	 * Prunes sequences shorter than {@link #minLength}.
	 * </p>
	 */
	private void pruneSequences() {
		Console.traceln("" + sequences.size() + " user sequences found");
		// prune sequences shorter than min-length and longer than maxLength
		int i = 0;
		while (i < sequences.size()) {
			if ((sequences.get(i).size() < minLength)
					|| sequences.get(i).size() > maxLength) {
				sequences.remove(i);
			} else {
				i++;
			}
		}
		Console.traceln("" + sequences.size()
				+ " remaining after pruning of sequences shorter than "
				+ minLength);
	}

	/**
	 * <p>
	 * Reads {@link #ROBOTFILTERFILE} and creates a regular expression that
	 * matches all the robots defined in the file. The regular expression is
	 * stored in the field {@link #robotRegex}.
	 * </p>
	 * 
	 * @throws IOException
	 *             thrown if there is a problem reading the robot filter
	 * @throws FileNotFoundException
	 *             thrown if the robot filter is not found
	 */
	private void loadRobotRegex() throws IOException, FileNotFoundException {
		String[] lines = FileTools.getLinesFromFile(ROBOTFILTERFILE);
		StringBuilder regex = new StringBuilder();
		for (int i = 0; i < lines.length; i++) {
			regex.append("(.*" + lines[i] + ".*)");
			if (i != lines.length - 1) {
				regex.append('|');
			}
		}
		robotRegex = regex.toString();
	}

	/**
	 * <p>
	 * Checks whether an agent is a robot.
	 * </p>
	 * 
	 * @param agent
	 *            agent that is checked
	 * @return true, if the agent is a robot; false otherwise
	 */
	private boolean isRobot(String agent) {
		return agent.matches(robotRegex);
	}

	/**
	 * <p>
	 * Parses the URI and extracts the GET variables that have been passed.
	 * </p>
	 * 
	 * @param uri
	 *            URI that is parsed
	 * @return a list with all GET variables
	 */
	private List<String> extractGetVarsFromUri(URI uri) {
		List<String> getVars = new ArrayList<String>();
		String query = uri.getQuery();
		if (query != null) {
			String[] paramPairs = query.split("&");
			for (String paramPair : paramPairs) {
				String[] paramSplit = paramPair.split("=");
				getVars.add(paramSplit[0]);
			}
		}
		return getVars;
	}
}