package de.ugoe.cs.eventbench.models; import java.io.Serializable; import java.security.InvalidParameterException; import java.util.Collection; import java.util.HashSet; import java.util.LinkedHashSet; import java.util.LinkedList; import java.util.List; import java.util.Set; import de.ugoe.cs.util.StringTools; import edu.uci.ics.jung.graph.DelegateTree; import edu.uci.ics.jung.graph.Graph; import edu.uci.ics.jung.graph.Tree; /** *

* This class implements a trie, i.e., a tree of sequences that the * occurence of subsequences up to a predefined length. This length is the trie * order. *

* * @author Steffen Herbold * * @param * Type of the symbols that are stored in the trie. * * @see TrieNode */ public class Trie implements IDotCompatible, Serializable { /** *

* Id for object serialization. *

*/ private static final long serialVersionUID = 1L; /** *

* Collection of all symbols occuring in the trie. *

*/ private Collection knownSymbols; /** *

* Reference to the root of the trie. *

*/ private final TrieNode rootNode; /** *

* Contructor. Creates a new Trie. *

*/ public Trie() { rootNode = new TrieNode(); knownSymbols = new LinkedHashSet(); } /** *

* Copy-Constructor. Creates a new Trie as the copy of other. The other trie * must noch be null. *

* * @param other * Trie that is copied */ public Trie(Trie other) { if (other == null) { throw new InvalidParameterException("other trie must not be null"); } rootNode = new TrieNode(other.rootNode); knownSymbols = new LinkedHashSet(other.knownSymbols); } /** *

* Returns a collection of all symbols occuring in the trie. *

* * @return symbols occuring in the trie */ public Collection getKnownSymbols() { return new LinkedHashSet(knownSymbols); } /** *

* Trains the current trie using the given sequence and adds all subsequence * of length {@code maxOrder}. *

* * @param sequence * sequence whose subsequences are added to the trie * @param maxOrder * maximum length of the subsequences added to the trie */ public void train(List sequence, int maxOrder) { if (maxOrder < 1) { return; } IncompleteMemory latestActions = new IncompleteMemory(maxOrder); int i = 0; for (T currentEvent : sequence) { latestActions.add(currentEvent); knownSymbols.add(currentEvent); i++; if (i >= maxOrder) { add(latestActions.getLast(maxOrder)); } } int sequenceLength = sequence.size(); for (int j = maxOrder - 1; j > 0; j--) { add(sequence.subList(sequenceLength - j, sequenceLength)); } } /** *

* Adds a given subsequence to the trie and increases the counters * accordingly. *

* * @param subsequence * subsequence whose counters are increased * @see TrieNode#add(List) */ protected void add(List subsequence) { if (subsequence != null && !subsequence.isEmpty()) { knownSymbols.addAll(subsequence); subsequence = new LinkedList(subsequence); // defensive copy! T firstSymbol = subsequence.get(0); TrieNode node = getChildCreate(firstSymbol); node.add(subsequence); } } /** *

* Returns the child of the root node associated with the given symbol or * creates it if it does not exist yet. *

* * @param symbol * symbol whose node is required * @return node associated with the symbol * @see TrieNode#getChildCreate(Object) */ protected TrieNode getChildCreate(T symbol) { return rootNode.getChildCreate(symbol); } /** *

* Returns the child of the root node associated with the given symbol or * null if it does not exist. *

* * @param symbol * symbol whose node is required * @return node associated with the symbol; null if no such node exists * @see TrieNode#getChild(Object) */ protected TrieNode getChild(T symbol) { return rootNode.getChild(symbol); } /** *

* Returns the number of occurences of the given sequence. *

* * @param sequence * sequence whose number of occurences is required * @return number of occurences of the sequence */ public int getCount(List sequence) { int count = 0; TrieNode node = find(sequence); if (node != null) { count = node.getCount(); } return count; } /** *

* Returns the number of occurences of the given prefix and a symbol that * follows it.
* Convenience function to simplify usage of {@link #getCount(List)}. *

* * @param sequence * prefix of the sequence * @param follower * suffix of the sequence * @return number of occurences of the sequence * @see #getCount(List) */ public int getCount(List sequence, T follower) { List tmpSequence = new LinkedList(sequence); tmpSequence.add(follower); return getCount(tmpSequence); } /** *

* Searches the trie for a given sequence and returns the node associated * with the sequence or null if no such node is found. *

* * @param sequence * sequence that is searched for * @return node associated with the sequence * @see TrieNode#find(List) */ public TrieNode find(List sequence) { if (sequence == null || sequence.isEmpty()) { return rootNode; } List sequenceCopy = new LinkedList(sequence); TrieNode result = null; TrieNode node = getChild(sequenceCopy.get(0)); if (node != null) { sequenceCopy.remove(0); result = node.find(sequenceCopy); } return result; } /** *

* Returns a collection of all symbols that follow a given sequence in the * trie. In case the sequence is not found or no symbols follow the sequence * the result will be empty. *

* * @param sequence * sequence whose followers are returned * @return symbols following the given sequence * @see TrieNode#getFollowingSymbols() */ public Collection getFollowingSymbols(List sequence) { Collection result = new LinkedList(); TrieNode node = find(sequence); if (node != null) { result = node.getFollowingSymbols(); } return result; } /** *

* Returns the longest suffix of the given context that is contained in the * tree and whose children are leaves. *

* * @param context * context whose suffix is searched for * @return longest suffix of the context */ public List getContextSuffix(List context) { List contextSuffix; if (context != null) { contextSuffix = new LinkedList(context); // defensive copy } else { contextSuffix = new LinkedList(); } boolean suffixFound = false; while (!suffixFound) { if (contextSuffix.isEmpty()) { suffixFound = true; // suffix is the empty word } else { TrieNode node = find(contextSuffix); if (node != null) { if (!node.getFollowingSymbols().isEmpty()) { suffixFound = true; } } if (!suffixFound) { contextSuffix.remove(0); } } } return contextSuffix; } /** *

* Helper class for graph visualization of a trie. *

* * @author Steffen Herbold * @version 1.0 */ static public class Edge { } /** *

* Helper class for graph visualization of a trie. *

* * @author Steffen Herbold * @version 1.0 */ static public class TrieVertex { /** *

* Id of the vertex. *

*/ private String id; /** *

* Contructor. Creates a new TrieVertex. *

* * @param id * id of the vertex */ protected TrieVertex(String id) { this.id = id; } /** *

* Returns the id of the vertex. *

* * @see java.lang.Object#toString() */ @Override public String toString() { return id; } } /** *

* Returns a {@link Graph} representation of the trie. *

* * @return {@link Graph} representation of the trie */ protected Tree getGraph() { DelegateTree graph = new DelegateTree(); rootNode.getGraph(null, graph); return graph; } /* * (non-Javadoc) * * @see de.ugoe.cs.eventbench.models.IDotCompatible#getDotRepresentation() */ public String getDotRepresentation() { StringBuilder stringBuilder = new StringBuilder(); stringBuilder.append("digraph model {" + StringTools.ENDLINE); rootNode.appendDotRepresentation(stringBuilder); stringBuilder.append('}' + StringTools.ENDLINE); return stringBuilder.toString(); } /** *

* Returns the string representation of the root node. *

* * @see TrieNode#toString() * @see java.lang.Object#toString() */ @Override public String toString() { return rootNode.toString(); } /** *

* Returns the number of symbols contained in the trie. *

* * @return number of symbols contained in the trie */ public int getNumSymbols() { return knownSymbols.size(); } /** *

* Returns the number of trie nodes that are ancestors of a leaf. This is * the equivalent to the number of states a first-order markov model would * have. *

* * @return number of trie nodes that are ancestors of leafs. */ public int getNumLeafAncestors() { Set> ancestors = new HashSet>(); rootNode.getLeafAncestors(ancestors); return ancestors.size(); } /** *

* Returns the number of trie nodes that are leafs. *

* * @return number of leafs in the trie */ public int getNumLeafs() { return rootNode.getNumLeafs(); } /** *

* Updates the list of known symbols by replacing it with all symbols that * are found in the child nodes of the root node. This should be the same as * all symbols that are contained in the trie. *

*/ public void updateKnownSymbols() { knownSymbols = new HashSet(); for (TrieNode node : rootNode.getChildren()) { knownSymbols.add(node.getSymbol()); } } /** *

* Two Tries are defined as equal, if their {@link #rootNode} are equal. *

* * @see java.lang.Object#equals(java.lang.Object) */ @SuppressWarnings("rawtypes") @Override public boolean equals(Object other) { if (other == this) { return true; } if (other instanceof Trie) { return rootNode.equals(((Trie) other).rootNode); } return false; } /* * (non-Javadoc) * * @see java.lang.Object#hashCode() */ @Override public int hashCode() { int multiplier = 17; int hash = 42; if (rootNode != null) { hash = multiplier * hash + rootNode.hashCode(); } return hash; } }