Skip to content

Instantly share code, notes, and snippets.

@usamec
Last active September 17, 2017 22:18
Show Gist options
  • Save usamec/9fcf4e60f65d130802fd24b9ec7ecb70 to your computer and use it in GitHub Desktop.
Save usamec/9fcf4e60f65d130802fd24b9ec7ecb70 to your computer and use it in GitHub Desktop.
package com.example
import scala.annotation.tailrec
import scala.collection.immutable.Queue
case class TrieNode[Char, Value](values: List[Value], next: Map[Char, TrieNode[Char, Value]]) {
def add(str: Seq[Char], value: Value): TrieNode[Char, Value] = {
@tailrec
def addWalkDown(node: TrieNode[Char, Value], stack: List[(Char, TrieNode[Char, Value])], remaining: List[Char], value: Value): TrieNode[Char, Value] = {
remaining match {
case Nil =>
addWalkUp(node.copy(values = value :: node.values), stack)
case head :: tail =>
val son = node.next.getOrElse(head, TrieNode.empty)
addWalkDown(son, (head, node) :: stack, tail, value)
}
}
@tailrec
def addWalkUp(node: TrieNode[Char, Value], stack: List[(Char, TrieNode[Char, Value])]): TrieNode[Char, Value] = {
stack match {
case Nil => node
case (char, parentNode) :: tail =>
val newNode = parentNode.copy(next = parentNode.next + (char -> node))
addWalkUp(newNode, tail)
}
}
addWalkDown(this, Nil, str.toList, value)
}
}
object TrieNode {
def empty[C, V] = TrieNode[C, V](Nil, Map.empty)
}
/**
* Looks for multiple patterns (sequences of tokens) in sequence
* Construct matching algorithm in linear time of length of patterns
* Matches in linear time of length of matched sequence
*
* @param patterns - Seq of (pattern, value), where pattern is Seq of tokens
* @tparam Char - type of token
* @tparam Value - type of value associated with pattern
*/
class AhoCorasick[Char, Value](patterns: Seq[(Seq[Char], Value)]) {
case class FoundMatch(pos: Int, value: Value)
abstract class MatcherNode(node: TrieNode[Char, Value]) {
def failureLink: MatcherNode
def outputLink: Option[MatcherNode]
lazy val next: Map[Char, InnerMatcherNode] = node.next.map({case (key, nextNode) =>
key -> new InnerMatcherNode(nextNode, (this, key))
})
val values: List[Value] = node.values
}
case class BaseMatcherNode(node: TrieNode[Char, Value]) extends MatcherNode(node) {
override def failureLink: MatcherNode = this
override def outputLink: Option[MatcherNode] = None
}
class InnerMatcherNode(node: TrieNode[Char, Value], parent0: => (MatcherNode, Char)) extends MatcherNode(node) {
lazy val parentNode = parent0._1
lazy val parentChar = parent0._2
override lazy val failureLink: MatcherNode = parentNode match {
case base: BaseMatcherNode => base
case inner: InnerMatcherNode =>
walkUp(parentNode.failureLink, parentChar)
}
override lazy val outputLink: Option[MatcherNode] = failureLink.values match {
case Nil => failureLink.outputLink
case _ => Some(failureLink)
}
@tailrec
private def walkUp(node: MatcherNode, lookup: Char): MatcherNode = {
node.next.get(lookup) match {
case Some(node) => node
case None =>
node match {
case base: BaseMatcherNode => base
case inner: InnerMatcherNode =>
walkUp(inner.failureLink, lookup)
}
}
}
}
private val base : MatcherNode = buildMatcher(patterns)
def buildMatcher(patterns: Seq[(Seq[Char], Value)]): MatcherNode = {
val baseTrie = patterns.foldLeft(TrieNode.empty[Char, Value])({case (root, (pattern, value)) =>
root.add(pattern, value)
})
val baseMatcherNode = BaseMatcherNode(baseTrie)
// We need this, otherwise we get stack overflow in some cases
prepareLazyValues(baseMatcherNode)
baseMatcherNode
}
private def prepareLazyValues(node: MatcherNode) = {
@tailrec
def bfs(queue: Queue[MatcherNode]): Unit = {
queue match {
case head +: tail =>
// This also hits failureLink
head.outputLink
bfs(head.next.values.foldLeft(tail)({ case (q, node) => q.enqueue(node)}))
case _ =>
}
}
bfs(Queue(node))
}
def findMatches(str: Seq[Char]): Seq[FoundMatch] = {
@tailrec
def move(node: MatcherNode, chr: Char): MatcherNode = {
node.next.get(chr) match {
case Some(node) => node
case None =>
node match {
case b: BaseMatcherNode => b
case i: InnerMatcherNode => move(i.failureLink, chr)
}
}
}
@tailrec
def getOutputs(node: MatcherNode, aggr: List[Value]): List[Value] = {
node.outputLink match {
case None => node.values ::: aggr
case Some(nextNode) => getOutputs(nextNode, node.values ::: aggr)
}
}
val (_, matches) = str.zipWithIndex.foldLeft(base, List.empty[FoundMatch])({ case ((curState, foundMatches), (curChar, curIndex)) =>
// just ask for calculation
val newState = move(curState, curChar)
val newMatches = getOutputs(newState, Nil).map(FoundMatch(curIndex, _))
(newState, newMatches ::: foundMatches)
})
matches
}
}
object AhoCorasick {
def main(args: Array[String]): Unit = {
{
val a1 = "ab" * 100000
val a2 = "ab" * 99999 + "cd" * 1
val aho = new AhoCorasick[Char, Int](Seq((a1: Seq[Char], 1), (a2: Seq[Char], 2)))
println(aho.findMatches("ab" * 100002))
println(aho.findMatches("b" + "ab" * 100002 + "cd"))
}
{
val a1 = "a" * 100000
val a2 = "c" + "a" * 100005
val aho = new AhoCorasick[Char, Int](Seq((a1: Seq[Char], 1), (a2: Seq[Char], 2)))
println(aho.findMatches("c" + "a" * 100002))
println(aho.findMatches("c"+ "a" * 100007 + "c"))
}
{
// This fails on SO without prepare lazy values
val q = 5000
val rr = Seq(Range(0, q+1)) ++ Range(0, q).map(x => Range(x, q))
println("data rdy")
val aho = new AhoCorasick[Int, Int](rr.zipWithIndex)
println("built")
println(aho.findMatches(Range(0, q+47)))
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment