Last active
September 17, 2017 22:18
-
-
Save usamec/9fcf4e60f65d130802fd24b9ec7ecb70 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package com.example | |
import scala.annotation.tailrec | |
import scala.collection.immutable.Queue | |
case class TrieNode[Char, Value](values: List[Value], next: Map[Char, TrieNode[Char, Value]]) { | |
def add(str: Seq[Char], value: Value): TrieNode[Char, Value] = { | |
@tailrec | |
def addWalkDown(node: TrieNode[Char, Value], stack: List[(Char, TrieNode[Char, Value])], remaining: List[Char], value: Value): TrieNode[Char, Value] = { | |
remaining match { | |
case Nil => | |
addWalkUp(node.copy(values = value :: node.values), stack) | |
case head :: tail => | |
val son = node.next.getOrElse(head, TrieNode.empty) | |
addWalkDown(son, (head, node) :: stack, tail, value) | |
} | |
} | |
@tailrec | |
def addWalkUp(node: TrieNode[Char, Value], stack: List[(Char, TrieNode[Char, Value])]): TrieNode[Char, Value] = { | |
stack match { | |
case Nil => node | |
case (char, parentNode) :: tail => | |
val newNode = parentNode.copy(next = parentNode.next + (char -> node)) | |
addWalkUp(newNode, tail) | |
} | |
} | |
addWalkDown(this, Nil, str.toList, value) | |
} | |
} | |
object TrieNode { | |
def empty[C, V] = TrieNode[C, V](Nil, Map.empty) | |
} | |
/** | |
* Looks for multiple patterns (sequences of tokens) in sequence | |
* Construct matching algorithm in linear time of length of patterns | |
* Matches in linear time of length of matched sequence | |
* | |
* @param patterns - Seq of (pattern, value), where pattern is Seq of tokens | |
* @tparam Char - type of token | |
* @tparam Value - type of value associated with pattern | |
*/ | |
class AhoCorasick[Char, Value](patterns: Seq[(Seq[Char], Value)]) { | |
case class FoundMatch(pos: Int, value: Value) | |
abstract class MatcherNode(node: TrieNode[Char, Value]) { | |
def failureLink: MatcherNode | |
def outputLink: Option[MatcherNode] | |
lazy val next: Map[Char, InnerMatcherNode] = node.next.map({case (key, nextNode) => | |
key -> new InnerMatcherNode(nextNode, (this, key)) | |
}) | |
val values: List[Value] = node.values | |
} | |
case class BaseMatcherNode(node: TrieNode[Char, Value]) extends MatcherNode(node) { | |
override def failureLink: MatcherNode = this | |
override def outputLink: Option[MatcherNode] = None | |
} | |
class InnerMatcherNode(node: TrieNode[Char, Value], parent0: => (MatcherNode, Char)) extends MatcherNode(node) { | |
lazy val parentNode = parent0._1 | |
lazy val parentChar = parent0._2 | |
override lazy val failureLink: MatcherNode = parentNode match { | |
case base: BaseMatcherNode => base | |
case inner: InnerMatcherNode => | |
walkUp(parentNode.failureLink, parentChar) | |
} | |
override lazy val outputLink: Option[MatcherNode] = failureLink.values match { | |
case Nil => failureLink.outputLink | |
case _ => Some(failureLink) | |
} | |
@tailrec | |
private def walkUp(node: MatcherNode, lookup: Char): MatcherNode = { | |
node.next.get(lookup) match { | |
case Some(node) => node | |
case None => | |
node match { | |
case base: BaseMatcherNode => base | |
case inner: InnerMatcherNode => | |
walkUp(inner.failureLink, lookup) | |
} | |
} | |
} | |
} | |
private val base : MatcherNode = buildMatcher(patterns) | |
def buildMatcher(patterns: Seq[(Seq[Char], Value)]): MatcherNode = { | |
val baseTrie = patterns.foldLeft(TrieNode.empty[Char, Value])({case (root, (pattern, value)) => | |
root.add(pattern, value) | |
}) | |
val baseMatcherNode = BaseMatcherNode(baseTrie) | |
// We need this, otherwise we get stack overflow in some cases | |
prepareLazyValues(baseMatcherNode) | |
baseMatcherNode | |
} | |
private def prepareLazyValues(node: MatcherNode) = { | |
@tailrec | |
def bfs(queue: Queue[MatcherNode]): Unit = { | |
queue match { | |
case head +: tail => | |
// This also hits failureLink | |
head.outputLink | |
bfs(head.next.values.foldLeft(tail)({ case (q, node) => q.enqueue(node)})) | |
case _ => | |
} | |
} | |
bfs(Queue(node)) | |
} | |
def findMatches(str: Seq[Char]): Seq[FoundMatch] = { | |
@tailrec | |
def move(node: MatcherNode, chr: Char): MatcherNode = { | |
node.next.get(chr) match { | |
case Some(node) => node | |
case None => | |
node match { | |
case b: BaseMatcherNode => b | |
case i: InnerMatcherNode => move(i.failureLink, chr) | |
} | |
} | |
} | |
@tailrec | |
def getOutputs(node: MatcherNode, aggr: List[Value]): List[Value] = { | |
node.outputLink match { | |
case None => node.values ::: aggr | |
case Some(nextNode) => getOutputs(nextNode, node.values ::: aggr) | |
} | |
} | |
val (_, matches) = str.zipWithIndex.foldLeft(base, List.empty[FoundMatch])({ case ((curState, foundMatches), (curChar, curIndex)) => | |
// just ask for calculation | |
val newState = move(curState, curChar) | |
val newMatches = getOutputs(newState, Nil).map(FoundMatch(curIndex, _)) | |
(newState, newMatches ::: foundMatches) | |
}) | |
matches | |
} | |
} | |
object AhoCorasick { | |
def main(args: Array[String]): Unit = { | |
{ | |
val a1 = "ab" * 100000 | |
val a2 = "ab" * 99999 + "cd" * 1 | |
val aho = new AhoCorasick[Char, Int](Seq((a1: Seq[Char], 1), (a2: Seq[Char], 2))) | |
println(aho.findMatches("ab" * 100002)) | |
println(aho.findMatches("b" + "ab" * 100002 + "cd")) | |
} | |
{ | |
val a1 = "a" * 100000 | |
val a2 = "c" + "a" * 100005 | |
val aho = new AhoCorasick[Char, Int](Seq((a1: Seq[Char], 1), (a2: Seq[Char], 2))) | |
println(aho.findMatches("c" + "a" * 100002)) | |
println(aho.findMatches("c"+ "a" * 100007 + "c")) | |
} | |
{ | |
// This fails on SO without prepare lazy values | |
val q = 5000 | |
val rr = Seq(Range(0, q+1)) ++ Range(0, q).map(x => Range(x, q)) | |
println("data rdy") | |
val aho = new AhoCorasick[Int, Int](rr.zipWithIndex) | |
println("built") | |
println(aho.findMatches(Range(0, q+47))) | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment