aho-corasick Algorithm
The Aho-Corasick algorithm is a highly efficient string-matching algorithm that searches for multiple patterns simultaneously in a given text. Developed by Alfred Aho and Margaret Corasick in 1975, this algorithm leverages the concept of finite state machines and tries to minimize the number of comparisons needed to find all occurrences of the patterns. It is widely used in various applications, including search engines, intrusion detection systems, bioinformatics, and natural language processing.
The key idea behind the Aho-Corasick algorithm is to construct a trie (also known as a prefix tree) that stores all the given patterns as paths from the root to a terminal node. Each node in the trie represents an intermediate or final state, and the edges are labeled with characters from the input alphabet. To enable efficient searching, the algorithm also creates failure links between nodes, which determine the next state to transition to when a character mismatch occurs. As a result, the algorithm can process the input text in a single pass, examining each character only once, and achieve a linear time complexity in the length of the text.
from collections import deque
class Automaton:
def __init__(self, keywords):
self.adlist = list()
self.adlist.append(
{"value": "", "next_states": [], "fail_state": 0, "output": []}
)
for keyword in keywords:
self.add_keyword(keyword)
self.set_fail_transitions()
def find_next_state(self, current_state, char):
for state in self.adlist[current_state]["next_states"]:
if char == self.adlist[state]["value"]:
return state
return None
def add_keyword(self, keyword):
current_state = 0
for character in keyword:
if self.find_next_state(current_state, character):
current_state = self.find_next_state(current_state, character)
else:
self.adlist.append(
{
"value": character,
"next_states": [],
"fail_state": 0,
"output": [],
}
)
self.adlist[current_state]["next_states"].append(len(self.adlist) - 1)
current_state = len(self.adlist) - 1
self.adlist[current_state]["output"].append(keyword)
def set_fail_transitions(self):
q = deque()
for node in self.adlist[0]["next_states"]:
q.append(node)
self.adlist[node]["fail_state"] = 0
while q:
r = q.popleft()
for child in self.adlist[r]["next_states"]:
q.append(child)
state = self.adlist[r]["fail_state"]
while (
self.find_next_state(state, self.adlist[child]["value"]) is None
and state != 0
):
state = self.adlist[state]["fail_state"]
self.adlist[child]["fail_state"] = self.find_next_state(
state, self.adlist[child]["value"]
)
if self.adlist[child]["fail_state"] is None:
self.adlist[child]["fail_state"] = 0
self.adlist[child]["output"] = (
self.adlist[child]["output"]
+ self.adlist[self.adlist[child]["fail_state"]]["output"]
)
def search_in(self, string):
"""
>>> A = Automaton(["what", "hat", "ver", "er"])
>>> A.search_in("whatever, err ... , wherever")
{'what': [0], 'hat': [1], 'ver': [5, 25], 'er': [6, 10, 22, 26]}
"""
result = dict() # returns a dict with keywords and list of its occurrences
current_state = 0
for i in range(len(string)):
while (
self.find_next_state(current_state, string[i]) is None
and current_state != 0
):
current_state = self.adlist[current_state]["fail_state"]
current_state = self.find_next_state(current_state, string[i])
if current_state is None:
current_state = 0
else:
for key in self.adlist[current_state]["output"]:
if not (key in result):
result[key] = []
result[key].append(i - len(key) + 1)
return result
if __name__ == "__main__":
import doctest
doctest.testmod()