aho-corasick Algorithm

The Aho-Corasick algorithm is a highly efficient string-matching algorithm that searches for multiple patterns simultaneously in a given text. Developed by Alfred Aho and Margaret Corasick in 1975, this algorithm leverages the concept of finite state machines and tries to minimize the number of comparisons needed to find all occurrences of the patterns. It is widely used in various applications, including search engines, intrusion detection systems, bioinformatics, and natural language processing. The key idea behind the Aho-Corasick algorithm is to construct a trie (also known as a prefix tree) that stores all the given patterns as paths from the root to a terminal node. Each node in the trie represents an intermediate or final state, and the edges are labeled with characters from the input alphabet. To enable efficient searching, the algorithm also creates failure links between nodes, which determine the next state to transition to when a character mismatch occurs. As a result, the algorithm can process the input text in a single pass, examining each character only once, and achieve a linear time complexity in the length of the text.
from collections import deque


class Automaton:
    def __init__(self, keywords):
        self.adlist = list()
        self.adlist.append(
            {"value": "", "next_states": [], "fail_state": 0, "output": []}
        )

        for keyword in keywords:
            self.add_keyword(keyword)
        self.set_fail_transitions()

    def find_next_state(self, current_state, char):
        for state in self.adlist[current_state]["next_states"]:
            if char == self.adlist[state]["value"]:
                return state
        return None

    def add_keyword(self, keyword):
        current_state = 0
        for character in keyword:
            if self.find_next_state(current_state, character):
                current_state = self.find_next_state(current_state, character)
            else:
                self.adlist.append(
                    {
                        "value": character,
                        "next_states": [],
                        "fail_state": 0,
                        "output": [],
                    }
                )
                self.adlist[current_state]["next_states"].append(len(self.adlist) - 1)
                current_state = len(self.adlist) - 1
        self.adlist[current_state]["output"].append(keyword)

    def set_fail_transitions(self):
        q = deque()
        for node in self.adlist[0]["next_states"]:
            q.append(node)
            self.adlist[node]["fail_state"] = 0
        while q:
            r = q.popleft()
            for child in self.adlist[r]["next_states"]:
                q.append(child)
                state = self.adlist[r]["fail_state"]
                while (
                    self.find_next_state(state, self.adlist[child]["value"]) is None
                    and state != 0
                ):
                    state = self.adlist[state]["fail_state"]
                self.adlist[child]["fail_state"] = self.find_next_state(
                    state, self.adlist[child]["value"]
                )
                if self.adlist[child]["fail_state"] is None:
                    self.adlist[child]["fail_state"] = 0
                self.adlist[child]["output"] = (
                    self.adlist[child]["output"]
                    + self.adlist[self.adlist[child]["fail_state"]]["output"]
                )

    def search_in(self, string):
        """
        >>> A = Automaton(["what", "hat", "ver", "er"])
        >>> A.search_in("whatever, err ... , wherever")
        {'what': [0], 'hat': [1], 'ver': [5, 25], 'er': [6, 10, 22, 26]}
        """
        result = dict()  # returns a dict with keywords and list of its occurrences
        current_state = 0
        for i in range(len(string)):
            while (
                self.find_next_state(current_state, string[i]) is None
                and current_state != 0
            ):
                current_state = self.adlist[current_state]["fail_state"]
            current_state = self.find_next_state(current_state, string[i])
            if current_state is None:
                current_state = 0
            else:
                for key in self.adlist[current_state]["output"]:
                    if not (key in result):
                        result[key] = []
                    result[key].append(i - len(key) + 1)
        return result


if __name__ == "__main__":
    import doctest

    doctest.testmod()

LANGUAGE:

DARK MODE: