AC自动机作为天朝发扬光大的算法,常用于非法字符、恶意文本匹配,比如把字符串中的“小学生”变成“***”之类,或是识别是不是违规的帖子之类的。AC自动机是基于前缀树做的飞速匹配字符的算法。
修改后的代码如下:
class TrieNode(object):
def __init__(self, value):
self.value = value
self.next = dict()
self.fail = None
self.emit = None
class AhoCorasic(object):
def __init__(self, attention_list: list = []):
self.root = TrieNode('root')
if attention_list:
self.set_attention_list(attention_list)
def set_attention_list(self, words: list):
assert isinstance(words, list) and words
root = self.root
for word in words:
node = root
for c in word:
if c not in node.next:
node.next[c] = TrieNode(c)
node = node.next[c]
if not node.emit:
node.emit = {word}
else:
node.emit.add(word)
queue = [] # 构造fail指针
queue.insert(0, (root, None))
while len(queue) > 0:
node_parent = queue.pop()
curr, parent = node_parent[0], node_parent[1]
for sub in curr.next.values():
queue.insert(0, (sub, curr))
if parent is None:
continue
elif parent is root:
curr.fail = root
else:
fail = parent.fail
while fail and curr.value not in fail.next:
fail = fail.fail
if fail:
curr.fail = fail.next[curr.value]
else:
curr.fail = root
return root
def search(self, search_str: str):
seq_list = []
node = self.root
for i, search_char in enumerate(search_str):
matched = True
while search_char not in node.next:
if not node.fail:
matched = False
node = self.root # 如果 next与 fail都查询失败,重头开始
break
node = node.fail
if not matched:
continue
node = node.next[search_char] # 不断向下查询
if node.emit: # 找到结束字符
for _ in node.emit:
from_index = i + 1 - len(_)
match_info = (from_index, _)
seq_list.append(match_info)
node = self.root
return seq_list
if __name__ == '__main__':
aho = AhoCorasic(['foo', 'obar'])
aho.set_attention_list(['man']) # 这里可以动态添加需要匹配的子字符串
print(aho.search('barfootheofobarman'))