def build_tree(str1, min_len=4): tree = pygtrie.CharTrie() for begin in range(len(str1)): for end in range(begin+min_len, len(str1)): tree[str1[begin:end]] = (begin, end) return tree
def find_prefixes(tree, str2, min_len=4): result = set() sub_len = 0 # Used to remove unnecessary substrings for start in range(len(str2)): longest_prefix = tree.longest_prefix(str2[start:]) if (longest_prefix.key is not None and len(longest_prefix.key) >= min_len and len(longest_prefix.key) > sub_len): result.add(longest_prefix.key) sub_len = len(longest_prefix.key) sub_len -= 1 return result
str1 = "我今天特别开心啊,因为今天是个好日子,我中了 500 万彩票。"*10 str2 = "今天不是个好日子,因为邻居中了 500 万彩票,我今天不开心。"*10 tree = build_tree(str1) result = find_prefixes(tree, str2) print(result)