# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-01-04 23:46
from typing import Dict, Any, List, Tuple, Sequence, Union, Iterable, Optional
[docs]class Node(object):
def __init__(self, value=None) -> None:
"""A node in a trie tree.
Args:
value: The value associated with this node.
"""
self._children = {}
self._value = value
def _get_or_add_child(self, char):
child = self._children.get(char)
if child is None:
child = Node(None)
self._children[char] = child
return child
[docs] def transit(self, key):
"""Transit the state of a Deterministic Finite Automata (DFA) with key.
Args:
key: A sequence of criterion (tokens or characters) used to transit to a new state.
Returns:
A new state if the transition succeeded, otherwise ``None``.
"""
state = self
for char in key:
state = state._children.get(char)
if state is None:
break
return state
def _walk(self, prefix: Union[str, tuple], ordered=False):
for char, child in sorted(self._children.items()) if ordered else self._children.items():
prefix_new = prefix + (char if isinstance(prefix, str) else (char,))
if child._value:
yield prefix_new, child._value
yield from child._walk(prefix_new)
[docs]class Trie(Node):
def __init__(self, tokens: Optional[Union[Dict[str, Any], Iterable[str]]] = None) -> None:
"""A referential implementation of the trie (:cite:`10.1145/1457838.1457895`) structure. It stores a dict by
assigning each key/value pair a :class:`~hanlp_trie.trie.Node` in a trie tree. It provides get/set/del/items
methods just like a :class:`dict` does. Additionally, it also provides longest-prefix-matching and keywords
lookup against a piece of text, which are very helpful in rule-based Natural Language Processing.
Args:
tokens: A set of keys or a dict mapping.
"""
super().__init__()
self._size = 0
if tokens:
if isinstance(tokens, dict):
for k, v in tokens.items():
self[k] = v
else:
for k in tokens:
self[k] = True
def __contains__(self, key):
return self[key] is not None
def __getitem__(self, key):
state = self.transit(key)
if state is None:
return None
return state._value
def __setitem__(self, key, value):
state = self
for char in key[:-1]:
state = state._get_or_add_child(char)
leaf = state._get_or_add_child(key[-1])
if leaf._value is None:
self._size += 1
leaf._value = value
def __delitem__(self, key):
state = self.transit(key)
if state is not None:
state._value = None
self._size -= 1
def update(self, dic: Dict[str, Any]):
for k, v in dic.items():
self[k] = v
return self
[docs] def parse(self, text: Sequence[str]) -> List[Tuple[int, int, Any]]:
"""Keywords lookup which takes a piece of text as input, and lookup all occurrences of keywords in it. These
occurrences can overlap with each other.
Args:
text: A piece of text. In HanLP's design, it doesn't really matter whether this is a str or a list of str.
The trie will transit on either types properly, which means a list of str simply defines a list of
transition criteria while a str defines each criterion as a character.
Returns:
A tuple of ``(begin, end, value)``.
"""
found = []
for i in range(len(text)):
state = self
for j in range(i, len(text)):
state = state.transit(text[j])
if state:
if state._value is not None:
found.append((i, j + 1, state._value))
else:
break
return found
[docs] def parse_longest(self, text: Sequence[str]) -> List[Tuple[int, int, Any]]:
"""Longest-prefix-matching which tries to match the longest keyword sequentially from the head of the text till
its tail. By definition, the matches won't overlap with each other.
Args:
text: A piece of text. In HanLP's design, it doesn't really matter whether this is a str or a list of str.
The trie will transit on either types properly, which means a list of str simply defines a list of
transition criteria while a str defines each criterion as a character.
Returns:
A tuple of ``(begin, end, value)``.
"""
found = []
i = 0
while i < len(text):
state = self.transit(text[i])
if state:
to = i + 1
end = to
value = state._value
for to in range(i + 1, len(text)):
state = state.transit(text[to])
if not state:
break
if state._value is not None:
value = state._value
end = to + 1
if value is not None:
found.append((i, end, value))
i = end - 1
i += 1
return found
def items(self, ordered=False, prefix=''):
yield from self._walk(prefix, ordered)
def __len__(self):
return self._size
def __bool__(self):
return bool(len(self))