Source code for hanlp_trie.trie

# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-01-04 23:46
from typing import Dict, Any, List, Tuple, Sequence, Union, Iterable, Optional


[docs]class Node(object): def __init__(self, value=None) -> None: """A node in a trie tree. Args: value: The value associated with this node. """ self._children = {} self._value = value def _get_or_add_child(self, char): child = self._children.get(char) if child is None: child = Node(None) self._children[char] = child return child
[docs] def transit(self, key): """Transit the state of a Deterministic Finite Automata (DFA) with key. Args: key: A sequence of criterion (tokens or characters) used to transit to a new state. Returns: A new state if the transition succeeded, otherwise ``None``. """ state = self for char in key: state = state._children.get(char) if state is None: break return state
def _walk(self, prefix: Union[str, tuple], ordered=False): for char, child in sorted(self._children.items()) if ordered else self._children.items(): prefix_new = prefix + (char if isinstance(prefix, str) else (char,)) if child._value: yield prefix_new, child._value yield from child._walk(prefix_new)
[docs]class Trie(Node): def __init__(self, tokens: Optional[Union[Dict[str, Any], Iterable[str]]] = None) -> None: """A referential implementation of the trie (:cite:`10.1145/1457838.1457895`) structure. It stores a dict by assigning each key/value pair a :class:`~hanlp_trie.trie.Node` in a trie tree. It provides get/set/del/items methods just like a :class:`dict` does. Additionally, it also provides longest-prefix-matching and keywords lookup against a piece of text, which are very helpful in rule-based Natural Language Processing. Args: tokens: A set of keys or a dict mapping. """ super().__init__() self._size = 0 if tokens: if isinstance(tokens, dict): for k, v in tokens.items(): self[k] = v else: for k in tokens: self[k] = True def __contains__(self, key): return self[key] is not None def __getitem__(self, key): state = self.transit(key) if state is None: return None return state._value def __setitem__(self, key, value): state = self for char in key[:-1]: state = state._get_or_add_child(char) leaf = state._get_or_add_child(key[-1]) if leaf._value is None: self._size += 1 leaf._value = value def __delitem__(self, key): state = self.transit(key) if state is not None: state._value = None self._size -= 1 def update(self, dic: Dict[str, Any]): for k, v in dic.items(): self[k] = v return self
[docs] def parse(self, text: Sequence[str]) -> List[Tuple[int, int, Any]]: """Keywords lookup which takes a piece of text as input, and lookup all occurrences of keywords in it. These occurrences can overlap with each other. Args: text: A piece of text. In HanLP's design, it doesn't really matter whether this is a str or a list of str. The trie will transit on either types properly, which means a list of str simply defines a list of transition criteria while a str defines each criterion as a character. Returns: A tuple of ``(begin, end, value)``. """ found = [] for i in range(len(text)): state = self for j in range(i, len(text)): state = state.transit(text[j]) if state: if state._value is not None: found.append((i, j + 1, state._value)) else: break return found
[docs] def parse_longest(self, text: Sequence[str]) -> List[Tuple[int, int, Any]]: """Longest-prefix-matching which tries to match the longest keyword sequentially from the head of the text till its tail. By definition, the matches won't overlap with each other. Args: text: A piece of text. In HanLP's design, it doesn't really matter whether this is a str or a list of str. The trie will transit on either types properly, which means a list of str simply defines a list of transition criteria while a str defines each criterion as a character. Returns: A tuple of ``(begin, end, value)``. """ found = [] i = 0 while i < len(text): state = self.transit(text[i]) if state: to = i + 1 end = to value = state._value for to in range(i + 1, len(text)): state = state.transit(text[to]) if not state: break if state._value is not None: value = state._value end = to + 1 if value is not None: found.append((i, end, value)) i = end - 1 i += 1 return found
def items(self, ordered=False, prefix=''): yield from self._walk(prefix, ordered) def __len__(self): return self._size def __bool__(self): return bool(len(self))