Source code for hanlp_trie.dictionary

# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-11-29 17:53
from abc import ABC, abstractmethod
from typing import List, Tuple, Any, Dict, Union, Sequence, Iterable, Optional

from hanlp_common.configurable import Configurable
from hanlp_common.reflection import classpath_of
from hanlp_trie.trie import Trie


[docs]class DictInterface(ABC):
[docs] @abstractmethod def tokenize(self, text: Union[str, Sequence[str]]) -> List[Tuple[int, int, Any]]: """Implement this method to tokenize a piece of text into a list of non-intersect spans, each span is a tuple of ``(begin_offset, end_offset, label)``, where label is some properties related to this span and downstream tasks have the freedom to define what kind of labels they want. Args: text: The text to be tokenized. Returns: A list of tokens. """ pass
[docs] def split(self, text: Union[str, Sequence[str]]) -> List[Tuple[int, int, Any]]: """Like the :meth:`str.split`, this method splits a piece of text into chunks by taking the keys in this dictionary as delimiters. It performs longest-prefix-matching on text and split it whenever a longest key is matched. Unlike the :meth:`str.split`, it inserts matched keys into the results list right after where they are found. So that the text can be restored by joining chunks in the results list. Args: text: A piece of text. Returns: A list of chunks, each chunk is a span of ``(begin_offset, end_offset, label)``, where label is some properties related to this span and downstream tasks. """ offset = 0 spans = [] for begin, end, label in self.tokenize(text): if begin > offset: spans.append(text[offset:begin]) spans.append((begin, end, label)) offset = end if offset < len(text): spans.append(text[offset:]) return spans
[docs]class TrieDict(Trie, DictInterface, Configurable): def __init__(self, dictionary: Optional[Union[Dict[Iterable[str], Any], Iterable[str]]] = None) -> None: r""" A dict-like structure for fast custom dictionary strategies in tokenization and tagging. It is built with a dict of key-value pairs or a set of strings. When a set is passed in, it will be turned into a dict where each key is assigned with a boolean value ``True``. Args: dictionary: A custom dictionary of string-value pairs. """ super().__init__(dictionary)
[docs] def tokenize(self, text: Union[str, Sequence[str]]) -> List[Tuple[int, int, Any]]: return self.parse_longest(text)
[docs] def split_batch(self, data: List[str]) -> Tuple[List[str], List[int], List[List[Tuple[int, int, Any]]]]: """ A handy method to perform longest-prefix-matching on a batch of sentences. It tokenize each sentence, record the chunks being either a key in the dict or a span outside of the dict. The spans are then packed into a new batch and returned along with the following information: - which sentence a span belongs to - the matched keys along with their spans and values. This method bridges the gap between statistical models and rule-based gazetteers. It's used in conjunction with :meth:`~hanlp_trie.dictionary.TrieDict.merge_batch`. Args: data: A batch of sentences. Returns: A tuple of the new batch, the belonging information and the keys. """ new_data, new_data_belongs, parts = [], [], [] for idx, sent in enumerate(data): parts.append([]) found = self.tokenize(sent) if found: pre_start = 0 for start, end, info in found: if start > pre_start: new_data.append(sent[pre_start:start]) new_data_belongs.append(idx) pre_start = end parts[idx].append((start, end, info)) if pre_start != len(sent): new_data.append(sent[pre_start:]) new_data_belongs.append(idx) else: new_data.append(sent) new_data_belongs.append(idx) return new_data, new_data_belongs, parts
[docs] @staticmethod def merge_batch(data, new_outputs, new_data_belongs, parts): """ A helper method to merge the outputs of split batch back by concatenating the output per span with the key used to split it. It's used in conjunction with :meth:`~hanlp_trie.dictionary.TrieDict.split_batch`. Args: data: Split batch. new_outputs: Outputs of the split batch. new_data_belongs: Belonging information. parts: The keys. Returns: Merged outputs. """ outputs = [] segments = [] for idx in range(len(data)): segments.append([]) for o, b in zip(new_outputs, new_data_belongs): dst = segments[b] dst.append(o) for s, p, sent in zip(segments, parts, data): s: list = s if p: dst = [] offset = 0 for start, end, info in p: while offset < start: head = s.pop(0) offset += sum(len(token) for token in head) dst += head if isinstance(info, list): dst += info elif isinstance(info, str): dst.append(info) else: dst.append(sent[start:end]) offset = end if s: assert len(s) == 1 dst += s[0] outputs.append(dst) else: outputs.append(s[0]) return outputs
@property def config(self): return { 'classpath': classpath_of(self), 'dictionary': dict(self.items()) }
class TupleTrieDict(TrieDict): def __init__(self, dictionary: Optional[Union[Dict[Iterable[str], Any], Iterable[str]]] = None) -> None: r""" A dict-like structure for fast custom dictionary strategies in tokenization and tagging. It is built with a dict of key-value pairs or a set of strings. When a set is passed in, it will be turned into a dict where each key is assigned with a boolean value ``True``. In comparison to ``TrieDict``, ``TupleTrieDict`` additionally supports serializing/deserializing tuple-as-keys dict. Args: dictionary: A custom dictionary of string-value pairs. """ if isinstance(dictionary, list) and dictionary and isinstance(dictionary[0], (list, tuple)): _d = dict() for k, v in dictionary: _d[tuple(k)] = v dictionary = _d super().__init__(dictionary) @property def config(self): return { 'classpath': classpath_of(self), 'dictionary': list(self.items(prefix=())) } def parse_longest(self, text: Sequence[str]) -> List[Tuple[int, int, Any]]: """Longest-prefix-matching which tries to match the longest keyword sequentially from the head of the text till its tail. By definition, the matches won't overlap with each other. Args: text: A piece of text. In HanLP's design, it doesn't really matter whether this is a str or a list of str. The trie will transit on either types properly, which means a list of str simply defines a list of transition criteria while a str defines each criterion as a character. Returns: A tuple of ``(begin, end, value)``. """ found = [] i = 0 while i < len(text): state = self.transit(text[i:i + 1]) if state: to = i + 1 end = to value = state._value for to in range(i + 1, len(text)): state = state.transit(text[to:to + 1]) if not state: break if state._value is not None: value = state._value end = to + 1 if value is not None: found.append((i, end, value)) i = end - 1 i += 1 return found