Source code for hanlp_trie.dictionary

# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-11-29 17:53
from abc import ABC, abstractmethod
from typing import List, Tuple, Any, Dict, Union, Sequence, Iterable, Optional

from hanlp_common.configurable import Configurable
from hanlp_common.reflection import classpath_of
from hanlp_trie.trie import Trie


[docs]class DictInterface(ABC):
[docs]    @abstractmethod
    def tokenize(self, text: Union[str, Sequence[str]]) -> List[Tuple[int, int, Any]]:
        """Implement this method to tokenize a piece of text into a list of non-intersect spans, each span is a tuple
        of ``(begin_offset, end_offset, label)``, where label is some properties related to this span and downstream
        tasks have the freedom to define what kind of labels they want.

        Args:
            text: The text to be tokenized.

        Returns:
              A list of tokens.

        """
        pass

[docs]    def split(self, text: Union[str, Sequence[str]]) -> List[Tuple[int, int, Any]]:
        """Like the :meth:`str.split`, this method splits a piece of text into chunks by taking the keys in this
        dictionary as delimiters. It performs longest-prefix-matching on text and split it whenever a longest key is
        matched. Unlike the :meth:`str.split`, it inserts matched keys into the results list right after where they are
        found. So that the text can be restored by joining chunks in the results list.

        Args:
            text: A piece of text.

        Returns:
            A list of chunks, each chunk is a span of ``(begin_offset, end_offset, label)``, where label is some
            properties related to this span and downstream tasks.
        """
        offset = 0
        spans = []
        for begin, end, label in self.tokenize(text):
            if begin > offset:
                spans.append(text[offset:begin])
            spans.append((begin, end, label))
            offset = end
        if offset < len(text):
            spans.append(text[offset:])
        return spans


[docs]class TrieDict(Trie, DictInterface, Configurable):
    def __init__(self, dictionary: Optional[Union[Dict[Iterable[str], Any], Iterable[str]]] = None) -> None:
        r"""
        A dict-like structure for fast custom dictionary strategies in tokenization and tagging. It is built with
        a dict of key-value pairs or a set of strings. When a set is passed in, it will be turned into a dict where each
        key is assigned with a boolean value ``True``.

        Args:
            dictionary: A custom dictionary of string-value pairs.
        """
        super().__init__(dictionary)

[docs]    def tokenize(self, text: Union[str, Sequence[str]]) -> List[Tuple[int, int, Any]]:
        return self.parse_longest(text)

[docs]    def split_batch(self, data: List[str]) -> Tuple[List[str], List[int], List[List[Tuple[int, int, Any]]]]:
        """ A handy method to perform longest-prefix-matching on a batch of sentences. It tokenize each sentence, record
        the chunks being either a key in the dict or a span outside of the dict. The spans are then packed into a new
        batch and returned along with the following information:

            - which sentence a span belongs to
            - the matched keys along with their spans and values.

        This method bridges the gap between statistical models and rule-based gazetteers.
        It's used in conjunction with :meth:`~hanlp_trie.dictionary.TrieDict.merge_batch`.

        Args:
            data: A batch of sentences.

        Returns:
            A tuple of the new batch, the belonging information and the keys.
        """
        new_data, new_data_belongs, parts = [], [], []
        for idx, sent in enumerate(data):
            parts.append([])
            found = self.tokenize(sent)
            if found:
                pre_start = 0
                for start, end, info in found:
                    if start > pre_start:
                        new_data.append(sent[pre_start:start])
                        new_data_belongs.append(idx)
                    pre_start = end
                    parts[idx].append((start, end, info))
                if pre_start != len(sent):
                    new_data.append(sent[pre_start:])
                    new_data_belongs.append(idx)
            else:
                new_data.append(sent)
                new_data_belongs.append(idx)
        return new_data, new_data_belongs, parts

[docs]    @staticmethod
    def merge_batch(data, new_outputs, new_data_belongs, parts):
        """ A helper method to merge the outputs of split batch back by concatenating the output per span with the key
        used to split it. It's used in conjunction with :meth:`~hanlp_trie.dictionary.TrieDict.split_batch`.

        Args:
            data: Split batch.
            new_outputs: Outputs of the split batch.
            new_data_belongs: Belonging information.
            parts: The keys.

        Returns:
            Merged outputs.
        """
        outputs = []
        segments = []
        for idx in range(len(data)):
            segments.append([])
        for o, b in zip(new_outputs, new_data_belongs):
            dst = segments[b]
            dst.append(o)
        for s, p, sent in zip(segments, parts, data):
            s: list = s
            if p:
                dst = []
                offset = 0
                for start, end, info in p:
                    while offset < start:
                        head = s.pop(0)
                        offset += sum(len(token) for token in head)
                        dst += head
                    if isinstance(info, list):
                        dst += info
                    elif isinstance(info, str):
                        dst.append(info)
                    else:
                        dst.append(sent[start:end])
                    offset = end
                if s:
                    assert len(s) == 1
                    dst += s[0]
                outputs.append(dst)
            else:
                outputs.append(s[0])
        return outputs

    @property
    def config(self):
        return {
            'classpath': classpath_of(self),
            'dictionary': dict(self.items())
        }


class TupleTrieDict(TrieDict):
    def __init__(self, dictionary: Optional[Union[Dict[Iterable[str], Any], Iterable[str]]] = None) -> None:
        r"""
        A dict-like structure for fast custom dictionary strategies in tokenization and tagging. It is built with
        a dict of key-value pairs or a set of strings. When a set is passed in, it will be turned into a dict where each
        key is assigned with a boolean value ``True``. In comparison to ``TrieDict``, ``TupleTrieDict`` additionally
        supports serializing/deserializing tuple-as-keys dict.

        Args:
            dictionary: A custom dictionary of string-value pairs.
        """
        if isinstance(dictionary, list) and dictionary and isinstance(dictionary[0], (list, tuple)):
            _d = dict()
            for k, v in dictionary:
                _d[tuple(k)] = v
            dictionary = _d
        super().__init__(dictionary)

    @property
    def config(self):
        return {
            'classpath': classpath_of(self),
            'dictionary': list(self.items(prefix=()))
        }

    def parse_longest(self, text: Sequence[str]) -> List[Tuple[int, int, Any]]:
        """Longest-prefix-matching which tries to match the longest keyword sequentially from the head of the text till
        its tail. By definition, the matches won't overlap with each other.

        Args:
            text: A piece of text. In HanLP's design, it doesn't really matter whether this is a str or a list of str.
                The trie will transit on either types properly, which means a list of str simply defines a list of
                transition criteria while a str defines each criterion as a character.

        Returns:
            A tuple of ``(begin, end, value)``.

        """
        found = []
        i = 0
        while i < len(text):
            state = self.transit(text[i:i + 1])
            if state:
                to = i + 1
                end = to
                value = state._value
                for to in range(i + 1, len(text)):
                    state = state.transit(text[to:to + 1])
                    if not state:
                        break
                    if state._value is not None:
                        value = state._value
                        end = to + 1
                if value is not None:
                    found.append((i, end, value))
                    i = end - 1
            i += 1
        return found
HanLP Documentation

Source code for hanlp_trie.dictionary