Source code for hanlp_common.conll

# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-12-19 20:50
from typing import Union, List

from hanlp_common.structure import SerializableDict
from hanlp_common.visualization import pretty_tree_horizontal, make_table, markdown_table


[docs]class CoNLLWord(SerializableDict):
    def __init__(self, id, form, lemma=None, cpos=None, pos=None, feats=None, head=None, deprel=None, phead=None,
                 pdeprel=None):
        """CoNLL (:cite:`buchholz-marsi-2006-conll`) format template, see http://anthology.aclweb.org/W/W06/W06-2920.pdf

        Args:
            id (int):
                Token counter, starting at 1 for each new sentence.
            form (str):
                Word form or punctuation symbol.
            lemma (str):
                Lemma or stem (depending on the particular treebank) of word form, or an underscore if not available.
            cpos (str):
                Coarse-grained part-of-speech tag, where the tagset depends on the treebank.
            pos (str):
                Fine-grained part-of-speech tag, where the tagset depends on the treebank.
            feats (str):
                Unordered set of syntactic and/or morphological features (depending on the particular treebank),
                or an underscore if not available.
            head (Union[int, List[int]]):
                Head of the current token, which is either a value of ID,
                or zero (’0’) if the token links to the virtual root node of the sentence.
            deprel (Union[str, List[str]]):
                Dependency relation to the HEAD.
            phead (int):
                Projective head of current token, which is either a value of ID or zero (’0’),
                or an underscore if not available.
            pdeprel (str):
                Dependency relation to the PHEAD, or an underscore if not available.
        """
        self.id = sanitize_conll_int_value(id)
        self.form = form
        self.cpos = cpos
        self.pos = pos
        self.head = sanitize_conll_int_value(head)
        self.deprel = deprel
        self.lemma = lemma
        self.feats = feats
        self.phead = phead
        self.pdeprel = pdeprel

    def __str__(self):
        if isinstance(self.head, list):
            return '\n'.join('\t'.join(['_' if v is None else v for v in values]) for values in [
                [str(self.id), self.form, self.lemma, self.cpos, self.pos, self.feats,
                 None if head is None else str(head), deprel, self.phead, self.pdeprel] for head, deprel in
                zip(self.head, self.deprel)
            ])
        values = [str(self.id), self.form, self.lemma, self.cpos, self.pos, self.feats,
                  None if self.head is None else str(self.head), self.deprel, self.phead, self.pdeprel]
        return '\t'.join(['_' if v is None else v for v in values])

    @property
    def nonempty_fields(self):
        """
        Get the values of nonempty fields as a list.
        """
        return list(f for f in
                    [self.form, self.lemma, self.cpos, self.pos, self.feats, self.head, self.deprel, self.phead,
                     self.pdeprel] if f)

[docs]    def get_pos(self, main_pos=False):
        """
        Get the precisest pos for this word.

        Args:
            main_pos: Use the main pos (cpos or upos) or the minor pos (pos or xpos).

        Returns: ``self.pos`` or ``self.cpos``.

        """
        if main_pos:
            return self.cpos or self.pos
        else:
            return self.pos or self.cpos


[docs]class CoNLLUWord(SerializableDict):
    def __init__(self, id: Union[int, str], form, lemma=None, upos=None, xpos=None, feats=None, head=None, deprel=None,
                 deps=None,
                 misc=None):
        """CoNLL-U format template, see https://universaldependencies.org/format.html

        Args:

            id (Union[int, str]):
                Token counter, starting at 1 for each new sentence.
            form (Union[str, None]):
                Word form or punctuation symbol.
            lemma (str):
                Lemma or stem (depending on the particular treebank) of word form, or an underscore if not available.
            upos (str):
                Universal part-of-speech tag.
            xpos (str):
                Language-specific part-of-speech tag; underscore if not available.
            feats (str):
                List of morphological features from the universal feature inventory or from a defined language-specific extension; underscore if not available.
            head (int):
                Head of the current token, which is either a value of ID,
                or zero (’0’) if the token links to the virtual root node of the sentence.
            deprel (str):
                Dependency relation to the HEAD.
            deps (Union[List[Tuple[int, str], str]):
                Projective head of current token, which is either a value of ID or zero (’0’),
                or an underscore if not available.
            misc (str):
                Dependency relation to the PHEAD, or an underscore if not available.
        """
        self.id = sanitize_conll_int_value(id)
        self.form = form
        self.upos = upos
        self.xpos = xpos
        if isinstance(head, list):
            assert deps is None, 'When head is a list, deps has to be None'
            assert isinstance(deprel, list), 'When head is a list, deprel has to be a list'
            assert len(deprel) == len(head), 'When head is a list, deprel has to match its length'
            deps = list(zip(head, deprel))
            head = None
            deprel = None
        self.head = sanitize_conll_int_value(head)
        self.deprel = deprel
        self.lemma = lemma
        self.feats = feats
        if deps == '_':
            deps = None
        if isinstance(deps, str):
            self.deps = []
            for pair in deps.split('|'):
                h, r = pair.split(':')
                h = int(h)
                self.deps.append((h, r))
        else:
            self.deps = deps
        self.misc = misc

    def __str__(self):
        deps = self.deps
        if not deps:
            deps = None
        else:
            deps = '|'.join(f'{h}:{r}' for h, r in deps)
        values = [str(self.id), self.form, self.lemma, self.upos, self.xpos, self.feats,
                  str(self.head) if self.head is not None else None, self.deprel, deps, self.misc]
        return '\t'.join(['_' if v is None else v for v in values])

    @property
    def nonempty_fields(self):
        """
        Get the values of nonempty fields as a list.
        """
        return list(f for f in
                    [self.form, self.lemma, self.upos, self.xpos, self.feats, self.head, self.deprel, self.deps,
                     self.misc] if f)

[docs]    def get_pos(self, main_pos=False):
        """
        Get the precisest pos for this word.

        Args:
            main_pos: Use the main pos (cpos or upos) or the minor pos (pos or xpos).

        Returns: ``self.xpos`` or ``self.upos``

        """
        if main_pos:
            return self.upos or self.xpos
        else:
            return self.xpos or self.upos


[docs]class CoNLLSentence(list):
    def __init__(self, words=None):
        """
        A list of :class:`~hanlp_common.conll.CoNLLWord` or :class:`~hanlp_common.conll.CoNLLUWord`. It is a sub-class
        of :class:`list` and its words can be accessed in the same way as accessing list elements.

        Args:
            words (list[Union[CoNLLWord, CoNLLUWord]]): A list of words.
        """
        super().__init__()
        if words:
            self.extend(words)

    def __str__(self):
        return '\n'.join([word.__str__() for word in self])

[docs]    @staticmethod
    def from_str(conll: str, conllu=False):
        """Build a CoNLLSentence from CoNLL-X format str

        Args:
          conll (str): CoNLL-X or CoNLL-U format string
          conllu:  ``True`` to build :class:`~hanlp_common.conll.CoNLLUWord` for each token.

        Returns:
            A :class:`~hanlp_common.conll.CoNLLSentence`.
        """
        words: List[CoNLLWord] = []
        prev_id = None
        for line in conll.strip().split('\n'):
            if line.startswith('#'):
                continue
            cells = line.split('\t')
            cells = [None if c == '_' else c for c in cells]
            if '-' in cells[0]:
                continue
            cells[0] = int(cells[0])
            cells[6] = int(cells[6])
            if cells[0] != prev_id:
                words.append(CoNLLUWord(*cells) if conllu else CoNLLWord(*cells))
            else:
                if isinstance(words[-1].head, list):
                    words[-1].head.append(cells[6])
                    words[-1].deprel.append(cells[7])
                else:
                    words[-1].head = [words[-1].head] + [cells[6]]
                    words[-1].deprel = [words[-1].deprel] + [cells[7]]
            prev_id = cells[0]
        if conllu:
            for word in words:  # type: CoNLLUWord
                if isinstance(word.head, list):
                    assert not word.deps
                    word.deps = list(zip(word.head, word.deprel))
                    word.head = None
                    word.deprel = None
        return CoNLLSentence(words)

[docs]    @staticmethod
    def from_file(path: str, conllu=None):
        """Build a CoNLLSentence from ``.conllx`` or ``.conllu`` file

        Args:
          path: Path to the file.
          conllu:  ``True`` to build :class:`~hanlp_common.conll.CoNLLUWord` for each token.

        Returns:
            A :class:`~hanlp_common.conll.CoNLLSentence`.
        """
        if conllu is None:
            conllu = path.endswith('.conllu')
        with open(path) as src:
            return [CoNLLSentence.from_str(x, conllu) for x in src.read().split('\n\n') if x.strip()]

[docs]    @staticmethod
    def from_dict(d: dict, conllu=False):
        """Build a CoNLLSentence from a dict.

        Args:
            d: A dict storing a list for each field, where each index corresponds to a token.
            conllu: ``True`` to build :class:`~hanlp_common.conll.CoNLLUWord` for each token.

        Returns:
            A :class:`~hanlp_common.conll.CoNLLSentence`.
        """
        if conllu:
            headings = ['ID', 'FORM', 'LEMMA', 'UPOS', 'XPOS', 'FEATS', 'HEAD', 'DEPREL', 'DEPS', 'MISC']
        else:
            headings = ['ID', 'FORM', 'LEMMA', 'CPOS', 'POS', 'FEATS', 'HEAD', 'DEPREL', 'PHEAD', 'PDEPREL']
        words: List[Union[CoNLLWord, CoNLLUWord]] = []
        for cells in zip(*list(d[f] for f in headings)):
            words.append(CoNLLUWord(*cells) if conllu else CoNLLWord(*cells))
        return CoNLLSentence(words)

[docs]    def to_markdown(self, headings: Union[str, List[str]] = 'auto') -> str:
        r"""Convert into markdown string.

        Args:
            headings: ``auto`` to automatically detect the word type. When passed a list of string, they are treated as
                        headings for each field.

        Returns:
            A markdown representation of this sentence.
        """
        cells = [str(word).split('\t') for word in self]
        if headings == 'auto':
            if isinstance(self[0], CoNLLWord):
                headings = ['ID', 'FORM', 'LEMMA', 'CPOS', 'POS', 'FEATS', 'HEAD', 'DEPREL', 'PHEAD', 'PDEPREL']
            else:  # conllu
                headings = ['ID', 'FORM', 'LEMMA', 'UPOS', 'XPOS', 'FEATS', 'HEAD', 'DEPREL', 'DEPS', 'MISC']
                for each in cells:
                    # if '|' in each[8]:
                    # each[8] = f'`{each[8]}`'
                    each[8] = each[8].replace('|', '⎮')
        alignment = [('^', '>'), ('^', '<'), ('^', '<'), ('^', '<'), ('^', '<'), ('^', '<'), ('^', '>'), ('^', '<'),
                     ('^', '<'), ('^', '<')]
        text = markdown_table(headings, cells, alignment=alignment)
        return text

[docs]    def to_tree(self, extras: List[str] = None, main_pos=True) -> str:
        """Convert into a pretty tree string which can be printed to show the tree structure.

        Args:
            extras: Extra table to be aligned to this tree.
            main_pos: Use the main pos (cpos or upos) or the minor pos (pos or xpos).

        Returns:
            A pretty tree string along with extra table if passed any.
        """
        arrows = []
        for word in self:  # type: Union[CoNLLWord, CoNLLUWord]
            if word.head:
                arrows.append({'from': word.head - 1, 'to': word.id - 1})
        tree = pretty_tree_horizontal(arrows)
        rows = [['Dep Tree', 'Token', 'Relation']]
        has_lem = all(x.lemma for x in self)
        has_pos = all(x.get_pos() for x in self)
        if has_lem:
            rows[0].append('Lemma')
        if has_pos:
            rows[0].append('PoS')
        if extras:
            rows[0].extend(extras[0])
        for i, (word, arc) in enumerate(zip(self, tree)):
            cell_per_word = [arc]
            cell_per_word.append(word.form)
            cell_per_word.append(word.deprel)
            if has_lem:
                cell_per_word.append(word.lemma)
            if has_pos:
                cell_per_word.append(word.get_pos(main_pos))
            if extras:
                cell_per_word.extend(extras[i + 1])
            rows.append(cell_per_word)
        return make_table(rows, insert_header=True)

    @property
    def projective(self):
        """
        ``True`` if this tree is projective.
        """
        return isprojective([x.head for x in self])


class CoNLLSentenceList(list):

    def __str__(self) -> str:
        return '\n\n'.join(str(x) for x in self)


def sanitize_conll_int_value(value: Union[str, int]):
    if value is None or isinstance(value, int):
        return value
    if value == '_':
        return None
    if isinstance(value, str):
        return int(value)
    return value


def isprojective(sequence):
    r"""
    Checks if a dependency tree is projective.
    This also works for partial annotation.

    Besides the obvious crossing arcs, the examples below illustrate two non-projective cases
    which are hard to detect in the scenario of partial annotation.

    Args:
        sequence (list[int]):
            A list of head indices.

    Returns:
        ``True`` if the tree is projective, ``False`` otherwise.

    Examples:
        >>> isprojective([2, -1, 1])  # -1 denotes un-annotated cases
        False
        >>> isprojective([3, -1, 2])
        False
    """

    pairs = [(h, d) for d, h in enumerate(sequence, 1) if h >= 0]
    for i, (hi, di) in enumerate(pairs):
        for hj, dj in pairs[i + 1:]:
            (li, ri), (lj, rj) = sorted([hi, di]), sorted([hj, dj])
            if li <= hj <= ri and hi == dj:
                return False
            if lj <= hi <= rj and hj == di:
                return False
            if (li < lj < ri or li < rj < ri) and (li - lj) * (ri - rj) > 0:
                return False
    return True
HanLP Documentation

Source code for hanlp_common.conll