Source code for hanlp_common.conll

# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-12-19 20:50
from typing import Union, List

from hanlp_common.structure import SerializableDict
from hanlp_common.visualization import pretty_tree_horizontal, make_table, markdown_table


[docs]class CoNLLWord(SerializableDict): def __init__(self, id, form, lemma=None, cpos=None, pos=None, feats=None, head=None, deprel=None, phead=None, pdeprel=None): """CoNLL (:cite:`buchholz-marsi-2006-conll`) format template, see http://anthology.aclweb.org/W/W06/W06-2920.pdf Args: id (int): Token counter, starting at 1 for each new sentence. form (str): Word form or punctuation symbol. lemma (str): Lemma or stem (depending on the particular treebank) of word form, or an underscore if not available. cpos (str): Coarse-grained part-of-speech tag, where the tagset depends on the treebank. pos (str): Fine-grained part-of-speech tag, where the tagset depends on the treebank. feats (str): Unordered set of syntactic and/or morphological features (depending on the particular treebank), or an underscore if not available. head (Union[int, List[int]]): Head of the current token, which is either a value of ID, or zero (’0’) if the token links to the virtual root node of the sentence. deprel (Union[str, List[str]]): Dependency relation to the HEAD. phead (int): Projective head of current token, which is either a value of ID or zero (’0’), or an underscore if not available. pdeprel (str): Dependency relation to the PHEAD, or an underscore if not available. """ self.id = sanitize_conll_int_value(id) self.form = form self.cpos = cpos self.pos = pos self.head = sanitize_conll_int_value(head) self.deprel = deprel self.lemma = lemma self.feats = feats self.phead = phead self.pdeprel = pdeprel def __str__(self): if isinstance(self.head, list): return '\n'.join('\t'.join(['_' if v is None else v for v in values]) for values in [ [str(self.id), self.form, self.lemma, self.cpos, self.pos, self.feats, None if head is None else str(head), deprel, self.phead, self.pdeprel] for head, deprel in zip(self.head, self.deprel) ]) values = [str(self.id), self.form, self.lemma, self.cpos, self.pos, self.feats, None if self.head is None else str(self.head), self.deprel, self.phead, self.pdeprel] return '\t'.join(['_' if v is None else v for v in values]) @property def nonempty_fields(self): """ Get the values of nonempty fields as a list. """ return list(f for f in [self.form, self.lemma, self.cpos, self.pos, self.feats, self.head, self.deprel, self.phead, self.pdeprel] if f)
[docs] def get_pos(self, main_pos=False): """ Get the precisest pos for this word. Args: main_pos: Use the main pos (cpos or upos) or the minor pos (pos or xpos). Returns: ``self.pos`` or ``self.cpos``. """ if main_pos: return self.cpos or self.pos else: return self.pos or self.cpos
[docs]class CoNLLUWord(SerializableDict): def __init__(self, id: Union[int, str], form, lemma=None, upos=None, xpos=None, feats=None, head=None, deprel=None, deps=None, misc=None): """CoNLL-U format template, see https://universaldependencies.org/format.html Args: id (Union[int, str]): Token counter, starting at 1 for each new sentence. form (Union[str, None]): Word form or punctuation symbol. lemma (str): Lemma or stem (depending on the particular treebank) of word form, or an underscore if not available. upos (str): Universal part-of-speech tag. xpos (str): Language-specific part-of-speech tag; underscore if not available. feats (str): List of morphological features from the universal feature inventory or from a defined language-specific extension; underscore if not available. head (int): Head of the current token, which is either a value of ID, or zero (’0’) if the token links to the virtual root node of the sentence. deprel (str): Dependency relation to the HEAD. deps (Union[List[Tuple[int, str], str]): Projective head of current token, which is either a value of ID or zero (’0’), or an underscore if not available. misc (str): Dependency relation to the PHEAD, or an underscore if not available. """ self.id = sanitize_conll_int_value(id) self.form = form self.upos = upos self.xpos = xpos if isinstance(head, list): assert deps is None, 'When head is a list, deps has to be None' assert isinstance(deprel, list), 'When head is a list, deprel has to be a list' assert len(deprel) == len(head), 'When head is a list, deprel has to match its length' deps = list(zip(head, deprel)) head = None deprel = None self.head = sanitize_conll_int_value(head) self.deprel = deprel self.lemma = lemma self.feats = feats if deps == '_': deps = None if isinstance(deps, str): self.deps = [] for pair in deps.split('|'): h, r = pair.split(':') h = int(h) self.deps.append((h, r)) else: self.deps = deps self.misc = misc def __str__(self): deps = self.deps if not deps: deps = None else: deps = '|'.join(f'{h}:{r}' for h, r in deps) values = [str(self.id), self.form, self.lemma, self.upos, self.xpos, self.feats, str(self.head) if self.head is not None else None, self.deprel, deps, self.misc] return '\t'.join(['_' if v is None else v for v in values]) @property def nonempty_fields(self): """ Get the values of nonempty fields as a list. """ return list(f for f in [self.form, self.lemma, self.upos, self.xpos, self.feats, self.head, self.deprel, self.deps, self.misc] if f)
[docs] def get_pos(self, main_pos=False): """ Get the precisest pos for this word. Args: main_pos: Use the main pos (cpos or upos) or the minor pos (pos or xpos). Returns: ``self.xpos`` or ``self.upos`` """ if main_pos: return self.upos or self.xpos else: return self.xpos or self.upos
[docs]class CoNLLSentence(list): def __init__(self, words=None): """ A list of :class:`~hanlp_common.conll.CoNLLWord` or :class:`~hanlp_common.conll.CoNLLUWord`. It is a sub-class of :class:`list` and its words can be accessed in the same way as accessing list elements. Args: words (list[Union[CoNLLWord, CoNLLUWord]]): A list of words. """ super().__init__() if words: self.extend(words) def __str__(self): return '\n'.join([word.__str__() for word in self])
[docs] @staticmethod def from_str(conll: str, conllu=False): """Build a CoNLLSentence from CoNLL-X format str Args: conll (str): CoNLL-X or CoNLL-U format string conllu: ``True`` to build :class:`~hanlp_common.conll.CoNLLUWord` for each token. Returns: A :class:`~hanlp_common.conll.CoNLLSentence`. """ words: List[CoNLLWord] = [] prev_id = None for line in conll.strip().split('\n'): if line.startswith('#'): continue cells = line.split('\t') cells = [None if c == '_' else c for c in cells] if '-' in cells[0]: continue cells[0] = int(cells[0]) cells[6] = int(cells[6]) if cells[0] != prev_id: words.append(CoNLLUWord(*cells) if conllu else CoNLLWord(*cells)) else: if isinstance(words[-1].head, list): words[-1].head.append(cells[6]) words[-1].deprel.append(cells[7]) else: words[-1].head = [words[-1].head] + [cells[6]] words[-1].deprel = [words[-1].deprel] + [cells[7]] prev_id = cells[0] if conllu: for word in words: # type: CoNLLUWord if isinstance(word.head, list): assert not word.deps word.deps = list(zip(word.head, word.deprel)) word.head = None word.deprel = None return CoNLLSentence(words)
[docs] @staticmethod def from_file(path: str, conllu=None): """Build a CoNLLSentence from ``.conllx`` or ``.conllu`` file Args: path: Path to the file. conllu: ``True`` to build :class:`~hanlp_common.conll.CoNLLUWord` for each token. Returns: A :class:`~hanlp_common.conll.CoNLLSentence`. """ if conllu is None: conllu = path.endswith('.conllu') with open(path) as src: return [CoNLLSentence.from_str(x, conllu) for x in src.read().split('\n\n') if x.strip()]
[docs] @staticmethod def from_dict(d: dict, conllu=False): """Build a CoNLLSentence from a dict. Args: d: A dict storing a list for each field, where each index corresponds to a token. conllu: ``True`` to build :class:`~hanlp_common.conll.CoNLLUWord` for each token. Returns: A :class:`~hanlp_common.conll.CoNLLSentence`. """ if conllu: headings = ['ID', 'FORM', 'LEMMA', 'UPOS', 'XPOS', 'FEATS', 'HEAD', 'DEPREL', 'DEPS', 'MISC'] else: headings = ['ID', 'FORM', 'LEMMA', 'CPOS', 'POS', 'FEATS', 'HEAD', 'DEPREL', 'PHEAD', 'PDEPREL'] words: List[Union[CoNLLWord, CoNLLUWord]] = [] for cells in zip(*list(d[f] for f in headings)): words.append(CoNLLUWord(*cells) if conllu else CoNLLWord(*cells)) return CoNLLSentence(words)
[docs] def to_markdown(self, headings: Union[str, List[str]] = 'auto') -> str: r"""Convert into markdown string. Args: headings: ``auto`` to automatically detect the word type. When passed a list of string, they are treated as headings for each field. Returns: A markdown representation of this sentence. """ cells = [str(word).split('\t') for word in self] if headings == 'auto': if isinstance(self[0], CoNLLWord): headings = ['ID', 'FORM', 'LEMMA', 'CPOS', 'POS', 'FEATS', 'HEAD', 'DEPREL', 'PHEAD', 'PDEPREL'] else: # conllu headings = ['ID', 'FORM', 'LEMMA', 'UPOS', 'XPOS', 'FEATS', 'HEAD', 'DEPREL', 'DEPS', 'MISC'] for each in cells: # if '|' in each[8]: # each[8] = f'`{each[8]}`' each[8] = each[8].replace('|', '⎮') alignment = [('^', '>'), ('^', '<'), ('^', '<'), ('^', '<'), ('^', '<'), ('^', '<'), ('^', '>'), ('^', '<'), ('^', '<'), ('^', '<')] text = markdown_table(headings, cells, alignment=alignment) return text
[docs] def to_tree(self, extras: List[str] = None, main_pos=True) -> str: """Convert into a pretty tree string which can be printed to show the tree structure. Args: extras: Extra table to be aligned to this tree. main_pos: Use the main pos (cpos or upos) or the minor pos (pos or xpos). Returns: A pretty tree string along with extra table if passed any. """ arrows = [] for word in self: # type: Union[CoNLLWord, CoNLLUWord] if word.head: arrows.append({'from': word.head - 1, 'to': word.id - 1}) tree = pretty_tree_horizontal(arrows) rows = [['Dep Tree', 'Token', 'Relation']] has_lem = all(x.lemma for x in self) has_pos = all(x.get_pos() for x in self) if has_lem: rows[0].append('Lemma') if has_pos: rows[0].append('PoS') if extras: rows[0].extend(extras[0]) for i, (word, arc) in enumerate(zip(self, tree)): cell_per_word = [arc] cell_per_word.append(word.form) cell_per_word.append(word.deprel) if has_lem: cell_per_word.append(word.lemma) if has_pos: cell_per_word.append(word.get_pos(main_pos)) if extras: cell_per_word.extend(extras[i + 1]) rows.append(cell_per_word) return make_table(rows, insert_header=True)
@property def projective(self): """ ``True`` if this tree is projective. """ return isprojective([x.head for x in self])
class CoNLLSentenceList(list): def __str__(self) -> str: return '\n\n'.join(str(x) for x in self) def sanitize_conll_int_value(value: Union[str, int]): if value is None or isinstance(value, int): return value if value == '_': return None if isinstance(value, str): return int(value) return value def isprojective(sequence): r""" Checks if a dependency tree is projective. This also works for partial annotation. Besides the obvious crossing arcs, the examples below illustrate two non-projective cases which are hard to detect in the scenario of partial annotation. Args: sequence (list[int]): A list of head indices. Returns: ``True`` if the tree is projective, ``False`` otherwise. Examples: >>> isprojective([2, -1, 1]) # -1 denotes un-annotated cases False >>> isprojective([3, -1, 2]) False """ pairs = [(h, d) for d, h in enumerate(sequence, 1) if h >= 0] for i, (hi, di) in enumerate(pairs): for hj, dj in pairs[i + 1:]: (li, ri), (lj, rj) = sorted([hi, di]), sorted([hj, dj]) if li <= hj <= ri and hi == dj: return False if lj <= hi <= rj and hj == di: return False if (li < lj < ri or li < rj < ri) and (li - lj) * (ri - rj) > 0: return False return True