Source code for hanlp.components.mtl.tasks.ner.tag_ner

# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-12-03 14:35
import logging
from typing import Union, List, Dict, Any, Iterable, Callable, Set, Sequence

import torch
from hanlp_trie import DictInterface
from torch.utils.data import DataLoader

from hanlp.common.dataset import SamplerBuilder, PadSequenceDataLoader
from hanlp.common.transform import VocabDict
from hanlp.components.mtl.tasks import Task
from hanlp.components.ner.transformer_ner import TransformerNamedEntityRecognizer
from hanlp.layers.crf.crf import CRF
from hanlp.layers.scalar_mix import ScalarMixWithDropoutBuilder
from hanlp.metrics.metric import Metric
from hanlp.metrics.mtl import MetricDict
from hanlp_common.util import merge_locals_kwargs


class LinearCRFDecoder(torch.nn.Module):
    def __init__(self,
                 hidden_size,
                 num_labels,
                 secondary_encoder=None,
                 crf=False) -> None:
        super().__init__()
        self.secondary_encoder = secondary_encoder
        self.classifier = torch.nn.Linear(hidden_size, num_labels)
        self.crf = CRF(num_labels) if crf else None

    def forward(self, contextualized_embeddings: torch.FloatTensor, batch: Dict[str, torch.Tensor], mask=None):
        if self.secondary_encoder:
            contextualized_embeddings = self.secondary_encoder(contextualized_embeddings, mask=mask)
        return self.classifier(contextualized_embeddings)


[docs]class TaggingNamedEntityRecognition(Task, TransformerNamedEntityRecognizer):

    def __init__(self,
                 trn: str = None,
                 dev: str = None,
                 tst: str = None,
                 sampler_builder: SamplerBuilder = None,
                 dependencies: str = None,
                 scalar_mix: ScalarMixWithDropoutBuilder = None,
                 use_raw_hidden_states=False,
                 lr=1e-3,
                 separate_optimizer=False,
                 max_seq_len=None,
                 sent_delimiter=None,
                 char_level=False,
                 hard_constraint=False,
                 tagging_scheme=None,
                 crf=False,
                 delimiter_in_entity=None,
                 merge_types: List[str] = None,
                 secondary_encoder=None,
                 token_key='token',
                 dict_whitelist: Union[DictInterface, Union[Dict[str, Any], Set[str]]] = None,
                 dict_blacklist: Union[DictInterface, Union[Dict[str, Any], Set[str]]] = None,
                 dict_tags: Union[
                     DictInterface, Union[Dict[Union[str, Sequence[str]], Union[str, Sequence[str]]]]] = None,
                 **kwargs) -> None:
        r"""A simple tagger using a linear layer with an optional CRF (:cite:`lafferty2001conditional`) layer for
        NER task. It can utilize whitelist gazetteers which is dict mapping from entity name to entity type.
        During decoding, it performs longest-prefix-matching of these words to override the prediction from
        underlying statistical model. It also uses a blacklist to mask out mis-predicted  entities.

        .. Note:: For algorithm beginners, longest-prefix-matching is the prerequisite to understand what dictionary can
            do and what it can't do. The tutorial in `this book <http://nlp.hankcs.com/book.php>`_ can be very helpful.

        Args:
            trn: Path to training set.
            dev: Path to dev set.
            tst: Path to test set.
            sampler_builder: A builder which builds a sampler.
            dependencies: Its dependencies on other tasks.
            scalar_mix: A builder which builds a `ScalarMixWithDropout` object.
            use_raw_hidden_states: Whether to use raw hidden states from transformer without any pooling.
            lr: Learning rate for this task.
            separate_optimizer: Use customized separate optimizer for this task.
            max_seq_len: Sentences longer than ``max_seq_len`` will be split into shorter ones if possible.
            sent_delimiter: Delimiter between sentences, like period or comma, which indicates a long sentence can
                be split here.
            char_level: Whether the sequence length is measured at char level, which is never the case for
                lemmatization.
            hard_constraint: Whether to enforce hard length constraint on sentences. If there is no ``sent_delimiter``
                in a sentence, it will be split at a token anyway.
            token_key: The key to tokens in dataset. This should always be set to ``token`` in MTL.
            crf: ``True`` to enable CRF (:cite:`lafferty2001conditional`).
            delimiter_in_entity: The delimiter between tokens in entity, which is used to rebuild entity by joining
                tokens during decoding.
            merge_types: The types of consecutive entities to be merged.
            secondary_encoder: An optional secondary encoder to provide enhanced representation by taking the hidden
                states from the main encoder as input.
            token_key: The key to tokens in dataset. This should always be set to ``token`` in MTL.
            dict_whitelist: A :class:`dict` or a :class:`~hanlp_trie.dictionary.DictInterface` of gazetteers to be
                included into the final results.
            dict_blacklist: A :class:`set` or a :class:`~hanlp_trie.dictionary.DictInterface` of badcases to be
                excluded from the final results.
            **kwargs:
        """
        super().__init__(**merge_locals_kwargs(locals(), kwargs))
        self.vocabs = VocabDict()
        self.secondary_encoder = secondary_encoder
        self.dict_whitelist = dict_whitelist
        self.dict_blacklist = dict_blacklist
        self.dict_tags = dict_tags

[docs]    def build_dataloader(self,
                         data,
                         transform: Callable = None,
                         training=False,
                         device=None,
                         logger: logging.Logger = None,
                         cache=False,
                         gradient_accumulation=1,
                         **kwargs) -> DataLoader:
        args = dict((k, self.config[k]) for k in
                    ['delimiter', 'max_seq_len', 'sent_delimiter', 'char_level', 'hard_constraint'] if k in self.config)
        dataset = self.build_dataset(data, cache=cache, transform=transform, **args)
        dataset.append_transform(self.vocabs)
        dataset.purge_cache()
        if self.vocabs.mutable:
            self.build_vocabs(dataset, logger)
        return PadSequenceDataLoader(
            batch_sampler=self.sampler_builder.build(
                self.compute_lens(data, dataset),
                shuffle=training, gradient_accumulation=gradient_accumulation),
            device=device,
            dataset=dataset)

    def compute_loss(self,
                     batch: Dict[str, Any],
                     output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any],
                     criterion) -> Union[torch.FloatTensor, Dict[str, torch.FloatTensor]]:
        return TransformerNamedEntityRecognizer.compute_loss(self, criterion, output, batch['tag_id'], batch['mask'])

    def decode_output(self,
                      output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any],
                      mask: torch.BoolTensor,
                      batch: Dict[str, Any],
                      decoder,
                      **kwargs) -> Union[Dict[str, Any], Any]:
        return TransformerNamedEntityRecognizer.decode_output(self, output, batch['mask'], batch, decoder)

    def update_metrics(self,
                       batch: Dict[str, Any],
                       output: Union[torch.Tensor, Dict[str, torch.Tensor], Iterable[torch.Tensor], Any],
                       prediction: Dict[str, Any],
                       metric: Union[MetricDict, Metric]):
        return TransformerNamedEntityRecognizer.update_metrics(self, metric, output, batch['tag_id'], batch['mask'],
                                                               batch, prediction)

[docs]    def build_model(self, encoder_size, training=True, **kwargs) -> torch.nn.Module:
        return LinearCRFDecoder(encoder_size, len(self.vocabs['tag']), self.secondary_encoder, self.config.crf)

[docs]    def build_metric(self, **kwargs):
        return TransformerNamedEntityRecognizer.build_metric(self, **kwargs)

[docs]    def input_is_flat(self, data) -> bool:
        return TransformerNamedEntityRecognizer.input_is_flat(self, data)

    def prediction_to_result(self, prediction: Dict[str, Any], batch: Dict[str, Any]) -> Union[List, Dict]:
        return TransformerNamedEntityRecognizer.prediction_to_human(self, prediction, self.vocabs['tag'].idx_to_token,
                                                                    batch)
HanLP Documentation

Source code for hanlp.components.mtl.tasks.ner.tag_ner