# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-11-29 17:48
import json
from typing import Union, List, Optional, Dict, Any, Tuple
from urllib.error import HTTPError
from urllib.parse import urlencode
from urllib.request import Request, urlopen
from hanlp_common.document import Document
# noinspection PyUnresolvedReferences
import requests
def _post(url, form: Dict[str, Any], headers: Dict[str, Any], timeout=60, verify=True) -> str:
response = requests.post(url, json=form, headers=headers, timeout=timeout, verify=verify)
if response.status_code != 200:
raise HTTPError(url, response.status_code, response.text, response.headers, None)
return response.text
except ImportError:
import ssl
def _post(url, form: Dict[str, Any], headers: Dict[str, Any], timeout=60, verify=True) -> str:
request = Request(url, json.dumps(form).encode())
for k, v in headers.items():
request.add_header(k, v)
ctx = None
if not verify:
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
return urlopen(request, timeout=timeout, context=ctx).read().decode()
[docs]class HanLPClient(object):
def __init__(self, url: str, auth: str = None, language=None, timeout=60, verify=True) -> None:
url (str): An API endpoint to a service provider.
auth (str): An auth key licenced from a service provider.
language (str): The default language for each :func:`~hanlp_restful.HanLPClient.parse` call.
Contact the service provider for the list of languages supported.
Conventionally, ``zh`` is used for Chinese, ``en`` for English, ``ja`` for Japanese and ``mul`` for
multilingual. Leave ``None`` to use the default language on server.
timeout (int): Maximum waiting time in seconds for a request.
verify (bool): ``True`` to enable SSL cert verification. You can also pass ``verify`` the path to a CA_BUNDLE
file or directory with certificates of trusted CAs (``requests`` required).
self._language = language
self._timeout = timeout
self._url = url
if auth is None:
import os
auth = os.getenv('HANLP_AUTH', None)
self._auth = auth
self._verify = verify
[docs] def parse(self,
text: Union[str, List[str]] = None,
tokens: List[List[str]] = None,
tasks: Optional[Union[str, List[str]]] = None,
skip_tasks: Optional[Union[str, List[str]]] = None,
language: str = None,
) -> Document:
Parse a piece of text.
text: A document (str), or a list of sentences (List[str]).
tokens: A list of sentences where each sentence is a list of tokens.
tasks: The tasks to predict. Use ``tasks=[...]`` to run selected tasks only. Dependent tasks will be
automatically selected.
skip_tasks: The tasks to skip. Use ``skip_tasks='tok/fine'`` to enable coarse tokenization for all tasks.
Use ``tasks=['tok/coarse', ...]`` and ``skip_tasks='tok/fine'`` to enable coarse tokenization for
selected tasks.
language: The language of input text or tokens. ``None`` to use the default language on server.
A :class:`~hanlp_common.document.Document`.
# Use tasks=[...] to run selected tasks only
HanLP('晓美焰来到自然语义科技公司', tasks=['pos', 'ner'])
# Use skip_tasks='tok/fine' to enable coarse tokenization for all tasks
HanLP('晓美焰来到自然语义科技公司', skip_tasks='tok/fine')
# Use tasks=['tok/coarse', ...] and skip_tasks='tok/fine' to enable
# coarse tokenization for selected tasks
HanLP('晓美焰来到自然语义科技公司', tasks=['tok/coarse','pos'],skip_tasks='tok/fine')
HTTPError: Any errors happening on the Internet side or the server side. Refer to the ``code`` and ``msg``
of the exception for more details. A list of common errors :
- ``400 Bad Request`` indicates that the server cannot process the request due to a client
fault (e.g., text too long, language unsupported).
- ``401 Unauthorized`` indicates that the request lacks **valid** ``auth`` credentials for the API.
- ``422 Unprocessable Entity`` indicates that the content type of the request entity is not in
proper json format.
- ``429 Too Many Requests`` indicates the user has sent too many requests in a given
amount of time ("rate limiting").
assert text or tokens, 'At least one of text or tokens has to be specified.'
response = self._send_post_json(self._url + '/parse', {
'text': text,
'tokens': tokens,
'tasks': tasks,
'skip_tasks': skip_tasks,
'language': language or self._language
return Document(response)
[docs] def __call__(self,
text: Union[str, List[str]] = None,
tokens: List[List[str]] = None,
tasks: Optional[Union[str, List[str]]] = None,
skip_tasks: Optional[Union[str, List[str]]] = None,
language: str = None,
) -> Document:
A shortcut of :meth:`~hanlp_restful.HanLPClient.parse`.
return self.parse(text, tokens, tasks, skip_tasks, language)
[docs] def about(self) -> Dict[str, Any]:
"""Get the information about server and your client.
A dict containing your rate limit and server version etc.
info = self._send_get_json(self._url + '/about', {})
return Document(info)
def _send_post(self, url, form: Dict[str, Any]):
request = Request(url, json.dumps(form).encode())
return self._fire_request(request)
def _fire_request(self, request):
return urlopen(request, timeout=self._timeout).read().decode()
def _send_post_json(self, url, form: Dict[str, Any]):
headers = dict()
if self._auth:
headers['Authorization'] = f'Basic {self._auth}'
return json.loads(_post(url, form, headers, self._timeout, verify=self._verify))
def _send_get(self, url, form: Dict[str, Any]):
request = Request(url + '?' + urlencode(form))
return self._fire_request(request)
def _add_headers(self, request):
if self._auth:
request.add_header('Authorization', f'Basic {self._auth}')
def _send_get_json(self, url, form: Dict[str, Any]):
return json.loads(self._send_get(url, form))
[docs] def text_style_transfer(self, text: Union[str, List[str]], target_style: str, language: str = None) \
-> Union[str, List[str]]:
""" Text style transfer aims to change the style of the input text to the target style while preserving its
text: Source text.
target_style: Target style.
language: The language of input text. ``None`` to use the default language.
Text or a list of text of the target style.
HanLP.text_style_transfer(['国家对中石油抱有很大的期望.', '要用创新去推动高质量的发展。'],
# Output:
# Output:
response = self._send_post_json(self._url + '/text_style_transfer',
{'text': text, 'target_style': target_style,
'language': language or self._language})
return response
[docs] def semantic_textual_similarity(self, text: Union[Tuple[str, str], List[Tuple[str, str]]], language: str = None) \
-> Union[float, List[float]]:
""" Semantic textual similarity deals with determining how similar two pieces of texts are.
text: A pair or pairs of text.
language: The language of input text. ``None`` to use the default language.
('看图猜一电影名', '看图猜电影'),
('无线路由器怎么无线上网', '无线上网卡和无线路由器怎么用'),
('北京到上海的动车票', '上海到北京的动车票'),
# Output:
0.9764469, # Similarity of ('看图猜一电影名', '看图猜电影')
0.0, # Similarity of ('无线路由器怎么无线上网', '无线上网卡和无线路由器怎么用')
0.0034587 # Similarity of ('北京到上海的动车票', '上海到北京的动车票')
response = self._send_post_json(self._url + '/semantic_textual_similarity',
{'text': text, 'language': language or self._language})
return response
[docs] def coreference_resolution(self, text: Optional[str] = None, tokens: Optional[List[List[str]]] = None,
speakers: Optional[List[str]] = None, language: Optional[str] = None) -> Union[
Dict[str, Union[List[str], List[List[Tuple[str, int, int]]]]], List[List[Tuple[str, int, int]]]]:
r""" Coreference resolution is the task of clustering mentions in text that refer to the same underlying
real world entities.
text: A piece of text, usually a document without tokenization.
tokens: A list of sentences where each sentence is a list of tokens.
speakers: A list of speakers where each speaker is a ``str`` representing the speaker's ID, e.g., ``Tom``.
language: The language of input text. ``None`` to use the default language.
When ``text`` is specified, return the clusters and tokens. Otherwise just the clusters, In this case, you need to ``sum(tokens, [])`` in order to match the span indices with tokens
# Output:
{'clusters': [
[['我', 0, 1], ['我', 3, 4], ['我', 8, 9]], # 指代说话人
[['我姐', 0, 2], ['她', 4, 5]], # 指代说话人的姐姐
[['她的猫', 4, 7], ['它', 11, 12]]], # 指代说话人的姐姐的猫
'tokens': ['我', '姐', '送', '我', '她', '的', '猫', '。',
'我', '很', '喜欢', '它', '。']}
tokens=[['我', '姐', '送', '我', '她', '的', '猫', '。'],
['我', '很', '喜欢', '它', '。']])
# Output:
[['我', 0, 1], ['我', 3, 4], ['我', 8, 9]], # 指代说话人
[['我姐', 0, 2], ['她', 4, 5]], # 指代说话人的姐姐
[['她的猫', 4, 7], ['它', 11, 12]]], # 指代说话人的姐姐的猫
.. image:: https://file.hankcs.com/img/coref_demo_small.png
:alt: Coreference resolution visualization
response = self._send_post_json(self._url + '/coreference_resolution',
{'text': text, 'tokens': tokens, 'speakers': speakers,
'language': language or self._language})
return response
[docs] def tokenize(self, text: Union[str, List[str]], coarse: Optional[bool] = None, language=None) -> List[List[str]]:
""" Split a document into sentences and tokenize them. Note that it is always faster to tokenize a whole
document than to tokenize each sentence one by one. So avoid calling this method sentence by sentence but put
sentences into a ``list`` and pass them to the ``text`` argument.
text: A document (``str``), or a list of sentences (``List[str]``).
coarse: Whether to perform coarse-grained or fine-grained tokenization. Chinese and Japanese supported.
language: The language of input text. ``None`` to use the default language.
A list of tokenized sentences.
# Avoid tokenizing sentence by sentence, it is expensive:
[['商品', '和', '服务', '。']]
[['阿婆主', '来到', '北京', '立方庭', '参观', '自然', '语义', '科技', '公司']]
# Instead, the following codes are much faster:
[['商品', '和', '服务', '。'],
['阿婆主', '来到', '北京', '立方庭', '参观', '自然', '语义', '科技', '公司']]
# To tokenize with coarse-grained standard:
HanLP.tokenize('商品和服务。阿婆主来到北京立方庭参观自然语义科技公司', coarse=True)
[['商品', '和', '服务', '。'],
['阿婆主', '来到', '北京', '立方庭', '参观', '自然语义科技公司']]
# To tokenize pre-segmented sentences:
HanLP.tokenize(['商品和服务。', '当下雨天地面积水分外严重'])
[['商品', '和', '服务', '。'],
['当', '下雨天', '地面', '积水', '分', '外', '严重']]
# Multilingual tokenization by specifying language='mul':
['In 2021, HanLPv2.1 delivers state-of-the-art multilingual NLP techniques
'to production environment.',
'2021年 HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。'], language='mul')
[['In', '2021', ',', 'HanLPv2.1', 'delivers', 'state-of-the-art', 'multilingual',
'NLP', 'techniques', 'to', 'production', 'environment', '.'],
['2021', '年', '、', 'HanLPv2.1', 'は', '次', '世代', 'の', '最', '先端', '多',
'言語', 'NLP', '技術', 'を', '本番', '環境', 'に', '導入', 'します', '。'],
['2021', '年', 'HanLPv2.1', '为', '生产', '环境', '带来', '次世代', '最', '先进的',
'多', '语种', 'NLP', '技术', '。']]
language = language or self._language
if coarse and language and language not in {'zh', 'ja'}:
raise NotImplementedError(f'Coarse tokenization not supported for {language}. Please set language="zh" or "ja".')
doc = self.parse(text=text, tasks='tok/coarse' if coarse is True else 'tok', language=language)
return next(iter(doc.values()))
[docs] def abstract_meaning_representation(self,
text: Union[str, List[str]] = None,
tokens: List[List[str]] = None,
language: str = None,
visualization: str = None,
) -> List[Dict]:
"""Abstract Meaning Representation (AMR) captures “who is doing what to whom” in a sentence. Each sentence is
represented as a rooted, directed, acyclic graph consisting of nodes (concepts) and edges (relations).
text: A document (str), or a list of sentences (List[str]).
tokens: A list of sentences where each sentence is a list of tokens.
language: The language of input text or tokens. ``None`` to use the default language on server.
visualization: Set to `dot` or `svg` to obtain coresspodning visualization.
Graphs in meaning represenation format.
HanLP.abstract_meaning_representation('The boy wants the girl to believe him.',
.. image:: https://hanlp.hankcs.com/backend/v2/amr_svg?tokens=%E7%94%B7%E5%AD%A9%20%E5%B8%8C%E6%9C%9B%20%E5%A5%B3%E5%AD%A9%20%E7%9B%B8%E4%BF%A1%20%E4%BB%96%20%E3%80%82&language=zh&scale=1
:alt: Abstract Meaning Representation
.. image:: https://hanlp.hankcs.com/backend/v2/amr_svg?tokens=The%20boy%20wants%20the%20girl%20to%20believe%20him%20.&language=en&scale=1
:alt: Abstract Meaning Representation
assert text or tokens, 'At least one of text or tokens has to be specified.'
return self._send_post_json(self._url + '/abstract_meaning_representation', {
'text': text,
'tokens': tokens,
'language': language or self._language,
'visualization': visualization,
[docs] def abstractive_summarization(
text: str,
language: str = None,
) -> str:
r""" Abstractive Summarization is the task of generating a short and concise summary that captures the
salient ideas of the source text. The generated summaries potentially contain new phrases and sentences that
may not appear in the source text.
text: The text content of the document.
language: The language of input text or tokens. ``None`` to use the default language on server.
# Output:
assert text, 'Text has to be non-empty.'
return self._send_post_json(self._url + '/abstractive_summarization', {
'text': text,
'language': language or self._language,
[docs] def grammatical_error_correction(self, text: Union[str, List[str]], language: str = None) \
-> Union[str, List[str]]:
""" Grammatical Error Correction (GEC) is the task of correcting different kinds of errors in text such as
spelling, punctuation, grammatical, and word choice errors.
text: Text potentially containing different kinds of errors such as spelling, punctuation,
grammatical, and word choice errors.
language: The language of input text. ``None`` to use the default language.
Corrected text.
# Output:
response = self._send_post_json(self._url + '/grammatical_error_correction',
{'text': text,
'language': language or self._language})
return response
[docs] def text_classification(self, text: Union[str, List[str]], model, topk=False, prob=False) -> Union[
str, Dict[str, float], List[Union[str, Dict[str, float]]]]:
Text classification is the task of assigning a sentence or document an appropriate category.
The categories depend on the chosen dataset and can range from topics.
text: A document or a list of documents.
model: The model to use for prediction.
topk: ``True`` or ``int`` to return the top-k labels.
prob: Return also probabilities.
Classification results.
response = self._send_post_json(self._url + '/text_classification',
{'text': text, 'model': model, 'topk': topk, 'prob': prob})
return response
[docs] def sentiment_analysis(self, text: Union[str, List[str]], language=None) -> Union[float, List[float]]:
Sentiment analysis is the task of classifying the polarity of a given text. For instance,
a text-based tweet can be categorized into either "positive", "negative", or "neutral".
text: A document or a list of documents.
language (str): The default language for each :func:`~hanlp_restful.HanLPClient.parse` call.
Contact the service provider for the list of languages supported.
Conventionally, ``zh`` is used for Chinese and ``mul`` for multilingual.
Leave ``None`` to use the default language on server.
Sentiment polarity as a numerical value which measures how positive the sentiment is.
response = self._send_post_json(self._url + '/sentiment_analysis',
{'text': text, 'language': language or self._language})
return response
[docs] def language_identification(self, text: Union[str, List[str]], topk=False, prob=False) -> Union[
str, Dict[str, float], List[Union[str, Dict[str, float]]]]:
Identify the language of a given text.
text: A document or a list of documents.
topk: ``True`` or ``int`` to return the top-k languages.
prob: Return also probabilities.
Identified language in `ISO 639-1 codes`_.
'In 2021, HanLPv2.1 delivers state-of-the-art multilingual NLP techniques.')
lang, prob = HanLP.language_identification(
'2021年、HanLPv2.1は次世代の最先端多言語NLP技術を本番環境に導入します。', prob=True)
('ja', 0.9976244568824768)
'2021年 HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。', topk=2)
['zh', 'ja']
'2021年 HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。', topk=3, prob=True)
{'zh': 0.3952908217906952, 'en': 0.37189167737960815, 'ja': 0.056213412433862686}
.. _ISO 639-1 codes:
return self.text_classification(text, 'lid', topk, prob)