[docs]classVocab(Serializable):def__init__(self,idx_to_token:List[str]=None,token_to_idx:Dict=None,mutable=True,pad_token=PAD,unk_token=UNK)->None:"""Vocabulary base class which converts tokens to indices and vice versa. Args: idx_to_token: id to token mapping. token_to_idx: token to id mapping. mutable: ``True`` to allow adding new tokens, ``False`` to map OOV to ``unk``. pad_token: The token representing padding. unk_token: The token representing OOV. """super().__init__()ifidx_to_token:t2i=dict((token,idx)foridx,tokeninenumerate(idx_to_token))iftoken_to_idx:t2i.update(token_to_idx)token_to_idx=t2iiftoken_to_idxisNone:token_to_idx={}ifpad_tokenisnotNone:token_to_idx[pad_token]=len(token_to_idx)ifunk_tokenisnotNone:token_to_idx[unk_token]=token_to_idx.get(unk_token,len(token_to_idx))self.token_to_idx=token_to_idxself.idx_to_token:List[str]=Noneself.mutable=mutableself.pad_token=pad_tokenself.unk_token=unk_tokendef__setitem__(self,token:str,idx:int):assertself.mutable,'Update an immutable Vocab object is not allowed'self.token_to_idx[token]=idx
[docs]def__getitem__(self,key:Union[str,int,List])->Union[int,str,List]:""" Get the index/indices associated with a token or a list of tokens or vice versa. Args: key: ``str`` for token(s) and ``int`` for index/indices. Returns: Associated indices or tokens. """ifisinstance(key,str):returnself.get_idx(key)elifisinstance(key,int):returnself.get_token(key)elifisinstance(key,list):iflen(key)==0:return[]elifisinstance(key[0],str):return[self.get_idx(x)forxinkey]elifisinstance(key[0],int):return[self.get_token(x)forxinkey]
[docs]defadd(self,token:str)->int:""" Tries to add a token into a vocab and returns its id. If it has already been there, its id will be returned and the vocab won't be updated. If the vocab is locked, an assertion failure will occur. Args: token: A new or existing token. Returns: Its associated id. """assertself.mutable,'It is not allowed to call add on an immutable Vocab'assertisinstance(token,str),f'Token type must be str but got {type(token)} from {token}'asserttokenisnotNone,'Token must not be None'idx=self.token_to_idx.get(token,None)ifidxisNone:idx=len(self.token_to_idx)self.token_to_idx[token]=idxreturnidx
[docs]defupdate(self,tokens:Iterable[str])->None:"""Update the vocab with these tokens by adding them to vocab one by one. Args: tokens (Iterable[str]): A list of tokens. """assertself.mutable,'It is not allowed to update an immutable Vocab'fortokenintokens:self.add(token)
[docs]defget_idx(self,token:str)->int:"""Get the idx of a token. If it's not there, it will be added to the vocab when the vocab is locked otherwise the id of UNK will be returned. Args: token: A token. Returns: The id of that token. """assertisinstance(token,str),'token has to be `str`'idx=self.token_to_idx.get(token,None)ifidxisNone:ifself.mutable:idx=len(self.token_to_idx)self.token_to_idx[token]=idxelse:idx=self.token_to_idx.get(self.unk_token,None)returnidx
[docs]defget_token(self,idx:int)->str:"""Get the token using its index. Args: idx: The index to a token. Returns: """ifself.idx_to_token:returnself.idx_to_token[idx]ifself.mutable:fortokeninself.token_to_idx:ifself.token_to_idx[token]==idx:returntoken
[docs]deflock(self):"""Lock this vocab up so that it won't accept new tokens. Returns: Itself. """ifself.locked:returnselfself.mutable=Falseself.build_idx_to_token()returnself
[docs]defunlock(self):"""Unlock this vocab so that new tokens can be added in. Returns: Itself. """ifnotself.locked:returnself.mutable=Trueself.idx_to_token=Nonereturnself
@propertydeflocked(self):""" ``True`` indicates this vocab is locked. """returnnotself.mutable@propertydefunk_idx(self):""" The index of ``UNK`` token. """ifself.unk_tokenisNone:returnNoneelse:returnself.token_to_idx.get(self.unk_token,None)@propertydefpad_idx(self):""" The index of ``PAD`` token. """ifself.pad_tokenisNone:returnNoneelse:returnself.token_to_idx.get(self.pad_token,None)@propertydeftokens(self):""" A set of all tokens in this vocab. """returnself.token_to_idx.keys()def__str__(self)->str:returnself.token_to_idx.__str__()
[docs]defsummary(self,verbose=True)->str:"""Get or print a summary of this vocab. Args: verbose: ``True`` to print the summary to stdout. Returns: Summary in text form. """# report = 'Length: {}\n'.format(len(self))# report += 'Samples: {}\n'.format(str(list(self.token_to_idx.keys())[:min(50, len(self))]))# report += 'Mutable: {}'.format(self.mutable)# report = report.strip()report='[{}] = '.format(len(self))report+=str(list(self.token_to_idx.keys())[:min(50,len(self))])ifverbose:print(report)returnreport
[docs]defto_dict(self)->dict:"""Convert this vocab to a dict so that it can be json serialized. Returns: A dict. """idx_to_token=self.idx_to_tokenpad_token=self.pad_tokenunk_token=self.unk_tokenmutable=self.mutableitems=locals().copy()items.pop('self')returnitems
[docs]defcopy_from(self,item:dict):"""Copy properties from a dict so that it can json de-serialized. Args: item: A dict holding ``token_to_idx`` Returns: Itself. """forkey,valueinitem.items():setattr(self,key,value)self.token_to_idx={k:vforv,kinenumerate(self.idx_to_token)}returnself
[docs]deflower(self):"""Convert all tokens to lower case. Returns: Itself. """self.unlock()token_to_idx=self.token_to_idxself.token_to_idx={}fortokenintoken_to_idx.keys():self.add(token.lower())returnself
@propertydeffirst_token(self):"""The first token in this vocab. """ifself.idx_to_token:returnself.idx_to_token[0]ifself.token_to_idx:returnnext(iter(self.token_to_idx))returnNone
[docs]defmerge(self,other):"""Merge this with another vocab inplace. Args: other (Vocab): Another vocab. """forword,idxinother.token_to_idx.items():self.get_idx(word)
@propertydefsafe_pad_token(self)->str:"""Get the pad token safely. It always returns a pad token, which is the pad token or the first token if pad does not present in the vocab. """ifself.pad_token:returnself.pad_tokenifself.first_token:returnself.first_tokenreturnPAD@propertydefsafe_pad_token_idx(self)->int:"""Get the idx to the pad token safely. It always returns an index, which corresponds to the pad token or the first token if pad does not present in the vocab. """returnself.token_to_idx.get(self.safe_pad_token,0)@propertydefsafe_unk_token(self)->str:"""Get the unk token safely. It always returns a unk token, which is the unk token or the first token if unk does not presented in the vocab. """ifself.unk_token:returnself.unk_tokenifself.first_token:returnself.first_tokenreturnUNKdef__repr__(self)->str:ifself.idx_to_tokenisnotNone:returnself.idx_to_token.__repr__()returnself.token_to_idx.__repr__()defextend(self,tokens:Iterable[str]):self.unlock()self(tokens)defreload_idx_to_token(self,idx_to_token:List[str],pad_idx=0,unk_idx=1):self.idx_to_token=idx_to_tokenself.token_to_idx=dict((s,i)fori,sinenumerate(idx_to_token))ifpad_idxisnotNone:self.pad_token=idx_to_token[pad_idx]ifunk_idxisnotNone:self.unk_token=idx_to_token[unk_idx]
[docs]defset_unk_as_safe_unk(self):"""Set ``self.unk_token = self.safe_unk_token``. It's useful when the dev/test set contains OOV labels. """self.unk_token=self.safe_unk_token
classCustomVocab(Vocab):defto_dict(self)->dict:d=super().to_dict()d['type']=classpath_of(self)returndclassLowercaseVocab(CustomVocab):defget_idx(self,token:str)->int:idx=self.token_to_idx.get(token,None)ifidxisNone:idx=self.token_to_idx.get(token.lower(),None)ifidxisNone:ifself.mutable:idx=len(self.token_to_idx)self.token_to_idx[token]=idxelse:idx=self.token_to_idx.get(self.unk_token,None)returnidxclassVocabWithNone(CustomVocab):defget_idx(self,token:str)->int:iftokenisNone:return-1returnsuper().get_idx(token)classVocabWithFrequency(CustomVocab):def__init__(self,counter:Counter=None,min_occur_cnt=0,pad_token=PAD,unk_token=UNK,specials=None)->None:super().__init__(None,None,True,pad_token,unk_token)ifspecials:foreachinspecials:counter.pop(each,None)self.add(each)self.frequencies=[1]*len(self)ifcounter:fortoken,freqincounter.most_common():iffreq>=min_occur_cnt:self.add(token)self.frequencies.append(freq)self.lock()defto_dict(self)->dict:d=super().to_dict()d['frequencies']=self.frequenciesreturnddefcopy_from(self,item:dict):super().copy_from(item)self.frequencies=item['frequencies']defget_frequency(self,token):idx=self.get_idx(token)ifidxisnotNone:returnself.frequencies[idx]return0classVocabCounter(CustomVocab):def__init__(self,idx_to_token:List[str]=None,token_to_idx:Dict=None,mutable=True,pad_token=PAD,unk_token=UNK)->None:super().__init__(idx_to_token,token_to_idx,mutable,pad_token,unk_token)self.counter=Counter()defget_idx(self,token:str)->int:ifself.mutable:self.counter[token]+=1returnsuper().get_idx(token)deftrim(self,min_frequency):assertself.mutablespecials={self.unk_token,self.pad_token}survivors=list((token,freq)fortoken,freqinself.counter.most_common()iffreq>=min_frequencyandtokennotinspecials)survivors=[(x,-1)forxinspecialsifx]+survivorsself.counter=Counter(dict(survivors))self.token_to_idx=dict()self.idx_to_token=Nonefortoken,freqinsurvivors:idx=len(self.token_to_idx)self.token_to_idx[token]=idxdefcopy_from(self,item:dict):super().copy_from(item)self.counter=Counter(item['counter'].items())if'counter'initemelseCounter()defto_dict(self)->dict:d=super().to_dict()d['counter']=dict(self.counter.items())returndclassVocab3D(CustomVocab):def__call__(self,some_token:Union[str,Iterable[str],Iterable[Iterable[str]]]) \
->Union[int,List[int],List[List[int]]]:"""It supports 3D arrays of tokens. Args: some_token: Tokens of 1D to 3D Returns: A list of indices. """ifisinstance(some_token,(list,tuple,set)):indices=[]iflen(some_token)andisinstance(some_token[0],(list,tuple,set)):forsentinsome_token:inside=[]fortokeninsent:inside.append(self.get_idx(token))indices.append(inside)returnindicesfortokeninsome_token:ifisinstance(token,str):indices.append(self.get_idx(token))else:indices.append([self.get_idx(x)forxintoken])returnindiceselse:returnself.get_idx(some_token)defcreate_label_vocab()->Vocab:returnVocab(pad_token=None,unk_token=None)