3
db                 @   s   d Z ddlmZ ddlmZ ddlmZ ddlZddlZddlZddlZddl	Z
dd Zdd	 Zd
d Zdd Zdd Zdd Zdd Zdd ZG dd deZG dd deZG dd deZdd Zdd Zd d! ZdS )"zTokenization classes.    )absolute_import)division)print_functionNc       
      C   s   |sdS t jd|}|dkr dS |jd}ddddg}dd	d
g}d}||krb|  rbd}d}d}d}	||kr~| r~d}d}d}d}	|rtd|||||	f dS )zHChecks whether the casing config is consistent with the checkpoint name.Nz$^.*?([A-Za-z0-9_-]+)/bert_model.ckpt   zuncased_L-24_H-1024_A-16zuncased_L-12_H-768_A-12zmultilingual_L-12_H-768_A-12zchinese_L-12_H-768_A-12zcased_L-12_H-768_A-12zcased_L-24_H-1024_A-16zmulti_cased_L-12_H-768_A-12FTFalseZ
lowercasedTrueZcaseda  You passed in `--do_lower_case=%s` with `--init_checkpoint=%s`. However, `%s` seems to be a %s model, so you should pass in `--do_lower_case=%s` so that the fine-tuning matches how the model was pre-training. If this error is wrong, please just comment out this check.)rematchgroup
ValueError)
do_lower_caseinit_checkpointm
model_nameZlower_modelsZcased_modelsZis_bad_configZactual_flagZ	case_nameZopposite_flag r   ;D:\pycharm\NER-BERT-BiLSTM-CRF--master\bert\tokenization.py validate_case_matches_checkpoint  s4    
r   c             C   s   t jr<t| tr| S t| tr*| jddS tdt|  nDt jrxt| trX| jddS t| t	rf| S tdt|  ntddS )zGConverts `text` to Unicode (if it's not already), assuming utf-8 input.zutf-8ignorezUnsupported string type: %sz#Not running on Python2 or Python 3?N)
sixPY3
isinstancestrbytesdecoder   typePY2unicode)textr   r   r   convert_to_unicode  s    



r   c             C   s   t jr<t| tr| S t| tr*| jddS tdt|  nBt jrvt| trP| S t| t	rd| j
dS tdt|  ntddS )zAReturns text encoded in a way suitable for print or `tf.logging`.zutf-8r   zUnsupported string type: %sz#Not running on Python2 or Python 3?N)r   r   r   r   r   r   r   r   r   r   encode)r   r   r   r   printable_text  s    




r    c          
   C   sZ   t j }d}tjj| d6}x.t|j }|s0P |j }|||< |d7 }qW W dQ R X |S )z*Loads a vocabulary file into a dictionary.r   rr   N)collectionsOrderedDicttfgfileGFiler   readlinestrip)
vocab_filevocabindexreadertokenr   r   r   
load_vocab  s    r.   c             C   s8   g }x.|D ]&}|dkr"|j d q
|j | |  q
W |S )z4Converts a sequence of [tokens|ids] using the vocab.z","r   )append)r*   itemsoutputitemr   r   r   convert_by_vocab  s    
r3   c             C   s
   t | |S )N)r3   )r*   tokensr   r   r   convert_tokens_to_ids"  s    r5   c             C   s
   t | |S )N)r3   )	inv_vocabidsr   r   r   convert_ids_to_tokens&  s    r8   c             C   s   | j  } | sg S | j }|S )z@Runs basic whitespace cleaning and splitting on a piece of text.)r(   split)r   r4   r   r   r   whitespace_tokenize*  s
    r:   c               @   s2   e Zd ZdZdddZdd Zdd Zd	d
 ZdS )FullTokenizerzRuns end-to-end tokenziation.Tc             C   s>   t || _dd | jj D | _t|d| _t| jd| _d S )Nc             S   s   i | ]\}}||qS r   r   ).0kvr   r   r   
<dictcomp>8  s    z*FullTokenizer.__init__.<locals>.<dictcomp>)r   )r*   )r.   r*   r0   r6   BasicTokenizerbasic_tokenizerWordpieceTokenizerwordpiece_tokenizer)selfr)   r   r   r   r   __init__6  s    
zFullTokenizer.__init__c             C   s>   g }x4| j j|D ]$}x| jj|D ]}|j| q$W qW |S )N)rA   tokenizerC   r/   )rD   r   split_tokensr-   Z	sub_tokenr   r   r   rF   <  s
    zFullTokenizer.tokenizec             C   s   t | j|S )N)r3   r*   )rD   r4   r   r   r   r5   D  s    z#FullTokenizer.convert_tokens_to_idsc             C   s   t | j|S )N)r3   r6   )rD   r7   r   r   r   r8   G  s    z#FullTokenizer.convert_ids_to_tokensN)T)__name__
__module____qualname____doc__rE   rF   r5   r8   r   r   r   r   r;   3  s
   
r;   c               @   sJ   e Zd ZdZdddZdd Zdd Zd	d
 Zdd Zdd Z	dd Z
dS )r@   zDRuns basic tokenization (punctuation splitting, lower casing, etc.).Tc             C   s
   || _ dS )zbConstructs a BasicTokenizer.

    Args:
      do_lower_case: Whether to lower case the input.
    N)r   )rD   r   r   r   r   rE   N  s    zBasicTokenizer.__init__c             C   sp   t |}| j|}| j|}t|}g }x4|D ],}| jrJ|j }| j|}|j| j| q.W tdj	|}|S )zTokenizes a piece of text. )
r   _clean_text_tokenize_chinese_charsr:   r   lower_run_strip_accentsextend_run_split_on_puncjoin)rD   r   Zorig_tokensrG   r-   output_tokensr   r   r   rF   V  s    



zBasicTokenizer.tokenizec             C   sF   t jd|}g }x*|D ]"}t j|}|dkr.q|j| qW dj|S )z$Strips accents from a piece of text.ZNFDZMn )unicodedata	normalizecategoryr/   rS   )rD   r   r1   charcatr   r   r   rP   n  s    

z!BasicTokenizer._run_strip_accentsc             C   s~   t |}d}d}g }xZ|t|k rn|| }t|rD|j|g d}n |rR|jg  d}|d j| |d7 }qW dd |D S )z&Splits punctuation on a piece of text.r   TFr   c             S   s   g | ]}d j |qS )rU   )rS   )r<   xr   r   r   
<listcomp>  s    z5BasicTokenizer._run_split_on_punc.<locals>.<listcomp>)listlen_is_punctuationr/   )rD   r   charsiZstart_new_wordr1   rY   r   r   r   rR   y  s    
z!BasicTokenizer._run_split_on_puncc             C   sX   g }xH|D ]@}t |}| j|r@|jd |j| |jd q
|j| q
W dj|S )z)Adds whitespace around any CJK character.rL   rU   )ord_is_chinese_charr/   rS   )rD   r   r1   rY   cpr   r   r   rN     s    



z&BasicTokenizer._tokenize_chinese_charsc             C   s   |dkr|dks|dkr |dks|dkr0|dks|dkr@|dks|d	krP|d
ks|dkr`|dks|dkrp|dks|dkr|dkrdS dS )z6Checks whether CP is the codepoint of a CJK character.i N  i  i 4  iM  i   iߦ i  i? i@ i i  i i   i  i  i TFr   )rD   re   r   r   r   rd     s    
zBasicTokenizer._is_chinese_charc             C   s\   g }xL|D ]D}t |}|dks
|dks
t|r0q
t|rD|jd q
|j| q
W dj|S )zBPerforms invalid character removal and whitespace cleanup on text.r   i  rL   rU   )rc   _is_control_is_whitespacer/   rS   )rD   r   r1   rY   re   r   r   r   rM     s    
zBasicTokenizer._clean_textN)T)rH   rI   rJ   rK   rE   rF   rP   rR   rN   rd   rM   r   r   r   r   r@   K  s   
r@   c               @   s"   e Zd ZdZd	ddZdd ZdS )
rB   zRuns WordPiece tokenziation.[UNK]   c             C   s   || _ || _|| _d S )N)r*   	unk_tokenmax_input_chars_per_word)rD   r*   rj   rk   r   r   r   rE     s    zWordpieceTokenizer.__init__c             C   s   t |}g }xt|D ]}t|}t|| jkr>|j| j qd}d}g }x|t|k rt|}d}	xF||k rdj||| }
|dkrd|
 }
|
| jkr|
}	P |d8 }qfW |	dkrd}P |j|	 |}qLW |r|j| j q|j	| qW |S )a  Tokenizes a piece of text into its word pieces.

    This uses a greedy longest-match-first algorithm to perform tokenization
    using the given vocabulary.

    For example:
      input = "unaffable"
      output = ["un", "##aff", "##able"]

    Args:
      text: A single token or whitespace separated tokens. This should have
        already been passed through `BasicTokenizer.

    Returns:
      A list of wordpiece tokens.
    Fr   NrU   z##r   T)
r   r:   r^   r_   rk   r/   rj   rS   r*   rQ   )rD   r   rT   r-   ra   Zis_badstartZ
sub_tokensendZ
cur_substrsubstrr   r   r   rF     s<    


zWordpieceTokenizer.tokenizeN)rh   ri   )rH   rI   rJ   rK   rE   rF   r   r   r   r   rB     s   
rB   c             C   s>   | dks | dks | dks | dkr$dS t j| }|dkr:dS dS )z1Checks whether `chars` is a whitespace character.rL   	
TZsF)rV   rX   )rY   rZ   r   r   r   rg     s     
rg   c             C   s6   | dks| dks| dkrdS t j| }|dkr2dS dS )	z.Checks whether `chars` is a control character.ro   rp   rq   FCcCfT)rs   rt   )rV   rX   )rY   rZ   r   r   r   rf     s    
rf   c             C   sh   t | }|dkr|dksH|dkr(|dksH|dkr8|dksH|dkrL|dkrLd	S tj| }|jd
rdd	S dS )z2Checks whether `chars` is a punctuation character.!   /   :   @   [   `   {   ~   TPF)rc   rV   rX   
startswith)rY   re   rZ   r   r   r   r`     s      

r`   )rK   
__future__r   r   r   r"   r   rV   r   
tensorflowr$   r   r   r    r.   r3   r5   r8   r:   objectr;   r@   rB   rg   rf   r`   r   r   r   r   <module>  s,   2	s>