B
    2ib                 @   s   d Z ddlmZ ddlmZ ddlmZ ddlZddlZddlZddlZddl	Z
dd Zdd	 Zd
d Zdd Zdd Zdd Zdd Zdd ZG dd deZG dd deZG dd deZdd Zdd Zd d! ZdS )"zTokenization classes.    )absolute_import)division)print_functionNc       
      C   s   |sdS t d|}|dkr dS |d}ddddg}dd	d
g}d}||kr`| s`d}d}d}d}	||kr|| r|d}d}d}d}	|rtd|||||	f dS )zHChecks whether the casing config is consistent with the checkpoint name.Nz$^.*?([A-Za-z0-9_-]+)/bert_model.ckpt   zuncased_L-24_H-1024_A-16zuncased_L-12_H-768_A-12zmultilingual_L-12_H-768_A-12zchinese_L-12_H-768_A-12zcased_L-12_H-768_A-12zcased_L-24_H-1024_A-16zmulti_cased_L-12_H-768_A-12FTFalseZ
lowercasedTrueZcaseda  You passed in `--do_lower_case=%s` with `--init_checkpoint=%s`. However, `%s` seems to be a %s model, so you should pass in `--do_lower_case=%s` so that the fine-tuning matches how the model was pre-training. If this error is wrong, please just comment out this check.)rematchgroup
ValueError)
do_lower_caseinit_checkpointm
model_nameZlower_modelsZcased_modelsZis_bad_configZactual_flagZ	case_nameZopposite_flag r   J/home/tuu73405/projects/NER-BERT-BiLSTM-CRF--master/models/tokenization.py validate_case_matches_checkpoint  s4    
r   c             C   s   t jr<t| tr| S t| tr*| ddS tdt|  nDt jrxt| trX| ddS t| t	rf| S tdt|  ntddS )zGConverts `text` to Unicode (if it's not already), assuming utf-8 input.zutf-8ignorezUnsupported string type: %sz#Not running on Python2 or Python 3?N)
sixPY3
isinstancestrbytesdecoder   typePY2unicode)textr   r   r   convert_to_unicode  s    



r   c             C   s   t jr<t| tr| S t| tr*| ddS tdt|  nBt jrvt| trP| S t| t	rd| 
dS tdt|  ntddS )zAReturns text encoded in a way suitable for print or `tf.logging`.zutf-8r   zUnsupported string type: %sz#Not running on Python2 or Python 3?N)r   r   r   r   r   r   r   r   r   r   encode)r   r   r   r   printable_text  s    




r    c          	   C   s\   t  }d}tjj| d6}x.t| }|s2P | }|||< |d7 }q W W dQ R X |S )z*Loads a vocabulary file into a dictionary.r   rr   N)	collectionsOrderedDicttfiogfileGFiler   readlinestrip)
vocab_filevocabindexreadertokenr   r   r   
load_vocab  s    r/   c             C   s8   g }x.|D ]&}|dkr"| d q
| | |  q
W |S )z4Converts a sequence of [tokens|ids] using the vocab.z","r   )append)r+   itemsoutputitemr   r   r   convert_by_vocab  s    
r4   c             C   s
   t | |S )N)r4   )r+   tokensr   r   r   convert_tokens_to_ids"  s    r6   c             C   s
   t | |S )N)r4   )	inv_vocabidsr   r   r   convert_ids_to_tokens&  s    r9   c             C   s   |   } | sg S |  }|S )z@Runs basic whitespace cleaning and splitting on a piece of text.)r)   split)r   r5   r   r   r   whitespace_tokenize*  s
    r;   c               @   s2   e Zd ZdZdddZdd Zdd Zd	d
 ZdS )FullTokenizerzRuns end-to-end tokenziation.Tc             C   s>   t || _dd | j D | _t|d| _t| jd| _d S )Nc             S   s   i | ]\}}||qS r   r   ).0kvr   r   r   
<dictcomp>8  s    z*FullTokenizer.__init__.<locals>.<dictcomp>)r   )r+   )r/   r+   r1   r7   BasicTokenizerbasic_tokenizerWordpieceTokenizerwordpiece_tokenizer)selfr*   r   r   r   r   __init__6  s    
zFullTokenizer.__init__c             C   s>   g }x4| j |D ]$}x| j|D ]}|| q$W qW |S )N)rB   tokenizerD   r0   )rE   r   split_tokensr.   Z	sub_tokenr   r   r   rG   <  s
    zFullTokenizer.tokenizec             C   s   t | j|S )N)r4   r+   )rE   r5   r   r   r   r6   D  s    z#FullTokenizer.convert_tokens_to_idsc             C   s   t | j|S )N)r4   r7   )rE   r8   r   r   r   r9   G  s    z#FullTokenizer.convert_ids_to_tokensN)T)__name__
__module____qualname____doc__rF   rG   r6   r9   r   r   r   r   r<   3  s
   
r<   c               @   sJ   e Zd ZdZdddZdd Zdd Zd	d
 Zdd Zdd Z	dd Z
dS )rA   zDRuns basic tokenization (punctuation splitting, lower casing, etc.).Tc             C   s
   || _ dS )zbConstructs a BasicTokenizer.

    Args:
      do_lower_case: Whether to lower case the input.
    N)r   )rE   r   r   r   r   rF   N  s    zBasicTokenizer.__init__c             C   sp   t |}| |}| |}t|}g }x4|D ],}| jrJ| }| |}|| | q.W td	|}|S )zTokenizes a piece of text. )
r   _clean_text_tokenize_chinese_charsr;   r   lower_run_strip_accentsextend_run_split_on_puncjoin)rE   r   Zorig_tokensrH   r.   output_tokensr   r   r   rG   V  s    



zBasicTokenizer.tokenizec             C   sF   t d|}g }x*|D ]"}t |}|dkr.q|| qW d|S )z$Strips accents from a piece of text.ZNFDZMn )unicodedata	normalizecategoryr0   rT   )rE   r   r2   charcatr   r   r   rQ   n  s    

z!BasicTokenizer._run_strip_accentsc             C   s~   t |}d}d}g }xZ|t|k rn|| }t|rD||g d}n |rR|g  d}|d | |d7 }qW dd |D S )z&Splits punctuation on a piece of text.r   TFr   c             S   s   g | ]}d  |qS )rV   )rT   )r=   xr   r   r   
<listcomp>  s    z5BasicTokenizer._run_split_on_punc.<locals>.<listcomp>)listlen_is_punctuationr0   )rE   r   charsiZstart_new_wordr2   rZ   r   r   r   rS   y  s    
z!BasicTokenizer._run_split_on_puncc             C   sX   g }xH|D ]@}t |}| |r@|d || |d q
|| q
W d|S )z)Adds whitespace around any CJK character.rM   rV   )ord_is_chinese_charr0   rT   )rE   r   r2   rZ   cpr   r   r   rO     s    



z&BasicTokenizer._tokenize_chinese_charsc             C   s   |dkr|dks|dkr |dks|dkr0|dks|dkr@|dks|d	krP|d
ks|dkr`|dks|dkrp|dks|dkr|dkrdS dS )z6Checks whether CP is the codepoint of a CJK character.i N  i  i 4  iM  i   iߦ i  i? i@ i i  i i   i  i  i TFr   )rE   rf   r   r   r   re     s    
zBasicTokenizer._is_chinese_charc             C   s\   g }xL|D ]D}t |}|dks
|dks
t|r0q
t|rD|d q
|| q
W d|S )zBPerforms invalid character removal and whitespace cleanup on text.r   i  rM   rV   )rd   _is_control_is_whitespacer0   rT   )rE   r   r2   rZ   rf   r   r   r   rN     s    
zBasicTokenizer._clean_textN)T)rI   rJ   rK   rL   rF   rG   rQ   rS   rO   re   rN   r   r   r   r   rA   K  s   
rA   c               @   s"   e Zd ZdZd	ddZdd ZdS )
rC   zRuns WordPiece tokenziation.[UNK]   c             C   s   || _ || _|| _d S )N)r+   	unk_tokenmax_input_chars_per_word)rE   r+   rk   rl   r   r   r   rF     s    zWordpieceTokenizer.__init__c             C   s   t |}g }xt|D ]}t|}t|| jkr>|| j qd}d}g }x|t|k rt|}d}	xF||k rd||| }
|dkrd|
 }
|
| jkr|
}	P |d8 }qfW |	dkrd}P ||	 |}qLW |r|| j q|	| qW |S )a  Tokenizes a piece of text into its word pieces.

    This uses a greedy longest-match-first algorithm to perform tokenization
    using the given vocabulary.

    For example:
      input = "unaffable"
      output = ["un", "##aff", "##able"]

    Args:
      text: A single token or whitespace separated tokens. This should have
        already been passed through `BasicTokenizer.

    Returns:
      A list of wordpiece tokens.
    Fr   NrV   z##r   T)
r   r;   r_   r`   rl   r0   rk   rT   r+   rR   )rE   r   rU   r.   rb   Zis_badstartZ
sub_tokensendZ
cur_substrsubstrr   r   r   rG     s<    


zWordpieceTokenizer.tokenizeN)ri   rj   )rI   rJ   rK   rL   rF   rG   r   r   r   r   rC     s   
rC   c             C   s>   | dks | dks | dks | dkr$dS t | }|dkr:dS dS )z1Checks whether `chars` is a whitespace character.rM   	
TZZsF)rW   rY   )rZ   r[   r   r   r   rh     s     
rh   c             C   s6   | dks| dks| dkrdS t | }|dkr2dS dS )z.Checks whether `chars` is a control character.rp   rq   rr   F)ZCcZCfT)rW   rY   )rZ   r[   r   r   r   rg     s    
rg   c             C   sh   t | }|dkr|dksH|dkr(|dksH|dkr8|dksH|dkrL|dkrLd	S t| }|d
rdd	S dS )z2Checks whether `chars` is a punctuation character.!   /   :   @   [   `   {   ~   TPF)rd   rW   rY   
startswith)rZ   rf   r[   r   r   r   ra     s      

ra   )rL   
__future__r   r   r   r"   r   rW   r   
tensorflowr$   r   r   r    r/   r4   r6   r9   r;   objectr<   rA   rC   rh   rg   ra   r   r   r   r   <module>  s,   2	s>