o
    2ib                     @   s   d Z ddlmZ ddlmZ ddlmZ ddlZddlZddlZddlZddl	Z
dd Zdd	 Zd
d Zdd Zdd Zdd Zdd Zdd ZG dd deZG dd deZG dd deZdd Zdd Zd d! ZdS )"zTokenization classes.    )absolute_import)division)print_functionNc           
      C   s   |sdS t d|}|du rdS |d}g d}g d}d}||v r-| s-d}d}d	}d
}	||v r;| r;d}d
}d}d}	|rHtd|||||	f dS )zHChecks whether the casing config is consistent with the checkpoint name.Nz$^.*?([A-Za-z0-9_-]+)/bert_model.ckpt   )zuncased_L-24_H-1024_A-16zuncased_L-12_H-768_A-12zmultilingual_L-12_H-768_A-12zchinese_L-12_H-768_A-12)zcased_L-12_H-768_A-12zcased_L-24_H-1024_A-16zmulti_cased_L-12_H-768_A-12FTFalseZ
lowercasedTrueZcaseda  You passed in `--do_lower_case=%s` with `--init_checkpoint=%s`. However, `%s` seems to be a %s model, so you should pass in `--do_lower_case=%s` so that the fine-tuning matches how the model was pre-training. If this error is wrong, please just comment out this check.)rematchgroup
ValueError)
do_lower_caseinit_checkpointm
model_nameZlower_modelsZcased_modelsZis_bad_configZactual_flagZ	case_nameZopposite_flag r   J/home/tuu73405/projects/NER-BERT-BiLSTM-CRF--master/models/tokenization.py validate_case_matches_checkpoint  s8   
r   c                 C   s|   t jrt| tr
| S t| tr| ddS tdt|  t jr:t| tr+| ddS t| t	r2| S tdt|  td)zGConverts `text` to Unicode (if it's not already), assuming utf-8 input.utf-8ignoreUnsupported string type: %s#Not running on Python2 or Python 3?)
sixPY3
isinstancestrbytesdecoder   typePY2unicodetextr   r   r   convert_to_unicode  s   



r"   c                 C   sz   t jrt| tr
| S t| tr| ddS tdt|  t jr9t| tr'| S t| t	r1| 
dS tdt|  td)zAReturns text encoded in a way suitable for print or `tf.logging`.r   r   r   r   )r   r   r   r   r   r   r   r   r   r   encoder    r   r   r   printable_text  s   




r$   c                 C   sp   t  }d}tjj| d }	 t| }|sn| }|||< |d7 }qW d   |S 1 s1w   Y  |S )z*Loads a vocabulary file into a dictionary.r   rTr   N)	collectionsOrderedDicttfiogfileGFiler"   readlinestrip)
vocab_filevocabindexreadertokenr   r   r   
load_vocab  s    
r3   c                 C   s4   g }|D ]}|dkr| d q| | |  q|S )z4Converts a sequence of [tokens|ids] using the vocab.z","r   )append)r/   itemsoutputitemr   r   r   convert_by_vocab  s   r8   c                 C   
   t | |S Nr8   )r/   tokensr   r   r   convert_tokens_to_ids"     
r=   c                 C   r9   r:   r;   )	inv_vocabidsr   r   r   convert_ids_to_tokens&  r>   rA   c                 C   s   |   } | sg S |  }|S )z@Runs basic whitespace cleaning and splitting on a piece of text.)r-   split)r!   r<   r   r   r   whitespace_tokenize*  s
   rC   c                   @   s2   e Zd ZdZdddZdd Zdd Zd	d
 ZdS )FullTokenizerzRuns end-to-end tokenziation.Tc                 C   s>   t || _dd | j D | _t|d| _t| jd| _d S )Nc                 S   s   i | ]\}}||qS r   r   ).0kvr   r   r   
<dictcomp>8      z*FullTokenizer.__init__.<locals>.<dictcomp>r   )r/   )r3   r/   r5   r?   BasicTokenizerbasic_tokenizerWordpieceTokenizerwordpiece_tokenizer)selfr.   r   r   r   r   __init__6  s   
zFullTokenizer.__init__c                 C   s6   g }| j |D ]}| j|D ]}|| qq|S r:   )rL   tokenizerN   r4   )rO   r!   split_tokensr2   Z	sub_tokenr   r   r   rQ   <  s   zFullTokenizer.tokenizec                 C      t | j|S r:   )r8   r/   )rO   r<   r   r   r   r=   D     z#FullTokenizer.convert_tokens_to_idsc                 C   rS   r:   )r8   r?   )rO   r@   r   r   r   rA   G  rT   z#FullTokenizer.convert_ids_to_tokensNT)__name__
__module____qualname____doc__rP   rQ   r=   rA   r   r   r   r   rD   3  s    
rD   c                   @   sJ   e Zd ZdZdddZdd Zdd Zd	d
 Zdd Zdd Z	dd Z
dS )rK   zDRuns basic tokenization (punctuation splitting, lower casing, etc.).Tc                 C   s
   || _ dS )zbConstructs a BasicTokenizer.

    Args:
      do_lower_case: Whether to lower case the input.
    NrJ   )rO   r   r   r   r   rP   N  s   
zBasicTokenizer.__init__c                 C   sl   t |}| |}| |}t|}g }|D ]}| jr$| }| |}|| | qtd	|}|S )zTokenizes a piece of text. )
r"   _clean_text_tokenize_chinese_charsrC   r   lower_run_strip_accentsextend_run_split_on_puncjoin)rO   r!   Zorig_tokensrR   r2   output_tokensr   r   r   rQ   V  s   


zBasicTokenizer.tokenizec                 C   sB   t d|}g }|D ]}t |}|dkrq
|| q
d|S )z$Strips accents from a piece of text.NFDMn )unicodedata	normalizecategoryr4   ra   )rO   r!   r6   charcatr   r   r   r^   n  s   

z!BasicTokenizer._run_strip_accentsc                 C   s   t |}d}d}g }|t|k r;|| }t|r!||g d}n|r(|g  d}|d | |d7 }|t|k sdd |D S )z&Splits punctuation on a piece of text.r   TFr   c                 S   s   g | ]}d  |qS )re   )ra   )rE   xr   r   r   
<listcomp>  rI   z5BasicTokenizer._run_split_on_punc.<locals>.<listcomp>)listlen_is_punctuationr4   )rO   r!   charsiZstart_new_wordr6   ri   r   r   r   r`   y  s    
z!BasicTokenizer._run_split_on_puncc                 C   sT   g }|D ] }t |}| |r|d || |d q|| qd|S )z)Adds whitespace around any CJK character.rZ   re   )ord_is_chinese_charr4   ra   rO   r!   r6   ri   cpr   r   r   r\     s   



z&BasicTokenizer._tokenize_chinese_charsc                 C   s   |dkr|dks@|dkr|dks@|dkr|dks@|dkr |dks@|d	kr(|d
ks@|dkr0|dks@|dkr8|dks@|dkrB|dkrBdS dS )z6Checks whether CP is the codepoint of a CJK character.i N  i  i 4  iM  i   iߦ i  i? i@ i i  i i   i  i  i TFr   )rO   rv   r   r   r   rt     s   
zBasicTokenizer._is_chinese_charc                 C   sX   g }|D ]"}t |}|dks|dkst|rqt|r!|d q|| qd|S )zBPerforms invalid character removal and whitespace cleanup on text.r   i  rZ   re   )rs   _is_control_is_whitespacer4   ra   ru   r   r   r   r[     s   
zBasicTokenizer._clean_textNrU   )rV   rW   rX   rY   rP   rQ   r^   r`   r\   rt   r[   r   r   r   r   rK   K  s    
rK   c                   @   s"   e Zd ZdZd	ddZdd ZdS )
rM   zRuns WordPiece tokenziation.[UNK]   c                 C   s   || _ || _|| _d S r:   )r/   	unk_tokenmax_input_chars_per_word)rO   r/   r{   r|   r   r   r   rP     s   
zWordpieceTokenizer.__init__c                 C   s   t |}g }t|D ]m}t|}t|| jkr|| j q
d}d}g }|t|k rit|}d}	||k rUd||| }
|dkrEd|
 }
|
| jv rM|
}	n|d8 }||k s4|	du r\d}n||	 |}|t|k s*|rr|| j q
|	| q
|S )a  Tokenizes a piece of text into its word pieces.

    This uses a greedy longest-match-first algorithm to perform tokenization
    using the given vocabulary.

    For example:
      input = "unaffable"
      output = ["un", "##aff", "##able"]

    Args:
      text: A single token or whitespace separated tokens. This should have
        already been passed through `BasicTokenizer.

    Returns:
      A list of wordpiece tokens.
    Fr   Nre   z##r   T)
r"   rC   rn   ro   r|   r4   r{   ra   r/   r_   )rO   r!   rb   r2   rq   Zis_badstartZ
sub_tokensendZ
cur_substrsubstrr   r   r   rQ     s@   

zWordpieceTokenizer.tokenizeN)ry   rz   )rV   rW   rX   rY   rP   rQ   r   r   r   r   rM     s    
rM   c                 C   s>   | dks| dks| dks| dkrdS t | }|dkrdS dS )z1Checks whether `chars` is a whitespace character.rZ   	
TZsFrf   rh   ri   rj   r   r   r   rx     s    
rx   c                 C   s6   | dks| dks| dkrdS t | }|dv rdS dS )z.Checks whether `chars` is a control character.r   r   r   F)CcCfTr   r   r   r   r   rw     s   
rw   c                 C   sh   t | }|dkr|dks$|dkr|dks$|dkr|dks$|dkr&|dkr&d	S t| }|d
r2d	S dS )z2Checks whether `chars` is a punctuation character.!   /   :   @   [   `   {   ~   TPF)rs   rf   rh   
startswith)ri   rv   rj   r   r   r   rp     s     

rp   )rY   
__future__r   r   r   r&   r   rf   r   
tensorflowr(   r   r"   r$   r3   r8   r=   rA   rC   objectrD   rK   rM   rx   rw   rp   r   r   r   r   <module>   s4       2	s>