o
    Où2iµ  ã                   @   s„   d dl Z d dlZd dlZd dlmZmZmZ d dlmZ ej	dddZ
dd„ Zd	d
„ Zdd„ Zddd„ZG dd„ deƒZdd„ ZdS )é    N)Úcreate_dicoÚcreate_mappingÚzero_digits)Útokenizationz=./pretrained_model/chinese-bert_wwm_L-12_H-768_A-12/vocab.txtF)Ú
vocab_fileZdo_lower_casec                 C   sð   g }g }d}t  | dd¡D ]U}|d7 }|rt| ¡ ƒn| ¡ }|s7t|ƒdkr6d|d d vr4| |¡ g }q|d dkrJd|dd…  }| ¡ }n| ¡ }zt|ƒd	ksWJ ‚| |¡ W q   Y qt|ƒdkrvd|d d vrv| |¡ |S )
zv
    Load sentences. A line must contain at least a word and its tag.
    Sentences are separated by empty lines.
    r   ÚrÚutf8é   ZDOCSTARTÚ ú$Né   )ÚcodecsÚopenr   ÚrstripÚlenÚappendÚsplit)ÚpathÚlowerÚzerosÚ	sentencesÚsentenceÚnumÚlineÚword© r   úB/home/tuu73405/projects/NER-BERT-BiLSTM-CRF--master/data_helper.pyÚload_sentences   s2   
€

r   c                 C   s\   dd„ | D ƒ}t |ƒ}t|ƒd |d< t|ƒd |d< t|ƒ\}}tdt|ƒ ƒ |||fS )zI
    Create a dictionary and a mapping of tags, sorted by frequency.
    c                 S   s   g | ]	}d d„ |D ƒ‘qS )c                 S   ó   g | ]}|d  ‘qS ©éÿÿÿÿr   )Ú.0Úcharr   r   r   Ú
<listcomp>4   ó    z*tag_mapping.<locals>.<listcomp>.<listcomp>r   )r!   Úsr   r   r   r#   4   s    ztag_mapping.<locals>.<listcomp>r	   ú[SEP]r   ú[CLS]z!Found %i unique named entity tags)r   r   r   Úprint)r   ÚtagsZdicoÚ	tag_to_idÚ	id_to_tagr   r   r   Útag_mapping0   s   
r,   c                 C   s®  |   d¡}|  d¡}g }g }t|ƒD ]*\}	}
| |
¡}| |¡ ||	 }tt|ƒƒD ]}|dkr6| |¡ q*| d¡ q*qt|ƒ|d krU|d|d … }|d|d … }g }g }g }| d¡ | d¡ | |d ¡ t|ƒD ]\}	}| |¡ | d¡ z| |||	  ¡ W qp   Y qp| d¡ | d¡ | |d ¡ | |¡}dgt|ƒ }t|ƒ|k rÑ| d¡ | d¡ | d¡ | d¡ | d¡ t|ƒ|k s²||||fS )	uR   
    å°†ä¸€ä¸ªæ ·æœ¬è¿›è¡Œåˆ†æžï¼Œç„¶åŽå°†å­—è½¬åŒ–ä¸ºid, æ ‡ç­¾è½¬åŒ–ä¸ºlb
    r
   r   ÚXr	   r   r'   r&   z**NULL**)r   Ú	enumerateÚtokenizeÚextendÚranger   r   Zconvert_tokens_to_ids)Ú	char_liner*   Úmax_seq_lengthÚ	tokenizerÚ
label_lineÚ	text_listÚ
label_listÚtokensÚlabelsÚir   ÚtokenZlabel_1ÚmZntokensÚsegment_idsÚ	label_idsÚ	input_idsÚ
input_maskr   r   r   Úconvert_single_example?   sT   



ü











úrA   Tc                    s¤   ‡ fdd„}g }| D ]E}dd„ |D ƒ}d  |¡}	t |	¡}
|r'dd„ |D ƒ}ndd„ |D ƒ}d  |¡}t |¡}t|
||t|d\}}}}| |||||g¡ q
|S )	zš
    Prepare the dataset. Return a list of lists of dictionaries containing:
        - word indexes
        - word char indexes
        - tag indexes
    c                    s   ˆ r|   ¡ S | S ©N©r   )ÚxrC   r   r   Úf}   s   zprepare_dataset.<locals>.fc                 S   ó   g | ]}|d    ¡ ‘qS ©r   ©Ústrip©r!   Úwr   r   r   r#      ó    z#prepare_dataset.<locals>.<listcomp>r
   c                 S   r   r   r   rJ   r   r   r   r#   †   r$   c                 S   ó   g | ]}d ‘qS ©ÚOr   ©r!   Ú_r   r   r   r#   ˆ   ó    ©r2   r*   r3   r4   r5   )Újoinr   Úconvert_to_unicoderA   r4   r   )r   r3   r*   r   ÚtrainrE   Údatar%   Ústringr2   Útextr)   r9   ÚidsÚmaskr=   r>   r   rC   r   Úprepare_datasetv   s&   



ür\   c                   @   s>   e Zd Zdd„ Zdd„ Zedd„ ƒZedd„ ƒZdd
d„ZdS )ÚBatchManagerc                 C   s   |   ||¡| _t| jƒ| _d S rB   )Úsort_and_padÚ
batch_datar   Úlen_data)ÚselfrW   Ú
batch_sizer   r   r   Ú__init__™   s   zBatchManager.__init__c              
   C   sZ   t t t|ƒ| ¡ƒ}tƒ }t|ƒD ]}| |  |t || ƒt |d | ƒ… ¡¡ q|S )Nr	   )ÚintÚmathÚceilr   Úlistr1   r   Úarrange_batch)ra   rW   rb   Z	num_batchr_   r:   r   r   r   r^      s
   .zBatchManager.sort_and_padc                 C   sh   g }g }g }g }g }| D ] \}}}}	}
|  |¡ |  |¡ |  |¡ |  |	¡ |  |
¡ q|||||gS )u^   
        æŠŠbatchæ•´ç†ä¸ºä¸€ä¸ª[5, ]çš„æ•°ç»„
        :param batch:
        :return:
        )r   )ÚbatchÚstringsr=   Úcharsr[   ÚtargetsrX   Zseg_idsr"   ÚmskÚtargetr   r   r   rh   ¥   s   



zBatchManager.arrange_batchc                 C   s   g }g }g }g }t dd„ | D ƒƒ}| D ].}|\}}}	}
}dg|t|ƒ  }| || ¡ | |	| ¡ | |
| ¡ | || ¡ q||||gS )Nc                 S   s   g | ]}t |d  ƒ‘qS rG   )r   )r!   r   r   r   r   r#   ¿   rL   z)BatchManager.pad_data.<locals>.<listcomp>r   )Úmaxr   r   )rW   rj   rk   Úsegsrl   Ú
max_lengthr   rX   r=   r"   Úsegrn   Úpaddingr   r   r   Úpad_data¹   s   zBatchManager.pad_dataFc                 c   s2    |r	t  | j¡ t| jƒD ]}| j| V  qd S rB   )ÚrandomÚshuffler_   r1   r`   )ra   rv   Úidxr   r   r   Ú
iter_batchÉ   s   €ÿzBatchManager.iter_batchN)F)	Ú__name__Ú
__module__Ú__qualname__rc   r^   Ústaticmethodrh   rt   rx   r   r   r   r   r]   —   s    

r]   c                 C   s´   dd„ | D ƒ}d  |¡}t |¡}dd„ |D ƒ}d  |¡}t |¡}t|||t|d\}}	}
}ddl}| |
d|f¡}
| |d|f¡}| |	d|f¡}	| |d|f¡}||
||	|gS )	za
    Take sentence data and return an input for
    the training or the evaluation function.
    c                 S   rF   rG   rH   rJ   r   r   r   r#   Õ   rL   z#input_from_line.<locals>.<listcomp>r
   c                 S   rM   rN   r   rP   r   r   r   r#   Û   rR   rS   r   Nr	   )rT   r   rU   rA   r4   ÚnumpyÚreshape)r   r3   r*   rX   r2   rY   r)   r9   rZ   r[   r=   r>   Únpr   r   r   Úinput_from_lineÐ   s$   



ür€   )FT)r   re   ru   Úutilsr   r   r   Úmodelsr   ZFullTokenizerr4   r   r,   rA   r\   Úobjectr]   r€   r   r   r   r   Ú<module>   s   ÿ!
7!9