B
    àN3iè  ã               @   s„   d dl Z d dlZd dlZd dlmZmZmZ d dlmZ ej	dddZ
dd„ Zd	d
„ Zdd„ Zddd„ZG dd„ deƒZdd„ ZdS )é    N)Úcreate_dicoÚcreate_mappingÚzero_digits)Útokenizationz=./pretrained_model/chinese-bert_wwm_L-12_H-768_A-12/vocab.txtF)Ú
vocab_fileZdo_lower_casec             C   sú   g }g }d}xÂt  | dd¡D ]°}|d7 }|r8t| ¡ ƒn| ¡ }|spt|ƒdkrÌd|d d krj| |¡ g }q|d dkr–d|dd…  }| ¡ }n| ¡ }yt|ƒd	ks°t‚| |¡ W q   wY qX qW t|ƒdkröd|d d krö| |¡ |S )
zv
    Load sentences. A line must contain at least a word and its tag.
    Sentences are separated by empty lines.
    r   ÚrÚutf8é   ZDOCSTARTú ú$Né   )ÚcodecsÚopenr   ÚrstripÚlenÚappendÚsplitÚAssertionError)ÚpathÚlowerÚzerosÚ	sentencesÚsentenceÚnumÚlineÚword© r   úB/home/tuu73405/projects/NER-BERT-BiLSTM-CRF--master/data_helper.pyÚload_sentences   s0    


r   c             C   s\   dd„ | D ƒ}t |ƒ}t|ƒd |d< t|ƒd |d< t|ƒ\}}tdt|ƒ ƒ |||fS )zI
    Create a dictionary and a mapping of tags, sorted by frequency.
    c             S   s   g | ]}d d„ |D ƒ‘qS )c             S   s   g | ]}|d  ‘qS )éÿÿÿÿr   )Ú.0Úcharr   r   r   ú
<listcomp>2   s    z*tag_mapping.<locals>.<listcomp>.<listcomp>r   )r    Úsr   r   r   r"   2   s    ztag_mapping.<locals>.<listcomp>r	   z[SEP]r   z[CLS]z!Found %i unique named entity tags)r   r   r   Úprint)r   ÚtagsZdicoÚ	tag_to_idÚ	id_to_tagr   r   r   Útag_mapping.   s    r(   c             C   s¼  |   d¡}|  d¡}g }g }xdt|ƒD ]X\}	}
| |
¡}| |¡ ||	 }x2tt|ƒƒD ]"}|dkrp| |¡ qX| d¡ qXW q&W t|ƒ|d kr²|d|d … }|d|d … }g }g }g }| d¡ | d¡ | |d ¡ xLt|ƒD ]@\}	}| |¡ | d¡ y| |||	  ¡ W qê   Y qêX qêW | d¡ | d¡ | |d ¡ | |¡}dgt|ƒ }xFt|ƒ|k r®| d¡ | d¡ | d¡ | d¡ | d¡ qjW ||||fS )	z
    r
   r   ÚXr	   r   z[CLS]z[SEP]z**NULL**)r   Ú	enumerateÚtokenizeÚextendÚranger   r   Zconvert_tokens_to_ids)Ú	char_liner&   Úmax_seq_lengthÚ	tokenizerÚ
label_lineZ	text_listZ
label_listÚtokensÚlabelsÚir   ÚtokenZlabel_1ÚmZntokensÚsegment_idsÚ	label_idsÚ	input_idsÚ
input_maskr   r   r   Úconvert_single_example=   sP    















r;   Tc                s¨   ‡ fdd„}g }x’| D ]Š}dd„ |D ƒ}d  |¡}	t |	¡}
|rPdd„ |D ƒ}ndd„ |D ƒ}d  |¡}t |¡}t|
||t|d\}}}}| |||||g¡ qW |S )	zš
    Prepare the dataset. Return a list of lists of dictionaries containing:
        - word indexes
        - word char indexes
        - tag indexes
    c                s   ˆ r|   ¡ S | S )N)r   )Úx)r   r   r   Úfz   s    zprepare_dataset.<locals>.fc             S   s   g | ]}|d    ¡ ‘qS )r   )Ústrip)r    Úwr   r   r   r"   ~   s    z#prepare_dataset.<locals>.<listcomp>r
   c             S   s   g | ]}|d  ‘qS )r   r   )r    r?   r   r   r   r"   ƒ   s    c             S   s   g | ]}d ‘qS )ÚOr   )r    Ú_r   r   r   r"   …   s    )r.   r&   r/   r0   r1   )Újoinr   Úconvert_to_unicoder;   r0   r   )r   r/   r&   r   Útrainr=   Údatar#   Ústringr.   Útextr%   r3   ÚidsÚmaskr7   r8   r   )r   r   Úprepare_datasets   s$    




rJ   c               @   s>   e Zd Zdd„ Zdd„ Zedd„ ƒZedd„ ƒZdd
d„ZdS )ÚBatchManagerc             C   s   |   ||¡| _t| jƒ| _d S )N)Úsort_and_padÚ
batch_datar   Úlen_data)ÚselfrE   Ú
batch_sizer   r   r   Ú__init__–   s    zBatchManager.__init__c          
   C   s^   t t t|ƒ| ¡ƒ}tƒ }x<t|ƒD ]0}| |  |t || ƒt |d | ƒ… ¡¡ q&W |S )Nr	   )ÚintÚmathÚceilr   Úlistr-   r   Úarrange_batch)rO   rE   rP   Z	num_batchrM   r4   r   r   r   rL   š   s
    0zBatchManager.sort_and_padc             C   sl   g }g }g }g }g }xH| D ]@\}}}}	}
|  |¡ |  |¡ |  |¡ |  |	¡ |  |
¡ qW |||||gS )u^   
        æŠŠbatchæ•´ç†ä¸ºä¸€ä¸ª[5, ]çš„æ•°ç»„
        :param batch:
        :return:
        )r   )ÚbatchÚstringsr7   ÚcharsrI   ÚtargetsrF   Zseg_idsr!   ÚmskÚtargetr   r   r   rV   ¢   s    



zBatchManager.arrange_batchc             C   s”   g }g }g }g }t dd„ | D ƒƒ}xd| D ]\}|\}}}	}
}dg|t|ƒ  }| || ¡ | |	| ¡ | |
| ¡ | || ¡ q(W ||||gS )Nc             S   s   g | ]}t |d  ƒ‘qS )r   )r   )r    r   r   r   r   r"   ¼   s    z)BatchManager.pad_data.<locals>.<listcomp>r   )Úmaxr   r   )rE   rX   rY   ZsegsrZ   Ú
max_lengthr   rF   r7   r!   Úsegr\   Úpaddingr   r   r   Úpad_data¶   s    
zBatchManager.pad_dataFc             c   s4   |rt  | j¡ xt| jƒD ]}| j| V  qW d S )N)ÚrandomÚshufflerM   r-   rN   )rO   rc   Úidxr   r   r   Ú
iter_batchÆ   s    zBatchManager.iter_batchN)F)	Ú__name__Ú
__module__Ú__qualname__rQ   rL   ÚstaticmethodrV   ra   re   r   r   r   r   rK   ”   s
   rK   c             C   s´   dd„ | D ƒ}d  |¡}t |¡}dd„ |D ƒ}d  |¡}t |¡}t|||t|d\}}	}
}ddl}| |
d|f¡}
| |d|f¡}| |	d|f¡}	| |d|f¡}||
||	|gS )	za
    Take sentence data and return an input for
    the training or the evaluation function.
    c             S   s   g | ]}|d    ¡ ‘qS )r   )r>   )r    r?   r   r   r   r"   Ò   s    z#input_from_line.<locals>.<listcomp>r
   c             S   s   g | ]}d ‘qS )r@   r   )r    rA   r   r   r   r"   Ø   s    )r.   r&   r/   r0   r1   r   Nr	   )rB   r   rC   r;   r0   ÚnumpyÚreshape)r   r/   r&   rF   r.   rG   r%   r3   rH   rI   r7   r8   Únpr   r   r   Úinput_from_lineÍ   s"    



rm   )FT)r   rS   rb   Úutilsr   r   r   Úmodelsr   ZFullTokenizerr0   r   r(   r;   rJ   ÚobjectrK   rm   r   r   r   r   Ú<module>   s   !6
!9