3
>d                 @   s   d dl Z d dlZd dlZd dlmZmZmZ d dlmZ ej	dddZ
dd Zd	d
 Zdd ZdddZG dd deZdd ZdS )    N)create_dicocreate_mappingzero_digits)tokenizationzbD:\pycharm\NER-BERT-BiLSTM-CRF--master\pretrained_model\chinese-bert_wwm_L-12_H-768_A-12\vocab.txtF)
vocab_fileZdo_lower_casec          
   C   s   g }g }d}xt j| ddD ]}|d7 }|r8t|j n|j }|spt|dkrd|d d krj|j| g }q|d dkrd|dd  }|j }n|j }yt|d	kst|j| W q   wY qX qW t|dkrd|d d kr|j| |S )
zv
    Load sentences. A line must contain at least a word and its tag.
    Sentences are separated by empty lines.
    r   rutf8   ZDOCSTART $N   )codecsopenr   rstriplenappendsplitAssertionError)pathlowerzeros	sentencessentencenumlineword r   5D:\pycharm\NER-BERT-BiLSTM-CRF--master\data_helper.pyload_sentences   s0    


r   c             C   s\   dd | D }t |}t|d |d< t|d |d< t|\}}tdt|  |||fS )zI
    Create a dictionary and a mapping of tags, sorted by frequency.
    c             S   s   g | ]}d d |D qS )c             S   s   g | ]}|d qS )r	   r   ).0charr   r   r   
<listcomp>4   s    z*tag_mapping.<locals>.<listcomp>.<listcomp>r   )r    sr   r   r   r"   4   s    ztag_mapping.<locals>.<listcomp>r	   z[SEP]r   z[CLS]z!Found %i unique named entity tags)r   r   r   print)r   tagsZdico	tag_to_id	id_to_tagr   r   r   tag_mapping0   s    r(   c             C   s  | j d}|j d}g }g }xdt|D ]X\}	}
|j|
}|j| ||	 }x2tt|D ]"}|dkrp|j| qX|jd qXW q&W t||d kr|d|d  }|d|d  }g }g }g }|jd |jd |j|d  xLt|D ]@\}	}|j| |jd y|j|||	   W q   Y qX qW |jd |jd |j|d  |j|}dgt| }xFt||k r|jd |jd |jd |jd |jd qjW ||||fS )	uR   
    将一个样本进行分析，然后将字转化为id, 标签转化为lb
    r
   r   Xr	   r   z[CLS]z[SEP]z**NULL**)r   	enumeratetokenizeextendranger   r   Zconvert_tokens_to_ids)	char_liner&   max_seq_length	tokenizer
label_lineZ	text_list
label_listtokenslabelsir   tokenZlabel_1mZntokenssegment_ids	label_ids	input_ids
input_maskr   r   r   convert_single_example?   sP    















r<   Tc                s    fdd}g }x| D ]}dd |D }dj |}	tj|	}
|rPdd |D }ndd |D }dj |}tj|}t|
||t|d\}}}}|j|||||g qW |S )	z
    Prepare the dataset. Return a list of lists of dictionaries containing:
        - word indexes
        - word char indexes
        - tag indexes
    c                s    r| j  S | S )N)r   )x)r   r   r   f}   s    zprepare_dataset.<locals>.fc             S   s   g | ]}|d  j  qS )r   )strip)r    wr   r   r   r"      s    z#prepare_dataset.<locals>.<listcomp>r
   c             S   s   g | ]}|d qS )r	   r   r   )r    r@   r   r   r   r"      s    c             S   s   g | ]}d qS )Or   )r    _r   r   r   r"      s    )r.   r&   r/   r0   r1   )joinr   convert_to_unicoder<   r0   r   )r   r/   r&   r   trainr>   datar#   stringr.   textr%   r4   idsmaskr8   r9   r   )r   r   prepare_datasetv   s$    




rK   c               @   s>   e Zd Zdd Zdd Zedd Zedd Zdd
dZdS )BatchManagerc             C   s   | j ||| _t| j| _d S )N)sort_and_pad
batch_datar   len_data)selfrF   
batch_sizer   r   r   __init__   s    zBatchManager.__init__c             C   s^   t tjt|| }t }x<t|D ]0}|j| j|t || t |d |   q&W |S )Nr	   )intmathceilr   listr-   r   arrange_batch)rP   rF   rQ   Z	num_batchrN   r5   r   r   r   rM      s
    0zBatchManager.sort_and_padc             C   sl   g }g }g }g }g }xH| D ]@\}}}}	}
|j | |j | |j | |j |	 |j |
 qW |||||gS )u^   
        把batch整理为一个[5, ]的数组
        :param batch:
        :return:
        )r   )batchstringsr8   charsrJ   targetsrG   Zseg_idsr!   msktargetr   r   r   rW      s    



zBatchManager.arrange_batchc             C   s   g }g }g }g }t dd | D }xd| D ]\}|\}}}	}
}dg|t|  }|j||  |j|	|  |j|
|  |j||  q(W ||||gS )Nc             S   s   g | ]}t |d  qS )r   )r   )r    r   r   r   r   r"      s    z)BatchManager.pad_data.<locals>.<listcomp>r   )maxr   r   )rF   rY   rZ   Zsegsr[   
max_lengthr   rG   r8   r!   segr]   paddingr   r   r   pad_data   s    
zBatchManager.pad_dataFc             c   s4   |rt j| j xt| jD ]}| j| V  qW d S )N)randomshufflerN   r-   rO   )rP   rd   idxr   r   r   
iter_batch   s    zBatchManager.iter_batchN)F)	__name__
__module____qualname__rR   rM   staticmethodrW   rb   rf   r   r   r   r   rL      s
   rL   c             C   s   dd | D }dj |}tj|}dd |D }dj |}tj|}t|||t|d\}}	}
}ddl}|j|
d|f}
|j|d|f}|j|	d|f}	|j|d|f}||
||	|gS )	za
    Take sentence data and return an input for
    the training or the evaluation function.
    c             S   s   g | ]}|d  j  qS )r   )r?   )r    r@   r   r   r   r"      s    z#input_from_line.<locals>.<listcomp>r
   c             S   s   g | ]}d qS )rA   r   )r    rB   r   r   r   r"      s    )r.   r&   r/   r0   r1   r   Nr	   )rC   r   rD   r<   r0   numpyreshape)r   r/   r&   rG   r.   rH   r%   r4   rI   rJ   r8   r9   npr   r   r   input_from_line   s"    



rn   )FT)r   rT   rc   utilsr   r   r   modelsr   ZFullTokenizerr0   r   r(   r<   rK   objectrL   rn   r   r   r   r   <module>   s   !7
!9