o
    3id                  
   @   s  d Z ddlmZ ddlmZ ddlmZ ddlZddlZddlZddlZddl	Z	ddl
ZddlZddlZddlm  mZ e  G dd deZG dd	 d	eZd
d Zdd Zdd Zdd Zd:ddZd:ddZd:ddZd;ddZ				d<ddZ				 	!	"		#	$d=d%d&Zd'd( Z		)	#				*					d>d+d,Z dd-d.d.d/ed$d$ddf
d0d1Z!d?d2d3Z"d4d5 Z#d6d7 Z$d:d8d9Z%dS )@z*The main BERT model and related functions.    )absolute_import)division)print_functionNc                   @   sV   e Zd ZdZ											dd
dZedd Zedd Zdd Zdd Z	dS )
BertConfigzConfiguration for `BertModel`.         gelu皙?      {Gz?c                 C   sF   || _ || _|| _|| _|| _|| _|| _|| _|	| _|
| _	|| _
dS )a  Constructs BertConfig.

    Args:
      vocab_size: Vocabulary size of `inputs_ids` in `BertModel`.
      hidden_size: Size of the encoder layers and the pooler layer.
      num_hidden_layers: Number of hidden layers in the Transformer encoder.
      num_attention_heads: Number of attention heads for each attention layer in
        the Transformer encoder.
      intermediate_size: The size of the "intermediate" (i.e., feed-forward)
        layer in the Transformer encoder.
      hidden_act: The non-linear activation function (function or string) in the
        encoder and pooler.
      hidden_dropout_prob: The dropout probability for all fully connected
        layers in the embeddings, encoder, and pooler.
      attention_probs_dropout_prob: The dropout ratio for the attention
        probabilities.
      max_position_embeddings: The maximum sequence length that this model might
        ever be used with. Typically set this to something large just in case
        (e.g., 512 or 1024 or 2048).
      type_vocab_size: The vocabulary size of the `token_type_ids` passed into
        `BertModel`.
      initializer_range: The stdev of the truncated_normal_initializer for
        initializing all weight matrices.
    N)
vocab_sizehidden_sizenum_hidden_layersnum_attention_heads
hidden_actintermediate_sizehidden_dropout_probattention_probs_dropout_probmax_position_embeddingstype_vocab_sizeinitializer_range)selfr   r   r   r   r   r   r   r   r   r   r    r   D/home/tuu73405/projects/NER-BERT-BiLSTM-CRF--master/bert/modeling.py__init__%   s   $
zBertConfig.__init__c                 C   s,   t dd}t|D ]	\}}||j|< q
|S )zAConstructs a `BertConfig` from a Python dictionary of parameters.N)r   )r   six	iteritems__dict__)clsZjson_objectconfigkeyvaluer   r   r   	from_dictU   s   
zBertConfig.from_dictc                 C   sH   t jj|d}| }W d   n1 sw   Y  | t|S )z9Constructs a `BertConfig` from a json file of parameters.rN)tfiogfileGFilereadr$   jsonloads)r    	json_filereadertextr   r   r   from_json_file]   s   
zBertConfig.from_json_filec                 C   s   t | j}|S )z0Serializes this instance to a Python dictionary.)copydeepcopyr   )r   outputr   r   r   to_dictd   s   zBertConfig.to_dictc                 C   s   t j|  dddd S )z*Serializes this instance to a JSON string.   T)indent	sort_keys
)r+   dumpsr4   r   r   r   r   to_json_stringi   s   zBertConfig.to_json_stringN)
r   r   r   r   r	   r
   r
   r   r   r   )
__name__
__module____qualname____doc__r   classmethodr$   r0   r4   r;   r   r   r   r   r   "   s&    
0

r   c                   @   sJ   e Zd ZdZ				dddZdd Zdd	 Zd
d Zdd Zdd Z	dS )	BertModela  BERT model ("Bidirectional Encoder Representations from Transformers").

  Example usage:

  ```python
  # Already been converted into WordPiece token ids
  input_ids = tf.constant([[31, 51, 99], [15, 5, 0]])
  input_mask = tf.constant([[1, 1, 1], [1, 1, 0]])
  token_type_ids = tf.constant([[0, 0, 1], [0, 2, 0]])

  config = modeling.BertConfig(vocab_size=32000, hidden_size=512,
    num_hidden_layers=8, num_attention_heads=6, intermediate_size=1024)

  model = modeling.BertModel(config=config, is_training=True,
    input_ids=input_ids, input_mask=input_mask, token_type_ids=token_type_ids)

  label_embeddings = tf.get_variable(...)
  pooled_output = model.get_pooled_output()
  logits = tf.matmul(pooled_output, label_embeddings)
  ...
  ```
  NFc                 C   s  t |}|sd|_d|_t|dd}|d }	|d }
|du r)tj|	|
gtjd}|du r7tj|	|
gtjd}tj	|dd	 t	d
- t
||j|j|jd|d\| _| _t| jd||jddd|j|j|jd
| _W d   n1 stw   Y  t	d( t||}t| j||j|j|j|jt|j|j|j|jdd| _W d   n1 sw   Y  | jd | _t	d- tj| jddddddf dd}tjjj|jtj t!|jd|| _"W d   n1 sw   Y  W d   dS W d   dS 1 sw   Y  dS )a  Constructor for BertModel.

    Args:
      config: `BertConfig` instance.
      is_training: bool. true for training model, false for eval model. Controls
        whether dropout will be applied.
      input_ids: int32 Tensor of shape [batch_size, seq_length].
      input_mask: (optional) int32 Tensor of shape [batch_size, seq_length].
      token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length].
      use_one_hot_embeddings: (optional) bool. Whether to use one-hot word
        embeddings or tf.embedding_lookup() for the word embeddings.
      scope: (optional) variable scope. Defaults to "bert".

    Raises:
      ValueError: The config is invalid or one of the input tensor shapes
        is invalid.
            r5   expected_rankr      Nshapedtypebert)default_name
embeddingsword_embeddings)	input_idsr   embedding_sizer   word_embedding_nameuse_one_hot_embeddingsTtoken_type_embeddingsposition_embeddings)
input_tensoruse_token_typetoken_type_idstoken_type_vocab_sizetoken_type_embedding_nameuse_position_embeddingsposition_embedding_namer   r   dropout_probencoder)rS   attention_maskr   r   r   r   intermediate_act_fnr   r   r   do_return_all_layersZpooleraxisunits
activationkernel_initializer)#r1   r2   r   r   get_shape_listr&   onesint32zerosvariable_scopeembedding_lookupr   r   r   embedding_outputembedding_tableembedding_postprocessorr   r   %create_attention_mask_from_input_masktransformer_modelr   r   r   get_activationr   all_encoder_layerssequence_outputsqueezekeraslayersDensetanhcreate_initializerpooled_output)r   r!   is_trainingrM   
input_maskrU   rP   scopeinput_shape
batch_size
seq_lengthr\   Zfirst_token_tensorr   r   r   r      s   



&5$zBertModel.__init__c                 C      | j S N)rz   r:   r   r   r   get_pooled_output      zBertModel.get_pooled_outputc                 C   r   )zGets final hidden layer of encoder.

    Returns:
      float Tensor of shape [batch_size, seq_length, hidden_size] corresponding
      to the final hidden of the transformer encoder.
    )rs   r:   r   r   r   get_sequence_output   s   zBertModel.get_sequence_outputc                 C   r   r   )rr   r:   r   r   r   get_all_encoder_layers   r   z BertModel.get_all_encoder_layersc                 C   r   )a  Gets output of the embedding lookup (i.e., input to the transformer).

    Returns:
      float Tensor of shape [batch_size, seq_length, hidden_size] corresponding
      to the output of the embedding layer, after summing the word
      embeddings with the positional embeddings and the token type embeddings,
      then performing layer normalization. This is the input to the transformer.
    )rl   r:   r   r   r   get_embedding_output   s   	zBertModel.get_embedding_outputc                 C   r   r   )rm   r:   r   r   r   get_embedding_table  r   zBertModel.get_embedding_table)NNFN)
r<   r=   r>   r?   r   r   r   r   r   r   r   r   r   r   rA   n   s    
k	rA   c                 C   s:   ddt tdtj | dt | d     }| | S )zGaussian Error Linear Unit.

  This is a smoother version of the RELU.
  Original paper: https://arxiv.org/abs/1606.08415
  Args:
    x: float Tensor to perform activation.

  Returns:
    `x` with the GELU activation applied.
  g      ?      ?r5   gHm?   )r&   rx   npsqrtpipow)xcdfr   r   r   r	     s   "r	   c                 C   sb   t | tjs| S | sdS |  }|dkrdS |dkrtjjS |dkr$tS |dkr+tjS t	d| )a  Maps a string to a Python function, e.g., "relu" => `tf.nn.relu`.

  Args:
    activation_string: String name of the activation function.

  Returns:
    A Python function corresponding to the activation function. If
    `activation_string` is None, empty, or "linear", this will return None.
    If `activation_string` is not a string, it will return `activation_string`.

  Raises:
    ValueError: The `activation_string` does not correspond to a known
      activation.
  Nlinearrelur	   rx   zUnsupported activation: %s)

isinstancer   string_typeslowerr&   nnr   r	   rx   
ValueError)Zactivation_stringZactr   r   r   rq     s   rq   c           
      C   s   i }i }t  }| D ]}|j}td|}|dur|d}|||< q
tj|}t  }|D ]}	|	d |	d }}||vr?q/|||< d||< d||d < q/||fS )zDCompute the union of the current variables and checkpoint variables.z
^(.*):\d+$NrE   r   z:0)	collectionsOrderedDictnamerematchgroupr&   trainlist_variables)
tvarsinit_checkpointassignment_mapinitialized_variable_namesZname_to_variablevarr   m	init_varsr   r   r   r   "get_assignment_map_from_checkpointD  s&   

r   c                 C   s*   |du s|dkr
| S t j| d| }|S )a  Perform dropout.

  Args:
    input_tensor: float Tensor.
    dropout_prob: Python float. The probability of dropping out a value (NOT of
      *keeping* a dimension as in `tf.nn.dropout`).

  Returns:
    A version of `input_tensor` with dropout applied.
  NrB   r   )r&   r   dropout)rS   rZ   r3   r   r   r   r   _  s   r   c                 C   s   t jjj| dd|dS )<Run layer normalization on the last dimension of the tensor.r_   )inputsZbegin_norm_axisZbegin_params_axisr}   )r&   contribrv   
layer_norm)rS   r   r   r   r   r   q  s   r   c                 C   s    t jjjdddd|d}|| S )r   r_   Tg-q=)ra   centerscaleepsilonr   )r&   ru   rv   LayerNormalization)rS   r   Zlayer_normar   r   r   r   u  s   
c                 C   s   t | |}t||}|S )z-Runs layer normalization followed by dropout.)r   r   )rS   rZ   r   output_tensorr   r   r   layer_norm_and_dropout  s   

r   r   c                 C   s   t j| dS )z>Creates a `truncated_normal_initializer` with the given range.)stddev)r&   truncated_normal_initializer)r   r   r   r   ry     s   ry      rL   Fc                 C   s   | j jdkrtj| dgd} tj|||gt|d}t| dg}|r1tj||d}t||}	nt	||}	t
| }
t|	|
dd |
d | g }	|	|fS )a=  Looks up words embeddings for id tensor.

  Args:
    input_ids: int32 Tensor of shape [batch_size, seq_length] containing word
      ids.
    vocab_size: int. Size of the embedding vocabulary.
    embedding_size: int. Width of the word embeddings.
    initializer_range: float. Embedding initialization range.
    word_embedding_name: string. Name of the embedding table.
    use_one_hot_embeddings: bool. If True, use one-hot method for word
      embeddings. If False, use `tf.gather()`.

  Returns:
    float Tensor of shape [batch_size, seq_length, embedding_size].
  r5   r_   r`   r   rG   initializerdepthr   )rG   ndimsr&   expand_dimsget_variablery   reshapeone_hotmatmulgatherrf   )rM   r   rN   r   rO   rP   rm   flat_input_idsZone_hot_input_idsr3   r~   r   r   r   rk     s"   rk   r   rQ   TrR   r   r
   c
                 C   s^  t | dd}
|
d }|
d }|
d }| }|rK|du rtdtj|||gt|d}t|d	g}tj||d
}t||}t||||g}||7 }|rt||}t	|gG tj|||gt|d}t
|ddg|d	g}t|j }g }t|d D ]}|d q|||g t||}||7 }W d   n1 sw   Y  t||	}|S )a  Performs various post-processing on a word embedding tensor.

  Args:
    input_tensor: float Tensor of shape [batch_size, seq_length,
      embedding_size].
    use_token_type: bool. Whether to add embeddings for `token_type_ids`.
    token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length].
      Must be specified if `use_token_type` is True.
    token_type_vocab_size: int. The vocabulary size of `token_type_ids`.
    token_type_embedding_name: string. The name of the embedding table variable
      for token type ids.
    use_position_embeddings: bool. Whether to add position embeddings for the
      position of each token in the sequence.
    position_embedding_name: string. The name of the embedding table variable
      for positional embeddings.
    initializer_range: float. Range of the weight initialization.
    max_position_embeddings: int. Maximum sequence length that might ever be
      used with this model. This can be longer than the sequence length of
      input_tensor, but cannot be shorter.
    dropout_prob: float. Dropout probability applied to the final output tensor.

  Returns:
    float tensor with same shape as `input_tensor`.

  Raises:
    ValueError: One of the tensor shapes or input values is invalid.
  r   rC   r   rE   r5   Nz>`token_type_ids` must be specified if`use_token_type` is True.r   r_   r   )rf   r   r&   r   ry   r   r   r   assert_less_equalcontrol_dependenciesslicelenrG   as_listrangeappendextendr   )rS   rT   rU   rV   rW   rX   rY   r   r   rZ   r~   r   r   widthr3   Ztoken_type_tableZflat_token_type_idsZone_hot_idsrQ   	assert_opZfull_position_embeddingsrR   num_dimsZposition_broadcast_shape_r   r   r   rn     sV   %

rn   c           	      C   sr   t | ddgd}|d }|d }t |dd}|d }tt||d|gtj}tj||dgtjd}|| }|S )a  Create 3D attention mask from a 2D tensor mask.

  Args:
    from_tensor: 2D or 3D Tensor of shape [batch_size, from_seq_length, ...].
    to_mask: int32 Tensor of shape [batch_size, to_seq_length].

  Returns:
    float Tensor of shape [batch_size, from_seq_length, to_seq_length].
  r5   r   rC   r   rE   rF   )rf   r&   castr   float32rg   )	from_tensorZto_mask
from_shaper   from_seq_lengthto_shapeto_seq_lengthZbroadcast_onesmaskr   r   r   ro     s   
ro   rE   rB   c                 C   s  dd }t | ddgd}t |ddgd}t|t|kr tdt|dkr3|d }|d }|d }nt|dkrI|d	u sE|d	u sE|d	u rItd
t| }t|}tjjj|| |dt|	d|}tjjj|| |dt|	d|}tjjj|| |dt|	d|}||||||}||||||}tj	||dd}t
|dtt| }|d	urtj|dgd}dt|tj d }||7 }tj|}t||}t|||||g}t|g d}t	||}t|g d}|
rt||| || g}|S t||||| g}|S )az  Performs multi-headed attention from `from_tensor` to `to_tensor`.

  This is an implementation of multi-headed attention based on "Attention
  is all you Need". If `from_tensor` and `to_tensor` are the same, then
  this is self-attention. Each timestep in `from_tensor` attends to the
  corresponding sequence in `to_tensor`, and returns a fixed-with vector.

  This function first projects `from_tensor` into a "query" tensor and
  `to_tensor` into "key" and "value" tensors. These are (effectively) a list
  of tensors of length `num_attention_heads`, where each tensor is of shape
  [batch_size, seq_length, size_per_head].

  Then, the query and key tensors are dot-producted and scaled. These are
  softmaxed to obtain attention probabilities. The value tensors are then
  interpolated by these probabilities, then concatenated back to a single
  tensor and returned.

  In practice, the multi-headed attention are done with transposes and
  reshapes rather than actual separate tensors.

  Args:
    from_tensor: float Tensor of shape [batch_size, from_seq_length,
      from_width].
    to_tensor: float Tensor of shape [batch_size, to_seq_length, to_width].
    attention_mask: (optional) int32 Tensor of shape [batch_size,
      from_seq_length, to_seq_length]. The values should be 1 or 0. The
      attention scores will effectively be set to -infinity for any positions in
      the mask that are 0, and will be unchanged for positions that are 1.
    num_attention_heads: int. Number of attention heads.
    size_per_head: int. Size of each attention head.
    query_act: (optional) Activation function for the query transform.
    key_act: (optional) Activation function for the key transform.
    value_act: (optional) Activation function for the value transform.
    attention_probs_dropout_prob: (optional) float. Dropout probability of the
      attention probabilities.
    initializer_range: float. Range of the weight initializer.
    do_return_2d_tensor: bool. If True, the output will be of shape [batch_size
      * from_seq_length, num_attention_heads * size_per_head]. If False, the
      output will be of shape [batch_size, from_seq_length, num_attention_heads
      * size_per_head].
    batch_size: (Optional) int. If the input is 2D, this might be the batch size
      of the 3D version of the `from_tensor` and `to_tensor`.
    from_seq_length: (Optional) If the input is 2D, this might be the seq length
      of the 3D version of the `from_tensor`.
    to_seq_length: (Optional) If the input is 2D, this might be the seq length
      of the 3D version of the `to_tensor`.

  Returns:
    float Tensor of shape [batch_size, from_seq_length,
      num_attention_heads * size_per_head]. (If `do_return_2d_tensor` is
      true, this will be of shape [batch_size * from_seq_length,
      num_attention_heads * size_per_head]).

  Raises:
    ValueError: Any of the arguments or tensor shapes are invalid.
  c                 S   s(   t | ||||g}t |g d}|S )Nr   r5   rE   r   )r&   r   	transpose)rS   r   r   r   r   r   r   r   r   transpose_for_scores  s
   z-attention_layer.<locals>.transpose_for_scoresr5   r   rC   z=The rank of `from_tensor` must match the rank of `to_tensor`.r   rE   NzWhen passing in rank 2 tensors to attention_layer, the values for `batch_size`, `from_seq_length`, and `to_seq_length` must all be specified.query)rc   rd   r   re   r"   r#   T)transpose_br   r`   g     r   )rf   r   r   reshape_to_matrixr&   ru   rv   rw   ry   r   multiplymathr   floatr   r   r   r   softmaxr   r   r   )r   	to_tensorr\   r   size_per_headZ	query_actZkey_actZ	value_actr   r   do_return_2d_tensorr   r   r   r   r   r   Zfrom_tensor_2dZto_tensor_2dZquery_layerZ	key_layerZvalue_layerattention_scoresadderZattention_probsZcontext_layerr   r   r   attention_layer=  s   G


	r   r   r   r   c                 C   s  || dkrt d||f t|| }t| dd}|d }|d }|d }||kr2t d||f t| }g }t|D ]}td|  |}td	 g }td
 t|||||||	d|||d}|| W d   n1 stw   Y  d}t	|dkr|d }ntj
|dd}tjjd  tjjj|t|	d|}t||}t|| }W d   n1 sw   Y  tjjd tjjj||t|	d|}W d   n1 sw   Y  tjjd tjjj|t|	d|}W d   n	1 sw   Y  t||}t|| }|}|| W d   n	1 s#w   Y  W d   n	1 s3w   Y  q<|
rPg }|D ]}t||}|| q@|S t||}|S )a.  Multi-headed, multi-layer Transformer from "Attention is All You Need".

  This is almost an exact implementation of the original Transformer encoder.

  See the original paper:
  https://arxiv.org/abs/1706.03762

  Also see:
  https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/models/transformer.py

  Args:
    input_tensor: float Tensor of shape [batch_size, seq_length, hidden_size].
    attention_mask: (optional) int32 Tensor of shape [batch_size, seq_length,
      seq_length], with 1 for positions that can be attended to and 0 in
      positions that should not be.
    hidden_size: int. Hidden size of the Transformer.
    num_hidden_layers: int. Number of layers (blocks) in the Transformer.
    num_attention_heads: int. Number of attention heads in the Transformer.
    intermediate_size: int. The size of the "intermediate" (a.k.a., feed
      forward) layer.
    intermediate_act_fn: function. The non-linear activation function to apply
      to the output of the intermediate/feed-forward layer.
    hidden_dropout_prob: float. Dropout probability for the hidden layers.
    attention_probs_dropout_prob: float. Dropout probability of the attention
      probabilities.
    initializer_range: float. Range of the initializer (stddev of truncated
      normal).
    do_return_all_layers: Whether to also return all layers or just the final
      layer.

  Returns:
    float Tensor of shape [batch_size, seq_length, hidden_size], the final
    hidden layer of the Transformer.

  Raises:
    ValueError: A Tensor shape or parameter is invalid.
  r   zLThe hidden size (%d) is not a multiple of the number of attention heads (%d)r   rC   rE   r5   z6The width of the input tensor (%d) != hidden size (%d)zlayer_%d	attentionr   T)r   r   r\   r   r   r   r   r   r   r   r   Nr_   r`   r3   )rc   re   intermediaterb   )r   intrf   r   r   r&   rj   r   r   r   concatcompatv1ru   rv   rw   ry   r   r   reshape_from_matrix)rS   r\   r   r   r   r   r]   r   r   r   r^   Zattention_head_sizer~   r   r   input_widthprev_outputZall_layer_outputsZ	layer_idxlayer_inputZattention_headsZattention_headattention_outputZintermediate_outputlayer_outputfinal_outputsfinal_outputr   r   r   rp     s   0


M

rp   c                 C   s   |du r| j }|durt| || | j }g }t|D ]\}}|du r)|| q|s.|S t| }|D ]}|| ||< q5|S )a(  Returns a list of the shape of tensor, preferring static dimensions.

  Args:
    tensor: A tf.Tensor object to find the shape of.
    expected_rank: (optional) int. The expected rank of `tensor`. If this is
      specified and the `tensor` has a different rank, and exception will be
      thrown.
    name: Optional name of the tensor for the error message.

  Returns:
    A list of dimensions of the shape of tensor. All static dimensions will
    be returned as python integers, and dynamic dimensions will be returned
    as tf.Tensor scalars.
  N)r   assert_rankrG   r   	enumerater   r&   )tensorrD   r   rG   Znon_static_indexesindexdimZ	dyn_shaper   r   r   rf     s    


rf   c                 C   sH   | j j}|dk rtd| j  |dkr| S | j d }t| d|g}|S )z@Reshapes a >= rank 2 tensor to a rank 2 tensor (i.e., a matrix).r5   z2Input tensor must have at least rank 2. Shape = %sr_   )rG   r   r   r&   r   )rS   r   r   r   r   r   r   r     s   
r   c                 C   s>   t |dkr| S t| }|dd }|d }t| ||g S )z?Reshapes a rank 2 tensor back to its original rank >= 2 tensor.r5   r   r_   )r   rf   r&   r   )r   Zorig_shape_listoutput_shapeZ	orig_dimsr   r   r   r   r     s   r   c                 C   sx   |du r| j }i }t|tjrd||< n	|D ]}d||< q| jj}||vr:t j }td|||t	| jt	|f dS )aQ  Raises an exception if the tensor rank is not of the expected rank.

  Args:
    tensor: A tf.Tensor to check the rank of.
    expected_rank: Python integer or list of integers, expected rank.
    name: Optional name of the tensor for the error message.

  Raises:
    ValueError: If the expected shape doesn't match the actual shape.
  NTzkFor the tensor `%s` in scope `%s`, the actual rank `%d` (shape = %s) is not equal to the expected rank `%s`)
r   r   r   integer_typesrG   r   r&   get_variable_scoper   str)r   rD   r   Zexpected_rank_dictr   actual_rank
scope_namer   r   r   r     s    


r   r   )r   )r   r   rL   F)	FNr   rQ   TrR   r   r   r
   )NrE   r   NNNrB   r   FNNN)NN)&r?   
__future__r   r   r   r   r1   r+   r   r   numpyr   r   
tensorflowr&   tensorflow.compat.v1r   r   disable_v2_behaviorobjectr   rA   r	   rq   r   r   r   r   ry   rk   rn   ro   r   rp   rf   r   r   r   r   r   r   r   <module>   s   L "%





1
`$
 Y
 
"%