import siximport tensorflow as tffrom easytransfer import layersfrom .modeling_utils import PreTrainedModelfrom .modeling_bert import BertConfig, BERT_PRETRAINED_MODEL_ARCHIVE_MAP, BERT_PRETRAINED_CONFIG_ARCHIVE_MAPdef assert_rank(tensor, expected_rank, name=None):  """Raises an exception if the tensor rank is not of the expected rank.  Args:    tensor: A tf.Tensor to check the rank of.    expected_rank: Python integer or list of integers, expected rank.    name: Optional name of the tensor for the error message.  Raises:    ValueError: If the expected shape doesn"t match the actual shape.  """  if name is None:    name = tensor.name  expected_rank_dict = {}  if isinstance(expected_rank, six.integer_types):    expected_rank_dict[expected_rank] = True  else:    for x in expected_rank:      expected_rank_dict[x] = True  actual_rank = tensor.shape.ndims  if actual_rank not in expected_rank_dict:    scope_name = tf.get_variable_scope().name    raise ValueError(        "For the tensor `%s` in scope `%s`, the actual rank "        "`%d` (shape = %s) is not equal to the expected rank `%s`" %        (name, scope_name, actual_rank, str(tensor.shape), str(expected_rank)))def get_shape_list(tensor, expected_rank=None, name=None):  """Returns a list of the shape of tensor, preferring static dimensions.  Args:    tensor: A tf.Tensor object to find the shape of.    expected_rank: (optional) int. The expected rank of `tensor`. If this is      specified and the `tensor` has a different rank, and exception will be      thrown.    name: Optional name of the tensor for the error message.  Returns:    A list of dimensions of the shape of tensor. All static dimensions will    be returned as python integers, and dynamic dimensions will be returned    as tf.Tensor scalars.  """  if name is None:    name = tensor.name  if expected_rank is not None:    assert_rank(tensor, expected_rank, name)  shape = tensor.shape.as_list()  non_static_indexes = []  for (index, dim) in enumerate(shape):    if dim is None:      non_static_indexes.append(index)  if not non_static_indexes:    return shape  dyn_shape = tf.shape(tensor)  for index in non_static_indexes:    shape[index] = dyn_shape[index]  return shapedef dropout(input_tensor, dropout_prob):  """Perform dropout.  Args:    input_tensor: float Tensor.    dropout_prob: Python float. The probabiltiy of dropping out a value (NOT of      *keeping* a dimension as in `tf.nn.dropout`).  Returns:    A version of `input_tensor` with dropout applied.  """  if dropout_prob is None or dropout_prob == 0.0:    return input_tensor  output = tf.nn.dropout(input_tensor, 1.0 - dropout_prob)  return outputdef layer_norm(input_tensor, name=None):  """Run layer normalization on the last dimension of the tensor."""  return tf.contrib.layers.layer_norm(      inputs=input_tensor, begin_norm_axis=-1, begin_params_axis=-1, scope=name)def layer_norm_and_dropout(input_tensor, dropout_prob, name=None):  """Runs layer normalization followed by dropout."""  output_tensor = layer_norm(input_tensor, name)  output_tensor = dropout(output_tensor, dropout_prob)  return output_tensordef create_initializer(initializer_range=0.02):  """Creates a `truncated_normal_initializer` with the given range."""  return tf.truncated_normal_initializer(stddev=initializer_range)def embedding_lookup(input_ids,                     vocab_size,                     embedding_size=128,                     initializer_range=0.02,                     word_embedding_name="word_embeddings",                     use_one_hot_embeddings=False):    """Looks up words embeddings for id tensor.    Args:      input_ids: int32 Tensor of shape [batch_size, seq_length] containing word        ids.      vocab_size: int. Size of the embedding vocabulary.      embedding_size: int. Width of the word embeddings.      initializer_range: float. Embedding initialization range.      word_embedding_name: string. Name of the embedding table.      use_one_hot_embeddings: bool. If True, use one-hot method for word        embeddings. If False, use `tf.nn.embedding_lookup()`. One hot is better        for TPUs.    Returns:      float Tensor of shape [batch_size, seq_length, embedding_size].    """    # This function assumes that the input is of shape [batch_size, seq_length,    # num_inputs].    #    # If the input is a 2D tensor of shape [batch_size, seq_length], we    # reshape to [batch_size, seq_length, 1].    if input_ids.shape.ndims == 2:        input_ids = tf.expand_dims(input_ids, axis=[-1])    embedding_table = tf.get_variable(        name=word_embedding_name,        shape=[vocab_size, embedding_size],        initializer=create_initializer(initializer_range))    if use_one_hot_embeddings:        flat_input_ids = tf.reshape(input_ids, [-1])        one_hot_input_ids = tf.one_hot(flat_input_ids, depth=vocab_size)        output = tf.matmul(one_hot_input_ids, embedding_table)    else:        output = tf.nn.embedding_lookup(embedding_table, input_ids)    input_shape = get_shape_list(input_ids)    output = tf.reshape(output,                        input_shape[0:-1] + [input_shape[-1] * embedding_size])    return (output, embedding_table)def embedding_postprocessor(input_tensor,                            use_token_type=False,                            token_type_ids=None,                            token_type_vocab_size=16,                            token_type_embedding_name="token_type_embeddings",                            use_position_embeddings=True,                            position_embedding_name="position_embeddings",                            initializer_range=0.02,                            max_position_embeddings=512,                            use_history_answer_embedding=True,                            history_answer_marker=None,                            history_answer_embedding_vocab_size=2,                            history_answer_embedding_name='history_answer_embedding',                            dropout_prob=0.1):    """Performs various post-processing on a word embedding tensor.    Args:      input_tensor: float Tensor of shape [batch_size, seq_length,        embedding_size].      use_token_type: bool. Whether to add embeddings for `token_type_ids`.      token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length].        Must be specified if `use_token_type` is True.      token_type_vocab_size: int. The vocabulary size of `token_type_ids`.      token_type_embedding_name: string. The name of the embedding table variable        for token type ids.      use_position_embeddings: bool. Whether to add position embeddings for the        position of each token in the sequence.      position_embedding_name: string. The name of the embedding table variable        for positional embeddings.      initializer_range: float. Range of the weight initialization.      max_position_embeddings: int. Maximum sequence length that might ever be        used with this model. This can be longer than the sequence length of        input_tensor, but cannot be shorter.      dropout_prob: float. Dropout probability applied to the final output tensor.    Returns:      float tensor with same shape as `input_tensor`.    Raises:      ValueError: One of the tensor shapes or input values is invalid.    """    input_shape = get_shape_list(input_tensor, expected_rank=3)    batch_size = input_shape[0]    seq_length = input_shape[1]    width = input_shape[2]    output = input_tensor    if use_token_type:        if token_type_ids is None:            raise ValueError("`token_type_ids` must be specified if"                             "`use_token_type` is True.")        token_type_table = tf.get_variable(            name=token_type_embedding_name,            shape=[token_type_vocab_size, width],            initializer=create_initializer(initializer_range))        # This vocab will be small so we always do one-hot here, since it is always        # faster for a small vocabulary.        flat_token_type_ids = tf.reshape(token_type_ids, [-1])        one_hot_ids = tf.one_hot(flat_token_type_ids, depth=token_type_vocab_size)        token_type_embeddings = tf.matmul(one_hot_ids, token_type_table)        token_type_embeddings = tf.reshape(token_type_embeddings,                                           [batch_size, seq_length, width])        output += token_type_embeddings    if use_history_answer_embedding:        if history_answer_marker is None:            raise ValueError("`history_answer_marker` must be specified if"                             "`use_history_answer_embedding` is True.")        history_answer_marker_table = tf.get_variable(            name=history_answer_embedding_name,            shape=[history_answer_embedding_vocab_size, width],            initializer=create_initializer(initializer_range))        flat_history_answer_marker = tf.reshape(history_answer_marker, [-1])        one_hot_ids = tf.one_hot(flat_history_answer_marker, depth=history_answer_embedding_vocab_size)        history_answer_embedding = tf.matmul(one_hot_ids, history_answer_marker_table)        history_answer_embedding = tf.reshape(history_answer_embedding,                                              [batch_size, seq_length, width])        output += history_answer_embedding    if use_position_embeddings:        full_position_embeddings = tf.get_variable(            name=position_embedding_name,            shape=[max_position_embeddings, width],            initializer=create_initializer(initializer_range))        # Since the position embedding table is a learned variable, we create it        # using a (long) sequence length `max_position_embeddings`. The actual        # sequence length might be shorter than this, for faster training of        # tasks that do not have long sequences.        #        # So `full_position_embeddings` is effectively an embedding table        # for position [0, 1, 2, ..., max_position_embeddings-1], and the current        # sequence has positions [0, 1, 2, ... seq_length-1], so we can just        # perform a slice.        position_embeddings = tf.slice(full_position_embeddings, [0, 0],                                       [seq_length, -1])        num_dims = len(output.shape.as_list())        # Only the last two dimensions are relevant (`seq_length` and `width`), so        # we broadcast among the first dimensions, which is typically just        # the batch size.        position_broadcast_shape = []        for _ in range(num_dims - 2):            position_broadcast_shape.append(1)        position_broadcast_shape.extend([seq_length, width])        position_embeddings = tf.reshape(position_embeddings,                                         position_broadcast_shape)        output += position_embeddings    output = layer_norm_and_dropout(output, dropout_prob)    return outputclass BertHAEBackbone(layers.Layer):    def __init__(self, config, **kwargs):        super(BertHAEBackbone, self).__init__(config, **kwargs)        self.config = config        self.encoder = layers.Encoder(config, name="encoder")        self.pooler = layers.Dense(            units=config.hidden_size,            activation='tanh',            kernel_initializer=layers.get_initializer(config.initializer_range),            name="pooler/dense")    def call(self, inputs,             input_mask=None,             segment_ids=None,             history_answer_marker=None,             training=False):        if isinstance(inputs, (tuple, list)):            input_ids = inputs[0]            input_mask = inputs[1] if len(inputs) > 1 else input_mask            segment_ids = inputs[2] if len(inputs) > 2 else segment_ids            history_answer_marker = inputs[3] if len(inputs) > 3 else history_answer_marker        else:            input_ids = inputs        input_shape = layers.get_shape_list(input_ids)        batch_size = input_shape[0]        seq_length = input_shape[1]        if input_mask is None:            input_mask = tf.ones(shape=[batch_size, seq_length], dtype=tf.int32)        if segment_ids is None:            segment_ids = tf.zeros(shape=[batch_size, seq_length], dtype=tf.int32)        if history_answer_marker is None:            history_answer_marker = tf.zeros(shape=[batch_size, seq_length], dtype=tf.int32)        with tf.variable_scope("embeddings"):            # Perform embedding lookup on the word ids.            (embedding_output, embedding_table) = embedding_lookup(                input_ids=input_ids,                vocab_size=self.config.vocab_size,                embedding_size=self.config.hidden_size,                initializer_range=self.config.initializer_range,                word_embedding_name="word_embeddings",                use_one_hot_embeddings=False)            # Add positional embeddings and token type embeddings, then layer            # normalize and perform dropout.            embedding_output = embedding_postprocessor(                input_tensor=embedding_output,                use_token_type=True,                token_type_ids=segment_ids,                token_type_vocab_size=self.config.type_vocab_size,                token_type_embedding_name="token_type_embeddings",                use_position_embeddings=True,                position_embedding_name="position_embeddings",                initializer_range=self.config.initializer_range,                max_position_embeddings=self.config.max_position_embeddings,                use_history_answer_embedding=True,                history_answer_marker=history_answer_marker,                history_answer_embedding_vocab_size=2,                history_answer_embedding_name='history_answer_embedding',                dropout_prob=self.config.hidden_dropout_prob)        attention_mask = layers.get_attn_mask_bert(input_ids, input_mask)        encoder_outputs = self.encoder([embedding_output, attention_mask], training=training)        pooled_output = self.pooler(encoder_outputs[0][-1][:, 0])        outputs = (encoder_outputs[0][-1], pooled_output)        return outputsclass BertHAEPretrainedModel(PreTrainedModel):    config_class = BertConfig    pretrained_model_archive_map = BERT_PRETRAINED_MODEL_ARCHIVE_MAP    pretrained_config_archive_map = BERT_PRETRAINED_CONFIG_ARCHIVE_MAP    def __init__(self, config, **kwargs):        super(BertHAEPretrainedModel, self).__init__(config, **kwargs)        self.bert_hae = BertHAEBackbone(config, name="bert")    def call(self, inputs, **kwargs):        training = kwargs['mode'] == tf.estimator.ModeKeys.TRAIN        return self.bert_hae(inputs, training=training)