Source code for easytransfer.app_zoo.text_match

# coding=utf-8
# Copyright (c) 2019 Alibaba PAI team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


import tensorflow as tf
from easytransfer.app_zoo.base import ApplicationModel
from easytransfer import preprocessors, model_zoo
import easytransfer.layers as layers
from easytransfer.losses import matching_embedding_margin_loss, mean_square_error, softmax_cross_entropy
from easytransfer.evaluators import match_eval_metrics
from easytransfer.preprocessors.deeptext_preprocessor import DeepTextPreprocessor


[docs]class BaseTextMatch(ApplicationModel):
    """ Basic Text Match Model """
    def __init__(self, **kwargs):
        super(BaseTextMatch, self).__init__(**kwargs)

[docs]    @staticmethod
    def default_model_params():
        """ Get default model required parameters

        Returns:
            default_param_dict (`dict`): key/value pair of default model required parameters
        """
        default_param_dict = {
            "num_labels": 2
        }
        return default_param_dict

[docs]    def build_logits(self, features, mode=None):
        """ Building the graph logic of text match model
        """
        raise NotImplementedError

[docs]    def build_loss(self, logits, labels):
        """ Building loss for training text match model
        """
        if self.config.num_labels < 2:
            return mean_square_error(labels, logits)
        else:
            return softmax_cross_entropy(labels, depth=self.config.num_labels, logits=logits)

[docs]    def build_eval_metrics(self, logits, labels):
        """ Building evaluation metrics while evaluating

        Args:
            predict_output (`tuple`): (logits, _)
        Returns:
            ret_dict (`dict`): A dict with tf.metrics op
                1. (`mse`) for regression
                2. (`accuracy`, `auc`, `f1`) for binary categories
                3. (`accuracy`, `macro-f1`, `micro-f1`) for multiple categories
        """
        return match_eval_metrics(logits, labels, self.config.num_labels)

[docs]    def build_predictions(self, predict_output):
        """ Building general text match model prediction dict.

        Args:
            predict_output (`tuple`): (logits, _*)
        Returns:
            ret_dict (`dict`): A dict with (`predictions`, `probabilities`, `logits`)
        """
        logits = predict_output[0]
        if isinstance(logits, list):
            logits = logits[0]
        if len(logits.shape) == 2:
            predictions = tf.argmax(logits, axis=-1, output_type=tf.int32)
            probs = tf.nn.softmax(logits, axis=1)
        else:
            probs = (logits + 1.0) / 2.0
            predictions = tf.cast(logits > 0.5, dtype=tf.int32)

        ret_dict = {
            "predictions": predictions,
            "probabilities": probs,
            "logits": logits,
        }
        return ret_dict

    def _add_word_embeddings(self, vocab_size, embed_size, pretrained_word_embeddings=None, trainable=False):
        with tf.name_scope("input_representations"):
            if pretrained_word_embeddings is not None:
                tf.logging.info("Initialize word embedding from pretrained")
                word_embedding_initializer = tf.constant_initializer(pretrained_word_embeddings)
            else:
                word_embedding_initializer = layers.get_initializer(0.02)
            word_embeddings = tf.get_variable("word_embeddings",
                                              [vocab_size, embed_size],
                                              dtype=tf.float32, initializer=word_embedding_initializer,
                                              trainable=trainable)
        return word_embeddings


[docs]class BertTextMatch(BaseTextMatch):
    """ Text Match model based on BERT-like pretrained models

        .. highlight:: python
        .. code-block:: python

            default_param_dict = {
                "pretrain_model_name_or_path": "pai-bert-base-zh",
                "num_labels": 2,
                "dropout_rate": 0.1
            }
    """
    def __init__(self, **kwargs):
        super(BertTextMatch, self).__init__(**kwargs)

[docs]    @staticmethod
    def get_input_tensor_schema():
        return "input_ids:int:64,input_mask:int:64,segment_ids:int:64,label_id:int:1"

[docs]    @staticmethod
    def get_received_tensor_schema():
        return "input_ids:int:64,input_mask:int:64,segment_ids:int:64"

[docs]    @staticmethod
    def default_model_params():
        """ Get default model required parameters

        Returns:
            default_param_dict (`dict`): key/value pair of default model required parameters
        """
        default_param_dict = {
            "pretrain_model_name_or_path": "pai-bert-base-zh",
            "num_labels": 2,
            "dropout_rate": 0.1
        }
        return default_param_dict

[docs]    def build_logits(self, features, mode=None):
        """ Building BERT text match graph

        Args:
            features (`OrderedDict`): A dict mapping raw input to tensors
            mode (`bool`): tell the model whether it is under training
        Returns:
            logits (`Tensor`): The output after the last dense layer. Shape of [None, num_labels]
            label_ids (`Tensor`): label_ids, shape of [None]
        """
        bert_preprocessor = preprocessors.get_preprocessor(self.config.pretrain_model_name_or_path,
                                                           user_defined_config=self.config)
        input_ids, input_mask, segment_ids, label_ids = bert_preprocessor(features)

        bert_backbone = model_zoo.get_pretrained_model(self.config.pretrain_model_name_or_path)
        _, pool_output = bert_backbone([input_ids, input_mask, segment_ids], mode=mode)

        is_training = (mode == tf.estimator.ModeKeys.TRAIN)
        pool_output = tf.layers.dropout(
            pool_output, rate=self.config.dropout_rate, training=is_training)
        logits = layers.Dense(self.config.num_labels,
                              kernel_initializer=layers.get_initializer(0.02),
                              name='app/ez_dense')(pool_output)

        self.check_and_init_from_checkpoint(mode)
        return logits, label_ids


[docs]class BertTextMatchTwoTower(BaseTextMatch):
    """ Text Match model based on BERT-like pretrained models, Two tower for learning embeddings

        .. highlight:: python
        .. code-block:: python

            default_param_dict = {
                "pretrain_model_name_or_path": "pai-bert-base-zh",
                "num_labels": 2
            }
    """
    def __init__(self, **kwargs):
        super(BertTextMatchTwoTower, self).__init__(**kwargs)

[docs]    @staticmethod
    def get_input_tensor_schema():
        return "input_ids_a:int:64,input_mask_a:int:64,segment_ids_a:int:64," \
               "input_ids_b:int:64,input_mask_b:int:64,segment_ids_b:int:64,label_ids:int:1"

[docs]    @staticmethod
    def get_received_tensor_schema():
        return "input_ids_a:int:64,input_mask_a:int:64,segment_ids_a:int:64," \
               "input_ids_b:int:64,input_mask_b:int:64,segment_ids_b:int:64"

[docs]    @staticmethod
    def default_model_params():
        """ Get default model required parameters

        Returns:
            default_param_dict (`dict`): key/value pair of default model required parameters

        """
        default_param_dict = {
            "pretrain_model_name_or_path": "pai-bert-base-zh",
            "num_labels": 2
        }
        return default_param_dict

[docs]    def build_logits(self, features, mode=None):
        """ Building BERT Two Tower text match graph

        Args:
            features (`OrderedDict`): A dict mapping raw input to tensors
            mode (`bool`): tell the model whether it is under training
        Returns:
            logits (`Tensor`): The output after the last dense layer. Shape of [None, num_labels]
            label_ids (`Tensor`): label_ids, shape of [None]
        """
        bert_preprocessor = preprocessors.get_preprocessor(self.config.pretrain_model_name_or_path,
                                                           is_paired=True,
                                                           user_defined_config=self.config)


        input_ids_a, input_mask_a, \
        segment_ids_a, input_ids_b, input_mask_b, segment_ids_b, label_id = bert_preprocessor(features)

        with tf.variable_scope('text_match_bert_two_tower', reuse=tf.AUTO_REUSE):
            bert_backbone = model_zoo.get_pretrained_model(self.config.pretrain_model_name_or_path)
            _, pool_output_a = bert_backbone([input_ids_a, input_mask_a, segment_ids_a], mode=mode)
            _, pool_output_b = bert_backbone([input_ids_b, input_mask_b, segment_ids_b], mode=mode)

        logits = self._cosine(pool_output_a, pool_output_b)

        self.check_and_init_from_checkpoint(mode)
        return [logits, pool_output_a, pool_output_b], label_id

    @staticmethod
    def _cosine(q, a):
        pooled_len_1 = tf.sqrt(tf.reduce_sum(q * q, 1))
        pooled_len_2 = tf.sqrt(tf.reduce_sum(a * a, 1))
        pooled_mul_12 = tf.reduce_sum(q * a, 1)
        score = tf.div(pooled_mul_12, pooled_len_1 * pooled_len_2 + 1e-8, name="scores")
        return score

[docs]    def build_loss(self, outputs, label_id):
        """ Building loss for training two tower text match model
        """
        _, emb1, emb2 = outputs
        return matching_embedding_margin_loss(emb1, emb2)


[docs]class DAMTextMatch(BaseTextMatch):
    """ Text Match model based on DAM models

        Ankur P. Parikh, Oscar Tackstrom, Dipanjan Das, Jakob Uszkoreit, et al.
        `A Decomposable Attention Model for Natural Language Inference <https://arxiv.org/abs/1606.01933/>`_
        , *EMNLP*, 2016.

        .. highlight:: python
        .. code-block:: python

            default_param_dict = {
                "max_vocab_size": 20000,
                "embedding_size": 300,
                "hidden_size": 200,
                "num_labels": 2,
                "first_sequence_length": 50,
                "second_sequence_length": 50,
                "pretrain_word_embedding_name_or_path": "",
                "fix_embedding": False
            }
    """
    def __init__(self, **kwargs):
        super(DAMTextMatch, self).__init__(**kwargs)
        self.pre_build_vocab = self.config.mode.startswith("train")

[docs]    @staticmethod
    def get_input_tensor_schema():
        return "input_ids_a:int:64,input_mask_a:int:64,input_ids_b:int:64,input_mask_b:int:64,label_id:int:1"

[docs]    @staticmethod
    def get_received_tensor_schema():
        return "input_ids_a:int:64,input_mask_a:int:64,input_ids_b:int:64,input_mask_b:int:64"

[docs]    @staticmethod
    def default_model_params():
        """ Get default model required parameters

        Returns:
            default_param_dict (`dict`): key/value pair of default model required parameters
        """
        default_param_dict = {
            "max_vocab_size": 20000,
            "embedding_size": 300,
            "hidden_size": 200,
            "num_labels": 2,
            "first_sequence_length": 50,
            "second_sequence_length": 50,
            "pretrain_word_embedding_name_or_path": "",
            "fix_embedding": False
        }
        return default_param_dict

[docs]    def build_logits(self, features, mode=None):
        """ Building DAM text match graph

        Args:
            features (`OrderedDict`): A dict mapping raw input to tensors
            mode (`bool`): tell the model whether it is under training
        Returns:
            logits (`Tensor`): The output after the last dense layer. Shape of [None, num_labels]
            label_ids (`Tensor`): label_ids, shape of [None]
        """
        text_preprocessor = DeepTextPreprocessor(self.config, mode=mode)
        text_a_indices, text_a_masks, text_b_indices, text_b_masks, label_ids = text_preprocessor(features)
        is_training = (mode == tf.estimator.ModeKeys.TRAIN)

        word_embeddings = self._add_word_embeddings(vocab_size=text_preprocessor.vocab.size,
                                                    embed_size=self.config.embedding_size,
                                                    pretrained_word_embeddings=text_preprocessor.pretrained_word_embeddings,
                                                    trainable=not self.config.fix_embedding)
        a_embeds = tf.nn.embedding_lookup(word_embeddings, text_a_indices)
        b_embeds = tf.nn.embedding_lookup(word_embeddings, text_b_indices)

        dam_output_features = layers.DAMEncoder(self.config.hidden_size)(
            [a_embeds, b_embeds, text_a_masks, text_b_masks], training=is_training)

        dam_output_features = tf.layers.dropout(
            dam_output_features, rate=0.2, training=is_training, name='dam_out_features_dropout')
        dam_output_features = layers.Dense(self.config.hidden_size,
                                           activation=tf.nn.relu,
                                           kernel_initializer=layers.get_initializer(0.02),
                                           name='dam_out_features_projection')(dam_output_features)


        logits = layers.Dense(self.config.num_labels,
                              kernel_initializer=layers.get_initializer(0.02),
                              name='output_layer')(dam_output_features)

        self.check_and_init_from_checkpoint(mode)
        return logits, label_ids


[docs]class DAMPlusTextMatch(BaseTextMatch):
    """ Text Match model based on DAM Plus model, Alibaba PAI Group

        .. highlight:: python
        .. code-block:: python

            default_param_dict = {
                "max_vocab_size": 20000,
                "embedding_size": 300,
                "hidden_size": 200,
                "num_labels": 2,
                "first_sequence_length": 50,
                "second_sequence_length": 50,
                "pretrain_word_embedding_name_or_path": "",
                "fix_embedding": False
            }
    """
    def __init__(self, **kwargs):
        super(DAMPlusTextMatch, self).__init__(**kwargs)
        self.pre_build_vocab = self.config.mode.startswith("train")

[docs]    @staticmethod
    def get_input_tensor_schema():
        return "input_ids_a:int:64,input_mask_a:int:64,input_ids_b:int:64,input_mask_b:int:64,label_id:int:1"

[docs]    @staticmethod
    def get_received_tensor_schema():
        return "input_ids_a:int:64,input_mask_a:int:64,input_ids_b:int:64,input_mask_b:int:64"

[docs]    @staticmethod
    def default_model_params():
        """ Get default model required parameters

        Returns:
            default_param_dict (`dict`): key/value pair of default model required parameters
        """
        default_param_dict = {
            "max_vocab_size": 20000,
            "embedding_size": 300,
            "hidden_size": 200,
            "num_labels": 2,
            "first_sequence_length": 50,
            "second_sequence_length": 50,
            "pretrain_word_embedding_name_or_path": "",
            "fix_embedding": False
        }
        return default_param_dict

[docs]    def build_logits(self, features, mode=None):
        """ Building DAMPlus text match graph

        Args:
            features (`OrderedDict`): A dict mapping raw input to tensors
            mode (`bool`): tell the model whether it is under training
        Returns:
            logits (`Tensor`): The output after the last dense layer. Shape of [None, num_labels]
            label_ids (`Tensor`): label_ids, shape of [None]
        """
        text_preprocessor = DeepTextPreprocessor(self.config, mode=mode)
        text_a_indices, text_a_masks, text_b_indices, text_b_masks, label_ids = text_preprocessor(features)
        is_training = (mode == tf.estimator.ModeKeys.TRAIN)

        word_embeddings = self._add_word_embeddings(vocab_size=text_preprocessor.vocab.size,
                                                    embed_size=self.config.embedding_size,
                                                    pretrained_word_embeddings=text_preprocessor.pretrained_word_embeddings,
                                                    trainable=not self.config.fix_embedding)
        a_embeds = tf.nn.embedding_lookup(word_embeddings, text_a_indices)
        b_embeds = tf.nn.embedding_lookup(word_embeddings, text_b_indices)

        dam_output_features = layers.DAMEncoder(self.config.hidden_size)(
            [a_embeds, b_embeds, text_a_masks, text_b_masks], training=is_training)
        bcnn_output_features = layers.BiCNNEncoder(self.config.hidden_size // 2)(
            [a_embeds, b_embeds, text_a_masks, text_b_masks])

        dam_output_features = tf.layers.dropout(
            dam_output_features, rate=0.2, training=is_training, name='dam_out_features_dropout')
        dam_output_features = layers.Dense(self.config.hidden_size,
                                           activation=tf.nn.relu,
                                           kernel_initializer=layers.get_initializer(0.02),
                                           name='dam_out_features_projection')(dam_output_features)

        bcnn_output_features = tf.layers.dropout(
            bcnn_output_features, rate=0.2, training=is_training, name='dam_out_features_dropout')
        bcnn_output_features = layers.Dense(self.config.hidden_size,
                                           activation=tf.nn.relu,
                                           kernel_initializer=layers.get_initializer(0.02),
                                           name='dam_out_features_projection')(bcnn_output_features)

        output_features = tf.concat([dam_output_features, bcnn_output_features], axis=1)

        logits = layers.Dense(self.config.num_labels,
                              kernel_initializer=layers.get_initializer(0.02),
                              name='output_layer')(output_features)

        self.check_and_init_from_checkpoint(mode)
        return logits, label_ids


[docs]class BiCNNTextMatch(BaseTextMatch):
    """ Text Match model based on BiCNN model, Alibaba PAI Group

        .. highlight:: python
        .. code-block:: python

            default_param_dict = {
                "max_vocab_size": 20000,
                "embedding_size": 300,
                "hidden_size": 200,
                "num_labels": 2,
                "first_sequence_length": 50,
                "second_sequence_length": 50,
                "pretrain_word_embedding_name_or_path": "",
                "fix_embedding": False
            }
    """
    def __init__(self, **kwargs):
        super(BiCNNTextMatch, self).__init__(**kwargs)
        self.pre_build_vocab = self.config.mode.startswith("train")

[docs]    @staticmethod
    def get_input_tensor_schema():
        return "input_ids_a:int:64,input_mask_a:int:64,input_ids_b:int:64,input_mask_b:int:64,label_id:int:1"

[docs]    @staticmethod
    def get_received_tensor_schema():
        return "input_ids_a:int:64,input_mask_a:int:64,input_ids_b:int:64,input_mask_b:int:64"

[docs]    @staticmethod
    def default_model_params():
        """ Get default model required parameters

        Returns:
            default_param_dict (`dict`): key/value pair of default model required parameters
        """
        default_param_dict = {
            "max_vocab_size": 20000,
            "embedding_size": 300,
            "hidden_size": 200,
            "num_labels": 2,
            "first_sequence_length": 50,
            "second_sequence_length": 50,
            "pretrain_word_embedding_name_or_path": "",
            "fix_embedding": False
        }
        return default_param_dict

[docs]    def build_logits(self, features, mode=None):
        """ Building BiCNN text match graph

        Args:
            features (`OrderedDict`): A dict mapping raw input to tensors
            mode (`bool`): tell the model whether it is under training
        Returns:
            logits (`Tensor`): The output after the last dense layer. Shape of [None, num_labels]
            label_ids (`Tensor`): label_ids, shape of [None]
        """
        text_preprocessor = DeepTextPreprocessor(self.config, mode=mode)
        text_a_indices, text_a_masks, text_b_indices, text_b_masks, label_ids = text_preprocessor(features)
        is_training = (mode == tf.estimator.ModeKeys.TRAIN)

        word_embeddings = self._add_word_embeddings(vocab_size=text_preprocessor.vocab.size,
                                                    embed_size=self.config.embedding_size,
                                                    pretrained_word_embeddings=text_preprocessor.pretrained_word_embeddings,
                                                    trainable=not self.config.fix_embedding)
        a_embeds = tf.nn.embedding_lookup(word_embeddings, text_a_indices)
        b_embeds = tf.nn.embedding_lookup(word_embeddings, text_b_indices)

        bcnn_output_features = layers.BiCNNEncoder(self.config.hidden_size)(
            [a_embeds, b_embeds, text_a_masks, text_b_masks])

        bcnn_output_features = tf.layers.dropout(
            bcnn_output_features, rate=0.2, training=is_training, name='dam_out_features_dropout')
        bcnn_output_features = layers.Dense(self.config.hidden_size,
                                           activation=tf.nn.relu,
                                           kernel_initializer=layers.get_initializer(0.02),
                                           name='dam_out_features_projection')(bcnn_output_features)

        logits = layers.Dense(self.config.num_labels,
                              kernel_initializer=layers.get_initializer(0.02),
                              name='output_layer')(bcnn_output_features)

        self.check_and_init_from_checkpoint(mode)
        return logits, label_ids


[docs]class HCNNTextMatch(BaseTextMatch):
    """ Text Match model based on Hybrid Context CNN

        Minghui Qiu, Yang Liu, Feng Ji, Wei Zhou, Jun Huang, et al.
        `Transfer Learning for Context-Aware Question Matching in Information-seeking
        Conversation Systems <https://www.aclweb.org/anthology/P18-2034//>`_
        , *ACL* 2018.

        .. highlight:: python
        .. code-block:: python

            default_param_dict = {
                "max_vocab_size": 20000,
                "embedding_size": 300,
                "hidden_size": 300,
                "num_labels": 2,
                "first_sequence_length": 64,
                "second_sequence_length": 64,
                "pretrain_word_embedding_name_or_path": "",
                "fix_embedding": False,
                "l2_reg": 0.0004,
                "filter_size": 4,
            }
    """
    def __init__(self, **kwargs):
        super(HCNNTextMatch, self).__init__(**kwargs)
        self.pre_build_vocab = self.config.mode.startswith("train")

[docs]    @staticmethod
    def get_input_tensor_schema():
        return "input_ids_a:int:64,input_mask_a:int:64,input_ids_b:int:64,input_mask_b:int:64,label_id:int:1"

[docs]    @staticmethod
    def get_received_tensor_schema():
        return "input_ids_a:int:64,input_mask_a:int:64,input_ids_b:int:64,input_mask_b:int:64"

[docs]    @staticmethod
    def default_model_params():
        """ Get default model required parameters

        Returns:
            default_param_dict (`dict`): key/value pair of default model required parameters
        """
        default_param_dict = {
            "max_vocab_size": 20000,
            "embedding_size": 300,
            "hidden_size": 200,
            "num_labels": 2,
            "first_sequence_length": 64,
            "second_sequence_length": 64,
            "pretrain_word_embedding_name_or_path": "",
            "fix_embedding": False,
            "l2_reg": 0.0004,
            "filter_size": 4,

        }
        return default_param_dict

[docs]    def build_logits(self, features, mode=None):
        text_preprocessor = DeepTextPreprocessor(self.config, mode=mode)
        text_a_indices, text_a_masks, text_b_indices, text_b_masks, label_ids = text_preprocessor(features)
        is_training = (mode == tf.estimator.ModeKeys.TRAIN)

        word_embeddings = self._add_word_embeddings(vocab_size=text_preprocessor.vocab.size,
                                                    embed_size=self.config.embedding_size,
                                                    pretrained_word_embeddings=text_preprocessor.pretrained_word_embeddings,
                                                    trainable=not self.config.fix_embedding)
        a_embeds = tf.nn.embedding_lookup(word_embeddings, text_a_indices)
        b_embeds = tf.nn.embedding_lookup(word_embeddings, text_b_indices)

        hcnn_output_features = layers.HybridCNNEncoder(
            num_filters=self.config.hidden_size,
            l2_reg=self.config.l2_reg,
            filter_size=self.config.filter_size)([a_embeds, b_embeds, text_a_masks, text_b_masks])

        hcnn_output_features = tf.layers.dropout(
            hcnn_output_features, rate=0.2, training=is_training, name='dam_out_features_dropout')
        hcnn_output_features = layers.Dense(self.config.hidden_size,
                                            activation=tf.nn.relu,
                                            kernel_initializer=layers.get_initializer(0.02),
                                            name='dam_out_features_projection')(hcnn_output_features)

        logits = layers.Dense(self.config.num_labels,
                              kernel_initializer=layers.get_initializer(0.02),
                              name='output_layer')(hcnn_output_features)

        self.check_and_init_from_checkpoint(mode)
        return logits, label_ids