Source code for easytransfer.app_zoo.text_classify

# coding=utf-8
# Copyright (c) 2019 Alibaba PAI team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


import tensorflow as tf
from easytransfer import preprocessors, model_zoo
from easytransfer.app_zoo.base import ApplicationModel
from easytransfer.evaluators import classification_eval_metrics, multi_label_eval_metrics, regression_eval_metrics
import easytransfer.layers as layers
from easytransfer.losses import mean_square_error, multi_label_sigmoid_cross_entropy, softmax_cross_entropy
from easytransfer.preprocessors.deeptext_preprocessor import DeepTextPreprocessor


[docs]class BaseTextClassify(ApplicationModel):
    def __init__(self, **kwargs):
        """ Basic Text Classification Model """
        super(BaseTextClassify, self).__init__(**kwargs)

[docs]    @staticmethod
    def default_model_params():
        """ The default value of the Text Classification Model """
        raise NotImplementedError

[docs]    def build_logits(self, features, mode=None):
        """ Building graph of the Text Classification Model
        """
        raise NotImplementedError

[docs]    def build_loss(self, logits, labels):
        """ Building loss for training the Text Classification Model
        """
        if hasattr(self.config, "multi_label") and self.config.multi_label:
            return multi_label_sigmoid_cross_entropy(labels, self.config.num_labels, logits)
        elif self.config.num_labels == 1:
            return mean_square_error(labels, logits)
        else:
            return softmax_cross_entropy(labels, self.config.num_labels, logits)

[docs]    def build_eval_metrics(self, logits, labels):
        """ Building evaluation metrics while evaluating

        Args:
            logits (`Tensor`): shape of [None, num_labels]
            labels (`Tensor`): shape of [None]
        Returns:
            ret_dict (`dict`): A dict with (`py_accuracy`, `py_micro_f1`, `py_macro_f1`) tf.metrics op
        """
        if hasattr(self.config, "multi_label") and self.config.multi_label:
            return multi_label_eval_metrics(logits, labels, self.config.num_labels)
        elif self.config.num_labels == 1:
            return regression_eval_metrics(logits, labels)
        else:
            return classification_eval_metrics(logits, labels, self.config.num_labels)

[docs]    def build_predictions(self, predict_output):
        """ Building  prediction dict of the Text Classification Model

        Args:
            predict_output (`tuple`): (logits, _)
        Returns:
            ret_dict (`dict`): A dict with (`predictions`, `probabilities`, `logits`)
        """
        if hasattr(self.config, "multi_label") and self.config.multi_label:
            return self._build_multi_label_predictions(predict_output)
        else:
            return self._build_single_label_predictions(predict_output)


    def _build_single_label_predictions(self, predict_output):
        logits, _ = predict_output
        predictions = tf.argmax(logits, axis=-1, output_type=tf.int32)
        probs = tf.nn.softmax(logits, axis=1)
        ret_dict = {
            "predictions": predictions,
            "probabilities": probs,
            "logits": logits,
        }
        return ret_dict

    def _build_multi_label_predictions(self, predict_output):
        logits, _ = predict_output
        probs = tf.sigmoid(logits)
        predictions = tf.cast(probs > 0.5, tf.int32)
        ret_dict = {
            "predictions": predictions,
            "probabilities": probs,
            "logits": logits,
        }
        return ret_dict


[docs]class BertTextClassify(BaseTextClassify):
    """ BERT Text Classification Model

        .. highlight:: python
        .. code-block:: python

            default_param_dict = {
                "pretrain_model_name_or_path": "pai-bert-base-zh",
                "multi_label": False,
                "num_labels": 2,
                "max_num_labels": 5,
                "dropout_rate": 0.1
            }
    """
    def __init__(self, **kwargs):
        super(BertTextClassify, self).__init__(**kwargs)

[docs]    @staticmethod
    def get_input_tensor_schema():
        return "input_ids:int:64,input_mask:int:64,segment_ids:int:64,label_id:int:1"

[docs]    @staticmethod
    def get_received_tensor_schema():
        return "input_ids:int:64,input_mask:int:64,segment_ids:int:64"

[docs]    @staticmethod
    def default_model_params():
        """ Get default model required parameters

        Returns:
            default_param_dict (`dict`): key/value pair of default model required parameters
        """
        default_param_dict = {
            "pretrain_model_name_or_path": "pai-bert-base-zh",
            "multi_label": False,
            "num_labels": 2,
            "max_num_labels": 5,
            "dropout_rate": 0.1
        }
        return default_param_dict

[docs]    def build_logits(self, features, mode=None):
        """ Building graph of BERT Text Classifier

        Args:
            features (`OrderedDict`): A dict mapping raw input to tensors
            mode (`bool): tell the model whether it is under training
        Returns:
            logits (`Tensor`): The output after the last dense layer. Shape of [None, num_labels]
            label_ids (`Tensor`): label_ids, shape of [None]
        """
        multi_label_flag = self.config.multi_label if hasattr(self.config, "multi_label") else False
        preprocessor = preprocessors.get_preprocessor(self.config.pretrain_model_name_or_path,
                                                      multi_label=multi_label_flag,
                                                      user_defined_config=self.config)
        input_ids, input_mask, segment_ids, labels = preprocessor(features)

        bert_backbone = model_zoo.get_pretrained_model(self.config.pretrain_model_name_or_path)
        _, pool_output = bert_backbone([input_ids, input_mask, segment_ids], mode=mode)

        is_training = (mode == tf.estimator.ModeKeys.TRAIN)
        pool_output = tf.layers.dropout(
            pool_output, rate=self.config.dropout_rate, training=is_training)
        logits = layers.Dense(self.config.num_labels,
                              kernel_initializer=layers.get_initializer(0.02),
                              name='app/ez_dense')(pool_output)

        self.check_and_init_from_checkpoint(mode)
        return logits, labels


[docs]class TextCNNClassify(BaseTextClassify):
    """ TextCNN Text Classification Model """
    def __init__(self, **kwargs):
        super(TextCNNClassify, self).__init__(**kwargs)
        self.pre_build_vocab = self.config.mode.startswith("train")

[docs]    @staticmethod
    def get_input_tensor_schema():
        return "input_ids_a:int:64,input_mask_a:int:64,input_ids_b:int:64,input_mask_b:int:64,label_id:int:1"

[docs]    @staticmethod
    def get_received_tensor_schema():
        return "input_ids_a:int:64,input_mask_a:int:64,input_ids_b:int:64,input_mask_b:int:64"

[docs]    @staticmethod
    def default_model_params():
        """ Get default model required parameters

        Returns:
            default_param_dict (`dict`): key/value pair of default model required parameters
        """
        default_param_dict = {
            "max_vocab_size": 30000,
            "embedding_size": 300,
            "num_filters": "100,100,100",
            "filter_sizes": "3,4,5",
            "dropout_rate": 0.5,
            "pretrain_word_embedding_name_or_path": "",
            "fix_embedding": False
        }
        return default_param_dict

[docs]    def build_logits(self, features, mode=None):
        """ Building DAM text match graph

        Args:
            features (`OrderedDict`): A dict mapping raw input to tensors
            mode (`bool`): tell the model whether it is under training
        Returns:
            logits (`Tensor`): The output after the last dense layer. Shape of [None, num_labels]
            label_ids (`Tensor`): label_ids, shape of [None]
        """
        text_preprocessor = DeepTextPreprocessor(self.config, mode=mode)
        text_indices, text_masks, _, _, label_ids = text_preprocessor(features)

        is_training = (mode == tf.estimator.ModeKeys.TRAIN)

        word_embeddings = self._add_word_embeddings(vocab_size=text_preprocessor.vocab.size,
                                                    embed_size=self.config.embedding_size,
                                                    pretrained_word_embeddings=text_preprocessor.pretrained_word_embeddings,
                                                    trainable=not self.config.fix_embedding)
        text_embeds = tf.nn.embedding_lookup(word_embeddings, text_indices)

        output_features = layers.TextCNNEncoder(num_filters=self.config.num_filters,
                                                filter_sizes=self.config.filter_sizes,
                                                embed_size=self.config.embedding_size,
                                                max_seq_len=self.config.sequence_length,
                                                )([text_embeds, text_masks], training=is_training)

        output_features = tf.layers.dropout(
            output_features, rate=self.config.dropout_rate, training=is_training, name='output_features')

        logits = layers.Dense(self.config.num_labels,
                              kernel_initializer=layers.get_initializer(0.02),
                              name='output_layer')(output_features)

        self.check_and_init_from_checkpoint(mode)
        return logits, label_ids

    def _add_word_embeddings(self, vocab_size, embed_size, pretrained_word_embeddings=None, trainable=False):
        with tf.name_scope("input_representations"):
            if pretrained_word_embeddings is not None:
                tf.logging.info("Initialize word embedding from pretrained")
                word_embedding_initializer = tf.constant_initializer(pretrained_word_embeddings)
            else:
                word_embedding_initializer = layers.get_initializer(0.02)
            word_embeddings = tf.get_variable("word_embeddings",
                                              [vocab_size, embed_size],
                                              dtype=tf.float32, initializer=word_embedding_initializer,
                                              trainable=trainable)
        return word_embeddings