# coding=utf-8
# Copyright (c) 2019 Alibaba PAI team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import tensorflow as tf
from easytransfer import preprocessors, model_zoo
from easytransfer.app_zoo.base import ApplicationModel
from easytransfer.evaluators import classification_eval_metrics, multi_label_eval_metrics, regression_eval_metrics
import easytransfer.layers as layers
from easytransfer.losses import mean_square_error, multi_label_sigmoid_cross_entropy, softmax_cross_entropy
from easytransfer.preprocessors.deeptext_preprocessor import DeepTextPreprocessor
[docs]class BaseTextClassify(ApplicationModel):
def __init__(self, **kwargs):
""" Basic Text Classification Model """
super(BaseTextClassify, self).__init__(**kwargs)
[docs] @staticmethod
def default_model_params():
""" The default value of the Text Classification Model """
raise NotImplementedError
[docs] def build_logits(self, features, mode=None):
""" Building graph of the Text Classification Model
"""
raise NotImplementedError
[docs] def build_loss(self, logits, labels):
""" Building loss for training the Text Classification Model
"""
if hasattr(self.config, "multi_label") and self.config.multi_label:
return multi_label_sigmoid_cross_entropy(labels, self.config.num_labels, logits)
elif self.config.num_labels == 1:
return mean_square_error(labels, logits)
else:
return softmax_cross_entropy(labels, self.config.num_labels, logits)
[docs] def build_eval_metrics(self, logits, labels):
""" Building evaluation metrics while evaluating
Args:
logits (`Tensor`): shape of [None, num_labels]
labels (`Tensor`): shape of [None]
Returns:
ret_dict (`dict`): A dict with (`py_accuracy`, `py_micro_f1`, `py_macro_f1`) tf.metrics op
"""
if hasattr(self.config, "multi_label") and self.config.multi_label:
return multi_label_eval_metrics(logits, labels, self.config.num_labels)
elif self.config.num_labels == 1:
return regression_eval_metrics(logits, labels)
else:
return classification_eval_metrics(logits, labels, self.config.num_labels)
[docs] def build_predictions(self, predict_output):
""" Building prediction dict of the Text Classification Model
Args:
predict_output (`tuple`): (logits, _)
Returns:
ret_dict (`dict`): A dict with (`predictions`, `probabilities`, `logits`)
"""
if hasattr(self.config, "multi_label") and self.config.multi_label:
return self._build_multi_label_predictions(predict_output)
else:
return self._build_single_label_predictions(predict_output)
def _build_single_label_predictions(self, predict_output):
logits, _ = predict_output
predictions = tf.argmax(logits, axis=-1, output_type=tf.int32)
probs = tf.nn.softmax(logits, axis=1)
ret_dict = {
"predictions": predictions,
"probabilities": probs,
"logits": logits,
}
return ret_dict
def _build_multi_label_predictions(self, predict_output):
logits, _ = predict_output
probs = tf.sigmoid(logits)
predictions = tf.cast(probs > 0.5, tf.int32)
ret_dict = {
"predictions": predictions,
"probabilities": probs,
"logits": logits,
}
return ret_dict
[docs]class BertTextClassify(BaseTextClassify):
""" BERT Text Classification Model
.. highlight:: python
.. code-block:: python
default_param_dict = {
"pretrain_model_name_or_path": "pai-bert-base-zh",
"multi_label": False,
"num_labels": 2,
"max_num_labels": 5,
"dropout_rate": 0.1
}
"""
def __init__(self, **kwargs):
super(BertTextClassify, self).__init__(**kwargs)
[docs] @staticmethod
def get_input_tensor_schema():
return "input_ids:int:64,input_mask:int:64,segment_ids:int:64,label_id:int:1"
[docs] @staticmethod
def get_received_tensor_schema():
return "input_ids:int:64,input_mask:int:64,segment_ids:int:64"
[docs] @staticmethod
def default_model_params():
""" Get default model required parameters
Returns:
default_param_dict (`dict`): key/value pair of default model required parameters
"""
default_param_dict = {
"pretrain_model_name_or_path": "pai-bert-base-zh",
"multi_label": False,
"num_labels": 2,
"max_num_labels": 5,
"dropout_rate": 0.1
}
return default_param_dict
[docs] def build_logits(self, features, mode=None):
""" Building graph of BERT Text Classifier
Args:
features (`OrderedDict`): A dict mapping raw input to tensors
mode (`bool): tell the model whether it is under training
Returns:
logits (`Tensor`): The output after the last dense layer. Shape of [None, num_labels]
label_ids (`Tensor`): label_ids, shape of [None]
"""
multi_label_flag = self.config.multi_label if hasattr(self.config, "multi_label") else False
preprocessor = preprocessors.get_preprocessor(self.config.pretrain_model_name_or_path,
multi_label=multi_label_flag,
user_defined_config=self.config)
input_ids, input_mask, segment_ids, labels = preprocessor(features)
bert_backbone = model_zoo.get_pretrained_model(self.config.pretrain_model_name_or_path)
_, pool_output = bert_backbone([input_ids, input_mask, segment_ids], mode=mode)
is_training = (mode == tf.estimator.ModeKeys.TRAIN)
pool_output = tf.layers.dropout(
pool_output, rate=self.config.dropout_rate, training=is_training)
logits = layers.Dense(self.config.num_labels,
kernel_initializer=layers.get_initializer(0.02),
name='app/ez_dense')(pool_output)
self.check_and_init_from_checkpoint(mode)
return logits, labels
[docs]class TextCNNClassify(BaseTextClassify):
""" TextCNN Text Classification Model """
def __init__(self, **kwargs):
super(TextCNNClassify, self).__init__(**kwargs)
self.pre_build_vocab = self.config.mode.startswith("train")
[docs] @staticmethod
def get_input_tensor_schema():
return "input_ids_a:int:64,input_mask_a:int:64,input_ids_b:int:64,input_mask_b:int:64,label_id:int:1"
[docs] @staticmethod
def get_received_tensor_schema():
return "input_ids_a:int:64,input_mask_a:int:64,input_ids_b:int:64,input_mask_b:int:64"
[docs] @staticmethod
def default_model_params():
""" Get default model required parameters
Returns:
default_param_dict (`dict`): key/value pair of default model required parameters
"""
default_param_dict = {
"max_vocab_size": 30000,
"embedding_size": 300,
"num_filters": "100,100,100",
"filter_sizes": "3,4,5",
"dropout_rate": 0.5,
"pretrain_word_embedding_name_or_path": "",
"fix_embedding": False
}
return default_param_dict
[docs] def build_logits(self, features, mode=None):
""" Building DAM text match graph
Args:
features (`OrderedDict`): A dict mapping raw input to tensors
mode (`bool`): tell the model whether it is under training
Returns:
logits (`Tensor`): The output after the last dense layer. Shape of [None, num_labels]
label_ids (`Tensor`): label_ids, shape of [None]
"""
text_preprocessor = DeepTextPreprocessor(self.config, mode=mode)
text_indices, text_masks, _, _, label_ids = text_preprocessor(features)
is_training = (mode == tf.estimator.ModeKeys.TRAIN)
word_embeddings = self._add_word_embeddings(vocab_size=text_preprocessor.vocab.size,
embed_size=self.config.embedding_size,
pretrained_word_embeddings=text_preprocessor.pretrained_word_embeddings,
trainable=not self.config.fix_embedding)
text_embeds = tf.nn.embedding_lookup(word_embeddings, text_indices)
output_features = layers.TextCNNEncoder(num_filters=self.config.num_filters,
filter_sizes=self.config.filter_sizes,
embed_size=self.config.embedding_size,
max_seq_len=self.config.sequence_length,
)([text_embeds, text_masks], training=is_training)
output_features = tf.layers.dropout(
output_features, rate=self.config.dropout_rate, training=is_training, name='output_features')
logits = layers.Dense(self.config.num_labels,
kernel_initializer=layers.get_initializer(0.02),
name='output_layer')(output_features)
self.check_and_init_from_checkpoint(mode)
return logits, label_ids
def _add_word_embeddings(self, vocab_size, embed_size, pretrained_word_embeddings=None, trainable=False):
with tf.name_scope("input_representations"):
if pretrained_word_embeddings is not None:
tf.logging.info("Initialize word embedding from pretrained")
word_embedding_initializer = tf.constant_initializer(pretrained_word_embeddings)
else:
word_embedding_initializer = layers.get_initializer(0.02)
word_embeddings = tf.get_variable("word_embeddings",
[vocab_size, embed_size],
dtype=tf.float32, initializer=word_embedding_initializer,
trainable=trainable)
return word_embeddings