# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors.
# Copyright (c) 2019 Alibaba PAI team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import collections
from copy import deepcopy
import os
import six
import uuid
import numpy as np
import tensorflow as tf
from .preprocessor import Preprocessor, PreprocessorConfig
from .tokenization import convert_to_unicode, printable_text
[docs]def is_whitespace(c):
if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
return True
return False
[docs]def whitespace_tokenize(text):
"""Runs basic whitespace cleaning and splitting on a piece of text."""
text = text.strip()
if not text:
return []
tokens = text.split()
return tokens
def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer,
orig_answer_text):
"""Returns tokenized answer spans that better match the annotated answer."""
# The SQuAD annotations are character based. We first project them to
# whitespace-tokenized words. But then after WordPiece tokenization, we can
# often find a "better match". For example:
#
# Question: What year was John Smith born?
# Context: The leader was John Smith (1895-1943).
# Answer: 1895
#
# The original whitespace-tokenized answer will be "(1895-1943).". However
# after tokenization, our tokens will be "( 1895 - 1943 ) .". So we can match
# the exact answer, 1895.
#
# However, this is not always possible. Consider the following:
#
# Question: What country is the top exporter of electornics?
# Context: The Japanese electronics industry is the lagest in the world.
# Answer: Japan
#
# In this case, the annotator chose "Japan" as a character sub-span of
# the word "Japanese". Since our WordPiece tokenizer does not split
# "Japanese", we just use "Japanese" as the annotation. This is fairly rare
# in SQuAD, but does happen.
tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text))
for new_start in range(input_start, input_end + 1):
for new_end in range(input_end, new_start - 1, -1):
text_span = " ".join(doc_tokens[new_start:(new_end + 1)])
if text_span == tok_answer_text:
return (new_start, new_end)
return (input_start, input_end)
def _check_is_max_context(doc_spans, cur_span_index, position):
"""Check if this is the 'max context' doc span for the token."""
# Because of the sliding window approach taken to scoring documents, a single
# token can appear in multiple documents. E.g.
# Doc: the man went to the store and bought a gallon of milk
# Span A: the man went to the
# Span B: to the store and bought
# Span C: and bought a gallon of
# ...
#
# Now the word 'bought' will have two scores from spans B and C. We only
# want to consider the score with "maximum context", which we define as
# the *minimum* of its left and right context (the *sum* of left and
# right context will always be the same, of course).
#
# In the example the maximum context for 'bought' would be span C since
# it has 1 left context and 3 right context, while span B has 4 left context
# and 0 right context.
best_score = None
best_span_index = None
for (span_index, doc_span) in enumerate(doc_spans):
end = doc_span.start + doc_span.length - 1
if position < doc_span.start:
continue
if position > end:
continue
num_left_context = position - doc_span.start
num_right_context = end - position
score = min(num_left_context, num_right_context) + 0.01 * doc_span.length
if best_score is None or score > best_score:
best_score = score
best_span_index = span_index
return cur_span_index == best_span_index
[docs]class ComprehensionPreprocessorConfig(PreprocessorConfig):
def __init__(self, **kwargs):
super(ComprehensionPreprocessorConfig, self).__init__(**kwargs)
self.input_schema = kwargs.get("input_schema")
self.sequence_length = kwargs.get("sequence_length")
self.first_sequence = kwargs.get("first_sequence")
self.second_sequence = kwargs.get("second_sequence")
self.label_name = kwargs.get("label_name")
self.label_enumerate_values = kwargs.get("label_enumerate_values")
[docs]class Example(object):
"""A single training/test example for simple sequence classification.
For scripts without an answer, the start and end position are -1.
"""
def __init__(self,
qas_id,
question_text,
doc_tokens,
orig_answer_text=None,
start_position=None,
end_position=None,
is_impossible=False):
self.qas_id = qas_id
self.question_text = question_text
self.doc_tokens = doc_tokens
self.orig_answer_text = orig_answer_text
self.start_position = start_position
self.end_position = end_position
self.is_impossible = is_impossible
def __str__(self):
return self.__repr__()
def __repr__(self):
s = ""
s += "qas_id: %s" % (printable_text(self.qas_id))
s += ", question_text: %s" % (
printable_text(self.question_text))
s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens))
if self.start_position:
s += ", start_position: %d" % (self.start_position)
if self.start_position:
s += ", end_position: %d" % (self.end_position)
if self.start_position:
s += ", is_impossible: %r" % (self.is_impossible)
return s
[docs]class ComprehensionPreprocessor(Preprocessor):
""" Preprocessor for single-turn text comprehension
"""
config_class = ComprehensionPreprocessorConfig
def __init__(self, config, thread_num=1, **kwargs):
super(ComprehensionPreprocessor, self).__init__(config, thread_num=thread_num, **kwargs)
self.config = config
self.max_seq_length = config.sequence_length
self.context_col_name = config.first_sequence
self.max_query_length = int(config.max_query_length)
self.doc_stride = int(config.doc_stride) if hasattr(config, "doc_stride") else 128
self.query_col_name = config.second_sequence
self.answer_col_name = config.label_name
self.input_tensor_names = []
if "/" in config.pretrain_model_name_or_path:
dirname = os.path.dirname(config.pretrain_model_name_or_path)
self.language = dirname.split("-")[-1]
else:
self.language = config.pretrain_model_name_or_path.split("-")[-1]
input_schema = config.input_schema
self.input_tensor_names = []
for schema in input_schema.split(","):
name = schema.split(":")[0]
self.input_tensor_names.append(name)
self.example_count = 0
[docs] def convert_example_to_features(self, items):
""" Convert single example to multiple input features
Args:
items (`dict`): inputs from the reader
Returns:
features (`list`): list of `InputFeature`
"""
paragraph_text = convert_to_unicode(items[self.context_col_name])
question_id_list = convert_to_unicode(items[self.query_col_name]).split("||")
questions = list(zip(question_id_list[::2], question_id_list[1::2]))
if self.answer_col_name in self.input_tensor_names:
answer_starts_list = convert_to_unicode(items[self.answer_col_name]).split("||")
answers = list(zip(answer_starts_list[::2], [int(t) for t in answer_starts_list[1::2]]))
is_training = True
else:
answers = list()
is_training = False
if self.mode.startswith("predict"):
is_training = False
doc_tokens = []
char_to_word_offset = []
prev_is_whitespace = True
for c in paragraph_text:
if is_whitespace(c):
prev_is_whitespace = True
else:
if prev_is_whitespace:
doc_tokens.append(c)
else:
doc_tokens[-1] += c
prev_is_whitespace = False
char_to_word_offset.append(len(doc_tokens) - 1)
examples = list()
for idx, (question_text, qas_id), in enumerate(questions):
start_position = None
end_position = None
orig_answer_text = None
is_impossible = False
if is_training:
orig_answer_text, answer_offset = answers[idx]
is_impossible = (answer_offset == -1)
if not is_impossible:
answer_length = len(orig_answer_text)
start_position = char_to_word_offset[answer_offset]
end_position = char_to_word_offset[answer_offset + answer_length - 1]
# Only add answers where the text can be exactly recovered from the
# document. If this CAN'T happen it's likely due to weird Unicode
# stuff so we will just skip the example.
#
# Note that this means for training mode, every example is NOT
# guaranteed to be preserved.
actual_text = " ".join(
doc_tokens[start_position:(end_position + 1)])
cleaned_answer_text = " ".join(
whitespace_tokenize(orig_answer_text))
if actual_text.find(cleaned_answer_text) == -1:
tf.logging.warning("Could not find answer: '%s' vs. '%s'",
actual_text, cleaned_answer_text)
continue
else:
start_position = -1
end_position = -1
orig_answer_text = ""
example = Example(
qas_id=qas_id,
question_text=question_text,
doc_tokens=doc_tokens,
orig_answer_text=orig_answer_text,
start_position=start_position,
end_position=end_position,
is_impossible=is_impossible)
examples.append(example)
features = list()
for (example_index, example) in enumerate(examples):
query_tokens = self.config.tokenizer.tokenize(example.question_text)
if len(query_tokens) > self.max_query_length:
query_tokens = query_tokens[0:self.max_query_length]
tok_to_orig_index = []
orig_to_tok_index = []
all_doc_tokens = []
for (i, token) in enumerate(example.doc_tokens):
orig_to_tok_index.append(len(all_doc_tokens))
sub_tokens = self.config.tokenizer.tokenize(token)
for sub_token in sub_tokens:
tok_to_orig_index.append(i)
all_doc_tokens.append(sub_token)
tok_start_position = None
tok_end_position = None
if is_training and example.is_impossible:
tok_start_position = -1
tok_end_position = -1
if is_training and not example.is_impossible:
tok_start_position = orig_to_tok_index[example.start_position]
if example.end_position < len(example.doc_tokens) - 1:
tok_end_position = orig_to_tok_index[example.end_position + 1] - 1
else:
tok_end_position = len(all_doc_tokens) - 1
(tok_start_position, tok_end_position) = _improve_answer_span(
all_doc_tokens, tok_start_position, tok_end_position, self.config.tokenizer,
example.orig_answer_text)
# The -3 accounts for [CLS], [SEP] and [SEP]
max_tokens_for_doc = self.max_seq_length - len(query_tokens) - 3
# We can have documents that are longer than the maximum sequence length.
# To deal with this we do a sliding window approach, where we take chunks
# of the up to our max length with a stride of `doc_stride`.
_DocSpan = collections.namedtuple( # pylint: disable=invalid-name
"DocSpan", ["start", "length"])
doc_spans = []
start_offset = 0
while start_offset < len(all_doc_tokens):
length = len(all_doc_tokens) - start_offset
if length > max_tokens_for_doc:
length = max_tokens_for_doc
doc_spans.append(_DocSpan(start=start_offset, length=length))
if start_offset + length == len(all_doc_tokens):
break
start_offset += min(length, self.doc_stride)
for (doc_span_index, doc_span) in enumerate(doc_spans):
tokens = []
token_to_orig_map = {}
token_is_max_context = {}
segment_ids = []
tokens.append("[CLS]")
segment_ids.append(0)
for token in query_tokens:
tokens.append(token)
segment_ids.append(0)
tokens.append("[SEP]")
segment_ids.append(0)
for i in range(doc_span.length):
split_token_index = doc_span.start + i
token_to_orig_map[len(tokens)] = tok_to_orig_index[split_token_index]
is_max_context = _check_is_max_context(doc_spans, doc_span_index,
split_token_index)
token_is_max_context[len(tokens)] = is_max_context
tokens.append(all_doc_tokens[split_token_index])
segment_ids.append(1)
tokens.append("[SEP]")
segment_ids.append(1)
input_ids = self.config.tokenizer.convert_tokens_to_ids(tokens)
# The mask has 1 for real tokens and 0 for padding tokens. Only real
# tokens are attended to.
input_mask = [1] * len(input_ids)
# Zero-pad up to the sequence length.
while len(input_ids) < self.max_seq_length:
input_ids.append(0)
input_mask.append(0)
segment_ids.append(0)
assert len(input_ids) == self.max_seq_length
assert len(input_mask) == self.max_seq_length
assert len(segment_ids) == self.max_seq_length
start_position = None
end_position = None
if is_training and not example.is_impossible:
# For training, if our document chunk does not contain an annotation
# we throw it out, since there is nothing to predict.
doc_start = doc_span.start
doc_end = doc_span.start + doc_span.length - 1
out_of_span = False
if not (tok_start_position >= doc_start and
tok_end_position <= doc_end):
out_of_span = True
if out_of_span:
start_position = 0
end_position = 0
else:
doc_offset = len(query_tokens) + 2
start_position = tok_start_position - doc_start + doc_offset
end_position = tok_end_position - doc_start + doc_offset
if is_training and example.is_impossible:
start_position = 0
end_position = 0
unique_id = str(uuid.uuid4())
if self.example_count < 20:
tf.logging.info("*** Example ***")
tf.logging.info("unique_id: %s" % (unique_id))
tf.logging.info("example_index: %s" % (example_index))
tf.logging.info("doc_span_index: %s" % (doc_span_index))
tf.logging.info("tokens: %s" % " ".join(
[printable_text(x) for x in tokens]))
tf.logging.info("token_to_orig_map: %s" % " ".join(
["%d:%d" % (x, y) for (x, y) in six.iteritems(token_to_orig_map)]))
tf.logging.info("token_is_max_context: %s" % " ".join([
"%d:%s" % (x, y) for (x, y) in six.iteritems(token_is_max_context)
]))
tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
tf.logging.info(
"input_mask: %s" % " ".join([str(x) for x in input_mask]))
tf.logging.info(
"segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
if is_training and example.is_impossible:
tf.logging.info("impossible example")
if is_training and not example.is_impossible:
answer_text = " ".join(tokens[start_position:(end_position + 1)])
tf.logging.info("start_position: %d" % (start_position))
tf.logging.info("end_position: %d" % (end_position))
tf.logging.info(
"answer: %s" % (printable_text(answer_text)))
self.example_count += 1
feature = InputFeatures(
unique_id=unique_id,
qas_id=example.qas_id,
example_index=example_index,
doc_span_index=doc_span_index,
doc_tokens=doc_tokens,
tokens=tokens,
token_to_orig_map=token_to_orig_map,
token_is_max_context=token_is_max_context,
input_ids=input_ids,
input_mask=input_mask,
segment_ids=segment_ids,
start_position=start_position,
end_position=end_position,
is_impossible=example.is_impossible)
features.append(feature)
return features
[docs] def call(self, inputs):
# HAE does not support on the fly mode, return the inputs
items = []
for name in self.input_tensor_names:
items.append(inputs[name])
return items
[docs] def process(self, inputs):
if isinstance(inputs, dict):
inputs = [inputs]
all_feature_list = []
for idx, example in enumerate(inputs):
feature_list = self.convert_example_to_features(example)
for feature in feature_list:
for key, val in example.items():
setattr(feature, key, val)
all_feature_list.extend(feature_list)
ret = dict()
for key in all_feature_list[0].__dict__.keys():
ret[key] = list()
for features in all_feature_list:
ret[key].append(getattr(features, key))
for key, val in ret.items():
ret[key] = np.array(val)
return ret
[docs]class CQAExample(object):
"""A single training/test example for multi-turn comprehension."""
def __init__(self,
qas_id,
question_text,
doc_tokens,
orig_answer_text=None,
start_position=None,
end_position=None,
history_answer_marker=None,
metadata=None):
self.qas_id = qas_id
self.question_text = question_text
self.doc_tokens = doc_tokens
self.orig_answer_text = orig_answer_text
self.start_position = start_position
self.end_position = end_position
self.history_answer_marker = history_answer_marker
self.metadata = metadata
[docs]class MultiTurnComprehensionPreprocessor(Preprocessor):
""" Preprocessor for multi-turn text comprehension
"""
config_class = ComprehensionPreprocessorConfig
def __init__(self, config, **kwargs):
super(MultiTurnComprehensionPreprocessor, self).__init__(config, **kwargs)
self.config = config
self.doc_stride = int(config.doc_stride) if hasattr(config, "doc_stride") else 128
self.max_seq_length = int(config.sequence_length) if hasattr(config, "sequence_length") else 384
self.max_query_length = int(config.max_query_length) if hasattr(config, "max_query_length") else 64
self.max_considered_history_turns = int(config.max_considered_history_turns) \
if hasattr(config, "max_considered_history_turns") else 11
self.context_col_name = config.first_sequence
self.query_col_name = config.second_sequence
self.answer_col_name = config.label_name
if "/" in config.pretrain_model_name_or_path:
dirname = os.path.dirname(config.pretrain_model_name_or_path)
self.language = dirname.split("-")[-1]
else:
self.language = config.pretrain_model_name_or_path.split("-")[-1]
self.input_tensor_names = []
input_schema = config.input_schema
for schema in input_schema.split(","):
name = schema.split(":")[0]
self.input_tensor_names.append(name)
[docs] @staticmethod
def convert_examples_to_example_variations(examples, max_considered_history_turns):
# an example is "question + passage + markers (M3 + M4) + markers_list (M3, M4)"
# an example variation is "question + passage + markers (M3)"
# meaning that we only have one marker for each example variation
# because we want to make a binary choice for every example variation,
# and combine all variations to form an example
new_examples = []
for example in examples:
# if the example is the first question in the dialog, it does not contain history answers,
# so we simply append it.
if len(example.metadata['tok_history_answer_markers']) == 0:
example.metadata['history_turns'] = []
new_examples.append(example)
else:
for history_turn, marker, history_turn_text in zip(
example.metadata['history_turns'][- max_considered_history_turns:],
example.metadata['tok_history_answer_markers'][- max_considered_history_turns:],
example.metadata['history_turns_text'][- max_considered_history_turns:]):
each_new_example = deepcopy(example)
each_new_example.history_answer_marker = marker
each_new_example.metadata['history_turns'] = [history_turn]
each_new_example.metadata['tok_history_answer_markers'] = [marker]
each_new_example.metadata['history_turns_text'] = [history_turn_text]
new_examples.append(each_new_example)
return new_examples
[docs] def convert_example_to_features(self, example):
""" Convert single example to multiple input features
Args:
items (`dict`): inputs from the reader
Returns:
features (`list`): list of `CQAInputFeatures`
"""
paragraph_text = convert_to_unicode(example[self.context_col_name])
question_id_list = convert_to_unicode(example[self.query_col_name]).split("||")
questions = list(zip(question_id_list[::2], question_id_list[1::2]))
answer_starts_list = convert_to_unicode(example[self.answer_col_name]).split("||")
answers = list(zip(answer_starts_list[::2], [int(t) for t in answer_starts_list[1::2]]))
if len(answers) != len(questions):
assert len(questions) == len(answers) + 1, "Need put same number of history " \
"questions and answer."
answers.append(("", -1))
is_training = False
else:
is_training = True
# Build paragraph doc tokens
doc_tokens = []
char_to_word_offset = []
prev_is_whitespace = True
for c in paragraph_text:
if is_whitespace(c):
prev_is_whitespace = True
else:
if prev_is_whitespace:
doc_tokens.append(c)
else:
doc_tokens[-1] += c
prev_is_whitespace = False
char_to_word_offset.append(len(doc_tokens) - 1)
# Prepare question-answer list
qas = []
for i, (question, answer) in enumerate(zip(questions, answers)):
metadata = {'turn': i + 1, 'history_turns': [], 'tok_history_answer_markers': [],
'history_turns_text': []}
end_index = i
question_with_histories = ''
start_index = 0 # we read all the histories no matter we use RL or not. we will make approporiate selections afterwards
history_answer_marker = []
for history_turn, (each_answer, each_question) in enumerate(
zip(answers[start_index: end_index], questions[start_index: end_index])):
# [history_answer_start, history_answer_end, history_answer_text]
each_marker = [each_answer[1], each_answer[1] + len(each_answer[0]), each_answer[0]]
history_answer_marker.append(each_marker)
metadata['history_turns'].append(history_turn + start_index + 1)
metadata['history_turns_text'].append((each_question[0], each_answer[0])) # [(q1, a1), (q2, a2), ...]
# add the current question
question_with_histories += question[0]
qas.append({'id': question[1], 'question': question_with_histories,
'answers': [{'answer_start': answer[1], 'text': answer[0]}],
'history_answer_marker': history_answer_marker, 'metadata': metadata})
examples = list()
for qa in qas:
qas_id = qa["id"]
question_text = qa["question"]
# if is_training:
# we read in the groundtruth answer bothing druing training and predicting, because we need to compute acc and f1 at predicting time.
if len(qa["answers"]) != 1:
raise ValueError(
"For training, each question should have exactly 1 answer.")
answer = qa["answers"][0]
orig_answer_text = answer["text"]
answer_offset = answer["answer_start"]
answer_length = len(orig_answer_text)
start_position = char_to_word_offset[answer_offset]
end_position = char_to_word_offset[answer_offset + answer_length - 1]
# Only add answers where the text can be exactly recovered from the
# document. If this CAN'T happen it's likely due to weird Unicode
# stuff so we will just skip the example.
#
# Note that this means for training mode, every example is NOT
# guaranteed to be preserved.
actual_text = " ".join(doc_tokens[start_position:(end_position + 1)])
cleaned_answer_text = " ".join(
whitespace_tokenize(orig_answer_text))
if is_training and actual_text.find(cleaned_answer_text) == -1:
tf.logging.warning("Could not find answer: '%s' vs. '%s'",
actual_text, cleaned_answer_text)
continue
# we construct a tok_history_answer_marker to store the aggregated history answer markers for a question.
# we also construct each_tok_history_answer_marker to store a single history answer marker.
tok_history_answer_marker = [0] * len(doc_tokens)
for marker_index, marker in enumerate(qa['history_answer_marker']):
each_tok_history_answer_marker = [0] * len(doc_tokens)
history_orig_answer_text = marker[2]
history_answer_offset = marker[0]
history_answer_length = len(history_orig_answer_text)
history_start_position = char_to_word_offset[history_answer_offset]
history_end_position = char_to_word_offset[history_answer_offset + history_answer_length - 1]
history_actual_text = " ".join(doc_tokens[history_start_position:(history_end_position + 1)])
history_cleaned_answer_text = " ".join(whitespace_tokenize(history_orig_answer_text))
if history_actual_text.find(history_cleaned_answer_text) != -1:
tok_history_answer_marker = tok_history_answer_marker[: history_start_position] + \
[1] * (history_end_position - history_start_position + 1) + \
tok_history_answer_marker[history_end_position + 1:]
each_tok_history_answer_marker = each_tok_history_answer_marker[: history_start_position] + \
[1] * (history_end_position - history_start_position + 1) + \
each_tok_history_answer_marker[history_end_position + 1:]
assert len(tok_history_answer_marker) == len(doc_tokens)
assert len(each_tok_history_answer_marker) == len(doc_tokens)
qa['metadata']['tok_history_answer_markers'].append(each_tok_history_answer_marker)
else:
tf.logging.warning("Could not find history answer: '%s' vs. '%s'", history_actual_text,
history_cleaned_answer_text)
example = CQAExample(
qas_id=qas_id,
question_text=question_text,
doc_tokens=doc_tokens,
orig_answer_text=orig_answer_text,
start_position=start_position,
end_position=end_position,
history_answer_marker=tok_history_answer_marker,
metadata=qa['metadata'])
examples.append(example)
features = []
for (example_index, example) in enumerate(examples):
variations = self.convert_examples_to_example_variations([example], self.max_considered_history_turns)
for example in variations:
metadata = example.metadata
query_tokens = self.config.tokenizer.tokenize(example.question_text)
if len(query_tokens) > self.max_query_length:
query_tokens = query_tokens[0:self.max_query_length]
history_answer_marker = example.history_answer_marker
tok_to_orig_index = []
orig_to_tok_index = []
all_doc_tokens = []
all_history_answer_marker = []
for (i, token) in enumerate(example.doc_tokens):
orig_to_tok_index.append(len(all_doc_tokens))
sub_tokens = self.config.tokenizer.tokenize(token)
for sub_token in sub_tokens:
tok_to_orig_index.append(i)
all_doc_tokens.append(sub_token)
all_history_answer_marker.append(history_answer_marker[i])
# we do this for both training and predicting, because we need also start/end position at testing time to compute acc and f1
tok_start_position = orig_to_tok_index[example.start_position]
if example.end_position < len(example.doc_tokens) - 1:
tok_end_position = orig_to_tok_index[example.end_position + 1] - 1
else:
tok_end_position = len(all_doc_tokens) - 1
(tok_start_position, tok_end_position) = _improve_answer_span(
all_doc_tokens, tok_start_position, tok_end_position, self.config.tokenizer,
example.orig_answer_text)
# The -3 accounts for [CLS], [SEP] and [SEP]
max_tokens_for_doc = self.max_seq_length - len(query_tokens) - 3
# We can have documents that are longer than the maximum sequence length.
# To deal with this we do a sliding window approach, where we take chunks
# of the up to our max length with a stride of `doc_stride`.
_DocSpan = collections.namedtuple( # pylint: disable=invalid-name
"DocSpan", ["start", "length"])
doc_spans = []
start_offset = 0
while start_offset < len(all_doc_tokens):
length = len(all_doc_tokens) - start_offset
if length > max_tokens_for_doc:
length = max_tokens_for_doc
doc_spans.append(_DocSpan(start=start_offset, length=length))
if start_offset + length == len(all_doc_tokens):
break
start_offset += min(length, self.doc_stride)
for (doc_span_index, doc_span) in enumerate(doc_spans):
marker = []
tokens = []
token_to_orig_map = {}
token_is_max_context = {}
segment_ids = []
tokens.append("[CLS]")
marker.append(0)
segment_ids.append(0)
for token in query_tokens:
tokens.append(token)
marker.append(0)
segment_ids.append(0)
tokens.append("[SEP]")
marker.append(0)
segment_ids.append(0)
for i in range(doc_span.length):
split_token_index = doc_span.start + i
token_to_orig_map[len(tokens)] = tok_to_orig_index[split_token_index]
is_max_context = _check_is_max_context(doc_spans, doc_span_index,
split_token_index)
token_is_max_context[len(tokens)] = is_max_context
tokens.append(all_doc_tokens[split_token_index])
marker.append(all_history_answer_marker[split_token_index])
segment_ids.append(1)
tokens.append("[SEP]")
marker.append(0)
segment_ids.append(1)
input_ids = self.config.tokenizer.convert_tokens_to_ids(tokens)
# The mask has 1 for real tokens and 0 for padding tokens. Only real
# tokens are attended to.
input_mask = [1] * len(input_ids)
# Zero-pad up to the sequence length.
while len(input_ids) < self.max_seq_length:
input_ids.append(0)
input_mask.append(0)
segment_ids.append(0)
marker.append(0)
assert len(input_ids) == self.max_seq_length
assert len(input_mask) == self.max_seq_length
assert len(segment_ids) == self.max_seq_length
assert len(marker) == self.max_seq_length
if is_training:
# For training, if our document chunk does not contain an annotation
# we throw it out, since there is nothing to predict.
doc_start = doc_span.start
doc_end = doc_span.start + doc_span.length - 1
if (example.start_position < doc_start or
example.end_position < doc_start or
example.start_position > doc_end or example.end_position > doc_end):
continue
doc_offset = len(query_tokens) + 2
start_position = tok_start_position - doc_start + doc_offset
end_position = tok_end_position - doc_start + doc_offset
else:
start_position = -1
end_position = -1
features.append(
CQAInputFeatures(
unique_id=str(uuid.uuid4()),
example_index=example_index,
doc_span_index=doc_span_index,
doc_tokens=doc_tokens,
tokens=tokens,
token_to_orig_map=token_to_orig_map,
token_is_max_context=token_is_max_context,
input_ids=input_ids,
input_mask=input_mask,
segment_ids=segment_ids,
start_position=start_position,
end_position=end_position,
history_answer_marker=marker,
metadata=metadata,
qas_id=example.qas_id))
return features
[docs] def call(self, inputs):
# HAE does not support on the fly mode, return the inputs
items = []
for name in self.input_tensor_names:
items.append(inputs[name])
return items
[docs] def process(self, inputs):
if isinstance(inputs, dict):
inputs = [inputs]
all_feature_list = []
for idx, example in enumerate(inputs):
feature_list = self.convert_example_to_features(example)
for feature in feature_list:
for key, val in example.items():
setattr(feature, key, val)
all_feature_list.extend(feature_list)
ret = dict()
for key in all_feature_list[0].__dict__.keys():
ret[key] = list()
for features in all_feature_list:
ret[key].append(getattr(features, key))
for key, val in ret.items():
ret[key] = np.array(val)
return ret