[tf]nlp任务中使用 tf.data

读取数据

比较好的方法是从tf.data.Dataset.from_generator中读取数据，因为这样允许从任意一个迭代器中读取数据，可以更灵活的对数据进行预处理等等。

def generator_fn():
    for digit in range(2):
        line = 'I am digit {}'.format(digit)
        words = line.split()
        yield [w.encode() for w in words], len(words)

虽然有很多读取数据的方法，比如tf.data.TextLineDataset是从text文本中读取数据，比如tf.data.Dataset.from_tensor_slices是从np array中读取数据的，tf.data.TFRecordDataset是从TF records中读取数据的，但是作为一个NLP的研究人员，除非要使用上面三个读取方式中的一个特定函数来获得模型性能上的提升，否在为了灵活性起见还是使用tf.data.Dataset.from_generator最好。

shapes = ([None], ())
types = (tf.string, tf.int32)

dataset = tf.data.Dataset.from_generator(generator_fn,
    output_shapes=shapes, output_types=types)

测试是否正常

The tf.enable_eager_execution() must be called at program startup, just after your import tensorflow as tf

 import tensorflow as tf
 tf.enable_eager_execution()

 for tf_words, tf_size in dataset:
     print(tf_words, tf_size)
 >>> tf.Tensor([b'I' b'am' b'digit' b'0'], shape=(4,), dtype=string) tf.Tensor(4, shape=(), dtype=int32)
 >>> tf.Tensor([b'I' b'am' b'digit' b'1'], shape=(4,), dtype=string) tf.Tensor(4, shape=(), dtype=int32)

使用一种old school的方式tf.Session()，但是这种方式需要先创建一个iterator。
然后创建一个取下一个节目的op，这样取出一个元素以后迭代器再向后移动一次。

 iterator = dataset.make_one_shot_iterator()
 node = iterator.get_next()
 with tf.Session() as sess:
     print(sess.run(node))
     print(sess.run(node))  # Each call moves the iterator to its next position
 >>> (array([b'I', b'am', b'digit', b'0'], dtype=object), 4)
 >>> (array([b'I', b'am', b'digit', b'1'], dtype=object), 4)

读取文件和进行分词

使用tf.data.Dataset.from_generaor()最大的好处就是可以使用你python方式进行文本的预处理，而不用想方设法找tf中的对应函数。

 def parse_fn(line_words, line_tags):
     # Encode in Bytes for TF
     words = [w.encode() for w in line_words.strip().split()]
     tags = [t.encode() for t in line_tags.strip().split()]
     assert len(words) == len(tags), "Words and tags lengths don't match"
     return (words, len(words)), tags

 def generator_fn(words, tags):
     with Path(words).open('r') as f_words, Path(tags).open('r') as f_tags:
         for line_words, line_tags in zip(f_words, f_tags):
             yield parse_fn(line_words, line_tags)

然后使用input_fn构建dataset，并接下来将和tf.estimator配合进行使用。其中的函数在我的另外一篇博客中都有。

prefetch which ensures that a batch of data is pre-loaded on the computing device so that it does not suffer from data starvation

 def input_fn(words, tags, params=None, shuffle_and_repeat=False):
     params = params if params is not None else {}
     shapes = (([None], ()), [None])
     types = ((tf.string, tf.int32), tf.string)
     defaults = (('<pad>', 0), 'O')

     dataset = tf.data.Dataset.from_generator(
         functools.partial(generator_fn, words, tags),
         output_shapes=shapes, output_types=types)

     if shuffle_and_repeat:
         dataset = dataset.shuffle(params['buffer']).repeat(params['epochs'])

     dataset = (dataset
                .padded_batch(params.get('batch_size', 20), shapes, defaults)
                .prefetch(1))
     return dataset

运行结果，可以看到Pad起到了应有的结果。

运行结果

tf.estimator

提供一个高级的用于训练测试和预测的方法，在使用之前需要定义两个组件。
一个模型文件model_fn(features, labels, mode, params) ->tf.estimator.EstimatorSpec
- 前面两个都是训练中需要的tensor。
- mode：是一个string，用于指定model_fn是用于预测，测试还是训练。
- param：是一个字典用于存放超参。
input_fn：就是之前我们所定义的返回tf.data.Dataset的函数，返回训练的tensorfeatures和labels被model_fn用于训练。

def model_fn(features, labels, mode, params):
    # Define the inference graph
    graph_outputs = some_tensorflow_applied_to(features)

    if mode == tf.estimator.ModeKeys.PREDICT:
        # Extract the predictions
        predictions = some_dict_from(graph_outputs)
        return tf.estimator.EstimatorSpec(mode, predictions=predictions)
    else:
        # Compute loss, metrics, tensorboard summaries
        loss = compute_loss_from(graph_outputs, labels)
        metrics = compute_metrics_from(graph_outputs, labels)

        if mode == tf.estimator.ModeKeys.EVAL:
            return tf.estimator.EstimatorSpec(
                mode, loss=loss, eval_metric_ops=metrics)

        elif mode == tf.estimator.ModeKeys.TRAIN:
            # Get train operator
            train_op = compute_train_op_from(graph_outputs, labels)
            return tf.estimator.EstimatorSpec(
                mode, loss=loss, train_op=train_op)

        else:
            raise NotImplementedError('Unknown mode {}'.format(mode))

一个具体的例子说明

tf.contrib.lookup.index_table_from_file将strings to ids in the tensorflow graph。

Here, params['words'] is the path to a file containing one lexeme (= an element of my vocabulary) per line. I use Tensorflow built-int lookup tables to map token strings to lexemes ids. We also use the same convention to store the vocabulary of tags.

dropout = params['dropout']
words, nwords = features
training = (mode == tf.estimator.ModeKeys.TRAIN)
vocab_words = tf.contrib.lookup.index_table_from_file(
    params['words'], num_oov_buckets=1)
with Path(params['tags']).open() as f:
    indices = [idx for idx, tag in enumerate(f) if tag.strip() != 'O']
    num_tags = len(indices) + 1

创建word embedding。
可以加载预训练的词向量。

word_ids = vocab_words.lookup(words)
glove = np.load(params['glove'])['embeddings']  # np.array
variable = np.vstack([glove, [[0.]*params['dim']]])  # For unknown words
variable = tf.Variable(variable, dtype=tf.float32, trainable=False)
embeddings = tf.nn.embedding_lookup(variable, word_ids)
embeddings = tf.layers.dropout(embeddings, rate=dropout, training=training)

我们使用最为有效的lstm cell方式，它将所有的LSTM操作都放在一个CUDA kernel里面进行

t = tf.transpose(embeddings, perm=[1, 0, 2])  # Make time-major
lstm_cell_fw = tf.contrib.rnn.LSTMBlockFusedCell(params['lstm_size'])
lstm_cell_bw = tf.contrib.rnn.LSTMBlockFusedCell(params['lstm_size'])
lstm_cell_bw = tf.contrib.rnn.TimeReversedFusedRNN(lstm_cell_bw)
output_fw, _ = lstm_cell_fw(t, dtype=tf.float32, sequence_length=nwords)
output_bw, _ = lstm_cell_bw(t, dtype=tf.float32, sequence_length=nwords)
output = tf.concat([output_fw, output_bw], axis=-1)
output = tf.transpose(output, perm=[1, 0, 2])  # Make batch-major
output = tf.layers.dropout(output, rate=dropout, training=training)

LSTMBlockCell需要time在前所以要使用tf.transpose进行翻转。

This is an extremely efficient LSTM implementation, that uses a single TF op for the entire LSTM. It should be both faster and more memory-efficient than LSTMBlockCell defined above.

加入CRF

logits = tf.layers.dense(output, num_tags)
crf_params = tf.get_variable("crf", [num_tags, num_tags], dtype=tf.float32)
pred_ids, _ = tf.contrib.crf.crf_decode(logits, crf_params, nwords)

测度和使用tensorboard

import tf_metrics

# Metrics
weights = tf.sequence_mask(nwords)
metrics = {
    'acc': tf.metrics.accuracy(tags, pred_ids, weights),
    'precision': tf_metrics.precision(tags, pred_ids, num_tags, indices, weights),
    'recall': tf_metrics.recall(tags, pred_ids, num_tags, indices, weights),
    'f1': tf_metrics.f1(tags, pred_ids, num_tags, indices, weights),
}
# Tensoboard summaries
for metric_name, op in metrics.items():
    tf.summary.scalar(metric_name, op[1])

评估模型

if mode == tf.estimator.ModeKeys.EVAL:
    return tf.estimator.EstimatorSpec(
        mode, loss=loss, eval_metric_ops=metrics)

elif mode == tf.estimator.ModeKeys.TRAIN:
    train_op = tf.train.AdamOptimizer().minimize(
        loss, global_step=tf.train.get_or_create_global_step())
    return tf.estimator.EstimatorSpec(
        mode, loss=loss, train_op=train_op)

实例化Estimator

params = {
    'dim': 300,
    'dropout': 0.5,
    'num_oov_buckets': 1,
    'epochs': 25,
    'batch_size': 20,
    'buffer': 15000,
    'lstm_size': 100,
    'words': str(Path(DATADIR, 'vocab.words.txt')),
    'chars': str(Path(DATADIR, 'vocab.chars.txt')),
    'tags': str(Path(DATADIR, 'vocab.tags.txt')),
    'glove': str(Path(DATADIR, 'glove.npz'))
}
cfg = tf.estimator.RunConfig(save_checkpoints_secs=120)
estimator = tf.estimator.Estimator(model_fn, 'results/model', cfg, params)

Train an Estimator with early stopping

因为我们的函数中只有后面几个参数不同没有必要再写一个函数，因此我们使用functools.partial对函数在不同数据集合上进行包装。
早停法训练，获得F1最高值的模型，使用tf.contrib.estimator.stop_if_no_increase_hook

# 1. Define our input_fn
train_inpf = functools.partial(input_fn, 'words.train.txt', 'tags.train.txt',
                               params, shuffle_and_repeat=True)
eval_inpf = functools.partial(input_fn,'words.testa.txt', 'tags.testa.txt'
                              params)

# 2. Create a hook
Path(estimator.eval_dir()).mkdir(parents=True, exist_ok=True)
hook = tf.contrib.estimator.stop_if_no_increase_hook(
    estimator, 'f1', 500, min_steps=8000, run_every_secs=120)
train_spec = tf.estimator.TrainSpec(input_fn=input_fn, hooks=[hook])
eval_spec = tf.estimator.EvalSpec(input_fn=eval_inpf, throttle_secs=120)

# 3. Train with early stopping
tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)