TensorRTをtensorflowとpycudaと一緒に使っているこのチュートリアルをやってみた。

TensorRT EngineをTensorFlowで構築する¶

UFF ToolkitはTensorFlow modelsをUFFへの変換を可能にする。UFF parserは、TensorRT enginesをこれらのUFF modelsから構築できる。今回のtutorialでは、手書き数字を分類するのにLeNet5 modelを訓練し、その後推論用にTensorRT Engineをビルドする。TensorRTをインポートする前にTensorFlowを最初にインポートする必要がある。gpuへの(からの)データ転送用にPyCUDAを、データのストア用にNumPyを使う。最後に、グラフをserialized frozen TensorFlow modelからUFFへ変換するのにUFF toolkitをインポートする。

import tensorflow as tf
import tensorrt as trt
from tensorrt.parsers import uffparser
import pycuda.driver as cuda
import pycuda.autoinit
import numpy as np
from random import randint # generate a random test case
from PIL import Image
from matplotlib.pyplot import imshow # To show test case
import time
import os
import uff

TensorFlowにおけるtraining modelsの詳細はこのサイトを参照。先ず、便宜のために、ハイパーパラメーターとヘルパーファンクションを定義してから、network, loss metrics, training/test steps, input nodes, data loaderを定義する。

STARTER_LEARNING_RATE = 1e-4
BATCH_SIZE = 10
NUM_CLASSES = 10
MAX_STEPS = 3000
IMAGE_SIZE = 28
IMAGE_PIXELS = IMAGE_SIZE ** 2
OUTPUT_NAMES = ["fc2/Relu"]

Conv2d layerをpaddingしていることに留意する。TensorRTはレイヤーにsymmetric paddingを要求する。

def WeightsVariable(shape):
    return tf.Variable(tf.truncated_normal(shape, stddev=0.1, name='weights'))

def BiasVariable(shape):
    return tf.Variable(tf.constant(0.1, shape=shape, name='biases'))

def Conv2d(x, W, b, strides=1):
    # Conv2D wrapper, with bias and relu activation
    filter_size = W.get_shape().as_list()
    pad_size = filter_size[0]//2
    pad_mat = np.array([[0,0],[pad_size,pad_size],[pad_size,pad_size],[0,0]])
    x = tf.pad(x, pad_mat)
    x = tf.nn.conv2d(x, W, strides=[1, strides, strides, 1], padding='VALID')
    x = tf.nn.bias_add(x, b)
    return tf.nn.relu(x)

def MaxPool2x2(x, k=2):
    # MaxPool2D wrapper
    pad_size = k//2
    pad_mat = np.array([[0,0],[pad_size,pad_size],[pad_size,pad_size],[0,0]])
    return tf.nn.max_pool(x, ksize=[1, k, k, 1], strides=[1, k, k, 1], padding='VALID')

def network(images):
    # Convolution 1
    with tf.name_scope('conv1'):
        weights = WeightsVariable([5,5,1,32])
        biases = BiasVariable([32])
        conv1 = tf.nn.relu(Conv2d(images, weights, biases))
        pool1 = MaxPool2x2(conv1)

    # Convolution 2
    with tf.name_scope('conv2'):
        weights = WeightsVariable([5,5,32,64])
        biases = BiasVariable([64])
        conv2 = tf.nn.relu(Conv2d(pool1, weights, biases))
        pool2 = MaxPool2x2(conv2)
        pool2_flat = tf.reshape(pool2, [-1, 7 * 7 * 64])

    # Fully Connected 1
    with tf.name_scope('fc1'):
        weights = WeightsVariable([7 * 7 * 64, 1024])
        biases = BiasVariable([1024])
        fc1 = tf.nn.relu(tf.matmul(pool2_flat, weights) + biases)

    # Fully Connected 2
    with tf.name_scope('fc2'):
        weights = WeightsVariable([1024, 10])
        biases = BiasVariable([10])
        fc2 = tf.nn.relu(tf.matmul(fc1, weights) + biases)

    return fc2

def loss_metrics(logits, labels):
    cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=labels,
                                                                   logits=logits,
                                                                   name='softmax')
    return tf.reduce_mean(cross_entropy, name='softmax_mean')

def training(loss):
    tf.summary.scalar('loss', loss)
    global_step = tf.Variable(0, name='global_step', trainable=False)
    learning_rate = tf.train.exponential_decay(STARTER_LEARNING_RATE,
                                               global_step,
                                               100000,
                                               0.75,
                                               staircase=True)
    tf.summary.scalar('learning_rate', learning_rate)
    optimizer = tf.train.MomentumOptimizer(learning_rate, 0.9)
    train_op = optimizer.minimize(loss, global_step=global_step)
    return train_op

def evaluation(logits, labels):
    correct = tf.nn.in_top_k(logits, labels, 1)
    return tf.reduce_sum(tf.cast(correct, tf.int32))

def do_eval(sess,
            eval_correct,
            images_placeholder,
            labels_placeholder,
            data_set,
            summary):

    true_count = 0
    steps_per_epoch = data_set.num_examples // BATCH_SIZE
    num_examples = steps_per_epoch * BATCH_SIZE
    for step in range(steps_per_epoch):
        feed_dict = fill_feed_dict(data_set,
                                   images_placeholder,
                                   labels_placeholder)
        log, correctness = sess.run([summary, eval_correct], feed_dict=feed_dict)
        true_count += correctness
    precision = float(true_count) / num_examples
    tf.summary.scalar('precision', tf.constant(precision))
    print('Num examples %d, Num Correct: %d Precision @ 1: %0.04f' %
          (num_examples, true_count, precision))
    return log

def placeholder_inputs(batch_size):
    images_placeholder = tf.placeholder(tf.float32, shape=(None, 28, 28, 1))
    labels_placeholder = tf.placeholder(tf.int32, shape=(None))
    return images_placeholder, labels_placeholder

def fill_feed_dict(data_set, images_pl, labels_pl):
    images_feed, labels_feed = data_set.next_batch(BATCH_SIZE)
    feed_dict = {
        images_pl: np.reshape(images_feed, (-1,28,28,1)),
        labels_pl: labels_feed,
    }
    return feed_dict

トレーニングノードが取り除かれたfrozen modelを返す関数の中にトレーニングパイプラインを定義する。

def run_training(data_sets):
    with tf.Graph().as_default():
        images_placeholder, labels_placeholder = placeholder_inputs(BATCH_SIZE)
        logits = network(images_placeholder)
        loss = loss_metrics(logits, labels_placeholder)
        train_op = training(loss)
        eval_correct = evaluation(logits, labels_placeholder)
        summary = tf.summary.merge_all()
        init = tf.global_variables_initializer()
        saver = tf.train.Saver()
        gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.5)
        sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
        summary_writer = tf.summary.FileWriter("/tmp/tensorflow/mnist/log",
                                               graph=tf.get_default_graph())
        test_writer = tf.summary.FileWriter("/tmp/tensorflow/mnist/log/validation",
                                            graph=tf.get_default_graph())
        sess.run(init)
        for step in range(MAX_STEPS):
            start_time = time.time()
            feed_dict = fill_feed_dict(data_sets.train,
                                       images_placeholder,
                                       labels_placeholder)
            _, loss_value = sess.run([train_op, loss], feed_dict=feed_dict)
            duration = time.time() - start_time
            if step % 100 == 0:
                print('Step %d: loss = %.2f (%.3f sec)' % (step, loss_value, duration))
                summary_str = sess.run(summary, feed_dict=feed_dict)
                summary_writer.add_summary(summary_str, step)
                summary_writer.flush()
            if (step + 1) % 1000 == 0 or (step + 1) == MAX_STEPS:
                checkpoint_file = os.path.join("/tmp/tensorflow/mnist/log", "model.ckpt")
                saver.save(sess, checkpoint_file, global_step=step)
                print('Validation Data Eval:')
                log = do_eval(sess,
                              eval_correct,
                              images_placeholder,
                              labels_placeholder,
                              data_sets.validation,
                              summary)
                test_writer.add_summary(log, step)
        # Return sess

        graphdef = tf.get_default_graph().as_graph_def()
        frozen_graph = tf.graph_util.convert_variables_to_constants(sess,
                                                                    graphdef,
                                                                    OUTPUT_NAMES)
        return tf.graph_util.remove_training_nodes(frozen_graph)

TensorFlow MNIST data loaderをロードして訓練を開始する。TensorBoardで訓練を視覚化できるようにモデルにはサマリーが含まれている。

MNIST_DATASETS = tf.contrib.learn.datasets.load_dataset("mnist")
tf_model = run_training(MNIST_DATASETS)

Extracting MNIST-data/train-images-idx3-ubyte.gz
Extracting MNIST-data/train-labels-idx1-ubyte.gz
Extracting MNIST-data/t10k-images-idx3-ubyte.gz
Extracting MNIST-data/t10k-labels-idx1-ubyte.gz
Step 0: loss = 2.18 (0.115 sec)
Step 100: loss = 2.02 (0.003 sec)
Step 200: loss = 1.55 (0.003 sec)
Step 300: loss = 1.91 (0.002 sec)
Step 400: loss = 1.08 (0.002 sec)
Step 500: loss = 1.45 (0.003 sec)
Step 600: loss = 1.34 (0.003 sec)
Step 700: loss = 1.74 (0.002 sec)
Step 800: loss = 1.08 (0.002 sec)
Step 900: loss = 1.34 (0.002 sec)
Validation Data Eval:
Num examples 5000, Num Correct: 3534 Precision @ 1: 0.7068
Step 1000: loss = 0.96 (0.006 sec)
Step 1100: loss = 1.29 (0.003 sec)
Step 1200: loss = 0.99 (0.002 sec)
Step 1300: loss = 1.31 (0.003 sec)
Step 1400: loss = 1.21 (0.003 sec)
Step 1500: loss = 0.79 (0.002 sec)
Step 1600: loss = 0.26 (0.002 sec)
Step 1700: loss = 0.49 (0.003 sec)
Step 1800: loss = 1.21 (0.003 sec)
Step 1900: loss = 0.76 (0.003 sec)
Validation Data Eval:
Num examples 5000, Num Correct: 4187 Precision @ 1: 0.8374
Step 2000: loss = 0.60 (0.006 sec)
Step 2100: loss = 1.52 (0.002 sec)
Step 2200: loss = 1.65 (0.003 sec)
Step 2300: loss = 0.96 (0.003 sec)
Step 2400: loss = 0.95 (0.002 sec)
Step 2500: loss = 0.48 (0.003 sec)
Step 2600: loss = 1.01 (0.003 sec)
Step 2700: loss = 0.95 (0.003 sec)
Step 2800: loss = 0.56 (0.003 sec)
Step 2900: loss = 1.51 (0.003 sec)
Validation Data Eval:
Num examples 5000, Num Correct: 4366 Precision @ 1: 0.8732
INFO:tensorflow:Froze 8 variables.
Converted 8 variables to const ops.

TensorFlow ModelをUFFへ変換¶

訓練したモデルをserialized UFF modelに変換する。モデル変換には最低限model streamと名前を定義する必要がある。

uff_model = uff.from_tensorflow(tf_model, ["fc2/Relu"])

Using output node fc2/Relu
Converting to UFF graph
No. nodes: 28

UFFモデルをTensorRTにインポートしてエンジン構築¶

TensorRT engineをビルドできるUFFモデルストリームが用意できたので、先ずはTensorRT用のlogger作成から始める。

G_LOGGER = trt.infer.ConsoleLogger(trt.infer.LogSeverity.ERROR)

次に、UFFパーサーを作って所望のインプット/アウトプットノードを特定する。

parser = uffparser.create_uff_parser()
parser.register_input("Placeholder", (1,28,28), 0)
parser.register_output("fc2/Relu")

True

次にlogger, parser, uff model streamといくつかの設定(最大バッチサイズと最大作業サイズ)をエンジンをビルドするユーティリティ関数にパスする。

engine = trt.utils.uff_to_trt_engine(G_LOGGER, uff_model, parser, 1, 1 << 20)

ここでパーサーは用済みなので破壊することができる。

parser.destroy()

次に、TensorFlow dataloaderからテストケースを得る(それをFP32に変換する)。

img, label = MNIST_DATASETS.test.next_batch(1)
img = img[0]
# Convert input data to Float32
img = img.astype(np.float32)
label = label[0]
%matplotlib inline
imshow(img.reshape(28,28))

<matplotlib.image.AxesImage at 0x7f6917a27c18>

それから、エンジン用のランタイムと実行内容を作成する。

runtime = trt.infer.create_infer_runtime(G_LOGGER)
context = engine.create_execution_context()

次に、GPUとCPUに推論結果を保存するための記憶領域をメモリに割り当てる。これらの割り当てサイズは、インプット/予想アウトプット×バッチサイズになる。

output = np.empty(10, dtype = np.float32)

# Alocate device memory
d_input = cuda.mem_alloc(1 * img.nbytes)
d_output = cuda.mem_alloc(1 * output.nbytes)

エンジンは、バインディング(GPUメモリへのポインター)を要求する。PyCUDAは、メモリ割り当ての結果をintにキャスティングすることでこの事を可能にしてくれる。

bindings = [int(d_input), int(d_output)]

推論実行のためにcudaストリームを作成する。

stream = cuda.Stream()

次に、データをGPUに転送して、推論を実行して。結果をCPUに返す。

# Transfer input data to device
cuda.memcpy_htod_async(d_input, img, stream)
# Execute model
context.enqueue(1, bindings, stream.handle, None)
# Transfer predictions back
cuda.memcpy_dtoh_async(output, d_output, stream)
# Syncronize threads
stream.synchronize()

推論結果を得るのにnp.argmaxを使うことができる。

print("Test Case: " + str(label))
print ("Prediction: " + str(np.argmax(output)))

Test Case: 1
Prediction: 1

エンジンは後で使えるように保存することもできる。

trt.utils.write_engine_to_file("./tf_mnist.engine", engine.serialize())

True

tensorrt.utils.load_engineを使って後でエンジンをロードすることが可能。

new_engine = trt.utils.load_engine(G_LOGGER, "./tf_mnist.engine")

最後に、コンテクスト、エンジン、ランタイムを破壊する。

context.destroy()
engine.destroy()
new_engine.destroy()
runtime.destroy()

PyCUDA、TensorRT、TensorFlowを集中的に勉強しようと思っている今日この頃。