TFLearn 在给定模型精度时候提前终止训练

拿来主义：看我的代码，我是在模型acc和验证数据集val_acc都达到99.8%时候才终止训练。

import numpy as np
import tflearn
from tflearn.layers.core import dropout
from tflearn.layers.normalization import batch_normalization
from tflearn.data_utils import to_categorical
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import sys




class EarlyStoppingCallback(tflearn.callbacks.Callback):
    def __init__(self, val_acc_thresh):
        """ Note: We are free to define our init function however we please. """
        # Store a validation accuracy threshold, which we can compare against
        # the current validation accuracy at, say, each epoch, each batch step, etc.
        self.val_acc_thresh = val_acc_thresh

    def on_epoch_end(self, training_state):
        """ 
        This is the final method called in trainer.py in the epoch loop. 
        We can stop training and leave without losing any information with a simple exception.  
        """
        #print dir(training_state)
        print("Terminating training at the end of epoch", training_state.epoch)
        if training_state.val_acc >= self.val_acc_thresh and training_state.acc_value >= self.val_acc_thresh:
            raise StopIteration

    def on_train_end(self, training_state):
        """
        Furthermore, tflearn will then immediately call this method after we terminate training, 
        (or when training ends regardless). This would be a good time to store any additional 
        information that tflearn doesn't store already.
        """
        print("Successfully left training! Final model accuracy:", training_state.acc_value)

if __name__ == "__main__":
    training_data = []
    with open("feature_with_dnn_todo.dat") as f:
        training_data = [parse_line(line) for line in f]

    X = training_data
    org_labels = [1 if int(x[0])==2.0 else 0 for x in X]
    labels = to_categorical(org_labels, nb_classes=2)
    data = [x[1:] for x in X]
    input_dim = len(data[0])

    X = data
    Y = labels

    print "X len:", len(X), "Y len:", len(Y)
    trainX, testX, trainY, testY = train_test_split(X, Y, test_size=0.2, random_state=42)
    print trainX[0]
    print trainY[0]
    print testX[-1]
    print testY[-1]

    # Build neural network   
    net = tflearn.input_data(shape=[None, input_dim])
    #  RMSProp | epoch: 100 | loss: 0.25209 - acc: 0.9109 | val_loss: 0.19742 - val_acc: 0.9392 -- iter: 14084/14084 remove unwanted_cols 2
    # | RMSProp | epoch: 100 | loss: 0.29420 - acc: 0.9075 | val_loss: 0.14464 - val_acc: 0.9551 -- iter: 14084/14084
    net = batch_normalization(net)
    dense1 = tflearn.fully_connected(net, 64, activation='tanh',
                                 regularizer='L2', weight_decay=0.001)
    dropout1 = tflearn.dropout(dense1, 0.8)
    dense2 = tflearn.fully_connected(dropout1, 64, activation='tanh',
                                 regularizer='L2', weight_decay=0.001)
    dropout2 = tflearn.dropout(dense2, 0.8)
    softmax = tflearn.fully_connected(dropout2, 2, activation='softmax')

    # Regression using SGD with learning rate decay and Top-3 accuracy
    net = tflearn.regression(softmax, optimizer="rmsprop", learning_rate=0.001, loss='categorical_crossentropy')

    """
    #| Adam | epoch: 100 | loss: 0.15578 - acc: 0.9419 | val_loss: 0.16620 - val_acc: 0.9392 -- iter: 14084/14084
    net = batch_normalization(net)
    net = tflearn.fully_connected(net, input_dim) 
    net = tflearn.fully_connected(net, 128, activation='tanh') 
    net = dropout(net, 0.5)
    net = tflearn.fully_connected(net, 2, activation='softmax')
    net = tflearn.regression(net, optimizer='adam', learning_rate=0.001,
                     loss='categorical_crossentropy', name='target')
    """
    # Define model
    model = tflearn.DNN(net)
    # Start training (apply gradient descent algorithm)
    # Initialize our callback with desired accuracy threshold.  
    early_stopping_cb = EarlyStoppingCallback(val_acc_thresh=0.998)
    try:
        model.fit(trainX, trainY, validation_set=(testX, testY), n_epoch=500, batch_size=8, show_metric=True, callbacks=early_stopping_cb)
    except StopIteration as e:
        print "pass"
    filename = 'dns_tunnel998.tflearn'
    model.save(filename)
    model.load(filename)
    #model.fit(trainX, trainY, validation_set=(testX, testY), show_metric=True, batch_size=1024, n_epoch=5)
    #model.fit(trainX, trainY, validation_set=(testX, testY), show_metric=True, batch_size=1024, n_epoch=5)
    y_predict_list = model.predict(X)
    y_predict = []
    for i in y_predict_list:
        #print  i[0]
        if i[0] >= 0.5:
            y_predict.append(0)
        else:
            y_predict.append(1)

    print(classification_report(org_labels, y_predict))
    print confusion_matrix(org_labels, y_predict)

The EarlyStoppingCallback Class

I show a proof-of-concept version of early stopping below. This is the simplest possible case: just stop training after the first epoch no matter what. It is up to the user to decide the conditions they want to trigger the stopping on.

class EarlyStoppingCallback(tflearn.callbacks.Callback):
    def __init__(self, val_acc_thresh):
        """ Note: We are free to define our init function however we please. """
        # Store a validation accuracy threshold, which we can compare against
        # the current validation accuracy at, say, each epoch, each batch step, etc.
        self.val_acc_thresh = val_acc_thresh
    
    def on_epoch_end(self, training_state):
        """ 
        This is the final method called in trainer.py in the epoch loop. 
        We can stop training and leave without losing any information with a simple exception.  
        """
        print("Terminating training at the end of epoch", training_state.epoch)
        raise StopIteration
    
    def on_train_end(self, training_state):
        """
        Furthermore, tflearn will then immediately call this method after we terminate training, 
        (or when training ends regardless). This would be a good time to store any additional 
        information that tflearn doesn't store already.
        """
        print("Successfully left training! Final model accuracy:", training_state.acc_value)
       
        
# Initialize our callback with desired accuracy threshold.  
early_stopping_cb = EarlyStoppingCallback(val_acc_thresh=0.5)

Result: Train the Model and Stop Early

try:
    # Give it to our trainer and let it fit the data. 
    trainer.fit(feed_dicts={X: trainX, Y: trainY}, 
                val_feed_dicts={X: testX, Y: testY}, 
                n_epoch=1, 
                show_metric=True, # Calculate accuracy and display at every step.
                callbacks=early_stopping_cb)
except StopIteration:
    print("Caught callback exception. Returning control to user program.")

Training Step: 860  | total loss: [1m[32m1.73372[0m[0m
| Optimizer | epoch: 002 | loss: 1.73372 - acc: 0.8196 | val_loss: 1.87058 - val_acc: 0.8011 -- iter: 55000/55000
Training Step: 860  | total loss: [1m[32m1.73372[0m[0m
| Optimizer | epoch: 002 | loss: 1.73372 - acc: 0.8196 | val_loss: 1.87058 - val_acc: 0.8011 -- iter: 55000/55000
--
Terminating training at the end of epoch 2
Successfully left training! Final model accuracy: 0.8196054697036743
Caught callback exception. Returning control to user program.

Appendix

For my own reference, this is the code I started with before tinkering with the early stopping solution above.

from __future__ import division, print_function, absolute_import

import os
import sys
import tempfile
import urllib
import collections
import math

import numpy as np
import tensorflow as tf
from scipy.io import arff

import tflearn
from sklearn.utils import shuffle
from sklearn.metrics import roc_auc_score
from tflearn.data_utils import shuffle, to_categorical
from tflearn.layers.core import input_data, dropout, fully_connected
from tflearn.layers.conv import conv_2d, max_pool_2d
from tflearn.layers.normalization import local_response_normalization, batch_normalization
from tflearn.layers.estimator import regression
import tflearn.datasets.mnist as mnist


# Load the data and handle any preprocessing here.
X, Y, testX, testY = mnist.load_data(one_hot=True)
X, Y  = shuffle(X, Y)
X     = X.reshape([-1, 28, 28, 1])
testX = testX.reshape([-1, 28, 28, 1])

# Define our network architecture: a simple 2-layer network of the form
# InputImages -> Fully Connected -> Softmax
out_readin1          = input_data(shape=[None,28,28,1])
out_fully_connected2 = fully_connected(out_readin1, 10)
out_softmax3         = fully_connected(out_fully_connected2, 10, activation='softmax')

hash='f0c188c3777519fb93f1a825ca758a0c'
scriptid='MNIST-f0c188c3777519fb93f1a825ca758a0c'

# Define our training metrics. 
network = regression(out_softmax3, 
                     optimizer='adam', 
                     learning_rate=0.01, 
                     loss='categorical_crossentropy', 
                     name='target')
model = tflearn.DNN(network, tensorboard_verbose=3)
try:
    model.fit(X, Y, n_epoch=1, validation_set=(testX, testY), 
          snapshot_epoch=False, 
          show_metric=True, 
          run_id=scriptid,callbacks=early_stopping_cb)
except StopIteration:
    print("Caught callback exception. Returning control to user program.")


prediction = model.predict(testX)
auc=roc_auc_score(testY, prediction, average='macro', sample_weight=None)
accuracy=model.evaluate(testX,testY)

print("Accuracy:", accuracy)
print("ROC AUC Score:", auc)

Training Step: 860  | total loss: [1m[32m0.30941[0m[0m
| Adam | epoch: 001 | loss: 0.30941 - acc: 0.9125 -- iter: 55000/55000
Terminating training at the end of epoch 1
Successfully left training! Final model accuracy: 0.9125033020973206
Caught callback exception. Returning control to user program.
Accuracy: [0.90410000000000001]
ROC AUC Score: 0.992379719297

参考：http://mckinziebrandon.me/TensorflowNotebooks/2016/11/19/tflearn-only.html

TFLearn

19 Nov 2016

Examples::Extending Tensorflow::Trainer

import tensorflow as tf
import tflearn
import tflearn.datasets.mnist as mnist

trainX, trainY, testX, testY = mnist.load_data(one_hot=True)

hdf5 not supported (please install/reinstall h5py)
Extracting mnist/train-images-idx3-ubyte.gz
Extracting mnist/train-labels-idx1-ubyte.gz
Extracting mnist/t10k-images-idx3-ubyte.gz
Extracting mnist/t10k-labels-idx1-ubyte.gz

Define the Architecture (Basic Tensorflow)

# Because I don't feel like retyping stuff.
def tfp(shape):
    return tf.placeholder("float", shape)
def tfrn(shape, name):
    return tf.Variable(tf.random_normal(shape), name=name)

# Define the inputs/outputs/weights as usual.
X, Y       = tfp([None, 784]), tfp([None, 10])
W1, W2, W3 = tfrn([784, 256], 'W1'), tfrn([256, 256], 'W2'), tfrn([256, 10], 'W3')
b1, b2, b3 = tfrn([256], 'b1'), tfrn([256], 'b2'), tfrn([10], 'b3')

# Multilayer perceptron.
def dnn(x):
    x = tf.tanh(tf.add(tf.matmul(x, W1), b1))
    x = tf.tanh(tf.add(tf.matmul(x, W2), b2))
    x = tf.add(tf.matmul(x, W3), b3)
    return x
net = dnn(X)

loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(net, Y))
optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.1)
accuracy = tf.reduce_mean(tf.cast( 
        tf.equal( tf.argmax(net, 1), tf.argmax(Y, 1) ), tf.float32), 
        name='acc')

Using a TFLearn Trainer

trainop = tflearn.TrainOp(loss=loss, optimizer=optimizer, metric=accuracy, batch_size=128)
trainer = tflearn.Trainer(train_ops=trainop, tensorboard_verbose=1)

trainer.fit({X: trainX, Y: trainY}, val_feed_dicts={X: testX, Y: testY}, 
           n_epoch=2, show_metric=True)

Training Step: 860  | total loss: [1m[32m1.73376[0m[0m
| Optimizer | epoch: 002 | loss: 1.73376 - acc: 0.8053 | val_loss: 1.78279 - val_acc: 0.8015 -- iter: 55000/55000
Training Step: 860  | total loss: [1m[32m1.73376[0m[0m
| Optimizer | epoch: 002 | loss: 1.73376 - acc: 0.8053 | val_loss: 1.78279 - val_acc: 0.8015 -- iter: 55000/55000
--

Training Callbacks

One suggestion for early stopping with tflearn (made by owner of tflearn repository) is to define a custom callback that raises an exception when we want to stop training. I’ve written a small snippet below as an example.

class EarlyStoppingCallback(tflearn.callbacks.Callback):
    def __init__(self, acc_thresh):
        """
        Args:
            acc_thresh - if our accuracy > acc_thresh, terminate training.
        """
        self.acc_thresh = acc_thresh
        self.accs = []
    
    def on_epoch_end(self, training_state):
        """ """
        self.accs.append(training_state.global_acc)
        if training_state.val_acc is not None and training_state.val_acc < self.acc_thresh:
            raise StopIteration

cb = EarlyStoppingCallback(acc_thresh=0.5)
trainer.fit({X: trainX, Y: trainY}, val_feed_dicts={X: testX, Y: testY}, 
           n_epoch=3, show_metric=True, snapshot_epoch=False,
            callbacks=cb)

Training Step: 3965  | total loss: [1m[32m0.33810[0m[0m
| Optimizer | epoch: 010 | loss: 0.33810 - acc: 0.9455 -- iter: 55000/55000
GOODBYE



---------------------------------------------------------------------------

StopIteration                             Traceback (most recent call last)

<ipython-input-24-9c383c6f5a8b> in <module>()
      2 trainer.fit({X: trainX, Y: trainY}, val_feed_dicts={X: testX, Y: testY}, 
      3            n_epoch=3, show_metric=True, snapshot_epoch=False,
----> 4             callbacks=cb)


/usr/local/lib/python3.5/dist-packages/tflearn/helpers/trainer.py in fit(self, feed_dicts, n_epoch, val_feed_dicts, show_metric, snapshot_step, snapshot_epoch, shuffle_all, dprep_dict, daug_dict, excl_trainops, run_id, callbacks)
    315 
    316                     # Epoch end
--> 317                     caller.on_epoch_end(self.training_state)
    318 
    319             finally:


/usr/local/lib/python3.5/dist-packages/tflearn/callbacks.py in on_epoch_end(self, training_state)
     67     def on_epoch_end(self, training_state):
     68         for callback in self.callbacks:
---> 69             callback.on_epoch_end(training_state)
     70 
     71     def on_train_end(self, training_state):


<ipython-input-23-d44cbdbc0814> in on_epoch_end(self, training_state)
     13         if True:
     14             print("GOODBYE")
---> 15             raise StopIteration


StopIteration:

cb.accs

[None]

参考：

Early Stopping with TensorFlow and TFLearn

20 Nov 2016

import tensorflow as tf
import tflearn
import tflearn.datasets.mnist as mnist

trainX, trainY, testX, testY = mnist.load_data(one_hot=True)

hdf5 not supported (please install/reinstall h5py)
Extracting mnist/train-images-idx3-ubyte.gz
Extracting mnist/train-labels-idx1-ubyte.gz
Extracting mnist/t10k-images-idx3-ubyte.gz
Extracting mnist/t10k-labels-idx1-ubyte.gz

n_features = 784
n_hidden = 256
n_classes = 10

# Define the inputs/outputs/weights as usual.
X = tf.placeholder("float", [None, n_features])
Y = tf.placeholder("float", [None, n_classes])

# Define the connections/weights and biases between layers.
W1 = tf.Variable(tf.random_normal([n_features, n_hidden]), name='W1')
W2 = tf.Variable(tf.random_normal([n_hidden, n_hidden]), name='W2')
W3 = tf.Variable(tf.random_normal([n_hidden, n_classes]), name='W3')

b1 = tf.Variable(tf.random_normal([n_hidden]), name='b1')
b2 = tf.Variable(tf.random_normal([n_hidden]), name='b2')
b3 = tf.Variable(tf.random_normal([n_classes]), name='b3')

# Define the operations throughout the network.
net = tf.tanh(tf.add(tf.matmul(X, W1), b1))
net = tf.tanh(tf.add(tf.matmul(net, W2), b2))
net = tf.add(tf.matmul(net, W3), b3)

# Define the optimization problem.
loss      = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(net, Y))
optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.1)
accuracy  = tf.reduce_mean(tf.cast(
        tf.equal(tf.argmax(net, 1), tf.argmax(Y, 1) ), tf.float32), name='acc')

Early Stopping

Training Setup

In tflearn, we can train our model with a tflearn.Trainer object: “Generic class to handle any TensorFlow graph training. It requires the use of TrainOp to specify all optimization parameters.”

TrainOp represents a set of operation used for optimizing a network.
Example: Time to initialize our trainer to work with our MNIST network. Below we create a TrainOp object that is then used for the purpose of telling our trainer
1. Our loss function. (softmax cross entropy with logits)
2. Our optimizer. (GradientDescentOptimizer)
3. Our evaluation [tensor] metric. (classification accuracy)

trainop = tflearn.TrainOp(loss=loss, optimizer=optimizer, metric=accuracy, batch_size=128)
trainer = tflearn.Trainer(train_ops=trainop, tensorboard_verbose=1)

Callbacks

The Callbacks interface describes a set of methods that we can implement ourselves that will be called during runtime. Below are our options, where here we will be primarily concerned with the on_epoch_end() method. * __ Methods __ :

    def on_train_begin(self, training_state):
    def on_epoch_begin(self, training_state):
    def on_batch_begin(self, training_state):
    def on_sub_batch_begin(self, training_state):
    def on_sub_batch_end(self, training_state, train_index=0):
    def on_batch_end(self, training_state, snapshot=False):
    def on_epoch_end(self, training_state):
    def on_train_end(self, training_state):

TrainingState: Notice that each method requires us to pass a training_state object as an argument. These useful helpers will be able to provide us with the information we need to determine when to stop training. Below is a list of the instance variables we can access with a training_state object:
- self.epoch
- self.step
- self.current_iter
- self.acc_value
- self.loss_value
- self.val_acc
- self.val_loss
- self.best_accuracy
- self.global_acc
- self.global_loss
Implementing our Callback: Let’s say we want to stop training when the validation accuracy reaches a certain threshold. Below, we implement the code required to define such a callback and fit the MNIST data.

class EarlyStoppingCallback(tflearn.callbacks.Callback):
    def __init__(self, val_acc_thresh):
        """ Note: We are free to define our init function however we please. """
        self.val_acc_thresh = val_acc_thresh
    
    def on_epoch_end(self, training_state):
        """ """
        # Apparently this can happen.
        if training_state.val_acc is None: return
        if training_state.val_acc > self.val_acc_thresh:
            raise StopIteration

# Initializae our callback.
early_stopping_cb = EarlyStoppingCallback(val_acc_thresh=0.5)
# Give it to our trainer and let it fit the data. 
trainer.fit(feed_dicts={X: trainX, Y: trainY}, 
            val_feed_dicts={X: testX, Y: testY}, 
            n_epoch=2, 
            show_metric=True, # Calculate accuracy and display at every step.
            snapshot_epoch=False,
            callbacks=early_stopping_cb)

Training Step: 1720  | total loss: [1m[32m0.81290[0m[0m
| Optimizer | epoch: 004 | loss: 0.81290 - acc_2: 0.8854 -- iter: 55000/55000

Using tf.contrib.learn instead

Iris data loading/tutorial prep

Note: can also load via: ```python import csv import random import numpy as np from sklearn import datasets from sklearn.cross_validation import train_test_split

X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.33, random_state=42) iris = datasets.load_iris() print(iris.data.shape) print(“Xt”, X_train.shape, “Yt”, y_train.shape) ```

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

# Suppress the massive amount of warnings.
tf.logging.set_verbosity(tf.logging.ERROR)

# Data sets
IRIS_TRAINING = "iris_training.csv"
IRIS_TEST = "iris_test.csv"

# Load datasets.
training_set = tf.contrib.learn.datasets.base.load_csv_with_header(filename=IRIS_TRAINING,
                                                       target_dtype=np.int, 
                                                        features_dtype=np.float32)
test_set = tf.contrib.learn.datasets.base.load_csv_with_header(filename=IRIS_TEST,
                                                   target_dtype=np.int, 
                                                    features_dtype=np.float32)

# Specify that all features have real-value data
feature_columns = [tf.contrib.layers.real_valued_column("", dimension=4)]

# Build 3 layer DNN with 10, 20, 10 units respectively.
classifier = tf.contrib.learn.DNNClassifier(feature_columns=feature_columns,
                                            hidden_units=[10, 20, 10],
                                            n_classes=3,
                                            model_dir="/tmp/iris_model")

# Fit model.
classifier.fit(x=X_train,
               y=y_train,
               steps=2000)

# Evaluate accuracy.
accuracy_score = classifier.evaluate(x=X_test, y=y_test)["accuracy"]
print('Accuracy: {0:f}'.format(accuracy_score))

# Classify two new flower samples.
new_samples = np.array([[6.4, 3.2, 4.5, 1.5], [5.8, 3.1, 5.0, 1.7]], dtype=np.float32)

y = classifier.predict(new_samples)
print('Predictions: {}'.format(str(y)))

Accuracy: 0.980000
Predictions: [1 1]

Validation Monitors

# Vanilla version
validation_monitor = tf.contrib.learn.monitors.ValidationMonitor(test_set.data,
                                                                 test_set.target,
                                                                 every_n_steps=50)

classifier = tf.contrib.learn.DNNClassifier(feature_columns=feature_columns,
                                            hidden_units=[10, 20, 10],
                                            n_classes=3,
                                            model_dir="/tmp/iris_model",
                                            config=tf.contrib.learn.RunConfig(
                                                save_checkpoints_secs=1))

classifier.fit(x=training_set.data,
               y=training_set.target,
               steps=2000,
               monitors=[validation_monitor])

Estimator(params={'dropout': None, 'hidden_units': [10, 20, 10], 'weight_column_name': None, 'feature_columns': [_RealValuedColumn(column_name='', dimension=4, default_value=None, dtype=tf.float32, normalizer=None)], 'optimizer': 'Adagrad', 'n_classes': 3, 'activation_fn': <function relu at 0x7f8568caa598>, 'num_ps_replicas': 0, 'gradient_clip_norm': None, 'enable_centered_bias': True})

Customizing the Evaluation Metrics and Stopping Early

If we run the code below, it stops early! Warning: You’re going to see a lot of WARNING print outputs from tf. I guess this tutorial is a bit out of date. But that’s not what we care abot here, we just want that early stopping! The important output to notice is

INFO:tensorflow:Validation (step 22556): accuracy = 0.966667, global_step = 22535, loss = 0.2767
INFO:tensorflow:Stopping. Best step: 22356 with loss = 0.2758353650569916.

validation_metrics = {"accuracy": tf.contrib.metrics.streaming_accuracy,
                      "precision": tf.contrib.metrics.streaming_precision,
                      "recall": tf.contrib.metrics.streaming_recall}

validation_monitor = tf.contrib.learn.monitors.ValidationMonitor(
    test_set.data,
    test_set.target,
    every_n_steps=50,
    #metrics=validation_metrics,
    early_stopping_metric='loss',
    early_stopping_metric_minimize=True,
    early_stopping_rounds=200)

tf.logging.set_verbosity(tf.logging.ERROR)
classifier.fit(x=training_set.data,
               y=training_set.target,
               steps=2000,
               monitors=[validation_monitor])

Estimator(params={'dropout': None, 'hidden_units': [10, 20, 10], 'weight_column_name': None, 'feature_columns': [_RealValuedColumn(column_name='', dimension=4, default_value=None, dtype=tf.float32, normalizer=None)], 'optimizer': 'Adagrad', 'n_classes': 3, 'activation_fn': <function relu at 0x7f8568caa598>, 'num_ps_replicas': 0, 'gradient_clip_norm': None, 'enable_centered_bias': True})