Source code for bigdl.examples.keras.imdb_cnn_lstm

#
# Copyright 2016 The BigDL Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#


# IMDB sentiment classification using a recurrent convolutional network on BigDL
# Reference: https://github.com/fchollet/keras/blob/1.2.2/examples/imdb_cnn_lstm.py
# The Keras version we support and test is Keras 1.2.2 with TensorFlow backend.
# See README.md for how to run this example.

from optparse import OptionParser
import sys


[docs]def load_imdb():
    """
    Load IMDB dataset
    Transform input data into an RDD of Sample
    """
    from keras.preprocessing import sequence
    from keras.datasets import imdb
    (X_train, y_train), (X_test, y_test) = imdb.load_data(nb_words=20000)
    X_train = sequence.pad_sequences(X_train, maxlen=100)
    X_test = sequence.pad_sequences(X_test, maxlen=100)
    return X_train, y_train, X_test, y_test


[docs]def build_keras_model():
    """
    Define a recurrent convolutional model in Keras 1.2.2
    """
    from keras.models import Sequential
    from keras.layers import Dense, Dropout, Activation
    from keras.layers import Embedding
    from keras.layers import LSTM
    from keras.layers import Convolution1D, MaxPooling1D
    keras_model = Sequential()
    keras_model.add(Embedding(20000, 128, input_length=100))
    keras_model.add(Dropout(0.25))
    keras_model.add(Convolution1D(nb_filter=64,
                                  filter_length=5,
                                  border_mode='valid',
                                  activation='relu',
                                  subsample_length=1))
    keras_model.add(MaxPooling1D(pool_length=4))
    keras_model.add(LSTM(70))
    keras_model.add(Dense(1))
    keras_model.add(Activation('sigmoid'))
    return keras_model


if __name__ == "__main__":
    parser = OptionParser()
    parser.add_option("-b", "--batchSize", type=int, dest="batchSize", default="32")
    parser.add_option("-m", "--max_epoch", type=int, dest="max_epoch", default="2")
    (options, args) = parser.parse_args(sys.argv)

    keras_model = build_keras_model()
    hdf5_path = "/tmp/imdb.h5"
    keras_model.save(hdf5_path)

    from bigdl.util.common import *
    from bigdl.nn.layer import *
    from bigdl.optim.optimizer import *
    from bigdl.nn.criterion import *

    # Load the HDF5 file with weights to a BigDL model
    bigdl_model = Model.load_keras(hdf5_path=hdf5_path)

    sc = get_spark_context(conf=create_spark_conf())
    redire_spark_logs()
    show_bigdl_info_logs()
    init_engine()

    X_train, y_train, X_test, y_test = load_imdb()
    train_data = to_sample_rdd(X_train, y_train)
    test_data = to_sample_rdd(X_test, y_test)

    optimizer = Optimizer(
        model=bigdl_model,
        training_rdd=train_data,
        criterion=BCECriterion(),
        optim_method=Adam(),
        end_trigger=MaxEpoch(options.max_epoch),
        batch_size=options.batchSize)
    optimizer.set_validation(
        batch_size=options.batchSize,
        val_rdd=test_data,
        trigger=EveryEpoch(),
        val_method=[Top1Accuracy()]
    )
    optimizer.optimize()