Keras - IMDB Sentiment-Analysis Keras and TensorFlow
Jump to navigation
Jump to search
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Fri Sep 6 08:49:48 2019 @author: onno """ # IMPORT MODULES # TURN ON the GPU !!! # If importing dataset from outside - like this IMDB - Internet must be "connected" import os from operator import itemgetter import numpy as np import pandas as pd import matplotlib.pyplot as plt import warnings warnings.filterwarnings('ignore') get_ipython().magic(u'matplotlib inline') plt.style.use('ggplot') import tensorflow as tf from keras import models, regularizers, layers, optimizers, losses, metrics from keras.models import Sequential from keras.layers import Dense from keras.utils import np_utils, to_categorical from keras.datasets import imdb print(os.getcwd()) print("Modules imported \n") print("Files in current directory:") from subprocess import check_output print(check_output(["ls", "./input"]).decode("utf8")) #check the files available in the directory # LOAD IMDB DATA # (train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=10000) import numpy as np # save np.load np_load_old = np.load # modify the default parameters of np.load np.load = lambda *a,**k: np_load_old(*a, allow_pickle=True, **k) # call load_data with allow_pickle implicitly set to true (train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=10000) # restore np.load for future normal usage np.load = np_load_old print("train_data ", train_data.shape) print("train_labels ", train_labels.shape) print("_"*100) print("test_data ", test_data.shape) print("test_labels ", test_labels.shape) print("_"*100) print("Maximum value of a word index ") print(max([max(sequence) for sequence in train_data])) print("Maximum length num words of review in train ") print(max([len(sequence) for sequence in train_data])) # See an actual review in words # Reverse from integers to words using the DICTIONARY (given by keras...need to do nothing to create it) word_index = imdb.get_word_index() reverse_word_index = dict( [(value, key) for (key, value) in word_index.items()]) decoded_review = ' '.join( [reverse_word_index.get(i - 3, '?') for i in train_data[123]]) print(decoded_review) # VECTORIZE as one cannot feed integers into a NN # Encoding the integer sequences into a binary matrix - one hot encoder basically # From integers representing words, at various lengths - to a normalized one hot encoded tensor (matrix) of 10k columns def vectorize_sequences(sequences, dimension=10000): results = np.zeros((len(sequences), dimension)) for i, sequence in enumerate(sequences): results[i, sequence] = 1. return results x_train = vectorize_sequences(train_data) x_test = vectorize_sequences(test_data) print("x_train ", x_train.shape) print("x_test ", x_test.shape) # VECTORIZE the labels too - NO INTEGERS only floats into a tensor...(rare exceptions) y_train = np.asarray(train_labels).astype('float32') y_test = np.asarray(test_labels).astype('float32') print("y_train ", y_train.shape) print("y_test ", y_test.shape) # Set a VALIDATION set x_val = x_train[:10000] partial_x_train = x_train[10000:] y_val = y_train[:10000] partial_y_train = y_train[10000:] print("x_val ", x_val.shape) print("partial_x_train ", partial_x_train.shape) print("y_val ", y_val.shape) print("partial_y_train ", partial_y_train.shape) # NN MODEL # Use of DROPOUT model = models.Sequential() model.add(layers.Dense(16, kernel_regularizer=regularizers.l1(0.001), activation='relu', input_shape=(10000,))) model.add(layers.Dropout(0.5)) model.add(layers.Dense(16, kernel_regularizer=regularizers.l1(0.001),activation='relu')) model.add(layers.Dropout(0.5)) model.add(layers.Dense(1, activation='sigmoid')) # Use of REGULARIZATION #model = models.Sequential() #model.add(layers.Dense(16, kernel_regularizer=regularizers.l1_l2(l1=0.001, l2=0.001),activation='relu', input_shape=(10000,))) #model.add(layers.Dense(16, kernel_regularizer=regularizers.l1_l2(l1=0.001, l2=0.001),activation='relu')) #model.add(layers.Dense(1, activation='sigmoid')) # REGULARIZERS L1 L2 #regularizers.l1(0.001) #regularizers.l2(0.001) #regularizers.l1_l2(l1=0.001, l2=0.001) # OPTIMIZERS #model.compile(optimizer=optimizers.RMSprop(lr=0.001), loss=losses.binary_crossentropy, metrics= [metrics.binary_accuracy]) #model.compile(optimizer='rmsprop',loss='binary_crossentropy',metrics=['accuracy']) # FIT / TRAIN model NumEpochs = 10 BatchSize = 512 model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc']) history = model.fit(partial_x_train, partial_y_train, epochs=NumEpochs, batch_size=BatchSize, validation_data=(x_val, y_val)) results = model.evaluate(x_test, y_test) print("_"*100) print("Test Loss and Accuracy") print("results ", results) history_dict = history.history history_dict.keys() # VALIDATION LOSS curves plt.clf() history_dict = history.history loss_values = history_dict['loss'] val_loss_values = history_dict['val_loss'] epochs = range(1, (len(history_dict['loss']) + 1)) plt.plot(epochs, loss_values, 'bo', label='Training loss') plt.plot(epochs, val_loss_values, 'b', label='Validation loss') plt.title('Training and validation loss') plt.xlabel('Epochs') plt.ylabel('Loss') plt.legend() plt.show() # VALIDATION ACCURACY curves plt.clf() acc_values = history_dict['acc'] val_acc_values = history_dict['val_acc'] epochs = range(1, (len(history_dict['acc']) + 1)) plt.plot(epochs, acc_values, 'bo', label='Training acc') plt.plot(epochs, val_acc_values, 'b', label='Validation acc') plt.title('Training and validation accuracy') plt.xlabel('Epochs') plt.ylabel('Accuracy') plt.legend() plt.show() # PREDICT model.predict(x_test)