#Importing tools
from keras.datasets import imdb
import numpy as np
import pandas as ps
import matplotlib.pyplot as plt
import seaborn as sns
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Embedding, Conv1D, GlobalMaxPooling1D
For this task we will be using the Large Movie View Dataset . This dataset contains a collection of 50,000 reviews from IMDb and contains an even number of positive and negative reviews. A negative review has a score ≤ 4 out of 10, and a positive review has a score ≥ 7 out of 10. Neutral reviews are not included in the dataset. The dataset is divided into training and test sets. The training set is the same 25,000 labeled reviews. Conveniently, Keras has a built-in IMDb movie reviews data set that we can use.
#laod the data
df = imdb.load_data(num_words = 5000)
SPLIT THE DATA
(X_train, Y_train), (X_test, Y_test) = df
DISPLAY SOME SAMPLE.
print('+=++=+++=review++=+=+++===+')
print(X_train[6])
print('++++++++Labels+++++++++++')
print(Y_train[6])
THE DATA has been already pre-processed. All words in a review have been mapped to integers. These integers represent the words sorted by their frequency. So 4 represents the 4th most used word, 5 the 5th most used word and so on. The integer 1 is reserved for the start marker, the integer 2 for an unknown word and 0 for padding. The label is also an integer (0 for negative, 1 for positive).
In order to feed this data into our RNN, all input documents must have the same length. Since the reviews differ heavily in terms of lengths we need to trim each review to its first 500 words. If reviews are shorter than 500 words we will need to pad them with zeros. Keras, offers a set of preprocessing routines that can easily do this for us. In order to pad our reviews we will need to use the pad_sequences function.
X_train = sequence.pad_sequences(X_train, maxlen=500)#we go be using the pad_sequences method from the sequence class
X_test = sequence.pad_sequences(X_test, maxlen=500)#we go be using the pad_sequences method from the sequence class
X_test.shape
X_train.shape#check their shapes
MODEL PARAMETERS
input_dim: 5000 This is the size of the vocabulary in the text data
output_dim: 32 for this task we're going to use 32 but one can try different value
input_length: 500 This is the length of input sequences
model = Sequential()
model.add(Embedding(5000,16, input_length=500))
model.add(Conv1D(128,3, padding='valid', activation='relu', strides=1))
model.add(GlobalMaxPooling1D())
model.add(Dense(128, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
WE WILL BE PLAYING AROUND WITH THE PARAMETERS BELOW FOR BETTER RESULTS OF OUR MODEL(ITS A PITTY THAT I CAN'T SHOW YOU ALL THE TROUBLES THAT I WENT TROUGH)
PARAMETERS
#compile the model and adjust parameter
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
hist = model.fit(X_train, Y_train,validation_data=(X_test, Y_test), epochs=10, batch_size=64)
print(hist.history['acc'])
print(hist.history['val_acc'])
#plot the results at each epoch
train_accuracies = hist.history['acc']
test_accuracies = hist.history['val_acc']
plt.figure()
plt.plot( train_accuracies, c='r', label='acc')
plt.plot( test_accuracies, c='b', label='val_acc')
plt.legend()
#evaluate the model>>>>> not too bad
scores = model.evaluate(X_test, Y_test)
print('Test accuracy:', scores[1])
prediction = model.predict(X_test)
y_pred = (prediction > 0.5)
from sklearn.metrics import f1_score, confusion_matrix
print('F1-score: {0}'.format(f1_score(y_pred, Y_test)))
print('Confusion matrix:')
confusion_matrix(y_pred, Y_test)