from __future__ import absolute_import, division, print_function
# TODO: Swallow output
!pip -q install pydot_ng
#!pip -q install graphviz
#!apt install graphviz > /dev/null
from keras.layers import BatchNormalization
import tensorflow as tf
import tensorflow.contrib.eager as tfe
import numpy as np
import matplotlib.pyplot as plt
from IPython import display
%matplotlib inline
try:
tf.enable_eager_execution()
print('Running in Eager mode.')
except ValueError:
print('Already running in Eager mode')
import keras
from keras.models import Sequential
from keras.utils import np_utils
from keras.preprocessing.image import ImageDataGenerator
from keras.layers import Dense, Activation, Flatten, Dropout, BatchNormalization
from keras.layers import Conv2D, MaxPooling2D
from keras.datasets import cifar10
from keras import regularizers
from keras.callbacks import LearningRateScheduler
import numpy as np
When modelling an image using a regular feed-forward network, we find that the number of model parameters grows exponentially.
QUESTION: How many parameters would there be in a feed-forward network with 2 hidden layers consisting of 512 and 256 neurons respectively, an output size of 10 and an input image of shape [32, 32, 3]? (Note that we represent each pixel in a a colour image using three real-numbers for the Red, Green and Blue values and hence the 32x32x3 shape.)
ConvNets address this model parameter issue by exploiting structure in the inputs to the network (in particular, by making the assumption that the input is a 3-D volume, which applies to images for example, where the 3 dimensions consist of the three RGB channels). The two key differences between a ConvNet and a Feed-forward network are:
ConvNet architectures were key to the tremendous success of deep learning in machine vision. In particular, the first deep learning model to win the ImageNet competition in 2012 was called AlexNet (after Alex Krizhevsky, one of its inventors). It had 5 convolutional layers followed by 3 fully-connected layers. Later winners included GoogLeNet and ResNet. If you're curious, have a look at this link for a great summary of different ConvNet architectures.
The hyper-parameters of a convolutional layer are as follows:
Now that we understand convolutional, max-pooling and feed-forward layers, we can combine these as building block to build a ConvNet classifier for images. For this practical, we will use the colour image dataset CIFAR10 (pronounced "seefar ten") which consists of 50,000 training images and 10,000 test images. As we did in Practical 1, we take 10,000 images from the training set to form a validation set and visualise some example images.
cifar = tf.keras.datasets.cifar10
(train_images, train_labels), (test_images, test_labels) = cifar.load_data()
cifar_labels = ['airplane', 'automobile', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck']
# Take the last 10000 images from the training set to form a validation set
train_labels = train_labels.squeeze()
validation_images = train_images[-10000:, :, :]
validation_labels = train_labels[-10000:]
train_images = train_images[:-10000, :, :]
train_labels = train_labels[:-10000]
What are the shapes and data-types of train_images and train_labels?
print('train_images.shape = {}, data-type = {}'.format(train_images.shape, train_images.dtype))
print('train_labels.shape = {}, data-type = {}'.format(train_labels.shape, train_labels.dtype))
print('validation_images.shape = {}, data-type = {}'.format(validation_images.shape, validation_images.dtype))
print('validation_labels.shape = {}, data-type = {}'.format(validation_labels.shape, validation_labels.dtype))
Run the cell below multiple times to see various images. (They might look a bit blurry because we've blown up the small images.)
plt.figure(figsize=(10,10))
for i in range(25):
plt.subplot(5,5,i+1)
plt.xticks([])
plt.yticks([])
plt.grid('off')
img_index = np.random.randint(0, 40000)
plt.imshow(train_images[img_index])
plt.xlabel(cifar_labels[train_labels[img_index]])
Finally, we build a simple convolutional architecture to classify the CIFAR images. We will build a mini version of the AlexNet architecture, which consists of 5 convolutional layers with max-pooling, followed by 3 fully-connected layers at the end. In order to investigate the effect each of these two layers have on the number of parameters, we'll build the model in two stages.
First, the convolutional layers + max-pooling:
# Define the convolutinal part of the model architecture using Keras Layers.
model = tf.keras.models.Sequential([
tf.keras.layers.Conv2D(filters=48, kernel_size=(3, 3), activation=tf.nn.relu, input_shape=(32, 32, 3), padding='same'),
tf.keras.layers.MaxPooling2D(pool_size=(3, 3)),
tf.keras.layers.Conv2D(filters=128, kernel_size=(3, 3), activation=tf.nn.relu, padding='same'),
tf.keras.layers.MaxPooling2D(pool_size=(3, 3)),
tf.keras.layers.Conv2D(filters=192, kernel_size=(3, 3), activation=tf.nn.relu, padding='same'),
tf.keras.layers.Conv2D(filters=192, kernel_size=(3, 3), activation=tf.nn.relu, padding='same'),
tf.keras.layers.Conv2D(filters=128, kernel_size=(3, 3), activation=tf.nn.relu, padding='same'),
tf.keras.layers.MaxPooling2D(pool_size=(3, 3)),
])
How many parameters are there in the convolutional part of the architecture? We can easily inspect this using the model summary function in Keras:
model.summary()
Now we add a fully-connected part. Note that we also add "Dropout" after the first fully-connected layer. Dropout is a regularization technique which randomly zeros out ("drops") connections between neurons, and it was one of the key innovations of the AlexNet paper in 2012.
model.add(tf.keras.layers.Flatten()) # Flatten "squeezes" a 3-D volume down into a single vector.
model.add(tf.keras.layers.Dense(1024, activation=tf.nn.relu))
model.add(tf.keras.layers.Dropout(rate=0.5))
model.add(tf.keras.layers.Dense(1024, activation=tf.nn.relu))
model.add(tf.keras.layers.Dense(10, activation=tf.nn.softmax))
model.summary()
You might have wondered what values we are using for the initial values of the weights and biases in our model. The short answer is that we typically use random initialization. In this case, we have just been using the default keras initializers for each layer, which are usually sufficient.
The longer answer is that just using completely random numbers does not always work best in practice and that there are a number of common initialization schemes (which are available in most deep learning frameworks such as TensorFlow and Keras).
Lets consider a few examples:
When using the ReLU activation it is common to initialize the biases with small positive numbers because this encourages the ReLU activations to start off in the on state, which helps to counteract the dying ReLU problem.
The deeper neural networks become the more likely it is that gradients will either shrink to the point that they vanish, or grow to the point that they overflow (the vanishing and exploding gradients problems). To help combat this we can initialize our weights to have a (model-specific) appropriate scale. One method for doing this is called Xavier or Glorot initialization.
The Xavier initialization scheme was designed with the traditional activations Sigmoid and TanH in mind, and does not work as well for ReLU activations. An alternative is He initialization which is a modification of Xavier initialization for ReLU activations.
This blog goes into more detail on He and Xavier initialization. The Keras documentation lists a number of common schemes.
Let's build a flow-diagram of the model we've constructed to see how information flows between the different layers.
tf.keras.utils.plot_model(model, to_file='small_lenet.png', show_shapes=True, show_layer_names=True)
display.display(display.Image('small_lenet.png'))
In the last practical we wrote out the dataset pipeline, loss function and training-loop to give you a good appreciation for how it works. This time, we use the training loop built-in to Keras. For simple, standard datasets like CIFAR, doing it this way will work fine, but it's important to know what goes on under the hood because you may need to write some or all of the steps out manually when working with more complex datasets!
batch_size = 128
num_epochs = 10 # The number of epochs (full passes through the data) to train for
# Compiling the model adds a loss function, optimiser and metrics to track during training
model.compile(optimizer=tf.train.AdamOptimizer(),
loss=tf.keras.losses.sparse_categorical_crossentropy,
metrics=['accuracy'])
# The fit function allows you to fit the compiled model to some training data
model.fit(x=train_images.astype('uint8'),
y=train_labels.astype('uint8'),
batch_size=batch_size,
epochs=num_epochs,
validation_data=(validation_images.astype('uint8'), validation_labels.astype('uint8')))
print('Training complete')
Finally, we evaluate how well the model does on the held-out test-set
metric_values = model.evaluate(x=test_images, y=test_labels)
print('Final TEST performance') for metric_value, metric_name in zip(metric_values, model.metrics_names): print('{}: {}'.format(metric_name, metric_value))
Note that we achieved roughly 80% training set accuracy, but our test accuracy is only around 67%. What do you think may be the reason for this?
We now use our trained model to classify a sample of 25 images from the test set. We pass these 25 images to the model.predict
function, which returns a [25, 10] dimensional matrix. The entry at position (i,j) of this matrix contains the probability that image i belongs to class j. We obtain the most-likely prediction using the np.argmax
function which returns the index of the maximum entry along the columns. Finally, we plot the result with the prediction and prediction probability labelled underneath the image and true label on the side.
img_indices = np.random.randint(0, len(test_images), size=[25])
sample_test_images = test_images[img_indices]
sample_test_labels = [cifar_labels[i] for i in test_labels[img_indices].squeeze()]
predictions = model.predict(sample_test_images)
max_prediction = np.argmax(predictions, axis=1)
prediction_probs = np.max(predictions, axis=1)
img_indices = np.random.randint(0, len(test_images), size=[25])
sample_test_images = test_images[img_indices]
sample_test_labels = [cifar_labels[i] for i in test_labels[img_indices].squeeze()]
predictions = model.predict(sample_test_images)
max_prediction = np.argmax(predictions, axis=1)
prediction_probs = np.max(predictions, axis=1)
plt.figure(figsize=(10,10))
for i, (img, prediction, prob, true_label) in enumerate(
zip(sample_test_images, max_prediction, prediction_probs, sample_test_labels)):
plt.subplot(5,5,i+1)
plt.xticks([])
plt.yticks([])
plt.grid('off')
plt.imshow(img)
plt.xlabel('{} ({:0.3f})'.format(cifar_labels[prediction], prob))
plt.ylabel('{}'.format(true_label))
Deep neural networks are not considered to be very good at estimating the uncertainty in their predictions. However, knowing your model's uncertainty can be very important for many applications. For example, consider a deep learning tool for diagnosing diseases, in this case a false negative could have massive impacts on a person's life! We would really like to know how confident our model is in its prediction. This is a budding field of research, for example see this blog for a nice introduction.
Deciding on the architecture for a CNN, i.e. the combination of convolution, pooling, dense, and other layers, can be tricky and often can seem arbitrary. On top of that, one also has to make decisions such as what kind of pooling, which activation functions, and what size of convolution to use, among other things. For new and old practitioners of deep learning, these choices can be overwhelming.
However, by examining existing successful CNN architectures we can learn a lot about what works and what doesn't. (We can even apply these existing architectures to our problems since many deep learning libraries, such as TensorFlow and Keras, have them built in and it is even possible to fine-tune pre-trained models to our specific problem using transfer learning.)
This article describes many of the most successful CNN architectures in recent years, including ResNet, Inception and VGG. For a more detailed and technical description of these models and more see these slides. Reading through these resources should give you insights into why these architectures are successful as well as best practices and current trends for CNNs that will help you design your own architectures.
Implement BATCH NORMALISATION (Tensorflow documentation and research paper) to improve the model's generalisation.
Why do modern architectures use less max-pooling?
Here's some more information on ConvNets:
train_images.shape
def model_initializer(pading, activation,SIZE, filters_main, other_filters, maxpooling=False ):
'''this function initial a model, the parameter will provided respectively by the user.
the function can generate a model with or with maxpooling layer (provided that the user specify True for pooling or False without it) '''
#check if the maxpooling is set to true of not (if True we add the pooling layer to our CNN)
if maxpooling == True:
#if so build the model with maxpooling layers
model_= tf.keras.models.Sequential([
tf.keras.layers.Conv2D(filters=filters_main, kernel_size=SIZE, activation=activation, input_shape=(32, 32, 3), padding=pading),
tf.keras.layers.MaxPooling2D(pool_size=SIZE),
tf.keras.layers.Conv2D(filters=other_filters, kernel_size=SIZE, activation=activation, padding=pading),
tf.keras.layers.MaxPooling2D(pool_size=SIZE),
tf.keras.layers.Conv2D(filters=other_filters, kernel_size=SIZE, activation=activation, padding=pading),
tf.keras.layers.Conv2D(filters=other_filters, kernel_size=SIZE, activation=activation, padding=pading),
tf.keras.layers.Conv2D(filters=other_filters, kernel_size=SIZE, activation=activation, padding=pading),
tf.keras.layers.MaxPooling2D(pool_size=SIZE),
])
# else ( whether maxpooling specify true or left empty we run the CNN without maxpooling layers)
else:
#we
model_ = tf.keras.models.Sequential([
tf.keras.layers.Conv2D(filters=filters_main, kernel_size=SIZE, activation=activation, input_shape=(32, 32, 3), padding=pading),
#we also going to add in so batchnormalization layers (based on number of research it's believed to speed up the training process and increase accuracy)
tf.keras.layers.BatchNormalization(),
tf.keras.layers.Conv2D(filters=other_filters, kernel_size=SIZE, activation=activation, padding=pading),
tf.keras.layers.BatchNormalization(),
tf.keras.layers.Dropout(rate=0.5),
tf.keras.layers.Flatten(), # Flatten ("squeezes") a 3-D volume down into a single vector.
tf.keras.layers.Dense(10, activation=tf.nn.softmax),
])
return model_
#create a function to visualize the model
def visualize_model(model):
#this straight forward as it looks, we start by creating the picture and display it
tf.keras.utils.plot_model(model, to_file='small_lenet.png', show_shapes=True, show_layer_names=True)
display.display(display.Image('small_lenet.png'))
#initialize the parameter to avoid uncessary mistake
SIZE=(1,1)
filters_main=5
pading = 'valid'
activation = 'relu'
other_filters =192
model_ = model_initializer(pading, activation,SIZE, filters_main, other_filters, maxpooling=True )
visualize_model(model_)
#look at the work in the background thhrough the summary methoda
model_.summary()
batch_size = 128
num_epochs = 10 # The number of epochs (full passes through the data) to train for
# Compiling the model adds a loss function, optimiser and metrics to track during training
model_.compile(optimizer=tf.train.AdamOptimizer(),
loss=tf.keras.losses.sparse_categorical_crossentropy,
metrics=['accuracy'])
# The fit function allows you to fit the compiled model to some training data
model_.fit(x=train_images.astype('uint8'),
y=train_labels.astype('uint8'),
batch_size=batch_size,
epochs=num_epochs,
validation_data=(validation_images.astype('uint8'), validation_labels.astype('uint8')))
print('Training complete')
#reset the parameter and turn off the maxpooling
SIZE=(3,3)
filters_main=128
pading = 'valid'
activation = 'relu'
other_filters =192
model_1 = model_initializer(pading, activation,SIZE, filters_main, other_filters, maxpooling=False )
#plot the model using our vizualize model function created earlier
visualize_model(model_1)
#inspect our model, no pooling in there
# we good?
model_1.summary()
batch_size = 125 #amount of data we want to load at a time
num_epochs = 10 # The number of epochs (full passes through the data) to train for
# Compiling the model adds a loss function, optimiser and metrics to track during training
model_1.compile(optimizer=tf.train.AdamOptimizer(),
loss=tf.keras.losses.sparse_categorical_crossentropy,
metrics=['accuracy'])
# now let's train our model
model_1.fit(x=train_images,
y=train_labels,
batch_size=batch_size,
epochs=10,
validation_data=(validation_images, validation_labels))
print('Training complete')
it learning but just a bit slow
#evaluate our model using it evaluate attribute
metric_values = model_1.evaluate(x=test_images, y=test_labels)
#display the final result
print('Final TEST performance')
for metric_value, metric_name in zip(metric_values, model.metrics_names):
print('{}: {}'.format(metric_name, metric_value))
#collecting some random sample
img_indices = np.random.randint(0, len(test_images), size=[25])
sample_test_images = test_images[img_indices]
sample_test_labels = [cifar_labels[i] for i in test_labels[img_indices].squeeze()]
#test the model on the samples colected
predictions = model_1.predict(sample_test_images)
max_prediction = np.argmax(predictions, axis=1)
prediction_probs = np.max(predictions, axis=1)
# display the result predicted vs the real image
plt.figure(figsize=(10,10))
for i, (img, prediction, prob, true_label) in enumerate(
zip(sample_test_images, max_prediction, prediction_probs, sample_test_labels)):
plt.subplot(5,5,i+1)
plt.xticks([])
plt.yticks([])
plt.grid('off')
plt.imshow(img)
plt.xlabel('{} ({:0.3f})'.format(cifar_labels[prediction], prob))
plt.ylabel('{}'.format(true_label))
Because this method is based on the idea of simplifying the process by pooling the the max value in a given array, therefore it somehow discard important information that could be usefull when looking for variance in the data.
And also based on some deep research found in the link below the Author concluded that this method seems not to work on more complex pooling works, but not well enough that you don’t need something else, and if you have that something else, you don’t need pooling.
https://principlesofdeeplearning.com/index.php/2018/08/27/is-pooling-dead-in-convolutional-networks/
A 1x1 convolution simply maps an input pixel with all it's channels to an output pixel, not looking at anything around itself. It is often used to reduce the number of depth channels, since it is often very slow to multiply volumes with extremely large depths. https://stats.stackexchange.com/questions/194142/what-does-1x1-convolution-mean-in-a-neural-network
simply put: connecting the OUTPUt of layer1 to the OUTPUT of layer2
in simple equation we will say if y=f(x) in standard connection.
in Residual connection the equation wi be y=f(x)+x. x being the output