4.2 - Convolutional Neural Networks#

Course’s material requires a tensorflow version lower than the default one used in Google Colab. Run the following cell to downgrade TensorFlow accordingly.

!wget -nc --no-cache -O init.py -q https://raw.githubusercontent.com/rramosp/2021.deeplearning/main/content/init.py
import init; init.init(force_download=False);
/content/init.py:2: SyntaxWarning: invalid escape sequence '\S'
  course_id = '\S*deeplearning\S*'
replicating local resources
import tensorflow as tf
from time import time
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from local.lib import mlutils
%matplotlib inline

Image analytics tasks#

from IPython.display import Image
Image(filename='local/imgs/imgs_tasks.jpeg', width=800)

Explore COCO Dataset#

also

Convolutional Neural Networks#

see video series Tensorflow and Deep Learning without a PhD

see convolutions summary | filter activation demo | confusion matrix

see The 9 Deep Learning Papers You Should Know

RECOMMENDATION#

First level filters and activations maps#

the filters in the middle are applied to the image on the left. Observe, for instance, in what parts of the image the seventh filter of the first row is activated (the one before the last one in the first row).

Image(filename='local/imgs/cnn_swan.png', width=800)

Hierarchy of filters and activation maps#

Image(filename='local/imgs/cnn_features.png', width=600)
Image(filename='local/imgs/conv1.jpg', width=800)
Image(filename='local/imgs/conv2.jpg', width=800)

otros ejemplos de filtros de primer nivel

Image(filename='local/imgs/cnn_features2.png', width=600)

We have a small image dataset based on CIFAR-10, where each image size is 32x32x3.

!wget -nc https://s3.amazonaws.com/rlx/mini_cifar.h5
--2025-09-03 22:13:45--  https://s3.amazonaws.com/rlx/mini_cifar.h5
Resolving s3.amazonaws.com (s3.amazonaws.com)... 3.5.21.44, 52.217.172.48, 3.5.24.43, ...
Connecting to s3.amazonaws.com (s3.amazonaws.com)|3.5.21.44|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 14803609 (14M) [binary/octet-stream]
Saving to: ‘mini_cifar.h5’

mini_cifar.h5       100%[===================>]  14.12M  66.7MB/s    in 0.2s    

2025-09-03 22:13:45 (66.7 MB/s) - ‘mini_cifar.h5’ saved [14803609/14803609]
import h5py
with h5py.File('mini_cifar.h5','r') as h5f:
    x_cifar = h5f["x"][:]
    y_cifar = h5f["y"][:]
mlutils.show_labeled_image_mosaic(x_cifar, y_cifar)
../_images/5d89ad64f72c0076c5272adb8397bd44c2d1a850b9c5f3728c35c4acf2b9c437.png
print (np.min(x_cifar), np.max(x_cifar))
0.0 1.0
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_cifar, y_cifar, test_size=.25)
print (x_train.shape, y_train.shape, x_test.shape, y_test.shape)
print ("\ndistribution of train classes")
print (pd.Series(y_train).value_counts())
print ("\ndistribution of test classes")
print (pd.Series(y_test).value_counts())
(2253, 32, 32, 3) (2253,) (751, 32, 32, 3) (751,)

distribution of train classes
2    788
0    750
1    715
Name: count, dtype: int64

distribution of test classes
1    259
0    255
2    237
Name: count, dtype: int64

build a Keras model

def get_conv_model_A(num_classes, img_size=32, compile=True):
    tf.keras.backend.clear_session()
    print ("using",num_classes,"classes")
    inputs = tf.keras.Input(shape=(img_size,img_size,3), name="input_1")
    layers = tf.keras.layers.Conv2D(15,(3,3), activation="relu", padding="SAME")(inputs)
    layers = tf.keras.layers.Flatten()(layers)
    layers = tf.keras.layers.Dense(16, activation=tf.nn.relu)(layers)
    layers = tf.keras.layers.Dropout(0.2)(layers)
    predictions = tf.keras.layers.Dense(num_classes, activation=tf.nn.softmax, name="output_1")(layers)
    model = tf.keras.Model(inputs = inputs, outputs=predictions)
    if compile:
        model.compile(optimizer='adam',
                      loss='sparse_categorical_crossentropy',
                      metrics=['accuracy'])
    return model
num_classes = len(np.unique(y_cifar))
model = get_conv_model_A(num_classes)
using 3 classes

observe the weights initialized and their weights

weights = model.get_weights()
for i in weights:
    print (i.shape)
(3, 3, 3, 15)
(15,)
(15360, 16)
(16,)
(16, 3)
(3,)

we keep the filters on the first layer to later compare them with the same filters after training.

initial_w0 = model.get_weights()[0].copy()
y_test.shape, y_train.shape, x_test.shape, x_train.shape
((751,), (2253,), (751, 32, 32, 3), (2253, 32, 32, 3))
num_classes = len(np.unique(y_cifar))

def train(model, batch_size, epochs, model_name=""):
    tensorboard = tf.keras.callbacks.TensorBoard(log_dir="logs/"+model_name+"_"+"{}".format(time()))
    model.fit(x_train, y_train, epochs=epochs, callbacks=[tensorboard],
              batch_size=batch_size,
              validation_data=(x_test, y_test))
    metrics = model.evaluate(x_test, y_test)
    return {k:v for k,v in zip (model.metrics_names, metrics)}

observe the shapes of model weights obtained above and try to see how they are related to the output shape and the number of parameters

model = get_conv_model_A(num_classes)
model.summary()
using 3 classes
Model: "functional"
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓
┃ Layer (type)                     Output Shape                  Param # ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩
│ input_1 (InputLayer)            │ (None, 32, 32, 3)      │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ conv2d (Conv2D)                 │ (None, 32, 32, 15)     │           420 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ flatten (Flatten)               │ (None, 15360)          │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dense (Dense)                   │ (None, 16)             │       245,776 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dropout (Dropout)               │ (None, 16)             │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ output_1 (Dense)                │ (None, 3)              │            51 │
└─────────────────────────────────┴────────────────────────┴───────────────┘
 Total params: 246,247 (961.90 KB)
 Trainable params: 246,247 (961.90 KB)
 Non-trainable params: 0 (0.00 B)
train(model, batch_size=32, epochs=10, model_name="model_A")
Epoch 1/10
71/71 ━━━━━━━━━━━━━━━━━━━━ 4s 33ms/step - accuracy: 0.4473 - loss: 1.2124 - val_accuracy: 0.6405 - val_loss: 0.8304
Epoch 2/10
71/71 ━━━━━━━━━━━━━━━━━━━━ 1s 19ms/step - accuracy: 0.6213 - loss: 0.8343 - val_accuracy: 0.6578 - val_loss: 0.7908
Epoch 3/10
71/71 ━━━━━━━━━━━━━━━━━━━━ 3s 19ms/step - accuracy: 0.6574 - loss: 0.7616 - val_accuracy: 0.6897 - val_loss: 0.7229
Epoch 4/10
71/71 ━━━━━━━━━━━━━━━━━━━━ 1s 17ms/step - accuracy: 0.6527 - loss: 0.7387 - val_accuracy: 0.7031 - val_loss: 0.7145
Epoch 5/10
71/71 ━━━━━━━━━━━━━━━━━━━━ 1s 17ms/step - accuracy: 0.6950 - loss: 0.6999 - val_accuracy: 0.6937 - val_loss: 0.7106
Epoch 6/10
71/71 ━━━━━━━━━━━━━━━━━━━━ 1s 17ms/step - accuracy: 0.7206 - loss: 0.6592 - val_accuracy: 0.6897 - val_loss: 0.7089
Epoch 7/10
71/71 ━━━━━━━━━━━━━━━━━━━━ 1s 17ms/step - accuracy: 0.7170 - loss: 0.6470 - val_accuracy: 0.7044 - val_loss: 0.6813
Epoch 8/10
71/71 ━━━━━━━━━━━━━━━━━━━━ 1s 19ms/step - accuracy: 0.7434 - loss: 0.5900 - val_accuracy: 0.7044 - val_loss: 0.6998
Epoch 9/10
71/71 ━━━━━━━━━━━━━━━━━━━━ 3s 21ms/step - accuracy: 0.7374 - loss: 0.5816 - val_accuracy: 0.7403 - val_loss: 0.6335
Epoch 10/10
71/71 ━━━━━━━━━━━━━━━━━━━━ 1s 19ms/step - accuracy: 0.7540 - loss: 0.5684 - val_accuracy: 0.7350 - val_loss: 0.6252
24/24 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step - accuracy: 0.7377 - loss: 0.6379
{'loss': 0.6252090930938721, 'compile_metrics': 0.7350199818611145}
test_preds = model.predict(x_test).argmax(axis=1)
mlutils.plot_confusion_matrix(y_test, test_preds, classes=np.r_[0,1,2], normalize=True)
24/24 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step
Normalized confusion matrix
[[0.56078431 0.14509804 0.29411765]
 [0.05019305 0.86872587 0.08108108]
 [0.11392405 0.10970464 0.77637131]]
<Axes: title={'center': 'Normalized confusion matrix'}, xlabel='Predicted label', ylabel='True label'>
../_images/f7b067f19f2a89b63a24aa14b35d50799b48f07c99e37dbecc4af8503d9b821c.png

observe the outp in tensorboard

tensorboard --logdir logs

first layer filters before training

mlutils.display_imgs(initial_w0)
../_images/998b61adc771b60f97e71e2eb3d27dd442ab08321bdd3d7226f2e34186eb514e.png

and after training

w0 = model.get_weights()[0]
print (w0.shape)
mlutils.display_imgs(w0)
(3, 3, 3, 15)
../_images/1f298fddcd540da43116d6ba7503a8d30e9b1835201733d40a531506daf6e0fe.png
idxs = np.random.permutation(len(x_test))[:5]
preds = model.predict(x_test[idxs])
mlutils.show_preds(x_test[idxs],y_test[idxs], preds)
1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 53ms/step
../_images/8cbd7e6ac14a310cb0c09bf00159aeeb24e28ef773c5583e4c7015d3387379f4.png ../_images/39c13266e36407a351e21e4498752748da690ff431753fb902fb1c665913de2c.png ../_images/4329c37816b5ca21cfb04802a1d94717c6c6af299b589fa02d6f740a83c9b1fe.png ../_images/97d54e8bd85bf10079b8b45d1d147e8a7523d5157937f4fe29497e48040e4761.png ../_images/d7aca469f46f30c7bc1b7baa0afaa2f9b40b4ab195843a31beee678d653844a1.png

Let’s try a more complex network#

def get_conv_model_B(num_classes, img_size=32, compile=True):
    tf.keras.backend.clear_session()
    print ("using",num_classes,"classes")
    inputs = tf.keras.Input(shape=(img_size,img_size,3), name="input_1")
    layers = tf.keras.layers.Conv2D(15,(5,5), activation="relu")(inputs)
    layers = tf.keras.layers.MaxPool2D((2,2))(layers)
    layers = tf.keras.layers.Conv2D(60,(5,5), activation="relu")(layers)
    layers = tf.keras.layers.Flatten()(layers)
    layers = tf.keras.layers.Dense(16, activation=tf.nn.relu)(layers)
    layers = tf.keras.layers.Dropout(0.2)(layers)
    predictions = tf.keras.layers.Dense(num_classes, activation=tf.nn.softmax, name="output_1")(layers)
    model = tf.keras.Model(inputs = inputs, outputs=predictions)
    if compile:
        model.compile(optimizer='adam',
                      loss='sparse_categorical_crossentropy',
                      metrics=['accuracy'])
    return model
model = get_conv_model_B(num_classes)
model.summary()
train(model, batch_size=32, epochs=10, model_name="model_B")
using 3 classes
Model: "functional"
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓
┃ Layer (type)                     Output Shape                  Param # ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩
│ input_1 (InputLayer)            │ (None, 32, 32, 3)      │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ conv2d (Conv2D)                 │ (None, 28, 28, 15)     │         1,140 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ max_pooling2d (MaxPooling2D)    │ (None, 14, 14, 15)     │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ conv2d_1 (Conv2D)               │ (None, 10, 10, 60)     │        22,560 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ flatten (Flatten)               │ (None, 6000)           │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dense (Dense)                   │ (None, 16)             │        96,016 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dropout (Dropout)               │ (None, 16)             │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ output_1 (Dense)                │ (None, 3)              │            51 │
└─────────────────────────────────┴────────────────────────┴───────────────┘
 Total params: 119,767 (467.84 KB)
 Trainable params: 119,767 (467.84 KB)
 Non-trainable params: 0 (0.00 B)
Epoch 1/10
71/71 ━━━━━━━━━━━━━━━━━━━━ 4s 37ms/step - accuracy: 0.4160 - loss: 1.0678 - val_accuracy: 0.5885 - val_loss: 0.9273
Epoch 2/10
71/71 ━━━━━━━━━━━━━━━━━━━━ 4s 52ms/step - accuracy: 0.5649 - loss: 0.9118 - val_accuracy: 0.6125 - val_loss: 0.8346
Epoch 3/10
71/71 ━━━━━━━━━━━━━━━━━━━━ 4s 34ms/step - accuracy: 0.5947 - loss: 0.8255 - val_accuracy: 0.6911 - val_loss: 0.7538
Epoch 4/10
71/71 ━━━━━━━━━━━━━━━━━━━━ 2s 34ms/step - accuracy: 0.6596 - loss: 0.7524 - val_accuracy: 0.6937 - val_loss: 0.7103
Epoch 5/10
71/71 ━━━━━━━━━━━━━━━━━━━━ 3s 34ms/step - accuracy: 0.6510 - loss: 0.7328 - val_accuracy: 0.7137 - val_loss: 0.6942
Epoch 6/10
71/71 ━━━━━━━━━━━━━━━━━━━━ 4s 53ms/step - accuracy: 0.6763 - loss: 0.7173 - val_accuracy: 0.7217 - val_loss: 0.6599
Epoch 7/10
71/71 ━━━━━━━━━━━━━━━━━━━━ 4s 35ms/step - accuracy: 0.7023 - loss: 0.6559 - val_accuracy: 0.7417 - val_loss: 0.6539
Epoch 8/10
71/71 ━━━━━━━━━━━━━━━━━━━━ 2s 34ms/step - accuracy: 0.7208 - loss: 0.6330 - val_accuracy: 0.7590 - val_loss: 0.6169
Epoch 9/10
71/71 ━━━━━━━━━━━━━━━━━━━━ 3s 35ms/step - accuracy: 0.7530 - loss: 0.5603 - val_accuracy: 0.7590 - val_loss: 0.5748
Epoch 10/10
71/71 ━━━━━━━━━━━━━━━━━━━━ 4s 53ms/step - accuracy: 0.7677 - loss: 0.5238 - val_accuracy: 0.7523 - val_loss: 0.6104
24/24 ━━━━━━━━━━━━━━━━━━━━ 0s 12ms/step - accuracy: 0.7349 - loss: 0.6249
{'loss': 0.6103829145431519, 'compile_metrics': 0.7523302435874939}
w0 = model.get_weights()[0]
print (w0.shape)
mlutils.display_imgs(w0)
(5, 5, 3, 15)
../_images/845910d8a39ad7c2de11903772a1f9278897a5981765f60da46167ed2c978886.png

or with larger filters#

def get_conv_model_C(num_classes, img_size=32, compile=True):
    tf.keras.backend.clear_session()
    print ("using",num_classes,"classes")
    inputs = tf.keras.Input(shape=(img_size,img_size,3), name="input_1")
    layers = tf.keras.layers.Conv2D(96,(11,11), activation="relu")(inputs)
    layers = tf.keras.layers.MaxPool2D((2,2))(layers)
    layers = tf.keras.layers.Conv2D(60,(11,11), activation="relu")(layers)
    layers = tf.keras.layers.Flatten()(layers)
    layers = tf.keras.layers.Dense(16, activation=tf.nn.relu)(layers)
    layers = tf.keras.layers.Dropout(0.2)(layers)
    predictions = tf.keras.layers.Dense(num_classes, activation=tf.nn.softmax, name="output_1")(layers)
    model = tf.keras.Model(inputs = inputs, outputs=predictions)
    if compile:
        model.compile(optimizer='adam',
                      loss='sparse_categorical_crossentropy',
                      metrics=['accuracy'])
    return model
model = get_conv_model_C(num_classes)
model.summary()
train(model, batch_size=32, epochs=10, model_name="model_C")
using 3 classes
Model: "functional"
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓
┃ Layer (type)                     Output Shape                  Param # ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩
│ input_1 (InputLayer)            │ (None, 32, 32, 3)      │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ conv2d (Conv2D)                 │ (None, 22, 22, 96)     │        34,944 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ max_pooling2d (MaxPooling2D)    │ (None, 11, 11, 96)     │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ conv2d_1 (Conv2D)               │ (None, 1, 1, 60)       │       697,020 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ flatten (Flatten)               │ (None, 60)             │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dense (Dense)                   │ (None, 16)             │           976 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dropout (Dropout)               │ (None, 16)             │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ output_1 (Dense)                │ (None, 3)              │            51 │
└─────────────────────────────────┴────────────────────────┴───────────────┘
 Total params: 732,991 (2.80 MB)
 Trainable params: 732,991 (2.80 MB)
 Non-trainable params: 0 (0.00 B)
Epoch 1/10
71/71 ━━━━━━━━━━━━━━━━━━━━ 8s 99ms/step - accuracy: 0.3655 - loss: 1.1214 - val_accuracy: 0.5140 - val_loss: 0.9933
Epoch 2/10
71/71 ━━━━━━━━━━━━━━━━━━━━ 8s 114ms/step - accuracy: 0.5219 - loss: 0.9886 - val_accuracy: 0.5965 - val_loss: 0.8874
Epoch 3/10
71/71 ━━━━━━━━━━━━━━━━━━━━ 10s 115ms/step - accuracy: 0.5585 - loss: 0.9041 - val_accuracy: 0.6138 - val_loss: 0.8526
Epoch 4/10
71/71 ━━━━━━━━━━━━━━━━━━━━ 11s 120ms/step - accuracy: 0.5946 - loss: 0.8621 - val_accuracy: 0.6431 - val_loss: 0.8478
Epoch 5/10
71/71 ━━━━━━━━━━━━━━━━━━━━ 7s 97ms/step - accuracy: 0.5931 - loss: 0.8814 - val_accuracy: 0.6471 - val_loss: 0.8161
Epoch 6/10
71/71 ━━━━━━━━━━━━━━━━━━━━ 11s 110ms/step - accuracy: 0.6173 - loss: 0.8420 - val_accuracy: 0.6471 - val_loss: 0.8194
Epoch 7/10
71/71 ━━━━━━━━━━━━━━━━━━━━ 11s 115ms/step - accuracy: 0.6380 - loss: 0.8193 - val_accuracy: 0.6485 - val_loss: 0.8030
Epoch 8/10
71/71 ━━━━━━━━━━━━━━━━━━━━ 10s 115ms/step - accuracy: 0.6524 - loss: 0.7944 - val_accuracy: 0.6485 - val_loss: 0.7796
Epoch 9/10
71/71 ━━━━━━━━━━━━━━━━━━━━ 7s 98ms/step - accuracy: 0.6609 - loss: 0.7736 - val_accuracy: 0.6658 - val_loss: 0.7591
Epoch 10/10
71/71 ━━━━━━━━━━━━━━━━━━━━ 10s 98ms/step - accuracy: 0.6640 - loss: 0.7502 - val_accuracy: 0.6551 - val_loss: 0.7783
24/24 ━━━━━━━━━━━━━━━━━━━━ 1s 42ms/step - accuracy: 0.6572 - loss: 0.7931
{'loss': 0.7782898545265198, 'compile_metrics': 0.6551265120506287}
w0 = model.get_weights()[0]
print (w0.shape)
mlutils.display_imgs(w0)
(11, 11, 3, 96)
../_images/5411e8c4c19febaa4a558bc869f0a29a86fa76bff81f68df843e14062c36bcce.png
bi = np.random.randint(len(x_test))
plt.imshow(x_test[bi])
plt.axis("off");
../_images/48d8885b0f9278650961cb86a1bcd33f40ded916aeecec7796e92f7a760fb4d6.png
def output_at_layer(X, model, layer_name):
    from tensorflow.keras.models import Model
    return Model(inputs=model.input, outputs=model.get_layer(layer_name).output)(X).numpy()
acts = output_at_layer(x_test[bi:bi+1], model, "conv2d")[0]
acts.shape
(22, 22, 96)
plt.figure(figsize=(10,10))
for i in range(acts.shape[-1]):
    plt.subplot(10,10,i+1)
    plt.imshow(acts[:,:,i], cmap=plt.cm.Greys_r )
    plt.axis("off")
../_images/598ae6d5a20a9f0bc92a6c466c2acabc418f88dce101b5f38f32224daf1815f7.png
idxs = np.random.permutation(len(x_test))[:5]
preds = model.predict(x_test[idxs])
mlutils.show_preds(x_test[idxs],y_test[idxs], preds)
1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 86ms/step
../_images/f3de2a443a300e8fce5b0fa5ab1dfbf90eacb245844f7e385ad1ef00ef6b9288.png ../_images/5c193fb9b65b0627300c6cb9b99182473245d8b9a58a6e00a3a032bd46d8e41d.png ../_images/92ba60ba9fcb20741a3cbd8e18ab4041960ab0e7e2006685fcf4ccd669f972a3.png ../_images/8eb575eee3b31e8eb641f767f8562582b28020499d1522a96b44018f75b91766.png ../_images/55b4f10744a8de71a7e114491f3fd5b668b691978b821b02fa44fb58d5f43cdd.png

see

Class activation maps https://jacobgil.github.io/deeplearning/class-activation-maps