5.9 CNN-LSTM architectures#

This type of architecture is useful for different applications, for instance, action recognition in video sequences. In order to show its use, we are going to create a syntethic dataset.

The dataset is composed of videos where a point moves through the frames forming four different patterns: a constant point, a point ascending from bottom-left corner to top-right corner, a point descending from top-lef corner to bottom-right corner, and a point following a sin function.

import numpy as np
import matplotlib.pyplot as plt
import matplotlib.animation as animation
from keras.utils import np_utils
from tensorflow.keras.layers import LSTM, Conv2D, Dense, TimeDistributed, MaxPooling2D, Flatten
from tensorflow.keras.models import Sequential
import math
t = np.linspace(math.pi/10, 2*math.pi, num=20)
y = np.sin(t) + 0.05*np.random.randn(1, 20)
y = y.flatten()
i = 1
t2 = 6*i
y2 = int(np.round(23*(y[i-1]+1)+4))
def f(t2, y2):
    m = np.zeros((50,130))
    m[y2-3:y2+3,t2-3:t2+3] = 255
    return m

def updatefig(*args):
    global y,i
    if i == 20:
        i = 1
    t2 = 6*i
    y2 = int(np.round(23*(y[i-1]+1)+4))
    i += 1
    return im,

def updatefig2(*args):
    global y,i
    if i == 20:
        i = 1
    t2 = 6*(i+1)
    y2 = int(np.round(2*(i+1) + 2 + np.random.randn(1)))
    i += 1
    return im,
fig = plt.figure()
im = plt.imshow(f(t2, y2))
ani = animation.FuncAnimation(fig, updatefig, interval=50, frames=20, blit=True)

This is an example of a sin pattern.

t2 = 6*(i+1)
y2 = int(np.round(2*(i+1) + 2 + np.random.randn(1)))

fig = plt.figure()
im = plt.imshow(f(t2, y2))
ani = animation.FuncAnimation(fig, updatefig2, interval=50, frames=20, blit=True)

The data must have the form [n_samples,n_times,n_rows,n_columns,n_channels]

#Class sin
Videos1 = np.zeros((20,20,50,130,1))
for j in range(20):
    y = np.sin(t) + 0.05*np.random.randn(1, 20)
    y = y.flatten()
    for i in range(20):
        t2 = 6*(i+1)
        y2 = int(np.round(23*(y[i]+1)+4))
        Videos1[j,i,:,:,0] = f(t2,y2)/255
#Class constan
Videos2 = np.zeros((20,20,50,130,1))
for j in range(20):
    for i in range(20):
        t2 = int(np.round(25 + np.random.randn(1)))
        y2 = int(np.round(65 + np.random.randn(1)))
        Videos2[j,i,:,:,0] = f(t2,y2)/255
#Class ascending
Videos3 = np.zeros((20,20,50,130,1))
for j in range(20):
    for i in range(20):
        t2 = 6*(i+1)
        y2 = int(np.round(2*(i+1) + 2 + np.random.randn(1)))
        Videos3[j,i,:,:,0] = f(t2,y2)/255
#Class descending
Videos4 = np.zeros((20,20,50,130,1))
for j in range(20):
    for i in range(20):
        t2 = 6*(i+1)
        y2 = int(np.round(2*(20-i)+ 2 + np.random.randn(1)))
        Videos4[j,i,:,:,0] = f(t2,y2)/255
Videos = np.concatenate((Videos1,Videos2,Videos3,Videos4),axis=0)
(80, 20, 50, 130, 1)
Y = np.r_[np.zeros(20),np.ones(20),2*np.ones(20),3*np.ones(20)]
# convert list of labels to binary class matrix
y_trainOHE = np_utils.to_categorical(Y)
nb_classes = y_trainOHE.shape[1]

There are Three ways to define the network:#

# define CNN model
cnn = Sequential()
# define LSTM model
model = Sequential()
model.add(TimeDistributed(cnn, ...))
model = Sequential()
# define CNN model
# define LSTM model

Let’s define our architecture:

rows = 50
columns = 130
channels = 1

model1 = Sequential()
model1.add(TimeDistributed(Conv2D(filters=5, kernel_size=(4, 4), 
                                 input_shape=(rows, columns, channels))))
model1.add(TimeDistributed(MaxPooling2D(pool_size=(2, 2))))
model1.add(TimeDistributed(Conv2D(filters=5, kernel_size=(8, 8), 
                                 input_shape=(rows, columns, channels))))
model1.add(TimeDistributed(MaxPooling2D(pool_size=(2, 2))))
model1.add(TimeDistributed(Conv2D(filters=5, kernel_size=(8, 8), 
                                 input_shape=(rows, columns, channels))))
model1.add(TimeDistributed(MaxPooling2D(pool_size=(2, 2))))

# add the LSTM layer, and a final Dense layer
model1.add(LSTM(units=5, activation='relu', stateful=False))
model1.add(Dense(4, activation='softmax'))
model1.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
WARNING:tensorflow:Layer lstm will not use cuDNN kernel since it doesn't meet the cuDNN kernel criteria. It will use generic GPU kernel as fallback when running on GPU
Model: "sequential"
Layer (type)                 Output Shape              Param #   
time_distributed (TimeDistri (None, 20, 50, 130, 5)    85        
time_distributed_1 (TimeDist (None, 20, 25, 65, 5)     0         
time_distributed_2 (TimeDist (None, 20, 25, 65, 5)     1605      
time_distributed_3 (TimeDist (None, 20, 12, 32, 5)     0         
time_distributed_4 (TimeDist (None, 20, 12, 32, 5)     1605      
time_distributed_5 (TimeDist (None, 20, 6, 16, 5)      0         
time_distributed_6 (TimeDist (None, 20, 480)           0         
lstm (LSTM)                  (None, 5)                 9720      
dense (Dense)                (None, 4)                 24        
Total params: 13,039
Trainable params: 13,039
Non-trainable params: 0

Let’s create a new set of videos to validate the model:

#Class sin
Videos1 = np.zeros((20,20,50,130,1))
for j in range(20):
    y = np.sin(t) + 0.05*np.random.randn(1, 20)
    y = y.flatten()
    for i in range(20):
        t2 = 6*(i+1)
        y2 = int(np.round(23*(y[i]+1)+4))
        Videos1[j,i,:,:,0] = f(t2,y2)/255
#Class constan
Videos2 = np.zeros((20,20,50,130,1))
for j in range(20):
    for i in range(20):
        t2 = int(np.round(25 + np.random.randn(1)))
        y2 = int(np.round(65 + np.random.randn(1)))
        Videos2[j,i,:,:,0] = f(t2,y2)/255
#Class ascending
Videos3 = np.zeros((20,20,50,130,1))
for j in range(20):
    for i in range(20):
        t2 = 6*(i+1)
        y2 = int(np.round(2*(i+1) + 2 + np.random.randn(1)))
        Videos3[j,i,:,:,0] = f(t2,y2)/255
#Class descending
Videos4 = np.zeros((20,20,50,130,1))
for j in range(20):
    for i in range(20):
        t2 = 6*(i+1)
        y2 = int(np.round(2*(20-i)+ 2 + np.random.randn(1)))
        Videos4[j,i,:,:,0] = f(t2,y2)/255
VideosTest = np.concatenate((Videos1,Videos2,Videos3,Videos4),axis=0)
y_est = np.argmax(model1.predict(VideosTest),axis=1)
print('accuracy testing = {}'.format(np.sum(y_est==Y)/80))
accuracy testing = 1.0

3) Convolutional LSTM#

from IPython.display import Image
Image(filename='local/imgs/ConvLSTM.png', width=600)

Image taken from here

ConvLSTMs are similar to a LSTMs, but the internal matrix multiplications are replaced by convolutions. The object that flows trough the cell is a 3D tensor instead of being just a 1D vector with features, like in ‘peephole’ LSTMs.

from tensorflow.keras.layers import ConvLSTM2D, BatchNormalization
frames = 20
model2 = Sequential()
model2.add(ConvLSTM2D(filters=5, kernel_size=(4, 4), 
                                 input_shape=(frames,rows, columns, channels),return_sequences=True))
model2.add(TimeDistributed(MaxPooling2D(pool_size=(2, 2))))
model2.add(ConvLSTM2D(filters=5, kernel_size=(8, 8), 
model2.add(TimeDistributed(MaxPooling2D(pool_size=(2, 2))))
model2.add(ConvLSTM2D(filters=5, kernel_size=(8, 8), 
model2.add(MaxPooling2D(pool_size=(2, 2)))

model2.add(Dense(4, activation='softmax'))
model2.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
Model: "sequential_1"
Layer (type)                 Output Shape              Param #   
conv_lst_m2d (ConvLSTM2D)    (None, 20, 50, 130, 5)    1940      
time_distributed_7 (TimeDist (None, 20, 25, 65, 5)     0         
conv_lst_m2d_1 (ConvLSTM2D)  (None, 20, 25, 65, 5)     12820     
time_distributed_8 (TimeDist (None, 20, 12, 32, 5)     0         
conv_lst_m2d_2 (ConvLSTM2D)  (None, 12, 32, 5)         12820     
max_pooling2d_5 (MaxPooling2 (None, 6, 16, 5)          0         
flatten_1 (Flatten)          (None, 480)               0         
dense_1 (Dense)              (None, 4)                 1924      
Total params: 29,504
Trainable params: 29,504
Non-trainable params: 0
y_est = np.argmax(model2.predict(VideosTest),axis=1)
print('accuracy testing = {}'.format(np.sum(y_est==Y)/80))
accuracy testing = 1.0