2.8 - Weights initialization#

!wget -nc --no-cache -O init.py -q https://raw.githubusercontent.com/rramosp/2021.deeplearning/main/content/init.py
import init; init.init(force_download=False);

follow the explanation here#

https://adventuresinmachinelearning.com/weight-initialization-tutorial-tensorflow/

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from IPython.display import Image
%matplotlib inline
import tensorflow as tf
tf.__version__
'2.19.0'
mnist = pd.read_csv("local/data/mnist1.5k.csv.gz", compression="gzip", header=None).values
X=mnist[:,1:785]/255.
y=mnist[:,0]
print("dimension de las imagenes y las clases", X.shape, y.shape)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)
X_train = X_train
X_test  = X_test
y_train_oh = np.eye(10)[y_train]
y_test_oh  = np.eye(10)[y_test]
print(X_train.shape, y_train_oh.shape)
dimension de las imagenes y las clases (1500, 784) (1500,)
(1200, 784) (1200, 10)

load data and train a simple model#

from tensorflow.keras import Sequential, Model
from tensorflow.keras.layers import Dense, Dropout, Flatten, concatenate, Input
from tensorflow.keras.backend import clear_session
from tensorflow import keras
import tensorflow as tf
import tensorflow.keras.backend as K
tf.keras.backend.set_floatx('float32')
def get_model(input_dim=784, output_dim=10, layer_sizes=[10]*6, activation="relu", sigma=1):
    inputs = Input(shape=(input_dim,))

    init1k = keras.initializers.RandomNormal(mean=.0, stddev=sigma, seed=None)
    init1b = keras.initializers.RandomNormal(mean=.0, stddev=sigma, seed=None)

    layer = Dense(layer_sizes[0], activation=activation, input_dim=input_dim, name="Layer_%02d_Input"%(0),
                    kernel_initializer=init1k,
                    bias_initializer=init1b,
                    dtype=tf.float64
                )(inputs)

    layers = [inputs, layer]
    for i, hsize in enumerate(layer_sizes[1:]):
        layer = Dense(hsize, activation=activation, name="Layer_%02d_Hidden"%(i+1), dtype=tf.float64)(layer)
        layers.append(layer)

    outputs = Dense(output_dim, activation="softmax", name="Layer_%02d_Output"%(len(layer_sizes)), dtype=tf.float64)(layer)

    layers.append(outputs)

    model = Model([inputs], [outputs])

    model.compile(optimizer='adam', loss=tf.keras.losses.categorical_crossentropy, metrics=['accuracy'])
    return model, layers

def get_gradients_functions(layers):
    T_input     = layers[0]
    T_outputs   = layers[1:]
    T_weights   = model.trainable_weights

    F_outputs = None
    F_outputs   = [Model(model.input, [out]) for out in T_outputs]

    def get_gradients_functions(model):
        r = []
        for i in range(len(model.trainable_variables)):
            def f(X,y,i=eval("i")):
                v = model.trainable_variables[i]
                with tf.GradientTape(persistent=True) as t:
                    loss = model.loss( model(X), y)
                return t.gradient(loss,v).numpy()
            r.append(f)
        return r

    F_gradients = get_gradients_functions(model)

    return T_input, T_outputs, T_weights, F_outputs, F_gradients
model, layers = get_model(layer_sizes=[20,15,15], activation="sigmoid", sigma=20)
model.summary()
Model: "functional_18"
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓
┃ Layer (type)                     Output Shape                  Param # ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩
│ input (InputLayer)              │ (None, 784)            │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ Layer_00_Input (Dense)          │ (None, 20)             │        15,700 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ Layer_01_Hidden (Dense)         │ (None, 15)             │           315 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ Layer_02_Hidden (Dense)         │ (None, 15)             │           240 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ Layer_03_Output (Dense)         │ (None, 10)             │           160 │
└─────────────────────────────────┴────────────────────────┴───────────────┘
 Total params: 16,415 (128.24 KB)
 Trainable params: 16,415 (128.24 KB)
 Non-trainable params: 0 (0.00 B)
model.get_config()
{'name': 'functional_18',
 'trainable': True,
 'layers': [{'module': 'keras.layers',
   'class_name': 'InputLayer',
   'config': {'batch_shape': (None, 784),
    'dtype': 'float32',
    'sparse': False,
    'ragged': False,
    'name': 'input'},
   'registered_name': None,
   'name': 'input',
   'inbound_nodes': []},
  {'module': 'keras.layers',
   'class_name': 'Dense',
   'config': {'name': 'Layer_00_Input',
    'trainable': True,
    'dtype': {'module': 'keras',
     'class_name': 'DTypePolicy',
     'config': {'name': 'float64'},
     'registered_name': None},
    'units': 20,
    'activation': 'sigmoid',
    'use_bias': True,
    'kernel_initializer': {'module': 'keras.initializers',
     'class_name': 'RandomNormal',
     'config': {'seed': None, 'mean': 0.0, 'stddev': 20},
     'registered_name': None},
    'bias_initializer': {'module': 'keras.initializers',
     'class_name': 'RandomNormal',
     'config': {'seed': None, 'mean': 0.0, 'stddev': 20},
     'registered_name': None},
    'kernel_regularizer': None,
    'bias_regularizer': None,
    'kernel_constraint': None,
    'bias_constraint': None},
   'registered_name': None,
   'build_config': {'input_shape': (None, 784)},
   'name': 'Layer_00_Input',
   'inbound_nodes': [{'args': ({'class_name': '__keras_tensor__',
       'config': {'shape': (None, 784),
        'dtype': 'float32',
        'keras_history': ['input', 0, 0]}},),
     'kwargs': {}}]},
  {'module': 'keras.layers',
   'class_name': 'Dense',
   'config': {'name': 'Layer_01_Hidden',
    'trainable': True,
    'dtype': {'module': 'keras',
     'class_name': 'DTypePolicy',
     'config': {'name': 'float64'},
     'registered_name': None},
    'units': 15,
    'activation': 'sigmoid',
    'use_bias': True,
    'kernel_initializer': {'module': 'keras.initializers',
     'class_name': 'GlorotUniform',
     'config': {'seed': None},
     'registered_name': None},
    'bias_initializer': {'module': 'keras.initializers',
     'class_name': 'Zeros',
     'config': {},
     'registered_name': None},
    'kernel_regularizer': None,
    'bias_regularizer': None,
    'kernel_constraint': None,
    'bias_constraint': None},
   'registered_name': None,
   'build_config': {'input_shape': (None, 20)},
   'name': 'Layer_01_Hidden',
   'inbound_nodes': [{'args': ({'class_name': '__keras_tensor__',
       'config': {'shape': (None, 20),
        'dtype': 'float64',
        'keras_history': ['Layer_00_Input', 0, 0]}},),
     'kwargs': {}}]},
  {'module': 'keras.layers',
   'class_name': 'Dense',
   'config': {'name': 'Layer_02_Hidden',
    'trainable': True,
    'dtype': {'module': 'keras',
     'class_name': 'DTypePolicy',
     'config': {'name': 'float64'},
     'registered_name': None},
    'units': 15,
    'activation': 'sigmoid',
    'use_bias': True,
    'kernel_initializer': {'module': 'keras.initializers',
     'class_name': 'GlorotUniform',
     'config': {'seed': None},
     'registered_name': None},
    'bias_initializer': {'module': 'keras.initializers',
     'class_name': 'Zeros',
     'config': {},
     'registered_name': None},
    'kernel_regularizer': None,
    'bias_regularizer': None,
    'kernel_constraint': None,
    'bias_constraint': None},
   'registered_name': None,
   'build_config': {'input_shape': (None, 15)},
   'name': 'Layer_02_Hidden',
   'inbound_nodes': [{'args': ({'class_name': '__keras_tensor__',
       'config': {'shape': (None, 15),
        'dtype': 'float64',
        'keras_history': ['Layer_01_Hidden', 0, 0]}},),
     'kwargs': {}}]},
  {'module': 'keras.layers',
   'class_name': 'Dense',
   'config': {'name': 'Layer_03_Output',
    'trainable': True,
    'dtype': {'module': 'keras',
     'class_name': 'DTypePolicy',
     'config': {'name': 'float64'},
     'registered_name': None},
    'units': 10,
    'activation': 'softmax',
    'use_bias': True,
    'kernel_initializer': {'module': 'keras.initializers',
     'class_name': 'GlorotUniform',
     'config': {'seed': None},
     'registered_name': None},
    'bias_initializer': {'module': 'keras.initializers',
     'class_name': 'Zeros',
     'config': {},
     'registered_name': None},
    'kernel_regularizer': None,
    'bias_regularizer': None,
    'kernel_constraint': None,
    'bias_constraint': None},
   'registered_name': None,
   'build_config': {'input_shape': (None, 15)},
   'name': 'Layer_03_Output',
   'inbound_nodes': [{'args': ({'class_name': '__keras_tensor__',
       'config': {'shape': (None, 15),
        'dtype': 'float64',
        'keras_history': ['Layer_02_Hidden', 0, 0]}},),
     'kwargs': {}}]}],
 'input_layers': [['input', 0, 0]],
 'output_layers': [['Layer_03_Output', 0, 0]]}
T_input, T_outputs, T_weights, F_outputs, F_gradients = get_gradients_functions(layers)
scale_X=.2
shift_X =.5
!rm -rf log
tb_callback = keras.callbacks.TensorBoard(log_dir='./log/winit', histogram_freq=1,  write_graph=True, write_images=True)
model.fit((X_train-shift_X)*scale_X, y_train_oh, epochs=30, batch_size=32,
          validation_data=((X_test-shift_X)*scale_X, y_test_oh),
         )#callbacks=[tb_callback])
Epoch 1/30
38/38 ━━━━━━━━━━━━━━━━━━━━ 2s 12ms/step - accuracy: 0.0850 - loss: 2.5021 - val_accuracy: 0.1033 - val_loss: 2.3299
Epoch 2/30
38/38 ━━━━━━━━━━━━━━━━━━━━ 0s 5ms/step - accuracy: 0.1063 - loss: 2.3428 - val_accuracy: 0.1033 - val_loss: 2.2992
Epoch 3/30
38/38 ━━━━━━━━━━━━━━━━━━━━ 0s 8ms/step - accuracy: 0.0879 - loss: 2.3143 - val_accuracy: 0.0967 - val_loss: 2.2909
Epoch 4/30
38/38 ━━━━━━━━━━━━━━━━━━━━ 1s 7ms/step - accuracy: 0.1268 - loss: 2.2931 - val_accuracy: 0.1200 - val_loss: 2.2884
Epoch 5/30
38/38 ━━━━━━━━━━━━━━━━━━━━ 0s 8ms/step - accuracy: 0.1319 - loss: 2.2814 - val_accuracy: 0.1033 - val_loss: 2.2885
Epoch 6/30
38/38 ━━━━━━━━━━━━━━━━━━━━ 1s 8ms/step - accuracy: 0.1289 - loss: 2.2806 - val_accuracy: 0.1033 - val_loss: 2.2863
Epoch 7/30
38/38 ━━━━━━━━━━━━━━━━━━━━ 1s 7ms/step - accuracy: 0.1460 - loss: 2.2638 - val_accuracy: 0.1033 - val_loss: 2.2823
Epoch 8/30
38/38 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - accuracy: 0.1229 - loss: 2.2705 - val_accuracy: 0.1033 - val_loss: 2.2763
Epoch 9/30
38/38 ━━━━━━━━━━━━━━━━━━━━ 0s 5ms/step - accuracy: 0.1300 - loss: 2.2560 - val_accuracy: 0.1233 - val_loss: 2.2700
Epoch 10/30
38/38 ━━━━━━━━━━━━━━━━━━━━ 0s 5ms/step - accuracy: 0.1543 - loss: 2.2479 - val_accuracy: 0.1433 - val_loss: 2.2592
Epoch 11/30
38/38 ━━━━━━━━━━━━━━━━━━━━ 0s 5ms/step - accuracy: 0.1591 - loss: 2.2397 - val_accuracy: 0.1867 - val_loss: 2.2486
Epoch 12/30
38/38 ━━━━━━━━━━━━━━━━━━━━ 0s 5ms/step - accuracy: 0.1890 - loss: 2.2206 - val_accuracy: 0.1700 - val_loss: 2.2354
Epoch 13/30
38/38 ━━━━━━━━━━━━━━━━━━━━ 0s 5ms/step - accuracy: 0.1962 - loss: 2.1983 - val_accuracy: 0.2000 - val_loss: 2.2182
Epoch 14/30
38/38 ━━━━━━━━━━━━━━━━━━━━ 0s 5ms/step - accuracy: 0.2300 - loss: 2.1754 - val_accuracy: 0.1833 - val_loss: 2.1980
Epoch 15/30
38/38 ━━━━━━━━━━━━━━━━━━━━ 0s 5ms/step - accuracy: 0.2335 - loss: 2.1482 - val_accuracy: 0.2033 - val_loss: 2.1789
Epoch 16/30
38/38 ━━━━━━━━━━━━━━━━━━━━ 0s 5ms/step - accuracy: 0.2487 - loss: 2.1239 - val_accuracy: 0.2233 - val_loss: 2.1545
Epoch 17/30
38/38 ━━━━━━━━━━━━━━━━━━━━ 0s 5ms/step - accuracy: 0.2959 - loss: 2.1074 - val_accuracy: 0.2667 - val_loss: 2.1342
Epoch 18/30
38/38 ━━━━━━━━━━━━━━━━━━━━ 0s 5ms/step - accuracy: 0.3413 - loss: 2.0760 - val_accuracy: 0.2600 - val_loss: 2.1130
Epoch 19/30
38/38 ━━━━━━━━━━━━━━━━━━━━ 0s 5ms/step - accuracy: 0.2686 - loss: 2.0881 - val_accuracy: 0.2633 - val_loss: 2.0932
Epoch 20/30
38/38 ━━━━━━━━━━━━━━━━━━━━ 0s 5ms/step - accuracy: 0.3225 - loss: 2.0440 - val_accuracy: 0.2800 - val_loss: 2.0766
Epoch 21/30
38/38 ━━━━━━━━━━━━━━━━━━━━ 0s 5ms/step - accuracy: 0.3454 - loss: 2.0276 - val_accuracy: 0.2800 - val_loss: 2.0597
Epoch 22/30
38/38 ━━━━━━━━━━━━━━━━━━━━ 0s 5ms/step - accuracy: 0.3476 - loss: 1.9927 - val_accuracy: 0.2833 - val_loss: 2.0468
Epoch 23/30
38/38 ━━━━━━━━━━━━━━━━━━━━ 0s 5ms/step - accuracy: 0.3469 - loss: 1.9790 - val_accuracy: 0.2800 - val_loss: 2.0316
Epoch 24/30
38/38 ━━━━━━━━━━━━━━━━━━━━ 0s 5ms/step - accuracy: 0.3576 - loss: 1.9553 - val_accuracy: 0.2833 - val_loss: 2.0216
Epoch 25/30
38/38 ━━━━━━━━━━━━━━━━━━━━ 0s 5ms/step - accuracy: 0.3610 - loss: 1.9595 - val_accuracy: 0.2933 - val_loss: 2.0090
Epoch 26/30
38/38 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step - accuracy: 0.3498 - loss: 1.9542 - val_accuracy: 0.3033 - val_loss: 1.9964
Epoch 27/30
38/38 ━━━━━━━━━━━━━━━━━━━━ 0s 5ms/step - accuracy: 0.3587 - loss: 1.9293 - val_accuracy: 0.2933 - val_loss: 1.9896
Epoch 28/30
38/38 ━━━━━━━━━━━━━━━━━━━━ 0s 5ms/step - accuracy: 0.3531 - loss: 1.9243 - val_accuracy: 0.3000 - val_loss: 1.9783
Epoch 29/30
38/38 ━━━━━━━━━━━━━━━━━━━━ 0s 5ms/step - accuracy: 0.3844 - loss: 1.9110 - val_accuracy: 0.3033 - val_loss: 1.9681
Epoch 30/30
38/38 ━━━━━━━━━━━━━━━━━━━━ 0s 5ms/step - accuracy: 0.3483 - loss: 1.8975 - val_accuracy: 0.3133 - val_loss: 1.9597
<keras.src.callbacks.history.History at 0x7da22420c5c0>

Effects of different initializations#

understand carefully the following function

check the notebook on Inspecting model internals to understand get_tensors_and_functions and the objects it returns

tf.experimental.numpy.experimental_enable_numpy_behavior()

def train_experiment(model, layers, sigma, X_train, X_test):
    T_input, T_outputs, T_weights, F_outputs, F_gradients = get_gradients_functions(layers)

    w0_before = model.get_weights()[0].reshape(-1)
    o0_before = F_outputs[0]([X_train])[0].reshape(-1)
    g0_before = F_gradients[0](X_train, y_train_oh).reshape(-1)

    model.fit(X_train, y_train_oh, epochs=30, batch_size=32,
              validation_data=(X_test, y_test_oh), verbose=0)

    w0_after = model.get_weights()[0].reshape(-1)
    o0_after = F_outputs[0]([X_train])[0].reshape(-1)
    g0_after = F_gradients[0](X_train, y_train_oh).reshape(-1)

    acc, val_acc = model.history.history["accuracy"], model.history.history["val_accuracy"]

    plt.figure(figsize=(20,3))
    plt.subplot(141)
    plt.plot(acc, label="train_acc")
    plt.plot(val_acc, label="test_acc")
    plt.legend();
    plt.grid()
    plt.title("sigma=%.2f"%(sigma))
    plt.xlabel("epoch")
    plt.subplot(142)
    plt.hist(w0_after, bins=30, density=True, label="after", alpha=.5);
    plt.hist(w0_before, bins=30, density=True, label="before", alpha=.5);
    plt.legend();
    plt.title("layer 0 weights")

    plt.subplot(143)
    plt.hist(o0_after, bins=30, density=True, label="after", alpha=.5);
    plt.hist(o0_before, bins=30, density=True, label="before", alpha=.5);
    plt.legend();
    plt.title("layer 0 outputs")

    plt.subplot(144)

    def get_percentile(k, perc=90):
        p = np.percentile(np.abs(k), [perc])[0]
        return k[(k>-p)&(k<p)]

    plt.hist(get_percentile(g0_after), bins=30, density=True, label="after", alpha=.5);
    plt.hist(get_percentile(g0_before), bins=30, density=True, label="before", alpha=.5);
    plt.legend();
    plt.title("layer 0 gradients")

initializing with a standard normal (\(\mu=0\) and \(\sigma=1\))#

histograms show weights, outputs and gradients before and after training

in good configurations:

  • weights move during training

  • gradients are spread around zero before training

  • outputs before training are spread

model, layers = get_model(layer_sizes=[20,15,15], activation="sigmoid", sigma=1.)
/usr/local/lib/python3.12/dist-packages/keras/src/layers/core/dense.py:93: UserWarning: Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
train_experiment(model, layers, sigma=1, X_train=X_train, X_test=X_test)
/usr/local/lib/python3.12/dist-packages/keras/src/models/functional.py:241: UserWarning: The structure of `inputs` doesn't match the expected structure.
Expected: ['keras_tensor_117']
Received: inputs=Tensor(shape=(1200, 784))
  warnings.warn(msg)
../_images/e66a9719a67de5c92602c4da472670dd7cc639039b9e33c46014b0e75eb478a3.png

initializing with a small \(\sigma\)#

sigma= 0.1
model, layers = get_model(layer_sizes=[20,15,15], activation="sigmoid", sigma=sigma)
/usr/local/lib/python3.12/dist-packages/keras/src/layers/core/dense.py:93: UserWarning: Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
train_experiment(model, layers, sigma=sigma, X_train=X_train, X_test=X_test)
/usr/local/lib/python3.12/dist-packages/keras/src/models/functional.py:241: UserWarning: The structure of `inputs` doesn't match the expected structure.
Expected: ['keras_tensor_123']
Received: inputs=Tensor(shape=(1200, 784))
  warnings.warn(msg)
../_images/b9989cd4bf9f7d7e42fb0475729fc5704d4939a9ae14f6cab21553353183fe76.png
sigma= 0.1
model, layers = get_model(layer_sizes=[20,15,15], activation="sigmoid", sigma=sigma)
train_experiment(model, layers, sigma=sigma, X_train=X_train, X_test=X_test)
/usr/local/lib/python3.12/dist-packages/keras/src/layers/core/dense.py:93: UserWarning: Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
/usr/local/lib/python3.12/dist-packages/keras/src/models/functional.py:241: UserWarning: The structure of `inputs` doesn't match the expected structure.
Expected: ['keras_tensor_129']
Received: inputs=Tensor(shape=(1200, 784))
  warnings.warn(msg)
../_images/bf58e47025cdd3a9997987d93ed3761de815c31fda0838726abffe903f67e07b.png

initializing with a large \(\sigma\)#

observe how gradients are very concentrated arounz zero at the beginning of training

sigma= 10
model, layers = get_model(layer_sizes=[20,15,15], activation="sigmoid", sigma=sigma)
train_experiment(model, layers, sigma=sigma, X_train=X_train, X_test=X_test)
/usr/local/lib/python3.12/dist-packages/keras/src/models/functional.py:241: UserWarning: The structure of `inputs` doesn't match the expected structure.
Expected: ['keras_tensor_135']
Received: inputs=Tensor(shape=(1200, 784))
  warnings.warn(msg)
../_images/891f062bcabb5dcf8100f094fe212ae914d4c80ebd8c89d0a363850ae7ff34d2.png

initializing with a small \(\sigma\) but with large values for input data#

Recall that \(XW+b\) is what enters the \(sigmoid\) function. If large, it will be away from the linear regine around zero. It can be large because of \(W\) (large initialization \(\sigma\)), or because of \(X\).

sigma= 0.1
model, layers = get_model(layer_sizes=[20,15,15], activation="sigmoid", sigma=sigma)
train_experiment(model, layers, sigma=sigma, X_train=X_train*100-50, X_test=X_test*100-50)
/usr/local/lib/python3.12/dist-packages/keras/src/layers/core/dense.py:93: UserWarning: Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
/usr/local/lib/python3.12/dist-packages/keras/src/models/functional.py:241: UserWarning: The structure of `inputs` doesn't match the expected structure.
Expected: ['keras_tensor_147']
Received: inputs=Tensor(shape=(1200, 784))
  warnings.warn(msg)
../_images/051f8ed9be33e2e6dd1bb21594b85d27d03b319c90feeb73b05670a55759fd03.png