LAB 5.3 - Transformer - BERT

!wget -nc --no-cache -O -q
import init; init.init(force_download=False); init.get_weblink() 
from local.lib.rlxmoocapi import submit, session
import inspect
session.LoginSequence(endpoint=init.endpoint, course_id=init.course_id, lab_id="L05.03", varname="student");
import sys
if 'google.colab' in sys.modules:
    print ("setting tensorflow version in colab")
    %tensorflow_version 2.x
import tensorflow as tf
import tensorflow_hub as hub
print("TF version: ", tf.__version__)
print("Hub version: ", hub.__version__)


Let’s take a look to the following example about the use of BERT model from Tensorflow_hub

We are going to use the same dataset for sentiment analysis than in the LAB 5.2

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import math
data = pd.read_csv('local/data/Tweets.csv')
# Keeping only the neccessary columns
data = data[['text','airline_sentiment']]
import re
pd.options.mode.chained_assignment = None  # default='warn'
#Remove neutral class
data = data[data.airline_sentiment != "neutral"]

#text normalization
data['text'] = data['text'].apply(lambda x: x.lower())
data['text'] = data['text'].apply((lambda x:re.sub('@[^\s]+','',x)))#remove the name of the airline
data['text'] = data['text'].apply((lambda x: re.sub('[^a-zA-z0-9\s]','',x)))
Np = np.sum(data['airline_sentiment'].values == 'positive')
Nn = np.sum(data['airline_sentiment'].values == 'negative')
print(f'Number of positive samples={Np}')
print(f'Number of negative samples={Nn}')
for idx,row in data.iterrows():
    row[0] = row[0].replace('rt',' ')

We can use a pre-trained BERT from tensorflow hub.

bert_layer = hub.KerasLayer("",
!pip install bert-for-tf2
import bert
FullTokenizer = bert.bert_tokenization.FullTokenizer
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = FullTokenizer(vocab_file, do_lower_case)
all_sentences = data['text'].values
all_words = []
maximo = 0
for sent in all_sentences:
    temp = []
    i = 0
    for w in tokenizer.tokenize(sent):
    if i> maximo:
        maximo = i
max_seq_length = maximo + 2  # Your choice here.

BERT model requires three inputs: ids, mask and segments.

  • ids: correspond to the tokenized word sequence.

  • mask: is used for MLM training phase.

  • segments: is used for NSP training pahse.

def get_masks(tokens, max_seq_length):
    """Mask for padding"""
    if len(tokens)>max_seq_length:
        raise IndexError("Token length more than max seq length!")
    return [1]*len(tokens) + [0] * (max_seq_length - len(tokens))

def get_segments(tokens, max_seq_length):
    """Segments: 0 for the first sequence, 1 for the second"""
    if len(tokens)>max_seq_length:
        raise IndexError("Token length more than max seq length!")
    segments = []
    current_segment_id = 0
    for token in tokens:
        if token == "[SEP]":
            current_segment_id = 1
    return segments + [0] * (max_seq_length - len(tokens))

def get_ids(tokens, tokenizer, max_seq_length):
    """Token ids from Tokenizer vocab"""
    token_ids = tokenizer.convert_tokens_to_ids(tokens)
    input_ids = token_ids + [0] * (max_seq_length-len(token_ids))
    return input_ids

Take a look to ids,mask and segments for a toy example:

s = "This is a nice sentence."
stokens = tokenizer.tokenize(s)
stokens = ["[CLS]"] + stokens + ["[SEP]"]

input_ids = get_ids(stokens, tokenizer, max_seq_length)
input_masks = get_masks(stokens, max_seq_length)
input_segments = get_segments(stokens, max_seq_length)
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input
input_word_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
input_mask = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
segment_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])

model = Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=[pooled_output, sequence_output])
pool_embs, all_embs = model.predict([np.array(input_ids).reshape(1,-1),np.array(input_masks).reshape(1,-1),np.array(input_segments).reshape(1,-1)])

BERT provides 768 dimension embedding for each token in the given sentence. Note that it gives you two different ouputs: pool_embs and all_embs. all embs is the embedding of the whole sequence and pool_embs is the embedding of the initial CLS token. It’s “pooled” from all input tokens in the sense that the multiple attention layers will force it to depend on all other tokens.


Yet, let’s create the three input arrays for the whole dataset:

input_ids = np.zeros((len(all_words),max_seq_length))
input_masks = np.zeros((len(all_words),max_seq_length))
input_segments = np.zeros((len(all_words),max_seq_length))
for i in range(len(all_words)):
    input_ids[i,:] = np.array(get_ids(all_words[i], tokenizer, max_seq_length)).reshape(1,-1)
    input_masks[i,:] = np.array(get_masks(all_words[i], max_seq_length)).reshape(1,-1)
    input_segments[i,:] = np.array(get_segments(all_words[i], max_seq_length)).reshape(1,-1)
from sklearn.preprocessing import LabelEncoder
y = data['airline_sentiment'].values
Encoder = LabelEncoder()
Y = Encoder.fit_transform(y)
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import LSTM, Dense, Input, Dropout
from sklearn.model_selection import train_test_split
input_ids_tr, input_ids_te, input_masks_tr, input_masks_te, input_segments_tr, input_segments_te, y_tr, y_te = train_test_split(input_ids, input_masks, input_segments, Y, test_size=0.2, random_state=2018)
input_word_ids = Input(shape=(max_seq_length,), dtype=tf.int32,
input_mask = Input(shape=(max_seq_length,), dtype=tf.int32,
segment_ids = Input(shape=(max_seq_length,), dtype=tf.int32,
#bert_layer = hub.KerasLayer("",
#                            trainable=False)
pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])

recurrent = LSTM(128,activation='tanh')(sequence_output)
dense1 = Dense(10,activation='relu')(recurrent)
drop1 = Dropout(0.3)(dense1)
dense2 = Dense(1,activation='sigmoid')(drop1)

model2 = Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=[dense2])
model2.compile(optimizer='adam', loss="binary_crossentropy", metrics=["accuracy"])[input_ids_tr,input_masks_tr,input_segments_tr], y_tr, validation_split=0.1,batch_size=32, epochs=3, verbose=1)
y_pred = np.round(model2.predict([input_ids_te,input_masks_te,input_segments_te]))
from sklearn.metrics import recall_score, accuracy_score
def especi_score(y_te,y_pred):
    Ns = np.sum(y_te == 0)
    return np.sum(y_te[y_te == 0] == y_pred[y_te == 0])/Ns
print('Accuracy = {}'.format(accuracy_score(y_te,y_pred)))
print('Sensitivity = {}'.format(recall_score(y_te,y_pred)))
print('Especificity = {}'.format(especi_score(y_te,y_pred.flatten())))

Task 1:

Create a layer class TokenAndPositionEmbedding similar to the one used in the course material U5.08 , but in this case define it such that returns word Embedding vectors plus non trainable sinusoidal positional embeddings, which must be set according to the embedding dimension and maximum length of the sentences.

The sinusoidal positional embedding can be estimated as:

\[PE_{(pos,2i)} = \sin\left(\frac{pos}{10000^{2i/d_e}}\right)$$, $$PE_{(pos,2i+1)} = \cos\left(\frac{pos}{10000^{2i/d_e}}\right)\]

where \(pos\) is the position in the embedding vector, \(d_e =\) embedding dimension and \(i \in [0,d_e/2]\). Remember to keep \(pos\) 0 for padding token position encoding zero vector.

For a \(d_e = 100\) and sequences with a maximum length of 200, the positional embedding matrix must look like this:

from IPython.display import Image
Image(filename='local/imgs/positional_embeddings.png', width=600)
../_images/U5 LAB 03 - Sentiment Analysis using BERT_38_0.png
def TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim):
    class _TokenAndPositionEmbedding(layers.Layer):
        def __init__(self, maxlen, vocab_size, embed_dim):
        def call(self, x):
        return token_embeddings + pos_embeddings
    return _TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)

Submit your solution:

student.submit_task(namespace=globals(), task_id='T1');

Task 2:

Create a function ‘get_transformer_model’ that define a DL model for the sentiment analysis prediction problem. The model must have the following architecture:

  • The former token + positional embedding layer created in task 1

  • Two Transformer Blocks (remember to take a look to the class material)

  • A GlobalAveragePooling1D layer

  • A droput(rate) layer

  • A dense layer with 20 neurons

  • A droput(rate) layer

  • One Output layer

def get_transformer_model(maxlen, vocab_size, embed_dim, num_heads, ff_dim, rate=0.1):
    #maxlen: sequences length
    #vocab_size: dictionary size
    #embed_dim: embedding dimension
    #num_heads: number of heads for MultiHeadAttention attention layer
    #ff_dim: number of neurons in the dense layer of the transformer block
    #rate: drop rate for all droput layers
    model = ...

    return model

The following is the architecture you should get for the given parameters:

from IPython.display import Image
../_images/U5 LAB 03 - Sentiment Analysis using BERT_45_0.png

Submit your solution:

student.submit_task(namespace=globals(), task_id='T2');

Let’s test the model:

Warning: Run this part only if you have already passed Tasks 1 and 2.

# Removing Stop Words
import nltk
from nltk.corpus import stopwords
all_sentences = data['text'].values
all_words = [nltk.word_tokenize(sent) for sent in all_sentences]
stop_words = stopwords.words('english')

for i in range(len(all_words)):  
    all_words[i] = [w for w in all_words[i] if (w not in stop_words) and (not w.isdigit())]
from local.lib.DataPreparationRNN import preprocessed_seq
tokenizer, X = preprocessed_seq(data['text'].values)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
y = data['airline_sentiment'].values
Encoder = LabelEncoder()
Y = Encoder.fit_transform(y)
X_tr, X_te, y_tr, y_te = train_test_split(X, Y, test_size=0.2, random_state=2018)
classes = np.unique(Y)
class_weight_vect = compute_class_weight('balanced',classes=classes,y=y_tr)
class_weight = {classes[0]: class_weight_vect[0],
                classes[1]: class_weight_vect[1]}
embed_dim = 32
num_heads = 4
ff_dim = 32
model = get_transformer_model(X.shape[1], tokenizer.num_words, embed_dim, num_heads, ff_dim, rate=0.1)
opt = tf.keras.optimizers.Adam(
    learning_rate=0.0001, beta_1=0.9, beta_2=0.999, epsilon=1e-07, amsgrad=False,

model.compile(optimizer=opt, loss="binary_crossentropy", metrics=["accuracy"])
history =, y_tr, validation_split=0.1,batch_size=32,epochs=20, class_weight=class_weight) #
from sklearn.metrics import recall_score, accuracy_score
def especi_score(y_te,y_pred):
    Ns = np.sum(y_te == 0)
    return np.sum(y_te[y_te == 0] == y_pred[y_te == 0])/Ns
y_pred = np.round(model.predict(X_te))
sensitivity = recall_score(y_te,y_pred)
accuracy = accuracy_score(y_te,y_pred)
especificity = especi_score(y_te,y_pred.flatten())


Repeat the former experiment but declaring equally weighted classes during training instead; compare the results with the ones you just got.