05.04 - PARTICIPATE IN KAGGLE

05.04 - PARTICIPATE IN KAGGLE#

!wget --no-cache -O init.py -q https://raw.githubusercontent.com/rramosp/ai4eng.v1/main/content/init.py
import init; init.init(force_download=False); init.get_weblink()

import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import local.lib.mlutils
import pandas as pd
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Dropout, Flatten
%matplotlib inline

We use Titanic data in Kaggle #

Register to Kaggle
Enter the competition Titanic Data at Kaggle
Download the train.csv and test.csv files
UPLOAD THE FILES to your notebook environment (in Colab, open the Files tab and upload)

d = pd.read_csv("train.csv")
print (d.shape)

(891, 12)

d.head()

	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Ticket	Fare	Cabin	Embarked
0	1	0	3	Braund, Mr. Owen Harris	male	22.0	1	A/5 21171	7.2500	NaN	S
1	2	1	1	Cumings, Mrs. John Bradley (Florence Briggs Th...	female	38.0	1	PC 17599	71.2833	C85	C
2	3	1	3	Heikkinen, Miss. Laina	female	26.0	0	STON/O2. 3101282	7.9250	NaN	S
3	4	1	1	Futrelle, Mrs. Jacques Heath (Lily May Peel)	female	35.0	1	113803	53.1000	C123	S
4	5	0	3	Allen, Mr. William Henry	male	35.0	0	373450	8.0500	NaN	S

Understand NaN values are present

for i in d.columns:
    print ("%20s"%i, np.sum(d[i].isna()))

         PassengerId 0
            Survived 0
              Pclass 0
                Name 0
                 Sex 0
                 Age 177
               SibSp 0
               Parch 0
              Ticket 0
                Fare 0
               Cabin 687
            Embarked 2

d.Embarked.value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

plt.hist(d.Age.dropna().values, bins=30);

../_images/9dbe3e3f7e802e005abd09a1ad3ee40a838d0636919b5575e0589775f379d017.png

Remove uninformative columns

del(d["PassengerId"])
del(d["Name"])
del(d["Ticket"])
del(d["Cabin"])

Fix NaN values

observe the different filling policies we decide to have

d["Embarked"] = d.Embarked.fillna("N")
d["Age"]      = d.Age.fillna(d.Age.mean())
d.head()

	Survived	Pclass	Sex	Age	SibSp	Fare	Embarked
0	0	3	male	22.0	1	7.2500	S
1	1	1	female	38.0	1	71.2833	C
2	1	3	female	26.0	0	7.9250	S
3	1	1	female	35.0	1	53.1000	S
4	0	3	male	35.0	0	8.0500	S

plt.hist(d.Age.dropna().values, bins=30);

../_images/0b958dcb1487bba4b0042aa7a92f910e5e7608ed039f9b3b07bf55108176a968.png

Turn categorical columns to a one_hot encoding

def to_onehot(x):
    values = np.unique(x)
    r = np.r_[[np.argwhere(i==values)[0][0] for i in x]]
    return np.eye(len(values))[r].astype(int)
    
k = to_onehot(d.Embarked.values)
k[:5]

array([[0, 0, 0, 1],
       [1, 0, 0, 0],
       [0, 0, 0, 1],
       [0, 0, 0, 1],
       [0, 0, 0, 1]])

def replace_columns_with_onehot(d, col):
    k = to_onehot(d[col].values)
    r = pd.DataFrame(k, columns=["%s_%d"%(col, i) for i in range(k.shape[1])], index=d.index).join(d)
    del(r[col])
    return r 

d.head()

	Survived	Pclass	Sex	Age	SibSp	Fare	Embarked
0	0	3	male	22.0	1	7.2500	S
1	1	1	female	38.0	1	71.2833	C
2	1	3	female	26.0	0	7.9250	S
3	1	1	female	35.0	1	53.1000	S
4	0	3	male	35.0	0	8.0500	S

d = replace_columns_with_onehot(d, "Embarked")
d.head()

	Embarked_0	Embarked_3	Survived	Pclass	Sex	Age	SibSp	Fare
0	0	1	0	3	male	22.0	1	7.2500
1	1	0	1	1	female	38.0	1	71.2833
2	0	1	1	3	female	26.0	0	7.9250
3	0	1	1	1	female	35.0	1	53.1000
4	0	1	0	3	male	35.0	0	8.0500

d = replace_columns_with_onehot(d, "Sex")
d.head()

	Sex_0	Sex_1	Embarked_0	Embarked_3	Survived	Pclass	Age	SibSp	Fare
0	0	1	0	1	0	3	22.0	1	7.2500
1	1	0	1	0	1	1	38.0	1	71.2833
2	1	0	0	1	1	3	26.0	0	7.9250
3	1	0	0	1	1	1	35.0	1	53.1000
4	0	1	0	1	0	3	35.0	0	8.0500

d.shape, d.values.sum()

((891, 12), 60142.86312352941)

Put all transformations together#

def clean_titanic(d):
    del(d["PassengerId"])
    del(d["Name"])
    del(d["Ticket"])
    del(d["Cabin"])
    d["Embarked"] = d.Embarked.fillna("N")
    d["Fare"]     = d.Fare.fillna(d.Fare.mean())
    d["Age"]      = d.Age.fillna(d.Age.mean())
    d = replace_columns_with_onehot(d, "Embarked")
    d = replace_columns_with_onehot(d, "Sex")
    return d

transform train and test data together

observe that test data does not have a Survival column. This is the result to submit to Kaggle

dtr = pd.read_csv("train.csv")
dts = pd.read_csv("test.csv")
lentr = len(dtr)
dtr.shape, dts.shape

((891, 12), (418, 11))

dts.head()

	PassengerId	Pclass	Name	Sex	Age	SibSp	Parch	Ticket	Fare	Cabin	Embarked
0	892	3	Kelly, Mr. James	male	34.5	0	0	330911	7.8292	NaN	Q
1	893	3	Wilkes, Mrs. James (Ellen Needs)	female	47.0	1	0	363272	7.0000	NaN	S
2	894	2	Myles, Mr. Thomas Francis	male	62.0	0	0	240276	9.6875	NaN	Q
3	895	3	Wirz, Mr. Albert	male	27.0	0	0	315154	8.6625	NaN	S
4	896	3	Hirvonen, Mrs. Alexander (Helga E Lindqvist)	female	22.0	1	1	3101298	12.2875	NaN	S

get data ready for training

source_cols = [i for i in dtr.columns if i!="Survived"]
all_data = pd.concat((dtr[source_cols], dts[source_cols]))
all_data.index = range(len(all_data))
all_data = clean_titanic(all_data)

Xtr, ytr = all_data.iloc[:lentr].values, dtr["Survived"].values
Xts      = all_data.iloc[lentr:].values

print (Xtr.shape, ytr.shape)
print (Xts.shape)

(891, 11) (891,)
(418, 11)

cross validate for model selection

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
rf = RandomForestClassifier()
print (cross_val_score(rf, Xtr, ytr))

svc = SVC()
print (cross_val_score(svc, Xtr, ytr))

[0.77094972 0.81460674 0.84831461 0.79213483 0.83707865]
[0.59217877 0.71348315 0.69101124 0.68539326 0.69101124]

now train with full dataset and generate submission for Kaggle

rf.fit(Xtr, ytr)
preds_ts = rf.predict(Xts)
preds_ts

array([0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0,
       0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0,
       1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0,
       1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0,
       0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1])

get predictions ready to submit to Kaggle

see https://www.kaggle.com/c/titanic#evaluation for file format

submission = pd.DataFrame([dts.PassengerId, pd.Series(preds_ts, name="Survived")]).T
submission.head()

	PassengerId	Survived
0	892	0
1	893	0
2	894	0
3	895	1
4	896	0

submission.to_csv("titanic_kaggle.csv", index=False)

!head titanic_kaggle.csv

PassengerId,Survived
892,0
893,0
894,0
895,1
896,0
897,0
898,0
899,0
900,1

05.04 - PARTICIPATE IN KAGGLE

Contents

05.04 - PARTICIPATE IN KAGGLE#

We use Titanic data in Kaggle#

Put all transformations together#

We use Titanic data in Kaggle #