06.03 - PCA, NMF IN PRACTICE

06.03 - PCA, NMF IN PRACTICE#

!wget --no-cache -O init.py -q https://raw.githubusercontent.com/rramosp/ai4eng.v1/main/content/init.py
import init; init.init(force_download=False); init.get_weblink()

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline

Reducción de dimensionalidad para tareas de clasificación#

mnist = pd.read_csv("local/data/mnist1.5k.csv.gz", compression="gzip", header=None).values
d=mnist[:,1:785]
c=mnist[:,0]
print ("dimension de las imagenes y las clases", d.shape, c.shape)

dimension de las imagenes y las clases (1500, 784) (1500,)

plt.imshow(d[9].reshape(28,28), cmap=plt.cm.gray)

<matplotlib.image.AxesImage at 0x7fa29c7058e0>

../_images/c78baa49c4512887622a71f746ee84ea59c47dd02de23fc475b40706cc621818.png

perm = np.random.permutation(range(d.shape[0]))[0:50]
random_imgs   = d[perm]
random_labels = c[perm] 
fig = plt.figure(figsize=(10,6))
for i in range(random_imgs.shape[0]):
    ax=fig.add_subplot(5,10,i+1)
    plt.imshow(random_imgs[i].reshape(28,28), interpolation="nearest", cmap = plt.cm.Greys_r)
    ax.set_title(int(random_labels[i]))
    ax.set_xticklabels([])
    ax.set_yticklabels([])

../_images/2617037ac93313c57d5157a9082b458401dfe07061880d916d8714820fcd76e0.png

Principal Component Analysis#

from sklearn.decomposition import PCA

mnist = pd.read_csv("local/data/mnist1.5k.csv.gz", compression="gzip", header=None).values
X=mnist[:,1:785]
y=mnist[:,0]

pca = PCA(n_components=10)
Xp = pca.fit_transform(X)

X.shape, Xp.shape

((1500, 784), (1500, 10))

for i in np.unique(y):
    print (i, np.sum(y==i))

from sklearn.model_selection import train_test_split

Xtr, Xts, ytr, yts = train_test_split(X,y,test_size=.3)
Xtr.shape, Xts.shape, ytr.shape, yts.shape

((1050, 784), (450, 784), (1050,), (450,))

from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
dt = GaussianNB()
dt.fit(Xtr, ytr)
dt.score(Xtr, ytr), dt.score(Xts, yts)

(0.72, 0.6177777777777778)

cs = range(10,200,5)

dtr, dts = [], []
for n_components in cs:
    print (".", end="")
    pca = PCA(n_components=n_components)
    pca.fit(Xtr)

    Xt_tr = pca.transform(Xtr)
    Xt_ts = pca.transform(Xts)

    dt.fit(Xt_tr,ytr)
    ypreds_tr = dt.predict(Xt_tr)
    ypreds_ts = dt.predict(Xt_ts)
    ypreds_tr.shape, ypreds_ts.shape
    dtr.append(np.mean(ytr==ypreds_tr))
    dts.append(np.mean(yts==ypreds_ts))

......................................

len(dtr), len(dts)

(38, 38)

plt.plot(cs, dtr, label="train")
plt.plot(cs, dts, label="test")
plt.xlabel("n components")
plt.ylabel("% acierto")
plt.legend()

<matplotlib.legend.Legend at 0x7f5316c9c790>

../_images/755a8e936fcb0e18c21a2ab278d98b61a0e7492fcda6a01fc42563fdd52b9538.png

best_cs = cs[np.argmax(dts)]
best_cs

clasificación en el nuevo espacio de representación#

pca = PCA(n_components=best_cs)
pca.fit(Xtr)

Xt_tr = pca.transform(Xtr)
Xt_ts = pca.transform(Xts)
dt.fit(Xt_tr,ytr)
ypreds_tr = dt.predict(Xt_tr)
ypreds_ts = dt.predict(Xt_ts)
ypreds_tr.shape, ypreds_ts.shape
np.mean(ytr==ypreds_tr),np.mean(yts==ypreds_ts)

(0.9047619047619048, 0.8355555555555556)

pipelines#

debemos de tener cuidado cuando usamos transformaciones en clasificación, ya que tenemos que ajustarlas (de manera no supervisada) sólo con los datos de entrenamiento

from sklearn.pipeline import Pipeline

estimator = Pipeline((("pca", PCA(n_components=best_cs)), ("naive", dt)))
estimator.fit(Xtr, ytr)
estimator.score(Xtr, ytr), estimator.score(Xts, yts)

(0.9057142857142857, 0.8333333333333334)

from sklearn.model_selection import cross_val_score
pip = Pipeline([("PCA", PCA(n_components=best_cs)), ("gaussian", GaussianNB())])
scores = cross_val_score(pip, X,y, cv=5 )
print ("%.2f +/- %.4f"%(np.mean(scores), np.std(scores)))

0.84 +/- 0.0168

obtenemos los componentes principales#

cols=20
plt.figure(figsize=(15,3))
for i in range(len(pca.components_)):
    plt.subplot(np.ceil(len(pca.components_)/15.),15,i+1)
    plt.imshow((pca.components_[i].reshape(28,28)), cmap = plt.cm.Greys_r)
    plt.xticks([]); plt.yticks([])

../_images/b0af6eb98be9b1767569de46fc4ea455c9beb65be1971f1bb9c83a2048a67f8a.png

verificamos la reconstrucción con los componentes principales#

pca = PCA(n_components=best_cs)
pca.fit(Xtr)
Xp = pca.transform(X)

plt.figure(figsize=(10,6))
for i in range(6):
    plt.subplot(3,6,i+1)
    k = np.random.randint(len(X))
    plt.imshow((np.sum((pca.components_*Xp[k].reshape(-1,1)), axis=0)).reshape(28,28), cmap=plt.cm.Greys_r)
    plt.xticks([]); plt.yticks([])
    plt.subplot(3,6,6+i+1)
    plt.imshow(X[k].reshape(28,28), cmap=plt.cm.Greys_r)
    plt.xticks([]); plt.yticks([])

../_images/c1e47fea547bfb7fa436c3a562d248f72777b2278ef64eb4dbecc47fc63f9954.png

observa la nueva representación de la primera imagen#

X[0]

array([  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0, 188, 255,  94,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0, 191, 250, 253,  93,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0, 123, 248, 253, 167,  10,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,  80, 247, 253, 208,  13,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,  29, 207, 253, 235,  77,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,  54, 209, 253, 253,  88,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,  93, 254, 253, 238, 170,
        17,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,  23, 210, 254, 253,
       159,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,  16, 209, 253,
       254, 240,  81,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,  27,
       253, 253, 254,  13,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
        20, 206, 254, 254, 198,   7,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0, 168, 253, 253, 196,   7,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,  20, 203, 253, 248,  76,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,  22, 188, 253, 245,  93,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0, 103, 253, 253, 191,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,  89, 240, 253, 195,  25,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,  15, 220, 253, 253,  80,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,  94, 253, 253,
       253,  94,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,  89,
       251, 253, 250, 131,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0, 214, 218,  95,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0])

Xp[0]

array([-602.96544279,  709.91417575,   48.46393261,  -94.3578192 ,
       -247.5969329 , -429.06089585, -197.60517121,  531.22635527,
        439.83674877, -143.65598588, -350.95470187,  172.60373135,
       -172.73696352, -281.72027853, -126.34179342,  161.73658414,
        -81.78527347, -413.79147679,   47.59742414, -237.42715084,
        223.18870607, -258.81025144,   43.07849287, -144.98480884,
        -56.84875879,   15.95569482,  -37.29026752, -169.61029386,
        290.04985091,  -18.09769242, -156.47270457,  115.19250934,
        -92.38887583,   97.42047216, -165.2477172 ,  195.55535288,
        202.84586027,   19.77436206, -103.13657262, -196.06966884,
        124.32311744,   31.23761748,  -38.67664165,   35.31148678,
         70.87604485,  186.20659945, -128.96485438,   66.94065907,
       -149.08882182,  -46.65103309,   -4.09849795,  111.30817383,
        -52.76418032,  -40.0893952 ,  -70.94692439, -105.78083452,
        -23.79724802,  152.12095565,   59.00594154,    7.56383813])

which correspond to the same components above for PCA.

Non negative matrix factorization#

Descomponemos una matriz $V \in \mathbb{R}_+^{m\times n}$ en el producto $W \times H$, con $W \in \mathbb{R}_+^{m\times r}$ y $H \in \mathbb{R}_+^{r\times n}$ con la restricción de que todo sea positivo ($\in \mathbb{R}_+$), de forma que:

\[V \approx W \times H\]

Las filas de $H$ son los componentes base, y se soluciona planteándolo como un problema de optimización matemática con restricciones.

\[\begin{split}\begin{split} argmin_{W,H}\;& ||V-W\times H||\\ s.t.&\;W,H \in \mathbb{R}_+ \end{split}\end{split}\]

from IPython.display import Image
Image(filename='local/imgs/nmf.png')

../_images/ab7913d5da88dd1296a4a0cdc2210456ff6f58831a5277a78db5b353ab88363c.png

obtenemos la descomposición#

from sklearn.decomposition import NMF
X=mnist[:,1:785]; y=mnist[:,0]

nmf = NMF(n_components=15, init="random")
Xn  = nmf.fit_transform(X)

cols=20
plt.figure(figsize=(15,3))
for i in range(len(nmf.components_)):
    plt.subplot(len(nmf.components_)/15,15,i+1)
    plt.imshow(np.abs(nmf.components_[i].reshape(28,28)), cmap = plt.cm.Greys_r)
    plt.xticks([]); plt.yticks([])

../_images/f2f7092e1539b888f78185ce829be2ac4dfed7c17e97bc288f2478df1516c361.png

Xn[0,:]

array([0.        , 0.        , 4.95514551, 0.        , 0.        ,
       6.08083653, 1.91867923, 0.09777909, 0.        , 0.        ,
       0.        , 8.73915575, 0.        , 3.50861945, 0.        ])

verfiicamos la reconstrucción#

plt.figure(figsize=(10,6))
for i in range(6):
    plt.subplot(3,6,i+1)
    k = np.random.randint(len(X))
    plt.imshow(np.abs(np.sum((nmf.components_*Xn[k].reshape(-1,1)), axis=0)).reshape(28,28), cmap=plt.cm.Greys_r)
    plt.xticks([]); plt.yticks([])
    plt.subplot(3,6,6+i+1)
    plt.imshow(X[k].reshape(28,28), cmap=plt.cm.Greys_r)
    plt.xticks([]); plt.yticks([])

../_images/c8d76e61ab261e95fad2802a1522143bed02cf3f5e5284276eb24f78b877577d.png

clasificamos en el nuevo espacio de representación#

print (np.mean(cross_val_score(GaussianNB(), X,y, cv=5 )))
print (np.mean(cross_val_score(GaussianNB(), Xn,y, cv=5 )))

0.5953333333333334
0.7733333333333333

la primera imagen en el nuevo espacio de representación#

observa que todos los componentes son positivos

plt.imshow(X[0].reshape(28,28))

<matplotlib.image.AxesImage at 0x7f5315e32190>

../_images/5e9c09cdce7b9354861f27e16f31812c820f5faa59da6bcacb1ccaa09ba7c782.png

Xn[0]

array([0.        , 0.        , 4.95514551, 0.        , 0.        ,
       6.08083653, 1.91867923, 0.09777909, 0.        , 0.        ,
       0.        , 8.73915575, 0.        , 3.50861945, 0.        ])

cols=20
plt.figure(figsize=(15,3))
for i in range(len(nmf.components_)):
    plt.subplot(len(nmf.components_)/15,15,i+1)
    plt.imshow(np.abs(nmf.components_[i].reshape(28,28)), cmap = plt.cm.Greys_r)
    plt.xticks([]); plt.yticks([])

NMF para el reconocimiento de rostros#

import numpy as np
faces = np.load("local/data/faces.npy")

plt.figure(figsize=(15,2))
for i in range(30):
    plt.subplot(2,15,i+1)
    plt.imshow(faces[np.random.randint(len(faces))].reshape(19,19), cmap=plt.cm.Greys_r)
    plt.xticks([]); plt.yticks([])

../_images/4985daf25aa4c2bfb43cac016f41164794442421ae5edb680959e66e3abdd002.png

nmf      = NMF(n_components=30, init="random")
faces_n  = nmf.fit_transform(faces)
cols=20
plt.figure(figsize=(15,2))
for i in range(len(nmf.components_)):
    plt.subplot(np.ceil(len(nmf.components_)/15.),15,i+1)
    plt.imshow(np.abs(nmf.components_[i].reshape(19,19)), cmap = plt.cm.Greys)
    plt.xticks([]); plt.yticks([])

../_images/d9e444db79ae0269b101d0afb91ada0ae08537f7ab107a76236d182b6fd5d3f2.png

forzamos dispersión en los componentes, y extendemos el problema de optimización con la norma $L_1$ en los componentes base.

\[\begin{split}\begin{split} argmin_{W,H}\;& ||V-W\times H|| + ||H||^2_1\\ s.t.&\;W,H \in \mathbb{R}_+ \end{split}\end{split}\]

también podríamos forzar dispersión en la nueva representación $$\begin{split} argmin_{W,H}\;& ||V-W\times H|| + ||W||^2_1\\ s.t.&\;W,H \in \mathbb{R}_+ \end{split}$$

nmf      = NMF(n_components=30, init="nndsvd", alpha=1000, l1_ratio=1)
faces_n  = nmf.fit_transform(faces)
cols=20
plt.figure(figsize=(15,2))
print (np.sum(nmf.components_))
for i in range(len(nmf.components_)):
    plt.subplot(np.ceil(len(nmf.components_)/15.),15,i+1)
    plt.imshow(np.abs(nmf.components_[i].reshape(19,19)), cmap = plt.cm.Greys)
    plt.xticks([]); plt.yticks([])

14650.57060382788

../_images/43cdc9b145dc775d4debc02e49501774e5d4e1395a38d8cc802961353d614f1b.png