# スタンフォード/CS131/宿題6-5 復元エラーとk近傍法+主成分分析

スポンサーリンク

## Reconstruction error and captured variance¶

We can plot the reconstruction error with respect to the dimension of the projected space.

The reconstruction gets better with more components.

We can see in the plot that the inflexion point is around dimension 200 or 300. This means that using this number of components is a good compromise between good reconstruction and low dimension.

import numpy as np
from skimage import io
import scipy
import scipy.linalg
from collections import defaultdict
import matplotlib.pyplot as plt
from matplotlib import rc

%matplotlib inline
plt.rcParams['figure.figsize'] = (15.0, 12.0) # set default size of plots
plt.rcParams["font.size"] = "17"
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

X_train, y_train, classes_train = load_dataset('faces', train=True, as_gray=True)
X_test, y_test, classes_test = load_dataset('faces', train=False, as_gray=True)
assert classes_train == classes_test
classes = classes_train
num_classes = len(classes)

# Flatten the image data into rows
# we now have one 4096 dimensional featue vector for each example
X_train = np.reshape(X_train, (X_train.shape[0], -1))
X_test = np.reshape(X_test, (X_test.shape[0], -1))

class PCA(object):
"""Class implementing Principal component analysis (PCA).
Steps to perform PCA on a matrix of features X:
1. Fit the training data using method fit (either with eigen decomposition of SVD)
2. Project X into a lower dimensional space using method transform
3. Optionally reconstruct the original X (as best as possible) using method reconstruct
"""
def __init__(self):
self.W_pca = None
self.mean = None

def fit(self, X, method='svd'):
"""Fit the training data X using the chosen method.
Will store the projection matrix in self.W_pca and the mean of the data in self.mean
Args:
X: numpy array of shape (N, D). Each of the N rows represent a data point.
Each data point contains D features.
method: Method to solve PCA. Must be one of 'svd' or 'eigen'.
"""
_, D = X.shape
self.mean = None   # empirical mean, has shape (D,)
X_centered = None  # zero-centered data
# 1. Compute the mean and store it in self.mean
# 2. Apply either method to X_centered
self.mean = np.mean(X, axis=0)
X_centered = X - self.mean
if method == 'svd':
self.W_pca = self._svd(X_centered)[0]
else:
self.W_pca = self._eigen_decomp(X_centered)[0]
# Make sure that X_centered has mean zero
assert np.allclose(X_centered.mean(), 0.0)
# Make sure that self.mean is set and has the right shape
assert self.mean is not None and self.mean.shape == (D,)
# Make sure that self.W_pca is set and has the right shape
assert self.W_pca is not None and self.W_pca.shape == (D, D)
# Each column of self.W_pca should have norm 1 (each one is an eigenvector)
for i in range(D):
assert np.allclose(np.linalg.norm(self.W_pca[:, i]), 1.0)

def _svd(self, X):
"""Performs Singular Value Decomposition (SVD) of X.
Args:
X: Zero-centered data array, each ROW containing a data point.
Numpy array of shape (N, D).
Returns:
vecs: right singular vectors. Numpy array of shape (D, D)
vals: singular values. Numpy array of shape (K,) where K = min(N, D)
"""
vecs = None  # shape (D, D)
N, D = X.shape
vals = None  # shape (K,)
# Here, compute the SVD of X
# Make sure to return vecs as the matrix of vectors where each column is a singular vector
_, vals, vecs = np.linalg.svd(X)
assert vecs.shape == (D, D)
K = min(N, D)
assert vals.shape == (K,)
return vecs.T, vals

def transform(self, X, n_components):
"""Center and project X onto a lower dimensional space using self.W_pca.
Args:
X: numpy array of shape (N, D). Each row is an example with D features.
n_components: number of principal components..
Returns:
X_proj: numpy array of shape (N, n_components).
"""
N, _ = X.shape
X_proj = None
# We need to modify X in two steps:
#     1. first substract the mean stored during fit
#     2. then project onto a subspace of dimension n_components using self.W_pca
X_proj = np.dot(X-self.mean,self.W_pca[:,:n_components])
assert X_proj.shape == (N, n_components), "X_proj doesn't have the right shape"
return X_proj

def reconstruct(self, X_proj):
"""Do the exact opposite of method transform: try to reconstruct the original features.
Given the X_proj of shape (N, n_components) obtained from the output of transform,
we try to reconstruct the original X.
Args:
X_proj: numpy array of shape (N, n_components). Each row is an example with D features.
Returns:
X: numpy array of shape (N, D).
"""
N, n_components = X_proj.shape
X = None
# Steps:
#     1. project back onto the original space of dimension D
#     2. add the mean that we substracted in transform
X = np.dot(X_proj,self.W_pca[:,:n_components].T) \
+ self.mean
return X

pca = PCA()
pca.fit(X_train)
# Plot reconstruction errors for different k
N = X_train.shape[0]
d = X_train.shape[1]

ns = range(1, d, 100)
errors = []

for n in ns:
X_proj = pca.transform(X_train, n)
X_rec = pca.reconstruct(X_proj)

# Compute reconstruction error
error = np.mean((X_rec - X_train) ** 2)
errors.append(error)

plt.plot(ns, errors)
plt.xlabel('Number of Components')
plt.ylabel('Reconstruction Error')
plt.show()


We can do the same process to see how much variance is captured by the projection.

Again, we see that the inflexion point is around 200 or 300 dimensions.
この場合も前回と同じように、変曲点は200か300次元周辺であることを視認できる。

u, s = pca._svd(X_train - X_train.mean(axis=0))
# Plot captured variance
ns = range(1, d, 100)
var_cap = []

for n in ns:
var_cap.append(np.sum(s[:n] ** 2)/np.sum(s ** 2))

plt.plot(ns, var_cap)
plt.xlabel('Number of Components')
plt.ylabel('Variance Captured')
plt.show()

スポンサーリンク

## kNN with PCA¶

Performing kNN on raw features (the pixels of the image) does not yield very good results.

Computing the distance between images in the image space is not a very good metric for actual proximity of images.
For instance, an image of person A with a dark background will be close to an image of B with a dark background, although these people are not the same.

Using a technique like PCA can help discover the real interesting features and perform kNN on them could give better accuracy.
PCAのような技術の利用が、実際に関心のある特徴の発見を促し、それらの特徴にKNNを施すことがより良好な正確度を与えてくれる。

However, we observe here that PCA doesn’t really help to disentangle the features and obtain useful distance metrics between the different classes. We basically obtain the same performance as with raw features.
しかしながら、ここでは、PCAが、実際には、特徴を探し出して異なるクラス間の有効な距離尺度を得る手助けにならないことに気付かされる。おおむね原特徴によるのと同じ性能しか得られない。

from k_nearest_neighbor import *

num_test = X_test.shape[0]

# We computed the best k and n for you
best_k = 20
best_n = 500

# PCA
pca = PCA()
pca.fit(X_train)
X_proj = pca.transform(X_train, best_n)
X_test_proj = pca.transform(X_test, best_n)

# kNN
dists = compute_distances(X_test_proj, X_proj)
y_test_pred = predict_labels(dists, y_train, k=best_k)

# Compute and display the accuracy
num_correct = np.sum(y_test_pred == y_test)
accuracy = float(num_correct) / num_test
print('Got %d / %d correct => accuracy: %f' % (num_correct, num_test, accuracy))

Got 42 / 160 correct => accuracy: 0.262500


スポンサーリンク
スポンサーリンク

フォローする