In [21]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn import decomposition
from sklearn.linear_model import RidgeCV
# Import the 3 dimensionality reduction methods
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
In [36]:
#import MNIST Mixed National Institute of Standards and Technology 
#dataset, which is a collection of hand written numbers in image form each image being normalized to a 28x28 image
#which gives us 784 predictors. I will use PCA, and t-SNE to perform dimensionality reduction on this dataset
#The 'curse of dimensionality' refers to the problem of needing exopnentially more observations than features 
#when dimension gets higher, which is unfeasible, therefore it is very important to be able to reduce number of features

data = pd.read_csv(r'/Users/shivambadkas/Downloads/train.csv')
target = pd.read_csv(r'/Users/shivambadkas/Downloads/test.csv')
In [37]:
#see data info. 
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42000 entries, 0 to 41999
Columns: 785 entries, label to pixel783
dtypes: int64(785)
memory usage: 251.5 MB
In [38]:
target.head()
Out[38]:
pixel0 pixel1 pixel2 pixel3 pixel4 pixel5 pixel6 pixel7 pixel8 pixel9 ... pixel774 pixel775 pixel776 pixel777 pixel778 pixel779 pixel780 pixel781 pixel782 pixel783
0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
3 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
4 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

5 rows × 784 columns

In [23]:
y = data['label']
X= data.drop(['label'], axis = 1)
In [29]:
#PCA aims to reduce dimensionality by projecting the original set of features onto a smaller set, it uses the principal
#components which are basically new columns that summarize the old columns
#scale data and normalize ranges
from sklearn.preprocessing import StandardScaler
X_scaled = StandardScaler().fit_transform(X)

pca = decomposition.PCA(n_components=2)
X_pca_scaled = pca.fit_transform(X_scaled)

print('Projecting %d-dimensional data to 2D' % X_scaled.shape[1])

plt.figure(figsize=(12,10))
plt.scatter(X_pca_scaled[:, 0], X_pca_scaled[:, 1], c=data['label'], alpha=0.7, s=40);
plt.colorbar()
plt.title('MNIST. PCA projection');
Projecting 784-dimensional data to 2D
In [33]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
pca_result = pca.fit_transform(X)

print(pca.explained_variance_ratio_)
[0.09748938 0.07160266]
In [34]:
print(X.shape)
print(pca_result.shape)
(42000, 784)
(42000, 2)
In [41]:
#this plot shows that approximately 150 principal components are enough to expalin more than 95% of the varianc
#in the dataset
pca = PCA(300)
pca_full = pca.fit(X)

plt.plot(np.cumsum(pca_full.explained_variance_ratio_))
plt.xlabel('# of components')
plt.ylabel('Cumulative explained variance')
Out[41]:
Text(0, 0.5, 'Cumulative explained variance')
In [45]:
#t-SNE (t-distributed stochastic neighbor embedding)
data_1000 = X_scaled[0:1000,:]
labels_1000 = y[0:1000]

model = TSNE(n_components=2, random_state=0)

tsne_data = model.fit_transform(data_1000)

tsne_data = np.vstack((tsne_data.T, labels_1000)).T
tsne_df = pd.DataFrame(data = tsne_data, columns=("Dim_1", "Dim_2", "label"))
/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/sklearn/manifold/_t_sne.py:795: FutureWarning: The default initialization in TSNE will change from 'random' to 'pca' in 1.2.
  warnings.warn(
/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/sklearn/manifold/_t_sne.py:805: FutureWarning: The default learning rate in TSNE will change from 200.0 to 'auto' in 1.2.
  warnings.warn(
In [47]:
#Plotting the results
import seaborn as sns
sns.FacetGrid(tsne_df, hue='label', height=6).map(plt.scatter, 'Dim_1', 'Dim_2').add_legend()
plt.show()
In [ ]: