import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn import decomposition
from sklearn.linear_model import RidgeCV
# Import the 3 dimensionality reduction methods
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
#import MNIST Mixed National Institute of Standards and Technology
#dataset, which is a collection of hand written numbers in image form each image being normalized to a 28x28 image
#which gives us 784 predictors. I will use PCA, and t-SNE to perform dimensionality reduction on this dataset
#The 'curse of dimensionality' refers to the problem of needing exopnentially more observations than features
#when dimension gets higher, which is unfeasible, therefore it is very important to be able to reduce number of features
data = pd.read_csv(r'/Users/shivambadkas/Downloads/train.csv')
target = pd.read_csv(r'/Users/shivambadkas/Downloads/test.csv')
#see data info.
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 42000 entries, 0 to 41999 Columns: 785 entries, label to pixel783 dtypes: int64(785) memory usage: 251.5 MB
target.head()
pixel0 | pixel1 | pixel2 | pixel3 | pixel4 | pixel5 | pixel6 | pixel7 | pixel8 | pixel9 | ... | pixel774 | pixel775 | pixel776 | pixel777 | pixel778 | pixel779 | pixel780 | pixel781 | pixel782 | pixel783 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
5 rows × 784 columns
y = data['label']
X= data.drop(['label'], axis = 1)
#PCA aims to reduce dimensionality by projecting the original set of features onto a smaller set, it uses the principal
#components which are basically new columns that summarize the old columns
#scale data and normalize ranges
from sklearn.preprocessing import StandardScaler
X_scaled = StandardScaler().fit_transform(X)
pca = decomposition.PCA(n_components=2)
X_pca_scaled = pca.fit_transform(X_scaled)
print('Projecting %d-dimensional data to 2D' % X_scaled.shape[1])
plt.figure(figsize=(12,10))
plt.scatter(X_pca_scaled[:, 0], X_pca_scaled[:, 1], c=data['label'], alpha=0.7, s=40);
plt.colorbar()
plt.title('MNIST. PCA projection');
Projecting 784-dimensional data to 2D
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
pca_result = pca.fit_transform(X)
print(pca.explained_variance_ratio_)
[0.09748938 0.07160266]
print(X.shape)
print(pca_result.shape)
(42000, 784) (42000, 2)
#this plot shows that approximately 150 principal components are enough to expalin more than 95% of the varianc
#in the dataset
pca = PCA(300)
pca_full = pca.fit(X)
plt.plot(np.cumsum(pca_full.explained_variance_ratio_))
plt.xlabel('# of components')
plt.ylabel('Cumulative explained variance')
Text(0, 0.5, 'Cumulative explained variance')
#t-SNE (t-distributed stochastic neighbor embedding)
data_1000 = X_scaled[0:1000,:]
labels_1000 = y[0:1000]
model = TSNE(n_components=2, random_state=0)
tsne_data = model.fit_transform(data_1000)
tsne_data = np.vstack((tsne_data.T, labels_1000)).T
tsne_df = pd.DataFrame(data = tsne_data, columns=("Dim_1", "Dim_2", "label"))
/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/sklearn/manifold/_t_sne.py:795: FutureWarning: The default initialization in TSNE will change from 'random' to 'pca' in 1.2. warnings.warn( /Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/sklearn/manifold/_t_sne.py:805: FutureWarning: The default learning rate in TSNE will change from 200.0 to 'auto' in 1.2. warnings.warn(
#Plotting the results
import seaborn as sns
sns.FacetGrid(tsne_df, hue='label', height=6).map(plt.scatter, 'Dim_1', 'Dim_2').add_legend()
plt.show()