In [19]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn
import pandas as pd
from sklearn.decomposition import PCA
import pickle
%matplotlib inline
In [2]:
# load smaller user behavior dataset
user_profile = pd.read_pickle('../data_user_view_buy/user_profile_items_nonnull_features_20_mins_5_views_v2_sample1000.pkl')
user_sample = user_profile.user_id.unique()
print(len(user_profile))
print(len(user_sample))
user_profile.head()
Out[2]:
In [4]:
# read nn features
spu_fea = pd.read_pickle("../data_nn_features/spu_fea_sample1000.pkl")
In [7]:
# sub-sample possible items
np.random.seed(1000)
item_sample = np.random.choice(user_profile.view_spu.unique(),size=3000)
# get item X feature matrix #
X_item_feature = np.empty((len(item_sample),len(spu_fea.features.as_matrix()[0])))
for ii,item_spu in enumerate(item_sample):
X_item_feature[ii,:]=spu_fea.loc[spu_fea.spu_id==item_spu,'features'].as_matrix()[0]
# calculate PC's
pca1 = PCA()
pca1.fit(X_item_feature)
Out[7]:
In [8]:
# sub-sample possible items
np.random.seed(2000)
item_sample = np.random.choice(user_profile.view_spu.unique(),size=3000)
# get item X feature matrix #
X_item_feature = np.empty((len(item_sample),len(spu_fea.features.as_matrix()[0])))
for ii,item_spu in enumerate(item_sample):
X_item_feature[ii,:]=spu_fea.loc[spu_fea.spu_id==item_spu,'features'].as_matrix()[0]
# calculate PC's
pca2 = PCA()
pca2.fit(X_item_feature)
Out[8]:
In [9]:
for i in range(10):
print(np.corrcoef(pca1.components_[i,:],pca2.components_[i,:])[0,1])
In [10]:
# get item X feature matrix for all
item_sample = user_profile.view_spu.unique()
X_item_feature = np.empty((len(item_sample),len(spu_fea.features.as_matrix()[0])))
for ii,item_spu in enumerate(item_sample):
X_item_feature[ii,:]=spu_fea.loc[spu_fea.spu_id==item_spu,'features'].as_matrix()[0]
In [11]:
X_item_feature.shape
Out[11]:
In [12]:
# calculate PC's
pca_all = PCA()
pca_all.fit(X_item_feature)
Out[12]:
In [15]:
pickle.dump(pca_all,open( "../data_nn_features/pca_all_items_sample1000.pkl", "wb" ))
In [17]:
pca_all = pickle.load(open('../data_nn_features/pca_all_items_sample1000.pkl','rb'))
In [36]:
plt.plot(pca_all.explained_variance_ratio_.cumsum())
plt.ylabel('cumulative percent explained variance')
plt.xlabel('component #')
plt.xlim([0,500])
Out[36]:
In [37]:
%%bash
#jupyter nbconvert --to Plotting_Sequences_in_low_dimensions.ipynb && mv Plotting_Sequences_in_low_dimensions.slides.html ../notebook_slides/Plotting_Sequences_in_low_dimensions_v1.slides.html
jupyter nbconvert --to html Dimensionality_Reduction_on_Features.ipynb && mv Dimensionality_Reduction_on_Features.html ../notebook_htmls/Dimensionality_Reduction_on_Features_v1.html
cp Dimensionality_Reduction_on_Features.ipynb ../notebook_versions/Dimensionality_Reduction_on_Features_v1.ipynb
In [ ]: