notebook.community

load in evaluation dataset
sub-sample a large set of features
calculate PCA and save out for loading in other places.



In [19]:

    
import numpy as np
import matplotlib.pyplot as plt
import seaborn
import pandas as pd
from sklearn.decomposition import PCA
import pickle
%matplotlib inline



In [2]:

    
# load smaller user behavior dataset
user_profile = pd.read_pickle('../data_user_view_buy/user_profile_items_nonnull_features_20_mins_5_views_v2_sample1000.pkl')
user_sample = user_profile.user_id.unique()
print(len(user_profile))
print(len(user_sample))
user_profile.head()









    



40141
961






    Out[2]:







  
    
      
      user_id
      buy_spu
      buy_sn
      buy_ct3
      view_spu
      view_sn
      view_ct3
      time_interval
      view_cnt
      view_seconds
      index
      spu_id
      view_spu_count
      drop
    
  
  
    
      1226
      3787002243
      14994645517246498
      10006541
      334
      221597279072104476
      10005949
      334
      389705
      4
      98
      87616.0
      2.215973e+17
      11
      0
    
    
      1227
      3787002243
      14994645517246498
      10006541
      334
      29349802448457744
      10004542
      334
      7342
      1
      10
      73880.0
      2.934980e+16
      11
      0
    
    
      2261
      132681117
      6831894477217820
      10001155
      334
      27661017974767638
      10004048
      334
      32451
      1
      6
      21676.0
      2.766102e+16
      63
      0
    
    
      2262
      132681117
      6831894477217820
      10001155
      334
      288869796837830741
      10000351
      334
      33318
      2
      30
      6475.0
      2.888698e+17
      63
      0
    
    
      2263
      132681117
      6831894477217820
      10001155
      334
      301817675098247170
      10020640
      334
      428149
      1
      9
      25612.0
      3.018177e+17
      63
      0



In [4]:

    
# read nn features 
spu_fea = pd.read_pickle("../data_nn_features/spu_fea_sample1000.pkl")

How similar are PCs on 2 sub-samples of data?



In [7]:

    
# sub-sample possible items
np.random.seed(1000)
item_sample = np.random.choice(user_profile.view_spu.unique(),size=3000)

# get item X feature matrix #
X_item_feature = np.empty((len(item_sample),len(spu_fea.features.as_matrix()[0])))
for ii,item_spu in enumerate(item_sample):
    X_item_feature[ii,:]=spu_fea.loc[spu_fea.spu_id==item_spu,'features'].as_matrix()[0]
    
# calculate PC's 
pca1 = PCA()
pca1.fit(X_item_feature)









    Out[7]:





PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)



In [8]:

    
# sub-sample possible items
np.random.seed(2000)
item_sample = np.random.choice(user_profile.view_spu.unique(),size=3000)

# get item X feature matrix #
X_item_feature = np.empty((len(item_sample),len(spu_fea.features.as_matrix()[0])))
for ii,item_spu in enumerate(item_sample):
    X_item_feature[ii,:]=spu_fea.loc[spu_fea.spu_id==item_spu,'features'].as_matrix()[0]
    
# calculate PC's 
pca2 = PCA()
pca2.fit(X_item_feature)









    Out[8]:





PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)



In [9]:

    
for i in range(10):
    print(np.corrcoef(pca1.components_[i,:],pca2.components_[i,:])[0,1])









    



0.989596783159
0.976819415357
-0.611981301025
0.64546023461
-0.791966038508
-0.784539319488
-0.907286692406
0.460205498172
0.424413489535
0.683556540565

After component 2, they are not that correlated which is surprising.

Calculate PCA on all view items



In [10]:

    
# get item X feature matrix for all 

item_sample = user_profile.view_spu.unique()

X_item_feature = np.empty((len(item_sample),len(spu_fea.features.as_matrix()[0])))
for ii,item_spu in enumerate(item_sample):
    X_item_feature[ii,:]=spu_fea.loc[spu_fea.spu_id==item_spu,'features'].as_matrix()[0]



In [11]:

    
X_item_feature.shape









    Out[11]:





(17797, 2048)



In [12]:

    
# calculate PC's
pca_all = PCA()
pca_all.fit(X_item_feature)









    Out[12]:





PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)



In [15]:

    
pickle.dump(pca_all,open( "../data_nn_features/pca_all_items_sample1000.pkl", "wb" ))



In [17]:

    
pca_all = pickle.load(open('../data_nn_features/pca_all_items_sample1000.pkl','rb'))



In [36]:

    
plt.plot(pca_all.explained_variance_ratio_.cumsum())
plt.ylabel('cumulative percent explained variance')
plt.xlabel('component #')
plt.xlim([0,500])









    Out[36]:





(0, 500)



In [37]:

    
%%bash 
#jupyter nbconvert --to Plotting_Sequences_in_low_dimensions.ipynb && mv Plotting_Sequences_in_low_dimensions.slides.html ../notebook_slides/Plotting_Sequences_in_low_dimensions_v1.slides.html
jupyter nbconvert --to html Dimensionality_Reduction_on_Features.ipynb && mv Dimensionality_Reduction_on_Features.html ../notebook_htmls/Dimensionality_Reduction_on_Features_v1.html
cp Dimensionality_Reduction_on_Features.ipynb ../notebook_versions/Dimensionality_Reduction_on_Features_v1.ipynb









    



[NbConvertApp] Converting notebook Dimensionality_Reduction_on_Features.ipynb to html
[NbConvertApp] Writing 285891 bytes to Dimensionality_Reduction_on_Features.html



In [ ]:

	user_id	buy_spu	buy_sn	buy_ct3	view_spu	view_sn	view_ct3	time_interval	view_cnt	view_seconds	index	spu_id	view_spu_count
1226	3787002243	14994645517246498	10006541	334	221597279072104476	10005949	334	389705	4	98	87616.0	2.215973e+17	11
1227	3787002243	14994645517246498	10006541	334	29349802448457744	10004542	334	7342	1	10	73880.0	2.934980e+16	11
2261	132681117	6831894477217820	10001155	334	27661017974767638	10004048	334	32451	1	6	21676.0	2.766102e+16	63
2262	132681117	6831894477217820	10001155	334	288869796837830741	10000351	334	33318	2	30	6475.0	2.888698e+17	63
2263	132681117	6831894477217820	10001155	334	301817675098247170	10020640	334	428149	1	9	25612.0	3.018177e+17	63