In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
In [2]:
os.chdir('/Users/Walkon302/Desktop/deep-learning-models-master/view2buy')
In [3]:
# Read the preprocessed file, containing the user profile and item features from view2buy folder
df = pd.read_pickle('user_fea_for_eval.pkl')
In [4]:
# Drop the first column, which is the original data format.
df.drop('0', axis = 1, inplace = True)
In [5]:
# Check the data
df.head()
Out[5]:
In [6]:
# Slice the data into 100k items
df = df.iloc[0:100000, :]
In [7]:
# Calculate the average view sec for all view items per user
avg_view_sec = pd.DataFrame(df.groupby(['user_id', 'buy_spu'])['view_secondes'].mean())
In [8]:
# Reset the index and rename the column
avg_view_sec.reset_index(inplace=True)
avg_view_sec.rename(columns = {'view_secondes':'avg_view_sec'}, inplace=True)
In [9]:
# Check the data
avg_view_sec.head()
Out[9]:
In [10]:
# Merge avg item view into data
df = pd.merge(df, avg_view_sec, on=['user_id', 'buy_spu'])
In [11]:
# Calculate the weights for view item vec
df['weight_of_view'] = df['view_secondes']/df['avg_view_sec']
In [12]:
df.head()
Out[12]:
In [13]:
# Generate view_item_vec and buy_item_vec
view_item_vec = df['view_features']
buy_item_vec = df['buy_features']
In [14]:
print 'view_item', len(view_item_vec), 'buy_item', len(buy_item_vec)
In [ ]:
# Generate TSNE model
model = TSNE(n_components=10, random_state=0)
In [121]:
# Time the tSNE with 250 samples
%%time
a = pd.DataFrame()
for i, j in enumerate(view_item_vec.iloc[0:250]):
a = pd.concat([a, pd.DataFrame(j).transpose()], axis = 0)
vt = model.fit_transform(a)
In [114]:
# Time the tSNE with 500 samples
%%time
a = pd.DataFrame()
for i, j in enumerate(view_item_vec.iloc[0:500]):
a = pd.concat([a, pd.DataFrame(j).transpose()], axis = 0)
vt = model.fit_transform(a)
In [113]:
# Time the tSNE with 1000 samples
%%time
a = pd.DataFrame()
for i, j in enumerate(view_item_vec.iloc[0:1000]):
a = pd.concat([a, pd.DataFrame(j).transpose()], axis = 0)
vt = model.fit_transform(a)
In [17]:
# Generate TSNE model
model = PCA(n_components=200, random_state=0)
In [18]:
%%time
view_item = []
for i in view_item_vec:
view_item.append(i)
view_item= np.array(view_item)
In [19]:
%%time
pca_view_vec = model.fit_transform(view_item)
In [20]:
# 200 dimensions of PCA can explain 85% of variables. Beyond that, e.g., 300 D, my computer will run out of memory (8g)
sum(model.explained_variance_ratio_)
Out[20]:
In [22]:
%%time
buy_item = []
for i in buy_item_vec:
buy_item.append(i)
buy_item= np.array(buy_item)
In [23]:
%%time
pca_buy_vec = model.fit_transform(buy_item)
In [24]:
# Incert pca result to data
df['pca_view'] = pca_view_vec.tolist()
df['pca_buy'] = pca_buy_vec.tolist()
In [25]:
# Check the data
df.head()
Out[25]:
In [15]:
df = pd.read_pickle('df_weighted.pkl')
In [123]:
# Calculate the weighted pca_view
df['weighted_view_pca'] = df.apply(lambda x: [y*x['weight_of_view'] for y in x['pca_view']], axis=1)
In [16]:
# Calculate the weighted pca_buy
df['weighted_buy_pca'] = df.apply(lambda x: [y*x['weight_of_view'] for y in x['pca_buy']], axis=1)
In [20]:
# Check the data
df.head()
Out[20]:
In [ ]:
df.to_pickle('top100k_user_pca.pkl')
In [ ]: