In [1]:
import matplotlib
matplotlib.use('nbagg')
%matplotlib inline
In [2]:
import graphlab as gl
In [3]:
reviews = gl.SFrame.read_csv('../data/yelp/yelp_training_set_review.json', header=False)
reviews = reviews.unpack('X1','')
reviews = reviews.unpack('votes', '')
reviews['total_votes'] = reviews['funny'] + reviews['cool'] + reviews['useful']
reviews
Out[3]:
In [4]:
from sklearn.decomposition import PCA
import pandas as pd
In [5]:
data = reviews[['funny','cool','useful']].to_dataframe()
In [6]:
pca = PCA(n_components=3)
pca.fit(data)
Out[6]:
In [7]:
import numpy as np
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
def plot_figs(ax, data, pca):
ax.set_xlabel('funny')
ax.set_ylabel('cool')
ax.set_zlabel('useful')
a = data.funny - data.funny.mean()
b = data.cool - data.cool.mean()
c = data.useful - data.useful.mean()
ax.scatter(a[::20], b[::20], c[::20], marker='+', alpha=1)
pca_score = pca.explained_variance_ratio_
V = pca.components_
x_pca_axis, y_pca_axis, z_pca_axis = 2 * V.T * pca_score / pca_score.min()
ax.plot(xs=(0, x_pca_axis[0]), ys=(0, y_pca_axis[0]), zs=(0, z_pca_axis[0]), color='r', linewidth=4)
ax.plot(xs=(0, x_pca_axis[1]), ys=(0, y_pca_axis[1]), zs=(0, z_pca_axis[1]), color='g', linewidth=4)
ax.plot(xs=(0, x_pca_axis[2]), ys=(0, y_pca_axis[2]), zs=(0, z_pca_axis[2]), color='y', linewidth=4)
return None
In [8]:
elev = 30
azim = 20
fig = plt.figure(1, figsize=(16, 12))
ax = fig.add_subplot(111, projection='3d')
ax.view_init(elev, azim)
plot_figs(ax, data, pca)
In [9]:
pca = PCA(4)
pca.fit(reviews[['funny','cool','useful','stars']].to_dataframe())
print(pca.explained_variance_ratio_)
pca.components_
Out[9]: