In [1]:
import pandas as pd
import numpy as np
import struct
import gzip
import matplotlib.pyplot as plt
import seaborn
def parse(path):
g = gzip.open(path, 'rb')
for l in g:
yield eval(l)
def getDF(path):
i = 0
df = {}
for d in parse(path):
df[i] = d
i += 1
return pd.DataFrame.from_dict(df, orient='index')
In [14]:
reviews = getDF('reviews_Baby.json.gz')
meta = getDF('meta_Baby.json.gz')
In [15]:
img_feat = pd.read_csv('rcnn_image_features_stephan_1.tar')
In [16]:
reviewsDF = reviews.set_index('asin').groupby(level = 0)['unixReviewTime','overall'].agg(np.average)
del reviews
In [17]:
meta = meta[['price','asin','title']].set_index('asin')
In [18]:
df = meta.merge(reviewsDF, how = 'inner', left_index = True, right_index = True).dropna(how = 'any')
df.head()
Out[18]:
In [19]:
df['time'] = pd.to_datetime(df['unixReviewTime'], unit = 's')
df = df.drop('unixReviewTime',axis = 1)
In [20]:
plt.figure()
df.set_index('time')['price'].groupby(pd.TimeGrouper(freq='6M')).count().plot(kind='bar')
plt.ylabel('Frequency')
plt.show()
In [29]:
plt.figure()
np.log(df['price']).plot.hist(bins = 50,)
plt.xlabel('ln(price)')
plt.show()
In [30]:
df = df[df['time'] > '2013-01-01']
df.shape
#33378
Out[30]:
In [44]:
plt.figure()
df[df.apply(lambda x: 'sock' in x['title'].lower(), axis = 1)]['price'].plot.hist(bins = 20)
plt.xlabel('Price')
plt.show()
In [49]:
plt.figure()
df[df.apply(lambda x: 'stroller' in x['title'].lower(), axis = 1)]['price'].plot.hist(bins = 20)
plt.xlabel('Price')
plt.show()
In [52]:
import nltk
from nltk.collocations import *
from nltk.corpus import stopwords
words = nltk.word_tokenize(' '.join(df['title']))
print(len(words))
stopset = set(stopwords.words('english'))
bigram_measures = nltk.collocations.BigramAssocMeasures()
filtered_words = [w for w in words if not w in stopwords.words('english')]
print(len(filtered_words))
In [53]:
import string
filtered_words = [x for x in filtered_words if x not in string.punctuation]
print(len(filtered_words))
In [54]:
finder = BigramCollocationFinder.from_words(filtered_words)
finder.nbest(bigram_measures.raw_freq, 20)
Out[54]:
In [58]:
word_fd = nltk.FreqDist(filtered_words)
word_fd.plot(35,cumulative=False)
In [59]:
words_only = [w for w in filtered_words if w.isalpha()]
unique = set([w.lower() for w in words_only])
len(unique)
Out[59]:
In [60]:
word_fd = nltk.FreqDist(filtered_words)
plt.figure()
plt.plot(list(range(len(word_fd))),np.log(sorted(word_fd.values(),reverse = True)))
plt.xlabel('# Words')
plt.ylabel('Log Frequency')
plt.show()
In [61]:
word_fd = nltk.FreqDist(filtered_words)
plt.figure()
plt.plot(list(range(len(word_fd))),np.array(sorted(word_fd.values(),reverse = True)).cumsum())
plt.xlabel('# Words')
plt.ylabel('Cumulative Frequency')
plt.show()
In [63]:
df = img_feat.set_index('rcnn_image_features.csv').merge(df, how = 'inner', left_index = True, right_index = True).dropna(how = 'any')
In [64]:
price = df['price']
rating = df['overall']
df = df.drop(['price','overall','time'],axis = 1)
In [65]:
del img_feat
In [66]:
df.shape
Out[66]:
In [67]:
from sklearn.feature_extraction.text import CountVectorizer
countvect = CountVectorizer(analyzer = 'word', tokenizer = nltk.word_tokenize
, stop_words = 'english', max_features = 1000)
In [68]:
from scipy.sparse import hstack
allFeatures = hstack((df.drop('title',axis = 1).values, countvect.fit_transform(df['title'])))
In [69]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsRegressor
X_train, X_test, y_train, y_test = train_test_split(allFeatures,price, test_size = 0.2, random_state = 1)
In [62]:
gb = GradientBoostingRegressor(random_state = 1, n_estimators = 100)
param_grid = {'learning_rate': [0.01, 0.1, 1, 10],
'max_depth': [3,5, 7]}
In [63]:
gs = GridSearchCV(estimator = gb, param_grid = param_grid, scoring = 'r2', cv = 3, n_jobs = 3, verbose = 10).fit(X_train, y_train)
In [64]:
import pickle
pickle.dump(gs, open('gsgd','wb'))
In [65]:
gs = pickle.load(open('gsgd','rb'))
In [67]:
pd.DataFrame(gs.cv_results_)
Out[67]:
In [70]:
gb = GradientBoostingRegressor(random_state = 1, n_estimators = 100)
param_grid = {'learning_rate': [0.01, 0.05, 0.1, 0.5],
'max_depth': [8, 10, 13]}
In [ ]:
gs2 = GridSearchCV(estimator = gb, param_grid = param_grid, scoring = 'r2', cv = 3, n_jobs = 3, verbose = 10).fit(X_train, y_train)