notebook.community

Edit and run



In [1]:

    
import pandas as pd
import numpy as np
import struct
import gzip
import matplotlib.pyplot as plt
import seaborn
    
def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield eval(l)

def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')



In [14]:

    
reviews = getDF('reviews_Baby.json.gz')
meta = getDF('meta_Baby.json.gz')



In [15]:

    
img_feat = pd.read_csv('rcnn_image_features_stephan_1.tar')



In [16]:

    
reviewsDF = reviews.set_index('asin').groupby(level = 0)['unixReviewTime','overall'].agg(np.average)
del reviews



In [17]:

    
meta = meta[['price','asin','title']].set_index('asin')



In [18]:

    
df = meta.merge(reviewsDF, how = 'inner', left_index = True, right_index = True).dropna(how = 'any')
df.head()









    Out[18]:







  
    
      
      price
      title
      unixReviewTime
      overall
    
    
      asin
      
      
      
      
    
  
  
    
      0188399313
      69.99
      Lifefactory 4oz BPA Free Glass Baby Bottles - ...
      1.369613e+09
      5.000000
    
    
      0188399518
      15.95
      Planetwise Flannel Wipes
      1.382789e+09
      3.500000
    
    
      0188399399
      10.95
      Planetwise Wipe Pouch
      1.365466e+09
      5.000000
    
    
      0316967297
      109.95
      Annas Dream Full Quilt with 2 Shams
      1.371168e+09
      4.500000
    
    
      0615447279
      16.95
      Stop Pacifier Sucking without tears with Thumb...
      1.348464e+09
      4.333333



In [19]:

    
df['time'] = pd.to_datetime(df['unixReviewTime'], unit = 's')
df = df.drop('unixReviewTime',axis = 1)



In [20]:

    
plt.figure()
df.set_index('time')['price'].groupby(pd.TimeGrouper(freq='6M')).count().plot(kind='bar')
plt.ylabel('Frequency')
plt.show()



In [29]:

    
plt.figure()
np.log(df['price']).plot.hist(bins = 50,)
plt.xlabel('ln(price)')
plt.show()



In [30]:

    
df = df[df['time'] > '2013-01-01']
df.shape
#33378









    Out[30]:





(33378, 4)



In [44]:

    
plt.figure()
df[df.apply(lambda x: 'sock' in x['title'].lower(), axis = 1)]['price'].plot.hist(bins = 20)
plt.xlabel('Price')
plt.show()



In [49]:

    
plt.figure()
df[df.apply(lambda x: 'stroller' in x['title'].lower(), axis = 1)]['price'].plot.hist(bins = 20)
plt.xlabel('Price')
plt.show()



In [52]:

    
import nltk
from nltk.collocations import *
from nltk.corpus import stopwords
words = nltk.word_tokenize(' '.join(df['title']))
print(len(words))
stopset = set(stopwords.words('english'))
bigram_measures = nltk.collocations.BigramAssocMeasures()
filtered_words = [w for w in words if not w in stopwords.words('english')]
print(len(filtered_words))



In [53]:

    
import string
filtered_words = [x for x in filtered_words if x not in string.punctuation]
print(len(filtered_words))



In [54]:

    
finder = BigramCollocationFinder.from_words(filtered_words)
finder.nbest(bigram_measures.raw_freq, 20)









    Out[54]:





[('Bedding', 'Set'),
 ('Car', 'Seat'),
 ('Diaper', 'Bag'),
 ('2', 'Pack'),
 ('Carter', "'s"),
 ('Crib', 'Bedding'),
 ('Jojo', 'Designs'),
 ('Sweet', 'Jojo'),
 ('Cloth', 'Diaper'),
 ('Summer', 'Infant'),
 ('One', 'Size'),
 ('BPA', 'Free'),
 ('Safety', '1st'),
 ('Changing', 'Pad'),
 ('Crib', 'Sheet'),
 ('Lambs', 'amp'),
 ('amp', 'Ivy'),
 ('3', 'Pack'),
 ('Gift', 'Set'),
 ('4', 'Piece')]



In [58]:

    
word_fd = nltk.FreqDist(filtered_words)
word_fd.plot(35,cumulative=False)



In [59]:

    
words_only = [w for w in filtered_words if w.isalpha()]
unique = set([w.lower() for w in words_only])
len(unique)









    Out[59]:





13978



In [60]:

    
word_fd = nltk.FreqDist(filtered_words)
plt.figure()
plt.plot(list(range(len(word_fd))),np.log(sorted(word_fd.values(),reverse = True)))
plt.xlabel('# Words')
plt.ylabel('Log Frequency')
plt.show()



In [61]:

    
word_fd = nltk.FreqDist(filtered_words)
plt.figure()
plt.plot(list(range(len(word_fd))),np.array(sorted(word_fd.values(),reverse = True)).cumsum())
plt.xlabel('# Words')
plt.ylabel('Cumulative Frequency')
plt.show()



In [63]:

    
df = img_feat.set_index('rcnn_image_features.csv').merge(df, how = 'inner', left_index = True, right_index = True).dropna(how = 'any')



In [64]:

    
price = df['price']
rating = df['overall']
df = df.drop(['price','overall','time'],axis = 1)



In [65]:

    
del img_feat



In [66]:

    
df.shape









    Out[66]:





(33378, 4097)



In [67]:

    
from sklearn.feature_extraction.text import CountVectorizer
countvect = CountVectorizer(analyzer = 'word', tokenizer = nltk.word_tokenize
                , stop_words = 'english', max_features = 1000)



In [68]:

    
from scipy.sparse import hstack
allFeatures = hstack((df.drop('title',axis = 1).values, countvect.fit_transform(df['title'])))



In [69]:

    
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsRegressor

X_train, X_test, y_train, y_test = train_test_split(allFeatures,price, test_size = 0.2, random_state = 1)



In [62]:

    
gb = GradientBoostingRegressor(random_state = 1, n_estimators = 100)
param_grid = {'learning_rate': [0.01, 0.1, 1, 10],
             'max_depth': [3,5, 7]}



In [63]:

    
gs = GridSearchCV(estimator = gb, param_grid = param_grid, scoring = 'r2', cv = 3, n_jobs = 3, verbose = 10).fit(X_train, y_train)









    



Fitting 3 folds for each of 12 candidates, totalling 36 fits






    



[Parallel(n_jobs=3)]: Done   2 tasks      | elapsed: 10.2min
[Parallel(n_jobs=3)]: Done   7 tasks      | elapsed: 51.2min
[Parallel(n_jobs=3)]: Done  12 tasks      | elapsed: 61.0min
[Parallel(n_jobs=3)]: Done  19 tasks      | elapsed: 110.7min
[Parallel(n_jobs=3)]: Done  26 tasks      | elapsed: 149.9min
[Parallel(n_jobs=3)]: Done  36 out of  36 | elapsed: 201.3min finished
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py:672: RuntimeWarning: overflow encountered in square
  array_means[:, np.newaxis]) ** 2,



In [64]:

    
import pickle
pickle.dump(gs, open('gsgd','wb'))



In [65]:

    
gs = pickle.load(open('gsgd','rb'))



In [67]:

    
pd.DataFrame(gs.cv_results_)









    Out[67]:







  
    
      
      mean_fit_time
      mean_score_time
      mean_test_score
      mean_train_score
      param_learning_rate
      param_max_depth
      params
      rank_test_score
      split0_test_score
      split0_train_score
      split1_test_score
      split1_train_score
      split2_test_score
      split2_train_score
      std_fit_time
      std_score_time
      std_test_score
      std_train_score
    
  
  
    
      0
      566.670623
      0.114586
      2.786318e-01
      3.243149e-01
      0.01
      3
      {'learning_rate': 0.01, 'max_depth': 3}
      6
      2.746436e-01
      3.206852e-01
      2.756077e-01
      3.257986e-01
      2.856449e-01
      3.264610e-01
      0.880525
      7.364516e-03
      0.004974
      0.002581
    
    
      1
      986.255134
      0.119795
      3.309793e-01
      4.812772e-01
      0.01
      5
      {'learning_rate': 0.01, 'max_depth': 5}
      5
      3.311018e-01
      4.812723e-01
      3.247332e-01
      4.761615e-01
      3.371035e-01
      4.863977e-01
      1.923422
      7.367271e-03
      0.005051
      0.004179
    
    
      2
      1485.886665
      0.145834
      3.545946e-01
      5.994504e-01
      0.01
      7
      {'learning_rate': 0.01, 'max_depth': 7}
      4
      3.550735e-01
      6.018290e-01
      3.471884e-01
      5.952919e-01
      3.615226e-01
      6.012302e-01
      7.834503
      7.365808e-03
      0.005862
      0.002951
    
    
      3
      562.740034
      0.114584
      4.494290e-01
      6.398482e-01
      0.1
      3
      {'learning_rate': 0.1, 'max_depth': 3}
      3
      4.584442e-01
      6.370217e-01
      4.378416e-01
      6.420067e-01
      4.520016e-01
      6.405162e-01
      0.943872
      7.366033e-03
      0.008606
      0.002089
    
    
      4
      989.149054
      0.139613
      4.848307e-01
      8.158985e-01
      0.1
      5
      {'learning_rate': 0.1, 'max_depth': 5}
      2
      4.922831e-01
      8.141078e-01
      4.711908e-01
      8.157892e-01
      4.910190e-01
      8.177984e-01
      0.990014
      1.433218e-03
      0.009659
      0.001509
    
    
      5
      1452.639853
      0.145834
      4.886593e-01
      9.063788e-01
      0.1
      7
      {'learning_rate': 0.1, 'max_depth': 7}
      1
      4.976960e-01
      9.064331e-01
      4.791010e-01
      9.076069e-01
      4.891809e-01
      9.050965e-01
      4.112546
      7.365639e-03
      0.007600
      0.001026
    
    
      6
      565.015646
      0.120319
      1.792066e-02
      8.619998e-01
      1
      3
      {'learning_rate': 1, 'max_depth': 3}
      7
      4.932588e-02
      8.617218e-01
      -3.529163e-02
      8.642543e-01
      3.973017e-02
      8.600232e-01
      0.675001
      7.765183e-03
      0.037831
      0.001738
    
    
      7
      970.525649
      0.140625
      -1.198566e-01
      9.623524e-01
      1
      5
      {'learning_rate': 1, 'max_depth': 5}
      8
      -1.098837e-01
      9.611571e-01
      -1.242398e-01
      9.622157e-01
      -1.254470e-01
      9.636842e-01
      0.890323
      3.893359e-07
      0.007069
      0.001036
    
    
      8
      1368.443473
      0.145834
      -1.649937e-01
      9.907511e-01
      1
      7
      {'learning_rate': 1, 'max_depth': 7}
      9
      -1.382360e-01
      9.904348e-01
      -2.061176e-01
      9.913164e-01
      -1.506260e-01
      9.905020e-01
      5.178201
      7.365527e-03
      0.029516
      0.000401
    
    
      9
      563.856970
      0.104178
      -1.796564e+190
      -1.918447e+190
      10
      3
      {'learning_rate': 10, 'max_depth': 3}
      10
      -1.592175e+190
      -1.868740e+190
      -1.839157e+190
      -1.942958e+190
      -1.958380e+190
      -1.943643e+190
      1.106056
      7.351815e-03
      inf
      inf
    
    
      10
      981.543549
      0.119794
      -2.579831e+190
      -2.902438e+190
      10
      5
      {'learning_rate': 10, 'max_depth': 5}
      11
      -2.521610e+190
      -2.819105e+190
      -2.814671e+190
      -2.901231e+190
      -2.403191e+190
      -2.986977e+190
      0.944962
      7.363448e-03
      inf
      inf
    
    
      11
      1519.095625
      0.130208
      -4.126538e+190
      -3.936789e+190
      10
      7
      {'learning_rate': 10, 'max_depth': 7}
      12
      -3.475897e+190
      -3.940914e+190
      -5.155596e+190
      -3.937756e+190
      -3.748079e+190
      -3.931695e+190
      18.480392
      7.365415e-03
      inf
      inf



In [70]:

    
gb = GradientBoostingRegressor(random_state = 1, n_estimators = 100)
param_grid = {'learning_rate': [0.01, 0.05, 0.1, 0.5],
             'max_depth': [8, 10, 13]}



In [ ]:

    
gs2 = GridSearchCV(estimator = gb, param_grid = param_grid, scoring = 'r2', cv = 3, n_jobs = 3, verbose = 10).fit(X_train, y_train)









    



Fitting 3 folds for each of 12 candidates, totalling 36 fits

	price	title	unixReviewTime	overall
asin
0188399313	69.99	Lifefactory 4oz BPA Free Glass Baby Bottles - ...	1.369613e+09	5.000000
0188399518	15.95	Planetwise Flannel Wipes	1.382789e+09	3.500000
0188399399	10.95	Planetwise Wipe Pouch	1.365466e+09	5.000000
0316967297	109.95	Annas Dream Full Quilt with 2 Shams	1.371168e+09	4.500000
0615447279	16.95	Stop Pacifier Sucking without tears with Thumb...	1.348464e+09	4.333333

	mean_fit_time	mean_score_time	mean_test_score	mean_train_score	param_learning_rate	param_max_depth	params	rank_test_score	split0_test_score	split0_train_score	split1_test_score	split1_train_score	split2_test_score	split2_train_score	std_fit_time	std_score_time	std_test_score	std_train_score
0	566.670623	0.114586	2.786318e-01	3.243149e-01	0.01	3	{'learning_rate': 0.01, 'max_depth': 3}	6	2.746436e-01	3.206852e-01	2.756077e-01	3.257986e-01	2.856449e-01	3.264610e-01	0.880525	7.364516e-03	0.004974	0.002581
1	986.255134	0.119795	3.309793e-01	4.812772e-01	0.01	5	{'learning_rate': 0.01, 'max_depth': 5}	5	3.311018e-01	4.812723e-01	3.247332e-01	4.761615e-01	3.371035e-01	4.863977e-01	1.923422	7.367271e-03	0.005051	0.004179
2	1485.886665	0.145834	3.545946e-01	5.994504e-01	0.01	7	{'learning_rate': 0.01, 'max_depth': 7}	4	3.550735e-01	6.018290e-01	3.471884e-01	5.952919e-01	3.615226e-01	6.012302e-01	7.834503	7.365808e-03	0.005862	0.002951
3	562.740034	0.114584	4.494290e-01	6.398482e-01	0.1	3	{'learning_rate': 0.1, 'max_depth': 3}	3	4.584442e-01	6.370217e-01	4.378416e-01	6.420067e-01	4.520016e-01	6.405162e-01	0.943872	7.366033e-03	0.008606	0.002089
4	989.149054	0.139613	4.848307e-01	8.158985e-01	0.1	5	{'learning_rate': 0.1, 'max_depth': 5}	2	4.922831e-01	8.141078e-01	4.711908e-01	8.157892e-01	4.910190e-01	8.177984e-01	0.990014	1.433218e-03	0.009659	0.001509
5	1452.639853	0.145834	4.886593e-01	9.063788e-01	0.1	7	{'learning_rate': 0.1, 'max_depth': 7}	1	4.976960e-01	9.064331e-01	4.791010e-01	9.076069e-01	4.891809e-01	9.050965e-01	4.112546	7.365639e-03	0.007600	0.001026
6	565.015646	0.120319	1.792066e-02	8.619998e-01	1	3	{'learning_rate': 1, 'max_depth': 3}	7	4.932588e-02	8.617218e-01	-3.529163e-02	8.642543e-01	3.973017e-02	8.600232e-01	0.675001	7.765183e-03	0.037831	0.001738
7	970.525649	0.140625	-1.198566e-01	9.623524e-01	1	5	{'learning_rate': 1, 'max_depth': 5}	8	-1.098837e-01	9.611571e-01	-1.242398e-01	9.622157e-01	-1.254470e-01	9.636842e-01	0.890323	3.893359e-07	0.007069	0.001036
8	1368.443473	0.145834	-1.649937e-01	9.907511e-01	1	7	{'learning_rate': 1, 'max_depth': 7}	9	-1.382360e-01	9.904348e-01	-2.061176e-01	9.913164e-01	-1.506260e-01	9.905020e-01	5.178201	7.365527e-03	0.029516	0.000401
9	563.856970	0.104178	-1.796564e+190	-1.918447e+190	10	3	{'learning_rate': 10, 'max_depth': 3}	10	-1.592175e+190	-1.868740e+190	-1.839157e+190	-1.942958e+190	-1.958380e+190	-1.943643e+190	1.106056	7.351815e-03	inf	inf
10	981.543549	0.119794	-2.579831e+190	-2.902438e+190	10	5	{'learning_rate': 10, 'max_depth': 5}	11	-2.521610e+190	-2.819105e+190	-2.814671e+190	-2.901231e+190	-2.403191e+190	-2.986977e+190	0.944962	7.363448e-03	inf	inf
11	1519.095625	0.130208	-4.126538e+190	-3.936789e+190	10	7	{'learning_rate': 10, 'max_depth': 7}	12	-3.475897e+190	-3.940914e+190	-5.155596e+190	-3.937756e+190	-3.748079e+190	-3.931695e+190	18.480392	7.365415e-03	inf	inf