In [1]:
import pandas as pd
import numpy as np
import struct
import gzip
import matplotlib.pyplot as plt

    
def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield eval(l)

def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

In [2]:
root_dir = "/home/stephanos/Downloads/courses/HackOnData-2017/Amazon/baby/"

reviews_file = root_dir + 'reviews_Baby.json.gz'
meta_file = root_dir + 'meta_Baby.json.gz'
rcnn_image_features = root_dir + 'rcnn_image_features.csv'
rcnn_image_features_resized = root_dir + 'rcnn_image_features_resized.csv'

In [3]:
reviews = getDF(reviews_file)
meta = getDF(meta_file)

In [4]:
img_feat = pd.read_csv(rcnn_image_features_resized)

In [5]:
img_feat


Out[5]:
asin 1 2 3 4 5 6 7 8 9 ... 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096
0 B007X64EKG 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 2.429399 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.821456 5.219251 0.120819
1 B000A1ELMW 1.122768 0.000000 0.000000 0.000000 0.000000 0.000000 2.514104 0.000000 0.000000 ... 0.000000 2.423971 0.000000 0.000557 0.000000 0.000000 0.000000 2.157165 4.932326 0.000000
2 B003953CUS 0.000000 1.989970 0.000000 0.000000 0.000000 0.000000 1.119017 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.124479 1.334983
3 B00415NIJA 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.911995 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.075507
4 B00608N3PK 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.774086 0.000000 0.000000 0.000000 0.739931 0.282007 1.235082
5 B00EWOOPXI 0.000000 0.000000 0.000000 0.000000 0.279954 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.535782 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
6 B004V4ISBO 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 2.378674 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
7 B00064ATVK 0.000000 1.930494 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.086297 0.000000 0.000000 0.000000 0.000000 0.000000 0.600964 0.288226
8 B008MCX1H2 0.000000 1.204601 0.000000 0.000000 0.000000 0.000000 0.443854 0.881599 0.000000 ... 0.000000 0.000000 0.638428 0.000000 0.000000 0.000000 0.000000 0.311544 2.472051 0.000000
9 B00CMCQMMY 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.548564 0.000000 0.448372 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 2.932648 0.000000
10 B003JQL722 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.056412 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.609630
11 B007RW30EW 1.421745 2.033537 0.000000 0.000000 0.000000 0.000000 0.154374 1.097668 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.592911 2.991863 0.000000
12 B001CWFLVI 0.000000 0.440103 0.000000 0.000000 0.000000 0.000000 1.871655 0.000000 0.478948 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.657174
13 B007JTSWG4 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.051539 0.681220 0.000000
14 B000P26UIE 0.995920 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.582713 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
15 B00CFRLXWK 1.125746 0.000000 0.000000 0.000000 0.000000 0.000000 0.843812 0.923384 0.000000 ... 0.000000 0.000000 0.620415 0.342110 0.000000 0.347874 0.000000 0.000000 0.822951 0.715261
16 B004BANAGQ 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.307469 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.078823 0.000000 2.441527
17 B00266QLR2 0.000000 0.481290 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 4.941329 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.075080
18 B0035RQTT6 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.307877 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.356649 0.000000 0.000000 1.156820 5.070184 0.000000
19 B003FI4Q5E 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.125257
20 B008CM58BE 0.873164 0.000000 1.738323 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.283211 1.025279 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.151452
21 B00ATJMGN6 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
22 B00C25TXTA 0.000000 0.979070 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.818404 ... 0.000000 0.000000 0.250654 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
23 B001FBKG66 1.246704 0.000000 0.000000 0.000000 0.000000 0.000000 0.897307 0.128968 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.130653 0.000000 0.609288 0.626373
24 B0037Y25RC 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
25 B000N351VM 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.724777 0.300108 0.000000 ... 0.000000 0.096456 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 2.516119 0.000000
26 B004FH4064 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.259952 ... 0.000000 0.000000 2.281197 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
27 B000V749W2 0.522684 2.240635 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.993432
28 B004C03UIS 0.000000 0.578808 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.211645
29 B008QPRPZY 1.026948 1.818366 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 1.003787 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
71206 B007SBXB1E 0.000000 0.767302 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
71207 B00771Z9N8 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.915328
71208 B00BJCO99Q 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.558754 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.974170
71209 B001RQQFZA 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.239755 0.000000 0.000000 0.369784 0.000000
71210 B000M52H1I 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 2.269695
71211 B0070O5YIM 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.324051 0.000000 0.269457 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.221421 0.883831 0.000000
71212 B00ED1WMBC 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 1.236745 0.000000 0.000000 0.291496 0.000000
71213 B003OUWIO4 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.772455 2.558613 0.000000
71214 B00DNFQAHG 0.000000 1.653914 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.300097 0.449351
71215 B004L0VR8E 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.399411 0.000000 0.000000 ... 1.339897 0.000000 0.000000 0.504692 0.072557 0.000000 0.000000 0.000000 0.503945 0.000000
71216 B00DGLVFJK 0.000000 0.192061 0.000000 0.000000 1.911973 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.286662 0.000000
71217 B001TH88BG 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.032130 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.128995 4.581048
71218 B00E3OIMKY 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.124517 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.174799 0.000000 0.000000 0.000000 0.000000 1.092291 0.000000
71219 B007KHQBW2 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.369802 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.604163 2.528894 0.810196
71220 B00ESL3TTG 0.000000 0.902281 0.000000 0.000000 0.000000 0.000000 0.600929 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.069776
71221 B007BG1LQ8 0.996784 2.774849 0.000000 0.000000 0.000000 0.000000 2.256393 0.000000 0.000000 ... 0.000000 0.000000 1.333417 0.861594 0.000000 0.000000 0.000000 1.666098 1.062779 0.863270
71222 B000H1HZ2S 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.341946 0.343959 1.408976 ... 0.000000 0.613971 0.000000 0.000000 0.000000 0.000000 0.000000 1.472055 2.411797 1.892486
71223 B006P56PRG 1.271194 0.000000 0.039354 0.823794 0.000000 0.082198 1.100674 0.000000 0.282259 ... 0.000000 0.000000 0.120218 0.000000 0.000000 0.000000 0.804746 0.000000 1.452349 0.000000
71224 B00AV2GBRS 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.527505 0.000000 ... 0.000000 0.691327 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.003949
71225 B007GRCJ72 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.059582 0.020294 0.000000 ... 0.000000 0.024062 0.199907 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
71226 B000H210EQ 1.246475 1.414785 0.000000 0.000000 0.000000 0.000000 0.037084 0.000000 0.000000 ... 0.000000 0.000000 0.684998 0.000000 0.000000 0.000000 0.353153 0.000000 0.000000 0.000000
71227 B004D2AZVK 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 4.544969 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
71228 B004J35L6C 0.000000 0.936775 0.000000 0.000000 0.000000 0.000000 0.284587 0.000000 0.000000 ... 0.000000 0.000000 0.937153 0.000000 0.000000 0.000000 0.000000 0.000000 0.513180 0.000000
71229 B00006AKYR 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.271163 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.483101
71230 B007BH1W52 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.032861 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.231445 0.000000
71231 B00EUI1L7O 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.028777 1.468933 0.000000 0.000000 0.000000 0.000000 0.000000 0.636482 0.000000
71232 B00BH76VOY 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.198771 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
71233 B003V3QZIE 0.000000 0.000000 0.000000 0.000000 0.701298 0.000000 0.405440 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.098799 2.046280
71234 B0037NZ2H8 0.000000 0.000000 0.000000 0.000000 0.007007 0.000000 0.638287 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.780315
71235 B00474TJCK 0.000000 0.000000 0.000000 0.162207 0.000000 0.516881 2.589649 0.000000 0.087725 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 3.156647 0.000000

71236 rows × 4097 columns


In [6]:
reviewsDF = reviews.set_index('asin').groupby(level = 0)['unixReviewTime','overall'].agg(np.average)
del reviews

In [7]:
meta = meta[['price','asin','title']].set_index('asin')

In [8]:
df = meta.merge(reviewsDF, how = 'inner', left_index = True, right_index = True).dropna(how = 'any')
df.head()


Out[8]:
price title unixReviewTime overall
asin
0188399313 69.99 Lifefactory 4oz BPA Free Glass Baby Bottles - ... 1.369613e+09 5.000000
0188399518 15.95 Planetwise Flannel Wipes 1.382789e+09 3.500000
0188399399 10.95 Planetwise Wipe Pouch 1.365466e+09 5.000000
0316967297 109.95 Annas Dream Full Quilt with 2 Shams 1.371168e+09 4.500000
0615447279 16.95 Stop Pacifier Sucking without tears with Thumb... 1.348464e+09 4.333333

In [9]:
df['time'] = pd.to_datetime(df['unixReviewTime'], unit = 's')
df = df.drop('unixReviewTime',axis = 1)

In [10]:
plt.figure()
df.set_index('time')['price'].groupby(pd.TimeGrouper(freq='6M')).count().plot(kind='bar')
plt.show()



In [11]:
plt.figure()
np.log(df['price']).plot.hist(bins = 50,)
plt.xlabel('ln(price)')
plt.show()



In [11]:
df = df[df['time'] > '2013-01-01']
df.shape


Out[11]:
(33378, 4)

In [12]:
import nltk
#nltk.download()
from nltk.collocations import *
from nltk.corpus import stopwords
words = nltk.word_tokenize(' '.join(df['title']))
print(len(words))
stopset = set(stopwords.words('english'))
bigram_measures = nltk.collocations.BigramAssocMeasures()
filtered_words = [w for w in words if not w in stopwords.words('english')]
print(len(filtered_words))


360191
344911

In [14]:
import string
filtered_words = [x for x in filtered_words if x not in string.punctuation]
print(len(filtered_words))


293700

In [15]:
finder = BigramCollocationFinder.from_words(filtered_words)
finder.nbest(bigram_measures.raw_freq, 20)


Out[15]:
[('Bedding', 'Set'),
 ('Car', 'Seat'),
 ('Diaper', 'Bag'),
 ('2', 'Pack'),
 ('Carter', "'s"),
 ('Crib', 'Bedding'),
 ('Jojo', 'Designs'),
 ('Sweet', 'Jojo'),
 ('Cloth', 'Diaper'),
 ('Summer', 'Infant'),
 ('One', 'Size'),
 ('BPA', 'Free'),
 ('Safety', '1st'),
 ('Changing', 'Pad'),
 ('Crib', 'Sheet'),
 ('Lambs', 'amp'),
 ('amp', 'Ivy'),
 ('3', 'Pack'),
 ('Gift', 'Set'),
 ('4', 'Piece')]

In [16]:
word_fd = nltk.FreqDist(filtered_words)
word_fd.plot(50,cumulative=False)



In [17]:
words_only = [w for w in filtered_words if w.isalpha()]
unique = set([w.lower() for w in words_only])
len(unique)


Out[17]:
13978

In [18]:
word_fd = nltk.FreqDist(filtered_words)
plt.figure()
plt.plot(list(range(len(word_fd))),np.log(sorted(word_fd.values(),reverse = True)))
plt.xlabel('# Words')
plt.ylabel('Log Frequency')
plt.show()



In [19]:
word_fd = nltk.FreqDist(filtered_words)
plt.figure()
plt.plot(list(range(len(word_fd))),np.array(sorted(word_fd.values(),reverse = True)).cumsum())
plt.xlabel('# Words')
plt.ylabel('Cumulative Frequency')
plt.show()



In [13]:
df = img_feat.set_index('asin').merge(df, how = 'inner', left_index = True, right_index = True).dropna(how = 'any')

In [14]:
price = df['price']
rating = df['overall']
df = df.drop(['price','overall','time'],axis = 1)

In [15]:
del img_feat

In [16]:
df.shape


Out[16]:
(33378, 4097)

In [17]:
from sklearn.feature_extraction.text import CountVectorizer
countvect = CountVectorizer(analyzer = 'word', tokenizer = nltk.word_tokenize
                , stop_words = 'english', min_df=5, binary=True)

In [19]:
title_counts = countvect.fit_transform(df['title'])

from collections import defaultdict

d = defaultdict(set)
for w in countvect.vocabulary_:
    c = title_counts[:,countvect.vocabulary_.get(w)].count_nonzero()
    d[c].add(w)

[(k,len(v)) for k,v in sorted(d.items())]


Out[19]:
[(5, 574),
 (6, 478),
 (7, 413),
 (8, 263),
 (9, 252),
 (10, 218),
 (11, 203),
 (12, 165),
 (13, 138),
 (14, 118),
 (15, 122),
 (16, 83),
 (17, 89),
 (18, 86),
 (19, 77),
 (20, 75),
 (21, 73),
 (22, 55),
 (23, 48),
 (24, 67),
 (25, 61),
 (26, 50),
 (27, 49),
 (28, 42),
 (29, 30),
 (30, 42),
 (31, 30),
 (32, 32),
 (33, 33),
 (34, 39),
 (35, 24),
 (36, 28),
 (37, 32),
 (38, 23),
 (39, 32),
 (40, 17),
 (41, 34),
 (42, 20),
 (43, 16),
 (44, 24),
 (45, 13),
 (46, 17),
 (47, 15),
 (48, 19),
 (49, 29),
 (50, 6),
 (51, 15),
 (52, 20),
 (53, 11),
 (54, 17),
 (55, 11),
 (56, 18),
 (57, 17),
 (58, 8),
 (59, 15),
 (60, 11),
 (61, 10),
 (62, 11),
 (63, 15),
 (64, 10),
 (65, 10),
 (66, 8),
 (67, 13),
 (68, 10),
 (69, 8),
 (70, 6),
 (71, 11),
 (72, 11),
 (73, 11),
 (74, 8),
 (75, 9),
 (76, 8),
 (77, 14),
 (78, 9),
 (79, 10),
 (80, 10),
 (81, 12),
 (82, 12),
 (83, 2),
 (84, 11),
 (85, 7),
 (86, 7),
 (87, 4),
 (88, 3),
 (89, 6),
 (90, 5),
 (91, 7),
 (92, 7),
 (93, 5),
 (94, 4),
 (95, 7),
 (96, 6),
 (97, 4),
 (98, 2),
 (99, 10),
 (100, 2),
 (101, 6),
 (102, 8),
 (103, 2),
 (104, 3),
 (105, 3),
 (106, 4),
 (107, 2),
 (108, 3),
 (109, 4),
 (110, 2),
 (111, 2),
 (112, 6),
 (113, 4),
 (114, 4),
 (115, 6),
 (116, 5),
 (117, 6),
 (118, 3),
 (119, 5),
 (120, 4),
 (121, 1),
 (122, 1),
 (123, 1),
 (124, 2),
 (125, 2),
 (126, 2),
 (127, 3),
 (128, 7),
 (129, 1),
 (130, 5),
 (131, 2),
 (132, 3),
 (133, 2),
 (134, 3),
 (135, 2),
 (136, 7),
 (137, 3),
 (138, 3),
 (139, 1),
 (140, 2),
 (141, 3),
 (142, 2),
 (143, 2),
 (144, 4),
 (145, 3),
 (146, 3),
 (147, 3),
 (148, 3),
 (149, 1),
 (150, 1),
 (151, 3),
 (152, 1),
 (153, 3),
 (154, 2),
 (155, 2),
 (156, 2),
 (157, 4),
 (158, 1),
 (159, 2),
 (160, 2),
 (161, 2),
 (162, 5),
 (163, 2),
 (164, 2),
 (165, 2),
 (166, 2),
 (167, 2),
 (168, 3),
 (169, 2),
 (170, 3),
 (171, 2),
 (172, 2),
 (173, 4),
 (174, 3),
 (175, 1),
 (176, 1),
 (180, 3),
 (182, 1),
 (183, 2),
 (184, 2),
 (185, 1),
 (186, 4),
 (187, 2),
 (188, 1),
 (189, 1),
 (191, 2),
 (192, 1),
 (194, 1),
 (195, 1),
 (196, 2),
 (197, 1),
 (198, 2),
 (199, 1),
 (200, 1),
 (201, 3),
 (202, 2),
 (203, 2),
 (204, 2),
 (206, 1),
 (207, 1),
 (211, 1),
 (212, 4),
 (213, 2),
 (214, 1),
 (217, 1),
 (218, 1),
 (219, 1),
 (220, 3),
 (221, 1),
 (222, 2),
 (223, 1),
 (224, 1),
 (225, 2),
 (226, 3),
 (228, 2),
 (229, 2),
 (233, 2),
 (234, 1),
 (236, 1),
 (237, 2),
 (238, 1),
 (241, 1),
 (243, 1),
 (245, 2),
 (246, 1),
 (248, 2),
 (250, 2),
 (251, 1),
 (252, 1),
 (253, 1),
 (254, 1),
 (255, 1),
 (256, 1),
 (257, 1),
 (261, 1),
 (263, 1),
 (265, 1),
 (267, 1),
 (269, 2),
 (271, 1),
 (273, 1),
 (274, 1),
 (280, 1),
 (283, 2),
 (284, 2),
 (285, 1),
 (287, 1),
 (288, 1),
 (290, 2),
 (291, 1),
 (292, 1),
 (293, 1),
 (297, 1),
 (298, 1),
 (300, 1),
 (302, 1),
 (303, 1),
 (304, 1),
 (305, 1),
 (306, 2),
 (308, 1),
 (310, 1),
 (311, 2),
 (314, 2),
 (324, 2),
 (325, 1),
 (326, 3),
 (327, 2),
 (329, 1),
 (331, 1),
 (332, 1),
 (334, 1),
 (335, 3),
 (336, 2),
 (340, 1),
 (346, 2),
 (348, 2),
 (349, 1),
 (353, 1),
 (357, 2),
 (360, 1),
 (363, 1),
 (368, 1),
 (375, 1),
 (381, 2),
 (389, 2),
 (390, 1),
 (392, 1),
 (404, 1),
 (405, 1),
 (414, 2),
 (424, 2),
 (425, 1),
 (426, 1),
 (437, 1),
 (440, 1),
 (441, 1),
 (454, 1),
 (455, 1),
 (456, 1),
 (462, 1),
 (468, 1),
 (470, 1),
 (472, 1),
 (475, 3),
 (476, 2),
 (480, 1),
 (481, 1),
 (490, 1),
 (503, 2),
 (507, 1),
 (508, 1),
 (509, 1),
 (516, 1),
 (529, 1),
 (535, 1),
 (541, 1),
 (542, 1),
 (563, 1),
 (570, 1),
 (573, 1),
 (623, 1),
 (656, 1),
 (660, 1),
 (669, 1),
 (678, 1),
 (679, 1),
 (687, 1),
 (688, 2),
 (695, 3),
 (717, 1),
 (723, 1),
 (741, 1),
 (753, 1),
 (769, 1),
 (792, 1),
 (795, 1),
 (815, 1),
 (856, 1),
 (882, 1),
 (894, 1),
 (902, 1),
 (958, 1),
 (961, 1),
 (988, 1),
 (994, 1),
 (1007, 1),
 (1039, 1),
 (1081, 1),
 (1087, 1),
 (1096, 1),
 (1116, 1),
 (1132, 1),
 (1205, 1),
 (1211, 1),
 (1273, 1),
 (1274, 1),
 (1424, 1),
 (1519, 1),
 (1566, 1),
 (1613, 1),
 (1766, 1),
 (1867, 1),
 (1881, 1),
 (1964, 1),
 (2023, 1),
 (2189, 1),
 (2363, 1),
 (2410, 1),
 (3156, 1),
 (4095, 1),
 (4134, 1),
 (4542, 1),
 (4543, 1),
 (6588, 1),
 (8150, 1),
 (12562, 1)]

In [32]:
countvect.fit_transform(df['title']).toarray()


---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-32-d53001041c62> in <module>()
----> 1 countvect.fit_transform(df['title']).toarray().count_nonzeros()

AttributeError: 'numpy.ndarray' object has no attribute 'count_nonzeros'

In [34]:
from scipy.sparse import hstack
allFeatures = hstack((df.drop('title',axis = 1).values, countvect.fit_transform(df['title']) )).toarray()

In [36]:
allFeatures.shape


Out[36]:
(33378, 9400)

In [37]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsRegressor

X_train, X_test, y_train, y_test = train_test_split(allFeatures, price, test_size = 0.2, random_state = 1)

In [38]:
gb = GradientBoostingRegressor(random_state = 1, n_estimators = 100)
param_grid = {'learning_rate': [0.01, 0.1, 0.3],
             'max_depth': [7]}

#0.05 0.08
#10

In [ ]:
gs = GridSearchCV(estimator = gb, param_grid = param_grid, scoring = 'r2', cv = 3, n_jobs = 4, verbose = 10).fit(X_train, y_train)


Fitting 3 folds for each of 3 candidates, totalling 9 fits
[CV] learning_rate=0.01, max_depth=7 .................................
[CV] learning_rate=0.01, max_depth=7 .................................
[CV] learning_rate=0.01, max_depth=7 .................................
[CV] learning_rate=0.1, max_depth=7 ..................................
[CV] ... learning_rate=0.1, max_depth=7, score=0.512798, total=38.1min
[CV] learning_rate=0.1, max_depth=7 ..................................
[CV] .. learning_rate=0.01, max_depth=7, score=0.351887, total=45.6min
[CV] learning_rate=0.1, max_depth=7 ..................................
[CV] .. learning_rate=0.01, max_depth=7, score=0.357887, total=45.8min
[CV] learning_rate=0.3, max_depth=7 ..................................
[Parallel(n_jobs=4)]: Done   3 out of   9 | elapsed: 45.9min remaining: 91.8min
[CV] .. learning_rate=0.01, max_depth=7, score=0.361162, total=46.7min
[CV] learning_rate=0.3, max_depth=7 ..................................
[Parallel(n_jobs=4)]: Done   4 out of   9 | elapsed: 46.9min remaining: 58.6min
[CV] ... learning_rate=0.1, max_depth=7, score=0.486881, total=37.7min
[CV] learning_rate=0.3, max_depth=7 ..................................
[Parallel(n_jobs=4)]: Done   5 out of   9 | elapsed: 76.1min remaining: 60.8min
[CV] ... learning_rate=0.3, max_depth=7, score=0.459171, total=33.6min
[Parallel(n_jobs=4)]: Done   6 out of   9 | elapsed: 79.5min remaining: 39.8min
[CV] ... learning_rate=0.3, max_depth=7, score=0.434054, total=34.0min
[Parallel(n_jobs=4)]: Done   7 out of   9 | elapsed: 80.9min remaining: 23.1min
[CV] ... learning_rate=0.1, max_depth=7, score=0.503045, total=37.1min
[CV] ... learning_rate=0.3, max_depth=7, score=0.457423, total=25.8min
[Parallel(n_jobs=4)]: Done   9 out of   9 | elapsed: 101.8min remaining:    0.0s
[Parallel(n_jobs=4)]: Done   9 out of   9 | elapsed: 101.8min finished

In [64]:
import pickle
pickle.dump(gs, open('gsgd','wb'))

In [65]:
gs = pickle.load(open('gsgd','rb'))

In [67]:
pd.DataFrame(gs.cv_results_)


Out[67]:
mean_fit_time mean_score_time mean_test_score mean_train_score param_learning_rate param_max_depth params rank_test_score split0_test_score split0_train_score split1_test_score split1_train_score split2_test_score split2_train_score std_fit_time std_score_time std_test_score std_train_score
0 566.670623 0.114586 2.786318e-01 3.243149e-01 0.01 3 {'learning_rate': 0.01, 'max_depth': 3} 6 2.746436e-01 3.206852e-01 2.756077e-01 3.257986e-01 2.856449e-01 3.264610e-01 0.880525 7.364516e-03 0.004974 0.002581
1 986.255134 0.119795 3.309793e-01 4.812772e-01 0.01 5 {'learning_rate': 0.01, 'max_depth': 5} 5 3.311018e-01 4.812723e-01 3.247332e-01 4.761615e-01 3.371035e-01 4.863977e-01 1.923422 7.367271e-03 0.005051 0.004179
2 1485.886665 0.145834 3.545946e-01 5.994504e-01 0.01 7 {'learning_rate': 0.01, 'max_depth': 7} 4 3.550735e-01 6.018290e-01 3.471884e-01 5.952919e-01 3.615226e-01 6.012302e-01 7.834503 7.365808e-03 0.005862 0.002951
3 562.740034 0.114584 4.494290e-01 6.398482e-01 0.1 3 {'learning_rate': 0.1, 'max_depth': 3} 3 4.584442e-01 6.370217e-01 4.378416e-01 6.420067e-01 4.520016e-01 6.405162e-01 0.943872 7.366033e-03 0.008606 0.002089
4 989.149054 0.139613 4.848307e-01 8.158985e-01 0.1 5 {'learning_rate': 0.1, 'max_depth': 5} 2 4.922831e-01 8.141078e-01 4.711908e-01 8.157892e-01 4.910190e-01 8.177984e-01 0.990014 1.433218e-03 0.009659 0.001509
5 1452.639853 0.145834 4.886593e-01 9.063788e-01 0.1 7 {'learning_rate': 0.1, 'max_depth': 7} 1 4.976960e-01 9.064331e-01 4.791010e-01 9.076069e-01 4.891809e-01 9.050965e-01 4.112546 7.365639e-03 0.007600 0.001026
6 565.015646 0.120319 1.792066e-02 8.619998e-01 1 3 {'learning_rate': 1, 'max_depth': 3} 7 4.932588e-02 8.617218e-01 -3.529163e-02 8.642543e-01 3.973017e-02 8.600232e-01 0.675001 7.765183e-03 0.037831 0.001738
7 970.525649 0.140625 -1.198566e-01 9.623524e-01 1 5 {'learning_rate': 1, 'max_depth': 5} 8 -1.098837e-01 9.611571e-01 -1.242398e-01 9.622157e-01 -1.254470e-01 9.636842e-01 0.890323 3.893359e-07 0.007069 0.001036
8 1368.443473 0.145834 -1.649937e-01 9.907511e-01 1 7 {'learning_rate': 1, 'max_depth': 7} 9 -1.382360e-01 9.904348e-01 -2.061176e-01 9.913164e-01 -1.506260e-01 9.905020e-01 5.178201 7.365527e-03 0.029516 0.000401
9 563.856970 0.104178 -1.796564e+190 -1.918447e+190 10 3 {'learning_rate': 10, 'max_depth': 3} 10 -1.592175e+190 -1.868740e+190 -1.839157e+190 -1.942958e+190 -1.958380e+190 -1.943643e+190 1.106056 7.351815e-03 inf inf
10 981.543549 0.119794 -2.579831e+190 -2.902438e+190 10 5 {'learning_rate': 10, 'max_depth': 5} 11 -2.521610e+190 -2.819105e+190 -2.814671e+190 -2.901231e+190 -2.403191e+190 -2.986977e+190 0.944962 7.363448e-03 inf inf
11 1519.095625 0.130208 -4.126538e+190 -3.936789e+190 10 7 {'learning_rate': 10, 'max_depth': 7} 12 -3.475897e+190 -3.940914e+190 -5.155596e+190 -3.937756e+190 -3.748079e+190 -3.931695e+190 18.480392 7.365415e-03 inf inf

In [ ]: