In [6]:
from os import listdir
from os.path import isfile, join
from bs4 import BeautifulSoup
import xml.etree.ElementTree as ET
import codecs
mypath ="/Anaconda/blogs"
import re
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cross_validation import train_test_split
from time import time
import sys
import scipy.sparse as sp
import pylab as pl
import cPickle
import sqlite3

In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

In [23]:
with open('RF_classifier2.pkl', 'rb') as fid:
    RF_loaded = cPickle.load(fid)

In [5]:
print RF_loaded


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=32, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=4,
            oob_score=False, random_state=None, verbose=True,
            warm_start=False)

In [27]:
import pandas as pd
import sqlite3

# Read sqlite query results into a pandas DataFrame
con = sqlite3.connect("Scrape-Amazon\\amazon\\Book.db")
df = pd.read_sql_query("SELECT * from Review", con)

# verify that result of SQL query is stored in the dataframe
print(df.head())

con.close()


   id             rid              rname         pid  \
0   1  R3A9TW37U9J9C3    College Stealth  0385535597   
1   2   REDTPL6B4HNHN      Connor Gibson  0385535597   
2   3   RB04JM6NDDGFL            Panda31  0385535597   
3   4  R37N2QMX2MYD5T  WhatIThinkAboutIt  006219867X   
4   5  R16ENI6A3DH1DN                Dan  0385535597   

                                              review  
0  This book is a hard read; not because of Lexil...  
1  This book is rapidly making waves, after the N...  
2  An enthralling and comprehensive story of the ...  
3  My 3-year-old loves Pete the Cat books and is ...  
4  This is the most exhaustive, best researched, ...  

In [28]:
df


Out[28]:
id rid rname pid review
0 1 R3A9TW37U9J9C3 College Stealth 0385535597 This book is a hard read; not because of Lexil...
1 2 REDTPL6B4HNHN Connor Gibson 0385535597 This book is rapidly making waves, after the N...
2 3 RB04JM6NDDGFL Panda31 0385535597 An enthralling and comprehensive story of the ...
3 4 R37N2QMX2MYD5T WhatIThinkAboutIt 006219867X My 3-year-old loves Pete the Cat books and is ...
4 5 R16ENI6A3DH1DN Dan 0385535597 This is the most exhaustive, best researched, ...
5 6 R2LHYLHQSHW2UK Jackie Cooper 0996275460 Filled with Captivating, Complex and Intricate...
6 7 RMEC81XLHVUBI Rebecca Monk Dezan 006219867X The Pete the Cat books without the collaborati...
7 8 RFSLZU7RD8ZAP Alex 1632154560 This entire compendium is a steal. It's highwa...
8 9 R1R8CRQDKLDBJC Antoinette Klein 0679805273 Dr. Seuss isn't just for the pre-school set, b...
9 10 R5NOHMGFOJEG3 Laura Butterfield 0451469828 This is an adorable addition to the Llama Llam...
10 11 R1ZHRZ96DN5G4T Amazon Customer 0385535597 A lot has been written about the Koch Brothers...
11 12 R14AKNOON6D35J Jackie Cooper 1574219952 This book includes 30 full paged illustrations...
12 13 R1EZ6YBUP7ZFGS Graymouser65 0786965606 I am going to try to not duplicate the informa...
13 14 R18SRHT1IPWDXH Tonya Weber 0996275460 Love it! Printed only on one side.
14 15 R2YV74WS97I1WW Shannon S. Ash 006219867X First, I'd better be honest. We LOVE Pete the ...
15 16 RO6RX4477U4IM C.H. Smith 1632154560 If you love the show, you'll love this too. Wa...
16 17 R1ZFUNAJ8QCAJ3 Danielle 0679805273 As I embark on yet ANOTHER new chapter of my l...
17 18 R1FUGAQEI3B92Z Amazon Customer 0451469828 I bought this for 2 of my 2 year old grand kid...
18 19 RC9RWEGZ2RDT9 R. L. Chacona 0385535597 This excellent book investigates the perfidy t...
19 20 R1WLMH31PDL745 Jody E 1574219952 I really love this Good Vibes Coloring Book!!...
20 21 R39QTM4RH9KHVY ĴĴ 1594746036 To be honest, when I first started reading "Mi...
21 22 REWCMZ146FIFA Anders 0786965606 So, the fifth edition of the venerable Dungeon...
22 23 R1TLE1BS2A94LO John D. Harris 1616149981 The back-jacket summary drew me in and proved ...
23 24 R1W5LUJHHE79Y6 Gursimran. 144947425X I bought this book because I love to hear the ...
24 25 R9UTXOOER51LL Krista 0996275460 Pros:\n- Neat paper\n- Not as pix-elated as si...
25 26 R3HWBTPXMRILUH S. Hamilton 006219867X We miss the Pete books written by Eric Litwin....
26 27 R2X2GMWP0ELOS6 H. P. 1632154560 The Walking Dead, Compendium 3 collects comics...
27 28 R2I1Y8HYZ6VXJ1 Donald Mitchell 0679805273 Researchers constantly find that reading to ch...
28 29 R22OA8ABXDCGKL Jackie Cooper 0451469828 My 2 year old granddaughter loves this book an...
29 30 R1FEQQRL539BX4 Carbonlord 0385535597 Starting back in 2010, Jane Mayer had publishe...
... ... ... ... ... ...
10849 10850 R1N2VV15SVXO3J Virginia Hinojosa 1302900153 My daughter is a fan and loves the Punisher, t...
10850 10851 R28BCE7A2FYMI0 Amazon Customer 1743216742 It is full of useful information and great to ...
10851 10852 R3A8BD79XAHGZU Amazon Customer 1440593345 Great job Mr. Stewart! Very much enjoyed revie...
10852 10853 R2VAS4EFH987V5 Amazon Customer 1451695195 I find this book by Eben Alexander to be both ...
10853 10854 R6EMJE70DBDPE RatherBeTraveling 158979799X This has been a great resource for my small ho...
10854 10855 R220ISGEW9RJ82 Erin (The Hardcover Lover) 1250007224 I have to start my review off by stating that ...
10855 10856 R1JYBSKNRVP5G7 Peggy A. Holloway 1439190275 I have been a devout follower of a low-carb li...
10856 10857 R146AN1OGR724V Chris R 156148640X I bought this hoping to be inspired to bring m...
10857 10858 R229O5NEQ388QQ Annie 1616204516 I waited with great anticipation for this book...
10858 10859 R3OGULDO1E6CKT Wildisland 0062270451 My "child" is heading off to college, and beca...
10859 10860 R9M0XGF41LS4Y Edmund Harriss 0470894520 A great overview of mathematics education rese...
10860 10861 R3VXX4GA1EPLM3 Michael E. Fleming 1484710800 My son loves this book. He is 11 and a huge G...
10861 10862 R2NRM3VNLI31DC Daniel Hopp 1302900153 Overall - covers are great. Artwork is meh.
10862 10863 RL9YBLYX7HR0X Graziella Sy 1743216742 This was very helpful when I first traveled to...
10863 10864 R38N5CSEVGUCDB CBP 1440593345 This book has many clever, useful ideas. I wi...
10864 10865 R3KTHD5E1E22IV Rosiedoll123 1439190275 I am 5'3" 25 years old when I started atkins I...
10865 10866 R23F37XSMWCMNH Thomas D. Kehoe 156148640X I bought a slow cooker after hearing gourmets ...
10866 10867 R1SF9QLTSOQCF Bondalini 1616204516 Very readable and well-written, just didn't ha...
10867 10868 R2Q25O3ZJ7A62P Ross Williams 0062270451 One key point the book makes is that "explosiv...
10868 10869 R2HB6DATK2XADC Book Lover in SC 0470894520 My undergraduate students are studying this b...
10869 10870 R1RH21O1TEVXCQ maroon5lover<3 1484710800 This book is awesome for all Gravity Falls fan...
10870 10871 R1MQGOWL4LQSSJ Lindsey Stephens 1743216742 The book is interesting and has given me good ...
10871 10872 R3OV8XJZI1Q1Q8 Jesse L.Belville 1440593345 interesting, informatitve and leads you to man...
10872 10873 RNWJ7I84X98NY armywifeandmom 156148640X When I saw that this cookbook had 1,400 recipe...
10873 10874 R15SN8BR4LCKEB Jocelyn 0062270451 I absolutely love the approach presented in th...
10874 10875 RQLHWHU1JH11D Cherise King 0470894520 This book is fantastic! Jo Boaler has worked h...
10875 10876 R39X3OONTQQFRW Christina 1743216742 Very helpful if one wants to see traditional s...
10876 10877 R1LMMXL5T7D5JV Sheena 156148640X Hi,\n\nI just got this cookbook in the mail to...
10877 10878 R2VHW3B0L4CBCM randomreviewer 0062270451 After a lot of time not being helped by doctor...
10878 10879 R198RZ2UC199WV m.s.o. 0062270451 My explosive child is now 22 years old. We sta...

10879 rows × 5 columns


In [29]:
reviews = df['review'].tolist()

In [30]:
with open('TFIDF_Vectorizer.pkl', 'rb') as fid:
    Vect_loaded = cPickle.load(fid)

In [31]:
Review_Vectorized = Vect_loaded.transform(reviews)

In [32]:
print Review_Vectorized.shape


(10879, 585606)

In [33]:
t0 = time()
print("Predicting the outcomes of the testing set")
t0 = time()
pred = RF_loaded.predict(Review_Vectorized)
print("done in %fs" % (time() - t0))
print pred


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.3s finished
Predicting the outcomes of the testing set
done in 0.564000s
[2 2 2 ..., 2 2 2]

In [34]:
print "Number of Reviewrs in Age group: 13-17",np.sum(np.array(pred)==1)
print "Number of Reviewrs in Age group: 17-33",np.sum(np.array(pred)==2)
print "Number of Reviewrs in Age group: 33 - ",np.sum(np.array(pred)==3)


Number of Reviewrs in Age group: 13-17 4
Number of Reviewrs in Age group: 17-33 10875
Number of Reviewrs in Age group: 33 -  0

In [ ]:
with open('NB_Classifier.pkl', 'rb') as fid:
    Vect_loaded = cPickle.load(fid)