notebook.community

Edit and run



In [1]:

    
%matplotlib inline
import datetime
from collections import Counter

import pandas as pd

from sentiment_classification import run_network
from utils import clean_text
from utils import calc_ratios



In [2]:

    
from IPython.core.display import HTML
css = open('table.css').read() + open('notebook.css').read()
HTML('<style>{}</style>'.format(css))









    Out[2]:



In [3]:

    
reviews = pd.read_csv("reviews.csv", encoding="utf-8")
reviews.head(5)









    Out[3]:







  
    
      
      userID
      productID
      userName
      reviewText
      summary
      unixReviewTime
      reviewTime
    
  
  
    
      0
      A2HD75EMZR8QLN
      0700099867
      123
      Installing the game was a struggle (because of...
      Pay to unlock content? I don't think so.
      1341792000
      07 9, 2012
    
    
      1
      A3UR8NLLY1ZHCX
      0700099867
      Alejandro Henao "Electronic Junky"
      If you like rally cars get this game you will ...
      Good rally game
      1372550400
      06 30, 2013
    
    
      2
      A1INA0F5CWW3J4
      0700099867
      Amazon Shopper "Mr.Repsol"
      1st shipment received a book instead of the ga...
      Wrong key
      1403913600
      06 28, 2014
    
    
      3
      A1DLMTOTHQ4AST
      0700099867
      ampgreen
      I got this version instead of the PS3 version,...
      awesome game, if it did not crash frequently !!
      1315958400
      09 14, 2011
    
    
      4
      A361M14PU2GUEG
      0700099867
      Angry Ryan "Ryan A. Forrest"
      I had Dirt 2 on Xbox 360 and it was an okay ga...
      DIRT 3
      1308009600
      06 14, 2011



In [4]:

    
ratings = pd.read_csv("ratings.csv", encoding="utf-8")
ratings.head(5)









    Out[4]:







  
    
      
      userID
      productID
      ratings
      timestamp
    
  
  
    
      0
      A24SSUT5CSW8BH
      0078764343
      5.0
      1377302400
    
    
      1
      AK3V0HEBJMQ7J
      0078764343
      4.0
      1372896000
    
    
      2
      A10BECPH7W8HM7
      043933702X
      5.0
      1404950400
    
    
      3
      A2PRV9OULX1TWP
      043933702X
      5.0
      1386115200
    
    
      4
      AE7GUHCDQQ4UI
      043933702X
      1.0
      1366156800

How many reviews are there?



In [5]:

    
len(reviews)









    Out[5]:





231780

Who has the most review data?



In [6]:

    
reviews[reviews["userID"] == reviews["userID"].value_counts().idxmax()].userName.head(1)









    Out[6]:





1400    Lisa Shea "be the change you wish to see in t...
Name: userName, dtype: object

Convert unixReviewTime field to date field and add it dataframe with column name date



In [7]:

    
reviews["date"] = pd.to_datetime(reviews["unixReviewTime"], unit='s')

What is the very first reviewText?



In [8]:

    
reviews.sort_values(by="unixReviewTime", ascending=True).head(1)[["reviewText","date"]]









    Out[8]:







  
    
      
      reviewText
      date
    
  
  
    
      4410
      I'm having the most fun I've ever had on PlayS...
      1999-10-14

What is the name of person that made first review?



In [9]:

    
reviews.sort_values(by="unixReviewTime", ascending=True).head(1)[["userName","date"]]









    Out[9]:







  
    
      
      userName
      date
    
  
  
    
      4410
      "kwobooks"
      1999-10-14

What is the id of video games that is the last reviewed?



In [10]:

    
reviews.sort_values(by="unixReviewTime", ascending=False).head(1)[["productID","date"]]









    Out[10]:







  
    
      
      productID
      date
    
  
  
    
      218837
      B00BMFIXT2
      2014-07-22

Make summaries lowercase and remove punctuations using clean_text function



In [11]:

    
reviews = clean_text(reviews, "summary")

What is the most occurrent summary in all data?



In [12]:

    
reviews["summary"].value_counts().nlargest(10)









    Out[12]:





great game      3749
awesome         1402
great           1302
good game       1259
love it          930
awesome game     796
fun game         766
fun              760
amazing          608
good             604
Name: summary, dtype: int64

What is the most occurrent summary in 2011?



In [13]:

    
reviews[reviews.date.dt.year == 2011]["summary"].value_counts().nlargest(10)









    Out[13]:





great game      316
awesome         102
great            83
good game        82
amazing          67
awesome game     67
fun game         67
love it          63
fun              63
wow              36
Name: summary, dtype: int64

What is the most occurent word in summaries in 2000?



In [14]:

    
all_counter = Counter(reviews[reviews.date.dt.year == 2000]["summary"].str.split(" ").sum())
all_counter.most_common(10)









    Out[14]:





[('the', 413),
 ('game', 403),
 ('a', 356),
 ('great', 201),
 ('of', 173),
 ('best', 151),
 ('for', 140),
 ('good', 140),
 ('but', 137),
 ('is', 129)]

What is the most occurent word in reviewTexts in 2000?



In [15]:

    
counter = Counter(reviews[reviews.date.dt.year == 2000]["reviewText"].str.split(" ").sum())
counter.most_common(10)









    Out[15]:





[(u'', 18114),
 (u'the', 17314),
 (u'and', 9467),
 (u'a', 9045),
 (u'to', 8865),
 (u'of', 7549),
 (u'is', 6833),
 (u'you', 6007),
 (u'game', 4659),
 (u'I', 4492)]

What is the most occurent word in reviewTexts before 2000?



In [16]:

    
counter = Counter(reviews[reviews.date.dt.year < 2000]["reviewText"].str.split(" ").sum())
counter.most_common(10)









    Out[16]:





[(u'', 561),
 (u'the', 380),
 (u'and', 220),
 (u'a', 211),
 (u'to', 193),
 (u'is', 156),
 (u'of', 149),
 (u'you', 133),
 (u'I', 104),
 (u'game', 97)]

Filter out reviews if there are older than 5 years



In [17]:

    
reviews = reviews[reviews.date > datetime.datetime.now() - datetime.timedelta(days=5*365)]

Join reviews and ratings dataframe on productID and userID



In [18]:

    
review_ratings = pd.merge(reviews,ratings, on=["productID","userID"])

Create a column named binary_ratings and for each row fill POSITIVE if ratings is bigger than 3 else fill NEGATIVE



In [19]:

    
review_ratings["binary_ratings"] = review_ratings["ratings"].apply(lambda x: "POSITIVE" if x > 3 else "NEGATIVE")

Find the most occurrent words in POSITIVE and NEGATIVE ratings



In [20]:

    
counters = review_ratings.groupby("binary_ratings").apply(lambda x: Counter(x.summary.str.split(" ").sum()))

Calculate the ratios of counter values



In [21]:

    
total_counts = counters.sum()
ratios = counters.apply(calc_ratios, total_counts=total_counts)



In [22]:

    
ratios









    Out[22]:







  
    
      
      0
      1
      2
      3
      4
      5
      6
      7
      8
      9
    
    
      binary_ratings
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      NEGATIVE
      (unplayable, 3.98898404656)
      (garbage, 3.62434093298)
      (ugh, 2.99573227355)
      (waste, 2.98946266054)
      (disappointment, 2.84490938382)
      (trash, 2.83321334406)
      (ehhhh, 2.7080502011)
      (underwhelming, 2.7080502011)
      (worst, 2.69968195143)
      (disappointing, 2.64552984412)
    
    
      POSITIVE
      (excelente, 4.45434729625)
      (she, 4.45434729625)
      (loving, 4.31748811354)
      (five, 4.14906946191)
      (four, 4.04305126783)
      (classics, 3.93182563272)
      (rocks, 3.86072971104)
      (excellent, 3.84275537092)
      (loves, 3.84160054113)
      (outstanding, 3.79548918917)

Run network with review_ratings dataframe and fields



In [23]:

    
mlp = run_network(review_ratings, "summary", "binary_ratings")









    



Progress:97.9% Speed(reviews/sec):5815. #Correct:73750 #Trained:87501 Training Accuracy:84.2%

Predict whether the "good" word is positive or negative



In [24]:

    
mlp.predict("good")









    Out[24]:





'POSITIVE'

Predict whether the "Bad" word is positive or negative



In [25]:

    
mlp.predict("Bad")









    Out[25]:





'NEGATIVE'

Predict "unplayable"



In [26]:

    
mlp.predict("unplayable")









    Out[26]:





'NEGATIVE'

Predict "excelente"



In [27]:

    
mlp.predict("excelente")









    Out[27]:





'POSITIVE'

Predict "excel"



In [28]:

    
mlp.predict("excel")









    Out[28]:





'POSITIVE'

Predict "playable"



In [29]:

    
mlp.predict("playable")









    Out[29]:





'NEGATIVE'



In [ ]:



In [ ]:

	userID	productID	userName	reviewText	summary	unixReviewTime	reviewTime
0	A2HD75EMZR8QLN	0700099867	123	Installing the game was a struggle (because of...	Pay to unlock content? I don't think so.	1341792000	07 9, 2012
1	A3UR8NLLY1ZHCX	0700099867	Alejandro Henao "Electronic Junky"	If you like rally cars get this game you will ...	Good rally game	1372550400	06 30, 2013
2	A1INA0F5CWW3J4	0700099867	Amazon Shopper "Mr.Repsol"	1st shipment received a book instead of the ga...	Wrong key	1403913600	06 28, 2014
3	A1DLMTOTHQ4AST	0700099867	ampgreen	I got this version instead of the PS3 version,...	awesome game, if it did not crash frequently !!	1315958400	09 14, 2011
4	A361M14PU2GUEG	0700099867	Angry Ryan "Ryan A. Forrest"	I had Dirt 2 on Xbox 360 and it was an okay ga...	DIRT 3	1308009600	06 14, 2011

	userID	productID	ratings	timestamp
0	A24SSUT5CSW8BH	0078764343	5.0	1377302400
1	AK3V0HEBJMQ7J	0078764343	4.0	1372896000
2	A10BECPH7W8HM7	043933702X	5.0	1404950400
3	A2PRV9OULX1TWP	043933702X	5.0	1386115200
4	AE7GUHCDQQ4UI	043933702X	1.0	1366156800

	0	1	2	3	4	5	6	7	8	9
binary_ratings
NEGATIVE	(unplayable, 3.98898404656)	(garbage, 3.62434093298)	(ugh, 2.99573227355)	(waste, 2.98946266054)	(disappointment, 2.84490938382)	(trash, 2.83321334406)	(ehhhh, 2.7080502011)	(underwhelming, 2.7080502011)	(worst, 2.69968195143)	(disappointing, 2.64552984412)
POSITIVE	(excelente, 4.45434729625)	(she, 4.45434729625)	(loving, 4.31748811354)	(five, 4.14906946191)	(four, 4.04305126783)	(classics, 3.93182563272)	(rocks, 3.86072971104)	(excellent, 3.84275537092)	(loves, 3.84160054113)	(outstanding, 3.79548918917)