In [1]:
%matplotlib inline
import datetime
from collections import Counter

import pandas as pd

from sentiment_classification import run_network
from utils import clean_text
from utils import calc_ratios

In [2]:
from IPython.core.display import HTML
css = open('table.css').read() + open('notebook.css').read()
HTML('<style>{}</style>'.format(css))


Out[2]:

In [3]:
reviews = pd.read_csv("reviews.csv", encoding="utf-8")
reviews.head(5)


Out[3]:
userID productID userName reviewText summary unixReviewTime reviewTime
0 A2HD75EMZR8QLN 0700099867 123 Installing the game was a struggle (because of... Pay to unlock content? I don't think so. 1341792000 07 9, 2012
1 A3UR8NLLY1ZHCX 0700099867 Alejandro Henao "Electronic Junky" If you like rally cars get this game you will ... Good rally game 1372550400 06 30, 2013
2 A1INA0F5CWW3J4 0700099867 Amazon Shopper "Mr.Repsol" 1st shipment received a book instead of the ga... Wrong key 1403913600 06 28, 2014
3 A1DLMTOTHQ4AST 0700099867 ampgreen I got this version instead of the PS3 version,... awesome game, if it did not crash frequently !! 1315958400 09 14, 2011
4 A361M14PU2GUEG 0700099867 Angry Ryan "Ryan A. Forrest" I had Dirt 2 on Xbox 360 and it was an okay ga... DIRT 3 1308009600 06 14, 2011

In [4]:
ratings = pd.read_csv("ratings.csv", encoding="utf-8")
ratings.head(5)


Out[4]:
userID productID ratings timestamp
0 A24SSUT5CSW8BH 0078764343 5.0 1377302400
1 AK3V0HEBJMQ7J 0078764343 4.0 1372896000
2 A10BECPH7W8HM7 043933702X 5.0 1404950400
3 A2PRV9OULX1TWP 043933702X 5.0 1386115200
4 AE7GUHCDQQ4UI 043933702X 1.0 1366156800

How many reviews are there?


In [5]:
len(reviews)


Out[5]:
231780

Who has the most review data?


In [6]:
reviews[reviews["userID"] == reviews["userID"].value_counts().idxmax()].userName.head(1)


Out[6]:
1400    Lisa Shea "be the change you wish to see in t...
Name: userName, dtype: object

Convert unixReviewTime field to date field and add it dataframe with column name date


In [7]:
reviews["date"] = pd.to_datetime(reviews["unixReviewTime"], unit='s')

What is the very first reviewText?


In [8]:
reviews.sort_values(by="unixReviewTime", ascending=True).head(1)[["reviewText","date"]]


Out[8]:
reviewText date
4410 I'm having the most fun I've ever had on PlayS... 1999-10-14

What is the name of person that made first review?


In [9]:
reviews.sort_values(by="unixReviewTime", ascending=True).head(1)[["userName","date"]]


Out[9]:
userName date
4410 "kwobooks" 1999-10-14

What is the id of video games that is the last reviewed?


In [10]:
reviews.sort_values(by="unixReviewTime", ascending=False).head(1)[["productID","date"]]


Out[10]:
productID date
218837 B00BMFIXT2 2014-07-22

Make summaries lowercase and remove punctuations using clean_text function


In [11]:
reviews = clean_text(reviews, "summary")

What is the most occurrent summary in all data?


In [12]:
reviews["summary"].value_counts().nlargest(10)


Out[12]:
great game      3749
awesome         1402
great           1302
good game       1259
love it          930
awesome game     796
fun game         766
fun              760
amazing          608
good             604
Name: summary, dtype: int64

What is the most occurrent summary in 2011?


In [13]:
reviews[reviews.date.dt.year == 2011]["summary"].value_counts().nlargest(10)


Out[13]:
great game      316
awesome         102
great            83
good game        82
amazing          67
awesome game     67
fun game         67
love it          63
fun              63
wow              36
Name: summary, dtype: int64

What is the most occurent word in summaries in 2000?


In [14]:
all_counter = Counter(reviews[reviews.date.dt.year == 2000]["summary"].str.split(" ").sum())
all_counter.most_common(10)


Out[14]:
[('the', 413),
 ('game', 403),
 ('a', 356),
 ('great', 201),
 ('of', 173),
 ('best', 151),
 ('for', 140),
 ('good', 140),
 ('but', 137),
 ('is', 129)]

What is the most occurent word in reviewTexts in 2000?


In [15]:
counter = Counter(reviews[reviews.date.dt.year == 2000]["reviewText"].str.split(" ").sum())
counter.most_common(10)


Out[15]:
[(u'', 18114),
 (u'the', 17314),
 (u'and', 9467),
 (u'a', 9045),
 (u'to', 8865),
 (u'of', 7549),
 (u'is', 6833),
 (u'you', 6007),
 (u'game', 4659),
 (u'I', 4492)]

What is the most occurent word in reviewTexts before 2000?


In [16]:
counter = Counter(reviews[reviews.date.dt.year < 2000]["reviewText"].str.split(" ").sum())
counter.most_common(10)


Out[16]:
[(u'', 561),
 (u'the', 380),
 (u'and', 220),
 (u'a', 211),
 (u'to', 193),
 (u'is', 156),
 (u'of', 149),
 (u'you', 133),
 (u'I', 104),
 (u'game', 97)]

Filter out reviews if there are older than 5 years


In [17]:
reviews = reviews[reviews.date > datetime.datetime.now() - datetime.timedelta(days=5*365)]

Join reviews and ratings dataframe on productID and userID


In [18]:
review_ratings = pd.merge(reviews,ratings, on=["productID","userID"])

Create a column named binary_ratings and for each row fill POSITIVE if ratings is bigger than 3 else fill NEGATIVE


In [19]:
review_ratings["binary_ratings"] = review_ratings["ratings"].apply(lambda x: "POSITIVE" if x > 3 else "NEGATIVE")

Find the most occurrent words in POSITIVE and NEGATIVE ratings


In [20]:
counters = review_ratings.groupby("binary_ratings").apply(lambda x: Counter(x.summary.str.split(" ").sum()))

Calculate the ratios of counter values


In [21]:
total_counts = counters.sum()
ratios = counters.apply(calc_ratios, total_counts=total_counts)

In [22]:
ratios


Out[22]:
0 1 2 3 4 5 6 7 8 9
binary_ratings
NEGATIVE (unplayable, 3.98898404656) (garbage, 3.62434093298) (ugh, 2.99573227355) (waste, 2.98946266054) (disappointment, 2.84490938382) (trash, 2.83321334406) (ehhhh, 2.7080502011) (underwhelming, 2.7080502011) (worst, 2.69968195143) (disappointing, 2.64552984412)
POSITIVE (excelente, 4.45434729625) (she, 4.45434729625) (loving, 4.31748811354) (five, 4.14906946191) (four, 4.04305126783) (classics, 3.93182563272) (rocks, 3.86072971104) (excellent, 3.84275537092) (loves, 3.84160054113) (outstanding, 3.79548918917)

Run network with review_ratings dataframe and fields


In [23]:
mlp = run_network(review_ratings, "summary", "binary_ratings")


Progress:97.9% Speed(reviews/sec):5815. #Correct:73750 #Trained:87501 Training Accuracy:84.2%

Predict whether the "good" word is positive or negative


In [24]:
mlp.predict("good")


Out[24]:
'POSITIVE'

Predict whether the "Bad" word is positive or negative


In [25]:
mlp.predict("Bad")


Out[25]:
'NEGATIVE'

Predict "unplayable"


In [26]:
mlp.predict("unplayable")


Out[26]:
'NEGATIVE'

Predict "excelente"


In [27]:
mlp.predict("excelente")


Out[27]:
'POSITIVE'

Predict "excel"


In [28]:
mlp.predict("excel")


Out[28]:
'POSITIVE'

Predict "playable"


In [29]:
mlp.predict("playable")


Out[29]:
'NEGATIVE'

In [ ]:


In [ ]: