In [1]:
%matplotlib inline
import datetime
from collections import Counter
import pandas as pd
from sentiment_classification import run_network
from utils import clean_text
from utils import calc_ratios
In [2]:
from IPython.core.display import HTML
css = open('table.css').read() + open('notebook.css').read()
HTML('<style>{}</style>'.format(css))
Out[2]:
In [3]:
reviews = pd.read_csv("reviews.csv", encoding="utf-8")
reviews.head(5)
Out[3]:
In [4]:
ratings = pd.read_csv("ratings.csv", encoding="utf-8")
ratings.head(5)
Out[4]:
In [5]:
len(reviews)
Out[5]:
In [6]:
reviews[reviews["userID"] == reviews["userID"].value_counts().idxmax()].userName.head(1)
Out[6]:
In [7]:
reviews["date"] = pd.to_datetime(reviews["unixReviewTime"], unit='s')
In [8]:
reviews.sort_values(by="unixReviewTime", ascending=True).head(1)[["reviewText","date"]]
Out[8]:
In [9]:
reviews.sort_values(by="unixReviewTime", ascending=True).head(1)[["userName","date"]]
Out[9]:
In [10]:
reviews.sort_values(by="unixReviewTime", ascending=False).head(1)[["productID","date"]]
Out[10]:
In [11]:
reviews = clean_text(reviews, "summary")
In [12]:
reviews["summary"].value_counts().nlargest(10)
Out[12]:
In [13]:
reviews[reviews.date.dt.year == 2011]["summary"].value_counts().nlargest(10)
Out[13]:
In [14]:
all_counter = Counter(reviews[reviews.date.dt.year == 2000]["summary"].str.split(" ").sum())
all_counter.most_common(10)
Out[14]:
In [15]:
counter = Counter(reviews[reviews.date.dt.year == 2000]["reviewText"].str.split(" ").sum())
counter.most_common(10)
Out[15]:
In [16]:
counter = Counter(reviews[reviews.date.dt.year < 2000]["reviewText"].str.split(" ").sum())
counter.most_common(10)
Out[16]:
In [17]:
reviews = reviews[reviews.date > datetime.datetime.now() - datetime.timedelta(days=5*365)]
In [18]:
review_ratings = pd.merge(reviews,ratings, on=["productID","userID"])
In [19]:
review_ratings["binary_ratings"] = review_ratings["ratings"].apply(lambda x: "POSITIVE" if x > 3 else "NEGATIVE")
In [20]:
counters = review_ratings.groupby("binary_ratings").apply(lambda x: Counter(x.summary.str.split(" ").sum()))
In [21]:
total_counts = counters.sum()
ratios = counters.apply(calc_ratios, total_counts=total_counts)
In [22]:
ratios
Out[22]:
In [23]:
mlp = run_network(review_ratings, "summary", "binary_ratings")
In [24]:
mlp.predict("good")
Out[24]:
In [25]:
mlp.predict("Bad")
Out[25]:
In [26]:
mlp.predict("unplayable")
Out[26]:
In [27]:
mlp.predict("excelente")
Out[27]:
In [28]:
mlp.predict("excel")
Out[28]:
In [29]:
mlp.predict("playable")
Out[29]:
In [ ]:
In [ ]: