In [1]:
%matplotlib inline
import pandas as pd
In [2]:
from IPython.core.display import HTML
css = open('table.css').read() + open('notebook.css').read()
HTML('<style>{}</style>'.format(css))
Out[2]:
In [3]:
ratings = pd.read_csv("ratings.csv", encoding="utf-8")
ratings.head(5)
Out[3]:
In [4]:
len(ratings)
Out[4]:
In [5]:
ratings["productID"].nunique()
Out[5]:
In [6]:
ratings[ratings.duplicated(["userID", "productID"]) == True]
Out[6]:
In [7]:
ratings["userID"].nunique()
Out[7]:
In [8]:
ratings.sort_values(by="timestamp", ascending=True).head(5)
Out[8]:
In [9]:
ratings.sort_values(by="timestamp", ascending=False).head(5)
Out[9]:
In [10]:
ratings["timestamp"] = pd.to_datetime(ratings["timestamp"], unit='s')
In [11]:
ratings.timestamp.dt.year.value_counts().sort_values(ascending=False).head()
Out[11]:
In [12]:
ratings.timestamp.dt.strftime("%B").value_counts().sort_values(ascending=False)
Out[12]:
In [13]:
ratings.timestamp.dt.strftime("%A").value_counts().sort_values(ascending=False)
Out[13]:
In [14]:
ratings.ratings.mean()
Out[14]:
In [15]:
ratings[["userID", "ratings"]].groupby("userID").mean().head(5)
Out[15]:
In [16]:
ratings[["productID", "ratings"]].groupby("productID").mean().head()
Out[16]:
In [17]:
ratings[["userID", "ratings"]].groupby("userID").mean().nlargest(5, 'ratings')
Out[17]:
In [18]:
ratings[["productID", "ratings"]].groupby("productID").mean().nlargest(5, 'ratings')
Out[18]:
In [19]:
ratings[["userID", "ratings"]].groupby("userID").mean().nsmallest(5, 'ratings')
Out[19]:
In [20]:
ratings[["productID", "ratings"]].groupby("productID").mean().nsmallest(5, 'ratings')
Out[20]:
In [21]:
ratings[["userID", "timestamp"]].groupby(by="userID").timestamp.unique == ratings["timestamp"].nunique
Out[21]:
In [22]:
ratings[["productID", "timestamp"]].groupby(by="productID").timestamp.unique == ratings["timestamp"].unique
Out[22]:
In [23]:
ratings.groupby(by="userID").ratings.mean().head(5)
Out[23]:
In [24]:
ratings.groupby(by="productID").ratings.mean().head(5)
Out[24]:
In [25]:
len(ratings[ratings["timestamp"].dt.year < 2000])
Out[25]:
In [26]:
len(ratings[ratings["timestamp"].dt.year > 2000])
Out[26]:
In [27]:
len(ratings[(ratings["timestamp"].dt.year > 2000) & (ratings["timestamp"].dt.year < 2014)])
Out[27]:
In [28]:
ratings["timestamp"].dt.year.value_counts().sort_values(ascending=False).head()
Out[28]:
In [29]:
ratings[["timestamp", "ratings"]].groupby(by=ratings["timestamp"].dt.year).mean().sort_index().head()
Out[29]:
In [30]:
len(ratings[ratings["userID"] == "A00010181745VTMHSO9TO"])
Out[30]:
In [31]:
len(ratings[ratings["productID"] == "0439339987"])
Out[31]:
In [32]:
from datetime import datetime
date = datetime.strptime('24.08.2013', "%d.%m.%Y")
len(ratings[ratings["timestamp"] == date])
Out[32]:
In [33]:
ratings["ratings"].value_counts().plot(kind="bar",title="Counts Of Ratings",sort_columns=True)
Out[33]:
In [34]:
(ratings["ratings"].value_counts() / float(len(ratings["ratings"]))).\
plot(kind="bar",title="Percentage Of Ratings",sort_columns=True)
Out[34]:
In [35]:
len(ratings[ratings["ratings"] == 3.0]) / float(len(ratings))
Out[35]:
In [36]:
ratings["ratings"].value_counts().nlargest(1)
Out[36]:
In [37]:
ratings["ratings"].value_counts().nsmallest(1)
Out[37]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]: