In [3]:
import pandas as pd
# TODO HSC: specify the dtypes at loading
df = pd.read_csv("../input/input.csv", na_values=["-"], encoding = "ISO-8859-1")
df.head()
Out[3]:
In [4]:
print(df.dtypes)
print(df.describe())
In [5]:
comments = df[df.TypeAction == "Comment"]
likes = df[df.TypeAction == "Like"]
#print(comments.head())
#print(likes.head())
print(len(df) - len(comments) - len(likes))
In [6]:
print(likes.User_name.unique())
print(likes.CommentsForItem_message.unique())
print(likes.Sentiment.unique())
In [7]:
# let's drop the commentsForItem and Sentiment columns, as they are empty for the likes...
likes.drop(['CommentsForItem_message', 'Sentiment'], 1,inplace=True)
# also drop the columns with no information
likes.drop(['TypeAction', 'Action_id'], 1,inplace=True)
likes.describe()
Out[7]:
In [12]:
print(comments.Sentiment.unique())
comments.describe()
Out[12]:
In [9]:
comments.to_csv("../output/comments.csv")
likes.to_csv("../output/likes.csv")