In [3]:
import pandas as pd

# TODO HSC: specify the dtypes at loading
df = pd.read_csv("../input/input.csv", na_values=["-"], encoding = "ISO-8859-1")

df.head()


/opt/conda/lib/python3.5/site-packages/IPython/core/interactiveshell.py:2705: DtypeWarning: Columns (5) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)
Out[3]:
Action_id TypeAction Page_name Feed_message User_name CommentsForItem_message Sentiment Action_created_time_qvdate
0 Comment/1364017066952510_1364097753611108 Comment Resident Evil [UK ONLY] To celebrate the release of Resident... Ronald Briand Muy buena :) (Y) 0.9880 1/5/2017
1 Comment/10154797365350629_10154797373500629 Comment Bandai Namco We continue our Tales Of look back with Tales ... Jon Imboden I may be in the minority here, but I LOVE the ... 0.9540 12/27/2016
2 Comment/1365217000165850_1365754593445424 Comment Resident Evil Are you afraid of the dark? Because you should... Claudia Alejandra Olguín mira Roberto Zeballos, para probar en tu PEQU... 0.9527 1/7/2017
3 Comment/1364017066952510_1364061950281355 Comment Resident Evil [UK ONLY] To celebrate the release of Resident... David Lawrence Sarah Amadeus :O 0.9402 1/5/2017
4 Comment/1365217000165850_1225502404193085 Comment Resident Evil Are you afraid of the dark? Because you should... Leandro Guilherme Aeee Paulo Lucas vou te dar de presente no seu... 0.9382 1/8/2017

In [4]:
print(df.dtypes)
print(df.describe())


Action_id                      object
TypeAction                     object
Page_name                      object
Feed_message                   object
User_name                      object
CommentsForItem_message        object
Sentiment                     float64
Action_created_time_qvdate     object
dtype: object
         Sentiment
count  3222.000000
mean     -0.032501
std       0.396417
min      -0.971000
25%            NaN
50%            NaN
75%            NaN
max       0.988000
/opt/conda/lib/python3.5/site-packages/numpy/lib/function_base.py:3403: RuntimeWarning: Invalid value encountered in median
  RuntimeWarning)

In [5]:
comments = df[df.TypeAction == "Comment"]
likes = df[df.TypeAction == "Like"]

#print(comments.head())
#print(likes.head())

print(len(df) - len(comments) - len(likes))


4

In [6]:
print(likes.User_name.unique())
print(likes.CommentsForItem_message.unique())
print(likes.Sentiment.unique())


[nan 'Aaron Turness' 'Marcelo J. Biott' ..., 'Michael Tong'
 'JadieKit Louise Brown' 'Mauro Mazzariol']
[nan]
[ nan]

In [7]:
# let's drop the commentsForItem and Sentiment columns, as they are empty for the likes...
likes.drop(['CommentsForItem_message', 'Sentiment'], 1,inplace=True)

# also drop the columns with no information
likes.drop(['TypeAction', 'Action_id'], 1,inplace=True)

likes.describe()


/opt/conda/lib/python3.5/site-packages/ipykernel/__main__.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
/opt/conda/lib/python3.5/site-packages/ipykernel/__main__.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
Out[7]:
Page_name Feed_message User_name Action_created_time_qvdate
count 56033 50987 58059 58089
unique 11 103 56315 7
top Resident Evil Are you afraid of the dark? Because you should... ??? 1/4/2017
freq 28556 12177 312 25313

In [12]:
print(comments.Sentiment.unique())

comments.describe()


[ 0.988   0.954   0.9527 ..., -0.9493 -0.971      nan]
/opt/conda/lib/python3.5/site-packages/numpy/lib/function_base.py:3403: RuntimeWarning: Invalid value encountered in median
  RuntimeWarning)
Out[12]:
Sentiment
count 3222.000000
mean -0.032501
std 0.396417
min -0.971000
25% NaN
50% NaN
75% NaN
max 0.988000

In [9]:
comments.to_csv("../output/comments.csv")
likes.to_csv("../output/likes.csv")