In [5]:
import pandas as pd
import datetime
import numpy as np
import scipy as sp
import os
import matplotlib.pyplot as plt
import matplotlib
%matplotlib inline
font = {'size' : 18}
matplotlib.rc('font', **font)
matplotlib.rcParams['figure.figsize'] = (12.0, 6.0)
#os.chdir("/root/Envs/btc-analysis/btc-price-analysis")
time_format = "%Y-%m-%dT%H:%M:%SZ"
In [8]:
score_data = pd.read_csv("../data/nyt_bitcoin_with_score.csv", index_col='time',
parse_dates=[0], date_parser=lambda x: datetime.datetime.strptime(x, time_format))
score_data.head()
Out[8]:
In [9]:
score_data.sentiment.unique()
Out[9]:
In [10]:
score_data.groupby("sentiment").sentiment.count().plot(kind='bar',rot=0)
Out[10]:
Massively negative ratings!!!! Is this special to bitcoin news? To double check, run the same analysis on news with headline including "internet".
In [10]:
internet_news = pd.read_csv("../data/nyt_internet_with_score.csv", index_col='time',
parse_dates=[0], date_parser=lambda x: datetime.datetime.strptime(x, time_format))
internet_news.head()
Out[10]:
In [12]:
internet_news.groupby("sentiment").sentiment.count()
Out[12]:
So it seems most of news would be classified as negative by the Stanford classifier. How about other classifier?
In [8]:
indico_news = pd.read_csv("../data/indico_nyt_bitcoin.csv", index_col='time',
parse_dates=[0], date_parser=lambda x: datetime.datetime.strptime(x, time_format))
indico_news.head()
Out[8]:
In [9]:
indico_news.indico_score.describe()
Out[9]:
Distribution
In [6]:
indico_news.indico_score.plot(kind='hist')
Out[6]:
The distribution of indico score looks quite like a normal distribution, which is better than the Stanford one of course. So maybe we should try using indico score?
In [11]:
indico_news.resample('w', how='mean').plot()
Out[11]:
Let's try again with news about "internet".
In [12]:
indico_news = pd.read_csv("../data/indico_nyt_internet.csv", index_col='time',
parse_dates=[0], date_parser=lambda x: datetime.datetime.strptime(x, time_format))
indico_news.head()
Out[12]:
In [5]:
indico_news.indico_score.plot(kind='hist', bins=20)
Out[5]:
Again, it is a normal distribution. I am not sure whether the reasonable distribution of sentiment about a thing should be like this? Because this is not a very neutral thing, and we should probably expect the distribution be positively skewed.
This needs to be further studied for validity.
In [13]:
indico_news.indico_score.resample('w', how='mean').plot()
Out[13]:
In [6]:
indico_news.indico_score.describe()
Out[6]:
In [4]:
weekly_news_count = score_data.resample('w', how='count').fillna(0)
weekly_news_count.sentiment.describe()
Out[4]:
In [5]:
weekly_news_count.sentiment.plot()
Out[5]:
In [6]:
weekly_news_count.sentiment.plot(kind='hist')
Out[6]:
In [18]:
weekly_score = score_data.resample('d', how='mean').fillna(0)
weekly_score.head()
Out[18]:
In [19]:
weekly_score.sentimentValue.plot(kind='hist')
Out[19]:
In [20]:
weekly_score.plot()
Out[20]:
We miss news about bitcoin for about half of the all time. Therefore we try keyword "internet".
In [21]:
missing_news = 100*weekly_score[weekly_score.sentimentValue==0].count()/float(weekly_score.count())
print "Percentage of weeks without news: %f%%" % missing_news