In [1]:
import pandas as pd
import datetime
import numpy as np
import scipy as sp
import os
import matplotlib.pyplot as plt
import matplotlib
%matplotlib inline
from bokeh import plotting
# for hover tooltip
from bokeh.models import HoverTool
font = {'size' : 18}
matplotlib.rc('font', **font)
matplotlib.rcParams['figure.figsize'] = (12.0, 6.0)
input_folder = "../data"
Read in news headline data with sentiment score
In [2]:
time_format = "%Y-%m-%dT%H:%M:%SZ"
news_data = pd.read_csv(os.path.join(input_folder, "alchemy_nyt_bitcoin.csv"),
header=True, names=['time', 'headline', 'score'],
index_col=0, parse_dates=[0],
date_parser=lambda x: datetime.datetime.strptime(x, time_format)) \
.drop_duplicates(take_last=True) \
.iloc[::-1] # reverse the original order of index
# news_data = news_data['5/1/2014':'2/1/2014']
news_data.head()
Out[2]:
Copy and construct a DF with same index as news data, empty price slots and 1 as 'bit' label
In [3]:
news_index = news_data[:]
news_index.loc[:,'price'] = None
news_index.drop('headline', axis=1, inplace=True)
news_index.drop('score', axis=1, inplace=True)
news_index.loc[:,'bit'] = 1
news_index.head()
Out[3]:
Read in price data, remove duplicates
In [4]:
time_format = "%Y-%m-%dT%H:%M:%S"
raw_price = pd.read_csv(os.path.join(input_folder, "price.csv"), names=['time', 'price'],
index_col='time', parse_dates=[0],
date_parser=lambda x: datetime.datetime.strptime(x, time_format))
raw_price['time_index'] = raw_price.index
raw_price.drop_duplicates(subset='time_index', take_last=True, inplace=True)
del raw_price['time_index']
# raw_price = raw_price['5/1/2014':'2/1/2014']
raw_price.head()
Out[4]:
Sampling the price data
In [5]:
price_data = pd.DataFrame(raw_price.resample('12h', how='ohlc').ix[:, 3])
price_data.columns = ['price']
price_data.head()
Out[5]:
Use price data to interpolate the price point of each piece of news
In [6]:
news_filled = pd.concat([news_index, price_data]).sort_index()
news_filled.loc[:,'price'].interpolate(method='time', inplace=True)
# calclate how much the price has been affected in the nearest future
news_filled['diff'] = news_filled['price'].shift(-1) - news_filled['price']
news_filled.dropna(axis=0, inplace=True)
news_filled.drop('bit', axis=1, inplace=True)
news_price = pd.merge(news_filled, news_data, how='left', left_index=True, right_index=True)
news_price.head()
Out[6]:
In [7]:
news_price['diff'].describe()
Out[7]:
In [8]:
news_price['diff'].plot(kind='hist', bins=50)
Out[8]:
Apparently most of them has a very limited effect on the price within next 12 hours. But what we are interested in are the ones with bigger effect. We should color those news based on how much std they are away from the mean.
In [9]:
import math
step = news_price['diff'].std()
center = news_price['diff'].mean()
news_price['color_index'] = np.vectorize(lambda t: round((t - center)/step))(news_price['diff'])
news_price.head()
Out[9]:
In [10]:
sorted(news_price['color_index'].unique())
Out[10]:
In [11]:
news_price['color_index'].plot(kind='hist', bins=15)
Out[11]:
In [12]:
colors = ['#cc0000', '#d01515', '#d42b2b', '#d94040', '#dd5555', '#e16a6a', '#e68080', '#ea9595', '#f2bfbf',
'#f2eaf4',
'#bfbfff', '#9595ff', '#8080ff', '#6a6aff', '#5555ff', '#4040ff', '#2b2bff', '#1515ff', '#0000ff'
]
news_price['color_type'] = np.vectorize(lambda c: colors[(-1*int(c)+9)%18])(news_price['color_index'])
news_price.head()
Out[12]:
In [13]:
news_price[['diff', 'score']].corr()
Out[13]:
Plot it out
In [16]:
# Create a set of tools to use
TOOLS="pan,wheel_zoom,box_zoom,reset,hover"
# We need to put these data into a ColumnDataSource
tip_source = plotting.ColumnDataSource(
data=dict(
x=news_price.index,
y=news_price['price'],
headline=news_price.loc[:,'headline'],
score=news_price.loc[:,'score'],
diff=news_price.loc[:,'diff']
)
)
plotting.output_notebook()
# create a figure
p1 = plotting.figure(title="Test",
x_axis_label="Date",
y_axis_label="Price",
x_axis_type="datetime",
plot_width=1000, plot_height=600,
tools=TOOLS)
p1.below[0].formatter.formats = dict(years=['%Y'], months=['%b %Y'], days=['%d %b %Y'])
# make plot
## price line plot
p1.line(
price_data.index, price_data['price'],
# color='#A6CEE3',
legend='BTC Price')
## news headline scatter plot
p1.circle(news_price.index, news_price['price'],
source=tip_source,
legend="News",
fill_color=news_price['color_type'],
size=9)
# hover tooltip
hover = p1.select(dict(type=HoverTool))
hover.tooltips = [("Headline", "@headline"),
("Sentiment score", "@score"),
("Price change($)", "@diff")
]
plotting.show(p1)