In [1]:
import pandas as pd
import datetime
import numpy as np
import scipy as sp
import os
import matplotlib.pyplot as plt
import matplotlib
%matplotlib inline
from bokeh import plotting
# for hover tooltip
from bokeh.models import HoverTool
font = {'size'   : 18}
matplotlib.rc('font', **font)
matplotlib.rcParams['figure.figsize'] = (12.0, 6.0)
input_folder = "../data"

Read in news headline data with sentiment score


In [2]:
time_format = "%Y-%m-%dT%H:%M:%SZ"
news_data = pd.read_csv(os.path.join(input_folder, "alchemy_nyt_bitcoin.csv"), 
                        header=True, names=['time', 'headline', 'score'],
                        index_col=0, parse_dates=[0], 
                        date_parser=lambda x: datetime.datetime.strptime(x, time_format)) \
            .drop_duplicates(take_last=True) \
            .iloc[::-1] # reverse the original order of index
# news_data = news_data['5/1/2014':'2/1/2014']
news_data.head()


Out[2]:
headline score
time
2015-03-30 12:51:17 Former U.S. Agents Charged for Bitcoin Theft D... -0.393392
2015-03-30 12:51:14 Federal Agents Charged With Stealing Digital C... 0.000000
2015-03-23 17:22:25 Nasdaq to Provide Trading Technology for Bitco... 0.243726
2015-03-18 11:16:17 In Boon for Bitcoin, UK to Regulate Digital Cu... 0.260152
2015-03-18 07:36:30 Swedish Crowdfunding Platform Launches Bitcoin... -0.238072

Copy and construct a DF with same index as news data, empty price slots and 1 as 'bit' label


In [3]:
news_index = news_data[:]
news_index.loc[:,'price'] = None
news_index.drop('headline', axis=1, inplace=True)
news_index.drop('score', axis=1, inplace=True)
news_index.loc[:,'bit'] = 1
news_index.head()


/root/Envs/btc-project/local/lib/python2.7/site-packages/pandas/core/indexing.py:244: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
/root/Envs/btc-project/local/lib/python2.7/site-packages/pandas/core/indexing.py:411: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
Out[3]:
price bit
time
2015-03-30 12:51:17 None 1
2015-03-30 12:51:14 None 1
2015-03-23 17:22:25 None 1
2015-03-18 11:16:17 None 1
2015-03-18 07:36:30 None 1

Read in price data, remove duplicates


In [4]:
time_format = "%Y-%m-%dT%H:%M:%S"
raw_price = pd.read_csv(os.path.join(input_folder, "price.csv"), names=['time', 'price'], 
                   index_col='time', parse_dates=[0], 
                   date_parser=lambda x: datetime.datetime.strptime(x, time_format))
raw_price['time_index'] = raw_price.index
raw_price.drop_duplicates(subset='time_index', take_last=True, inplace=True)
del raw_price['time_index']
# raw_price = raw_price['5/1/2014':'2/1/2014']
raw_price.head()


Out[4]:
price
time
2015-04-19 02:14:30 224.38
2015-04-19 02:08:34 224.04
2015-04-19 01:58:34 224.04
2015-04-19 01:48:34 224.03
2015-04-19 01:39:40 224.03

Sampling the price data


In [5]:
price_data = pd.DataFrame(raw_price.resample('12h', how='ohlc').ix[:, 3])
price_data.columns = ['price']
price_data.head()


Out[5]:
price
time
2011-11-01 12:00:00 3.18
2011-11-02 00:00:00 3.24
2011-11-02 12:00:00 3.25
2011-11-03 00:00:00 3.22
2011-11-03 12:00:00 3.19

Use price data to interpolate the price point of each piece of news


In [6]:
news_filled = pd.concat([news_index, price_data]).sort_index()
news_filled.loc[:,'price'].interpolate(method='time', inplace=True)
# calclate how much the price has been affected in the nearest future
news_filled['diff'] = news_filled['price'].shift(-1) - news_filled['price']
news_filled.dropna(axis=0, inplace=True)
news_filled.drop('bit', axis=1, inplace=True)
news_price = pd.merge(news_filled, news_data, how='left', left_index=True, right_index=True)
news_price.head()


Out[6]:
price diff headline score
time
2012-01-16 01:01:41 6.992585 -0.292585 'Good Wife' Watch: Jason Biggs, Jim Cramer and... 0.441828
2012-04-12 14:30:13 4.883741 -0.023741 Canada Seeks to Turn Coins Into Digital Currency -0.455343
2013-03-12 20:28:27 45.650558 0.549442 Today's Scuttlebot: Bitcoin Problem and Tracki... -0.269256
2013-04-08 00:00:00 183.750000 3.210000 Bubble or No, This Virtual Currency Is a Lot o... 0.000000
2013-04-11 16:42:24 104.151444 -2.778475 Bitcoin Has Real-World Investors -0.228388

In [7]:
news_price['diff'].describe()


Out[7]:
count    341.000000
mean       0.096439
std       13.247217
min      -61.126478
25%       -2.887945
50%       -0.077642
75%        1.841223
max      113.891018
Name: diff, dtype: float64

In [8]:
news_price['diff'].plot(kind='hist', bins=50)


Out[8]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f805cbb4350>

Apparently most of them has a very limited effect on the price within next 12 hours. But what we are interested in are the ones with bigger effect. We should color those news based on how much std they are away from the mean.


In [9]:
import math
step = news_price['diff'].std()
center = news_price['diff'].mean()
news_price['color_index'] = np.vectorize(lambda t: round((t - center)/step))(news_price['diff'])
news_price.head()


Out[9]:
price diff headline score color_index
time
2012-01-16 01:01:41 6.992585 -0.292585 'Good Wife' Watch: Jason Biggs, Jim Cramer and... 0.441828 -0
2012-04-12 14:30:13 4.883741 -0.023741 Canada Seeks to Turn Coins Into Digital Currency -0.455343 -0
2013-03-12 20:28:27 45.650558 0.549442 Today's Scuttlebot: Bitcoin Problem and Tracki... -0.269256 0
2013-04-08 00:00:00 183.750000 3.210000 Bubble or No, This Virtual Currency Is a Lot o... 0.000000 0
2013-04-11 16:42:24 104.151444 -2.778475 Bitcoin Has Real-World Investors -0.228388 -0

In [10]:
sorted(news_price['color_index'].unique())


Out[10]:
[-5.0, -4.0, -3.0, -2.0, -1.0, -0.0, 1.0, 2.0, 3.0, 4.0, 7.0, 9.0]

In [11]:
news_price['color_index'].plot(kind='hist', bins=15)


Out[11]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f805c9ef610>

In [12]:
colors = ['#cc0000', '#d01515', '#d42b2b', '#d94040', '#dd5555', '#e16a6a', '#e68080', '#ea9595', '#f2bfbf',
          '#f2eaf4', 
          '#bfbfff', '#9595ff', '#8080ff', '#6a6aff', '#5555ff', '#4040ff', '#2b2bff', '#1515ff', '#0000ff'
          ]
news_price['color_type'] = np.vectorize(lambda c: colors[(-1*int(c)+9)%18])(news_price['color_index'])
news_price.head()


Out[12]:
price diff headline score color_index color_type
time
2012-01-16 01:01:41 6.992585 -0.292585 'Good Wife' Watch: Jason Biggs, Jim Cramer and... 0.441828 -0 #f2eaf4
2012-04-12 14:30:13 4.883741 -0.023741 Canada Seeks to Turn Coins Into Digital Currency -0.455343 -0 #f2eaf4
2013-03-12 20:28:27 45.650558 0.549442 Today's Scuttlebot: Bitcoin Problem and Tracki... -0.269256 0 #f2eaf4
2013-04-08 00:00:00 183.750000 3.210000 Bubble or No, This Virtual Currency Is a Lot o... 0.000000 0 #f2eaf4
2013-04-11 16:42:24 104.151444 -2.778475 Bitcoin Has Real-World Investors -0.228388 -0 #f2eaf4

In [13]:
news_price[['diff', 'score']].corr()


Out[13]:
diff score
diff 1.000000 -0.108742
score -0.108742 1.000000

Plot it out


In [16]:
# Create a set of tools to use
TOOLS="pan,wheel_zoom,box_zoom,reset,hover"

# We need to put these data into a ColumnDataSource
tip_source = plotting.ColumnDataSource(
    data=dict(
        x=news_price.index,
        y=news_price['price'],
        headline=news_price.loc[:,'headline'],
        score=news_price.loc[:,'score'],
        diff=news_price.loc[:,'diff']        
    )
)

plotting.output_notebook()
# create a figure
p1 = plotting.figure(title="Test",
                    x_axis_label="Date",
                    y_axis_label="Price",
                    x_axis_type="datetime",
                    plot_width=1000, plot_height=600,
                    tools=TOOLS)
p1.below[0].formatter.formats = dict(years=['%Y'], months=['%b %Y'], days=['%d %b %Y'])
# make plot
## price line plot
p1.line(
    price_data.index, price_data['price'],
#     color='#A6CEE3',
    legend='BTC Price')
## news headline scatter plot
p1.circle(news_price.index, news_price['price'], 
          source=tip_source,
          legend="News", 
          fill_color=news_price['color_type'],
          size=9)
# hover tooltip
hover = p1.select(dict(type=HoverTool))
hover.tooltips = [("Headline", "@headline"),
                  ("Sentiment score", "@score"),
                  ("Price change($)", "@diff")
                 ]

plotting.show(p1)


BokehJS successfully loaded.

Warning: BokehJS previously loaded