EDA for FOMC Meeting Minutes

And testing out spaCy

Goal

Test out and create a pipeline for the creation of a model for parsing FOMC minutes and creating predictions


In [5]:
import matplotlib.pyplot as plt   # Import matplotlib
# This line is necessary for the plot to appear in a Jupyter notebook
%matplotlib inline
# Control the default size of figures in this Jupyter notebook
%pylab inline
pylab.rcParams['figure.figsize'] = (15, 9)   # Change the size of plots
import glob

from collections import Counter, defaultdict

import pandas as pd
from pandas_datareader import data
from matplotlib.dates import DateFormatter, WeekdayLocator, DayLocator, MONDAY
from matplotlib.finance import candlestick_ohlc
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.linear_model import LogisticRegression as LR

from FOMC import FOMC
from yahoo_finance import Currency, Share
from spacy.en import English
import pickle
import datetime as dt
from __future__ import print_function
from pprint import pprint


Populating the interactive namespace from numpy and matplotlib

In [6]:
fomc = FOMC()
df = fomc.get_statements()


Getting links...
There are 167 statements..
.....Getting articles - Multi-threaded......
.............................................................................................................................................................

In [8]:
fomc.pick_df()


Writing to ../data/minutes.pickle

In [ ]:
# fomc.pick_df('../data/minutes_df.pickle')

In [ ]:
with open(r'../data/minutes_df.pickle', 'rb') as f:
    minutes_df = pickle.load(f)

In [ ]:
print(minutes_df.index[70])

In [ ]:
print(minutes_df.ix['2017-03-15'][0])

In [ ]:
nlp = English()
doc = nlp(unicode(minutes_df.ix['2017-03-15'][0]))
doc.__class__

In [ ]:
doc.sents

In [ ]:
for sent in doc.sents:
    print('new: ', sent)

In [ ]:
VXX = Share('VXX')  # Volatility
float(VXX.get_price()) - float(VXX.get_prev_close())

In [ ]:
VXX_historical = VXX.get_historical('2009-01-01', '2010-12-31')

In [ ]:
VXX_historical[0]

In [ ]:
str(dt.date.today())

In [ ]:
# We will look at stock prices over the past year, starting at January 1, 2016
start = dt.datetime(2014,1,1)
end = dt.date.today()
 
# Let's get Apple stock data; Apple's ticker symbol is AAPL
# First argument is the series we want, second is the source ("yahoo" for Yahoo! Finance), third is the start date, fourth is the end date
apple = data.DataReader("AAPL", "yahoo", start, end)
 
type(apple)

In [ ]:
apple

In [ ]:
def pandas_candlestick_ohlc(dat, stick = "day", otherseries = None):
    """
    :param dat: pandas DataFrame object with datetime64 index, and float columns "Open", "High", "Low", and "Close", likely created via DataReader from "yahoo"
    :param stick: A string or number indicating the period of time covered by a single candlestick. Valid string inputs include "day", "week", "month", and "year", ("day" default), and any numeric input indicates the number of trading days included in a period
    :param otherseries: An iterable that will be coerced into a list, containing the columns of dat that hold other series to be plotted as lines
 
    This will show a Japanese candlestick plot for stock data stored in dat, also plotting other series if passed.
    """
    mondays = WeekdayLocator(MONDAY)        # major ticks on the mondays
    alldays = DayLocator()              # minor ticks on the days
    dayFormatter = DateFormatter('%d')      # e.g., 12
 
    # Create a new DataFrame which includes OHLC data for each period specified by stick input
    transdat = dat.loc[:,["Open", "High", "Low", "Close"]]
    if (type(stick) == str):
        if stick == "day":
            plotdat = transdat
            stick = 1 # Used for plotting
        elif stick in ["week", "month", "year"]:
            if stick == "week":
                transdat["week"] = pd.to_datetime(transdat.index).map(lambda x: x.isocalendar()[1]) # Identify weeks
            elif stick == "month":
                transdat["month"] = pd.to_datetime(transdat.index).map(lambda x: x.month) # Identify months
            transdat["year"] = pd.to_datetime(transdat.index).map(lambda x: x.isocalendar()[0]) # Identify years
            grouped = transdat.groupby(list(set(["year",stick]))) # Group by year and other appropriate variable
            plotdat = pd.DataFrame({"Open": [], "High": [], "Low": [], "Close": []}) # Create empty data frame containing what will be plotted
            for name, group in grouped:
                plotdat = plotdat.append(pd.DataFrame({"Open": group.iloc[0,0],
                                            "High": max(group.High),
                                            "Low": min(group.Low),
                                            "Close": group.iloc[-1,3]},
                                           index = [group.index[0]]))
            if stick == "week": stick = 5
            elif stick == "month": stick = 30
            elif stick == "year": stick = 365
 
    elif (type(stick) == int and stick >= 1):
        transdat["stick"] = [np.floor(i / stick) for i in range(len(transdat.index))]
        grouped = transdat.groupby("stick")
        plotdat = pd.DataFrame({"Open": [], "High": [], "Low": [], "Close": []}) # Create empty data frame containing what will be plotted
        for name, group in grouped:
            plotdat = plotdat.append(pd.DataFrame({"Open": group.iloc[0,0],
                                        "High": max(group.High),
                                        "Low": min(group.Low),
                                        "Close": group.iloc[-1,3]},
                                       index = [group.index[0]]))
 
    else:
        raise ValueError('Valid inputs to argument "stick" include the strings "day", "week", "month", "year", or a positive integer')
 
 
    # Set plot parameters, including the axis object ax used for plotting
    fig, ax = plt.subplots()
    fig.subplots_adjust(bottom=0.2)
    if plotdat.index[-1] - plotdat.index[0] < pd.Timedelta('730 days'):
        weekFormatter = DateFormatter('%b %d')  # e.g., Jan 12
        ax.xaxis.set_major_locator(mondays)
        ax.xaxis.set_minor_locator(alldays)
    else:
        weekFormatter = DateFormatter('%b %d, %Y')
    ax.xaxis.set_major_formatter(weekFormatter)
 
    ax.grid(True)
 
    # Create the candelstick chart
    candlestick_ohlc(ax, list(zip(list(date2num(plotdat.index.tolist())), plotdat["Open"].tolist(), plotdat["High"].tolist(),
                      plotdat["Low"].tolist(), plotdat["Close"].tolist())),
                      colorup = "black", colordown = "red", width = stick * .4)
 
    # Plot other series (such as moving averages) as lines
    if otherseries != None:
        if type(otherseries) != list:
            otherseries = [otherseries]
        dat.loc[:,otherseries].plot(ax = ax, lw = 1.3, grid = True)
 
    ax.xaxis_date()
    ax.autoscale_view()
    plt.setp(plt.gca().get_xticklabels(), rotation=45, horizontalalignment='right')
 
    plt.show()
 
apple["Adj Close"].plot(grid = True) # Plot the adjusted closing price of AAPL
pandas_candlestick_ohlc(apple)

In [ ]:
doc = nlp(unicode("Apples and oranges are similar. Boots and hippos aren't."))

apples = doc[0]
oranges = doc[2]
boots = doc[6]
hippos = doc[8]

In [ ]:
apples.similarity(oranges)

In [ ]:
import spacy
nlp = spacy.load('en')
doc = nlp(u'They told us to duck.')
for word in doc:
    print(word.text, word.lemma, word.lemma_, word.tag, word.tag_, word.pos, word.pos_)

In [ ]:
def find_person_occurences(processed_text):
    """
    Return a list of actors from `doc` with corresponding occurences.
    
    :param doc: Spacy NLP parsed document
    :return: list of tuples in form
        [('elizabeth', 622), ('darcy', 312), ('jane', 286), ('bennet', 266)]
    """
    
    characters = Counter()
    for ent in processed_text.ents:
        if ent.label_ == 'PERSON':
            characters[ent.lemma_] += 1
            
    return characters.most_common()

def find_place_occurences(processed_text):
    characters = Counter()
    for ent in processed_text.ents:
        if ent.label_ == 'GPE':
            characters[ent.lemma_] += 1
    return characters.most_common()

def find_rate_occurences(processed_text):
    characters = Counter()
    for ent in processed_text.ents:
        if ent.label_ in ['CARDINAL','PERCENT']:
            characters[ent.lemma_] += 1
    return characters.most_common()

def find_date_occurences(processed_text):
    characters = Counter()
    for ent in processed_text.ents:
        if ent.label_ == 'DATE':
            characters[ent.lemma_] += 1
    return characters.most_common()

def find_org_occurences(processed_text):
    characters = Counter()
    for ent in processed_text.ents:
        if ent.label_ == 'ORG':
            characters[ent.lemma_] += 1
    return characters.most_common()

def find_occurences(processed_text, list_):
    characters = Counter()
    for ent in processed_text.ents:
        if ent.label_ in list_:
            characters[ent.lemma_] += 1
    return characters.most_common()

In [ ]:
find_occurences(doc, ['MONEY','ORG'])

In [ ]:
for ent in doc.ents:
    print(ent.lemma_, ent.label_)

In [ ]:
doc = nlp(unicode(minutes_df.iloc[0,0]), )
find_person_occurences(doc)

In [ ]:
print(doc.text)

In [ ]:
list(doc.noun_chunks)

In [ ]:
# Process sentences 'Hello, world. Natural Language Processing in 10 lines of code.' using spaCy
doc = nlp(u'Hello, world. Natural Language Processing in 10 lines of code.')

# Get first token of the processed document
token = doc[0]
print(token)

# Print sentences (one sentence per line)
for sent in doc.sents:
    print(sent)
    
print()
# For each token, print corresponding part of speech tag
for token in doc:
    print('{} - {}'.format(token, token.pos_))

In [ ]:
doc1 = nlp(unicode(minutes_df.iloc[0,0]))
doc2 = nlp(unicode(minutes_df.iloc[1,0]))
doc99 = nlp(unicode(minutes_df.iloc[-1,0]))

In [ ]:
doc1.similarity(doc99)

In [ ]:
word = nlp(unicode('marry'))[0]

In [ ]:
doc = nlp(unicode("her mother was talking to that one person (Lady Lucas) freely, openly, and of nothing else but her expectation that Jane would soon be married to Mr. Bingley."))

In [ ]:
VERB_LEMMA = "marry"
for ent in doc.ents:
    if ent.label_ == 'PERSON':
        print(ent.root.head.lemma_,'.')

Price data for engineering the Y values


In [ ]:
def plot_trend_data(ax, name, series):
    ax.plot(series.index, series)
    ax.set_title("{}".format(name))
def fit_moving_average_trend(series, window=6):
    return series.rolling(window=window,center=False).mean()
def plot_moving_average_trend(ax, name, series, window=6):
    moving_average_trend = fit_moving_average_trend(series, window)
    plot_trend_data(ax, name, series)
    ax.plot(series.index, moving_average_trend, color='green')

In [ ]:
prices = dict()
col_names = ['date', 'open', 'high', 'low', 'close', 'volume', 'count', 'WAP']
for filename in glob.glob('../data/*.csv'):
    this_file = filename.split('/')[-1].split('.')[0]
    prices[this_file] = pd.read_csv(filename, parse_dates=['date'], infer_datetime_format=True,names=col_names).drop_duplicates()
    prices[this_file].set_index('date', inplace=True,)
    prices[this_file].index = prices[this_file].index.tz_localize('America/Los_Angeles').tz_convert('America/New_York').tz_localize(None)
    prices[this_file]['close-MA-4'] = fit_moving_average_trend(prices[this_file]['close'], 
                                                              window=4)

In [ ]:
prices.keys()

How many observations are there in each?


In [ ]:
for key in prices.keys():
    print(len(prices[key]), "observations in {}".format(key))

Lets plot the prices to check for anomalies and outlies (usually misprints)


In [ ]:
for key in prices.keys():
    if len(key) > 8:
        plt.plot(prices[key].index, prices[key]['close'])
        plt.title(key)
        plt.show()

In [ ]:
for key in prices.keys():
    if key[:3] in ['USD','EUR']:
        plt.plot(prices[key].index, prices[key]['close'])
        plt.title(key)
        plt.show()

In [ ]:
fig, axs = plt.subplots(4, figsize=(14, 6))
plot_moving_average_trend(axs[0], 'open', prices['SPY-USD-TRADES']['open'][:100], window=10)
plot_moving_average_trend(axs[1], 'high', prices['SPY-USD-TRADES']['high'][:100], window=10)
plot_moving_average_trend(axs[2], 'low', prices['SPY-USD-TRADES']['low'][:100], window=10)
plot_moving_average_trend(axs[3], 'close', prices['SPY-USD-TRADES']['close'][:100], window=10)
plt.tight_layout()

In [ ]:
prices['SHY-USD-TRADES']['close'].plot()

Looks like data is pretty clean now. Lets gather the y variables


In [ ]:
pre_post_FOMC_time_selector = []
for date in minutes_df.index:
    pre_post_FOMC_time_selector.extend(pd.date_range(date.replace(hour=13, minute=30), periods=2, freq='2 H'))

In [ ]:
prices_FOMC = dict()
for key in prices.keys():
    prices_FOMC[key] = prices[key].loc[pre_post_FOMC_time_selector][['close-MA-4']].dropna()

In [ ]:
prices_FOMC['SHY-USD-TRADES'].head()

In [ ]:
this_df = prices_FOMC['SHY-USD-TRADES']

In [ ]:
this_df.head()

In [ ]:
y = this_df.groupby(this_df.index.date).diff().dropna()

In [ ]:
sum(y > 0)

In [ ]:
sum(y < 0)

We seem to have a pretty balanced number of up days and down days.

Now lets convert everything to target variables


In [ ]:
y_dfs = dict()
for key in prices_FOMC:
    y_dfs[key] = prices_FOMC[key].groupby(prices_FOMC[key].index.date).diff().dropna() 
    y_dfs[key]['fomc-close-MA-4-pct'] = y_dfs[key]['close-MA-4'] / prices[key].loc[y_dfs[key].index]['close']
    y_dfs[key].index = y_dfs[key].index.normalize()

In [ ]:
y_dfs['SPY-USD-TRADES']

Time to TF-IDF!!!

Lets build the X matrix first


In [ ]:
tfidf_vectorizer = TfidfVectorizer(min_df=1, stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(minutes_df['statements'].values.tolist())

In [ ]:
tfidf_vectorizer.vocabulary_

In [ ]:
tfidf_matrix.todense()

In [ ]:
tfidf_matrix.shape

In [ ]:
minutes_list = minutes_df['statements'].values.tolist()

In [ ]:
minutes_list[0]

In [ ]:
minutes_df.iloc[0]

In [ ]:
minutes_df.__class__

In [ ]:
type(minutes_df)

Tuning RF


In [ ]:
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC

from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import GridSearchCV

svn_grid = {'C':[0.01,0.1,1,8,9,10,11,12,15,30,15000],
            'gamma':[0.1,1,2,3,4,5]}

svn_gridsearch = GridSearchCV(rbf, svn_grid, n_jobs=-1, verbose=True, scoring="accuracy", cv=10)
fit = svn_gridsearch.fit(X_train, y_train)

In [ ]:


In [ ]:


In [ ]:

Digging into the classes


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [4]:
string = "{}/{}".format((1),2)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-4-c4b10730261e> in <module>()
----> 1 string = "{}/{}".format(unicode(1),2)

NameError: name 'unicode' is not defined

In [3]:
string


Out[3]:
'1/2'

In [ ]: