In [5]:
import matplotlib.pyplot as plt # Import matplotlib
# This line is necessary for the plot to appear in a Jupyter notebook
%matplotlib inline
# Control the default size of figures in this Jupyter notebook
%pylab inline
pylab.rcParams['figure.figsize'] = (15, 9) # Change the size of plots
import glob
from collections import Counter, defaultdict
import pandas as pd
from pandas_datareader import data
from matplotlib.dates import DateFormatter, WeekdayLocator, DayLocator, MONDAY
from matplotlib.finance import candlestick_ohlc
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression as LR
from FOMC import FOMC
from yahoo_finance import Currency, Share
from spacy.en import English
import pickle
import datetime as dt
from __future__ import print_function
from pprint import pprint
In [6]:
fomc = FOMC()
df = fomc.get_statements()
In [8]:
fomc.pick_df()
In [ ]:
# fomc.pick_df('../data/minutes_df.pickle')
In [ ]:
with open(r'../data/minutes_df.pickle', 'rb') as f:
minutes_df = pickle.load(f)
In [ ]:
print(minutes_df.index[70])
In [ ]:
print(minutes_df.ix['2017-03-15'][0])
In [ ]:
nlp = English()
doc = nlp(unicode(minutes_df.ix['2017-03-15'][0]))
doc.__class__
In [ ]:
doc.sents
In [ ]:
for sent in doc.sents:
print('new: ', sent)
In [ ]:
VXX = Share('VXX') # Volatility
float(VXX.get_price()) - float(VXX.get_prev_close())
In [ ]:
VXX_historical = VXX.get_historical('2009-01-01', '2010-12-31')
In [ ]:
VXX_historical[0]
In [ ]:
str(dt.date.today())
In [ ]:
# We will look at stock prices over the past year, starting at January 1, 2016
start = dt.datetime(2014,1,1)
end = dt.date.today()
# Let's get Apple stock data; Apple's ticker symbol is AAPL
# First argument is the series we want, second is the source ("yahoo" for Yahoo! Finance), third is the start date, fourth is the end date
apple = data.DataReader("AAPL", "yahoo", start, end)
type(apple)
In [ ]:
apple
In [ ]:
def pandas_candlestick_ohlc(dat, stick = "day", otherseries = None):
"""
:param dat: pandas DataFrame object with datetime64 index, and float columns "Open", "High", "Low", and "Close", likely created via DataReader from "yahoo"
:param stick: A string or number indicating the period of time covered by a single candlestick. Valid string inputs include "day", "week", "month", and "year", ("day" default), and any numeric input indicates the number of trading days included in a period
:param otherseries: An iterable that will be coerced into a list, containing the columns of dat that hold other series to be plotted as lines
This will show a Japanese candlestick plot for stock data stored in dat, also plotting other series if passed.
"""
mondays = WeekdayLocator(MONDAY) # major ticks on the mondays
alldays = DayLocator() # minor ticks on the days
dayFormatter = DateFormatter('%d') # e.g., 12
# Create a new DataFrame which includes OHLC data for each period specified by stick input
transdat = dat.loc[:,["Open", "High", "Low", "Close"]]
if (type(stick) == str):
if stick == "day":
plotdat = transdat
stick = 1 # Used for plotting
elif stick in ["week", "month", "year"]:
if stick == "week":
transdat["week"] = pd.to_datetime(transdat.index).map(lambda x: x.isocalendar()[1]) # Identify weeks
elif stick == "month":
transdat["month"] = pd.to_datetime(transdat.index).map(lambda x: x.month) # Identify months
transdat["year"] = pd.to_datetime(transdat.index).map(lambda x: x.isocalendar()[0]) # Identify years
grouped = transdat.groupby(list(set(["year",stick]))) # Group by year and other appropriate variable
plotdat = pd.DataFrame({"Open": [], "High": [], "Low": [], "Close": []}) # Create empty data frame containing what will be plotted
for name, group in grouped:
plotdat = plotdat.append(pd.DataFrame({"Open": group.iloc[0,0],
"High": max(group.High),
"Low": min(group.Low),
"Close": group.iloc[-1,3]},
index = [group.index[0]]))
if stick == "week": stick = 5
elif stick == "month": stick = 30
elif stick == "year": stick = 365
elif (type(stick) == int and stick >= 1):
transdat["stick"] = [np.floor(i / stick) for i in range(len(transdat.index))]
grouped = transdat.groupby("stick")
plotdat = pd.DataFrame({"Open": [], "High": [], "Low": [], "Close": []}) # Create empty data frame containing what will be plotted
for name, group in grouped:
plotdat = plotdat.append(pd.DataFrame({"Open": group.iloc[0,0],
"High": max(group.High),
"Low": min(group.Low),
"Close": group.iloc[-1,3]},
index = [group.index[0]]))
else:
raise ValueError('Valid inputs to argument "stick" include the strings "day", "week", "month", "year", or a positive integer')
# Set plot parameters, including the axis object ax used for plotting
fig, ax = plt.subplots()
fig.subplots_adjust(bottom=0.2)
if plotdat.index[-1] - plotdat.index[0] < pd.Timedelta('730 days'):
weekFormatter = DateFormatter('%b %d') # e.g., Jan 12
ax.xaxis.set_major_locator(mondays)
ax.xaxis.set_minor_locator(alldays)
else:
weekFormatter = DateFormatter('%b %d, %Y')
ax.xaxis.set_major_formatter(weekFormatter)
ax.grid(True)
# Create the candelstick chart
candlestick_ohlc(ax, list(zip(list(date2num(plotdat.index.tolist())), plotdat["Open"].tolist(), plotdat["High"].tolist(),
plotdat["Low"].tolist(), plotdat["Close"].tolist())),
colorup = "black", colordown = "red", width = stick * .4)
# Plot other series (such as moving averages) as lines
if otherseries != None:
if type(otherseries) != list:
otherseries = [otherseries]
dat.loc[:,otherseries].plot(ax = ax, lw = 1.3, grid = True)
ax.xaxis_date()
ax.autoscale_view()
plt.setp(plt.gca().get_xticklabels(), rotation=45, horizontalalignment='right')
plt.show()
apple["Adj Close"].plot(grid = True) # Plot the adjusted closing price of AAPL
pandas_candlestick_ohlc(apple)
In [ ]:
doc = nlp(unicode("Apples and oranges are similar. Boots and hippos aren't."))
apples = doc[0]
oranges = doc[2]
boots = doc[6]
hippos = doc[8]
In [ ]:
apples.similarity(oranges)
In [ ]:
import spacy
nlp = spacy.load('en')
doc = nlp(u'They told us to duck.')
for word in doc:
print(word.text, word.lemma, word.lemma_, word.tag, word.tag_, word.pos, word.pos_)
In [ ]:
def find_person_occurences(processed_text):
"""
Return a list of actors from `doc` with corresponding occurences.
:param doc: Spacy NLP parsed document
:return: list of tuples in form
[('elizabeth', 622), ('darcy', 312), ('jane', 286), ('bennet', 266)]
"""
characters = Counter()
for ent in processed_text.ents:
if ent.label_ == 'PERSON':
characters[ent.lemma_] += 1
return characters.most_common()
def find_place_occurences(processed_text):
characters = Counter()
for ent in processed_text.ents:
if ent.label_ == 'GPE':
characters[ent.lemma_] += 1
return characters.most_common()
def find_rate_occurences(processed_text):
characters = Counter()
for ent in processed_text.ents:
if ent.label_ in ['CARDINAL','PERCENT']:
characters[ent.lemma_] += 1
return characters.most_common()
def find_date_occurences(processed_text):
characters = Counter()
for ent in processed_text.ents:
if ent.label_ == 'DATE':
characters[ent.lemma_] += 1
return characters.most_common()
def find_org_occurences(processed_text):
characters = Counter()
for ent in processed_text.ents:
if ent.label_ == 'ORG':
characters[ent.lemma_] += 1
return characters.most_common()
def find_occurences(processed_text, list_):
characters = Counter()
for ent in processed_text.ents:
if ent.label_ in list_:
characters[ent.lemma_] += 1
return characters.most_common()
In [ ]:
find_occurences(doc, ['MONEY','ORG'])
In [ ]:
for ent in doc.ents:
print(ent.lemma_, ent.label_)
In [ ]:
doc = nlp(unicode(minutes_df.iloc[0,0]), )
find_person_occurences(doc)
In [ ]:
print(doc.text)
In [ ]:
list(doc.noun_chunks)
In [ ]:
# Process sentences 'Hello, world. Natural Language Processing in 10 lines of code.' using spaCy
doc = nlp(u'Hello, world. Natural Language Processing in 10 lines of code.')
# Get first token of the processed document
token = doc[0]
print(token)
# Print sentences (one sentence per line)
for sent in doc.sents:
print(sent)
print()
# For each token, print corresponding part of speech tag
for token in doc:
print('{} - {}'.format(token, token.pos_))
In [ ]:
doc1 = nlp(unicode(minutes_df.iloc[0,0]))
doc2 = nlp(unicode(minutes_df.iloc[1,0]))
doc99 = nlp(unicode(minutes_df.iloc[-1,0]))
In [ ]:
doc1.similarity(doc99)
In [ ]:
word = nlp(unicode('marry'))[0]
In [ ]:
doc = nlp(unicode("her mother was talking to that one person (Lady Lucas) freely, openly, and of nothing else but her expectation that Jane would soon be married to Mr. Bingley."))
In [ ]:
VERB_LEMMA = "marry"
for ent in doc.ents:
if ent.label_ == 'PERSON':
print(ent.root.head.lemma_,'.')
In [ ]:
def plot_trend_data(ax, name, series):
ax.plot(series.index, series)
ax.set_title("{}".format(name))
def fit_moving_average_trend(series, window=6):
return series.rolling(window=window,center=False).mean()
def plot_moving_average_trend(ax, name, series, window=6):
moving_average_trend = fit_moving_average_trend(series, window)
plot_trend_data(ax, name, series)
ax.plot(series.index, moving_average_trend, color='green')
In [ ]:
prices = dict()
col_names = ['date', 'open', 'high', 'low', 'close', 'volume', 'count', 'WAP']
for filename in glob.glob('../data/*.csv'):
this_file = filename.split('/')[-1].split('.')[0]
prices[this_file] = pd.read_csv(filename, parse_dates=['date'], infer_datetime_format=True,names=col_names).drop_duplicates()
prices[this_file].set_index('date', inplace=True,)
prices[this_file].index = prices[this_file].index.tz_localize('America/Los_Angeles').tz_convert('America/New_York').tz_localize(None)
prices[this_file]['close-MA-4'] = fit_moving_average_trend(prices[this_file]['close'],
window=4)
In [ ]:
prices.keys()
In [ ]:
for key in prices.keys():
print(len(prices[key]), "observations in {}".format(key))
In [ ]:
for key in prices.keys():
if len(key) > 8:
plt.plot(prices[key].index, prices[key]['close'])
plt.title(key)
plt.show()
In [ ]:
for key in prices.keys():
if key[:3] in ['USD','EUR']:
plt.plot(prices[key].index, prices[key]['close'])
plt.title(key)
plt.show()
In [ ]:
fig, axs = plt.subplots(4, figsize=(14, 6))
plot_moving_average_trend(axs[0], 'open', prices['SPY-USD-TRADES']['open'][:100], window=10)
plot_moving_average_trend(axs[1], 'high', prices['SPY-USD-TRADES']['high'][:100], window=10)
plot_moving_average_trend(axs[2], 'low', prices['SPY-USD-TRADES']['low'][:100], window=10)
plot_moving_average_trend(axs[3], 'close', prices['SPY-USD-TRADES']['close'][:100], window=10)
plt.tight_layout()
In [ ]:
prices['SHY-USD-TRADES']['close'].plot()
In [ ]:
pre_post_FOMC_time_selector = []
for date in minutes_df.index:
pre_post_FOMC_time_selector.extend(pd.date_range(date.replace(hour=13, minute=30), periods=2, freq='2 H'))
In [ ]:
prices_FOMC = dict()
for key in prices.keys():
prices_FOMC[key] = prices[key].loc[pre_post_FOMC_time_selector][['close-MA-4']].dropna()
In [ ]:
prices_FOMC['SHY-USD-TRADES'].head()
In [ ]:
this_df = prices_FOMC['SHY-USD-TRADES']
In [ ]:
this_df.head()
In [ ]:
y = this_df.groupby(this_df.index.date).diff().dropna()
In [ ]:
sum(y > 0)
In [ ]:
sum(y < 0)
In [ ]:
y_dfs = dict()
for key in prices_FOMC:
y_dfs[key] = prices_FOMC[key].groupby(prices_FOMC[key].index.date).diff().dropna()
y_dfs[key]['fomc-close-MA-4-pct'] = y_dfs[key]['close-MA-4'] / prices[key].loc[y_dfs[key].index]['close']
y_dfs[key].index = y_dfs[key].index.normalize()
In [ ]:
y_dfs['SPY-USD-TRADES']
In [ ]:
tfidf_vectorizer = TfidfVectorizer(min_df=1, stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(minutes_df['statements'].values.tolist())
In [ ]:
tfidf_vectorizer.vocabulary_
In [ ]:
tfidf_matrix.todense()
In [ ]:
tfidf_matrix.shape
In [ ]:
minutes_list = minutes_df['statements'].values.tolist()
In [ ]:
minutes_list[0]
In [ ]:
minutes_df.iloc[0]
In [ ]:
minutes_df.__class__
In [ ]:
type(minutes_df)
In [ ]:
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import GridSearchCV
svn_grid = {'C':[0.01,0.1,1,8,9,10,11,12,15,30,15000],
'gamma':[0.1,1,2,3,4,5]}
svn_gridsearch = GridSearchCV(rbf, svn_grid, n_jobs=-1, verbose=True, scoring="accuracy", cv=10)
fit = svn_gridsearch.fit(X_train, y_train)
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [4]:
string = "{}/{}".format((1),2)
In [3]:
string
Out[3]:
In [ ]: