There is collaboration and a Python IDE directly on the site, which is how the user interacts with their data.
Both of us are brand new to the site, so we are exploring in order of the help file.
Symbols:
symbol('goog') # single
symbols('goog', 'fb') # multiple
sid(24) # unique ID to Quantopian -- 24 is for aapl
Ordering: market, limit, stop, stop limit
The API is so thin -- it's really just about trading -- so instead explore machine learning using existing data outside of the Quantopian environment. Also, they provide zipline, their backtest functions, as open-source code (github repo / pypi page) to test outside of the quantopian environment. Here is a how-to
In [ ]:
import numpy as np
import pandas as pd
import pandas.io.data as web
import matplotlib.pyplot as plt
%matplotlib inline
import requests
import datetime
from database import Database
db = Database()
In [ ]:
#list of fortune 500 companies ticke symbols
chicago_companies = ['ADM', 'BA', 'WGN', 'UAL', 'SHLD',
'MDLZ', 'ALL', 'MCD', 'EXC', 'ABT',
'ABBV', 'KRFT', 'ITW', 'NAV', 'CDW',
'RRD', 'GWW', 'DFS', 'DOV', 'MSI',
'TEN', 'INGR', 'NI', 'TEG', 'CF',
'ORI', 'USTR', 'LKQ'
]
In [ ]:
url = "http://ichart.yahoo.com/table.csv?s={stock}&a={strt_month}&b={strt_day}&c={strt_year}&d={end_month}&e={end_day}&f={end_year}&g=w&ignore=.csv"
params = {"stock":'ADM', "strt_month": 1, 'strt_day':1, 'strt_year': 2010,
"end_month": 1, "end_day": 1, "end_year": 2014}
new_url = url.format(**params)
print url.format(**params)
data_new = web.DataReader(chicago_companies, 'yahoo', datetime.datetime(2010,1,1), datetime.datetime(2014,1,1))
In [ ]:
#all Stock data is contained in 3D datastructure called a panel
data_new
In [ ]:
ADM = pd.read_csv('data/table (4).csv')
ADM.head()
In [ ]:
ADM = ADM.sort_index(by='Date')
ADM['Short'] = pd.rolling_mean(ADM['Close'], 4)
ADM['Long'] = pd.rolling_mean(ADM['Close'], 12)
ADM.plot(x='Date', y=['Close', 'Long', 'Short'])
In [ ]:
buy_signals = np.where(pd.rolling_mean(ADM['Close'], 4) > pd.rolling_mean(ADM['Close'], 12), 1.0, 0.0)
Scikit Learn's home page divides up the space of machine learning well, but the Mahout algorithms list has a more comprehensive list of algorithms. From both:
The S&P 500 dataset is great for us to quickly explore regression, clustering, and principal component analysis. We can also backtest here using Quantopian's zipline library.
We can calculate our own 'Beta' by pulling the S&P 500 index values from Yahoo using Pandas and then regressing each of our components of the S&P500 with it. The NASDAQ defines Beta as the coefficient found from regressing the individual stock's returns in excess of the 90-day treasury rate on the S&P 500's returns in excess of the 90-day rate. Nasdaq cautions that Beta can change over time and that it could be different for positive and negative changes.
The equation for the regression will be:
(Return - Treas90) = Beta * (SP500 - Treas90) + alpha
In [ ]:
date_range = db.select_one("SELECT min(dt), max(dt) from return;", columns=("start", "end"))
chicago_companies = ['ADM', 'BA', 'MDLZ', 'ALL', 'MCD', 'EXC', 'ABT',
'ABBV', 'KRFT', 'ITW', 'GWW', 'DFS', 'DOV', 'MSI',
'NI', 'TEG', 'CF' ]
chicago_returns = db.select('SELECT dt, "{}" FROM return ORDER BY dt;'.format(
'", "'.join((c.lower() for c in chicago_companies))),
columns=["Date"] + chicago_companies)
chi_dates = [row.pop("Date") for row in chicago_returns]
chicago_returns = pd.DataFrame(chicago_returns, index=chi_dates)
sp500 = web.DataReader('^GSPC', 'yahoo', date_range['start'], date_range['end'])
sp500['sp500'] = sp500['Adj Close'].diff() / sp500['Adj Close']
treas90 = web.DataReader('^IRX', 'yahoo', date_range['start'], date_range['end'])
treas90['treas90'] = treas90['Adj Close'].diff() / treas90['Adj Close']
chicago_returns = chicago_returns.join(sp500['sp500'])
chicago_returns = chicago_returns.join(treas90['treas90'])
chicago_returns.drop(chicago_returns.head(1).index, inplace=True)
chicago_returns = chicago_returns.sub(chicago_returns['treas90'], axis=0)
chicago_returns.replace([np.inf, -np.inf], np.nan, inplace=True)
chicago_returns = chicago_returns / 100
In [ ]:
# For ordinary least squares regression
from pandas.stats.api import ols
In [ ]:
regressions = {}
for y in chicago_returns.columns:
if y not in ('sp500', 'treas90'):
df = chicago_returns.dropna(subset=[y, 'sp500'])[[y, 'sp500']]
regressions[y] = ols(y=df[y], x=df['sp500'])
regressions
In [ ]:
symbols = sorted(regressions.keys())
data = []
for s in symbols:
data.append(dict(alpha=regressions[s].beta[1], beta=regressions[s].beta[0]))
betas = pd.DataFrame(data=data,index=symbols)
betas
In [ ]:
betas.values[0,0]
In [ ]:
fig = plt.figure()
fig.suptitle('Betas vs Alphas', fontsize=14, fontweight='bold')
ax = fig.add_subplot(111)
fig.subplots_adjust(top=0.85)
#ax.set_title('axes title')
ax.set_xlabel('Beta')
ax.set_ylabel('Alpha')
for i in range(len(betas.index)):
ax.text(betas.values[i,1], betas.values[i,0], betas.index[i]) #, bbox={'facecolor':'slateblue', 'alpha':0.5, 'pad':10})
ax.set_ylim([0,0.2])
ax.set_xlim([0.75, 1.2])
In [ ]:
betas.describe()
In [ ]:
help(ax.text)
We can use clustering and principal component analysis both to reduce the dimension of the problem.
In [ ]:
import scipy.spatial.distance as dist
import scipy.cluster.hierarchy as hclust
In [ ]:
chicago_returns
# chicago_returns.dropna(subset=[y, 'sp500'])[[y, 'sp500']]
# Spatial distance: scipy.spatial.distance.pdist(X)
# http://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.pdist.html#scipy.spatial.distance.pdist
# Perform the hierarchical clustering: scipy.cluster.hierarchy.linkage(distance_matrix)
# https://docs.scipy.org/doc/scipy-0.15.1/reference/generated/scipy.cluster.hierarchy.linkage.html#scipy.cluster.hierarchy.linkage
# Plot the dendrogram: den = hclust.dendrogram(links,labels=chicago_dist.columns) #, orientation="left")
#
#
chicago_dist = dist.pdist(chicago_returns.dropna().transpose(), 'euclidean')
links = hclust.linkage(chicago_dist)
In [ ]:
chicago_dist
In [ ]:
plt.figure(figsize=(5,10))
#data_link = hclust.linkage(chicago_dist, method='single', metric='euclidean')
den = hclust.dendrogram(links,labels=chicago_returns.columns, orientation="left")
plt.ylabel('Samples', fontsize=9)
plt.xlabel('Distance')
plt.suptitle('Stocks clustered by similarity', fontweight='bold', fontsize=14);
In [ ]:
with open("sp500_columns.txt") as infile:
sp_companies = infile.read().strip().split("\n")
returns = db.select( ('SELECT dt, "{}" FROM return '
'WHERE dt BETWEEN \'2012-01-01\' AND \'2012-12-31\''
'ORDER BY dt;').format(
'", "'.join((c.lower() for c in sp_companies))),
columns=["Date"] + sp_companies)
sp_dates = [row.pop("Date") for row in returns]
returns = pd.DataFrame(returns, index=sp_dates)
In [ ]:
#Calculate distance and cluster
sp_dist = dist.pdist(returns.dropna().transpose(), 'euclidean')
sp_links = hclust.linkage(sp_dist, method='single', metric='euclidean')
In [ ]:
plt.figure(figsize=(10,180))
#data_link = hclust.linkage(chicago_dist, method='single', metric='euclidean')
den = hclust.dendrogram(sp_links,labels=returns.columns, orientation="left")
plt.ylabel('Samples', fontsize=9)
plt.xlabel('Distance')
plt.suptitle('Stocks clustered by similarity', fontweight='bold', fontsize=14)
In [ ]:
returns.transpose().dropna().shape
In [ ]:
from scipy.cluster.vq import whiten
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
In [ ]:
pca = PCA(n_components=2)
pca.fit(returns.transpose().dropna())
princ_comp_returns = pca.transform(returns.transpose().dropna())
princ_comp_returns.shape
In [ ]:
normalize = whiten(returns.transpose().dropna())
km = KMeans(n_clusters = 2)
km.fit(normalize)
centroids = km.cluster_centers_
#idx = vq(normalize, centriods)
In [ ]:
pca = PCA(n_components=2)
pca.fit(returns.transpose().dropna())
princ_comp = pca.transform(centroids)
princ_comp
In [ ]:
colors = km.labels_
plt.scatter(princ_comp_returns[:,0], princ_comp_returns[:, 1], c= colors)
The Modern Portfolio Theory code selects an 'efficient frontier' of the minimum risk for any desired return by using linear combinations of available stocks. Stocks should be grouped into categories if there is a risk of very high correlation between pairs of stocks to reduce the risk of inverting a singular matrix.
The return of a portfolio is equal to the sum of the individual returns of the stocks multiplied by their percent allocations:
In [6]:
%%latex
\begin{aligned}
r_{portfolio} =
\left(\begin{matrix}w_1 & w_2 & \ldots & w_n \end{matrix}\right) \, \cdot \,
\left(\begin{matrix}r_1\\ r_2 \\ \vdots \\ r_n \end{matrix}\right)
\end{aligned}
In [5]:
%%latex
\begin{aligned}
\sigma( x_1 + x_2) = \sigma(x_1) + \sigma(x_2) - 2 \, \cdot \, cov(x_1, x_2) \, \cdot \, \sigma(x_1) \, \cdot \, \sigma(x_2)
\\[0.25in]
\sigma( portfolio ) =
\left(\begin{matrix}p_1 & p_2 & \ldots & p_n \end{matrix}\right) \, \cdot \,
\left(\begin{matrix}cov(x_1,x_1) & cov(x_1, x_2) &\ldots & cov(x_1, x_n)\\
cov(x_2, x1) & cov(x_2, x_2) &\ldots & cov(x_2, x_n) \\
\vdots & \vdots & \ddots & \vdots \\
cov(x_n, x_1)& cov(x_n, x_2) & \ldots & cov(x_n,x_n) \end{matrix}\right)
\left(\begin{matrix}p_1 \\ p_2 \\ \vdots \\ p_n \end{matrix}\right)
\end{aligned}
In [ ]:
import mpt
In [ ]:
date_range = {"start": datetime.date(2009,1,1), "end": datetime.date(2012,12,31)}
chicago_companies = ['ADM', 'BA', 'MDLZ', 'ALL', 'MCD', 'EXC', 'ABT',
'KRFT', 'ITW', 'GWW', 'DFS', 'DOV', 'MSI',
'NI', 'TEG', 'CF' ]
chicago_returns = db.select(('SELECT dt, "{}" FROM return'
' WHERE dt BETWEEN ''%s'' AND ''%s'''
' ORDER BY dt;').format(
'", "'.join((c.lower() for c in chicago_companies))),
columns=["Date"] + chicago_companies,
args=[date_range['start'], date_range['end']])
# At this point chicago_returns is an array of dictionaries
# [{"Date":datetime.date(2009,1,1), "ADM":0.1, "BA":0.2, etc ... , "CF": 0.1},
# {"Date":datetime.date(2009,1,2), "ADM":0.2, "BA":0.1, etc ..., "CF":0.1},
# ...,
# {"Date":datetime.date(2012,12,31), "ADM":0.2, "BA":0.1, etc ..., "CF":0.1}]
# Pull out the dates to mak them the indices in the Pandas DataFrame
chi_dates = [row.pop("Date") for row in chicago_returns]
chicago_returns = pd.DataFrame(chicago_returns, index=chi_dates)
chicago_returns = chicago_returns / 100
treas90 = web.DataReader('^IRX', 'yahoo', date_range['start'], date_range['end'])
treas90['treas90'] = treas90['Adj Close'].diff() / treas90['Adj Close']
chicago_returns = chicago_returns.join(treas90['treas90'])
chicago_returns.drop(chicago_returns.head(1).index, inplace=True)
chicago_returns.replace([np.inf, -np.inf], np.nan, inplace=True)
result = mpt.get_efficient_frontier(chicago_returns)
In [ ]:
chicago_returns.std()
In [ ]:
fig = plt.figure()
fig.suptitle("MPT Efficient Frontier", fontsize=14, fontweight='bold')
ax = fig.add_subplot(111)
fig.subplots_adjust(top=0.85)
ax.set_xlabel('Risk')
ax.set_ylabel('Target return (annual)')
sds = list(chicago_returns.std()*np.sqrt(250))
mus = list(chicago_returns.mean()*250)
syms = list(chicago_returns.columns)
#for i in range(len(sds)):
# plt.text(sds[i], mus[i], syms[i], bbox={'facecolor':'slateblue', 'alpha':0.5, 'pad':10})
ax.plot(result["risks"], result["returns"], linewidth=2)
ax.scatter(sds, mus, alpha=0.5)
I could not use pip's install of plotly's Python api because it was missing almost all of the submodules (at least for Python 2.7). Instead, install from github:
git clone https://github.com/plotly/python-api.git
cd python-api
python setup.py install
And also configure plotly for your account, per the instructions here. My configuration:
python -c "import plotly; plotly.tools.set_credentials_file(username='tanya.schlusser', api_key='9fjjjaguyh', stream_ids=['n8gn5rg5ag', 'dlsytpqrr0'])"
In [ ]:
In [ ]: