In [1]:
from pandas_datareader import data, wb
In [2]:
tickers_data = {}
In [3]:
from tqdm import tqdm
tickers = ['APBR', 'PAMP', 'YPFD', 'GGAL', 'ERAR', 'CRES', 'COME', 'ALUA', 'FRAN', 'MIRG',
'BMA', 'TRAN', 'TS', 'JMIN', 'EDN', 'TGSU2', 'SAMI', 'AGRO', 'TECO2', 'PESA',
'CEPU', 'CTIO', 'CECO2', 'AUSO', 'PETR', 'CELU', 'TGNO4']
for ticker in tqdm(tickers):
if ticker in tickers_data: continue
for method in 'get_data_google get_data_yahoo'.split():
try:
tickers_data[ticker] = {
'source': method,
'data': getattr(data, method)(ticker)
}
except Exception:
continue
In [4]:
import pickle
with open('tickers_data.pkl', 'w') as f:
pickle.dump(tickers_data, f, 2)
In [5]:
%matplotlib nbagg
%pylab
In [98]:
from statsmodels.tsa.filters.hp_filter import hpfilter
returns = []
for ticker, ticker_data in tickers_data.iteritems():
close = ticker_data['data']['2017-01-01':].Close
if len(close) == 0: continue
returns.append(
{
'return': (close.iloc[-1] - close.iloc[0]) / close.iloc[0],
'close': hpfilter(close, 50)[1],
'close_orig': close,
'ticker': ticker
}
)
In [99]:
figure()
t = choice(returns)
title(t['ticker'])
t['close'].plot()
t['close_orig'].plot()
Out[99]:
In [100]:
def corr(s1, s2, start):
s1 = s1[start:]
s2 = s2[start:]
# return (s1 / s1.iloc[0]).corr(s2 / s2.iloc[0])
return (s1 / s1.iloc[0]).diff().corr((s2 / s2.iloc[0]).diff())
corrs = []
for t1 in returns:
for t2 in returns:
corrs.append(
{
't1': t1['ticker'],
't2': t2['ticker'],
'corr': corr(t1['close'], t2['close'], '2016-01-01')
}
)
In [101]:
import pandas as pd
returns = pd.DataFrame(returns)
returns.sort_values('return', ascending=False)[['ticker', 'return']]
Out[101]:
In [102]:
corrs = pd.DataFrame(corrs)
corrs = corrs.sort_values('corr', ascending=False)
corrs[corrs.t1 == 'PAMP'].merge(returns, left_on='t2', right_on='ticker')[['t2', 'corr', 'return']]
Out[102]:
In [104]:
from sklearn.cluster import k_means
figure()
plot(range(2, 10), [k_means(M, i)[-1] for i in range(2, 10)], '-o')
Out[104]:
In [105]:
from sklearn.cluster import k_means
clusters = dict(zip(names, k_means(M, 5)[1]))
In [112]:
from collections import defaultdict
names = sorted(corrs.t1.unique(), key=lambda x:corrs['corr'][corrs.t1==x].sum())
if 'avg_cluster_distance' in globals():
avg_cluster_distance = defaultdict(list)
for name, cl_id in clusters.iteritems():
avg_cluster_distance[cl_id].append(corrs['corr'][corrs.t1==name].sum())
avg_cluster_distance = {cl_id: np.mean(v) for cl_id, v in avg_cluster_distance.iteritems()}
names = sorted(names, key=lambda x:avg_cluster_distance[clusters[x]])
M = []
for name1 in names:
row = []
for name2 in names:
if name1 == name2:
row.append(0)
else:
c = corrs[(corrs.t1 == name1) & (corrs.t2==name2)]
row.append(c['corr'].iloc[0])
M.append(row)
In [111]:
figure()
imshow(M, interpolation='nearest', origin='lower')
xticks(range(len(names)), names, rotation=45)
yticks(range(len(names)), names, rotation=45)
colorbar()
title('Correlacion stocks del Merval')
Out[111]:
In [109]:
import networkx as nx
edges = []
for i, row in corrs.iterrows():
if row['corr'] > 0:
edges.append((row.t1, row.t2, round(float(row['corr']), 2)))
g = nx.Graph()
g.add_weighted_edges_from(edges)
In [110]:
nx.write_graphml(g, 'g.graphml')
In [95]:
!pwd
In [114]:
figure()
tickers_data['FRAN']['data']['2016-01-01':].Close.plot()
Out[114]: