In [1]:
from pandas_datareader import data, wb

In [2]:
tickers_data = {}

In [3]:
from tqdm import tqdm

tickers = ['APBR', 'PAMP', 'YPFD', 'GGAL', 'ERAR', 'CRES', 'COME', 'ALUA', 'FRAN', 'MIRG',
'BMA', 'TRAN', 'TS', 'JMIN', 'EDN', 'TGSU2', 'SAMI', 'AGRO', 'TECO2', 'PESA',
'CEPU', 'CTIO', 'CECO2', 'AUSO', 'PETR', 'CELU', 'TGNO4']


for ticker in tqdm(tickers):
    if ticker in tickers_data: continue
        
    for method in 'get_data_google get_data_yahoo'.split():
        try:
            tickers_data[ticker] = {
                'source': method,
                'data': getattr(data, method)(ticker)
            }
        except Exception:
            continue


100%|██████████| 27/27 [01:40<00:00,  3.42s/it]

In [4]:
import pickle
with open('tickers_data.pkl', 'w') as f:
    pickle.dump(tickers_data, f, 2)

In [5]:
%matplotlib nbagg

%pylab


Using matplotlib backend: nbAgg
Populating the interactive namespace from numpy and matplotlib
/Library/Python/2.7/site-packages/IPython/core/magics/pylab.py:161: UserWarning: pylab import has clobbered these variables: ['f']
`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"

In [98]:
from statsmodels.tsa.filters.hp_filter import hpfilter

returns = []
for ticker, ticker_data in tickers_data.iteritems():
    close = ticker_data['data']['2017-01-01':].Close
    if len(close) == 0: continue
        
    returns.append(
        {
            'return': (close.iloc[-1] - close.iloc[0]) / close.iloc[0],
            'close': hpfilter(close, 50)[1],
            'close_orig': close,
            'ticker': ticker
        }    
    )

In [99]:
figure()
t = choice(returns)
title(t['ticker'])
t['close'].plot()
t['close_orig'].plot()


Out[99]:
<matplotlib.axes._subplots.AxesSubplot at 0x1146a9f90>

In [100]:
def corr(s1, s2, start):
    s1 = s1[start:]
    s2 = s2[start:]
    
#     return (s1 / s1.iloc[0]).corr(s2 / s2.iloc[0])
    return (s1 / s1.iloc[0]).diff().corr((s2 / s2.iloc[0]).diff())

corrs = []
for t1 in returns:
    for t2 in returns:
        corrs.append(
            {
                't1': t1['ticker'],
                't2': t2['ticker'],
                'corr': corr(t1['close'], t2['close'], '2016-01-01')
            }
        )

In [101]:
import pandas as pd
returns = pd.DataFrame(returns)
returns.sort_values('return', ascending=False)[['ticker', 'return']]


Out[101]:
ticker return
8 TGNO4 0.786207
18 PESA 0.568966
3 PAMP 0.543103
24 AUSO 0.537234
10 GGAL 0.482124
23 TGSU2 0.444270
0 YPFD 0.366300
7 BMA 0.265382
15 CECO2 0.263158
17 COME 0.254902
14 TECO2 0.249153
6 ERAR 0.189744
22 EDN 0.163116
21 CRES 0.161179
16 JMIN 0.153846
2 MIRG 0.091796
4 ALUA 0.082524
12 CTIO 0.040816
5 AGRO 0.032877
19 SAMI 0.014976
20 APBR -0.110138
1 TS -0.137315
13 FRAN -0.147922
9 CELU -0.167614
11 PETR -0.745136

In [102]:
corrs = pd.DataFrame(corrs)
corrs = corrs.sort_values('corr', ascending=False)
corrs[corrs.t1 == 'PAMP'].merge(returns, left_on='t2', right_on='ticker')[['t2', 'corr', 'return']]


Out[102]:
t2 corr return
0 PAMP 1.000000 0.543103
1 PESA 0.976001 0.568966
2 YPFD 0.743478 0.366300
3 TS 0.708021 -0.137315
4 EDN 0.487973 0.163116
5 PETR 0.441737 -0.745136
6 ALUA 0.440038 0.082524
7 GGAL 0.349228 0.482124
8 JMIN 0.311200 0.153846
9 APBR 0.296299 -0.110138
10 ERAR 0.258614 0.189744
11 CECO2 0.257261 0.263158
12 TGNO4 0.244543 0.786207
13 CELU 0.239365 -0.167614
14 AGRO 0.197500 0.032877
15 AUSO 0.185255 0.537234
16 TGSU2 0.180537 0.444270
17 BMA 0.074827 0.265382
18 CTIO -0.024119 0.040816
19 TECO2 -0.131469 0.249153
20 MIRG -0.165242 0.091796
21 FRAN -0.197521 -0.147922
22 COME -0.572795 0.254902
23 SAMI -0.738557 0.014976
24 CRES -0.840984 0.161179

In [104]:
from sklearn.cluster import k_means
figure()
plot(range(2, 10), [k_means(M, i)[-1] for i in range(2, 10)], '-o')


Out[104]:
[<matplotlib.lines.Line2D at 0x114ffbc50>]

In [105]:
from sklearn.cluster import k_means
clusters = dict(zip(names, k_means(M, 5)[1]))

In [112]:
from collections import defaultdict

names = sorted(corrs.t1.unique(), key=lambda x:corrs['corr'][corrs.t1==x].sum())
if 'avg_cluster_distance' in globals():
    avg_cluster_distance = defaultdict(list)
    for name, cl_id in clusters.iteritems():
        avg_cluster_distance[cl_id].append(corrs['corr'][corrs.t1==name].sum())
    avg_cluster_distance = {cl_id: np.mean(v) for cl_id, v in avg_cluster_distance.iteritems()}

    names = sorted(names, key=lambda x:avg_cluster_distance[clusters[x]])
    
M = []

for name1 in names:
    row = []
    for name2 in names:
        if name1 == name2:
            row.append(0)
        else:
            c = corrs[(corrs.t1 == name1) & (corrs.t2==name2)]
            row.append(c['corr'].iloc[0])
            
        
    M.append(row)

In [111]:
figure()
imshow(M, interpolation='nearest', origin='lower')
xticks(range(len(names)), names, rotation=45)
yticks(range(len(names)), names, rotation=45)

colorbar()
title('Correlacion stocks del Merval')


Out[111]:
<matplotlib.text.Text at 0x116787f50>

In [109]:
import networkx as nx
edges = []

for i, row in corrs.iterrows():
    if row['corr'] > 0:
        edges.append((row.t1, row.t2, round(float(row['corr']), 2)))
        
g = nx.Graph()
g.add_weighted_edges_from(edges)

In [110]:
nx.write_graphml(g, 'g.graphml')

In [95]:
!pwd


/Users/przivic/prog

In [114]:
figure()
tickers_data['FRAN']['data']['2016-01-01':].Close.plot()


Out[114]:
<matplotlib.axes._subplots.AxesSubplot at 0x115eaff90>