In [1]:
import pandas as pd
import numpy as np
from pandas import DataFrame, Series
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
df = DataFrame({'category': ['a', 'a', 'a', 'a', 'b', 'b', 'b', 'b'],
               'data': np.random.randn(8),
               'weights': np.random.rand(8)})
df


Out[2]:
category data weights
0 a 1.016068 0.273315
1 a -1.611363 0.969514
2 a -2.169703 0.301117
3 a -1.501790 0.822700
4 b -1.184955 0.779356
5 b -1.291867 0.312252
6 b 0.913494 0.686205
7 b -0.385359 0.344992

In [3]:
grouped = df.groupby('category')
get_wavg = lambda g: np.average(g['data'], weights=g['weights'])
grouped.apply(get_wavg)


Out[3]:
category
a   -1.340881
b   -0.392401
dtype: float64

In [4]:
np.average?

In [7]:
close_px = pd.read_csv('ch09/stock_px.csv', parse_dates=True, index_col=0)
close_px[-4:]


Out[7]:
AAPL MSFT XOM SPX
2011-10-11 400.29 27.00 76.27 1195.54
2011-10-12 402.19 26.96 77.16 1207.25
2011-10-13 408.43 27.18 76.37 1203.66
2011-10-14 422.00 27.27 78.11 1224.58

In [8]:
rets = close_px.pct_change().dropna()
spx_corr = lambda x: x.corrwith(x['SPX'])
by_year = rets.groupby(lambda x: x.year)
by_year.apply(spx_corr)


Out[8]:
AAPL MSFT XOM SPX
2003 0.541124 0.745174 0.661265 1.0
2004 0.374283 0.588531 0.557742 1.0
2005 0.467540 0.562374 0.631010 1.0
2006 0.428267 0.406126 0.518514 1.0
2007 0.508118 0.658770 0.786264 1.0
2008 0.681434 0.804626 0.828303 1.0
2009 0.707103 0.654902 0.797921 1.0
2010 0.710105 0.730118 0.839057 1.0
2011 0.691931 0.800996 0.859975 1.0

In [13]:
# 苹果和微软的年度相关系数
by_year.apply(lambda g: g['AAPL'].corr(g['MSFT']))


Out[13]:
2003    0.480868
2004    0.259024
2005    0.300093
2006    0.161735
2007    0.417738
2008    0.611901
2009    0.432738
2010    0.571946
2011    0.581987
dtype: float64

In [12]:
DataFrame.corrwith?

In [17]:
import statsmodels.api as sm
def regress(data, yvar, xvars):
    Y = data[yvar]
    X = data[xvars]
    X['intercept'] = 1.
    # 最小二乘法(Ordinary Least Squares, OLS)
    result = sm.OLS(Y, X).fit()
    return result.params

In [18]:
by_year.apply(regress, 'AAPL', ['SPX'])


Out[18]:
SPX intercept
2003 1.195406 0.000710
2004 1.363463 0.004201
2005 1.766415 0.003246
2006 1.645496 0.000080
2007 1.198761 0.003438
2008 0.968016 -0.001110
2009 0.879103 0.002954
2010 1.052608 0.001261
2011 0.806605 0.001514

In [22]:
regress(rets.loc[:, :], 'AAPL', ['SPX'])


d:\Anaconda2\lib\site-packages\ipykernel\__main__.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
Out[22]:
SPX          1.025987
intercept    0.001896
dtype: float64

In [23]:
sm.OLS?

In [25]:
Y = [1,2,3,4,5,6,7,8]
X = [2,3,4,5,6,7,8,9]
X = sm.add_constant(X)
model = sm.OLS(Y,X)
results = model.fit()
results.params


Out[25]:
array([-1.,  1.])

In [ ]: