w4-practice-02--fund-index



In [1]:
import pandas as pd
from sklearn.decomposition import PCA
import numpy as np

In [2]:
close_prices = pd.read_csv('resources/close_prices.csv')

In [3]:
close_prices.head(10)


Out[3]:
date AXP BA CAT CSCO CVX DD DIS GE GS ... PFE PG T TRV UNH UTX V VZ WMT XOM
0 2013-09-23 76.440002 117.510002 85.029999 24.270000 125.519997 59.409999 64.750000 24.280001 165.250000 ... 28.799999 79.279999 34.220001 86.379997 71.820000 109.419998 196.240005 47.980000 76.419998 87.750000
1 2013-09-24 76.070000 119.000000 85.110001 24.139999 124.489998 59.319997 64.320000 24.320000 162.970001 ... 28.709999 78.620003 34.090000 85.870003 72.320000 110.000000 193.339996 47.270000 75.750000 87.360001
2 2013-09-25 75.989998 118.510002 84.500000 24.430000 124.070000 59.319997 64.449997 24.230000 162.309998 ... 28.490000 77.720001 34.049999 85.980003 71.980003 109.260002 191.559998 46.950001 74.650002 87.139999
3 2013-09-26 76.320000 119.379997 84.199997 23.770000 123.489998 59.509996 65.239998 24.250000 162.289993 ... 28.520000 78.050003 34.230000 85.830002 72.160004 109.660004 193.559998 47.669998 74.620003 87.070000
4 2013-09-27 75.889999 118.739998 83.800003 23.330000 122.639999 59.009995 65.190002 24.049999 159.850006 ... 28.879999 77.209999 33.980000 85.410004 71.989998 109.360001 193.050003 47.000000 74.360001 86.900002
5 2013-09-30 75.519997 117.500000 83.400002 23.430000 121.500000 58.560000 64.489998 23.889999 158.210007 ... 28.730000 75.589996 33.820000 84.769997 71.610001 107.820000 191.100006 46.669998 73.959999 86.040001
6 2013-10-01 75.930000 117.750000 83.760002 23.240000 121.320000 58.689999 64.830002 24.170000 159.000000 ... 28.889999 76.160004 34.060001 84.660004 72.580002 107.379997 193.220001 46.990002 73.589996 86.000000
7 2013-10-02 74.580002 117.839996 84.070000 23.320000 120.830002 58.989999 64.879997 24.330000 158.669998 ... 29.010000 75.930000 33.939999 84.480003 72.570000 104.980003 191.820007 46.790001 73.720001 86.080002
8 2013-10-03 74.019997 115.239998 83.970001 23.010000 118.250000 57.759995 64.019997 24.100000 156.850006 ... 28.770000 75.839996 33.639999 84.059998 72.519997 103.690002 188.649994 47.009998 73.160004 85.500000
9 2013-10-04 74.309998 117.199997 84.199997 23.020000 118.129997 58.649998 65.300003 24.049999 156.550003 ... 29.000000 76.019997 33.750000 84.680000 72.989998 104.269997 190.479996 47.099998 72.800003 86.320000

10 rows × 31 columns


In [4]:
pca = PCA(n_components=10)
pca.fit(close_prices.loc[:, 'AXP':].values)
pca.explained_variance_ratio_


Out[4]:
array([0.73897118, 0.11007169, 0.04995088, 0.0287492 , 0.02215448,
       0.01931577, 0.00674853, 0.00614091, 0.00320594, 0.00305611])

In [5]:
-np.sort(-pca.explained_variance_ratio_)


Out[5]:
array([0.73897118, 0.11007169, 0.04995088, 0.0287492 , 0.02215448,
       0.01931577, 0.00674853, 0.00614091, 0.00320594, 0.00305611])

In [6]:
sum1 = 0.0
count = 1
for i in -np.sort(-pca.explained_variance_ratio_):
    sum1 += i
    if sum1 >= 0.9:
        break
    count += 1

print ("Количество признаков: " + str(count))


Количество признаков: 4

In [7]:
comp1 = pd.DataFrame(pca.transform(close_prices.loc[:, 'AXP':]))[0]

In [8]:
djia = pd.read_csv('resources/djia_index.csv')

In [9]:
djia.head(10)


Out[9]:
date ^DJI
0 2013-09-23 15401.379883
1 2013-09-24 15334.589844
2 2013-09-25 15273.259766
3 2013-09-26 15328.299805
4 2013-09-27 15258.240234
5 2013-09-30 15129.669922
6 2013-10-01 15191.700195
7 2013-10-02 15133.139648
8 2013-10-03 14996.480469
9 2013-10-04 15072.580078

In [13]:
dji = djia['^DJI'];
np.corrcoef(comp1, dji)


Out[13]:
array([[1.        , 0.90965222],
       [0.90965222, 1.        ]])

In [23]:
comp0_w = pd.Series(pca.components_[0])
comp0_w_top = comp0_w.sort_values(ascending=False).head(1).index[0]
company = close_prices.columns[comp0_w_top + 1]
print(company)


V