Rolling apply and mapping functions (preparing for ML)

https://youtu.be/uLqmM6ExPvo?list=PLQVvvaa0QuDc-3szzjeP6N6b0aDrrKyL-


In [31]:
import quandl;
import pandas as pd;

import pickle;

import matplotlib.pyplot as plt;
from matplotlib import style;
style.use("ggplot");

import numpy as np;

from statistics import mean;

In [32]:
api_key = open("quandlapikey.txt", "r").read();

def mortgage_30y_resampled():
    df = quandl.get("FMAC/MORTG", trim_start = "1975-01-01", authtoken = api_key);       
    df["Value"] = (df["Value"] - df["Value"][0]) / df["Value"][0] * 100.0;
    df.columns = ["M30"];
    return df.resample("M").mean();

def state_list():
    fiddy_states = pd.read_html("https://simple.wikipedia.org/wiki/List_of_U.S._states");
    return fiddy_states[0][0][1:];

def grap_initial_state_data_start_pct():
    states = state_list();
    main_df = pd.DataFrame();
    for ab in states:
        querry = "FMAC/HPI_" + ab;
        df = quandl.get(querry, authtoken = api_key);
        df.columns = [ab];        
        df[ab] = (df[ab] - df[ab][0]) / df[ab][0] * 100.0; # <-------
        if main_df.empty:
            main_df = df;
        else:
            main_df = main_df.join(df);

    pickle_out = open("./data/fiddy_states.pickle", "wb");
    pickle.dump(main_df, pickle_out);
    pickle_out.close();
    
def HPI_Benchmark():
    df = quandl.get("FMAC/HPI_USA", authtoken = api_key);
    df.columns = ["US"];        
    df["US"] = (df["US"] - df["US"][0]) / df["US"][0] * 100.0; # <-------
    return df;

In [33]:
def sp500_data():
    df = quandl.get("YAHOO/INDEX_GSPC", trim_start = "1975-01-01", authtoken = api_key);
    df["Adjusted Close"] = (df["Adjusted Close"] - df["Adjusted Close"][0]) / df["Adjusted Close"][0] * 100.0; # <-------
    df = df.resample("M").mean();
    df.rename(columns={"Adjusted Close":"sp500"}, inplace = True);
    df = df["sp500"];
    return df;

In [34]:
df = sp500_data();
print(df.head());


Date
1975-01-31     3.323491
1975-02-28    14.049322
1975-03-31    19.367785
1975-04-30    20.636734
1975-05-31    28.287322
Freq: M, Name: sp500, dtype: float64

In [35]:
def gdp_data():
    df = quandl.get("BCB/4385", trim_start = "1975-01-01", authtoken = api_key);
    df["Value"] = (df["Value"] - df["Value"][0]) / df["Value"][0] * 100.0; # <-------
    df = df.resample("M").mean();
    df.rename(columns={"Value":"GDP"}, inplace = True);
    df = df["GDP"];
    return df;

In [36]:
def us_unemployment():
    df = quandl.get("ECPI/JOB_G", trim_start = "1975-01-01", authtoken = api_key);
    df["Unemployment Rate"] = (df["Unemployment Rate"] - df["Unemployment Rate"][0]) / df["Unemployment Rate"][0] * 100.0; # <-------
    df = df.resample("1D").mean();
    df = df.resample("M").mean();
    return df;

In [37]:
sp500 = sp500_data();
US_GDP = gdp_data();
US_uneployment = us_unemployment();

m30 = mortgage_30y_resampled();
HPI_data = pd.read_pickle("./data/fiddy_states.pickle");
HPI_bench = HPI_Benchmark();

In [38]:
HPI = HPI_data.join([HPI_bench, m30, US_uneployment, US_GDP, sp500]);
print(HPI.head());
print(HPI.corr().head());
# we have nans!!


                  AL        AK        AZ        AR        CA        CO  \
Date                                                                     
1975-01-31  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
1975-02-28  0.626500  1.485775  1.688504  0.846192  0.356177  1.159639   
1975-03-31  1.358575  3.006473  3.261346  1.581956  1.575690  2.299449   
1975-04-30  2.254726  4.593530  4.475810  2.183669  3.573196  3.359028   
1975-05-31  3.107829  6.327600  5.139617  2.786248  5.241395  4.226895   

                  CT        DE         FL        GA    ...            VA  \
Date                                                   ...                 
1975-01-31  0.000000  0.000000   0.000000  0.000000    ...      0.000000   
1975-02-28  2.123926  0.142451   3.938796 -0.902841    ...      0.987288   
1975-03-31  3.719898  0.387918   9.798243 -1.282758    ...      1.707474   
1975-04-30  4.616778  0.891619  16.974819 -1.068371    ...      2.238392   
1975-05-31  4.901787  1.752086  17.891884 -0.676830    ...      2.684036   

                  WA         WV        WI        WY        US       M30  \
Date                                                                      
1975-01-31  0.000000   0.000000  0.000000  0.000000  0.000000  0.000000   
1975-02-28  0.397561   2.495069  1.404920  1.438502  0.639523 -3.393425   
1975-03-31  1.015723   5.093474  2.791371  2.962512  1.681678 -5.620361   
1975-04-30  1.757887   7.772685  4.034165  4.545011  3.047044 -6.468717   
1975-05-31  2.426439  10.469784  5.047628  6.080901  3.922540 -5.514316   

            Unemployment Rate  GDP      sp500  
Date                                           
1975-01-31           0.000000  NaN   3.323491  
1975-02-28           0.000000  NaN  14.049322  
1975-03-31           6.172840  NaN  19.367785  
1975-04-30           8.641975  NaN  20.636734  
1975-05-31          11.111111  NaN  28.287322  

[5 rows x 55 columns]
          AL        AK        AZ        AR        CA        CO        CT  \
AL  1.000000  0.946418  0.937592  0.995119  0.942310  0.965541  0.953146   
AK  0.946418  1.000000  0.910237  0.967983  0.921818  0.960382  0.884600   
AZ  0.937592  0.910237  1.000000  0.936454  0.976776  0.919486  0.917688   
AR  0.995119  0.967983  0.936454  1.000000  0.945774  0.976929  0.944746   
CA  0.942310  0.921818  0.976776  0.945774  1.000000  0.938870  0.942463   

          DE        FL        GA    ...           VA        WA        WV  \
AL  0.982824  0.929395  0.978346    ...     0.975277  0.985060  0.982097   
AK  0.938068  0.900713  0.899525    ...     0.962226  0.958805  0.972376   
AZ  0.948801  0.994380  0.949167    ...     0.958568  0.956881  0.919514   
AR  0.977921  0.928081  0.968410    ...     0.979944  0.984041  0.990395   
CA  0.967911  0.985869  0.951573    ...     0.979699  0.965064  0.936562   

          WI        WY        US       M30  Unemployment Rate       GDP  \
AL  0.991161  0.946695  0.983404 -0.796453          -0.313968  0.679765   
AK  0.930831  0.987373  0.950294 -0.734748          -0.077318  0.831059   
AZ  0.935717  0.909381  0.969580 -0.673961          -0.363260  0.461065   
AR  0.986244  0.964090  0.984300 -0.788178          -0.274777  0.738729   
CA  0.945139  0.919537  0.986693 -0.733805          -0.309076  0.513147   

       sp500  
AL  0.912874  
AK  0.894500  
AZ  0.857531  
AR  0.919558  
CA  0.873121  

[5 rows x 55 columns]

In [39]:
HPI.dropna(inplace = True);
print(HPI.head());
print(HPI.corr().head());


                   AL         AK          AZ         AR          CA  \
Date                                                                  
1990-01-31  97.377466  62.960664  125.925384  90.013079  407.639219   
1990-02-28  97.306825  64.745987  125.954481  90.292939  409.393746   
1990-03-31  97.640926  68.843816  125.677184  90.749096  412.358848   
1990-04-30  98.347335  75.545290  125.537118  91.190719  414.727667   
1990-05-31  99.135672  84.269489  125.820526  91.693828  415.785301   

                    CO          CT          DE          FL          GA  \
Date                                                                     
1990-01-31  128.607770  289.087700  202.214111  133.668588  116.794322   
1990-02-28  128.817810  286.729447  203.585164  133.302517  116.293285   
1990-03-31  129.016696  283.918663  203.903611  133.217207  115.513095   
1990-04-30  129.123827  280.649292  203.811564  133.568341  115.303254   
1990-05-31  129.785821  277.504604  203.813374  134.063840  115.679836   

               ...              VA          WA         WV          WI  \
Date           ...                                                      
1990-01-31     ...      179.224441  225.298844  68.476763  102.969030   
1990-02-28     ...      178.921132  233.971612  68.825487  104.289628   
1990-03-31     ...      179.104137  242.490216  69.416380  105.932415   
1990-04-30     ...      180.099895  250.784574  70.283645  107.582128   
1990-05-31     ...      181.135479  258.186412  71.468265  108.807348   

                   WY          US        M30  Unemployment Rate        GDP  \
Date                                                                         
1990-01-31  74.740767  198.780437   4.984093         -33.333333   0.000000   
1990-02-28  75.353989  199.407324   8.165429         -34.567901  -2.697505   
1990-03-31  76.287112  200.269331   8.907741         -35.802469  -2.065278   
1990-04-30  77.735727  201.307039   9.968187         -33.333333 -10.874318   
1990-05-31  79.764564  202.362724  11.134677         -33.333333   1.006296   

                 sp500  
Date                    
1990-01-31  384.083446  
1990-02-28  370.529148  
1990-03-31  381.937898  
1990-04-30  381.529238  
1990-05-31  398.718477  

[5 rows x 55 columns]
          AL        AK        AZ        AR        CA        CO        CT  \
AL  1.000000  0.972248  0.901313  0.994846  0.877794  0.961665  0.927413   
AK  0.972248  1.000000  0.839755  0.981834  0.855834  0.922324  0.930099   
AZ  0.901313  0.839755  1.000000  0.886004  0.958469  0.842439  0.912187   
AR  0.994846  0.981834  0.886004  1.000000  0.880304  0.969541  0.932475   
CA  0.877794  0.855834  0.958469  0.880304  1.000000  0.845729  0.968665   

          DE        FL        GA    ...           VA        WA        WV  \
AL  0.956129  0.886165  0.936673    ...     0.953386  0.986904  0.987465   
AK  0.962020  0.841722  0.851612    ...     0.963128  0.966984  0.992558   
AZ  0.921635  0.991186  0.934123    ...     0.927210  0.935524  0.850917   
AR  0.956638  0.879779  0.925560    ...     0.959097  0.977613  0.994293   
CA  0.949543  0.984781  0.911907    ...     0.958405  0.917135  0.848311   

          WI        WY        US       M30  Unemployment Rate       GDP  \
AL  0.984120  0.976964  0.959813 -0.797310           0.065500  0.655117   
AK  0.947880  0.995386  0.937370 -0.850842           0.259508  0.778482   
AZ  0.898546  0.835989  0.954788 -0.569351          -0.190969  0.392354   
AR  0.988203  0.981145  0.961420 -0.831055           0.104737  0.681297   
CA  0.895063  0.835664  0.975569 -0.620295          -0.027545  0.410782   

       sp500  
AL  0.806501  
AK  0.731637  
AZ  0.709411  
AR  0.797588  
CA  0.644640  

[5 rows x 55 columns]

In [40]:
HPI.to_pickle("./data/HPI.pickle");

in fact, this notebook starts here


In [41]:
housing_data = pd.read_pickle("./data/HPI.pickle");
housing_data = housing_data.pct_change();
print(housing_data.head());


                  AL        AK        AZ        AR        CA        CO  \
Date                                                                     
1990-01-31       NaN       NaN       NaN       NaN       NaN       NaN   
1990-02-28 -0.000725  0.028356  0.000231  0.003109  0.004304  0.001633   
1990-03-31  0.003433  0.063291 -0.002202  0.005052  0.007243  0.001544   
1990-04-30  0.007235  0.097343 -0.001114  0.004866  0.005745  0.000830   
1990-05-31  0.008016  0.115483  0.002258  0.005517  0.002550  0.005127   

                  CT        DE        FL        GA    ...           VA  \
Date                                                  ...                
1990-01-31       NaN       NaN       NaN       NaN    ...          NaN   
1990-02-28 -0.008158  0.006780 -0.002739 -0.004290    ...    -0.001692   
1990-03-31 -0.009803  0.001564 -0.000640 -0.006709    ...     0.001023   
1990-04-30 -0.011515 -0.000451  0.002636 -0.001817    ...     0.005560   
1990-05-31 -0.011205  0.000009  0.003710  0.003266    ...     0.005750   

                  WA        WV        WI        WY        US       M30  \
Date                                                                     
1990-01-31       NaN       NaN       NaN       NaN       NaN       NaN   
1990-02-28  0.038495  0.005093  0.012825  0.008205  0.003154  0.638298   
1990-03-31  0.036409  0.008585  0.015752  0.012383  0.004323  0.090909   
1990-04-30  0.034205  0.012494  0.015573  0.018989  0.005182  0.119048   
1990-05-31  0.029515  0.016855  0.011389  0.026099  0.005244  0.117021   

            Unemployment Rate       GDP     sp500  
Date                                               
1990-01-31                NaN       NaN       NaN  
1990-02-28           0.037037      -inf -0.035290  
1990-03-31           0.035714 -0.234375  0.030790  
1990-04-30          -0.068966  4.265306 -0.001070  
1990-05-31           0.000000 -1.092539  0.045054  

[5 rows x 55 columns]

In [42]:
housing_data.replace([np.inf, -np.inf], np.nan, inplace = True);
housing_data.dropna(inplace = True);
print(housing_data.head());


                  AL        AK        AZ        AR        CA        CO  \
Date                                                                     
1990-03-31  0.003433  0.063291 -0.002202  0.005052  0.007243  0.001544   
1990-04-30  0.007235  0.097343 -0.001114  0.004866  0.005745  0.000830   
1990-05-31  0.008016  0.115483  0.002258  0.005517  0.002550  0.005127   
1990-06-30  0.004616  0.103961  0.003969  0.006417  0.003595  0.007326   
1990-07-31  0.000083  0.069205  0.001866  0.006427  0.004748  0.003203   

                  CT        DE        FL        GA    ...           VA  \
Date                                                  ...                
1990-03-31 -0.009803  0.001564 -0.000640 -0.006709    ...     0.001023   
1990-04-30 -0.011515 -0.000451  0.002636 -0.001817    ...     0.005560   
1990-05-31 -0.011205  0.000009  0.003710  0.003266    ...     0.005750   
1990-06-30 -0.007019 -0.000411  0.003081  0.002858    ...     0.003002   
1990-07-31 -0.003224 -0.003613  0.002929  0.001878    ...     0.002215   

                  WA        WV        WI        WY        US       M30  \
Date                                                                     
1990-03-31  0.036409  0.008585  0.015752  0.012383  0.004323  0.090909   
1990-04-30  0.034205  0.012494  0.015573  0.018989  0.005182  0.119048   
1990-05-31  0.029515  0.016855  0.011389  0.026099  0.005244  0.117021   
1990-06-30  0.018036  0.017798  0.009686  0.027659  0.005103 -0.304762   
1990-07-31  0.008122  0.012048  0.009039  0.021413  0.003712 -0.164384   

            Unemployment Rate       GDP     sp500  
Date                                               
1990-03-31           0.035714 -0.234375  0.030790  
1990-04-30          -0.068966  4.265306 -0.001070  
1990-05-31           0.000000 -1.092539  0.045054  
1990-06-30           0.074074  3.115183  0.036200  
1990-07-31          -0.103448  0.441476 -0.001226  

[5 rows x 55 columns]

In [43]:
housing_data["US_HPI_future"] = housing_data["US"].shift(-1);
print(housing_data[["US_HPI_future", "US"]].head());


            US_HPI_future        US
Date                               
1990-03-31       0.005182  0.004323
1990-04-30       0.005244  0.005182
1990-05-31       0.005103  0.005244
1990-06-30       0.003712  0.005103
1990-07-31       0.000489  0.003712

In [44]:
def create_labels(cur_hpi, fut_hpi):
    if fut_hpi > cur_hpi:
        return 1;
    else:
        return 0;

housing_data["label"] = list(map(create_labels, housing_data["US"], housing_data["US_HPI_future"])); # wow
#pd.Series.map may be useful also
print(housing_data[["US_HPI_future", "US", "label"]].head());


            US_HPI_future        US  label
Date                                      
1990-03-31       0.005182  0.004323      1
1990-04-30       0.005244  0.005182      1
1990-05-31       0.005103  0.005244      0
1990-06-30       0.003712  0.005103      0
1990-07-31       0.000489  0.003712      0

In [45]:
def moving_average(values):
    return mean(values);

housing_data["ma_apply_example"] = housing_data["M30"].rolling(window = 10).apply(moving_average);
print(housing_data[["M30", "ma_apply_example"]]);


                 M30  ma_apply_example
Date                                  
1990-03-31  0.090909               NaN
1990-04-30  0.119048               NaN
1990-05-31  0.117021               NaN
1990-06-30 -0.304762               NaN
1990-07-31 -0.164384               NaN
1990-08-31  0.098361               NaN
1990-09-30  0.119403               NaN
1990-10-31  0.000000               NaN
1990-11-30 -0.226667               NaN
1990-12-31 -0.586207         -0.073728
1991-01-31 -0.125000         -0.095319
1991-02-28 -1.285714         -0.235795
1991-03-31 -2.166667         -0.464164
1991-04-30 -0.142857         -0.447973
1991-05-31 -0.333333         -0.464868
1991-06-30  3.750000         -0.099704
1991-07-31 -0.210526         -0.132697
1991-08-31 -2.266667         -0.359364
1991-09-30  1.210526         -0.215644
1991-10-31  0.357143         -0.121310
1991-11-30  0.263158         -0.082494
1991-12-31  0.291667          0.075244
1992-01-31  0.075269          0.299438
1992-02-29 -0.330000          0.280724
1992-03-31 -0.268657          0.287191
1992-04-30  0.183673         -0.069441
1992-05-31  0.310345         -0.017354
1992-06-30  0.210526          0.230365
1992-07-31  0.413043          0.150617
1992-08-31  0.115385          0.126441
...              ...               ...
2009-07-31  0.049875          0.025387
2009-08-31  0.007126          0.030820
2009-09-30  0.030660          0.030480
2009-10-31  0.025172          0.010243
2009-11-30  0.015625          0.005220
2009-12-31 -0.010989          0.005723
2010-01-31 -0.022222          0.000478
2010-02-28  0.009091         -0.002902
2010-03-31  0.004505         -0.001370
2010-04-30 -0.029148          0.007969
2010-05-31  0.048499          0.007832
2010-06-30  0.033040          0.010423
2010-07-31  0.038380          0.011195
2010-08-31  0.026694          0.011347
2010-09-30  0.016000          0.011385
2010-10-31  0.023622          0.014846
2010-11-30 -0.013462          0.015722
2010-12-31 -0.079922          0.006821
2011-01-31 -0.010593          0.005311
2011-02-28 -0.040685          0.004157
2011-03-31  0.024554          0.001763
2011-04-30  0.000000         -0.001541
2011-05-31  0.043573         -0.001022
2011-06-30  0.027140         -0.000977
2011-07-31 -0.008130         -0.003390
2011-08-31  0.057377         -0.000015
2011-09-30  0.031008          0.004432
2011-10-31  0.007519          0.013176
2011-11-30  0.014925          0.015728
2011-12-31  0.005515          0.020348

[262 rows x 2 columns]

In [ ]: