In [4]:
BASEDIR = "/home/dmoore/python/sparsegp/experiments/"
import os,sys
import numpy as np

if '/home/dmoore/python/' not in sys.path:
    sys.path.append('/home/dmoore/python/')

from sparsegp.experiments.code.datasets import mkdir_p

In [9]:
import pandas as pd
def standardizeX(dfX, ignore_cols=[]):
    for col in dfX.columns:
        if col in ignore_cols: continue
                
        mx = dfX[col].max()
        mn = dfX[col].min()
        print col, mx, mn
        dfX[col] = (dfX[col] - mn) / ((mx-mn)/100.0 )
        print col, mx, mn, (mx-mn)
        
def save_test_train_split(Xframe, yframe, dataset, seed=0, n_train=None):

    n = Xframe.shape[0]
    
    n_train = int(n*.7) if n_train is None else n_train
    
    np.random.seed(seed)
    p = np.random.permutation(n)
        
    train_idx = p[:n_train]
    test_idx = p[n_train:]

    X_train = Xframe.iloc[train_idx,:]
    y_train = yframe.iloc[train_idx,:]
    

    
    X_test = Xframe.iloc[test_idx,:]
    y_test = yframe.iloc[test_idx,:]
    
    mu = np.mean(y_train)
    sigma = np.std(y_train)
    y_train = (y_train - mu)/sigma
    y_test = (y_test - mu)/sigma
    
    savedir = os.path.join(BASEDIR, 'datasets', dataset)
    mkdir_p(savedir)
    
    X_train.to_csv(os.path.join(savedir, 'X_train.txt'), index=False)
    y_train.to_csv(os.path.join(savedir, 'y_train.txt'), index=False)
    
    X_test.to_csv(os.path.join(savedir, 'X_test.txt'), index=False)
    y_test.to_csv(os.path.join(savedir, 'y_test.txt'), index=False)

In [61]:
h1 = pd.read_csv(os.path.join(BASEDIR, 'raw_data/cal_housing_data.csv'))
h2 = pd.read_csv(os.path.join(BASEDIR, 'raw_data/cal_housing_dbf.csv'))
h2 = h2[np.isfinite(h2['BLCKGR'])]
h2 = h2[h2['H061A001'] > 0]
print h1
print h2


       lon    lat  age  income  units  population  households   rooms   value
0  -122.23  37.88   41     880    129         322         126  8.3252  452600
1  -122.22  37.86   21    7099   1106        2401        1138  8.3014  358500
2  -122.24  37.85   52    1467    190         496         177  7.2574  352100
3  -122.25  37.85   52    1274    235         558         219  5.6431  341300
4  -122.25  37.85   52    1627    280         565         259  3.8462  342200
5  -122.25  37.85   52     919    213         413         193  4.0368  269700
6  -122.25  37.84   52    2535    489        1094         514  3.6591  299200
7  -122.25  37.84   52    3104    687        1157         647  3.1200  241400
8  -122.26  37.84   42    2555    665        1206         595  2.0804  226700
9  -122.25  37.84   52    3549    707        1551         714  3.6912  261100
10 -122.26  37.85   52    2202    434         910         402  3.2031  281500
11 -122.26  37.85   52    3503    752        1504         734  3.2705  241800
12 -122.26  37.85   52    2491    474        1098         468  3.0750  213500
13 -122.26  37.84   52     696    191         345         174  2.6736  191300
14 -122.26  37.85   52    2643    626        1212         620  1.9167  159200
15 -122.26  37.85   50    1120    283         697         264  2.1250  140000
16 -122.27  37.85   52    1966    347         793         331  2.7750  152500
17 -122.27  37.85   52    1228    293         648         303  2.1202  155500
18 -122.26  37.84   50    2239    455         990         419  1.9911  158700
19 -122.27  37.84   52    1503    298         690         275  2.6033  162900
20 -122.27  37.85   40     751    184         409         166  1.3578  147500
21 -122.27  37.85   42    1639    367         929         366  1.7135  159800
22 -122.27  37.84   52    2436    541        1015         478  1.7250  113900
23 -122.27  37.84   52    1688    337         853         325  2.1806   99700
24 -122.27  37.84   52    2224    437        1006         422  2.6000  132600
25 -122.28  37.85   41     535    123         317         119  2.4038  107500
26 -122.28  37.85   49    1130    244         607         239  2.4597   93800
27 -122.28  37.85   52    1898    421        1102         397  1.8080  105500
28 -122.28  37.84   50    2082    492        1131         473  1.6424  108900
29 -122.28  37.84   52     729    160         395         155  1.6875  132000
30 -122.28  37.84   49    1916    447         863         378  1.9274  122300
31 -122.28  37.84   52    2153    481        1168         441  1.9615  115200
32 -122.27  37.84   48    1922    409        1026         335  1.7969  110400
33 -122.27  37.83   49    1655    366         754         329  1.3750  104900
34 -122.27  37.83   51    2665    574        1258         536  2.7303  109700
35 -122.27  37.83   49    1215    282         570         264  1.4861   97200
36 -122.27  37.83   48    1798    432         987         374  1.0972  104500
37 -122.28  37.83   52    1511    390         901         403  1.4103  103900
38 -122.26  37.83   52    1470    330         689         309  3.4800  191400
39 -122.26  37.83   52    2432    715        1377         696  2.5898  176000
40 -122.26  37.83   52    1665    419         946         395  2.0978  155400
41 -122.26  37.83   51     936    311         517         249  1.2852  150000
42 -122.26  37.84   49     713    202         462         189  1.0250  118800
43 -122.26  37.84   52     950    202         467         198  3.9643  188800
44 -122.26  37.83   52    1443    311         660         292  3.0125  184400
45 -122.26  37.83   52    1656    420         718         382  2.6768  182300
46 -122.26  37.83   50    1125    322         616         304  2.0260  142500
47 -122.27  37.82   43    1007    312         558         253  1.7348  137500
48 -122.26  37.82   40     624    195         423         160  0.9506  187500
49 -122.27  37.82   40     946    375         700         352  1.7750  112500
50 -122.27  37.82   21     896    453         735         438  0.9218  171900
51 -122.27  37.82   43    1868    456        1061         407  1.5045   93800
52 -122.27  37.82   41    3221    853        1959         720  1.1108   97500
53 -122.27  37.82   52    1630    456        1162         400  1.2475  104200
54 -122.28  37.82   52    1170    235         701         233  1.6098   87500
55 -122.28  37.82   52     945    243         576         220  1.4113   83100
56 -122.28  37.82   52    1238    288         622         259  1.5057   87500
57 -122.28  37.82   52    1489    335         728         244  0.8172   85300
58 -122.28  37.82   52    1387    341        1074         304  1.2171   80300
59 -122.29  37.82    2     158     43          94          57  2.5625   60000
       ...    ...  ...     ...    ...         ...         ...     ...     ...

[20640 rows x 9 columns]
     FIPS     TRACT         BLOCKGR  SUMLEV  STATEFP  CNTY  COUSUBFP  PLACEFP  \
901  6001  60014001  060014001.00:1     150        6     1       NaN      NaN   
902  6001  60014001  060014001.00:2     150        6     1       NaN      NaN   
904  6001  60014002  060014002.00:1     150        6     1       NaN      NaN   
905  6001  60014002  060014002.00:2     150        6     1       NaN      NaN   
906  6001  60014002  060014002.00:3     150        6     1       NaN      NaN   
907  6001  60014002  060014002.00:4     150        6     1       NaN      NaN   
909  6001  60014003  060014003.00:1     150        6     1       NaN      NaN   
910  6001  60014003  060014003.00:2     150        6     1       NaN      NaN   
911  6001  60014003  060014003.00:3     150        6     1       NaN      NaN   
912  6001  60014003  060014003.00:4     150        6     1       NaN      NaN   
914  6001  60014004  060014004.00:1     150        6     1       NaN      NaN   
915  6001  60014004  060014004.00:2     150        6     1       NaN      NaN   
916  6001  60014004  060014004.00:3     150        6     1       NaN      NaN   
917  6001  60014004  060014004.00:4     150        6     1       NaN      NaN   
919  6001  60014005  060014005.00:1     150        6     1       NaN      NaN   
920  6001  60014005  060014005.00:2     150        6     1       NaN      NaN   
921  6001  60014005  060014005.00:3     150        6     1       NaN      NaN   
922  6001  60014005  060014005.00:4     150        6     1       NaN      NaN   
924  6001  60014006  060014006.00:1     150        6     1       NaN      NaN   
925  6001  60014006  060014006.00:2     150        6     1       NaN      NaN   
927  6001  60014007  060014007.00:1     150        6     1       NaN      NaN   
928  6001  60014007  060014007.00:2     150        6     1       NaN      NaN   
929  6001  60014007  060014007.00:3     150        6     1       NaN      NaN   
930  6001  60014007  060014007.00:4     150        6     1       NaN      NaN   
931  6001  60014007  060014007.00:5     150        6     1       NaN      NaN   
933  6001  60014008  060014008.00:1     150        6     1       NaN      NaN   
934  6001  60014008  060014008.00:2     150        6     1       NaN      NaN   
935  6001  60014008  060014008.00:3     150        6     1       NaN      NaN   
936  6001  60014008  060014008.00:4     150        6     1       NaN      NaN   
938  6001  60014009  060014009.00:1     150        6     1       NaN      NaN   
939  6001  60014009  060014009.00:2     150        6     1       NaN      NaN   
940  6001  60014009  060014009.00:3     150        6     1       NaN      NaN   
942  6001  60014010  060014010.00:1     150        6     1       NaN      NaN   
943  6001  60014010  060014010.00:2     150        6     1       NaN      NaN   
944  6001  60014010  060014010.00:3     150        6     1       NaN      NaN   
945  6001  60014010  060014010.00:4     150        6     1       NaN      NaN   
946  6001  60014010  060014010.00:5     150        6     1       NaN      NaN   
947  6001  60014010  060014010.00:6     150        6     1       NaN      NaN   
949  6001  60014011  060014011.00:1     150        6     1       NaN      NaN   
950  6001  60014011  060014011.00:2     150        6     1       NaN      NaN   
951  6001  60014011  060014011.00:3     150        6     1       NaN      NaN   
952  6001  60014011  060014011.00:4     150        6     1       NaN      NaN   
955  6001  60014012  060014012.00:1     150        6     1       NaN      NaN   
956  6001  60014012  060014012.00:2     150        6     1       NaN      NaN   
957  6001  60014012  060014012.00:3     150        6     1       NaN      NaN   
958  6001  60014012  060014012.00:4     150        6     1       NaN      NaN   
960  6001  60014013  060014013.00:1     150        6     1       NaN      NaN   
961  6001  60014013  060014013.00:2     150        6     1       NaN      NaN   
963  6001  60014013  060014013.00:4     150        6     1       NaN      NaN   
965  6001  60014014  060014014.00:1     150        6     1       NaN      NaN   
966  6001  60014014  060014014.00:2     150        6     1       NaN      NaN   
967  6001  60014014  060014014.00:3     150        6     1       NaN      NaN   
969  6001  60014015  060014015.00:1     150        6     1       NaN      NaN   
970  6001  60014015  060014015.00:2     150        6     1       NaN      NaN   
971  6001  60014015  060014015.00:3     150        6     1       NaN      NaN   
973  6001  60014016  060014016.00:1     150        6     1       NaN      NaN   
974  6001  60014016  060014016.00:2     150        6     1       NaN      NaN   
977  6001  60014017  060014017.00:1     150        6     1       NaN      NaN   
978  6001  60014017  060014017.00:2     150        6     1       NaN      NaN   
980  6001  60014017  060014017.00:4     150        6     1       NaN      NaN   
      ...       ...             ...     ...      ...   ...       ...      ...   

     TRACTBNA  BLCKGR  LOGRECNU  P0010001  P0050001  P0060001  P0060002  \
901      4001       1     46052       322       134       322         0   
902      4001       2     46053      2401      1110      2401         0   
904      4002       1     46054       496       188       496         0   
905      4002       2     46055       558       221       558         0   
906      4002       3     46056       565       240       565         0   
907      4002       4     46057       413       227       413         0   
909      4003       1     46058      1094       492      1094         0   
910      4003       2     46059      1157       578      1157         0   
911      4003       3     46060      1206       652      1206         0   
912      4003       4     46061      1551       740      1551         0   
914      4004       1     46062       910       415       910         0   
915      4004       2     46063      1504       757      1504         0   
916      4004       3     46064      1098       446      1098         0   
917      4004       4     46065       345       165       345         0   
919      4005       1     46066      1212       584      1212         0   
920      4005       2     46067       697       266       697         0   
921      4005       3     46068       793       330       793         0   
922      4005       4     46069       648       301       648         0   
924      4006       1     46070       990       477       990         0   
925      4006       2     46071       690       233       690         0   
927      4007       1     46072       409       195       409         0   
928      4007       2     46073       929       363       929         0   
929      4007       3     46074      1015       464      1015         0   
930      4007       4     46075       853       305       853         0   
931      4007       5     46076      1006       394      1006         0   
933      4008       1     46077       317       110       317         0   
934      4008       2     46078       607       186       607         0   
935      4008       3     46079      1102       391      1102         0   
936      4008       4     46080      1131       487      1131         0   
938      4009       1     46081       395       139       395         0   
939      4009       2     46082       863       362       863         0   
940      4009       3     46083      1168       448      1168         0   
942      4010       1     46084      1026       327      1026         0   
943      4010       2     46085       754       287       754         0   
944      4010       3     46086      1258       516      1258         0   
945      4010       4     46087       570       249       570         0   
946      4010       5     46088       987       385       987         0   
947      4010       6     46089       901       444       901         0   
949      4011       1     46090       689       294       689         0   
950      4011       2     46091      1377       697      1377         0   
951      4011       3     46092       946       344       946         0   
952      4011       4     46093       517       241       517         0   
955      4012       1     46095       467       184       467         0   
956      4012       2     46096       660       269       660         0   
957      4012       3     46097       718       416       718         0   
958      4012       4     46098       616       311       616         0   
960      4013       1     46099       558       251       558         0   
961      4013       2     46100       423       186       423         0   
963      4013       4     46102       735       469       735         0   
965      4014       1     46103      1061       460      1061         0   
966      4014       2     46104      1959       666      1959         0   
967      4014       3     46105      1162       379      1162         0   
969      4015       1     46106       701       223       701         0   
970      4015       2     46107       576       192       576         0   
971      4015       3     46108       622       259       622         0   
973      4016       1     46109       728       323       728         0   
974      4016       2     46110      1074       287      1074         0   
977      4017       1     46112       554       136       554         0   
978      4017       2     46113        86        13        86         0   
980      4017       4     46115       377       161       377         0   
          ...     ...       ...       ...       ...       ...       ...   

     P0060003  P0060004  P0100001  P0120001  P0120002      
901         0         0         0       298         0 ...  
902         0         0        78      2020       111 ...  
904         0         0         0       404        47 ...  
905         0         0        19       496        19 ...  
906         0         0        54       454         0 ...  
907         0         0        39       343        23 ...  
909         0         0        48       977        23 ...  
910         0         0        54       873        84 ...  
911         0         0       142       546       454 ...  
912         0         0       127      1201       148 ...  
914         0         0        42       722        70 ...  
915         0         0        60      1004       322 ...  
916         0         0        87       561       383 ...  
917         0         0        15       181        85 ...  
919         0         0        73       574       465 ...  
920         0         0        50       155       446 ...  
921         0         0        16       100       621 ...  
922         0         0        18       259       326 ...  
924         0         0        19       281       642 ...  
925         0         0        87       102       455 ...  
927         0         0        36        35       334 ...  
928         0         0        16       123       739 ...  
929         0         0        97       157       733 ...  
930         0         0        23        67       753 ...  
931         0         0         0       121       832 ...  
933         0         0         0        49       261 ...  
934         0         0         0        71       440 ...  
935         0         0        52        64       923 ...  
936         0         0        34       315       697 ...  
938         0         0        18        48       329 ...  
939         0         0        24       124       666 ...  
940         0         0        21       276       822 ...  
942         0         0         8        38       980 ...  
943         0         0        61       165       528 ...  
944         0         0       104       121       938 ...  
945         0         0        20        75       441 ...  
946         0         0        73        12       747 ...  
947         0         0        17        18       866 ...  
949         0         0        13       388       178 ...  
950         0         0       209       515       573 ...  
951         0         0        94       159       380 ...  
952         0         0        32       152       218 ...  
955         0         0        18       269       168 ...  
956         0         0        23       370       146 ...  
957         0         0        19       486       179 ...  
958         0         0        25       296       195 ...  
960         0         0        20       167       290 ...  
961         0         0        35       102       248 ...  
963         0         0        21       245       309 ...  
965         0         0         0        31       878 ...  
966         0         0       102       179      1606 ...  
967         0         0       125        66       839 ...  
969         0         0        76        42       583 ...  
970         0         0        29        51       496 ...  
971         0         0        23        56       504 ...  
973         0         0        30        22       654 ...  
974         0         0        54        99       668 ...  
977         0         0       192        68       254 ...  
978         0         0        45         0        41 ...  
980         0         0        13        10       354 ...  
          ...       ...       ...       ...       ...      

[20077 rows x 99 columns]

In [62]:
(h2[h2['H061A001'] == 14999])['P0050001']


Out[62]:
3788     114
4600      50
4682     117
5241     213
15615    505
18885    616
26229    331
Name: P0050001, dtype: float64

In [63]:
h1[h1['value'] == 14999]


Out[63]:
lon lat age income units population households rooms value
2521 -122.74 39.71 16 255 73 85 38 1.6607 14999
2799 -117.02 36.40 19 619 239 490 164 2.1000 14999
9188 -117.86 34.24 52 803 267 628 225 4.1932 14999
19802 -123.17 40.31 36 98 28 18 8 0.5360 14999

4 rows × 9 columns


In [63]:


In [64]:
print (h2['H061A001'] > 0).sum()


20077

In [65]:
precip = pd.read_csv(os.path.join(BASEDIR, 'raw_data/NCAR_pinfill/ppt.complete.Y101'),sep=r'\s*', names=['sta', 'jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec', 'missing'], index_col=0)
precip_full = precip[precip['missing'] == 000000000000]
print precip_full


        jan  feb  mar  apr  may  jun  jul  aug  sep  oct  nov  dec  missing
sta                                                                        
010008   94  174  126   78   58  113  151  109   61  198  131   99        0
010063  118  139  131  132  109  119   75   67   81  110  162  126        0
010140  123   88  125  116   50  105   55  151   71  236  164  202        0
010184   86   99  155  177  102   73   51  150   49  202  211  103        0
010252  196  203  143  124  168   96  229  184   34  424  162  150        0
010369  107  267  151   79   36  139   55  119   82  301  185  139        0
010390  107   81  109  146   81   60  102  106   89  203  138   77        0
010402  169  216  180  160  170   55  233  272   36  254  199  168        0
010440  128  117  105  113   24   28  131   85  103  223  109  167        0
010505  122   98  131  121   48  160   69   60  101  232  181  107        0
010583  158  113  188  193  353  108  179  339   70  272  217  184        0
010655   96  122   92  122   46   69   76   94  193  177  139   80        0
010764  106  212  101  105  163   84   22   53  105  326  202   82        0
010831   98  111   92  112   80   78   46   38  140  302  177  125        0
010957  139  137  110  108   33   97  115  114  104  264  137   84        0
011080  186  224  211  190  211   73  199  258   50  507  188  136        0
011084  165  193  171  224  237  116  167  292   23  380  194   81        0
011099  148  115  102   68   81  101  132   93  145  200  205   88        0
011225   75  176   90  120   90   93   27   72  113  167  210   46        0
011301  127   84  150  130   94   30   93  143   26  251  151  201        0
011324   95  155  136   59   23   54   75  150   53  245   86  132        0
011377  122  160  145  150   51  144   77   44   85  209  140  135        0
011525  114  197  139  186   94   49   46  123   81  242  203  123        0
011566  216  126  189  179   88   64  120  195   42  166  119  193        0
011620  103  344  127  162   39   58   44  139   89  250  170  118        0
011690  157  122  109  127   71   56  115  120   34  210  148  208        0
011803  176   47  299  191  136  155  232  105   74  211  237  114        0
012245  113  149  187  128  103   54   61   88   60  248  146  125        0
012632   95  125  110  135   64   95   96  115   55  183  156  114        0
012758  183  188  165  131  130   73  126  300   62  303  151  141        0
012813  138   72  255  170  153  115  220  244   77  308  295  123        0
012883  120  132  203  150   45  131   67   95   81  164  149  110        0
013160   79  129  165  145   59   45   54  109  116  146  162  121        0
013271  135  130  147  135   77  132   99  165    0  316  129   72        0
013511   79  155  130  188  122   65   54  131   64  254   95  118        0
013519  109  109  134  114   52   48  139  103   15  327  115  159        0
013620  155  205  138  238  131  142  158   83   95  142  171  129        0
013761  111  216  133  114   74   97  134  131   43  154  113   61        0
013775  100  213  142  112   77   72   57   57  149  255  193   97        0
013816  143  132  132   90   69  161   51  207   36  308  111  108        0
013842   85  167  116  128   21   86   46   71   90  278  152  116        0
013899  165  166  136  242  132   93  131   66   63  129  123   96        0
013930   91  132  132   97   69   90  105  111   51  170  159   78        0
014064   89  178   84  114   86  117  105   35  117  173  131  105        0
014209   83  118  102  123   71   66   99   66  141  291  168   89        0
014226  140  150  147  161   64  130   63   33  106  204  195  121        0
014431  165  181  160  145  105   95  108  172   39  320  189   71        0
014619   92  100   96  117   89  112   12  114   94  239  178  119        0
014798   64  123  140  173  109   50   59   49   38  146  106   82        0
015112  125  230  139  146   69   68   62  160   83  242  143  128        0
015121  127  107  150  137   33   69   35  135  161  224  120  183        0
015420  123  110  143   70   89   15   92  166   35  220  148  192        0
015478  180   76  217  189  307   84  118  193   29  221  204  225        0
015635  117  178  113  162  184   83   66   50  113  200  124  125        0
015658   90  178  171  133   44   48  198   99   81  216  154   91        0
016121  133  153  121  104   90  106   91  111  163  313  144   90        0
016129   94  134  122   63   32   52   52   99   51  391  118  113        0
016246  105  129  151  135   58   89   41  111  113  314  172  105        0
016334  127  101  178  186  170   58  130  103   45  208  144  147        0
016478   98  130  119  103   59   85   56   55  108  271  161   95        0
        ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...      ...

[5776 rows x 13 columns]

In [66]:
precip_sta = pd.read_csv(os.path.join(BASEDIR, 'raw_data/NCAR_pinfill/METAinfo'),sep=r'\s*', names=['sta', 'lon', 'lat', 'elev'], index_col=0, header=1)
precip_joined = precip_full.join(precip_sta, how='inner')
precip_joined['annual'] = precip_joined['jan']+precip_joined['feb']+precip_joined['mar']+precip_joined['apr']+precip_joined['may']+precip_joined['jun']+precip_joined['jul']+precip_joined['aug']+precip_joined['sep']+precip_joined['oct']+precip_joined['nov']+precip_joined['dec']
print precip_joined


        jan  feb  mar  apr  may  jun  jul  aug  sep  oct  nov  dec  missing  \
sta                                                                           
010063  118  139  131  132  109  119   75   67   81  110  162  126        0   
010140  123   88  125  116   50  105   55  151   71  236  164  202        0   
010184   86   99  155  177  102   73   51  150   49  202  211  103        0   
010252  196  203  143  124  168   96  229  184   34  424  162  150        0   
010369  107  267  151   79   36  139   55  119   82  301  185  139        0   
010390  107   81  109  146   81   60  102  106   89  203  138   77        0   
010402  169  216  180  160  170   55  233  272   36  254  199  168        0   
010440  128  117  105  113   24   28  131   85  103  223  109  167        0   
010505  122   98  131  121   48  160   69   60  101  232  181  107        0   
010583  158  113  188  193  353  108  179  339   70  272  217  184        0   
010655   96  122   92  122   46   69   76   94  193  177  139   80        0   
010764  106  212  101  105  163   84   22   53  105  326  202   82        0   
010831   98  111   92  112   80   78   46   38  140  302  177  125        0   
010957  139  137  110  108   33   97  115  114  104  264  137   84        0   
011080  186  224  211  190  211   73  199  258   50  507  188  136        0   
011084  165  193  171  224  237  116  167  292   23  380  194   81        0   
011099  148  115  102   68   81  101  132   93  145  200  205   88        0   
011225   75  176   90  120   90   93   27   72  113  167  210   46        0   
011301  127   84  150  130   94   30   93  143   26  251  151  201        0   
011324   95  155  136   59   23   54   75  150   53  245   86  132        0   
011377  122  160  145  150   51  144   77   44   85  209  140  135        0   
011525  114  197  139  186   94   49   46  123   81  242  203  123        0   
011566  216  126  189  179   88   64  120  195   42  166  119  193        0   
011620  103  344  127  162   39   58   44  139   89  250  170  118        0   
011690  157  122  109  127   71   56  115  120   34  210  148  208        0   
011803  176   47  299  191  136  155  232  105   74  211  237  114        0   
012245  113  149  187  128  103   54   61   88   60  248  146  125        0   
012632   95  125  110  135   64   95   96  115   55  183  156  114        0   
012758  183  188  165  131  130   73  126  300   62  303  151  141        0   
012813  138   72  255  170  153  115  220  244   77  308  295  123        0   
012883  120  132  203  150   45  131   67   95   81  164  149  110        0   
013160   79  129  165  145   59   45   54  109  116  146  162  121        0   
013271  135  130  147  135   77  132   99  165    0  316  129   72        0   
013511   79  155  130  188  122   65   54  131   64  254   95  118        0   
013519  109  109  134  114   52   48  139  103   15  327  115  159        0   
013620  155  205  138  238  131  142  158   83   95  142  171  129        0   
013761  111  216  133  114   74   97  134  131   43  154  113   61        0   
013775  100  213  142  112   77   72   57   57  149  255  193   97        0   
013816  143  132  132   90   69  161   51  207   36  308  111  108        0   
013842   85  167  116  128   21   86   46   71   90  278  152  116        0   
013899  165  166  136  242  132   93  131   66   63  129  123   96        0   
013930   91  132  132   97   69   90  105  111   51  170  159   78        0   
014064   89  178   84  114   86  117  105   35  117  173  131  105        0   
014209   83  118  102  123   71   66   99   66  141  291  168   89        0   
014226  140  150  147  161   64  130   63   33  106  204  195  121        0   
014431  165  181  160  145  105   95  108  172   39  320  189   71        0   
014619   92  100   96  117   89  112   12  114   94  239  178  119        0   
014798   64  123  140  173  109   50   59   49   38  146  106   82        0   
015112  125  230  139  146   69   68   62  160   83  242  143  128        0   
015121  127  107  150  137   33   69   35  135  161  224  120  183        0   
015420  123  110  143   70   89   15   92  166   35  220  148  192        0   
015478  180   76  217  189  307   84  118  193   29  221  204  225        0   
015635  117  178  113  162  184   83   66   50  113  200  124  125        0   
015658   90  178  171  133   44   48  198   99   81  216  154   91        0   
016121  133  153  121  104   90  106   91  111  163  313  144   90        0   
016129   94  134  122   63   32   52   52   99   51  391  118  113        0   
016246  105  129  151  135   58   89   41  111  113  314  172  105        0   
016334  127  101  178  186  170   58  130  103   45  208  144  147        0   
016478   98  130  119  103   59   85   56   55  108  271  161   95        0   
016508  140  132  132  126   88  108   60   90   95  214  127  168        0   
        ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...      ...   

          lon    lat  elev  annual  
sta                                 
010063 -87.18  34.22   244    1369  
010140 -87.42  32.23    55    1486  
010184 -88.28  33.23    50    1458  
010252 -86.50  31.32   107    2113  
010369 -85.83  33.27   308    1660  
010390 -86.98  34.80   217    1299  
010402 -87.48  31.17    92    2112  
010440 -86.68  32.47    61    1333  
010505 -87.35  33.45    85    1430  
010583 -87.78  30.88    82    2374  
010655 -86.88  34.70   183    1306  
010764 -87.00  33.40   137    1561  
010831 -86.75  33.57   192    1399  
010957 -86.17  34.22   329    1442  
011080 -87.03  31.12    49    2433  
011084 -87.05  31.07    26    2243  
011099 -85.73  34.93   201    1478  
011225 -86.72  33.45   140    1279  
011301 -87.32  32.03    72    1480  
011324 -85.68  32.83   207    1263  
011377 -87.53  33.90   131    1462  
011525 -87.25  32.90   139    1597  
011566 -88.25  31.47    61    1697  
011620 -86.33  33.28   127    1643  
011690 -87.52  31.53    55    1477  
011803 -88.22  30.38     3    1977  
012245 -87.83  32.52    31    1462  
012632 -87.80  33.25    77    1343  
012758 -86.97  31.43   104    1953  
012813 -87.88  30.55     7    2170  
012883 -87.83  33.68   110    1447  
013160 -88.13  32.83    38    1330  
013271 -86.73  31.65    85    1537  
013511 -87.58  32.70    67    1455  
013519 -86.63  31.83   134    1424  
013620 -87.62  34.23   290    1787  
013761 -85.33  31.35   113    1381  
013775 -85.60  33.65   259    1524  
013816 -86.32  31.95   181    1548  
013842 -85.40  33.53   360    1356  
013899 -87.93  34.33   259    1542  
013930 -87.45  33.25    59    1285  
014064 -86.58  34.70   185    1334  
014209 -85.75  33.82   217    1417  
014226 -87.27  33.90   162    1514  
014431 -86.18  31.23    82    1750  
014619 -86.55  33.55   194    1362  
014798 -88.18  32.58    49    1139  
015112 -87.27  32.68    52    1595  
015121 -87.22  32.47    61    1481  
015420 -87.42  32.10    35    1403  
015478 -88.23  30.68    66    2043  
015635 -87.30  34.48   197    1515  
015658 -87.63  33.00    36    1503  
016121 -86.48  33.95   262    1619  
016129 -85.38  32.63   226    1321  
016246 -86.67  33.77   244    1523  
016334 -88.02  32.23    26    1597  
016478 -86.70  33.68   185    1340  
016508 -86.93  32.65    73    1480  
          ...    ...   ...     ...  

[5775 rows x 17 columns]

In [66]:


In [67]:
precip_X = precip_joined[['lon', 'lat', 'elev']]
standardizeX(precip_X, ignore_cols=['lon', 'lat'])
mkdir_p(os.path.join(BASEDIR, 'datasets/precip_all/'))
precip_annual = precip_joined[['annual']]
precip_annual = (precip_annual - precip_annual.mean())/precip_annual.std()

save_test_train_split(precip_X, precip_annual, "precip_all", 0, n_train=5000)


for month in ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec']:
    mkdir_p(os.path.join(BASEDIR, 'datasets/precip_%s/' % month))
    precip_month = precip_joined[[month]]
    precip_month = (precip_month - precip_month.mean())/precip_month.std()
    save_test_train_split(precip_X, precip_month, "precip_%s" % month, 0, n_train=5000)


elev 3537 -59
elev 3537 -59 3596

In [68]:
precip_X.head()
#precip_X['lon'].astype('float')


Out[68]:
lon lat elev
sta
010063 -87.18 34.22 8.426029
010140 -87.42 32.23 3.170189
010184 -88.28 33.23 3.031146
010252 -86.50 31.32 4.616240
010369 -85.83 33.27 10.205784

5 rows × 3 columns


In [69]:
tco = pd.read_csv(os.path.join(BASEDIR, 'raw_data/tco.csv'), names=['lon', 'lat', 'elev', 'tco'])
print tco


        lon   lat  elev  tco
0  -179.375 -89.5     0    0
1  -178.125 -89.5     0    0
2  -176.875 -89.5     0    0
3  -175.625 -89.5     0    0
4  -174.375 -89.5     0    0
5  -173.125 -89.5     0    0
6  -171.875 -89.5     0    0
7  -170.625 -89.5     0    0
8  -169.375 -89.5     0    0
9  -168.125 -89.5     0    0
10 -166.875 -89.5     0    0
11 -165.625 -89.5     0    0
12 -164.375 -89.5     0    0
13 -163.125 -89.5     0    0
14 -161.875 -89.5     0    0
15 -160.625 -89.5     0    0
16 -159.375 -89.5     0    0
17 -158.125 -89.5     0    0
18 -156.875 -89.5     0    0
19 -155.625 -89.5     0    0
20 -154.375 -89.5     0    0
21 -153.125 -89.5     0    0
22 -151.875 -89.5     0    0
23 -150.625 -89.5     0    0
24 -149.375 -89.5     0    0
25 -148.125 -89.5     0    0
26 -146.875 -89.5     0    0
27 -145.625 -89.5     0    0
28 -144.375 -89.5     0    0
29 -143.125 -89.5     0    0
30 -141.875 -89.5     0    0
31 -140.625 -89.5     0    0
32 -139.375 -89.5     0    0
33 -138.125 -89.5     0    0
34 -136.875 -89.5     0    0
35 -135.625 -89.5     0    0
36 -134.375 -89.5     0    0
37 -133.125 -89.5     0    0
38 -131.875 -89.5     0    0
39 -130.625 -89.5     0    0
40 -129.375 -89.5     0    0
41 -128.125 -89.5     0    0
42 -126.875 -89.5     0    0
43 -125.625 -89.5     0    0
44 -124.375 -89.5     0    0
45 -123.125 -89.5     0    0
46 -121.875 -89.5     0    0
47 -120.625 -89.5     0    0
48 -119.375 -89.5     0    0
49 -118.125 -89.5     0    0
50 -116.875 -89.5     0    0
51 -115.625 -89.5     0    0
52 -114.375 -89.5     0    0
53 -113.125 -89.5     0    0
54 -111.875 -89.5     0    0
55 -110.625 -89.5     0    0
56 -109.375 -89.5     0    0
57 -108.125 -89.5     0    0
58 -106.875 -89.5     0    0
59 -105.625 -89.5     0    0
        ...   ...   ...  ...

[51840 rows x 4 columns]

In [70]:
tco_X = tco[['lon', 'lat']]
tco_y = tco[['tco']]
tco_y = (tco_y - tco_y.mean()) / tco_y.std()

save_test_train_split(tco_X, tco_y, "tco", n_train=15000)

In [71]:
tco_y.hist()


---------------------------------------------------------------------------
TclError                                  Traceback (most recent call last)
<ipython-input-71-f907b5411566> in <module>()
----> 1 tco_y.hist()

/home/dmoore/.virtualenvs/sparsegp/local/lib/python2.7/site-packages/pandas/tools/plotting.pyc in hist_frame(data, column, by, grid, xlabelsize, xrot, ylabelsize, yrot, ax, sharex, sharey, figsize, layout, **kwds)
   2067                 cols += 1
   2068     fig, axes = _subplots(nrows=rows, ncols=cols, ax=ax, squeeze=False,
-> 2069                           sharex=sharex, sharey=sharey, figsize=figsize)
   2070 
   2071     for i, col in enumerate(com._try_sort(data.columns)):

/home/dmoore/.virtualenvs/sparsegp/local/lib/python2.7/site-packages/pandas/tools/plotting.pyc in _subplots(nrows, ncols, sharex, sharey, squeeze, subplot_kw, ax, secondary_y, data, **fig_kw)
   2431 
   2432     if ax is None:
-> 2433         fig = plt.figure(**fig_kw)
   2434     else:
   2435         fig = ax.get_figure()

/home/dmoore/.virtualenvs/sparsegp/local/lib/python2.7/site-packages/matplotlib/pyplot.pyc in figure(num, figsize, dpi, facecolor, edgecolor, frameon, FigureClass, **kwargs)
    421                                         frameon=frameon,
    422                                         FigureClass=FigureClass,
--> 423                                         **kwargs)
    424 
    425         if figLabel:

/home/dmoore/.virtualenvs/sparsegp/local/lib/python2.7/site-packages/matplotlib/backends/backend_tkagg.pyc in new_figure_manager(num, *args, **kwargs)
     77     FigureClass = kwargs.pop('FigureClass', Figure)
     78     figure = FigureClass(*args, **kwargs)
---> 79     return new_figure_manager_given_figure(num, figure)
     80 
     81 

/home/dmoore/.virtualenvs/sparsegp/local/lib/python2.7/site-packages/matplotlib/backends/backend_tkagg.pyc in new_figure_manager_given_figure(num, figure)
     85     """
     86     _focus = windowing.FocusManager()
---> 87     window = Tk.Tk()
     88     window.withdraw()
     89 

/usr/lib/python2.7/lib-tk/Tkinter.pyc in __init__(self, screenName, baseName, className, useTk, sync, use)
   1686                 baseName = baseName + ext
   1687         interactive = 0
-> 1688         self.tk = _tkinter.create(screenName, baseName, className, interactive, wantobjects, useTk, sync, use)
   1689         if useTk:
   1690             self._loadtk()

TclError: no display name and no $DISPLAY environment variable

In [72]:
h1 = pd.read_csv(os.path.join(BASEDIR, 'raw_data/cal_housing_data.csv'))
h1_norm = (h1 - h1.mean()) / h1.std()
print h1_norm


         lon       lat       age    income     units  population  households  \
0  -1.327803  1.052523  0.982119 -0.804800 -0.970683   -0.974405   -0.977009   
1  -1.322812  1.043159 -0.607004  2.045841  1.348617    0.861418    1.669921   
2  -1.332794  1.038478  1.856137 -0.535733 -0.825875   -0.820757   -0.843616   
3  -1.337785  1.038478  1.856137 -0.624199 -0.719049   -0.766010   -0.733764   
4  -1.337785  1.038478  1.856137 -0.462393 -0.612224   -0.759828   -0.629142   
5  -1.337785  1.038478  1.856137 -0.786923 -0.771275   -0.894049   -0.801768   
6  -1.337785  1.033796  1.856137 -0.046187 -0.116079   -0.292704    0.037822   
7  -1.337785  1.033796  1.856137  0.214629  0.353953   -0.237073    0.385689   
8  -1.342777  1.033796  1.061575 -0.037020  0.301727   -0.193805    0.249681   
9  -1.337785  1.033796  1.856137  0.418606  0.401431    0.110841    0.560930   
10 -1.342777  1.038478  1.856137 -0.198827 -0.246643   -0.455182   -0.255119   
11 -1.342777  1.038478  1.856137  0.397521  0.508257    0.069339    0.613241   
12 -1.342777  1.038478  1.856137 -0.066356 -0.151687   -0.289172   -0.082493   
13 -1.342777  1.033796  1.856137 -0.889141 -0.823501   -0.954095   -0.851463   
14 -1.342777  1.038478  1.856137  0.003317  0.209145   -0.188507    0.315069   
15 -1.342777  1.038478  1.697224 -0.694789 -0.605102   -0.643268   -0.616064   
16 -1.347768  1.038478  1.856137 -0.307003 -0.453173   -0.558497   -0.440823   
17 -1.347768  1.038478  1.856137 -0.645285 -0.581363   -0.686537   -0.514058   
18 -1.342777  1.033796  1.697224 -0.181867 -0.196792   -0.384540   -0.210655   
19 -1.347768  1.033796  1.856137 -0.519231 -0.569494   -0.649449   -0.587293   
20 -1.347768  1.038478  0.902663 -0.863930 -0.840118   -0.897581   -0.872387   
21 -1.347768  1.038478  1.061575 -0.456892 -0.405695   -0.438405   -0.349279   
22 -1.347768  1.033796  1.856137 -0.091567  0.007364   -0.362464   -0.056338   
23 -1.347768  1.033796  1.856137 -0.434432 -0.476912   -0.505515   -0.456516   
24 -1.347768  1.033796  1.856137 -0.188742 -0.239522   -0.370411   -0.202808   
25 -1.352759  1.038478  0.982119 -0.962939 -0.984926   -0.978820   -0.995318   
26 -1.352759  1.038478  1.617768 -0.690206 -0.697684   -0.722741   -0.681453   
27 -1.352759  1.038478  1.856137 -0.338173 -0.277504   -0.285640   -0.268197   
28 -1.352759  1.033796  1.697224 -0.253832 -0.108957   -0.260032   -0.069416   
29 -1.352759  1.033796  1.856137 -0.874014 -0.897092   -0.909944   -0.901158   
30 -1.352759  1.033796  1.617768 -0.329922 -0.215783   -0.496685   -0.317892   
31 -1.352759  1.033796  1.856137 -0.221287 -0.135070   -0.227360   -0.153113   
32 -1.347768  1.033796  1.538312 -0.327172 -0.305991   -0.352751   -0.430361   
33 -1.347768  1.029114  1.617768 -0.449558 -0.408069   -0.592935   -0.446054   
34 -1.347768  1.029114  1.776680  0.013402  0.085702   -0.147887    0.095364   
35 -1.347768  1.029114  1.617768 -0.651244 -0.607476   -0.755413   -0.616064   
36 -1.347768  1.029114  1.538312 -0.384010 -0.251391   -0.387189   -0.328354   
37 -1.352759  1.029114  1.856137 -0.515564 -0.351095   -0.463130   -0.252504   
38 -1.342777  1.029114  1.856137 -0.534358 -0.493529   -0.650332   -0.498365   
39 -1.342777  1.029114  1.856137 -0.093400  0.420422   -0.042807    0.513850   
40 -1.342777  1.029114  1.856137 -0.444974 -0.282252   -0.423393   -0.273428   
41 -1.342777  1.029114  1.776680 -0.779131 -0.538633   -0.802214   -0.655297   
42 -1.342777  1.033796  1.617768 -0.881348 -0.797388   -0.850781   -0.812230   
43 -1.342777  1.033796  1.856137 -0.772713 -0.797388   -0.846365   -0.788690   
44 -1.342777  1.029114  1.856137 -0.546734 -0.538633   -0.675940   -0.542829   
45 -1.342777  1.029114  1.856137 -0.449100 -0.279878   -0.624724   -0.307430   
46 -1.342777  1.029114  1.697224 -0.692497 -0.512520   -0.714794   -0.511442   
47 -1.347768  1.024432  1.141031 -0.746586 -0.536259   -0.766010   -0.644835   
48 -1.342777  1.024432  0.902663 -0.922144 -0.814005   -0.885219   -0.888081   
49 -1.347768  1.024432  0.902663 -0.774547 -0.386703   -0.640619   -0.385896   
50 -1.347768  1.024432 -0.607004 -0.797466 -0.201539   -0.609713   -0.160960   
51 -1.347768  1.024432  1.141031 -0.351924 -0.194418   -0.321845   -0.242042   
52 -1.347768  1.024432  0.982119  0.268259  0.748020    0.471118    0.576623   
53 -1.347768  1.024432  1.856137 -0.461018 -0.194418   -0.232658   -0.260350   
54 -1.352759  1.024432  1.856137 -0.671871 -0.719049   -0.639736   -0.697146   
55 -1.352759  1.024432  1.856137 -0.775005 -0.700058   -0.750115   -0.731148   
56 -1.352759  1.024432  1.856137 -0.640701 -0.593233   -0.709495   -0.629142   
57 -1.352759  1.024432  1.856137 -0.525649 -0.481659   -0.615894   -0.668375   
58 -1.352759  1.024432  1.856137 -0.572403 -0.467416   -0.310365   -0.511442   
59 -1.357750  1.024432 -2.116671 -1.135747 -1.174838   -1.175736   -1.157482   
         ...       ...       ...       ...       ...         ...         ...   

       rooms     value  
0   2.344709  2.129580  
1   2.332181  1.314124  
2   1.782656  1.258663  
3   0.932945  1.165072  
4  -0.012881  1.172871  
5   0.087445  0.544598  
6  -0.111364  0.800240  
7  -0.395127  0.299354  
8  -0.942336  0.171967  
9  -0.094467  0.470071  
10 -0.351386  0.646855  
11 -0.315909  0.302821  
12 -0.418814  0.057577  
13 -0.630096 -0.134804  
14 -1.028502 -0.412978  
15 -0.918860 -0.579362  
16 -0.576723 -0.471039  
17 -0.921387 -0.445041  
18 -0.989341 -0.417311  
19 -0.667100 -0.380914  
20 -1.322688 -0.514368  
21 -1.135460 -0.407778  
22 -1.129407 -0.805540  
23 -0.889595 -0.928595  
24 -0.668837 -0.643489  
25 -0.772110 -0.861002  
26 -0.742686 -0.979724  
27 -1.085718 -0.878333  
28 -1.172884 -0.848869  
29 -1.149145 -0.648689  
30 -1.022870 -0.732747  
31 -1.004921 -0.794275  
32 -1.091561 -0.835871  
33 -1.313634 -0.883533  
34 -0.600252 -0.841937  
35 -1.255155 -0.950260  
36 -1.459859 -0.886999  
37 -1.295054 -0.892199  
38 -0.205636 -0.133938  
39 -0.674206 -0.267392  
40 -0.933178 -0.445908  
41 -1.360902 -0.492703  
42 -1.497862 -0.763078  
43  0.049283 -0.156469  
44 -0.451711 -0.194599  
45 -0.628412 -0.212797  
46 -0.970971 -0.557697  
47 -1.124248 -0.601026  
48 -1.537024 -0.167734  
49 -1.103088 -0.817672  
50 -1.552183 -0.302922  
51 -1.245470 -0.979724  
52 -1.452700 -0.947660  
53 -1.380746 -0.889599  
54 -1.190044 -1.034318  
55 -1.294527 -1.072448  
56 -1.244838 -1.034318  
57 -1.607241 -1.053383  
58 -1.396747 -1.096713  
59 -0.688576 -1.272629  
         ...       ...  

[20640 rows x 9 columns]

In [86]:
housing_age_X = h1[['income', 'value']]
standardizeX(housing_age_X, ignore_cols=['lon', 'lat'])
housing_age_y = h1_norm[['age']]
save_test_train_split(housing_age_X, housing_age_y, "housing_age", n_train=18000)


housing_inc_X = h1[['age', 'value']]
standardizeX(housing_inc_X, ignore_cols=['lon', 'lat'])
housing_inc_y = h1_norm[['income']]
save_test_train_split(housing_inc_X, housing_inc_y, "housing_inc", n_train=18000)


housing_val_X = h1[['age', 'income']]
standardizeX(housing_val_X, ignore_cols=['lon', 'lat'])
housing_val_y = h1_norm[['value']]
save_test_train_split(housing_val_X, housing_val_y, "housing_val", n_train=18000)


income 39320.0 2.0
income 39320.0 2.0 39318.0
value 500001.0 14999.0
value 500001.0 14999.0 485002.0
age 52.0 1.0
age 52.0 1.0 51.0
value 500001.0 14999.0
value 500001.0 14999.0 485002.0
age 52.0 1.0
age 52.0 1.0 51.0
income 39320.0 2.0
income 39320.0 2.0 39318.0

In [87]:
bigsta=None

import os

for stafile in os.listdir(os.path.join(BASEDIR, 'raw_data/california_snow/')):
    if not stafile.endswith('.csv'): continue
    
    sta = pd.read_csv(os.path.join(BASEDIR, 'raw_data/california_snow/' + stafile))
    sta['daynum'] = sta.index
    sta = sta.dropna()
    sta['snow'].astype('float')
    if bigsta is None:
        bigsta = sta
    else:
        bigsta = pd.concat([bigsta, sta])
#print bigsta

In [91]:
bigsta['snow'] = (bigsta['snow'] - bigsta['snow'].mean()) / bigsta['snow'].std()

In [92]:
bigsta_sorted = bigsta.sort(columns=['daynum', 'lon', 'lat', 'elev'])

snow_X = bigsta_sorted[['daynum', 'lon', 'lat', 'elev']]
standardizeX(snow_X, ignore_cols=['daynum', 'lon', 'lat'])


snow_y = bigsta_sorted[['snow']]

save_test_train_split(snow_X, snow_y, "snow", n_train=15000)


elev 11400.0 5100.0
elev 11400.0 5100.0 6300.0

In [77]:
print bigsta_sorted.head()


         date      lon     lat   elev      snow  daynum
0  2011-11-01  118.177 -36.483  10150 -0.804404       0
0  2011-11-01  118.268 -36.200   8300 -0.770013       0
0  2011-11-01  118.288 -36.367   8900 -1.025080       0
0  2011-11-01  118.293 -36.127   7650 -0.820167       0
0  2011-11-01  118.345 -36.563  10700 -0.838795       0

[5 rows x 6 columns]

In [13]:
import scipy.io    
sdir = os.path.join(BASEDIR, 'raw_data')
sarcos_train = scipy.io.loadmat(os.path.join(sdir, 'sarcos_inv.mat'))['sarcos_inv'].byteswap().newbyteorder()
sarcos_test = scipy.io.loadmat(os.path.join(sdir, 'sarcos_inv_test.mat'))['sarcos_inv_test'].byteswap().newbyteorder()

print sarcos_train


[[ 0.019478 -0.134218  0.027439 ..., -0.424812 -0.907553  8.090739]
 [ 0.017279 -0.137077  0.026999 ..., -0.228739 -1.235817  7.762475]
 [ 0.016336 -0.140878  0.02725  ...,  0.244491 -1.70088   7.289678]
 ..., 
 [-0.551577 -0.445141  0.07741  ..., -1.274436 -2.375086  1.892345]
 [-0.541838 -0.436124  0.0743   ..., -1.577874 -2.317157  2.490944]
 [-0.530622 -0.426197  0.069838 ..., -0.788937 -2.259228  2.857827]]

In [14]:
sarcos_train_X = sarcos_train[:, :21]
sarcos_train_y = sarcos_train[:, 21]
sarcos_test_X = sarcos_test[:, :21]
sarcos_test_y = sarcos_test[:, 21]

sarcos_train_X = np.array(sarcos_train_X, copy=True, dtype=float, order="C")[:16384,:]
sarcos_train_y = np.array(sarcos_train_y, copy=True, dtype=float, order="C")[:16384]
sarcos_test_X = np.array(sarcos_test_X, copy=True, dtype=float, order="C")
sarcos_test_y = np.array(sarcos_test_y, copy=True, dtype=float, order="C")

train_X_mean = np.reshape(np.mean(sarcos_train_X, axis=0), (1, -1))
train_X_std = np.reshape(np.std(sarcos_train_X, axis=0), (1, -1))
train_y_mean = np.mean(sarcos_train_y)
train_y_std = np.std(sarcos_train_y)
    
sarcos_train_X = (sarcos_train_X - train_X_mean)/(train_X_std / 50.0)
sarcos_test_X = (sarcos_test_X - train_X_mean)/(train_X_std / 50.0)
sarcos_train_y = (sarcos_train_y - train_y_mean)/train_y_std
sarcos_test_y = (sarcos_test_y - train_y_mean)/train_y_std



out_dir = os.path.join(BASEDIR, 'datasets/sarcos/')
mkdir_p(out_dir)
trainXframe = pd.DataFrame(sarcos_train_X)
trainXframe.to_csv(os.path.join(out_dir, 'X_train.txt'), index=False)
trainyframe = pd.DataFrame(sarcos_train_y)
trainyframe.to_csv(os.path.join(out_dir, 'y_train.txt'), index=False)

testXframe = pd.DataFrame(sarcos_test_X)
testXframe.to_csv(os.path.join(out_dir, 'X_test.txt'), index=False)
testyframe = pd.DataFrame(sarcos_test_y)
testyframe.to_csv(os.path.join(out_dir, 'y_test.txt'), index=False)

In [79]:


In [80]:
# wiggle preprocessing


ycols = ['amp_%02d' % freq for freq in range(60)]
X = pd.read_csv(os.path.join(BASEDIR, 'raw_data/wiggle_X_good.txt'), names=['lon', 'lat', 'depth'], sep=r'\s*')
Y = pd.read_csv(os.path.join(BASEDIR, 'raw_data/wiggle_Y_good.txt'), names=ycols, sep=r'\s*')

In [81]:
n = Y.shape[0]
for i in range(n):
    Y.ix[i,:] = Y.ix[i,:] / np.linalg.norm(Y.ix[i,:], 2)
    
Y = (Y - Y.mean())/Y.std()

In [82]:
X['lon'] = X['lon'] % 360 - 180

In [83]:
XY = X.merge(Y, left_index=True, right_index=True)
XY=XY.dropna()
XY = XY.sort(columns=['lon', 'lat', 'depth'])

In [84]:
XX = XY[['lon', 'lat', 'depth']]
standardizeX(XX, ignore_cols=['lon', 'lat'])
YY = XY[ycols]


depth 700.0 0.0
depth 700.0 0.0 700.0

In [85]:
save_test_train_split(XX, YY[['amp_05']], "wiggle_5", n_train=10000)
save_test_train_split(XX, YY[['amp_15']], "wiggle_15", n_train=10000)
save_test_train_split(XX, YY[['amp_20']], "wiggle_20", n_train=10000)
save_test_train_split(XX, YY[['amp_45']], "wiggle_45", n_train=10000)

In [111]:
X = pd.read_csv(os.path.join(BASEDIR, 'raw_data/X_FITZ.txt'), names=['lon', 'lat', 'depth', 'dist', 'azi', 'mb'], sep=r'\s*')
y = pd.read_csv(os.path.join(BASEDIR, 'raw_data/y_FITZ.txt'), names=['amp_transfer'], sep=r'\s*')

y = (y - y.mean())/y.std()
X = X[['lon', 'lat', 'depth']]
save_test_train_split(X, y, "seismic_fitz", n_train=3500)

In [108]:
X = pd.read_csv(os.path.join(BASEDIR, 'raw_data/X_AS12.txt'), names=['lon', 'lat', 'depth', 'dist', 'azi', 'mb'], sep=r'\s*')
y = pd.read_csv(os.path.join(BASEDIR, 'raw_data/y_AS12.txt'), names=['amp_transfer'], sep=r'\s*')


y = (y - y.mean())/y.std()
X = X[['lon', 'lat', 'depth']]
save_test_train_split(X, y, "seismic_as12", n_train=20000)X = pd.read_csv(os.path.join(BASEDIR, 'raw_data/X_AS12.txt'), names=['lon', 'lat', 'depth', 'dist', 'azi', 'mb'], sep=r'\s*')
y = pd.read_csv(os.path.join(BASEDIR, 'raw_data/y_AS12.txt'), names=['amp_transfer'], sep=r'\s*')


y = (y - y.mean())/y.std()
X = X[['lon', 'lat', 'depth']]
save_test_train_split(X, y, "seismic_as12", n_train=20000)


(25877, 3) (25877, 1) 25877 20000
(20000, 3)        amp_transfer
6756      -0.764122
8870       0.032346
25545      1.684493
25141     -0.565302
25134      0.214456
9608      -0.088577
6038      -0.322334
9135      -0.839262
21437      0.848224
21726     -0.737662
15384     -0.122362
3101       0.448063
2606       0.484243
5244      -0.013923
3833       0.105997
18001      1.089783
9285       0.879798
24452      0.364199
1844      -0.393160
8330      -2.051111
18811     -0.340341
1603      -0.721587
17636      2.164483
25009      0.270061
4682       0.533632
10944      0.108581
17915     -1.086216
14535      0.497035
9619      -0.482938
4436      -0.445580
6441       1.133402
6895      -0.882243
25168      2.202363
8953      -1.045123
14708      2.913464
11734      1.074340
21465     -0.164889
8771       0.030962
13207     -1.768228
18522      0.225052
6863      -1.196633
21416     -0.146108
13526     -1.330761
20080     -0.171786
7647       0.940515
2508      -0.707555
4889      -1.199541
3075       1.107323
9995      -1.203096
791        1.180000
1026      -0.901203
15693      0.871947
18998     -0.342705
2459      -0.719490
23189     -1.201105
21847      0.827051
18237      0.494443
21826      0.542124
8509       0.350043
2201       1.172106
                ...

[20000 rows x 1 columns]
(5877, 3)
(5877, 1)

In [10]:
X = pd.read_csv(os.path.join(BASEDIR, 'raw_data/tt_ASAR_X.txt'), names=['lon', 'lat', 'depth'], sep=r'\s*')
y = pd.read_csv(os.path.join(BASEDIR, 'raw_data/tt_ASAR_y.txt'), names=['tt_residual'], sep=r'\s*')

X = X[['lon', 'lat', 'depth']].iloc[:20000,:]
y = y.iloc[:20000]
save_test_train_split(X, y, "seismic_tt_ASAR", n_train=16000)

In [11]:
X = pd.read_csv(os.path.join(BASEDIR, 'raw_data/tt_FITZ_X.txt'), names=['lon', 'lat', 'depth'], sep=r'\s*')
y = pd.read_csv(os.path.join(BASEDIR, 'raw_data/tt_FITZ_y.txt'), names=['tt_residual'], sep=r'\s*')

X = X[['lon', 'lat', 'depth']].iloc[:20000,:]
y = y.iloc[:20000]
save_test_train_split(X, y, "seismic_tt_FITZ", n_train=16000)

In [15]:
X = pd.read_csv(os.path.join(BASEDIR, 'raw_data/tt_ASAR_X.txt'), names=['lon', 'lat', 'depth'], sep=r'\s*')
y = pd.read_csv(os.path.join(BASEDIR, 'raw_data/tt_ASAR_y.txt'), names=['tt_residual'], sep=r'\s*')

save_test_train_split(X, y, "seismic_tt_ASAR_50", n_train=45000)

In [ ]: