In [4]:
BASEDIR = "/home/dmoore/python/sparsegp/experiments/"
import os,sys
import numpy as np
if '/home/dmoore/python/' not in sys.path:
sys.path.append('/home/dmoore/python/')
from sparsegp.experiments.code.datasets import mkdir_p
In [9]:
import pandas as pd
def standardizeX(dfX, ignore_cols=[]):
for col in dfX.columns:
if col in ignore_cols: continue
mx = dfX[col].max()
mn = dfX[col].min()
print col, mx, mn
dfX[col] = (dfX[col] - mn) / ((mx-mn)/100.0 )
print col, mx, mn, (mx-mn)
def save_test_train_split(Xframe, yframe, dataset, seed=0, n_train=None):
n = Xframe.shape[0]
n_train = int(n*.7) if n_train is None else n_train
np.random.seed(seed)
p = np.random.permutation(n)
train_idx = p[:n_train]
test_idx = p[n_train:]
X_train = Xframe.iloc[train_idx,:]
y_train = yframe.iloc[train_idx,:]
X_test = Xframe.iloc[test_idx,:]
y_test = yframe.iloc[test_idx,:]
mu = np.mean(y_train)
sigma = np.std(y_train)
y_train = (y_train - mu)/sigma
y_test = (y_test - mu)/sigma
savedir = os.path.join(BASEDIR, 'datasets', dataset)
mkdir_p(savedir)
X_train.to_csv(os.path.join(savedir, 'X_train.txt'), index=False)
y_train.to_csv(os.path.join(savedir, 'y_train.txt'), index=False)
X_test.to_csv(os.path.join(savedir, 'X_test.txt'), index=False)
y_test.to_csv(os.path.join(savedir, 'y_test.txt'), index=False)
In [61]:
h1 = pd.read_csv(os.path.join(BASEDIR, 'raw_data/cal_housing_data.csv'))
h2 = pd.read_csv(os.path.join(BASEDIR, 'raw_data/cal_housing_dbf.csv'))
h2 = h2[np.isfinite(h2['BLCKGR'])]
h2 = h2[h2['H061A001'] > 0]
print h1
print h2
In [62]:
(h2[h2['H061A001'] == 14999])['P0050001']
Out[62]:
In [63]:
h1[h1['value'] == 14999]
Out[63]:
In [63]:
In [64]:
print (h2['H061A001'] > 0).sum()
In [65]:
precip = pd.read_csv(os.path.join(BASEDIR, 'raw_data/NCAR_pinfill/ppt.complete.Y101'),sep=r'\s*', names=['sta', 'jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec', 'missing'], index_col=0)
precip_full = precip[precip['missing'] == 000000000000]
print precip_full
In [66]:
precip_sta = pd.read_csv(os.path.join(BASEDIR, 'raw_data/NCAR_pinfill/METAinfo'),sep=r'\s*', names=['sta', 'lon', 'lat', 'elev'], index_col=0, header=1)
precip_joined = precip_full.join(precip_sta, how='inner')
precip_joined['annual'] = precip_joined['jan']+precip_joined['feb']+precip_joined['mar']+precip_joined['apr']+precip_joined['may']+precip_joined['jun']+precip_joined['jul']+precip_joined['aug']+precip_joined['sep']+precip_joined['oct']+precip_joined['nov']+precip_joined['dec']
print precip_joined
In [66]:
In [67]:
precip_X = precip_joined[['lon', 'lat', 'elev']]
standardizeX(precip_X, ignore_cols=['lon', 'lat'])
mkdir_p(os.path.join(BASEDIR, 'datasets/precip_all/'))
precip_annual = precip_joined[['annual']]
precip_annual = (precip_annual - precip_annual.mean())/precip_annual.std()
save_test_train_split(precip_X, precip_annual, "precip_all", 0, n_train=5000)
for month in ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec']:
mkdir_p(os.path.join(BASEDIR, 'datasets/precip_%s/' % month))
precip_month = precip_joined[[month]]
precip_month = (precip_month - precip_month.mean())/precip_month.std()
save_test_train_split(precip_X, precip_month, "precip_%s" % month, 0, n_train=5000)
In [68]:
precip_X.head()
#precip_X['lon'].astype('float')
Out[68]:
In [69]:
tco = pd.read_csv(os.path.join(BASEDIR, 'raw_data/tco.csv'), names=['lon', 'lat', 'elev', 'tco'])
print tco
In [70]:
tco_X = tco[['lon', 'lat']]
tco_y = tco[['tco']]
tco_y = (tco_y - tco_y.mean()) / tco_y.std()
save_test_train_split(tco_X, tco_y, "tco", n_train=15000)
In [71]:
tco_y.hist()
In [72]:
h1 = pd.read_csv(os.path.join(BASEDIR, 'raw_data/cal_housing_data.csv'))
h1_norm = (h1 - h1.mean()) / h1.std()
print h1_norm
In [86]:
housing_age_X = h1[['income', 'value']]
standardizeX(housing_age_X, ignore_cols=['lon', 'lat'])
housing_age_y = h1_norm[['age']]
save_test_train_split(housing_age_X, housing_age_y, "housing_age", n_train=18000)
housing_inc_X = h1[['age', 'value']]
standardizeX(housing_inc_X, ignore_cols=['lon', 'lat'])
housing_inc_y = h1_norm[['income']]
save_test_train_split(housing_inc_X, housing_inc_y, "housing_inc", n_train=18000)
housing_val_X = h1[['age', 'income']]
standardizeX(housing_val_X, ignore_cols=['lon', 'lat'])
housing_val_y = h1_norm[['value']]
save_test_train_split(housing_val_X, housing_val_y, "housing_val", n_train=18000)
In [87]:
bigsta=None
import os
for stafile in os.listdir(os.path.join(BASEDIR, 'raw_data/california_snow/')):
if not stafile.endswith('.csv'): continue
sta = pd.read_csv(os.path.join(BASEDIR, 'raw_data/california_snow/' + stafile))
sta['daynum'] = sta.index
sta = sta.dropna()
sta['snow'].astype('float')
if bigsta is None:
bigsta = sta
else:
bigsta = pd.concat([bigsta, sta])
#print bigsta
In [91]:
bigsta['snow'] = (bigsta['snow'] - bigsta['snow'].mean()) / bigsta['snow'].std()
In [92]:
bigsta_sorted = bigsta.sort(columns=['daynum', 'lon', 'lat', 'elev'])
snow_X = bigsta_sorted[['daynum', 'lon', 'lat', 'elev']]
standardizeX(snow_X, ignore_cols=['daynum', 'lon', 'lat'])
snow_y = bigsta_sorted[['snow']]
save_test_train_split(snow_X, snow_y, "snow", n_train=15000)
In [77]:
print bigsta_sorted.head()
In [13]:
import scipy.io
sdir = os.path.join(BASEDIR, 'raw_data')
sarcos_train = scipy.io.loadmat(os.path.join(sdir, 'sarcos_inv.mat'))['sarcos_inv'].byteswap().newbyteorder()
sarcos_test = scipy.io.loadmat(os.path.join(sdir, 'sarcos_inv_test.mat'))['sarcos_inv_test'].byteswap().newbyteorder()
print sarcos_train
In [14]:
sarcos_train_X = sarcos_train[:, :21]
sarcos_train_y = sarcos_train[:, 21]
sarcos_test_X = sarcos_test[:, :21]
sarcos_test_y = sarcos_test[:, 21]
sarcos_train_X = np.array(sarcos_train_X, copy=True, dtype=float, order="C")[:16384,:]
sarcos_train_y = np.array(sarcos_train_y, copy=True, dtype=float, order="C")[:16384]
sarcos_test_X = np.array(sarcos_test_X, copy=True, dtype=float, order="C")
sarcos_test_y = np.array(sarcos_test_y, copy=True, dtype=float, order="C")
train_X_mean = np.reshape(np.mean(sarcos_train_X, axis=0), (1, -1))
train_X_std = np.reshape(np.std(sarcos_train_X, axis=0), (1, -1))
train_y_mean = np.mean(sarcos_train_y)
train_y_std = np.std(sarcos_train_y)
sarcos_train_X = (sarcos_train_X - train_X_mean)/(train_X_std / 50.0)
sarcos_test_X = (sarcos_test_X - train_X_mean)/(train_X_std / 50.0)
sarcos_train_y = (sarcos_train_y - train_y_mean)/train_y_std
sarcos_test_y = (sarcos_test_y - train_y_mean)/train_y_std
out_dir = os.path.join(BASEDIR, 'datasets/sarcos/')
mkdir_p(out_dir)
trainXframe = pd.DataFrame(sarcos_train_X)
trainXframe.to_csv(os.path.join(out_dir, 'X_train.txt'), index=False)
trainyframe = pd.DataFrame(sarcos_train_y)
trainyframe.to_csv(os.path.join(out_dir, 'y_train.txt'), index=False)
testXframe = pd.DataFrame(sarcos_test_X)
testXframe.to_csv(os.path.join(out_dir, 'X_test.txt'), index=False)
testyframe = pd.DataFrame(sarcos_test_y)
testyframe.to_csv(os.path.join(out_dir, 'y_test.txt'), index=False)
In [79]:
In [80]:
# wiggle preprocessing
ycols = ['amp_%02d' % freq for freq in range(60)]
X = pd.read_csv(os.path.join(BASEDIR, 'raw_data/wiggle_X_good.txt'), names=['lon', 'lat', 'depth'], sep=r'\s*')
Y = pd.read_csv(os.path.join(BASEDIR, 'raw_data/wiggle_Y_good.txt'), names=ycols, sep=r'\s*')
In [81]:
n = Y.shape[0]
for i in range(n):
Y.ix[i,:] = Y.ix[i,:] / np.linalg.norm(Y.ix[i,:], 2)
Y = (Y - Y.mean())/Y.std()
In [82]:
X['lon'] = X['lon'] % 360 - 180
In [83]:
XY = X.merge(Y, left_index=True, right_index=True)
XY=XY.dropna()
XY = XY.sort(columns=['lon', 'lat', 'depth'])
In [84]:
XX = XY[['lon', 'lat', 'depth']]
standardizeX(XX, ignore_cols=['lon', 'lat'])
YY = XY[ycols]
In [85]:
save_test_train_split(XX, YY[['amp_05']], "wiggle_5", n_train=10000)
save_test_train_split(XX, YY[['amp_15']], "wiggle_15", n_train=10000)
save_test_train_split(XX, YY[['amp_20']], "wiggle_20", n_train=10000)
save_test_train_split(XX, YY[['amp_45']], "wiggle_45", n_train=10000)
In [111]:
X = pd.read_csv(os.path.join(BASEDIR, 'raw_data/X_FITZ.txt'), names=['lon', 'lat', 'depth', 'dist', 'azi', 'mb'], sep=r'\s*')
y = pd.read_csv(os.path.join(BASEDIR, 'raw_data/y_FITZ.txt'), names=['amp_transfer'], sep=r'\s*')
y = (y - y.mean())/y.std()
X = X[['lon', 'lat', 'depth']]
save_test_train_split(X, y, "seismic_fitz", n_train=3500)
In [108]:
X = pd.read_csv(os.path.join(BASEDIR, 'raw_data/X_AS12.txt'), names=['lon', 'lat', 'depth', 'dist', 'azi', 'mb'], sep=r'\s*')
y = pd.read_csv(os.path.join(BASEDIR, 'raw_data/y_AS12.txt'), names=['amp_transfer'], sep=r'\s*')
y = (y - y.mean())/y.std()
X = X[['lon', 'lat', 'depth']]
save_test_train_split(X, y, "seismic_as12", n_train=20000)X = pd.read_csv(os.path.join(BASEDIR, 'raw_data/X_AS12.txt'), names=['lon', 'lat', 'depth', 'dist', 'azi', 'mb'], sep=r'\s*')
y = pd.read_csv(os.path.join(BASEDIR, 'raw_data/y_AS12.txt'), names=['amp_transfer'], sep=r'\s*')
y = (y - y.mean())/y.std()
X = X[['lon', 'lat', 'depth']]
save_test_train_split(X, y, "seismic_as12", n_train=20000)
In [10]:
X = pd.read_csv(os.path.join(BASEDIR, 'raw_data/tt_ASAR_X.txt'), names=['lon', 'lat', 'depth'], sep=r'\s*')
y = pd.read_csv(os.path.join(BASEDIR, 'raw_data/tt_ASAR_y.txt'), names=['tt_residual'], sep=r'\s*')
X = X[['lon', 'lat', 'depth']].iloc[:20000,:]
y = y.iloc[:20000]
save_test_train_split(X, y, "seismic_tt_ASAR", n_train=16000)
In [11]:
X = pd.read_csv(os.path.join(BASEDIR, 'raw_data/tt_FITZ_X.txt'), names=['lon', 'lat', 'depth'], sep=r'\s*')
y = pd.read_csv(os.path.join(BASEDIR, 'raw_data/tt_FITZ_y.txt'), names=['tt_residual'], sep=r'\s*')
X = X[['lon', 'lat', 'depth']].iloc[:20000,:]
y = y.iloc[:20000]
save_test_train_split(X, y, "seismic_tt_FITZ", n_train=16000)
In [15]:
X = pd.read_csv(os.path.join(BASEDIR, 'raw_data/tt_ASAR_X.txt'), names=['lon', 'lat', 'depth'], sep=r'\s*')
y = pd.read_csv(os.path.join(BASEDIR, 'raw_data/tt_ASAR_y.txt'), names=['tt_residual'], sep=r'\s*')
save_test_train_split(X, y, "seismic_tt_ASAR_50", n_train=45000)
In [ ]: