In [1]:
FN = '160313-fuse'
In [2]:
FN0 = '160306-patient'
FN1 = '160306-train.std'
In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from scipy import stats
import json
import os
In [4]:
from utils import Dataset, params, Nt, Nv, Ns, temp_dir, out_dir, awscp
Na = Nt+Nv+Ns
ystd = np.array([ 43.24805568, 59.27725076]) # fix a bug
Nt, Nv, Ns
Out[4]:
In [5]:
if out_dir.startswith('s3://'):
lscmd = 'aws s3 ls ' + out_dir+'/'
else:
lscmd = 'ls -1 ' + out_dir+'/'
In [6]:
awscp(FN0+'.pkl', verbose=True)
patient = pd.read_pickle(os.path.join(temp_dir,FN0+'.pkl'))
In [7]:
Ntraining = len(patient[['Systole','Diastole']].dropna())
assert Ntraining == Nt or Ntraining == Nt+Nv
Ntraining
Out[7]:
In [8]:
Y = patient[['Systole','Diastole']].values.astype(float)
Y = Y[:Ntraining]
Y.shape,Y.min(axis=0),Y.mean(axis=0)
Out[8]:
In [9]:
def yfix(yp):
mask = np.where(yp[:,1] < yp[:,0])
yp[mask,:] = yp[mask,::-1]
yp = np.clip(yp,3,600)
ypm = np.sqrt(yp[:,0]*yp[:,1])
yp[:,0] = np.clip(yp[:,0],ypm*np.sqrt(0.1),ypm)
yp[:,1] = np.clip(yp[:,1],ypm,ypm/np.sqrt(0.1))
return yp
In [10]:
# each item in FNX has the following structure
# (<file name>,-1) = load file as is. If file contains "*" then look for latest iteration
# (<file name>,0) = look for last iteration
# (<file name>,<seed>) = add seed to file name and look for last iteration
# (<file name>,(<seed>,(None,<split>)) = add seed and split and look for latest iteration
# (<file name>,(<seed>,(<iteration>,<split>)) = add seed itetation and split and load
# <file name> can be a tupple in which case the first element is the file name to read and the second is to label it
FNX = []
for seed in range(1000,1020):
for isplit in range(3):
FNX.append((FN1,(seed,(None,isplit)),False))
In [11]:
import cPickle as pickle
try:
with open(os.path.join(temp_dir,FN+'.files.pkl'),'rb') as fp:
files = pickle.load(fp)
print "there are %d files already downloaded in %s"%(len(files), temp_dir)
except:
files = []
In [12]:
def ffilter(res, pattern, itersplit):
res = [r.split()[-1] for r in res]
res = [f for f in res if f.find(pattern) >= 0]
# filter wanted iteration and split
if itersplit is not None and itersplit[0] is not None:
res = [r for r in res if r.endswith('.iter-%d.%d.pkl'%itersplit)]
res = sorted(res,key=lambda x: -int(x.split('.')[-3].split('-')[1]))
if itersplit is not None and itersplit[0] is None:
res = [r for r in res if r.endswith('.%d.pkl'%itersplit[1])]
res = res[-1:]
return res
In [13]:
import os.path
# files =[]
df = {}
df_count = {}
df_var = {}
df_var_count = {}
itests = {}
for fnseed in FNX:
if len(fnseed) == 2:
fn, seed = fnseed
bug_fix = True
else:
fn, seed, bug_fix = fnseed
if isinstance(seed,int):
itersplit = None
else:
seed, itersplit = seed
fn1 = fn
if isinstance(fn, tuple):
fn, fn1 = fn
if seed >= 0:
didls = False
if seed:
pattern = '%s.seed%d.yp.iter-'%(fn,seed)
else:
pattern = '%s.yp.iter-'%(fn)
# default pattern in case pattern is not found
pattern1 = '%s.yp.iter-'%(fn)
# look for pattern in files we already know about
res = ffilter(files, pattern, itersplit)
if not res:
res = ffilter(files, pattern1, itersplit)
if res:
print "Default",fn
# if not found, look for pattern in S3
if not res:
didls = True
fres = !{lscmd}
res = ffilter(fres, pattern, itersplit)
if not res:
res = ffilter(fres, pattern1, itersplit)
if res:
print "Default S3",fn
if not res:
print 'cant find',fnseed
# !{lscmd} | grep {pattern}
continue
fname = res[0]
if didls:
print "S3",fname
awscp(fname)
with open(os.path.join(temp_dir, fname),'rb') as fp:
yp_corrected, Y_itest, itest, yp_test = pickle.load(fp)
itest = np.concatenate((itest,np.arange(Ntraining,Na)))
yp = np.vstack((yp_corrected, yp_test))
else:
# load file directly
res = !{lscmd} | grep {fn}
res = sorted(res,key=lambda x: -int(x.split('.')[-3].split('-')[1]))
if not res:
print 'cant find',fnseed
break
fname = res[0]
awscp(fname)
if fname.endswith('.npy'):
itest = np.arange(Na)
yp = np.load(os.path.join(temp_dir, fname))
else:
assert fname.endswith('.pkl')
with open(os.path.join(temp_dir, fname),'rb') as fp:
yp_corrected, Y_itest, itest, yp_test = pickle.load(fp)
itest = np.concatenate((itest,np.arange(Ntraining,Na)))
yp = np.vstack((yp_corrected, yp_test))
# make sure predictions make sense
ypprb = yfix(yp[:,:2])
# fld = '%s.%d'%(fn1,seed)
fld = fname
if fld not in df:
df[fld] = np.zeros((Na,2))
df_count[fld] = np.zeros(Na)
if yp.shape[1] == 4:
df_var[fld] = np.zeros((Na,2))
df_var_count[fld] = np.zeros(Na)
itests[fname] = itest
df_count[fld][itest] += 1
df[fld][itest,:] += ypprb
if yp.shape[1] == 4:
df_var_count[fld][itest] += 1
if bug_fix:
# /ystd is a BUG FIX in 160221-train-std-clip
df_var[fld][itest,:] += yp[:,2:]*yp[:,2:]/ystd
else:
df_var[fld][itest,:] += yp[:,2:]*yp[:,2:]
In [14]:
fnames = df.keys()
len(fnames)
Out[14]:
In [15]:
# save files stored in temp_dir to save time on re-run
with open(os.path.join(temp_dir,FN+'.files.pkl'),'wb') as fp:
pickle.dump(fnames, fp, -1)
In [16]:
df0 = pd.DataFrame(dict((k,v[:,0]/df_count[k]) for k,v in df.items()),index=range(Na))
df1 = pd.DataFrame(dict((k,v[:,1]/df_count[k]) for k,v in df.items()),index=range(Na))
dfx = pd.concat((df0,df1),keys=('sys','dia')).swaplevel(0, 1, axis=0).sort_index(level=0)
In [17]:
df0 = pd.DataFrame(dict((k,v[:,0]/df_count[k]) for k,v in df_var.items()),index=range(Na))
df1 = pd.DataFrame(dict((k,v[:,1]/df_count[k]) for k,v in df_var.items()),index=range(Na))
dfx_var = pd.concat((df0,df1),keys=('sys','dia')).swaplevel(0, 1, axis=0).sort_index(level=0)
In [18]:
dfy = pd.concat((pd.DataFrame({'y':Y[:,0]}),pd.DataFrame({'y':Y[:,1]})),keys=('sys','dia')).swaplevel(0, 1, axis=0).sort_index(level=0)
In [19]:
def score(cols):
return np.sqrt(np.mean((dfx[cols].mean(axis=1).values[:Ntraining*2] - dfy.values[:,0])**2))
remove= []
print score(fnames)
for k in fnames:
vals = [(score([c for j, c in enumerate(fnames) if j != i and j not in remove]), i)
for i, _ in enumerate(fnames)]
vals.sort(key=lambda x: x[0])
val,j = vals[0]
if val!=val:
break
if j in remove:
break
remove.append(j)
print j,val, fnames[j]
In [20]:
fnames_mean = [f for i, f in enumerate(fnames) if i not in remove]
fnames_mean
Out[20]:
In [21]:
y_mean = dfx[fnames_mean].mean(axis=1).values
In [22]:
y_mean = y_mean.reshape((-1,2))
In [23]:
fnames_var = dfx_var.columns
len(fnames_var),len(fnames)
Out[23]:
In [24]:
def loss(cols):
err2 = np.square(dfx[cols].mean(axis=1).values[:Ntraining*2] - dfy.values[:,0]).reshape((-1,2))
var = dfx_var[cols].mean(axis=1).values[:Ntraining*2].reshape((-1,2))
loss = np.log(var) + err2/var
return loss.mean()
remove= []
print score(fnames_var)
for k in fnames_var:
vals = [(loss([c for j, c in enumerate(fnames_var) if j != i and j not in remove]), i)
for i, _ in enumerate(fnames_var)]
vals.sort(key=lambda x: x[0])
val,j = vals[0]
if val!=val:
break
if j in remove:
break
remove.append(j)
print j,val, fnames_var[j]
In [25]:
fnames_var = [f for i, f in enumerate(fnames_var) if i not in remove]
fnames_var
Out[25]:
In [26]:
y_std = np.sqrt(dfx_var[fnames_var].mean(axis=1).values.reshape((-1,2)))
In [27]:
for p in [0,268,414,415,483]:
print Y[p],y_mean[p]
In [28]:
def plot_outliers(Y,y_mean,y_std,mask=None):
plt.scatter(Y[:],y_mean[:Ntraining],alpha=0.1,color='b');
plt.errorbar(Y[:],y_mean[:Ntraining], yerr=y_std[:Ntraining], fmt='.',alpha=0.1,color='b')
if mask is None:
mask = np.where(np.abs(Y[:]-y_mean[:Ntraining])/(Y[:]+100) > 0.2)[0]
print mask
# plt.scatter(Y[mask],y_mean[mask],alpha=0.3,color='r')
plt.errorbar(Y[mask],y_mean[mask], yerr=y_std[mask], fmt='.',alpha=0.3,color='r')
plt.figure(figsize=(14,4))
plt.subplot(121)
plot_outliers(Y[:,0],y_mean[:,0],y_std[:,0])
plt.subplot(122)
plot_outliers(Y[:,1],y_mean[:,1],y_std[:,0]);
In [29]:
plt.figure(figsize=(14,4))
plt.subplot(121)
plt.hist(y_mean[:,0])
plt.hist(y_mean[:,1],alpha=0.3);
plt.subplot(122)
plt.hist(y_std[:,0]/y_mean[:,0])
plt.hist(y_std[:,1]/y_mean[:,1],alpha=0.3);
In [30]:
plt.figure(figsize=(14,4))
for m in range(2):
plt.subplot(1,2,1+m)
plt.hist(Y[:Ntraining,m],bins=50,histtype='step',normed=True,label='true')
plt.hist(y_mean[:Ntraining,m],histtype='step',normed=True,label='train')
plt.hist(y_mean[Ntraining:,m],histtype='step',normed=True,label='test');
plt.legend()
In [31]:
plt.figure(figsize=(14,4))
for m in range(2):
plt.subplot(1,2,1+m)
plt.hist(y_std[:Ntraining,m],histtype='step',normed=True,label='train')
plt.hist(y_std[Ntraining:,m],histtype='step',normed=True,label='test');
plt.legend()
In [32]:
plt.figure(figsize=(14,4))
plt.scatter(y_mean[:Ntraining,0],y_mean[:Ntraining,1]);
plt.scatter(y_mean[Ntraining:,0],y_mean[Ntraining:,1],color='r');
In [33]:
plt.figure(figsize=(14,4))
plt.scatter(y_mean[:Ntraining,0],y_std[:Ntraining,0]/y_mean[:Ntraining,0]);
plt.scatter(y_mean[Ntraining:,0],y_std[Ntraining:,0]/y_mean[Ntraining:,0],color='r');
In [34]:
plt.figure(figsize=(14,4))
plt.scatter(y_mean[:Ntraining,1],y_std[:Ntraining,1]/y_mean[:Ntraining,1]);
plt.scatter(y_mean[Ntraining:,1],y_std[Ntraining:,1]/y_mean[Ntraining:,1],color='r');
In [35]:
import scipy.stats
def myplt(err,fit=scipy.stats.t):
# err -= err.mean()
param = fit.fit(err) # distribution fitting
# now, param[0] and param[1] are the mean and
# the standard deviation of the fitted distribution
x = np.linspace(-15,15,100)
# fitted distribution
pdf_fitted = fit.pdf(x,df=param[0],loc=param[1],scale=param[2])
plt.plot(x,pdf_fitted,'r-')
plt.hist(err,normed=1,alpha=.3,bins=40)
return param
betagamma=[]
for m in range(2):
plt.subplot(121+m)
res = myplt(((Y[:,m]-y_mean[:Ntraining,m])/y_std[:Ntraining,m]))
print res
betagamma.append(res[1])
In [36]:
plt.figure(figsize=(14,4))
plt.subplot(121)
m=0
plt.scatter(np.clip(np.abs(Y[:,m]-y_mean[:Ntraining,m]),0,400),y_std[:Ntraining,m]);
plt.subplot(122)
m=1
plt.scatter(np.clip(np.abs(Y[:,m]-y_mean[:Ntraining,m]),0,400),y_std[:Ntraining,m]);
# plt.scatter(y_mean[Ntraining:,1],y_std[Ntraining:,1]/y_mean[Ntraining:,1],color='r');
Out[36]:
In [37]:
np.where(y_std < 0),np.where(y_mean < 0)
Out[37]:
In [38]:
times = ['meanmeanTriggerTime_sax','stdmeanTriggerTime_sax']
age = patient.PatientAge_sax.values
logage = np.log(age)
sex = patient.PatientSex_sax.values
Xaux = np.array([sex,np.log(age)]+[patient[t].values for t in times]).T
In [39]:
Xaux.shape
Out[39]:
In [40]:
from sklearn.base import BaseEstimator
from sklearn import linear_model
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
class Ridge(BaseEstimator):
def __init__(self, alpha=72, nrm=True, naux=2):
self.alpha = alpha
self.nrm = nrm
self.naux = naux
def fit(self,X,y):
self.lrs = []
for measure in range(2):
lr = linear_model.Ridge(alpha = self.alpha)
if self.nrm:
clf = Pipeline([('nrm',StandardScaler()),('lr',lr)])
else:
clf = lr
clf.fit(X[:,:self.naux],y[:,measure])
self.lrs.append(clf)
def predict(self,X):
lrs_predict = []
for measure in range(2):
lr_predict = self.lrs[measure].predict(X[:,:self.naux])
lrs_predict.append(lr_predict)
lrs_predict = np.array(lrs_predict).T
return lrs_predict
def score(self, X, y):
return -np.sqrt(np.mean(np.square(y - self.predict(X)),axis=0)).mean()
In [41]:
from sklearn.grid_search import GridSearchCV, RandomizedSearchCV
clfg = GridSearchCV(Ridge(),param_grid={'alpha':np.logspace(1,1.5,8),'naux':[1,2,3,4]},n_jobs=-1,cv=10) #,verbose=1000)
clfg.fit(Xaux[:Ntraining,:],Y)
clfg.best_params_,-clfg.best_score_
Out[41]:
In [42]:
Yaux = np.clip(clfg.predict(Xaux),1,600)
In [43]:
plt.figure(figsize=(14,4))
plt.subplot(121)
plt.scatter(Y[:,0],Yaux[:Ntraining,0])
plt.subplot(122)
plt.scatter(Y[:,1],Yaux[:Ntraining,1])
Out[43]:
In [44]:
plt.figure(figsize=(14,4))
for m in range(2):
plt.subplot(121+m)
plt.scatter(Yaux[:Ntraining,m],(Y[:,m]-y_mean[:Ntraining,m]))
In [45]:
x = np.hstack((y_mean[:Ntraining,:1],Yaux[:Ntraining,:1]))
x.shape
Out[45]:
In [46]:
import statsmodels.api as sm
mymodel = sm.OLS(Y[:,0],x,'y',['ym','aux']).fit()
In [47]:
mymodel.summary()
Out[47]:
In [48]:
Yaux.shape
Out[48]:
In [49]:
X = np.hstack((y_mean, y_std, Yaux))
NF = X.shape[1] # stat of auxiliary data
X.shape,NF
Out[49]:
In [50]:
Xtrain = X[:Ntraining]
Ytrain = Y[:Ntraining]
Xtrain.shape, Ytrain.shape
Out[50]:
In [51]:
def line_score(x,v):
scr = (x - ((np.arange(600) - v) >= 0))
return (scr*scr).mean()
import itertools
def score(y_true, y_pred):
scores = []
for yt, yp in itertools.izip(y_true, y_pred):
scores.append(line_score(yp[0], yt[0]))
scores.append(line_score(yp[1], yt[1]))
return np.mean(scores)
In [52]:
from sklearn.base import BaseEstimator
from sklearn.grid_search import GridSearchCV, RandomizedSearchCV
In [53]:
class Student(BaseEstimator):
M = 600
domain = np.arange(M)
def __init__(self, beta=1., gamma=1., delta=7):
self.beta = beta
self.gamma = gamma
self.delta=delta
def fit(self,X,y):
return self
def predict_llk(self,X):
N = len(X)
domain = np.tile(self.domain,N).reshape((N,self.M))
y = np.zeros((N,2,self.M))
stdscale = [self.beta, self.gamma]
for i in range(N):
for m in range(2):
y[i,m,:] = scipy.stats.t.logpdf((self.domain - X[i,m])/(stdscale[m]*X[i,m+2]), # /np.sqrt(X[i,4])
df=self.delta)
y = y - y.max(axis=2,keepdims=True)
return y
def predict(self,X):
y = self.predict_llk(X)
y = np.exp(y)
y /= y.sum(axis=2,keepdims=True)
y = y.cumsum(axis=2)
return y
def score(self, X, y):
y_pred = self.predict(X)
return 1.-score(y, y_pred)
In [54]:
1.-Student().score(Xtrain,Ytrain)
Out[54]:
In [55]:
# clfg = GridSearchCV(Student(),param_grid={'beta':[0.83684210526315783], # np.linspace(0.7,0.9,20),
# 'gamma':[0.77368421052631575], #np.linspace(0.7,0.9,20),
# 'delta':np.linspace(10,100,20)},n_jobs=-1,cv=10)
# clfg.fit(Xtrain,Ytrain)
# clfg.best_params_,1.-clfg.best_score_
In [56]:
class Normal(BaseEstimator):
M = 600
domain = np.arange(M)
def __init__(self, beta=0.8368421, gamma=0.773684210, aux=-0.086):
self.beta = beta
self.gamma = gamma
self.aux = aux
def fit(self,X,y):
return self
def predict_llk(self,X):
N = len(X)
domain = np.tile(self.domain,N).reshape((N,self.M))
y = np.zeros((N,2,self.M))
stdscale = [self.beta, self.gamma]
for m in range(2):
std = stdscale[m]*X[:,m+2,None] + self.aux*np.sqrt(X[:,m+4,None])# *np.sqrt(X[:,4,None])
y[:,m,:] = -(domain - X[:,m,None])**2/(2.*std**2)
y = y - y.max(axis=2,keepdims=True)
return y
def predict(self,X):
y = self.predict_llk(X)
y = np.exp(y)
y /= y.sum(axis=2,keepdims=True)
y = y.cumsum(axis=2)
return y
def score(self, X, y):
y_pred = self.predict(X)
return 1.-score(y, y_pred)
In [57]:
1.-Normal(aux=0).score(Xtrain,Ytrain)
Out[57]:
In [58]:
def objective(args):
clf = Normal(**args)
return 1.-clf.score(Xtrain,Ytrain)
In [59]:
import hyperopt
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
best = fmin(objective,
space={
'beta':hp.loguniform('beta', -1, 0),
'gamma':hp.loguniform('gamma', -1, 0),
'aux':hp.normal('aux', 0, 0.1),
},
algo=tpe.suggest,
max_evals=1000)
best, objective(best)
Out[59]:
In [60]:
clf = Normal(**best)
clf.fit(Xtrain,Ytrain)
Out[60]:
In [61]:
Xtest = X[Ntraining:].copy()
In [62]:
y_pred = clf.predict(Xtest)
In [63]:
sub_columns = ['P%d'%p for p in range(600)]
sub_index = ['%d_%s'%(i,m) for i in range(Ntraining+1,Na+1) for m in ['Diastole', 'Systole']]
sub = pd.DataFrame(np.zeros((2*(Na-Ntraining), 600)), index=sub_index, columns=sub_columns)
sub.index.name = 'Id'
In [64]:
sub[sub.columns] = y_pred[:,::-1,:].reshape((-1,600))
fname = FN+'.csv.gz'
sub.to_csv(os.path.join(temp_dir, fname),compression='gzip')
awscp(fname, upload=True)
In [65]:
!cp {os.path.join(temp_dir, fname)} /Users/udi/Desktop/
Private LB 0.015064