In [1]:
import pandas as pd;
import SETTINGS as sts;
import fitting_models
import analysis
import train_pred
import numpy as np

In [5]:
reload(train_pred)


Out[5]:
<module 'train_pred' from 'train_pred.py'>

In [3]:
#####Load all data, same in the test code, but we can filter out train in train and test for test
info = pd.read_csv(sts.output_dir + '/info.csv')
ch4_data = { int(r[0]):(r[1], r[2]) for _,r in
      pd.read_csv('tencia_scripts/output/ch4_volumes_map.csv', header=False).iterrows()};
#Tencia's cnn result

#Qi's cnn result
qifiles = [ 'v1_p2090_size256_ss100_nocL_tag3_2norm',\
    'v2_p2090_size196_ss100_nocL_tag6_2norm',\
    'v1_p2090_size256_ss150_nocL_tag10_2norm',\
    'v2_p2090_size196_ss150_nocL_tag11_2norm',\
    'v1_p2090_size256_ss200_nocL_tag5_2norm',\
    'v2_p2090_size196_ss200_nocL_tag7_2norm',\
    'v1_p2090_size256_ss75_nocL_tag12_2norm',\
    'v2_p2090_size196_ss75_nocL_tag13_2norm',\
    ];
qi_areas = [analysis.get_cnn_results(sts.output_dir +"/areas_map_{}.csv".format(v)) for v in qifiles];
qi_cnts = [analysis.get_cnn_results(sts.output_dir +"/contour_portion_{}.csv".format(v)) for v in qifiles];

train_true = pd.read_csv(sts.data_kaggle + '/train.csv');#append validate result to here once released
validate = train_true[train_true.Id>300];
train_true = train_true[train_true.Id<=300];

Ntrain = train_true.shape[0];#500, becomes 700 when validate data released
print("number of train cases is {}".format(Ntrain));


number of train cases is 300

In [4]:
#### train models,
########### default models
sa_predict = train_pred.train_sex_age_model(info, train_true);
ch4_predict = train_pred.train_ch4_model(ch4_data, train_true);
pick = [0,1]; 
qi_best,qi_best_cont = analysis.take_best_contour([qi_areas[i] for i in pick],[qi_cnts[i] for i in pick],method=1);
oneslice_pred = train_pred.train_oneslice_model(qi_best,train_true);
# fit the fall back model, a combination of oneslice_model and 4-ch model, 
# if it still fails use the sex-age model
# 0.6 * oneslice_predict + 0.4 * ch4_predict; (use fixed 0.6, 0.4)
default_pred = train_pred.build_default_model(oneslice_pred, ch4_predict, sa_predict);
analysis.evaluate_pred(default_pred, train_true);


 ------ train sex age model :
use fitted values, no fitting
score is 0.0393119122528
 ---- train ch4 model :
278 cases are used to fit the model
cases are missing in train: 1,12,19,36,60,66,74,85,100,101,116,135,144,166,167,176,191,200,212,216,225,260
fitting parameters [ 0.86105281  6.51470415  0.33565059  0.85714133  0.07211585  8.58795949]
fitting score 0.0176765079141
score is 0.0176765079141
 --- train oneslice model :
not implemented yet, use default to fit
score is 0.0155012381802
 --- building default model :
score is 0.0138794226048

In [8]:
tencia_files = ['pMS','p1090'];
tencia_areas = [analysis.get_cnn_results('tencia_scripts/output/areas_map_{}.csv'.format(x)) for x in tencia_files];
tencia_best = analysis.take_best(tencia_areas, method=2);
tencia_predict = train_pred.train_sax_model(tencia_best, train_true, version = 1);
#0.01048


 ---- train sax model :
length <=4
length <=4
length <=4
length <=4
length <=4
length <=4
length <=4
length <=4
length <=4
length <=4
length <=4
length <=4
length <=4
length <=4
length <=4
length <=4
length <=4
300 cases are used to fit the model
fitting parameters [ 1.00479942  0.99180158  0.05414734  4.00012514]
fitting score 0.0104732086834
score is 0.0104732086834

In [9]:
pick = [0,1];
reload(train_pred)
qi_best,qi_best_cont = analysis.take_best_contour([qi_areas[i] for i in pick],[qi_cnts[i] for i in pick],method=1);
qi_sax_pred = train_pred.train_sax_model(qi_best,train_true, version = 2, cleaner=[0,1,2]);
qi_sax_cnt_pred = train_pred.train_sax_cnt_model(qi_best, qi_best_cont, train_true, version = 2,cleaner=[0,1,2]);
qi_sax_filter_pred = train_pred.train_sax_cnt_filter_model(qi_best,qi_best_cont,train_true,cleaner=[0,1,2]);
#0,1: 0.009323, 0.00977, 0.009313


 ---- train sax model :
length <=4
length <=4
length <=4
length <=4
length <=4
length <=4
volume 3.7128 is too small, set to 4
300 cases are used to fit the model
fitting parameters [ 1.04224151  0.87434184  0.08081928  2.16647547  0.05295136  2.71257429]
fitting score 0.00922685434573
score is 0.00922685434573
 ---- train sax countour model :
length <=4
length <=4
length <=4
length <=4
length <=4
length <=4
volume 3.7128 is too small, set to 4
300 cases are used to fit the model
fitting parameters [ 0.45742235  0.56393121  0.06372138  3.99949918  0.04501477  3.99527151]
fitting score 0.00974057241091
score is 0.00974057241091
 ---- train sax countour filter model :
length <=4
length <=4
length <=4
length <=4
length <=4
length <=4
volume 3.7128 is too small, set to 4
300 cases are used to fit the model
fitting parameters [ 1.15870022  0.90017129  0.05437383  3.99950635  0.047496    3.99385422]
fitting score 0.00923522675788
score is 0.00923522675788

In [136]:
reload(analysis)
pick = [2,3];
reload(train_pred)
reload(fitting_models)
qi_best,qi_best_cont = analysis.take_best_contour([qi_areas[i] for i in pick],[qi_cnts[i] for i in pick],method=1);

result = analysis.get_preliminary_volume_features(qi_best,qi_best_cont,cleaner=[0,1,2]);
# result = analysis.get_preliminary_volume_cnt_filter(qi_best,qi_best_cont,cleaner=[0,1,2]);

sax_model = fitting_models.SaxFeatureModel();
sax_model.fit(result,train_true);
sax_predict = sax_model.predict(result);
analysis.evaluate_pred(sax_predict, train_true);


length <=4
volume 586.382 is too big, set to 580
length <=4
length <=4
length <=4
length <=4
length <=4
300 cases are used to fit the model
fitting parameters [ 0.27857085 -0.26132326  1.33486723  1.68689225 -2.37452227  0.76255337
  4.45235356]
fitting score 0.00975271640129
score is 0.00975271640129

In [137]:
analysis.evaluate_pred(sax_predict, validate);
analysis.evaluate_pred(qi_sax_filter_pred2, validate);


score is 0.0106138414263
score is 0.0105953784163

In [62]:
X =[];
for c in result:
    X.append(result.get(c));
X = np.asarray(X).reshape((-1,4));

In [117]:
np.sum(X[:,3]>1.3)


Out[117]:
70

In [10]:
pick = [2,3];
qi_best,qi_best_cont = analysis.take_best_contour([qi_areas[i] for i in pick],[qi_cnts[i] for i in pick],method=3);
qi_sax_pred2 = train_pred.train_sax_model(qi_best,train_true, version = 2,cleaner=[0,1,2]);
qi_sax_cnt_pred2 = train_pred.train_sax_cnt_model(qi_best, qi_best_cont, train_true, version = 2,cleaner=[0,1,2]);
qi_sax_filter_pred2 = train_pred.train_sax_cnt_filter_model(qi_best,qi_best_cont,train_true,cleaner=[0,1,2]);
#0.00954,0.0106,0.0968


 ---- train sax model :
length <=4
length <=4
length <=4
length <=4
length <=4
300 cases are used to fit the model
fitting parameters [ 1.46320678  1.15553243  0.08932317  1.9810633   0.05198473  3.08080125]
fitting score 0.00949225401491
score is 0.00949225401491
 ---- train sax countour model :
length <=4
length <=4
length <=4
length <=4
length <=4
300 cases are used to fit the model
fitting parameters [ 0.6649228   0.80098251  0.07321033  3.99754751  0.05216178  3.99733347]
fitting score 0.0105702374438
score is 0.0105702374438
 ---- train sax countour filter model :
length <=4
length <=4
length <=4
length <=4
length <=4
300 cases are used to fit the model
fitting parameters [ 1.42213647  1.23801426  0.06721213  4.00052412  0.04858318  3.99610735]
fitting score 0.00960771806751
score is 0.00960771806751

In [11]:
pick = [4,5];
qi_best,qi_best_cont = analysis.take_best_contour([qi_areas[i] for i in pick],[qi_cnts[i] for i in pick],method=1);
qi_sax_pred3 = train_pred.train_sax_model(qi_best,train_true, version = 2,cleaner=[0,1,2]);
qi_sax_cnt_pred3 = train_pred.train_sax_cnt_model(qi_best, qi_best_cont, train_true, version = 2,cleaner=[0,1,2]);
qi_sax_filter_pred3 = train_pred.train_sax_cnt_filter_model(qi_best,qi_best_cont,train_true,cleaner=[0,1,2]);
#0.00944,0.00958,0.00943


 ---- train sax model :
volume 2.298 is too small, set to 4
volume 582.445 is too big, set to 580
length <=4
length <=4
length <=4
length <=4
300 cases are used to fit the model
fitting parameters [ 1.37153141  1.1291468   0.06192786  3.99498324  0.04618147  3.99698685]
fitting score 0.00932574186619
score is 0.00932574186619
 ---- train sax countour model :
volume 2.298 is too small, set to 4
length <=4
length <=4
length <=4
length <=4
300 cases are used to fit the model
fitting parameters [ 0.67875272  0.78012469  0.05782141  3.99983309  0.04654852  3.99673419]
fitting score 0.00951856704943
score is 0.00951856704943
 ---- train sax countour filter model :
volume 2.298 is too small, set to 4
volume 582.445 is too big, set to 580
length <=4
length <=4
length <=4
length <=4
300 cases are used to fit the model
fitting parameters [ 1.531951    1.16518033  0.06392761  3.99947066  0.0474817   3.99174552]
fitting score 0.00930264366878
score is 0.00930264366878

In [12]:
pick = [6,7];
qi_best,qi_best_cont = analysis.take_best_contour([qi_areas[i] for i in pick],[qi_cnts[i] for i in pick],method=1);
qi_sax_pred4 = train_pred.train_sax_model(qi_best,train_true, version = 2,cleaner=[0,1,2]);
qi_sax_cnt_pred4 = train_pred.train_sax_cnt_model(qi_best, qi_best_cont, train_true, version = 2,cleaner=[0,1,2]);
qi_sax_filter_pred4 = train_pred.train_sax_cnt_filter_model(qi_best,qi_best_cont,train_true,cleaner=[0,1,2]);
#0.00959


 ---- train sax model :
length <=4
length <=4
length <=4
length <=4
length <=4
300 cases are used to fit the model
fitting parameters [ 1.16989609  1.03810506  0.06294464  3.99626325  0.04821035  3.99756062]
fitting score 0.00954991771869
score is 0.00954991771869
 ---- train sax countour model :
length <=4
length <=4
length <=4
length <=4
length <=4
300 cases are used to fit the model
fitting parameters [ 0.58945593  0.72845071  0.06463916  4.00005477  0.05108148  3.99609428]
fitting score 0.0101470311452
score is 0.0101470311452
 ---- train sax countour filter model :
length <=4
length <=4
length <=4
length <=4
length <=4
300 cases are used to fit the model
fitting parameters [ 1.0685861   1.09247361  0.07148267  4.0024348   0.04916844  3.99205676]
fitting score 0.0095641095916
score is 0.0095641095916

In [13]:
# fit the combined model based on the cnn-sax models.
# when it fails, fall to the previously fitted fall back model
print(" --------- average models --");
reload(fitting_models)
#qi_sax_cnt_pred4,qi_sax_filter_pred4,
all_models = [qi_sax_pred,qi_sax_cnt_pred,\
             qi_sax_pred2,qi_sax_cnt_pred2,\
             qi_sax_pred3,qi_sax_cnt_pred3,\
             qi_sax_filter_pred,qi_sax_filter_pred2,qi_sax_filter_pred3,\
             tencia_predict,default_pred,qi_sax_pred4]; #validate = 0.0984

# ave_model = fitting_models.AverageModel(2e-4);
# all_models = [qi_sax_filter_pred, qi_sax_filter_pred2,qi_sax_filter_pred3,qi_sax_filter_pred4];
# ave_model.fit(all_models, train_true)
# qi_sax_filter_pred_ave = ave_model.predict(all_models);
# analysis.evaluate_pred(qi_sax_filter_pred_ave,train_true)
# all_models = [qi_sax_cnt_pred, qi_sax_cnt_pred2,qi_sax_cnt_pred3,qi_sax_cnt_pred4];
# ave_model.fit(all_models, train_true)
# qi_sax_cnt_pred_ave = ave_model.predict(all_models);
# analysis.evaluate_pred(qi_sax_cnt_pred_ave,train_true)
# all_models = [qi_sax_pred, qi_sax_pred2,qi_sax_pred3,qi_sax_pred4];
# ave_model.fit(all_models, train_true)
# qi_sax_pred_ave = ave_model.predict(all_models);
# analysis.evaluate_pred(qi_sax_pred_ave,train_true)
# all_models = [qi_sax_pred_ave,tencia_predict,default_pred,qi_sax_filter_pred_ave,qi_sax_cnt_pred_ave];
reload(fitting_models)
ave_model = fitting_models.AverageModel(1.0e-4);
ave_model.fit(all_models,train_true);
ave_model.set(ave_model.p*1.1)
#N = len(all_models);
#ave_model.set(np.array([np.sqrt(N)]*N)*0.95)
ave_pred = ave_model.predict(all_models);

#apply default model when it fails
final_pred = analysis.fill_default(ave_pred, default_pred);
analysis.evaluate_pred(final_pred, train_true);
#analysis.evaluate_pred(final_pred, validate);

#this is for the test part, the test cases are also calculated in the above trainning process
#we might want to save the parameters of the models, and code another script to calculate the test cases. These fittings are very quick, so I'll leave it here and work on some other things first. 
#analysis.make_submit(final_pred, 501, 700, "v2"); #inclusive 500-700

#1-300 train: 0.008920
#201-500:validate: 0.009846


 --------- average models --
combine # predictions:700,700,700,700,700,700,700,700,700,700,700,700
init score :0.00892194088431
fitting parameters [ 3.28909099  2.62825157  3.80375984  3.92005645  3.25201615  2.36624647
  3.24030371  3.95696622  3.12327609  4.06444979  4.22283368  3.66943437]
fitting score 0.00888071067091
combine # predictions:700,700,700,700,700,700,700,700,700,700,700,700
score is 0.00884976414184

In [14]:
analysis.evaluate_pred(final_pred, validate); #0.00978


score is 0.00987173461514

In [16]:
reload(analysis)
analysis.evaluate_pred(final_pred, train_true);


score is 0.00893311843184

In [138]:
final_pred.get(212)


Out[138]:
array([ 1.82121781,  2.45188259,  8.87643828,  3.66078859])

In [111]:
default_pred.get(595)


Out[111]:
array([  94.29886892,   12.54399006,  255.17018585,   26.67766744])

In [112]:
default_pred.get(599)


Out[112]:
array([  37.95278885,    8.00086614,  114.42115483,   16.06057827])

In [47]:
case = 595;
print final_pred.get(case)
print oneslice_pred.get(case),ch4_predict.get(case),sa_predict.get(case)
print train_true[train_true.Id==case]
print validate[validate.Id==case]


[  93.84516574   14.25015755  252.66617114   25.75961421]
[ 103.12696935   14.25015755  262.36889918   26.67766744] [  79.92246034   14.35163556  238.11207909   25.75961421] [  75.   35.  181.   45.]
Empty DataFrame
Columns: [Id, Systole, Diastole]
Index: []
Empty DataFrame
Columns: [Id, Systole, Diastole]
Index: []

In [145]:
print qi_sax_pred.get(case)


[  7.35303078   3.17128321  23.85628402   4.53848263]

In [80]:
ch4_predict.get(2)


Out[80]:
array([  59.94360765,   12.3034449 ,  170.25612443,   15.00278353])

In [140]:
np.percentile([1,2],80)


Out[140]:
1.8

In [ ]: