In [1]:
import pandas as pd;
import SETTINGS as sts;
import fitting_models
import analysis
import train_pred
import numpy as np
In [5]:
reload(train_pred)
Out[5]:
In [3]:
#####Load all data, same in the test code, but we can filter out train in train and test for test
info = pd.read_csv(sts.output_dir + '/info.csv')
ch4_data = { int(r[0]):(r[1], r[2]) for _,r in
pd.read_csv('tencia_scripts/output/ch4_volumes_map.csv', header=False).iterrows()};
#Tencia's cnn result
#Qi's cnn result
qifiles = [ 'v1_p2090_size256_ss100_nocL_tag3_2norm',\
'v2_p2090_size196_ss100_nocL_tag6_2norm',\
'v1_p2090_size256_ss150_nocL_tag10_2norm',\
'v2_p2090_size196_ss150_nocL_tag11_2norm',\
'v1_p2090_size256_ss200_nocL_tag5_2norm',\
'v2_p2090_size196_ss200_nocL_tag7_2norm',\
'v1_p2090_size256_ss75_nocL_tag12_2norm',\
'v2_p2090_size196_ss75_nocL_tag13_2norm',\
];
qi_areas = [analysis.get_cnn_results(sts.output_dir +"/areas_map_{}.csv".format(v)) for v in qifiles];
qi_cnts = [analysis.get_cnn_results(sts.output_dir +"/contour_portion_{}.csv".format(v)) for v in qifiles];
train_true = pd.read_csv(sts.data_kaggle + '/train.csv');#append validate result to here once released
validate = train_true[train_true.Id>300];
train_true = train_true[train_true.Id<=300];
Ntrain = train_true.shape[0];#500, becomes 700 when validate data released
print("number of train cases is {}".format(Ntrain));
In [4]:
#### train models,
########### default models
sa_predict = train_pred.train_sex_age_model(info, train_true);
ch4_predict = train_pred.train_ch4_model(ch4_data, train_true);
pick = [0,1];
qi_best,qi_best_cont = analysis.take_best_contour([qi_areas[i] for i in pick],[qi_cnts[i] for i in pick],method=1);
oneslice_pred = train_pred.train_oneslice_model(qi_best,train_true);
# fit the fall back model, a combination of oneslice_model and 4-ch model,
# if it still fails use the sex-age model
# 0.6 * oneslice_predict + 0.4 * ch4_predict; (use fixed 0.6, 0.4)
default_pred = train_pred.build_default_model(oneslice_pred, ch4_predict, sa_predict);
analysis.evaluate_pred(default_pred, train_true);
In [8]:
tencia_files = ['pMS','p1090'];
tencia_areas = [analysis.get_cnn_results('tencia_scripts/output/areas_map_{}.csv'.format(x)) for x in tencia_files];
tencia_best = analysis.take_best(tencia_areas, method=2);
tencia_predict = train_pred.train_sax_model(tencia_best, train_true, version = 1);
#0.01048
In [9]:
pick = [0,1];
reload(train_pred)
qi_best,qi_best_cont = analysis.take_best_contour([qi_areas[i] for i in pick],[qi_cnts[i] for i in pick],method=1);
qi_sax_pred = train_pred.train_sax_model(qi_best,train_true, version = 2, cleaner=[0,1,2]);
qi_sax_cnt_pred = train_pred.train_sax_cnt_model(qi_best, qi_best_cont, train_true, version = 2,cleaner=[0,1,2]);
qi_sax_filter_pred = train_pred.train_sax_cnt_filter_model(qi_best,qi_best_cont,train_true,cleaner=[0,1,2]);
#0,1: 0.009323, 0.00977, 0.009313
In [136]:
reload(analysis)
pick = [2,3];
reload(train_pred)
reload(fitting_models)
qi_best,qi_best_cont = analysis.take_best_contour([qi_areas[i] for i in pick],[qi_cnts[i] for i in pick],method=1);
result = analysis.get_preliminary_volume_features(qi_best,qi_best_cont,cleaner=[0,1,2]);
# result = analysis.get_preliminary_volume_cnt_filter(qi_best,qi_best_cont,cleaner=[0,1,2]);
sax_model = fitting_models.SaxFeatureModel();
sax_model.fit(result,train_true);
sax_predict = sax_model.predict(result);
analysis.evaluate_pred(sax_predict, train_true);
In [137]:
analysis.evaluate_pred(sax_predict, validate);
analysis.evaluate_pred(qi_sax_filter_pred2, validate);
In [62]:
X =[];
for c in result:
X.append(result.get(c));
X = np.asarray(X).reshape((-1,4));
In [117]:
np.sum(X[:,3]>1.3)
Out[117]:
In [10]:
pick = [2,3];
qi_best,qi_best_cont = analysis.take_best_contour([qi_areas[i] for i in pick],[qi_cnts[i] for i in pick],method=3);
qi_sax_pred2 = train_pred.train_sax_model(qi_best,train_true, version = 2,cleaner=[0,1,2]);
qi_sax_cnt_pred2 = train_pred.train_sax_cnt_model(qi_best, qi_best_cont, train_true, version = 2,cleaner=[0,1,2]);
qi_sax_filter_pred2 = train_pred.train_sax_cnt_filter_model(qi_best,qi_best_cont,train_true,cleaner=[0,1,2]);
#0.00954,0.0106,0.0968
In [11]:
pick = [4,5];
qi_best,qi_best_cont = analysis.take_best_contour([qi_areas[i] for i in pick],[qi_cnts[i] for i in pick],method=1);
qi_sax_pred3 = train_pred.train_sax_model(qi_best,train_true, version = 2,cleaner=[0,1,2]);
qi_sax_cnt_pred3 = train_pred.train_sax_cnt_model(qi_best, qi_best_cont, train_true, version = 2,cleaner=[0,1,2]);
qi_sax_filter_pred3 = train_pred.train_sax_cnt_filter_model(qi_best,qi_best_cont,train_true,cleaner=[0,1,2]);
#0.00944,0.00958,0.00943
In [12]:
pick = [6,7];
qi_best,qi_best_cont = analysis.take_best_contour([qi_areas[i] for i in pick],[qi_cnts[i] for i in pick],method=1);
qi_sax_pred4 = train_pred.train_sax_model(qi_best,train_true, version = 2,cleaner=[0,1,2]);
qi_sax_cnt_pred4 = train_pred.train_sax_cnt_model(qi_best, qi_best_cont, train_true, version = 2,cleaner=[0,1,2]);
qi_sax_filter_pred4 = train_pred.train_sax_cnt_filter_model(qi_best,qi_best_cont,train_true,cleaner=[0,1,2]);
#0.00959
In [13]:
# fit the combined model based on the cnn-sax models.
# when it fails, fall to the previously fitted fall back model
print(" --------- average models --");
reload(fitting_models)
#qi_sax_cnt_pred4,qi_sax_filter_pred4,
all_models = [qi_sax_pred,qi_sax_cnt_pred,\
qi_sax_pred2,qi_sax_cnt_pred2,\
qi_sax_pred3,qi_sax_cnt_pred3,\
qi_sax_filter_pred,qi_sax_filter_pred2,qi_sax_filter_pred3,\
tencia_predict,default_pred,qi_sax_pred4]; #validate = 0.0984
# ave_model = fitting_models.AverageModel(2e-4);
# all_models = [qi_sax_filter_pred, qi_sax_filter_pred2,qi_sax_filter_pred3,qi_sax_filter_pred4];
# ave_model.fit(all_models, train_true)
# qi_sax_filter_pred_ave = ave_model.predict(all_models);
# analysis.evaluate_pred(qi_sax_filter_pred_ave,train_true)
# all_models = [qi_sax_cnt_pred, qi_sax_cnt_pred2,qi_sax_cnt_pred3,qi_sax_cnt_pred4];
# ave_model.fit(all_models, train_true)
# qi_sax_cnt_pred_ave = ave_model.predict(all_models);
# analysis.evaluate_pred(qi_sax_cnt_pred_ave,train_true)
# all_models = [qi_sax_pred, qi_sax_pred2,qi_sax_pred3,qi_sax_pred4];
# ave_model.fit(all_models, train_true)
# qi_sax_pred_ave = ave_model.predict(all_models);
# analysis.evaluate_pred(qi_sax_pred_ave,train_true)
# all_models = [qi_sax_pred_ave,tencia_predict,default_pred,qi_sax_filter_pred_ave,qi_sax_cnt_pred_ave];
reload(fitting_models)
ave_model = fitting_models.AverageModel(1.0e-4);
ave_model.fit(all_models,train_true);
ave_model.set(ave_model.p*1.1)
#N = len(all_models);
#ave_model.set(np.array([np.sqrt(N)]*N)*0.95)
ave_pred = ave_model.predict(all_models);
#apply default model when it fails
final_pred = analysis.fill_default(ave_pred, default_pred);
analysis.evaluate_pred(final_pred, train_true);
#analysis.evaluate_pred(final_pred, validate);
#this is for the test part, the test cases are also calculated in the above trainning process
#we might want to save the parameters of the models, and code another script to calculate the test cases. These fittings are very quick, so I'll leave it here and work on some other things first.
#analysis.make_submit(final_pred, 501, 700, "v2"); #inclusive 500-700
#1-300 train: 0.008920
#201-500:validate: 0.009846
In [14]:
analysis.evaluate_pred(final_pred, validate); #0.00978
In [16]:
reload(analysis)
analysis.evaluate_pred(final_pred, train_true);
In [138]:
final_pred.get(212)
Out[138]:
In [111]:
default_pred.get(595)
Out[111]:
In [112]:
default_pred.get(599)
Out[112]:
In [47]:
case = 595;
print final_pred.get(case)
print oneslice_pred.get(case),ch4_predict.get(case),sa_predict.get(case)
print train_true[train_true.Id==case]
print validate[validate.Id==case]
In [145]:
print qi_sax_pred.get(case)
In [80]:
ch4_predict.get(2)
Out[80]:
In [140]:
np.percentile([1,2],80)
Out[140]:
In [ ]: