In [2]:
import sys, os
#sys.path.insert(0, os.getcwd() + '//..')
os.chdir('d:/ml/mlbootcamp5')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
color = sns.color_palette()
from datetime import datetime
import qml_workdir.classes
from qml_workdir.classes.config import config
%matplotlib inline
In [3]:
ids_train = pd.read_csv(config.QML_DATA_DIR + "ids_train.csv", index_col='id').index
ids_test = pd.read_csv(config.QML_DATA_DIR + "ids_test.csv", index_col='id').index
In [3]:
train_raw = pd.read_csv(config.QML_DATA_DIR + "raw/train.csv", delimiter=";", index_col="id", na_values=['None'])
test_raw = pd.read_csv(config.QML_DATA_DIR + "raw/test.csv", delimiter=";", index_col="id", na_values=['None'])
train_raw.drop(train_raw.columns, axis=1).to_csv(config.QML_DATA_DIR + "ids_train.csv")
test_raw.drop(test_raw.columns, axis=1).to_csv(config.QML_DATA_DIR + "ids_test.csv")
train_raw.drop(set(train_raw.columns)-set(['cardio']), axis=1).to_csv(config.QML_DATA_DIR + "train_y.csv")
train_raw.drop(['cardio'], axis=1, inplace=True)
all_raw = pd.concat([train_raw, test_raw])
ids_train = pd.read_csv(config.QML_DATA_DIR + "ids_train.csv", index_col='id').index
ids_test = pd.read_csv(config.QML_DATA_DIR + "ids_test.csv", index_col='id').index
In [ ]:
In [36]:
#raw with na
DATA_ID = 1
all = all_raw.copy()
all = pd.get_dummies(all, columns=['gender']).rename(columns={'gender_1': 'gender_female', 'gender_2': 'gender_male'})
all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))
In [ ]:
In [ ]:
In [40]:
#raw filled na
DATA_ID = 2
train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(1), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(1), index_col='id')
all = pd.concat([train, test])
all.fillna(0, inplace=True)
all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))
In [ ]:
In [ ]:
In [42]:
#with imt
DATA_ID = 3
train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(2), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(2), index_col='id')
all = pd.concat([train, test])
all['imt'] = all['weight'] / (all['height']/100)/ (all['height']/100)
all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))
In [ ]:
In [ ]:
In [73]:
#raw + processed data
DATA_ID = 4
train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(1), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(1), index_col='id')
all = pd.concat([train, test])
all = pd.concat([train, test])
all['age_years'] = all['age'] / 365.25
all.ix[all['height']>210, 'height'] = all[all['height']<210]['height'].max()
all['height_low'] = np.int32(all['height']<110)
all.ix[all['height']<110, 'height'] = 110
all['weight_low'] = np.int32(all['weight']<40)
all.ix[all['weight']<40, 'weight'] = 40
all['cholesterol_all'] = all['cholesterol']
all['gluc_all'] = all['gluc']
all = pd.get_dummies(all, columns=['cholesterol', 'gluc'])
all['ap_error'] = 0
all['ap_error_swap'] = 0
all.ix[all['ap_hi']>10000, 'ap_hi'] = np.int64(all.ix[all['ap_hi']>10000, 'ap_hi']/100)
all.ix[all['ap_hi']>10000, 'ap_error'] = 1
all.ix[all['ap_hi']>1000, 'ap_hi'] = np.int64(all.ix[all['ap_hi']>1000, 'ap_hi']/10)
all.ix[all['ap_hi']>1000, 'ap_error'] = 1
all.ix[all['ap_hi']<=-100, 'ap_hi'] = np.int64(-all.ix[all['ap_hi']<=-100, 'ap_hi'])
all.ix[all['ap_hi']<=-100, 'ap_error'] = 1
all.ix[all['ap_hi']<0, 'ap_hi'] = np.int64(-all.ix[all['ap_hi']<0, 'ap_hi'])
all.ix[all['ap_hi']<0, 'ap_error'] = 1
ix = (all['ap_hi']==1) & (all['ap_lo']>1000)
all.ix[ix, 'ap_hi'], all.ix[ix, 'ap_lo'] = \
np.int64( all.ix[ix, 'ap_hi']*100 + all.ix[ix, 'ap_lo']/100), np.int64(((all.ix[ix, 'ap_lo']/10)%10))*10
all.ix[ix, 'ap_error'] = 1
ix = (all['ap_hi']>600) & (all['ap_lo']==0)
all.ix[ix, 'ap_hi'], all.ix[ix, 'ap_lo'] = \
np.int64(all.ix[ix, 'ap_hi']/10), np.int64(all.ix[ix, 'ap_hi']%10)*10
all.ix[ix, 'ap_error'] = 1
ix = (all['ap_hi']>240) &(all['ap_hi']<=600) & (all['ap_lo']==0)
all.ix[ix, 'ap_hi'], all.ix[ix, 'ap_lo'] = \
np.int64(100 + all.ix[ix, 'ap_hi']/10), np.int64(all.ix[ix, 'ap_hi']%10)*10
all.ix[ix, 'ap_error'] = 1
ix = (all['ap_hi']>=400) &(all['ap_hi']<500)
all.ix[ix, 'ap_hi'] = np.int64(all.ix[ix, 'ap_hi'])-300
all.ix[ix, 'ap_error'] = 1
ix = (all['ap_hi']>=900) &(all['ap_hi']<1000)
all.ix[ix, 'ap_hi'] = np.int64(all.ix[ix, 'ap_hi']/100)*10
all.ix[ix, 'ap_error'] = 1
ix = (all['ap_hi']>=700) &(all['ap_hi']<800)
all.ix[ix, 'ap_hi'] = np.int64(all.ix[ix, 'ap_hi']/100)*10+100
all.ix[ix, 'ap_error'] = 1
all.ix[all['ap_lo']<0, 'ap_lo'] = np.int64(-all.ix[all['ap_lo']<0, 'ap_lo'])
all.ix[all['ap_lo']<0, 'ap_error'] = 1
ix = (all['ap_hi']<100) & (all['ap_lo']==0)
all.ix[ix, 'ap_hi'], all.ix[ix, 'ap_lo'] = \
np.int64(all.ix[ix, 'ap_hi']*10), np.int64(all.ix[ix, 'ap_hi']*10-40)
all.ix[ix, 'ap_error'] = 1
ix = (all['ap_hi']>100) & (all['ap_lo']==0) & (all['ap_hi']%10==0)
all.ix[ix, 'ap_lo'] = np.int64(all.ix[ix, 'ap_hi'] - 40)
all.ix[ix, 'ap_error'] = 1
ix = (all['ap_hi']>100) & (all['ap_lo']==0) & (all['ap_hi']%10!=0) & (all['ap_hi']%10>=2)
all.ix[ix, 'ap_hi'], all.ix[ix, 'ap_lo'] = \
np.int64(all.ix[ix, 'ap_hi']/10)*10, np.int64(all.ix[ix, 'ap_hi']%10)*10
all.ix[ix, 'ap_error'] = 1
ix = (all['ap_hi']>100) & (all['ap_lo']==0) & (all['ap_hi']%10!=0) & (all['ap_hi']%10<2)
all.ix[ix, 'ap_hi'], all.ix[ix, 'ap_lo'] = \
np.int64(all.ix[ix, 'ap_hi']/10)*10, np.int64(all.ix[ix, 'ap_hi']%10)*100
all.ix[ix, 'ap_error'] = 1
all.ix[all['ap_lo']>=10000, 'ap_lo'] = np.int64(all.ix[all['ap_lo']>=10000, 'ap_lo']/100)
all.ix[all['ap_lo']>=10000, 'ap_error'] = 1
all.ix[all['ap_lo']>=4500, 'ap_lo'] = np.int64(all.ix[all['ap_lo']>4500, 'ap_lo']/100)
all.ix[all['ap_lo']>=4500, 'ap_error'] = 1
all.ix[all['ap_lo']>=4000, 'ap_lo'] = np.int64(all.ix[all['ap_lo']>4000, 'ap_lo']/100)+100
all.ix[all['ap_lo']>=4000, 'ap_error'] = 1
all.ix[all['ap_lo']>=1500, 'ap_lo'] = np.int64(all.ix[all['ap_lo']>1500, 'ap_lo']/10)%100
all.ix[all['ap_lo']>=1500, 'ap_error'] = 1
all.ix[all['ap_lo']>=1200, 'ap_lo'] = np.int64(all.ix[all['ap_lo']>=1200, 'ap_lo']/10)
all.ix[all['ap_lo']>=1200, 'ap_error'] = 1
all.ix[all['ap_lo']>=1150, 'ap_lo'] = np.int64(all.ix[all['ap_lo']>=1150, 'ap_lo']/100)*10
all.ix[all['ap_lo']>=1150, 'ap_error'] = 1
all.ix[all['ap_lo']>=1100, 'ap_lo'] = np.int64(all.ix[all['ap_lo']>=1100, 'ap_lo']/10)*10%1000
all.ix[all['ap_lo']>=1100, 'ap_error'] = 1
all.ix[all['ap_lo']>=1000, 'ap_lo'] = np.int64(all.ix[all['ap_lo']>=1000, 'ap_lo']/100)*10
all.ix[all['ap_lo']>=1000, 'ap_error'] = 1
all.ix[all['ap_lo']>=500, 'ap_lo'] = np.int64(all.ix[all['ap_lo']>=500, 'ap_lo']/10)
all.ix[all['ap_lo']>=500, 'ap_error'] = 1
ix = (all['ap_hi']>=100) & (all['ap_lo']<10)& (all['ap_lo']>1)
all.ix[ix, 'ap_lo'] = np.int64(all.ix[ix, 'ap_lo'])*10
all.ix[ix, 'ap_error'] = 1
ix = (all['ap_hi']>100) & (all['ap_lo']>=10)& (all['ap_lo']<19)
all.ix[ix, 'ap_lo'] = np.int64(all.ix[ix, 'ap_lo'])*10
all.ix[ix, 'ap_error'] = 1
ix = (all['ap_hi']>=100) & (all['ap_lo']==1)
all.ix[ix, 'ap_lo'] = np.int64(all.ix[ix, 'ap_lo'])*100
all.ix[ix, 'ap_error'] = 1
all.ix[all['ap_lo']<10, 'ap_lo'] = np.int64(all.ix[all['ap_lo']<10, 'ap_lo']*10)
all.ix[all['ap_lo']<10, 'ap_error'] = 1
ix = (all['ap_hi']>=100) & (all['ap_lo']==1)
all.ix[ix, 'ap_lo'] = np.int64(all.ix[ix, 'ap_hi'] - 40)
all.ix[ix, 'ap_error'] = 1
ix = (all['ap_hi']>=100) & (all['ap_lo']<=40)
all.ix[ix, 'ap_lo'] = np.int64(all.ix[ix, 'ap_hi'] - 40)
all.ix[ix, 'ap_error'] = 1
all.ix[all['ap_hi']<=10, 'ap_hi'] = np.int64(all.ix[all['ap_hi']<=10, 'ap_lo']+40)
all.ix[all['ap_hi']<=10, 'ap_error'] = 1
all.ix[all['ap_hi']<=20, 'ap_hi'] = np.int64(all.ix[all['ap_hi']<=20, 'ap_hi']*10)
all.ix[all['ap_hi']<=20, 'ap_error'] = 1
all.ix[all['ap_lo']<=10, 'ap_lo'] = np.int64(all.ix[all['ap_lo']<=10, 'ap_lo']*10)
all.ix[all['ap_lo']<=10, 'ap_error'] = 1
ix = (all['ap_lo']<=50) & (all['ap_hi']<=50)
all.ix[ix, 'ap_hi'], all.ix[ix, 'ap_lo'] = 120, 80
all.ix[ix, 'ap_error'] = 1
all.ix[all['ap_lo']<40, 'ap_lo'] = np.int64(all.ix[all['ap_lo']<40, 'ap_hi']-30)
all.ix[all['ap_lo']<40, 'ap_error'] = 1
ix = (all['ap_hi']==all['ap_lo'])
all.ix[ix, 'ap_hi'], all.ix[ix, 'ap_lo'] = 120, 80
all.ix[ix, 'ap_error'] = 1
ix = (all['ap_hi']<all['ap_lo'])
all.ix[ix, 'ap_hi'], all.ix[ix, 'ap_lo'] = all.ix[ix, 'ap_lo'], all.ix[ix, 'ap_hi']
all.ix[ix, 'ap_error'] = 1
all.ix[ix, 'ap_error_swap'] = 1
all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))
In [ ]:
In [ ]:
In [74]:
#with imt
DATA_ID = 5
train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(4), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(4), index_col='id')
all = pd.concat([train, test])
all['imt'] = all['weight'] / (all['height']/100)/ (all['height']/100)
all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))
In [ ]:
In [ ]:
In [76]:
#with imt stat
DATA_ID = 6
train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(5), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(5), index_col='id')
all = pd.concat([train, test])
all['imt_class'] = 0
all.ix[(16<=all['imt']) & (all['imt']<18.5) , 'imt_class'] = 1
all.ix[(18.5<=all['imt']) & (all['imt']<25) , 'imt_class'] = 2
all.ix[(25<=all['imt']) & (all['imt']<30) , 'imt_class'] = 3
all.ix[(30<=all['imt']) & (all['imt']<35) , 'imt_class'] = 4
all.ix[(35<=all['imt']) & (all['imt']<40) , 'imt_class'] =5
all.ix[(40<=all['imt']) , 'imt_class'] = 6
all['imt_class_all'] = all['imt_class']
all = pd.get_dummies(all, columns=['imt_class'])
all['smoke'].fillna(0, inplace=True)
all['alco'].fillna(0, inplace=True)
all['active'].fillna(1, inplace=True)
all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))
In [8]:
Out[8]:
In [ ]:
In [ ]:
In [11]:
#combinations
DATA_ID = 7
train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(6), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(6), index_col='id')
all = pd.concat([train, test])
from itertools import combinations_with_replacement
cols = ['age', 'height', 'weight', 'ap_hi', 'ap_lo', 'smoke', 'alco', 'active',
'age_years', 'cholesterol_all', 'gluc_all', 'imt', 'imt_class_all']
for c1, c2 in combinations_with_replacement(cols, 2):
all["x_{}_{}".format(c1,c2)] = all[c1] * all[c2]
all["div_{}_{}".format(c1,c2)] = all[c1] / (all[c2] if all[c2].min()>0 else all[c2]+1)
all["plus_{}_{}".format(c1,c2)] = all[c1] + all[c2]
all["min_{}_{}".format(c1,c2)] = all[c1] - all[c2]
all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))
In [ ]:
In [9]:
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(7), index_col='id')
In [10]:
len(test.columns)
Out[10]:
In [3]:
In [ ]:
In [ ]:
In [4]:
#raw + processed data (как 4 только ""Я считал, что 2 и 1 это ошибочно распознанная 7, а 3 - это 8.)
DATA_ID = 8
train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(1), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(1), index_col='id')
all = pd.concat([train, test])
all = pd.concat([train, test])
all['age_years'] = all['age'] / 365.25
all.ix[all['height']>210, 'height'] = all[all['height']<210]['height'].max()
all['height_low'] = np.int32(all['height']<110)
all.ix[all['height']<110, 'height'] = 110
all['weight_low'] = np.int32(all['weight']<40)
all.ix[all['weight']<30, 'weight'] = all.ix[all['weight']<30, 'weight'] %10 +70
all.ix[all['weight']<40, 'weight'] = all.ix[all['weight']<30, 'weight'] %10 +80
all['cholesterol_all'] = all['cholesterol']
all['gluc_all'] = all['gluc']
all = pd.get_dummies(all, columns=['cholesterol', 'gluc'])
all['ap_error'] = 0
all['ap_error_swap'] = 0
all.ix[all['ap_hi']>10000, 'ap_hi'] = np.int64(all.ix[all['ap_hi']>10000, 'ap_hi']/100)
all.ix[all['ap_hi']>10000, 'ap_error'] = 1
all.ix[all['ap_hi']>1000, 'ap_hi'] = np.int64(all.ix[all['ap_hi']>1000, 'ap_hi']/10)
all.ix[all['ap_hi']>1000, 'ap_error'] = 1
all.ix[all['ap_hi']<=-100, 'ap_hi'] = np.int64(-all.ix[all['ap_hi']<=-100, 'ap_hi'])
all.ix[all['ap_hi']<=-100, 'ap_error'] = 1
all.ix[all['ap_hi']<0, 'ap_hi'] = np.int64(-all.ix[all['ap_hi']<0, 'ap_hi'])
all.ix[all['ap_hi']<0, 'ap_error'] = 1
ix = (all['ap_hi']==1) & (all['ap_lo']>1000)
all.ix[ix, 'ap_hi'], all.ix[ix, 'ap_lo'] = \
np.int64( all.ix[ix, 'ap_hi']*100 + all.ix[ix, 'ap_lo']/100), np.int64(((all.ix[ix, 'ap_lo']/10)%10))*10
all.ix[ix, 'ap_error'] = 1
ix = (all['ap_hi']>600) & (all['ap_lo']==0)
all.ix[ix, 'ap_hi'], all.ix[ix, 'ap_lo'] = \
np.int64(all.ix[ix, 'ap_hi']/10), np.int64(all.ix[ix, 'ap_hi']%10)*10
all.ix[ix, 'ap_error'] = 1
ix = (all['ap_hi']>240) &(all['ap_hi']<=600) & (all['ap_lo']==0)
all.ix[ix, 'ap_hi'], all.ix[ix, 'ap_lo'] = \
np.int64(100 + all.ix[ix, 'ap_hi']/10), np.int64(all.ix[ix, 'ap_hi']%10)*10
all.ix[ix, 'ap_error'] = 1
ix = (all['ap_hi']>=400) &(all['ap_hi']<500)
all.ix[ix, 'ap_hi'] = np.int64(all.ix[ix, 'ap_hi'])-300
all.ix[ix, 'ap_error'] = 1
ix = (all['ap_hi']>=900) &(all['ap_hi']<1000)
all.ix[ix, 'ap_hi'] = np.int64(all.ix[ix, 'ap_hi']/100)*10
all.ix[ix, 'ap_error'] = 1
ix = (all['ap_hi']>=700) &(all['ap_hi']<800)
all.ix[ix, 'ap_hi'] = np.int64(all.ix[ix, 'ap_hi']/100)*10+100
all.ix[ix, 'ap_error'] = 1
all.ix[all['ap_lo']<0, 'ap_lo'] = np.int64(-all.ix[all['ap_lo']<0, 'ap_lo'])
all.ix[all['ap_lo']<0, 'ap_error'] = 1
ix = (all['ap_hi']<100) & (all['ap_lo']==0)
all.ix[ix, 'ap_hi'], all.ix[ix, 'ap_lo'] = \
np.int64(all.ix[ix, 'ap_hi']*10), np.int64(all.ix[ix, 'ap_hi']*10-40)
all.ix[ix, 'ap_error'] = 1
ix = (all['ap_hi']>100) & (all['ap_lo']==0) & (all['ap_hi']%10==0)
all.ix[ix, 'ap_lo'] = np.int64(all.ix[ix, 'ap_hi'] - 40)
all.ix[ix, 'ap_error'] = 1
ix = (all['ap_hi']>100) & (all['ap_lo']==0) & (all['ap_hi']%10!=0) & (all['ap_hi']%10>=2)
all.ix[ix, 'ap_hi'], all.ix[ix, 'ap_lo'] = \
np.int64(all.ix[ix, 'ap_hi']/10)*10, np.int64(all.ix[ix, 'ap_hi']%10)*10
all.ix[ix, 'ap_error'] = 1
ix = (all['ap_hi']>100) & (all['ap_lo']==0) & (all['ap_hi']%10!=0) & (all['ap_hi']%10<2)
all.ix[ix, 'ap_hi'], all.ix[ix, 'ap_lo'] = \
np.int64(all.ix[ix, 'ap_hi']/10)*10, np.int64(all.ix[ix, 'ap_hi']%10)*100
all.ix[ix, 'ap_error'] = 1
all.ix[all['ap_lo']>=10000, 'ap_lo'] = np.int64(all.ix[all['ap_lo']>=10000, 'ap_lo']/100)
all.ix[all['ap_lo']>=10000, 'ap_error'] = 1
all.ix[all['ap_lo']>=4500, 'ap_lo'] = np.int64(all.ix[all['ap_lo']>4500, 'ap_lo']/100)
all.ix[all['ap_lo']>=4500, 'ap_error'] = 1
all.ix[all['ap_lo']>=4000, 'ap_lo'] = np.int64(all.ix[all['ap_lo']>4000, 'ap_lo']/100)+100
all.ix[all['ap_lo']>=4000, 'ap_error'] = 1
all.ix[all['ap_lo']>=1500, 'ap_lo'] = np.int64(all.ix[all['ap_lo']>1500, 'ap_lo']/10)%100
all.ix[all['ap_lo']>=1500, 'ap_error'] = 1
all.ix[all['ap_lo']>=1200, 'ap_lo'] = np.int64(all.ix[all['ap_lo']>=1200, 'ap_lo']/10)
all.ix[all['ap_lo']>=1200, 'ap_error'] = 1
all.ix[all['ap_lo']>=1150, 'ap_lo'] = np.int64(all.ix[all['ap_lo']>=1150, 'ap_lo']/100)*10
all.ix[all['ap_lo']>=1150, 'ap_error'] = 1
all.ix[all['ap_lo']>=1100, 'ap_lo'] = np.int64(all.ix[all['ap_lo']>=1100, 'ap_lo']/10)*10%1000
all.ix[all['ap_lo']>=1100, 'ap_error'] = 1
all.ix[all['ap_lo']>=1000, 'ap_lo'] = np.int64(all.ix[all['ap_lo']>=1000, 'ap_lo']/100)*10
all.ix[all['ap_lo']>=1000, 'ap_error'] = 1
all.ix[all['ap_lo']>=500, 'ap_lo'] = np.int64(all.ix[all['ap_lo']>=500, 'ap_lo']/10)
all.ix[all['ap_lo']>=500, 'ap_error'] = 1
ix = (all['ap_hi']>=100) & (all['ap_lo']<10)& (all['ap_lo']>1)
all.ix[ix, 'ap_lo'] = np.int64(all.ix[ix, 'ap_lo'])*10
all.ix[ix, 'ap_error'] = 1
ix = (all['ap_hi']>100) & (all['ap_lo']>=10)& (all['ap_lo']<19)
all.ix[ix, 'ap_lo'] = np.int64(all.ix[ix, 'ap_lo'])*10
all.ix[ix, 'ap_error'] = 1
ix = (all['ap_hi']>=100) & (all['ap_lo']==1)
all.ix[ix, 'ap_lo'] = np.int64(all.ix[ix, 'ap_lo'])*100
all.ix[ix, 'ap_error'] = 1
all.ix[all['ap_lo']<10, 'ap_lo'] = np.int64(all.ix[all['ap_lo']<10, 'ap_lo']*10)
all.ix[all['ap_lo']<10, 'ap_error'] = 1
ix = (all['ap_hi']>=100) & (all['ap_lo']==1)
all.ix[ix, 'ap_lo'] = np.int64(all.ix[ix, 'ap_hi'] - 40)
all.ix[ix, 'ap_error'] = 1
ix = (all['ap_hi']>=100) & (all['ap_lo']<=40)
all.ix[ix, 'ap_lo'] = np.int64(all.ix[ix, 'ap_hi'] - 40)
all.ix[ix, 'ap_error'] = 1
all.ix[all['ap_hi']<=10, 'ap_hi'] = np.int64(all.ix[all['ap_hi']<=10, 'ap_lo']+40)
all.ix[all['ap_hi']<=10, 'ap_error'] = 1
all.ix[all['ap_hi']<=20, 'ap_hi'] = np.int64(all.ix[all['ap_hi']<=20, 'ap_hi']*10)
all.ix[all['ap_hi']<=20, 'ap_error'] = 1
all.ix[all['ap_lo']<=10, 'ap_lo'] = np.int64(all.ix[all['ap_lo']<=10, 'ap_lo']*10)
all.ix[all['ap_lo']<=10, 'ap_error'] = 1
ix = (all['ap_lo']<=50) & (all['ap_hi']<=50)
all.ix[ix, 'ap_hi'], all.ix[ix, 'ap_lo'] = 120, 80
all.ix[ix, 'ap_error'] = 1
all.ix[all['ap_lo']<40, 'ap_lo'] = np.int64(all.ix[all['ap_lo']<40, 'ap_hi']-30)
all.ix[all['ap_lo']<40, 'ap_error'] = 1
ix = (all['ap_hi']==all['ap_lo'])
all.ix[ix, 'ap_hi'], all.ix[ix, 'ap_lo'] = 120, 80
all.ix[ix, 'ap_error'] = 1
ix = (all['ap_hi']<all['ap_lo'])
all.ix[ix, 'ap_hi'], all.ix[ix, 'ap_lo'] = all.ix[ix, 'ap_lo'], all.ix[ix, 'ap_hi']
all.ix[ix, 'ap_error'] = 1
all.ix[ix, 'ap_error_swap'] = 1
all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))
In [ ]:
In [ ]:
In [5]:
#with imt
DATA_ID = 9
train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(8), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(8), index_col='id')
all = pd.concat([train, test])
all['imt'] = all['weight'] / (all['height']/100)/ (all['height']/100)
all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))
In [ ]:
In [ ]:
In [8]:
#cleaned + dillna
DATA_ID = 10
train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(8), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(8), index_col='id')
all = pd.concat([train, test])
all['smoke'].fillna(0, inplace=True)
all['alco'].fillna(0, inplace=True)
all['active'].fillna(1, inplace=True)
all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))
In [ ]:
In [ ]:
In [ ]:
In [10]:
#cleaned + dillna
DATA_ID = 11
train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(8), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(8), index_col='id')
all = pd.concat([train, test])
all['smoke'].fillna(all['smoke'].mean(), inplace=True)
all['alco'].fillna(all['alco'].mean(), inplace=True)
all['active'].fillna(all['active'].mean(), inplace=True)
all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))
In [ ]:
In [ ]:
In [27]:
#with imt stat
DATA_ID = 12
train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(9), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(9), index_col='id')
all = pd.concat([train, test])
all['imt_class'] = 0
all.ix[(16<=all['imt']) & (all['imt']<18.5) , 'imt_class'] = 1
all.ix[(18.5<=all['imt']) & (all['imt']<25) , 'imt_class'] = 2
all.ix[(25<=all['imt']) & (all['imt']<30) , 'imt_class'] = 3
all.ix[(30<=all['imt']) & (all['imt']<35) , 'imt_class'] = 4
all.ix[(35<=all['imt']) & (all['imt']<40) , 'imt_class'] =5
all.ix[(40<=all['imt']) , 'imt_class'] = 6
all['imt_class_all'] = all['imt_class']
all = pd.get_dummies(all, columns=['imt_class'])
all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))
In [ ]:
In [ ]:
In [43]:
#6 without one hot
DATA_ID = 13
train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(6), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(6), index_col='id')
all = pd.concat([train, test])
all.drop(labels=['gender_female', 'imt_class_0', 'imt_class_1',
'imt_class_2', 'imt_class_3', 'imt_class_4', 'imt_class_5',
'imt_class_6', 'cholesterol_1', 'cholesterol_2',
'cholesterol_3', 'gluc_1', 'gluc_2', 'gluc_3'], axis=1, inplace=True)
all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))
In [44]:
all.columns
Out[44]:
In [ ]:
In [ ]:
In [ ]:
#combinations
DATA_ID = 14
train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(6), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(6), index_col='id')
all = pd.concat([train, test])
from itertools import combinations_with_replacement
cols = ['age', 'height', 'weight', 'ap_hi', 'ap_lo', 'smoke', 'alco', 'active',
'age_years', 'cholesterol_all', 'gluc_all', 'imt', 'imt_class_all']
for c1, c2 in combinations_with_replacement(cols, 2):
all["x__{}__{}".format(c1,c2)] = all[c1] * all[c2]
if c1 != c2:
all["div__{}__{}".format(c1,c2)] = all[c1] / (all[c2] if all[c2].min()>0 else all[c2]+1)
all["plus__{}__{}".format(c1,c2)] = all[c1] + all[c2]
all["minus__{}__{}".format(c1,c2)] = all[c1] - all[c2]
for col in cols:
all["x__{}__{}".format(col,'gender_male')] = all[col] * all['gender_male']
all["x__{}__{}".format(col,'gender_female')] = all[col] * all['gender_female']
all['log__ap_hi'] = np.log(all['ap_hi'])
all['log__ap_lo'] = np.log(all['ap_lo'])
all['minus__log_ap_hi__log_ap_lo'] = all['log__ap_hi'] - all['log__ap_lo']
all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))
len(all.columns)
In [ ]:
In [53]:
In [ ]:
#6 + features selection hyperopt 02
DATA_ID = 15
train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(6), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(6), index_col='id')
all = pd.concat([train, test])
cols = ['active', 'age', 'alco', 'ap_error_swap', 'ap_hi', 'ap_lo', 'cholesterol_2', 'cholesterol_3', 'cholesterol_all', 'gender_female', 'gluc_1', 'gluc_2', 'gluc_3', 'gluc_all', 'height', 'imt', 'imt_class_4', 'imt_class_6', 'smoke', 'weight']
all = all[cols]
all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))
In [ ]:
In [ ]:
In [4]:
#6 + features selection del 01
DATA_ID = 16
train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(6), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(6), index_col='id')
all = pd.concat([train, test])
all.drop(labels=['height', 'cholesterol_2'], axis=1, inplace=True)
all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))
In [ ]:
In [ ]:
In [11]:
#6 + features selection add 03
DATA_ID = 17
train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(6), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(6), index_col='id')
all = pd.concat([train, test])
all['x__age__gluc_all'] = all['age']*all['gluc_all']
all['smoke'].fillna(0, inplace=True)
all['alco'].fillna(0, inplace=True)
all['active'].fillna(1, inplace=True)
all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))
In [16]:
#6 + features selection add 04
DATA_ID = 18
train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(6), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(6), index_col='id')
all = pd.concat([train, test])
all['x__age__gluc_all'] = all['age']*all['gluc_all']
all['x__ap_hi__cholesterol_all'] = all['ap_hi']*all['cholesterol_all']
all['smoke'].fillna(0, inplace=True)
all['alco'].fillna(0, inplace=True)
all['active'].fillna(1, inplace=True)
all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))
In [ ]:
In [17]:
#6 + features selection add 05
DATA_ID = 19
train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(6), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(6), index_col='id')
all = pd.concat([train, test])
all['x__age__gluc_all'] = all['age']*all['gluc_all']
all['x__ap_hi__cholesterol_all'] = all['ap_hi']*all['cholesterol_all']
all['div__smoke__imt_class_all'] = all['smoke']/(all['imt_class_all'] if all['imt_class_all'].min()>0 else all['imt_class_all']+1)
all['smoke'].fillna(0, inplace=True)
all['alco'].fillna(0, inplace=True)
all['active'].fillna(1, inplace=True)
all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))
In [ ]:
In [ ]:
In [41]:
#6 + combinations
DATA_ID = 20
train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(6), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(6), index_col='id')
all = pd.concat([train, test])
from itertools import combinations
cols = ['age', 'height', 'weight', 'ap_hi', 'ap_lo', #'smoke', 'alco', 'active',
'cholesterol_all', 'gluc_all', 'imt']
cols_norm = {}
for c in cols:
all[c+'_norm'] = (all[c] - all[c].mean()) / (all[c].max() - all[c].min())
cols_norm[c] = c+'_norm'
for c1, c2 in combinations(cols, 2):
all["x__{}__{}".format(c1,c2)] = all[c1] * all[c2]
all["div__{}__{}".format(c1,c2)] = all[c1] / (all[c2] if all[c2].min()>0 else all[c2]+1)
all["plus__{}__{}".format(cols_norm[c1],cols_norm[c2])] = all[cols_norm[c1]] + all[cols_norm[c2]]
all["minus__{}__{}".format(cols_norm[c1],cols_norm[c2])] = all[cols_norm[c1]] - all[cols_norm[c2]]
print(0)
for c1, c2, c3 in combinations(cols, 3):
all["x__{}__{}__{}".format(c1,c2,c3)] = all[c1] * all[c2] * all[c3]
all["plus__{}__{}__{}".format(cols_norm[c1],cols_norm[c2],cols_norm[c3])] = all[cols_norm[c1]] + all[cols_norm[c2]] + all[cols_norm[c3]]
all["minus1__{}__{}__{}".format(cols_norm[c1],cols_norm[c2],cols_norm[c3])] = \
all[cols_norm[c1]] + all[cols_norm[c2]] - all[cols_norm[c3]]
all["minus2__{}__{}__{}".format(cols_norm[c1],cols_norm[c2],cols_norm[c3])] = \
all[cols_norm[c1]] - all[cols_norm[c2]] + all[cols_norm[c3]]
all["minus3__{}__{}__{}".format(cols_norm[c1],cols_norm[c2],cols_norm[c3])] = \
- all[cols_norm[c1]] + all[cols_norm[c2]] + all[cols_norm[c3]]
all["minus4__{}__{}__{}".format(cols_norm[c1],cols_norm[c2],cols_norm[c3])] = \
all[cols_norm[c1]] - all[cols_norm[c2]] - all[cols_norm[c3]]
all["minus5__{}__{}__{}".format(cols_norm[c1],cols_norm[c2],cols_norm[c3])] = \
- all[cols_norm[c1]] + all[cols_norm[c2]] - all[cols_norm[c3]]
all["minus6__{}__{}__{}".format(cols_norm[c1],cols_norm[c2],cols_norm[c3])] = \
- all[cols_norm[c1]] - all[cols_norm[c2]] + all[cols_norm[c3]]
all["div1__{}__{}__{}".format(c1,c2,c3)] = 1 * all[c1] * all[c2] / all[c3]
all["div2__{}__{}__{}".format(c1,c2,c3)] = 1 * all[c1] / all[c2] * all[c3]
all["div3__{}__{}__{}".format(c1,c2,c3)] = 1 / all[c1] * all[c2] * all[c3]
all["div4__{}__{}__{}".format(c1,c2,c3)] = 1 * all[c1] / all[c2] / all[c3]
all["div5__{}__{}__{}".format(c1,c2,c3)] = 1 / all[c1] * all[c2] / all[c3]
all["div6__{}__{}__{}".format(c1,c2,c3)] = 1 / all[c1] / all[c2] * all[c3]
print(1)
all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
print(2)
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))
len(all.columns)
Out[41]:
In [ ]:
In [ ]:
In [4]:
#6 + features selection add 06
DATA_ID = 21
train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(6), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(6), index_col='id')
all = pd.concat([train, test])
all['x__age__gluc_all'] = all['age']*all['gluc_all']
all['x__ap_hi__cholesterol_all'] = all['ap_hi']*all['cholesterol_all']
all['div4__age__height__gluc_all'] = 1 * all['age'] / all['height'] / all['gluc_all']
all['smoke'].fillna(0, inplace=True)
all['alco'].fillna(0, inplace=True)
all['active'].fillna(1, inplace=True)
all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))
In [ ]:
In [5]:
#6 + features selection add 06
DATA_ID = 22
train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(6), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(6), index_col='id')
all = pd.concat([train, test])
all['x__age__gluc_all'] = all['age']*all['gluc_all']
all['x__ap_hi__cholesterol_all'] = all['ap_hi']*all['cholesterol_all']
all['x__age__ap_hi__gluc_all'] = all['age'] * all['ap_hi'] * all['gluc_all']
all['smoke'].fillna(0, inplace=True)
all['alco'].fillna(0, inplace=True)
all['active'].fillna(1, inplace=True)
all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))
In [7]:
#6 + features selection add 06
DATA_ID = 23
train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(6), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(6), index_col='id')
all = pd.concat([train, test])
for c in ['ap_hi', 'ap_lo', 'cholesterol_all']:
all[c+'_norm'] = (all[c] - all[c].mean()) / (all[c].max() - all[c].min())
all['x__age__gluc_all'] = all['age']*all['gluc_all']
all['x__ap_hi__cholesterol_all'] = all['ap_hi']*all['cholesterol_all']
all['minus6__ap_hi_norm__ap_lo_norm__cholesterol_all_norm'] = -all['ap_hi_norm'] - all['ap_lo_norm'] + all['cholesterol_all_norm']
all['smoke'].fillna(0, inplace=True)
all['alco'].fillna(0, inplace=True)
all['active'].fillna(1, inplace=True)
for c in ['ap_hi', 'ap_lo', 'cholesterol_all']:
del all[c+'_norm']
all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))
In [ ]:
INFO:root:032 0.5380299708136911 div6__height__gluc_all__imt
INFO:root:032 0.5380492890506681 div1__age__weight__cholesterol_all
In [ ]:
In [9]:
#6 + features selection add 06
DATA_ID = 24
train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(6), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(6), index_col='id')
all = pd.concat([train, test])
for c in ['ap_hi', 'ap_lo', 'cholesterol_all']:
all[c+'_norm'] = (all[c] - all[c].mean()) / (all[c].max() - all[c].min())
all['x__age__gluc_all'] = all['age']*all['gluc_all']
all['x__ap_hi__cholesterol_all'] = all['ap_hi']*all['cholesterol_all']
all['div6__height__gluc_all__imt'] = 1 / all['height'] / all['gluc_all'] * all['imt']
all['smoke'].fillna(0, inplace=True)
all['alco'].fillna(0, inplace=True)
all['active'].fillna(1, inplace=True)
for c in ['ap_hi', 'ap_lo', 'cholesterol_all']:
del all[c+'_norm']
all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))
In [11]:
#6 + features selection add 06
DATA_ID = 25
train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(6), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(6), index_col='id')
all = pd.concat([train, test])
for c in ['ap_hi', 'ap_lo', 'cholesterol_all']:
all[c+'_norm'] = (all[c] - all[c].mean()) / (all[c].max() - all[c].min())
all['x__age__gluc_all'] = all['age']*all['gluc_all']
all['x__ap_hi__cholesterol_all'] = all['ap_hi']*all['cholesterol_all']
all['div1__age__weight__cholesterol_all'] = 1 * all['age'] * all['weight'] / all['cholesterol_all']
all['smoke'].fillna(0, inplace=True)
all['alco'].fillna(0, inplace=True)
all['active'].fillna(1, inplace=True)
for c in ['ap_hi', 'ap_lo', 'cholesterol_all']:
del all[c+'_norm']
all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [14]:
#6 + features selection add 07
DATA_ID = 26
train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(6), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(6), index_col='id')
all = pd.concat([train, test])
cols = ['age', 'height', 'weight', 'ap_hi', 'ap_lo', 'cholesterol_all', 'gluc_all', 'imt']
for c in cols:
all[c+'_norm'] = (all[c] - all[c].mean()) / (all[c].max() - all[c].min())
all['x__age__gluc_all'] = all['age']*all['gluc_all']
all['x__ap_hi__cholesterol_all'] = all['ap_hi']*all['cholesterol_all']
all['div6__height__gluc_all__imt'] = 1 / all['height'] / all['gluc_all'] * all['imt']
all['plus__age_norm__ap_hi_norm__gluc_all_norm'] = all['age_norm'] + all['ap_hi_norm'] + all['gluc_all_norm']
all['smoke'].fillna(0, inplace=True)
all['alco'].fillna(0, inplace=True)
all['active'].fillna(1, inplace=True)
for c in cols:
del all[c+'_norm']
all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))
In [ ]:
In [15]:
#6 + features selection add 08
DATA_ID = 27
train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(6), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(6), index_col='id')
all = pd.concat([train, test])
cols = ['age', 'height', 'weight', 'ap_hi', 'ap_lo', 'cholesterol_all', 'gluc_all', 'imt']
for c in cols:
all[c+'_norm'] = (all[c] - all[c].mean()) / (all[c].max() - all[c].min())
all['x__age__gluc_all'] = all['age']*all['gluc_all']
all['x__ap_hi__cholesterol_all'] = all['ap_hi']*all['cholesterol_all']
all['div6__height__gluc_all__imt'] = 1 / all['height'] / all['gluc_all'] * all['imt']
all['plus__age_norm__ap_hi_norm__gluc_all_norm'] = all['age_norm'] + all['ap_hi_norm'] + all['gluc_all_norm']
all['x__age__weight'] = all['age']*all['weight']
all['smoke'].fillna(0, inplace=True)
all['alco'].fillna(0, inplace=True)
all['active'].fillna(1, inplace=True)
for c in cols:
del all[c+'_norm']
all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))
In [ ]:
In [ ]:
In [60]:
#6 + features selection add 08 (batch)
DATA_ID = 28
train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(20), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(20), index_col='id')
all = pd.concat([train, test])
cols =[
'age',
'height',
'weight',
'ap_hi',
'ap_lo',
'smoke',
'alco',
'active',
'gender_male',
'height_low',
'weight_low',
'cholesterol_all',
'gluc_all',
'cholesterol_1',
'cholesterol_2',
'cholesterol_3',
'gluc_1',
'gluc_2',
'gluc_3',
'ap_error',
'ap_error_swap',
'imt',
'imt_class_all',
'imt_class_0',
'imt_class_1',
'imt_class_2',
'imt_class_3',
'imt_class_4',
'imt_class_5',
'imt_class_6',
'x__age__gluc_all',
'x__ap_hi__cholesterol_all',
'div6__height__gluc_all__imt',
'plus__age_norm__ap_hi_norm__gluc_all_norm',
'x__age__weight',
'div1__age__weight__cholesterol_all',
'div6__age__weight__cholesterol_all',
'plus__height_norm__weight_norm__gluc_all_norm',
'div1__ap_hi__ap_lo__cholesterol_all',
'div6__ap_hi__ap_lo__cholesterol_all',
'plus__age_norm__gluc_all_norm__imt_norm',
'minus6__ap_hi_norm__ap_lo_norm__cholesterol_all_norm',
'minus1__ap_hi_norm__ap_lo_norm__cholesterol_all_norm',
'minus6__age_norm__ap_lo_norm__cholesterol_all_norm',
'minus1__age_norm__ap_lo_norm__cholesterol_all_norm',
'div6__height__weight__ap_lo',
'div2__ap_lo__cholesterol_all__gluc_all',
'x__age__ap_hi__gluc_all',
'div5__ap_lo__cholesterol_all__gluc_all',
]
all = all[cols]
all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [14]:
#6 + score
DATA_ID = 29
train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(28), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(28), index_col='id')
all = pd.concat([train, test])
all['score_scale_val_v1'] = 0
chol4 = all['cholesterol_all'] == 1
chol5 = all['cholesterol_all'] == 1
chol6 = all['cholesterol_all'] == 1
chol7 = all['cholesterol_all'] == 2
chol8 = all['cholesterol_all'] == 3
male = all['gender_male'] == 1
female = all['gender_male'] == 0
smoke = all['smoke'] == 1
not_smoke = all['smoke'] == 0
age40 = (all['age']/365.25 >=40) & (all['age']/365.25 < 45)
age45 = (all['age']/365.25 >=45) & (all['age']/365.25 < 55)
age55 = (all['age']/365.25 >=55) & (all['age']/365.25 < 60)
age60 = (all['age']/365.25 >=60) & (all['age']/365.25 < 65)
age65 = (all['age']/365.25 >=65)
ap120 = (all['ap_hi'] >= 120) & (all['ap_hi'] < 140)
ap140 = (all['ap_hi'] >= 140) & (all['ap_hi'] < 160)
ap160 = (all['ap_hi'] >= 160) & (all['ap_hi'] < 170)
ap180 = (all['ap_hi'] >= 180)
data = [
[female, not_smoke, age65, ap180, 7, 8, 9, 10, 12],
[female, not_smoke, age65, ap160, 5, 5, 6, 7, 8],
[female, not_smoke, age65, ap140, 3, 3, 4, 5, 6],
[female, not_smoke, age65, ap120, 2, 2, 3, 3, 4],
[female, not_smoke, age60, ap180, 4, 4, 5, 6, 7],
[female, not_smoke, age60, ap160, 3, 3, 3, 4, 5],
[female, not_smoke, age60, ap140, 2, 2, 2, 3, 3],
[female, not_smoke, age60, ap120, 1, 1, 2, 2, 2],
[female, not_smoke, age55, ap180, 2, 2, 3, 3, 4],
[female, not_smoke, age55, ap160, 1, 2, 2, 2, 3],
[female, not_smoke, age55, ap140, 1, 1, 1, 1, 2],
[female, not_smoke, age55, ap120, 1, 1, 1, 1, 1],
[female, not_smoke, age45, ap180, 1, 1, 1, 2, 2],
[female, not_smoke, age45, ap160, 1, 1, 1, 1, 1],
[female, not_smoke, age45, ap140, 0, 1, 1, 1, 1],
[female, not_smoke, age45, ap120, 0, 0, 1, 1, 1],
# [female, not_smoke, age40, ap180, , , , , ],
# [female, not_smoke, age40, ap160, , , , , ],
# [female, not_smoke, age40, ap140, , , , , ],
# [female, not_smoke, age40, ap120, , , , , ],
#######################################################
[female, smoke, age65, ap180, 13, 15, 17, 19, 22],
[female, smoke, age65, ap160, 9, 10, 12, 13, 16],
[female, smoke, age65, ap140, 6, 7, 8, 9, 11],
[female, smoke, age65, ap120, 4, 5, 5, 6, 7],
[female, smoke, age60, ap180, 8, 9, 10, 11, 13],
[female, smoke, age60, ap160, 5, 6, 7, 8, 9],
[female, smoke, age60, ap140, 3, 4, 5, 5, 6],
[female, smoke, age60, ap120, 2, 3, 3, 4, 4],
[female, smoke, age55, ap180, 4, 5, 5, 6, 7],
[female, smoke, age55, ap160, 3, 3, 4, 4, 5],
[female, smoke, age55, ap140, 2, 2, 2, 3, 3],
[female, smoke, age55, ap120, 1, 1, 2, 2, 2],
[female, smoke, age45, ap180, 2, 2, 3, 3, 4],
[female, smoke, age45, ap160, 1, 2, 2, 2, 3],
[female, smoke, age45, ap140, 1, 1, 1, 1, 2],
[female, smoke, age45, ap120, 1, 1, 1, 1, 1],
[female, smoke, age40, ap180, 0, 0, 0, 1, 1],
# [female, smoke, age40, ap160, , , , , ],
# [female, smoke, age40, ap140, , , , , ],
# [female, smoke, age40, ap120, , , , , ],
#######################################################
[male, not_smoke, age65, ap180, 14, 16, 19, 22, 26],
[male, not_smoke, age65, ap160, 9, 11, 13, 15, 16],
[male, not_smoke, age65, ap140, 6, 8, 9, 11, 13],
[male, not_smoke, age65, ap120, 4, 5, 6, 7, 9],
[male, not_smoke, age60, ap180, 9, 11, 13, 15, 18],
[male, not_smoke, age60, ap160, 6, 7, 9, 10, 12],
[male, not_smoke, age60, ap140, 4, 5, 6, 7, 9],
[male, not_smoke, age60, ap120, 3, 3, 4, 5, 6],
[male, not_smoke, age55, ap180, 6, 7, 8, 10, 12],
[male, not_smoke, age55, ap160, 4, 5, 6, 7, 8],
[male, not_smoke, age55, ap140, 3, 3, 4, 5, 6],
[male, not_smoke, age55, ap120, 2, 2, 3, 3, 4],
[male, not_smoke, age45, ap180, 4, 4, 5, 6, 7],
[male, not_smoke, age45, ap160, 2, 3, 3, 4, 5],
[male, not_smoke, age45, ap140, 2, 2, 2, 3, 3],
[male, not_smoke, age45, ap120, 1, 1, 2, 2, 2],
[male, not_smoke, age40, ap180, 1, 1, 1, 2, 2],
[male, not_smoke, age40, ap160, 1, 1, 1, 1, 1],
[male, not_smoke, age40, ap140, 0, 1, 1, 1, 1],
[male, not_smoke, age40, ap120, 0, 0, 1, 1, 1],
#######################################################
[male, smoke, age65, ap180, 26, 30, 35, 41, 47],
[male, smoke, age65, ap160, 18, 21, 25, 29, 34],
[male, smoke, age65, ap140, 13, 15, 17, 20, 24],
[male, smoke, age65, ap120, 9, 10, 12, 14, 17],
[male, smoke, age60, ap180, 18, 21, 24, 28, 33],
[male, smoke, age60, ap160, 12, 14, 17, 20, 24],
[male, smoke, age60, ap140, 8, 10, 12, 14, 17],
[male, smoke, age60, ap120, 6, 7, 8, 10, 12],
[male, smoke, age55, ap180, 12, 13, 16, 19, 22],
[male, smoke, age55, ap160, 8, 9, 11, 13, 16],
[male, smoke, age55, ap140, 5, 6, 8, 9, 11],
[male, smoke, age55, ap120, 4, 4, 5, 6, 8],
[male, smoke, age45, ap180, 7, 8, 10, 12, 14],
[male, smoke, age45, ap160, 5, 6, 7, 8, 10],
[male, smoke, age45, ap140, 3, 4, 5, 6, 7],
[male, smoke, age45, ap120, 2, 3, 3, 4, 5],
[male, smoke, age40, ap180, 2, 2, 3, 3, 4],
[male, smoke, age40, ap160, 1, 2, 2, 2, 3],
[male, smoke, age40, ap140, 1, 1, 1, 2, 2],
[male, smoke, age40, ap120, 1, 1, 1, 1, 1],
]
for gender, sm, age, aphi, c4, c5, c6, c7, c8 in data:
all.ix[gender & sm & aphi & chol4 , 'score_scale_val'] = c4
all.ix[gender & sm & aphi & chol5 , 'score_scale_val'] = c5
all.ix[gender & sm & aphi & chol6 , 'score_scale_val'] = c6
all.ix[gender & sm & aphi & chol7 , 'score_scale_val'] = c7
all.ix[gender & sm & aphi & chol8 , 'score_scale_val'] = c8
all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))
In [15]:
#6 + score
DATA_ID = 30
train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(28), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(28), index_col='id')
all = pd.concat([train, test])
all['score_scale_val_v2'] = 0
chol4 = all['cholesterol_all'] == 1
chol5 = all['cholesterol_all'] == 1
chol6 = all['cholesterol_all'] == 2
chol7 = all['cholesterol_all'] == 2
chol8 = all['cholesterol_all'] == 3
male = all['gender_male'] == 1
female = all['gender_male'] == 0
smoke = all['smoke'] == 1
not_smoke = all['smoke'] == 0
age40 = (all['age']/365.25 >=40) & (all['age']/365.25 < 45)
age45 = (all['age']/365.25 >=45) & (all['age']/365.25 < 55)
age55 = (all['age']/365.25 >=55) & (all['age']/365.25 < 60)
age60 = (all['age']/365.25 >=60) & (all['age']/365.25 < 65)
age65 = (all['age']/365.25 >=65)
ap120 = (all['ap_hi'] >= 120) & (all['ap_hi'] < 140)
ap140 = (all['ap_hi'] >= 140) & (all['ap_hi'] < 160)
ap160 = (all['ap_hi'] >= 160) & (all['ap_hi'] < 170)
ap180 = (all['ap_hi'] >= 180)
data = [
[female, not_smoke, age65, ap180, 7, 8, 9, 10, 12],
[female, not_smoke, age65, ap160, 5, 5, 6, 7, 8],
[female, not_smoke, age65, ap140, 3, 3, 4, 5, 6],
[female, not_smoke, age65, ap120, 2, 2, 3, 3, 4],
[female, not_smoke, age60, ap180, 4, 4, 5, 6, 7],
[female, not_smoke, age60, ap160, 3, 3, 3, 4, 5],
[female, not_smoke, age60, ap140, 2, 2, 2, 3, 3],
[female, not_smoke, age60, ap120, 1, 1, 2, 2, 2],
[female, not_smoke, age55, ap180, 2, 2, 3, 3, 4],
[female, not_smoke, age55, ap160, 1, 2, 2, 2, 3],
[female, not_smoke, age55, ap140, 1, 1, 1, 1, 2],
[female, not_smoke, age55, ap120, 1, 1, 1, 1, 1],
[female, not_smoke, age45, ap180, 1, 1, 1, 2, 2],
[female, not_smoke, age45, ap160, 1, 1, 1, 1, 1],
[female, not_smoke, age45, ap140, 0, 1, 1, 1, 1],
[female, not_smoke, age45, ap120, 0, 0, 1, 1, 1],
# [female, not_smoke, age40, ap180, , , , , ],
# [female, not_smoke, age40, ap160, , , , , ],
# [female, not_smoke, age40, ap140, , , , , ],
# [female, not_smoke, age40, ap120, , , , , ],
#######################################################
[female, smoke, age65, ap180, 13, 15, 17, 19, 22],
[female, smoke, age65, ap160, 9, 10, 12, 13, 16],
[female, smoke, age65, ap140, 6, 7, 8, 9, 11],
[female, smoke, age65, ap120, 4, 5, 5, 6, 7],
[female, smoke, age60, ap180, 8, 9, 10, 11, 13],
[female, smoke, age60, ap160, 5, 6, 7, 8, 9],
[female, smoke, age60, ap140, 3, 4, 5, 5, 6],
[female, smoke, age60, ap120, 2, 3, 3, 4, 4],
[female, smoke, age55, ap180, 4, 5, 5, 6, 7],
[female, smoke, age55, ap160, 3, 3, 4, 4, 5],
[female, smoke, age55, ap140, 2, 2, 2, 3, 3],
[female, smoke, age55, ap120, 1, 1, 2, 2, 2],
[female, smoke, age45, ap180, 2, 2, 3, 3, 4],
[female, smoke, age45, ap160, 1, 2, 2, 2, 3],
[female, smoke, age45, ap140, 1, 1, 1, 1, 2],
[female, smoke, age45, ap120, 1, 1, 1, 1, 1],
[female, smoke, age40, ap180, 0, 0, 0, 1, 1],
# [female, smoke, age40, ap160, , , , , ],
# [female, smoke, age40, ap140, , , , , ],
# [female, smoke, age40, ap120, , , , , ],
#######################################################
[male, not_smoke, age65, ap180, 14, 16, 19, 22, 26],
[male, not_smoke, age65, ap160, 9, 11, 13, 15, 16],
[male, not_smoke, age65, ap140, 6, 8, 9, 11, 13],
[male, not_smoke, age65, ap120, 4, 5, 6, 7, 9],
[male, not_smoke, age60, ap180, 9, 11, 13, 15, 18],
[male, not_smoke, age60, ap160, 6, 7, 9, 10, 12],
[male, not_smoke, age60, ap140, 4, 5, 6, 7, 9],
[male, not_smoke, age60, ap120, 3, 3, 4, 5, 6],
[male, not_smoke, age55, ap180, 6, 7, 8, 10, 12],
[male, not_smoke, age55, ap160, 4, 5, 6, 7, 8],
[male, not_smoke, age55, ap140, 3, 3, 4, 5, 6],
[male, not_smoke, age55, ap120, 2, 2, 3, 3, 4],
[male, not_smoke, age45, ap180, 4, 4, 5, 6, 7],
[male, not_smoke, age45, ap160, 2, 3, 3, 4, 5],
[male, not_smoke, age45, ap140, 2, 2, 2, 3, 3],
[male, not_smoke, age45, ap120, 1, 1, 2, 2, 2],
[male, not_smoke, age40, ap180, 1, 1, 1, 2, 2],
[male, not_smoke, age40, ap160, 1, 1, 1, 1, 1],
[male, not_smoke, age40, ap140, 0, 1, 1, 1, 1],
[male, not_smoke, age40, ap120, 0, 0, 1, 1, 1],
#######################################################
[male, smoke, age65, ap180, 26, 30, 35, 41, 47],
[male, smoke, age65, ap160, 18, 21, 25, 29, 34],
[male, smoke, age65, ap140, 13, 15, 17, 20, 24],
[male, smoke, age65, ap120, 9, 10, 12, 14, 17],
[male, smoke, age60, ap180, 18, 21, 24, 28, 33],
[male, smoke, age60, ap160, 12, 14, 17, 20, 24],
[male, smoke, age60, ap140, 8, 10, 12, 14, 17],
[male, smoke, age60, ap120, 6, 7, 8, 10, 12],
[male, smoke, age55, ap180, 12, 13, 16, 19, 22],
[male, smoke, age55, ap160, 8, 9, 11, 13, 16],
[male, smoke, age55, ap140, 5, 6, 8, 9, 11],
[male, smoke, age55, ap120, 4, 4, 5, 6, 8],
[male, smoke, age45, ap180, 7, 8, 10, 12, 14],
[male, smoke, age45, ap160, 5, 6, 7, 8, 10],
[male, smoke, age45, ap140, 3, 4, 5, 6, 7],
[male, smoke, age45, ap120, 2, 3, 3, 4, 5],
[male, smoke, age40, ap180, 2, 2, 3, 3, 4],
[male, smoke, age40, ap160, 1, 2, 2, 2, 3],
[male, smoke, age40, ap140, 1, 1, 1, 2, 2],
[male, smoke, age40, ap120, 1, 1, 1, 1, 1],
]
for gender, sm, age, aphi, c4, c5, c6, c7, c8 in data:
all.ix[gender & sm & aphi & chol4 , 'score_scale_val'] = c4
all.ix[gender & sm & aphi & chol5 , 'score_scale_val'] = c5
all.ix[gender & sm & aphi & chol6 , 'score_scale_val'] = c6
all.ix[gender & sm & aphi & chol7 , 'score_scale_val'] = c7
all.ix[gender & sm & aphi & chol8 , 'score_scale_val'] = c8
all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))
In [17]:
#6 + score
DATA_ID = 31
train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(28), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(28), index_col='id')
all = pd.concat([train, test])
all['score_scale_val_v3'] = 0
chol4 = all['cholesterol_all'] == 1
chol5 = all['cholesterol_all'] == 1
chol6 = all['cholesterol_all'] == 2
chol7 = all['cholesterol_all'] == 3
chol8 = all['cholesterol_all'] == 3
male = all['gender_male'] == 1
female = all['gender_male'] == 0
smoke = all['smoke'] == 1
not_smoke = all['smoke'] == 0
age40 = (all['age']/365.25 >=40) & (all['age']/365.25 < 45)
age45 = (all['age']/365.25 >=45) & (all['age']/365.25 < 55)
age55 = (all['age']/365.25 >=55) & (all['age']/365.25 < 60)
age60 = (all['age']/365.25 >=60) & (all['age']/365.25 < 65)
age65 = (all['age']/365.25 >=65)
ap120 = (all['ap_hi'] >= 120) & (all['ap_hi'] < 140)
ap140 = (all['ap_hi'] >= 140) & (all['ap_hi'] < 160)
ap160 = (all['ap_hi'] >= 160) & (all['ap_hi'] < 170)
ap180 = (all['ap_hi'] >= 180)
data = [
[female, not_smoke, age65, ap180, 7, 8, 9, 10, 12],
[female, not_smoke, age65, ap160, 5, 5, 6, 7, 8],
[female, not_smoke, age65, ap140, 3, 3, 4, 5, 6],
[female, not_smoke, age65, ap120, 2, 2, 3, 3, 4],
[female, not_smoke, age60, ap180, 4, 4, 5, 6, 7],
[female, not_smoke, age60, ap160, 3, 3, 3, 4, 5],
[female, not_smoke, age60, ap140, 2, 2, 2, 3, 3],
[female, not_smoke, age60, ap120, 1, 1, 2, 2, 2],
[female, not_smoke, age55, ap180, 2, 2, 3, 3, 4],
[female, not_smoke, age55, ap160, 1, 2, 2, 2, 3],
[female, not_smoke, age55, ap140, 1, 1, 1, 1, 2],
[female, not_smoke, age55, ap120, 1, 1, 1, 1, 1],
[female, not_smoke, age45, ap180, 1, 1, 1, 2, 2],
[female, not_smoke, age45, ap160, 1, 1, 1, 1, 1],
[female, not_smoke, age45, ap140, 0, 1, 1, 1, 1],
[female, not_smoke, age45, ap120, 0, 0, 1, 1, 1],
# [female, not_smoke, age40, ap180, , , , , ],
# [female, not_smoke, age40, ap160, , , , , ],
# [female, not_smoke, age40, ap140, , , , , ],
# [female, not_smoke, age40, ap120, , , , , ],
#######################################################
[female, smoke, age65, ap180, 13, 15, 17, 19, 22],
[female, smoke, age65, ap160, 9, 10, 12, 13, 16],
[female, smoke, age65, ap140, 6, 7, 8, 9, 11],
[female, smoke, age65, ap120, 4, 5, 5, 6, 7],
[female, smoke, age60, ap180, 8, 9, 10, 11, 13],
[female, smoke, age60, ap160, 5, 6, 7, 8, 9],
[female, smoke, age60, ap140, 3, 4, 5, 5, 6],
[female, smoke, age60, ap120, 2, 3, 3, 4, 4],
[female, smoke, age55, ap180, 4, 5, 5, 6, 7],
[female, smoke, age55, ap160, 3, 3, 4, 4, 5],
[female, smoke, age55, ap140, 2, 2, 2, 3, 3],
[female, smoke, age55, ap120, 1, 1, 2, 2, 2],
[female, smoke, age45, ap180, 2, 2, 3, 3, 4],
[female, smoke, age45, ap160, 1, 2, 2, 2, 3],
[female, smoke, age45, ap140, 1, 1, 1, 1, 2],
[female, smoke, age45, ap120, 1, 1, 1, 1, 1],
[female, smoke, age40, ap180, 0, 0, 0, 1, 1],
# [female, smoke, age40, ap160, , , , , ],
# [female, smoke, age40, ap140, , , , , ],
# [female, smoke, age40, ap120, , , , , ],
#######################################################
[male, not_smoke, age65, ap180, 14, 16, 19, 22, 26],
[male, not_smoke, age65, ap160, 9, 11, 13, 15, 16],
[male, not_smoke, age65, ap140, 6, 8, 9, 11, 13],
[male, not_smoke, age65, ap120, 4, 5, 6, 7, 9],
[male, not_smoke, age60, ap180, 9, 11, 13, 15, 18],
[male, not_smoke, age60, ap160, 6, 7, 9, 10, 12],
[male, not_smoke, age60, ap140, 4, 5, 6, 7, 9],
[male, not_smoke, age60, ap120, 3, 3, 4, 5, 6],
[male, not_smoke, age55, ap180, 6, 7, 8, 10, 12],
[male, not_smoke, age55, ap160, 4, 5, 6, 7, 8],
[male, not_smoke, age55, ap140, 3, 3, 4, 5, 6],
[male, not_smoke, age55, ap120, 2, 2, 3, 3, 4],
[male, not_smoke, age45, ap180, 4, 4, 5, 6, 7],
[male, not_smoke, age45, ap160, 2, 3, 3, 4, 5],
[male, not_smoke, age45, ap140, 2, 2, 2, 3, 3],
[male, not_smoke, age45, ap120, 1, 1, 2, 2, 2],
[male, not_smoke, age40, ap180, 1, 1, 1, 2, 2],
[male, not_smoke, age40, ap160, 1, 1, 1, 1, 1],
[male, not_smoke, age40, ap140, 0, 1, 1, 1, 1],
[male, not_smoke, age40, ap120, 0, 0, 1, 1, 1],
#######################################################
[male, smoke, age65, ap180, 26, 30, 35, 41, 47],
[male, smoke, age65, ap160, 18, 21, 25, 29, 34],
[male, smoke, age65, ap140, 13, 15, 17, 20, 24],
[male, smoke, age65, ap120, 9, 10, 12, 14, 17],
[male, smoke, age60, ap180, 18, 21, 24, 28, 33],
[male, smoke, age60, ap160, 12, 14, 17, 20, 24],
[male, smoke, age60, ap140, 8, 10, 12, 14, 17],
[male, smoke, age60, ap120, 6, 7, 8, 10, 12],
[male, smoke, age55, ap180, 12, 13, 16, 19, 22],
[male, smoke, age55, ap160, 8, 9, 11, 13, 16],
[male, smoke, age55, ap140, 5, 6, 8, 9, 11],
[male, smoke, age55, ap120, 4, 4, 5, 6, 8],
[male, smoke, age45, ap180, 7, 8, 10, 12, 14],
[male, smoke, age45, ap160, 5, 6, 7, 8, 10],
[male, smoke, age45, ap140, 3, 4, 5, 6, 7],
[male, smoke, age45, ap120, 2, 3, 3, 4, 5],
[male, smoke, age40, ap180, 2, 2, 3, 3, 4],
[male, smoke, age40, ap160, 1, 2, 2, 2, 3],
[male, smoke, age40, ap140, 1, 1, 1, 2, 2],
[male, smoke, age40, ap120, 1, 1, 1, 1, 1],
]
for gender, sm, age, aphi, c4, c5, c6, c7, c8 in data:
all.ix[gender & sm & aphi & chol4 , 'score_scale_val'] = c4
all.ix[gender & sm & aphi & chol5 , 'score_scale_val'] = c5
all.ix[gender & sm & aphi & chol6 , 'score_scale_val'] = c6
all.ix[gender & sm & aphi & chol7 , 'score_scale_val'] = c7
all.ix[gender & sm & aphi & chol8 , 'score_scale_val'] = c8
all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))
In [41]:
#6 + score manual
DATA_ID = 32
train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(28), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(28), index_col='id')
all = pd.concat([train, test])
all['score_scale_val_manual'] = all['cholesterol_all'] * (all['gender_male']+1) * (all['smoke']+1) * all['age'] * all['ap_hi']
all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))
In [44]:
#6 + score
DATA_ID = 35
train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(28), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(28), index_col='id')
all = pd.concat([train, test])
all['score_scale_val_v3'] = 0
chol4 = all['cholesterol_all'] == 1
chol5 = all['cholesterol_all'] == 1
chol6 = all['cholesterol_all'] == 1
chol7 = all['cholesterol_all'] == 1
chol8 = (all['cholesterol_all'] == 2) | (all['cholesterol_all'] == 3)
male = all['gender_male'] == 1
female = all['gender_male'] == 0
smoke = all['smoke'] == 1
not_smoke = all['smoke'] == 0
age40 = (all['age']/365.25 >=40) & (all['age']/365.25 < 45)
age45 = (all['age']/365.25 >=45) & (all['age']/365.25 < 55)
age55 = (all['age']/365.25 >=55) & (all['age']/365.25 < 60)
age60 = (all['age']/365.25 >=60) & (all['age']/365.25 < 65)
age65 = (all['age']/365.25 >=65)
ap120 = (all['ap_hi'] >= 120) & (all['ap_hi'] < 140)
ap140 = (all['ap_hi'] >= 140) & (all['ap_hi'] < 160)
ap160 = (all['ap_hi'] >= 160) & (all['ap_hi'] < 170)
ap180 = (all['ap_hi'] >= 180)
data = [
[female, not_smoke, age65, ap180, 7, 8, 9, 10, 12],
[female, not_smoke, age65, ap160, 5, 5, 6, 7, 8],
[female, not_smoke, age65, ap140, 3, 3, 4, 5, 6],
[female, not_smoke, age65, ap120, 2, 2, 3, 3, 4],
[female, not_smoke, age60, ap180, 4, 4, 5, 6, 7],
[female, not_smoke, age60, ap160, 3, 3, 3, 4, 5],
[female, not_smoke, age60, ap140, 2, 2, 2, 3, 3],
[female, not_smoke, age60, ap120, 1, 1, 2, 2, 2],
[female, not_smoke, age55, ap180, 2, 2, 3, 3, 4],
[female, not_smoke, age55, ap160, 1, 2, 2, 2, 3],
[female, not_smoke, age55, ap140, 1, 1, 1, 1, 2],
[female, not_smoke, age55, ap120, 1, 1, 1, 1, 1],
[female, not_smoke, age45, ap180, 1, 1, 1, 2, 2],
[female, not_smoke, age45, ap160, 1, 1, 1, 1, 1],
[female, not_smoke, age45, ap140, 0, 1, 1, 1, 1],
[female, not_smoke, age45, ap120, 0, 0, 1, 1, 1],
# [female, not_smoke, age40, ap180, , , , , ],
# [female, not_smoke, age40, ap160, , , , , ],
# [female, not_smoke, age40, ap140, , , , , ],
# [female, not_smoke, age40, ap120, , , , , ],
#######################################################
[female, smoke, age65, ap180, 13, 15, 17, 19, 22],
[female, smoke, age65, ap160, 9, 10, 12, 13, 16],
[female, smoke, age65, ap140, 6, 7, 8, 9, 11],
[female, smoke, age65, ap120, 4, 5, 5, 6, 7],
[female, smoke, age60, ap180, 8, 9, 10, 11, 13],
[female, smoke, age60, ap160, 5, 6, 7, 8, 9],
[female, smoke, age60, ap140, 3, 4, 5, 5, 6],
[female, smoke, age60, ap120, 2, 3, 3, 4, 4],
[female, smoke, age55, ap180, 4, 5, 5, 6, 7],
[female, smoke, age55, ap160, 3, 3, 4, 4, 5],
[female, smoke, age55, ap140, 2, 2, 2, 3, 3],
[female, smoke, age55, ap120, 1, 1, 2, 2, 2],
[female, smoke, age45, ap180, 2, 2, 3, 3, 4],
[female, smoke, age45, ap160, 1, 2, 2, 2, 3],
[female, smoke, age45, ap140, 1, 1, 1, 1, 2],
[female, smoke, age45, ap120, 1, 1, 1, 1, 1],
[female, smoke, age40, ap180, 0, 0, 0, 1, 1],
# [female, smoke, age40, ap160, , , , , ],
# [female, smoke, age40, ap140, , , , , ],
# [female, smoke, age40, ap120, , , , , ],
#######################################################
[male, not_smoke, age65, ap180, 14, 16, 19, 22, 26],
[male, not_smoke, age65, ap160, 9, 11, 13, 15, 16],
[male, not_smoke, age65, ap140, 6, 8, 9, 11, 13],
[male, not_smoke, age65, ap120, 4, 5, 6, 7, 9],
[male, not_smoke, age60, ap180, 9, 11, 13, 15, 18],
[male, not_smoke, age60, ap160, 6, 7, 9, 10, 12],
[male, not_smoke, age60, ap140, 4, 5, 6, 7, 9],
[male, not_smoke, age60, ap120, 3, 3, 4, 5, 6],
[male, not_smoke, age55, ap180, 6, 7, 8, 10, 12],
[male, not_smoke, age55, ap160, 4, 5, 6, 7, 8],
[male, not_smoke, age55, ap140, 3, 3, 4, 5, 6],
[male, not_smoke, age55, ap120, 2, 2, 3, 3, 4],
[male, not_smoke, age45, ap180, 4, 4, 5, 6, 7],
[male, not_smoke, age45, ap160, 2, 3, 3, 4, 5],
[male, not_smoke, age45, ap140, 2, 2, 2, 3, 3],
[male, not_smoke, age45, ap120, 1, 1, 2, 2, 2],
[male, not_smoke, age40, ap180, 1, 1, 1, 2, 2],
[male, not_smoke, age40, ap160, 1, 1, 1, 1, 1],
[male, not_smoke, age40, ap140, 0, 1, 1, 1, 1],
[male, not_smoke, age40, ap120, 0, 0, 1, 1, 1],
#######################################################
[male, smoke, age65, ap180, 26, 30, 35, 41, 47],
[male, smoke, age65, ap160, 18, 21, 25, 29, 34],
[male, smoke, age65, ap140, 13, 15, 17, 20, 24],
[male, smoke, age65, ap120, 9, 10, 12, 14, 17],
[male, smoke, age60, ap180, 18, 21, 24, 28, 33],
[male, smoke, age60, ap160, 12, 14, 17, 20, 24],
[male, smoke, age60, ap140, 8, 10, 12, 14, 17],
[male, smoke, age60, ap120, 6, 7, 8, 10, 12],
[male, smoke, age55, ap180, 12, 13, 16, 19, 22],
[male, smoke, age55, ap160, 8, 9, 11, 13, 16],
[male, smoke, age55, ap140, 5, 6, 8, 9, 11],
[male, smoke, age55, ap120, 4, 4, 5, 6, 8],
[male, smoke, age45, ap180, 7, 8, 10, 12, 14],
[male, smoke, age45, ap160, 5, 6, 7, 8, 10],
[male, smoke, age45, ap140, 3, 4, 5, 6, 7],
[male, smoke, age45, ap120, 2, 3, 3, 4, 5],
[male, smoke, age40, ap180, 2, 2, 3, 3, 4],
[male, smoke, age40, ap160, 1, 2, 2, 2, 3],
[male, smoke, age40, ap140, 1, 1, 1, 2, 2],
[male, smoke, age40, ap120, 1, 1, 1, 1, 1],
]
for gender, sm, age, aphi, c4, c5, c6, c7, c8 in data:
all.ix[gender & sm & aphi & chol4 , 'score_scale_val'] = c4
all.ix[gender & sm & aphi & chol5 , 'score_scale_val'] = c5
all.ix[gender & sm & aphi & chol6 , 'score_scale_val'] = c6
all.ix[gender & sm & aphi & chol7 , 'score_scale_val'] = c7
all.ix[gender & sm & aphi & chol8 , 'score_scale_val'] = c8
all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))
In [ ]:
In [42]:
#28 + age by 5 year
DATA_ID = 33
train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(28), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(28), index_col='id')
all = pd.concat([train, test])
all['age_5year'] = all['age'] // (365.25 * 5)
all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))
In [43]:
#28 + time to birthday
DATA_ID = 34
train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(28), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(28), index_col='id')
all = pd.concat([train, test])
all['time_to_birthday'] = all['age'] % (365.25 )
all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))
In [ ]:
In [ ]:
In [ ]:
In [61]:
#6 + score
DATA_ID = 36
train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(28), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(28), index_col='id')
all = pd.concat([train, test])
all['score_scale_val_v3'] = 0
chol4 = all['cholesterol_all'] == 1
chol5 = all['cholesterol_all'] == 1
chol6 = all['cholesterol_all'] == 1
chol7 = all['cholesterol_all'] == 2
chol8 = (all['cholesterol_all'] == 3)
male = all['gender_male'] == 1
female = all['gender_male'] == 0
smoke = all['smoke'] == 1
not_smoke = all['smoke'] == 0
age40 = (all['age']/365.25 >=40) & (all['age']/365.25 < 45)
age45 = (all['age']/365.25 >=45) & (all['age']/365.25 < 55)
age55 = (all['age']/365.25 >=55) & (all['age']/365.25 < 60)
age60 = (all['age']/365.25 >=60) & (all['age']/365.25 < 65)
age65 = (all['age']/365.25 >=65)
ap120 = (all['ap_hi'] >= 120) & (all['ap_hi'] < 140)
ap140 = (all['ap_hi'] >= 140) & (all['ap_hi'] < 160)
ap160 = (all['ap_hi'] >= 160) & (all['ap_hi'] < 170)
ap180 = (all['ap_hi'] >= 180)
data = [
[female, not_smoke, age65, ap180, 7, 8, 9, 10, 12],
[female, not_smoke, age65, ap160, 5, 5, 6, 7, 8],
[female, not_smoke, age65, ap140, 3, 3, 4, 5, 6],
[female, not_smoke, age65, ap120, 2, 2, 3, 3, 4],
[female, not_smoke, age60, ap180, 4, 4, 5, 6, 7],
[female, not_smoke, age60, ap160, 3, 3, 3, 4, 5],
[female, not_smoke, age60, ap140, 2, 2, 2, 3, 3],
[female, not_smoke, age60, ap120, 1, 1, 2, 2, 2],
[female, not_smoke, age55, ap180, 2, 2, 3, 3, 4],
[female, not_smoke, age55, ap160, 1, 2, 2, 2, 3],
[female, not_smoke, age55, ap140, 1, 1, 1, 1, 2],
[female, not_smoke, age55, ap120, 1, 1, 1, 1, 1],
[female, not_smoke, age45, ap180, 1, 1, 1, 2, 2],
[female, not_smoke, age45, ap160, 1, 1, 1, 1, 1],
[female, not_smoke, age45, ap140, 0, 1, 1, 1, 1],
[female, not_smoke, age45, ap120, 0, 0, 1, 1, 1],
# [female, not_smoke, age40, ap180, , , , , ],
# [female, not_smoke, age40, ap160, , , , , ],
# [female, not_smoke, age40, ap140, , , , , ],
# [female, not_smoke, age40, ap120, , , , , ],
#######################################################
[female, smoke, age65, ap180, 13, 15, 17, 19, 22],
[female, smoke, age65, ap160, 9, 10, 12, 13, 16],
[female, smoke, age65, ap140, 6, 7, 8, 9, 11],
[female, smoke, age65, ap120, 4, 5, 5, 6, 7],
[female, smoke, age60, ap180, 8, 9, 10, 11, 13],
[female, smoke, age60, ap160, 5, 6, 7, 8, 9],
[female, smoke, age60, ap140, 3, 4, 5, 5, 6],
[female, smoke, age60, ap120, 2, 3, 3, 4, 4],
[female, smoke, age55, ap180, 4, 5, 5, 6, 7],
[female, smoke, age55, ap160, 3, 3, 4, 4, 5],
[female, smoke, age55, ap140, 2, 2, 2, 3, 3],
[female, smoke, age55, ap120, 1, 1, 2, 2, 2],
[female, smoke, age45, ap180, 2, 2, 3, 3, 4],
[female, smoke, age45, ap160, 1, 2, 2, 2, 3],
[female, smoke, age45, ap140, 1, 1, 1, 1, 2],
[female, smoke, age45, ap120, 1, 1, 1, 1, 1],
[female, smoke, age40, ap180, 0, 0, 0, 1, 1],
# [female, smoke, age40, ap160, , , , , ],
# [female, smoke, age40, ap140, , , , , ],
# [female, smoke, age40, ap120, , , , , ],
#######################################################
[male, not_smoke, age65, ap180, 14, 16, 19, 22, 26],
[male, not_smoke, age65, ap160, 9, 11, 13, 15, 16],
[male, not_smoke, age65, ap140, 6, 8, 9, 11, 13],
[male, not_smoke, age65, ap120, 4, 5, 6, 7, 9],
[male, not_smoke, age60, ap180, 9, 11, 13, 15, 18],
[male, not_smoke, age60, ap160, 6, 7, 9, 10, 12],
[male, not_smoke, age60, ap140, 4, 5, 6, 7, 9],
[male, not_smoke, age60, ap120, 3, 3, 4, 5, 6],
[male, not_smoke, age55, ap180, 6, 7, 8, 10, 12],
[male, not_smoke, age55, ap160, 4, 5, 6, 7, 8],
[male, not_smoke, age55, ap140, 3, 3, 4, 5, 6],
[male, not_smoke, age55, ap120, 2, 2, 3, 3, 4],
[male, not_smoke, age45, ap180, 4, 4, 5, 6, 7],
[male, not_smoke, age45, ap160, 2, 3, 3, 4, 5],
[male, not_smoke, age45, ap140, 2, 2, 2, 3, 3],
[male, not_smoke, age45, ap120, 1, 1, 2, 2, 2],
[male, not_smoke, age40, ap180, 1, 1, 1, 2, 2],
[male, not_smoke, age40, ap160, 1, 1, 1, 1, 1],
[male, not_smoke, age40, ap140, 0, 1, 1, 1, 1],
[male, not_smoke, age40, ap120, 0, 0, 1, 1, 1],
#######################################################
[male, smoke, age65, ap180, 26, 30, 35, 41, 47],
[male, smoke, age65, ap160, 18, 21, 25, 29, 34],
[male, smoke, age65, ap140, 13, 15, 17, 20, 24],
[male, smoke, age65, ap120, 9, 10, 12, 14, 17],
[male, smoke, age60, ap180, 18, 21, 24, 28, 33],
[male, smoke, age60, ap160, 12, 14, 17, 20, 24],
[male, smoke, age60, ap140, 8, 10, 12, 14, 17],
[male, smoke, age60, ap120, 6, 7, 8, 10, 12],
[male, smoke, age55, ap180, 12, 13, 16, 19, 22],
[male, smoke, age55, ap160, 8, 9, 11, 13, 16],
[male, smoke, age55, ap140, 5, 6, 8, 9, 11],
[male, smoke, age55, ap120, 4, 4, 5, 6, 8],
[male, smoke, age45, ap180, 7, 8, 10, 12, 14],
[male, smoke, age45, ap160, 5, 6, 7, 8, 10],
[male, smoke, age45, ap140, 3, 4, 5, 6, 7],
[male, smoke, age45, ap120, 2, 3, 3, 4, 5],
[male, smoke, age40, ap180, 2, 2, 3, 3, 4],
[male, smoke, age40, ap160, 1, 2, 2, 2, 3],
[male, smoke, age40, ap140, 1, 1, 1, 2, 2],
[male, smoke, age40, ap120, 1, 1, 1, 1, 1],
]
for gender, sm, age, aphi, c4, c5, c6, c7, c8 in data:
all.ix[gender & sm & aphi & chol8 , 'score_scale_val'] = c8
all.ix[gender & sm & aphi & chol7 , 'score_scale_val'] = c7
all.ix[gender & sm & aphi & chol6 , 'score_scale_val'] = c6
all.ix[gender & sm & aphi & chol5 , 'score_scale_val'] = c5
all.ix[gender & sm & aphi & chol4 , 'score_scale_val'] = c4
all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))
In [46]:
#6 + score
DATA_ID = 37
train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(28), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(28), index_col='id')
all = pd.concat([train, test])
all['score_scale_val_v3'] = 0
chol4 = all['cholesterol_all'] == 1
chol5 = all['cholesterol_all'] == 1
chol6 = all['cholesterol_all'] == 1
chol7 = all['cholesterol_all'] == 1
chol8 = (all['cholesterol_all'] == 2) | (all['cholesterol_all'] == 3)
male = all['gender_male'] == 1
female = all['gender_male'] == 0
smoke = all['smoke'] == 1
not_smoke = all['smoke'] == 0
age40 = (all['age']/365.25 >=40) & (all['age']/365.25 < 45)
age45 = (all['age']/365.25 >=45) & (all['age']/365.25 < 55)
age55 = (all['age']/365.25 >=55) & (all['age']/365.25 < 60)
age60 = (all['age']/365.25 >=60) & (all['age']/365.25 < 65)
age65 = (all['age']/365.25 >=65)
ap120 = (all['ap_hi'] >= 120) & (all['ap_hi'] < 140)
ap140 = (all['ap_hi'] >= 140) & (all['ap_hi'] < 160)
ap160 = (all['ap_hi'] >= 160) & (all['ap_hi'] < 170)
ap180 = (all['ap_hi'] >= 180)
data = [
[female, not_smoke, age65, ap180, 7, 8, 9, 10, 12],
[female, not_smoke, age65, ap160, 5, 5, 6, 7, 8],
[female, not_smoke, age65, ap140, 3, 3, 4, 5, 6],
[female, not_smoke, age65, ap120, 2, 2, 3, 3, 4],
[female, not_smoke, age60, ap180, 4, 4, 5, 6, 7],
[female, not_smoke, age60, ap160, 3, 3, 3, 4, 5],
[female, not_smoke, age60, ap140, 2, 2, 2, 3, 3],
[female, not_smoke, age60, ap120, 1, 1, 2, 2, 2],
[female, not_smoke, age55, ap180, 2, 2, 3, 3, 4],
[female, not_smoke, age55, ap160, 1, 2, 2, 2, 3],
[female, not_smoke, age55, ap140, 1, 1, 1, 1, 2],
[female, not_smoke, age55, ap120, 1, 1, 1, 1, 1],
[female, not_smoke, age45, ap180, 1, 1, 1, 2, 2],
[female, not_smoke, age45, ap160, 1, 1, 1, 1, 1],
[female, not_smoke, age45, ap140, 0, 1, 1, 1, 1],
[female, not_smoke, age45, ap120, 0, 0, 1, 1, 1],
# [female, not_smoke, age40, ap180, , , , , ],
# [female, not_smoke, age40, ap160, , , , , ],
# [female, not_smoke, age40, ap140, , , , , ],
# [female, not_smoke, age40, ap120, , , , , ],
#######################################################
[female, smoke, age65, ap180, 13, 15, 17, 19, 22],
[female, smoke, age65, ap160, 9, 10, 12, 13, 16],
[female, smoke, age65, ap140, 6, 7, 8, 9, 11],
[female, smoke, age65, ap120, 4, 5, 5, 6, 7],
[female, smoke, age60, ap180, 8, 9, 10, 11, 13],
[female, smoke, age60, ap160, 5, 6, 7, 8, 9],
[female, smoke, age60, ap140, 3, 4, 5, 5, 6],
[female, smoke, age60, ap120, 2, 3, 3, 4, 4],
[female, smoke, age55, ap180, 4, 5, 5, 6, 7],
[female, smoke, age55, ap160, 3, 3, 4, 4, 5],
[female, smoke, age55, ap140, 2, 2, 2, 3, 3],
[female, smoke, age55, ap120, 1, 1, 2, 2, 2],
[female, smoke, age45, ap180, 2, 2, 3, 3, 4],
[female, smoke, age45, ap160, 1, 2, 2, 2, 3],
[female, smoke, age45, ap140, 1, 1, 1, 1, 2],
[female, smoke, age45, ap120, 1, 1, 1, 1, 1],
[female, smoke, age40, ap180, 0, 0, 0, 1, 1],
# [female, smoke, age40, ap160, , , , , ],
# [female, smoke, age40, ap140, , , , , ],
# [female, smoke, age40, ap120, , , , , ],
#######################################################
[male, not_smoke, age65, ap180, 14, 16, 19, 22, 26],
[male, not_smoke, age65, ap160, 9, 11, 13, 15, 16],
[male, not_smoke, age65, ap140, 6, 8, 9, 11, 13],
[male, not_smoke, age65, ap120, 4, 5, 6, 7, 9],
[male, not_smoke, age60, ap180, 9, 11, 13, 15, 18],
[male, not_smoke, age60, ap160, 6, 7, 9, 10, 12],
[male, not_smoke, age60, ap140, 4, 5, 6, 7, 9],
[male, not_smoke, age60, ap120, 3, 3, 4, 5, 6],
[male, not_smoke, age55, ap180, 6, 7, 8, 10, 12],
[male, not_smoke, age55, ap160, 4, 5, 6, 7, 8],
[male, not_smoke, age55, ap140, 3, 3, 4, 5, 6],
[male, not_smoke, age55, ap120, 2, 2, 3, 3, 4],
[male, not_smoke, age45, ap180, 4, 4, 5, 6, 7],
[male, not_smoke, age45, ap160, 2, 3, 3, 4, 5],
[male, not_smoke, age45, ap140, 2, 2, 2, 3, 3],
[male, not_smoke, age45, ap120, 1, 1, 2, 2, 2],
[male, not_smoke, age40, ap180, 1, 1, 1, 2, 2],
[male, not_smoke, age40, ap160, 1, 1, 1, 1, 1],
[male, not_smoke, age40, ap140, 0, 1, 1, 1, 1],
[male, not_smoke, age40, ap120, 0, 0, 1, 1, 1],
#######################################################
[male, smoke, age65, ap180, 26, 30, 35, 41, 47],
[male, smoke, age65, ap160, 18, 21, 25, 29, 34],
[male, smoke, age65, ap140, 13, 15, 17, 20, 24],
[male, smoke, age65, ap120, 9, 10, 12, 14, 17],
[male, smoke, age60, ap180, 18, 21, 24, 28, 33],
[male, smoke, age60, ap160, 12, 14, 17, 20, 24],
[male, smoke, age60, ap140, 8, 10, 12, 14, 17],
[male, smoke, age60, ap120, 6, 7, 8, 10, 12],
[male, smoke, age55, ap180, 12, 13, 16, 19, 22],
[male, smoke, age55, ap160, 8, 9, 11, 13, 16],
[male, smoke, age55, ap140, 5, 6, 8, 9, 11],
[male, smoke, age55, ap120, 4, 4, 5, 6, 8],
[male, smoke, age45, ap180, 7, 8, 10, 12, 14],
[male, smoke, age45, ap160, 5, 6, 7, 8, 10],
[male, smoke, age45, ap140, 3, 4, 5, 6, 7],
[male, smoke, age45, ap120, 2, 3, 3, 4, 5],
[male, smoke, age40, ap180, 2, 2, 3, 3, 4],
[male, smoke, age40, ap160, 1, 2, 2, 2, 3],
[male, smoke, age40, ap140, 1, 1, 1, 2, 2],
[male, smoke, age40, ap120, 1, 1, 1, 1, 1],
]
for gender, sm, age, aphi, c4, c5, c6, c7, c8 in data:
all.ix[gender & sm & aphi & chol8 , 'score_scale_val'] = c8
all.ix[gender & sm & aphi & chol7 , 'score_scale_val'] = c7
all.ix[gender & sm & aphi & chol6 , 'score_scale_val'] = c6
all.ix[gender & sm & aphi & chol5 , 'score_scale_val'] = c5
all.ix[gender & sm & aphi & chol4 , 'score_scale_val'] = c4
all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))
In [69]:
#6 + score
DATA_ID = 38
train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(28), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(28), index_col='id')
all = pd.concat([train, test])
all['score_scale_val'] = 0
chol4 = all['cholesterol_all'] == 1
chol5 = all['cholesterol_all'] == 1
chol6 = all['cholesterol_all'] == 1
chol7 = all['cholesterol_all'] == 2
chol8 = (all['cholesterol_all'] == 3)
male = all['gender_male'] == 1
female = all['gender_male'] == 0
smoke = all['smoke'] == 1
not_smoke = all['smoke'] == 0
age40 = (all['age']/365.25 >=40) & (all['age']/365.25 < 45)
age45 = (all['age']/365.25 >=45) & (all['age']/365.25 < 55)
age55 = (all['age']/365.25 >=55) & (all['age']/365.25 < 60)
age60 = (all['age']/365.25 >=60) & (all['age']/365.25 < 65)
age65 = (all['age']/365.25 >=65)
ap120 = (all['ap_hi'] >= 120) & (all['ap_hi'] < 140)
ap140 = (all['ap_hi'] >= 140) & (all['ap_hi'] < 160)
ap160 = (all['ap_hi'] >= 160) & (all['ap_hi'] < 170)
ap180 = (all['ap_hi'] >= 180)
data = [
[female, not_smoke, age65, ap180, 7, 8, 9, 10, 12],
[female, not_smoke, age65, ap160, 5, 5, 6, 7, 8],
[female, not_smoke, age65, ap140, 3, 3, 4, 5, 6],
[female, not_smoke, age65, ap120, 2, 2, 3, 3, 4],
[female, not_smoke, age60, ap180, 4, 4, 5, 6, 7],
[female, not_smoke, age60, ap160, 3, 3, 3, 4, 5],
[female, not_smoke, age60, ap140, 2, 2, 2, 3, 3],
[female, not_smoke, age60, ap120, 1, 1, 2, 2, 2],
[female, not_smoke, age55, ap180, 2, 2, 3, 3, 4],
[female, not_smoke, age55, ap160, 1, 2, 2, 2, 3],
[female, not_smoke, age55, ap140, 1, 1, 1, 1, 2],
[female, not_smoke, age55, ap120, 1, 1, 1, 1, 1],
[female, not_smoke, age45, ap180, 1, 1, 1, 2, 2],
[female, not_smoke, age45, ap160, 1, 1, 1, 1, 1],
[female, not_smoke, age45, ap140, 0, 1, 1, 1, 1],
[female, not_smoke, age45, ap120, 0, 0, 1, 1, 1],
[female, not_smoke, age40, ap180, 0, 0, 0, 0, 0],
[female, not_smoke, age40, ap160, 0, 0, 0, 0, 0],
[female, not_smoke, age40, ap140, 0, 0, 0, 0, 0],
[female, not_smoke, age40, ap120, 0, 0, 0, 0, 0],
#######################################################
[female, smoke, age65, ap180, 13, 15, 17, 19, 22],
[female, smoke, age65, ap160, 9, 10, 12, 13, 16],
[female, smoke, age65, ap140, 6, 7, 8, 9, 11],
[female, smoke, age65, ap120, 4, 5, 5, 6, 7],
[female, smoke, age60, ap180, 8, 9, 10, 11, 13],
[female, smoke, age60, ap160, 5, 6, 7, 8, 9],
[female, smoke, age60, ap140, 3, 4, 5, 5, 6],
[female, smoke, age60, ap120, 2, 3, 3, 4, 4],
[female, smoke, age55, ap180, 4, 5, 5, 6, 7],
[female, smoke, age55, ap160, 3, 3, 4, 4, 5],
[female, smoke, age55, ap140, 2, 2, 2, 3, 3],
[female, smoke, age55, ap120, 1, 1, 2, 2, 2],
[female, smoke, age45, ap180, 2, 2, 3, 3, 4],
[female, smoke, age45, ap160, 1, 2, 2, 2, 3],
[female, smoke, age45, ap140, 1, 1, 1, 1, 2],
[female, smoke, age45, ap120, 1, 1, 1, 1, 1],
[female, smoke, age40, ap180, 0, 0, 0, 1, 1],
[female, smoke, age40, ap160, 0, 0, 0, 0, 0],
[female, smoke, age40, ap140, 0, 0, 0, 0, 0],
[female, smoke, age40, ap120, 0, 0, 0, 0, 0],
#######################################################
[male, not_smoke, age65, ap180, 14, 16, 19, 22, 26],
[male, not_smoke, age65, ap160, 9, 11, 13, 15, 16],
[male, not_smoke, age65, ap140, 6, 8, 9, 11, 13],
[male, not_smoke, age65, ap120, 4, 5, 6, 7, 9],
[male, not_smoke, age60, ap180, 9, 11, 13, 15, 18],
[male, not_smoke, age60, ap160, 6, 7, 9, 10, 12],
[male, not_smoke, age60, ap140, 4, 5, 6, 7, 9],
[male, not_smoke, age60, ap120, 3, 3, 4, 5, 6],
[male, not_smoke, age55, ap180, 6, 7, 8, 10, 12],
[male, not_smoke, age55, ap160, 4, 5, 6, 7, 8],
[male, not_smoke, age55, ap140, 3, 3, 4, 5, 6],
[male, not_smoke, age55, ap120, 2, 2, 3, 3, 4],
[male, not_smoke, age45, ap180, 4, 4, 5, 6, 7],
[male, not_smoke, age45, ap160, 2, 3, 3, 4, 5],
[male, not_smoke, age45, ap140, 2, 2, 2, 3, 3],
[male, not_smoke, age45, ap120, 1, 1, 2, 2, 2],
[male, not_smoke, age40, ap180, 1, 1, 1, 2, 2],
[male, not_smoke, age40, ap160, 1, 1, 1, 1, 1],
[male, not_smoke, age40, ap140, 0, 1, 1, 1, 1],
[male, not_smoke, age40, ap120, 0, 0, 1, 1, 1],
#######################################################
[male, smoke, age65, ap180, 26, 30, 35, 41, 47],
[male, smoke, age65, ap160, 18, 21, 25, 29, 34],
[male, smoke, age65, ap140, 13, 15, 17, 20, 24],
[male, smoke, age65, ap120, 9, 10, 12, 14, 17],
[male, smoke, age60, ap180, 18, 21, 24, 28, 33],
[male, smoke, age60, ap160, 12, 14, 17, 20, 24],
[male, smoke, age60, ap140, 8, 10, 12, 14, 17],
[male, smoke, age60, ap120, 6, 7, 8, 10, 12],
[male, smoke, age55, ap180, 12, 13, 16, 19, 22],
[male, smoke, age55, ap160, 8, 9, 11, 13, 16],
[male, smoke, age55, ap140, 5, 6, 8, 9, 11],
[male, smoke, age55, ap120, 4, 4, 5, 6, 8],
[male, smoke, age45, ap180, 7, 8, 10, 12, 14],
[male, smoke, age45, ap160, 5, 6, 7, 8, 10],
[male, smoke, age45, ap140, 3, 4, 5, 6, 7],
[male, smoke, age45, ap120, 2, 3, 3, 4, 5],
[male, smoke, age40, ap180, 2, 2, 3, 3, 4],
[male, smoke, age40, ap160, 1, 2, 2, 2, 3],
[male, smoke, age40, ap140, 1, 1, 1, 2, 2],
[male, smoke, age40, ap120, 1, 1, 1, 1, 1],
]
for gender, sm, age, aphi, c4, c5, c6, c7, c8 in data:
all.ix[gender & sm & aphi & chol8 , 'score_scale_val'] = c8
all.ix[gender & sm & aphi & chol7 , 'score_scale_val'] = c7
all.ix[gender & sm & aphi & chol6 , 'score_scale_val'] = c6
all.ix[gender & sm & aphi & chol5 , 'score_scale_val'] = c5
all.ix[gender & sm & aphi & chol4 , 'score_scale_val'] = c4
all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))
In [ ]:
In [ ]:
In [47]:
In [52]:
all.columns
Out[52]:
In [ ]:
#38 + combinations
DATA_ID = 40
train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(38), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(38), index_col='id')
all = pd.concat([train, test])
from itertools import combinations
cols = ['age', 'height', 'weight', 'ap_hi', 'ap_lo', #'smoke', 'alco', 'active',
'cholesterol_all', 'gluc_all', 'imt']
cols_norm = {}
for c in cols:
all[c+'_norm'] = (all[c] - all[c].mean()) / (all[c].max() - all[c].min())
cols_norm[c] = c+'_norm'
for c1, c2 in combinations(cols, 2):
all["x__{}__{}".format(c1,c2)] = all[c1] * all[c2]
all["div__{}__{}".format(c1,c2)] = all[c1] / (all[c2] if all[c2].min()>0 else all[c2]+1)
all["plus__{}__{}".format(cols_norm[c1],cols_norm[c2])] = all[cols_norm[c1]] + all[cols_norm[c2]]
all["minus__{}__{}".format(cols_norm[c1],cols_norm[c2])] = all[cols_norm[c1]] - all[cols_norm[c2]]
print(0)
for c1, c2, c3 in combinations(cols, 3):
all["x__{}__{}__{}".format(c1,c2,c3)] = all[c1] * all[c2] * all[c3]
all["plus__{}__{}__{}".format(cols_norm[c1],cols_norm[c2],cols_norm[c3])] = all[cols_norm[c1]] + all[cols_norm[c2]] + all[cols_norm[c3]]
all["minus1__{}__{}__{}".format(cols_norm[c1],cols_norm[c2],cols_norm[c3])] = \
all[cols_norm[c1]] + all[cols_norm[c2]] - all[cols_norm[c3]]
all["minus2__{}__{}__{}".format(cols_norm[c1],cols_norm[c2],cols_norm[c3])] = \
all[cols_norm[c1]] - all[cols_norm[c2]] + all[cols_norm[c3]]
all["minus3__{}__{}__{}".format(cols_norm[c1],cols_norm[c2],cols_norm[c3])] = \
- all[cols_norm[c1]] + all[cols_norm[c2]] + all[cols_norm[c3]]
all["minus4__{}__{}__{}".format(cols_norm[c1],cols_norm[c2],cols_norm[c3])] = \
all[cols_norm[c1]] - all[cols_norm[c2]] - all[cols_norm[c3]]
all["minus5__{}__{}__{}".format(cols_norm[c1],cols_norm[c2],cols_norm[c3])] = \
- all[cols_norm[c1]] + all[cols_norm[c2]] - all[cols_norm[c3]]
all["minus6__{}__{}__{}".format(cols_norm[c1],cols_norm[c2],cols_norm[c3])] = \
- all[cols_norm[c1]] - all[cols_norm[c2]] + all[cols_norm[c3]]
all["div1__{}__{}__{}".format(c1,c2,c3)] = 1 * all[c1] * all[c2] / all[c3]
all["div2__{}__{}__{}".format(c1,c2,c3)] = 1 * all[c1] / all[c2] * all[c3]
all["div3__{}__{}__{}".format(c1,c2,c3)] = 1 / all[c1] * all[c2] * all[c3]
all["div4__{}__{}__{}".format(c1,c2,c3)] = 1 * all[c1] / all[c2] / all[c3]
all["div5__{}__{}__{}".format(c1,c2,c3)] = 1 / all[c1] * all[c2] / all[c3]
all["div6__{}__{}__{}".format(c1,c2,c3)] = 1 / all[c1] / all[c2] * all[c3]
print(1)
all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
print(2)
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))
len(all.columns)
Out[ ]:
In [ ]:
In [ ]:
In [ ]:
In [59]:
#36 + kmean
DATA_ID = 41
train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(36), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(36), index_col='id')
all = pd.concat([train, test])
from sklearn.cluster import KMeans
k15 = KMeans(n_clusters=15, precompute_distances = True, n_jobs=-1)
k15.fit(all.fillna(0))
k7 = KMeans(n_clusters=7, precompute_distances = True, n_jobs=-1)
k7.fit(all.fillna(0))
k3 = KMeans(n_clusters=3, precompute_distances = True, n_jobs=-1)
k3.fit(all.fillna(0))
k15_res = k15.predict(all.fillna(0))[None].T
k7_res = k7.predict(all.fillna(0))[None].T
k3_res = k3.predict(all.fillna(0))[None].T
all['k15'] = k15_res
all['k7'] = k7_res
all['k3'] = k3_res
all = pd.get_dummies(all, columns=["k15", "k7", "k3",])#
all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))
len(all.columns)
Out[59]:
In [ ]:
In [ ]:
In [ ]:
In [66]:
#36 + fillna 001
DATA_ID = 42
train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(36), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(36), index_col='id')
all = pd.concat([train, test])
all['smoke'].fillna(0, inplace=True)
all['alco'].fillna(0, inplace=True)
all['active'].fillna(1, inplace=True)
all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))
len(all.columns)
Out[66]:
In [ ]:
In [ ]:
In [ ]:
In [67]:
#36 + kmean
DATA_ID = 43
train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(42), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(42), index_col='id')
all = pd.concat([train, test])
from sklearn.cluster import KMeans
all2=all[['age', 'height', 'weight', 'ap_hi', 'ap_lo', 'smoke', 'alco', 'active',
'gender_male', 'height_low', 'weight_low', 'cholesterol_all',
'gluc_all', 'ap_error', 'ap_error_swap', 'imt', 'imt_class_all',
]]
k15 = KMeans(n_clusters=15, precompute_distances = True, n_jobs=-1)
k15.fit(all2)
k7 = KMeans(n_clusters=7, precompute_distances = True, n_jobs=-1)
k7.fit(all2)
k3 = KMeans(n_clusters=3, precompute_distances = True, n_jobs=-1)
k3.fit(all2)
k15_res = k15.predict(all2)[None].T
k7_res = k7.predict(all2)[None].T
k3_res = k3.predict(all2)[None].T
all['k15'] = k15_res
all['k7'] = k7_res
all['k3'] = k3_res
all = pd.get_dummies(all, columns=["k15", "k7", "k3",])#
all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))
len(all.columns)
Out[67]:
In [76]:
#36 + kmean
DATA_ID = 44
train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(42), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(42), index_col='id')
all = pd.concat([train, test])
from sklearn.cluster import KMeans
all2=all[['age', 'height', 'weight', 'ap_hi', 'ap_lo', 'smoke', 'alco', 'active',
'gender_male', 'height_low', 'weight_low', 'cholesterol_all',
'gluc_all', 'ap_error', 'ap_error_swap', 'imt', 'imt_class_all',
]]
for c in ['age', 'height', 'weight', 'ap_hi', 'ap_lo', 'cholesterol_all', 'gluc_all', 'imt', 'imt_class_all']:
all2[c] = (all2[c]-all2[c].mean())/(all2[c].max()-all2[c].min())
#k15 = KMeans(n_clusters=15, precompute_distances = True, n_jobs=-1)
#k15.fit(all2)
k7 = KMeans(n_clusters=7, precompute_distances = True, n_jobs=-1)
k7.fit(all2)
k3 = KMeans(n_clusters=3, precompute_distances = True, n_jobs=-1)
k3.fit(all2)
#k15_res = k15.predict(all2)[None].T
k7_res = k7.predict(all2)[None].T
k3_res = k3.predict(all2)[None].T
#all['k15'] = k15_res
all['k7'] = k7_res
all['k3'] = k3_res
all = pd.get_dummies(all, columns=[ "k7", "k3",])#k15
all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))
len(all.columns)
Out[76]:
In [ ]:
In [ ]:
In [107]:
#43 + stat feat for add
DATA_ID = 45
train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(43), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(43), index_col='id')
all = pd.concat([train, test])
is_male = all['gender_male'] == 1
is_female = all['gender_male'] == 1
is_smoke = all['smoke'] == 1
is_smoke = all['smoke'] == 1
for c in ['age', 'height', 'weight', 'ap_hi', 'ap_lo',
'cholesterol_all',
'gluc_all', 'imt', 'imt_class_all',
'x__age__gluc_all',
'x__ap_hi__cholesterol_all', 'div6__height__gluc_all__imt',
'plus__age_norm__ap_hi_norm__gluc_all_norm', 'x__age__weight',
'div1__age__weight__cholesterol_all',
'div6__age__weight__cholesterol_all',
'plus__height_norm__weight_norm__gluc_all_norm',
'div1__ap_hi__ap_lo__cholesterol_all',
'div6__ap_hi__ap_lo__cholesterol_all',
'plus__age_norm__gluc_all_norm__imt_norm',
'minus6__ap_hi_norm__ap_lo_norm__cholesterol_all_norm',
'minus1__ap_hi_norm__ap_lo_norm__cholesterol_all_norm',
'minus6__age_norm__ap_lo_norm__cholesterol_all_norm',
'minus1__age_norm__ap_lo_norm__cholesterol_all_norm',
'div6__height__weight__ap_lo', 'div2__ap_lo__cholesterol_all__gluc_all',
'x__age__ap_hi__gluc_all', 'div5__ap_lo__cholesterol_all__gluc_all',
'score_scale_val']:
all.ix[all['gender_male'] == 1, c + '__gender__scale'] = \
(all.ix[all['gender_male'] == 1, c] - all.groupby(['gender_male']).mean()[c].loc[1]) / \
all.groupby(['gender_male']).std()[c].loc[1]
all.ix[all['gender_male'] == 0, c + '__gender__scale'] = \
(all.ix[all['gender_male'] == 0, c] - all.groupby(['gender_male']).mean()[c].loc[0]) / \
all.groupby(['gender_male']).std()[c].loc[0]
all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))
len(all.columns)
Out[107]:
In [ ]:
In [105]:
all.groupby(['gender_male', 'smoke']).mean()
Out[105]:
In [86]:
all.groupby(['gender_male']).std()
Out[86]:
In [ ]:
In [108]:
list(all.columns)
Out[108]:
In [ ]:
In [111]:
#43 + na subjactive
DATA_ID = 46
train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(43), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(43), index_col='id')
test_na = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(1), index_col='id')
test.alco=test_na.alco
test.smoke=test_na.smoke
test.active=test_na.active
all = pd.concat([train, test])
all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))
len(all.columns)
Out[111]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [112]:
#43 + na subjactive
DATA_ID = 47
train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(20), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(20), index_col='id')
all = pd.concat([train, test])
cols =[
'age',
'height',
'weight',
'ap_hi',
'ap_lo',
'smoke',
'alco',
'active',
'gender_male',
'height_low',
'weight_low',
'cholesterol_all',
'gluc_all',
'cholesterol_1',
'cholesterol_2',
'cholesterol_3',
'gluc_1',
'gluc_2',
'gluc_3',
'ap_error',
'ap_error_swap',
'imt',
'imt_class_all',
'imt_class_0',
'imt_class_1',
'imt_class_2',
'imt_class_3',
'imt_class_4',
'imt_class_5',
'imt_class_6',
'x__age__gluc_all',
'x__ap_hi__cholesterol_all',
'div6__height__gluc_all__imt',
'plus__age_norm__ap_hi_norm__gluc_all_norm',
'x__age__weight',
'div1__age__weight__cholesterol_all',
'div6__age__weight__cholesterol_all',
'plus__height_norm__weight_norm__gluc_all_norm',
'div1__ap_hi__ap_lo__cholesterol_all',
'div6__ap_hi__ap_lo__cholesterol_all',
'plus__age_norm__gluc_all_norm__imt_norm',
'minus6__ap_hi_norm__ap_lo_norm__cholesterol_all_norm',
'minus1__ap_hi_norm__ap_lo_norm__cholesterol_all_norm',
'minus6__age_norm__ap_lo_norm__cholesterol_all_norm',
'minus1__age_norm__ap_lo_norm__cholesterol_all_norm',
'div6__height__weight__ap_lo',
'div2__ap_lo__cholesterol_all__gluc_all',
'x__age__ap_hi__gluc_all',
'div5__ap_lo__cholesterol_all__gluc_all',
]
all = all[cols]
all['smoke'].fillna(0, inplace=True)
all['alco'].fillna(0, inplace=True)
all['active'].fillna(1, inplace=True)
all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))
In [113]:
#6 + score
DATA_ID = 48
train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(47), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(47), index_col='id')
all = pd.concat([train, test])
all['score_scale_val'] = 0
chol4 = all['cholesterol_all'] == 1
chol5 = all['cholesterol_all'] == 1
chol6 = all['cholesterol_all'] == 1
chol7 = all['cholesterol_all'] == 2
chol8 = (all['cholesterol_all'] == 3)
male = all['gender_male'] == 1
female = all['gender_male'] == 0
smoke = all['smoke'] == 1
not_smoke = all['smoke'] == 0
age40 = (all['age']/365.25 >=40) & (all['age']/365.25 < 45)
age45 = (all['age']/365.25 >=45) & (all['age']/365.25 < 55)
age55 = (all['age']/365.25 >=55) & (all['age']/365.25 < 60)
age60 = (all['age']/365.25 >=60) & (all['age']/365.25 < 65)
age65 = (all['age']/365.25 >=65)
ap120 = (all['ap_hi'] >= 120) & (all['ap_hi'] < 140)
ap140 = (all['ap_hi'] >= 140) & (all['ap_hi'] < 160)
ap160 = (all['ap_hi'] >= 160) & (all['ap_hi'] < 170)
ap180 = (all['ap_hi'] >= 180)
data = [
[female, not_smoke, age65, ap180, 7, 8, 9, 10, 12],
[female, not_smoke, age65, ap160, 5, 5, 6, 7, 8],
[female, not_smoke, age65, ap140, 3, 3, 4, 5, 6],
[female, not_smoke, age65, ap120, 2, 2, 3, 3, 4],
[female, not_smoke, age60, ap180, 4, 4, 5, 6, 7],
[female, not_smoke, age60, ap160, 3, 3, 3, 4, 5],
[female, not_smoke, age60, ap140, 2, 2, 2, 3, 3],
[female, not_smoke, age60, ap120, 1, 1, 2, 2, 2],
[female, not_smoke, age55, ap180, 2, 2, 3, 3, 4],
[female, not_smoke, age55, ap160, 1, 2, 2, 2, 3],
[female, not_smoke, age55, ap140, 1, 1, 1, 1, 2],
[female, not_smoke, age55, ap120, 1, 1, 1, 1, 1],
[female, not_smoke, age45, ap180, 1, 1, 1, 2, 2],
[female, not_smoke, age45, ap160, 1, 1, 1, 1, 1],
[female, not_smoke, age45, ap140, 0, 1, 1, 1, 1],
[female, not_smoke, age45, ap120, 0, 0, 1, 1, 1],
# [female, not_smoke, age40, ap180, , , , , ],
# [female, not_smoke, age40, ap160, , , , , ],
# [female, not_smoke, age40, ap140, , , , , ],
# [female, not_smoke, age40, ap120, , , , , ],
#######################################################
[female, smoke, age65, ap180, 13, 15, 17, 19, 22],
[female, smoke, age65, ap160, 9, 10, 12, 13, 16],
[female, smoke, age65, ap140, 6, 7, 8, 9, 11],
[female, smoke, age65, ap120, 4, 5, 5, 6, 7],
[female, smoke, age60, ap180, 8, 9, 10, 11, 13],
[female, smoke, age60, ap160, 5, 6, 7, 8, 9],
[female, smoke, age60, ap140, 3, 4, 5, 5, 6],
[female, smoke, age60, ap120, 2, 3, 3, 4, 4],
[female, smoke, age55, ap180, 4, 5, 5, 6, 7],
[female, smoke, age55, ap160, 3, 3, 4, 4, 5],
[female, smoke, age55, ap140, 2, 2, 2, 3, 3],
[female, smoke, age55, ap120, 1, 1, 2, 2, 2],
[female, smoke, age45, ap180, 2, 2, 3, 3, 4],
[female, smoke, age45, ap160, 1, 2, 2, 2, 3],
[female, smoke, age45, ap140, 1, 1, 1, 1, 2],
[female, smoke, age45, ap120, 1, 1, 1, 1, 1],
[female, smoke, age40, ap180, 0, 0, 0, 1, 1],
# [female, smoke, age40, ap160, , , , , ],
# [female, smoke, age40, ap140, , , , , ],
# [female, smoke, age40, ap120, , , , , ],
#######################################################
[male, not_smoke, age65, ap180, 14, 16, 19, 22, 26],
[male, not_smoke, age65, ap160, 9, 11, 13, 15, 16],
[male, not_smoke, age65, ap140, 6, 8, 9, 11, 13],
[male, not_smoke, age65, ap120, 4, 5, 6, 7, 9],
[male, not_smoke, age60, ap180, 9, 11, 13, 15, 18],
[male, not_smoke, age60, ap160, 6, 7, 9, 10, 12],
[male, not_smoke, age60, ap140, 4, 5, 6, 7, 9],
[male, not_smoke, age60, ap120, 3, 3, 4, 5, 6],
[male, not_smoke, age55, ap180, 6, 7, 8, 10, 12],
[male, not_smoke, age55, ap160, 4, 5, 6, 7, 8],
[male, not_smoke, age55, ap140, 3, 3, 4, 5, 6],
[male, not_smoke, age55, ap120, 2, 2, 3, 3, 4],
[male, not_smoke, age45, ap180, 4, 4, 5, 6, 7],
[male, not_smoke, age45, ap160, 2, 3, 3, 4, 5],
[male, not_smoke, age45, ap140, 2, 2, 2, 3, 3],
[male, not_smoke, age45, ap120, 1, 1, 2, 2, 2],
[male, not_smoke, age40, ap180, 1, 1, 1, 2, 2],
[male, not_smoke, age40, ap160, 1, 1, 1, 1, 1],
[male, not_smoke, age40, ap140, 0, 1, 1, 1, 1],
[male, not_smoke, age40, ap120, 0, 0, 1, 1, 1],
#######################################################
[male, smoke, age65, ap180, 26, 30, 35, 41, 47],
[male, smoke, age65, ap160, 18, 21, 25, 29, 34],
[male, smoke, age65, ap140, 13, 15, 17, 20, 24],
[male, smoke, age65, ap120, 9, 10, 12, 14, 17],
[male, smoke, age60, ap180, 18, 21, 24, 28, 33],
[male, smoke, age60, ap160, 12, 14, 17, 20, 24],
[male, smoke, age60, ap140, 8, 10, 12, 14, 17],
[male, smoke, age60, ap120, 6, 7, 8, 10, 12],
[male, smoke, age55, ap180, 12, 13, 16, 19, 22],
[male, smoke, age55, ap160, 8, 9, 11, 13, 16],
[male, smoke, age55, ap140, 5, 6, 8, 9, 11],
[male, smoke, age55, ap120, 4, 4, 5, 6, 8],
[male, smoke, age45, ap180, 7, 8, 10, 12, 14],
[male, smoke, age45, ap160, 5, 6, 7, 8, 10],
[male, smoke, age45, ap140, 3, 4, 5, 6, 7],
[male, smoke, age45, ap120, 2, 3, 3, 4, 5],
[male, smoke, age40, ap180, 2, 2, 3, 3, 4],
[male, smoke, age40, ap160, 1, 2, 2, 2, 3],
[male, smoke, age40, ap140, 1, 1, 1, 2, 2],
[male, smoke, age40, ap120, 1, 1, 1, 1, 1],
]
for gender, sm, age, aphi, c4, c5, c6, c7, c8 in data:
all.ix[gender & sm & aphi & chol8 , 'score_scale_val'] = c8
all.ix[gender & sm & aphi & chol7 , 'score_scale_val'] = c7
all.ix[gender & sm & aphi & chol6 , 'score_scale_val'] = c6
all.ix[gender & sm & aphi & chol5 , 'score_scale_val'] = c5
all.ix[gender & sm & aphi & chol4 , 'score_scale_val'] = c4
all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))
In [126]:
#6 + score
DATA_ID = 49
train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(47), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(47), index_col='id')
all = pd.concat([train, test])
#all['score_scale_val'] = 0
chol4 = all['cholesterol_all'] == 1
chol5 = all['cholesterol_all'] == 1
chol6 = all['cholesterol_all'] == 1
chol7 = all['cholesterol_all'] == 2
chol8 = (all['cholesterol_all'] == 3)
male = all['gender_male'] == 1
female = all['gender_male'] == 0
smoke = all['smoke'] == 1
not_smoke = all['smoke'] == 0
age40 = (all['age']/365.25 >=40) & (all['age']/365.25 < 45)
age45 = (all['age']/365.25 >=45) & (all['age']/365.25 < 55)
age55 = (all['age']/365.25 >=55) & (all['age']/365.25 < 60)
age60 = (all['age']/365.25 >=60) & (all['age']/365.25 < 65)
age65 = (all['age']/365.25 >=65)
ap120 = (all['ap_hi'] >= 120) & (all['ap_hi'] < 140)
ap140 = (all['ap_hi'] >= 140) & (all['ap_hi'] < 160)
ap160 = (all['ap_hi'] >= 160) & (all['ap_hi'] < 170)
ap180 = (all['ap_hi'] >= 180)
data = [
[female, not_smoke, age65, ap180, 7, 8, 9, 10, 12],
[female, not_smoke, age65, ap160, 5, 5, 6, 7, 8],
[female, not_smoke, age65, ap140, 3, 3, 4, 5, 6],
[female, not_smoke, age65, ap120, 2, 2, 3, 3, 4],
[female, not_smoke, age60, ap180, 4, 4, 5, 6, 7],
[female, not_smoke, age60, ap160, 3, 3, 3, 4, 5],
[female, not_smoke, age60, ap140, 2, 2, 2, 3, 3],
[female, not_smoke, age60, ap120, 1, 1, 2, 2, 2],
[female, not_smoke, age55, ap180, 2, 2, 3, 3, 4],
[female, not_smoke, age55, ap160, 1, 2, 2, 2, 3],
[female, not_smoke, age55, ap140, 1, 1, 1, 1, 2],
[female, not_smoke, age55, ap120, 1, 1, 1, 1, 1],
[female, not_smoke, age45, ap180, 1, 1, 1, 2, 2],
[female, not_smoke, age45, ap160, 1, 1, 1, 1, 1],
[female, not_smoke, age45, ap140, 0, 1, 1, 1, 1],
[female, not_smoke, age45, ap120, 0, 0, 1, 1, 1],
# [female, not_smoke, age40, ap180, , , , , ],
# [female, not_smoke, age40, ap160, , , , , ],
# [female, not_smoke, age40, ap140, , , , , ],
# [female, not_smoke, age40, ap120, , , , , ],
#######################################################
[female, smoke, age65, ap180, 13, 15, 17, 19, 22],
[female, smoke, age65, ap160, 9, 10, 12, 13, 16],
[female, smoke, age65, ap140, 6, 7, 8, 9, 11],
[female, smoke, age65, ap120, 4, 5, 5, 6, 7],
[female, smoke, age60, ap180, 8, 9, 10, 11, 13],
[female, smoke, age60, ap160, 5, 6, 7, 8, 9],
[female, smoke, age60, ap140, 3, 4, 5, 5, 6],
[female, smoke, age60, ap120, 2, 3, 3, 4, 4],
[female, smoke, age55, ap180, 4, 5, 5, 6, 7],
[female, smoke, age55, ap160, 3, 3, 4, 4, 5],
[female, smoke, age55, ap140, 2, 2, 2, 3, 3],
[female, smoke, age55, ap120, 1, 1, 2, 2, 2],
[female, smoke, age45, ap180, 2, 2, 3, 3, 4],
[female, smoke, age45, ap160, 1, 2, 2, 2, 3],
[female, smoke, age45, ap140, 1, 1, 1, 1, 2],
[female, smoke, age45, ap120, 1, 1, 1, 1, 1],
[female, smoke, age40, ap180, 0, 0, 0, 1, 1],
# [female, smoke, age40, ap160, , , , , ],
# [female, smoke, age40, ap140, , , , , ],
# [female, smoke, age40, ap120, , , , , ],
#######################################################
[male, not_smoke, age65, ap180, 14, 16, 19, 22, 26],
[male, not_smoke, age65, ap160, 9, 11, 13, 15, 16],
[male, not_smoke, age65, ap140, 6, 8, 9, 11, 13],
[male, not_smoke, age65, ap120, 4, 5, 6, 7, 9],
[male, not_smoke, age60, ap180, 9, 11, 13, 15, 18],
[male, not_smoke, age60, ap160, 6, 7, 9, 10, 12],
[male, not_smoke, age60, ap140, 4, 5, 6, 7, 9],
[male, not_smoke, age60, ap120, 3, 3, 4, 5, 6],
[male, not_smoke, age55, ap180, 6, 7, 8, 10, 12],
[male, not_smoke, age55, ap160, 4, 5, 6, 7, 8],
[male, not_smoke, age55, ap140, 3, 3, 4, 5, 6],
[male, not_smoke, age55, ap120, 2, 2, 3, 3, 4],
[male, not_smoke, age45, ap180, 4, 4, 5, 6, 7],
[male, not_smoke, age45, ap160, 2, 3, 3, 4, 5],
[male, not_smoke, age45, ap140, 2, 2, 2, 3, 3],
[male, not_smoke, age45, ap120, 1, 1, 2, 2, 2],
[male, not_smoke, age40, ap180, 1, 1, 1, 2, 2],
[male, not_smoke, age40, ap160, 1, 1, 1, 1, 1],
[male, not_smoke, age40, ap140, 0, 1, 1, 1, 1],
[male, not_smoke, age40, ap120, 0, 0, 1, 1, 1],
#######################################################
[male, smoke, age65, ap180, 26, 30, 35, 41, 47],
[male, smoke, age65, ap160, 18, 21, 25, 29, 34],
[male, smoke, age65, ap140, 13, 15, 17, 20, 24],
[male, smoke, age65, ap120, 9, 10, 12, 14, 17],
[male, smoke, age60, ap180, 18, 21, 24, 28, 33],
[male, smoke, age60, ap160, 12, 14, 17, 20, 24],
[male, smoke, age60, ap140, 8, 10, 12, 14, 17],
[male, smoke, age60, ap120, 6, 7, 8, 10, 12],
[male, smoke, age55, ap180, 12, 13, 16, 19, 22],
[male, smoke, age55, ap160, 8, 9, 11, 13, 16],
[male, smoke, age55, ap140, 5, 6, 8, 9, 11],
[male, smoke, age55, ap120, 4, 4, 5, 6, 8],
[male, smoke, age45, ap180, 7, 8, 10, 12, 14],
[male, smoke, age45, ap160, 5, 6, 7, 8, 10],
[male, smoke, age45, ap140, 3, 4, 5, 6, 7],
[male, smoke, age45, ap120, 2, 3, 3, 4, 5],
[male, smoke, age40, ap180, 2, 2, 3, 3, 4],
[male, smoke, age40, ap160, 1, 2, 2, 2, 3],
[male, smoke, age40, ap140, 1, 1, 1, 2, 2],
[male, smoke, age40, ap120, 1, 1, 1, 1, 1],
]
for gender, sm, age, aphi, c4, c5, c6, c7, c8 in data:
all.ix[gender & sm & aphi & chol8 , 'score_scale_val'] = c8
all.ix[gender & sm & aphi & chol7 , 'score_scale_val'] = c7
all.ix[gender & sm & aphi & chol6 , 'score_scale_val'] = c6
all.ix[gender & sm & aphi & chol5 , 'score_scale_val'] = c5
all.ix[gender & sm & aphi & chol4 , 'score_scale_val'] = c4
all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))
In [141]:
DATA_ID = 50
train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(49), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(49), index_col='id')
all = pd.concat([train, test])
from sklearn.cluster import KMeans
k15 = KMeans(n_clusters=15, precompute_distances = True, n_jobs=-1)
k15.fit(all.fillna(0))
k7 = KMeans(n_clusters=7, precompute_distances = True, n_jobs=-1)
k7.fit(all.fillna(0))
k3 = KMeans(n_clusters=3, precompute_distances = True, n_jobs=-1)
k3.fit(all.fillna(0))
k15_res = k15.predict(all.fillna(0))[None].T
k7_res = k7.predict(all.fillna(0))[None].T
k3_res = k3.predict(all.fillna(0))[None].T
all['k15'] = k15_res
all['k7'] = k7_res
all['k3'] = k3_res
all = pd.get_dummies(all, columns=["k15", "k7", "k3",])#
all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))
len(all.columns)
Out[141]:
In [142]:
#43 + stat feat for add
DATA_ID = 51
train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(50), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(50), index_col='id')
all = pd.concat([train, test])
is_male = all['gender_male'] == 1
is_female = all['gender_male'] == 0
chol1 = all['cholesterol_all'] == 1
chol2 = all['cholesterol_all'] == 2
chol3 = all['cholesterol_all'] == 3
gluc1 = all['gluc_all'] == 1
gluc2 = all['gluc_all'] == 2
gluc3 = all['gluc_all'] == 3
age_year = all['age'] // 365.25
all.ix[age_year < 40 , 'age_cat'] = 1
all.ix[(age_year >= 40) & (age_year < 45) , 'age_cat'] = 2
all.ix[(age_year >= 45) & (age_year < 50) , 'age_cat'] = 3
all.ix[(age_year >= 50) & (age_year < 55) , 'age_cat'] = 4
all.ix[(age_year >= 55) & (age_year < 60) , 'age_cat'] = 5
all.ix[(age_year >= 60) , 'age_cat'] = 6
age1 = all['age_cat'] == 1
age2 = all['age_cat'] == 2
age3 = all['age_cat'] == 3
age4 = all['age_cat'] == 4
age5 = all['age_cat'] == 5
age6 = all['age_cat'] == 6
for c in ['age', 'height', 'weight', 'ap_hi', 'ap_lo',
'cholesterol_all',
'gluc_all', 'imt', 'imt_class_all',
'x__age__gluc_all',
'x__ap_hi__cholesterol_all', 'div6__height__gluc_all__imt',
'plus__age_norm__ap_hi_norm__gluc_all_norm', 'x__age__weight',
'div1__age__weight__cholesterol_all',
'div6__age__weight__cholesterol_all',
'plus__height_norm__weight_norm__gluc_all_norm',
'div1__ap_hi__ap_lo__cholesterol_all',
'div6__ap_hi__ap_lo__cholesterol_all',
'plus__age_norm__gluc_all_norm__imt_norm',
'minus6__ap_hi_norm__ap_lo_norm__cholesterol_all_norm',
'minus1__ap_hi_norm__ap_lo_norm__cholesterol_all_norm',
'minus6__age_norm__ap_lo_norm__cholesterol_all_norm',
'minus1__age_norm__ap_lo_norm__cholesterol_all_norm',
'div6__height__weight__ap_lo', 'div2__ap_lo__cholesterol_all__gluc_all',
'x__age__ap_hi__gluc_all', 'div5__ap_lo__cholesterol_all__gluc_all',
'score_scale_val']:
all.ix[is_male, c + '___gender__scale'] = \
(all.ix[is_male, c] - all.groupby(['gender_male']).mean()[c].loc[1]) / \
all.groupby(['gender_male']).std()[c].loc[1]
all.ix[is_female, c + '___gender__scale'] = \
(all.ix[is_female, c] - all.groupby(['gender_male']).mean()[c].loc[0]) / \
all.groupby(['gender_male']).std()[c].loc[0]
if c != 'cholesterol_all':
group_by = all.groupby(['gender_male', 'cholesterol_all'])
all.ix[is_male & chol1, c + '___gender_chol__scale'] = \
(all.ix[is_male & chol1, c] - group_by.mean()[c].loc[1, 1]) / group_by.std()[c].loc[1, 1]
all.ix[is_male & chol2, c + '___gender_chol__scale'] = \
(all.ix[is_male & chol2, c] - group_by.mean()[c].loc[1, 2]) / group_by.std()[c].loc[1, 2]
all.ix[is_male & chol3, c + '___gender_chol__scale'] = \
(all.ix[is_male & chol3, c] - group_by.mean()[c].loc[1, 3]) / group_by.std()[c].loc[1, 3]
all.ix[is_female & chol1, c + '___gender_chol__scale'] = \
(all.ix[is_female & chol1, c] - group_by.mean()[c].loc[0, 1]) / group_by.std()[c].loc[0, 1]
all.ix[is_female & chol2, c + '___gender_chol__scale'] = \
(all.ix[is_female & chol2, c] - group_by.mean()[c].loc[0, 2]) / group_by.std()[c].loc[0, 2]
all.ix[is_female & chol3, c + '___gender_chol__scale'] = \
(all.ix[is_female & chol3, c] - group_by.mean()[c].loc[0, 3]) / group_by.std()[c].loc[0, 3]
if c != 'gluc_all':
group_by = all.groupby(['gender_male', 'gluc_all'])
all.ix[is_male & gluc1, c + '___gender_gluc__scale'] = \
(all.ix[is_male & gluc1, c] - group_by.mean()[c].loc[1, 1]) / group_by.std()[c].loc[1, 1]
all.ix[is_male & gluc2, c + '___gender_gluc__scale'] = \
(all.ix[is_male & gluc2, c] - group_by.mean()[c].loc[1, 2]) / group_by.std()[c].loc[1, 2]
all.ix[is_male & gluc3, c + '___gender_gluc__scale'] = \
(all.ix[is_male & gluc3, c] - group_by.mean()[c].loc[1, 3]) / group_by.std()[c].loc[1, 3]
all.ix[is_female & gluc1, c + '___gender_gluc__scale'] = \
(all.ix[is_female & gluc1, c] - group_by.mean()[c].loc[0, 1]) / group_by.std()[c].loc[0, 1]
all.ix[is_female & gluc2, c + '___gender_gluc__scale'] = \
(all.ix[is_female & gluc2, c] - group_by.mean()[c].loc[0, 2]) / group_by.std()[c].loc[0, 2]
all.ix[is_female & gluc3, c + '___gender_gluc__scale'] = \
(all.ix[is_female & gluc3, c] - group_by.mean()[c].loc[0, 3]) / group_by.std()[c].loc[0, 3]
if c != 'age':
group_by = all.groupby(['gender_male', 'age_cat'])
all.ix[is_male & age1, c + '___gender_age__scale'] = \
(all.ix[is_male & age1, c] - group_by.mean()[c].loc[1, 1]) / group_by.std()[c].loc[1, 1]
all.ix[is_male & age2, c + '___gender_age__scale'] = \
(all.ix[is_male & age2, c] - group_by.mean()[c].loc[1, 2]) / group_by.std()[c].loc[1, 2]
all.ix[is_male & age3, c + '___gender_age__scale'] = \
(all.ix[is_male & age3, c] - group_by.mean()[c].loc[1, 3]) / group_by.std()[c].loc[1, 3]
all.ix[is_male & age4, c + '___gender_age__scale'] = \
(all.ix[is_male & age4, c] - group_by.mean()[c].loc[1, 4]) / group_by.std()[c].loc[1, 4]
all.ix[is_male & age5, c + '___gender_age__scale'] = \
(all.ix[is_male & age5, c] - group_by.mean()[c].loc[1, 5]) / group_by.std()[c].loc[1, 5]
all.ix[is_male & age6, c + '___gender_age__scale'] = \
(all.ix[is_male & age6, c] - group_by.mean()[c].loc[1, 6]) / group_by.std()[c].loc[1, 6]
all.ix[is_female & age1, c + '___gender_age__scale'] = \
(all.ix[is_female & age1, c] - group_by.mean()[c].loc[0, 1]) / group_by.std()[c].loc[0, 1]
all.ix[is_female & age2, c + '___gender_age__scale'] = \
(all.ix[is_female & age2, c] - group_by.mean()[c].loc[0, 2]) / group_by.std()[c].loc[0, 2]
all.ix[is_female & age3, c + '___gender_age__scale'] = \
(all.ix[is_female & age3, c] - group_by.mean()[c].loc[0, 3]) / group_by.std()[c].loc[0, 3]
all.ix[is_female & age4, c + '___gender_age__scale'] = \
(all.ix[is_female & age4, c] - group_by.mean()[c].loc[0, 4]) / group_by.std()[c].loc[0, 4]
all.ix[is_female & age5, c + '___gender_age__scale'] = \
(all.ix[is_female & age5, c] - group_by.mean()[c].loc[0, 5]) / group_by.std()[c].loc[0, 5]
all.ix[is_female & age6, c + '___gender_age__scale'] = \
(all.ix[is_female & age6, c] - group_by.mean()[c].loc[0, 6]) / group_by.std()[c].loc[0, 6]
del all['age_cat']
all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))
len(all.columns)
Out[142]:
In [143]:
# all.groupby(['gender_male', 'age_cat']).mean()
In [144]:
# all.groupby(['gender_male', 'age_cat']).mean()['age'].loc[1, 2]
In [ ]:
In [128]:
In [145]:
DATA_ID = 52
train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(50), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(50), index_col='id')
all = pd.concat([train, test])
train51 = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(51), index_col='id')
test51 = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(51), index_col='id')
all51 = pd.concat([train51, test51])
all['div6__height__gluc_all__imt___gender__scale'] = all51['div6__height__gluc_all__imt___gender__scale']
all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))
len(all.columns)
Out[145]:
In [ ]:
In [18]:
DATA_ID = 53
train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(52), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(52), index_col='id')
all = pd.concat([train, test])
for i in ['k15_0',
'k15_1',
'k15_2',
'k15_3',
'k15_4',
'k15_5',
'k15_6',
'k15_7',
'k15_8',
'k15_9',
'k15_10',
'k15_11',
'k15_12',
'k15_13',
'k15_14',
'k7_0',
'k7_1',
'k7_2',
'k7_3',
'k7_4',
'k7_5',
'k7_6',
'k3_0',
'k3_1',
'k3_2']:
del all[i]
all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))
len(all.columns)
Out[18]:
In [19]:
DATA_ID = 54
train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(53), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(53), index_col='id')
all = pd.concat([train, test])
train43 = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(43), index_col='id')
test43 = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(43), index_col='id')
all43 = pd.concat([train43, test43])
for i in ['k15_0',
'k15_1',
'k15_2',
'k15_3',
'k15_4',
'k15_5',
'k15_6',
'k15_7',
'k15_8',
'k15_9',
'k15_10',
'k15_11',
'k15_12',
'k15_13',
'k15_14',
'k7_0',
'k7_1',
'k7_2',
'k7_3',
'k7_4',
'k7_5',
'k7_6',
'k3_0',
'k3_1',
'k3_2']:
all[i] = all43[i]
all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))
len(all.columns)
Out[19]:
In [ ]:
In [ ]:
In [8]:
DATA_ID = 55
train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(53), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(53), index_col='id')
all = pd.concat([train, test])
all2=all[['age', 'height', 'weight', 'ap_hi', 'ap_lo', 'smoke', 'alco', 'active',
'gender_male', 'height_low', 'weight_low', 'cholesterol_all',
'gluc_all', 'ap_error', 'ap_error_swap', 'imt', 'imt_class_all',
]]
from sklearn.cluster import KMeans
k15 = KMeans(n_clusters=15, precompute_distances = True, n_jobs=-1)
k15.fit(all2)
k7 = KMeans(n_clusters=7, precompute_distances = True, n_jobs=-1)
k7.fit(all2)
k3 = KMeans(n_clusters=3, precompute_distances = True, n_jobs=-1)
k3.fit(all2)
k15_res = k15.predict(all2)[None].T
k7_res = k7.predict(all2)[None].T
k3_res = k3.predict(all2)[None].T
all['k15'] = k15_res
all['k7'] = k7_res
all['k3'] = k3_res
all = pd.get_dummies(all, columns=["k15", "k7", "k3",])#
all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))
len(all.columns)
Out[8]:
In [9]:
DATA_ID = 56
train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(53), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(53), index_col='id')
all = pd.concat([train, test])
all2=all[['age', 'height', 'weight', 'ap_hi', 'ap_lo',
'gender_male', 'height_low', 'weight_low', 'cholesterol_all',
'gluc_all', 'ap_error', 'ap_error_swap', 'imt', 'imt_class_all',
]]
from sklearn.cluster import KMeans
k15 = KMeans(n_clusters=15, precompute_distances = True, n_jobs=-1)
k15.fit(all2)
k7 = KMeans(n_clusters=7, precompute_distances = True, n_jobs=-1)
k7.fit(all2)
k3 = KMeans(n_clusters=3, precompute_distances = True, n_jobs=-1)
k3.fit(all2)
k15_res = k15.predict(all2)[None].T
k7_res = k7.predict(all2)[None].T
k3_res = k3.predict(all2)[None].T
all['k15'] = k15_res
all['k7'] = k7_res
all['k3'] = k3_res
all = pd.get_dummies(all, columns=["k15", "k7", "k3",])#
all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))
len(all.columns)
Out[9]:
In [12]:
DATA_ID = 57
train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(53), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(53), index_col='id')
all = pd.concat([train, test])
all2=all[['age', 'height', 'weight', 'ap_hi', 'ap_lo',
'gender_male', 'height_low', 'weight_low', 'cholesterol_all',
'gluc_all', 'ap_error', 'ap_error_swap', 'imt', 'imt_class_all',
]]
from sklearn.cluster import KMeans
k7 = KMeans(n_clusters=7, precompute_distances = True, n_jobs=-1)
k7.fit(all2)
k3 = KMeans(n_clusters=3, precompute_distances = True, n_jobs=-1)
k3.fit(all2)
k7_res = k7.predict(all2)[None].T
k3_res = k3.predict(all2)[None].T
all['k7'] = k7_res
all['k3'] = k3_res
all = pd.get_dummies(all, columns=["k7", "k3",])#
all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))
len(all.columns)
Out[12]:
In [13]:
DATA_ID = 58
train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(53), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(53), index_col='id')
all = pd.concat([train, test])
all2=all[['age', 'height', 'weight', 'ap_hi', 'ap_lo',
'gender_male', 'height_low', 'weight_low', 'cholesterol_all',
'gluc_all', 'ap_error', 'ap_error_swap', 'imt', 'imt_class_all',
]]
from sklearn.cluster import KMeans
k15 = KMeans(n_clusters=15, precompute_distances = True, n_jobs=-1)
k15.fit(all2)
k3 = KMeans(n_clusters=3, precompute_distances = True, n_jobs=-1)
k3.fit(all2)
k15_res = k15.predict(all2)[None].T
k3_res = k3.predict(all2)[None].T
all['k15'] = k15_res
all['k3'] = k3_res
all = pd.get_dummies(all, columns=["k15", "k3",])#
all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))
len(all.columns)
Out[13]:
In [14]:
DATA_ID = 59
train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(53), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(53), index_col='id')
all = pd.concat([train, test])
all2=all[['age', 'height', 'weight', 'ap_hi', 'ap_lo',
'gender_male', 'height_low', 'weight_low', 'cholesterol_all',
'gluc_all', 'ap_error', 'ap_error_swap', 'imt', 'imt_class_all',
]]
from sklearn.cluster import KMeans
k15 = KMeans(n_clusters=15, precompute_distances = True, n_jobs=-1)
k15.fit(all2)
k7 = KMeans(n_clusters=7, precompute_distances = True, n_jobs=-1)
k7.fit(all2)
k15_res = k15.predict(all2)[None].T
k7_res = k7.predict(all2)[None].T
all['k15'] = k15_res
all['k7'] = k7_res
all = pd.get_dummies(all, columns=["k15", "k7", ])#
all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))
len(all.columns)
Out[14]:
In [ ]:
In [4]:
#43 + stat feat for add
DATA_ID = 60
train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(56), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(56), index_col='id')
all = pd.concat([train, test])
is_male = all['gender_male'] == 1
is_female = all['gender_male'] == 0
chol1 = all['cholesterol_all'] == 1
chol2 = all['cholesterol_all'] == 2
chol3 = all['cholesterol_all'] == 3
gluc1 = all['gluc_all'] == 1
gluc2 = all['gluc_all'] == 2
gluc3 = all['gluc_all'] == 3
age_year = all['age'] // 365.25
all.ix[age_year < 40 , 'age_cat'] = 1
all.ix[(age_year >= 40) & (age_year < 45) , 'age_cat'] = 2
all.ix[(age_year >= 45) & (age_year < 50) , 'age_cat'] = 3
all.ix[(age_year >= 50) & (age_year < 55) , 'age_cat'] = 4
all.ix[(age_year >= 55) & (age_year < 60) , 'age_cat'] = 5
all.ix[(age_year >= 60) , 'age_cat'] = 6
age1 = all['age_cat'] == 1
age2 = all['age_cat'] == 2
age3 = all['age_cat'] == 3
age4 = all['age_cat'] == 4
age5 = all['age_cat'] == 5
age6 = all['age_cat'] == 6
for c in ['age', 'height', 'weight', 'ap_hi', 'ap_lo',
'cholesterol_all',
'gluc_all', 'imt', 'imt_class_all',
'x__age__gluc_all',
'x__ap_hi__cholesterol_all', 'div6__height__gluc_all__imt',
'plus__age_norm__ap_hi_norm__gluc_all_norm', 'x__age__weight',
'div1__age__weight__cholesterol_all',
'div6__age__weight__cholesterol_all',
'plus__height_norm__weight_norm__gluc_all_norm',
'div1__ap_hi__ap_lo__cholesterol_all',
'div6__ap_hi__ap_lo__cholesterol_all',
'plus__age_norm__gluc_all_norm__imt_norm',
'minus6__ap_hi_norm__ap_lo_norm__cholesterol_all_norm',
'minus1__ap_hi_norm__ap_lo_norm__cholesterol_all_norm',
'minus6__age_norm__ap_lo_norm__cholesterol_all_norm',
'minus1__age_norm__ap_lo_norm__cholesterol_all_norm',
'div6__height__weight__ap_lo', 'div2__ap_lo__cholesterol_all__gluc_all',
'x__age__ap_hi__gluc_all', 'div5__ap_lo__cholesterol_all__gluc_all',
'score_scale_val']:
all.ix[is_male, c + '___gender__scale'] = \
(all.ix[is_male, c] - all.groupby(['gender_male']).mean()[c].loc[1]) / \
all.groupby(['gender_male']).std()[c].loc[1]
all.ix[is_female, c + '___gender__scale'] = \
(all.ix[is_female, c] - all.groupby(['gender_male']).mean()[c].loc[0]) / \
all.groupby(['gender_male']).std()[c].loc[0]
if c != 'cholesterol_all':
group_by = all.groupby(['gender_male', 'cholesterol_all'])
all.ix[is_male & chol1, c + '___gender_chol__scale'] = \
(all.ix[is_male & chol1, c] - group_by.mean()[c].loc[1, 1]) / group_by.std()[c].loc[1, 1]
all.ix[is_male & chol2, c + '___gender_chol__scale'] = \
(all.ix[is_male & chol2, c] - group_by.mean()[c].loc[1, 2]) / group_by.std()[c].loc[1, 2]
all.ix[is_male & chol3, c + '___gender_chol__scale'] = \
(all.ix[is_male & chol3, c] - group_by.mean()[c].loc[1, 3]) / group_by.std()[c].loc[1, 3]
all.ix[is_female & chol1, c + '___gender_chol__scale'] = \
(all.ix[is_female & chol1, c] - group_by.mean()[c].loc[0, 1]) / group_by.std()[c].loc[0, 1]
all.ix[is_female & chol2, c + '___gender_chol__scale'] = \
(all.ix[is_female & chol2, c] - group_by.mean()[c].loc[0, 2]) / group_by.std()[c].loc[0, 2]
all.ix[is_female & chol3, c + '___gender_chol__scale'] = \
(all.ix[is_female & chol3, c] - group_by.mean()[c].loc[0, 3]) / group_by.std()[c].loc[0, 3]
if c != 'gluc_all':
group_by = all.groupby(['gender_male', 'gluc_all'])
all.ix[is_male & gluc1, c + '___gender_gluc__scale'] = \
(all.ix[is_male & gluc1, c] - group_by.mean()[c].loc[1, 1]) / group_by.std()[c].loc[1, 1]
all.ix[is_male & gluc2, c + '___gender_gluc__scale'] = \
(all.ix[is_male & gluc2, c] - group_by.mean()[c].loc[1, 2]) / group_by.std()[c].loc[1, 2]
all.ix[is_male & gluc3, c + '___gender_gluc__scale'] = \
(all.ix[is_male & gluc3, c] - group_by.mean()[c].loc[1, 3]) / group_by.std()[c].loc[1, 3]
all.ix[is_female & gluc1, c + '___gender_gluc__scale'] = \
(all.ix[is_female & gluc1, c] - group_by.mean()[c].loc[0, 1]) / group_by.std()[c].loc[0, 1]
all.ix[is_female & gluc2, c + '___gender_gluc__scale'] = \
(all.ix[is_female & gluc2, c] - group_by.mean()[c].loc[0, 2]) / group_by.std()[c].loc[0, 2]
all.ix[is_female & gluc3, c + '___gender_gluc__scale'] = \
(all.ix[is_female & gluc3, c] - group_by.mean()[c].loc[0, 3]) / group_by.std()[c].loc[0, 3]
if c != 'age':
group_by = all.groupby(['gender_male', 'age_cat'])
all.ix[is_male & age1, c + '___gender_age__scale'] = \
(all.ix[is_male & age1, c] - group_by.mean()[c].loc[1, 1]) / group_by.std()[c].loc[1, 1]
all.ix[is_male & age2, c + '___gender_age__scale'] = \
(all.ix[is_male & age2, c] - group_by.mean()[c].loc[1, 2]) / group_by.std()[c].loc[1, 2]
all.ix[is_male & age3, c + '___gender_age__scale'] = \
(all.ix[is_male & age3, c] - group_by.mean()[c].loc[1, 3]) / group_by.std()[c].loc[1, 3]
all.ix[is_male & age4, c + '___gender_age__scale'] = \
(all.ix[is_male & age4, c] - group_by.mean()[c].loc[1, 4]) / group_by.std()[c].loc[1, 4]
all.ix[is_male & age5, c + '___gender_age__scale'] = \
(all.ix[is_male & age5, c] - group_by.mean()[c].loc[1, 5]) / group_by.std()[c].loc[1, 5]
all.ix[is_male & age6, c + '___gender_age__scale'] = \
(all.ix[is_male & age6, c] - group_by.mean()[c].loc[1, 6]) / group_by.std()[c].loc[1, 6]
all.ix[is_female & age1, c + '___gender_age__scale'] = \
(all.ix[is_female & age1, c] - group_by.mean()[c].loc[0, 1]) / group_by.std()[c].loc[0, 1]
all.ix[is_female & age2, c + '___gender_age__scale'] = \
(all.ix[is_female & age2, c] - group_by.mean()[c].loc[0, 2]) / group_by.std()[c].loc[0, 2]
all.ix[is_female & age3, c + '___gender_age__scale'] = \
(all.ix[is_female & age3, c] - group_by.mean()[c].loc[0, 3]) / group_by.std()[c].loc[0, 3]
all.ix[is_female & age4, c + '___gender_age__scale'] = \
(all.ix[is_female & age4, c] - group_by.mean()[c].loc[0, 4]) / group_by.std()[c].loc[0, 4]
all.ix[is_female & age5, c + '___gender_age__scale'] = \
(all.ix[is_female & age5, c] - group_by.mean()[c].loc[0, 5]) / group_by.std()[c].loc[0, 5]
all.ix[is_female & age6, c + '___gender_age__scale'] = \
(all.ix[is_female & age6, c] - group_by.mean()[c].loc[0, 6]) / group_by.std()[c].loc[0, 6]
del all['age_cat']
all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))
len(all.columns)
Out[4]:
In [ ]:
In [7]:
DATA_ID = 61
train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(56), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(56), index_col='id')
all = pd.concat([train, test])
train60 = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(60), index_col='id')
test60 = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(60), index_col='id')
all60 = pd.concat([train60, test60])
all['gluc_all___gender_age__scale'] = all60['gluc_all___gender_age__scale']
all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))
len(all.columns)
Out[7]:
In [ ]:
In [8]:
DATA_ID = 62
train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(56), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(56), index_col='id')
all = pd.concat([train, test])
del all['div6__height__gluc_all__imt___gender__scale']
all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))
len(all.columns)
Out[8]:
In [ ]:
In [15]:
DATA_ID = 63
train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(62), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(62), index_col='id')
all = pd.concat([train, test])
train60 = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(60), index_col='id')
test60 = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(60), index_col='id')
all60 = pd.concat([train60, test60])
all['score_scale_val___gender_chol__scale'] = all60['score_scale_val___gender_chol__scale']
all['div6__height__gluc_all__imt___gender_chol__scale'] = all60['div6__height__gluc_all__imt___gender_chol__scale']
all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))
len(all.columns)
Out[15]:
In [16]:
DATA_ID = 64
train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(62), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(62), index_col='id')
all = pd.concat([train, test])
train60 = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(60), index_col='id')
test60 = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(60), index_col='id')
all60 = pd.concat([train60, test60])
all['score_scale_val___gender_chol__scale'] = all60['score_scale_val___gender_chol__scale']
all['imt___gender_age__scale'] = all60['imt___gender_age__scale']
all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))
len(all.columns)
Out[16]:
In [ ]:
In [26]:
DATA_ID = 65
train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(63), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(63), index_col='id')
all = pd.concat([train, test])
from sklearn.decomposition import PCA
pca = PCA(n_components=5, random_state=1000)
pca_res = pca.fit_transform(all.fillna(0))
all['pca_0'] = [i[0] for i in pca_res]
all['pca_1'] = [i[1] for i in pca_res]
all['pca_2'] = [i[2] for i in pca_res]
all['pca_3'] = [i[3] for i in pca_res]
all['pca_4'] = [i[4] for i in pca_res]
all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))
len(all.columns)
Out[26]:
In [25]:
[i[0] for i in pca_res]
Out[25]:
In [ ]:
In [29]:
DATA_ID = 66
train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(62), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(62), index_col='id')
all = pd.concat([train, test])
train60 = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(60), index_col='id')
test60 = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(60), index_col='id')
all60 = pd.concat([train60, test60])
all['gluc_all___gender_age__scale'] = all60['gluc_all___gender_age__scale']
all['cholesterol_all___gender_age__scale'] = all60['cholesterol_all___gender_age__scale']
all['div6__height__gluc_all__imt___gender_chol__scale'] = all60['div6__height__gluc_all__imt___gender_chol__scale']
all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))
len(all.columns)
Out[29]:
In [ ]:
DATA_ID = 67
train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(62), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(62), index_col='id')
all = pd.concat([train, test])
del all['score_scale_val']
all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))
len(all.columns)
In [ ]:
In [37]:
DATA_ID = 68
train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(62), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(62), index_col='id')
all = pd.concat([train, test])
train60 = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(60), index_col='id')
test60 = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(60), index_col='id')
all60 = pd.concat([train60, test60])
all['cholesterol_all___gender_age__scale'] = all60['cholesterol_all___gender_age__scale']
all['div6__height__gluc_all__imt___gender_chol__scale'] = all60['div6__height__gluc_all__imt___gender_chol__scale']
all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))
len(all.columns)
Out[37]:
In [32]:
test.isnull().sum()
Out[32]:
In [ ]:
In [38]:
DATA_ID = 68
train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(66), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(66), index_col='id')
all = pd.concat([train, test])
del all['height']
del all['div6__height__weight__ap_lo']
all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))
len(all.columns)
Out[38]:
In [39]:
DATA_ID = 69
train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(66), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(66), index_col='id')
all = pd.concat([train, test])
del all['div6__height__weight__ap_lo']
all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))
len(all.columns)
Out[39]:
In [ ]:
In [43]:
DATA_ID = 266
train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(66), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(66), index_col='id')
all = pd.concat([train, test])
all = all.fillna(0)
cols = ['age', 'height', 'weight', 'ap_hi', 'ap_lo',
'cholesterol_all',
'gluc_all', 'imt', 'imt_class_all',
'x__age__gluc_all',
'x__ap_hi__cholesterol_all', 'div6__height__gluc_all__imt',
'plus__age_norm__ap_hi_norm__gluc_all_norm', 'x__age__weight',
'div1__age__weight__cholesterol_all',
'div6__age__weight__cholesterol_all',
'plus__height_norm__weight_norm__gluc_all_norm',
'div1__ap_hi__ap_lo__cholesterol_all',
'div6__ap_hi__ap_lo__cholesterol_all',
'plus__age_norm__gluc_all_norm__imt_norm',
'minus6__ap_hi_norm__ap_lo_norm__cholesterol_all_norm',
'minus1__ap_hi_norm__ap_lo_norm__cholesterol_all_norm',
'minus6__age_norm__ap_lo_norm__cholesterol_all_norm',
'minus1__age_norm__ap_lo_norm__cholesterol_all_norm',
'div6__height__weight__ap_lo', 'div2__ap_lo__cholesterol_all__gluc_all',
'x__age__ap_hi__gluc_all', 'div5__ap_lo__cholesterol_all__gluc_all',
'score_scale_val', 'gluc_all___gender_age__scale',
'cholesterol_all___gender_age__scale',
'div6__height__gluc_all__imt___gender_chol__scale']
for c in cols:
all[c] = (all[c] - all[c].mean()) / (all[c].max()-all[c].min())
all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))
len(all.columns)
Out[43]:
In [47]:
DATA_ID = 269
train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(69), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(69), index_col='id')
all = pd.concat([train, test])
all = all.fillna(0)
cols = ['age', 'height', 'weight', 'ap_hi', 'ap_lo',
'cholesterol_all',
'gluc_all', 'imt', 'imt_class_all',
'x__age__gluc_all',
'x__ap_hi__cholesterol_all', 'div6__height__gluc_all__imt',
'plus__age_norm__ap_hi_norm__gluc_all_norm', 'x__age__weight',
'div1__age__weight__cholesterol_all',
'div6__age__weight__cholesterol_all',
'plus__height_norm__weight_norm__gluc_all_norm',
'div1__ap_hi__ap_lo__cholesterol_all',
'div6__ap_hi__ap_lo__cholesterol_all',
'plus__age_norm__gluc_all_norm__imt_norm',
'minus6__ap_hi_norm__ap_lo_norm__cholesterol_all_norm',
'minus1__ap_hi_norm__ap_lo_norm__cholesterol_all_norm',
'minus6__age_norm__ap_lo_norm__cholesterol_all_norm',
'minus1__age_norm__ap_lo_norm__cholesterol_all_norm',
'div2__ap_lo__cholesterol_all__gluc_all',
'x__age__ap_hi__gluc_all', 'div5__ap_lo__cholesterol_all__gluc_all',
'score_scale_val', 'gluc_all___gender_age__scale',
'cholesterol_all___gender_age__scale',
'div6__height__gluc_all__imt___gender_chol__scale']
for c in cols:
all[c] = (all[c] - all[c].mean()) / (all[c].max()-all[c].min())
all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))
len(all.columns)
Out[47]:
In [49]:
DATA_ID = 219
train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(19), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(19), index_col='id')
all = pd.concat([train, test])
all['smoke'].fillna(0, inplace=True)
all['alco'].fillna(0, inplace=True)
all['active'].fillna(1, inplace=True)
all = all.fillna(0)
all.columns
cols = ['age', 'height', 'weight', 'ap_hi', 'ap_lo',
'age_years',
'cholesterol_all', 'gluc_all', 'imt', 'imt_class_all', 'x__age__gluc_all', 'x__ap_hi__cholesterol_all',
'div__smoke__imt_class_all']
for c in cols:
all[c] = (all[c] - all[c].mean()) / (all[c].max()-all[c].min())
all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))
len(all.columns)
Out[49]:
In [ ]:
In [ ]:
DATA_ID = 203
train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(3), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(3), index_col='id')
all = pd.concat([train, test])
all['smoke'].fillna(0, inplace=True)
all['alco'].fillna(0, inplace=True)
all['active'].fillna(1, inplace=True)
all = all.fillna(0)
print(all.columns)
cols = ['age', 'height', 'weight', 'ap_hi', 'ap_lo',
'cholesterol', 'gluc', 'imt']
for c in cols:
all[c] = (all[c] - all[c].mean()) / (all[c].max()-all[c].min())
all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))
len(all.columns)
In [ ]:
In [63]:
DATA_ID = 70
train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(69), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(69), index_col='id')
all = pd.concat([train, test])
train2 = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(269), index_col='id')
test2 = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(269), index_col='id')
target = pd.read_csv(config.QML_TRAIN_Y_FILE_MASK.format(269), index_col='id')
all2 = pd.concat([train2, test2])
from sklearn.linear_model import SGDRegressor
sgd = SGDRegressor(loss='huber', n_iter=100)
sgd.fit(train2, target)
all['lin'] = sgd.predict(all2)
all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))
len(all.columns)
Out[63]:
In [ ]:
In [72]:
#mortido (after contest test)
DATA_ID = 300
train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(69), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(69), index_col='id')
all = pd.concat([train, test])
train2 = pd.read_csv('d:/temp/train.csv', index_col='id', delimiter=';')
test2 = pd.read_csv('d:/temp/test.csv', index_col='id', delimiter=';')
all2 = pd.concat([train2, test2])
for c in all2.columns:
all[c] = all2[c]
all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))
len(all.columns)
Out[72]:
In [71]:
all2.columns
Out[71]:
In [ ]: