In [1]:
import sys, os
#sys.path.insert(0, os.getcwd() + '//..')
os.chdir('d:/ml/mlbootcamp5')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
color = sns.color_palette()
from datetime import datetime
import qml_workdir.classes
from qml_workdir.classes.config import config
%matplotlib inline
In [2]:
ids_train = pd.read_csv(config.QML_DATA_DIR + "ids_train.csv", index_col='id').index
ids_test = pd.read_csv(config.QML_DATA_DIR + "ids_test.csv", index_col='id').index
In [3]:
train_raw = pd.read_csv(config.QML_DATA_DIR + "raw/train.csv", delimiter=";", index_col="id", na_values=['None'])
test_raw = pd.read_csv(config.QML_DATA_DIR + "raw/test.csv", delimiter=";", index_col="id", na_values=['None'])
train_raw.drop(train_raw.columns, axis=1).to_csv(config.QML_DATA_DIR + "ids_train.csv")
test_raw.drop(test_raw.columns, axis=1).to_csv(config.QML_DATA_DIR + "ids_test.csv")
train_raw.drop(set(train_raw.columns)-set(['cardio']), axis=1).to_csv(config.QML_DATA_DIR + "train_y.csv")
train_raw.drop(['cardio'], axis=1, inplace=True)
all_raw = pd.concat([train_raw, test_raw])
ids_train = pd.read_csv(config.QML_DATA_DIR + "ids_train.csv", index_col='id').index
ids_test = pd.read_csv(config.QML_DATA_DIR + "ids_test.csv", index_col='id').index
In [ ]:
In [36]:
#raw with na
DATA_ID = 1
all = all_raw.copy()
all = pd.get_dummies(all, columns=['gender']).rename(columns={'gender_1': 'gender_female', 'gender_2': 'gender_male'})
all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))
In [ ]:
In [ ]:
In [40]:
#raw filled na
DATA_ID = 2
train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(1), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(1), index_col='id')
all = pd.concat([train, test])
all.fillna(0, inplace=True)
all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))
In [ ]:
In [ ]:
In [42]:
#with imt
DATA_ID = 3
train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(2), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(2), index_col='id')
all = pd.concat([train, test])
all['imt'] = all['weight'] / (all['height']/100)/ (all['height']/100)
all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))
In [4]:
train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(2), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(2), index_col='id')
all = pd.concat([train, test])
In [14]:
all.groupby(['gender_male']).median()['weight']
#len(all[all['weight']<40])
Out[14]:
In [16]:
#raw + processed data
DATA_ID = 104
train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(1), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(1), index_col='id')
all = pd.concat([train, test])
all = pd.concat([train, test])
all['age_years'] = all['age'] / 365.25
all.ix[all['height']>210, 'height'] = all[all['height']<210]['height'].max()
all['height_low'] = np.int32(all['height']<130)
all.ix[(all['height']<130) & (all['gender_male']==1) , 'height'] = 170
all.ix[(all['height']<130) & (all['gender_male']==0) , 'height'] = 161
all['weight_low'] = np.int32(all['weight']<40)
all.ix[(all['weight']<40) & (all['gender_male']==1), 'weight'] = 75
all.ix[(all['weight']<40) & (all['gender_male']==0), 'weight'] = 70
all['cholesterol_all'] = all['cholesterol']
all['gluc_all'] = all['gluc']
all = pd.get_dummies(all, columns=['cholesterol', 'gluc'])
all['ap_error'] = 0
all['ap_error_swap'] = 0
all.ix[all['ap_hi']>10000, 'ap_hi'] = np.int64(all.ix[all['ap_hi']>10000, 'ap_hi']/100)
all.ix[all['ap_hi']>10000, 'ap_error'] = 1
all.ix[all['ap_hi']>1000, 'ap_hi'] = np.int64(all.ix[all['ap_hi']>1000, 'ap_hi']/10)
all.ix[all['ap_hi']>1000, 'ap_error'] = 1
all.ix[all['ap_hi']<=-100, 'ap_hi'] = np.int64(-all.ix[all['ap_hi']<=-100, 'ap_hi'])
all.ix[all['ap_hi']<=-100, 'ap_error'] = 1
all.ix[all['ap_hi']<0, 'ap_hi'] = np.int64(-all.ix[all['ap_hi']<0, 'ap_hi'])
all.ix[all['ap_hi']<0, 'ap_error'] = 1
ix = (all['ap_hi']==1) & (all['ap_lo']>1000)
all.ix[ix, 'ap_hi'], all.ix[ix, 'ap_lo'] = \
np.int64( all.ix[ix, 'ap_hi']*100 + all.ix[ix, 'ap_lo']/100), np.int64(((all.ix[ix, 'ap_lo']/10)%10))*10
all.ix[ix, 'ap_error'] = 1
ix = (all['ap_hi']>600) & (all['ap_lo']==0)
all.ix[ix, 'ap_hi'], all.ix[ix, 'ap_lo'] = \
np.int64(all.ix[ix, 'ap_hi']/10), np.int64(all.ix[ix, 'ap_hi']%10)*10
all.ix[ix, 'ap_error'] = 1
ix = (all['ap_hi']>240) &(all['ap_hi']<=600) & (all['ap_lo']==0)
all.ix[ix, 'ap_hi'], all.ix[ix, 'ap_lo'] = \
np.int64(100 + all.ix[ix, 'ap_hi']/10), np.int64(all.ix[ix, 'ap_hi']%10)*10
all.ix[ix, 'ap_error'] = 1
ix = (all['ap_hi']>=400) &(all['ap_hi']<500)
all.ix[ix, 'ap_hi'] = np.int64(all.ix[ix, 'ap_hi'])-300
all.ix[ix, 'ap_error'] = 1
ix = (all['ap_hi']>=900) &(all['ap_hi']<1000)
all.ix[ix, 'ap_hi'] = np.int64(all.ix[ix, 'ap_hi']/100)*10
all.ix[ix, 'ap_error'] = 1
ix = (all['ap_hi']>=700) &(all['ap_hi']<800)
all.ix[ix, 'ap_hi'] = np.int64(all.ix[ix, 'ap_hi']/100)*10+100
all.ix[ix, 'ap_error'] = 1
all.ix[all['ap_lo']<0, 'ap_lo'] = np.int64(-all.ix[all['ap_lo']<0, 'ap_lo'])
all.ix[all['ap_lo']<0, 'ap_error'] = 1
ix = (all['ap_hi']<100) & (all['ap_lo']==0)
all.ix[ix, 'ap_hi'], all.ix[ix, 'ap_lo'] = \
np.int64(all.ix[ix, 'ap_hi']*10), np.int64(all.ix[ix, 'ap_hi']*10-40)
all.ix[ix, 'ap_error'] = 1
ix = (all['ap_hi']>100) & (all['ap_lo']==0) & (all['ap_hi']%10==0)
all.ix[ix, 'ap_lo'] = np.int64(all.ix[ix, 'ap_hi'] - 40)
all.ix[ix, 'ap_error'] = 1
ix = (all['ap_hi']>100) & (all['ap_lo']==0) & (all['ap_hi']%10!=0) & (all['ap_hi']%10>=2)
all.ix[ix, 'ap_hi'], all.ix[ix, 'ap_lo'] = \
np.int64(all.ix[ix, 'ap_hi']/10)*10, np.int64(all.ix[ix, 'ap_hi']%10)*10
all.ix[ix, 'ap_error'] = 1
ix = (all['ap_hi']>100) & (all['ap_lo']==0) & (all['ap_hi']%10!=0) & (all['ap_hi']%10<2)
all.ix[ix, 'ap_hi'], all.ix[ix, 'ap_lo'] = \
np.int64(all.ix[ix, 'ap_hi']/10)*10, np.int64(all.ix[ix, 'ap_hi']%10)*100
all.ix[ix, 'ap_error'] = 1
all.ix[all['ap_lo']>=10000, 'ap_lo'] = np.int64(all.ix[all['ap_lo']>=10000, 'ap_lo']/100)
all.ix[all['ap_lo']>=10000, 'ap_error'] = 1
all.ix[all['ap_lo']>=4500, 'ap_lo'] = np.int64(all.ix[all['ap_lo']>4500, 'ap_lo']/100)
all.ix[all['ap_lo']>=4500, 'ap_error'] = 1
all.ix[all['ap_lo']>=4000, 'ap_lo'] = np.int64(all.ix[all['ap_lo']>4000, 'ap_lo']/100)+100
all.ix[all['ap_lo']>=4000, 'ap_error'] = 1
all.ix[all['ap_lo']>=1500, 'ap_lo'] = np.int64(all.ix[all['ap_lo']>1500, 'ap_lo']/10)%100
all.ix[all['ap_lo']>=1500, 'ap_error'] = 1
all.ix[all['ap_lo']>=1200, 'ap_lo'] = np.int64(all.ix[all['ap_lo']>=1200, 'ap_lo']/10)
all.ix[all['ap_lo']>=1200, 'ap_error'] = 1
all.ix[all['ap_lo']>=1150, 'ap_lo'] = np.int64(all.ix[all['ap_lo']>=1150, 'ap_lo']/100)*10
all.ix[all['ap_lo']>=1150, 'ap_error'] = 1
all.ix[all['ap_lo']>=1100, 'ap_lo'] = np.int64(all.ix[all['ap_lo']>=1100, 'ap_lo']/10)*10%1000
all.ix[all['ap_lo']>=1100, 'ap_error'] = 1
all.ix[all['ap_lo']>=1000, 'ap_lo'] = np.int64(all.ix[all['ap_lo']>=1000, 'ap_lo']/100)*10
all.ix[all['ap_lo']>=1000, 'ap_error'] = 1
all.ix[all['ap_lo']>=500, 'ap_lo'] = np.int64(all.ix[all['ap_lo']>=500, 'ap_lo']/10)
all.ix[all['ap_lo']>=500, 'ap_error'] = 1
ix = (all['ap_hi']>=100) & (all['ap_lo']<10)& (all['ap_lo']>1)
all.ix[ix, 'ap_lo'] = np.int64(all.ix[ix, 'ap_lo'])*10
all.ix[ix, 'ap_error'] = 1
ix = (all['ap_hi']>100) & (all['ap_lo']>=10)& (all['ap_lo']<19)
all.ix[ix, 'ap_lo'] = np.int64(all.ix[ix, 'ap_lo'])*10
all.ix[ix, 'ap_error'] = 1
ix = (all['ap_hi']>=100) & (all['ap_lo']==1)
all.ix[ix, 'ap_lo'] = np.int64(all.ix[ix, 'ap_lo'])*100
all.ix[ix, 'ap_error'] = 1
all.ix[all['ap_lo']<10, 'ap_lo'] = np.int64(all.ix[all['ap_lo']<10, 'ap_lo']*10)
all.ix[all['ap_lo']<10, 'ap_error'] = 1
ix = (all['ap_hi']>=100) & (all['ap_lo']==1)
all.ix[ix, 'ap_lo'] = np.int64(all.ix[ix, 'ap_hi'] - 40)
all.ix[ix, 'ap_error'] = 1
ix = (all['ap_hi']>=100) & (all['ap_lo']<=40)
all.ix[ix, 'ap_lo'] = np.int64(all.ix[ix, 'ap_hi'] - 40)
all.ix[ix, 'ap_error'] = 1
all.ix[all['ap_hi']<=10, 'ap_hi'] = np.int64(all.ix[all['ap_hi']<=10, 'ap_lo']+40)
all.ix[all['ap_hi']<=10, 'ap_error'] = 1
all.ix[all['ap_hi']<=20, 'ap_hi'] = np.int64(all.ix[all['ap_hi']<=20, 'ap_hi']*10)
all.ix[all['ap_hi']<=20, 'ap_error'] = 1
all.ix[all['ap_lo']<=10, 'ap_lo'] = np.int64(all.ix[all['ap_lo']<=10, 'ap_lo']*10)
all.ix[all['ap_lo']<=10, 'ap_error'] = 1
ix = (all['ap_lo']<=50) & (all['ap_hi']<=50)
all.ix[ix, 'ap_hi'], all.ix[ix, 'ap_lo'] = 120, 80
all.ix[ix, 'ap_error'] = 1
all.ix[all['ap_lo']<40, 'ap_lo'] = np.int64(all.ix[all['ap_lo']<40, 'ap_hi']-30)
all.ix[all['ap_lo']<40, 'ap_error'] = 1
ix = (all['ap_hi']==all['ap_lo'])
all.ix[ix, 'ap_hi'], all.ix[ix, 'ap_lo'] = 120, 80
all.ix[ix, 'ap_error'] = 1
ix = (all['ap_hi']<all['ap_lo'])
all.ix[ix, 'ap_hi'], all.ix[ix, 'ap_lo'] = all.ix[ix, 'ap_lo'], all.ix[ix, 'ap_hi']
all.ix[ix, 'ap_error'] = 1
all.ix[ix, 'ap_error_swap'] = 1
all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))
In [ ]:
In [ ]:
In [17]:
#with imt
DATA_ID = 105
train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(104), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(104), index_col='id')
all = pd.concat([train, test])
all['imt'] = all['weight'] / (all['height']/100)/ (all['height']/100)
all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))
In [ ]:
In [ ]:
In [18]:
#with imt stat
DATA_ID = 106
train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(105), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(105), index_col='id')
all = pd.concat([train, test])
all['imt_class'] = 0
all.ix[(16<=all['imt']) & (all['imt']<18.5) , 'imt_class'] = 1
all.ix[(18.5<=all['imt']) & (all['imt']<25) , 'imt_class'] = 2
all.ix[(25<=all['imt']) & (all['imt']<30) , 'imt_class'] = 3
all.ix[(30<=all['imt']) & (all['imt']<35) , 'imt_class'] = 4
all.ix[(35<=all['imt']) & (all['imt']<40) , 'imt_class'] =5
all.ix[(40<=all['imt']) , 'imt_class'] = 6
all['imt_class_all'] = all['imt_class']
all = pd.get_dummies(all, columns=['imt_class'])
all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [19]:
#6 + combinations
DATA_ID = 120
train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(106), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(106), index_col='id')
all = pd.concat([train, test])
from itertools import combinations
cols = ['age', 'height', 'weight', 'ap_hi', 'ap_lo', #'smoke', 'alco', 'active',
'cholesterol_all', 'gluc_all', 'imt']
cols_norm = {}
for c in cols:
all[c+'_norm'] = (all[c] - all[c].mean()) / (all[c].max() - all[c].min())
cols_norm[c] = c+'_norm'
for c1, c2 in combinations(cols, 2):
all["x__{}__{}".format(c1,c2)] = all[c1] * all[c2]
all["div__{}__{}".format(c1,c2)] = all[c1] / (all[c2] if all[c2].min()>0 else all[c2]+1)
all["plus__{}__{}".format(cols_norm[c1],cols_norm[c2])] = all[cols_norm[c1]] + all[cols_norm[c2]]
all["minus__{}__{}".format(cols_norm[c1],cols_norm[c2])] = all[cols_norm[c1]] - all[cols_norm[c2]]
print(0)
for c1, c2, c3 in combinations(cols, 3):
all["x__{}__{}__{}".format(c1,c2,c3)] = all[c1] * all[c2] * all[c3]
all["plus__{}__{}__{}".format(cols_norm[c1],cols_norm[c2],cols_norm[c3])] = all[cols_norm[c1]] + all[cols_norm[c2]] + all[cols_norm[c3]]
all["minus1__{}__{}__{}".format(cols_norm[c1],cols_norm[c2],cols_norm[c3])] = \
all[cols_norm[c1]] + all[cols_norm[c2]] - all[cols_norm[c3]]
all["minus2__{}__{}__{}".format(cols_norm[c1],cols_norm[c2],cols_norm[c3])] = \
all[cols_norm[c1]] - all[cols_norm[c2]] + all[cols_norm[c3]]
all["minus3__{}__{}__{}".format(cols_norm[c1],cols_norm[c2],cols_norm[c3])] = \
- all[cols_norm[c1]] + all[cols_norm[c2]] + all[cols_norm[c3]]
all["minus4__{}__{}__{}".format(cols_norm[c1],cols_norm[c2],cols_norm[c3])] = \
all[cols_norm[c1]] - all[cols_norm[c2]] - all[cols_norm[c3]]
all["minus5__{}__{}__{}".format(cols_norm[c1],cols_norm[c2],cols_norm[c3])] = \
- all[cols_norm[c1]] + all[cols_norm[c2]] - all[cols_norm[c3]]
all["minus6__{}__{}__{}".format(cols_norm[c1],cols_norm[c2],cols_norm[c3])] = \
- all[cols_norm[c1]] - all[cols_norm[c2]] + all[cols_norm[c3]]
all["div1__{}__{}__{}".format(c1,c2,c3)] = 1 * all[c1] * all[c2] / all[c3]
all["div2__{}__{}__{}".format(c1,c2,c3)] = 1 * all[c1] / all[c2] * all[c3]
all["div3__{}__{}__{}".format(c1,c2,c3)] = 1 / all[c1] * all[c2] * all[c3]
all["div4__{}__{}__{}".format(c1,c2,c3)] = 1 * all[c1] / all[c2] / all[c3]
all["div5__{}__{}__{}".format(c1,c2,c3)] = 1 / all[c1] * all[c2] / all[c3]
all["div6__{}__{}__{}".format(c1,c2,c3)] = 1 / all[c1] / all[c2] * all[c3]
print(1)
all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
print(2)
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))
len(all.columns)
Out[19]:
In [20]:
1
Out[20]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [21]:
#43 + na subjactive
DATA_ID = 147
#train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(120), index_col='id')
#test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(120), index_col='id')
#all = pd.concat([train, test])
cols =[
'age',
'height',
'weight',
'ap_hi',
'ap_lo',
'smoke',
'alco',
'active',
'gender_male',
'height_low',
'weight_low',
'cholesterol_all',
'gluc_all',
'cholesterol_1',
'cholesterol_2',
'cholesterol_3',
'gluc_1',
'gluc_2',
'gluc_3',
'ap_error',
'ap_error_swap',
'imt',
'imt_class_all',
'imt_class_0',
'imt_class_1',
'imt_class_2',
'imt_class_3',
'imt_class_4',
'imt_class_5',
'imt_class_6',
'x__age__gluc_all',
'x__ap_hi__cholesterol_all',
'div6__height__gluc_all__imt',
'plus__age_norm__ap_hi_norm__gluc_all_norm',
'x__age__weight',
'div1__age__weight__cholesterol_all',
'div6__age__weight__cholesterol_all',
'plus__height_norm__weight_norm__gluc_all_norm',
'div1__ap_hi__ap_lo__cholesterol_all',
'div6__ap_hi__ap_lo__cholesterol_all',
'plus__age_norm__gluc_all_norm__imt_norm',
'minus6__ap_hi_norm__ap_lo_norm__cholesterol_all_norm',
'minus1__ap_hi_norm__ap_lo_norm__cholesterol_all_norm',
'minus6__age_norm__ap_lo_norm__cholesterol_all_norm',
'minus1__age_norm__ap_lo_norm__cholesterol_all_norm',
'div6__height__weight__ap_lo',
'div2__ap_lo__cholesterol_all__gluc_all',
'x__age__ap_hi__gluc_all',
'div5__ap_lo__cholesterol_all__gluc_all',
]
all = all[cols]
all['smoke'].fillna(0, inplace=True)
all['alco'].fillna(0, inplace=True)
all['active'].fillna(1, inplace=True)
all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))
In [ ]:
In [22]:
#6 + score
DATA_ID = 149
train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(147), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(147), index_col='id')
all = pd.concat([train, test])
#all['score_scale_val'] = 0
chol4 = all['cholesterol_all'] == 1
chol5 = all['cholesterol_all'] == 1
chol6 = all['cholesterol_all'] == 1
chol7 = all['cholesterol_all'] == 2
chol8 = (all['cholesterol_all'] == 3)
male = all['gender_male'] == 1
female = all['gender_male'] == 0
smoke = all['smoke'] == 1
not_smoke = all['smoke'] == 0
age40 = (all['age']/365.25 >=40) & (all['age']/365.25 < 45)
age45 = (all['age']/365.25 >=45) & (all['age']/365.25 < 55)
age55 = (all['age']/365.25 >=55) & (all['age']/365.25 < 60)
age60 = (all['age']/365.25 >=60) & (all['age']/365.25 < 65)
age65 = (all['age']/365.25 >=65)
ap120 = (all['ap_hi'] >= 120) & (all['ap_hi'] < 140)
ap140 = (all['ap_hi'] >= 140) & (all['ap_hi'] < 160)
ap160 = (all['ap_hi'] >= 160) & (all['ap_hi'] < 170)
ap180 = (all['ap_hi'] >= 180)
data = [
[female, not_smoke, age65, ap180, 7, 8, 9, 10, 12],
[female, not_smoke, age65, ap160, 5, 5, 6, 7, 8],
[female, not_smoke, age65, ap140, 3, 3, 4, 5, 6],
[female, not_smoke, age65, ap120, 2, 2, 3, 3, 4],
[female, not_smoke, age60, ap180, 4, 4, 5, 6, 7],
[female, not_smoke, age60, ap160, 3, 3, 3, 4, 5],
[female, not_smoke, age60, ap140, 2, 2, 2, 3, 3],
[female, not_smoke, age60, ap120, 1, 1, 2, 2, 2],
[female, not_smoke, age55, ap180, 2, 2, 3, 3, 4],
[female, not_smoke, age55, ap160, 1, 2, 2, 2, 3],
[female, not_smoke, age55, ap140, 1, 1, 1, 1, 2],
[female, not_smoke, age55, ap120, 1, 1, 1, 1, 1],
[female, not_smoke, age45, ap180, 1, 1, 1, 2, 2],
[female, not_smoke, age45, ap160, 1, 1, 1, 1, 1],
[female, not_smoke, age45, ap140, 0, 1, 1, 1, 1],
[female, not_smoke, age45, ap120, 0, 0, 1, 1, 1],
# [female, not_smoke, age40, ap180, , , , , ],
# [female, not_smoke, age40, ap160, , , , , ],
# [female, not_smoke, age40, ap140, , , , , ],
# [female, not_smoke, age40, ap120, , , , , ],
#######################################################
[female, smoke, age65, ap180, 13, 15, 17, 19, 22],
[female, smoke, age65, ap160, 9, 10, 12, 13, 16],
[female, smoke, age65, ap140, 6, 7, 8, 9, 11],
[female, smoke, age65, ap120, 4, 5, 5, 6, 7],
[female, smoke, age60, ap180, 8, 9, 10, 11, 13],
[female, smoke, age60, ap160, 5, 6, 7, 8, 9],
[female, smoke, age60, ap140, 3, 4, 5, 5, 6],
[female, smoke, age60, ap120, 2, 3, 3, 4, 4],
[female, smoke, age55, ap180, 4, 5, 5, 6, 7],
[female, smoke, age55, ap160, 3, 3, 4, 4, 5],
[female, smoke, age55, ap140, 2, 2, 2, 3, 3],
[female, smoke, age55, ap120, 1, 1, 2, 2, 2],
[female, smoke, age45, ap180, 2, 2, 3, 3, 4],
[female, smoke, age45, ap160, 1, 2, 2, 2, 3],
[female, smoke, age45, ap140, 1, 1, 1, 1, 2],
[female, smoke, age45, ap120, 1, 1, 1, 1, 1],
[female, smoke, age40, ap180, 0, 0, 0, 1, 1],
# [female, smoke, age40, ap160, , , , , ],
# [female, smoke, age40, ap140, , , , , ],
# [female, smoke, age40, ap120, , , , , ],
#######################################################
[male, not_smoke, age65, ap180, 14, 16, 19, 22, 26],
[male, not_smoke, age65, ap160, 9, 11, 13, 15, 16],
[male, not_smoke, age65, ap140, 6, 8, 9, 11, 13],
[male, not_smoke, age65, ap120, 4, 5, 6, 7, 9],
[male, not_smoke, age60, ap180, 9, 11, 13, 15, 18],
[male, not_smoke, age60, ap160, 6, 7, 9, 10, 12],
[male, not_smoke, age60, ap140, 4, 5, 6, 7, 9],
[male, not_smoke, age60, ap120, 3, 3, 4, 5, 6],
[male, not_smoke, age55, ap180, 6, 7, 8, 10, 12],
[male, not_smoke, age55, ap160, 4, 5, 6, 7, 8],
[male, not_smoke, age55, ap140, 3, 3, 4, 5, 6],
[male, not_smoke, age55, ap120, 2, 2, 3, 3, 4],
[male, not_smoke, age45, ap180, 4, 4, 5, 6, 7],
[male, not_smoke, age45, ap160, 2, 3, 3, 4, 5],
[male, not_smoke, age45, ap140, 2, 2, 2, 3, 3],
[male, not_smoke, age45, ap120, 1, 1, 2, 2, 2],
[male, not_smoke, age40, ap180, 1, 1, 1, 2, 2],
[male, not_smoke, age40, ap160, 1, 1, 1, 1, 1],
[male, not_smoke, age40, ap140, 0, 1, 1, 1, 1],
[male, not_smoke, age40, ap120, 0, 0, 1, 1, 1],
#######################################################
[male, smoke, age65, ap180, 26, 30, 35, 41, 47],
[male, smoke, age65, ap160, 18, 21, 25, 29, 34],
[male, smoke, age65, ap140, 13, 15, 17, 20, 24],
[male, smoke, age65, ap120, 9, 10, 12, 14, 17],
[male, smoke, age60, ap180, 18, 21, 24, 28, 33],
[male, smoke, age60, ap160, 12, 14, 17, 20, 24],
[male, smoke, age60, ap140, 8, 10, 12, 14, 17],
[male, smoke, age60, ap120, 6, 7, 8, 10, 12],
[male, smoke, age55, ap180, 12, 13, 16, 19, 22],
[male, smoke, age55, ap160, 8, 9, 11, 13, 16],
[male, smoke, age55, ap140, 5, 6, 8, 9, 11],
[male, smoke, age55, ap120, 4, 4, 5, 6, 8],
[male, smoke, age45, ap180, 7, 8, 10, 12, 14],
[male, smoke, age45, ap160, 5, 6, 7, 8, 10],
[male, smoke, age45, ap140, 3, 4, 5, 6, 7],
[male, smoke, age45, ap120, 2, 3, 3, 4, 5],
[male, smoke, age40, ap180, 2, 2, 3, 3, 4],
[male, smoke, age40, ap160, 1, 2, 2, 2, 3],
[male, smoke, age40, ap140, 1, 1, 1, 2, 2],
[male, smoke, age40, ap120, 1, 1, 1, 1, 1],
]
for gender, sm, age, aphi, c4, c5, c6, c7, c8 in data:
all.ix[gender & sm & aphi & chol8 , 'score_scale_val'] = c8
all.ix[gender & sm & aphi & chol7 , 'score_scale_val'] = c7
all.ix[gender & sm & aphi & chol6 , 'score_scale_val'] = c6
all.ix[gender & sm & aphi & chol5 , 'score_scale_val'] = c5
all.ix[gender & sm & aphi & chol4 , 'score_scale_val'] = c4
all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))
In [ ]:
In [ ]:
In [ ]:
In [23]:
DATA_ID = 150
train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(149), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(149), index_col='id')
all = pd.concat([train, test])
from sklearn.cluster import KMeans
k15 = KMeans(n_clusters=15, precompute_distances = True, n_jobs=-1)
k15.fit(all.fillna(0))
k7 = KMeans(n_clusters=7, precompute_distances = True, n_jobs=-1)
k7.fit(all.fillna(0))
k3 = KMeans(n_clusters=3, precompute_distances = True, n_jobs=-1)
k3.fit(all.fillna(0))
k15_res = k15.predict(all.fillna(0))[None].T
k7_res = k7.predict(all.fillna(0))[None].T
k3_res = k3.predict(all.fillna(0))[None].T
all['k15'] = k15_res
all['k7'] = k7_res
all['k3'] = k3_res
all = pd.get_dummies(all, columns=["k15", "k7", "k3",])#
all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))
len(all.columns)
Out[23]:
In [24]:
#43 + stat feat for add
DATA_ID = 151
train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(150), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(150), index_col='id')
all = pd.concat([train, test])
is_male = all['gender_male'] == 1
is_female = all['gender_male'] == 0
chol1 = all['cholesterol_all'] == 1
chol2 = all['cholesterol_all'] == 2
chol3 = all['cholesterol_all'] == 3
gluc1 = all['gluc_all'] == 1
gluc2 = all['gluc_all'] == 2
gluc3 = all['gluc_all'] == 3
age_year = all['age'] // 365.25
all.ix[age_year < 40 , 'age_cat'] = 1
all.ix[(age_year >= 40) & (age_year < 45) , 'age_cat'] = 2
all.ix[(age_year >= 45) & (age_year < 50) , 'age_cat'] = 3
all.ix[(age_year >= 50) & (age_year < 55) , 'age_cat'] = 4
all.ix[(age_year >= 55) & (age_year < 60) , 'age_cat'] = 5
all.ix[(age_year >= 60) , 'age_cat'] = 6
age1 = all['age_cat'] == 1
age2 = all['age_cat'] == 2
age3 = all['age_cat'] == 3
age4 = all['age_cat'] == 4
age5 = all['age_cat'] == 5
age6 = all['age_cat'] == 6
for c in ['age', 'height', 'weight', 'ap_hi', 'ap_lo',
'cholesterol_all',
'gluc_all', 'imt', 'imt_class_all',
'x__age__gluc_all',
'x__ap_hi__cholesterol_all', 'div6__height__gluc_all__imt',
'plus__age_norm__ap_hi_norm__gluc_all_norm', 'x__age__weight',
'div1__age__weight__cholesterol_all',
'div6__age__weight__cholesterol_all',
'plus__height_norm__weight_norm__gluc_all_norm',
'div1__ap_hi__ap_lo__cholesterol_all',
'div6__ap_hi__ap_lo__cholesterol_all',
'plus__age_norm__gluc_all_norm__imt_norm',
'minus6__ap_hi_norm__ap_lo_norm__cholesterol_all_norm',
'minus1__ap_hi_norm__ap_lo_norm__cholesterol_all_norm',
'minus6__age_norm__ap_lo_norm__cholesterol_all_norm',
'minus1__age_norm__ap_lo_norm__cholesterol_all_norm',
'div6__height__weight__ap_lo', 'div2__ap_lo__cholesterol_all__gluc_all',
'x__age__ap_hi__gluc_all', 'div5__ap_lo__cholesterol_all__gluc_all',
'score_scale_val']:
all.ix[is_male, c + '___gender__scale'] = \
(all.ix[is_male, c] - all.groupby(['gender_male']).mean()[c].loc[1]) / \
all.groupby(['gender_male']).std()[c].loc[1]
all.ix[is_female, c + '___gender__scale'] = \
(all.ix[is_female, c] - all.groupby(['gender_male']).mean()[c].loc[0]) / \
all.groupby(['gender_male']).std()[c].loc[0]
if c != 'cholesterol_all':
group_by = all.groupby(['gender_male', 'cholesterol_all'])
all.ix[is_male & chol1, c + '___gender_chol__scale'] = \
(all.ix[is_male & chol1, c] - group_by.mean()[c].loc[1, 1]) / group_by.std()[c].loc[1, 1]
all.ix[is_male & chol2, c + '___gender_chol__scale'] = \
(all.ix[is_male & chol2, c] - group_by.mean()[c].loc[1, 2]) / group_by.std()[c].loc[1, 2]
all.ix[is_male & chol3, c + '___gender_chol__scale'] = \
(all.ix[is_male & chol3, c] - group_by.mean()[c].loc[1, 3]) / group_by.std()[c].loc[1, 3]
all.ix[is_female & chol1, c + '___gender_chol__scale'] = \
(all.ix[is_female & chol1, c] - group_by.mean()[c].loc[0, 1]) / group_by.std()[c].loc[0, 1]
all.ix[is_female & chol2, c + '___gender_chol__scale'] = \
(all.ix[is_female & chol2, c] - group_by.mean()[c].loc[0, 2]) / group_by.std()[c].loc[0, 2]
all.ix[is_female & chol3, c + '___gender_chol__scale'] = \
(all.ix[is_female & chol3, c] - group_by.mean()[c].loc[0, 3]) / group_by.std()[c].loc[0, 3]
if c != 'gluc_all':
group_by = all.groupby(['gender_male', 'gluc_all'])
all.ix[is_male & gluc1, c + '___gender_gluc__scale'] = \
(all.ix[is_male & gluc1, c] - group_by.mean()[c].loc[1, 1]) / group_by.std()[c].loc[1, 1]
all.ix[is_male & gluc2, c + '___gender_gluc__scale'] = \
(all.ix[is_male & gluc2, c] - group_by.mean()[c].loc[1, 2]) / group_by.std()[c].loc[1, 2]
all.ix[is_male & gluc3, c + '___gender_gluc__scale'] = \
(all.ix[is_male & gluc3, c] - group_by.mean()[c].loc[1, 3]) / group_by.std()[c].loc[1, 3]
all.ix[is_female & gluc1, c + '___gender_gluc__scale'] = \
(all.ix[is_female & gluc1, c] - group_by.mean()[c].loc[0, 1]) / group_by.std()[c].loc[0, 1]
all.ix[is_female & gluc2, c + '___gender_gluc__scale'] = \
(all.ix[is_female & gluc2, c] - group_by.mean()[c].loc[0, 2]) / group_by.std()[c].loc[0, 2]
all.ix[is_female & gluc3, c + '___gender_gluc__scale'] = \
(all.ix[is_female & gluc3, c] - group_by.mean()[c].loc[0, 3]) / group_by.std()[c].loc[0, 3]
if c != 'age':
group_by = all.groupby(['gender_male', 'age_cat'])
all.ix[is_male & age1, c + '___gender_age__scale'] = \
(all.ix[is_male & age1, c] - group_by.mean()[c].loc[1, 1]) / group_by.std()[c].loc[1, 1]
all.ix[is_male & age2, c + '___gender_age__scale'] = \
(all.ix[is_male & age2, c] - group_by.mean()[c].loc[1, 2]) / group_by.std()[c].loc[1, 2]
all.ix[is_male & age3, c + '___gender_age__scale'] = \
(all.ix[is_male & age3, c] - group_by.mean()[c].loc[1, 3]) / group_by.std()[c].loc[1, 3]
all.ix[is_male & age4, c + '___gender_age__scale'] = \
(all.ix[is_male & age4, c] - group_by.mean()[c].loc[1, 4]) / group_by.std()[c].loc[1, 4]
all.ix[is_male & age5, c + '___gender_age__scale'] = \
(all.ix[is_male & age5, c] - group_by.mean()[c].loc[1, 5]) / group_by.std()[c].loc[1, 5]
all.ix[is_male & age6, c + '___gender_age__scale'] = \
(all.ix[is_male & age6, c] - group_by.mean()[c].loc[1, 6]) / group_by.std()[c].loc[1, 6]
all.ix[is_female & age1, c + '___gender_age__scale'] = \
(all.ix[is_female & age1, c] - group_by.mean()[c].loc[0, 1]) / group_by.std()[c].loc[0, 1]
all.ix[is_female & age2, c + '___gender_age__scale'] = \
(all.ix[is_female & age2, c] - group_by.mean()[c].loc[0, 2]) / group_by.std()[c].loc[0, 2]
all.ix[is_female & age3, c + '___gender_age__scale'] = \
(all.ix[is_female & age3, c] - group_by.mean()[c].loc[0, 3]) / group_by.std()[c].loc[0, 3]
all.ix[is_female & age4, c + '___gender_age__scale'] = \
(all.ix[is_female & age4, c] - group_by.mean()[c].loc[0, 4]) / group_by.std()[c].loc[0, 4]
all.ix[is_female & age5, c + '___gender_age__scale'] = \
(all.ix[is_female & age5, c] - group_by.mean()[c].loc[0, 5]) / group_by.std()[c].loc[0, 5]
all.ix[is_female & age6, c + '___gender_age__scale'] = \
(all.ix[is_female & age6, c] - group_by.mean()[c].loc[0, 6]) / group_by.std()[c].loc[0, 6]
del all['age_cat']
all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))
len(all.columns)
Out[24]:
In [143]:
# all.groupby(['gender_male', 'age_cat']).mean()
In [144]:
# all.groupby(['gender_male', 'age_cat']).mean()['age'].loc[1, 2]
In [26]:
1
Out[26]:
In [27]:
3
Out[27]:
In [28]:
DATA_ID = 152
train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(150), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(150), index_col='id')
all = pd.concat([train, test])
train51 = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(151), index_col='id')
test51 = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(151), index_col='id')
all51 = pd.concat([train51, test51])
all['div6__height__gluc_all__imt___gender__scale'] = all51['div6__height__gluc_all__imt___gender__scale']
all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))
len(all.columns)
Out[28]:
In [ ]:
In [29]:
DATA_ID = 153
train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(152), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(152), index_col='id')
all = pd.concat([train, test])
for i in ['k15_0',
'k15_1',
'k15_2',
'k15_3',
'k15_4',
'k15_5',
'k15_6',
'k15_7',
'k15_8',
'k15_9',
'k15_10',
'k15_11',
'k15_12',
'k15_13',
'k15_14',
'k7_0',
'k7_1',
'k7_2',
'k7_3',
'k7_4',
'k7_5',
'k7_6',
'k3_0',
'k3_1',
'k3_2']:
del all[i]
all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))
len(all.columns)
Out[29]:
In [ ]:
In [ ]:
In [30]:
DATA_ID = 156
train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(153), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(153), index_col='id')
all = pd.concat([train, test])
all2=all[['age', 'height', 'weight', 'ap_hi', 'ap_lo',
'gender_male', 'height_low', 'weight_low', 'cholesterol_all',
'gluc_all', 'ap_error', 'ap_error_swap', 'imt', 'imt_class_all',
]]
from sklearn.cluster import KMeans
k15 = KMeans(n_clusters=15, precompute_distances = True, n_jobs=-1)
k15.fit(all2)
k7 = KMeans(n_clusters=7, precompute_distances = True, n_jobs=-1)
k7.fit(all2)
k3 = KMeans(n_clusters=3, precompute_distances = True, n_jobs=-1)
k3.fit(all2)
k15_res = k15.predict(all2)[None].T
k7_res = k7.predict(all2)[None].T
k3_res = k3.predict(all2)[None].T
all['k15'] = k15_res
all['k7'] = k7_res
all['k3'] = k3_res
all = pd.get_dummies(all, columns=["k15", "k7", "k3",])#
all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))
len(all.columns)
Out[30]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]: