In [2]:
import sys, os
#sys.path.insert(0, os.getcwd() + '//..')
os.chdir('d:/ml/mlbootcamp5')


import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
color = sns.color_palette()

from datetime import datetime

import qml_workdir.classes

from qml_workdir.classes.config import config


%matplotlib inline

In [3]:
ids_train = pd.read_csv(config.QML_DATA_DIR + "ids_train.csv", index_col='id').index
ids_test = pd.read_csv(config.QML_DATA_DIR + "ids_test.csv", index_col='id').index

In [3]:
train_raw = pd.read_csv(config.QML_DATA_DIR + "raw/train.csv", delimiter=";", index_col="id", na_values=['None'])
test_raw = pd.read_csv(config.QML_DATA_DIR + "raw/test.csv", delimiter=";", index_col="id", na_values=['None'])

train_raw.drop(train_raw.columns, axis=1).to_csv(config.QML_DATA_DIR + "ids_train.csv")
test_raw.drop(test_raw.columns, axis=1).to_csv(config.QML_DATA_DIR + "ids_test.csv")

train_raw.drop(set(train_raw.columns)-set(['cardio']), axis=1).to_csv(config.QML_DATA_DIR + "train_y.csv")

train_raw.drop(['cardio'], axis=1, inplace=True)

all_raw = pd.concat([train_raw, test_raw])


ids_train = pd.read_csv(config.QML_DATA_DIR + "ids_train.csv", index_col='id').index
ids_test = pd.read_csv(config.QML_DATA_DIR + "ids_test.csv", index_col='id').index

In [ ]:


In [36]:
#raw with na
DATA_ID = 1
all = all_raw.copy()

all = pd.get_dummies(all, columns=['gender']).rename(columns={'gender_1': 'gender_female', 'gender_2': 'gender_male'})


all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))

In [ ]:


In [ ]:


In [40]:
#raw filled na
DATA_ID = 2

train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(1), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(1), index_col='id')
all = pd.concat([train, test])

all.fillna(0, inplace=True)


all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))

In [ ]:


In [ ]:


In [42]:
#with imt
DATA_ID = 3

train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(2), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(2), index_col='id')
all = pd.concat([train, test])

all['imt'] = all['weight'] / (all['height']/100)/ (all['height']/100)


all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))

In [ ]:


In [ ]:


In [73]:
#raw + processed data
DATA_ID = 4

train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(1), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(1), index_col='id')
all = pd.concat([train, test])
all = pd.concat([train, test])



all['age_years'] = all['age'] / 365.25
all.ix[all['height']>210, 'height'] = all[all['height']<210]['height'].max()
all['height_low'] = np.int32(all['height']<110)
all.ix[all['height']<110, 'height'] = 110

all['weight_low'] = np.int32(all['weight']<40)
all.ix[all['weight']<40, 'weight'] = 40

all['cholesterol_all'] = all['cholesterol']
all['gluc_all'] = all['gluc']

all = pd.get_dummies(all, columns=['cholesterol', 'gluc'])

all['ap_error'] = 0
all['ap_error_swap'] = 0

all.ix[all['ap_hi']>10000, 'ap_hi'] = np.int64(all.ix[all['ap_hi']>10000, 'ap_hi']/100)
all.ix[all['ap_hi']>10000, 'ap_error'] = 1


all.ix[all['ap_hi']>1000, 'ap_hi'] = np.int64(all.ix[all['ap_hi']>1000, 'ap_hi']/10)
all.ix[all['ap_hi']>1000, 'ap_error'] = 1


all.ix[all['ap_hi']<=-100, 'ap_hi'] = np.int64(-all.ix[all['ap_hi']<=-100, 'ap_hi'])
all.ix[all['ap_hi']<=-100, 'ap_error'] = 1

all.ix[all['ap_hi']<0, 'ap_hi'] = np.int64(-all.ix[all['ap_hi']<0, 'ap_hi'])
all.ix[all['ap_hi']<0, 'ap_error'] = 1


ix = (all['ap_hi']==1) & (all['ap_lo']>1000)
all.ix[ix, 'ap_hi'], all.ix[ix, 'ap_lo'] = \
np.int64( all.ix[ix, 'ap_hi']*100 + all.ix[ix, 'ap_lo']/100), np.int64(((all.ix[ix, 'ap_lo']/10)%10))*10
all.ix[ix, 'ap_error'] = 1

ix = (all['ap_hi']>600) & (all['ap_lo']==0)
all.ix[ix, 'ap_hi'], all.ix[ix, 'ap_lo'] = \
np.int64(all.ix[ix, 'ap_hi']/10), np.int64(all.ix[ix, 'ap_hi']%10)*10
all.ix[ix, 'ap_error'] = 1

ix = (all['ap_hi']>240) &(all['ap_hi']<=600) & (all['ap_lo']==0)
all.ix[ix, 'ap_hi'], all.ix[ix, 'ap_lo'] = \
np.int64(100 + all.ix[ix, 'ap_hi']/10), np.int64(all.ix[ix, 'ap_hi']%10)*10
all.ix[ix, 'ap_error'] = 1


ix = (all['ap_hi']>=400) &(all['ap_hi']<500)
all.ix[ix, 'ap_hi'] = np.int64(all.ix[ix, 'ap_hi'])-300
all.ix[ix, 'ap_error'] = 1


ix = (all['ap_hi']>=900) &(all['ap_hi']<1000)
all.ix[ix, 'ap_hi'] = np.int64(all.ix[ix, 'ap_hi']/100)*10
all.ix[ix, 'ap_error'] = 1


ix = (all['ap_hi']>=700) &(all['ap_hi']<800)
all.ix[ix, 'ap_hi'] = np.int64(all.ix[ix, 'ap_hi']/100)*10+100
all.ix[ix, 'ap_error'] = 1


all.ix[all['ap_lo']<0, 'ap_lo'] = np.int64(-all.ix[all['ap_lo']<0, 'ap_lo'])
all.ix[all['ap_lo']<0, 'ap_error'] = 1


ix = (all['ap_hi']<100) & (all['ap_lo']==0)
all.ix[ix, 'ap_hi'], all.ix[ix, 'ap_lo'] = \
np.int64(all.ix[ix, 'ap_hi']*10), np.int64(all.ix[ix, 'ap_hi']*10-40)
all.ix[ix, 'ap_error'] = 1


ix = (all['ap_hi']>100) & (all['ap_lo']==0) & (all['ap_hi']%10==0)
all.ix[ix, 'ap_lo'] = np.int64(all.ix[ix, 'ap_hi'] - 40)
all.ix[ix, 'ap_error'] = 1


ix = (all['ap_hi']>100) & (all['ap_lo']==0) & (all['ap_hi']%10!=0) & (all['ap_hi']%10>=2)
all.ix[ix, 'ap_hi'], all.ix[ix, 'ap_lo'] = \
np.int64(all.ix[ix, 'ap_hi']/10)*10, np.int64(all.ix[ix, 'ap_hi']%10)*10
all.ix[ix, 'ap_error'] = 1


ix = (all['ap_hi']>100) & (all['ap_lo']==0) & (all['ap_hi']%10!=0) & (all['ap_hi']%10<2)
all.ix[ix, 'ap_hi'], all.ix[ix, 'ap_lo'] = \
np.int64(all.ix[ix, 'ap_hi']/10)*10, np.int64(all.ix[ix, 'ap_hi']%10)*100
all.ix[ix, 'ap_error'] = 1


all.ix[all['ap_lo']>=10000, 'ap_lo'] = np.int64(all.ix[all['ap_lo']>=10000, 'ap_lo']/100)
all.ix[all['ap_lo']>=10000, 'ap_error'] = 1


all.ix[all['ap_lo']>=4500, 'ap_lo'] = np.int64(all.ix[all['ap_lo']>4500, 'ap_lo']/100)
all.ix[all['ap_lo']>=4500, 'ap_error'] = 1


all.ix[all['ap_lo']>=4000, 'ap_lo'] = np.int64(all.ix[all['ap_lo']>4000, 'ap_lo']/100)+100
all.ix[all['ap_lo']>=4000, 'ap_error'] = 1


all.ix[all['ap_lo']>=1500, 'ap_lo'] = np.int64(all.ix[all['ap_lo']>1500, 'ap_lo']/10)%100
all.ix[all['ap_lo']>=1500, 'ap_error'] = 1


all.ix[all['ap_lo']>=1200, 'ap_lo'] = np.int64(all.ix[all['ap_lo']>=1200, 'ap_lo']/10)
all.ix[all['ap_lo']>=1200, 'ap_error'] = 1


all.ix[all['ap_lo']>=1150, 'ap_lo'] = np.int64(all.ix[all['ap_lo']>=1150, 'ap_lo']/100)*10
all.ix[all['ap_lo']>=1150, 'ap_error'] = 1


all.ix[all['ap_lo']>=1100, 'ap_lo'] = np.int64(all.ix[all['ap_lo']>=1100, 'ap_lo']/10)*10%1000
all.ix[all['ap_lo']>=1100, 'ap_error'] = 1


all.ix[all['ap_lo']>=1000, 'ap_lo'] = np.int64(all.ix[all['ap_lo']>=1000, 'ap_lo']/100)*10
all.ix[all['ap_lo']>=1000, 'ap_error'] = 1


all.ix[all['ap_lo']>=500, 'ap_lo'] = np.int64(all.ix[all['ap_lo']>=500, 'ap_lo']/10)
all.ix[all['ap_lo']>=500, 'ap_error'] = 1


ix = (all['ap_hi']>=100) & (all['ap_lo']<10)& (all['ap_lo']>1)
all.ix[ix, 'ap_lo'] = np.int64(all.ix[ix, 'ap_lo'])*10
all.ix[ix, 'ap_error'] = 1


ix = (all['ap_hi']>100) & (all['ap_lo']>=10)& (all['ap_lo']<19)
all.ix[ix, 'ap_lo'] = np.int64(all.ix[ix, 'ap_lo'])*10
all.ix[ix, 'ap_error'] = 1


ix = (all['ap_hi']>=100) & (all['ap_lo']==1)
all.ix[ix, 'ap_lo'] = np.int64(all.ix[ix, 'ap_lo'])*100
all.ix[ix, 'ap_error'] = 1


all.ix[all['ap_lo']<10, 'ap_lo'] = np.int64(all.ix[all['ap_lo']<10, 'ap_lo']*10)
all.ix[all['ap_lo']<10, 'ap_error'] = 1


ix = (all['ap_hi']>=100) & (all['ap_lo']==1)
all.ix[ix, 'ap_lo'] = np.int64(all.ix[ix, 'ap_hi'] - 40)
all.ix[ix, 'ap_error'] = 1


ix = (all['ap_hi']>=100) & (all['ap_lo']<=40)
all.ix[ix, 'ap_lo'] = np.int64(all.ix[ix, 'ap_hi'] - 40)
all.ix[ix, 'ap_error'] = 1


all.ix[all['ap_hi']<=10, 'ap_hi'] = np.int64(all.ix[all['ap_hi']<=10, 'ap_lo']+40)
all.ix[all['ap_hi']<=10, 'ap_error'] = 1


all.ix[all['ap_hi']<=20, 'ap_hi'] = np.int64(all.ix[all['ap_hi']<=20, 'ap_hi']*10)
all.ix[all['ap_hi']<=20, 'ap_error'] = 1


all.ix[all['ap_lo']<=10, 'ap_lo'] = np.int64(all.ix[all['ap_lo']<=10, 'ap_lo']*10)
all.ix[all['ap_lo']<=10, 'ap_error'] = 1



ix = (all['ap_lo']<=50) & (all['ap_hi']<=50)
all.ix[ix, 'ap_hi'], all.ix[ix, 'ap_lo'] = 120, 80
all.ix[ix, 'ap_error'] = 1


all.ix[all['ap_lo']<40, 'ap_lo'] = np.int64(all.ix[all['ap_lo']<40, 'ap_hi']-30)
all.ix[all['ap_lo']<40, 'ap_error'] = 1


ix = (all['ap_hi']==all['ap_lo'])
all.ix[ix, 'ap_hi'], all.ix[ix, 'ap_lo'] = 120, 80
all.ix[ix, 'ap_error'] = 1


ix = (all['ap_hi']<all['ap_lo'])
all.ix[ix, 'ap_hi'], all.ix[ix, 'ap_lo'] = all.ix[ix, 'ap_lo'], all.ix[ix, 'ap_hi']
all.ix[ix, 'ap_error'] = 1
all.ix[ix, 'ap_error_swap'] = 1



all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))

In [ ]:


In [ ]:


In [74]:
#with imt
DATA_ID = 5

train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(4), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(4), index_col='id')
all = pd.concat([train, test])

all['imt'] = all['weight'] / (all['height']/100)/ (all['height']/100)


all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))

In [ ]:


In [ ]:


In [76]:
#with imt stat
DATA_ID = 6

train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(5), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(5), index_col='id')
all = pd.concat([train, test])

all['imt_class'] = 0

all.ix[(16<=all['imt']) & (all['imt']<18.5) , 'imt_class'] = 1
all.ix[(18.5<=all['imt']) & (all['imt']<25) , 'imt_class'] = 2
all.ix[(25<=all['imt']) & (all['imt']<30) , 'imt_class'] = 3
all.ix[(30<=all['imt']) & (all['imt']<35) , 'imt_class'] = 4
all.ix[(35<=all['imt']) & (all['imt']<40) , 'imt_class'] =5
all.ix[(40<=all['imt']) , 'imt_class'] = 6

all['imt_class_all'] = all['imt_class']
all = pd.get_dummies(all, columns=['imt_class'])

all['smoke'].fillna(0, inplace=True)
all['alco'].fillna(0, inplace=True)
all['active'].fillna(1, inplace=True)


all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))

In [8]:



Out[8]:
Index(['age', 'height', 'weight', 'ap_hi', 'ap_lo', 'smoke', 'alco', 'active',
       'gender_female', 'gender_male', 'age_years', 'height_low', 'weight_low',
       'cholesterol_all', 'gluc_all', 'cholesterol_1', 'cholesterol_2',
       'cholesterol_3', 'gluc_1', 'gluc_2', 'gluc_3', 'ap_error',
       'ap_error_swap', 'imt', 'imt_class_all', 'imt_class_0', 'imt_class_1',
       'imt_class_2', 'imt_class_3', 'imt_class_4', 'imt_class_5',
       'imt_class_6'],
      dtype='object')

In [ ]:


In [ ]:


In [11]:
#combinations
DATA_ID = 7


train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(6), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(6), index_col='id')
all = pd.concat([train, test])

from itertools import combinations_with_replacement

cols = ['age', 'height', 'weight', 'ap_hi', 'ap_lo', 'smoke', 'alco', 'active',
       'age_years', 'cholesterol_all', 'gluc_all', 'imt', 'imt_class_all']

for c1, c2 in combinations_with_replacement(cols, 2):
    all["x_{}_{}".format(c1,c2)] = all[c1] * all[c2]
    all["div_{}_{}".format(c1,c2)] = all[c1] / (all[c2] if all[c2].min()>0 else all[c2]+1)
    all["plus_{}_{}".format(c1,c2)] = all[c1] + all[c2]
    all["min_{}_{}".format(c1,c2)] = all[c1] - all[c2]

all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))

In [ ]:


In [9]:
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(7), index_col='id')

In [10]:
len(test.columns)


Out[10]:
396

In [3]:


In [ ]:


In [ ]:


In [4]:
#raw + processed data (как 4 только ""Я считал, что 2 и 1 это ошибочно распознанная 7, а 3 - это 8.)
DATA_ID = 8

train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(1), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(1), index_col='id')
all = pd.concat([train, test])
all = pd.concat([train, test])



all['age_years'] = all['age'] / 365.25
all.ix[all['height']>210, 'height'] = all[all['height']<210]['height'].max()
all['height_low'] = np.int32(all['height']<110)
all.ix[all['height']<110, 'height'] = 110

all['weight_low'] = np.int32(all['weight']<40)
all.ix[all['weight']<30, 'weight'] = all.ix[all['weight']<30, 'weight'] %10 +70
all.ix[all['weight']<40, 'weight'] = all.ix[all['weight']<30, 'weight'] %10 +80

all['cholesterol_all'] = all['cholesterol']
all['gluc_all'] = all['gluc']

all = pd.get_dummies(all, columns=['cholesterol', 'gluc'])

all['ap_error'] = 0
all['ap_error_swap'] = 0

all.ix[all['ap_hi']>10000, 'ap_hi'] = np.int64(all.ix[all['ap_hi']>10000, 'ap_hi']/100)
all.ix[all['ap_hi']>10000, 'ap_error'] = 1


all.ix[all['ap_hi']>1000, 'ap_hi'] = np.int64(all.ix[all['ap_hi']>1000, 'ap_hi']/10)
all.ix[all['ap_hi']>1000, 'ap_error'] = 1


all.ix[all['ap_hi']<=-100, 'ap_hi'] = np.int64(-all.ix[all['ap_hi']<=-100, 'ap_hi'])
all.ix[all['ap_hi']<=-100, 'ap_error'] = 1

all.ix[all['ap_hi']<0, 'ap_hi'] = np.int64(-all.ix[all['ap_hi']<0, 'ap_hi'])
all.ix[all['ap_hi']<0, 'ap_error'] = 1


ix = (all['ap_hi']==1) & (all['ap_lo']>1000)
all.ix[ix, 'ap_hi'], all.ix[ix, 'ap_lo'] = \
np.int64( all.ix[ix, 'ap_hi']*100 + all.ix[ix, 'ap_lo']/100), np.int64(((all.ix[ix, 'ap_lo']/10)%10))*10
all.ix[ix, 'ap_error'] = 1

ix = (all['ap_hi']>600) & (all['ap_lo']==0)
all.ix[ix, 'ap_hi'], all.ix[ix, 'ap_lo'] = \
np.int64(all.ix[ix, 'ap_hi']/10), np.int64(all.ix[ix, 'ap_hi']%10)*10
all.ix[ix, 'ap_error'] = 1

ix = (all['ap_hi']>240) &(all['ap_hi']<=600) & (all['ap_lo']==0)
all.ix[ix, 'ap_hi'], all.ix[ix, 'ap_lo'] = \
np.int64(100 + all.ix[ix, 'ap_hi']/10), np.int64(all.ix[ix, 'ap_hi']%10)*10
all.ix[ix, 'ap_error'] = 1


ix = (all['ap_hi']>=400) &(all['ap_hi']<500)
all.ix[ix, 'ap_hi'] = np.int64(all.ix[ix, 'ap_hi'])-300
all.ix[ix, 'ap_error'] = 1


ix = (all['ap_hi']>=900) &(all['ap_hi']<1000)
all.ix[ix, 'ap_hi'] = np.int64(all.ix[ix, 'ap_hi']/100)*10
all.ix[ix, 'ap_error'] = 1


ix = (all['ap_hi']>=700) &(all['ap_hi']<800)
all.ix[ix, 'ap_hi'] = np.int64(all.ix[ix, 'ap_hi']/100)*10+100
all.ix[ix, 'ap_error'] = 1


all.ix[all['ap_lo']<0, 'ap_lo'] = np.int64(-all.ix[all['ap_lo']<0, 'ap_lo'])
all.ix[all['ap_lo']<0, 'ap_error'] = 1


ix = (all['ap_hi']<100) & (all['ap_lo']==0)
all.ix[ix, 'ap_hi'], all.ix[ix, 'ap_lo'] = \
np.int64(all.ix[ix, 'ap_hi']*10), np.int64(all.ix[ix, 'ap_hi']*10-40)
all.ix[ix, 'ap_error'] = 1


ix = (all['ap_hi']>100) & (all['ap_lo']==0) & (all['ap_hi']%10==0)
all.ix[ix, 'ap_lo'] = np.int64(all.ix[ix, 'ap_hi'] - 40)
all.ix[ix, 'ap_error'] = 1


ix = (all['ap_hi']>100) & (all['ap_lo']==0) & (all['ap_hi']%10!=0) & (all['ap_hi']%10>=2)
all.ix[ix, 'ap_hi'], all.ix[ix, 'ap_lo'] = \
np.int64(all.ix[ix, 'ap_hi']/10)*10, np.int64(all.ix[ix, 'ap_hi']%10)*10
all.ix[ix, 'ap_error'] = 1


ix = (all['ap_hi']>100) & (all['ap_lo']==0) & (all['ap_hi']%10!=0) & (all['ap_hi']%10<2)
all.ix[ix, 'ap_hi'], all.ix[ix, 'ap_lo'] = \
np.int64(all.ix[ix, 'ap_hi']/10)*10, np.int64(all.ix[ix, 'ap_hi']%10)*100
all.ix[ix, 'ap_error'] = 1


all.ix[all['ap_lo']>=10000, 'ap_lo'] = np.int64(all.ix[all['ap_lo']>=10000, 'ap_lo']/100)
all.ix[all['ap_lo']>=10000, 'ap_error'] = 1


all.ix[all['ap_lo']>=4500, 'ap_lo'] = np.int64(all.ix[all['ap_lo']>4500, 'ap_lo']/100)
all.ix[all['ap_lo']>=4500, 'ap_error'] = 1


all.ix[all['ap_lo']>=4000, 'ap_lo'] = np.int64(all.ix[all['ap_lo']>4000, 'ap_lo']/100)+100
all.ix[all['ap_lo']>=4000, 'ap_error'] = 1


all.ix[all['ap_lo']>=1500, 'ap_lo'] = np.int64(all.ix[all['ap_lo']>1500, 'ap_lo']/10)%100
all.ix[all['ap_lo']>=1500, 'ap_error'] = 1


all.ix[all['ap_lo']>=1200, 'ap_lo'] = np.int64(all.ix[all['ap_lo']>=1200, 'ap_lo']/10)
all.ix[all['ap_lo']>=1200, 'ap_error'] = 1


all.ix[all['ap_lo']>=1150, 'ap_lo'] = np.int64(all.ix[all['ap_lo']>=1150, 'ap_lo']/100)*10
all.ix[all['ap_lo']>=1150, 'ap_error'] = 1


all.ix[all['ap_lo']>=1100, 'ap_lo'] = np.int64(all.ix[all['ap_lo']>=1100, 'ap_lo']/10)*10%1000
all.ix[all['ap_lo']>=1100, 'ap_error'] = 1


all.ix[all['ap_lo']>=1000, 'ap_lo'] = np.int64(all.ix[all['ap_lo']>=1000, 'ap_lo']/100)*10
all.ix[all['ap_lo']>=1000, 'ap_error'] = 1


all.ix[all['ap_lo']>=500, 'ap_lo'] = np.int64(all.ix[all['ap_lo']>=500, 'ap_lo']/10)
all.ix[all['ap_lo']>=500, 'ap_error'] = 1


ix = (all['ap_hi']>=100) & (all['ap_lo']<10)& (all['ap_lo']>1)
all.ix[ix, 'ap_lo'] = np.int64(all.ix[ix, 'ap_lo'])*10
all.ix[ix, 'ap_error'] = 1


ix = (all['ap_hi']>100) & (all['ap_lo']>=10)& (all['ap_lo']<19)
all.ix[ix, 'ap_lo'] = np.int64(all.ix[ix, 'ap_lo'])*10
all.ix[ix, 'ap_error'] = 1


ix = (all['ap_hi']>=100) & (all['ap_lo']==1)
all.ix[ix, 'ap_lo'] = np.int64(all.ix[ix, 'ap_lo'])*100
all.ix[ix, 'ap_error'] = 1


all.ix[all['ap_lo']<10, 'ap_lo'] = np.int64(all.ix[all['ap_lo']<10, 'ap_lo']*10)
all.ix[all['ap_lo']<10, 'ap_error'] = 1


ix = (all['ap_hi']>=100) & (all['ap_lo']==1)
all.ix[ix, 'ap_lo'] = np.int64(all.ix[ix, 'ap_hi'] - 40)
all.ix[ix, 'ap_error'] = 1


ix = (all['ap_hi']>=100) & (all['ap_lo']<=40)
all.ix[ix, 'ap_lo'] = np.int64(all.ix[ix, 'ap_hi'] - 40)
all.ix[ix, 'ap_error'] = 1


all.ix[all['ap_hi']<=10, 'ap_hi'] = np.int64(all.ix[all['ap_hi']<=10, 'ap_lo']+40)
all.ix[all['ap_hi']<=10, 'ap_error'] = 1


all.ix[all['ap_hi']<=20, 'ap_hi'] = np.int64(all.ix[all['ap_hi']<=20, 'ap_hi']*10)
all.ix[all['ap_hi']<=20, 'ap_error'] = 1


all.ix[all['ap_lo']<=10, 'ap_lo'] = np.int64(all.ix[all['ap_lo']<=10, 'ap_lo']*10)
all.ix[all['ap_lo']<=10, 'ap_error'] = 1



ix = (all['ap_lo']<=50) & (all['ap_hi']<=50)
all.ix[ix, 'ap_hi'], all.ix[ix, 'ap_lo'] = 120, 80
all.ix[ix, 'ap_error'] = 1


all.ix[all['ap_lo']<40, 'ap_lo'] = np.int64(all.ix[all['ap_lo']<40, 'ap_hi']-30)
all.ix[all['ap_lo']<40, 'ap_error'] = 1


ix = (all['ap_hi']==all['ap_lo'])
all.ix[ix, 'ap_hi'], all.ix[ix, 'ap_lo'] = 120, 80
all.ix[ix, 'ap_error'] = 1


ix = (all['ap_hi']<all['ap_lo'])
all.ix[ix, 'ap_hi'], all.ix[ix, 'ap_lo'] = all.ix[ix, 'ap_lo'], all.ix[ix, 'ap_hi']
all.ix[ix, 'ap_error'] = 1
all.ix[ix, 'ap_error_swap'] = 1



all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))

In [ ]:


In [ ]:


In [5]:
#with imt
DATA_ID = 9

train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(8), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(8), index_col='id')
all = pd.concat([train, test])

all['imt'] = all['weight'] / (all['height']/100)/ (all['height']/100)


all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))

In [ ]:


In [ ]:


In [8]:
#cleaned + dillna
DATA_ID = 10

train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(8), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(8), index_col='id')
all = pd.concat([train, test])

all['smoke'].fillna(0, inplace=True)
all['alco'].fillna(0, inplace=True)
all['active'].fillna(1, inplace=True)


all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))

In [ ]:


In [ ]:


In [ ]:


In [10]:
#cleaned + dillna
DATA_ID = 11

train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(8), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(8), index_col='id')
all = pd.concat([train, test])

all['smoke'].fillna(all['smoke'].mean(), inplace=True)
all['alco'].fillna(all['alco'].mean(), inplace=True)
all['active'].fillna(all['active'].mean(), inplace=True)


all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))

In [ ]:


In [ ]:


In [27]:
#with imt stat
DATA_ID = 12

train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(9), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(9), index_col='id')
all = pd.concat([train, test])

all['imt_class'] = 0

all.ix[(16<=all['imt']) & (all['imt']<18.5) , 'imt_class'] = 1
all.ix[(18.5<=all['imt']) & (all['imt']<25) , 'imt_class'] = 2
all.ix[(25<=all['imt']) & (all['imt']<30) , 'imt_class'] = 3
all.ix[(30<=all['imt']) & (all['imt']<35) , 'imt_class'] = 4
all.ix[(35<=all['imt']) & (all['imt']<40) , 'imt_class'] =5
all.ix[(40<=all['imt']) , 'imt_class'] = 6

all['imt_class_all'] = all['imt_class']
all = pd.get_dummies(all, columns=['imt_class'])


all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))

In [ ]:


In [ ]:


In [43]:
#6 without one hot
DATA_ID = 13

train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(6), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(6), index_col='id')
all = pd.concat([train, test])

all.drop(labels=['gender_female', 'imt_class_0', 'imt_class_1',
       'imt_class_2', 'imt_class_3', 'imt_class_4', 'imt_class_5',
       'imt_class_6', 'cholesterol_1', 'cholesterol_2',
       'cholesterol_3', 'gluc_1', 'gluc_2', 'gluc_3'], axis=1, inplace=True)


all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))

In [44]:
all.columns


Out[44]:
Index(['age', 'height', 'weight', 'ap_hi', 'ap_lo', 'smoke', 'alco', 'active',
       'gender_male', 'age_years', 'height_low', 'weight_low',
       'cholesterol_all', 'gluc_all', 'ap_error', 'ap_error_swap', 'imt',
       'imt_class_all'],
      dtype='object')

In [ ]:


In [ ]:


In [ ]:
#combinations
DATA_ID = 14


train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(6), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(6), index_col='id')
all = pd.concat([train, test])


from itertools import combinations_with_replacement

cols = ['age', 'height', 'weight', 'ap_hi', 'ap_lo', 'smoke', 'alco', 'active',
       'age_years', 'cholesterol_all', 'gluc_all', 'imt', 'imt_class_all']

for c1, c2 in combinations_with_replacement(cols, 2):
    all["x__{}__{}".format(c1,c2)] = all[c1] * all[c2]
    
    if c1 != c2:
        all["div__{}__{}".format(c1,c2)] = all[c1] / (all[c2] if all[c2].min()>0 else all[c2]+1)
        all["plus__{}__{}".format(c1,c2)] = all[c1] + all[c2]
        all["minus__{}__{}".format(c1,c2)] = all[c1] - all[c2]
        
for col in cols:
    all["x__{}__{}".format(col,'gender_male')] = all[col] * all['gender_male']
    all["x__{}__{}".format(col,'gender_female')] = all[col] * all['gender_female']
    
all['log__ap_hi'] = np.log(all['ap_hi'])
all['log__ap_lo'] = np.log(all['ap_lo'])
all['minus__log_ap_hi__log_ap_lo'] = all['log__ap_hi'] - all['log__ap_lo']

all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))

len(all.columns)

In [ ]:


In [53]:


In [ ]:
#6 + features selection hyperopt 02
DATA_ID = 15


train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(6), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(6), index_col='id')
all = pd.concat([train, test])



cols = ['active', 'age', 'alco', 'ap_error_swap', 'ap_hi', 'ap_lo', 'cholesterol_2', 'cholesterol_3', 'cholesterol_all', 'gender_female', 'gluc_1', 'gluc_2', 'gluc_3', 'gluc_all', 'height', 'imt', 'imt_class_4', 'imt_class_6', 'smoke', 'weight']

all = all[cols]

all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))

In [ ]:


In [ ]:


In [4]:
#6 + features selection del 01
DATA_ID = 16


train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(6), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(6), index_col='id')
all = pd.concat([train, test])

all.drop(labels=['height', 'cholesterol_2'], axis=1, inplace=True)

all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))

In [ ]:


In [ ]:


In [11]:
#6 + features selection add 03
DATA_ID = 17


train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(6), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(6), index_col='id')
all = pd.concat([train, test])

all['x__age__gluc_all'] = all['age']*all['gluc_all']
all['smoke'].fillna(0, inplace=True)
all['alco'].fillna(0, inplace=True)
all['active'].fillna(1, inplace=True)

all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))

In [16]:
#6 + features selection add 04 
DATA_ID = 18


train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(6), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(6), index_col='id')
all = pd.concat([train, test])

all['x__age__gluc_all'] = all['age']*all['gluc_all']
all['x__ap_hi__cholesterol_all'] = all['ap_hi']*all['cholesterol_all']
all['smoke'].fillna(0, inplace=True)
all['alco'].fillna(0, inplace=True)
all['active'].fillna(1, inplace=True)

all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))

In [ ]:


In [17]:
#6 + features selection add 05
DATA_ID = 19


train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(6), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(6), index_col='id')
all = pd.concat([train, test])

all['x__age__gluc_all'] = all['age']*all['gluc_all']
all['x__ap_hi__cholesterol_all'] = all['ap_hi']*all['cholesterol_all']
all['div__smoke__imt_class_all'] = all['smoke']/(all['imt_class_all'] if all['imt_class_all'].min()>0 else all['imt_class_all']+1)
all['smoke'].fillna(0, inplace=True)
all['alco'].fillna(0, inplace=True)
all['active'].fillna(1, inplace=True)

all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))

In [ ]:


In [ ]:


In [41]:
#6 + combinations
DATA_ID = 20


train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(6), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(6), index_col='id')
all = pd.concat([train, test])


from itertools import combinations

cols = ['age', 'height', 'weight', 'ap_hi', 'ap_lo', #'smoke', 'alco', 'active',
        'cholesterol_all', 'gluc_all', 'imt']

cols_norm = {}
for c in cols:
    all[c+'_norm'] = (all[c] - all[c].mean()) / (all[c].max() - all[c].min())
    cols_norm[c] = c+'_norm'
    

for c1, c2 in combinations(cols, 2):
    all["x__{}__{}".format(c1,c2)] = all[c1] * all[c2]
    all["div__{}__{}".format(c1,c2)] = all[c1] / (all[c2] if all[c2].min()>0 else all[c2]+1)
    all["plus__{}__{}".format(cols_norm[c1],cols_norm[c2])] = all[cols_norm[c1]] + all[cols_norm[c2]]
    all["minus__{}__{}".format(cols_norm[c1],cols_norm[c2])] = all[cols_norm[c1]] - all[cols_norm[c2]]
print(0)
for c1, c2, c3 in combinations(cols, 3):
    all["x__{}__{}__{}".format(c1,c2,c3)] = all[c1] * all[c2] * all[c3]
    all["plus__{}__{}__{}".format(cols_norm[c1],cols_norm[c2],cols_norm[c3])] = all[cols_norm[c1]] + all[cols_norm[c2]] + all[cols_norm[c3]]

    all["minus1__{}__{}__{}".format(cols_norm[c1],cols_norm[c2],cols_norm[c3])] = \
        all[cols_norm[c1]] + all[cols_norm[c2]] - all[cols_norm[c3]]
    all["minus2__{}__{}__{}".format(cols_norm[c1],cols_norm[c2],cols_norm[c3])] = \
        all[cols_norm[c1]] - all[cols_norm[c2]] + all[cols_norm[c3]]
    all["minus3__{}__{}__{}".format(cols_norm[c1],cols_norm[c2],cols_norm[c3])] = \
       - all[cols_norm[c1]] + all[cols_norm[c2]] + all[cols_norm[c3]]
    all["minus4__{}__{}__{}".format(cols_norm[c1],cols_norm[c2],cols_norm[c3])] = \
        all[cols_norm[c1]] - all[cols_norm[c2]] - all[cols_norm[c3]]
    all["minus5__{}__{}__{}".format(cols_norm[c1],cols_norm[c2],cols_norm[c3])] = \
       - all[cols_norm[c1]] + all[cols_norm[c2]] - all[cols_norm[c3]]
    all["minus6__{}__{}__{}".format(cols_norm[c1],cols_norm[c2],cols_norm[c3])] = \
       - all[cols_norm[c1]] - all[cols_norm[c2]] + all[cols_norm[c3]]
        
    all["div1__{}__{}__{}".format(c1,c2,c3)] = 1 * all[c1] * all[c2] / all[c3]
    all["div2__{}__{}__{}".format(c1,c2,c3)] = 1 * all[c1] / all[c2] * all[c3]
    all["div3__{}__{}__{}".format(c1,c2,c3)] = 1 / all[c1] * all[c2] * all[c3]
    all["div4__{}__{}__{}".format(c1,c2,c3)] = 1 * all[c1] / all[c2] / all[c3]
    all["div5__{}__{}__{}".format(c1,c2,c3)] = 1 / all[c1] * all[c2] / all[c3]
    all["div6__{}__{}__{}".format(c1,c2,c3)] = 1 / all[c1] / all[c2] * all[c3]
        
    
print(1)
all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
print(2)
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))

len(all.columns)


0
1
2
Out[41]:
936

In [ ]:


In [ ]:


In [4]:
#6 + features selection add 06
DATA_ID = 21


train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(6), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(6), index_col='id')
all = pd.concat([train, test])

all['x__age__gluc_all'] = all['age']*all['gluc_all']
all['x__ap_hi__cholesterol_all'] = all['ap_hi']*all['cholesterol_all']
all['div4__age__height__gluc_all'] = 1 * all['age'] / all['height'] / all['gluc_all']
all['smoke'].fillna(0, inplace=True)
all['alco'].fillna(0, inplace=True)
all['active'].fillna(1, inplace=True)

all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))

In [ ]:


In [5]:
#6 + features selection add 06
DATA_ID = 22


train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(6), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(6), index_col='id')
all = pd.concat([train, test])

all['x__age__gluc_all'] = all['age']*all['gluc_all']
all['x__ap_hi__cholesterol_all'] = all['ap_hi']*all['cholesterol_all']
all['x__age__ap_hi__gluc_all'] = all['age'] * all['ap_hi'] * all['gluc_all']
all['smoke'].fillna(0, inplace=True)
all['alco'].fillna(0, inplace=True)
all['active'].fillna(1, inplace=True)

all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))

In [7]:
#6 + features selection add 06
DATA_ID = 23


train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(6), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(6), index_col='id')
all = pd.concat([train, test])

for c in ['ap_hi', 'ap_lo', 'cholesterol_all']:
    all[c+'_norm'] = (all[c] - all[c].mean()) / (all[c].max() - all[c].min())

all['x__age__gluc_all'] = all['age']*all['gluc_all']
all['x__ap_hi__cholesterol_all'] = all['ap_hi']*all['cholesterol_all']
all['minus6__ap_hi_norm__ap_lo_norm__cholesterol_all_norm'] = -all['ap_hi_norm'] - all['ap_lo_norm'] + all['cholesterol_all_norm']
all['smoke'].fillna(0, inplace=True)
all['alco'].fillna(0, inplace=True)
all['active'].fillna(1, inplace=True)

for c in ['ap_hi', 'ap_lo', 'cholesterol_all']:
    del all[c+'_norm']

all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))

In [ ]:
INFO:root:032	0.5380299708136911	div6__height__gluc_all__imt
INFO:root:032	0.5380492890506681	div1__age__weight__cholesterol_all

In [ ]:


In [9]:
#6 + features selection add 06
DATA_ID = 24


train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(6), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(6), index_col='id')
all = pd.concat([train, test])

for c in ['ap_hi', 'ap_lo', 'cholesterol_all']:
    all[c+'_norm'] = (all[c] - all[c].mean()) / (all[c].max() - all[c].min())

all['x__age__gluc_all'] = all['age']*all['gluc_all']
all['x__ap_hi__cholesterol_all'] = all['ap_hi']*all['cholesterol_all']
all['div6__height__gluc_all__imt'] = 1 / all['height'] / all['gluc_all'] * all['imt']
all['smoke'].fillna(0, inplace=True)
all['alco'].fillna(0, inplace=True)
all['active'].fillna(1, inplace=True)

for c in ['ap_hi', 'ap_lo', 'cholesterol_all']:
    del all[c+'_norm']

all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))

In [11]:
#6 + features selection add 06
DATA_ID = 25


train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(6), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(6), index_col='id')
all = pd.concat([train, test])

for c in ['ap_hi', 'ap_lo', 'cholesterol_all']:
    all[c+'_norm'] = (all[c] - all[c].mean()) / (all[c].max() - all[c].min())

all['x__age__gluc_all'] = all['age']*all['gluc_all']
all['x__ap_hi__cholesterol_all'] = all['ap_hi']*all['cholesterol_all']
all['div1__age__weight__cholesterol_all'] = 1 * all['age'] * all['weight'] / all['cholesterol_all']
all['smoke'].fillna(0, inplace=True)
all['alco'].fillna(0, inplace=True)
all['active'].fillna(1, inplace=True)

for c in ['ap_hi', 'ap_lo', 'cholesterol_all']:
    del all[c+'_norm']

all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))

In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [14]:
#6 + features selection add 07
DATA_ID = 26


train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(6), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(6), index_col='id')
all = pd.concat([train, test])

cols = ['age', 'height', 'weight', 'ap_hi', 'ap_lo', 'cholesterol_all', 'gluc_all', 'imt']

for c in cols:
    all[c+'_norm'] = (all[c] - all[c].mean()) / (all[c].max() - all[c].min())

all['x__age__gluc_all'] = all['age']*all['gluc_all']
all['x__ap_hi__cholesterol_all'] = all['ap_hi']*all['cholesterol_all']
all['div6__height__gluc_all__imt'] = 1 / all['height'] / all['gluc_all'] * all['imt']
all['plus__age_norm__ap_hi_norm__gluc_all_norm'] = all['age_norm'] + all['ap_hi_norm'] + all['gluc_all_norm']


all['smoke'].fillna(0, inplace=True)
all['alco'].fillna(0, inplace=True)
all['active'].fillna(1, inplace=True)

for c in cols:
    del all[c+'_norm']

all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))

In [ ]:


In [15]:
#6 + features selection add 08
DATA_ID = 27


train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(6), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(6), index_col='id')
all = pd.concat([train, test])

cols = ['age', 'height', 'weight', 'ap_hi', 'ap_lo', 'cholesterol_all', 'gluc_all', 'imt']

for c in cols:
    all[c+'_norm'] = (all[c] - all[c].mean()) / (all[c].max() - all[c].min())

all['x__age__gluc_all'] = all['age']*all['gluc_all']
all['x__ap_hi__cholesterol_all'] = all['ap_hi']*all['cholesterol_all']
all['div6__height__gluc_all__imt'] = 1 / all['height'] / all['gluc_all'] * all['imt']
all['plus__age_norm__ap_hi_norm__gluc_all_norm'] = all['age_norm'] + all['ap_hi_norm'] + all['gluc_all_norm']
all['x__age__weight'] = all['age']*all['weight']

all['smoke'].fillna(0, inplace=True)
all['alco'].fillna(0, inplace=True)
all['active'].fillna(1, inplace=True)

for c in cols:
    del all[c+'_norm']

all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))

In [ ]:


In [ ]:


In [60]:
#6 + features selection add 08 (batch)
DATA_ID = 28


train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(20), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(20), index_col='id')
all = pd.concat([train, test])

cols =[
    'age',
    'height',
    'weight',
    'ap_hi',
    'ap_lo',
    'smoke',
    'alco',
    'active',
    'gender_male',
    'height_low',
    'weight_low',
    'cholesterol_all',
    'gluc_all',
    'cholesterol_1',
    'cholesterol_2',
    'cholesterol_3',
    'gluc_1',
    'gluc_2',
    'gluc_3',
    'ap_error',
    'ap_error_swap',
    'imt',
    'imt_class_all',
    'imt_class_0',
    'imt_class_1',
    'imt_class_2',
    'imt_class_3',
    'imt_class_4',
    'imt_class_5',
    'imt_class_6',
    'x__age__gluc_all',
    'x__ap_hi__cholesterol_all',
'div6__height__gluc_all__imt',
'plus__age_norm__ap_hi_norm__gluc_all_norm',
    'x__age__weight',
    'div1__age__weight__cholesterol_all',
'div6__age__weight__cholesterol_all',
'plus__height_norm__weight_norm__gluc_all_norm',
'div1__ap_hi__ap_lo__cholesterol_all',
'div6__ap_hi__ap_lo__cholesterol_all',
'plus__age_norm__gluc_all_norm__imt_norm',
'minus6__ap_hi_norm__ap_lo_norm__cholesterol_all_norm',
'minus1__ap_hi_norm__ap_lo_norm__cholesterol_all_norm',
'minus6__age_norm__ap_lo_norm__cholesterol_all_norm',
'minus1__age_norm__ap_lo_norm__cholesterol_all_norm',
'div6__height__weight__ap_lo',
'div2__ap_lo__cholesterol_all__gluc_all',
'x__age__ap_hi__gluc_all',
'div5__ap_lo__cholesterol_all__gluc_all',
]

all = all[cols]



all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))

In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [14]:
#6 + score
DATA_ID = 29


train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(28), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(28), index_col='id')
all = pd.concat([train, test])


all['score_scale_val_v1'] = 0

chol4 = all['cholesterol_all'] == 1
chol5 = all['cholesterol_all'] == 1
chol6 = all['cholesterol_all'] == 1
chol7 = all['cholesterol_all'] == 2
chol8 = all['cholesterol_all'] == 3

male = all['gender_male'] == 1
female = all['gender_male'] == 0

smoke = all['smoke'] == 1
not_smoke = all['smoke'] == 0

age40 = (all['age']/365.25 >=40) & (all['age']/365.25 < 45)
age45 = (all['age']/365.25 >=45) & (all['age']/365.25 < 55)
age55 = (all['age']/365.25 >=55) & (all['age']/365.25 < 60)
age60 = (all['age']/365.25 >=60) & (all['age']/365.25 < 65)
age65 = (all['age']/365.25 >=65)

ap120 = (all['ap_hi'] >= 120) & (all['ap_hi'] < 140)
ap140 = (all['ap_hi'] >= 140) & (all['ap_hi'] < 160)
ap160 = (all['ap_hi'] >= 160) & (all['ap_hi'] < 170)
ap180 = (all['ap_hi'] >= 180) 


data = [
    [female, not_smoke, age65, ap180, 7, 8, 9, 10, 12],
    [female, not_smoke, age65, ap160, 5, 5, 6, 7, 8],
    [female, not_smoke, age65, ap140, 3, 3, 4, 5, 6],
    [female, not_smoke, age65, ap120, 2, 2, 3, 3, 4],
    
    [female, not_smoke, age60, ap180, 4, 4, 5, 6, 7],
    [female, not_smoke, age60, ap160, 3, 3, 3, 4, 5],
    [female, not_smoke, age60, ap140, 2, 2, 2, 3, 3],
    [female, not_smoke, age60, ap120, 1, 1, 2, 2, 2],
    
    [female, not_smoke, age55, ap180, 2, 2, 3, 3, 4],
    [female, not_smoke, age55, ap160, 1, 2, 2, 2, 3],
    [female, not_smoke, age55, ap140, 1, 1, 1, 1, 2],
    [female, not_smoke, age55, ap120, 1, 1, 1, 1, 1],
    
    [female, not_smoke, age45, ap180, 1, 1, 1, 2, 2],
    [female, not_smoke, age45, ap160, 1, 1, 1, 1, 1],
    [female, not_smoke, age45, ap140, 0, 1, 1, 1, 1],
    [female, not_smoke, age45, ap120, 0, 0, 1, 1, 1],
    
#     [female, not_smoke, age40, ap180, , , , , ],
#     [female, not_smoke, age40, ap160, , , , , ],
#     [female, not_smoke, age40, ap140, , , , , ],
#     [female, not_smoke, age40, ap120, , , , , ],
#######################################################
    [female, smoke, age65, ap180, 13, 15, 17, 19, 22],
    [female, smoke, age65, ap160, 9, 10, 12, 13, 16],
    [female, smoke, age65, ap140, 6, 7, 8, 9, 11],
    [female, smoke, age65, ap120, 4, 5, 5, 6, 7],
    
    [female, smoke, age60, ap180, 8, 9, 10, 11, 13],
    [female, smoke, age60, ap160, 5, 6, 7, 8, 9],
    [female, smoke, age60, ap140, 3, 4, 5, 5, 6],
    [female, smoke, age60, ap120, 2, 3, 3, 4, 4],
    
    [female, smoke, age55, ap180, 4, 5, 5, 6, 7],
    [female, smoke, age55, ap160, 3, 3, 4, 4, 5],
    [female, smoke, age55, ap140, 2, 2, 2, 3, 3],
    [female, smoke, age55, ap120, 1, 1, 2, 2, 2],
    
    [female, smoke, age45, ap180, 2, 2, 3, 3, 4],
    [female, smoke, age45, ap160, 1, 2, 2, 2, 3],
    [female, smoke, age45, ap140, 1, 1, 1, 1, 2],
    [female, smoke, age45, ap120, 1, 1, 1, 1, 1],
    
    [female, smoke, age40, ap180, 0, 0, 0, 1, 1],
#     [female, smoke, age40, ap160, , , , , ],
#     [female, smoke, age40, ap140, , , , , ],
#     [female, smoke, age40, ap120, , , , , ],
#######################################################
    [male, not_smoke, age65, ap180, 14, 16, 19, 22, 26],
    [male, not_smoke, age65, ap160, 9, 11, 13, 15, 16],
    [male, not_smoke, age65, ap140, 6, 8, 9, 11, 13],
    [male, not_smoke, age65, ap120, 4, 5, 6, 7, 9],
    
    [male, not_smoke, age60, ap180, 9, 11, 13, 15, 18],
    [male, not_smoke, age60, ap160, 6, 7, 9, 10, 12],
    [male, not_smoke, age60, ap140, 4, 5, 6, 7, 9],
    [male, not_smoke, age60, ap120, 3, 3, 4, 5, 6],
    
    [male, not_smoke, age55, ap180, 6, 7, 8, 10, 12],
    [male, not_smoke, age55, ap160, 4, 5, 6, 7, 8],
    [male, not_smoke, age55, ap140, 3, 3, 4, 5, 6],
    [male, not_smoke, age55, ap120, 2, 2, 3, 3, 4],
    
    [male, not_smoke, age45, ap180, 4, 4, 5, 6, 7],
    [male, not_smoke, age45, ap160, 2, 3, 3, 4, 5],
    [male, not_smoke, age45, ap140, 2, 2, 2, 3, 3],
    [male, not_smoke, age45, ap120, 1, 1, 2, 2, 2],
    
    [male, not_smoke, age40, ap180, 1, 1, 1, 2, 2],
    [male, not_smoke, age40, ap160, 1, 1, 1, 1, 1],
    [male, not_smoke, age40, ap140, 0, 1, 1, 1, 1],
    [male, not_smoke, age40, ap120, 0, 0, 1, 1, 1],
#######################################################
    [male, smoke, age65, ap180, 26, 30, 35, 41, 47],
    [male, smoke, age65, ap160, 18, 21, 25, 29, 34],
    [male, smoke, age65, ap140, 13, 15, 17, 20, 24],
    [male, smoke, age65, ap120, 9, 10, 12, 14, 17],
    
    [male, smoke, age60, ap180, 18, 21, 24, 28, 33],
    [male, smoke, age60, ap160, 12, 14, 17, 20, 24],
    [male, smoke, age60, ap140, 8, 10, 12, 14, 17],
    [male, smoke, age60, ap120, 6, 7, 8, 10, 12],
    
    [male, smoke, age55, ap180, 12, 13, 16, 19, 22],
    [male, smoke, age55, ap160, 8, 9, 11, 13, 16],
    [male, smoke, age55, ap140, 5, 6, 8, 9, 11],
    [male, smoke, age55, ap120, 4, 4, 5, 6, 8],
    
    [male, smoke, age45, ap180, 7, 8, 10, 12, 14],
    [male, smoke, age45, ap160, 5, 6, 7, 8, 10],
    [male, smoke, age45, ap140, 3, 4, 5, 6, 7],
    [male, smoke, age45, ap120, 2, 3, 3, 4, 5],
    
    [male, smoke, age40, ap180, 2, 2, 3, 3, 4],
    [male, smoke, age40, ap160, 1, 2, 2, 2, 3],
    [male, smoke, age40, ap140, 1, 1, 1, 2, 2],
    [male, smoke, age40, ap120, 1, 1, 1, 1, 1],
]


for gender, sm, age, aphi, c4, c5, c6, c7, c8 in data:
    all.ix[gender & sm & aphi & chol4 , 'score_scale_val'] = c4
    all.ix[gender & sm & aphi & chol5 , 'score_scale_val'] = c5
    all.ix[gender & sm & aphi & chol6 , 'score_scale_val'] = c6
    all.ix[gender & sm & aphi & chol7 , 'score_scale_val'] = c7
    all.ix[gender & sm & aphi & chol8 , 'score_scale_val'] = c8



all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))

In [15]:
#6 + score
DATA_ID = 30


train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(28), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(28), index_col='id')
all = pd.concat([train, test])


all['score_scale_val_v2'] = 0

chol4 = all['cholesterol_all'] == 1
chol5 = all['cholesterol_all'] == 1
chol6 = all['cholesterol_all'] == 2
chol7 = all['cholesterol_all'] == 2
chol8 = all['cholesterol_all'] == 3

male = all['gender_male'] == 1
female = all['gender_male'] == 0

smoke = all['smoke'] == 1
not_smoke = all['smoke'] == 0

age40 = (all['age']/365.25 >=40) & (all['age']/365.25 < 45)
age45 = (all['age']/365.25 >=45) & (all['age']/365.25 < 55)
age55 = (all['age']/365.25 >=55) & (all['age']/365.25 < 60)
age60 = (all['age']/365.25 >=60) & (all['age']/365.25 < 65)
age65 = (all['age']/365.25 >=65)

ap120 = (all['ap_hi'] >= 120) & (all['ap_hi'] < 140)
ap140 = (all['ap_hi'] >= 140) & (all['ap_hi'] < 160)
ap160 = (all['ap_hi'] >= 160) & (all['ap_hi'] < 170)
ap180 = (all['ap_hi'] >= 180) 


data = [
    [female, not_smoke, age65, ap180, 7, 8, 9, 10, 12],
    [female, not_smoke, age65, ap160, 5, 5, 6, 7, 8],
    [female, not_smoke, age65, ap140, 3, 3, 4, 5, 6],
    [female, not_smoke, age65, ap120, 2, 2, 3, 3, 4],
    
    [female, not_smoke, age60, ap180, 4, 4, 5, 6, 7],
    [female, not_smoke, age60, ap160, 3, 3, 3, 4, 5],
    [female, not_smoke, age60, ap140, 2, 2, 2, 3, 3],
    [female, not_smoke, age60, ap120, 1, 1, 2, 2, 2],
    
    [female, not_smoke, age55, ap180, 2, 2, 3, 3, 4],
    [female, not_smoke, age55, ap160, 1, 2, 2, 2, 3],
    [female, not_smoke, age55, ap140, 1, 1, 1, 1, 2],
    [female, not_smoke, age55, ap120, 1, 1, 1, 1, 1],
    
    [female, not_smoke, age45, ap180, 1, 1, 1, 2, 2],
    [female, not_smoke, age45, ap160, 1, 1, 1, 1, 1],
    [female, not_smoke, age45, ap140, 0, 1, 1, 1, 1],
    [female, not_smoke, age45, ap120, 0, 0, 1, 1, 1],
    
#     [female, not_smoke, age40, ap180, , , , , ],
#     [female, not_smoke, age40, ap160, , , , , ],
#     [female, not_smoke, age40, ap140, , , , , ],
#     [female, not_smoke, age40, ap120, , , , , ],
#######################################################
    [female, smoke, age65, ap180, 13, 15, 17, 19, 22],
    [female, smoke, age65, ap160, 9, 10, 12, 13, 16],
    [female, smoke, age65, ap140, 6, 7, 8, 9, 11],
    [female, smoke, age65, ap120, 4, 5, 5, 6, 7],
    
    [female, smoke, age60, ap180, 8, 9, 10, 11, 13],
    [female, smoke, age60, ap160, 5, 6, 7, 8, 9],
    [female, smoke, age60, ap140, 3, 4, 5, 5, 6],
    [female, smoke, age60, ap120, 2, 3, 3, 4, 4],
    
    [female, smoke, age55, ap180, 4, 5, 5, 6, 7],
    [female, smoke, age55, ap160, 3, 3, 4, 4, 5],
    [female, smoke, age55, ap140, 2, 2, 2, 3, 3],
    [female, smoke, age55, ap120, 1, 1, 2, 2, 2],
    
    [female, smoke, age45, ap180, 2, 2, 3, 3, 4],
    [female, smoke, age45, ap160, 1, 2, 2, 2, 3],
    [female, smoke, age45, ap140, 1, 1, 1, 1, 2],
    [female, smoke, age45, ap120, 1, 1, 1, 1, 1],
    
    [female, smoke, age40, ap180, 0, 0, 0, 1, 1],
#     [female, smoke, age40, ap160, , , , , ],
#     [female, smoke, age40, ap140, , , , , ],
#     [female, smoke, age40, ap120, , , , , ],
#######################################################
    [male, not_smoke, age65, ap180, 14, 16, 19, 22, 26],
    [male, not_smoke, age65, ap160, 9, 11, 13, 15, 16],
    [male, not_smoke, age65, ap140, 6, 8, 9, 11, 13],
    [male, not_smoke, age65, ap120, 4, 5, 6, 7, 9],
    
    [male, not_smoke, age60, ap180, 9, 11, 13, 15, 18],
    [male, not_smoke, age60, ap160, 6, 7, 9, 10, 12],
    [male, not_smoke, age60, ap140, 4, 5, 6, 7, 9],
    [male, not_smoke, age60, ap120, 3, 3, 4, 5, 6],
    
    [male, not_smoke, age55, ap180, 6, 7, 8, 10, 12],
    [male, not_smoke, age55, ap160, 4, 5, 6, 7, 8],
    [male, not_smoke, age55, ap140, 3, 3, 4, 5, 6],
    [male, not_smoke, age55, ap120, 2, 2, 3, 3, 4],
    
    [male, not_smoke, age45, ap180, 4, 4, 5, 6, 7],
    [male, not_smoke, age45, ap160, 2, 3, 3, 4, 5],
    [male, not_smoke, age45, ap140, 2, 2, 2, 3, 3],
    [male, not_smoke, age45, ap120, 1, 1, 2, 2, 2],
    
    [male, not_smoke, age40, ap180, 1, 1, 1, 2, 2],
    [male, not_smoke, age40, ap160, 1, 1, 1, 1, 1],
    [male, not_smoke, age40, ap140, 0, 1, 1, 1, 1],
    [male, not_smoke, age40, ap120, 0, 0, 1, 1, 1],
#######################################################
    [male, smoke, age65, ap180, 26, 30, 35, 41, 47],
    [male, smoke, age65, ap160, 18, 21, 25, 29, 34],
    [male, smoke, age65, ap140, 13, 15, 17, 20, 24],
    [male, smoke, age65, ap120, 9, 10, 12, 14, 17],
    
    [male, smoke, age60, ap180, 18, 21, 24, 28, 33],
    [male, smoke, age60, ap160, 12, 14, 17, 20, 24],
    [male, smoke, age60, ap140, 8, 10, 12, 14, 17],
    [male, smoke, age60, ap120, 6, 7, 8, 10, 12],
    
    [male, smoke, age55, ap180, 12, 13, 16, 19, 22],
    [male, smoke, age55, ap160, 8, 9, 11, 13, 16],
    [male, smoke, age55, ap140, 5, 6, 8, 9, 11],
    [male, smoke, age55, ap120, 4, 4, 5, 6, 8],
    
    [male, smoke, age45, ap180, 7, 8, 10, 12, 14],
    [male, smoke, age45, ap160, 5, 6, 7, 8, 10],
    [male, smoke, age45, ap140, 3, 4, 5, 6, 7],
    [male, smoke, age45, ap120, 2, 3, 3, 4, 5],
    
    [male, smoke, age40, ap180, 2, 2, 3, 3, 4],
    [male, smoke, age40, ap160, 1, 2, 2, 2, 3],
    [male, smoke, age40, ap140, 1, 1, 1, 2, 2],
    [male, smoke, age40, ap120, 1, 1, 1, 1, 1],
]


for gender, sm, age, aphi, c4, c5, c6, c7, c8 in data:
    all.ix[gender & sm & aphi & chol4 , 'score_scale_val'] = c4
    all.ix[gender & sm & aphi & chol5 , 'score_scale_val'] = c5
    all.ix[gender & sm & aphi & chol6 , 'score_scale_val'] = c6
    all.ix[gender & sm & aphi & chol7 , 'score_scale_val'] = c7
    all.ix[gender & sm & aphi & chol8 , 'score_scale_val'] = c8



all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))

In [17]:
#6 + score
DATA_ID = 31


train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(28), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(28), index_col='id')
all = pd.concat([train, test])


all['score_scale_val_v3'] = 0

chol4 = all['cholesterol_all'] == 1
chol5 = all['cholesterol_all'] == 1
chol6 = all['cholesterol_all'] == 2
chol7 = all['cholesterol_all'] == 3
chol8 = all['cholesterol_all'] == 3

male = all['gender_male'] == 1
female = all['gender_male'] == 0

smoke = all['smoke'] == 1
not_smoke = all['smoke'] == 0

age40 = (all['age']/365.25 >=40) & (all['age']/365.25 < 45)
age45 = (all['age']/365.25 >=45) & (all['age']/365.25 < 55)
age55 = (all['age']/365.25 >=55) & (all['age']/365.25 < 60)
age60 = (all['age']/365.25 >=60) & (all['age']/365.25 < 65)
age65 = (all['age']/365.25 >=65)

ap120 = (all['ap_hi'] >= 120) & (all['ap_hi'] < 140)
ap140 = (all['ap_hi'] >= 140) & (all['ap_hi'] < 160)
ap160 = (all['ap_hi'] >= 160) & (all['ap_hi'] < 170)
ap180 = (all['ap_hi'] >= 180) 


data = [
    [female, not_smoke, age65, ap180, 7, 8, 9, 10, 12],
    [female, not_smoke, age65, ap160, 5, 5, 6, 7, 8],
    [female, not_smoke, age65, ap140, 3, 3, 4, 5, 6],
    [female, not_smoke, age65, ap120, 2, 2, 3, 3, 4],
    
    [female, not_smoke, age60, ap180, 4, 4, 5, 6, 7],
    [female, not_smoke, age60, ap160, 3, 3, 3, 4, 5],
    [female, not_smoke, age60, ap140, 2, 2, 2, 3, 3],
    [female, not_smoke, age60, ap120, 1, 1, 2, 2, 2],
    
    [female, not_smoke, age55, ap180, 2, 2, 3, 3, 4],
    [female, not_smoke, age55, ap160, 1, 2, 2, 2, 3],
    [female, not_smoke, age55, ap140, 1, 1, 1, 1, 2],
    [female, not_smoke, age55, ap120, 1, 1, 1, 1, 1],
    
    [female, not_smoke, age45, ap180, 1, 1, 1, 2, 2],
    [female, not_smoke, age45, ap160, 1, 1, 1, 1, 1],
    [female, not_smoke, age45, ap140, 0, 1, 1, 1, 1],
    [female, not_smoke, age45, ap120, 0, 0, 1, 1, 1],
    
#     [female, not_smoke, age40, ap180, , , , , ],
#     [female, not_smoke, age40, ap160, , , , , ],
#     [female, not_smoke, age40, ap140, , , , , ],
#     [female, not_smoke, age40, ap120, , , , , ],
#######################################################
    [female, smoke, age65, ap180, 13, 15, 17, 19, 22],
    [female, smoke, age65, ap160, 9, 10, 12, 13, 16],
    [female, smoke, age65, ap140, 6, 7, 8, 9, 11],
    [female, smoke, age65, ap120, 4, 5, 5, 6, 7],
    
    [female, smoke, age60, ap180, 8, 9, 10, 11, 13],
    [female, smoke, age60, ap160, 5, 6, 7, 8, 9],
    [female, smoke, age60, ap140, 3, 4, 5, 5, 6],
    [female, smoke, age60, ap120, 2, 3, 3, 4, 4],
    
    [female, smoke, age55, ap180, 4, 5, 5, 6, 7],
    [female, smoke, age55, ap160, 3, 3, 4, 4, 5],
    [female, smoke, age55, ap140, 2, 2, 2, 3, 3],
    [female, smoke, age55, ap120, 1, 1, 2, 2, 2],
    
    [female, smoke, age45, ap180, 2, 2, 3, 3, 4],
    [female, smoke, age45, ap160, 1, 2, 2, 2, 3],
    [female, smoke, age45, ap140, 1, 1, 1, 1, 2],
    [female, smoke, age45, ap120, 1, 1, 1, 1, 1],
    
    [female, smoke, age40, ap180, 0, 0, 0, 1, 1],
#     [female, smoke, age40, ap160, , , , , ],
#     [female, smoke, age40, ap140, , , , , ],
#     [female, smoke, age40, ap120, , , , , ],
#######################################################
    [male, not_smoke, age65, ap180, 14, 16, 19, 22, 26],
    [male, not_smoke, age65, ap160, 9, 11, 13, 15, 16],
    [male, not_smoke, age65, ap140, 6, 8, 9, 11, 13],
    [male, not_smoke, age65, ap120, 4, 5, 6, 7, 9],
    
    [male, not_smoke, age60, ap180, 9, 11, 13, 15, 18],
    [male, not_smoke, age60, ap160, 6, 7, 9, 10, 12],
    [male, not_smoke, age60, ap140, 4, 5, 6, 7, 9],
    [male, not_smoke, age60, ap120, 3, 3, 4, 5, 6],
    
    [male, not_smoke, age55, ap180, 6, 7, 8, 10, 12],
    [male, not_smoke, age55, ap160, 4, 5, 6, 7, 8],
    [male, not_smoke, age55, ap140, 3, 3, 4, 5, 6],
    [male, not_smoke, age55, ap120, 2, 2, 3, 3, 4],
    
    [male, not_smoke, age45, ap180, 4, 4, 5, 6, 7],
    [male, not_smoke, age45, ap160, 2, 3, 3, 4, 5],
    [male, not_smoke, age45, ap140, 2, 2, 2, 3, 3],
    [male, not_smoke, age45, ap120, 1, 1, 2, 2, 2],
    
    [male, not_smoke, age40, ap180, 1, 1, 1, 2, 2],
    [male, not_smoke, age40, ap160, 1, 1, 1, 1, 1],
    [male, not_smoke, age40, ap140, 0, 1, 1, 1, 1],
    [male, not_smoke, age40, ap120, 0, 0, 1, 1, 1],
#######################################################
    [male, smoke, age65, ap180, 26, 30, 35, 41, 47],
    [male, smoke, age65, ap160, 18, 21, 25, 29, 34],
    [male, smoke, age65, ap140, 13, 15, 17, 20, 24],
    [male, smoke, age65, ap120, 9, 10, 12, 14, 17],
    
    [male, smoke, age60, ap180, 18, 21, 24, 28, 33],
    [male, smoke, age60, ap160, 12, 14, 17, 20, 24],
    [male, smoke, age60, ap140, 8, 10, 12, 14, 17],
    [male, smoke, age60, ap120, 6, 7, 8, 10, 12],
    
    [male, smoke, age55, ap180, 12, 13, 16, 19, 22],
    [male, smoke, age55, ap160, 8, 9, 11, 13, 16],
    [male, smoke, age55, ap140, 5, 6, 8, 9, 11],
    [male, smoke, age55, ap120, 4, 4, 5, 6, 8],
    
    [male, smoke, age45, ap180, 7, 8, 10, 12, 14],
    [male, smoke, age45, ap160, 5, 6, 7, 8, 10],
    [male, smoke, age45, ap140, 3, 4, 5, 6, 7],
    [male, smoke, age45, ap120, 2, 3, 3, 4, 5],
    
    [male, smoke, age40, ap180, 2, 2, 3, 3, 4],
    [male, smoke, age40, ap160, 1, 2, 2, 2, 3],
    [male, smoke, age40, ap140, 1, 1, 1, 2, 2],
    [male, smoke, age40, ap120, 1, 1, 1, 1, 1],
]


for gender, sm, age, aphi, c4, c5, c6, c7, c8 in data:
    all.ix[gender & sm & aphi & chol4 , 'score_scale_val'] = c4
    all.ix[gender & sm & aphi & chol5 , 'score_scale_val'] = c5
    all.ix[gender & sm & aphi & chol6 , 'score_scale_val'] = c6
    all.ix[gender & sm & aphi & chol7 , 'score_scale_val'] = c7
    all.ix[gender & sm & aphi & chol8 , 'score_scale_val'] = c8



all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))

In [41]:
#6 + score manual
DATA_ID = 32


train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(28), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(28), index_col='id')
all = pd.concat([train, test])


all['score_scale_val_manual'] = all['cholesterol_all'] * (all['gender_male']+1) * (all['smoke']+1) * all['age'] * all['ap_hi']



all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))

In [44]:
#6 + score
DATA_ID = 35


train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(28), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(28), index_col='id')
all = pd.concat([train, test])


all['score_scale_val_v3'] = 0

chol4 = all['cholesterol_all'] == 1
chol5 = all['cholesterol_all'] == 1
chol6 = all['cholesterol_all'] == 1
chol7 = all['cholesterol_all'] == 1
chol8 = (all['cholesterol_all'] == 2) | (all['cholesterol_all'] == 3)

male = all['gender_male'] == 1
female = all['gender_male'] == 0

smoke = all['smoke'] == 1
not_smoke = all['smoke'] == 0

age40 = (all['age']/365.25 >=40) & (all['age']/365.25 < 45)
age45 = (all['age']/365.25 >=45) & (all['age']/365.25 < 55)
age55 = (all['age']/365.25 >=55) & (all['age']/365.25 < 60)
age60 = (all['age']/365.25 >=60) & (all['age']/365.25 < 65)
age65 = (all['age']/365.25 >=65)

ap120 = (all['ap_hi'] >= 120) & (all['ap_hi'] < 140)
ap140 = (all['ap_hi'] >= 140) & (all['ap_hi'] < 160)
ap160 = (all['ap_hi'] >= 160) & (all['ap_hi'] < 170)
ap180 = (all['ap_hi'] >= 180) 


data = [
    [female, not_smoke, age65, ap180, 7, 8, 9, 10, 12],
    [female, not_smoke, age65, ap160, 5, 5, 6, 7, 8],
    [female, not_smoke, age65, ap140, 3, 3, 4, 5, 6],
    [female, not_smoke, age65, ap120, 2, 2, 3, 3, 4],
    
    [female, not_smoke, age60, ap180, 4, 4, 5, 6, 7],
    [female, not_smoke, age60, ap160, 3, 3, 3, 4, 5],
    [female, not_smoke, age60, ap140, 2, 2, 2, 3, 3],
    [female, not_smoke, age60, ap120, 1, 1, 2, 2, 2],
    
    [female, not_smoke, age55, ap180, 2, 2, 3, 3, 4],
    [female, not_smoke, age55, ap160, 1, 2, 2, 2, 3],
    [female, not_smoke, age55, ap140, 1, 1, 1, 1, 2],
    [female, not_smoke, age55, ap120, 1, 1, 1, 1, 1],
    
    [female, not_smoke, age45, ap180, 1, 1, 1, 2, 2],
    [female, not_smoke, age45, ap160, 1, 1, 1, 1, 1],
    [female, not_smoke, age45, ap140, 0, 1, 1, 1, 1],
    [female, not_smoke, age45, ap120, 0, 0, 1, 1, 1],
    
#     [female, not_smoke, age40, ap180, , , , , ],
#     [female, not_smoke, age40, ap160, , , , , ],
#     [female, not_smoke, age40, ap140, , , , , ],
#     [female, not_smoke, age40, ap120, , , , , ],
#######################################################
    [female, smoke, age65, ap180, 13, 15, 17, 19, 22],
    [female, smoke, age65, ap160, 9, 10, 12, 13, 16],
    [female, smoke, age65, ap140, 6, 7, 8, 9, 11],
    [female, smoke, age65, ap120, 4, 5, 5, 6, 7],
    
    [female, smoke, age60, ap180, 8, 9, 10, 11, 13],
    [female, smoke, age60, ap160, 5, 6, 7, 8, 9],
    [female, smoke, age60, ap140, 3, 4, 5, 5, 6],
    [female, smoke, age60, ap120, 2, 3, 3, 4, 4],
    
    [female, smoke, age55, ap180, 4, 5, 5, 6, 7],
    [female, smoke, age55, ap160, 3, 3, 4, 4, 5],
    [female, smoke, age55, ap140, 2, 2, 2, 3, 3],
    [female, smoke, age55, ap120, 1, 1, 2, 2, 2],
    
    [female, smoke, age45, ap180, 2, 2, 3, 3, 4],
    [female, smoke, age45, ap160, 1, 2, 2, 2, 3],
    [female, smoke, age45, ap140, 1, 1, 1, 1, 2],
    [female, smoke, age45, ap120, 1, 1, 1, 1, 1],
    
    [female, smoke, age40, ap180, 0, 0, 0, 1, 1],
#     [female, smoke, age40, ap160, , , , , ],
#     [female, smoke, age40, ap140, , , , , ],
#     [female, smoke, age40, ap120, , , , , ],
#######################################################
    [male, not_smoke, age65, ap180, 14, 16, 19, 22, 26],
    [male, not_smoke, age65, ap160, 9, 11, 13, 15, 16],
    [male, not_smoke, age65, ap140, 6, 8, 9, 11, 13],
    [male, not_smoke, age65, ap120, 4, 5, 6, 7, 9],
    
    [male, not_smoke, age60, ap180, 9, 11, 13, 15, 18],
    [male, not_smoke, age60, ap160, 6, 7, 9, 10, 12],
    [male, not_smoke, age60, ap140, 4, 5, 6, 7, 9],
    [male, not_smoke, age60, ap120, 3, 3, 4, 5, 6],
    
    [male, not_smoke, age55, ap180, 6, 7, 8, 10, 12],
    [male, not_smoke, age55, ap160, 4, 5, 6, 7, 8],
    [male, not_smoke, age55, ap140, 3, 3, 4, 5, 6],
    [male, not_smoke, age55, ap120, 2, 2, 3, 3, 4],
    
    [male, not_smoke, age45, ap180, 4, 4, 5, 6, 7],
    [male, not_smoke, age45, ap160, 2, 3, 3, 4, 5],
    [male, not_smoke, age45, ap140, 2, 2, 2, 3, 3],
    [male, not_smoke, age45, ap120, 1, 1, 2, 2, 2],
    
    [male, not_smoke, age40, ap180, 1, 1, 1, 2, 2],
    [male, not_smoke, age40, ap160, 1, 1, 1, 1, 1],
    [male, not_smoke, age40, ap140, 0, 1, 1, 1, 1],
    [male, not_smoke, age40, ap120, 0, 0, 1, 1, 1],
#######################################################
    [male, smoke, age65, ap180, 26, 30, 35, 41, 47],
    [male, smoke, age65, ap160, 18, 21, 25, 29, 34],
    [male, smoke, age65, ap140, 13, 15, 17, 20, 24],
    [male, smoke, age65, ap120, 9, 10, 12, 14, 17],
    
    [male, smoke, age60, ap180, 18, 21, 24, 28, 33],
    [male, smoke, age60, ap160, 12, 14, 17, 20, 24],
    [male, smoke, age60, ap140, 8, 10, 12, 14, 17],
    [male, smoke, age60, ap120, 6, 7, 8, 10, 12],
    
    [male, smoke, age55, ap180, 12, 13, 16, 19, 22],
    [male, smoke, age55, ap160, 8, 9, 11, 13, 16],
    [male, smoke, age55, ap140, 5, 6, 8, 9, 11],
    [male, smoke, age55, ap120, 4, 4, 5, 6, 8],
    
    [male, smoke, age45, ap180, 7, 8, 10, 12, 14],
    [male, smoke, age45, ap160, 5, 6, 7, 8, 10],
    [male, smoke, age45, ap140, 3, 4, 5, 6, 7],
    [male, smoke, age45, ap120, 2, 3, 3, 4, 5],
    
    [male, smoke, age40, ap180, 2, 2, 3, 3, 4],
    [male, smoke, age40, ap160, 1, 2, 2, 2, 3],
    [male, smoke, age40, ap140, 1, 1, 1, 2, 2],
    [male, smoke, age40, ap120, 1, 1, 1, 1, 1],
]


for gender, sm, age, aphi, c4, c5, c6, c7, c8 in data:
    all.ix[gender & sm & aphi & chol4 , 'score_scale_val'] = c4
    all.ix[gender & sm & aphi & chol5 , 'score_scale_val'] = c5
    all.ix[gender & sm & aphi & chol6 , 'score_scale_val'] = c6
    all.ix[gender & sm & aphi & chol7 , 'score_scale_val'] = c7
    all.ix[gender & sm & aphi & chol8 , 'score_scale_val'] = c8



all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))

In [ ]:


In [42]:
#28 + age by 5 year
DATA_ID = 33


train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(28), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(28), index_col='id')
all = pd.concat([train, test])


all['age_5year'] = all['age']  // (365.25 * 5)



all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))

In [43]:
#28 + time to birthday
DATA_ID = 34


train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(28), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(28), index_col='id')
all = pd.concat([train, test])


all['time_to_birthday'] = all['age']  % (365.25 )



all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))

In [ ]:


In [ ]:


In [ ]:


In [61]:
#6 + score
DATA_ID = 36


train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(28), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(28), index_col='id')
all = pd.concat([train, test])


all['score_scale_val_v3'] = 0

chol4 = all['cholesterol_all'] == 1
chol5 = all['cholesterol_all'] == 1
chol6 = all['cholesterol_all'] == 1
chol7 = all['cholesterol_all'] == 2
chol8 = (all['cholesterol_all'] == 3) 

male = all['gender_male'] == 1
female = all['gender_male'] == 0

smoke = all['smoke'] == 1
not_smoke = all['smoke'] == 0

age40 = (all['age']/365.25 >=40) & (all['age']/365.25 < 45)
age45 = (all['age']/365.25 >=45) & (all['age']/365.25 < 55)
age55 = (all['age']/365.25 >=55) & (all['age']/365.25 < 60)
age60 = (all['age']/365.25 >=60) & (all['age']/365.25 < 65)
age65 = (all['age']/365.25 >=65)

ap120 = (all['ap_hi'] >= 120) & (all['ap_hi'] < 140)
ap140 = (all['ap_hi'] >= 140) & (all['ap_hi'] < 160)
ap160 = (all['ap_hi'] >= 160) & (all['ap_hi'] < 170)
ap180 = (all['ap_hi'] >= 180) 


data = [
    [female, not_smoke, age65, ap180, 7, 8, 9, 10, 12],
    [female, not_smoke, age65, ap160, 5, 5, 6, 7, 8],
    [female, not_smoke, age65, ap140, 3, 3, 4, 5, 6],
    [female, not_smoke, age65, ap120, 2, 2, 3, 3, 4],
    
    [female, not_smoke, age60, ap180, 4, 4, 5, 6, 7],
    [female, not_smoke, age60, ap160, 3, 3, 3, 4, 5],
    [female, not_smoke, age60, ap140, 2, 2, 2, 3, 3],
    [female, not_smoke, age60, ap120, 1, 1, 2, 2, 2],
    
    [female, not_smoke, age55, ap180, 2, 2, 3, 3, 4],
    [female, not_smoke, age55, ap160, 1, 2, 2, 2, 3],
    [female, not_smoke, age55, ap140, 1, 1, 1, 1, 2],
    [female, not_smoke, age55, ap120, 1, 1, 1, 1, 1],
    
    [female, not_smoke, age45, ap180, 1, 1, 1, 2, 2],
    [female, not_smoke, age45, ap160, 1, 1, 1, 1, 1],
    [female, not_smoke, age45, ap140, 0, 1, 1, 1, 1],
    [female, not_smoke, age45, ap120, 0, 0, 1, 1, 1],
    
#     [female, not_smoke, age40, ap180, , , , , ],
#     [female, not_smoke, age40, ap160, , , , , ],
#     [female, not_smoke, age40, ap140, , , , , ],
#     [female, not_smoke, age40, ap120, , , , , ],
#######################################################
    [female, smoke, age65, ap180, 13, 15, 17, 19, 22],
    [female, smoke, age65, ap160, 9, 10, 12, 13, 16],
    [female, smoke, age65, ap140, 6, 7, 8, 9, 11],
    [female, smoke, age65, ap120, 4, 5, 5, 6, 7],
    
    [female, smoke, age60, ap180, 8, 9, 10, 11, 13],
    [female, smoke, age60, ap160, 5, 6, 7, 8, 9],
    [female, smoke, age60, ap140, 3, 4, 5, 5, 6],
    [female, smoke, age60, ap120, 2, 3, 3, 4, 4],
    
    [female, smoke, age55, ap180, 4, 5, 5, 6, 7],
    [female, smoke, age55, ap160, 3, 3, 4, 4, 5],
    [female, smoke, age55, ap140, 2, 2, 2, 3, 3],
    [female, smoke, age55, ap120, 1, 1, 2, 2, 2],
    
    [female, smoke, age45, ap180, 2, 2, 3, 3, 4],
    [female, smoke, age45, ap160, 1, 2, 2, 2, 3],
    [female, smoke, age45, ap140, 1, 1, 1, 1, 2],
    [female, smoke, age45, ap120, 1, 1, 1, 1, 1],
    
    [female, smoke, age40, ap180, 0, 0, 0, 1, 1],
#     [female, smoke, age40, ap160, , , , , ],
#     [female, smoke, age40, ap140, , , , , ],
#     [female, smoke, age40, ap120, , , , , ],
#######################################################
    [male, not_smoke, age65, ap180, 14, 16, 19, 22, 26],
    [male, not_smoke, age65, ap160, 9, 11, 13, 15, 16],
    [male, not_smoke, age65, ap140, 6, 8, 9, 11, 13],
    [male, not_smoke, age65, ap120, 4, 5, 6, 7, 9],
    
    [male, not_smoke, age60, ap180, 9, 11, 13, 15, 18],
    [male, not_smoke, age60, ap160, 6, 7, 9, 10, 12],
    [male, not_smoke, age60, ap140, 4, 5, 6, 7, 9],
    [male, not_smoke, age60, ap120, 3, 3, 4, 5, 6],
    
    [male, not_smoke, age55, ap180, 6, 7, 8, 10, 12],
    [male, not_smoke, age55, ap160, 4, 5, 6, 7, 8],
    [male, not_smoke, age55, ap140, 3, 3, 4, 5, 6],
    [male, not_smoke, age55, ap120, 2, 2, 3, 3, 4],
    
    [male, not_smoke, age45, ap180, 4, 4, 5, 6, 7],
    [male, not_smoke, age45, ap160, 2, 3, 3, 4, 5],
    [male, not_smoke, age45, ap140, 2, 2, 2, 3, 3],
    [male, not_smoke, age45, ap120, 1, 1, 2, 2, 2],
    
    [male, not_smoke, age40, ap180, 1, 1, 1, 2, 2],
    [male, not_smoke, age40, ap160, 1, 1, 1, 1, 1],
    [male, not_smoke, age40, ap140, 0, 1, 1, 1, 1],
    [male, not_smoke, age40, ap120, 0, 0, 1, 1, 1],
#######################################################
    [male, smoke, age65, ap180, 26, 30, 35, 41, 47],
    [male, smoke, age65, ap160, 18, 21, 25, 29, 34],
    [male, smoke, age65, ap140, 13, 15, 17, 20, 24],
    [male, smoke, age65, ap120, 9, 10, 12, 14, 17],
    
    [male, smoke, age60, ap180, 18, 21, 24, 28, 33],
    [male, smoke, age60, ap160, 12, 14, 17, 20, 24],
    [male, smoke, age60, ap140, 8, 10, 12, 14, 17],
    [male, smoke, age60, ap120, 6, 7, 8, 10, 12],
    
    [male, smoke, age55, ap180, 12, 13, 16, 19, 22],
    [male, smoke, age55, ap160, 8, 9, 11, 13, 16],
    [male, smoke, age55, ap140, 5, 6, 8, 9, 11],
    [male, smoke, age55, ap120, 4, 4, 5, 6, 8],
    
    [male, smoke, age45, ap180, 7, 8, 10, 12, 14],
    [male, smoke, age45, ap160, 5, 6, 7, 8, 10],
    [male, smoke, age45, ap140, 3, 4, 5, 6, 7],
    [male, smoke, age45, ap120, 2, 3, 3, 4, 5],
    
    [male, smoke, age40, ap180, 2, 2, 3, 3, 4],
    [male, smoke, age40, ap160, 1, 2, 2, 2, 3],
    [male, smoke, age40, ap140, 1, 1, 1, 2, 2],
    [male, smoke, age40, ap120, 1, 1, 1, 1, 1],
]


for gender, sm, age, aphi, c4, c5, c6, c7, c8 in data:
    all.ix[gender & sm & aphi & chol8 , 'score_scale_val'] = c8
    all.ix[gender & sm & aphi & chol7 , 'score_scale_val'] = c7
    all.ix[gender & sm & aphi & chol6 , 'score_scale_val'] = c6
    all.ix[gender & sm & aphi & chol5 , 'score_scale_val'] = c5
    all.ix[gender & sm & aphi & chol4 , 'score_scale_val'] = c4



all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))

In [46]:
#6 + score
DATA_ID = 37


train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(28), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(28), index_col='id')
all = pd.concat([train, test])


all['score_scale_val_v3'] = 0

chol4 = all['cholesterol_all'] == 1
chol5 = all['cholesterol_all'] == 1
chol6 = all['cholesterol_all'] == 1
chol7 = all['cholesterol_all'] == 1
chol8 = (all['cholesterol_all'] == 2) | (all['cholesterol_all'] == 3)

male = all['gender_male'] == 1
female = all['gender_male'] == 0

smoke = all['smoke'] == 1
not_smoke = all['smoke'] == 0

age40 = (all['age']/365.25 >=40) & (all['age']/365.25 < 45)
age45 = (all['age']/365.25 >=45) & (all['age']/365.25 < 55)
age55 = (all['age']/365.25 >=55) & (all['age']/365.25 < 60)
age60 = (all['age']/365.25 >=60) & (all['age']/365.25 < 65)
age65 = (all['age']/365.25 >=65)

ap120 = (all['ap_hi'] >= 120) & (all['ap_hi'] < 140)
ap140 = (all['ap_hi'] >= 140) & (all['ap_hi'] < 160)
ap160 = (all['ap_hi'] >= 160) & (all['ap_hi'] < 170)
ap180 = (all['ap_hi'] >= 180) 


data = [
    [female, not_smoke, age65, ap180, 7, 8, 9, 10, 12],
    [female, not_smoke, age65, ap160, 5, 5, 6, 7, 8],
    [female, not_smoke, age65, ap140, 3, 3, 4, 5, 6],
    [female, not_smoke, age65, ap120, 2, 2, 3, 3, 4],
    
    [female, not_smoke, age60, ap180, 4, 4, 5, 6, 7],
    [female, not_smoke, age60, ap160, 3, 3, 3, 4, 5],
    [female, not_smoke, age60, ap140, 2, 2, 2, 3, 3],
    [female, not_smoke, age60, ap120, 1, 1, 2, 2, 2],
    
    [female, not_smoke, age55, ap180, 2, 2, 3, 3, 4],
    [female, not_smoke, age55, ap160, 1, 2, 2, 2, 3],
    [female, not_smoke, age55, ap140, 1, 1, 1, 1, 2],
    [female, not_smoke, age55, ap120, 1, 1, 1, 1, 1],
    
    [female, not_smoke, age45, ap180, 1, 1, 1, 2, 2],
    [female, not_smoke, age45, ap160, 1, 1, 1, 1, 1],
    [female, not_smoke, age45, ap140, 0, 1, 1, 1, 1],
    [female, not_smoke, age45, ap120, 0, 0, 1, 1, 1],
    
#     [female, not_smoke, age40, ap180, , , , , ],
#     [female, not_smoke, age40, ap160, , , , , ],
#     [female, not_smoke, age40, ap140, , , , , ],
#     [female, not_smoke, age40, ap120, , , , , ],
#######################################################
    [female, smoke, age65, ap180, 13, 15, 17, 19, 22],
    [female, smoke, age65, ap160, 9, 10, 12, 13, 16],
    [female, smoke, age65, ap140, 6, 7, 8, 9, 11],
    [female, smoke, age65, ap120, 4, 5, 5, 6, 7],
    
    [female, smoke, age60, ap180, 8, 9, 10, 11, 13],
    [female, smoke, age60, ap160, 5, 6, 7, 8, 9],
    [female, smoke, age60, ap140, 3, 4, 5, 5, 6],
    [female, smoke, age60, ap120, 2, 3, 3, 4, 4],
    
    [female, smoke, age55, ap180, 4, 5, 5, 6, 7],
    [female, smoke, age55, ap160, 3, 3, 4, 4, 5],
    [female, smoke, age55, ap140, 2, 2, 2, 3, 3],
    [female, smoke, age55, ap120, 1, 1, 2, 2, 2],
    
    [female, smoke, age45, ap180, 2, 2, 3, 3, 4],
    [female, smoke, age45, ap160, 1, 2, 2, 2, 3],
    [female, smoke, age45, ap140, 1, 1, 1, 1, 2],
    [female, smoke, age45, ap120, 1, 1, 1, 1, 1],
    
    [female, smoke, age40, ap180, 0, 0, 0, 1, 1],
#     [female, smoke, age40, ap160, , , , , ],
#     [female, smoke, age40, ap140, , , , , ],
#     [female, smoke, age40, ap120, , , , , ],
#######################################################
    [male, not_smoke, age65, ap180, 14, 16, 19, 22, 26],
    [male, not_smoke, age65, ap160, 9, 11, 13, 15, 16],
    [male, not_smoke, age65, ap140, 6, 8, 9, 11, 13],
    [male, not_smoke, age65, ap120, 4, 5, 6, 7, 9],
    
    [male, not_smoke, age60, ap180, 9, 11, 13, 15, 18],
    [male, not_smoke, age60, ap160, 6, 7, 9, 10, 12],
    [male, not_smoke, age60, ap140, 4, 5, 6, 7, 9],
    [male, not_smoke, age60, ap120, 3, 3, 4, 5, 6],
    
    [male, not_smoke, age55, ap180, 6, 7, 8, 10, 12],
    [male, not_smoke, age55, ap160, 4, 5, 6, 7, 8],
    [male, not_smoke, age55, ap140, 3, 3, 4, 5, 6],
    [male, not_smoke, age55, ap120, 2, 2, 3, 3, 4],
    
    [male, not_smoke, age45, ap180, 4, 4, 5, 6, 7],
    [male, not_smoke, age45, ap160, 2, 3, 3, 4, 5],
    [male, not_smoke, age45, ap140, 2, 2, 2, 3, 3],
    [male, not_smoke, age45, ap120, 1, 1, 2, 2, 2],
    
    [male, not_smoke, age40, ap180, 1, 1, 1, 2, 2],
    [male, not_smoke, age40, ap160, 1, 1, 1, 1, 1],
    [male, not_smoke, age40, ap140, 0, 1, 1, 1, 1],
    [male, not_smoke, age40, ap120, 0, 0, 1, 1, 1],
#######################################################
    [male, smoke, age65, ap180, 26, 30, 35, 41, 47],
    [male, smoke, age65, ap160, 18, 21, 25, 29, 34],
    [male, smoke, age65, ap140, 13, 15, 17, 20, 24],
    [male, smoke, age65, ap120, 9, 10, 12, 14, 17],
    
    [male, smoke, age60, ap180, 18, 21, 24, 28, 33],
    [male, smoke, age60, ap160, 12, 14, 17, 20, 24],
    [male, smoke, age60, ap140, 8, 10, 12, 14, 17],
    [male, smoke, age60, ap120, 6, 7, 8, 10, 12],
    
    [male, smoke, age55, ap180, 12, 13, 16, 19, 22],
    [male, smoke, age55, ap160, 8, 9, 11, 13, 16],
    [male, smoke, age55, ap140, 5, 6, 8, 9, 11],
    [male, smoke, age55, ap120, 4, 4, 5, 6, 8],
    
    [male, smoke, age45, ap180, 7, 8, 10, 12, 14],
    [male, smoke, age45, ap160, 5, 6, 7, 8, 10],
    [male, smoke, age45, ap140, 3, 4, 5, 6, 7],
    [male, smoke, age45, ap120, 2, 3, 3, 4, 5],
    
    [male, smoke, age40, ap180, 2, 2, 3, 3, 4],
    [male, smoke, age40, ap160, 1, 2, 2, 2, 3],
    [male, smoke, age40, ap140, 1, 1, 1, 2, 2],
    [male, smoke, age40, ap120, 1, 1, 1, 1, 1],
]


for gender, sm, age, aphi, c4, c5, c6, c7, c8 in data:
    all.ix[gender & sm & aphi & chol8 , 'score_scale_val'] = c8
    all.ix[gender & sm & aphi & chol7 , 'score_scale_val'] = c7
    all.ix[gender & sm & aphi & chol6 , 'score_scale_val'] = c6
    all.ix[gender & sm & aphi & chol5 , 'score_scale_val'] = c5
    all.ix[gender & sm & aphi & chol4 , 'score_scale_val'] = c4



all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))

In [69]:
#6 + score
DATA_ID = 38


train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(28), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(28), index_col='id')
all = pd.concat([train, test])


all['score_scale_val'] = 0

chol4 = all['cholesterol_all'] == 1
chol5 = all['cholesterol_all'] == 1
chol6 = all['cholesterol_all'] == 1
chol7 = all['cholesterol_all'] == 2
chol8 = (all['cholesterol_all'] == 3) 

male = all['gender_male'] == 1
female = all['gender_male'] == 0

smoke = all['smoke'] == 1
not_smoke = all['smoke'] == 0

age40 = (all['age']/365.25 >=40) & (all['age']/365.25 < 45)
age45 = (all['age']/365.25 >=45) & (all['age']/365.25 < 55)
age55 = (all['age']/365.25 >=55) & (all['age']/365.25 < 60)
age60 = (all['age']/365.25 >=60) & (all['age']/365.25 < 65)
age65 = (all['age']/365.25 >=65)

ap120 = (all['ap_hi'] >= 120) & (all['ap_hi'] < 140)
ap140 = (all['ap_hi'] >= 140) & (all['ap_hi'] < 160)
ap160 = (all['ap_hi'] >= 160) & (all['ap_hi'] < 170)
ap180 = (all['ap_hi'] >= 180) 


data = [
    [female, not_smoke, age65, ap180, 7, 8, 9, 10, 12],
    [female, not_smoke, age65, ap160, 5, 5, 6, 7, 8],
    [female, not_smoke, age65, ap140, 3, 3, 4, 5, 6],
    [female, not_smoke, age65, ap120, 2, 2, 3, 3, 4],
    
    [female, not_smoke, age60, ap180, 4, 4, 5, 6, 7],
    [female, not_smoke, age60, ap160, 3, 3, 3, 4, 5],
    [female, not_smoke, age60, ap140, 2, 2, 2, 3, 3],
    [female, not_smoke, age60, ap120, 1, 1, 2, 2, 2],
    
    [female, not_smoke, age55, ap180, 2, 2, 3, 3, 4],
    [female, not_smoke, age55, ap160, 1, 2, 2, 2, 3],
    [female, not_smoke, age55, ap140, 1, 1, 1, 1, 2],
    [female, not_smoke, age55, ap120, 1, 1, 1, 1, 1],
    
    [female, not_smoke, age45, ap180, 1, 1, 1, 2, 2],
    [female, not_smoke, age45, ap160, 1, 1, 1, 1, 1],
    [female, not_smoke, age45, ap140, 0, 1, 1, 1, 1],
    [female, not_smoke, age45, ap120, 0, 0, 1, 1, 1],
    
    [female, not_smoke, age40, ap180, 0, 0, 0, 0, 0],
    [female, not_smoke, age40, ap160, 0, 0, 0, 0, 0],
    [female, not_smoke, age40, ap140, 0, 0, 0, 0, 0],
    [female, not_smoke, age40, ap120, 0, 0, 0, 0, 0],
#######################################################
    [female, smoke, age65, ap180, 13, 15, 17, 19, 22],
    [female, smoke, age65, ap160, 9, 10, 12, 13, 16],
    [female, smoke, age65, ap140, 6, 7, 8, 9, 11],
    [female, smoke, age65, ap120, 4, 5, 5, 6, 7],
    
    [female, smoke, age60, ap180, 8, 9, 10, 11, 13],
    [female, smoke, age60, ap160, 5, 6, 7, 8, 9],
    [female, smoke, age60, ap140, 3, 4, 5, 5, 6],
    [female, smoke, age60, ap120, 2, 3, 3, 4, 4],
    
    [female, smoke, age55, ap180, 4, 5, 5, 6, 7],
    [female, smoke, age55, ap160, 3, 3, 4, 4, 5],
    [female, smoke, age55, ap140, 2, 2, 2, 3, 3],
    [female, smoke, age55, ap120, 1, 1, 2, 2, 2],
    
    [female, smoke, age45, ap180, 2, 2, 3, 3, 4],
    [female, smoke, age45, ap160, 1, 2, 2, 2, 3],
    [female, smoke, age45, ap140, 1, 1, 1, 1, 2],
    [female, smoke, age45, ap120, 1, 1, 1, 1, 1],
    
    [female, smoke, age40, ap180, 0, 0, 0, 1, 1],
    [female, smoke, age40, ap160, 0, 0, 0, 0, 0],
    [female, smoke, age40, ap140, 0, 0, 0, 0, 0],
    [female, smoke, age40, ap120, 0, 0, 0, 0, 0],
#######################################################
    [male, not_smoke, age65, ap180, 14, 16, 19, 22, 26],
    [male, not_smoke, age65, ap160, 9, 11, 13, 15, 16],
    [male, not_smoke, age65, ap140, 6, 8, 9, 11, 13],
    [male, not_smoke, age65, ap120, 4, 5, 6, 7, 9],
    
    [male, not_smoke, age60, ap180, 9, 11, 13, 15, 18],
    [male, not_smoke, age60, ap160, 6, 7, 9, 10, 12],
    [male, not_smoke, age60, ap140, 4, 5, 6, 7, 9],
    [male, not_smoke, age60, ap120, 3, 3, 4, 5, 6],
    
    [male, not_smoke, age55, ap180, 6, 7, 8, 10, 12],
    [male, not_smoke, age55, ap160, 4, 5, 6, 7, 8],
    [male, not_smoke, age55, ap140, 3, 3, 4, 5, 6],
    [male, not_smoke, age55, ap120, 2, 2, 3, 3, 4],
    
    [male, not_smoke, age45, ap180, 4, 4, 5, 6, 7],
    [male, not_smoke, age45, ap160, 2, 3, 3, 4, 5],
    [male, not_smoke, age45, ap140, 2, 2, 2, 3, 3],
    [male, not_smoke, age45, ap120, 1, 1, 2, 2, 2],
    
    [male, not_smoke, age40, ap180, 1, 1, 1, 2, 2],
    [male, not_smoke, age40, ap160, 1, 1, 1, 1, 1],
    [male, not_smoke, age40, ap140, 0, 1, 1, 1, 1],
    [male, not_smoke, age40, ap120, 0, 0, 1, 1, 1],
#######################################################
    [male, smoke, age65, ap180, 26, 30, 35, 41, 47],
    [male, smoke, age65, ap160, 18, 21, 25, 29, 34],
    [male, smoke, age65, ap140, 13, 15, 17, 20, 24],
    [male, smoke, age65, ap120, 9, 10, 12, 14, 17],
    
    [male, smoke, age60, ap180, 18, 21, 24, 28, 33],
    [male, smoke, age60, ap160, 12, 14, 17, 20, 24],
    [male, smoke, age60, ap140, 8, 10, 12, 14, 17],
    [male, smoke, age60, ap120, 6, 7, 8, 10, 12],
    
    [male, smoke, age55, ap180, 12, 13, 16, 19, 22],
    [male, smoke, age55, ap160, 8, 9, 11, 13, 16],
    [male, smoke, age55, ap140, 5, 6, 8, 9, 11],
    [male, smoke, age55, ap120, 4, 4, 5, 6, 8],
    
    [male, smoke, age45, ap180, 7, 8, 10, 12, 14],
    [male, smoke, age45, ap160, 5, 6, 7, 8, 10],
    [male, smoke, age45, ap140, 3, 4, 5, 6, 7],
    [male, smoke, age45, ap120, 2, 3, 3, 4, 5],
    
    [male, smoke, age40, ap180, 2, 2, 3, 3, 4],
    [male, smoke, age40, ap160, 1, 2, 2, 2, 3],
    [male, smoke, age40, ap140, 1, 1, 1, 2, 2],
    [male, smoke, age40, ap120, 1, 1, 1, 1, 1],
]


for gender, sm, age, aphi, c4, c5, c6, c7, c8 in data:
    all.ix[gender & sm & aphi & chol8 , 'score_scale_val'] = c8
    all.ix[gender & sm & aphi & chol7 , 'score_scale_val'] = c7
    all.ix[gender & sm & aphi & chol6 , 'score_scale_val'] = c6
    all.ix[gender & sm & aphi & chol5 , 'score_scale_val'] = c5
    all.ix[gender & sm & aphi & chol4 , 'score_scale_val'] = c4



all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))

In [ ]:


In [ ]:


In [47]:


In [52]:
all.columns


Out[52]:
Index(['age', 'height', 'weight', 'ap_hi', 'ap_lo', 'smoke', 'alco', 'active',
       'gender_male', 'height_low', 'weight_low', 'cholesterol_all',
       'gluc_all', 'cholesterol_1', 'cholesterol_2', 'cholesterol_3', 'gluc_1',
       'gluc_2', 'gluc_3', 'ap_error', 'ap_error_swap', 'imt', 'imt_class_all',
       'imt_class_0', 'imt_class_1', 'imt_class_2', 'imt_class_3',
       'imt_class_4', 'imt_class_5', 'imt_class_6', 'x__age__gluc_all',
       'x__ap_hi__cholesterol_all', 'div6__height__gluc_all__imt',
       'plus__age_norm__ap_hi_norm__gluc_all_norm', 'x__age__weight',
       'div1__age__weight__cholesterol_all',
       'div6__age__weight__cholesterol_all',
       'plus__height_norm__weight_norm__gluc_all_norm',
       'div1__ap_hi__ap_lo__cholesterol_all',
       'div6__ap_hi__ap_lo__cholesterol_all',
       'plus__age_norm__gluc_all_norm__imt_norm',
       'minus6__ap_hi_norm__ap_lo_norm__cholesterol_all_norm',
       'minus1__ap_hi_norm__ap_lo_norm__cholesterol_all_norm',
       'minus6__age_norm__ap_lo_norm__cholesterol_all_norm',
       'minus1__age_norm__ap_lo_norm__cholesterol_all_norm',
       'div6__height__weight__ap_lo', 'div2__ap_lo__cholesterol_all__gluc_all',
       'x__age__ap_hi__gluc_all', 'div5__ap_lo__cholesterol_all__gluc_all',
       'score_scale_val'],
      dtype='object')

In [ ]:
#38 + combinations
DATA_ID = 40


train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(38), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(38), index_col='id')
all = pd.concat([train, test])


from itertools import combinations

cols = ['age', 'height', 'weight', 'ap_hi', 'ap_lo', #'smoke', 'alco', 'active',
        'cholesterol_all', 'gluc_all', 'imt']

cols_norm = {}
for c in cols:
    all[c+'_norm'] = (all[c] - all[c].mean()) / (all[c].max() - all[c].min())
    cols_norm[c] = c+'_norm'
    

for c1, c2 in combinations(cols, 2):
    all["x__{}__{}".format(c1,c2)] = all[c1] * all[c2]
    all["div__{}__{}".format(c1,c2)] = all[c1] / (all[c2] if all[c2].min()>0 else all[c2]+1)
    all["plus__{}__{}".format(cols_norm[c1],cols_norm[c2])] = all[cols_norm[c1]] + all[cols_norm[c2]]
    all["minus__{}__{}".format(cols_norm[c1],cols_norm[c2])] = all[cols_norm[c1]] - all[cols_norm[c2]]
print(0)
for c1, c2, c3 in combinations(cols, 3):
    all["x__{}__{}__{}".format(c1,c2,c3)] = all[c1] * all[c2] * all[c3]
    all["plus__{}__{}__{}".format(cols_norm[c1],cols_norm[c2],cols_norm[c3])] = all[cols_norm[c1]] + all[cols_norm[c2]] + all[cols_norm[c3]]

    all["minus1__{}__{}__{}".format(cols_norm[c1],cols_norm[c2],cols_norm[c3])] = \
        all[cols_norm[c1]] + all[cols_norm[c2]] - all[cols_norm[c3]]
    all["minus2__{}__{}__{}".format(cols_norm[c1],cols_norm[c2],cols_norm[c3])] = \
        all[cols_norm[c1]] - all[cols_norm[c2]] + all[cols_norm[c3]]
    all["minus3__{}__{}__{}".format(cols_norm[c1],cols_norm[c2],cols_norm[c3])] = \
       - all[cols_norm[c1]] + all[cols_norm[c2]] + all[cols_norm[c3]]
    all["minus4__{}__{}__{}".format(cols_norm[c1],cols_norm[c2],cols_norm[c3])] = \
        all[cols_norm[c1]] - all[cols_norm[c2]] - all[cols_norm[c3]]
    all["minus5__{}__{}__{}".format(cols_norm[c1],cols_norm[c2],cols_norm[c3])] = \
       - all[cols_norm[c1]] + all[cols_norm[c2]] - all[cols_norm[c3]]
    all["minus6__{}__{}__{}".format(cols_norm[c1],cols_norm[c2],cols_norm[c3])] = \
       - all[cols_norm[c1]] - all[cols_norm[c2]] + all[cols_norm[c3]]
        
    all["div1__{}__{}__{}".format(c1,c2,c3)] = 1 * all[c1] * all[c2] / all[c3]
    all["div2__{}__{}__{}".format(c1,c2,c3)] = 1 * all[c1] / all[c2] * all[c3]
    all["div3__{}__{}__{}".format(c1,c2,c3)] = 1 / all[c1] * all[c2] * all[c3]
    all["div4__{}__{}__{}".format(c1,c2,c3)] = 1 * all[c1] / all[c2] / all[c3]
    all["div5__{}__{}__{}".format(c1,c2,c3)] = 1 / all[c1] * all[c2] / all[c3]
    all["div6__{}__{}__{}".format(c1,c2,c3)] = 1 / all[c1] / all[c2] * all[c3]
        
    
print(1)
all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
print(2)
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))

len(all.columns)


0
1
2
Out[ ]:
935

In [ ]:


In [ ]:


In [ ]:


In [59]:
#36 + kmean
DATA_ID = 41


train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(36), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(36), index_col='id')
all = pd.concat([train, test])

from sklearn.cluster import KMeans

k15 = KMeans(n_clusters=15, precompute_distances = True, n_jobs=-1)
k15.fit(all.fillna(0))
k7 = KMeans(n_clusters=7, precompute_distances = True, n_jobs=-1)
k7.fit(all.fillna(0))
k3 = KMeans(n_clusters=3, precompute_distances = True, n_jobs=-1)
k3.fit(all.fillna(0))


k15_res = k15.predict(all.fillna(0))[None].T
k7_res =  k7.predict(all.fillna(0))[None].T
k3_res =  k3.predict(all.fillna(0))[None].T

all['k15'] = k15_res
all['k7'] = k7_res
all['k3'] = k3_res

all = pd.get_dummies(all, columns=["k15", "k7", "k3",])#


all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))

len(all.columns)


Out[59]:
76

In [ ]:


In [ ]:


In [ ]:


In [66]:
#36 + fillna 001
DATA_ID = 42


train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(36), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(36), index_col='id')
all = pd.concat([train, test])

all['smoke'].fillna(0, inplace=True)
all['alco'].fillna(0, inplace=True)
all['active'].fillna(1, inplace=True)

all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))

len(all.columns)


Out[66]:
51

In [ ]:


In [ ]:


In [ ]:


In [67]:
#36 + kmean
DATA_ID = 43


train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(42), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(42), index_col='id')
all = pd.concat([train, test])

from sklearn.cluster import KMeans

all2=all[['age', 'height', 'weight', 'ap_hi', 'ap_lo', 'smoke', 'alco', 'active',
       'gender_male', 'height_low', 'weight_low', 'cholesterol_all',
       'gluc_all', 'ap_error', 'ap_error_swap', 'imt', 'imt_class_all',
       ]]

k15 = KMeans(n_clusters=15, precompute_distances = True, n_jobs=-1)
k15.fit(all2)
k7 = KMeans(n_clusters=7, precompute_distances = True, n_jobs=-1)
k7.fit(all2)
k3 = KMeans(n_clusters=3, precompute_distances = True, n_jobs=-1)
k3.fit(all2)


k15_res = k15.predict(all2)[None].T
k7_res =  k7.predict(all2)[None].T
k3_res =  k3.predict(all2)[None].T

all['k15'] = k15_res
all['k7'] = k7_res
all['k3'] = k3_res

all = pd.get_dummies(all, columns=["k15", "k7", "k3",])#


all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))

len(all.columns)


Out[67]:
76

In [76]:
#36 + kmean
DATA_ID = 44


train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(42), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(42), index_col='id')
all = pd.concat([train, test])

from sklearn.cluster import KMeans

all2=all[['age', 'height', 'weight', 'ap_hi', 'ap_lo', 'smoke', 'alco', 'active',
       'gender_male', 'height_low', 'weight_low', 'cholesterol_all',
       'gluc_all', 'ap_error', 'ap_error_swap', 'imt', 'imt_class_all',
       ]]

for c in ['age', 'height', 'weight', 'ap_hi', 'ap_lo',         'cholesterol_all',       'gluc_all', 'imt', 'imt_class_all']:
    all2[c] = (all2[c]-all2[c].mean())/(all2[c].max()-all2[c].min())

#k15 = KMeans(n_clusters=15, precompute_distances = True, n_jobs=-1)
#k15.fit(all2)
k7 = KMeans(n_clusters=7, precompute_distances = True, n_jobs=-1)
k7.fit(all2)
k3 = KMeans(n_clusters=3, precompute_distances = True, n_jobs=-1)
k3.fit(all2)


#k15_res = k15.predict(all2)[None].T
k7_res =  k7.predict(all2)[None].T
k3_res =  k3.predict(all2)[None].T

#all['k15'] = k15_res
all['k7'] = k7_res
all['k3'] = k3_res

all = pd.get_dummies(all, columns=[ "k7", "k3",])#k15


all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))

len(all.columns)


d:\python36\lib\site-packages\ipykernel\__main__.py:17: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
Out[76]:
61

In [ ]:


In [ ]:


In [107]:
#43 + stat feat for add
DATA_ID = 45


train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(43), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(43), index_col='id')
all = pd.concat([train, test])

is_male = all['gender_male'] == 1
is_female = all['gender_male'] == 1

is_smoke = all['smoke'] == 1
is_smoke = all['smoke'] == 1

for c in ['age', 'height', 'weight', 'ap_hi', 'ap_lo', 
       'cholesterol_all',
       'gluc_all',  'imt', 'imt_class_all',
        'x__age__gluc_all',
       'x__ap_hi__cholesterol_all', 'div6__height__gluc_all__imt',
       'plus__age_norm__ap_hi_norm__gluc_all_norm', 'x__age__weight',
       'div1__age__weight__cholesterol_all',
       'div6__age__weight__cholesterol_all',
       'plus__height_norm__weight_norm__gluc_all_norm',
       'div1__ap_hi__ap_lo__cholesterol_all',
       'div6__ap_hi__ap_lo__cholesterol_all',
       'plus__age_norm__gluc_all_norm__imt_norm',
       'minus6__ap_hi_norm__ap_lo_norm__cholesterol_all_norm',
       'minus1__ap_hi_norm__ap_lo_norm__cholesterol_all_norm',
       'minus6__age_norm__ap_lo_norm__cholesterol_all_norm',
       'minus1__age_norm__ap_lo_norm__cholesterol_all_norm',
       'div6__height__weight__ap_lo', 'div2__ap_lo__cholesterol_all__gluc_all',
       'x__age__ap_hi__gluc_all', 'div5__ap_lo__cholesterol_all__gluc_all',
       'score_scale_val']:

    all.ix[all['gender_male'] == 1, c + '__gender__scale'] = \
        (all.ix[all['gender_male'] == 1, c] - all.groupby(['gender_male']).mean()[c].loc[1]) / \
        all.groupby(['gender_male']).std()[c].loc[1]
        
    all.ix[all['gender_male'] == 0, c + '__gender__scale'] = \
        (all.ix[all['gender_male'] == 0, c] - all.groupby(['gender_male']).mean()[c].loc[0]) / \
        all.groupby(['gender_male']).std()[c].loc[0]
        
        
        
        




all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))

len(all.columns)


Out[107]:
105

In [ ]:


In [105]:
all.groupby(['gender_male', 'smoke']).mean()


Out[105]:
age height weight ap_hi ap_lo alco active height_low weight_low cholesterol_all ... plus__age_norm__gluc_all_norm__imt_norm_gender_scale minus6__ap_hi_norm__ap_lo_norm__cholesterol_all_norm_gender_scale minus1__ap_hi_norm__ap_lo_norm__cholesterol_all_norm_gender_scale minus6__age_norm__ap_lo_norm__cholesterol_all_norm_gender_scale minus1__age_norm__ap_lo_norm__cholesterol_all_norm_gender_scale div6__height__weight__ap_lo_gender_scale div2__ap_lo__cholesterol_all__gluc_all_gender_scale x__age__ap_hi__gluc_all_gender_scale div5__ap_lo__cholesterol_all__gluc_all_gender_scale score_scale_val_gender_scale
gender_male smoke
0 0.0 19511.488273 161.347626 72.450696 126.270679 81.074454 0.022137 0.807381 0.000611 0.001050 1.381977 ... 0.000205 -0.003134 0.003134 -0.004658 0.004658 0.002100 0.001129 -0.000352 -0.002767 -0.027219
1.0 19042.847443 161.776896 75.145679 126.549383 81.329806 0.196649 0.814815 0.000882 0.000882 1.514109 ... -0.011514 0.176286 -0.176286 0.261971 -0.261971 -0.118141 -0.063509 0.019807 0.155614 1.530311
1 0.0 19469.017126 169.801467 77.212309 128.452851 82.514091 0.048811 0.805405 0.000361 0.000325 1.324698 ... 0.014544 -0.014505 0.014505 -0.030008 0.030008 0.007961 0.009431 0.005548 -0.020078 -0.321040
1.0 19112.323879 170.431253 77.669408 128.913695 82.585764 0.303485 0.849136 0.000540 0.000270 1.376553 ... -0.054370 0.054224 -0.054224 0.112179 -0.112179 -0.029759 -0.035255 -0.020740 0.075057 1.186715

4 rows × 103 columns


In [86]:
all.groupby(['gender_male']).std()


Out[86]:
age height weight ap_hi ap_lo smoke alco active height_low weight_low ... k7_0 k7_1 k7_2 k7_3 k7_4 k7_5 k7_6 k3_0 k3_1 k3_2
gender_male
0 2434.060352 6.887387 14.183831 17.043531 9.726323 0.131009 0.156690 0.394258 0.024815 0.032348 ... 0.397102 0.393716 0.297547 0.341563 0.289127 0.342466 0.358297 0.473618 0.427112 0.493601
1 2534.849592 7.005219 14.170650 16.877038 9.487594 0.408058 0.303387 0.388599 0.019973 0.017705 ... 0.385405 0.381082 0.318908 0.358376 0.307452 0.324170 0.358043 0.473504 0.448586 0.485756

2 rows × 75 columns


In [ ]:


In [108]:
list(all.columns)


Out[108]:
['age',
 'height',
 'weight',
 'ap_hi',
 'ap_lo',
 'smoke',
 'alco',
 'active',
 'gender_male',
 'height_low',
 'weight_low',
 'cholesterol_all',
 'gluc_all',
 'cholesterol_1',
 'cholesterol_2',
 'cholesterol_3',
 'gluc_1',
 'gluc_2',
 'gluc_3',
 'ap_error',
 'ap_error_swap',
 'imt',
 'imt_class_all',
 'imt_class_0',
 'imt_class_1',
 'imt_class_2',
 'imt_class_3',
 'imt_class_4',
 'imt_class_5',
 'imt_class_6',
 'x__age__gluc_all',
 'x__ap_hi__cholesterol_all',
 'div6__height__gluc_all__imt',
 'plus__age_norm__ap_hi_norm__gluc_all_norm',
 'x__age__weight',
 'div1__age__weight__cholesterol_all',
 'div6__age__weight__cholesterol_all',
 'plus__height_norm__weight_norm__gluc_all_norm',
 'div1__ap_hi__ap_lo__cholesterol_all',
 'div6__ap_hi__ap_lo__cholesterol_all',
 'plus__age_norm__gluc_all_norm__imt_norm',
 'minus6__ap_hi_norm__ap_lo_norm__cholesterol_all_norm',
 'minus1__ap_hi_norm__ap_lo_norm__cholesterol_all_norm',
 'minus6__age_norm__ap_lo_norm__cholesterol_all_norm',
 'minus1__age_norm__ap_lo_norm__cholesterol_all_norm',
 'div6__height__weight__ap_lo',
 'div2__ap_lo__cholesterol_all__gluc_all',
 'x__age__ap_hi__gluc_all',
 'div5__ap_lo__cholesterol_all__gluc_all',
 'score_scale_val_v3',
 'score_scale_val',
 'k15_0',
 'k15_1',
 'k15_2',
 'k15_3',
 'k15_4',
 'k15_5',
 'k15_6',
 'k15_7',
 'k15_8',
 'k15_9',
 'k15_10',
 'k15_11',
 'k15_12',
 'k15_13',
 'k15_14',
 'k7_0',
 'k7_1',
 'k7_2',
 'k7_3',
 'k7_4',
 'k7_5',
 'k7_6',
 'k3_0',
 'k3_1',
 'k3_2',
 'age__gender__scale',
 'height__gender__scale',
 'weight__gender__scale',
 'ap_hi__gender__scale',
 'ap_lo__gender__scale',
 'cholesterol_all__gender__scale',
 'gluc_all__gender__scale',
 'imt__gender__scale',
 'imt_class_all__gender__scale',
 'x__age__gluc_all__gender__scale',
 'x__ap_hi__cholesterol_all__gender__scale',
 'div6__height__gluc_all__imt__gender__scale',
 'plus__age_norm__ap_hi_norm__gluc_all_norm__gender__scale',
 'x__age__weight__gender__scale',
 'div1__age__weight__cholesterol_all__gender__scale',
 'div6__age__weight__cholesterol_all__gender__scale',
 'plus__height_norm__weight_norm__gluc_all_norm__gender__scale',
 'div1__ap_hi__ap_lo__cholesterol_all__gender__scale',
 'div6__ap_hi__ap_lo__cholesterol_all__gender__scale',
 'plus__age_norm__gluc_all_norm__imt_norm__gender__scale',
 'minus6__ap_hi_norm__ap_lo_norm__cholesterol_all_norm__gender__scale',
 'minus1__ap_hi_norm__ap_lo_norm__cholesterol_all_norm__gender__scale',
 'minus6__age_norm__ap_lo_norm__cholesterol_all_norm__gender__scale',
 'minus1__age_norm__ap_lo_norm__cholesterol_all_norm__gender__scale',
 'div6__height__weight__ap_lo__gender__scale',
 'div2__ap_lo__cholesterol_all__gluc_all__gender__scale',
 'x__age__ap_hi__gluc_all__gender__scale',
 'div5__ap_lo__cholesterol_all__gluc_all__gender__scale',
 'score_scale_val__gender__scale']

In [ ]:


In [111]:
#43 + na subjactive
DATA_ID = 46


train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(43), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(43), index_col='id')
test_na = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(1), index_col='id')

test.alco=test_na.alco
test.smoke=test_na.smoke
test.active=test_na.active
all = pd.concat([train, test])


all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))

len(all.columns)


Out[111]:
76

In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [112]:
#43 + na subjactive
DATA_ID = 47


train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(20), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(20), index_col='id')
all = pd.concat([train, test])

cols =[
    'age',
    'height',
    'weight',
    'ap_hi',
    'ap_lo',
    'smoke',
    'alco',
    'active',
    'gender_male',
    'height_low',
    'weight_low',
    'cholesterol_all',
    'gluc_all',
    'cholesterol_1',
    'cholesterol_2',
    'cholesterol_3',
    'gluc_1',
    'gluc_2',
    'gluc_3',
    'ap_error',
    'ap_error_swap',
    'imt',
    'imt_class_all',
    'imt_class_0',
    'imt_class_1',
    'imt_class_2',
    'imt_class_3',
    'imt_class_4',
    'imt_class_5',
    'imt_class_6',
    'x__age__gluc_all',
    'x__ap_hi__cholesterol_all',
'div6__height__gluc_all__imt',
'plus__age_norm__ap_hi_norm__gluc_all_norm',
    'x__age__weight',
    'div1__age__weight__cholesterol_all',
'div6__age__weight__cholesterol_all',
'plus__height_norm__weight_norm__gluc_all_norm',
'div1__ap_hi__ap_lo__cholesterol_all',
'div6__ap_hi__ap_lo__cholesterol_all',
'plus__age_norm__gluc_all_norm__imt_norm',
'minus6__ap_hi_norm__ap_lo_norm__cholesterol_all_norm',
'minus1__ap_hi_norm__ap_lo_norm__cholesterol_all_norm',
'minus6__age_norm__ap_lo_norm__cholesterol_all_norm',
'minus1__age_norm__ap_lo_norm__cholesterol_all_norm',
'div6__height__weight__ap_lo',
'div2__ap_lo__cholesterol_all__gluc_all',
'x__age__ap_hi__gluc_all',
'div5__ap_lo__cholesterol_all__gluc_all',
]

all = all[cols]

all['smoke'].fillna(0, inplace=True)
all['alco'].fillna(0, inplace=True)
all['active'].fillna(1, inplace=True)



all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))

In [113]:
#6 + score
DATA_ID = 48


train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(47), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(47), index_col='id')
all = pd.concat([train, test])


all['score_scale_val'] = 0

chol4 = all['cholesterol_all'] == 1
chol5 = all['cholesterol_all'] == 1
chol6 = all['cholesterol_all'] == 1
chol7 = all['cholesterol_all'] == 2
chol8 = (all['cholesterol_all'] == 3) 

male = all['gender_male'] == 1
female = all['gender_male'] == 0

smoke = all['smoke'] == 1
not_smoke = all['smoke'] == 0

age40 = (all['age']/365.25 >=40) & (all['age']/365.25 < 45)
age45 = (all['age']/365.25 >=45) & (all['age']/365.25 < 55)
age55 = (all['age']/365.25 >=55) & (all['age']/365.25 < 60)
age60 = (all['age']/365.25 >=60) & (all['age']/365.25 < 65)
age65 = (all['age']/365.25 >=65)

ap120 = (all['ap_hi'] >= 120) & (all['ap_hi'] < 140)
ap140 = (all['ap_hi'] >= 140) & (all['ap_hi'] < 160)
ap160 = (all['ap_hi'] >= 160) & (all['ap_hi'] < 170)
ap180 = (all['ap_hi'] >= 180) 


data = [
    [female, not_smoke, age65, ap180, 7, 8, 9, 10, 12],
    [female, not_smoke, age65, ap160, 5, 5, 6, 7, 8],
    [female, not_smoke, age65, ap140, 3, 3, 4, 5, 6],
    [female, not_smoke, age65, ap120, 2, 2, 3, 3, 4],
    
    [female, not_smoke, age60, ap180, 4, 4, 5, 6, 7],
    [female, not_smoke, age60, ap160, 3, 3, 3, 4, 5],
    [female, not_smoke, age60, ap140, 2, 2, 2, 3, 3],
    [female, not_smoke, age60, ap120, 1, 1, 2, 2, 2],
    
    [female, not_smoke, age55, ap180, 2, 2, 3, 3, 4],
    [female, not_smoke, age55, ap160, 1, 2, 2, 2, 3],
    [female, not_smoke, age55, ap140, 1, 1, 1, 1, 2],
    [female, not_smoke, age55, ap120, 1, 1, 1, 1, 1],
    
    [female, not_smoke, age45, ap180, 1, 1, 1, 2, 2],
    [female, not_smoke, age45, ap160, 1, 1, 1, 1, 1],
    [female, not_smoke, age45, ap140, 0, 1, 1, 1, 1],
    [female, not_smoke, age45, ap120, 0, 0, 1, 1, 1],
    
#     [female, not_smoke, age40, ap180, , , , , ],
#     [female, not_smoke, age40, ap160, , , , , ],
#     [female, not_smoke, age40, ap140, , , , , ],
#     [female, not_smoke, age40, ap120, , , , , ],
#######################################################
    [female, smoke, age65, ap180, 13, 15, 17, 19, 22],
    [female, smoke, age65, ap160, 9, 10, 12, 13, 16],
    [female, smoke, age65, ap140, 6, 7, 8, 9, 11],
    [female, smoke, age65, ap120, 4, 5, 5, 6, 7],
    
    [female, smoke, age60, ap180, 8, 9, 10, 11, 13],
    [female, smoke, age60, ap160, 5, 6, 7, 8, 9],
    [female, smoke, age60, ap140, 3, 4, 5, 5, 6],
    [female, smoke, age60, ap120, 2, 3, 3, 4, 4],
    
    [female, smoke, age55, ap180, 4, 5, 5, 6, 7],
    [female, smoke, age55, ap160, 3, 3, 4, 4, 5],
    [female, smoke, age55, ap140, 2, 2, 2, 3, 3],
    [female, smoke, age55, ap120, 1, 1, 2, 2, 2],
    
    [female, smoke, age45, ap180, 2, 2, 3, 3, 4],
    [female, smoke, age45, ap160, 1, 2, 2, 2, 3],
    [female, smoke, age45, ap140, 1, 1, 1, 1, 2],
    [female, smoke, age45, ap120, 1, 1, 1, 1, 1],
    
    [female, smoke, age40, ap180, 0, 0, 0, 1, 1],
#     [female, smoke, age40, ap160, , , , , ],
#     [female, smoke, age40, ap140, , , , , ],
#     [female, smoke, age40, ap120, , , , , ],
#######################################################
    [male, not_smoke, age65, ap180, 14, 16, 19, 22, 26],
    [male, not_smoke, age65, ap160, 9, 11, 13, 15, 16],
    [male, not_smoke, age65, ap140, 6, 8, 9, 11, 13],
    [male, not_smoke, age65, ap120, 4, 5, 6, 7, 9],
    
    [male, not_smoke, age60, ap180, 9, 11, 13, 15, 18],
    [male, not_smoke, age60, ap160, 6, 7, 9, 10, 12],
    [male, not_smoke, age60, ap140, 4, 5, 6, 7, 9],
    [male, not_smoke, age60, ap120, 3, 3, 4, 5, 6],
    
    [male, not_smoke, age55, ap180, 6, 7, 8, 10, 12],
    [male, not_smoke, age55, ap160, 4, 5, 6, 7, 8],
    [male, not_smoke, age55, ap140, 3, 3, 4, 5, 6],
    [male, not_smoke, age55, ap120, 2, 2, 3, 3, 4],
    
    [male, not_smoke, age45, ap180, 4, 4, 5, 6, 7],
    [male, not_smoke, age45, ap160, 2, 3, 3, 4, 5],
    [male, not_smoke, age45, ap140, 2, 2, 2, 3, 3],
    [male, not_smoke, age45, ap120, 1, 1, 2, 2, 2],
    
    [male, not_smoke, age40, ap180, 1, 1, 1, 2, 2],
    [male, not_smoke, age40, ap160, 1, 1, 1, 1, 1],
    [male, not_smoke, age40, ap140, 0, 1, 1, 1, 1],
    [male, not_smoke, age40, ap120, 0, 0, 1, 1, 1],
#######################################################
    [male, smoke, age65, ap180, 26, 30, 35, 41, 47],
    [male, smoke, age65, ap160, 18, 21, 25, 29, 34],
    [male, smoke, age65, ap140, 13, 15, 17, 20, 24],
    [male, smoke, age65, ap120, 9, 10, 12, 14, 17],
    
    [male, smoke, age60, ap180, 18, 21, 24, 28, 33],
    [male, smoke, age60, ap160, 12, 14, 17, 20, 24],
    [male, smoke, age60, ap140, 8, 10, 12, 14, 17],
    [male, smoke, age60, ap120, 6, 7, 8, 10, 12],
    
    [male, smoke, age55, ap180, 12, 13, 16, 19, 22],
    [male, smoke, age55, ap160, 8, 9, 11, 13, 16],
    [male, smoke, age55, ap140, 5, 6, 8, 9, 11],
    [male, smoke, age55, ap120, 4, 4, 5, 6, 8],
    
    [male, smoke, age45, ap180, 7, 8, 10, 12, 14],
    [male, smoke, age45, ap160, 5, 6, 7, 8, 10],
    [male, smoke, age45, ap140, 3, 4, 5, 6, 7],
    [male, smoke, age45, ap120, 2, 3, 3, 4, 5],
    
    [male, smoke, age40, ap180, 2, 2, 3, 3, 4],
    [male, smoke, age40, ap160, 1, 2, 2, 2, 3],
    [male, smoke, age40, ap140, 1, 1, 1, 2, 2],
    [male, smoke, age40, ap120, 1, 1, 1, 1, 1],
]


for gender, sm, age, aphi, c4, c5, c6, c7, c8 in data:
    all.ix[gender & sm & aphi & chol8 , 'score_scale_val'] = c8
    all.ix[gender & sm & aphi & chol7 , 'score_scale_val'] = c7
    all.ix[gender & sm & aphi & chol6 , 'score_scale_val'] = c6
    all.ix[gender & sm & aphi & chol5 , 'score_scale_val'] = c5
    all.ix[gender & sm & aphi & chol4 , 'score_scale_val'] = c4



all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))

In [126]:
#6 + score
DATA_ID = 49


train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(47), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(47), index_col='id')
all = pd.concat([train, test])


#all['score_scale_val'] = 0

chol4 = all['cholesterol_all'] == 1
chol5 = all['cholesterol_all'] == 1
chol6 = all['cholesterol_all'] == 1
chol7 = all['cholesterol_all'] == 2
chol8 = (all['cholesterol_all'] == 3) 

male = all['gender_male'] == 1
female = all['gender_male'] == 0

smoke = all['smoke'] == 1
not_smoke = all['smoke'] == 0

age40 = (all['age']/365.25 >=40) & (all['age']/365.25 < 45)
age45 = (all['age']/365.25 >=45) & (all['age']/365.25 < 55)
age55 = (all['age']/365.25 >=55) & (all['age']/365.25 < 60)
age60 = (all['age']/365.25 >=60) & (all['age']/365.25 < 65)
age65 = (all['age']/365.25 >=65)

ap120 = (all['ap_hi'] >= 120) & (all['ap_hi'] < 140)
ap140 = (all['ap_hi'] >= 140) & (all['ap_hi'] < 160)
ap160 = (all['ap_hi'] >= 160) & (all['ap_hi'] < 170)
ap180 = (all['ap_hi'] >= 180) 


data = [
    [female, not_smoke, age65, ap180, 7, 8, 9, 10, 12],
    [female, not_smoke, age65, ap160, 5, 5, 6, 7, 8],
    [female, not_smoke, age65, ap140, 3, 3, 4, 5, 6],
    [female, not_smoke, age65, ap120, 2, 2, 3, 3, 4],
    
    [female, not_smoke, age60, ap180, 4, 4, 5, 6, 7],
    [female, not_smoke, age60, ap160, 3, 3, 3, 4, 5],
    [female, not_smoke, age60, ap140, 2, 2, 2, 3, 3],
    [female, not_smoke, age60, ap120, 1, 1, 2, 2, 2],
    
    [female, not_smoke, age55, ap180, 2, 2, 3, 3, 4],
    [female, not_smoke, age55, ap160, 1, 2, 2, 2, 3],
    [female, not_smoke, age55, ap140, 1, 1, 1, 1, 2],
    [female, not_smoke, age55, ap120, 1, 1, 1, 1, 1],
    
    [female, not_smoke, age45, ap180, 1, 1, 1, 2, 2],
    [female, not_smoke, age45, ap160, 1, 1, 1, 1, 1],
    [female, not_smoke, age45, ap140, 0, 1, 1, 1, 1],
    [female, not_smoke, age45, ap120, 0, 0, 1, 1, 1],
    
#     [female, not_smoke, age40, ap180, , , , , ],
#     [female, not_smoke, age40, ap160, , , , , ],
#     [female, not_smoke, age40, ap140, , , , , ],
#     [female, not_smoke, age40, ap120, , , , , ],
#######################################################
    [female, smoke, age65, ap180, 13, 15, 17, 19, 22],
    [female, smoke, age65, ap160, 9, 10, 12, 13, 16],
    [female, smoke, age65, ap140, 6, 7, 8, 9, 11],
    [female, smoke, age65, ap120, 4, 5, 5, 6, 7],
    
    [female, smoke, age60, ap180, 8, 9, 10, 11, 13],
    [female, smoke, age60, ap160, 5, 6, 7, 8, 9],
    [female, smoke, age60, ap140, 3, 4, 5, 5, 6],
    [female, smoke, age60, ap120, 2, 3, 3, 4, 4],
    
    [female, smoke, age55, ap180, 4, 5, 5, 6, 7],
    [female, smoke, age55, ap160, 3, 3, 4, 4, 5],
    [female, smoke, age55, ap140, 2, 2, 2, 3, 3],
    [female, smoke, age55, ap120, 1, 1, 2, 2, 2],
    
    [female, smoke, age45, ap180, 2, 2, 3, 3, 4],
    [female, smoke, age45, ap160, 1, 2, 2, 2, 3],
    [female, smoke, age45, ap140, 1, 1, 1, 1, 2],
    [female, smoke, age45, ap120, 1, 1, 1, 1, 1],
    
    [female, smoke, age40, ap180, 0, 0, 0, 1, 1],
#     [female, smoke, age40, ap160, , , , , ],
#     [female, smoke, age40, ap140, , , , , ],
#     [female, smoke, age40, ap120, , , , , ],
#######################################################
    [male, not_smoke, age65, ap180, 14, 16, 19, 22, 26],
    [male, not_smoke, age65, ap160, 9, 11, 13, 15, 16],
    [male, not_smoke, age65, ap140, 6, 8, 9, 11, 13],
    [male, not_smoke, age65, ap120, 4, 5, 6, 7, 9],
    
    [male, not_smoke, age60, ap180, 9, 11, 13, 15, 18],
    [male, not_smoke, age60, ap160, 6, 7, 9, 10, 12],
    [male, not_smoke, age60, ap140, 4, 5, 6, 7, 9],
    [male, not_smoke, age60, ap120, 3, 3, 4, 5, 6],
    
    [male, not_smoke, age55, ap180, 6, 7, 8, 10, 12],
    [male, not_smoke, age55, ap160, 4, 5, 6, 7, 8],
    [male, not_smoke, age55, ap140, 3, 3, 4, 5, 6],
    [male, not_smoke, age55, ap120, 2, 2, 3, 3, 4],
    
    [male, not_smoke, age45, ap180, 4, 4, 5, 6, 7],
    [male, not_smoke, age45, ap160, 2, 3, 3, 4, 5],
    [male, not_smoke, age45, ap140, 2, 2, 2, 3, 3],
    [male, not_smoke, age45, ap120, 1, 1, 2, 2, 2],
    
    [male, not_smoke, age40, ap180, 1, 1, 1, 2, 2],
    [male, not_smoke, age40, ap160, 1, 1, 1, 1, 1],
    [male, not_smoke, age40, ap140, 0, 1, 1, 1, 1],
    [male, not_smoke, age40, ap120, 0, 0, 1, 1, 1],
#######################################################
    [male, smoke, age65, ap180, 26, 30, 35, 41, 47],
    [male, smoke, age65, ap160, 18, 21, 25, 29, 34],
    [male, smoke, age65, ap140, 13, 15, 17, 20, 24],
    [male, smoke, age65, ap120, 9, 10, 12, 14, 17],
    
    [male, smoke, age60, ap180, 18, 21, 24, 28, 33],
    [male, smoke, age60, ap160, 12, 14, 17, 20, 24],
    [male, smoke, age60, ap140, 8, 10, 12, 14, 17],
    [male, smoke, age60, ap120, 6, 7, 8, 10, 12],
    
    [male, smoke, age55, ap180, 12, 13, 16, 19, 22],
    [male, smoke, age55, ap160, 8, 9, 11, 13, 16],
    [male, smoke, age55, ap140, 5, 6, 8, 9, 11],
    [male, smoke, age55, ap120, 4, 4, 5, 6, 8],
    
    [male, smoke, age45, ap180, 7, 8, 10, 12, 14],
    [male, smoke, age45, ap160, 5, 6, 7, 8, 10],
    [male, smoke, age45, ap140, 3, 4, 5, 6, 7],
    [male, smoke, age45, ap120, 2, 3, 3, 4, 5],
    
    [male, smoke, age40, ap180, 2, 2, 3, 3, 4],
    [male, smoke, age40, ap160, 1, 2, 2, 2, 3],
    [male, smoke, age40, ap140, 1, 1, 1, 2, 2],
    [male, smoke, age40, ap120, 1, 1, 1, 1, 1],
]


for gender, sm, age, aphi, c4, c5, c6, c7, c8 in data:
    all.ix[gender & sm & aphi & chol8 , 'score_scale_val'] = c8
    all.ix[gender & sm & aphi & chol7 , 'score_scale_val'] = c7
    all.ix[gender & sm & aphi & chol6 , 'score_scale_val'] = c6
    all.ix[gender & sm & aphi & chol5 , 'score_scale_val'] = c5
    all.ix[gender & sm & aphi & chol4 , 'score_scale_val'] = c4



all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))

In [141]:
DATA_ID = 50


train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(49), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(49), index_col='id')
all = pd.concat([train, test])



from sklearn.cluster import KMeans

k15 = KMeans(n_clusters=15, precompute_distances = True, n_jobs=-1)
k15.fit(all.fillna(0))
k7 = KMeans(n_clusters=7, precompute_distances = True, n_jobs=-1)
k7.fit(all.fillna(0))
k3 = KMeans(n_clusters=3, precompute_distances = True, n_jobs=-1)
k3.fit(all.fillna(0))


k15_res = k15.predict(all.fillna(0))[None].T
k7_res =  k7.predict(all.fillna(0))[None].T
k3_res =  k3.predict(all.fillna(0))[None].T

all['k15'] = k15_res
all['k7'] = k7_res
all['k3'] = k3_res

all = pd.get_dummies(all, columns=["k15", "k7", "k3",])#


all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))

len(all.columns)


Out[141]:
75

In [142]:
#43 + stat feat for add
DATA_ID = 51


train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(50), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(50), index_col='id')
all = pd.concat([train, test])

is_male = all['gender_male'] == 1
is_female = all['gender_male'] == 0

chol1 = all['cholesterol_all'] == 1
chol2 = all['cholesterol_all'] == 2
chol3 = all['cholesterol_all'] == 3

gluc1 = all['gluc_all'] == 1
gluc2 = all['gluc_all'] == 2
gluc3 = all['gluc_all'] == 3

age_year = all['age'] // 365.25
all.ix[age_year < 40 , 'age_cat'] = 1
all.ix[(age_year >= 40) & (age_year < 45) , 'age_cat'] = 2
all.ix[(age_year >= 45) & (age_year < 50) , 'age_cat'] = 3
all.ix[(age_year >= 50) & (age_year < 55) , 'age_cat'] = 4
all.ix[(age_year >= 55) & (age_year < 60) , 'age_cat'] = 5
all.ix[(age_year >= 60) , 'age_cat'] = 6

age1 = all['age_cat'] == 1
age2 = all['age_cat'] == 2
age3 = all['age_cat'] == 3
age4 = all['age_cat'] == 4
age5 = all['age_cat'] == 5
age6 = all['age_cat'] == 6

for c in ['age', 'height', 'weight', 'ap_hi', 'ap_lo', 
       'cholesterol_all',
       'gluc_all',  'imt', 'imt_class_all',
        'x__age__gluc_all',
       'x__ap_hi__cholesterol_all', 'div6__height__gluc_all__imt',
       'plus__age_norm__ap_hi_norm__gluc_all_norm', 'x__age__weight',
       'div1__age__weight__cholesterol_all',
       'div6__age__weight__cholesterol_all',
       'plus__height_norm__weight_norm__gluc_all_norm',
       'div1__ap_hi__ap_lo__cholesterol_all',
       'div6__ap_hi__ap_lo__cholesterol_all',
       'plus__age_norm__gluc_all_norm__imt_norm',
       'minus6__ap_hi_norm__ap_lo_norm__cholesterol_all_norm',
       'minus1__ap_hi_norm__ap_lo_norm__cholesterol_all_norm',
       'minus6__age_norm__ap_lo_norm__cholesterol_all_norm',
       'minus1__age_norm__ap_lo_norm__cholesterol_all_norm',
       'div6__height__weight__ap_lo', 'div2__ap_lo__cholesterol_all__gluc_all',
       'x__age__ap_hi__gluc_all', 'div5__ap_lo__cholesterol_all__gluc_all',
       'score_scale_val']:

    all.ix[is_male, c + '___gender__scale'] = \
        (all.ix[is_male, c] - all.groupby(['gender_male']).mean()[c].loc[1]) / \
        all.groupby(['gender_male']).std()[c].loc[1]
        
    all.ix[is_female, c + '___gender__scale'] = \
        (all.ix[is_female, c] - all.groupby(['gender_male']).mean()[c].loc[0]) / \
        all.groupby(['gender_male']).std()[c].loc[0]
        
    if c != 'cholesterol_all':
        group_by = all.groupby(['gender_male', 'cholesterol_all'])
        
        all.ix[is_male & chol1, c + '___gender_chol__scale'] = \
            (all.ix[is_male & chol1, c] - group_by.mean()[c].loc[1, 1]) / group_by.std()[c].loc[1, 1]

        all.ix[is_male & chol2, c + '___gender_chol__scale'] = \
            (all.ix[is_male & chol2, c] - group_by.mean()[c].loc[1, 2]) / group_by.std()[c].loc[1, 2]

        all.ix[is_male & chol3, c + '___gender_chol__scale'] = \
            (all.ix[is_male & chol3, c] - group_by.mean()[c].loc[1, 3]) / group_by.std()[c].loc[1, 3]


        all.ix[is_female & chol1, c + '___gender_chol__scale'] = \
            (all.ix[is_female & chol1, c] - group_by.mean()[c].loc[0, 1]) / group_by.std()[c].loc[0, 1]

        all.ix[is_female & chol2, c + '___gender_chol__scale'] = \
            (all.ix[is_female & chol2, c] - group_by.mean()[c].loc[0, 2]) / group_by.std()[c].loc[0, 2]

        all.ix[is_female & chol3, c + '___gender_chol__scale'] = \
            (all.ix[is_female & chol3, c] - group_by.mean()[c].loc[0, 3]) / group_by.std()[c].loc[0, 3]
            
    if c != 'gluc_all':
        group_by = all.groupby(['gender_male', 'gluc_all'])
        
        all.ix[is_male & gluc1, c + '___gender_gluc__scale'] = \
            (all.ix[is_male & gluc1, c] - group_by.mean()[c].loc[1, 1]) / group_by.std()[c].loc[1, 1]

        all.ix[is_male & gluc2, c + '___gender_gluc__scale'] = \
            (all.ix[is_male & gluc2, c] - group_by.mean()[c].loc[1, 2]) / group_by.std()[c].loc[1, 2]

        all.ix[is_male & gluc3, c + '___gender_gluc__scale'] = \
            (all.ix[is_male & gluc3, c] - group_by.mean()[c].loc[1, 3]) / group_by.std()[c].loc[1, 3]


        all.ix[is_female & gluc1, c + '___gender_gluc__scale'] = \
            (all.ix[is_female & gluc1, c] - group_by.mean()[c].loc[0, 1]) / group_by.std()[c].loc[0, 1]

        all.ix[is_female & gluc2, c + '___gender_gluc__scale'] = \
            (all.ix[is_female & gluc2, c] - group_by.mean()[c].loc[0, 2]) / group_by.std()[c].loc[0, 2]

        all.ix[is_female & gluc3, c + '___gender_gluc__scale'] = \
            (all.ix[is_female & gluc3, c] - group_by.mean()[c].loc[0, 3]) / group_by.std()[c].loc[0, 3]
            
    if c != 'age':
        group_by = all.groupby(['gender_male', 'age_cat'])
        
        all.ix[is_male & age1, c + '___gender_age__scale'] = \
            (all.ix[is_male & age1, c] - group_by.mean()[c].loc[1, 1]) / group_by.std()[c].loc[1, 1]
        all.ix[is_male & age2, c + '___gender_age__scale'] = \
            (all.ix[is_male & age2, c] - group_by.mean()[c].loc[1, 2]) / group_by.std()[c].loc[1, 2]
        all.ix[is_male & age3, c + '___gender_age__scale'] = \
            (all.ix[is_male & age3, c] - group_by.mean()[c].loc[1, 3]) / group_by.std()[c].loc[1, 3]
        all.ix[is_male & age4, c + '___gender_age__scale'] = \
            (all.ix[is_male & age4, c] - group_by.mean()[c].loc[1, 4]) / group_by.std()[c].loc[1, 4]
        all.ix[is_male & age5, c + '___gender_age__scale'] = \
            (all.ix[is_male & age5, c] - group_by.mean()[c].loc[1, 5]) / group_by.std()[c].loc[1, 5]
        all.ix[is_male & age6, c + '___gender_age__scale'] = \
            (all.ix[is_male & age6, c] - group_by.mean()[c].loc[1, 6]) / group_by.std()[c].loc[1, 6]

        all.ix[is_female & age1, c + '___gender_age__scale'] = \
            (all.ix[is_female & age1, c] - group_by.mean()[c].loc[0, 1]) / group_by.std()[c].loc[0, 1]
        all.ix[is_female & age2, c + '___gender_age__scale'] = \
            (all.ix[is_female & age2, c] - group_by.mean()[c].loc[0, 2]) / group_by.std()[c].loc[0, 2]
        all.ix[is_female & age3, c + '___gender_age__scale'] = \
            (all.ix[is_female & age3, c] - group_by.mean()[c].loc[0, 3]) / group_by.std()[c].loc[0, 3]
        all.ix[is_female & age4, c + '___gender_age__scale'] = \
            (all.ix[is_female & age4, c] - group_by.mean()[c].loc[0, 4]) / group_by.std()[c].loc[0, 4]
        all.ix[is_female & age5, c + '___gender_age__scale'] = \
            (all.ix[is_female & age5, c] - group_by.mean()[c].loc[0, 5]) / group_by.std()[c].loc[0, 5]
        all.ix[is_female & age6, c + '___gender_age__scale'] = \
            (all.ix[is_female & age6, c] - group_by.mean()[c].loc[0, 6]) / group_by.std()[c].loc[0, 6]

del all['age_cat']
all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))

len(all.columns)


Out[142]:
188

In [143]:
# all.groupby(['gender_male', 'age_cat']).mean()

In [144]:
# all.groupby(['gender_male', 'age_cat']).mean()['age'].loc[1, 2]

In [ ]:


In [128]:


In [145]:
DATA_ID = 52


train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(50), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(50), index_col='id')
all = pd.concat([train, test])

train51 = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(51), index_col='id')
test51 = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(51), index_col='id')
all51 = pd.concat([train51, test51])

all['div6__height__gluc_all__imt___gender__scale'] = all51['div6__height__gluc_all__imt___gender__scale']


all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))

len(all.columns)


Out[145]:
76

In [ ]:


In [18]:
DATA_ID = 53


train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(52), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(52), index_col='id')
all = pd.concat([train, test])

for i in ['k15_0',
 'k15_1',
 'k15_2',
 'k15_3',
 'k15_4',
 'k15_5',
 'k15_6',
 'k15_7',
 'k15_8',
 'k15_9',
 'k15_10',
 'k15_11',
 'k15_12',
 'k15_13',
 'k15_14',
 'k7_0',
 'k7_1',
 'k7_2',
 'k7_3',
 'k7_4',
 'k7_5',
 'k7_6',
 'k3_0',
 'k3_1',
 'k3_2']:
    del all[i]



all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))

len(all.columns)


Out[18]:
51

In [19]:
DATA_ID = 54


train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(53), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(53), index_col='id')
all = pd.concat([train, test])

train43 = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(43), index_col='id')
test43 = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(43), index_col='id')
all43 = pd.concat([train43, test43])

for i in ['k15_0',
 'k15_1',
 'k15_2',
 'k15_3',
 'k15_4',
 'k15_5',
 'k15_6',
 'k15_7',
 'k15_8',
 'k15_9',
 'k15_10',
 'k15_11',
 'k15_12',
 'k15_13',
 'k15_14',
 'k7_0',
 'k7_1',
 'k7_2',
 'k7_3',
 'k7_4',
 'k7_5',
 'k7_6',
 'k3_0',
 'k3_1',
 'k3_2']:
    all[i] = all43[i]


all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))

len(all.columns)


Out[19]:
76

In [ ]:


In [ ]:


In [8]:
DATA_ID = 55


train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(53), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(53), index_col='id')
all = pd.concat([train, test])

all2=all[['age', 'height', 'weight', 'ap_hi', 'ap_lo', 'smoke', 'alco', 'active',
       'gender_male', 'height_low', 'weight_low', 'cholesterol_all',
       'gluc_all', 'ap_error', 'ap_error_swap', 'imt', 'imt_class_all',
       ]]

from sklearn.cluster import KMeans

k15 = KMeans(n_clusters=15, precompute_distances = True, n_jobs=-1)
k15.fit(all2)
k7 = KMeans(n_clusters=7, precompute_distances = True, n_jobs=-1)
k7.fit(all2)
k3 = KMeans(n_clusters=3, precompute_distances = True, n_jobs=-1)
k3.fit(all2)


k15_res = k15.predict(all2)[None].T
k7_res =  k7.predict(all2)[None].T
k3_res =  k3.predict(all2)[None].T

all['k15'] = k15_res
all['k7'] = k7_res
all['k3'] = k3_res

all = pd.get_dummies(all, columns=["k15", "k7", "k3",])#

all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))

len(all.columns)


Out[8]:
76

In [9]:
DATA_ID = 56


train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(53), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(53), index_col='id')
all = pd.concat([train, test])

all2=all[['age', 'height', 'weight', 'ap_hi', 'ap_lo', 
       'gender_male', 'height_low', 'weight_low', 'cholesterol_all',
       'gluc_all', 'ap_error', 'ap_error_swap', 'imt', 'imt_class_all',
       ]]

from sklearn.cluster import KMeans

k15 = KMeans(n_clusters=15, precompute_distances = True, n_jobs=-1)
k15.fit(all2)
k7 = KMeans(n_clusters=7, precompute_distances = True, n_jobs=-1)
k7.fit(all2)
k3 = KMeans(n_clusters=3, precompute_distances = True, n_jobs=-1)
k3.fit(all2)


k15_res = k15.predict(all2)[None].T
k7_res =  k7.predict(all2)[None].T
k3_res =  k3.predict(all2)[None].T

all['k15'] = k15_res
all['k7'] = k7_res
all['k3'] = k3_res

all = pd.get_dummies(all, columns=["k15", "k7", "k3",])#

all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))

len(all.columns)


Out[9]:
76

In [12]:
DATA_ID = 57


train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(53), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(53), index_col='id')
all = pd.concat([train, test])

all2=all[['age', 'height', 'weight', 'ap_hi', 'ap_lo', 
       'gender_male', 'height_low', 'weight_low', 'cholesterol_all',
       'gluc_all', 'ap_error', 'ap_error_swap', 'imt', 'imt_class_all',
       ]]

from sklearn.cluster import KMeans

k7 = KMeans(n_clusters=7, precompute_distances = True, n_jobs=-1)
k7.fit(all2)
k3 = KMeans(n_clusters=3, precompute_distances = True, n_jobs=-1)
k3.fit(all2)


k7_res =  k7.predict(all2)[None].T
k3_res =  k3.predict(all2)[None].T

all['k7'] = k7_res
all['k3'] = k3_res

all = pd.get_dummies(all, columns=["k7", "k3",])#

all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))

len(all.columns)


Out[12]:
61

In [13]:
DATA_ID = 58


train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(53), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(53), index_col='id')
all = pd.concat([train, test])

all2=all[['age', 'height', 'weight', 'ap_hi', 'ap_lo', 
       'gender_male', 'height_low', 'weight_low', 'cholesterol_all',
       'gluc_all', 'ap_error', 'ap_error_swap', 'imt', 'imt_class_all',
       ]]

from sklearn.cluster import KMeans

k15 = KMeans(n_clusters=15, precompute_distances = True, n_jobs=-1)
k15.fit(all2)

k3 = KMeans(n_clusters=3, precompute_distances = True, n_jobs=-1)
k3.fit(all2)


k15_res = k15.predict(all2)[None].T

k3_res =  k3.predict(all2)[None].T

all['k15'] = k15_res

all['k3'] = k3_res

all = pd.get_dummies(all, columns=["k15",  "k3",])#

all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))

len(all.columns)


Out[13]:
69

In [14]:
DATA_ID = 59


train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(53), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(53), index_col='id')
all = pd.concat([train, test])

all2=all[['age', 'height', 'weight', 'ap_hi', 'ap_lo', 
       'gender_male', 'height_low', 'weight_low', 'cholesterol_all',
       'gluc_all', 'ap_error', 'ap_error_swap', 'imt', 'imt_class_all',
       ]]

from sklearn.cluster import KMeans

k15 = KMeans(n_clusters=15, precompute_distances = True, n_jobs=-1)
k15.fit(all2)
k7 = KMeans(n_clusters=7, precompute_distances = True, n_jobs=-1)
k7.fit(all2)



k15_res = k15.predict(all2)[None].T
k7_res =  k7.predict(all2)[None].T


all['k15'] = k15_res
all['k7'] = k7_res


all = pd.get_dummies(all, columns=["k15", "k7", ])#

all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))

len(all.columns)


Out[14]:
73

In [ ]:


In [4]:
#43 + stat feat for add
DATA_ID = 60


train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(56), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(56), index_col='id')
all = pd.concat([train, test])

is_male = all['gender_male'] == 1
is_female = all['gender_male'] == 0

chol1 = all['cholesterol_all'] == 1
chol2 = all['cholesterol_all'] == 2
chol3 = all['cholesterol_all'] == 3

gluc1 = all['gluc_all'] == 1
gluc2 = all['gluc_all'] == 2
gluc3 = all['gluc_all'] == 3

age_year = all['age'] // 365.25
all.ix[age_year < 40 , 'age_cat'] = 1
all.ix[(age_year >= 40) & (age_year < 45) , 'age_cat'] = 2
all.ix[(age_year >= 45) & (age_year < 50) , 'age_cat'] = 3
all.ix[(age_year >= 50) & (age_year < 55) , 'age_cat'] = 4
all.ix[(age_year >= 55) & (age_year < 60) , 'age_cat'] = 5
all.ix[(age_year >= 60) , 'age_cat'] = 6

age1 = all['age_cat'] == 1
age2 = all['age_cat'] == 2
age3 = all['age_cat'] == 3
age4 = all['age_cat'] == 4
age5 = all['age_cat'] == 5
age6 = all['age_cat'] == 6

for c in ['age', 'height', 'weight', 'ap_hi', 'ap_lo', 
       'cholesterol_all',
       'gluc_all',  'imt', 'imt_class_all',
        'x__age__gluc_all',
       'x__ap_hi__cholesterol_all', 'div6__height__gluc_all__imt',
       'plus__age_norm__ap_hi_norm__gluc_all_norm', 'x__age__weight',
       'div1__age__weight__cholesterol_all',
       'div6__age__weight__cholesterol_all',
       'plus__height_norm__weight_norm__gluc_all_norm',
       'div1__ap_hi__ap_lo__cholesterol_all',
       'div6__ap_hi__ap_lo__cholesterol_all',
       'plus__age_norm__gluc_all_norm__imt_norm',
       'minus6__ap_hi_norm__ap_lo_norm__cholesterol_all_norm',
       'minus1__ap_hi_norm__ap_lo_norm__cholesterol_all_norm',
       'minus6__age_norm__ap_lo_norm__cholesterol_all_norm',
       'minus1__age_norm__ap_lo_norm__cholesterol_all_norm',
       'div6__height__weight__ap_lo', 'div2__ap_lo__cholesterol_all__gluc_all',
       'x__age__ap_hi__gluc_all', 'div5__ap_lo__cholesterol_all__gluc_all',
       'score_scale_val']:

    all.ix[is_male, c + '___gender__scale'] = \
        (all.ix[is_male, c] - all.groupby(['gender_male']).mean()[c].loc[1]) / \
        all.groupby(['gender_male']).std()[c].loc[1]
        
    all.ix[is_female, c + '___gender__scale'] = \
        (all.ix[is_female, c] - all.groupby(['gender_male']).mean()[c].loc[0]) / \
        all.groupby(['gender_male']).std()[c].loc[0]
        
    if c != 'cholesterol_all':
        group_by = all.groupby(['gender_male', 'cholesterol_all'])
        
        all.ix[is_male & chol1, c + '___gender_chol__scale'] = \
            (all.ix[is_male & chol1, c] - group_by.mean()[c].loc[1, 1]) / group_by.std()[c].loc[1, 1]

        all.ix[is_male & chol2, c + '___gender_chol__scale'] = \
            (all.ix[is_male & chol2, c] - group_by.mean()[c].loc[1, 2]) / group_by.std()[c].loc[1, 2]

        all.ix[is_male & chol3, c + '___gender_chol__scale'] = \
            (all.ix[is_male & chol3, c] - group_by.mean()[c].loc[1, 3]) / group_by.std()[c].loc[1, 3]


        all.ix[is_female & chol1, c + '___gender_chol__scale'] = \
            (all.ix[is_female & chol1, c] - group_by.mean()[c].loc[0, 1]) / group_by.std()[c].loc[0, 1]

        all.ix[is_female & chol2, c + '___gender_chol__scale'] = \
            (all.ix[is_female & chol2, c] - group_by.mean()[c].loc[0, 2]) / group_by.std()[c].loc[0, 2]

        all.ix[is_female & chol3, c + '___gender_chol__scale'] = \
            (all.ix[is_female & chol3, c] - group_by.mean()[c].loc[0, 3]) / group_by.std()[c].loc[0, 3]
            
    if c != 'gluc_all':
        group_by = all.groupby(['gender_male', 'gluc_all'])
        
        all.ix[is_male & gluc1, c + '___gender_gluc__scale'] = \
            (all.ix[is_male & gluc1, c] - group_by.mean()[c].loc[1, 1]) / group_by.std()[c].loc[1, 1]

        all.ix[is_male & gluc2, c + '___gender_gluc__scale'] = \
            (all.ix[is_male & gluc2, c] - group_by.mean()[c].loc[1, 2]) / group_by.std()[c].loc[1, 2]

        all.ix[is_male & gluc3, c + '___gender_gluc__scale'] = \
            (all.ix[is_male & gluc3, c] - group_by.mean()[c].loc[1, 3]) / group_by.std()[c].loc[1, 3]


        all.ix[is_female & gluc1, c + '___gender_gluc__scale'] = \
            (all.ix[is_female & gluc1, c] - group_by.mean()[c].loc[0, 1]) / group_by.std()[c].loc[0, 1]

        all.ix[is_female & gluc2, c + '___gender_gluc__scale'] = \
            (all.ix[is_female & gluc2, c] - group_by.mean()[c].loc[0, 2]) / group_by.std()[c].loc[0, 2]

        all.ix[is_female & gluc3, c + '___gender_gluc__scale'] = \
            (all.ix[is_female & gluc3, c] - group_by.mean()[c].loc[0, 3]) / group_by.std()[c].loc[0, 3]
            
    if c != 'age':
        group_by = all.groupby(['gender_male', 'age_cat'])
        
        all.ix[is_male & age1, c + '___gender_age__scale'] = \
            (all.ix[is_male & age1, c] - group_by.mean()[c].loc[1, 1]) / group_by.std()[c].loc[1, 1]
        all.ix[is_male & age2, c + '___gender_age__scale'] = \
            (all.ix[is_male & age2, c] - group_by.mean()[c].loc[1, 2]) / group_by.std()[c].loc[1, 2]
        all.ix[is_male & age3, c + '___gender_age__scale'] = \
            (all.ix[is_male & age3, c] - group_by.mean()[c].loc[1, 3]) / group_by.std()[c].loc[1, 3]
        all.ix[is_male & age4, c + '___gender_age__scale'] = \
            (all.ix[is_male & age4, c] - group_by.mean()[c].loc[1, 4]) / group_by.std()[c].loc[1, 4]
        all.ix[is_male & age5, c + '___gender_age__scale'] = \
            (all.ix[is_male & age5, c] - group_by.mean()[c].loc[1, 5]) / group_by.std()[c].loc[1, 5]
        all.ix[is_male & age6, c + '___gender_age__scale'] = \
            (all.ix[is_male & age6, c] - group_by.mean()[c].loc[1, 6]) / group_by.std()[c].loc[1, 6]

        all.ix[is_female & age1, c + '___gender_age__scale'] = \
            (all.ix[is_female & age1, c] - group_by.mean()[c].loc[0, 1]) / group_by.std()[c].loc[0, 1]
        all.ix[is_female & age2, c + '___gender_age__scale'] = \
            (all.ix[is_female & age2, c] - group_by.mean()[c].loc[0, 2]) / group_by.std()[c].loc[0, 2]
        all.ix[is_female & age3, c + '___gender_age__scale'] = \
            (all.ix[is_female & age3, c] - group_by.mean()[c].loc[0, 3]) / group_by.std()[c].loc[0, 3]
        all.ix[is_female & age4, c + '___gender_age__scale'] = \
            (all.ix[is_female & age4, c] - group_by.mean()[c].loc[0, 4]) / group_by.std()[c].loc[0, 4]
        all.ix[is_female & age5, c + '___gender_age__scale'] = \
            (all.ix[is_female & age5, c] - group_by.mean()[c].loc[0, 5]) / group_by.std()[c].loc[0, 5]
        all.ix[is_female & age6, c + '___gender_age__scale'] = \
            (all.ix[is_female & age6, c] - group_by.mean()[c].loc[0, 6]) / group_by.std()[c].loc[0, 6]

del all['age_cat']
all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))

len(all.columns)


Out[4]:
188

In [ ]:


In [7]:
DATA_ID = 61


train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(56), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(56), index_col='id')
all = pd.concat([train, test])

train60 = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(60), index_col='id')
test60 = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(60), index_col='id')
all60 = pd.concat([train60, test60])

all['gluc_all___gender_age__scale'] = all60['gluc_all___gender_age__scale']


all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))

len(all.columns)


Out[7]:
77

In [ ]:


In [8]:
DATA_ID = 62


train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(56), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(56), index_col='id')
all = pd.concat([train, test])

del all['div6__height__gluc_all__imt___gender__scale']


all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))

len(all.columns)


Out[8]:
75

In [ ]:


In [15]:
DATA_ID = 63


train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(62), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(62), index_col='id')
all = pd.concat([train, test])

train60 = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(60), index_col='id')
test60 = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(60), index_col='id')
all60 = pd.concat([train60, test60])

all['score_scale_val___gender_chol__scale'] = all60['score_scale_val___gender_chol__scale']
all['div6__height__gluc_all__imt___gender_chol__scale'] = all60['div6__height__gluc_all__imt___gender_chol__scale']


all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))

len(all.columns)


Out[15]:
77

In [16]:
DATA_ID = 64


train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(62), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(62), index_col='id')
all = pd.concat([train, test])

train60 = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(60), index_col='id')
test60 = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(60), index_col='id')
all60 = pd.concat([train60, test60])

all['score_scale_val___gender_chol__scale'] = all60['score_scale_val___gender_chol__scale']
all['imt___gender_age__scale'] = all60['imt___gender_age__scale']


all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))

len(all.columns)


Out[16]:
77

In [ ]:


In [26]:
DATA_ID = 65


train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(63), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(63), index_col='id')
all = pd.concat([train, test])



from sklearn.decomposition import PCA
pca = PCA(n_components=5, random_state=1000)
pca_res = pca.fit_transform(all.fillna(0))

all['pca_0'] = [i[0] for i in pca_res]
all['pca_1'] = [i[1] for i in pca_res]
all['pca_2'] = [i[2] for i in pca_res]
all['pca_3'] = [i[3] for i in pca_res]
all['pca_4'] = [i[4] for i in pca_res]



all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))

len(all.columns)


Out[26]:
82

In [25]:
[i[0] for i in pca_res]


Out[25]:
100000

In [ ]:


In [29]:
DATA_ID = 66


train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(62), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(62), index_col='id')
all = pd.concat([train, test])

train60 = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(60), index_col='id')
test60 = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(60), index_col='id')
all60 = pd.concat([train60, test60])


all['gluc_all___gender_age__scale'] = all60['gluc_all___gender_age__scale']
all['cholesterol_all___gender_age__scale'] = all60['cholesterol_all___gender_age__scale']
all['div6__height__gluc_all__imt___gender_chol__scale'] = all60['div6__height__gluc_all__imt___gender_chol__scale']


all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))

len(all.columns)


Out[29]:
78

In [ ]:
DATA_ID = 67


train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(62), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(62), index_col='id')
all = pd.concat([train, test])

del all['score_scale_val']


all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))

len(all.columns)

In [ ]:


In [37]:
DATA_ID = 68


train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(62), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(62), index_col='id')
all = pd.concat([train, test])

train60 = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(60), index_col='id')
test60 = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(60), index_col='id')
all60 = pd.concat([train60, test60])


all['cholesterol_all___gender_age__scale'] = all60['cholesterol_all___gender_age__scale']
all['div6__height__gluc_all__imt___gender_chol__scale'] = all60['div6__height__gluc_all__imt___gender_chol__scale']


all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))

len(all.columns)


Out[37]:
77

In [32]:
test.isnull().sum()


Out[32]:
age                 0
height              0
weight              0
ap_hi               0
ap_lo               0
cholesterol         0
gluc                0
smoke            3030
alco             3031
active           2897
gender_female       0
gender_male         0
dtype: int64

In [ ]:


In [38]:
DATA_ID = 68


train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(66), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(66), index_col='id')
all = pd.concat([train, test])

del all['height']
del all['div6__height__weight__ap_lo']


all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))

len(all.columns)


Out[38]:
76

In [39]:
DATA_ID = 69


train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(66), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(66), index_col='id')
all = pd.concat([train, test])

del all['div6__height__weight__ap_lo']


all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))

len(all.columns)


Out[39]:
77

In [ ]:


In [43]:
DATA_ID = 266


train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(66), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(66), index_col='id')
all = pd.concat([train, test])

all = all.fillna(0)

cols = ['age', 'height', 'weight', 'ap_hi', 'ap_lo', 
       'cholesterol_all',
       'gluc_all',  'imt', 'imt_class_all',
       'x__age__gluc_all',
       'x__ap_hi__cholesterol_all', 'div6__height__gluc_all__imt',
       'plus__age_norm__ap_hi_norm__gluc_all_norm', 'x__age__weight',
       'div1__age__weight__cholesterol_all',
       'div6__age__weight__cholesterol_all',
       'plus__height_norm__weight_norm__gluc_all_norm',
       'div1__ap_hi__ap_lo__cholesterol_all',
       'div6__ap_hi__ap_lo__cholesterol_all',
       'plus__age_norm__gluc_all_norm__imt_norm',
       'minus6__ap_hi_norm__ap_lo_norm__cholesterol_all_norm',
       'minus1__ap_hi_norm__ap_lo_norm__cholesterol_all_norm',
       'minus6__age_norm__ap_lo_norm__cholesterol_all_norm',
       'minus1__age_norm__ap_lo_norm__cholesterol_all_norm',
       'div6__height__weight__ap_lo', 'div2__ap_lo__cholesterol_all__gluc_all',
       'x__age__ap_hi__gluc_all', 'div5__ap_lo__cholesterol_all__gluc_all',
       'score_scale_val', 'gluc_all___gender_age__scale',
       'cholesterol_all___gender_age__scale',
       'div6__height__gluc_all__imt___gender_chol__scale']

for c in cols:
    all[c] = (all[c] - all[c].mean()) / (all[c].max()-all[c].min())


all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))

len(all.columns)


Out[43]:
78

In [47]:
DATA_ID = 269


train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(69), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(69), index_col='id')
all = pd.concat([train, test])

all = all.fillna(0)

cols = ['age', 'height', 'weight', 'ap_hi', 'ap_lo', 
       'cholesterol_all',
       'gluc_all',  'imt', 'imt_class_all',
       'x__age__gluc_all',
       'x__ap_hi__cholesterol_all', 'div6__height__gluc_all__imt',
       'plus__age_norm__ap_hi_norm__gluc_all_norm', 'x__age__weight',
       'div1__age__weight__cholesterol_all',
       'div6__age__weight__cholesterol_all',
       'plus__height_norm__weight_norm__gluc_all_norm',
       'div1__ap_hi__ap_lo__cholesterol_all',
       'div6__ap_hi__ap_lo__cholesterol_all',
       'plus__age_norm__gluc_all_norm__imt_norm',
       'minus6__ap_hi_norm__ap_lo_norm__cholesterol_all_norm',
       'minus1__ap_hi_norm__ap_lo_norm__cholesterol_all_norm',
       'minus6__age_norm__ap_lo_norm__cholesterol_all_norm',
       'minus1__age_norm__ap_lo_norm__cholesterol_all_norm',
       'div2__ap_lo__cholesterol_all__gluc_all',
       'x__age__ap_hi__gluc_all', 'div5__ap_lo__cholesterol_all__gluc_all',
       'score_scale_val', 'gluc_all___gender_age__scale',
       'cholesterol_all___gender_age__scale',
       'div6__height__gluc_all__imt___gender_chol__scale']

for c in cols:
    all[c] = (all[c] - all[c].mean()) / (all[c].max()-all[c].min())


all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))

len(all.columns)


Out[47]:
77

In [49]:
DATA_ID = 219


train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(19), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(19), index_col='id')
all = pd.concat([train, test])


all['smoke'].fillna(0, inplace=True)
all['alco'].fillna(0, inplace=True)
all['active'].fillna(1, inplace=True)
all = all.fillna(0)

all.columns
cols = ['age', 'height', 'weight', 'ap_hi', 'ap_lo',
        'age_years', 
       'cholesterol_all', 'gluc_all',  'imt', 'imt_class_all',  'x__age__gluc_all', 'x__ap_hi__cholesterol_all',
       'div__smoke__imt_class_all']

for c in cols:
    all[c] = (all[c] - all[c].mean()) / (all[c].max()-all[c].min())


all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))

len(all.columns)


Out[49]:
35

In [ ]:


In [ ]:
DATA_ID = 203


train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(3), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(3), index_col='id')
all = pd.concat([train, test])


all['smoke'].fillna(0, inplace=True)
all['alco'].fillna(0, inplace=True)
all['active'].fillna(1, inplace=True)
all = all.fillna(0)

print(all.columns)
cols = ['age', 'height', 'weight', 'ap_hi', 'ap_lo',
      
       'cholesterol', 'gluc',  'imt']

for c in cols:
    all[c] = (all[c] - all[c].mean()) / (all[c].max()-all[c].min())


all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))

len(all.columns)


Index(['age', 'height', 'weight', 'ap_hi', 'ap_lo', 'cholesterol', 'gluc',
       'smoke', 'alco', 'active', 'gender_female', 'gender_male', 'imt'],
      dtype='object')

In [ ]:


In [63]:
DATA_ID = 70


train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(69), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(69), index_col='id')
all = pd.concat([train, test])

train2 = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(269), index_col='id')
test2 = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(269), index_col='id')
target = pd.read_csv(config.QML_TRAIN_Y_FILE_MASK.format(269), index_col='id')
all2 = pd.concat([train2, test2])


from sklearn.linear_model import SGDRegressor
sgd = SGDRegressor(loss='huber', n_iter=100)
sgd.fit(train2, target)

all['lin'] = sgd.predict(all2)


all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))

len(all.columns)


d:\python36\lib\site-packages\sklearn\utils\validation.py:526: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  y = column_or_1d(y, warn=True)
Out[63]:
78

In [ ]:


In [72]:
#mortido (after contest test)
DATA_ID = 300


train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(69), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(69), index_col='id')
all = pd.concat([train, test])

train2 = pd.read_csv('d:/temp/train.csv', index_col='id', delimiter=';')
test2 = pd.read_csv('d:/temp/test.csv', index_col='id', delimiter=';')
all2 = pd.concat([train2, test2])

for c in all2.columns:
    all[c] = all2[c]

all.loc[ids_train].to_csv(config.QML_TRAIN_X_FILE_MASK.format(DATA_ID))
all.loc[ids_test].to_csv(config.QML_TEST_X_FILE_MASK.format(DATA_ID))

len(all.columns)


Out[72]:
82

In [71]:
all2.columns


Out[71]:
Index(['age_dif2', 'ap_hi_1', 'ap_lo_1', 'ap_hi_2', 'ap_lo_2'], dtype='object')

In [ ]: