In [1]:
import sys, os
#sys.path.insert(0, os.getcwd() + '//..')
os.chdir('d:/ml/mlbootcamp5')


import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
color = sns.color_palette()

from datetime import datetime

import qml_workdir.classes

from qml_workdir.classes.config import config


%matplotlib inline

In [2]:
ids_train = pd.read_csv(config.QML_DATA_DIR + "ids_train.csv", index_col='id').index
ids_test = pd.read_csv(config.QML_DATA_DIR + "ids_test.csv", index_col='id').index

In [16]:
DATA_ID = 1

train = pd.read_csv(config.QML_TRAIN_X_FILE_MASK.format(1), index_col='id')
test = pd.read_csv(config.QML_TEST_X_FILE_MASK.format(1), index_col='id')
all = pd.concat([train, test])

all['age_years'] = all['age'] / 365.25
all.ix[all['height']>210, 'height'] = all[all['height']<210]['height'].max()
all['height_low'] = np.int32(all['height']<110)
all.ix[all['height']<110, 'height'] = 110

all['weight_low'] = np.int32(all['weight']<40)
all.ix[all['weight']<40, 'weight'] = 40

all['cholesterol_all'] = all['cholesterol']
all['gluc_all'] = all['gluc']

all = pd.get_dummies(all, columns=['cholesterol', 'gluc'])

all['ap_error'] = 0
all['ap_error_swap'] = 0

all.ix[all['ap_hi']>10000, 'ap_hi'] = np.int64(all.ix[all['ap_hi']>10000, 'ap_hi']/100)
all.ix[all['ap_hi']>10000, 'ap_error'] = 1


all.ix[all['ap_hi']>1000, 'ap_hi'] = np.int64(all.ix[all['ap_hi']>1000, 'ap_hi']/10)
all.ix[all['ap_hi']>1000, 'ap_error'] = 1


all.ix[all['ap_hi']<=-100, 'ap_hi'] = np.int64(-all.ix[all['ap_hi']<=-100, 'ap_hi'])
all.ix[all['ap_hi']<=-100, 'ap_error'] = 1

all.ix[all['ap_hi']<0, 'ap_hi'] = np.int64(-all.ix[all['ap_hi']<0, 'ap_hi'])
all.ix[all['ap_hi']<0, 'ap_error'] = 1


ix = (all['ap_hi']==1) & (all['ap_lo']>1000)
all.ix[ix, 'ap_hi'], all.ix[ix, 'ap_lo'] = \
np.int64( all.ix[ix, 'ap_hi']*100 + all.ix[ix, 'ap_lo']/100), np.int64(((all.ix[ix, 'ap_lo']/10)%10))*10
all.ix[ix, 'ap_error'] = 1

ix = (all['ap_hi']>600) & (all['ap_lo']==0)
all.ix[ix, 'ap_hi'], all.ix[ix, 'ap_lo'] = \
np.int64(all.ix[ix, 'ap_hi']/10), np.int64(all.ix[ix, 'ap_hi']%10)*10
all.ix[ix, 'ap_error'] = 1

ix = (all['ap_hi']>240) &(all['ap_hi']<=600) & (all['ap_lo']==0)
all.ix[ix, 'ap_hi'], all.ix[ix, 'ap_lo'] = \
np.int64(100 + all.ix[ix, 'ap_hi']/10), np.int64(all.ix[ix, 'ap_hi']%10)*10
all.ix[ix, 'ap_error'] = 1


ix = (all['ap_hi']>=400) &(all['ap_hi']<500)
all.ix[ix, 'ap_hi'] = np.int64(all.ix[ix, 'ap_hi'])-300
all.ix[ix, 'ap_error'] = 1


ix = (all['ap_hi']>=900) &(all['ap_hi']<1000)
all.ix[ix, 'ap_hi'] = np.int64(all.ix[ix, 'ap_hi']/100)*10
all.ix[ix, 'ap_error'] = 1


ix = (all['ap_hi']>=700) &(all['ap_hi']<800)
all.ix[ix, 'ap_hi'] = np.int64(all.ix[ix, 'ap_hi']/100)*10+100
all.ix[ix, 'ap_error'] = 1


all.ix[all['ap_lo']<0, 'ap_lo'] = np.int64(-all.ix[all['ap_lo']<0, 'ap_lo'])
all.ix[all['ap_lo']<0, 'ap_error'] = 1


ix = (all['ap_hi']<100) & (all['ap_lo']==0)
all.ix[ix, 'ap_hi'], all.ix[ix, 'ap_lo'] = \
np.int64(all.ix[ix, 'ap_hi']*10), np.int64(all.ix[ix, 'ap_hi']*10-40)
all.ix[ix, 'ap_error'] = 1


ix = (all['ap_hi']>100) & (all['ap_lo']==0) & (all['ap_hi']%10==0)
all.ix[ix, 'ap_lo'] = np.int64(all.ix[ix, 'ap_hi'] - 40)
all.ix[ix, 'ap_error'] = 1


ix = (all['ap_hi']>100) & (all['ap_lo']==0) & (all['ap_hi']%10!=0) & (all['ap_hi']%10>=2)
all.ix[ix, 'ap_hi'], all.ix[ix, 'ap_lo'] = \
np.int64(all.ix[ix, 'ap_hi']/10)*10, np.int64(all.ix[ix, 'ap_hi']%10)*10
all.ix[ix, 'ap_error'] = 1


ix = (all['ap_hi']>100) & (all['ap_lo']==0) & (all['ap_hi']%10!=0) & (all['ap_hi']%10<2)
all.ix[ix, 'ap_hi'], all.ix[ix, 'ap_lo'] = \
np.int64(all.ix[ix, 'ap_hi']/10)*10, np.int64(all.ix[ix, 'ap_hi']%10)*100
all.ix[ix, 'ap_error'] = 1


all.ix[all['ap_lo']>=10000, 'ap_lo'] = np.int64(all.ix[all['ap_lo']>=10000, 'ap_lo']/100)
all.ix[all['ap_lo']>=10000, 'ap_error'] = 1


all.ix[all['ap_lo']>=4500, 'ap_lo'] = np.int64(all.ix[all['ap_lo']>4500, 'ap_lo']/100)
all.ix[all['ap_lo']>=4500, 'ap_error'] = 1


all.ix[all['ap_lo']>=4000, 'ap_lo'] = np.int64(all.ix[all['ap_lo']>4000, 'ap_lo']/100)+100
all.ix[all['ap_lo']>=4000, 'ap_error'] = 1


all.ix[all['ap_lo']>=1500, 'ap_lo'] = np.int64(all.ix[all['ap_lo']>1500, 'ap_lo']/10)%100
all.ix[all['ap_lo']>=1500, 'ap_error'] = 1


all.ix[all['ap_lo']>=1200, 'ap_lo'] = np.int64(all.ix[all['ap_lo']>=1200, 'ap_lo']/10)
all.ix[all['ap_lo']>=1200, 'ap_error'] = 1


all.ix[all['ap_lo']>=1150, 'ap_lo'] = np.int64(all.ix[all['ap_lo']>=1150, 'ap_lo']/100)*10
all.ix[all['ap_lo']>=1150, 'ap_error'] = 1


all.ix[all['ap_lo']>=1100, 'ap_lo'] = np.int64(all.ix[all['ap_lo']>=1100, 'ap_lo']/10)*10%1000
all.ix[all['ap_lo']>=1100, 'ap_error'] = 1


all.ix[all['ap_lo']>=1000, 'ap_lo'] = np.int64(all.ix[all['ap_lo']>=1000, 'ap_lo']/100)*10
all.ix[all['ap_lo']>=1000, 'ap_error'] = 1


all.ix[all['ap_lo']>=500, 'ap_lo'] = np.int64(all.ix[all['ap_lo']>=500, 'ap_lo']/10)
all.ix[all['ap_lo']>=500, 'ap_error'] = 1


ix = (all['ap_hi']>=100) & (all['ap_lo']<10)& (all['ap_lo']>1)
all.ix[ix, 'ap_lo'] = np.int64(all.ix[ix, 'ap_lo'])*10
all.ix[ix, 'ap_error'] = 1


ix = (all['ap_hi']>100) & (all['ap_lo']>=10)& (all['ap_lo']<19)
all.ix[ix, 'ap_lo'] = np.int64(all.ix[ix, 'ap_lo'])*10
all.ix[ix, 'ap_error'] = 1


ix = (all['ap_hi']>=100) & (all['ap_lo']==1)
all.ix[ix, 'ap_lo'] = np.int64(all.ix[ix, 'ap_lo'])*100
all.ix[ix, 'ap_error'] = 1


all.ix[all['ap_lo']<10, 'ap_lo'] = np.int64(all.ix[all['ap_lo']<10, 'ap_lo']*10)
all.ix[all['ap_lo']<10, 'ap_error'] = 1


ix = (all['ap_hi']>=100) & (all['ap_lo']==1)
all.ix[ix, 'ap_lo'] = np.int64(all.ix[ix, 'ap_hi'] - 40)
all.ix[ix, 'ap_error'] = 1


ix = (all['ap_hi']>=100) & (all['ap_lo']<=40)
all.ix[ix, 'ap_lo'] = np.int64(all.ix[ix, 'ap_hi'] - 40)
all.ix[ix, 'ap_error'] = 1


all.ix[all['ap_hi']<=10, 'ap_hi'] = np.int64(all.ix[all['ap_hi']<=10, 'ap_lo']+40)
all.ix[all['ap_hi']<=10, 'ap_error'] = 1


all.ix[all['ap_hi']<=20, 'ap_hi'] = np.int64(all.ix[all['ap_hi']<=20, 'ap_hi']*10)
all.ix[all['ap_hi']<=20, 'ap_error'] = 1


all.ix[all['ap_lo']<=10, 'ap_lo'] = np.int64(all.ix[all['ap_lo']<=10, 'ap_lo']*10)
all.ix[all['ap_lo']<=10, 'ap_error'] = 1



ix = (all['ap_lo']<=50) & (all['ap_hi']<=50)
all.ix[ix, 'ap_hi'], all.ix[ix, 'ap_lo'] = 120, 80
all.ix[ix, 'ap_error'] = 1


all.ix[all['ap_lo']<40, 'ap_lo'] = np.int64(all.ix[all['ap_lo']<40, 'ap_hi']-30)
all.ix[all['ap_lo']<40, 'ap_error'] = 1


ix = (all['ap_hi']==all['ap_lo'])
all.ix[ix, 'ap_hi'], all.ix[ix, 'ap_lo'] = 120, 80
all.ix[ix, 'ap_error'] = 1


ix = (all['ap_hi']<all['ap_lo'])
all.ix[ix, 'ap_hi'], all.ix[ix, 'ap_lo'] = all.ix[ix, 'ap_lo'], all.ix[ix, 'ap_hi']
all.ix[ix, 'ap_error'] = 1
all.ix[ix, 'ap_error_swap'] = 1

In [15]:
ix = (all['ap_hi']<all['ap_lo'])
all.ix[ix, 'ap_hi'], all.ix[ix, 'ap_lo']


Out[15]:
(id
 681      120
 913       70
 3356      90
 4214      80
 4880      80
 5130      80
 6836      90
 6843      70
 6992      80
 7277      90
 7737      95
 8190      80
 12019     80
 12756     80
 12847     90
 13470     80
 14006     90
 14119     85
 15369     90
 15821     80
 16007     95
 19520     70
 19858     90
 20375     70
 21516     90
 22180     95
 24645     80
 24758     80
 25046     80
 26957     80
         ... 
 33552     70
 34871     80
 37329     95
 37445     70
 45254     60
 53710     70
 54795     80
 57548     80
 57702     80
 60111     80
 67443     80
 68429    140
 74101     95
 75103     80
 75960     90
 76248     80
 78523     80
 82483     80
 83193     80
 86627     90
 88055     60
 88522     95
 88602     70
 89839     90
 91594     70
 93758     80
 94426    116
 97779     80
 99068     90
 99353     80
 Name: ap_hi, dtype: int64, id
 681      150
 913      110
 3356     150
 4214     140
 4880     125
 5130     120
 6836     140
 6843     100
 6992     170
 7277     140
 7737     130
 8190     120
 12019    130
 12756    130
 12847    130
 13470    120
 14006    160
 14119     95
 15369    130
 15821    120
 16007    140
 19520    110
 19858    140
 20375    110
 21516    140
 22180    170
 24645    120
 24758    140
 25046    130
 26957    120
         ... 
 33552    110
 34871    120
 37329    150
 37445    120
 45254    100
 53710    110
 54795    120
 57548    120
 57702    115
 60111    120
 67443    120
 68429    190
 74101    130
 75103    120
 75960    160
 76248    120
 78523    120
 82483    120
 83193    120
 86627    150
 88055    110
 88522    170
 88602    110
 89839    140
 91594    110
 93758    140
 94426    120
 97779    120
 99068    140
 99353    120
 Name: ap_lo, dtype: int64)

In [278]:
all[['ap_hi','ap_lo']].loc[81260                                                ]


Out[278]:
ap_hi    70
ap_lo    40
Name: 81260, dtype: int64

In [9]:
all.sort_values(['ap_lo', 'ap_hi'])


Out[9]:
age height weight ap_hi ap_lo smoke alco active gender_female gender_male ... weight_low cholesterol_all gluc_all cholesterol_1 cholesterol_2 cholesterol_3 gluc_1 gluc_2 gluc_3 app_error
id
19634 15892 157 73.0 60 40 0.0 0.0 0.0 1 0 ... 0 1 3 1 0 0 0 0 1 0
17067 15795 160 52.0 70 40 1.0 0.0 1.0 0 1 ... 0 1 1 1 0 0 1 0 0 0
65633 15399 158 47.0 70 40 0.0 0.0 1.0 1 0 ... 0 1 1 1 0 0 1 0 0 0
81260 18336 155 76.0 70 40 0.0 1.0 1.0 1 0 ... 0 1 1 1 0 0 1 0 0 0
42755 19136 158 61.0 70 40 NaN NaN 1.0 1 0 ... 0 1 1 1 0 0 1 0 0 0
61612 19122 162 63.0 80 40 0.0 0.0 0.0 1 0 ... 0 2 2 0 1 0 0 1 0 0
47217 20919 149 54.0 90 40 0.0 0.0 1.0 1 0 ... 0 1 1 1 0 0 1 0 0 0
65930 19670 175 73.0 130 40 0.0 0.0 0.0 0 1 ... 0 1 1 1 0 0 1 0 0 0
85793 18329 156 50.0 76 44 0.0 0.0 1.0 1 0 ... 0 1 1 1 0 0 1 0 0 0
20791 14395 163 45.8 123 45 1.0 1.0 1.0 1 0 ... 0 1 1 1 0 0 1 0 0 0
72974 17371 160 68.0 160 45 0.0 0.0 1.0 1 0 ... 0 1 1 1 0 0 1 0 0 0
45450 18125 170 86.0 125 47 0.0 0.0 1.0 1 0 ... 0 2 1 0 1 0 1 0 0 0
86648 20972 150 64.0 80 49 0.0 0.0 1.0 1 0 ... 0 1 1 1 0 0 1 0 0 0
92788 15611 155 60.0 95 49 0.0 0.0 1.0 1 0 ... 0 1 1 1 0 0 1 0 0 0
96520 21993 153 77.0 115 49 0.0 0.0 1.0 1 0 ... 0 1 1 1 0 0 1 0 0 0
10083 21235 154 61.0 70 50 0.0 1.0 0.0 1 0 ... 0 2 2 0 1 0 0 1 0 0
23761 14506 162 57.0 70 50 0.0 0.0 1.0 1 0 ... 0 1 1 1 0 0 1 0 0 0
543 19517 147 41.0 80 50 0.0 0.0 1.0 1 0 ... 0 1 1 1 0 0 1 0 0 0
6137 21654 155 52.0 80 50 0.0 0.0 1.0 1 0 ... 0 1 1 1 0 0 1 0 0 0
16841 14662 156 44.0 80 50 0.0 0.0 1.0 1 0 ... 0 1 1 1 0 0 1 0 0 0
40071 15858 163 61.0 80 50 1.0 1.0 1.0 0 1 ... 0 1 1 1 0 0 1 0 0 0
59595 14568 161 46.0 80 50 0.0 0.0 0.0 1 0 ... 0 1 1 1 0 0 1 0 0 0
61755 18186 172 75.0 80 50 0.0 0.0 1.0 0 1 ... 0 2 2 0 1 0 0 1 0 0
84283 14608 160 58.0 80 50 0.0 0.0 1.0 1 0 ... 0 2 1 0 1 0 1 0 0 0
93753 14773 150 42.0 80 50 0.0 0.0 0.0 1 0 ... 0 2 2 0 1 0 0 1 0 0
11513 21273 163 69.0 80 50 0.0 0.0 1.0 1 0 ... 0 1 1 1 0 0 1 0 0 0
14366 15459 152 47.0 80 50 0.0 0.0 NaN 1 0 ... 0 2 1 0 1 0 1 0 0 0
49751 16132 162 57.0 80 50 0.0 NaN 1.0 1 0 ... 0 2 1 0 1 0 1 0 0 0
99619 18214 166 65.0 80 50 0.0 0.0 NaN 1 0 ... 0 1 1 1 0 0 1 0 0 0
2769 17458 155 50.0 90 50 0.0 0.0 1.0 1 0 ... 0 1 1 1 0 0 1 0 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
78488 17506 170 95.0 210 150 0.0 0.0 0.0 0 1 ... 0 1 1 1 0 0 1 0 0 0
14006 15987 152 53.0 90 160 0.0 1.0 1.0 1 0 ... 0 1 1 1 0 0 1 0 0 0
62938 16333 175 89.0 90 160 0.0 0.0 1.0 1 0 ... 0 1 1 1 0 0 1 0 0 0
75520 19118 174 99.0 90 160 0.0 0.0 1.0 0 1 ... 0 1 1 1 0 0 1 0 0 0
3777 16277 169 87.0 90 160 0.0 0.0 1.0 1 0 ... 0 1 1 1 0 0 1 0 0 0
75960 23458 160 69.0 90 160 0.0 0.0 1.0 0 1 ... 0 1 1 1 0 0 1 0 0 0
91638 16804 164 88.0 95 160 0.0 1.0 1.0 0 1 ... 0 2 2 0 1 0 0 1 0 0
94387 20590 154 72.0 95 160 0.0 1.0 1.0 1 0 ... 0 3 3 0 0 1 0 0 1 0
78905 22369 148 55.0 100 160 0.0 0.0 1.0 1 0 ... 0 3 3 0 0 1 0 0 1 0
10591 21899 172 85.0 100 160 NaN NaN 0.0 0 1 ... 0 1 1 1 0 0 1 0 0 0
78442 19491 169 80.0 200 160 0.0 0.0 1.0 1 0 ... 0 3 3 0 0 1 0 0 1 0
94673 22551 169 88.0 200 160 0.0 0.0 0.0 1 0 ... 0 3 3 0 0 1 0 0 1 0
7054 22722 173 74.0 220 160 0.0 0.0 1.0 0 1 ... 0 2 2 0 1 0 0 1 0 0
6992 20501 160 69.0 80 170 0.0 0.0 1.0 1 0 ... 0 1 1 1 0 0 1 0 0 0
24054 20027 160 93.0 90 170 0.0 0.0 1.0 1 0 ... 0 3 3 0 0 1 0 0 1 0
22180 22418 152 65.0 95 170 0.0 1.0 1.0 1 0 ... 0 1 1 1 0 0 1 0 0 0
44592 22132 165 73.0 95 170 0.0 0.0 1.0 1 0 ... 0 1 3 1 0 0 0 0 1 0
88522 22042 154 52.0 95 170 0.0 0.0 1.0 1 0 ... 0 3 3 0 0 1 0 0 1 0
37746 15286 176 81.0 110 170 0.0 0.0 1.0 1 0 ... 0 1 2 1 0 0 0 1 0 0
6769 18961 158 74.0 200 170 0.0 0.0 1.0 1 0 ... 0 1 1 1 0 0 1 0 0 0
44701 22801 163 115.0 200 170 0.0 0.0 1.0 1 0 ... 0 1 1 1 0 0 1 0 0 0
68825 19136 164 98.0 95 180 0.0 1.0 1.0 1 0 ... 0 3 3 0 0 1 0 0 1 0
28449 22817 160 88.0 150 180 0.0 0.0 0.0 1 0 ... 0 1 1 1 0 0 1 0 0 0
53070 18784 175 75.0 150 180 0.0 0.0 0.0 0 1 ... 0 1 1 1 0 0 1 0 0 0
62861 22652 163 70.0 200 180 0.0 0.0 0.0 0 1 ... 0 1 1 1 0 0 1 0 0 0
54282 21770 161 84.0 196 182 0.0 0.0 1.0 1 0 ... 0 2 2 0 1 0 0 1 0 0
91264 16136 164 69.0 130 190 0.0 0.0 1.0 1 0 ... 0 1 1 1 0 0 1 0 0 0
68429 16789 158 85.0 140 190 NaN 0.0 1.0 1 0 ... 0 1 1 1 0 0 1 0 0 0
70400 22045 157 86.0 170 190 0.0 1.0 1.0 1 0 ... 0 2 2 0 1 0 0 1 0 0
22506 21848 165 95.0 172 190 0.0 0.0 1.0 1 0 ... 0 2 1 0 1 0 1 0 0 0

100000 rows × 22 columns


In [ ]:


In [6]:
sns.distplot(all.ap_lo);



In [46]:
sns.distplot(all[all['height']<100].height);



In [ ]:


In [ ]: