In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
In [2]:
from fastai.imports import *
from fastai.structured import *
from pandas_summary import DataFrameSummary
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from IPython.display import display
import xgboost as xgb
from sklearn import metrics
In [3]:
PATH = os.getcwd();
PATH
Out[3]:
In [4]:
df_raw = pd.read_csv(f'{PATH}\\AV_Mckin\\train_encoded.csv', low_memory= False)
df_test = pd.read_csv(f'{PATH}\\AV_Mckin\\test_encoded.csv', low_memory=False)
In [15]:
df_raw = pd.read_csv(f'{PATH}\\AV_Mckin\\train_ajEneEa.csv', low_memory=False)
df_test = pd.read_csv(f'{PATH}\\AV_Mckin\\test_v2akXPA.csv', low_memory=False)
In [16]:
df_raw.shape, df_test.shape
Out[16]:
In [17]:
df_raw.head(2)
Out[17]:
In [18]:
target = df_raw.stroke.values;
df_raw.info()
In [19]:
# This way we have randomness and are able to reproduce the behaviour within this cell.
np.random.seed(13)
from sklearn.model_selection import KFold
def impact_coding(data, feature, target='y'):
'''
In this implementation we get the values and the dictionary as two different steps.
This is just because initially we were ignoring the dictionary as a result variable.
In this implementation the KFolds use shuffling. If you want reproducibility the cv
could be moved to a parameter.
'''
n_folds = 10
n_inner_folds = 5
impact_coded = pd.Series()
oof_default_mean = data[target].mean() # Gobal mean to use by default (you could further tune this)
kf = KFold(n_splits=n_folds, shuffle=True)
oof_mean_cv = pd.DataFrame()
split = 0
for infold, oof in kf.split(data[feature]):
impact_coded_cv = pd.Series()
kf_inner = KFold(n_splits=n_inner_folds, shuffle=True)
inner_split = 0
inner_oof_mean_cv = pd.DataFrame()
oof_default_inner_mean = data.iloc[infold][target].mean()
for infold_inner, oof_inner in kf_inner.split(data.iloc[infold]):
# The mean to apply to the inner oof split (a 1/n_folds % based on the rest)
oof_mean = data.iloc[infold_inner].groupby(by=feature)[target].mean()
impact_coded_cv = impact_coded_cv.append(data.iloc[infold].apply(
lambda x: oof_mean[x[feature]]
if x[feature] in oof_mean.index
else oof_default_inner_mean
, axis=1))
# Also populate mapping (this has all group -> mean for all inner CV folds)
inner_oof_mean_cv = inner_oof_mean_cv.join(pd.DataFrame(oof_mean), rsuffix=inner_split, how='outer')
inner_oof_mean_cv.fillna(value=oof_default_inner_mean, inplace=True)
inner_split += 1
# Also populate mapping
oof_mean_cv = oof_mean_cv.join(pd.DataFrame(inner_oof_mean_cv), rsuffix=split, how='outer')
oof_mean_cv.fillna(value=oof_default_mean, inplace=True)
split += 1
impact_coded = impact_coded.append(data.iloc[oof].apply(
lambda x: inner_oof_mean_cv.loc[x[feature]].mean()
if x[feature] in inner_oof_mean_cv.index
else oof_default_mean
, axis=1))
return impact_coded, oof_mean_cv.mean(axis=1), oof_default_mean
In [20]:
features = df_raw.columns
numeric_features = []
categorical_features = []
for dtype, feature in zip(df_raw.dtypes, df_raw.columns):
if dtype == object:
#print(column)
#print(train_data[column].describe())
categorical_features.append(feature)
else:
numeric_features.append(feature)
categorical_features
Out[20]:
In [22]:
%%time
# Apply the encoding to training and test data, and preserve the mapping
impact_coding_map = {}
for f in categorical_features:
print("Impact coding for {}".format(f))
df_raw["impact_encoded_{}".format(f)], impact_coding_mapping, default_coding = impact_coding(df_raw, f,'stroke')
impact_coding_map[f] = (impact_coding_mapping, default_coding)
mapping, default_mean = impact_coding_map[f]
df_test["impact_encoded_{}".format(f)] = df_test.apply(lambda x: mapping[x[f]]
if x[f] in mapping
else default_mean
, axis=1)
In [167]:
df_raw.to_csv(f'{PATH}\\AV_Mckin\\train_encoded.csv',index = False)
df_test.to_csv(f'{PATH}\\AV_Mckin\\test_encoded.csv',index = False)
In [175]:
categorical_features_indices1 = np.where(df_raw.dtypes == 'category')[0]
In [176]:
categorical_features_indices1
Out[176]:
In [33]:
df_raw.fillna(method='bfill',inplace=True)
In [41]:
df_raw.drop('stroke',axis=1,inplace=True)
In [179]:
from sklearn.model_selection import train_test_split
X_train, X_validation, y_train, y_validation = train_test_split(df_raw, target, train_size=0.8, random_state=1234)
In [188]:
#importing library and building model
from catboost import CatBoostClassifier
model=CatBoostClassifier(iterations=1000, depth=12, learning_rate=0.01, loss_function='CrossEntropy',\
)
model.fit(X_train, y_train,cat_features=categorical_features_indices,eval_set=(X_validation, y_validation))
Out[188]:
In [189]:
df_test.fillna(method='ffill', inplace=True)
In [190]:
prediction_proba = model.predict_proba(df_test)
In [191]:
prediction_proba[:,1]
Out[191]:
In [33]:
def make_submission(probs):
sample = pd.read_csv(f'{PATH}\\AV_Mckin\\sample_submission_1.csv')
submit = sample.copy()
submit['stroke'] = probs
return submit
In [34]:
sample = pd.read_csv(f'{PATH}\\AV_Mckin\\sample_submission_1.csv')
In [ ]:
submit = make_submission(preds_xgb)
In [ ]:
submit.head(2)
In [ ]:
submit.to_csv(f'{PATH}\\AV_Mckin\\xgb.csv', index=False)
In [66]:
sns.set()
feature_list = ['hypertension','heart_disease','ever_married','work_type','Residence_type','smoking_status']
for column_name in feature_list:
fig , (ax1,ax2) = plt.subplots(1,2,figsize = ( 15 , 6 ))
fig.suptitle(column_name,fontsize=16)
sns.countplot(df_raw[column_name],ax=ax1)
ax1.set_title("Train distribution")
for tick in ax1.get_xticklabels():
tick.set_rotation(45)
sns.countplot(df_test[column_name],ax=ax2,)
ax2.set_title("Test distribution")
for tick in ax2.get_xticklabels():
tick.set_rotation(45)
In [88]:
df_raw['y'] = y
In [150]:
df_raw['is_age>40'] = np.where(df_raw['age']>=40,1,0)
df_test['is_age>40'] = np.where(df_test['age']>=40,1,0)
df_raw['healthy'] = np.zeros(df_raw.shape[0])
my_query = df_raw.query('bmi>=18.5 & bmi<=25').index
df_raw.iloc[my_query, -1] = 1
df_raw['under_weight'] = np.zeros(df_raw.shape[0])
my_query = df_raw.query('bmi<=18.5').index
df_raw.iloc[my_query, -1] = 1
df_raw['over_weight'] = np.zeros(df_raw.shape[0])
my_query = df_raw.query('bmi>25. & bmi<=29.9').index
df_raw.iloc[my_query, -1] = 1
df_raw['obsese'] = np.zeros(df_raw.shape[0])
my_query = df_raw.query('bmi>=30').index
df_raw.iloc[my_query, -1] = 1
df_raw['normal_glucose'] = np.zeros(df_raw.shape[0])
my_query = df_raw.query('avg_glucose_level<=140').index
df_raw.iloc[my_query, -1] = 1
df_raw['pre_diabetes'] = np.zeros(df_raw.shape[0])
my_query = df_raw.query('avg_glucose_level>140 & avg_glucose_level<=199.').index
df_raw.iloc[my_query, -1] = 1
df_raw['diabetes'] = np.zeros(df_raw.shape[0])
my_query = df_raw.query('avg_glucose_level>=200').index
df_raw.iloc[my_query, -1] = 1
df_test['normal_glucose'] = np.zeros(df_test.shape[0])
my_query = df_test.query('avg_glucose_level<=140').index
df_test.iloc[my_query, -1] = 1
df_test['pre_diabetes'] = np.zeros(df_test.shape[0])
my_query = df_test.query('avg_glucose_level>140 & avg_glucose_level<=199.').index
df_test.iloc[my_query, -1] = 1
df_test['diabetes'] = np.zeros(df_test.shape[0])
my_query = df_test.query('avg_glucose_level>=200').index
df_test.iloc[my_query, -1] = 1
df_test['healthy'] = np.zeros(df_test.shape[0])
my_query = df_test.query('bmi>=18.5 & bmi<=25').index
df_test.iloc[my_query, -1] = 1
df_test['under_weight'] = np.zeros(df_test.shape[0])
my_query = df_test.query('bmi<=18.5').index
df_test.iloc[my_query, -1] = 1
df_test['over_weight'] = np.zeros(df_test.shape[0])
my_query = df_test.query('bmi>25. & bmi<=29.9').index
df_test.iloc[my_query, -1] = 1
df_test['obsese'] = np.zeros(df_test.shape[0])
my_query = df_test.query('bmi>=30').index
df_test.iloc[my_query, -1] = 1
In [115]:
df_train['y'] = y
corr = df_train.corr()
# Create a mask to hide the upper triangle of the correlation matrix (which is symmetric)
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
f, ax = plt.subplots(figsize=(20, 20))
sns.heatmap(corr, mask=mask, vmax=1, center=0, annot=True, fmt='.1f',
square=True, linewidths=.5, cbar_kws={"shrink": .5});
df_train.drop('y',axis=1,inplace=True)
In [119]:
df_uniques = pd.melt(frame=df_raw, value_vars=['gender','ever_married',
'work_type', 'Residence_type', 'smoking_status'])
df_uniques = pd.DataFrame(df_uniques.groupby(['variable',
'value'])['value'].count()) \
.sort_index(level=[0, 1]) \
.rename(columns={'value': 'count'}) \
.reset_index()
sns.factorplot(x='variable', y='count', hue='value',
data=df_uniques, kind='bar',size=8);
In [121]:
df_uniques = pd.melt(frame=df_raw, value_vars=['gender','ever_married',
'work_type', 'Residence_type', 'smoking_status'], id_vars=['y'])
df_uniques = pd.DataFrame(df_uniques.groupby(['variable',
'value', 'y'])['value'].count()) \
.sort_index(level=[0, 1]) \
.rename(columns={'value': 'count'}) \
.reset_index()
sns.factorplot(x='variable', y='count', hue='value',
col= 'y', data=df_uniques, kind='bar',size=12);
In [109]:
plt.figure(figsize=(15,15))
sns.lmplot(x='age', y='bmi', hue='work_type', data=df_raw);
In [111]:
plt.figure(figsize=(20,20))
sns.lmplot(x='age', y='avg_glucose_level', hue='work_type', data=df_raw);
In [89]:
sns.lmplot(x='age', y='bmi', hue='y', data=df_raw,)
Out[89]:
In [102]:
sns.lmplot(x='age', y='avg_glucose_level', hue='y', data=df_raw,)
Out[102]:
In [67]:
from sklearn.ensemble import GradientBoostingClassifier #GBM algorithm
from sklearn import cross_validation, metrics #Additional scklearn functions
from sklearn.grid_search import GridSearchCV #Perforing grid search
In [77]:
train_cats(df_raw)
train_cats(df_test)
df_raw['y'] = target
df_test['y'] = target[:18601]
df_raw1, y, nas = proc_df(df_raw, 'y', max_n_cat=10,)
df_test1, _, _ = proc_df(df_test, 'y', na_dict=nas)
df_raw.drop('y', axis=1, inplace=True)
df_test.drop('y', axis=1, inplace=True)
In [79]:
def modelfit(alg, dtrain, predictors, performCV=True, printFeatureImportance=True, cv_folds=5):
#Fit the algorithm on the data
alg.fit(dtrain[predictors], target)
#Predict training set:
dtrain_predictions = alg.predict(dtrain[predictors])
dtrain_predprob = alg.predict_proba(dtrain[predictors])[:,1]
#Perform cross-validation:
if performCV:
cv_score = cross_validation.cross_val_score(alg, dtrain[predictors], target, cv=cv_folds, scoring='roc_auc')
#Print model report:
print ("\nModel Report")
print ("Accuracy : %.4g" % metrics.accuracy_score(target , dtrain_predictions))
print ("AUC Score (Train): %f" % metrics.roc_auc_score(target , dtrain_predprob))
if performCV:
print ("CV Score : Mean - %.7g | Std - %.7g | Min - %.7g | Max - %.7g" % (np.mean(cv_score),np.std(cv_score),np.min(cv_score),np.max(cv_score)))
#Print Feature Importance:
if printFeatureImportance:
feat_imp = pd.Series(alg.feature_importances_, predictors).sort_values(ascending=False)
plt.figure(figsize=(20,20))
feat_imp.plot(kind='bar', title='Feature Importances')
plt.ylabel('Feature Importance Score')
In [80]:
#Choose all predictors except target & IDcols
predictors = df_raw1.columns
gbm0 = GradientBoostingClassifier(random_state=10)
modelfit(gbm0, df_raw1, predictors)
In [81]:
param_test1 = {'n_estimators':[20, 30, 40, 50, 60, 70, 80, 90]}
gsearch1 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.05, min_samples_split=500,
min_samples_leaf=50,max_depth=8,max_features='sqrt',subsample=0.8,random_state=10),
param_grid = param_test1, scoring='roc_auc',n_jobs=-1,iid=False, cv=5)
gsearch1.fit(df_raw1[predictors], y)
Out[81]:
In [82]:
gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_
Out[82]:
In [83]:
## Test 2
param_test2 = {'max_depth':[5, 7, 9, 11, 13, 15] ,'min_samples_split': [200, 400, 600, 800, 1000]}
gsearch2 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.05, n_estimators=70, max_features='sqrt', subsample=0.8, random_state=10),
param_grid = param_test2, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch2.fit(df_raw1[predictors], y)
gsearch2.grid_scores_, gsearch2.best_params_, gsearch2.best_score_
Out[83]:
In [84]:
#test 3
param_test3 = {'min_samples_split': [800, 1000, 1200, 1600] , 'min_samples_leaf': [30, 40, 70]}
gsearch3 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.05, n_estimators=70,\
max_depth=7,max_features='sqrt', subsample=0.8, random_state=10),
param_grid = param_test3, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
In [85]:
gsearch3.fit(df_raw1[predictors], y)
Out[85]:
In [ ]:
In [ ]:
In [197]:
def display_all(df):
with pd.option_context("display.max_rows", 1000, "display.max_columns", 1000):
display(df)
In [29]:
m = RandomForestRegressor(n_jobs=-1)
m.fit(df_raw.drop('stroke', axis=1), df_raw.stroke)
In [199]:
train_cats(df_raw)
In [10]:
display_all(df_raw.isnull().sum().sort_index()/len(df_raw))
In [201]:
df_raw['stroke'] = y
In [202]:
df, y, nas = proc_df(df_raw, 'stroke', max_n_cat= 20,)
In [203]:
m = RandomForestRegressor(n_jobs=-1)
m.fit(df, y)
m.score(df,y)
Out[203]:
In [37]:
def split_vals(a,n): return a[:n].copy(), a[n:].copy()
n_valid = 18601 # same as Kaggle's test set size
n_trn = len(df)-n_valid
raw_train, raw_valid = split_vals(df_raw, n_trn)
X_train, X_valid = split_vals(df, n_trn)
y_train, y_valid = split_vals(y, n_trn)
X_train.shape, y_train.shape, X_valid.shape
Out[37]:
In [38]:
def rmse(x,y): return math.sqrt(((x-y)**2).mean())
def print_score(m):
res = [rmse(m.predict(X_train), y_train), rmse(m.predict(X_valid), y_valid),
m.score(X_train, y_train), m.score(X_valid, y_valid)]
if hasattr(m, 'oob_score_'): res.append(m.oob_score_)
print(res)
In [39]:
m = RandomForestRegressor(n_jobs=-1)
%time m.fit(X_train, y_train)
print_score(m)
In [40]:
m = RandomForestRegressor(n_estimators=1, max_depth=3, bootstrap=False, n_jobs=-1)
m.fit(X_train, y_train)
print_score(m)
In [43]:
draw_tree(m.estimators_[0], df, precision=3)
In [44]:
m = RandomForestRegressor(n_estimators=1, bootstrap=False, n_jobs=-1)
m.fit(X_train, y_train)
print_score(m)
In [45]:
preds = np.stack([t.predict(X_valid) for t in m.estimators_])
preds[:,0], np.mean(preds[:,0]), y_valid[0]
Out[45]:
In [46]:
preds.shape
Out[46]:
In [47]:
plt.plot([metrics.r2_score(y_valid, np.mean(preds[:i+1], axis=0)) for i in range(10)]);
In [48]:
m = RandomForestRegressor(n_estimators=20, n_jobs=-1)
m.fit(X_train, y_train)
print_score(m)
In [49]:
m = RandomForestRegressor(n_estimators=40, n_jobs=-1)
m.fit(X_train, y_train)
print_score(m)
In [50]:
m = RandomForestRegressor(n_estimators=80, n_jobs=-1)
m.fit(X_train, y_train)
print_score(m)
In [51]:
m = RandomForestRegressor(n_estimators=40, n_jobs=-1, oob_score=True)
m.fit(X_train, y_train)
print_score(m)
In [55]:
m = RandomForestRegressor(n_estimators=80, n_jobs=-1, oob_score=True)
m.fit(X_train, y_train)
print_score(m)
In [56]:
fi = rf_feat_importance(m, df); fi[:10]
Out[56]:
In [57]:
fi.plot('cols', 'imp', figsize=(10,6), legend=False);
In [60]:
def plot_fi(fi): return fi.plot('cols', 'imp', 'barh', figsize=(12,7), legend=False)
In [61]:
plot_fi(fi[:30]);
In [62]:
to_keep = fi[fi.imp>0.005].cols; len(to_keep)
Out[62]:
In [63]:
df_keep = df[to_keep].copy()
X_train, X_valid = split_vals(df_keep, n_trn)
In [66]:
m = RandomForestRegressor(n_estimators=1000, max_features=0.5,
n_jobs=-1, oob_score=True)
m.fit(X_train, y_train)
print_score(m)
In [67]:
fi = rf_feat_importance(m, df_keep)
plot_fi(fi);
In [68]:
from scipy.cluster import hierarchy as hc
In [69]:
corr = np.round(scipy.stats.spearmanr(df_keep).correlation, 4)
corr_condensed = hc.distance.squareform(1-corr)
z = hc.linkage(corr_condensed, method='average')
fig = plt.figure(figsize=(16,10))
dendrogram = hc.dendrogram(z, labels=df_keep.columns, orientation='left', leaf_font_size=16)
plt.show()
In [70]:
def get_oob(df):
m = RandomForestRegressor(n_estimators=1000, min_samples_leaf=5, max_features=0.6, n_jobs=-1, oob_score=True)
x, _ = split_vals(df, n_trn)
m.fit(x, y_train)
return m.oob_score_
In [71]:
get_oob(df_keep)
Out[71]:
In [204]:
apply_cats(df_test,df_raw)
In [205]:
df_test.shape
Out[205]:
In [206]:
df.columns
Out[206]:
In [207]:
y_test = y[:18601]
In [208]:
df_test['y'] = y_test
In [209]:
df_test_1, _, _ = proc_df(df_test,'y',na_dict=nas,max_n_cat=20)
In [210]:
df_raw.head(2)
Out[210]:
In [92]:
df_raw.groupby('gender')['bmi'].mean()
Out[92]:
In [211]:
# Calculate the correlation matrix
corr = df.corr(method='pearson')
# Create a mask to hide the upper triangle of the correlation matrix (which is symmetric)
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corr, mask=mask, vmax=1, center=0, annot=True, fmt='.1f',
square=True, linewidths=.5, cbar_kws={"shrink": .5});
In [102]:
df_raw.groupby(['work_type','gender'])['age'].describe()
Out[102]:
In [113]:
m
Out[113]:
In [122]:
m = RandomForestRegressor(n_estimators=1000,
n_jobs=-1, oob_score=True)
m.fit(df, y)
In [125]:
preds = m.predict(df_test_1)
In [130]:
categorical_features_indices = []
In [ ]:
#importing library and building model
from catboost import CatBoostClassifier
model=CatBoostClassifier(iterations=1000, depth=10, learning_rate=0.01, loss_function='CrossEntropy',\
)
model.fit(X_train, y_train,cat_features=categorical_features_indices,eval_set=(X_validation, y_validation))
In [214]:
preds=m.predict(df_test_1)
In [5]:
df_raw.head(3)
Out[5]:
In [6]:
df_test.head(3)
Out[6]:
In [12]:
features = df_raw.columns
numeric_features = []
categorical_features = []
for dtype, feature in zip(df_raw.dtypes, df_raw.columns):
if dtype == object:
#print(column)
#print(train_data[column].describe())
categorical_features.append(feature)
else:
numeric_features.append(feature)
categorical_features
Out[12]:
In [44]:
(43400-783)/783
Out[44]:
In [42]:
np.count_nonzero(y)
Out[42]:
In [118]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.metrics import roc_auc_score
def runXGB(train_X, train_y, test_X, test_y=None, seed_val=1, depth = 10):
params = {}
params["objective"] = "binary:logistic"
params['eval_metric'] = 'logloss'
params["eta"] = 0.01 #0.00334
params["subsample"] = 0.9
params["silent"] = 0
params["max_depth"] = depth
params["seed"] = seed_val
params["max_delta_step"] = 4
params['scale_pos_weight'] = 54.427841634738186
#params["gamma"] = 0.5
num_rounds = 5000 #3600
plst = list(params.items())
xgtrain = xgb.DMatrix(train_X, label=train_y)
if test_y is not None:
xgtest = xgb.DMatrix(test_X, label=test_y)
watchlist = [ (xgtrain,'train'), (xgtest, 'test') ]
model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds= 500)
else:
xgtest = xgb.DMatrix(test_X)
#xgtest1 = xgb.DMatrix(test_X1)
model = xgb.train(plst, xgtrain, num_rounds)
pred_test_y = model.predict(xgtest)
return pred_test_y, model
In [119]:
df_raw['y'] = y
y = df_raw.y.values;
df_raw.drop('y',axis=1,inplace=True)
train_cats(df_raw);
apply_cats(df_test, df_raw)
df_raw['y'] = y
df_test['y'] = y[:18601]
df_train , y, nas = proc_df(df_raw, 'y', max_n_cat=20)
df_test , _ , _ = proc_df(df_test, 'y', max_n_cat=20, na_dict= nas)
df_raw.drop('y',axis=1,inplace=True)
In [ ]:
%time preds_xgb, model = runXGB(df_train, y, df_test)
In [ ]:
xgb.plot_importance(model,max_num_features=20,importance_type='gain')
In [ ]:
len(df_train.columns)
In [ ]:
len(df_test.columns)
In [95]:
df_raw['is_age>60'] = np.where(df_raw['age']>=60,1,0)
df_raw['is_age>80'] = np.where(df_raw['age']>=80,1,0)
df_raw['is_age20&40'] = np.zeros(df_raw.shape[0])
my_query = df_raw.query('age>=20 & age<=40').index
df_raw.iloc[my_query, -1] = 1
df_raw['is_ageless20'] = np.zeros(df_raw.shape[0])
my_query = df_raw.query('age<20').index
df_raw.iloc[my_query, -1] = 1
df_test['is_age20&40'] = np.zeros(df_test.shape[0])
my_query = df_test.query('age>=20 & age<=40').index
df_test.iloc[my_query, -1] = 1
df_test['is_ageless20'] = np.zeros(df_test.shape[0])
my_query = df_test.query('age<20').index
df_test.iloc[my_query, -1] = 1
In [108]:
df_test['is_age20&40'] = np.zeros(df_test.shape[0])
my_query = df_test.query('age>=20 & age<=40').index
df_test.iloc[my_query, -1] = 1
df_test['is_ageless20'] = np.zeros(df_test.shape[0])
my_query = df_test.query('age<20').index
df_test.iloc[my_query, -1] = 1