In [2]:
```from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from fastai.imports import *
from fastai.structured import *
from pandas_summary import DataFrameSummary
from IPython.display import display
from sklearn import metrics
from sklearn import metrics
import warnings
warnings.filterwarnings('ignore')
import seaborn as sns
sns.set()
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from xgboost import XGBRegressor
from sklearn.metrics import roc_auc_score

```
In [3]:
```import os
PATH = os.getcwd();
PATH = PATH+'\\AV_Mckin\\ods\\';
PATH

```
Out[3]:
```'D:\\Github\\fastai\\courses\\ml1\\AV_Mckin\\ods\\'

```
In [57]:
```df_train_org = pd.read_csv(f'{PATH}\\flight_delays_train.csv', low_memory=False)
df_test_org = pd.read_csv(f'{PATH}\\flight_delays_test.csv', low_memory=False)
df_raw = pd.read_csv(f'{PATH}\\impact_encoded_train.csv', low_memory=False)
df_test = pd.read_csv(f'{PATH}\\impact_encoded_test.csv', low_memory=False)

```
In [58]:
```df_raw.head(2)

```
Out[58]:
```
Month
DayofMonth
DayOfWeek
DepTime
UniqueCarrier
Origin
Dest
Distance
target
impact_encoded_Month
impact_encoded_DayofMonth
impact_encoded_DayOfWeek
impact_encoded_UniqueCarrier
impact_encoded_Origin
impact_encoded_Dest
0
c-8
c-21
c-7
1934
AA
ATL
DFW
732
0
0.202561
0.202928
0.191829
0.186803
0.258108
0.148708
1
c-4
c-20
c-3
1548
US
PIT
MCO
834
0
0.155567
0.213503
0.176398
0.167930
0.178860
0.177618

```
In [8]:
```from sklearn.model_selection import train_test_split

```
In [65]:
```def preprocess(X):
X["Flight"] = X["Origin"] + "-" + X["Dest"]
X["Hour"] = X["DepTime"] // 100
X['Mins'] = X['DepTime'] % 100
X["Month"] = X["Month"].apply(lambda x: int(x.split('-')[1]))
X["DayOfMonth"] = X["DayofMonth"].apply(lambda x: int(x.split('-')[1]))
X = X.drop(["DayofMonth"], axis=1)
X['DayOfWeek'] = X['DayOfWeek'].apply(lambda x: int(x.split('-')[1]))
X['weekend'] = np.where(X['DayOfWeek']>=6,1,0)
return X

```
In [66]:
```df_raw = preprocess(df_raw.copy())
df_test = preprocess(df_test.copy())

```
In [67]:
```y = df_raw.target.values
df_raw.drop('target',axis=1,inplace=True)
categorical_features_indices = np.where(df_raw.dtypes == object)[0]
categorical_features_indices

```
Out[67]:
```array([ 3, 4, 5, 13], dtype=int64)

```
In [68]:
```from sklearn.model_selection import train_test_split
X_train, X_validation, y_train, y_validation = train_test_split(df_raw, y, train_size=0.8, random_state=1234);

```
In [69]:
```#importing library and building model
from catboost import CatBoostClassifier
model=CatBoostClassifier(iterations=1000, depth=10, learning_rate=0.01, loss_function='CrossEntropy',\
)
model.fit(X_train, y_train,cat_features=categorical_features_indices,eval_set=(X_validation, y_validation))

```
Out[69]:
<catboost.core.CatBoostClassifier at 0x2048192cda0>

```
In [70]:
```prediction_proba = model.predict_proba(df_test)[:,1]

```
In [71]:
```model.get_feature_importance(X_train,y_train,cat_features=categorical_features_indices)

```
Out[71]:
```[2.4904738937880913,
0.9023635736328831,
26.784972512305057,
3.474099618979113,
2.489420620267185,
4.015760514421802,
3.6905698914829443,
6.242727532021042,
3.7215939749033424,
4.357634070441675,
3.5198557625898284,
6.269741184129969,
5.607910307964413,
1.9890194408451036,
18.474678710283758,
2.991705877361596,
2.528727859987967,
0.4487446545942221]

```
In [72]:
```pd.Series(prediction_proba,
name='dep_delayed_15min').to_csv(f'{PATH}cat_encoded_flights_ods.csv',
index_label='id', header=True)

```
In [77]:
```# This way we have randomness and are able to reproduce the behaviour within this cell.
np.random.seed(13)
from sklearn.model_selection import KFold
def impact_coding(data, feature, target='y'):
'''
In this implementation we get the values and the dictionary as two different steps.
This is just because initially we were ignoring the dictionary as a result variable.
In this implementation the KFolds use shuffling. If you want reproducibility the cv
could be moved to a parameter.
'''
n_folds = 10
n_inner_folds = 5
impact_coded = pd.Series()
oof_default_mean = data[target].mean() # Gobal mean to use by default (you could further tune this)
kf = KFold(n_splits=n_folds, shuffle=True)
oof_mean_cv = pd.DataFrame()
split = 0
for infold, oof in kf.split(data[feature]):
impact_coded_cv = pd.Series()
kf_inner = KFold(n_splits=n_inner_folds, shuffle=True)
inner_split = 0
inner_oof_mean_cv = pd.DataFrame()
oof_default_inner_mean = data.iloc[infold][target].mean()
for infold_inner, oof_inner in kf_inner.split(data.iloc[infold]):
# The mean to apply to the inner oof split (a 1/n_folds % based on the rest)
oof_mean = data.iloc[infold_inner].groupby(by=feature)[target].mean()
impact_coded_cv = impact_coded_cv.append(data.iloc[infold].apply(
lambda x: oof_mean[x[feature]]
if x[feature] in oof_mean.index
else oof_default_inner_mean
, axis=1))
# Also populate mapping (this has all group -> mean for all inner CV folds)
inner_oof_mean_cv = inner_oof_mean_cv.join(pd.DataFrame(oof_mean), rsuffix=inner_split, how='outer')
inner_oof_mean_cv.fillna(value=oof_default_inner_mean, inplace=True)
inner_split += 1
# Also populate mapping
oof_mean_cv = oof_mean_cv.join(pd.DataFrame(inner_oof_mean_cv), rsuffix=split, how='outer')
oof_mean_cv.fillna(value=oof_default_mean, inplace=True)
split += 1
impact_coded = impact_coded.append(data.iloc[oof].apply(
lambda x: inner_oof_mean_cv.loc[x[feature]].mean()
if x[feature] in inner_oof_mean_cv.index
else oof_default_mean
, axis=1))
return impact_coded, oof_mean_cv.mean(axis=1), oof_default_mean

```
In [81]:
```features = df_raw.columns
numeric_features = []
categorical_features = []
for dtype, feature in zip(df_raw.dtypes, df_raw.columns):
if dtype == object:
#print(column)
#print(train_data[column].describe())
categorical_features.append(feature)
else:
numeric_features.append(feature)
categorical_features

```
Out[81]:
```['Month', 'DayofMonth', 'DayOfWeek', 'UniqueCarrier', 'Origin', 'Dest']

```
In [88]:
```df_raw['target'] = y.map({'Y': 1, 'N': 0}).values

```
In [90]:
```%%time
# Apply the encoding to training and test data, and preserve the mapping
impact_coding_map = {}
for f in categorical_features:
print("Impact coding for {}".format(f))
df_raw["impact_encoded_{}".format(f)], impact_coding_mapping, default_coding = impact_coding(df_raw, f,'target')
impact_coding_map[f] = (impact_coding_mapping, default_coding)
mapping, default_mean = impact_coding_map[f]
df_test["impact_encoded_{}".format(f)] = df_test.apply(lambda x: mapping[x[f]]
if x[f] in mapping
else default_mean
, axis=1)
df_raw.to_csv(PATH + '\\impact_encoded_train.csv',index = False)
df_test.to_csv(PATH + '\\impact_encoded_test.csv',index = False)

```
```Impact coding for Month
Impact coding for DayofMonth
Impact coding for DayOfWeek
Impact coding for UniqueCarrier
Impact coding for Origin
Impact coding for Dest
Wall time: 13min 44s

```
In [32]:
```sub_df1 = pd.read_csv(f'{PATH}cat_encoded_flights_ods.csv');
sub_df2 = pd.read_csv(f'{PATH}cat_encoded_flights_ods.csv');

```
In [61]:
```preds = (sub_df1.dep_delayed_15min * 1.3 + sub_df2.dep_delayed_15min*2.5)/4

```
In [62]:
```max(preds)

```
Out[62]:
```0.90939239921497128

```
In [63]:
```preds1 = np.where(preds>.80, .84, preds)

```
In [64]:
```sub_df['dep_delayed_15min'] = preds

```
In [65]:
```sub_df.to_csv(f'{PATH}modified_flights.csv',index=None,)

```
In [ ]:
```

