In [1]:
    
%load_ext autoreload
%autoreload 2
%matplotlib inline
    
In [2]:
    
from fastai.imports import *
from fastai.structured import *
from pandas_summary import DataFrameSummary
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from IPython.display import display
from sklearn import metrics
    
In [12]:
    
PATH = os.getcwd();
PATH = PATH+"\\AV_Lord"
    
In [123]:
    
df_raw = pd.read_feather(f'{PATH}\combined.raw')
    
In [124]:
    
df_raw.shape
    
    Out[124]:
In [125]:
    
df_raw.head(1)
    
    Out[125]:
In [126]:
    
def display_all(df):
    with pd.option_context("display.max_rows", 1000): 
        with pd.option_context("display.max_columns", 1000): 
            display(df)
    
In [127]:
    
def disply_dtype_plot(df = None):
    if df is None:
        return
    l = []
    cols = df.columns
    for i in cols:
        if df[i].dtype == 'int64':
            l.append('integer dtype')
        elif df[i].dtype == 'object':
            l.append('object dtype')
        elif df[i].dtype == 'float64':
            l.append('float dtype')
        else:
            pass
    sns.countplot(l)
    del l
disply_dtype_plot(df_raw)
    
    
In [128]:
    
df_raw.head(0)
    
    Out[128]:
In [129]:
    
df_raw.info()
    
    
In [130]:
    
df_raw = df_raw * 1
    
In [131]:
    
train_cats(df_raw)
    
In [132]:
    
os.makedirs('tmp', exist_ok=True)
df_raw.to_feather('tmp/av_lord-raw')
    
In [133]:
    
df_raw = pd.read_feather('tmp/av_lord-raw')
    
In [134]:
    
df, y, nas, mapper = proc_df(df_raw, 'is_click', do_scale=True,max_n_cat=30)
    
In [135]:
    
sns.countplot(y)
    
    Out[135]:
    
In [139]:
    
y[-26:] = 0
    
In [229]:
    
#df.drop('is_open', axis=1, inplace=True)
m = RandomForestRegressor(n_jobs=-1)
m.fit(df, y)
m.score(df,y)
    
    Out[229]:
In [141]:
    
def print_score(m):
    res = [rmse(m.predict(X_train), y_train), rmse(m.predict(X_valid), y_valid),
                m.score(X_train, y_train), m.score(X_valid, y_valid)]
    if hasattr(m, 'oob_score_'): res.append(m.oob_score_)
    print(res)
    
In [ ]:
    
m = RandomForestRegressor(n_jobs=-1)
%%time m.fit(X_train, y_train)
print_score(m)
    
In [173]:
    
display_all(test.isnull().sum().sort_index()/len(df_raw))
    
    
In [178]:
    
display_all(df.columns)
    
    
In [ ]:
    
    
In [194]:
    
test = pd.read_csv(f'{PATH}\\test_BDIfz5B.csv')
    
In [195]:
    
test['y'] = y[:773858]
    
In [196]:
    
test.head(2)
    
    Out[196]:
In [197]:
    
add_datepart(test,'send_date')
    
In [198]:
    
test.drop('send_Elapsed',axis=1,inplace=True)
test.head(2)
    
    Out[198]:
In [199]:
    
np.unique(camp['campaign_id'])
    
    Out[199]:
In [200]:
    
test = test.merge(camp,on='campaign_id');
    
    
In [202]:
    
test['link_diff'] = test['total_links'] - test['no_of_internal_links']
test['av_links'] = (test['no_of_internal_links']/ test['total_links']) * 100
test['img_per_section'] = test['no_of_images']/ test['no_of_sections']
test['link_diff_%'] = (test['total_links'] - test['no_of_internal_links'])/test['total_links'] * 100
test.head(1)
    
    Out[202]:
In [205]:
    
test.to_feather('tmp/av_lord_test')
    
In [208]:
    
train_cats(test)
    
In [212]:
    
nas
    
    Out[212]:
In [214]:
    
test.columns
    
    Out[214]:
In [215]:
    
mapper
    
    Out[215]:
In [217]:
    
test.columns
    
    Out[217]:
In [218]:
    
test, _, _ = proc_df(test,max_n_cat=30,mapper=mapper,na_dict=nas)
    
In [219]:
    
test.columns
    
    Out[219]:
In [224]:
    
df.drop(list(set(df.columns) - set(test.columns)), axis=1,inplace=True)
    
In [228]:
    
len(test.columns)
    
    Out[228]:
In [227]:
    
len(df.columns)
    
    Out[227]:
In [51]:
    
print(df['img_per_sec'].value_counts())
sns.countplot(df['img_per_sec'],orient='h');
    
    
    
In [43]:
    
print(df['is_open'].value_counts())
sns.countplot(df['is_open']);
    
    
    
In [38]:
    
sns.countplot(df['no_of_images']);
    
    
In [39]:
    
sns.countplot(df['no_of_sections']);
    
    
In [37]:
    
sns.countplot(df['link_diff']);
    
    Out[37]:
    
In [81]:
    
train_cats(df)
    
In [82]:
    
apply_cats(test, df)
    
In [87]:
    
df.drop(['id', 'user_id'], axis=1, inplace=True);
test.drop(['id', 'user_id'], axis=1, inplace=True);
    
In [88]:
    
df.head(1)
    
    Out[88]:
In [89]:
    
test.head(1)
    
    Out[89]:
In [90]:
    
df.info()
    
    
In [99]:
    
categorical_features_indices = np.where(df.dtypes == 'category')[0]
    
In [100]:
    
categorical_features_indices
    
    Out[100]:
In [103]:
    
df[:].fillna(method='ffill', inplace=True)
    
In [106]:
    
def rmse(x,y): return math.sqrt(((x-y)**2).mean())
def print_score(m):
    res = [rmse(m.predict(X_train), y_train), rmse(m.predict(X_valid), y_valid),
                m.score(X_train, y_train), m.score(X_valid, y_valid)]
    if hasattr(m, 'oob_score_'): res.append(m.oob_score_)
    print(res)
    
In [109]:
    
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(C=1, random_state=17, solver='lbfgs',class_weight='balanced',n_jobs=-1,max_iter=1000)
    
In [112]:
    
df.drop(['email_body','subject','email_url'], axis =1, inplace=True)
test.drop(['email_body','subject','email_url'], axis =1, inplace=True)
    
In [116]:
    
train_cats(df)
    
In [117]:
    
apply_cats(test,df)
    
In [121]:
    
categorical_features_indices = np.where(df.dtypes == 'category')[0]
    
In [122]:
    
categorical_features_indices
    
    Out[122]:
In [133]:
    
#importing library and building model
from catboost import CatBoostRegressor
#model=CatBoostClassifier(iterations=1000, depth=10,learning_rate=0.01, loss_function='CrossEntropy',\
                         #)
#model.fit(X_train, y_train,cat_features=categorical_features_indices,eval_set=(X_validation, y_validation))
    
In [128]:
    
df, _, nan, mapper = proc_df(df,do_scale=True,max_n_cat=30)
    
In [129]:
    
from sklearn.model_selection import train_test_split
X_train, X_validation, y_train, y_validation = train_test_split(df, y_target, train_size=0.8, random_state=1234)
    
    
In [136]:
    
lr.fit(X_train,y_train)
    
    
In [139]:
    
df.isnull().head()
    
    Out[139]:
In [230]:
    
m
    
    Out[230]:
In [232]:
    
preds = m.predict(test)
    
In [239]:
    
preds
    
    Out[239]:
In [233]:
    
sample_sub['is_click'] = preds
    
In [234]:
    
sample_sub
    
    Out[234]:
In [241]:
    
def make_submission(probs):
    sample = pd.read_csv(f'{PATH}//sample_submission.csv')
    submit = sample.copy()
    submit['is_click'] = probs
    return submit
    
In [242]:
    
submit = make_submission(preds)
    
In [243]:
    
submit.head(2)
    
    Out[243]:
In [244]:
    
submit.to_csv(PATH + '//av_cat_2.csv', index=False)
    
In [ ]: