In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
In [2]:
from fastai.imports import *
from fastai.structured import *
from pandas_summary import DataFrameSummary
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from IPython.display import display
from sklearn import metrics
In [12]:
PATH = os.getcwd();
PATH = PATH+"\\AV_Lord"
In [123]:
df_raw = pd.read_feather(f'{PATH}\combined.raw')
In [124]:
df_raw.shape
Out[124]:
In [125]:
df_raw.head(1)
Out[125]:
In [126]:
def display_all(df):
with pd.option_context("display.max_rows", 1000):
with pd.option_context("display.max_columns", 1000):
display(df)
In [127]:
def disply_dtype_plot(df = None):
if df is None:
return
l = []
cols = df.columns
for i in cols:
if df[i].dtype == 'int64':
l.append('integer dtype')
elif df[i].dtype == 'object':
l.append('object dtype')
elif df[i].dtype == 'float64':
l.append('float dtype')
else:
pass
sns.countplot(l)
del l
disply_dtype_plot(df_raw)
In [128]:
df_raw.head(0)
Out[128]:
In [129]:
df_raw.info()
In [130]:
df_raw = df_raw * 1
In [131]:
train_cats(df_raw)
In [132]:
os.makedirs('tmp', exist_ok=True)
df_raw.to_feather('tmp/av_lord-raw')
In [133]:
df_raw = pd.read_feather('tmp/av_lord-raw')
In [134]:
df, y, nas, mapper = proc_df(df_raw, 'is_click', do_scale=True,max_n_cat=30)
In [135]:
sns.countplot(y)
Out[135]:
In [139]:
y[-26:] = 0
In [229]:
#df.drop('is_open', axis=1, inplace=True)
m = RandomForestRegressor(n_jobs=-1)
m.fit(df, y)
m.score(df,y)
Out[229]:
In [141]:
def print_score(m):
res = [rmse(m.predict(X_train), y_train), rmse(m.predict(X_valid), y_valid),
m.score(X_train, y_train), m.score(X_valid, y_valid)]
if hasattr(m, 'oob_score_'): res.append(m.oob_score_)
print(res)
In [ ]:
m = RandomForestRegressor(n_jobs=-1)
%%time m.fit(X_train, y_train)
print_score(m)
In [173]:
display_all(test.isnull().sum().sort_index()/len(df_raw))
In [178]:
display_all(df.columns)
In [ ]:
In [194]:
test = pd.read_csv(f'{PATH}\\test_BDIfz5B.csv')
In [195]:
test['y'] = y[:773858]
In [196]:
test.head(2)
Out[196]:
In [197]:
add_datepart(test,'send_date')
In [198]:
test.drop('send_Elapsed',axis=1,inplace=True)
test.head(2)
Out[198]:
In [199]:
np.unique(camp['campaign_id'])
Out[199]:
In [200]:
test = test.merge(camp,on='campaign_id');
In [202]:
test['link_diff'] = test['total_links'] - test['no_of_internal_links']
test['av_links'] = (test['no_of_internal_links']/ test['total_links']) * 100
test['img_per_section'] = test['no_of_images']/ test['no_of_sections']
test['link_diff_%'] = (test['total_links'] - test['no_of_internal_links'])/test['total_links'] * 100
test.head(1)
Out[202]:
In [205]:
test.to_feather('tmp/av_lord_test')
In [208]:
train_cats(test)
In [212]:
nas
Out[212]:
In [214]:
test.columns
Out[214]:
In [215]:
mapper
Out[215]:
In [217]:
test.columns
Out[217]:
In [218]:
test, _, _ = proc_df(test,max_n_cat=30,mapper=mapper,na_dict=nas)
In [219]:
test.columns
Out[219]:
In [224]:
df.drop(list(set(df.columns) - set(test.columns)), axis=1,inplace=True)
In [228]:
len(test.columns)
Out[228]:
In [227]:
len(df.columns)
Out[227]:
In [51]:
print(df['img_per_sec'].value_counts())
sns.countplot(df['img_per_sec'],orient='h');
In [43]:
print(df['is_open'].value_counts())
sns.countplot(df['is_open']);
In [38]:
sns.countplot(df['no_of_images']);
In [39]:
sns.countplot(df['no_of_sections']);
In [37]:
sns.countplot(df['link_diff']);
Out[37]:
In [81]:
train_cats(df)
In [82]:
apply_cats(test, df)
In [87]:
df.drop(['id', 'user_id'], axis=1, inplace=True);
test.drop(['id', 'user_id'], axis=1, inplace=True);
In [88]:
df.head(1)
Out[88]:
In [89]:
test.head(1)
Out[89]:
In [90]:
df.info()
In [99]:
categorical_features_indices = np.where(df.dtypes == 'category')[0]
In [100]:
categorical_features_indices
Out[100]:
In [103]:
df[:].fillna(method='ffill', inplace=True)
In [106]:
def rmse(x,y): return math.sqrt(((x-y)**2).mean())
def print_score(m):
res = [rmse(m.predict(X_train), y_train), rmse(m.predict(X_valid), y_valid),
m.score(X_train, y_train), m.score(X_valid, y_valid)]
if hasattr(m, 'oob_score_'): res.append(m.oob_score_)
print(res)
In [109]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(C=1, random_state=17, solver='lbfgs',class_weight='balanced',n_jobs=-1,max_iter=1000)
In [112]:
df.drop(['email_body','subject','email_url'], axis =1, inplace=True)
test.drop(['email_body','subject','email_url'], axis =1, inplace=True)
In [116]:
train_cats(df)
In [117]:
apply_cats(test,df)
In [121]:
categorical_features_indices = np.where(df.dtypes == 'category')[0]
In [122]:
categorical_features_indices
Out[122]:
In [133]:
#importing library and building model
from catboost import CatBoostRegressor
#model=CatBoostClassifier(iterations=1000, depth=10,learning_rate=0.01, loss_function='CrossEntropy',\
#)
#model.fit(X_train, y_train,cat_features=categorical_features_indices,eval_set=(X_validation, y_validation))
In [128]:
df, _, nan, mapper = proc_df(df,do_scale=True,max_n_cat=30)
In [129]:
from sklearn.model_selection import train_test_split
X_train, X_validation, y_train, y_validation = train_test_split(df, y_target, train_size=0.8, random_state=1234)
In [136]:
lr.fit(X_train,y_train)
In [139]:
df.isnull().head()
Out[139]:
In [230]:
m
Out[230]:
In [232]:
preds = m.predict(test)
In [239]:
preds
Out[239]:
In [233]:
sample_sub['is_click'] = preds
In [234]:
sample_sub
Out[234]:
In [241]:
def make_submission(probs):
sample = pd.read_csv(f'{PATH}//sample_submission.csv')
submit = sample.copy()
submit['is_click'] = probs
return submit
In [242]:
submit = make_submission(preds)
In [243]:
submit.head(2)
Out[243]:
In [244]:
submit.to_csv(PATH + '//av_cat_2.csv', index=False)
In [ ]: