av_lord_ml



In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
from fastai.imports import *
from fastai.structured import *

from pandas_summary import DataFrameSummary
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from IPython.display import display

from sklearn import metrics

In [12]:
PATH = os.getcwd();
PATH = PATH+"\\AV_Lord"

In [123]:
df_raw = pd.read_feather(f'{PATH}\combined.raw')

In [124]:
df_raw.shape


Out[124]:
(1023217, 29)

In [125]:
df_raw.head(1)


Out[125]:
id user_id campaign_id is_open is_click send_Year send_Month send_Week send_Day send_Dayofweek ... no_of_internal_links no_of_images no_of_sections email_body subject email_url link_diff img_per_sec link_diff_% img_per_section
0 42_14051 14051.0 42 0.0 0.0 2017.0 1.0 2.0 9.0 0.0 ... 79 13 4 September Newsletter\r\n \r\nDear AVians,\r\n ... [September] Exciting days ahead with DataHack ... http://r.newsletters.analyticsvidhya.com/7v3rd... 9 3.25 0.102273 3.25

1 rows × 29 columns


In [126]:
def display_all(df):
    with pd.option_context("display.max_rows", 1000): 
        with pd.option_context("display.max_columns", 1000): 
            display(df)

In [127]:
def disply_dtype_plot(df = None):
    if df is None:
        return
    l = []
    cols = df.columns
    for i in cols:
        if df[i].dtype == 'int64':
            l.append('integer dtype')
        elif df[i].dtype == 'object':
            l.append('object dtype')
        elif df[i].dtype == 'float64':
            l.append('float dtype')
        else:
            pass
    sns.countplot(l)
    del l
disply_dtype_plot(df_raw)



In [128]:
df_raw.head(0)


Out[128]:
id user_id campaign_id is_open is_click send_Year send_Month send_Week send_Day send_Dayofweek ... no_of_internal_links no_of_images no_of_sections email_body subject email_url link_diff img_per_sec link_diff_% img_per_section

0 rows × 29 columns


In [129]:
df_raw.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1023217 entries, 0 to 1023216
Data columns (total 29 columns):
id                       1023191 non-null object
user_id                  1023191 non-null float64
campaign_id              1023217 non-null int64
is_open                  1023191 non-null float64
is_click                 1023191 non-null float64
send_Year                1023191 non-null float64
send_Month               1023191 non-null float64
send_Week                1023191 non-null float64
send_Day                 1023191 non-null float64
send_Dayofweek           1023191 non-null float64
send_Dayofyear           1023191 non-null float64
send_Is_month_end        1023191 non-null object
send_Is_month_start      1023191 non-null object
send_Is_quarter_end      1023191 non-null object
send_Is_quarter_start    1023191 non-null object
send_Is_year_end         1023191 non-null object
send_Is_year_start       1023191 non-null object
communication_type       1023217 non-null object
total_links              1023217 non-null int64
no_of_internal_links     1023217 non-null int64
no_of_images             1023217 non-null int64
no_of_sections           1023217 non-null int64
email_body               1023217 non-null object
subject                  1023217 non-null object
email_url                1023217 non-null object
link_diff                1023217 non-null int64
img_per_sec              1023217 non-null float64
link_diff_%              1023217 non-null float64
img_per_section          1023217 non-null float64
dtypes: float64(12), int64(6), object(11)
memory usage: 226.4+ MB

In [130]:
df_raw = df_raw * 1

In [131]:
train_cats(df_raw)

In [132]:
os.makedirs('tmp', exist_ok=True)
df_raw.to_feather('tmp/av_lord-raw')

In [133]:
df_raw = pd.read_feather('tmp/av_lord-raw')

In [134]:
df, y, nas, mapper = proc_df(df_raw, 'is_click', do_scale=True,max_n_cat=30)

In [135]:
sns.countplot(y)


Out[135]:
<matplotlib.axes._subplots.AxesSubplot at 0x27193e5ff28>

In [139]:
y[-26:] = 0

In [229]:
#df.drop('is_open', axis=1, inplace=True)
m = RandomForestRegressor(n_jobs=-1)
m.fit(df, y)
m.score(df,y)


Out[229]:
0.7547053047196518

In [141]:
def print_score(m):
    res = [rmse(m.predict(X_train), y_train), rmse(m.predict(X_valid), y_valid),
                m.score(X_train, y_train), m.score(X_valid, y_valid)]
    if hasattr(m, 'oob_score_'): res.append(m.oob_score_)
    print(res)

In [ ]:
m = RandomForestRegressor(n_jobs=-1)
%%time m.fit(X_train, y_train)
print_score(m)

In [173]:
display_all(test.isnull().sum().sort_index()/len(df_raw))


campaign_id    0.0
id             0.0
send_date      0.0
user_id        0.0
dtype: float64

In [178]:
display_all(df.columns)


Index(['id', 'user_id', 'campaign_id', 'send_Year', 'send_Month', 'send_Week',
       'send_Day', 'send_Dayofweek', 'send_Dayofyear', 'total_links',
       'no_of_internal_links', 'no_of_images', 'no_of_sections', 'email_body',
       'subject', 'email_url', 'link_diff', 'img_per_sec', 'link_diff_%',
       'img_per_section', 'user_id_na', 'is_open_na', 'send_Year_na',
       'send_Month_na', 'send_Week_na', 'send_Day_na', 'send_Dayofweek_na',
       'send_Dayofyear_na', 'send_Is_month_end_0.0', 'send_Is_month_end_nan',
       'send_Is_month_start_0.0', 'send_Is_month_start_nan',
       'send_Is_quarter_end_0.0', 'send_Is_quarter_end_nan',
       'send_Is_quarter_start_0.0', 'send_Is_quarter_start_nan',
       'send_Is_year_end_0.0', 'send_Is_year_end_nan',
       'send_Is_year_start_0.0', 'send_Is_year_start_nan',
       'communication_type_Conference', 'communication_type_Corporate',
       'communication_type_Hackathon', 'communication_type_Newsletter',
       'communication_type_Others', 'communication_type_Upcoming Events',
       'communication_type_Webinar', 'communication_type_nan', 'av_links'],
      dtype='object')

In [ ]:

testset transforms


In [194]:
test = pd.read_csv(f'{PATH}\\test_BDIfz5B.csv')

In [195]:
test['y'] = y[:773858]

In [196]:
test.head(2)


Out[196]:
id campaign_id user_id send_date y
0 63_122715 63 122715 01-02-2018 22:35 0.0
1 56_76206 56 76206 02-01-2018 08:15 0.0

In [197]:
add_datepart(test,'send_date')

In [198]:
test.drop('send_Elapsed',axis=1,inplace=True)
test.head(2)


Out[198]:
id campaign_id user_id y send_Year send_Month send_Week send_Day send_Dayofweek send_Dayofyear send_Is_month_end send_Is_month_start send_Is_quarter_end send_Is_quarter_start send_Is_year_end send_Is_year_start
0 63_122715 63 122715 0.0 2018 1 1 2 1 2 False False False False False False
1 56_76206 56 76206 0.0 2018 2 5 1 3 32 False True False False False False

In [199]:
np.unique(camp['campaign_id'])


Out[199]:
array([29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54,
       55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80], dtype=int64)

In [200]:
test = test.merge(camp,on='campaign_id');


id campaign_id user_id y send_Year send_Month send_Week send_Day send_Dayofweek send_Dayofyear send_Is_month_end send_Is_month_start send_Is_quarter_end send_Is_quarter_start send_Is_year_end send_Is_year_start communication_type total_links no_of_internal_links no_of_images no_of_sections email_body subject email_url
0 63_122715 63 122715 0.0 2018 1 1 2 1 2 False False False False False False Newsletter 68 64 15 5 \r\nFebruary 2018 Newsletter\r\n \r\nDear AVia... AVbytes, Ultimate 2018 learning path and aweso... http://r.newsletters.analyticsvidhya.com/7whsu...
1 63_124394 63 124394 0.0 2018 1 1 2 1 2 False False False False False False Newsletter 68 64 15 5 \r\nFebruary 2018 Newsletter\r\n \r\nDear AVia... AVbytes, Ultimate 2018 learning path and aweso... http://r.newsletters.analyticsvidhya.com/7whsu...
2 63_95168 63 95168 0.0 2018 1 1 2 1 2 False False False False False False Newsletter 68 64 15 5 \r\nFebruary 2018 Newsletter\r\n \r\nDear AVia... AVbytes, Ultimate 2018 learning path and aweso... http://r.newsletters.analyticsvidhya.com/7whsu...
3 63_31556 63 31556 0.0 2018 1 1 2 1 2 False False False False False False Newsletter 68 64 15 5 \r\nFebruary 2018 Newsletter\r\n \r\nDear AVia... AVbytes, Ultimate 2018 learning path and aweso... http://r.newsletters.analyticsvidhya.com/7whsu...
4 63_138377 63 138377 0.0 2018 1 1 2 1 2 False False False False False False Newsletter 68 64 15 5 \r\nFebruary 2018 Newsletter\r\n \r\nDear AVia... AVbytes, Ultimate 2018 learning path and aweso... http://r.newsletters.analyticsvidhya.com/7whsu...
5 63_88136 63 88136 0.0 2018 1 1 2 1 2 False False False False False False Newsletter 68 64 15 5 \r\nFebruary 2018 Newsletter\r\n \r\nDear AVia... AVbytes, Ultimate 2018 learning path and aweso... http://r.newsletters.analyticsvidhya.com/7whsu...
6 63_18079 63 18079 0.0 2018 1 1 2 1 2 False False False False False False Newsletter 68 64 15 5 \r\nFebruary 2018 Newsletter\r\n \r\nDear AVia... AVbytes, Ultimate 2018 learning path and aweso... http://r.newsletters.analyticsvidhya.com/7whsu...
7 63_84118 63 84118 0.0 2018 1 1 2 1 2 False False False False False False Newsletter 68 64 15 5 \r\nFebruary 2018 Newsletter\r\n \r\nDear AVia... AVbytes, Ultimate 2018 learning path and aweso... http://r.newsletters.analyticsvidhya.com/7whsu...
8 63_45321 63 45321 0.0 2018 1 1 2 1 2 False False False False False False Newsletter 68 64 15 5 \r\nFebruary 2018 Newsletter\r\n \r\nDear AVia... AVbytes, Ultimate 2018 learning path and aweso... http://r.newsletters.analyticsvidhya.com/7whsu...
9 63_7092 63 7092 0.0 2018 1 1 2 1 2 False False False False False False Newsletter 68 64 15 5 \r\nFebruary 2018 Newsletter\r\n \r\nDear AVia... AVbytes, Ultimate 2018 learning path and aweso... http://r.newsletters.analyticsvidhya.com/7whsu...

In [202]:
test['link_diff'] = test['total_links'] - test['no_of_internal_links']
test['av_links'] = (test['no_of_internal_links']/ test['total_links']) * 100
test['img_per_section'] = test['no_of_images']/ test['no_of_sections']

test['link_diff_%'] = (test['total_links'] - test['no_of_internal_links'])/test['total_links'] * 100
test.head(1)


Out[202]:
id campaign_id user_id y send_Year send_Month send_Week send_Day send_Dayofweek send_Dayofyear ... no_of_internal_links no_of_images no_of_sections email_body subject email_url link_diff av_links img_per_section link_diff_%
0 63_122715 63 122715 0.0 2018 1 1 2 1 2 ... 64 15 5 \r\nFebruary 2018 Newsletter\r\n \r\nDear AVia... AVbytes, Ultimate 2018 learning path and aweso... http://r.newsletters.analyticsvidhya.com/7whsu... 4 94.117647 3.0 5.882353

1 rows × 28 columns


In [205]:
test.to_feather('tmp/av_lord_test')

In [208]:
train_cats(test)

In [212]:
nas


Out[212]:
{'is_open': 0.0,
 'send_Day': 10.0,
 'send_Dayofweek': 3.0,
 'send_Dayofyear': 67.0,
 'send_Month': 3.0,
 'send_Week': 10.0,
 'send_Year': 2017.0,
 'user_id': 120476.0}

In [214]:
test.columns


Out[214]:
Index(['id', 'campaign_id', 'user_id', 'y', 'send_Year', 'send_Month',
       'send_Week', 'send_Day', 'send_Dayofweek', 'send_Dayofyear',
       'send_Is_month_end', 'send_Is_month_start', 'send_Is_quarter_end',
       'send_Is_quarter_start', 'send_Is_year_end', 'send_Is_year_start',
       'communication_type', 'total_links', 'no_of_internal_links',
       'no_of_images', 'no_of_sections', 'email_body', 'subject', 'email_url',
       'link_diff', 'av_links', 'img_per_section', 'link_diff_%'],
      dtype='object')

In [215]:
mapper


Out[215]:
DataFrameMapper(default=False, df_out=False,
        features=[(['user_id'], StandardScaler(copy=True, with_mean=True, with_std=True)), (['campaign_id'], StandardScaler(copy=True, with_mean=True, with_std=True)), (['is_open'], StandardScaler(copy=True, with_mean=True, with_std=True)), (['send_Year'], StandardScaler(copy=True, with_mean=True, with_std=... with_std=True)), (['send_Dayofyear_na'], StandardScaler(copy=True, with_mean=True, with_std=True))],
        input_df=False, sparse=False)

In [217]:
test.columns


Out[217]:
Index(['id', 'campaign_id', 'user_id', 'y', 'send_Year', 'send_Month',
       'send_Week', 'send_Day', 'send_Dayofweek', 'send_Dayofyear',
       'send_Is_month_end', 'send_Is_month_start', 'send_Is_quarter_end',
       'send_Is_quarter_start', 'send_Is_year_end', 'send_Is_year_start',
       'communication_type', 'total_links', 'no_of_internal_links',
       'no_of_images', 'no_of_sections', 'email_body', 'subject', 'email_url',
       'link_diff', 'av_links', 'img_per_section', 'link_diff_%'],
      dtype='object')

In [218]:
test, _, _ = proc_df(test,max_n_cat=30,mapper=mapper,na_dict=nas)

In [219]:
test.columns


Out[219]:
Index(['id', 'campaign_id', 'user_id', 'y', 'send_Year', 'send_Month',
       'send_Week', 'send_Day', 'send_Dayofweek', 'send_Dayofyear',
       ...
       'email_url_http://r.newsletters.analyticsvidhya.com/7wra6vb5p4c.html?t=1520942329',
       'email_url_http://r.newsletters.analyticsvidhya.com/7wrjo7b5p4c.html?t=1520942329',
       'email_url_http://r.newsletters.analyticsvidhya.com/7ww0uvb5p4c.html?t=1520940826',
       'email_url_http://r.newsletters.analyticsvidhya.com/7wx2s7b5p4c.html?t=1520940826',
       'email_url_http://r.newsletters.analyticsvidhya.com/7wxlqvb5p4c.html?t=1520940826',
       'email_url_http://r.newsletters.analyticsvidhya.com/7wxv87b5p4c.html?t=1520940826',
       'email_url_http://r.newsletters.analyticsvidhya.com/7wz6mvb5p4c.html?t=1520940826',
       'email_url_http://r.newsletters.analyticsvidhya.com/7wzpljb5p4c.html?t=1520935115',
       'email_url_http://r.newsletters.analyticsvidhya.com/7x08k7b5p4c.html?t=1520935115',
       'email_url_nan'],
      dtype='object', length=113)

In [224]:
df.drop(list(set(df.columns) - set(test.columns)), axis=1,inplace=True)

In [228]:
len(test.columns)


Out[228]:
29

In [227]:
len(df.columns)


Out[227]:
29

In [51]:
print(df['img_per_sec'].value_counts())
sns.countplot(df['img_per_sec'],orient='h');


2.000000     211722
7.000000     139380
3.000000     132195
13.000000     85433
2.500000      82163
16.000000     81358
3.250000      81253
4.000000      76361
1.000000      50942
2.833333      39710
3.166667      39498
5.000000       3198
3.750000          2
3.500000          1
9.000000          1
Name: img_per_sec, dtype: int64

In [43]:
print(df['is_open'].value_counts())
sns.countplot(df['is_open']);


0.0    920401
1.0    102790
Name: is_open, dtype: int64

In [38]:
sns.countplot(df['no_of_images']);



In [39]:
sns.countplot(df['no_of_sections']);



In [37]:
sns.countplot(df['link_diff']);


Out[37]:
<matplotlib.axes._subplots.AxesSubplot at 0x1f117841a20>

In [81]:
train_cats(df)

In [82]:
apply_cats(test, df)

In [87]:
df.drop(['id', 'user_id'], axis=1, inplace=True);
test.drop(['id', 'user_id'], axis=1, inplace=True);

In [88]:
df.head(1)


Out[88]:
campaign_id send_Year send_Month send_Week send_Day send_Dayofweek send_Dayofyear send_Is_month_end send_Is_month_start send_Is_quarter_end ... no_of_internal_links no_of_images no_of_sections email_body subject email_url link_diff link_diff_% img_per_section av_links
0 42 2017.0 1.0 2.0 9.0 0.0 9.0 False False False ... 79 13 4 September Newsletter\r\n \r\nDear AVians,\r\n ... [September] Exciting days ahead with DataHack ... http://r.newsletters.analyticsvidhya.com/7v3rd... 9 10.227273 3.25 89.772727

1 rows × 25 columns


In [89]:
test.head(1)


Out[89]:
campaign_id send_Year send_Month send_Week send_Day send_Dayofweek send_Dayofyear send_Is_month_end send_Is_month_start send_Is_quarter_end ... no_of_internal_links no_of_images no_of_sections email_body subject email_url link_diff av_links img_per_section link_diff_%
0 63 2018 1 1 2 1 2 False False False ... 64 15 5 \r\nFebruary 2018 Newsletter\r\n \r\nDear AVia... AVbytes, Ultimate 2018 learning path and aweso... http://r.newsletters.analyticsvidhya.com/7whsu... 4 94.117647 3.0 5.882353

1 rows × 25 columns


In [90]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1023217 entries, 0 to 1023216
Data columns (total 25 columns):
campaign_id              1023217 non-null int64
send_Year                1023191 non-null float64
send_Month               1023191 non-null float64
send_Week                1023191 non-null float64
send_Day                 1023191 non-null float64
send_Dayofweek           1023191 non-null float64
send_Dayofyear           1023191 non-null float64
send_Is_month_end        1023191 non-null category
send_Is_month_start      1023191 non-null category
send_Is_quarter_end      1023191 non-null category
send_Is_quarter_start    1023191 non-null category
send_Is_year_end         1023191 non-null category
send_Is_year_start       1023191 non-null category
communication_type       1023217 non-null category
total_links              1023217 non-null int64
no_of_internal_links     1023217 non-null int64
no_of_images             1023217 non-null int64
no_of_sections           1023217 non-null int64
email_body               1023217 non-null category
subject                  1023217 non-null category
email_url                1023217 non-null category
link_diff                1023217 non-null int64
link_diff_%              1023217 non-null float64
img_per_section          1023217 non-null float64
av_links                 1023217 non-null float64
dtypes: category(10), float64(9), int64(6)
memory usage: 126.9 MB

In [99]:
categorical_features_indices = np.where(df.dtypes == 'category')[0]

In [100]:
categorical_features_indices


Out[100]:
array([ 7,  8,  9, 10, 11, 12, 13, 18, 19, 20], dtype=int64)

In [103]:
df[:].fillna(method='ffill', inplace=True)

In [106]:
def rmse(x,y): return math.sqrt(((x-y)**2).mean())

def print_score(m):
    res = [rmse(m.predict(X_train), y_train), rmse(m.predict(X_valid), y_valid),
                m.score(X_train, y_train), m.score(X_valid, y_valid)]
    if hasattr(m, 'oob_score_'): res.append(m.oob_score_)
    print(res)

In [109]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(C=1, random_state=17, solver='lbfgs',class_weight='balanced',n_jobs=-1,max_iter=1000)

In [112]:
df.drop(['email_body','subject','email_url'], axis =1, inplace=True)
test.drop(['email_body','subject','email_url'], axis =1, inplace=True)

In [116]:
train_cats(df)

In [117]:
apply_cats(test,df)

In [121]:
categorical_features_indices = np.where(df.dtypes == 'category')[0]

In [122]:
categorical_features_indices


Out[122]:
array([ 7,  8,  9, 10, 11, 12, 13], dtype=int64)

In [133]:
#importing library and building model
from catboost import CatBoostRegressor
#model=CatBoostClassifier(iterations=1000, depth=10,learning_rate=0.01, loss_function='CrossEntropy',\
                         #)
#model.fit(X_train, y_train,cat_features=categorical_features_indices,eval_set=(X_validation, y_validation))

In [128]:
df, _, nan, mapper = proc_df(df,do_scale=True,max_n_cat=30)

In [129]:
from sklearn.model_selection import train_test_split
X_train, X_validation, y_train, y_validation = train_test_split(df, y_target, train_size=0.8, random_state=1234)


C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_split.py:2026: FutureWarning: From version 0.21, test_size will always complement train_size unless both are specified.
  FutureWarning)

In [136]:
lr.fit(X_train,y_train)


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-136-0903aefc4a63> in <module>()
----> 1 lr.fit(X_train,y_train)

C:\ProgramData\Anaconda3\lib\site-packages\sklearn\linear_model\logistic.py in fit(self, X, y, sample_weight)
   1214 
   1215         X, y = check_X_y(X, y, accept_sparse='csr', dtype=_dtype,
-> 1216                          order="C")
   1217         check_classification_targets(y)
   1218         self.classes_ = np.unique(y)

C:\ProgramData\Anaconda3\lib\site-packages\sklearn\utils\validation.py in check_X_y(X, y, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, warn_on_dtype, estimator)
    577     else:
    578         y = column_or_1d(y, warn=True)
--> 579         _assert_all_finite(y)
    580     if y_numeric and y.dtype.kind == 'O':
    581         y = y.astype(np.float64)

C:\ProgramData\Anaconda3\lib\site-packages\sklearn\utils\validation.py in _assert_all_finite(X)
     42             and not np.isfinite(X).all()):
     43         raise ValueError("Input contains NaN, infinity"
---> 44                          " or a value too large for %r." % X.dtype)
     45 
     46 

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [139]:
df.isnull().head()


Out[139]:
campaign_id send_Year send_Month send_Week send_Day send_Dayofweek send_Dayofyear total_links no_of_internal_links no_of_images ... send_Is_year_start_False send_Is_year_start_nan communication_type_Conference communication_type_Corporate communication_type_Hackathon communication_type_Newsletter communication_type_Others communication_type_Upcoming Events communication_type_Webinar communication_type_nan
0 False False False False False False False False False False ... False False False False False False False False False False
1 False False False False False False False False False False ... False False False False False False False False False False
2 False False False False False False False False False False ... False False False False False False False False False False
3 False False False False False False False False False False ... False False False False False False False False False False
4 False False False False False False False False False False ... False False False False False False False False False False

5 rows × 35 columns


In [230]:
m


Out[230]:
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [232]:
preds = m.predict(test)

In [239]:
preds


Out[239]:
array([ 0.5,  0.5,  0.6, ...,  0.6,  0.6,  0.6])

In [233]:
sample_sub['is_click'] = preds

In [234]:
sample_sub


Out[234]:
id is_click
0 63_122715 0.5
1 56_76206 0.5
2 57_96189 0.6
3 56_166917 0.5
4 56_172838 0.5
5 71_50253 0.6
6 76_128244 0.5
7 74_195513 0.5
8 67_74617 0.5
9 75_140599 0.5
10 63_124394 0.5
11 63_95168 0.5
12 74_90706 0.6
13 63_31556 0.5
14 57_166505 0.5
15 64_2089 0.5
16 74_133849 0.5
17 74_7486 0.5
18 57_159015 0.5
19 64_111872 0.5
20 56_176763 0.5
21 71_8316 0.5
22 69_133308 0.5
23 74_50984 0.5
24 76_64692 0.5
25 64_184811 0.5
26 57_81487 0.5
27 57_41624 0.5
28 71_95059 0.5
29 71_25044 0.5
... ... ...
773828 56_47638 0.6
773829 57_199696 0.6
773830 56_162289 0.6
773831 69_145085 0.6
773832 74_233054 0.6
773833 63_42227 0.6
773834 74_238916 0.6
773835 64_67184 0.6
773836 63_64538 0.6
773837 71_83603 0.6
773838 57_13091 0.6
773839 63_216162 0.6
773840 63_111877 0.6
773841 63_163766 0.6
773842 74_40948 0.6
773843 74_59284 0.6
773844 64_232176 0.6
773845 69_65847 0.6
773846 76_200174 0.6
773847 57_191488 0.6
773848 69_42153 0.6
773849 57_127883 0.6
773850 55_153170 0.6
773851 57_4034 0.6
773852 74_80370 0.6
773853 74_202929 0.6
773854 56_95437 0.6
773855 63_11063 0.6
773856 72_144182 0.6
773857 57_226299 0.6

773858 rows × 2 columns


In [241]:
def make_submission(probs):
    sample = pd.read_csv(f'{PATH}//sample_submission.csv')
    submit = sample.copy()
    submit['is_click'] = probs
    return submit

In [242]:
submit = make_submission(preds)

In [243]:
submit.head(2)


Out[243]:
id is_click
0 63_122715 0.5
1 56_76206 0.5

In [244]:
submit.to_csv(PATH + '//av_cat_2.csv', index=False)

In [ ]: