In [ ]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [ ]:
%%bash
ls -lh ../input

In [ ]:
%%bash
wc -l ../input/*

In [ ]:
40000000*100/184903891

In [ ]:
%%bash
head ../input/sample_submission.csv

In [ ]:
#train = pd.read_csv("../input/train.csv")
#test = pd.read_csv("../input/test.csv")

path = "../input/"

dtypes = {
        'ip'            : 'uint32',
        'app'           : 'uint16',
        'device'        : 'uint16',
        'os'            : 'uint16',
        'channel'       : 'uint16',
        'is_attributed' : 'uint8',
        'click_id'      : 'uint32'
        }

print('loading train data...')
train_df = pd.read_csv(path+"train.csv", nrows=40000000, dtype=dtypes, usecols=['ip','app','device','os', 'channel', 'click_time', 'is_attributed'])

print('loading test data...')
test_df = pd.read_csv(path+"test.csv", dtype=dtypes, usecols=['ip','app','device','os', 'channel', 'click_time', 'click_id'])

sub = pd.read_csv("../input/sample_submission.csv")

In [ ]:
import gc
gc.collect()

In [ ]:
print(test_df.shape)
test_df.head()

In [ ]:
len_train = len(train_df)
train_df=train_df.append(test_df)

In [ ]:
del test_df
gc.collect()

print('Extracting new features...')
train_df['hour'] = pd.to_datetime(train_df.click_time).dt.hour.astype('uint8')
train_df['day'] = pd.to_datetime(train_df.click_time).dt.day.astype('uint8')
train_df['wday']  = pd.to_datetime(train_df.click_time).dt.dayofweek.astype('uint8')

gc.collect()

In [ ]:
print('grouping by ip-day-hour combination...')
gp = train_df[['ip','day','hour','channel']].groupby(by=['ip','day','hour'])[['channel']].count().reset_index().rename(index=str, columns={'channel': 'qty'})
train_df = train_df.merge(gp, on=['ip','day','hour'], how='left')
del gp
gc.collect()

In [ ]:
print('group by ip-app combination...')
gp = train_df[['ip', 'app', 'channel']].groupby(by=['ip', 'app'])[['channel']].count().reset_index().rename(index=str, columns={'channel': 'ip_app_count'})
train_df = train_df.merge(gp, on=['ip','app'], how='left')
del gp
gc.collect()

In [ ]:
print('group by ip-app-os combination...')
gp = train_df[['ip','app', 'os', 'channel']].groupby(by=['ip', 'app', 'os'])[['channel']].count().reset_index().rename(index=str, columns={'channel': 'ip_app_os_count'})
print("merging...")
train_df = train_df.merge(gp, on=['ip','app', 'os'], how='left')
del gp
gc.collect()

In [ ]:
print("vars and data type: ")
train_df.info()
train_df['qty'] = train_df['qty'].astype('uint16')
train_df['ip_app_count'] = train_df['ip_app_count'].astype('uint16')
train_df['ip_app_os_count'] = train_df['ip_app_os_count'].astype('uint16')

In [ ]:
test_df = train_df[len_train:]
train_df = train_df[:len_train]

In [ ]:
train_df.head()

In [ ]:
train_df = train_df.sample(frac=1).reset_index(drop=True)

In [ ]:
train_df.head()

In [ ]:
val_df = train_df[(len_train-3000000):len_train]
train_df = train_df[:(len_train-3000000)]

In [ ]:
print("train size: ", len(train_df))
print("valid size: ", len(val_df))
print("test size : ", len(test_df))

target = 'is_attributed'
predictors = ['app','device','os', 'channel', 'hour', 'day', 'wday', 'qty', 'ip_app_count', 'ip_app_os_count']
categorical = ['app', 'device', 'os', 'channel', 'hour', 'day', 'wday']

sub = pd.DataFrame()
sub['click_id'] = test_df['click_id'].astype('int')

In [ ]:
from sklearn.linear_model import LogisticRegressionCV
model = LogisticRegressionCV(cv=5,scoring="neg_log_loss",random_state=1
                             #,penalty="l1"
                             #,Cs= Cs_#list(np.arange(1e-7,1e-9,-0.5e-9)) # [0.5,0.1,0.01,0.001] #list(np.power(1, np.arange(-10, 10)))
                             #,max_iter=1000, tol=1e-11
                             #,solver="liblinear"
                             ,n_jobs=4
                            )
model.fit(train_df[predictors], train_df[target])

#---
Cs = model.Cs_
list(np.power(10.0, np.arange(-10, 10)))
dir(model)
sco = model.scores_[1].mean(axis=0)
#---
import matplotlib.pyplot as plt
plt.plot(Cs
    #np.log10(Cs)
    ,sco)
# plt.ylabel('some numbers')
plt.show()
sco.min()

In [ ]:


In [ ]:
from sklearn.metrics import roc_auc_score
y_pred=model.predict(val_df[predictors],verbose=1,batch_size=10000)
y_test=val_df[target]
roc_auc_score(y_test, y_pred)

In [ ]:
y_test.hist()

In [ ]:
y_pred[:6]
numpy.histogram(y_pred)

In [ ]:
almost1 = y_pred >= 0.96
y_pred[almost1] = 1

In [ ]:
y_pred[almost1]

In [ ]:
roc_auc_score(y_test, y_pred) - 0.9235889250504977

In [ ]:
gc.collect()

print("Training...")
start_time = time.time()


params = {
    'learning_rate': 0.19,
    #'is_unbalance': 'true', # replaced with scale_pos_weight argument
    'num_leaves': 15,  # 2^max_depth - 1
    'max_depth': 4,  # -1 means no limit
    'min_child_samples': 100,  # Minimum number of data need in a child(min_data_in_leaf)
    'max_bin': 100,  # Number of bucketed bin for feature values
    'subsample': .7,  # Subsample ratio of the training instance.
    'subsample_freq': 1,  # frequence of subsample, <=0 means no enable
    'colsample_bytree': 0.7,  # Subsample ratio of columns when constructing each tree.
    'min_child_weight': 0,  # Minimum sum of instance weight(hessian) needed in a child(leaf)
    'scale_pos_weight':99 # because training data is extremely unbalanced 
}
bst = lgb_modelfit_nocv(params, 
                        train_df, 
                        val_df, 
                        predictors, 
                        target, 
                        objective='binary', 
                        metrics='auc',
                        early_stopping_rounds=50, 
                        verbose_eval=True, 
                        num_boost_round=350, 
                        categorical_features=categorical)

print('[{}]: model training time'.format(time.time() - start_time))

In [ ]:
print("Predicting...")
sub['is_attributed'] = bst.predict(test_df[predictors])
print("writing...")
sub.to_csv('sub_lgb_balanced99.csv',index=False)
print("done...")

In [ ]: