Windows 10 Coin

train: (row: 1,347,190, columns: 1,085) test: (row: 374,136, columns: 1,084)

y value: if HasClicked == True, app 1.8%

How to run

  1. Put the train and test files in ..\input
  2. Put the script file in ..\script
  3. In Jupyter, run all and get submission file in the same script folder

In [2]:
# Timer and file info
import math
import time
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import gc # We're gonna be clearing memory a lot
import matplotlib.pyplot as plt
import seaborn as sns
import random
#from ml_metrics import mapk
from datetime import datetime
import re
import csv
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import ensemble
from sklearn import model_selection
from sklearn.metrics import matthews_corrcoef, f1_score, classification_report, confusion_matrix, precision_score, recall_score
%matplotlib inline

# Timer
class Timer:
    def __init__(self, text=None):
        self.text = text
        
    def __enter__(self):
        self.cpu = time.clock()
        self.time = time.time()
        if self.text:
            print("{}...".format(self.text))
            print(datetime.now())
        return self

    def __exit__(self, *args):
        self.cpu = time.clock() - self.cpu
        self.time = time.time() - self.time
        if self.text:
            print("%s: cpu %0.2f, time %0.2f\n" % (self.text, self.cpu, self.time))

# Split to train and holdout sets with counts
def sample_train_holdout(_df, sample_count, holdout_count):   
    random.seed(7)
    sample_RowNumber = random.sample(list(_df['RowNumber']), (sample_count + holdout_count))
    train_RowNumber = random.sample(sample_RowNumber, sample_count)
    holdout_RowNumber = list(set(sample_RowNumber) - set(train_RowNumber))
    holdout = _df[_df['RowNumber'].isin(holdout_RowNumber)].copy()
    _df = _df[_df['RowNumber'].isin(train_RowNumber)]
    return _df, holdout 

# Sampling for train and holdout with imbalanced binary label
def trainHoldoutSampling(_df, _id, _label, _seed=7, t_tr=0.5, t_ho=0.5, f_tr=0.05, f_ho=0.5):
    random.seed(_seed)
    
    positive_id = list(_df[_df[_label]==True][_id].values)
    negative_id = list(_df[_df[_label]==False][_id].values)
    
    train_positive_id = random.sample(positive_id, int(len(positive_id) * t_tr))
    holdout_positive_id = random.sample(list(set(positive_id)-set(train_positive_id)), int(len(positive_id) * t_ho)) 
    train_negative_id = random.sample(negative_id, int(len(negative_id) * f_tr))
    holdout_negative_id = random.sample(list(set(negative_id)-set(train_negative_id)), int(len(negative_id) * f_ho))
    
    train_id = list(set(train_positive_id)|set(train_negative_id))
    holdout_id = list(set(holdout_positive_id)|set(holdout_negative_id))
    
    print('train count: {}, train positive count: {}'.format(len(train_id),len(train_positive_id)))
    print('holdout count: {}, holdout positive count: {}'.format(len(holdout_id),len(holdout_positive_id)))
    
    return _df[_df[_id].isin(train_id)], _df[_df[_id].isin(holdout_id)]

def datetime_features2(_df, _col):
    _format='%m/%d/%Y %I:%M:%S %p'
    _df[_col] = _df[_col].apply(lambda x: datetime.strptime(x, _format))
    
    colYear = _col+'Year'
    colMonth = _col+'Month'
    colDay = _col+'Day'
    colHour = _col+'Hour'
    colMinute = _col+'Minute'
    colSecond = _col+'Second'
    colYearMonthDay = _col+'YearMonthDay'
    colYearMonthDayHour = _col+'YearMonthDayHour' 
    colYearMonthDayHourMinute = _col+'YearMonthDayHourMinute' 
    colYearMonthDayHourMinuteSecond = _col+'YearMonthDayHourMinuteSecond' 
    
    _df[colYear] = _df[_col].apply(lambda x: x.year)
    _df[colMonth] = _df[_col].apply(lambda x: x.month)
    _df[colDay] = _df[_col].apply(lambda x: x.day)
    _df[colHour] = _df[_col].apply(lambda x: x.hour)
    _df[colMinute] = _df[_col].apply(lambda x: x.minute)
    _df[colSecond] = _df[_col].apply(lambda x: x.second)
    
    ymd = [colYear, colMonth, colDay]
    ymdh = [colYear, colMonth, colDay, colHour]
    ymdhm = [colYear, colMonth, colDay, colHour, colMinute]
    ymdhms = [colYear, colMonth, colDay, colHour, colMinute, colSecond]
    
    _df[colYearMonthDay] = _df[ymd].apply(lambda x: '_'.join(str(x)), axis=1)
    _df[colYearMonthDayHour] = _df[ymdh].apply(lambda x: '_'.join(str(x)), axis=1)
    _df[colYearMonthDayHourMinute] = _df[ymdhm].apply(lambda x: '_'.join(str(x)), axis=1)
    _df[colYearMonthDayHourMinuteSecond] = _df[ymdhms].apply(lambda x: '_'.join(str(x)), axis=1)
    
    return _df
    
# Change date column datetime type and add date time features
def datetime_features(_df, _col, isDelete = False):
    # 1. For years greater than 2017, create year folder with regex and change year to 2017 in datetime column
    # find and return 4 digit number (1st finding) in dataframe string columns
    year_col = _col + 'Year'
    _df[year_col] = _df[_col].apply(lambda x: int(re.findall(r"\D(\d{4})\D", " "+ str(x) +" ")[0]))
    years = sorted(list(_df[year_col].unique()))
    yearsGreaterThan2017 = sorted(i for i in years if i > 2017)

    # Two ways for strange year data (1) change it to 2017 temporarily (2) remove from data; we will go with (1)
    # because we cannot remove test rows anyway
    if isDelete:
        _df = _df[~_df[year_col].isin(yearsGreaterThan2017)]
    else:
        for i in yearsGreaterThan2017:
            print("replace ", i, " to 2017 for conversion")
            _df.loc[_df[year_col] == i, _col] = _df[_df[year_col] == i][_col].values[0].replace(str(i), "2017")
    
    # How to remove strange year rows
    # train = train[~train['year'].isin(yearsGreaterThan2017)]

    # 2. Convert string to datetime
    _df[_col] = pd.to_datetime(_df[_col])
    print(_col, "column conversion to datetime type is done")
    
    # 3. Add more date time features
    month_col = _col + 'Month'
    week_col = _col + 'Week'
    weekday_col = _col + 'Weekday'
    day_col = _col + 'Day'
    hour_col = _col + 'Hour'
    min_col = _col + 'Minute'
    second_col = _col + 'Second'
    #year_month_day_col = _col + 'YearMonthDay'
    #year_month_day_hour_col = _col + 'YearMonthDayHour'
    
    _df[month_col] = pd.DatetimeIndex(_df[_col]).month
    _df[week_col] = pd.DatetimeIndex(_df[_col]).week
    _df[weekday_col] = pd.DatetimeIndex(_df[_col]).weekday
    _df[day_col] = pd.DatetimeIndex(_df[_col]).day
    _df[hour_col] = pd.DatetimeIndex(_df[_col]).hour
    _df[minute_col] = pd.DatetimeIndex(_df[_col]).minute
    _df[second_col] = pd.DatetimeIndex(_df[_col]).second
    #_df[year_month_day_col] = _df[[year_col, month_col, day_col]].apply(lambda x: ''.join(str(x)), axis=1)
    #_df[year_month_day_hour_col] = _df[[year_col, month_col, day_col, hour_col]].apply(lambda x: ''.join(str(x)), axis=1)
    print("year, month, week, weekday, day, hour, minute, second features are added")
    
    return _df

# Delete rows with list condition for dataframe
def delRows(_df, _col, _list):
    _df = _df[~_df[_col].isin(_list)]
    return _df

import re

# Create new column using regex pattern for strings for dataframe
def addFeatureRegex(_df, _col, _newCol):
    _df[_newCol] = _df[_col].apply(lambda x: int(re.findall(r"\D(\d{4})\D", " "+ str(x) +" ")[0]))
    return _df

# Convert string to datetime type
def stringToDatetime(_df, _col):
    _df[_col] = _df[_col].astype('datetime64[ns]')
    return _df

# Add features from datetime
def addDatetimeFeatures(_df, _col):
    _df[_col + 'Year'] = pd.DatetimeIndex(_df[_col]).year
    _df[_col + 'Month'] = pd.DatetimeIndex(_df[_col]).month
    _df[_col + 'Week'] = pd.DatetimeIndex(_df[_col]).week
    _df[_col + 'Weekday'] = pd.DatetimeIndex(_df[_col]).weekday
    _df[_col + 'Day'] = pd.DatetimeIndex(_df[_col]).day
    _df[_col + 'Hour'] = pd.DatetimeIndex(_df[_col]).hour
    return _df

# Get categorical column names
def categoricalColumns(_df):
    cat_columns = _df.select_dtypes(['object']).columns
    print("Categorical column count:", len(cat_columns))
    print("First 5 values:", cat_columns[:5])
    return cat_columns

# Get column names starting with
def columnsStartingWith(_df, _str):
    sorted_list = sorted(i for i in list(_df) if i.startswith(_str))
    print("Column count:", len(sorted_list))
    print("First 5 values:", sorted_list[:5])    
    return sorted_list

# Get column names ending with
def columnsEndingWith(_df, _str):
    sorted_list = sorted(i for i in list(_df) if i.endswith(_str))
    print("Column count:", len(sorted_list))
    print("First 5 values:", sorted_list[:5])    
    return sorted_list

# Get constant columns
def constantColumns(_df):
    constant_list = []
    cols = list(_df) # same as _df.columns.values
    for col in cols:
        if len(_df[col].unique()) == 1:
            constant_list.append(col)
    print("Constant column count:", len(constant_list))
    print("First 5 values:", constant_list[:5])  
    return constant_list

# Add null columns
def makeNullColumns(_df, _cols):
    null_df = _df[_cols].isnull()
    null_df.columns = null_df.columns + 'Null'
    _df = pd.concat([_df, null_df], axis=1)
    return _df

# Union
def union(a, b):
    return list(set(a)|set(b))

def unique(a):
    return list(set(a))

# undersampling - sample rate 0.8 for 80% samling using isUndersampled column 
def underSampling(_df, _sample_rate):
    _df['isUnderSampled'] = 1
    _rand_num = 1/(1-_sample_rate)
    underSample = np.random.randint(_rand_num, size=len(_df[_df['HasClicked'] == 0]))
    _df.loc[_df['HasClicked'] == 0, 'isUnderSampled'] = underSample>0
    return _df

# Add column with value count
def valueCountColumn(_df, _col):
    _dict = dict([(i, a) for i, a in zip(_df[_col].value_counts().index, _df[_col].value_counts().values)])
    _df[_col+'ValueCount'] = _df[_col].apply(lambda x: _dict[x])
    return _df

# Add column with bool values to check if keyword is contained or not
def containColumn(_df, _col, _str):
    _df[_col+'Cotains'+_str] = _df[_col].str.contains(_str)
    return _df

# Feature engineering
def feature_engineering(_df):
    print("shape:", _df.shape)
    print("Add datetime features...")
    datetime_columns = ['BubbleShownTime', 'FirstUpdatedDate', 'OSOOBEDateTime']
    for col in datetime_columns:
        print(col)
        if _df[col].isnull().sum() > 0:
            _df[col] = _df[col].fillna('1/1/2017 11:11:11 AM')
        _df = datetime_features2(_df, col)

    print("shape:", _df.shape)

    gc.collect()
    
    # Null count
    print("Missing value count...")
    _df['CntNs'] = _df.isnull().sum(axis=1) 

    cols = ['AppCategoryNMinus1', 'AppCategoryNMinus2', 'AppCategoryNMinus3', 'AppCategoryNMinus4', 'AppCategoryNMinus5',
           'AppCategoryNMinus6', 'AppCategoryNMinus7', 'AppCategoryNMinus8']
    _df['AppCatCntNs'] = _df[cols].isnull().sum(axis=1)

    #_df[cols] = _df[cols].fillna("NA")
    #for col in cols:
    #    print(col)
    #    _df[col+'HighLevel'] = _df[col].apply(lambda x: str(x).split(':')[0])

    # Game segment parse with '.'
    # to-do: 2nd and 3rd parsed values to add as features later, some exception handling is needed
    print("Gamer segment parsing...")
    _df['GamerSegment1'] = _df['GamerSegment'].apply(lambda x: str(x).split('.')[0] if str(x).split('.') else 'Unknown')
    
    # Check creativeName contains keyword or not
    keywords = ['SL', 'TS', 'Week7', 'Week 7', 'Meet', 'Skype', 'Battery']
    for keyword in keywords:
        _df = containColumn(_df, 'creativeName', keyword)
    #_df['week7'] = _df['Week7'].values + _df['Week 7'].values
    #_df.drop(['Week7', 'Week 7'], axis = 1, inplace = True)
    
    # Convert categorical columns to numeric
    print("Convert categorical columns to numeric...")
    cat_columns = _df.select_dtypes(['object']).columns
    for cat_column in cat_columns:
        print(cat_column)
        if cat_column == 'creativeName':
            _df['creativeNameTest'] = _df['creativeName'].values
        _df[cat_column] = _df[cat_column].apply(lambda x: abs(hash(x)))
    gc.collect()
    
    # Replace missing values with -1
    print("Replace missing values with -1")
    _df = _df.fillna(-1)
        
    return _df

# Get best threshold value for F1 score
def f1_best_threshold(_actual, _pred):
    thresholds = np.linspace(0.01, 0.99, 1000)

    fc = np.array([f1_score(_actual, _pred>thr) for thr in thresholds])
    plt.plot(thresholds, fc)
    best_threshold = thresholds[fc.argmax()]
    print('f1 score:', fc.max())
    print('best threshold:', best_threshold)
    print('TF pred mean:', (_pred>best_threshold).mean())
    
    return best_threshold

In [3]:
start_time = datetime.now()

In [4]:
with Timer("Read train data..."):
    train = pd.read_csv('../input/CoinMlCompetitionSoftlandingTrainWithHeader.tsv', sep='\t') # (1347190, 1085)
    print(train.shape)
    
    print("Train and holdout sampling...")
    seed = 1
    train, holdout = trainHoldoutSampling(train, 'RowNumber', 'HasClicked', seed, t_tr=0.5, t_ho=0.5, f_tr=0.05, f_ho=0.5)
    gc.collect()
    
    # 100% t 10% f for the final


Read train data......
2017-05-22 20:59:57.987833
E:\Users\hyunor\AppData\Local\Continuum\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py:2717: DtypeWarning: Columns (2,8,17,33) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)
(1347190, 1085)
Train and holdout sampling...
train count: 78457, train positive count: 12331
holdout count: 673595, holdout positive count: 12331
Read train data...: cpu 114.32, time 114.32


In [5]:
# Train feature engineering
with Timer("Train feature engineering..."):
    #train = feature_engineering(train, isDeleteOddDateRows=True)
    train = feature_engineering(train)
    
    train_y = train['HasClicked'].values
    print("train y mean:", train_y.mean())


Train feature engineering......
2017-05-22 21:09:21.042283
shape: (78457, 1085)
Add datetime features...
BubbleShownTime
FirstUpdatedDate
OSOOBEDateTime
shape: (78457, 1115)
Missing value count...
Gamer segment parsing...
Convert categorical columns to numeric...
BubbleShownTime
CampaignId
creativeName
IsVirtualDevice
ISOCountryShortName
DefaultInternetBrowser
OSOOBEDateTime
IsDomainJoined
IsAlwaysOnAlwaysConnectedCapable
DeviceTimezone
DisplayLanguage
GamerPCClassification
GamerSegment
AppCategoryNMinus1
AppCategoryNMinus2
AppCategoryNMinus3
AppCategoryNMinus4
AppCategoryNMinus5
AppCategoryNMinus6
AppCategoryNMinus7
AppCategoryNMinus8
BubbleShownTimeYearMonthDay
BubbleShownTimeYearMonthDayHour
BubbleShownTimeYearMonthDayHourMinute
BubbleShownTimeYearMonthDayHourMinuteSecond
FirstUpdatedDateYearMonthDay
FirstUpdatedDateYearMonthDayHour
FirstUpdatedDateYearMonthDayHourMinute
FirstUpdatedDateYearMonthDayHourMinuteSecond
OSOOBEDateTimeYearMonthDay
OSOOBEDateTimeYearMonthDayHour
OSOOBEDateTimeYearMonthDayHourMinute
OSOOBEDateTimeYearMonthDayHourMinuteSecond
GamerSegment1
creativeNameCotainsSL
creativeNameCotainsTS
creativeNameCotainsWeek7
creativeNameCotainsWeek 7
creativeNameCotainsMeet
creativeNameCotainsSkype
creativeNameCotainsBattery
Replace missing values with -1
train y mean: 0.157168895064
Train feature engineering...: cpu 293.06, time 293.05


In [6]:
# holdout feature engineering
with Timer("Holdout feature engineering..."):
    #holdout = feature_engineering(holdout, isDeleteOddDateRows=True)
    holdout = feature_engineering(holdout)
    
    y_holdout = holdout['HasClicked'].values
    print("holdout y mean:", y_holdout.mean())


Holdout feature engineering......
2017-05-22 21:14:14.104588
shape: (673595, 1085)
Add datetime features...
BubbleShownTime
FirstUpdatedDate
OSOOBEDateTime
shape: (673595, 1115)
Missing value count...
Gamer segment parsing...
Convert categorical columns to numeric...
BubbleShownTime
CampaignId
creativeName
IsVirtualDevice
ISOCountryShortName
DefaultInternetBrowser
OSOOBEDateTime
IsDomainJoined
IsAlwaysOnAlwaysConnectedCapable
DeviceTimezone
DisplayLanguage
GamerPCClassification
GamerSegment
AppCategoryNMinus1
AppCategoryNMinus2
AppCategoryNMinus3
AppCategoryNMinus4
AppCategoryNMinus5
AppCategoryNMinus6
AppCategoryNMinus7
AppCategoryNMinus8
BubbleShownTimeYearMonthDay
BubbleShownTimeYearMonthDayHour
BubbleShownTimeYearMonthDayHourMinute
BubbleShownTimeYearMonthDayHourMinuteSecond
FirstUpdatedDateYearMonthDay
FirstUpdatedDateYearMonthDayHour
FirstUpdatedDateYearMonthDayHourMinute
FirstUpdatedDateYearMonthDayHourMinuteSecond
OSOOBEDateTimeYearMonthDay
OSOOBEDateTimeYearMonthDayHour
OSOOBEDateTimeYearMonthDayHourMinute
OSOOBEDateTimeYearMonthDayHourMinuteSecond
GamerSegment1
creativeNameCotainsSL
creativeNameCotainsTS
creativeNameCotainsWeek7
creativeNameCotainsWeek 7
creativeNameCotainsMeet
creativeNameCotainsSkype
creativeNameCotainsBattery
Replace missing values with -1
holdout y mean: 0.0183062522733
Holdout feature engineering...: cpu 2786.80, time 2786.78


In [12]:
# New feature engineering and validation
isNewFeatureOn = True

if isNewFeatureOn:

    # Add column with value count
    def valueCountColumnDict(_df, _col):
        _dict = dict([(i, a) for i, a in zip(_df[_col].value_counts().index, _df[_col].value_counts().values)])
        _df[_col+'ValueCount'] = _df[_col].apply(lambda x: _dict[x])
        return _dict

    # Add column with bool values to check if keyword is contained or not
    #def containColumn(_df, _col, _str):
    #    _df[_col+'Cotains'+_str] = _df[_col].str.contains(_str)
    #    return _df

    #for keyword in ['SL']:
    #    train = containColumn(train, 'creativeNameTest', keyword)
    #    holdout = containColumn(holdout, 'creativeNameTest', keyword)

    from collections import Counter

    # add columsn for train and holdout sets together
    def valueCountTrainHoldout(_tr, _ho, _col):
        tr_dict = valueCountColumnDict(_tr, _col)
        ho_dict = valueCountColumnDict(_ho, _col)

        _dict = Counter(tr_dict) + Counter(ho_dict)

        _tr[_col + 'ValueCount'] = _tr[_col].apply(lambda x: _dict[x])
        _ho[_col + 'ValueCount'] = _ho[_col].apply(lambda x: _dict[x])

        print

        return _tr, _ho

    #val_count_cols = ['UniqueUserDeviceKey', 'CampaignId', 'ISOCountryShortName', 'DeviceTimezone', 
    #                  'DisplayLanguage', 'GamerSegment']
    val_count_cols = ['UniqueUserDeviceKey']
    for col in val_count_cols:
        print(col)
        train, holdout = valueCountTrainHoldout(train, holdout, col)

    #collections.Counter test
    #from collections import Counter
    #A = Counter({'a':1, 'b':2, 'c':3})
    #B = Counter({'b':3, 'c':4, 'd':5})
    #A + B
    #Counter({'c': 7, 'b': 5, 'd': 5, 'a': 1})

    # uudk count check
    #print(tr_uudk_dict[832614])
    #print(ho_uudk_dict[832614])
    #print(len(tr_uudk_dict))
    #print(len(ho_uudk_dict))
    #print(len(uudk_dict))
    #print(uudk_dict[832614])

    # uudk mean value - should be almost same
    print(train.UniqueUserDeviceKeyValueCount.mean())
    print(holdout.UniqueUserDeviceKeyValueCount.mean())

    #print(train.CampaignIdValueCount.mean())
    #print(holdout.CampaignIdValueCount.mean())


UniqueUserDeviceKey
1.3822221089259084
1.3321432017755477

In [ ]:
isNewFeatureOn = False

if isNewFeatureOn:
    
    # ymd and ymdh features
    
    #bs_cols = ['BubbleShownTimeYear', 'BubbleShownTimeMonth', 'BubbleShownTimeDay']
    #train[bs_cols]
    #train['bs_ymd'] = train[bs_cols].apply(lambda x: "_".join(x), axis = 1)
    #train['ymd'] = train['BubbleShownTimeYear'].astype(str) + train['BubbleShownTimeMonth'].astype(str) 
    #+ train['BubbleShownTimeDay'].astype(str)

    train['ymd'] = train[['BubbleShownTimeYear', 'BubbleShownTimeMonth', 
                          'BubbleShownTimeDay']].apply(lambda x : '{}-{}-{}'.format(x[0],x[1],x[2]), axis=1)
    train['ymdh'] = train[['BubbleShownTimeYear', 'BubbleShownTimeMonth', 'BubbleShownTimeDay', 
                           'BubbleShownTimeHour']].apply(lambda x : '{}-{}-{}-{}'.format(x[0],x[1],x[2],x[3]), axis=1)

    holdout['ymd'] = holdout[['BubbleShownTimeYear', 'BubbleShownTimeMonth', 
                          'BubbleShownTimeDay']].apply(lambda x : '{}-{}-{}'.format(x[0],x[1],x[2]), axis=1)
    holdout['ymdh'] = holdout[['BubbleShownTimeYear', 'BubbleShownTimeMonth', 'BubbleShownTimeDay', 
                           'BubbleShownTimeHour']].apply(lambda x : '{}-{}-{}-{}'.format(x[0],x[1],x[2],x[3]), axis=1)

In [13]:
isNewFeatureOn = True

if isNewFeatureOn:
    
    def countGroupByDict(_df, str1, str2):
        new_col = str1 + "_" + str2
        _df[new_col] = _df[[str1, str2]].apply(lambda x : '{}-{}'.format(x[0],x[1]), axis=1)
        _dict = valueCountColumnDict(_df, new_col)
        #_df.drop(new_col, axis=1, inplace=True)
        print("New value count column is added:", new_col)
        return _dict
    
    # group by country and campaign id
    def countGroupBy(_df, str1, str2):
        new_col = str1 + "_" + str2
        _df[new_col] = _df[[str1, str2]].apply(lambda x : '{}-{}'.format(x[0],x[1]), axis=1)
        _df = valueCountColumn(_df, new_col)
        _df.drop(new_col, axis=1, inplace=True)
        print("New value count column is added:", new_col)
        return _df

    #train = countGroupBy(train, 'BubbleShownTimeYearMonthDayHourMinuteSecond', 'UniqueUserDeviceKey')
    #holdout = countGroupBy(holdout, 'BubbleShownTimeYearMonthDayHourMinuteSecond', 'UniqueUserDeviceKey')
 
    def trainHoldoutCountGroupBy(_tr, _ho, col1, col2):
        tr_dict = countGroupByDict(_tr, col1, col2)
        ho_dict = countGroupByDict(_ho, col1, col2)

        _dict = Counter(tr_dict) + Counter(ho_dict)

        _tr[col1 + '_' + col2 + 'ValueCount'] = _tr[col1 + '_' + col2].apply(lambda x: _dict[x])
        _ho[col1 + '_' + col2 + 'ValueCount'] = _ho[col1 + '_' + col2].apply(lambda x: _dict[x])
        
        _tr.drop(col1 + '_' + col2, axis = 1, inplace = True)
        _ho.drop(col1 + '_' + col2, axis = 1, inplace = True)
        
        return _tr, _ho

    cols = ['BubbleShownTime', 'ISOCountryShortName', 'CampaignId', 'creativeName']
    for col in cols:
        train, holdout = trainHoldoutCountGroupBy(train, holdout, col, 'UniqueUserDeviceKey')
    
'''
    train_dict = countGroupByDict(train, 'BubbleShownTime', 'UniqueUserDeviceKey')
    holdout_dict = countGroupByDict(holdout, 'BubbleShownTime', 'UniqueUserDeviceKey')

    _dict = Counter(train_dict) + Counter(holdout_dict)
    
    train['BubbleShownTime_UniqueUserDeviceKeyValueCount'] = train['BubbleShownTime_UniqueUserDeviceKey'].apply(lambda x: _dict[x])
    holdout['BubbleShownTime_UniqueUserDeviceKeyValueCount'] = holdout['BubbleShownTime_UniqueUserDeviceKey'].apply(lambda x: _dict[x])    

    
    train_dict = countGroupByDict(train, 'ISOCountryShortName', 'UniqueUserDeviceKey')
    holdout_dict = countGroupByDict(holdout, 'ISOCountryShortName', 'UniqueUserDeviceKey')

    _dict = Counter(train_dict) + Counter(holdout_dict)
    
    train['ISOCountryShortName_UniqueUserDeviceKeyValueCount'] = train['ISOCountryShortName_UniqueUserDeviceKey'].apply(lambda x: _dict[x])
    holdout['ISOCountryShortName_UniqueUserDeviceKeyValueCount'] = holdout['ISOCountryShortName_UniqueUserDeviceKey'].apply(lambda x: _dict[x])    

    
    train_dict = countGroupByDict(train, 'CampaignId', 'UniqueUserDeviceKey')
    holdout_dict = countGroupByDict(holdout, 'CampaignId', 'UniqueUserDeviceKey')

    _dict = Counter(train_dict) + Counter(holdout_dict)
    
    train['CampaignId_UniqueUserDeviceKeyValueCount'] = train['CampaignId_UniqueUserDeviceKey'].apply(lambda x: _dict[x])
    holdout['CampaignId_UniqueUserDeviceKeyValueCount'] = holdout['CampaignId_UniqueUserDeviceKey'].apply(lambda x: _dict[x])    
   

    train_dict = countGroupByDict(train, 'ISOCountryShortName', 'UniqueUserDeviceKey')
    holdout_dict = countGroupByDict(holdout, 'ISOCountryShortName', 'UniqueUserDeviceKey')

    _dict = Counter(train_dict) + Counter(holdout_dict)
    
    train['ISOCountryShortName_UniqueUserDeviceKeyValueCount'] = train['ISOCountryShortName_UniqueUserDeviceKey'].apply(lambda x: _dict[x])
    holdout['ISOCountryShortName_UniqueUserDeviceKeyValueCount'] = holdout['ISOCountryShortName_UniqueUserDeviceKey'].apply(lambda x: _dict[x])    
   
'''    
    
    #train = countGroupBy(train, 'ISOCountryShortName', 'UniqueUserDeviceKey')
    #holdout = countGroupBy(holdout, 'ISOCountryShortName', 'UniqueUserDeviceKey')
    
    #train = countGroupBy(train, 'ISOCountryShortNameValueCount', 'CampaignId')
    #holdout = countGroupBy(holdout, 'ISOCountryShortNameValueCount', 'CampaignId')

    #train[bs_cols]
    #print(train['ymdhm'])
    #print(train['ymdhms'])


New value count column is added: BubbleShownTime_UniqueUserDeviceKey
New value count column is added: BubbleShownTime_UniqueUserDeviceKey
New value count column is added: ISOCountryShortName_UniqueUserDeviceKey
New value count column is added: ISOCountryShortName_UniqueUserDeviceKey
New value count column is added: CampaignId_UniqueUserDeviceKey
New value count column is added: CampaignId_UniqueUserDeviceKey
New value count column is added: creativeName_UniqueUserDeviceKey
New value count column is added: creativeName_UniqueUserDeviceKey
Out[13]:
"\n    train_dict = countGroupByDict(train, 'BubbleShownTime', 'UniqueUserDeviceKey')\n    holdout_dict = countGroupByDict(holdout, 'BubbleShownTime', 'UniqueUserDeviceKey')\n\n    _dict = Counter(train_dict) + Counter(holdout_dict)\n    \n    train['BubbleShownTime_UniqueUserDeviceKeyValueCount'] = train['BubbleShownTime_UniqueUserDeviceKey'].apply(lambda x: _dict[x])\n    holdout['BubbleShownTime_UniqueUserDeviceKeyValueCount'] = holdout['BubbleShownTime_UniqueUserDeviceKey'].apply(lambda x: _dict[x])    \n\n    \n    train_dict = countGroupByDict(train, 'ISOCountryShortName', 'UniqueUserDeviceKey')\n    holdout_dict = countGroupByDict(holdout, 'ISOCountryShortName', 'UniqueUserDeviceKey')\n\n    _dict = Counter(train_dict) + Counter(holdout_dict)\n    \n    train['ISOCountryShortName_UniqueUserDeviceKeyValueCount'] = train['ISOCountryShortName_UniqueUserDeviceKey'].apply(lambda x: _dict[x])\n    holdout['ISOCountryShortName_UniqueUserDeviceKeyValueCount'] = holdout['ISOCountryShortName_UniqueUserDeviceKey'].apply(lambda x: _dict[x])    \n\n    \n    train_dict = countGroupByDict(train, 'CampaignId', 'UniqueUserDeviceKey')\n    holdout_dict = countGroupByDict(holdout, 'CampaignId', 'UniqueUserDeviceKey')\n\n    _dict = Counter(train_dict) + Counter(holdout_dict)\n    \n    train['CampaignId_UniqueUserDeviceKeyValueCount'] = train['CampaignId_UniqueUserDeviceKey'].apply(lambda x: _dict[x])\n    holdout['CampaignId_UniqueUserDeviceKeyValueCount'] = holdout['CampaignId_UniqueUserDeviceKey'].apply(lambda x: _dict[x])    \n   \n\n    train_dict = countGroupByDict(train, 'ISOCountryShortName', 'UniqueUserDeviceKey')\n    holdout_dict = countGroupByDict(holdout, 'ISOCountryShortName', 'UniqueUserDeviceKey')\n\n    _dict = Counter(train_dict) + Counter(holdout_dict)\n    \n    train['ISOCountryShortName_UniqueUserDeviceKeyValueCount'] = train['ISOCountryShortName_UniqueUserDeviceKey'].apply(lambda x: _dict[x])\n    holdout['ISOCountryShortName_UniqueUserDeviceKeyValueCount'] = holdout['ISOCountryShortName_UniqueUserDeviceKey'].apply(lambda x: _dict[x])    \n   \n"

In [14]:
print(len(_dict))
print(len(train_dict))
print(len(holdout_dict))


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-14-edd064618d5c> in <module>()
----> 1 print(len(_dict))
      2 print(len(train_dict))
      3 print(len(holdout_dict))

NameError: name '_dict' is not defined

In [15]:
#train.head()
print(train.BubbleShownTime_UniqueUserDeviceKeyValueCount.mean())
print(train.ISOCountryShortName_UniqueUserDeviceKeyValueCount.mean())

print(holdout.BubbleShownTime_UniqueUserDeviceKeyValueCount.mean())
print(holdout.ISOCountryShortName_UniqueUserDeviceKeyValueCount.mean())


1.0279516168092075
1.3822221089259084
1.0033328632189966
1.3321432017755477

In [16]:
# Get column groups and features
all_columns = train.columns
print("All columns:", len(all_columns))

# Remove constant columns for train (all included in time_ and days_ columns)
print("features without constant columns")
constant_columns = constantColumns(train)
features = list(set(all_columns) - set(constant_columns))
print("features:", len(features))

# With a lot of nulls, exclude time and days columns first and add later for improvement
print("features without time_ and days_ columns")
time_columns = columnsStartingWith(train, 'Time_')
days_columns = columnsStartingWith(train, 'Days_')
features = list(set(features) - set(time_columns))
features = list(set(features) - set(days_columns))

# Drop features
drop_features = ['HasClicked', 'RowNumber', 'BubbleShownTime', 'FirstUpdatedDate', 'OSOOBEDateTime', 
                 'creativeNameTest', 'ymd', 'ymdh', 'UniqueUserDeviceKey', 'ymd_UniqueUserDeviceKeyValueCount',
                'ymdh_ISOCountryShortNameValueCount', 'ymd_CampaignIdValueCountValueCount',
                'ymdh_CampaignIdValueCountValueCount', 'ISOCountryShortNameValueCount_CampaignIdValueCount',
                'BubbleShownTime_UniqueUserDeviceKey', 'BubbleShownTime_UniqueUserDeviceKeyValueCountValueCount',
                'ISOCountryShortName_UniqueUserDeviceKey'] 
features = list(set(features) - set(drop_features))

# Drop features
#HighLevel_features = columnsEndingWith(train, 'HighLevel') 
#features = list(set(features) - set(HighLevel_features))

# Drop features
#UniqueUserDeviceKey_features = ['UniqueUserDeviceKeyValueCount', 'CampaignIdValueCount', 'creativeNameValueCount'] 
#features = list(set(features) - set(UniqueUserDeviceKey_features))

# Feature importance > 50
imp_time_features = ['Time_Accessibility', 'Time_Browser', 'Time_Communications', 'Time_Content', 'Time_DevTools', 
                     'Time_Games', 'Time_Malware', 'Time_Media', 'Time_PersonalProductivity', 'Time_Readers', 
                     'Time_Search', 'Time_Social', 'Time_StudentAndLearning', 'Time_ModernApps', 
                     'Time_Games_Core', 'Time_Games_Casual', 'Time_windows_immersivecontrolpanel',
                     'Time_msascui_exe', 'Time_chrome_exe', 'Time_microsoft_windows_cortana', 'Time_lockapphost_exe']

features = list(set(features) | set(imp_time_features))

print("features:", len(features))


All columns: 1131
features without constant columns
Constant column count: 68
First 5 values: ['Time_microsoft_basicattractloop', 'Days_microsoft_basicattractloop', 'Time_5319275a_whatsapp', 'Days_5319275a_whatsapp', 'Time_5b04b775_356b_4aa0_aaf8_6491ffea5602']
features: 1063
features without time_ and days_ columns
Column count: 517
First 5 values: ['Time_0c72c7cd_217379cb4ae6f', 'Time_0c72c7cd_beta', 'Time_123freesolitaire_exe', 'Time_1cv8_exe', 'Time_2345explorer_exe']
Column count: 517
First 5 values: ['Days_0c72c7cd_217379cb4ae6f', 'Days_0c72c7cd_beta', 'Days_123freesolitaire_exe', 'Days_1cv8_exe', 'Days_2345explorer_exe']
features: 111

In [17]:
sorted(features)


Out[17]:
['AppCatCntNs',
 'AppCategoryNMinus1',
 'AppCategoryNMinus2',
 'AppCategoryNMinus3',
 'AppCategoryNMinus4',
 'AppCategoryNMinus5',
 'AppCategoryNMinus6',
 'AppCategoryNMinus7',
 'AppCategoryNMinus8',
 'BubbleShownTimeDay',
 'BubbleShownTimeHour',
 'BubbleShownTimeMinute',
 'BubbleShownTimeMonth',
 'BubbleShownTimeSecond',
 'BubbleShownTimeYear',
 'BubbleShownTimeYearMonthDay',
 'BubbleShownTimeYearMonthDayHour',
 'BubbleShownTimeYearMonthDayHourMinute',
 'BubbleShownTimeYearMonthDayHourMinuteSecond',
 'BubbleShownTime_UniqueUserDeviceKeyValueCount',
 'CampaignId',
 'CampaignId_UniqueUserDeviceKeyValueCount',
 'CntNs',
 'DefaultInternetBrowser',
 'DeviceTimezone',
 'DisplayLanguage',
 'FirstUpdatedDateDay',
 'FirstUpdatedDateHour',
 'FirstUpdatedDateMinute',
 'FirstUpdatedDateMonth',
 'FirstUpdatedDateSecond',
 'FirstUpdatedDateYear',
 'FirstUpdatedDateYearMonthDay',
 'FirstUpdatedDateYearMonthDayHour',
 'FirstUpdatedDateYearMonthDayHourMinute',
 'FirstUpdatedDateYearMonthDayHourMinuteSecond',
 'FrontFacingCameraResolution',
 'GamerPCClassification',
 'GamerSegment',
 'GamerSegment1',
 'HasFrontFacingCamera',
 'HasRearFacingCamera',
 'ISOCountryShortName',
 'ISOCountryShortName_UniqueUserDeviceKeyValueCount',
 'InternalBatteryNumberOfCharges',
 'InternalPrimaryDiagonalDisplaySizeInInches',
 'InternalPrimaryDisplayLogicalDPIX',
 'InternalPrimaryDisplayLogicalDPIY',
 'InternalPrimaryDisplayResolutionHorizontal',
 'InternalPrimaryDisplayResolutionVertical',
 'IsAlwaysOnAlwaysConnectedCapable',
 'IsCommercial',
 'IsDomainJoined',
 'IsEducation',
 'IsMalware',
 'IsPenCapable',
 'IsTouchEnabled',
 'IsVirtualDevice',
 'LifeTimeConversionCount',
 'LifeTimeImpressionCount',
 'NumberofExternalDisplays',
 'NumberofInternalDisplays',
 'OSOOBEDateTimeDay',
 'OSOOBEDateTimeHour',
 'OSOOBEDateTimeMinute',
 'OSOOBEDateTimeMonth',
 'OSOOBEDateTimeSecond',
 'OSOOBEDateTimeYear',
 'OSOOBEDateTimeYearMonthDay',
 'OSOOBEDateTimeYearMonthDayHour',
 'OSOOBEDateTimeYearMonthDayHourMinute',
 'OSOOBEDateTimeYearMonthDayHourMinuteSecond',
 'PrimaryDiskTotalCapacity',
 'ProcessorClockSpeed',
 'ProcessorCores',
 'ProcessorPhysicalCores',
 'RecentMonthConversionCount',
 'Time_Accessibility',
 'Time_Browser',
 'Time_Communications',
 'Time_Content',
 'Time_DevTools',
 'Time_Games',
 'Time_Games_Casual',
 'Time_Games_Core',
 'Time_Malware',
 'Time_Media',
 'Time_ModernApps',
 'Time_PersonalProductivity',
 'Time_Readers',
 'Time_Search',
 'Time_Social',
 'Time_StudentAndLearning',
 'Time_chrome_exe',
 'Time_lockapphost_exe',
 'Time_microsoft_windows_cortana',
 'Time_msascui_exe',
 'Time_windows_immersivecontrolpanel',
 'TotalDays',
 'TotalEngagementTimeInSec',
 'TotalPhysicalRAM',
 'UniqueUserDeviceKeyValueCount',
 'creativeName',
 'creativeNameCotainsBattery',
 'creativeNameCotainsMeet',
 'creativeNameCotainsSL',
 'creativeNameCotainsSkype',
 'creativeNameCotainsTS',
 'creativeNameCotainsWeek 7',
 'creativeNameCotainsWeek7',
 'creativeName_UniqueUserDeviceKeyValueCount']

In [12]:
train.head()


Out[12]:
RowNumber BubbleShownTime CampaignId creativeName UniqueUserDeviceKey LifeTimeImpressionCount LifeTimeConversionCount RecentMonthConversionCount IsVirtualDevice ISOCountryShortName ... creativeNameCotainsWeek 7 creativeNameCotainsMeet creativeNameCotainsSkype creativeNameCotainsBattery creativeNameTest UniqueUserDeviceKeyValueCount BubbleShownTime_UniqueUserDeviceKeyValueCount ISOCountryShortName_UniqueUserDeviceKeyValueCount CampaignId_UniqueUserDeviceKeyValueCount creativeName_UniqueUserDeviceKeyValueCount
8 9 1832587208742229165 3000000000265650 2785328259414446637 79974 22.0 0.0 0.0 0 3714697026978332222 ... 1 0 0 0 [SL] Edge - Switch - Chrome - Security - Week ... 1 1 1 1 1
19 20 2791887120101984283 111000000000270254 1232225600981039426 636220 28.0 0.0 0.0 0 3714697026978332222 ... 1 0 0 0 [SL] Edge - Switch - Chrome - Performance - We... 1 1 1 1 1
38 39 7341280477697335459 111000000000270254 1232225600981039426 704804 13.0 0.0 0.0 0 3714697026978332222 ... 1 0 0 0 [SL] Edge - Switch - Chrome - Performance - We... 1 1 1 1 1
49 50 3935258070067889319 36000000000261028 5179136934924494730 1316891 -1.0 -1.0 -1.0 0 8022880232484933194 ... 0 0 0 0 SL - Edge Web Notes-Non-Touch Devices - Re-signed 1 1 1 1 1
52 53 563808584753779797 3000000000265650 2785328259414446637 832614 4.0 0.0 0.0 0 2700715776491014538 ... 1 0 0 0 [SL] Edge - Switch - Chrome - Security - Week ... 2 1 2 1 1

5 rows × 1131 columns


In [13]:
'''
with Timer("Read test and feature engineering..."):
    # Read tsv file
    test = pd.read_csv('../input/CoinMlCompetitionSoftlandingEvaluateNoLabel.tsv', sep='\t', header = None)
    
    # Add header because test does not header
    test.columns = train.columns[0:1084]
    
    # Feature engineering - should not delete odd date rows
    test = feature_engineering(test, isDeleteOddDateRows=False)
'''


Out[13]:
'\nwith Timer("Read test and feature engineering..."):\n    # Read tsv file\n    test = pd.read_csv(\'../input/CoinMlCompetitionSoftlandingEvaluateNoLabel.tsv\', sep=\'\t\', header = None)\n    \n    # Add header because test does not header\n    test.columns = train.columns[0:1084]\n    \n    # Feature engineering - should not delete odd date rows\n    test = feature_engineering(test, isDeleteOddDateRows=False)\n'

In [14]:
'''
from sklearn.model_selection import train_test_split
with Timer('# train validation split'):
    #X_train, X_val, y_train, y_val = train_test_split(train[train.isUnderSampled == True][features], train_y[train.isUnderSampled == True], test_size=0.2, random_state=0)
    X_train, X_val, y_train, y_val = train_test_split(train[features], train_y, test_size=0.2, random_state=0)
    
    gc.collect()

    print(y_train.shape)
    print(X_train.shape)
    print(y_val.shape)
    print(X_val.shape)
    
    print(y_train.mean())
    print(y_val.mean())
    
    del train
    gc.collect()
'''


Out[14]:
"\nfrom sklearn.model_selection import train_test_split\nwith Timer('# train validation split'):\n    #X_train, X_val, y_train, y_val = train_test_split(train[train.isUnderSampled == True][features], train_y[train.isUnderSampled == True], test_size=0.2, random_state=0)\n    X_train, X_val, y_train, y_val = train_test_split(train[features], train_y, test_size=0.2, random_state=0)\n    \n    gc.collect()\n\n    print(y_train.shape)\n    print(X_train.shape)\n    print(y_val.shape)\n    print(X_val.shape)\n    \n    print(y_train.mean())\n    print(y_val.mean())\n    \n    del train\n    gc.collect()\n"

In [18]:
import lightgbm as lgb
#train_data = lgb.Dataset(X_train[X_train.isUnderSampled == True][features], label=X_train[X_train.isUnderSampled == True]['HasClicked'].values)

#train_data = lgb.Dataset(X_train[features], label=y_train)
#val_data = lgb.Dataset(X_val[features], y_val)

# use train holdout directly with t f ratio
train_data = lgb.Dataset(train[features], label=train_y)
val_data = lgb.Dataset(holdout[features], y_holdout)

#print(X_train[features].shape)
#print(X_val[features].shape)

print(train[features].shape)
print(holdout[features].shape)


(78457, 111)
(673595, 111)

In [27]:
with Timer("Trainig..."):

    random.seed(2007)

    params = {
        'task' : 'train',
        'boosting_type' : 'dart', #'gbdt'
        'objective' : 'binary',
        'metric' : 'auc', # 'binary_logloss'
        'is_training_metric': True,
        'max_bin': 255,
        'num_leaves' : 64,
        'learning_rate' : 0.02, # 0.05, #0.1,
        'feature_fraction' : 0.82,
        'bagging_fraction': 0.8,
        'bagging_freq': 5,
        'min_data_in_leaf': 10,
        'min_sum_hessian_in_leaf': 5,
        #'scale_pos_weight': 0.5,
        #'is_unbalance': True
        # 'num_threads': 16,
    }
    num_round = 10000


    bst = lgb.train(params, train_data, num_round, valid_sets=val_data, early_stopping_rounds=10)

    #val_preds = bst.predict(X_val[features], num_iteration=bst.best_iteration)
    holdout_preds = bst.predict(holdout[features], num_iteration=bst.best_iteration)
    #test_preds = bst.predict(test[features], num_iteration=bst.best_iteration)

    #0.7019 

    # Including all high level and ymd and ymdh
    # [297]	valid_0's auc:0.67564 F1 score: 0.096338028169, best thr: 0.325385385385, Click mean: 0.0343981839588

    # without ymd; f1 score not improved, so keep this
    # [201]	valid_0's auc:0.67772 F1 score: 0.0966780126125, best thr: 0.306746746747, Click mean: 0.0379598932823

    # With uniqueUserDeviceKey valueCount
    # [368]	valid_0's auc:0.664831 F1 score: 0.06x ???

    # Value counts
    # [525]	valid_0's auc:0.686445 f1 score: 0.104380886546 thr: 0.325875875876 Click mean: 0.0332386612486 (gain: 0.04)

    # Count UniqueUserDeviceKey
    # [505]	valid_0's auc:0.706443 f1 score: 0.128913201081 thr: 0.371491491491 Click mean: 0.0267462248702 (gain:0.024)

    # Count CampaignId
    # [544]	valid_0's auc:0.707357 f1 score: 0.13101569594 thr: 0.363643643644 Click mean: 0.0274719972684 (gain: 0.002)

    # Remove all time and days
    # [392]	valid_0's auc:0.703582 f1 score: 0.123669773283 thr: 0.378358358358 Click mean: 0.0266139148895

    # Include imp time features
    # [418]	valid_0's auc:0.706095 f1 score: 0.126989843694 thr: 0.386206206206 Click mean: 0.0229143624878 (loss: 0.004)

    # Group by CampaignId and day
    # [424]	valid_0's auc:0.705353


Trainig......
2017-05-23 00:00:44.164208
[1]	valid_0's auc: 0.672335
Train until valid scores didn't improve in 10 rounds.
[2]	valid_0's auc: 0.679391
[3]	valid_0's auc: 0.681683
[4]	valid_0's auc: 0.701499
[5]	valid_0's auc: 0.700029
[6]	valid_0's auc: 0.698579
[7]	valid_0's auc: 0.702342
[8]	valid_0's auc: 0.703215
[9]	valid_0's auc: 0.703751
[10]	valid_0's auc: 0.708059
[11]	valid_0's auc: 0.708368
[12]	valid_0's auc: 0.708683
[13]	valid_0's auc: 0.708032
[14]	valid_0's auc: 0.707205
[15]	valid_0's auc: 0.706577
[16]	valid_0's auc: 0.708681
[17]	valid_0's auc: 0.709849
[18]	valid_0's auc: 0.710854
[19]	valid_0's auc: 0.711635
[20]	valid_0's auc: 0.71198
[21]	valid_0's auc: 0.711956
[22]	valid_0's auc: 0.71212
[23]	valid_0's auc: 0.712069
[24]	valid_0's auc: 0.712328
[25]	valid_0's auc: 0.712436
[26]	valid_0's auc: 0.712748
[27]	valid_0's auc: 0.712856
[28]	valid_0's auc: 0.712968
[29]	valid_0's auc: 0.71304
[30]	valid_0's auc: 0.713065
[31]	valid_0's auc: 0.712903
[32]	valid_0's auc: 0.712817
[33]	valid_0's auc: 0.712936
[34]	valid_0's auc: 0.712752
[35]	valid_0's auc: 0.712735
[36]	valid_0's auc: 0.71283
[37]	valid_0's auc: 0.712864
[38]	valid_0's auc: 0.713257
[39]	valid_0's auc: 0.713517
[40]	valid_0's auc: 0.71358
[41]	valid_0's auc: 0.713543
[42]	valid_0's auc: 0.713804
[43]	valid_0's auc: 0.713841
[44]	valid_0's auc: 0.713831
[45]	valid_0's auc: 0.713941
[46]	valid_0's auc: 0.714014
[47]	valid_0's auc: 0.713964
[48]	valid_0's auc: 0.714163
[49]	valid_0's auc: 0.714191
[50]	valid_0's auc: 0.714239
[51]	valid_0's auc: 0.713889
[52]	valid_0's auc: 0.71399
[53]	valid_0's auc: 0.714026
[54]	valid_0's auc: 0.714121
[55]	valid_0's auc: 0.714467
[56]	valid_0's auc: 0.714475
[57]	valid_0's auc: 0.71463
[58]	valid_0's auc: 0.71465
[59]	valid_0's auc: 0.714701
[60]	valid_0's auc: 0.714714
[61]	valid_0's auc: 0.71467
[62]	valid_0's auc: 0.71499
[63]	valid_0's auc: 0.714775
[64]	valid_0's auc: 0.714758
[65]	valid_0's auc: 0.714746
[66]	valid_0's auc: 0.714511
[67]	valid_0's auc: 0.714654
[68]	valid_0's auc: 0.714769
[69]	valid_0's auc: 0.714727
[70]	valid_0's auc: 0.714728
[71]	valid_0's auc: 0.714864
[72]	valid_0's auc: 0.715034
[73]	valid_0's auc: 0.715051
[74]	valid_0's auc: 0.715061
[75]	valid_0's auc: 0.715092
[76]	valid_0's auc: 0.715114
[77]	valid_0's auc: 0.715075
[78]	valid_0's auc: 0.715081
[79]	valid_0's auc: 0.715017
[80]	valid_0's auc: 0.715152
[81]	valid_0's auc: 0.715138
[82]	valid_0's auc: 0.715342
[83]	valid_0's auc: 0.715361
[84]	valid_0's auc: 0.715372
[85]	valid_0's auc: 0.715387
[86]	valid_0's auc: 0.7154
[87]	valid_0's auc: 0.715248
[88]	valid_0's auc: 0.71528
[89]	valid_0's auc: 0.715297
[90]	valid_0's auc: 0.715339
[91]	valid_0's auc: 0.715357
[92]	valid_0's auc: 0.715412
[93]	valid_0's auc: 0.715383
[94]	valid_0's auc: 0.715386
[95]	valid_0's auc: 0.715379
[96]	valid_0's auc: 0.715389
[97]	valid_0's auc: 0.715321
[98]	valid_0's auc: 0.715353
[99]	valid_0's auc: 0.715311
[100]	valid_0's auc: 0.715314
[101]	valid_0's auc: 0.715331
[102]	valid_0's auc: 0.715315
Early stopping, best iteration is:
[92]	valid_0's auc: 0.715412
Trainig...: cpu 22.63, time 22.64


In [28]:
#val_best_threshold = f1_best_threshold(y_val, val_preds)
ho_best_threshold = f1_best_threshold(y_holdout, holdout_preds)


E:\Users\hyunor\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\metrics\classification.py:1113: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no predicted samples.
  'precision', 'predicted', average, warn_for)
f1 score: 0.247659929649
best threshold: 0.451441441441
TF pred mean: 0.00659446700168

In [17]:
#val_best_threshold = f1_best_threshold(y_val, val_preds)
ho_best_threshold = f1_best_threshold(y_holdout, holdout_preds)


f1 score: 0.239953065415
best threshold: 0.613303303303
TF pred mean: 0.0069982704741

In [ ]:
bst.feature_importance()

In [ ]:
#feature_list = X_val[features].columns.values
feature_list = holdout[features].columns.values
df_fi = pd.DataFrame(bst.feature_importance(), columns=['importance'])
df_fi['feature'] = feature_list
df_fi = df_fi.sort_values('importance', ascending = 0)
df_fi[df_fi.importance >= 10]

In [ ]:
df_fi = containColumn(df_fi, 'feature', 'Time_')

In [ ]:
df_fi[(df_fi.featureCotainsTime_==True) & (df_fi.importance > 50)]

In [ ]:
zeroImportance = df_fi[df_fi.importance == 0]['feature'].values
print(len(zeroImportance))

In [ ]:
# To do #
# Feature reduction by (1) removing zero imp or (2) under 10

In [ ]:
# Overll time complete
end_time = datetime.now()

total_time = end_time - start_time
print("Time to complete:", total_time)

In [ ]: