In [2]:
# Timer and file info
import math
import time
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import gc # We're gonna be clearing memory a lot
import matplotlib.pyplot as plt
import seaborn as sns
import random
#from ml_metrics import mapk
from datetime import datetime
import re
import csv
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import ensemble
from sklearn import model_selection
from sklearn.metrics import matthews_corrcoef, f1_score, classification_report, confusion_matrix, precision_score, recall_score
%matplotlib inline
# Timer
class Timer:
def __init__(self, text=None):
self.text = text
def __enter__(self):
self.cpu = time.clock()
self.time = time.time()
if self.text:
print("{}...".format(self.text))
print(datetime.now())
return self
def __exit__(self, *args):
self.cpu = time.clock() - self.cpu
self.time = time.time() - self.time
if self.text:
print("%s: cpu %0.2f, time %0.2f\n" % (self.text, self.cpu, self.time))
# Split to train and holdout sets with counts
def sample_train_holdout(_df, sample_count, holdout_count):
random.seed(7)
sample_RowNumber = random.sample(list(_df['RowNumber']), (sample_count + holdout_count))
train_RowNumber = random.sample(sample_RowNumber, sample_count)
holdout_RowNumber = list(set(sample_RowNumber) - set(train_RowNumber))
holdout = _df[_df['RowNumber'].isin(holdout_RowNumber)].copy()
_df = _df[_df['RowNumber'].isin(train_RowNumber)]
return _df, holdout
# Sampling for train and holdout with imbalanced binary label
def trainHoldoutSampling(_df, _id, _label, _seed=7, t_tr=0.5, t_ho=0.5, f_tr=0.05, f_ho=0.5):
random.seed(_seed)
positive_id = list(_df[_df[_label]==True][_id].values)
negative_id = list(_df[_df[_label]==False][_id].values)
train_positive_id = random.sample(positive_id, int(len(positive_id) * t_tr))
holdout_positive_id = random.sample(list(set(positive_id)-set(train_positive_id)), int(len(positive_id) * t_ho))
train_negative_id = random.sample(negative_id, int(len(negative_id) * f_tr))
holdout_negative_id = random.sample(list(set(negative_id)-set(train_negative_id)), int(len(negative_id) * f_ho))
train_id = list(set(train_positive_id)|set(train_negative_id))
holdout_id = list(set(holdout_positive_id)|set(holdout_negative_id))
print('train count: {}, train positive count: {}'.format(len(train_id),len(train_positive_id)))
print('holdout count: {}, holdout positive count: {}'.format(len(holdout_id),len(holdout_positive_id)))
return _df[_df[_id].isin(train_id)], _df[_df[_id].isin(holdout_id)]
def datetime_features2(_df, _col):
_format='%m/%d/%Y %I:%M:%S %p'
_df[_col] = _df[_col].apply(lambda x: datetime.strptime(x, _format))
colYear = _col+'Year'
colMonth = _col+'Month'
colDay = _col+'Day'
colHour = _col+'Hour'
colMinute = _col+'Minute'
colSecond = _col+'Second'
colYearMonthDay = _col+'YearMonthDay'
colYearMonthDayHour = _col+'YearMonthDayHour'
colYearMonthDayHourMinute = _col+'YearMonthDayHourMinute'
colYearMonthDayHourMinuteSecond = _col+'YearMonthDayHourMinuteSecond'
_df[colYear] = _df[_col].apply(lambda x: x.year)
_df[colMonth] = _df[_col].apply(lambda x: x.month)
_df[colDay] = _df[_col].apply(lambda x: x.day)
_df[colHour] = _df[_col].apply(lambda x: x.hour)
_df[colMinute] = _df[_col].apply(lambda x: x.minute)
_df[colSecond] = _df[_col].apply(lambda x: x.second)
ymd = [colYear, colMonth, colDay]
ymdh = [colYear, colMonth, colDay, colHour]
ymdhm = [colYear, colMonth, colDay, colHour, colMinute]
ymdhms = [colYear, colMonth, colDay, colHour, colMinute, colSecond]
_df[colYearMonthDay] = _df[ymd].apply(lambda x: '_'.join(str(x)), axis=1)
_df[colYearMonthDayHour] = _df[ymdh].apply(lambda x: '_'.join(str(x)), axis=1)
_df[colYearMonthDayHourMinute] = _df[ymdhm].apply(lambda x: '_'.join(str(x)), axis=1)
_df[colYearMonthDayHourMinuteSecond] = _df[ymdhms].apply(lambda x: '_'.join(str(x)), axis=1)
return _df
# Change date column datetime type and add date time features
def datetime_features(_df, _col, isDelete = False):
# 1. For years greater than 2017, create year folder with regex and change year to 2017 in datetime column
# find and return 4 digit number (1st finding) in dataframe string columns
year_col = _col + 'Year'
_df[year_col] = _df[_col].apply(lambda x: int(re.findall(r"\D(\d{4})\D", " "+ str(x) +" ")[0]))
years = sorted(list(_df[year_col].unique()))
yearsGreaterThan2017 = sorted(i for i in years if i > 2017)
# Two ways for strange year data (1) change it to 2017 temporarily (2) remove from data; we will go with (1)
# because we cannot remove test rows anyway
if isDelete:
_df = _df[~_df[year_col].isin(yearsGreaterThan2017)]
else:
for i in yearsGreaterThan2017:
print("replace ", i, " to 2017 for conversion")
_df.loc[_df[year_col] == i, _col] = _df[_df[year_col] == i][_col].values[0].replace(str(i), "2017")
# How to remove strange year rows
# train = train[~train['year'].isin(yearsGreaterThan2017)]
# 2. Convert string to datetime
_df[_col] = pd.to_datetime(_df[_col])
print(_col, "column conversion to datetime type is done")
# 3. Add more date time features
month_col = _col + 'Month'
week_col = _col + 'Week'
weekday_col = _col + 'Weekday'
day_col = _col + 'Day'
hour_col = _col + 'Hour'
min_col = _col + 'Minute'
second_col = _col + 'Second'
#year_month_day_col = _col + 'YearMonthDay'
#year_month_day_hour_col = _col + 'YearMonthDayHour'
_df[month_col] = pd.DatetimeIndex(_df[_col]).month
_df[week_col] = pd.DatetimeIndex(_df[_col]).week
_df[weekday_col] = pd.DatetimeIndex(_df[_col]).weekday
_df[day_col] = pd.DatetimeIndex(_df[_col]).day
_df[hour_col] = pd.DatetimeIndex(_df[_col]).hour
_df[minute_col] = pd.DatetimeIndex(_df[_col]).minute
_df[second_col] = pd.DatetimeIndex(_df[_col]).second
#_df[year_month_day_col] = _df[[year_col, month_col, day_col]].apply(lambda x: ''.join(str(x)), axis=1)
#_df[year_month_day_hour_col] = _df[[year_col, month_col, day_col, hour_col]].apply(lambda x: ''.join(str(x)), axis=1)
print("year, month, week, weekday, day, hour, minute, second features are added")
return _df
# Delete rows with list condition for dataframe
def delRows(_df, _col, _list):
_df = _df[~_df[_col].isin(_list)]
return _df
import re
# Create new column using regex pattern for strings for dataframe
def addFeatureRegex(_df, _col, _newCol):
_df[_newCol] = _df[_col].apply(lambda x: int(re.findall(r"\D(\d{4})\D", " "+ str(x) +" ")[0]))
return _df
# Convert string to datetime type
def stringToDatetime(_df, _col):
_df[_col] = _df[_col].astype('datetime64[ns]')
return _df
# Add features from datetime
def addDatetimeFeatures(_df, _col):
_df[_col + 'Year'] = pd.DatetimeIndex(_df[_col]).year
_df[_col + 'Month'] = pd.DatetimeIndex(_df[_col]).month
_df[_col + 'Week'] = pd.DatetimeIndex(_df[_col]).week
_df[_col + 'Weekday'] = pd.DatetimeIndex(_df[_col]).weekday
_df[_col + 'Day'] = pd.DatetimeIndex(_df[_col]).day
_df[_col + 'Hour'] = pd.DatetimeIndex(_df[_col]).hour
return _df
# Get categorical column names
def categoricalColumns(_df):
cat_columns = _df.select_dtypes(['object']).columns
print("Categorical column count:", len(cat_columns))
print("First 5 values:", cat_columns[:5])
return cat_columns
# Get column names starting with
def columnsStartingWith(_df, _str):
sorted_list = sorted(i for i in list(_df) if i.startswith(_str))
print("Column count:", len(sorted_list))
print("First 5 values:", sorted_list[:5])
return sorted_list
# Get column names ending with
def columnsEndingWith(_df, _str):
sorted_list = sorted(i for i in list(_df) if i.endswith(_str))
print("Column count:", len(sorted_list))
print("First 5 values:", sorted_list[:5])
return sorted_list
# Get constant columns
def constantColumns(_df):
constant_list = []
cols = list(_df) # same as _df.columns.values
for col in cols:
if len(_df[col].unique()) == 1:
constant_list.append(col)
print("Constant column count:", len(constant_list))
print("First 5 values:", constant_list[:5])
return constant_list
# Add null columns
def makeNullColumns(_df, _cols):
null_df = _df[_cols].isnull()
null_df.columns = null_df.columns + 'Null'
_df = pd.concat([_df, null_df], axis=1)
return _df
# Union
def union(a, b):
return list(set(a)|set(b))
def unique(a):
return list(set(a))
# undersampling - sample rate 0.8 for 80% samling using isUndersampled column
def underSampling(_df, _sample_rate):
_df['isUnderSampled'] = 1
_rand_num = 1/(1-_sample_rate)
underSample = np.random.randint(_rand_num, size=len(_df[_df['HasClicked'] == 0]))
_df.loc[_df['HasClicked'] == 0, 'isUnderSampled'] = underSample>0
return _df
# Add column with value count
def valueCountColumn(_df, _col):
_dict = dict([(i, a) for i, a in zip(_df[_col].value_counts().index, _df[_col].value_counts().values)])
_df[_col+'ValueCount'] = _df[_col].apply(lambda x: _dict[x])
return _df
# Add column with bool values to check if keyword is contained or not
def containColumn(_df, _col, _str):
_df[_col+'Cotains'+_str] = _df[_col].str.contains(_str)
return _df
# Feature engineering
def feature_engineering(_df):
print("shape:", _df.shape)
print("Add datetime features...")
datetime_columns = ['BubbleShownTime', 'FirstUpdatedDate', 'OSOOBEDateTime']
for col in datetime_columns:
print(col)
if _df[col].isnull().sum() > 0:
_df[col] = _df[col].fillna('1/1/2017 11:11:11 AM')
_df = datetime_features2(_df, col)
print("shape:", _df.shape)
gc.collect()
# Null count
print("Missing value count...")
_df['CntNs'] = _df.isnull().sum(axis=1)
cols = ['AppCategoryNMinus1', 'AppCategoryNMinus2', 'AppCategoryNMinus3', 'AppCategoryNMinus4', 'AppCategoryNMinus5',
'AppCategoryNMinus6', 'AppCategoryNMinus7', 'AppCategoryNMinus8']
_df['AppCatCntNs'] = _df[cols].isnull().sum(axis=1)
#_df[cols] = _df[cols].fillna("NA")
#for col in cols:
# print(col)
# _df[col+'HighLevel'] = _df[col].apply(lambda x: str(x).split(':')[0])
# Game segment parse with '.'
# to-do: 2nd and 3rd parsed values to add as features later, some exception handling is needed
print("Gamer segment parsing...")
_df['GamerSegment1'] = _df['GamerSegment'].apply(lambda x: str(x).split('.')[0] if str(x).split('.') else 'Unknown')
# Check creativeName contains keyword or not
keywords = ['SL', 'TS', 'Week7', 'Week 7', 'Meet', 'Skype', 'Battery']
for keyword in keywords:
_df = containColumn(_df, 'creativeName', keyword)
#_df['week7'] = _df['Week7'].values + _df['Week 7'].values
#_df.drop(['Week7', 'Week 7'], axis = 1, inplace = True)
# Convert categorical columns to numeric
print("Convert categorical columns to numeric...")
cat_columns = _df.select_dtypes(['object']).columns
for cat_column in cat_columns:
print(cat_column)
if cat_column == 'creativeName':
_df['creativeNameTest'] = _df['creativeName'].values
_df[cat_column] = _df[cat_column].apply(lambda x: abs(hash(x)))
gc.collect()
# Replace missing values with -1
print("Replace missing values with -1")
_df = _df.fillna(-1)
return _df
# Get best threshold value for F1 score
def f1_best_threshold(_actual, _pred):
thresholds = np.linspace(0.01, 0.99, 1000)
fc = np.array([f1_score(_actual, _pred>thr) for thr in thresholds])
plt.plot(thresholds, fc)
best_threshold = thresholds[fc.argmax()]
print('f1 score:', fc.max())
print('best threshold:', best_threshold)
print('TF pred mean:', (_pred>best_threshold).mean())
return best_threshold
In [3]:
start_time = datetime.now()
In [4]:
with Timer("Read train data..."):
train = pd.read_csv('../input/CoinMlCompetitionSoftlandingTrainWithHeader.tsv', sep='\t') # (1347190, 1085)
print(train.shape)
print("Train and holdout sampling...")
seed = 1
train, holdout = trainHoldoutSampling(train, 'RowNumber', 'HasClicked', seed, t_tr=0.5, t_ho=0.5, f_tr=0.05, f_ho=0.5)
gc.collect()
# 100% t 10% f for the final
In [5]:
# Train feature engineering
with Timer("Train feature engineering..."):
#train = feature_engineering(train, isDeleteOddDateRows=True)
train = feature_engineering(train)
train_y = train['HasClicked'].values
print("train y mean:", train_y.mean())
In [6]:
# holdout feature engineering
with Timer("Holdout feature engineering..."):
#holdout = feature_engineering(holdout, isDeleteOddDateRows=True)
holdout = feature_engineering(holdout)
y_holdout = holdout['HasClicked'].values
print("holdout y mean:", y_holdout.mean())
In [12]:
# New feature engineering and validation
isNewFeatureOn = True
if isNewFeatureOn:
# Add column with value count
def valueCountColumnDict(_df, _col):
_dict = dict([(i, a) for i, a in zip(_df[_col].value_counts().index, _df[_col].value_counts().values)])
_df[_col+'ValueCount'] = _df[_col].apply(lambda x: _dict[x])
return _dict
# Add column with bool values to check if keyword is contained or not
#def containColumn(_df, _col, _str):
# _df[_col+'Cotains'+_str] = _df[_col].str.contains(_str)
# return _df
#for keyword in ['SL']:
# train = containColumn(train, 'creativeNameTest', keyword)
# holdout = containColumn(holdout, 'creativeNameTest', keyword)
from collections import Counter
# add columsn for train and holdout sets together
def valueCountTrainHoldout(_tr, _ho, _col):
tr_dict = valueCountColumnDict(_tr, _col)
ho_dict = valueCountColumnDict(_ho, _col)
_dict = Counter(tr_dict) + Counter(ho_dict)
_tr[_col + 'ValueCount'] = _tr[_col].apply(lambda x: _dict[x])
_ho[_col + 'ValueCount'] = _ho[_col].apply(lambda x: _dict[x])
print
return _tr, _ho
#val_count_cols = ['UniqueUserDeviceKey', 'CampaignId', 'ISOCountryShortName', 'DeviceTimezone',
# 'DisplayLanguage', 'GamerSegment']
val_count_cols = ['UniqueUserDeviceKey']
for col in val_count_cols:
print(col)
train, holdout = valueCountTrainHoldout(train, holdout, col)
#collections.Counter test
#from collections import Counter
#A = Counter({'a':1, 'b':2, 'c':3})
#B = Counter({'b':3, 'c':4, 'd':5})
#A + B
#Counter({'c': 7, 'b': 5, 'd': 5, 'a': 1})
# uudk count check
#print(tr_uudk_dict[832614])
#print(ho_uudk_dict[832614])
#print(len(tr_uudk_dict))
#print(len(ho_uudk_dict))
#print(len(uudk_dict))
#print(uudk_dict[832614])
# uudk mean value - should be almost same
print(train.UniqueUserDeviceKeyValueCount.mean())
print(holdout.UniqueUserDeviceKeyValueCount.mean())
#print(train.CampaignIdValueCount.mean())
#print(holdout.CampaignIdValueCount.mean())
In [ ]:
isNewFeatureOn = False
if isNewFeatureOn:
# ymd and ymdh features
#bs_cols = ['BubbleShownTimeYear', 'BubbleShownTimeMonth', 'BubbleShownTimeDay']
#train[bs_cols]
#train['bs_ymd'] = train[bs_cols].apply(lambda x: "_".join(x), axis = 1)
#train['ymd'] = train['BubbleShownTimeYear'].astype(str) + train['BubbleShownTimeMonth'].astype(str)
#+ train['BubbleShownTimeDay'].astype(str)
train['ymd'] = train[['BubbleShownTimeYear', 'BubbleShownTimeMonth',
'BubbleShownTimeDay']].apply(lambda x : '{}-{}-{}'.format(x[0],x[1],x[2]), axis=1)
train['ymdh'] = train[['BubbleShownTimeYear', 'BubbleShownTimeMonth', 'BubbleShownTimeDay',
'BubbleShownTimeHour']].apply(lambda x : '{}-{}-{}-{}'.format(x[0],x[1],x[2],x[3]), axis=1)
holdout['ymd'] = holdout[['BubbleShownTimeYear', 'BubbleShownTimeMonth',
'BubbleShownTimeDay']].apply(lambda x : '{}-{}-{}'.format(x[0],x[1],x[2]), axis=1)
holdout['ymdh'] = holdout[['BubbleShownTimeYear', 'BubbleShownTimeMonth', 'BubbleShownTimeDay',
'BubbleShownTimeHour']].apply(lambda x : '{}-{}-{}-{}'.format(x[0],x[1],x[2],x[3]), axis=1)
In [13]:
isNewFeatureOn = True
if isNewFeatureOn:
def countGroupByDict(_df, str1, str2):
new_col = str1 + "_" + str2
_df[new_col] = _df[[str1, str2]].apply(lambda x : '{}-{}'.format(x[0],x[1]), axis=1)
_dict = valueCountColumnDict(_df, new_col)
#_df.drop(new_col, axis=1, inplace=True)
print("New value count column is added:", new_col)
return _dict
# group by country and campaign id
def countGroupBy(_df, str1, str2):
new_col = str1 + "_" + str2
_df[new_col] = _df[[str1, str2]].apply(lambda x : '{}-{}'.format(x[0],x[1]), axis=1)
_df = valueCountColumn(_df, new_col)
_df.drop(new_col, axis=1, inplace=True)
print("New value count column is added:", new_col)
return _df
#train = countGroupBy(train, 'BubbleShownTimeYearMonthDayHourMinuteSecond', 'UniqueUserDeviceKey')
#holdout = countGroupBy(holdout, 'BubbleShownTimeYearMonthDayHourMinuteSecond', 'UniqueUserDeviceKey')
def trainHoldoutCountGroupBy(_tr, _ho, col1, col2):
tr_dict = countGroupByDict(_tr, col1, col2)
ho_dict = countGroupByDict(_ho, col1, col2)
_dict = Counter(tr_dict) + Counter(ho_dict)
_tr[col1 + '_' + col2 + 'ValueCount'] = _tr[col1 + '_' + col2].apply(lambda x: _dict[x])
_ho[col1 + '_' + col2 + 'ValueCount'] = _ho[col1 + '_' + col2].apply(lambda x: _dict[x])
_tr.drop(col1 + '_' + col2, axis = 1, inplace = True)
_ho.drop(col1 + '_' + col2, axis = 1, inplace = True)
return _tr, _ho
cols = ['BubbleShownTime', 'ISOCountryShortName', 'CampaignId', 'creativeName']
for col in cols:
train, holdout = trainHoldoutCountGroupBy(train, holdout, col, 'UniqueUserDeviceKey')
'''
train_dict = countGroupByDict(train, 'BubbleShownTime', 'UniqueUserDeviceKey')
holdout_dict = countGroupByDict(holdout, 'BubbleShownTime', 'UniqueUserDeviceKey')
_dict = Counter(train_dict) + Counter(holdout_dict)
train['BubbleShownTime_UniqueUserDeviceKeyValueCount'] = train['BubbleShownTime_UniqueUserDeviceKey'].apply(lambda x: _dict[x])
holdout['BubbleShownTime_UniqueUserDeviceKeyValueCount'] = holdout['BubbleShownTime_UniqueUserDeviceKey'].apply(lambda x: _dict[x])
train_dict = countGroupByDict(train, 'ISOCountryShortName', 'UniqueUserDeviceKey')
holdout_dict = countGroupByDict(holdout, 'ISOCountryShortName', 'UniqueUserDeviceKey')
_dict = Counter(train_dict) + Counter(holdout_dict)
train['ISOCountryShortName_UniqueUserDeviceKeyValueCount'] = train['ISOCountryShortName_UniqueUserDeviceKey'].apply(lambda x: _dict[x])
holdout['ISOCountryShortName_UniqueUserDeviceKeyValueCount'] = holdout['ISOCountryShortName_UniqueUserDeviceKey'].apply(lambda x: _dict[x])
train_dict = countGroupByDict(train, 'CampaignId', 'UniqueUserDeviceKey')
holdout_dict = countGroupByDict(holdout, 'CampaignId', 'UniqueUserDeviceKey')
_dict = Counter(train_dict) + Counter(holdout_dict)
train['CampaignId_UniqueUserDeviceKeyValueCount'] = train['CampaignId_UniqueUserDeviceKey'].apply(lambda x: _dict[x])
holdout['CampaignId_UniqueUserDeviceKeyValueCount'] = holdout['CampaignId_UniqueUserDeviceKey'].apply(lambda x: _dict[x])
train_dict = countGroupByDict(train, 'ISOCountryShortName', 'UniqueUserDeviceKey')
holdout_dict = countGroupByDict(holdout, 'ISOCountryShortName', 'UniqueUserDeviceKey')
_dict = Counter(train_dict) + Counter(holdout_dict)
train['ISOCountryShortName_UniqueUserDeviceKeyValueCount'] = train['ISOCountryShortName_UniqueUserDeviceKey'].apply(lambda x: _dict[x])
holdout['ISOCountryShortName_UniqueUserDeviceKeyValueCount'] = holdout['ISOCountryShortName_UniqueUserDeviceKey'].apply(lambda x: _dict[x])
'''
#train = countGroupBy(train, 'ISOCountryShortName', 'UniqueUserDeviceKey')
#holdout = countGroupBy(holdout, 'ISOCountryShortName', 'UniqueUserDeviceKey')
#train = countGroupBy(train, 'ISOCountryShortNameValueCount', 'CampaignId')
#holdout = countGroupBy(holdout, 'ISOCountryShortNameValueCount', 'CampaignId')
#train[bs_cols]
#print(train['ymdhm'])
#print(train['ymdhms'])
Out[13]:
In [14]:
print(len(_dict))
print(len(train_dict))
print(len(holdout_dict))
In [15]:
#train.head()
print(train.BubbleShownTime_UniqueUserDeviceKeyValueCount.mean())
print(train.ISOCountryShortName_UniqueUserDeviceKeyValueCount.mean())
print(holdout.BubbleShownTime_UniqueUserDeviceKeyValueCount.mean())
print(holdout.ISOCountryShortName_UniqueUserDeviceKeyValueCount.mean())
In [16]:
# Get column groups and features
all_columns = train.columns
print("All columns:", len(all_columns))
# Remove constant columns for train (all included in time_ and days_ columns)
print("features without constant columns")
constant_columns = constantColumns(train)
features = list(set(all_columns) - set(constant_columns))
print("features:", len(features))
# With a lot of nulls, exclude time and days columns first and add later for improvement
print("features without time_ and days_ columns")
time_columns = columnsStartingWith(train, 'Time_')
days_columns = columnsStartingWith(train, 'Days_')
features = list(set(features) - set(time_columns))
features = list(set(features) - set(days_columns))
# Drop features
drop_features = ['HasClicked', 'RowNumber', 'BubbleShownTime', 'FirstUpdatedDate', 'OSOOBEDateTime',
'creativeNameTest', 'ymd', 'ymdh', 'UniqueUserDeviceKey', 'ymd_UniqueUserDeviceKeyValueCount',
'ymdh_ISOCountryShortNameValueCount', 'ymd_CampaignIdValueCountValueCount',
'ymdh_CampaignIdValueCountValueCount', 'ISOCountryShortNameValueCount_CampaignIdValueCount',
'BubbleShownTime_UniqueUserDeviceKey', 'BubbleShownTime_UniqueUserDeviceKeyValueCountValueCount',
'ISOCountryShortName_UniqueUserDeviceKey']
features = list(set(features) - set(drop_features))
# Drop features
#HighLevel_features = columnsEndingWith(train, 'HighLevel')
#features = list(set(features) - set(HighLevel_features))
# Drop features
#UniqueUserDeviceKey_features = ['UniqueUserDeviceKeyValueCount', 'CampaignIdValueCount', 'creativeNameValueCount']
#features = list(set(features) - set(UniqueUserDeviceKey_features))
# Feature importance > 50
imp_time_features = ['Time_Accessibility', 'Time_Browser', 'Time_Communications', 'Time_Content', 'Time_DevTools',
'Time_Games', 'Time_Malware', 'Time_Media', 'Time_PersonalProductivity', 'Time_Readers',
'Time_Search', 'Time_Social', 'Time_StudentAndLearning', 'Time_ModernApps',
'Time_Games_Core', 'Time_Games_Casual', 'Time_windows_immersivecontrolpanel',
'Time_msascui_exe', 'Time_chrome_exe', 'Time_microsoft_windows_cortana', 'Time_lockapphost_exe']
features = list(set(features) | set(imp_time_features))
print("features:", len(features))
In [17]:
sorted(features)
Out[17]:
In [12]:
train.head()
Out[12]:
In [13]:
'''
with Timer("Read test and feature engineering..."):
# Read tsv file
test = pd.read_csv('../input/CoinMlCompetitionSoftlandingEvaluateNoLabel.tsv', sep='\t', header = None)
# Add header because test does not header
test.columns = train.columns[0:1084]
# Feature engineering - should not delete odd date rows
test = feature_engineering(test, isDeleteOddDateRows=False)
'''
Out[13]:
In [14]:
'''
from sklearn.model_selection import train_test_split
with Timer('# train validation split'):
#X_train, X_val, y_train, y_val = train_test_split(train[train.isUnderSampled == True][features], train_y[train.isUnderSampled == True], test_size=0.2, random_state=0)
X_train, X_val, y_train, y_val = train_test_split(train[features], train_y, test_size=0.2, random_state=0)
gc.collect()
print(y_train.shape)
print(X_train.shape)
print(y_val.shape)
print(X_val.shape)
print(y_train.mean())
print(y_val.mean())
del train
gc.collect()
'''
Out[14]:
In [18]:
import lightgbm as lgb
#train_data = lgb.Dataset(X_train[X_train.isUnderSampled == True][features], label=X_train[X_train.isUnderSampled == True]['HasClicked'].values)
#train_data = lgb.Dataset(X_train[features], label=y_train)
#val_data = lgb.Dataset(X_val[features], y_val)
# use train holdout directly with t f ratio
train_data = lgb.Dataset(train[features], label=train_y)
val_data = lgb.Dataset(holdout[features], y_holdout)
#print(X_train[features].shape)
#print(X_val[features].shape)
print(train[features].shape)
print(holdout[features].shape)
In [27]:
with Timer("Trainig..."):
random.seed(2007)
params = {
'task' : 'train',
'boosting_type' : 'dart', #'gbdt'
'objective' : 'binary',
'metric' : 'auc', # 'binary_logloss'
'is_training_metric': True,
'max_bin': 255,
'num_leaves' : 64,
'learning_rate' : 0.02, # 0.05, #0.1,
'feature_fraction' : 0.82,
'bagging_fraction': 0.8,
'bagging_freq': 5,
'min_data_in_leaf': 10,
'min_sum_hessian_in_leaf': 5,
#'scale_pos_weight': 0.5,
#'is_unbalance': True
# 'num_threads': 16,
}
num_round = 10000
bst = lgb.train(params, train_data, num_round, valid_sets=val_data, early_stopping_rounds=10)
#val_preds = bst.predict(X_val[features], num_iteration=bst.best_iteration)
holdout_preds = bst.predict(holdout[features], num_iteration=bst.best_iteration)
#test_preds = bst.predict(test[features], num_iteration=bst.best_iteration)
#0.7019
# Including all high level and ymd and ymdh
# [297] valid_0's auc:0.67564 F1 score: 0.096338028169, best thr: 0.325385385385, Click mean: 0.0343981839588
# without ymd; f1 score not improved, so keep this
# [201] valid_0's auc:0.67772 F1 score: 0.0966780126125, best thr: 0.306746746747, Click mean: 0.0379598932823
# With uniqueUserDeviceKey valueCount
# [368] valid_0's auc:0.664831 F1 score: 0.06x ???
# Value counts
# [525] valid_0's auc:0.686445 f1 score: 0.104380886546 thr: 0.325875875876 Click mean: 0.0332386612486 (gain: 0.04)
# Count UniqueUserDeviceKey
# [505] valid_0's auc:0.706443 f1 score: 0.128913201081 thr: 0.371491491491 Click mean: 0.0267462248702 (gain:0.024)
# Count CampaignId
# [544] valid_0's auc:0.707357 f1 score: 0.13101569594 thr: 0.363643643644 Click mean: 0.0274719972684 (gain: 0.002)
# Remove all time and days
# [392] valid_0's auc:0.703582 f1 score: 0.123669773283 thr: 0.378358358358 Click mean: 0.0266139148895
# Include imp time features
# [418] valid_0's auc:0.706095 f1 score: 0.126989843694 thr: 0.386206206206 Click mean: 0.0229143624878 (loss: 0.004)
# Group by CampaignId and day
# [424] valid_0's auc:0.705353
In [28]:
#val_best_threshold = f1_best_threshold(y_val, val_preds)
ho_best_threshold = f1_best_threshold(y_holdout, holdout_preds)
In [17]:
#val_best_threshold = f1_best_threshold(y_val, val_preds)
ho_best_threshold = f1_best_threshold(y_holdout, holdout_preds)
In [ ]:
bst.feature_importance()
In [ ]:
#feature_list = X_val[features].columns.values
feature_list = holdout[features].columns.values
df_fi = pd.DataFrame(bst.feature_importance(), columns=['importance'])
df_fi['feature'] = feature_list
df_fi = df_fi.sort_values('importance', ascending = 0)
df_fi[df_fi.importance >= 10]
In [ ]:
df_fi = containColumn(df_fi, 'feature', 'Time_')
In [ ]:
df_fi[(df_fi.featureCotainsTime_==True) & (df_fi.importance > 50)]
In [ ]:
zeroImportance = df_fi[df_fi.importance == 0]['feature'].values
print(len(zeroImportance))
In [ ]:
# To do #
# Feature reduction by (1) removing zero imp or (2) under 10
In [ ]:
# Overll time complete
end_time = datetime.now()
total_time = end_time - start_time
print("Time to complete:", total_time)
In [ ]: