Xtr_app = csr_matrix(appp)
Xtrain = hstack((Xtr_brand, Xtr_model, Xtr_app, Xtr_label), format='csr') Xtest = hstack((Xte_brand, Xte_model, Xte_app, Xte_label), format='csr') print('All features: train shape {}, test shape {}'.format(Xtrain.shape, Xtest.shape))
targetencoder = LabelEncoder().fit(gatrain.group) y = targetencoder.transform(gatrain.group) nclasses = len(targetencoder.classes_)
def score(clf, random_state = 23): kf = StratifiedKFold(y, n_folds=10, shuffle=True, random_state=random_state) pred = np.zeros((y.shape[0],nclasses)) for itrain, itest in kf: Xtr, Xte = Xtrain[itrain, :], Xtrain[itest, :] ytr, yte = y[itrain], y[itest] clf.fit(Xtr, ytr) pred[itest,:] = clf.predict_proba(Xte)
# Downsize to one fold only for kernels
#print("{:.5f}".format(log_loss(yte, pred[itest,:])), end=' ')
#return log_loss(yte, pred[itest, :])
print("{:.5f}".format(log_loss(y, pred)), end=' ')
return log_loss(y, pred)
Cs = np.logspace(-3,0,10) print(Cs) res = [] for C in Cs: res.append(score(LogisticRegression(C = C, multi_class='multinomial',solver='lbfgs'))) plt.semilogx(Cs, res,'-o');
import xgboost as xgb dtrain = xgb.DMatrix(Xtrain, y)
params = { "eta": 0.3, "booster": "gbtree", "objective": "multi:softprob", "max_depth": 10, "silent": 1, "seed": 1233, "num_class": 12, "nthread": 16, "eval_metric": "mlogloss" }
xgb.cv(params, dtrain, num_boost_round=10000, early_stopping_rounds = 50, maximize = False, nfold=10, stratified=False, verbose_eval=10)
In [1]:
import pandas as pd
import numpy as np
%matplotlib inline
#import seaborn as sns
import matplotlib.pyplot as plt
import os
from sklearn.preprocessing import LabelEncoder
from scipy.sparse import csr_matrix, hstack
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import StratifiedKFold
from sklearn.metrics import log_loss
def save_sparse_csr(filename,array):
np.savez(filename,data = array.data ,indices=array.indices,
indptr =array.indptr, shape=array.shape )
def load_sparse_csr(filename):
loader = np.load(filename)
return csr_matrix(( loader['data'], loader['indices'], loader['indptr']),
shape = loader['shape'])
In [2]:
datadir = 'input/'
gatrain = pd.read_csv(os.path.join(datadir,'gender_age_train.csv'),
index_col='device_id')
gatest = pd.read_csv(os.path.join(datadir,'gender_age_test.csv'),
index_col = 'device_id')
phone = pd.read_csv(os.path.join(datadir,'phone_brand_device_model.csv'))
# Get rid of duplicate device ids in phone
phone = phone.drop_duplicates('device_id',keep='first').set_index('device_id')
events = pd.read_csv(os.path.join(datadir,'events.csv'),
parse_dates=['timestamp'], index_col='event_id')
appevents = pd.read_csv(os.path.join(datadir,'app_events.csv'),
usecols=['event_id','app_id','is_active'],
dtype={'is_active':bool})
applabels = pd.read_csv(os.path.join(datadir,'app_labels.csv'))
In [3]:
Xtr_app = load_sparse_csr('xtr_app.npz')
In [4]:
Xte_app = load_sparse_csr('xte_app.npz')
In [5]:
Xtr_app.shape
Out[5]:
In [6]:
gatrain['trainrow'] = np.arange(gatrain.shape[0])
gatest['testrow'] = np.arange(gatest.shape[0])
In [7]:
brandencoder = LabelEncoder().fit(phone.phone_brand)
phone['brand'] = brandencoder.transform(phone['phone_brand'])
gatrain['brand'] = phone['brand']
gatest['brand'] = phone['brand']
Xtr_brand = csr_matrix((np.ones(gatrain.shape[0]),
(gatrain.trainrow, gatrain.brand)))
Xte_brand = csr_matrix((np.ones(gatest.shape[0]),
(gatest.testrow, gatest.brand)))
print('Brand features: train shape {}, test shape {}'.format(Xtr_brand.shape, Xte_brand.shape))
In [8]:
m = phone.phone_brand.str.cat(phone.device_model)
modelencoder = LabelEncoder().fit(m)
phone['model'] = modelencoder.transform(m)
gatrain['model'] = phone['model']
gatest['model'] = phone['model']
Xtr_model = csr_matrix((np.ones(gatrain.shape[0]),
(gatrain.trainrow, gatrain.model)))
Xte_model = csr_matrix((np.ones(gatest.shape[0]),
(gatest.testrow, gatest.model)))
print('Model features: train shape {}, test shape {}'.format(Xtr_model.shape, Xte_model.shape))
In [9]:
appencoder = LabelEncoder().fit(appevents.app_id)
appevents['app'] = appencoder.transform(appevents.app_id)
napps = len(appencoder.classes_)
deviceapps = (appevents.merge(events[['device_id']], how='left',left_on='event_id',right_index=True)
.groupby(['device_id','app'])['app'].agg(['size'])
.merge(gatrain[['trainrow']], how='left', left_index=True, right_index=True)
.merge(gatest[['testrow']], how='left', left_index=True, right_index=True)
.reset_index())
deviceapps.head()
Out[9]:
In [10]:
d = deviceapps.dropna(subset=['trainrow'])
Xtr_app1 = csr_matrix((np.ones(d.shape[0]), (d.trainrow, d.app)),
shape=(gatrain.shape[0],napps))
d = deviceapps.dropna(subset=['testrow'])
Xte_app1 = csr_matrix((np.ones(d.shape[0]), (d.testrow, d.app)),
shape=(gatest.shape[0],napps))
print('Apps data: train shape {}, test shape {}'.format(Xtr_app1.shape, Xte_app1.shape))
In [11]:
applabels = applabels.loc[applabels.app_id.isin(appevents.app_id.unique())]
applabels['app'] = appencoder.transform(applabels.app_id)
labelencoder = LabelEncoder().fit(applabels.label_id)
applabels['label'] = labelencoder.transform(applabels.label_id)
nlabels = len(labelencoder.classes_)
In [12]:
devicelabels = (deviceapps[['device_id','app']]
.merge(applabels[['app','label']])
.groupby(['device_id','label'])['app'].agg(['size'])
.merge(gatrain[['trainrow']], how='left', left_index=True, right_index=True)
.merge(gatest[['testrow']], how='left', left_index=True, right_index=True)
.reset_index())
devicelabels.head()
Out[12]:
In [13]:
d = devicelabels.dropna(subset=['trainrow'])
Xtr_label = csr_matrix((np.ones(d.shape[0]), (d.trainrow, d.label)),
shape=(gatrain.shape[0],nlabels))
d = devicelabels.dropna(subset=['testrow'])
Xte_label = csr_matrix((np.ones(d.shape[0]), (d.testrow, d.label)),
shape=(gatest.shape[0],nlabels))
print('Labels data: train shape {}, test shape {}'.format(Xtr_label.shape, Xte_label.shape))
In [14]:
app_df_tr = pd.DataFrame(Xtr_app.toarray())
app_df_te = pd.DataFrame(Xte_app.toarray())
In [15]:
app_df_tr['summ'] = app_df_tr.sum(axis=1)
app_df_te['summ'] = app_df_te.sum(axis=1)
In [16]:
app_df_tr.index = gatrain.index
app_df_te.index = gatest.index
In [17]:
brand_df_tr = pd.DataFrame(Xtr_brand.toarray())
brand_df_te = pd.DataFrame(Xte_brand.toarray())
brand_df_tr.index = gatrain.index
brand_df_te.index = gatest.index
model_df_tr = pd.DataFrame(Xtr_model.toarray())
model_df_te = pd.DataFrame(Xte_model.toarray())
model_df_tr.index = gatrain.index
model_df_te.index = gatest.index
label_df_tr = pd.DataFrame(Xtr_label.toarray())
label_df_te = pd.DataFrame(Xte_label.toarray())
label_df_tr.index = gatrain.index
label_df_te.index = gatest.index
In [18]:
index_tr_app_events = app_df_tr[app_df_tr.summ>0].index
index_te_app_events = app_df_te[app_df_te.summ>0].index
In [19]:
#with_app_events_tr_app = app_df_tr.ix[app_df_tr[app_df_tr.summ>0].index]
#with_app_events_te_app = app_df_te.ix[app_df_te[app_df_te.summ>0].index]
In [22]:
df1_tr = brand_df_tr.ix[index_tr_app_events]
df2_tr = model_df_tr.ix[index_tr_app_events]
df3_tr = app_df_tr.ix[index_tr_app_events]
df4_tr = label_df_tr.ix[index_tr_app_events]
In [23]:
df1_te = brand_df_te.ix[index_te_app_events]
df2_te = model_df_te.ix[index_te_app_events]
df3_te = app_df_te.ix[index_te_app_events]
df4_te = label_df_te.ix[index_te_app_events]
In [24]:
df_tr_app_events = pd.concat([df1_tr,df2_tr,df3_tr,df4_tr], axis=1, ignore_index=True)
df_te_app_events = pd.concat([df1_te,df2_te,df3_te,df4_te], axis=1, ignore_index=True)
In [30]:
rows1tr = pd.Series(df_tr_app_events.index)
rows1te = pd.Series(df_te_app_events.index)
In [31]:
rows1tr.to_csv('rows1tr.csv',index_label='index',header=True)
rows1te.to_csv('rows1te.csv',index_label='index',header=True)
In [33]:
df_tr_app_events_sparse = csr_matrix(df_tr_app_events)
df_te_app_events_sparse = csr_matrix(df_te_app_events)
In [34]:
save_sparse_csr('df_tr_app_events',df_tr_app_events_sparse)
save_sparse_csr('df_te_app_events',df_te_app_events_sparse)
In [35]:
events_data = pd.read_hdf('events_data.hdf5','table')
In [48]:
in_events_ids_tr = (set(events_data.device_id.unique())&set(gatrain.index))
in_events_ids_te = (set(events_data.device_id.unique())&set(gatest.index))
In [55]:
rows2tr = pd.Series(list(in_events_ids_tr-set(df_tr_app_events.index)))
rows2te = pd.Series(list(in_events_ids_te-set(df_te_app_events.index)))
In [61]:
rows2tr.to_csv('rows2tr.csv',index_label='index',header=True)
rows2te.to_csv('rows2te.csv',index_label='index',header=True)
In [68]:
len(set(gatest.index)-set(events_data.device_id.unique()))
Out[68]:
In [69]:
76899+df_te_app_events.shape[0]+rows2te.shape[0]
Out[69]:
In [70]:
no_events_ids_tr = (set(gatrain.index)-set(events_data.device_id.unique()))
no_events_ids_te = (set(gatest.index)-set(events_data.device_id.unique()))
In [74]:
rows3tr = pd.Series(list(no_events_ids_tr))
rows3te = pd.Series(list(no_events_ids_te))
In [75]:
rows3tr.to_csv('rows3tr.csv',index_label='index',header=True)
rows3te.to_csv('rows3te.csv',index_label='index',header=True)
In [ ]: