Xtr_app = csr_matrix(appp)

Xtrain = hstack((Xtr_brand, Xtr_model, Xtr_app, Xtr_label), format='csr') Xtest = hstack((Xte_brand, Xte_model, Xte_app, Xte_label), format='csr') print('All features: train shape {}, test shape {}'.format(Xtrain.shape, Xtest.shape))

targetencoder = LabelEncoder().fit(gatrain.group) y = targetencoder.transform(gatrain.group) nclasses = len(targetencoder.classes_)

def score(clf, random_state = 23): kf = StratifiedKFold(y, n_folds=10, shuffle=True, random_state=random_state) pred = np.zeros((y.shape[0],nclasses)) for itrain, itest in kf: Xtr, Xte = Xtrain[itrain, :], Xtrain[itest, :] ytr, yte = y[itrain], y[itest] clf.fit(Xtr, ytr) pred[itest,:] = clf.predict_proba(Xte)

    # Downsize to one fold only for kernels
    #print("{:.5f}".format(log_loss(yte, pred[itest,:])), end=' ')
    #return log_loss(yte, pred[itest, :])
print("{:.5f}".format(log_loss(y, pred)), end=' ')
return log_loss(y, pred)

Cs = np.logspace(-3,0,10) print(Cs) res = [] for C in Cs: res.append(score(LogisticRegression(C = C, multi_class='multinomial',solver='lbfgs'))) plt.semilogx(Cs, res,'-o');

import xgboost as xgb dtrain = xgb.DMatrix(Xtrain, y)

params = { "eta": 0.3, "booster": "gbtree", "objective": "multi:softprob", "max_depth": 10, "silent": 1, "seed": 1233, "num_class": 12, "nthread": 16, "eval_metric": "mlogloss" }

xgb.cv(params, dtrain, num_boost_round=10000, early_stopping_rounds = 50, maximize = False, nfold=10, stratified=False, verbose_eval=10)


In [1]:
import pandas as pd
import numpy as np
%matplotlib inline
#import seaborn as sns
import matplotlib.pyplot as plt
import os
from sklearn.preprocessing import LabelEncoder
from scipy.sparse import csr_matrix, hstack
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import StratifiedKFold
from sklearn.metrics import log_loss

def save_sparse_csr(filename,array):
    np.savez(filename,data = array.data ,indices=array.indices,
             indptr =array.indptr, shape=array.shape )

def load_sparse_csr(filename):
    loader = np.load(filename)
    return csr_matrix((  loader['data'], loader['indices'], loader['indptr']),
                         shape = loader['shape'])

In [2]:
datadir = 'input/'
gatrain = pd.read_csv(os.path.join(datadir,'gender_age_train.csv'),
                      index_col='device_id')
gatest = pd.read_csv(os.path.join(datadir,'gender_age_test.csv'),
                     index_col = 'device_id')
phone = pd.read_csv(os.path.join(datadir,'phone_brand_device_model.csv'))
# Get rid of duplicate device ids in phone
phone = phone.drop_duplicates('device_id',keep='first').set_index('device_id')
events = pd.read_csv(os.path.join(datadir,'events.csv'),
                     parse_dates=['timestamp'], index_col='event_id')
appevents = pd.read_csv(os.path.join(datadir,'app_events.csv'), 
                        usecols=['event_id','app_id','is_active'],
                        dtype={'is_active':bool})
applabels = pd.read_csv(os.path.join(datadir,'app_labels.csv'))

In [3]:
Xtr_app = load_sparse_csr('xtr_app.npz')

In [4]:
Xte_app = load_sparse_csr('xte_app.npz')

In [5]:
Xtr_app.shape


Out[5]:
(74645, 7825)

In [6]:
gatrain['trainrow'] = np.arange(gatrain.shape[0])
gatest['testrow'] = np.arange(gatest.shape[0])

In [7]:
brandencoder = LabelEncoder().fit(phone.phone_brand)
phone['brand'] = brandencoder.transform(phone['phone_brand'])
gatrain['brand'] = phone['brand']
gatest['brand'] = phone['brand']
Xtr_brand = csr_matrix((np.ones(gatrain.shape[0]), 
                       (gatrain.trainrow, gatrain.brand)))
Xte_brand = csr_matrix((np.ones(gatest.shape[0]), 
                       (gatest.testrow, gatest.brand)))
print('Brand features: train shape {}, test shape {}'.format(Xtr_brand.shape, Xte_brand.shape))


Brand features: train shape (74645, 131), test shape (112071, 131)

In [8]:
m = phone.phone_brand.str.cat(phone.device_model)
modelencoder = LabelEncoder().fit(m)
phone['model'] = modelencoder.transform(m)
gatrain['model'] = phone['model']
gatest['model'] = phone['model']
Xtr_model = csr_matrix((np.ones(gatrain.shape[0]), 
                       (gatrain.trainrow, gatrain.model)))
Xte_model = csr_matrix((np.ones(gatest.shape[0]), 
                       (gatest.testrow, gatest.model)))
print('Model features: train shape {}, test shape {}'.format(Xtr_model.shape, Xte_model.shape))


Model features: train shape (74645, 1667), test shape (112071, 1667)

In [9]:
appencoder = LabelEncoder().fit(appevents.app_id)
appevents['app'] = appencoder.transform(appevents.app_id)
napps = len(appencoder.classes_)
deviceapps = (appevents.merge(events[['device_id']], how='left',left_on='event_id',right_index=True)
                       .groupby(['device_id','app'])['app'].agg(['size'])
                       .merge(gatrain[['trainrow']], how='left', left_index=True, right_index=True)
                       .merge(gatest[['testrow']], how='left', left_index=True, right_index=True)
                       .reset_index())
deviceapps.head()


Out[9]:
device_id app size trainrow testrow
0 -9222956879900151005 548 18 21594.0 NaN
1 -9222956879900151005 1096 18 21594.0 NaN
2 -9222956879900151005 1248 26 21594.0 NaN
3 -9222956879900151005 1545 12 21594.0 NaN
4 -9222956879900151005 1664 18 21594.0 NaN

In [10]:
d = deviceapps.dropna(subset=['trainrow'])
Xtr_app1 = csr_matrix((np.ones(d.shape[0]), (d.trainrow, d.app)), 
                      shape=(gatrain.shape[0],napps))
d = deviceapps.dropna(subset=['testrow'])
Xte_app1 = csr_matrix((np.ones(d.shape[0]), (d.testrow, d.app)), 
                      shape=(gatest.shape[0],napps))
print('Apps data: train shape {}, test shape {}'.format(Xtr_app1.shape, Xte_app1.shape))


Apps data: train shape (74645, 19237), test shape (112071, 19237)

In [11]:
applabels = applabels.loc[applabels.app_id.isin(appevents.app_id.unique())]
applabels['app'] = appencoder.transform(applabels.app_id)
labelencoder = LabelEncoder().fit(applabels.label_id)
applabels['label'] = labelencoder.transform(applabels.label_id)
nlabels = len(labelencoder.classes_)

In [12]:
devicelabels = (deviceapps[['device_id','app']]
                .merge(applabels[['app','label']])
                .groupby(['device_id','label'])['app'].agg(['size'])
                .merge(gatrain[['trainrow']], how='left', left_index=True, right_index=True)
                .merge(gatest[['testrow']], how='left', left_index=True, right_index=True)
                .reset_index())
devicelabels.head()


Out[12]:
device_id label size trainrow testrow
0 -9222956879900151005 117 1 21594.0 NaN
1 -9222956879900151005 120 1 21594.0 NaN
2 -9222956879900151005 126 1 21594.0 NaN
3 -9222956879900151005 138 2 21594.0 NaN
4 -9222956879900151005 147 2 21594.0 NaN

In [13]:
d = devicelabels.dropna(subset=['trainrow'])
Xtr_label = csr_matrix((np.ones(d.shape[0]), (d.trainrow, d.label)), 
                      shape=(gatrain.shape[0],nlabels))
d = devicelabels.dropna(subset=['testrow'])
Xte_label = csr_matrix((np.ones(d.shape[0]), (d.testrow, d.label)), 
                      shape=(gatest.shape[0],nlabels))
print('Labels data: train shape {}, test shape {}'.format(Xtr_label.shape, Xte_label.shape))


Labels data: train shape (74645, 492), test shape (112071, 492)

In [14]:
app_df_tr = pd.DataFrame(Xtr_app.toarray())
app_df_te = pd.DataFrame(Xte_app.toarray())

In [15]:
app_df_tr['summ'] = app_df_tr.sum(axis=1)
app_df_te['summ'] = app_df_te.sum(axis=1)

In [16]:
app_df_tr.index = gatrain.index
app_df_te.index = gatest.index

In [17]:
brand_df_tr = pd.DataFrame(Xtr_brand.toarray())
brand_df_te = pd.DataFrame(Xte_brand.toarray())
brand_df_tr.index = gatrain.index
brand_df_te.index = gatest.index

model_df_tr = pd.DataFrame(Xtr_model.toarray())
model_df_te = pd.DataFrame(Xte_model.toarray())
model_df_tr.index = gatrain.index
model_df_te.index = gatest.index

label_df_tr = pd.DataFrame(Xtr_label.toarray())
label_df_te = pd.DataFrame(Xte_label.toarray())
label_df_tr.index = gatrain.index
label_df_te.index = gatest.index

In [18]:
index_tr_app_events = app_df_tr[app_df_tr.summ>0].index
index_te_app_events = app_df_te[app_df_te.summ>0].index

In [19]:
#with_app_events_tr_app = app_df_tr.ix[app_df_tr[app_df_tr.summ>0].index]
#with_app_events_te_app = app_df_te.ix[app_df_te[app_df_te.summ>0].index]

In [22]:
df1_tr = brand_df_tr.ix[index_tr_app_events]
df2_tr = model_df_tr.ix[index_tr_app_events]
df3_tr = app_df_tr.ix[index_tr_app_events]
df4_tr = label_df_tr.ix[index_tr_app_events]

In [23]:
df1_te = brand_df_te.ix[index_te_app_events]
df2_te = model_df_te.ix[index_te_app_events]
df3_te = app_df_te.ix[index_te_app_events]
df4_te = label_df_te.ix[index_te_app_events]

In [24]:
df_tr_app_events = pd.concat([df1_tr,df2_tr,df3_tr,df4_tr], axis=1, ignore_index=True)
df_te_app_events = pd.concat([df1_te,df2_te,df3_te,df4_te], axis=1, ignore_index=True)

In [30]:
rows1tr = pd.Series(df_tr_app_events.index)
rows1te = pd.Series(df_te_app_events.index)

In [31]:
rows1tr.to_csv('rows1tr.csv',index_label='index',header=True)
rows1te.to_csv('rows1te.csv',index_label='index',header=True)

In [33]:
df_tr_app_events_sparse = csr_matrix(df_tr_app_events)
df_te_app_events_sparse = csr_matrix(df_te_app_events)

In [34]:
save_sparse_csr('df_tr_app_events',df_tr_app_events_sparse)
save_sparse_csr('df_te_app_events',df_te_app_events_sparse)

In [35]:
events_data = pd.read_hdf('events_data.hdf5','table')

In [48]:
in_events_ids_tr = (set(events_data.device_id.unique())&set(gatrain.index))
in_events_ids_te = (set(events_data.device_id.unique())&set(gatest.index))

In [55]:
rows2tr = pd.Series(list(in_events_ids_tr-set(df_tr_app_events.index)))
rows2te = pd.Series(list(in_events_ids_te-set(df_te_app_events.index)))

In [61]:
rows2tr.to_csv('rows2tr.csv',index_label='index',header=True)
rows2te.to_csv('rows2te.csv',index_label='index',header=True)

In [68]:
len(set(gatest.index)-set(events_data.device_id.unique()))


Out[68]:
76899

In [69]:
76899+df_te_app_events.shape[0]+rows2te.shape[0]


Out[69]:
112071

In [70]:
no_events_ids_tr = (set(gatrain.index)-set(events_data.device_id.unique()))
no_events_ids_te = (set(gatest.index)-set(events_data.device_id.unique()))

In [74]:
rows3tr = pd.Series(list(no_events_ids_tr))
rows3te = pd.Series(list(no_events_ids_te))

In [75]:
rows3tr.to_csv('rows3tr.csv',index_label='index',header=True)
rows3te.to_csv('rows3te.csv',index_label='index',header=True)

In [ ]: