思路:

拼接所有能合并的特征,链接statead,利用贝叶斯处理user缺省值

其他的能拆就拆,然后全部one-hot

对于AppID,先把一个用户的AppID连接在一起,然后使用tf-idf处理,得到App特征

上下两个合起来,裸跑LogisticRegression,生成submission,跑全量数据(17-28)在线上30%评分在0.10305左右。

没有代码重构


In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
import numpy as np
import pickle
import math

In [2]:
#评分函数
import scipy as sp
def logloss(act, pred):
  epsilon = 1e-15
  pred = sp.maximum(epsilon, pred)
  pred = sp.minimum(1-epsilon, pred)
  ll = -sp.mean(act*sp.log(pred) + sp.subtract(1,act)*sp.log(1-pred))
  return ll

In [4]:
train = pd.read_csv('./pre/train.csv')
test = pd.read_csv('./pre/test.csv')

train.head()


Out[4]:
label clickTime conversionTime creativeID userID positionID connectionType telecomsOperator
0 0 170000 NaN 3089 2798058 293 1 1
1 0 170000 NaN 1259 463234 6161 1 2
2 0 170000 NaN 4465 1857485 7434 4 1
3 0 170000 NaN 1004 2038823 977 1 1
4 0 170000 NaN 1887 2015141 3688 1 1

In [25]:
# statead = pd.read_csv('./statead.csv')
# statead.head()


Out[25]:
creativeID brand successclick successconversion
0 4079 191 2890.0 10
1 4565 56 449000.0 11622
2 3170 1400 387.0 5
3 6566 1400 224.0 1
4 5187 56 141000.0 2277

In [27]:
test.head()


Out[27]:
instanceID label clickTime creativeID userID positionID connectionType telecomsOperator clickTime_day clickTime_hour clickTime_minute sitesetID_x positionType_x sitesetID_y positionType_y brand successclick successconversion
0 1 -1 310000 3745 1164848 3451 1 3 31 0 0 0 1 0 1 191 11946.0 18
1 2 -1 310000 2284 2127247 1613 1 3 31 0 0 0 1 0 1 191 3745.0 11
2 3 -1 310000 1456 2769125 5510 2 1 31 0 0 0 1 0 1 56 136000.0 3452
3 4 -1 310000 4565 9762 4113 2 3 31 0 0 0 1 0 1 56 449000.0 11622
4 5 -1 310000 49 2513636 3615 1 3 31 0 0 0 1 0 1 191 51.0 0

In [5]:
#时间离散化
train['clickTime_day'] = train['clickTime'].map(lambda x:int(x/10000))
train['clickTime_hour'] = train['clickTime'].map(lambda x:int(x/100%100))
train['clickTime_minute'] = train['clickTime'].map(lambda x:int(x%100))

In [6]:
train.groupby(['clickTime_day'])['label'].value_counts()


Out[6]:
clickTime_day  label
17             0        287089
               1          7464
18             0        155890
               1          4101
19             0        100872
               1          3286
20             0        201406
               1          5056
21             0        301475
               1          7121
22             0        318550
               1          7371
23             0        280826
               1          7607
24             0        277860
               1          7382
25             0        259457
               1          7376
26             0        290089
               1          7736
27             0        289065
               1          7649
28             0        271291
               1          7343
29             0        294307
               1          7462
30             0        328089
               1          6308
Name: label, dtype: int64

In [108]:
# 将第28天作为验证集  (集第一次更新)
# proof = train[train.clickTime_day==28]
train = train[(train.clickTime_day>=17 ) & (train.clickTime_day<= 28)]

In [7]:
print test.shape,train.shape


(338489, 8) (3749528, 11)

In [8]:
#时间离散化
test['clickTime_day'] = test['clickTime'].map(lambda x:int(x/10000))
test['clickTime_hour'] = test['clickTime'].map(lambda x:int(x/100%100))
test['clickTime_minute'] = test['clickTime'].map(lambda x:int(x%100))

In [9]:
test.groupby(['clickTime_hour'])['label'].value_counts()


Out[9]:
clickTime_hour  label
0               -1        7545
1               -1        4785
2               -1        3159
3               -1        2478
4               -1        2643
5               -1        3329
6               -1        6014
7               -1       10554
8               -1       13330
9               -1       15433
10              -1       17187
11              -1       16355
12              -1       17028
13              -1       17776
14              -1       19110
15              -1       19717
16              -1       18122
17              -1       18839
18              -1       20482
19              -1       23049
20              -1       22770
21              -1       24124
22              -1       20210
23              -1       14450
Name: label, dtype: int64

In [3]:
#position直接加上去,LogisticRegression Logistic回归
#的训练得分 0.120106201117,可见position特征用处不大
position = pd.read_csv('./pre/position.csv')
# train = pd.merge(train,statead,position,on='positionID',how='left')
# test = pd.merge(test,statead,position,on='positionID',how='left')
position.head()


Out[3]:
positionID sitesetID positionType
0 2150 1 0
1 2579 1 0
2 3322 1 0
3 5726 1 0
4 4522 2 0

In [26]:
# train = pd.merge(train,statead,on='creativeID',how='left')
# test = pd.merge(test,statead,on='creativeID',how='left')

In [23]:
# statead.head()


Out[23]:
creativenessID brand successclick successconversion
0 4079 191 2890.0 10
1 4565 56 449000.0 11622
2 3170 1400 387.0 5
3 6566 1400 224.0 1
4 5187 56 141000.0 2277

In [73]:
#numberical feature数字特征,feature_name总特征,categorical_feature 分类特征
#我们去掉label,convertiontime跑一次
feature_name = [a for a in train.columns if a not in ['label','conversionTime']]
categorical_feature = ['creativeID','userID','positionID','connectionType','telecomsOperator']

In [74]:
#去掉除label,convertiontime的第二次数据集
train_label = train['label']
train = train[feature_name]
test_label = test['label']
test = test[feature_name]

In [75]:
#添加appID特征(tfidf)
user_installedapps = pd.read_csv('./pre/user_installedapps.csv')
user_installedapps_count = user_installedapps.groupby('userID').agg(len).reset_index()#计数特征

In [76]:
user_installedapps.head()


Out[76]:
userID appID
0 1 357
1 1 360
2 1 362
3 1 365
4 1 375

In [77]:
user_installedapps_count.columns = ['userID','user_appID_count']
#2798058	app360 app361 app362 app375 app480 app481 app4  相当于app+value
user_installedapps = user_installedapps.groupby('userID').agg(lambda x:' '.join(['app'+str(s) for s in x.values])).reset_index()

In [78]:
user_id_all = pd.concat([train.userID,test.userID],axis=0)
user_id_all = pd.DataFrame(user_id_all,columns=['userID'])
user_id_all.head()


Out[78]:
userID
0 2378636
1 1361396
2 1172949
3 1318946
4 1367085

In [79]:
#不同用户的先提取出来
user_installedapps = pd.merge(user_id_all.drop_duplicates(),user_installedapps,on='userID',how='left')
user_installedapps = user_installedapps.fillna('Missing')
#至此,user_installedapps处理完毕

In [80]:
tfv = TfidfVectorizer()
tfv.fit(user_installedapps.appID)


Out[80]:
TfidfVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm=u'l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern=u'(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [81]:
#按照顺序转化为tfidf特征
user_installedapps = pd.merge(user_id_all,user_installedapps,on='userID',how='left')
user_installedapps = user_installedapps.fillna('Missing')
user_installedapps_tfv = tfv.transform(user_installedapps.appID)

In [22]:
#保险起见,爱你,就储存吧
user_installedapps.to_csv('./pre/user-app.csv',index=None)

In [11]:
def featureManipulation(dtfm, colList, func):
    '''依次处理某一dataframe内__所有__col的__所有__零值'''
    for col in colList:
        pr_col = func(dtfm, col)
        for row in pr_col.iterrows():
            zeroSample = dtfm[col][(dtfm[col] == 0)]
            replace = row[0]
            num = row[1][col].astype(int)
            if num > len(zeroSample):
                print(replace)
                num = len(zeroSample)
            if num <= 0:
                continue
            smpl = zeroSample.sample(num)
            smpl = smpl.replace(0, replace)
            dtfm[col].update(smpl)
    print(dtfm)


Out[11]:
0    0.247033
1    0.227382
3    0.183746
2    0.177069
4    0.087440
5    0.062026
6    0.010541
7    0.004763
Name: education, dtype: float64

In [12]:
# 这里是对user的例子
user = pd.read_csv('./pre/user.csv')
user.head()
def sln(dtfm, col):
    dtfm_col = dtfm[dtfm[col] > 0]
    pr_col = dtfm_col[col].value_counts()/len(dtfm_col[col])
    pr_col *= len(dtfm[col][(dtfm[col] == 0)])
    pr_col = pr_col.apply(np.round)
    pr_col = pr_col.to_frame()
    return pr_col
featureManipulation(user, ['age','gender','education','hometown','residence'], sln)


Out[12]:
0     0.104905
15    0.053536
16    0.042894
17    0.041247
19    0.041243
13    0.040872
20    0.040034
18    0.039410
14    0.038271
25    0.035427
21    0.034332
26    0.033149
22    0.032702
12    0.031670
23    0.031274
27    0.030278
28    0.028647
24    0.028334
11    0.028048
29    0.024487
30    0.019674
31    0.016924
33    0.015341
32    0.014844
10    0.013925
34    0.012143
35    0.011802
36    0.010457
37    0.009451
38    0.007586
        ...   
6     0.001399
5     0.001210
54    0.000782
55    0.000753
4     0.000698
56    0.000640
57    0.000631
58    0.000544
59    0.000486
60    0.000392
61    0.000329
62    0.000147
63    0.000104
66    0.000102
65    0.000095
64    0.000083
67    0.000078
80    0.000059
68    0.000052
69    0.000044
70    0.000044
71    0.000037
78    0.000032
73    0.000031
74    0.000031
76    0.000030
75    0.000029
72    0.000027
77    0.000026
79    0.000020
Name: age, dtype: float64

In [13]:
user.isnull().values.any()


Out[13]:
False

In [ ]:
user['hometown_city'] = user['hometown']%100
user['hometown_province'] = (user['hometown']/100).astype('int')
user['residence_city'] = user['residence']%100
user['residence_province'] = (user['residence']/100).astype('int')

In [4]:
ad = pd.read_csv('./pre/ad.csv')
ad.head()


Out[4]:
creativeID adID camgaignID advertiserID appID appPlatform
0 4079 2318 147 80 14 2
1 4565 3593 632 3 465 1
2 3170 1593 205 54 389 1
3 6566 2390 205 54 389 1
4 5187 411 564 3 465 1

In [84]:
#合并特征
train = pd.merge(train,user_installedapps_count,on='userID',how='left')
train = pd.merge(train,user,on='userID',how='left')
train = pd.merge(train,ad,on='creativeID',how='left')

In [85]:
#验证集合并特征
test = pd.merge(test,user_installedapps_count,on='userID',how='left')
test = pd.merge(test,user,on='userID',how='left')
test = pd.merge(test,ad,on='creativeID',how='left')

In [86]:
train.head()


Out[86]:
clickTime creativeID userID positionID connectionType telecomsOperator clickTime_day clickTime_hour clickTime_minute sitesetID ... residence hometown_city hometown_province residence_city residence_province adID camgaignID advertiserID appID appPlatform
0 250000 5616 2378636 4292 1 1 25 0 0 0 ... 801 10 2 1 8 333 139 10 434 1
1 250000 3395 1361396 7219 1 1 25 0 0 0 ... 1301 1 13 1 13 2514 139 10 434 1
2 250000 784 1172949 3347 2 1 25 0 0 0 ... 1103 0 0 3 11 3242 375 80 14 2
3 250000 1456 1318946 4292 2 1 25 0 0 0 ... 1901 1 19 1 19 3379 411 3 465 1
4 250000 5747 1367085 3347 1 3 25 0 0 0 ... 609 9 6 9 6 1896 105 80 14 2

5 rows × 28 columns


In [87]:
#保险起见,爱你,就储存吧
train.to_csv('./pre/train28.csv',index=None)

In [88]:
test.head()


Out[88]:
clickTime creativeID userID positionID connectionType telecomsOperator clickTime_day clickTime_hour clickTime_minute sitesetID ... residence hometown_city hometown_province residence_city residence_province adID camgaignID advertiserID appID appPlatform
0 310000 3745 1164848 3451 1 3 31 0 0 0 ... 605 5 6 5 6 1166 430 80 14 2
1 310000 2284 2127247 1613 1 3 31 0 0 0 ... 2301 5 22 1 23 1388 325 80 14 2
2 310000 1456 2769125 5510 2 1 31 0 0 0 ... 1502 0 0 2 15 3379 411 3 465 1
3 310000 4565 9762 4113 2 3 31 0 0 0 ... 2407 5 24 7 24 3593 632 3 465 1
4 310000 49 2513636 3615 1 3 31 0 0 0 ... 307 7 3 7 3 1469 535 80 14 2

5 rows × 28 columns


In [89]:
#保险起见,爱你,就储存吧
test.to_csv('./pre/test26.csv',index=None)

In [90]:
train.dtypes


Out[90]:
clickTime               int64
creativeID              int64
userID                  int64
positionID              int64
connectionType          int64
telecomsOperator        int64
clickTime_day           int64
clickTime_hour          int64
clickTime_minute        int64
sitesetID               int64
positionType            int64
user_appID_count      float64
age                     int64
gender                  int64
education               int64
marriageStatus          int64
haveBaby                int64
hometown                int64
residence               int64
hometown_city           int64
hometown_province       int32
residence_city          int64
residence_province      int32
adID                    int64
camgaignID              int64
advertiserID            int64
appID                   int64
appPlatform             int64
dtype: object

In [91]:
train = train.fillna(0)
test = test.fillna(0)
train.dtypes


Out[91]:
clickTime               int64
creativeID              int64
userID                  int64
positionID              int64
connectionType          int64
telecomsOperator        int64
clickTime_day           int64
clickTime_hour          int64
clickTime_minute        int64
sitesetID               int64
positionType            int64
user_appID_count      float64
age                     int64
gender                  int64
education               int64
marriageStatus          int64
haveBaby                int64
hometown                int64
residence               int64
hometown_city           int64
hometown_province       int32
residence_city          int64
residence_province      int32
adID                    int64
camgaignID              int64
advertiserID            int64
appID                   int64
appPlatform             int64
dtype: object

In [92]:
train_user_appID_count =  train[['user_appID_count']]
test_user_appID_count =  test[['user_appID_count']]
del train['user_appID_count'],test['user_appID_count']

In [93]:
oneEnc = OneHotEncoder()
data_one = pd.concat([train,test])
data_one = oneEnc.fit_transform(data_one)
train_one = data_one[:train.shape[0]]
test_one = data_one[train.shape[0]:]

In [94]:
print train_one.shape
print user_installedapps_tfv[:train.shape[0]].shape
print train_user_appID_count.shape
print train.shape


(861372, 987038)
(861372, 111975)
(861372, 1)
(861372, 27)

In [95]:
train_user_appID_count.values


Out[95]:
array([[ 64.],
       [  0.],
       [ 67.],
       ..., 
       [ 64.],
       [  0.],
       [ 92.]])

In [100]:
train = hstack([train_one,user_installedapps_tfv[:train.shape[0]]])
test = hstack([test_one,user_installedapps_tfv[train.shape[0]:]])




In [ ]:
# #输出训练集和测试集
# with open('train.pkl','w') as f:
#     pickle.dump(train,f)
# with open('test.pkl','w') as f:
#     pickle.dump(test,f)
# #读取训练集和测试集
# with open('train.pkl','r') as f:
#     train = pickle.load(f)
# with open('test.pkl','r') as f:
#     test = pickle.load(f)

In [97]:
from sklearn.linear_model import LogisticRegression
print 'LogisticRegression Logistic回归'
lr = LogisticRegression(n_jobs=-1,random_state=2017)
lr.fit(train,train_label)
pred = lr.predict_proba(train)[:,1]
print '训练得分',logloss(train_label,pred)
# pred = lr.predict_proba(test)[:,1]
# print '验证得分',logloss(test_label,pred)


LogisticRegression Logistic回归
训练得分 0.0797782026628

In [101]:
# 取消了test线下测试
results = lr.predict_proba(test)[:,1]
# print '验证得分',logloss(test_label,pred)


验证得分
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-101-a15274632c27> in <module>()
      1 results = lr.predict_proba(test)[:,1]
----> 2 print '验证得分',logloss(test_label,pred)

<ipython-input-2-8353cd4f7e12> in logloss(act, pred)
      5   pred = sp.maximum(epsilon, pred)
      6   pred = sp.minimum(1-epsilon, pred)
----> 7   ll = -sp.mean(act*sp.log(pred) + sp.subtract(1,act)*sp.log(1-pred))
      8   return ll

C:\ProgramData\Anaconda2\lib\site-packages\pandas\core\ops.py in wrapper(left, right, name, na_op)
    713                 lvalues = lvalues.values
    714 
--> 715         result = wrap_results(safe_na_op(lvalues, rvalues))
    716         return construct_result(
    717             left,

C:\ProgramData\Anaconda2\lib\site-packages\pandas\core\ops.py in safe_na_op(lvalues, rvalues)
    674         try:
    675             with np.errstate(all='ignore'):
--> 676                 return na_op(lvalues, rvalues)
    677         except Exception:
    678             if isinstance(rvalues, ABCSeries):

C:\ProgramData\Anaconda2\lib\site-packages\pandas\core\ops.py in na_op(x, y)
    650         try:
    651             result = expressions.evaluate(op, str_rep, x, y,
--> 652                                           raise_on_error=True, **eval_kwargs)
    653         except TypeError:
    654             if isinstance(y, (np.ndarray, ABCSeries, pd.Index)):

C:\ProgramData\Anaconda2\lib\site-packages\pandas\computation\expressions.py in evaluate(op, op_str, a, b, raise_on_error, use_numexpr, **eval_kwargs)
    208     if use_numexpr:
    209         return _evaluate(op, op_str, a, b, raise_on_error=raise_on_error,
--> 210                          **eval_kwargs)
    211     return _evaluate_standard(op, op_str, a, b, raise_on_error=raise_on_error)
    212 

C:\ProgramData\Anaconda2\lib\site-packages\pandas\computation\expressions.py in _evaluate_numexpr(op, op_str, a, b, raise_on_error, truediv, reversed, **eval_kwargs)
    119 
    120     if result is None:
--> 121         result = _evaluate_standard(op, op_str, a, b, raise_on_error)
    122 
    123     return result

C:\ProgramData\Anaconda2\lib\site-packages\pandas\computation\expressions.py in _evaluate_standard(op, op_str, a, b, raise_on_error, **eval_kwargs)
     61         _store_test_result(False)
     62     with np.errstate(all='ignore'):
---> 63         return op(a, b)
     64 
     65 

ValueError: operands could not be broadcast together with shapes (338489,) (861372,) 

In [103]:
#输出结果
# a = pd.DataFrame({'instanceID':pd.read_csv('./pre/test.csv')['instanceID'],'prob':pred})
#输出
test1 = pd.read_csv('./pre/test.csv')
test1['prob'] = results
test1= test1[['instanceID','prob']]
test1.to_csv('./pre/submission.csv',index=None)
submission =  pd.read_csv('./pre/submission.csv')

In [ ]:
#import os
#os.system('shutdown -s')

In [104]:
submission


Out[104]:
instanceID prob
0 1 0.001197
1 2 0.006299
2 3 0.006338
3 4 0.013193
4 5 0.002560
5 6 0.011754
6 7 0.025130
7 8 0.012672
8 9 0.003862
9 10 0.010393
10 11 0.012863
11 12 0.008053
12 13 0.008431
13 14 0.013405
14 15 0.083680
15 16 0.001964
16 17 0.015090
17 18 0.006288
18 19 0.007944
19 20 0.013068
20 21 0.010861
21 22 0.015490
22 23 0.027935
23 24 0.005580
24 25 0.003735
25 26 0.008332
26 27 0.002650
27 28 0.002326
28 29 0.010389
29 30 0.010008
... ... ...
338459 338460 0.004043
338460 338461 0.013022
338461 338462 0.042818
338462 338463 0.011095
338463 338464 0.000447
338464 338465 0.018586
338465 338466 0.005391
338466 338467 0.006847
338467 338468 0.006535
338468 338469 0.057930
338469 338470 0.151786
338470 338471 0.012775
338471 338472 0.017977
338472 338473 0.034548
338473 338474 0.004017
338474 338475 0.149101
338475 338476 0.011452
338476 338477 0.018084
338477 338478 0.015030
338478 338479 0.204649
338479 338480 0.053037
338480 338481 0.012215
338481 338482 0.029022
338482 338483 0.036207
338483 338484 0.005004
338484 338485 0.018215
338485 338486 0.001162
338486 338487 0.021712
338487 338488 0.010121
338488 338489 0.003414

338489 rows × 2 columns