思路：

拼接所有能合并的特征，链接statead，利用贝叶斯处理user缺省值

其他的能拆就拆，然后全部one-hot

对于AppID，先把一个用户的AppID连接在一起，然后使用tf-idf处理，得到App特征

上下两个合起来，裸跑LogisticRegression，生成submission，跑全量数据（17-28）在线上30%评分在0.10305左右。

没有代码重构



In [1]:

    
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
import numpy as np
import pickle
import math



In [2]:

    
#评分函数
import scipy as sp
def logloss(act, pred):
  epsilon = 1e-15
  pred = sp.maximum(epsilon, pred)
  pred = sp.minimum(1-epsilon, pred)
  ll = -sp.mean(act*sp.log(pred) + sp.subtract(1,act)*sp.log(1-pred))
  return ll



In [4]:

    
train = pd.read_csv('./pre/train.csv')
test = pd.read_csv('./pre/test.csv')

train.head()









    Out[4]:






  
    
      
      label
      clickTime
      conversionTime
      creativeID
      userID
      positionID
      connectionType
      telecomsOperator
    
  
  
    
      0
      0
      170000
      NaN
      3089
      2798058
      293
      1
      1
    
    
      1
      0
      170000
      NaN
      1259
      463234
      6161
      1
      2
    
    
      2
      0
      170000
      NaN
      4465
      1857485
      7434
      4
      1
    
    
      3
      0
      170000
      NaN
      1004
      2038823
      977
      1
      1
    
    
      4
      0
      170000
      NaN
      1887
      2015141
      3688
      1
      1



In [25]:

    
# statead = pd.read_csv('./statead.csv')
# statead.head()









    Out[25]:






  
    
      
      creativeID
      brand
      successclick
      successconversion
    
  
  
    
      0
      4079
      191
      2890.0
      10
    
    
      1
      4565
      56
      449000.0
      11622
    
    
      2
      3170
      1400
      387.0
      5
    
    
      3
      6566
      1400
      224.0
      1
    
    
      4
      5187
      56
      141000.0
      2277



In [27]:

    
test.head()









    Out[27]:






  
    
      
      instanceID
      label
      clickTime
      creativeID
      userID
      positionID
      connectionType
      telecomsOperator
      clickTime_day
      clickTime_hour
      clickTime_minute
      sitesetID_x
      positionType_x
      sitesetID_y
      positionType_y
      brand
      successclick
      successconversion
    
  
  
    
      0
      1
      -1
      310000
      3745
      1164848
      3451
      1
      3
      31
      0
      0
      0
      1
      0
      1
      191
      11946.0
      18
    
    
      1
      2
      -1
      310000
      2284
      2127247
      1613
      1
      3
      31
      0
      0
      0
      1
      0
      1
      191
      3745.0
      11
    
    
      2
      3
      -1
      310000
      1456
      2769125
      5510
      2
      1
      31
      0
      0
      0
      1
      0
      1
      56
      136000.0
      3452
    
    
      3
      4
      -1
      310000
      4565
      9762
      4113
      2
      3
      31
      0
      0
      0
      1
      0
      1
      56
      449000.0
      11622
    
    
      4
      5
      -1
      310000
      49
      2513636
      3615
      1
      3
      31
      0
      0
      0
      1
      0
      1
      191
      51.0
      0



In [5]:

    
#时间离散化
train['clickTime_day'] = train['clickTime'].map(lambda x:int(x/10000))
train['clickTime_hour'] = train['clickTime'].map(lambda x:int(x/100%100))
train['clickTime_minute'] = train['clickTime'].map(lambda x:int(x%100))



In [6]:

    
train.groupby(['clickTime_day'])['label'].value_counts()









    Out[6]:





clickTime_day  label
17             0        287089
               1          7464
18             0        155890
               1          4101
19             0        100872
               1          3286
20             0        201406
               1          5056
21             0        301475
               1          7121
22             0        318550
               1          7371
23             0        280826
               1          7607
24             0        277860
               1          7382
25             0        259457
               1          7376
26             0        290089
               1          7736
27             0        289065
               1          7649
28             0        271291
               1          7343
29             0        294307
               1          7462
30             0        328089
               1          6308
Name: label, dtype: int64



In [108]:

    
# 将第28天作为验证集  （集第一次更新）
# proof = train[train.clickTime_day==28]
train = train[(train.clickTime_day>=17 ) & (train.clickTime_day<= 28)]



In [7]:

    
print test.shape,train.shape









    



(338489, 8) (3749528, 11)



In [8]:

    
#时间离散化
test['clickTime_day'] = test['clickTime'].map(lambda x:int(x/10000))
test['clickTime_hour'] = test['clickTime'].map(lambda x:int(x/100%100))
test['clickTime_minute'] = test['clickTime'].map(lambda x:int(x%100))



In [9]:

    
test.groupby(['clickTime_hour'])['label'].value_counts()









    Out[9]:





clickTime_hour  label
0               -1        7545
1               -1        4785
2               -1        3159
3               -1        2478
4               -1        2643
5               -1        3329
6               -1        6014
7               -1       10554
8               -1       13330
9               -1       15433
10              -1       17187
11              -1       16355
12              -1       17028
13              -1       17776
14              -1       19110
15              -1       19717
16              -1       18122
17              -1       18839
18              -1       20482
19              -1       23049
20              -1       22770
21              -1       24124
22              -1       20210
23              -1       14450
Name: label, dtype: int64



In [3]:

    
#position直接加上去，LogisticRegression Logistic回归
#的训练得分 0.120106201117，可见position特征用处不大
position = pd.read_csv('./pre/position.csv')
# train = pd.merge(train,statead,position,on='positionID',how='left')
# test = pd.merge(test,statead,position,on='positionID',how='left')
position.head()









    Out[3]:






  
    
      
      positionID
      sitesetID
      positionType
    
  
  
    
      0
      2150
      1
      0
    
    
      1
      2579
      1
      0
    
    
      2
      3322
      1
      0
    
    
      3
      5726
      1
      0
    
    
      4
      4522
      2
      0



In [26]:

    
# train = pd.merge(train,statead,on='creativeID',how='left')
# test = pd.merge(test,statead,on='creativeID',how='left')



In [23]:

    
# statead.head()









    Out[23]:






  
    
      
      creativenessID
      brand
      successclick
      successconversion
    
  
  
    
      0
      4079
      191
      2890.0
      10
    
    
      1
      4565
      56
      449000.0
      11622
    
    
      2
      3170
      1400
      387.0
      5
    
    
      3
      6566
      1400
      224.0
      1
    
    
      4
      5187
      56
      141000.0
      2277



In [73]:

    
#numberical feature数字特征，feature_name总特征，categorical_feature 分类特征
#我们去掉label，convertiontime跑一次
feature_name = [a for a in train.columns if a not in ['label','conversionTime']]
categorical_feature = ['creativeID','userID','positionID','connectionType','telecomsOperator']



In [74]:

    
#去掉除label，convertiontime的第二次数据集
train_label = train['label']
train = train[feature_name]
test_label = test['label']
test = test[feature_name]



In [75]:

    
#添加appID特征（tfidf）
user_installedapps = pd.read_csv('./pre/user_installedapps.csv')
user_installedapps_count = user_installedapps.groupby('userID').agg(len).reset_index()#计数特征



In [76]:

    
user_installedapps.head()



In [77]:

    
user_installedapps_count.columns = ['userID','user_appID_count']
#2798058	app360 app361 app362 app375 app480 app481 app4  相当于app+value
user_installedapps = user_installedapps.groupby('userID').agg(lambda x:' '.join(['app'+str(s) for s in x.values])).reset_index()



In [78]:

    
user_id_all = pd.concat([train.userID,test.userID],axis=0)
user_id_all = pd.DataFrame(user_id_all,columns=['userID'])
user_id_all.head()



In [79]:

    
#不同用户的先提取出来
user_installedapps = pd.merge(user_id_all.drop_duplicates(),user_installedapps,on='userID',how='left')
user_installedapps = user_installedapps.fillna('Missing')
#至此，user_installedapps处理完毕



In [80]:

    
tfv = TfidfVectorizer()
tfv.fit(user_installedapps.appID)









    Out[80]:





TfidfVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm=u'l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern=u'(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)



In [81]:

    
#按照顺序转化为tfidf特征
user_installedapps = pd.merge(user_id_all,user_installedapps,on='userID',how='left')
user_installedapps = user_installedapps.fillna('Missing')
user_installedapps_tfv = tfv.transform(user_installedapps.appID)



In [22]:

    
#保险起见，爱你，就储存吧
user_installedapps.to_csv('./pre/user-app.csv',index=None)



In [11]:

    
def featureManipulation(dtfm, colList, func):
    '''依次处理某一dataframe内__所有__col的__所有__零值'''
    for col in colList:
        pr_col = func(dtfm, col)
        for row in pr_col.iterrows():
            zeroSample = dtfm[col][(dtfm[col] == 0)]
            replace = row[0]
            num = row[1][col].astype(int)
            if num > len(zeroSample):
                print(replace)
                num = len(zeroSample)
            if num <= 0:
                continue
            smpl = zeroSample.sample(num)
            smpl = smpl.replace(0, replace)
            dtfm[col].update(smpl)
    print(dtfm)









    Out[11]:





0    0.247033
1    0.227382
3    0.183746
2    0.177069
4    0.087440
5    0.062026
6    0.010541
7    0.004763
Name: education, dtype: float64



In [12]:

    
# 这里是对user的例子
user = pd.read_csv('./pre/user.csv')
user.head()
def sln(dtfm, col):
    dtfm_col = dtfm[dtfm[col] > 0]
    pr_col = dtfm_col[col].value_counts()/len(dtfm_col[col])
    pr_col *= len(dtfm[col][(dtfm[col] == 0)])
    pr_col = pr_col.apply(np.round)
    pr_col = pr_col.to_frame()
    return pr_col
featureManipulation(user, ['age','gender','education','hometown','residence'], sln)









    Out[12]:





0     0.104905
15    0.053536
16    0.042894
17    0.041247
19    0.041243
13    0.040872
20    0.040034
18    0.039410
14    0.038271
25    0.035427
21    0.034332
26    0.033149
22    0.032702
12    0.031670
23    0.031274
27    0.030278
28    0.028647
24    0.028334
11    0.028048
29    0.024487
30    0.019674
31    0.016924
33    0.015341
32    0.014844
10    0.013925
34    0.012143
35    0.011802
36    0.010457
37    0.009451
38    0.007586
        ...   
6     0.001399
5     0.001210
54    0.000782
55    0.000753
4     0.000698
56    0.000640
57    0.000631
58    0.000544
59    0.000486
60    0.000392
61    0.000329
62    0.000147
63    0.000104
66    0.000102
65    0.000095
64    0.000083
67    0.000078
80    0.000059
68    0.000052
69    0.000044
70    0.000044
71    0.000037
78    0.000032
73    0.000031
74    0.000031
76    0.000030
75    0.000029
72    0.000027
77    0.000026
79    0.000020
Name: age, dtype: float64



In [13]:

    
user.isnull().values.any()









    Out[13]:





False



In [ ]:

    
user['hometown_city'] = user['hometown']%100
user['hometown_province'] = (user['hometown']/100).astype('int')
user['residence_city'] = user['residence']%100
user['residence_province'] = (user['residence']/100).astype('int')



In [4]:

    
ad = pd.read_csv('./pre/ad.csv')
ad.head()









    Out[4]:






  
    
      
      creativeID
      adID
      camgaignID
      advertiserID
      appID
      appPlatform
    
  
  
    
      0
      4079
      2318
      147
      80
      14
      2
    
    
      1
      4565
      3593
      632
      3
      465
      1
    
    
      2
      3170
      1593
      205
      54
      389
      1
    
    
      3
      6566
      2390
      205
      54
      389
      1
    
    
      4
      5187
      411
      564
      3
      465
      1



In [84]:

    
#合并特征
train = pd.merge(train,user_installedapps_count,on='userID',how='left')
train = pd.merge(train,user,on='userID',how='left')
train = pd.merge(train,ad,on='creativeID',how='left')



In [85]:

    
#验证集合并特征
test = pd.merge(test,user_installedapps_count,on='userID',how='left')
test = pd.merge(test,user,on='userID',how='left')
test = pd.merge(test,ad,on='creativeID',how='left')



In [86]:

    
train.head()









    Out[86]:






  
    
      
      clickTime
      creativeID
      userID
      positionID
      connectionType
      telecomsOperator
      clickTime_day
      clickTime_hour
      clickTime_minute
      sitesetID
      ...
      residence
      hometown_city
      hometown_province
      residence_city
      residence_province
      adID
      camgaignID
      advertiserID
      appID
      appPlatform
    
  
  
    
      0
      250000
      5616
      2378636
      4292
      1
      1
      25
      0
      0
      0
      ...
      801
      10
      2
      1
      8
      333
      139
      10
      434
      1
    
    
      1
      250000
      3395
      1361396
      7219
      1
      1
      25
      0
      0
      0
      ...
      1301
      1
      13
      1
      13
      2514
      139
      10
      434
      1
    
    
      2
      250000
      784
      1172949
      3347
      2
      1
      25
      0
      0
      0
      ...
      1103
      0
      0
      3
      11
      3242
      375
      80
      14
      2
    
    
      3
      250000
      1456
      1318946
      4292
      2
      1
      25
      0
      0
      0
      ...
      1901
      1
      19
      1
      19
      3379
      411
      3
      465
      1
    
    
      4
      250000
      5747
      1367085
      3347
      1
      3
      25
      0
      0
      0
      ...
      609
      9
      6
      9
      6
      1896
      105
      80
      14
      2
    
  

5 rows × 28 columns



In [87]:

    
#保险起见，爱你，就储存吧
train.to_csv('./pre/train28.csv',index=None)



In [88]:

    
test.head()









    Out[88]:






  
    
      
      clickTime
      creativeID
      userID
      positionID
      connectionType
      telecomsOperator
      clickTime_day
      clickTime_hour
      clickTime_minute
      sitesetID
      ...
      residence
      hometown_city
      hometown_province
      residence_city
      residence_province
      adID
      camgaignID
      advertiserID
      appID
      appPlatform
    
  
  
    
      0
      310000
      3745
      1164848
      3451
      1
      3
      31
      0
      0
      0
      ...
      605
      5
      6
      5
      6
      1166
      430
      80
      14
      2
    
    
      1
      310000
      2284
      2127247
      1613
      1
      3
      31
      0
      0
      0
      ...
      2301
      5
      22
      1
      23
      1388
      325
      80
      14
      2
    
    
      2
      310000
      1456
      2769125
      5510
      2
      1
      31
      0
      0
      0
      ...
      1502
      0
      0
      2
      15
      3379
      411
      3
      465
      1
    
    
      3
      310000
      4565
      9762
      4113
      2
      3
      31
      0
      0
      0
      ...
      2407
      5
      24
      7
      24
      3593
      632
      3
      465
      1
    
    
      4
      310000
      49
      2513636
      3615
      1
      3
      31
      0
      0
      0
      ...
      307
      7
      3
      7
      3
      1469
      535
      80
      14
      2
    
  

5 rows × 28 columns



In [89]:

    
#保险起见，爱你，就储存吧
test.to_csv('./pre/test26.csv',index=None)



In [90]:

    
train.dtypes









    Out[90]:





clickTime               int64
creativeID              int64
userID                  int64
positionID              int64
connectionType          int64
telecomsOperator        int64
clickTime_day           int64
clickTime_hour          int64
clickTime_minute        int64
sitesetID               int64
positionType            int64
user_appID_count      float64
age                     int64
gender                  int64
education               int64
marriageStatus          int64
haveBaby                int64
hometown                int64
residence               int64
hometown_city           int64
hometown_province       int32
residence_city          int64
residence_province      int32
adID                    int64
camgaignID              int64
advertiserID            int64
appID                   int64
appPlatform             int64
dtype: object



In [91]:

    
train = train.fillna(0)
test = test.fillna(0)
train.dtypes









    Out[91]:





clickTime               int64
creativeID              int64
userID                  int64
positionID              int64
connectionType          int64
telecomsOperator        int64
clickTime_day           int64
clickTime_hour          int64
clickTime_minute        int64
sitesetID               int64
positionType            int64
user_appID_count      float64
age                     int64
gender                  int64
education               int64
marriageStatus          int64
haveBaby                int64
hometown                int64
residence               int64
hometown_city           int64
hometown_province       int32
residence_city          int64
residence_province      int32
adID                    int64
camgaignID              int64
advertiserID            int64
appID                   int64
appPlatform             int64
dtype: object



In [92]:

    
train_user_appID_count =  train[['user_appID_count']]
test_user_appID_count =  test[['user_appID_count']]
del train['user_appID_count'],test['user_appID_count']



In [93]:

    
oneEnc = OneHotEncoder()
data_one = pd.concat([train,test])
data_one = oneEnc.fit_transform(data_one)
train_one = data_one[:train.shape[0]]
test_one = data_one[train.shape[0]:]



In [94]:

    
print train_one.shape
print user_installedapps_tfv[:train.shape[0]].shape
print train_user_appID_count.shape
print train.shape









    



(861372, 987038)
(861372, 111975)
(861372, 1)
(861372, 27)



In [95]:

    
train_user_appID_count.values









    Out[95]:





array([[ 64.],
       [  0.],
       [ 67.],
       ..., 
       [ 64.],
       [  0.],
       [ 92.]])



In [100]:

    
train = hstack([train_one,user_installedapps_tfv[:train.shape[0]]])
test = hstack([test_one,user_installedapps_tfv[train.shape[0]:]])



In [ ]:

    
# #输出训练集和测试集
# with open('train.pkl','w') as f:
#     pickle.dump(train,f)
# with open('test.pkl','w') as f:
#     pickle.dump(test,f)
# #读取训练集和测试集
# with open('train.pkl','r') as f:
#     train = pickle.load(f)
# with open('test.pkl','r') as f:
#     test = pickle.load(f)



In [97]:

    
from sklearn.linear_model import LogisticRegression
print 'LogisticRegression Logistic回归'
lr = LogisticRegression(n_jobs=-1,random_state=2017)
lr.fit(train,train_label)
pred = lr.predict_proba(train)[:,1]
print '训练得分',logloss(train_label,pred)
# pred = lr.predict_proba(test)[:,1]
# print '验证得分',logloss(test_label,pred)









    



LogisticRegression Logistic回归
训练得分 0.0797782026628



In [101]:

    
# 取消了test线下测试
results = lr.predict_proba(test)[:,1]
# print '验证得分',logloss(test_label,pred)









    



验证得分





    



---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-101-a15274632c27> in <module>()
      1 results = lr.predict_proba(test)[:,1]
----> 2 print '验证得分',logloss(test_label,pred)

<ipython-input-2-8353cd4f7e12> in logloss(act, pred)
      5   pred = sp.maximum(epsilon, pred)
      6   pred = sp.minimum(1-epsilon, pred)
----> 7   ll = -sp.mean(act*sp.log(pred) + sp.subtract(1,act)*sp.log(1-pred))
      8   return ll

C:\ProgramData\Anaconda2\lib\site-packages\pandas\core\ops.py in wrapper(left, right, name, na_op)
    713                 lvalues = lvalues.values
    714 
--> 715         result = wrap_results(safe_na_op(lvalues, rvalues))
    716         return construct_result(
    717             left,

C:\ProgramData\Anaconda2\lib\site-packages\pandas\core\ops.py in safe_na_op(lvalues, rvalues)
    674         try:
    675             with np.errstate(all='ignore'):
--> 676                 return na_op(lvalues, rvalues)
    677         except Exception:
    678             if isinstance(rvalues, ABCSeries):

C:\ProgramData\Anaconda2\lib\site-packages\pandas\core\ops.py in na_op(x, y)
    650         try:
    651             result = expressions.evaluate(op, str_rep, x, y,
--> 652                                           raise_on_error=True, **eval_kwargs)
    653         except TypeError:
    654             if isinstance(y, (np.ndarray, ABCSeries, pd.Index)):

C:\ProgramData\Anaconda2\lib\site-packages\pandas\computation\expressions.py in evaluate(op, op_str, a, b, raise_on_error, use_numexpr, **eval_kwargs)
    208     if use_numexpr:
    209         return _evaluate(op, op_str, a, b, raise_on_error=raise_on_error,
--> 210                          **eval_kwargs)
    211     return _evaluate_standard(op, op_str, a, b, raise_on_error=raise_on_error)
    212 

C:\ProgramData\Anaconda2\lib\site-packages\pandas\computation\expressions.py in _evaluate_numexpr(op, op_str, a, b, raise_on_error, truediv, reversed, **eval_kwargs)
    119 
    120     if result is None:
--> 121         result = _evaluate_standard(op, op_str, a, b, raise_on_error)
    122 
    123     return result

C:\ProgramData\Anaconda2\lib\site-packages\pandas\computation\expressions.py in _evaluate_standard(op, op_str, a, b, raise_on_error, **eval_kwargs)
     61         _store_test_result(False)
     62     with np.errstate(all='ignore'):
---> 63         return op(a, b)
     64 
     65 

ValueError: operands could not be broadcast together with shapes (338489,) (861372,)



In [103]:

    
#输出结果
# a = pd.DataFrame({'instanceID':pd.read_csv('./pre/test.csv')['instanceID'],'prob':pred})
#输出
test1 = pd.read_csv('./pre/test.csv')
test1['prob'] = results
test1= test1[['instanceID','prob']]
test1.to_csv('./pre/submission.csv',index=None)
submission =  pd.read_csv('./pre/submission.csv')



In [ ]:

    
#import os
#os.system('shutdown -s')



In [104]:

    
submission









    Out[104]:






  
    
      
      instanceID
      prob
    
  
  
    
      0
      1
      0.001197
    
    
      1
      2
      0.006299
    
    
      2
      3
      0.006338
    
    
      3
      4
      0.013193
    
    
      4
      5
      0.002560
    
    
      5
      6
      0.011754
    
    
      6
      7
      0.025130
    
    
      7
      8
      0.012672
    
    
      8
      9
      0.003862
    
    
      9
      10
      0.010393
    
    
      10
      11
      0.012863
    
    
      11
      12
      0.008053
    
    
      12
      13
      0.008431
    
    
      13
      14
      0.013405
    
    
      14
      15
      0.083680
    
    
      15
      16
      0.001964
    
    
      16
      17
      0.015090
    
    
      17
      18
      0.006288
    
    
      18
      19
      0.007944
    
    
      19
      20
      0.013068
    
    
      20
      21
      0.010861
    
    
      21
      22
      0.015490
    
    
      22
      23
      0.027935
    
    
      23
      24
      0.005580
    
    
      24
      25
      0.003735
    
    
      25
      26
      0.008332
    
    
      26
      27
      0.002650
    
    
      27
      28
      0.002326
    
    
      28
      29
      0.010389
    
    
      29
      30
      0.010008
    
    
      ...
      ...
      ...
    
    
      338459
      338460
      0.004043
    
    
      338460
      338461
      0.013022
    
    
      338461
      338462
      0.042818
    
    
      338462
      338463
      0.011095
    
    
      338463
      338464
      0.000447
    
    
      338464
      338465
      0.018586
    
    
      338465
      338466
      0.005391
    
    
      338466
      338467
      0.006847
    
    
      338467
      338468
      0.006535
    
    
      338468
      338469
      0.057930
    
    
      338469
      338470
      0.151786
    
    
      338470
      338471
      0.012775
    
    
      338471
      338472
      0.017977
    
    
      338472
      338473
      0.034548
    
    
      338473
      338474
      0.004017
    
    
      338474
      338475
      0.149101
    
    
      338475
      338476
      0.011452
    
    
      338476
      338477
      0.018084
    
    
      338477
      338478
      0.015030
    
    
      338478
      338479
      0.204649
    
    
      338479
      338480
      0.053037
    
    
      338480
      338481
      0.012215
    
    
      338481
      338482
      0.029022
    
    
      338482
      338483
      0.036207
    
    
      338483
      338484
      0.005004
    
    
      338484
      338485
      0.018215
    
    
      338485
      338486
      0.001162
    
    
      338486
      338487
      0.021712
    
    
      338487
      338488
      0.010121
    
    
      338488
      338489
      0.003414
    
  

338489 rows × 2 columns

	clickTime	conversionTime	creativeID	userID	positionID	connectionType	telecomsOperator
0	170000	NaN	3089	2798058	293	1	1
1	170000	NaN	1259	463234	6161	1	2
2	170000	NaN	4465	1857485	7434	4	1
3	170000	NaN	1004	2038823	977	1	1
4	170000	NaN	1887	2015141	3688	1	1

	creativeID	brand	successclick	successconversion
0	4079	191	2890.0	10
1	4565	56	449000.0	11622
2	3170	1400	387.0	5
3	6566	1400	224.0	1
4	5187	56	141000.0	2277

	instanceID	label	clickTime	creativeID	userID	positionID	connectionType	telecomsOperator	clickTime_day	positionType_x	positionType_y	brand	successclick	successconversion
0	1	-1	310000	3745	1164848	3451	1	3	31	1	1	191	11946.0	18
1	2	-1	310000	2284	2127247	1613	1	3	31	1	1	191	3745.0	11
2	3	-1	310000	1456	2769125	5510	2	1	31	1	1	56	136000.0	3452
3	4	-1	310000	4565	9762	4113	2	3	31	1	1	56	449000.0	11622
4	5	-1	310000	49	2513636	3615	1	3	31	1	1	191	51.0	0

	creativeID	adID	camgaignID	advertiserID	appID	appPlatform
0	4079	2318	147	80	14	2
1	4565	3593	632	3	465	1
2	3170	1593	205	54	389	1
3	6566	2390	205	54	389	1
4	5187	411	564	3	465	1

	clickTime	creativeID	userID	positionID	connectionType	telecomsOperator	clickTime_day	...	residence	hometown_city	hometown_province	residence_city	residence_province	adID	camgaignID	advertiserID	appID	appPlatform
0	250000	5616	2378636	4292	1	1	25	...	801	10	2	1	8	333	139	10	434	1
1	250000	3395	1361396	7219	1	1	25	...	1301	1	13	1	13	2514	139	10	434	1
2	250000	784	1172949	3347	2	1	25	...	1103	0	0	3	11	3242	375	80	14	2
3	250000	1456	1318946	4292	2	1	25	...	1901	1	19	1	19	3379	411	3	465	1
4	250000	5747	1367085	3347	1	3	25	...	609	9	6	9	6	1896	105	80	14	2

	instanceID	prob
0	1	0.001197
1	2	0.006299
2	3	0.006338
3	4	0.013193
4	5	0.002560
5	6	0.011754
6	7	0.025130
7	8	0.012672
8	9	0.003862
9	10	0.010393
10	11	0.012863
11	12	0.008053
12	13	0.008431
13	14	0.013405
14	15	0.083680
15	16	0.001964
16	17	0.015090
17	18	0.006288
18	19	0.007944
19	20	0.013068
20	21	0.010861
21	22	0.015490
22	23	0.027935
23	24	0.005580
24	25	0.003735
25	26	0.008332
26	27	0.002650
27	28	0.002326
28	29	0.010389
29	30	0.010008
...	...	...
338459	338460	0.004043
338460	338461	0.013022
338461	338462	0.042818
338462	338463	0.011095
338463	338464	0.000447
338464	338465	0.018586
338465	338466	0.005391
338466	338467	0.006847
338467	338468	0.006535
338468	338469	0.057930
338469	338470	0.151786
338470	338471	0.012775
338471	338472	0.017977
338472	338473	0.034548
338473	338474	0.004017
338474	338475	0.149101
338475	338476	0.011452
338476	338477	0.018084
338477	338478	0.015030
338478	338479	0.204649
338479	338480	0.053037
338480	338481	0.012215
338481	338482	0.029022
338482	338483	0.036207
338483	338484	0.005004
338484	338485	0.018215
338485	338486	0.001162
338486	338487	0.021712
338487	338488	0.010121
338488	338489	0.003414