思路:

拼接所有能合并的特征 能拆就拆 然后全部one-hot

对于AppID,先把一个用户的AppID连接在一起,然后使用tf-idf处理,得到App特征

上下两个合起来,裸跑LogisticRegression


In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
import numpy as np
import pickle
import math
import cPickle
import xgboost as xgb

In [2]:
#评分函数
import scipy as sp
def logloss(act, pred):
  epsilon = 1e-15
  pred = sp.maximum(epsilon, pred)
  pred = sp.minimum(1-epsilon, pred)
  ll = -sp.mean(act*sp.log(pred) + sp.subtract(1,act)*sp.log(1-pred))
  return ll

In [8]:
train = pd.read_csv('./pre/train.csv')
test = pd.read_csv('./pre/test.csv')

In [25]:
# statead = pd.read_csv('./statead.csv')
# statead.head()


Out[25]:
creativeID brand successclick successconversion
0 4079 191 2890.0 10
1 4565 56 449000.0 11622
2 3170 1400 387.0 5
3 6566 1400 224.0 1
4 5187 56 141000.0 2277

In [9]:
#时间离散化
train['clickTime_day'] = train['clickTime'].map(lambda x:int(x/10000))
train['clickTime_hour'] = train['clickTime'].map(lambda x:int(x/100%100))
train['clickTime_minute'] = train['clickTime'].map(lambda x:int(x%100))

In [10]:
train.groupby(['clickTime_day'])['label'].value_counts()


Out[10]:
clickTime_day  label
17             0        287089
               1          7464
18             0        155890
               1          4101
19             0        100872
               1          3286
20             0        201406
               1          5056
21             0        301475
               1          7121
22             0        318550
               1          7371
23             0        280826
               1          7607
24             0        277860
               1          7382
25             0        259457
               1          7376
26             0        290089
               1          7736
27             0        289065
               1          7649
28             0        271291
               1          7343
29             0        294307
               1          7462
30             0        328089
               1          6308
Name: label, dtype: int64

In [11]:
# 将第28天作为验证集  (集第一次更新)
# proof = train[train.clickTime_day==28]
train = train[(train.clickTime_day>=17 ) & (train.clickTime_day<= 28)]

In [12]:
print test.shape,train.shape


(338489, 8) (3113362, 11)

In [13]:
#时间离散化
test['clickTime_day'] = test['clickTime'].map(lambda x:int(x/10000))
test['clickTime_hour'] = test['clickTime'].map(lambda x:int(x/100%100))
test['clickTime_minute'] = test['clickTime'].map(lambda x:int(x%100))

In [30]:
test.groupby(['clickTime_hour'])['label'].value_counts()


Out[30]:
clickTime_hour  label
0               -1        7545
1               -1        4785
2               -1        3159
3               -1        2478
4               -1        2643
5               -1        3329
6               -1        6014
7               -1       10554
8               -1       13330
9               -1       15433
10              -1       17187
11              -1       16355
12              -1       17028
13              -1       17776
14              -1       19110
15              -1       19717
16              -1       18122
17              -1       18839
18              -1       20482
19              -1       23049
20              -1       22770
21              -1       24124
22              -1       20210
23              -1       14450
Name: label, dtype: int64

In [14]:
#position直接加上去,LogisticRegression Logistic回归
#的训练得分 0.120106201117,可见position特征用处不大
position = pd.read_csv('./pre/position.csv')
# train = pd.merge(train,statead,position,on='positionID',how='left')
# test = pd.merge(test,statead,position,on='positionID',how='left')
position.head()


Out[14]:
positionID sitesetID positionType
0 2150 1 0
1 2579 1 0
2 3322 1 0
3 5726 1 0
4 4522 2 0

In [15]:
# train = pd.merge(train,statead,on='creativeID',how='left')
# test = pd.merge(test,statead,on='creativeID',how='left')


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-15-7d2972eab77f> in <module>()
----> 1 train = pd.merge(train,statead,on='creativeID',how='left')
      2 test = pd.merge(test,statead,on='creativeID',how='left')

NameError: name 'statead' is not defined

In [23]:
# statead.head()


Out[23]:
creativenessID brand successclick successconversion
0 4079 191 2890.0 10
1 4565 56 449000.0 11622
2 3170 1400 387.0 5
3 6566 1400 224.0 1
4 5187 56 141000.0 2277

In [16]:
#numberical feature数字特征,feature_name总特征,categorical_feature 分类特征
#我们去掉label,convertiontime跑一次
feature_name = [a for a in train.columns if a not in ['label','conversionTime']]
categorical_feature = ['creativeID','userID','positionID','connectionType','telecomsOperator']

In [17]:
#去掉除label,convertiontime的第二次数据集
train_label = train['label']
train = train[feature_name]
test_label = test['label']
test = test[feature_name]

In [18]:
#添加appID特征(tfidf)
user_installedapps = pd.read_csv('./pre/user_installedapps.csv')
user_installedapps_count = user_installedapps.groupby('userID').agg(len).reset_index()#计数特征

In [19]:
user_installedapps.head()


Out[19]:
userID appID
0 1 357
1 1 360
2 1 362
3 1 365
4 1 375

In [20]:
user_installedapps_count.columns = ['userID','user_appID_count']
#2798058	app360 app361 app362 app375 app480 app481 app4  相当于app+value
user_installedapps = user_installedapps.groupby('userID').agg(lambda x:' '.join(['app'+str(s) for s in x.values])).reset_index()

In [21]:
user_id_all = pd.concat([train.userID,test.userID],axis=0)
user_id_all = pd.DataFrame(user_id_all,columns=['userID'])
user_id_all.head()


Out[21]:
userID
0 2798058
1 463234
2 1857485
3 2038823
4 2015141

In [22]:
#不同用户的先提取出来
user_installedapps = pd.merge(user_id_all.drop_duplicates(),user_installedapps,on='userID',how='left')
user_installedapps = user_installedapps.fillna('Missing')
#至此,user_installedapps处理完毕

In [23]:
tfv = TfidfVectorizer()
tfv.fit(user_installedapps.appID)


Out[23]:
TfidfVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm=u'l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern=u'(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [24]:
#按照顺序转化为tfidf特征
user_installedapps = pd.merge(user_id_all,user_installedapps,on='userID',how='left')
user_installedapps = user_installedapps.fillna('Missing')
user_installedapps_tfv = tfv.transform(user_installedapps.appID)

In [22]:
#保险起见,爱你,就储存吧
user_installedapps.to_csv('./pre/user-app.csv',index=None)

In [26]:
def featureManipulation(dtfm, colList, func):
    '''依次处理某一dataframe内__所有__col的__所有__零值'''
    for col in colList:
        pr_col = func(dtfm, col)
        for row in pr_col.iterrows():
            zeroSample = dtfm[col][(dtfm[col] == 0)]
            replace = row[0]
            num = row[1][col].astype(int)
            if num > len(zeroSample):
                print(replace)
                num = len(zeroSample)
            if num <= 0:
                continue
            smpl = zeroSample.sample(num)
            smpl = smpl.replace(0, replace)
            dtfm[col].update(smpl)
    print(dtfm)

In [27]:
# 这里是对user的例子
user = pd.read_csv('./pre/user.csv')
user.head()
def sln(dtfm, col):
    dtfm_col = dtfm[dtfm[col] > 0]
    pr_col = dtfm_col[col].value_counts()/len(dtfm_col[col])
    pr_col *= len(dtfm[col][(dtfm[col] == 0)])
    pr_col = pr_col.apply(np.round)
    pr_col = pr_col.to_frame()
    return pr_col
featureManipulation(user, ['age','gender','education','hometown','residence'], sln)


3207
3203
3206
          userID  age  gender  education  marriageStatus  haveBaby  hometown  \
0              1   42       1          1               2         0       512   
1              2   18       1          5               1         0      1403   
2              3   11       2          4               0         0      1006   
3              4   21       2          5               3         0       607   
4              5   22       2          3               0         0       101   
5              6   20       2          5               0         0       301   
6              7   17       1          5               0         0       313   
7              8   21       1          2               3         1      1607   
8              9   38       2          2               2         0      2203   
9             10   13       2          1               0         0       311   
10            11   17       2          3               2         1      2405   
11            12   15       2          1               0         0       116   
12            13   20       1          3               0         0       604   
13            14   32       1          2               0         0       803   
14            15   21       1          1               0         0       415   
15            16   19       1          1               0         0       510   
16            17   19       1          3               1         0      1403   
17            18   22       2          1               1         0       206   
18            19   28       1          2               1         0      1011   
19            20   47       1          3               2         0       203   
20            21   14       2          3               0         0      1812   
21            22   15       1          2               0         0       311   
22            23   16       1          3               0         0      1002   
23            24   20       1          2               0         0       403   
24            25   12       2          1               3         0      1003   
25            26   15       2          2               0         0       805   
26            27   34       2          2               0         1       705   
27            28   19       1          2               3         0       708   
28            29   18       1          1               0         0       309   
29            30   50       2          1               0         0      1401   
...          ...  ...     ...        ...             ...       ...       ...   
2805088  2805089   22       1          2               3         0       302   
2805089  2805090   18       1          1               0         0      2516   
2805090  2805091   31       1          4               2         0       314   
2805091  2805092   10       2          4               1         0      1504   
2805092  2805093   28       2          1               0         1       603   
2805093  2805094   35       2          2               2         1      2201   
2805094  2805095   27       2          2               1         0      2410   
2805095  2805096   17       1          4               0         0      1202   
2805096  2805097   28       2          3               2         1       306   
2805097  2805098   15       1          1               2         0       512   
2805098  2805099   21       1          6               1         0       606   
2805099  2805100   24       1          1               1         0      3001   
2805100  2805101   19       1          1               0         0       805   
2805101  2805102   16       2          3               0         0      1401   
2805102  2805103   50       1          1               2         0      1306   
2805103  2805104   36       1          2               2         0       115   
2805104  2805105   26       1          1               0         0       911   
2805105  2805106   15       2          1               1         0      1901   
2805106  2805107   13       1          5               1         0      2301   
2805107  2805108   26       2          5               1         0      2113   
2805108  2805109   15       1          5               1         0      1601   
2805109  2805110   18       1          2               1         0       804   
2805110  2805111   20       2          1               3         1      1504   
2805111  2805112   39       2          1               2         0      1101   
2805112  2805113   14       1          3               1         0       105   
2805113  2805114   25       1          4               0         0       404   
2805114  2805115   14       1          3               0         0      1901   
2805115  2805116   27       1          5               0         5       208   
2805116  2805117   34       2          1               2         1      1304   
2805117  2805118   51       1          2               0         0      1208   

         residence  
0              503  
1             1403  
2             1004  
3              607  
4             1301  
5             2301  
6              313  
7             1607  
8             2203  
9             2203  
10            1601  
11            2101  
12             601  
13             602  
14             513  
15            1802  
16            1403  
17             303  
18            1106  
19            1008  
20             104  
21            2404  
22             901  
23             212  
24            1003  
25             805  
26            1812  
27             503  
28             309  
29             218  
...            ...  
2805088        302  
2805089       2516  
2805090        303  
2805091       1504  
2805092       2407  
2805093       1501  
2805094       1602  
2805095        909  
2805096        306  
2805097        509  
2805098        601  
2805099       3001  
2805100        604  
2805101       2109  
2805102       1208  
2805103       2301  
2805104        911  
2805105       1901  
2805106       1901  
2805107       2113  
2805108       1601  
2805109        206  
2805110       1512  
2805111       1101  
2805112        805  
2805113       1204  
2805114       1901  
2805115       1901  
2805116       1101  
2805117       2601  

[2805118 rows x 8 columns]

In [13]:
user.isnull().values.any()


Out[13]:
False

In [28]:
user['hometown_city'] = user['hometown']%100
user['hometown_province'] = (user['hometown']/100).astype('int')
user['residence_city'] = user['residence']%100
user['residence_province'] = (user['residence']/100).astype('int')

In [29]:
ad = pd.read_csv('./pre/ad.csv')
ad.head()


Out[29]:
creativeID adID camgaignID advertiserID appID appPlatform
0 4079 2318 147 80 14 2
1 4565 3593 632 3 465 1
2 3170 1593 205 54 389 1
3 6566 2390 205 54 389 1
4 5187 411 564 3 465 1

In [30]:
#合并特征
train = pd.merge(train,user_installedapps_count,on='userID',how='left')
train = pd.merge(train,user,on='userID',how='left')
train = pd.merge(train,ad,on='creativeID',how='left')

In [31]:
#验证集合并特征
test = pd.merge(test,user_installedapps_count,on='userID',how='left')
test = pd.merge(test,user,on='userID',how='left')
test = pd.merge(test,ad,on='creativeID',how='left')

In [33]:
train.shape


Out[33]:
(3113362, 26)

In [34]:
#保险起见,爱你,就储存吧
train.to_csv('./pre/train17-28.csv',index=None)

In [51]:
train = pd.read_csv('./pre/train17-28.csv')
train


Out[51]:
clickTime creativeID userID positionID connectionType telecomsOperator clickTime_day clickTime_hour clickTime_minute user_appID_count ... residence hometown_city hometown_province residence_city residence_province adID camgaignID advertiserID appID appPlatform
0 170000 3089 2798058 293 1 1 17 0 0 44.0 ... 1301 7 2 1 13 1321 83 10 434 1
1 170000 1259 463234 6161 1 2 17 0 0 NaN ... 213 6 6 13 2 1535 685 80 14 2
2 170000 4465 1857485 7434 4 1 17 0 0 NaN ... 1502 5 8 2 15 147 460 3 465 1
3 170000 1004 2038823 977 1 1 17 0 0 69.0 ... 1001 6 5 1 10 411 564 3 465 1
4 170000 1887 2015141 3688 1 1 17 0 0 NaN ... 1001 1 10 1 10 369 144 84 360 1
5 170000 3293 1177829 3347 1 1 17 0 0 NaN ... 107 7 1 7 1 2891 685 80 14 2
6 170000 4793 1257450 1876 1 2 17 0 0 28.0 ... 1104 1 13 4 11 1225 484 80 14 2
7 170000 1456 764204 6086 1 3 17 0 0 78.0 ... 308 8 3 8 3 3379 411 3 465 1
8 170000 4465 1438585 2426 2 3 17 0 0 NaN ... 601 3 8 1 6 147 460 3 465 1
9 170000 985 1750317 5414 1 3 17 0 0 88.0 ... 2102 2 21 2 21 86 503 3 465 1
10 170000 3465 2079582 6086 1 3 17 0 0 57.0 ... 307 11 3 7 3 86 503 3 465 1
11 170000 2764 2551467 977 1 1 17 0 0 48.0 ... 1103 11 2 3 11 147 460 3 465 1
12 170000 997 2120928 5648 1 3 17 0 0 NaN ... 1402 2 14 2 14 2982 661 80 14 2
13 170000 997 2292736 3498 1 1 17 0 0 NaN ... 201 1 2 1 2 2982 661 80 14 2
14 170000 4465 1119329 4635 3 3 17 0 0 91.0 ... 3104 4 31 4 31 147 460 3 465 1
15 170000 3465 2256911 5013 1 1 17 0 0 NaN ... 505 15 4 5 5 86 503 3 465 1
16 170000 3846 554021 932 1 1 17 0 0 NaN ... 2701 1 6 1 27 493 379 84 360 1
17 170000 6490 1666508 5347 1 1 17 0 0 28.0 ... 2501 3 2 1 25 1469 535 80 14 2
18 170000 5187 1082944 6850 2 3 17 0 0 63.0 ... 101 1 19 1 1 411 564 3 465 1
19 170000 1456 2437627 6086 1 3 17 0 0 32.0 ... 2407 8 28 7 24 3379 411 3 465 1
20 170000 4686 1658851 1613 1 1 17 0 0 46.0 ... 206 4 2 6 2 834 107 80 14 2
21 170000 4250 2009709 2831 1 1 17 0 0 NaN ... 103 3 13 3 1 1638 440 3 465 1
22 170000 4465 1746056 4113 1 2 17 0 0 90.0 ... 1901 6 6 1 19 147 460 3 465 1
23 170000 3575 1111108 6905 1 3 17 0 0 NaN ... 802 2 8 2 8 2757 70 80 14 2
24 170000 3531 402769 3347 1 1 17 0 0 NaN ... 1104 7 8 4 11 2581 300 80 14 2
25 170000 5187 2428743 6086 1 1 17 0 0 55.0 ... 307 6 3 7 3 411 564 3 465 1
26 170000 3465 90746 2931 1 1 17 0 0 37.0 ... 204 1 4 4 2 86 503 3 465 1
27 170000 2137 217320 2579 1 1 17 0 0 73.0 ... 1501 3 15 1 15 2148 179 84 360 1
28 170000 4250 1443907 6193 1 1 17 0 0 76.0 ... 604 12 4 4 6 1638 440 3 465 1
29 170000 726 912898 2150 1 1 17 0 0 61.0 ... 512 3 13 12 5 1791 497 15 383 1
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
3113332 282359 1456 361690 6772 1 1 28 23 59 NaN ... 206 15 2 6 2 3379 411 3 465 1
3113333 282359 5610 2119514 249 1 2 28 23 59 NaN ... 1009 10 6 9 10 411 564 3 465 1
3113334 282359 1004 2766816 977 2 3 28 23 59 125.0 ... 1705 2 24 5 17 411 564 3 465 1
3113335 282359 4565 2515408 7197 1 2 28 23 59 43.0 ... 206 6 15 6 2 3593 632 3 465 1
3113336 282359 376 1123842 2579 1 1 28 23 59 NaN ... 402 5 4 2 4 3102 649 84 360 1
3113337 282359 3012 585710 4818 1 2 28 23 59 65.0 ... 1306 2 6 6 13 1576 139 10 434 1
3113338 282359 4250 1292804 1530 1 1 28 23 59 56.0 ... 2403 1 24 3 24 1638 440 3 465 1
3113339 282359 1887 1012568 1245 1 1 28 23 59 86.0 ... 705 1 29 5 7 369 144 84 360 1
3113340 282359 3465 1583203 5024 1 1 28 23 59 47.0 ... 906 1 2 6 9 86 503 3 465 1
3113341 282359 410 780027 7149 1 1 28 23 59 NaN ... 914 14 9 14 9 2477 217 44 421 1
3113342 282359 3093 1575499 2692 2 1 28 23 59 136.0 ... 1501 3 15 1 15 86 503 3 465 1
3113343 282359 2553 2477457 3688 1 2 28 23 59 NaN ... 1401 4 12 1 14 2519 60 3 465 1
3113344 282359 2553 168331 7366 1 1 28 23 59 NaN ... 1503 3 15 3 15 2519 60 3 465 1
3113345 282359 3586 55185 4867 2 1 28 23 59 63.0 ... 801 1 8 1 8 2238 637 3 465 1
3113346 282359 5209 240597 2579 1 2 28 23 59 29.0 ... 1204 4 12 4 12 293 82 84 360 1
3113347 282359 5588 284474 2150 1 1 28 23 59 61.0 ... 304 4 3 4 3 1502 23 15 383 1
3113348 282359 5209 240597 2579 1 2 28 23 59 29.0 ... 1204 4 12 4 12 293 82 84 360 1
3113349 282359 2764 1731672 7350 1 1 28 23 59 NaN ... 602 2 6 2 6 147 460 3 465 1
3113350 282359 985 2612385 4867 1 1 28 23 59 50.0 ... 2511 1 4 11 25 86 503 3 465 1
3113351 282359 4565 2072371 7262 1 1 28 23 59 NaN ... 1601 4 8 1 16 3593 632 3 465 1
3113352 282359 2982 1460744 3365 1 1 28 23 59 86.0 ... 101 3 4 1 1 3379 411 3 465 1
3113353 282359 4565 179405 2599 1 1 28 23 59 20.0 ... 2406 6 24 6 24 3593 632 3 465 1
3113354 282359 4565 67345 5971 3 1 28 23 59 NaN ... 1109 6 12 9 11 3593 632 3 465 1
3113355 282359 6440 966364 846 1 2 28 23 59 NaN ... 2301 8 22 1 23 3379 411 3 465 1
3113356 282359 985 2031728 1465 2 3 28 23 59 69.0 ... 1709 9 17 9 17 86 503 3 465 1
3113357 282359 863 2087582 1923 1 1 28 23 59 59.0 ... 2903 2 21 3 29 2238 637 3 465 1
3113358 282359 4565 1442773 2891 1 3 28 23 59 NaN ... 605 5 6 5 6 3593 632 3 465 1
3113359 282359 4432 317835 4455 1 1 28 23 59 61.0 ... 902 6 14 2 9 1161 45 3 465 1
3113360 282359 1456 829587 3232 1 3 28 23 59 64.0 ... 1708 7 6 8 17 3379 411 3 465 1
3113361 282359 866 119946 3322 1 2 28 23 59 41.0 ... 2410 10 24 10 24 619 89 29 286 2

3113362 rows × 26 columns


In [35]:
test.shape


Out[35]:
(338489, 26)

In [36]:
#保险起见,爱你,就储存吧
test.to_csv('./pre/test17-28.csv',index=None)

In [49]:
test = pd.read_csv('./pre/test17-28.csv')
test


Out[49]:
clickTime creativeID userID positionID connectionType telecomsOperator clickTime_day clickTime_hour clickTime_minute user_appID_count ... residence hometown_city hometown_province residence_city residence_province adID camgaignID advertiserID appID appPlatform
0 310000 3745 1164848 3451 1 3 31 0 0 34.0 ... 605 5 6 5 6 1166 430 80 14 2
1 310000 2284 2127247 1613 1 3 31 0 0 NaN ... 2301 5 22 1 23 1388 325 80 14 2
2 310000 1456 2769125 5510 2 1 31 0 0 74.0 ... 1502 3 10 2 15 3379 411 3 465 1
3 310000 4565 9762 4113 2 3 31 0 0 NaN ... 2407 5 24 7 24 3593 632 3 465 1
4 310000 49 2513636 3615 1 3 31 0 0 NaN ... 307 7 3 7 3 1469 535 80 14 2
5 310000 3824 488035 3821 1 1 31 0 0 NaN ... 209 5 24 9 2 2792 252 79 391 1
6 310000 863 225561 4188 1 3 31 0 0 NaN ... 111 5 6 11 1 2238 637 3 465 1
7 310000 4469 1144620 4113 1 1 31 0 0 90.0 ... 1408 1 5 8 14 2853 138 79 391 1
8 310000 3745 1113275 3347 1 2 31 0 0 66.0 ... 504 10 5 4 5 1166 430 80 14 2
9 310000 4565 1361729 2426 2 2 31 0 0 92.0 ... 1201 1 12 1 12 3593 632 3 465 1
10 310000 4250 1749379 4292 1 3 31 0 0 NaN ... 602 14 4 2 6 1638 440 3 465 1
11 310000 1456 840178 6086 1 1 31 0 0 46.0 ... 504 5 14 4 5 3379 411 3 465 1
12 310000 51 1945411 977 4 2 31 0 0 82.0 ... 2506 3 13 6 25 1638 440 3 465 1
13 310000 4250 1789615 5875 1 2 31 0 0 62.0 ... 1002 2 10 2 10 1638 440 3 465 1
14 310000 5588 340733 2150 1 1 31 0 0 89.0 ... 2301 9 17 1 23 1502 23 15 383 1
15 310000 6001 1884781 1385 1 3 31 0 0 NaN ... 1303 3 13 3 13 1655 139 10 434 1
16 310000 4432 596931 6667 1 3 31 0 0 NaN ... 1402 2 14 2 14 1161 45 3 465 1
17 310000 4967 931181 6539 1 1 31 0 0 NaN ... 511 11 5 11 5 1166 430 80 14 2
18 310000 1704 2657412 2695 1 0 31 0 0 60.0 ... 702 2 2 2 7 571 402 79 391 1
19 310000 4565 1251533 7135 1 1 31 0 0 87.0 ... 1406 2 21 6 14 3593 632 3 465 1
20 310000 3690 1199429 6932 1 1 31 0 0 73.0 ... 1705 5 17 5 17 998 402 79 391 1
21 310000 5527 2455848 3228 1 3 31 0 0 NaN ... 305 1 26 5 3 972 138 79 391 1
22 310000 4565 1956857 5971 1 1 31 0 0 74.0 ... 707 8 2 7 7 3593 632 3 465 1
23 310000 1831 1191158 2461 1 3 31 0 0 44.0 ... 313 8 16 13 3 352 252 79 391 1
24 310000 6190 2272417 1465 1 1 31 0 0 40.0 ... 307 8 5 7 3 266 138 79 391 1
25 310000 2189 390752 3789 1 1 31 0 0 55.0 ... 1811 11 18 11 18 377 138 79 391 1
26 310000 594 1399698 6161 1 2 31 0 0 NaN ... 1102 14 1 2 11 2837 534 80 14 2
27 310000 4250 2297337 6193 2 1 31 0 0 53.0 ... 206 15 2 6 2 1638 440 3 465 1
28 310000 1092 1423913 5510 1 3 31 0 0 NaN ... 109 9 1 9 1 831 562 3 465 1
29 310000 4250 1206944 2426 2 1 31 0 0 NaN ... 202 5 2 2 2 1638 440 3 465 1
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
338459 312359 2389 1060910 1625 1 2 31 23 59 47.0 ... 1608 5 1 8 16 2899 252 79 391 1
338460 312359 4565 569989 4292 1 1 31 23 59 33.0 ... 1803 3 18 3 18 3593 632 3 465 1
338461 312359 863 478252 5030 1 3 31 23 59 NaN ... 609 4 6 9 6 2238 637 3 465 1
338462 312359 3871 2493985 2044 1 1 31 23 59 NaN ... 219 19 2 19 2 2807 402 79 391 1
338463 312359 2474 1712009 4818 1 2 31 23 59 82.0 ... 1310 10 13 10 13 1655 139 10 434 1
338464 312359 4257 1422501 2150 1 1 31 23 59 69.0 ... 1003 1 19 3 10 1608 69 79 391 1
338465 312359 3690 1238716 522 1 3 31 23 59 NaN ... 101 1 1 1 1 998 402 79 391 1
338466 312359 5187 2323692 5510 1 1 31 23 59 84.0 ... 408 2 16 8 4 411 564 3 465 1
338467 312359 3192 1020405 3322 1 2 31 23 59 83.0 ... 901 1 9 1 9 1124 237 7 336 2
338468 312359 4432 364120 4455 1 3 31 23 59 97.0 ... 1601 4 25 1 16 1161 45 3 465 1
338469 312359 863 666507 5013 1 1 31 23 59 63.0 ... 2301 1 23 1 23 2238 637 3 465 1
338470 312359 5497 1163363 2823 1 3 31 23 59 NaN ... 1901 6 4 1 19 3090 217 44 421 1
338471 312359 4785 2160374 3322 1 2 31 23 59 NaN ... 401 9 4 1 4 2871 613 20 328 2
338472 312359 1465 2452121 7619 1 2 31 23 59 7.0 ... 1404 4 14 4 14 3141 277 74 100 2
338473 312359 985 1257057 4867 1 1 31 23 59 5.0 ... 1012 1 14 12 10 86 503 3 465 1
338474 312359 1456 106972 4657 1 1 31 23 59 76.0 ... 1501 2 18 1 15 3379 411 3 465 1
338475 312359 750 1893932 3322 1 2 31 23 59 NaN ... 1103 3 11 3 11 750 141 87 116 2
338476 312359 5258 1670094 3347 2 3 31 23 59 NaN ... 901 4 9 1 9 2195 369 67 137 2
338477 312359 3869 399524 3789 1 1 31 23 59 41.0 ... 909 7 14 9 9 696 183 3 465 1
338478 312359 4077 2611278 2579 1 1 31 23 59 48.0 ... 305 1 21 5 3 2698 201 89 420 1
338479 312359 853 53208 2579 1 1 31 23 59 73.0 ... 1104 8 9 4 11 993 26 15 383 1
338480 312359 1092 585337 103 1 1 31 23 59 46.0 ... 2301 5 15 1 23 831 562 3 465 1
338481 312359 5943 610001 2868 1 1 31 23 59 38.0 ... 801 3 20 1 8 1638 440 3 465 1
338482 312359 2553 772834 4250 1 1 31 23 59 74.0 ... 810 10 8 10 8 2519 60 3 465 1
338483 312359 6496 1043859 4867 1 3 31 23 59 74.0 ... 102 4 2 2 1 933 504 3 465 1
338484 312359 1456 365374 4455 2 1 31 23 59 NaN ... 1011 11 10 11 10 3379 411 3 465 1
338485 312359 2962 2231643 4818 1 2 31 23 59 NaN ... 2204 1 22 4 22 146 138 79 391 1
338486 312359 4785 1673380 3322 1 3 31 23 59 53.0 ... 503 8 1 3 5 2871 613 20 328 2
338487 312359 4432 1303085 4501 0 1 31 23 59 NaN ... 204 4 2 4 2 1161 45 3 465 1
338488 312359 2982 490286 7149 2 1 31 23 59 NaN ... 1009 2 10 9 10 3379 411 3 465 1

338489 rows × 26 columns


In [6]:
#去掉除label,convertiontime的第二次数据集
train_label = train['label']
test_label = test['label']


---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-6-9319c166a1ef> in <module>()
      1 #去掉除label,convertiontime的第二次数据集
----> 2 train_label = train['label']
      3 test_label = test['label']

C:\ProgramData\Anaconda2\lib\site-packages\pandas\core\frame.py in __getitem__(self, key)
   2057             return self._getitem_multilevel(key)
   2058         else:
-> 2059             return self._getitem_column(key)
   2060 
   2061     def _getitem_column(self, key):

C:\ProgramData\Anaconda2\lib\site-packages\pandas\core\frame.py in _getitem_column(self, key)
   2064         # get column
   2065         if self.columns.is_unique:
-> 2066             return self._get_item_cache(key)
   2067 
   2068         # duplicate columns & possible reduce dimensionality

C:\ProgramData\Anaconda2\lib\site-packages\pandas\core\generic.py in _get_item_cache(self, item)
   1384         res = cache.get(item)
   1385         if res is None:
-> 1386             values = self._data.get(item)
   1387             res = self._box_item_values(item, values)
   1388             cache[item] = res

C:\ProgramData\Anaconda2\lib\site-packages\pandas\core\internals.py in get(self, item, fastpath)
   3541 
   3542             if not isnull(item):
-> 3543                 loc = self.items.get_loc(item)
   3544             else:
   3545                 indexer = np.arange(len(self.items))[isnull(self.items)]

C:\ProgramData\Anaconda2\lib\site-packages\pandas\indexes\base.py in get_loc(self, key, method, tolerance)
   2134                 return self._engine.get_loc(key)
   2135             except KeyError:
-> 2136                 return self._engine.get_loc(self._maybe_cast_indexer(key))
   2137 
   2138         indexer = self.get_indexer([key], method=method, tolerance=tolerance)

pandas\index.pyx in pandas.index.IndexEngine.get_loc (pandas\index.c:4433)()

pandas\index.pyx in pandas.index.IndexEngine.get_loc (pandas\index.c:4279)()

pandas\src\hashtable_class_helper.pxi in pandas.hashtable.PyObjectHashTable.get_item (pandas\hashtable.c:13742)()

pandas\src\hashtable_class_helper.pxi in pandas.hashtable.PyObjectHashTable.get_item (pandas\hashtable.c:13696)()

KeyError: 'label'

In [47]:
train.dtypes


Out[47]:
clickTime             int64
creativeID            int64
userID                int64
positionID            int64
connectionType        int64
telecomsOperator      int64
clickTime_day         int64
clickTime_hour        int64
clickTime_minute      int64
age                   int64
gender                int64
education             int64
marriageStatus        int64
haveBaby              int64
hometown              int64
residence             int64
hometown_city         int64
hometown_province     int32
residence_city        int64
residence_province    int32
adID                  int64
camgaignID            int64
advertiserID          int64
appID                 int64
appPlatform           int64
dtype: object

In [91]:
train = train.fillna(0)
test = test.fillna(0)
train.dtypes


Out[91]:
clickTime               int64
creativeID              int64
userID                  int64
positionID              int64
connectionType          int64
telecomsOperator        int64
clickTime_day           int64
clickTime_hour          int64
clickTime_minute        int64
sitesetID               int64
positionType            int64
user_appID_count      float64
age                     int64
gender                  int64
education               int64
marriageStatus          int64
haveBaby                int64
hometown                int64
residence               int64
hometown_city           int64
hometown_province       int32
residence_city          int64
residence_province      int32
adID                    int64
camgaignID              int64
advertiserID            int64
appID                   int64
appPlatform             int64
dtype: object

In [52]:
train_user_appID_count =  train[['user_appID_count']]
test_user_appID_count =  test[['user_appID_count']]
del train['user_appID_count'],test['user_appID_count']

In [93]:
oneEnc = OneHotEncoder()
data_one = pd.concat([train,test])
data_one = oneEnc.fit_transform(data_one)
train_one = data_one[:train.shape[0]]
test_one = data_one[train.shape[0]:]

In [94]:
print train_one.shape
print user_installedapps_tfv[:train.shape[0]].shape
print train_user_appID_count.shape
print train.shape


(861372, 987038)
(861372, 111975)
(861372, 1)
(861372, 27)

In [95]:
train_user_appID_count.values


Out[95]:
array([[ 64.],
       [  0.],
       [ 67.],
       ..., 
       [ 64.],
       [  0.],
       [ 92.]])

In [100]:
train = hstack([train_one,user_installedapps_tfv[:train.shape[0]]])
test = hstack([test_one,user_installedapps_tfv[train.shape[0]:]])




In [13]:
# #输出训练集和测试集
# with open('train.pkl','w') as f:
#     pickle.dump(train,f)
# with open('test.pkl','w') as f:
#     pickle.dump(test,f)
#读取训练集和测试集
with open('train.pkl','rb') as f:
    train = cPickle.load(f)
# with open('test.pkl','rb') as f:
#     test = cPickle.load(f)

In [15]:
print(test.predict(X[0:1]))


---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-15-385b41b6d7b5> in <module>()
----> 1 print(test.predict(X[0:1]))

C:\ProgramData\Anaconda2\lib\site-packages\scipy\sparse\base.py in __getattr__(self, attr)
    557             return self.getnnz()
    558         else:
--> 559             raise AttributeError(attr + " not found")
    560 
    561     def transpose(self, axes=None, copy=False):

AttributeError: predict not found

In [97]:
# from sklearn.linear_model import LogisticRegression
# print 'LogisticRegression Logistic回归'
# lr = LogisticRegression(n_jobs=-1,random_state=2017)
# lr.fit(train,train_label)
# pred = lr.predict_proba(train)[:,1]
# print '训练得分',logloss(train_label,pred)
# # pred = lr.predict_proba(test)[:,1]
# # print '验证得分',logloss(test_label,pred)


LogisticRegression Logistic回归
训练得分 0.0797782026628

In [53]:
#模型参数设置
xlf = xgb.XGBRegressor(max_depth=5, 
                        learning_rate=0.01, 
                        n_estimators=2000, 
                        silent=True, 
                        objective='reg:linear', 
                        n_jobs=-1, 
                        gamma=0.1,
                        min_child_weight=1.1, 
                        max_delta_step=5, 
                        subsample=0.7, 
                        colsample_bytree=0.7, 
                        colsample_bylevel=0.7, 
                        reg_alpha=0, 
                        reg_lambda=10, 
                        scale_pos_weight=1, 
                        random_state=0, 
                        missing=None)

xlf.fit(train, train_label, eval_metric='rmse', eval_set = [(test,test_label )],verbose = True, early_stopping_rounds=200)


# pred = lr.predict_proba(test)[:,1]
# print '验证得分',logloss(test_label,pred)


[0]	validation_0-rmse:1.49526
Will train until validation_0-rmse hasn't improved in 100 rounds.
[1]	validation_0-rmse:1.49061
[2]	validation_0-rmse:1.48599
[3]	validation_0-rmse:1.48146
[4]	validation_0-rmse:1.47694
[5]	validation_0-rmse:1.4724
[6]	validation_0-rmse:1.46797
[7]	validation_0-rmse:1.46354
[8]	validation_0-rmse:1.45916
[9]	validation_0-rmse:1.45483
[10]	validation_0-rmse:1.45062
[11]	validation_0-rmse:1.44642
[12]	validation_0-rmse:1.44227
[13]	validation_0-rmse:1.43838
[14]	validation_0-rmse:1.4343
[15]	validation_0-rmse:1.43024
[16]	validation_0-rmse:1.4262
[17]	validation_0-rmse:1.42223
[18]	validation_0-rmse:1.41832
[19]	validation_0-rmse:1.41444
[20]	validation_0-rmse:1.41057
[21]	validation_0-rmse:1.40688
[22]	validation_0-rmse:1.40307
[23]	validation_0-rmse:1.39935
[24]	validation_0-rmse:1.39565
[25]	validation_0-rmse:1.39196
[26]	validation_0-rmse:1.38836
[27]	validation_0-rmse:1.3848
[28]	validation_0-rmse:1.38124
[29]	validation_0-rmse:1.37792
[30]	validation_0-rmse:1.37443
[31]	validation_0-rmse:1.37095
[32]	validation_0-rmse:1.36769
[33]	validation_0-rmse:1.36431
[34]	validation_0-rmse:1.36094
[35]	validation_0-rmse:1.35764
[36]	validation_0-rmse:1.35443
[37]	validation_0-rmse:1.35118
[38]	validation_0-rmse:1.34795
[39]	validation_0-rmse:1.34473
[40]	validation_0-rmse:1.34156
[41]	validation_0-rmse:1.33843
[42]	validation_0-rmse:1.33533
[43]	validation_0-rmse:1.33228
[44]	validation_0-rmse:1.32922
[45]	validation_0-rmse:1.3262
[46]	validation_0-rmse:1.32324
[47]	validation_0-rmse:1.32032
[48]	validation_0-rmse:1.31745
[49]	validation_0-rmse:1.31454
[50]	validation_0-rmse:1.31172
[51]	validation_0-rmse:1.30888
[52]	validation_0-rmse:1.30609
[53]	validation_0-rmse:1.30333
[54]	validation_0-rmse:1.30058
[55]	validation_0-rmse:1.29788
[56]	validation_0-rmse:1.2953
[57]	validation_0-rmse:1.29262
[58]	validation_0-rmse:1.28999
[59]	validation_0-rmse:1.28741
[60]	validation_0-rmse:1.28489
[61]	validation_0-rmse:1.28239
[62]	validation_0-rmse:1.27985
[63]	validation_0-rmse:1.27734
[64]	validation_0-rmse:1.27487
[65]	validation_0-rmse:1.27239
[66]	validation_0-rmse:1.26994
[67]	validation_0-rmse:1.26752
[68]	validation_0-rmse:1.26513
[69]	validation_0-rmse:1.26276
[70]	validation_0-rmse:1.26042
[71]	validation_0-rmse:1.25815
[72]	validation_0-rmse:1.25588
[73]	validation_0-rmse:1.25368
[74]	validation_0-rmse:1.25148
[75]	validation_0-rmse:1.24925
[76]	validation_0-rmse:1.24707
[77]	validation_0-rmse:1.24494
[78]	validation_0-rmse:1.24277
[79]	validation_0-rmse:1.24063
[80]	validation_0-rmse:1.23857
[81]	validation_0-rmse:1.23646
[82]	validation_0-rmse:1.23439
[83]	validation_0-rmse:1.23233
[84]	validation_0-rmse:1.23031
[85]	validation_0-rmse:1.2283
[86]	validation_0-rmse:1.22632
[87]	validation_0-rmse:1.22433
[88]	validation_0-rmse:1.22239
[89]	validation_0-rmse:1.22045
[90]	validation_0-rmse:1.21859
[91]	validation_0-rmse:1.21672
[92]	validation_0-rmse:1.21484
[93]	validation_0-rmse:1.21308
[94]	validation_0-rmse:1.21126
[95]	validation_0-rmse:1.20942
[96]	validation_0-rmse:1.20764
[97]	validation_0-rmse:1.20586
[98]	validation_0-rmse:1.20411
[99]	validation_0-rmse:1.20234
[100]	validation_0-rmse:1.20066
[101]	validation_0-rmse:1.19894
[102]	validation_0-rmse:1.19723
[103]	validation_0-rmse:1.19573
[104]	validation_0-rmse:1.19411
[105]	validation_0-rmse:1.19251
[106]	validation_0-rmse:1.19092
[107]	validation_0-rmse:1.18937
[108]	validation_0-rmse:1.18779
[109]	validation_0-rmse:1.18626
[110]	validation_0-rmse:1.18478
[111]	validation_0-rmse:1.18326
[112]	validation_0-rmse:1.18175
[113]	validation_0-rmse:1.18027
[114]	validation_0-rmse:1.17886
[115]	validation_0-rmse:1.17739
[116]	validation_0-rmse:1.17593
[117]	validation_0-rmse:1.17448
[118]	validation_0-rmse:1.17307
[119]	validation_0-rmse:1.17182
[120]	validation_0-rmse:1.1704
[121]	validation_0-rmse:1.16904
[122]	validation_0-rmse:1.16774
[123]	validation_0-rmse:1.16639
[124]	validation_0-rmse:1.16503
[125]	validation_0-rmse:1.16374
[126]	validation_0-rmse:1.16241
[127]	validation_0-rmse:1.16109
[128]	validation_0-rmse:1.15983
[129]	validation_0-rmse:1.15854
[130]	validation_0-rmse:1.15732
[131]	validation_0-rmse:1.15609
[132]	validation_0-rmse:1.15495
[133]	validation_0-rmse:1.15375
[134]	validation_0-rmse:1.15254
[135]	validation_0-rmse:1.15132
[136]	validation_0-rmse:1.15011
[137]	validation_0-rmse:1.14892
[138]	validation_0-rmse:1.14775
[139]	validation_0-rmse:1.1466
[140]	validation_0-rmse:1.14544
[141]	validation_0-rmse:1.14433
[142]	validation_0-rmse:1.14325
[143]	validation_0-rmse:1.14217
[144]	validation_0-rmse:1.14105
[145]	validation_0-rmse:1.13996
[146]	validation_0-rmse:1.13903
[147]	validation_0-rmse:1.13795
[148]	validation_0-rmse:1.13688
[149]	validation_0-rmse:1.13582
[150]	validation_0-rmse:1.13478
[151]	validation_0-rmse:1.13386
[152]	validation_0-rmse:1.13284
[153]	validation_0-rmse:1.13184
[154]	validation_0-rmse:1.13084
[155]	validation_0-rmse:1.12996
[156]	validation_0-rmse:1.12901
[157]	validation_0-rmse:1.12807
[158]	validation_0-rmse:1.12709
[159]	validation_0-rmse:1.12622
[160]	validation_0-rmse:1.12529
[161]	validation_0-rmse:1.12436
[162]	validation_0-rmse:1.12343
[163]	validation_0-rmse:1.12257
[164]	validation_0-rmse:1.12171
[165]	validation_0-rmse:1.12084
[166]	validation_0-rmse:1.11995
[167]	validation_0-rmse:1.11907
[168]	validation_0-rmse:1.11837
[169]	validation_0-rmse:1.1175
[170]	validation_0-rmse:1.11665
[171]	validation_0-rmse:1.11586
[172]	validation_0-rmse:1.11504
[173]	validation_0-rmse:1.11424
[174]	validation_0-rmse:1.11341
[175]	validation_0-rmse:1.1126
[176]	validation_0-rmse:1.11181
[177]	validation_0-rmse:1.11102
[178]	validation_0-rmse:1.11025
[179]	validation_0-rmse:1.10948
[180]	validation_0-rmse:1.10871
[181]	validation_0-rmse:1.10796
[182]	validation_0-rmse:1.10722
[183]	validation_0-rmse:1.10651
[184]	validation_0-rmse:1.10578
[185]	validation_0-rmse:1.10518
[186]	validation_0-rmse:1.10454
[187]	validation_0-rmse:1.10394
[188]	validation_0-rmse:1.10323
[189]	validation_0-rmse:1.10253
[190]	validation_0-rmse:1.10181
[191]	validation_0-rmse:1.10112
[192]	validation_0-rmse:1.1007
[193]	validation_0-rmse:1.10006
[194]	validation_0-rmse:1.09939
[195]	validation_0-rmse:1.09876
[196]	validation_0-rmse:1.09809
[197]	validation_0-rmse:1.09772
[198]	validation_0-rmse:1.0972
[199]	validation_0-rmse:1.09661
[200]	validation_0-rmse:1.09598
[201]	validation_0-rmse:1.09536
[202]	validation_0-rmse:1.09474
[203]	validation_0-rmse:1.09413
[204]	validation_0-rmse:1.09361
[205]	validation_0-rmse:1.093
[206]	validation_0-rmse:1.09253
[207]	validation_0-rmse:1.09204
[208]	validation_0-rmse:1.09146
[209]	validation_0-rmse:1.09088
[210]	validation_0-rmse:1.09031
[211]	validation_0-rmse:1.08974
[212]	validation_0-rmse:1.08922
[213]	validation_0-rmse:1.08869
[214]	validation_0-rmse:1.08815
[215]	validation_0-rmse:1.08761
[216]	validation_0-rmse:1.08705
[217]	validation_0-rmse:1.08654
[218]	validation_0-rmse:1.08601
[219]	validation_0-rmse:1.08549
[220]	validation_0-rmse:1.08498
[221]	validation_0-rmse:1.08446
[222]	validation_0-rmse:1.08394
[223]	validation_0-rmse:1.08346
[224]	validation_0-rmse:1.08296
[225]	validation_0-rmse:1.08247
[226]	validation_0-rmse:1.08199
[227]	validation_0-rmse:1.08152
[228]	validation_0-rmse:1.08105
[229]	validation_0-rmse:1.08063
[230]	validation_0-rmse:1.08019
[231]	validation_0-rmse:1.07979
[232]	validation_0-rmse:1.07938
[233]	validation_0-rmse:1.07892
[234]	validation_0-rmse:1.07848
[235]	validation_0-rmse:1.07816
[236]	validation_0-rmse:1.07775
[237]	validation_0-rmse:1.07733
[238]	validation_0-rmse:1.07689
[239]	validation_0-rmse:1.07648
[240]	validation_0-rmse:1.07605
[241]	validation_0-rmse:1.07582
[242]	validation_0-rmse:1.07542
[243]	validation_0-rmse:1.075
[244]	validation_0-rmse:1.07464
[245]	validation_0-rmse:1.07425
[246]	validation_0-rmse:1.0739
[247]	validation_0-rmse:1.07368
[248]	validation_0-rmse:1.07336
[249]	validation_0-rmse:1.07298
[250]	validation_0-rmse:1.07259
[251]	validation_0-rmse:1.07222
[252]	validation_0-rmse:1.07187
[253]	validation_0-rmse:1.07154
[254]	validation_0-rmse:1.07118
[255]	validation_0-rmse:1.07082
[256]	validation_0-rmse:1.07046
[257]	validation_0-rmse:1.07011
[258]	validation_0-rmse:1.06978
[259]	validation_0-rmse:1.06943
[260]	validation_0-rmse:1.06908
[261]	validation_0-rmse:1.06883
[262]	validation_0-rmse:1.06851
[263]	validation_0-rmse:1.06831
[264]	validation_0-rmse:1.06798
[265]	validation_0-rmse:1.06774
[266]	validation_0-rmse:1.06742
[267]	validation_0-rmse:1.06709
[268]	validation_0-rmse:1.06678
[269]	validation_0-rmse:1.06647
[270]	validation_0-rmse:1.06615
[271]	validation_0-rmse:1.06585
[272]	validation_0-rmse:1.06559
[273]	validation_0-rmse:1.06528
[274]	validation_0-rmse:1.06498
[275]	validation_0-rmse:1.06468
[276]	validation_0-rmse:1.06439
[277]	validation_0-rmse:1.06409
[278]	validation_0-rmse:1.06381
[279]	validation_0-rmse:1.06353
[280]	validation_0-rmse:1.06324
[281]	validation_0-rmse:1.06299
[282]	validation_0-rmse:1.0627
[283]	validation_0-rmse:1.06288
[284]	validation_0-rmse:1.06266
[285]	validation_0-rmse:1.06243
[286]	validation_0-rmse:1.06216
[287]	validation_0-rmse:1.0619
[288]	validation_0-rmse:1.06166
[289]	validation_0-rmse:1.06141
[290]	validation_0-rmse:1.06116
[291]	validation_0-rmse:1.06094
[292]	validation_0-rmse:1.0607
[293]	validation_0-rmse:1.06046
[294]	validation_0-rmse:1.06023
[295]	validation_0-rmse:1.05998
[296]	validation_0-rmse:1.05974
[297]	validation_0-rmse:1.05952
[298]	validation_0-rmse:1.05928
[299]	validation_0-rmse:1.05905
[300]	validation_0-rmse:1.05886
[301]	validation_0-rmse:1.05863
[302]	validation_0-rmse:1.0584
[303]	validation_0-rmse:1.05825
[304]	validation_0-rmse:1.05807
[305]	validation_0-rmse:1.05785
[306]	validation_0-rmse:1.05766
[307]	validation_0-rmse:1.05751
[308]	validation_0-rmse:1.05738
[309]	validation_0-rmse:1.05724
[310]	validation_0-rmse:1.05703
[311]	validation_0-rmse:1.05685
[312]	validation_0-rmse:1.05664
[313]	validation_0-rmse:1.05647
[314]	validation_0-rmse:1.05637
[315]	validation_0-rmse:1.05616
[316]	validation_0-rmse:1.05609
[317]	validation_0-rmse:1.05598
[318]	validation_0-rmse:1.05579
[319]	validation_0-rmse:1.05561
[320]	validation_0-rmse:1.05545
[321]	validation_0-rmse:1.05526
[322]	validation_0-rmse:1.05507
[323]	validation_0-rmse:1.05492
[324]	validation_0-rmse:1.05475
[325]	validation_0-rmse:1.05459
[326]	validation_0-rmse:1.05443
[327]	validation_0-rmse:1.05426
[328]	validation_0-rmse:1.05412
[329]	validation_0-rmse:1.05397
[330]	validation_0-rmse:1.05383
[331]	validation_0-rmse:1.05364
[332]	validation_0-rmse:1.05364
[333]	validation_0-rmse:1.05347
[334]	validation_0-rmse:1.05334
[335]	validation_0-rmse:1.05318
[336]	validation_0-rmse:1.05315
[337]	validation_0-rmse:1.05303
[338]	validation_0-rmse:1.05288
[339]	validation_0-rmse:1.05272
[340]	validation_0-rmse:1.05257
[341]	validation_0-rmse:1.05246
[342]	validation_0-rmse:1.05234
[343]	validation_0-rmse:1.05219
[344]	validation_0-rmse:1.05203
[345]	validation_0-rmse:1.05189
[346]	validation_0-rmse:1.05174
[347]	validation_0-rmse:1.0516
[348]	validation_0-rmse:1.0515
[349]	validation_0-rmse:1.05145
[350]	validation_0-rmse:1.05136
[351]	validation_0-rmse:1.05124
[352]	validation_0-rmse:1.05111
[353]	validation_0-rmse:1.051
[354]	validation_0-rmse:1.05088
[355]	validation_0-rmse:1.05074
[356]	validation_0-rmse:1.0506
[357]	validation_0-rmse:1.05048
[358]	validation_0-rmse:1.05034
[359]	validation_0-rmse:1.05022
[360]	validation_0-rmse:1.05012
[361]	validation_0-rmse:1.04999
[362]	validation_0-rmse:1.04989
[363]	validation_0-rmse:1.04975
[364]	validation_0-rmse:1.04963
[365]	validation_0-rmse:1.04951
[366]	validation_0-rmse:1.04942
[367]	validation_0-rmse:1.0493
[368]	validation_0-rmse:1.04928
[369]	validation_0-rmse:1.04917
[370]	validation_0-rmse:1.04911
[371]	validation_0-rmse:1.04905
[372]	validation_0-rmse:1.04894
[373]	validation_0-rmse:1.04883
[374]	validation_0-rmse:1.04875
[375]	validation_0-rmse:1.04866
[376]	validation_0-rmse:1.04855
[377]	validation_0-rmse:1.04845
[378]	validation_0-rmse:1.04835
[379]	validation_0-rmse:1.04824
[380]	validation_0-rmse:1.04822
[381]	validation_0-rmse:1.04836
[382]	validation_0-rmse:1.04825
[383]	validation_0-rmse:1.04815
[384]	validation_0-rmse:1.04804
[385]	validation_0-rmse:1.04794
[386]	validation_0-rmse:1.04784
[387]	validation_0-rmse:1.04776
[388]	validation_0-rmse:1.04767
[389]	validation_0-rmse:1.04761
[390]	validation_0-rmse:1.04751
[391]	validation_0-rmse:1.04737
[392]	validation_0-rmse:1.04728
[393]	validation_0-rmse:1.04722
[394]	validation_0-rmse:1.04715
[395]	validation_0-rmse:1.04705
[396]	validation_0-rmse:1.04702
[397]	validation_0-rmse:1.04696
[398]	validation_0-rmse:1.04688
[399]	validation_0-rmse:1.04676
[400]	validation_0-rmse:1.04674
[401]	validation_0-rmse:1.04664
[402]	validation_0-rmse:1.04656
[403]	validation_0-rmse:1.04657
[404]	validation_0-rmse:1.04649
[405]	validation_0-rmse:1.04646
[406]	validation_0-rmse:1.04639
[407]	validation_0-rmse:1.04631
[408]	validation_0-rmse:1.04627
[409]	validation_0-rmse:1.04622
[410]	validation_0-rmse:1.04614
[411]	validation_0-rmse:1.04608
[412]	validation_0-rmse:1.04599
[413]	validation_0-rmse:1.04592
[414]	validation_0-rmse:1.04581
[415]	validation_0-rmse:1.04573
[416]	validation_0-rmse:1.04568
[417]	validation_0-rmse:1.04561
[418]	validation_0-rmse:1.04557
[419]	validation_0-rmse:1.0455
[420]	validation_0-rmse:1.04543
[421]	validation_0-rmse:1.04536
[422]	validation_0-rmse:1.04531
[423]	validation_0-rmse:1.04526
[424]	validation_0-rmse:1.04519
[425]	validation_0-rmse:1.04515
[426]	validation_0-rmse:1.04511
[427]	validation_0-rmse:1.04505
[428]	validation_0-rmse:1.045
[429]	validation_0-rmse:1.04498
[430]	validation_0-rmse:1.04492
[431]	validation_0-rmse:1.04486
[432]	validation_0-rmse:1.0448
[433]	validation_0-rmse:1.04474
[434]	validation_0-rmse:1.04468
[435]	validation_0-rmse:1.04462
[436]	validation_0-rmse:1.04459
[437]	validation_0-rmse:1.04461
[438]	validation_0-rmse:1.04462
[439]	validation_0-rmse:1.04456
[440]	validation_0-rmse:1.0445
[441]	validation_0-rmse:1.04445
[442]	validation_0-rmse:1.0444
[443]	validation_0-rmse:1.04431
[444]	validation_0-rmse:1.04422
[445]	validation_0-rmse:1.04418
[446]	validation_0-rmse:1.04421
[447]	validation_0-rmse:1.04415
[448]	validation_0-rmse:1.04411
[449]	validation_0-rmse:1.04406
[450]	validation_0-rmse:1.044
[451]	validation_0-rmse:1.04393
[452]	validation_0-rmse:1.04389
[453]	validation_0-rmse:1.04385
[454]	validation_0-rmse:1.0438
[455]	validation_0-rmse:1.04379
[456]	validation_0-rmse:1.04374
[457]	validation_0-rmse:1.04369
[458]	validation_0-rmse:1.04374
[459]	validation_0-rmse:1.04369
[460]	validation_0-rmse:1.04367
[461]	validation_0-rmse:1.04362
[462]	validation_0-rmse:1.04358
[463]	validation_0-rmse:1.04364
[464]	validation_0-rmse:1.0436
[465]	validation_0-rmse:1.0436
[466]	validation_0-rmse:1.04373
[467]	validation_0-rmse:1.0437
[468]	validation_0-rmse:1.0437
[469]	validation_0-rmse:1.0437
[470]	validation_0-rmse:1.04367
[471]	validation_0-rmse:1.04363
[472]	validation_0-rmse:1.04361
[473]	validation_0-rmse:1.04355
[474]	validation_0-rmse:1.04356
[475]	validation_0-rmse:1.04353
[476]	validation_0-rmse:1.04346
[477]	validation_0-rmse:1.04342
[478]	validation_0-rmse:1.04341
[479]	validation_0-rmse:1.0434
[480]	validation_0-rmse:1.04336
[481]	validation_0-rmse:1.04332
[482]	validation_0-rmse:1.04329
[483]	validation_0-rmse:1.04325
[484]	validation_0-rmse:1.04324
[485]	validation_0-rmse:1.04323
[486]	validation_0-rmse:1.0432
[487]	validation_0-rmse:1.04313
[488]	validation_0-rmse:1.04309
[489]	validation_0-rmse:1.04303
[490]	validation_0-rmse:1.043
[491]	validation_0-rmse:1.04298
[492]	validation_0-rmse:1.04296
[493]	validation_0-rmse:1.04293
[494]	validation_0-rmse:1.0429
[495]	validation_0-rmse:1.04289
[496]	validation_0-rmse:1.04286
[497]	validation_0-rmse:1.04282
[498]	validation_0-rmse:1.04279
[499]	validation_0-rmse:1.04277
Out[53]:
XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.7, gamma=0, learning_rate=0.01, max_delta_step=0,
       max_depth=10, min_child_weight=1, missing=None, n_estimators=500,
       n_jobs=-1, nthread=-1, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=0, silent=True,
       subsample=0.85)

In [54]:
pred = xlf.predict(train)
print '训练得分',logloss(train_label,pred)


训练得分 0.100489348173

In [55]:
results = xlf.predict(test)
test['prob'] = results

In [101]:
# results = lr.predict_proba(test)[:,1]
# print '验证得分',logloss(test_label,pred)


验证得分
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-101-a15274632c27> in <module>()
      1 results = lr.predict_proba(test)[:,1]
----> 2 print '验证得分',logloss(test_label,pred)

<ipython-input-2-8353cd4f7e12> in logloss(act, pred)
      5   pred = sp.maximum(epsilon, pred)
      6   pred = sp.minimum(1-epsilon, pred)
----> 7   ll = -sp.mean(act*sp.log(pred) + sp.subtract(1,act)*sp.log(1-pred))
      8   return ll

C:\ProgramData\Anaconda2\lib\site-packages\pandas\core\ops.py in wrapper(left, right, name, na_op)
    713                 lvalues = lvalues.values
    714 
--> 715         result = wrap_results(safe_na_op(lvalues, rvalues))
    716         return construct_result(
    717             left,

C:\ProgramData\Anaconda2\lib\site-packages\pandas\core\ops.py in safe_na_op(lvalues, rvalues)
    674         try:
    675             with np.errstate(all='ignore'):
--> 676                 return na_op(lvalues, rvalues)
    677         except Exception:
    678             if isinstance(rvalues, ABCSeries):

C:\ProgramData\Anaconda2\lib\site-packages\pandas\core\ops.py in na_op(x, y)
    650         try:
    651             result = expressions.evaluate(op, str_rep, x, y,
--> 652                                           raise_on_error=True, **eval_kwargs)
    653         except TypeError:
    654             if isinstance(y, (np.ndarray, ABCSeries, pd.Index)):

C:\ProgramData\Anaconda2\lib\site-packages\pandas\computation\expressions.py in evaluate(op, op_str, a, b, raise_on_error, use_numexpr, **eval_kwargs)
    208     if use_numexpr:
    209         return _evaluate(op, op_str, a, b, raise_on_error=raise_on_error,
--> 210                          **eval_kwargs)
    211     return _evaluate_standard(op, op_str, a, b, raise_on_error=raise_on_error)
    212 

C:\ProgramData\Anaconda2\lib\site-packages\pandas\computation\expressions.py in _evaluate_numexpr(op, op_str, a, b, raise_on_error, truediv, reversed, **eval_kwargs)
    119 
    120     if result is None:
--> 121         result = _evaluate_standard(op, op_str, a, b, raise_on_error)
    122 
    123     return result

C:\ProgramData\Anaconda2\lib\site-packages\pandas\computation\expressions.py in _evaluate_standard(op, op_str, a, b, raise_on_error, **eval_kwargs)
     61         _store_test_result(False)
     62     with np.errstate(all='ignore'):
---> 63         return op(a, b)
     64 
     65 

ValueError: operands could not be broadcast together with shapes (338489,) (861372,) 

In [56]:
#输出结果
# a = pd.DataFrame({'instanceID':pd.read_csv('./pre/test.csv')['instanceID'],'prob':pred})
#输出
test1 = pd.read_csv('./pre/test.csv')
test1['prob'] = results
test1= test1[['instanceID','prob']]
test1.to_csv('./pre/submission.csv',index=None)
submission =  pd.read_csv('./pre/submission.csv')

In [ ]:
def get_duplicated_feature():
    #重复数据是本题的一个特点
    #部分label为1的特征是重复的而且位置为第一个
    
    train = pd.read_csv('../input/train.csv')
    train.drop('conversionTime',axis=1,inplace=True)
    train.drop('label',axis=1,inplace=True)
    test = pd.read_csv('../input/test.csv')
    test.drop('instanceID',axis=1,inplace=True)
    test.drop('label',axis=1,inplace=True)
    
    is_duplicated = train.duplicated(keep=False).astype('int')
    is_duplicated_first = train.duplicated(keep='first').astype('int')
    is_duplicated_last = train.duplicated(keep='last').astype('int')
    train['is_duplicated'] = is_duplicated
    train['is_duplicated_first'] = is_duplicated_first
    train['is_duplicated_last'] = is_duplicated_last
    
    is_duplicated = test.duplicated(keep=False).astype('int')
    is_duplicated_first = test.duplicated(keep='first').astype('int')
    is_duplicated_last = test.duplicated(keep='last').astype('int')
    test['is_duplicated'] = is_duplicated
    test['is_duplicated_first'] = is_duplicated_first
    test['is_duplicated_last'] = is_duplicated_last
    
    return train[['is_duplicated','is_duplicated_first','is_duplicated_last']],test[['is_duplicated','is_duplicated_first','is_duplicated_last']]

In [57]:
submission


Out[57]:
instanceID prob
0 1 0.006102
1 2 0.008251
2 3 0.030742
3 4 0.034045
4 5 0.021748
5 6 0.038723
6 7 0.039754
7 8 0.074027
8 9 0.008836
9 10 0.012768
10 11 0.033577
11 12 0.040656
12 13 0.024345
13 14 0.028347
14 15 0.047148
15 16 0.050785
16 17 0.047488
17 18 0.029121
18 19 0.062852
19 20 0.044021
20 21 0.058608
21 22 0.061507
22 23 0.032527
23 24 0.058068
24 25 0.052819
25 26 0.056776
26 27 0.034613
27 28 0.018351
28 29 0.036128
29 30 0.008615
... ... ...
338459 338460 0.025382
338460 338461 0.025632
338461 338462 0.089135
338462 338463 0.031939
338463 338464 0.030574
338464 338465 0.062271
338465 338466 0.041421
338466 338467 0.024870
338467 338468 0.019778
338468 338469 0.026059
338469 338470 0.059455
338470 338471 0.049819
338471 338472 0.017630
338472 338473 0.030372
338473 338474 0.027363
338474 338475 0.068157
338475 338476 0.034019
338476 338477 0.011059
338477 338478 0.033410
338478 338479 0.206111
338479 338480 0.092348
338480 338481 0.046143
338481 338482 0.048854
338482 338483 0.031805
338483 338484 0.050104
338484 338485 0.011184
338485 338486 0.067838
338486 338487 0.024302
338487 338488 0.012091
338488 338489 0.003336

338489 rows × 2 columns