In [1]:
import numpy as np
import pandas as pd
%matplotlib inline
import math 
from xgboost.sklearn import XGBClassifier
from sklearn.cross_validation import cross_val_score
from sklearn import cross_validation
from sklearn.metrics import roc_auc_score
from matplotlib import pyplot


/home/analyst/anaconda2/lib/python2.7/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)

In [2]:
train = pd.read_csv("xtrain.csv")
target = pd.read_csv("ytrain.csv")
test = pd.read_csv("xtest.csv")

In [3]:
train.head()


Out[3]:
1 2 3 4 5 6 7 8 9 10 ... 49 50 51 52 53 54 55 56 57 58
0 -42.822536 NaN 12.0 NaN 1.0 2.0 24.0 -45.025510 NaN 1.0 ... 17.871210 4.0 15.0 21.863365 -13.349394 -2.215086 -5.137377 20.904186 53.939262 -17.328346
1 -13.478816 13.0 12.0 75.132502 0.0 2.0 24.0 -49.213545 7.0 0.0 ... 21.511019 2.0 13.0 -2.880103 21.739125 5.464161 -30.347612 23.304507 47.746225 -47.305489
2 51.702721 13.0 12.0 63.459270 0.0 3.0 24.0 -58.777043 8.0 0.0 ... NaN 3.0 16.0 21.851623 NaN 7.471764 -12.348314 34.406243 34.479515 -33.326172
3 NaN 12.0 13.0 -15.492561 1.0 1.0 23.0 0.624258 9.0 0.0 ... 16.964848 2.0 15.0 -25.128119 -26.858262 54.203501 10.341217 29.080753 40.235855 -35.835956
4 7.633273 NaN 13.0 59.862681 0.0 3.0 NaN -61.395319 NaN 0.0 ... 8.467076 4.0 14.0 30.610006 -15.663721 38.323843 -65.561821 33.965883 50.245794 -44.018245

5 rows × 58 columns


In [4]:
train.describe()


Out[4]:
1 2 3 4 5 6 7 8 9 10 ... 49 50 51 52 53 54 55 56 57 58
count 820529.000000 808921.000000 831550.000000 828051.000000 819375.000000 796632.000000 791582.000000 796405.000000 840044.000000 794046.000000 ... 803958.000000 836761.000000 798606.000000 843544.000000 814017.000000 812818.000000 800856.000000 812926.000000 835366.000000 786115.000000
mean 11.012513 12.329729 11.344055 25.406132 0.313465 2.180648 23.222268 -46.304667 7.901558 0.269151 ... 20.596154 2.982181 14.680950 24.042950 -19.295371 16.461962 -25.783931 26.158484 38.320750 -37.013222
std 99.986889 0.796650 1.662548 20.636810 0.536960 0.771303 0.867651 19.875120 1.286734 0.443519 ... 6.409154 1.097207 2.301128 29.146244 47.647342 45.131263 23.565330 8.508795 5.995944 15.246010
min -480.088690 7.000000 3.000000 -72.310070 0.000000 0.000000 17.000000 -138.181159 1.000000 0.000000 ... -11.358178 0.000000 4.000000 -115.421307 -246.217734 -197.419313 -136.146509 -14.899675 9.648201 -112.352665
25% -56.357813 12.000000 10.000000 11.492726 0.000000 2.000000 23.000000 -59.736287 7.000000 0.000000 ... 16.278806 2.000000 13.000000 4.395491 -51.462848 -13.974363 -41.718505 20.427519 34.264541 -47.312376
50% 11.029669 13.000000 11.000000 25.421574 0.000000 2.000000 23.000000 -46.305721 8.000000 0.000000 ... 20.605346 3.000000 15.000000 24.050412 -19.318562 16.460499 -25.779677 26.149082 38.316862 -36.990170
75% 78.379658 13.000000 13.000000 39.346755 1.000000 3.000000 24.000000 -32.910540 9.000000 1.000000 ... 24.915043 4.000000 16.000000 43.716637 12.907192 46.941529 -9.868830 31.893980 42.376050 -26.696559
max 545.896248 13.000000 15.000000 128.900592 4.000000 3.000000 24.000000 55.334255 10.000000 1.000000 ... 51.729840 5.000000 23.000000 164.653264 209.925962 226.741140 81.935811 73.609990 69.557056 41.368872

8 rows × 58 columns


In [5]:
target.head()


Out[5]:
x
0 1
1 0
2 0
3 1
4 0

In [6]:
for column in train:
    print column, ": ", len(train[column].unique())


1 :  820530
2 :  8
3 :  14
4 :  828052
5 :  6
6 :  5
7 :  9
8 :  796406
9 :  11
10 :  3
11 :  792090
12 :  822005
13 :  801825
14 :  822892
15 :  14
16 :  822113
17 :  15
18 :  14
19 :  857587
20 :  829600
21 :  20
22 :  14
23 :  5
24 :  17
25 :  809401
26 :  19
27 :  23
28 :  812096
29 :  14
30 :  7
31 :  7
32 :  11
33 :  815736
34 :  815944
35 :  800967
36 :  15
37 :  831797
38 :  816784
39 :  812254
40 :  786254
41 :  832849
42 :  844695
43 :  820060
44 :  5
45 :  10
46 :  820502
47 :  3
48 :  15
49 :  803959
50 :  7
51 :  21
52 :  843545
53 :  814018
54 :  812819
55 :  800857
56 :  812927
57 :  835366
58 :  786116

In [3]:
cat_features = []
real_features = []

for column in train:
    if len(train[column].unique()) > 21:
        real_features.append(column)
    else:
        cat_features.append(column)

In [8]:
# построим гистограммы для первых 50к значений для категориальных признаков
train[cat_features].head(50000).plot.hist(bins = 100, figsize=(20, 20))
test[cat_features].head(50000).plot.hist(bins = 100, figsize=(20, 20))


Out[8]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fe0125ff210>

In [9]:
# построим гистограммы для первых 50к значений для остальных признаков
train[real_features].head(50000).plot.hist(bins = 100, figsize=(20, 20))
test[real_features].head(50000).plot.hist(bins = 100, figsize=(20, 20))

#гистограммы для теста и обучающей выборки совпадают


Out[9]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fe015bbf910>

In [17]:
import seaborn
seaborn.heatmap(train[real_features].corr(), square=True)
#числовые признаки не коррелируеют между собой


Out[17]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fe021256990>

In [11]:
# в данных есть nan values в каждом столбце
train.isnull().sum()


Out[11]:
1      79471
2      91079
3      68450
4      71949
5      80625
6     103368
7     108418
8     103595
9      59956
10    105954
11    107911
12     77918
13     98176
14     77109
15    114703
16     77888
17     95691
18     92278
19     42414
20     70401
21     96029
22     86128
23    131388
24     76619
25     90600
26     76446
27    107409
28     87905
29    113008
30     78357
31     72053
32     79065
33     84265
34     84057
35     99034
36     84897
37     68204
38     83217
39     87747
40    113747
41     67152
42     55306
43     79941
44    105538
45     93753
46     79499
47     86376
48     92006
49     96042
50     63239
51    101394
52     56456
53     85983
54     87182
55     99144
56     87074
57     64634
58    113885
dtype: int64

In [5]:
#для категориальных признаков, nan значения заменим -1
#Для действительных признаков - заменим средним значнием
train[cat_features] = train[cat_features].fillna(-1)

In [6]:
for column in train[real_features]:
    mean_val = train[column].mean()
    train[column] = train[column].fillna(mean_val)

In [7]:
target.mean() #класса 0 больше чем 1


Out[7]:
x    0.306733
dtype: float64

In [ ]:
import xgboost as xgb
from sklearn.cross_validation import train_test_split

X_fit, X_eval, y_fit, y_eval= train_test_split(
    train, target, test_size=0.20, random_state=1
)

clf = xgb.XGBClassifier(missing=np.nan, max_depth=3, 
                        n_estimators=550, learning_rate=0.05, gamma =0.3, min_child_weight = 3,
                        subsample=0.9, colsample_bytree=0.8, seed=2000,objective= 'binary:logistic')

clf.fit(X_fit, y_fit, early_stopping_rounds=40,  eval_metric="auc", eval_set=[(X_eval, y_eval)])

In [23]:
auc_train = roc_auc_score(y_fit.x, clf.predict(X_fit))
auc_val = roc_auc_score(y_eval.x, clf.predict(X_eval))

print 'auc_train: ', auc_train
print 'auc_val: ', auc_val

#имеет место быть переобучение


auc_train:  0.553770758732
auc_val:  0.552502316614

In [40]:
eps = 1e-5
dropped_columns = set()
C = train.columns
#Определим константные признаки
for c in C:
    if train[c].var() < eps:
        print '.. %-30s: too low variance ... column ignored'%(c)
        dropped_columns.add(c)
#таких не обнаружено

In [41]:
for i, c1 in enumerate(C):
    f1 = train[c1].values
    for j, c2 in enumerate(C[i+1:]):
        f2 = train[c2].values
        if np.all(f1 == f2):
            dropped_columns.add(c2)
            print c2
# одинаковых полей также нет

In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
import matplotlib.pyplot as plt
forest = ExtraTreesClassifier(n_estimators=150,
                              random_state=0)

forest.fit(train.head(100000), target.head(100000).x)
importances = forest.feature_importances_
std = np.std([tree.feature_importances_ for tree in forest.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

# Попробуем посмотреть  какие признаки значимы с помощью деревьев
print("Feature ranking:")

for f in range(train.head(100000).shape[1]):
    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

# Построим графики
plt.figure()
plt.title("Feature importances")
plt.bar(range(train.head(100000).shape[1]), importances[indices],
       color="r", yerr=std[indices], align="center")
plt.xticks(range(train.head(100000).shape[1]), indices)
plt.xlim([-1, train.head(100000).shape[1]])
plt.show()


Feature ranking:
1. feature 11 (0.018281)
2. feature 33 (0.018268)
3. feature 13 (0.018228)
4. feature 40 (0.018156)
5. feature 10 (0.018108)
6. feature 52 (0.018095)
7. feature 45 (0.018085)
8. feature 41 (0.018078)
9. feature 19 (0.018045)
10. feature 51 (0.018021)
11. feature 55 (0.018017)
12. feature 32 (0.018017)
13. feature 53 (0.018003)
14. feature 54 (0.017998)
15. feature 37 (0.017997)
16. feature 34 (0.017996)
17. feature 7 (0.017995)
18. feature 27 (0.017994)
19. feature 42 (0.017992)
20. feature 18 (0.017987)
21. feature 56 (0.017968)
22. feature 38 (0.017966)
23. feature 57 (0.017961)
24. feature 3 (0.017956)
25. feature 48 (0.017951)
26. feature 24 (0.017942)
27. feature 15 (0.017901)
28. feature 0 (0.017894)
29. feature 12 (0.017852)
30. feature 39 (0.017843)
31. feature 26 (0.017689)
32. feature 21 (0.017592)
33. feature 20 (0.017566)
34. feature 35 (0.017560)
35. feature 23 (0.017536)
36. feature 28 (0.017486)
37. feature 47 (0.017442)
38. feature 31 (0.017387)
39. feature 17 (0.017322)
40. feature 50 (0.017236)
41. feature 25 (0.017199)
42. feature 2 (0.017032)
43. feature 49 (0.016939)
44. feature 36 (0.016865)
45. feature 8 (0.016755)
46. feature 14 (0.016686)
47. feature 16 (0.016525)
48. feature 22 (0.016472)
49. feature 30 (0.016253)
50. feature 44 (0.016195)
51. feature 43 (0.016066)
52. feature 1 (0.015586)
53. feature 29 (0.015509)
54. feature 5 (0.015104)
55. feature 6 (0.015079)
56. feature 4 (0.014163)
57. feature 9 (0.013654)
58. feature 46 (0.012509)

In [ ]:
# Явных лидеров как и аутсайдеров среди признаков не видно. Признаки анонимны, 
# еще раз обучим модель с более сложными вычислительно гиперпараметрами
from sklearn.cross_validation import train_test_split
import xgboost as xgb

X_fit, X_eval, y_fit, y_eval= train_test_split(
    train, target, test_size=0.20, random_state=1
)

clf = xgb.XGBClassifier(missing=np.nan, max_depth=3, 
                        n_estimators=1200, learning_rate=0.05, gamma =0.3, min_child_weight = 3,
                        subsample=0.9, colsample_bytree=0.8, seed=2000,objective= 'binary:logistic')

clf.fit(X_fit, y_fit, early_stopping_rounds=40,  eval_metric="auc", eval_set=[(X_eval, y_eval)])

In [13]:
# формирование результатов
test_target = clf.predict(test)
submission = pd.DataFrame(test_target)
submission.to_csv("test_target.csv", index=False)

In [ ]: