In [1]:
import gbdt
from sklearn import metrics
import pandas as pd

In [2]:
def ComputeAUC(forest, data, targets):
    predictions = forest.predict(data)
    fpr, tpr, _ = metrics.roc_curve(targets, predictions, pos_label=1)
    return metrics.auc(fpr, tpr)

In [3]:
df = pd.read_csv('train.tsv', delimiter='\t')

In [4]:
df


Out[4]:
age workclass fnlwgt education education-num marital-status occupation relationship race sex capital-gain capital-loss hours-per-week native-country target
0 39 State-gov 77516 Bachelors 13 Never-married Adm-clerical Not-in-family White Male 2174 0 40 United-States <=50K
1 50 Self-emp-not-inc 83311 Bachelors 13 Married-civ-spouse Exec-managerial Husband White Male 0 0 13 United-States <=50K
2 38 Private 215646 HS-grad 9 Divorced Handlers-cleaners Not-in-family White Male 0 0 40 United-States <=50K
3 53 Private 234721 11th 7 Married-civ-spouse Handlers-cleaners Husband Black Male 0 0 40 United-States <=50K
4 28 Private 338409 Bachelors 13 Married-civ-spouse Prof-specialty Wife Black Female 0 0 40 Cuba <=50K
5 37 Private 284582 Masters 14 Married-civ-spouse Exec-managerial Wife White Female 0 0 40 United-States <=50K
6 49 Private 160187 9th 5 Married-spouse-absent Other-service Not-in-family Black Female 0 0 16 Jamaica <=50K
7 52 Self-emp-not-inc 209642 HS-grad 9 Married-civ-spouse Exec-managerial Husband White Male 0 0 45 United-States >50K
8 31 Private 45781 Masters 14 Never-married Prof-specialty Not-in-family White Female 14084 0 50 United-States >50K
9 42 Private 159449 Bachelors 13 Married-civ-spouse Exec-managerial Husband White Male 5178 0 40 United-States >50K
10 37 Private 280464 Some-college 10 Married-civ-spouse Exec-managerial Husband Black Male 0 0 80 United-States >50K
11 30 State-gov 141297 Bachelors 13 Married-civ-spouse Prof-specialty Husband Asian-Pac-Islander Male 0 0 40 India >50K
12 23 Private 122272 Bachelors 13 Never-married Adm-clerical Own-child White Female 0 0 30 United-States <=50K
13 32 Private 205019 Assoc-acdm 12 Never-married Sales Not-in-family Black Male 0 0 50 United-States <=50K
14 40 Private 121772 Assoc-voc 11 Married-civ-spouse Craft-repair Husband Asian-Pac-Islander Male 0 0 40 ? >50K
15 34 Private 245487 7th-8th 4 Married-civ-spouse Transport-moving Husband Amer-Indian-Eskimo Male 0 0 45 Mexico <=50K
16 25 Self-emp-not-inc 176756 HS-grad 9 Never-married Farming-fishing Own-child White Male 0 0 35 United-States <=50K
17 32 Private 186824 HS-grad 9 Never-married Machine-op-inspct Unmarried White Male 0 0 40 United-States <=50K
18 38 Private 28887 11th 7 Married-civ-spouse Sales Husband White Male 0 0 50 United-States <=50K
19 43 Self-emp-not-inc 292175 Masters 14 Divorced Exec-managerial Unmarried White Female 0 0 45 United-States >50K
20 40 Private 193524 Doctorate 16 Married-civ-spouse Prof-specialty Husband White Male 0 0 60 United-States >50K
21 54 Private 302146 HS-grad 9 Separated Other-service Unmarried Black Female 0 0 20 United-States <=50K
22 35 Federal-gov 76845 9th 5 Married-civ-spouse Farming-fishing Husband Black Male 0 0 40 United-States <=50K
23 43 Private 117037 11th 7 Married-civ-spouse Transport-moving Husband White Male 0 2042 40 United-States <=50K
24 59 Private 109015 HS-grad 9 Divorced Tech-support Unmarried White Female 0 0 40 United-States <=50K
25 56 Local-gov 216851 Bachelors 13 Married-civ-spouse Tech-support Husband White Male 0 0 40 United-States >50K
26 19 Private 168294 HS-grad 9 Never-married Craft-repair Own-child White Male 0 0 40 United-States <=50K
27 54 ? 180211 Some-college 10 Married-civ-spouse ? Husband Asian-Pac-Islander Male 0 0 60 South >50K
28 39 Private 367260 HS-grad 9 Divorced Exec-managerial Not-in-family White Male 0 0 80 United-States <=50K
29 49 Private 193366 HS-grad 9 Married-civ-spouse Craft-repair Husband White Male 0 0 40 United-States <=50K
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
32531 30 ? 33811 Bachelors 13 Never-married ? Not-in-family Asian-Pac-Islander Female 0 0 99 United-States <=50K
32532 34 Private 204461 Doctorate 16 Married-civ-spouse Prof-specialty Husband White Male 0 0 60 United-States >50K
32533 54 Private 337992 Bachelors 13 Married-civ-spouse Exec-managerial Husband Asian-Pac-Islander Male 0 0 50 Japan >50K
32534 37 Private 179137 Some-college 10 Divorced Adm-clerical Unmarried White Female 0 0 39 United-States <=50K
32535 22 Private 325033 12th 8 Never-married Protective-serv Own-child Black Male 0 0 35 United-States <=50K
32536 34 Private 160216 Bachelors 13 Never-married Exec-managerial Not-in-family White Female 0 0 55 United-States >50K
32537 30 Private 345898 HS-grad 9 Never-married Craft-repair Not-in-family Black Male 0 0 46 United-States <=50K
32538 38 Private 139180 Bachelors 13 Divorced Prof-specialty Unmarried Black Female 15020 0 45 United-States >50K
32539 71 ? 287372 Doctorate 16 Married-civ-spouse ? Husband White Male 0 0 10 United-States >50K
32540 45 State-gov 252208 HS-grad 9 Separated Adm-clerical Own-child White Female 0 0 40 United-States <=50K
32541 41 ? 202822 HS-grad 9 Separated ? Not-in-family Black Female 0 0 32 United-States <=50K
32542 72 ? 129912 HS-grad 9 Married-civ-spouse ? Husband White Male 0 0 25 United-States <=50K
32543 45 Local-gov 119199 Assoc-acdm 12 Divorced Prof-specialty Unmarried White Female 0 0 48 United-States <=50K
32544 31 Private 199655 Masters 14 Divorced Other-service Not-in-family Other Female 0 0 30 United-States <=50K
32545 39 Local-gov 111499 Assoc-acdm 12 Married-civ-spouse Adm-clerical Wife White Female 0 0 20 United-States >50K
32546 37 Private 198216 Assoc-acdm 12 Divorced Tech-support Not-in-family White Female 0 0 40 United-States <=50K
32547 43 Private 260761 HS-grad 9 Married-civ-spouse Machine-op-inspct Husband White Male 0 0 40 Mexico <=50K
32548 65 Self-emp-not-inc 99359 Prof-school 15 Never-married Prof-specialty Not-in-family White Male 1086 0 60 United-States <=50K
32549 43 State-gov 255835 Some-college 10 Divorced Adm-clerical Other-relative White Female 0 0 40 United-States <=50K
32550 43 Self-emp-not-inc 27242 Some-college 10 Married-civ-spouse Craft-repair Husband White Male 0 0 50 United-States <=50K
32551 32 Private 34066 10th 6 Married-civ-spouse Handlers-cleaners Husband Amer-Indian-Eskimo Male 0 0 40 United-States <=50K
32552 43 Private 84661 Assoc-voc 11 Married-civ-spouse Sales Husband White Male 0 0 45 United-States <=50K
32553 32 Private 116138 Masters 14 Never-married Tech-support Not-in-family Asian-Pac-Islander Male 0 0 11 Taiwan <=50K
32554 53 Private 321865 Masters 14 Married-civ-spouse Exec-managerial Husband White Male 0 0 40 United-States >50K
32555 22 Private 310152 Some-college 10 Never-married Protective-serv Not-in-family White Male 0 0 40 United-States <=50K
32556 27 Private 257302 Assoc-acdm 12 Married-civ-spouse Tech-support Wife White Female 0 0 38 United-States <=50K
32557 40 Private 154374 HS-grad 9 Married-civ-spouse Machine-op-inspct Husband White Male 0 0 40 United-States >50K
32558 58 Private 151910 HS-grad 9 Widowed Adm-clerical Unmarried White Female 0 0 40 United-States <=50K
32559 22 Private 201490 HS-grad 9 Never-married Adm-clerical Own-child White Male 0 0 20 United-States <=50K
32560 52 Self-emp-inc 287927 HS-grad 9 Married-civ-spouse Exec-managerial Wife White Female 15024 0 40 United-States >50K

32561 rows × 15 columns


In [5]:
config = {'loss_func': 'logloss',
          'num_trees': 100,
          'num_leaves': 12,
          'example_sampling_rate': 0.5,
          'feature_sampling_rate': 0.8,
          'pair_sampling_rate': 20,
          'min_hessian': 50,
          'shrinkage' : 0.05}

In [6]:
float_features = ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']

In [7]:
cat_features = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']

In [8]:
target_column = 'target'

In [9]:
training_data = gbdt.DataLoader.from_tsvs(tsvs=['train.tsv'],
                                          bucketized_float_cols=float_features,
                                          string_cols=cat_features + [target_column])
training_targets = list(map(lambda x: 1 if x=='>50K' else -1, training_data[target_column]))

In [10]:
testing_data = gbdt.DataLoader.from_tsvs(tsvs=['test.tsv'],
                                          bucketized_float_cols=float_features,
                                          string_cols=cat_features + [target_column])
testing_targets = list(map(lambda x: 1 if x=='>50K' else -1, testing_data[target_column]))

In [11]:
forest = gbdt.train(training_data,
                    y=training_targets,
                    features=float_features + cat_features,
                    config=config)

In [12]:
print ("Training AUC =", ComputeAUC(forest, training_data, training_targets))
print ("Testing AUC =", ComputeAUC(forest, testing_data, testing_targets))


Training AUC = 0.9228652193948579
Testing AUC = 0.9161781774849602

In [13]:
forest.feature_importance()


Out[13]:
[('relationship', 1.0),
 ('capital-gain', 0.6120898863234928),
 ('marital-status', 0.5235976651686585),
 ('occupation', 0.4139503497840851),
 ('age', 0.3096430700074806),
 ('education-num', 0.30917146559227743),
 ('education', 0.16735061310389038),
 ('hours-per-week', 0.1422546436452378),
 ('capital-loss', 0.11825127361653291),
 ('workclass', 0.04094403680448024),
 ('native-country', 0.034161381549460884),
 ('fnlwgt', 0.022186954904719502),
 ('sex', 0.017457985853170644),
 ('race', 0.0029956572158538514)]

In [14]:
visualizer = gbdt.ForestVisualizer(forest)

In [15]:
visualizer.visualize_tree(1)


Out[15]:
%3 tree1 relationship in {Husband, Wife} 0.000153 tree1l education-num *<= 12.500 0.057462 tree1->tree1l tree1r capital-gain *<= 3299.000 -0.047594 tree1->tree1r tree1ll capital-gain *<= 4225.000 0.024595 tree1l->tree1ll tree1lr capital-gain *<= 5095.500 0.132855 tree1l->tree1lr tree1lll occupation in {Adm-clerical, Sales, Protective-serv, Prof-specialty, Exec-managerial, Tech-support} 0.014936 tree1ll->tree1lll tree1llr 0.172148 tree1ll->tree1llr tree1llll age *<= 31.500 0.050658 tree1lll->tree1llll tree1lllr education in {Assoc-acdm, HS-grad, Some-college, Assoc-voc} -0.007438 tree1lll->tree1lllr tree1lllll 0.001270 tree1llll->tree1lllll tree1llllr 0.061317 tree1llll->tree1llllr tree1lllrl 0.002693 tree1lllr->tree1lllrl tree1lllrr -0.041657 tree1lllr->tree1lllrr tree1lrl occupation in {Tech-support, Protective-serv, Prof-specialty, Exec-managerial} 0.119632 tree1lr->tree1lrl tree1lrr 0.206828 tree1lr->tree1lrr tree1lrll 0.137001 tree1lrl->tree1lrll tree1lrlr 0.079043 tree1lrl->tree1lrlr tree1rl education-num *<= 12.500 -0.052166 tree1r->tree1rl tree1rr 0.094668 tree1r->tree1rr tree1rll -0.058430 tree1rl->tree1rll tree1rlr hours-per-week *<= 49.000 -0.026180 tree1rl->tree1rlr tree1rlrl -0.037538 tree1rlr->tree1rlrl tree1rlrr 0.013117 tree1rlr->tree1rlrr

In [18]:
gbdt.plot_partial_dependency(forest, training_data, 'hours-per-week', list(range(10, 50)))



In [ ]:


In [ ]: