In [1]:
import gbdt
from sklearn import metrics
import pandas as pd
In [2]:
def ComputeAUC(forest, data, targets):
predictions = forest.predict(data)
fpr, tpr, _ = metrics.roc_curve(targets, predictions, pos_label=1)
return metrics.auc(fpr, tpr)
In [3]:
df = pd.read_csv('train.tsv', delimiter='\t')
In [4]:
df
Out[4]:
In [5]:
config = {'loss_func': 'logloss',
'num_trees': 100,
'num_leaves': 12,
'example_sampling_rate': 0.5,
'feature_sampling_rate': 0.8,
'pair_sampling_rate': 20,
'min_hessian': 50,
'shrinkage' : 0.05}
In [6]:
float_features = ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']
In [7]:
cat_features = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']
In [8]:
target_column = 'target'
In [9]:
training_data = gbdt.DataLoader.from_tsvs(tsvs=['train.tsv'],
bucketized_float_cols=float_features,
string_cols=cat_features + [target_column])
training_targets = list(map(lambda x: 1 if x=='>50K' else -1, training_data[target_column]))
In [10]:
testing_data = gbdt.DataLoader.from_tsvs(tsvs=['test.tsv'],
bucketized_float_cols=float_features,
string_cols=cat_features + [target_column])
testing_targets = list(map(lambda x: 1 if x=='>50K' else -1, testing_data[target_column]))
In [11]:
forest = gbdt.train(training_data,
y=training_targets,
features=float_features + cat_features,
config=config)
In [12]:
print ("Training AUC =", ComputeAUC(forest, training_data, training_targets))
print ("Testing AUC =", ComputeAUC(forest, testing_data, testing_targets))
In [13]:
forest.feature_importance()
Out[13]:
In [14]:
visualizer = gbdt.ForestVisualizer(forest)
In [15]:
visualizer.visualize_tree(1)
Out[15]:
In [18]:
gbdt.plot_partial_dependency(forest, training_data, 'hours-per-week', list(range(10, 50)))
In [ ]:
In [ ]: