In [1]:
import sklearn
import mglearn
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
In [2]:
file = pd.read_csv('project-data.csv')
opioids = pd.DataFrame(file)
opioids.drop(opioids.columns[[0,1]], axis=1, inplace=True)
del opioids['SUICATT']
opioids.shape
Out[2]:
In [3]:
print(opioids.keys())
In [4]:
opioids['HEROINEVR'].value_counts()
Out[4]:
In [5]:
features = ['AGECAT', 'SEX', 'MARRIED', 'EDUCAT', 'EMPLOY18',
'CTYMETRO', 'HEALTH','MENTHLTH', 'PRLMISEVR', 'PRLMISAB', 'PRLANY',
'TRQLZRS', 'SEDATVS', 'COCAINE', 'AMPHETMN', 'TRTMENT','MHTRTMT']
opioids.data = pd.DataFrame(opioids, columns=['AGECAT', 'SEX', 'MARRIED', 'EDUCAT', 'EMPLOY18',
'CTYMETRO', 'HEALTH','MENTHLTH', 'PRLMISEVR',
'PRLMISAB','PRLANY','TRQLZRS', 'SEDATVS',
'COCAINE', 'AMPHETMN', 'TRTMENT','MHTRTMT'])
opioids.target = opioids['HEROINEVR']
In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
opioids.data, opioids.target, stratify=opioids.target, random_state=42)
In [7]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression().fit(X_train, y_train)
In [8]:
print("Training set score: {:.3f}".format(logreg.score(X_train, y_train)))
print("Test set score: {:.3f}".format(logreg.score(X_test, y_test)))
In [9]:
logreg100 = LogisticRegression(C=100).fit(X_train, y_train)
print("Training set score: {:.3f}".format(logreg100.score(X_train, y_train)))
print("Test set score: {:.3f}".format(logreg100.score(X_test, y_test)))
In [10]:
logreg001 = LogisticRegression(C=0.01).fit(X_train, y_train)
print("Training set score: {:.3f}".format(logreg001.score(X_train, y_train)))
print("Test set score: {:.3f}".format(logreg001.score(X_test, y_test)))
L2
(Ridge) regularization by default
In [11]:
for C, marker in zip([0.01, 1, 100], ['v', 'o', '^']):
lr_l1 = LogisticRegression(C=C, penalty="l1").fit(X_train, y_train)
print("Training accuracy of L1 logreg with C={:.3f}: {:.3f}".format(
C, lr_l1.score(X_train, y_train)))
print("Test accuracy of L1 logreg with C={:.3f}: {:.3f}".format(
C, lr_l1.score(X_test, y_test)))
plt.plot(lr_l1.coef_.T, marker, label="C={:.3f}".format(C))
plt.xticks(range(opioids.data.shape[1]), features, rotation=90)
plt.hlines(0,0, opioids.data.shape[1])
plt.xlabel("Coefficient Index")
plt.xlabel("Coefficient Magnitude")
plt.ylim(-2, 2)
plt.legend()
Out[11]:
DecisionTreeClassifier
random_state
in the tree, for breaking ties internally
In [12]:
from IPython.display import Image, display
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier(random_state=0)
tree.fit(X_train, y_train)
Out[12]:
In [13]:
print("Accuracy on the training: {:.3f}".format(tree.score(X_train, y_train)))
print("Accuracy on the test set: {:.3f}".format(tree.score(X_test, y_test)))
In [14]:
tree = DecisionTreeClassifier(max_depth=4, random_state=0)
tree.fit(X_train, y_train)
print("Accuracy on the training: {:.3f}".format(tree.score(X_train, y_train)))
print("Accuracy on the test set: {:.3f}".format(tree.score(X_test, y_test)))
In [15]:
from sklearn.tree import export_graphviz
export_graphviz(tree, out_file="tree.dot", class_names=["Yes", "No"],
feature_names=features, impurity=False, filled=True)
In [16]:
from IPython.display import display
import graphviz
with open('tree.dot') as f:
dot_graph = f.read()
display(graphviz.Source(dot_graph))
In [17]:
print("Feature importances:\n{}".format(tree.feature_importances_))
In [18]:
def plot_feature_importances_heroin(model):
n_features = opioids.data.shape[1]
plt.barh(range(n_features), model.feature_importances_, align='center')
plt.yticks(np.arange(n_features), features)
plt.xlabel("Feature importance")
plt.ylabel("Feature")
plot_feature_importances_heroin(tree)
n_estimators
parametern_samples
max_features
paramtermax_features
set to n_features
, each split can look at all features inthe dataset, no randomness in selectionmax_features
means the trees in a random forest will be quite similarmax_feature
means trees in random forest will be quite differentn_estimators
to 100 trees; build model on the training set
In [19]:
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_estimators=100, random_state=0)
forest.fit(X_train, y_train)
Out[19]:
In [20]:
print("Accuracy on training set: {:.3f}".format(forest.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(forest.score(X_test, y_test)))
In [21]:
plot_feature_importances_heroin(forest)
With 100 trees, of maximum depth 3, and learning rate of 0.1
In [22]:
from sklearn.ensemble import GradientBoostingClassifier
gbrt = GradientBoostingClassifier(random_state=0, n_estimators=100, max_depth=3, learning_rate=0.01)
gbrt.fit(X_train, y_train)
Out[22]:
In [23]:
gbrt = GradientBoostingClassifier(random_state=0, max_depth=1)
gbrt.fit(X_train, y_train)
plot_feature_importances_heroin(gbrt)