In [147]:
# We discussed simple linear regression, multiple
# linear regression, and polynomial regression. These models are special cases of the
# generalized linear model, a flexible framework that requires fewer assumptions
# than ordinary linear regression. In this chapter, we will discuss some of these
# assumptions as they relate to another special case of the generalized linear model
# called logistic regression.
# Topics Covered:
# 1. Logistic Regression
# 2. Calculating various cross_val scorings e.g. precision, recall, f1, auc etc
# 3. Plotting AUC curve
# 4. Hyperparameter tuning using Grid Search, Pipeline etc
# 5. Multiclass classificatin
# 6. Multi-label classification
In [148]:
%matplotlib inline
import pandas as pd
df = pd.read_csv("/home/topo/myRepos/github/mml_sklearn/datasets/smsspam/SMSSpamCollection", delimiter = '\t', header = None)
In [149]:
df.head()
Out[149]:
In [150]:
df.tail()
Out[150]:
In [151]:
print "spam count=", df[df[0]=='spam'][0].count()
In [152]:
print "ham count=", df[df[0]=='ham'][0].count()
In [153]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model.logistic import LogisticRegression
from sklearn.cross_validation import train_test_split, cross_val_score
In [154]:
X_train_raw, X_test_raw, y_train, y_test = train_test_split(df[1], df[0])
In [155]:
df[0][:5]
Out[155]:
In [156]:
df[1][:5]
Out[156]:
In [157]:
# TfidfVectorizer combines CountVectorizer and TfidfTransformer.
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train_raw)
#X_test is to be just transformed from X_test_raw and not fit_transformed()
X_test = vectorizer.transform(X_test_raw)
print X_train.shape
print X_test.shape
print y_train.shape
print y_test.shape
In [158]:
clf_logreg = LogisticRegression()
clf_logreg.fit(X_train, y_train)
Out[158]:
In [159]:
preds = clf_logreg.predict(X_test)
print preds[:5]
In [160]:
for i,prediction in enumerate(preds[:10]):
print 'Prediction:%s and Message:%s' % (prediction, X_test_raw[i])
In [161]:
print df[1].shape
print df[0].shape
In [162]:
print df.shape
In [163]:
df[:2]
Out[163]:
In [164]:
for i,j in enumerate(range(5,10)):
#print i, j
print "%d-num=%d" % (i, j*j)
In [165]:
print X_train_raw[1000]
print X_train_raw.shape
In [166]:
X_test_raw.shape
Out[166]:
In [167]:
# Measuring Accuracy
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
In [168]:
y_test = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
y_pred = [0, 1, 0, 0, 0, 0, 0, 1, 1, 1]
In [169]:
conf_matrix = confusion_matrix(y_test, y_pred)
In [170]:
conf_matrix
Out[170]:
In [171]:
#print plot
plt.matshow(conf_matrix)
plt.title("Confusion Matrix")
plt.colorbar()
plt.xlabel("Predicted label")
plt.ylabel("True label")
plt.show()
In [172]:
from sklearn.metrics import accuracy_score
y_pred, y_true= [0,1,1,1], [1,1,1,1]
print 'Accuracy: ', accuracy_score(y_true, y_pred)
In [173]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split, cross_val_score
In [174]:
df = pd.read_csv("/home/topo/myRepos/github/mml_sklearn/datasets/smsspam/SMSSpamCollection", delimiter = '\t', header = None)
In [175]:
X_train_raw, X_test_raw, y_train, y_test = train_test_split(df[1], df[0])
In [176]:
df.head()
Out[176]:
In [177]:
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train_raw)
X_test = vectorizer.transform(X_test_raw)
In [178]:
clf = LogisticRegression()
clf.fit(X_train, y_train)
Out[178]:
In [179]:
scores = cross_val_score(clf, X_test, y_test, cv=10)
In [180]:
print scores
In [181]:
print "Average accuracy=%.2f" % np.mean(scores)
In [182]:
precision = cross_val_score(clf, X_test, y_test, cv=10, scoring='precision')
In [183]:
# For P and R calculations target must be 1 / 0
df_binary = df
df_binary.is_copy = False
In [184]:
df[df_binary[0]=='spam'][0].count()
Out[184]:
In [185]:
df_binary.loc[df_binary[0]=='spam'][0] = 1
In [186]:
df_binary.loc[df_binary[0]=='ham'][0] = 0
In [187]:
df_binary.head()
Out[187]:
In [188]:
#pd.get_dummies(df[0])
In [195]:
if df_binary[0] is'spam':
df_binary[2] = 1
elif df_binary[0] is'ham':
df_binary[2] = 0
In [196]:
df_binary.head()
Out[196]:
In [197]:
df_binary.index
Out[197]:
In [198]:
# manually changed: spam=1, ham=0 , stored in new file: smsspam
# TODO : do it via pandas
df_2 = pd.read_csv("/home/topo/myRepos/github/mml_sklearn/datasets/smsspam/smsspam", delimiter = '\t', header = None)
In [199]:
df_2.head()
Out[199]:
In [200]:
X_train_raw, X_test_raw, y_train, y_test = train_test_split(df_2[1], df_2[0])
In [201]:
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train_raw)
X_test = vectorizer.transform(X_test_raw)
In [202]:
X_train.shape
Out[202]:
In [203]:
X_test.shape
Out[203]:
In [204]:
clf = LogisticRegression()
clf.fit(X_train, y_train)
Out[204]:
In [205]:
preds = clf.predict(X_test)
In [206]:
preds.shape
Out[206]:
In [207]:
preds[:5]
Out[207]:
In [212]:
scores = cross_val_score(clf, X_test, y_test, cv=10)
In [215]:
print scores
print 'mean score = %.2f' % np.mean(scores)
In [216]:
precision_score = cross_val_score(clf, X_test, y_test, cv=10, scoring='precision')
In [218]:
print precision_score
print np.mean(precision_score)
In [219]:
recall_score = cross_val_score(clf, X_test, y_test, cv=10, scoring='recall')
In [221]:
print recall_score
print np.mean(recall_score)
In [223]:
# 'f1-score' is not a valid scoring value. Valid options are
# ['accuracy', 'adjusted_rand_score', 'average_precision', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', '
# f1_weighted', 'log_loss', 'mean_absolute_error', 'mean_squared_error', 'median_absolute_error', 'precision',
# 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'r2', 'recall',
# 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'roc_auc']
f1_score = cross_val_score(clf, X_test, y_test, cv=10, scoring='f1')
In [224]:
print f1_score
In [232]:
# Removed 3 items, from full list as they are not meaningful for binary classification
scors = ['accuracy', 'adjusted_rand_score', 'average_precision', 'f1', 'f1_macro', 'f1_micro',
'f1_weighted', 'log_loss', 'mean_absolute_error', 'mean_squared_error', 'median_absolute_error', 'precision',
'precision_macro', 'precision_micro', 'precision_weighted', 'r2', 'recall',
'recall_macro', 'recall_micro', 'recall_weighted', 'roc_auc']
In [233]:
for i, sc in enumerate(scors):
print"%d. Mean %s = %.2f" % (i, sc, np.mean(cross_val_score(clf, X_test, y_test, cv=10, scoring=sc)))
In [234]:
# Plotting AUC curve
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import cross_val_score, train_test_split
from sklearn.metrics import roc_auc_score, roc_curve, auc
In [236]:
df_3 = pd.read_csv("/home/topo/myRepos/github/mml_sklearn/datasets/smsspam/smsspam", delimiter = '\t', header = None)
X_train_raw, X_test_raw, y_train, y_test = train_test_split(df_3[1], df_3[0])
In [237]:
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train_raw)
X_test = vectorizer.transform(X_test_raw)
In [238]:
clf = LogisticRegression()
clf.fit(X_train, y_train)
Out[238]:
In [239]:
pred_proba = clf.predict_proba(X_test)
In [240]:
pred_proba.shape
Out[240]:
In [251]:
print 'Ham Proba Spam Proba'
print pred_proba[:10]
print X_test_raw[:10]
print y_test[:10] # 1=spam, 0=ham
In [243]:
fp_rate, recall, thresholds = roc_curve(y_test, pred_proba[:, 1])
In [260]:
print thresholds.shape
print thresholds
In [253]:
roc_auc = auc(fp_rate, recall)
In [254]:
roc_auc
Out[254]:
In [270]:
plt.title("AUC Curve")
plt.xlabel("Specivity")
plt.ylabel("Sensivity --OR-- Recall")
plt.plot(fp_rate, recall, 'b', label='AUC = %0.2f' % roc_auc)
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.show()
In [271]:
# Hyperparameter tuning using Grid Search
In [273]:
# Grid search is a common method to select the hyperparameter values
# that produce the best model. Grid search takes a set of possible values for each
# hyperparameter that should be tuned, and evaluates a model trained on each
# element of the Cartesian product of the sets. That is, grid search is an exhaustive
# search that trains and evaluates a model for each possible combination of the
# hyperparameter values supplied by the developer. A disadvantage of grid search
# is that it is computationally costly for even small sets of hyperparameter values.
# Fortunately, it is an embarrassingly parallel problem; many models can easily be
# trained and evaluated concurrently since no synchronization is required between
# the processes.
In [274]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.cross_validation import cross_val_score, train_test_split
from sklearn.metrics import precision_score, recall_score, accuracy_score
In [275]:
pipeline = Pipeline ( [
('vectorizer', TfidfVectorizer(stop_words='english')),
('clf', LogisticRegression())
]
)
In [276]:
pipeline
Out[276]:
In [277]:
parameters = {
'vectorizer__max_df': (0.25, 0.5, 0.75),
'vectorizer__stop_words': ('english', None),
'vectorizer__max_features': (2500, 5000, 10000, None),
'vectorizer__ngram_range':((1,1),(1,2)),
'vectorizer__use_idf': (True, False),
'vectorizer__norm': ('l1', 'l2'),
'clf__penalty' : ('l1','l2'),
'clf__C' : (0.01, 0.1, 1, 10),
}
In [278]:
# GridSearchCV() takes an estimator, a parameter space, and performance measure.
# The argument n_jobs specifies the maximum number of concurrent jobs; set n_jobs
# to -1 to use all CPU cores. Note that fit() must be called in a Python main block in
# order to fork additional processes
In [281]:
if __name__ == "__main__":
grid_search = GridSearchCV(pipeline, parameters, n_jobs = -1, verbose=1, scoring='accuracy', cv=3)
df_3 = pd.read_csv("/home/topo/myRepos/github/mml_sklearn/datasets/smsspam/smsspam", delimiter = '\t', header = None)
X_train, X_test, y_train, y_test = train_test_split(df_3[1], df_3[0])
grid_search.fit(X_train, y_train)
print 'Best score: %0.3f' % grid_search.best_score_
print 'Best parameters set:'
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(best_parameters.keys()):
print '\t%s: %r', (param_name, best_parameters[param_name])
predictions = grid_search.predict(X_test)
print 'Accuracy:', accuracy_score(y_test, predictions)
print 'Precision:', precision_score(y_test, predictions)
print 'Recall:', recall_score(y_test, predictions)
In [1]:
# Multiclass classification
%matplotlib inline
import pandas as pd
In [2]:
df = pd.read_csv("/home/topo/myRepos/github/mml_sklearn/datasets/sentiment_kaggle/train.tsv", delimiter='\t', header=0)
In [3]:
df.head()
Out[3]:
In [4]:
df.count()
Out[4]:
In [36]:
print type(df['Sentiment'])
print type(df['Sentiment'].as_matrix())
In [21]:
df[df['Sentiment']==1].count()
Out[21]:
In [14]:
# total sentiment types = 5
set(df['Sentiment'])
Out[14]:
In [23]:
for i in xrange(5):
print 'Sentiment type: %d' % i, "\n", df[df['Sentiment']==i].count()
In [25]:
print df['Phrase'].head(10)
In [26]:
df['Sentiment'].describe()
Out[26]:
In [27]:
#Items from each type
print df['Sentiment'].value_counts()
In [28]:
# % for each sentiment type
print df['Sentiment'].value_counts()/df['Sentiment'].count()
In [29]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import cross_val_score, train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
In [44]:
def main():
pipeline = Pipeline([
('vect', TfidfVectorizer(stop_words='english')),
('clf' , LogisticRegression())
])
parameters = {
'vect__max_df':(0.25, 0.5),
'vect__ngram_range':((1,1),(1,2)),
'vect__use_idf':(True, False),
'clf__C': (0.1, 1.0, 10),
}
df = pd.read_csv("/home/topo/myRepos/github/mml_sklearn/datasets/sentiment_kaggle/train.tsv", delimiter='\t', header=0)
# Convert y from Panda.Series to np.ndarray() else scikit will spew tons of error messages
X, y = df['Phrase'], df['Sentiment'].as_matrix()
# 50% train and 50% test data ==> train_size = .5
# play with different values
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = .7)
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, scoring='accuracy')
grid_search.fit(X_train, y_train)
print 'Best score: %.2f' % grid_search.best_score_
print 'Best parameter sets: '
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
print '\t%s: %r' % ( param_name, best_parameters[param_name])
predictions = grid_search.predict(X_test)
print 'Accuracy:', accuracy_score(y_test, predictions)
print 'confusion_matrix', "\n", confusion_matrix(y_test, predictions)
print 'classification report',"\n", classification_report(y_test, predictions)
if __name__ == '__main__':
main()
In [45]:
# Multi-label classification
# The final type of classification
# problem that we will discuss is multi-label classification, in w hich each instance can
# be assigned a subset of the set of classes. Examples of multi-label classification include
# assigning tags to messages posted on a forum, and classifying the objects present in an
# image
# 1. The first problem
# transformation method that we will review converts each set of labels encountered
# in the training data to a single label.
# 2. A second problem transformation is to train one binary classifier for each of the
# labels in the training set. Each classifier predicts whether or not the instance belongs
# to one label. Our example would require five binary classifiers; the first classifier
# would predict whether or not an instance should be classified as Local, the second
# classifier would predict whether or not an instance should be classified as US, and
# so on. The final prediction is the union of the predictions from all of the binary
# classifiers.
In [ ]: