In [26]:
%reset
import os
import pprint
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import learning_curve
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import VotingClassifier
In [40]:
def learn_curve(str_id, clf, X_train, X_test, y_train, y_test):
"""Get learning curve and confusion matrix
Params
-------
str_id : Ideally name of classifier {str}
clf : classifier {sklearn-obj}
X_train : {numpy.array}
X_test : {numpy.array}
y_train : {numpy.array}
y_test : {numpy.array}
Creates two png files.
"""
dest = os.path.join('figures')
if not os.path.exists(dest):
os.makedirs(dest)
# get validation curve
train_sizes, train_scores, test_scores =\
learning_curve(estimator=clf,
X=X_train,
y=y_train,
train_sizes=np.linspace(0.1, 1.0, 10),
cv=10,
n_jobs=-1)
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)
plt.plot(train_sizes, train_mean,
color='blue', marker='o',
markersize=5, label='training accuracy')
plt.fill_between(train_sizes,
train_mean + train_std,
train_mean - train_std,
alpha=0.15, color='blue')
plt.plot(train_sizes, test_mean,
color='green', linestyle='--',
marker='s', markersize=5,
label='validation accuracy')
plt.fill_between(train_sizes,
test_mean + test_std,
test_mean - test_std,
alpha=0.15, color='green')
plt.tight_layout()
plt.grid()
plt.xlabel('Number of training samples')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')
lc_name = str_id + 'learning_curve.png'
plt.savefig('./figures/'+lc_name, dpi=300)
plt.show()
def conf_mat(str_id, clf, X_train, X_test, y_train, y_test):
"""Get learning curve and confusion matrix
Params
-------
str_id : Ideally name of classifier {str}
clf : classifier {sklearn-obj}
X_train : {numpy.array}
X_test : {numpy.array}
y_train : {numpy.array}
y_test : {numpy.array}
Creates two png files.
"""
dest = os.path.join('figures')
if not os.path.exists(dest):
os.makedirs(dest)
# confusion matrix
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
confmat = confusion_matrix(y_true=y_test, y_pred=y_pred)
fig, ax = plt.subplots(figsize=(5, 5))
ax.matshow(confmat, cmap=plt.cm.Blues, alpha=0.3)
for i in range(confmat.shape[0]):
for j in range(confmat.shape[1]):
ax.text(x=j, y=i, s=confmat[i, j], va='center', ha='center')
plt.xlabel('predicted label', fontsize=15)
plt.ylabel('true label', fontsize=15)
#plt.tight_layout()
cm_name = str_id + 'confusion_matrix.png'
plt.savefig('./figures/'+cm_name, dpi=300)
plt.show()
In [20]:
# These were obtained from previous Gridsearches from the Notebook_gs* scripts
# the results were stored in a subdir named `pkl_objects`
dest = os.path.join('pkl_objects')
gs_clf1 = pickle.load(open(os.path.join(dest, '2gs_logit.pkl'), 'rb'))
gs_clf2 = pickle.load(open(os.path.join(dest, '2gs_svc.pkl'), 'rb'))
gs_clf3 = pickle.load(open(os.path.join(dest, '2gs_nb.pkl'), 'rb'))
clf1 = gs_clf1.best_estimator_
clf2 = gs_clf2.best_estimator_
clf3 = gs_clf3.best_estimator_
print(gs_clf1.best_params_,'\n')
print(gs_clf2.best_params_,'\n')
print(gs_clf3.best_params_,'\n')
In [28]:
# Out dataset is contained in the following CSV
f_authorship = 'users/authorship.csv'
df = pd.read_csv(f_authorship)
df.drop_duplicates()
np.random.seed(0)
df = df.reindex(np.random.permutation(df.index))
X = df.loc[:, 'text'].values
y = df.loc[:, 'user_id'].values
le = LabelEncoder()
y = le.fit_transform(y)
# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, stratify=y, random_state=99)
In [36]:
from nltk.tokenize import TweetTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
tweet_token = TweetTokenizer()
def tokenizer_twitter(text):
return tweet_token.tokenize(text)
tfidf = TfidfVectorizer(strip_accents=None,
lowercase=False,
preprocessor=None,
ngram_range=(1,2),
stop_words=None,
tokenizer=tokenizer_twitter)
tf = TfidfVectorizer(strip_accents=None,
lowercase=False,
preprocessor=None,
ngram_range=(1,2),
stop_words=None,
tokenizer=tokenizer_twitter,
use_idf=False)
lr_tfidf = Pipeline([('vect', tfidf),
('clf', LogisticRegression(C=100.0, penalty='l1', random_state=1))])
svc_tfidf = Pipeline([('vect', tfidf),
('clf', SVC(C=10.0, gamma=0.1, kernel='rbf', random_state=1))])
nb_tf = Pipeline([('vect', tf),
('clf', MultinomialNB(alpha=0.25))])
rf_tfidf = Pipeline([('vect', tfidf),
('clf', RandomForestClassifier(criterion='entropy', n_estimators=600, random_state=1))])
ens1 = VotingClassifier(estimators=[('lr', lr_tfidf), ('svc', svc_tfidf), ('nb', nb_tf)], voting='hard')
ens2 = VotingClassifier(estimators=[('lr', lr_tfidf), ('rf', rf_tfidf), ('nb', nb_tf)], voting='soft')
In [30]:
# Hard voting
ens1.fit(X_train, y_train)
print('Test Accuracy for hard voting: {:.3f}'.format(ens1.score(X_test, y_test)))
# Soft voting
ens2.fit(X_train, y_train)
print('Test Accuracy for soft voting: {:.3f}'.format(ens2.score(X_test, y_test)))
In [31]:
%reset_selective ens1
In [32]:
learn_curve('ens1', ens1, X_train, X_test, y_train, y_test)
conf_mat('ens1', ens1, X_train, X_test, y_train, y_test)
In [ ]:
%reset_selective ens2
In [33]:
learn_curve('ens2', ens2, X_train, X_test, y_train, y_test)
conf_mat('ens2', ens2, X_train, X_test, y_train, y_test)
In [41]:
learn_curve('lr', lr_tfidf, X_train, X_test, y_train, y_test)
conf_mat('lr', lr_tfidf, X_train, X_test, y_train, y_test)
In [ ]: