In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import precision_score, accuracy_score, roc_curve, confusion_matrix
from sklearn import preprocessing
import itertools
from sklearn import metrics
%matplotlib inline
plt.style.use('ggplot')
In [2]:
topic_df = pd.read_csv('result_all_windows_labels.csv')
topic_df.head()
Out[2]:
In [9]:
# Data cleanup
topic_df = topic_df[topic_df['label'].notnull()]
topic_df = topic_df[topic_df.label != 'environmental']
topic_df = topic_df[topic_df.label != 'religious']
topic_df = topic_df[topic_df.label != 'economical']
label_cat = {'violence/terrorism' : 1, 'misc': 2, 'political': 3,
# 'religious': 4, 'economical': 5, 'environmental': 6
}
print(label_cat.keys())
def to_category(x):
return label_cat[x]
topic_df['target'] = topic_df.apply(lambda row: to_category(row['label']), axis=1)
plt.figure()
topic_df['target'].plot.hist(alpha=0.5)
# Okay for now?
topic_corpus = []
for index, row in topic_df.iloc[ :, 2:32].iterrows():
topic_corpus.append(u' '.join(row.tolist()))
# topic_df
In [11]:
def plot_confusion_matrix(cm, classes,
normalize=False,
title='Confusion matrix',
cmap=plt.cm.Blues):
"""
This function prints and plots the confusion matrix.
Normalization can be applied by setting `normalize=True`.
"""
if normalize:
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
print("Normalized confusion matrix")
else:
print('Confusion matrix, without normalization')
print(cm)
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=45)
plt.yticks(tick_marks, classes)
fmt = '.2f' if normalize else 'd'
thresh = cm.max() / 2.
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
plt.text(j, i, format(cm[i, j], fmt),
horizontalalignment="center",
color="white" if cm[i, j] > thresh else "black")
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
In [12]:
X = topic_corpus
y = topic_df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=1)
In [13]:
print(len(X_train), len(y_train))
print(len(X_test), len(y_test))
# print(y)
In [14]:
vect = CountVectorizer(min_df=2)
vect = TfidfVectorizer(norm='l2')
X_train_dtm = vect.fit_transform(X_train)
X_test_dtm = vect.transform(X_test)
# # normalized
# X_train_dtm = preprocessing.normalize(X_train_dtm, norm='l2')
# X_test_dtm = preprocessing.normalize(X_test_dtm, norm='l2')
In [15]:
def kNeighours(X_train_dtm, X_test_dtm, y_train, y_test):
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=3)
neigh.fit(X_train_dtm, y_train)
y_pred_test = neigh.predict(X_test_dtm)
accuracy = accuracy_score(y_pred=y_pred_test, y_true=y_test)
CM = confusion_matrix(y_pred=y_pred_test, y_true=y_test)
print('accuracy: ', accuracy)
print(metrics.classification_report(y_test, y_pred_test, target_names=list(label_cat.keys())))
# Plot normalized confusion matrix
plt.figure(figsize=(12,8))
plot_confusion_matrix(CM, classes=label_cat.keys(), title='Confusion matrix')
plt.show()
kNeighours(X_train_dtm, X_test_dtm, y_train, y_test)
In [16]:
def nearestCentroid(X_train_dtm, X_test_dtm, y_train, y_test):
from sklearn.neighbors.nearest_centroid import NearestCentroid
clf = NearestCentroid()
clf.fit(X_train_dtm, y_train)
y_pred_test = clf.predict(X_test_dtm)
accuracy = accuracy_score(y_pred=y_pred_test, y_true=y_test)
CM = confusion_matrix(y_pred=y_pred_test, y_true=y_test)
print('accuracy: ', accuracy)
print(metrics.classification_report(y_test, y_pred_test, target_names=list(label_cat.keys())))
# Plot normalized confusion matrix
plt.figure(figsize=(12,8))
plot_confusion_matrix(CM, classes=label_cat.keys(), title='Confusion matrix')
plt.show()
nearestCentroid(X_train_dtm, X_test_dtm, y_train, y_test)
In [17]:
def multinomial(X_train_dtm, X_test_dtm, y_train, y_test):
from sklearn.naive_bayes import MultinomialNB
# Create instance of estimator
nb = MultinomialNB()
# Train the model
nb.fit(X_train_dtm, y_train)
# Evaluate
y_pred_test = nb.predict(X_test_dtm)
accuracy = accuracy_score(y_pred=y_pred_test, y_true=y_test)
CM = confusion_matrix(y_pred=y_pred_test, y_true=y_test)
print('accuracy: ', accuracy)
print(metrics.classification_report(y_test, y_pred_test, target_names=list(label_cat.keys())))
# Plot normalized confusion matrix
plt.figure(figsize=(12,8))
plot_confusion_matrix(CM, classes=label_cat.keys(), title='Confusion matrix')
plt.show()
multinomial(X_train_dtm, X_test_dtm, y_train, y_test)
In [21]:
def logisticRegression(X_train_dtm, X_test_dtm, y_train, y_test):
from sklearn.linear_model import LogisticRegression
# Create instance of estimator
logistic_regression = LogisticRegression(C=15, n_jobs=-1, random_state=15)
# Train
logistic_regression.fit(X_train_dtm, y_train)
# Evaluate
y_pred_test = logistic_regression.predict(X_test_dtm)
accuracy = accuracy_score(y_pred=y_pred_test, y_true=y_test)
CM = confusion_matrix(y_pred=y_pred_test, y_true=y_test)
print('accuracy: ', accuracy)
print(metrics.classification_report(y_test, y_pred_test, target_names=list(label_cat.keys())))
# Plot normalized confusion matrix
plt.figure(figsize=(12,8))
plot_confusion_matrix(CM, classes=label_cat.keys(), title='Confusion matrix')
plt.show()
logisticRegression(X_train_dtm, X_test_dtm, y_train, y_test)
In [22]:
def tree(X_train_dtm, X_test_dtm, y_train, y_test):
from sklearn.tree import DecisionTreeClassifier
class_tree = DecisionTreeClassifier(max_depth=10, random_state=10)
class_tree.fit(X_train_dtm, y_train)
# Evaluate
y_pred_test = class_tree.predict(X_test_dtm)
accuracy = accuracy_score(y_pred=y_pred_test, y_true=y_test)
CM = confusion_matrix(y_pred=y_pred_test, y_true=y_test)
print('accuracy: ', accuracy)
print(metrics.classification_report(y_test, y_pred_test, target_names=list(label_cat.keys())))
# Plot normalized confusion matrix
plt.figure(figsize=(12,8))
plot_confusion_matrix(CM, classes=label_cat.keys(), title='Confusion matrix')
plt.show()
tree(X_train_dtm, X_test_dtm, y_train, y_test)
In [23]:
def gaussianNB(X_train_dtm, X_test_dtm, y_train, y_test):
from sklearn.naive_bayes import GaussianNB
# Create instance of estimator
gnb = GaussianNB()
# Train the model
gnb.fit(X_train_dtm.toarray(), y_train)
# Evaluate
y_pred_test = gnb.predict(X_test_dtm.toarray())
accuracy = accuracy_score(y_pred=y_pred_test, y_true=y_test)
CM = confusion_matrix(y_pred=y_pred_test, y_true=y_test)
print('accuracy: ', accuracy)
print(metrics.classification_report(y_test, y_pred_test, target_names=list(label_cat.keys())))
# Plot normalized confusion matrix
plt.figure(figsize=(12,8))
plot_confusion_matrix(CM, classes=label_cat.keys(), title='Confusion matrix')
plt.show()
gaussianNB(X_train_dtm, X_test_dtm, y_train, y_test)
In [24]:
def svm (X_train_dtm, X_test_dtm, y_train, y_test):
from sklearn.svm import SVC
clf = SVC(kernel='linear')
clf.fit(X_train_dtm, y_train)
# Evaluate
y_pred_test = clf.predict(X_test_dtm)
accuracy = accuracy_score(y_pred=y_pred_test, y_true=y_test)
CM = confusion_matrix(y_pred=y_pred_test, y_true=y_test)
print('accuracy: ', accuracy)
print(metrics.classification_report(y_test, y_pred_test, target_names=list(label_cat.keys())))
# Plot normalized confusion matrix
plt.figure(figsize=(12,8))
plot_confusion_matrix(CM, classes=label_cat.keys(), title='Confusion matrix')
plt.show()
svm(X_train_dtm, X_test_dtm, y_train, y_test)
In [25]:
def randomForest (X_train_dtm, X_test_dtm, y_train, y_test):
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(max_depth=20, random_state=0)
clf.fit(X_train_dtm, y_train)
# Evaluate
y_pred_test = clf.predict(X_test_dtm)
accuracy = accuracy_score(y_pred=y_pred_test, y_true=y_test)
CM = confusion_matrix(y_pred=y_pred_test, y_true=y_test)
print('accuracy: ', accuracy)
print(metrics.classification_report(y_test, y_pred_test, target_names=list(label_cat.keys())))
# Plot normalized confusion matrix
plt.figure(figsize=(12,8))
plot_confusion_matrix(CM, classes=label_cat.keys(), title='Confusion matrix')
plt.show()
randomForest(X_train_dtm, X_test_dtm, y_train, y_test)
In [27]:
def adaBoostClassifier (X_train_dtm, X_test_dtm, y_train, y_test):
from sklearn.ensemble import AdaBoostClassifier
clf = AdaBoostClassifier(n_estimators=100)
clf.fit(X_train_dtm, y_train)
# Evaluate
y_pred_test = clf.predict(X_test_dtm)
accuracy = accuracy_score(y_pred=y_pred_test, y_true=y_test)
CM = confusion_matrix(y_pred=y_pred_test, y_true=y_test)
print('accuracy: ', accuracy)
print(metrics.classification_report(y_test, y_pred_test, target_names=list(label_cat.keys())))
# Plot normalized confusion matrix
plt.figure(figsize=(12,8))
plot_confusion_matrix(CM, classes=label_cat.keys(), title='Confusion matrix')
plt.show()
adaBoostClassifier(X_train_dtm, X_test_dtm, y_train, y_test)
In [30]:
def SGDClassifier (X_train_dtm, X_test_dtm, y_train, y_test):
from sklearn.linear_model import SGDClassifier
clf = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, random_state=42)
clf.fit(X_train_dtm, y_train)
# Evaluate
y_pred_test = clf.predict(X_test_dtm)
accuracy = accuracy_score(y_pred=y_pred_test, y_true=y_test)
CM = confusion_matrix(y_pred=y_pred_test, y_true=y_test)
print('accuracy: ', accuracy)
print(metrics.classification_report(y_test, y_pred_test, target_names=list(label_cat.keys())))
# Plot normalized confusion matrix
plt.figure(figsize=(12,8))
plot_confusion_matrix(CM, classes=label_cat.keys(), title='Confusion matrix')
plt.show()
SGDClassifier(X_train_dtm, X_test_dtm, y_train, y_test)
In [33]:
# https://www.kdnuggets.com/2016/10/beginners-guide-neural-networks-python-scikit-learn.html/2
def MLPClassifier (X_train_dtm, X_test_dtm, y_train, y_test):
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(hidden_layer_sizes=(30,20,20))
clf.fit(X_train_dtm, y_train)
# Evaluate
y_pred_test = clf.predict(X_test_dtm)
accuracy = accuracy_score(y_pred=y_pred_test, y_true=y_test)
CM = confusion_matrix(y_pred=y_pred_test, y_true=y_test)
print('accuracy: ', accuracy)
print(metrics.classification_report(y_test, y_pred_test, target_names=list(label_cat.keys())))
# Plot normalized confusion matrix
plt.figure(figsize=(12,8))
plot_confusion_matrix(CM, classes=label_cat.keys(), title='Confusion matrix')
plt.show()
MLPClassifier(X_train_dtm, X_test_dtm, y_train, y_test)
In [22]:
# def predict_topic(model, doc):
# prediction = model.predict(vect.transform(doc))[0]
# print(doc)
# print(prediction)
# predict_topic(class_tree, ['hagel iraqi baghdad iraq forces abadi troop terry coalition kuwait secretary_chuck commander us army islamic_state defense train prime_minister advise james_terry obama security is_militant military lieutenant_general'])
In [20]:
# store the predicted probabilities for class 1
# y_pred_prob = logistic_regression.predict_proba(X_train_dtm)[:, 0]
# # histogram of predicted probabilities
# # 8 bins
# plt.hist(y_pred_prob, bins=8)
# # x-axis limit from 0 to 1
# plt.xlim(0,1)
# plt.title('Histogram of predicted probabilities')
# plt.xlabel('Predicted probability of topics')
# plt.ylabel('Frequency')
In [ ]:
In [ ]: