Classification and prediction of topics


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import precision_score, accuracy_score, roc_curve, confusion_matrix
from sklearn import preprocessing
import itertools
from sklearn import metrics


%matplotlib inline
plt.style.use('ggplot')

In [2]:
topic_df = pd.read_csv('result_all_windows_labels.csv')
topic_df.head()


Out[2]:
0 label 1 2 3 4 5 6 7 8 ... 28 29 30 label_w2v_no label_w2v_scale label_w2v_normalize label_wmd_distance label_wmd_normalize label_wmd_scale label_tfidf_normalize
0 2012_01_01 violence/terrorism syrian assad say syria killed damascus people regime ... civilian least journalist 20 3 20 34 6 60 15
1 2012_01_02 misc use osc copyrighted_material dissemination usage reproduction original authorize ... al agency location 36 6 25 49 2 60 46
2 2012_01_03 misc will year can people one country party make ... change political just 15 15 19 10 3 60 8
3 2012_01_04 misc quot apos say the we it reuters terrorists ... but don protest 38 25 10 20 23 60 6
4 2012_01_05 violence/terrorism baghdad iraq sunni killed bomb iraqi attacks wound ... sadr basra near 26 5 11 18 14 60 29

5 rows × 39 columns


In [9]:
# Data cleanup
topic_df = topic_df[topic_df['label'].notnull()]
topic_df = topic_df[topic_df.label != 'environmental']
topic_df = topic_df[topic_df.label != 'religious']
topic_df = topic_df[topic_df.label != 'economical']

label_cat = {'violence/terrorism' : 1, 'misc': 2, 'political': 3,
#              'religious': 4, 'economical': 5, 'environmental': 6
            }
print(label_cat.keys()) 


def to_category(x):
    return label_cat[x]

topic_df['target'] = topic_df.apply(lambda row: to_category(row['label']), axis=1)

plt.figure()
topic_df['target'].plot.hist(alpha=0.5)
# Okay for now?

topic_corpus = []

for index, row in topic_df.iloc[ :, 2:32].iterrows():
    topic_corpus.append(u' '.join(row.tolist()))


# topic_df


dict_keys(['political', 'misc', 'violence/terrorism'])

In [11]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

Text Vector-representation (Bag of Words)


In [12]:
X = topic_corpus
y = topic_df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=1)

In [13]:
print(len(X_train), len(y_train))
print(len(X_test), len(y_test))
# print(y)


1159 1159
290 290

In [14]:
vect = CountVectorizer(min_df=2)
vect = TfidfVectorizer(norm='l2')
X_train_dtm = vect.fit_transform(X_train)
X_test_dtm = vect.transform(X_test)


# # normalized
# X_train_dtm = preprocessing.normalize(X_train_dtm, norm='l2')
# X_test_dtm = preprocessing.normalize(X_test_dtm, norm='l2')


/usr/local/lib/python3.5/dist-packages/sklearn/feature_extraction/text.py:1059: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):

KNeighborsClassifier


In [15]:
def kNeighours(X_train_dtm, X_test_dtm, y_train, y_test):
    from sklearn.neighbors import KNeighborsClassifier
    neigh = KNeighborsClassifier(n_neighbors=3)
    neigh.fit(X_train_dtm, y_train) 

    y_pred_test = neigh.predict(X_test_dtm)
    
    accuracy = accuracy_score(y_pred=y_pred_test, y_true=y_test)
    CM = confusion_matrix(y_pred=y_pred_test, y_true=y_test)
    print('accuracy: ', accuracy)
    print(metrics.classification_report(y_test, y_pred_test, target_names=list(label_cat.keys())))
    
    # Plot normalized confusion matrix
    plt.figure(figsize=(12,8))
    plot_confusion_matrix(CM, classes=label_cat.keys(), title='Confusion matrix')
    plt.show()

    
    
kNeighours(X_train_dtm, X_test_dtm, y_train, y_test)


accuracy:  0.8275862068965517
                    precision    recall  f1-score   support

         political       0.83      0.82      0.82       115
              misc       0.85      0.90      0.88        71
violence/terrorism       0.80      0.79      0.80       104

       avg / total       0.83      0.83      0.83       290

Confusion matrix, without normalization
[[94  5 16]
 [ 3 64  4]
 [16  6 82]]

NearestCentroid


In [16]:
def nearestCentroid(X_train_dtm, X_test_dtm, y_train, y_test):
    from sklearn.neighbors.nearest_centroid import NearestCentroid

    clf = NearestCentroid()
    clf.fit(X_train_dtm, y_train)

    y_pred_test = clf.predict(X_test_dtm)
    
    accuracy = accuracy_score(y_pred=y_pred_test, y_true=y_test)
    CM = confusion_matrix(y_pred=y_pred_test, y_true=y_test)
    print('accuracy: ', accuracy)
    print(metrics.classification_report(y_test, y_pred_test, target_names=list(label_cat.keys())))

    # Plot normalized confusion matrix
    plt.figure(figsize=(12,8))
    plot_confusion_matrix(CM, classes=label_cat.keys(), title='Confusion matrix')
    plt.show()
    
nearestCentroid(X_train_dtm, X_test_dtm, y_train, y_test)


accuracy:  0.8241379310344827
                    precision    recall  f1-score   support

         political       0.83      0.76      0.79       115
              misc       0.89      0.90      0.90        71
violence/terrorism       0.78      0.85      0.81       104

       avg / total       0.83      0.82      0.82       290

Confusion matrix, without normalization
[[87  5 23]
 [ 5 64  2]
 [13  3 88]]

Multinomial classifier


In [17]:
def multinomial(X_train_dtm, X_test_dtm, y_train, y_test):

    from sklearn.naive_bayes import MultinomialNB

    # Create instance of estimator
    nb = MultinomialNB()
    # Train the model
    nb.fit(X_train_dtm, y_train)
    # Evaluate
    y_pred_test = nb.predict(X_test_dtm)


    accuracy = accuracy_score(y_pred=y_pred_test, y_true=y_test)
    CM = confusion_matrix(y_pred=y_pred_test, y_true=y_test)
    print('accuracy: ', accuracy)
    print(metrics.classification_report(y_test, y_pred_test, target_names=list(label_cat.keys())))

    # Plot normalized confusion matrix
    plt.figure(figsize=(12,8))
    plot_confusion_matrix(CM, classes=label_cat.keys(), title='Confusion matrix')
    plt.show()
    
multinomial(X_train_dtm, X_test_dtm, y_train, y_test)


accuracy:  0.8172413793103448
                    precision    recall  f1-score   support

         political       0.79      0.82      0.80       115
              misc       0.94      0.82      0.87        71
violence/terrorism       0.78      0.82      0.80       104

       avg / total       0.82      0.82      0.82       290

Confusion matrix, without normalization
[[94  2 19]
 [ 8 58  5]
 [17  2 85]]

Logistic Regression


In [21]:
def logisticRegression(X_train_dtm, X_test_dtm, y_train, y_test):
    from sklearn.linear_model import LogisticRegression

    # Create instance of estimator
    logistic_regression = LogisticRegression(C=15, n_jobs=-1, random_state=15)
    # Train
    logistic_regression.fit(X_train_dtm, y_train)
    # Evaluate
    y_pred_test = logistic_regression.predict(X_test_dtm)

    accuracy = accuracy_score(y_pred=y_pred_test, y_true=y_test)
    CM = confusion_matrix(y_pred=y_pred_test, y_true=y_test)
    print('accuracy: ', accuracy)
    print(metrics.classification_report(y_test, y_pred_test, target_names=list(label_cat.keys())))
    
    # Plot normalized confusion matrix
    plt.figure(figsize=(12,8))
    plot_confusion_matrix(CM, classes=label_cat.keys(), title='Confusion matrix')
    plt.show()
    
logisticRegression(X_train_dtm, X_test_dtm, y_train, y_test)


accuracy:  0.8413793103448276
                    precision    recall  f1-score   support

         political       0.85      0.83      0.84       115
              misc       0.91      0.89      0.90        71
violence/terrorism       0.79      0.83      0.81       104

       avg / total       0.84      0.84      0.84       290

Confusion matrix, without normalization
[[95  2 18]
 [ 3 63  5]
 [14  4 86]]

Classification Trees


In [22]:
def tree(X_train_dtm, X_test_dtm, y_train, y_test):
    from sklearn.tree import DecisionTreeClassifier

    class_tree = DecisionTreeClassifier(max_depth=10, random_state=10)

    class_tree.fit(X_train_dtm, y_train)

    # Evaluate
    y_pred_test = class_tree.predict(X_test_dtm)

    accuracy = accuracy_score(y_pred=y_pred_test, y_true=y_test)
    CM = confusion_matrix(y_pred=y_pred_test, y_true=y_test)
    print('accuracy: ', accuracy)
    print(metrics.classification_report(y_test, y_pred_test, target_names=list(label_cat.keys())))

    # Plot normalized confusion matrix
    plt.figure(figsize=(12,8))
    plot_confusion_matrix(CM, classes=label_cat.keys(), title='Confusion matrix')
    plt.show()
    
tree(X_train_dtm, X_test_dtm, y_train, y_test)


accuracy:  0.696551724137931
                    precision    recall  f1-score   support

         political       0.82      0.59      0.69       115
              misc       0.66      0.77      0.71        71
violence/terrorism       0.64      0.76      0.69       104

       avg / total       0.72      0.70      0.70       290

Confusion matrix, without normalization
[[68 15 32]
 [ 3 55 13]
 [12 13 79]]

Naive Bayes Classifier - GaussianNB


In [23]:
def gaussianNB(X_train_dtm, X_test_dtm, y_train, y_test):
    from sklearn.naive_bayes import GaussianNB

    # Create instance of estimator
    gnb = GaussianNB()
    # Train the model
    gnb.fit(X_train_dtm.toarray(), y_train)
    # Evaluate
    y_pred_test = gnb.predict(X_test_dtm.toarray())

    accuracy = accuracy_score(y_pred=y_pred_test, y_true=y_test)
    CM = confusion_matrix(y_pred=y_pred_test, y_true=y_test)
    print('accuracy: ', accuracy)
    print(metrics.classification_report(y_test, y_pred_test, target_names=list(label_cat.keys())))

    # Plot normalized confusion matrix
    plt.figure(figsize=(12,8))
    plot_confusion_matrix(CM, classes=label_cat.keys(), title='Confusion matrix')
    plt.show()
    
gaussianNB(X_train_dtm, X_test_dtm, y_train, y_test)


accuracy:  0.7379310344827587
                    precision    recall  f1-score   support

         political       0.71      0.73      0.72       115
              misc       0.86      0.80      0.83        71
violence/terrorism       0.70      0.70      0.70       104

       avg / total       0.74      0.74      0.74       290

Confusion matrix, without normalization
[[84  5 26]
 [ 8 57  6]
 [27  4 73]]

SVM


In [24]:
def svm (X_train_dtm, X_test_dtm, y_train, y_test):
    from sklearn.svm import SVC
    clf = SVC(kernel='linear')
    clf.fit(X_train_dtm, y_train)
    # Evaluate
    y_pred_test = clf.predict(X_test_dtm)
    
    accuracy = accuracy_score(y_pred=y_pred_test, y_true=y_test)
    CM = confusion_matrix(y_pred=y_pred_test, y_true=y_test)
    print('accuracy: ', accuracy)
    print(metrics.classification_report(y_test, y_pred_test, target_names=list(label_cat.keys())))

    # Plot normalized confusion matrix
    plt.figure(figsize=(12,8))
    plot_confusion_matrix(CM, classes=label_cat.keys(), title='Confusion matrix')
    plt.show()
    
svm(X_train_dtm, X_test_dtm, y_train, y_test)


accuracy:  0.8413793103448276
                    precision    recall  f1-score   support

         political       0.83      0.83      0.83       115
              misc       0.90      0.90      0.90        71
violence/terrorism       0.82      0.81      0.81       104

       avg / total       0.84      0.84      0.84       290

Confusion matrix, without normalization
[[96  2 17]
 [ 5 64  2]
 [15  5 84]]

In [25]:
def randomForest (X_train_dtm, X_test_dtm, y_train, y_test):
    from sklearn.ensemble import RandomForestClassifier
    
    clf = RandomForestClassifier(max_depth=20, random_state=0)
    clf.fit(X_train_dtm, y_train)
    # Evaluate
    y_pred_test = clf.predict(X_test_dtm)
  
    accuracy = accuracy_score(y_pred=y_pred_test, y_true=y_test)
    CM = confusion_matrix(y_pred=y_pred_test, y_true=y_test)
    print('accuracy: ', accuracy)
    print(metrics.classification_report(y_test, y_pred_test, target_names=list(label_cat.keys())))
    
    # Plot normalized confusion matrix
    plt.figure(figsize=(12,8))
    plot_confusion_matrix(CM, classes=label_cat.keys(), title='Confusion matrix')
    plt.show()
    
randomForest(X_train_dtm, X_test_dtm, y_train, y_test)


accuracy:  0.8
                    precision    recall  f1-score   support

         political       0.75      0.82      0.78       115
              misc       0.94      0.83      0.88        71
violence/terrorism       0.77      0.76      0.77       104

       avg / total       0.81      0.80      0.80       290

Confusion matrix, without normalization
[[94  2 19]
 [ 8 59  4]
 [23  2 79]]

In [27]:
def adaBoostClassifier (X_train_dtm, X_test_dtm, y_train, y_test):
    from sklearn.ensemble import AdaBoostClassifier
    
    clf = AdaBoostClassifier(n_estimators=100)
    clf.fit(X_train_dtm, y_train)
    # Evaluate
    y_pred_test = clf.predict(X_test_dtm)

    accuracy = accuracy_score(y_pred=y_pred_test, y_true=y_test)
    CM = confusion_matrix(y_pred=y_pred_test, y_true=y_test)
    print('accuracy: ', accuracy)
    print(metrics.classification_report(y_test, y_pred_test, target_names=list(label_cat.keys())))

    # Plot normalized confusion matrix
    plt.figure(figsize=(12,8))
    plot_confusion_matrix(CM, classes=label_cat.keys(), title='Confusion matrix')
    plt.show()
    
adaBoostClassifier(X_train_dtm, X_test_dtm, y_train, y_test)


accuracy:  0.6931034482758621
                    precision    recall  f1-score   support

         political       0.72      0.70      0.71       115
              misc       0.85      0.55      0.67        71
violence/terrorism       0.61      0.78      0.69       104

       avg / total       0.71      0.69      0.69       290

Confusion matrix, without normalization
[[81  2 32]
 [13 39 19]
 [18  5 81]]

In [30]:
def SGDClassifier (X_train_dtm, X_test_dtm, y_train, y_test):
    from sklearn.linear_model import SGDClassifier
    
    clf =  SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, random_state=42)
    clf.fit(X_train_dtm, y_train)
    # Evaluate
    y_pred_test = clf.predict(X_test_dtm)

    accuracy = accuracy_score(y_pred=y_pred_test, y_true=y_test)
    CM = confusion_matrix(y_pred=y_pred_test, y_true=y_test)
    print('accuracy: ', accuracy)
    print(metrics.classification_report(y_test, y_pred_test, target_names=list(label_cat.keys())))

    # Plot normalized confusion matrix
    plt.figure(figsize=(12,8))
    plot_confusion_matrix(CM, classes=label_cat.keys(), title='Confusion matrix')
    plt.show()
    
SGDClassifier(X_train_dtm, X_test_dtm, y_train, y_test)


accuracy:  0.8448275862068966
                    precision    recall  f1-score   support

         political       0.83      0.83      0.83       115
              misc       0.90      0.92      0.91        71
violence/terrorism       0.82      0.81      0.81       104

       avg / total       0.84      0.84      0.84       290

Confusion matrix, without normalization
[[96  2 17]
 [ 4 65  2]
 [15  5 84]]

In [33]:
# https://www.kdnuggets.com/2016/10/beginners-guide-neural-networks-python-scikit-learn.html/2
def MLPClassifier (X_train_dtm, X_test_dtm, y_train, y_test):
    from sklearn.neural_network import MLPClassifier
    
    clf = MLPClassifier(hidden_layer_sizes=(30,20,20))
    clf.fit(X_train_dtm, y_train)
    # Evaluate
    y_pred_test = clf.predict(X_test_dtm)

    accuracy = accuracy_score(y_pred=y_pred_test, y_true=y_test)
    CM = confusion_matrix(y_pred=y_pred_test, y_true=y_test)
    print('accuracy: ', accuracy)
    print(metrics.classification_report(y_test, y_pred_test, target_names=list(label_cat.keys())))

    # Plot normalized confusion matrix
    plt.figure(figsize=(12,8))
    plot_confusion_matrix(CM, classes=label_cat.keys(), title='Confusion matrix')
    plt.show()
    
MLPClassifier(X_train_dtm, X_test_dtm, y_train, y_test)


accuracy:  0.8068965517241379
                    precision    recall  f1-score   support

         political       0.80      0.81      0.81       115
              misc       0.90      0.85      0.87        71
violence/terrorism       0.76      0.78      0.77       104

       avg / total       0.81      0.81      0.81       290

Confusion matrix, without normalization
[[93  3 19]
 [ 4 60  7]
 [19  4 81]]

In [22]:
# def predict_topic(model, doc):
#     prediction = model.predict(vect.transform(doc))[0]
#     print(doc)
#     print(prediction)
    
# predict_topic(class_tree, ['hagel iraqi baghdad iraq forces abadi troop terry coalition kuwait secretary_chuck commander us army islamic_state defense train prime_minister advise james_terry obama security is_militant military lieutenant_general'])

In [20]:
# store the predicted probabilities for class 1
# y_pred_prob = logistic_regression.predict_proba(X_train_dtm)[:, 0]
# # histogram of predicted probabilities

# # 8 bins
# plt.hist(y_pred_prob, bins=8)

# # x-axis limit from 0 to 1
# plt.xlim(0,1)
# plt.title('Histogram of predicted probabilities')
# plt.xlabel('Predicted probability of topics')
# plt.ylabel('Frequency')

In [ ]:


In [ ]: