Classification and prediction of topics



In [1]:

    
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import precision_score, accuracy_score, roc_curve, confusion_matrix
from sklearn import preprocessing
import itertools
from sklearn import metrics


%matplotlib inline
plt.style.use('ggplot')



In [2]:

    
topic_df = pd.read_csv('result_all_windows_labels.csv')
topic_df.head()









    Out[2]:







  
    
      
      0
      label
      1
      2
      3
      4
      5
      6
      7
      8
      ...
      28
      29
      30
      label_w2v_no
      label_w2v_scale
      label_w2v_normalize
      label_wmd_distance
      label_wmd_normalize
      label_wmd_scale
      label_tfidf_normalize
    
  
  
    
      0
      2012_01_01
      violence/terrorism
      syrian
      assad
      say
      syria
      killed
      damascus
      people
      regime
      ...
      civilian
      least
      journalist
      20
      3
      20
      34
      6
      60
      15
    
    
      1
      2012_01_02
      misc
      use
      osc
      copyrighted_material
      dissemination
      usage
      reproduction
      original
      authorize
      ...
      al
      agency
      location
      36
      6
      25
      49
      2
      60
      46
    
    
      2
      2012_01_03
      misc
      will
      year
      can
      people
      one
      country
      party
      make
      ...
      change
      political
      just
      15
      15
      19
      10
      3
      60
      8
    
    
      3
      2012_01_04
      misc
      quot
      apos
      say
      the
      we
      it
      reuters
      terrorists
      ...
      but
      don
      protest
      38
      25
      10
      20
      23
      60
      6
    
    
      4
      2012_01_05
      violence/terrorism
      baghdad
      iraq
      sunni
      killed
      bomb
      iraqi
      attacks
      wound
      ...
      sadr
      basra
      near
      26
      5
      11
      18
      14
      60
      29
    
  

5 rows × 39 columns



In [9]:

    
# Data cleanup
topic_df = topic_df[topic_df['label'].notnull()]
topic_df = topic_df[topic_df.label != 'environmental']
topic_df = topic_df[topic_df.label != 'religious']
topic_df = topic_df[topic_df.label != 'economical']

label_cat = {'violence/terrorism' : 1, 'misc': 2, 'political': 3,
#              'religious': 4, 'economical': 5, 'environmental': 6
            }
print(label_cat.keys()) 


def to_category(x):
    return label_cat[x]

topic_df['target'] = topic_df.apply(lambda row: to_category(row['label']), axis=1)

plt.figure()
topic_df['target'].plot.hist(alpha=0.5)
# Okay for now?

topic_corpus = []

for index, row in topic_df.iloc[ :, 2:32].iterrows():
    topic_corpus.append(u' '.join(row.tolist()))


# topic_df









    



dict_keys(['political', 'misc', 'violence/terrorism'])



In [11]:

    
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

Text Vector-representation (Bag of Words)



In [12]:

    
X = topic_corpus
y = topic_df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=1)



In [13]:

    
print(len(X_train), len(y_train))
print(len(X_test), len(y_test))
# print(y)



In [14]:

    
vect = CountVectorizer(min_df=2)
vect = TfidfVectorizer(norm='l2')
X_train_dtm = vect.fit_transform(X_train)
X_test_dtm = vect.transform(X_test)


# # normalized
# X_train_dtm = preprocessing.normalize(X_train_dtm, norm='l2')
# X_test_dtm = preprocessing.normalize(X_test_dtm, norm='l2')









    



/usr/local/lib/python3.5/dist-packages/sklearn/feature_extraction/text.py:1059: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):

KNeighborsClassifier



In [15]:

    
def kNeighours(X_train_dtm, X_test_dtm, y_train, y_test):
    from sklearn.neighbors import KNeighborsClassifier
    neigh = KNeighborsClassifier(n_neighbors=3)
    neigh.fit(X_train_dtm, y_train) 

    y_pred_test = neigh.predict(X_test_dtm)
    
    accuracy = accuracy_score(y_pred=y_pred_test, y_true=y_test)
    CM = confusion_matrix(y_pred=y_pred_test, y_true=y_test)
    print('accuracy: ', accuracy)
    print(metrics.classification_report(y_test, y_pred_test, target_names=list(label_cat.keys())))
    
    # Plot normalized confusion matrix
    plt.figure(figsize=(12,8))
    plot_confusion_matrix(CM, classes=label_cat.keys(), title='Confusion matrix')
    plt.show()

    
    
kNeighours(X_train_dtm, X_test_dtm, y_train, y_test)









    



accuracy:  0.8275862068965517
                    precision    recall  f1-score   support

         political       0.83      0.82      0.82       115
              misc       0.85      0.90      0.88        71
violence/terrorism       0.80      0.79      0.80       104

       avg / total       0.83      0.83      0.83       290

Confusion matrix, without normalization
[[94  5 16]
 [ 3 64  4]
 [16  6 82]]

NearestCentroid



In [16]:

    
def nearestCentroid(X_train_dtm, X_test_dtm, y_train, y_test):
    from sklearn.neighbors.nearest_centroid import NearestCentroid

    clf = NearestCentroid()
    clf.fit(X_train_dtm, y_train)

    y_pred_test = clf.predict(X_test_dtm)
    
    accuracy = accuracy_score(y_pred=y_pred_test, y_true=y_test)
    CM = confusion_matrix(y_pred=y_pred_test, y_true=y_test)
    print('accuracy: ', accuracy)
    print(metrics.classification_report(y_test, y_pred_test, target_names=list(label_cat.keys())))

    # Plot normalized confusion matrix
    plt.figure(figsize=(12,8))
    plot_confusion_matrix(CM, classes=label_cat.keys(), title='Confusion matrix')
    plt.show()
    
nearestCentroid(X_train_dtm, X_test_dtm, y_train, y_test)









    



accuracy:  0.8241379310344827
                    precision    recall  f1-score   support

         political       0.83      0.76      0.79       115
              misc       0.89      0.90      0.90        71
violence/terrorism       0.78      0.85      0.81       104

       avg / total       0.83      0.82      0.82       290

Confusion matrix, without normalization
[[87  5 23]
 [ 5 64  2]
 [13  3 88]]

Multinomial classifier



In [17]:

    
def multinomial(X_train_dtm, X_test_dtm, y_train, y_test):

    from sklearn.naive_bayes import MultinomialNB

    # Create instance of estimator
    nb = MultinomialNB()
    # Train the model
    nb.fit(X_train_dtm, y_train)
    # Evaluate
    y_pred_test = nb.predict(X_test_dtm)


    accuracy = accuracy_score(y_pred=y_pred_test, y_true=y_test)
    CM = confusion_matrix(y_pred=y_pred_test, y_true=y_test)
    print('accuracy: ', accuracy)
    print(metrics.classification_report(y_test, y_pred_test, target_names=list(label_cat.keys())))

    # Plot normalized confusion matrix
    plt.figure(figsize=(12,8))
    plot_confusion_matrix(CM, classes=label_cat.keys(), title='Confusion matrix')
    plt.show()
    
multinomial(X_train_dtm, X_test_dtm, y_train, y_test)









    



accuracy:  0.8172413793103448
                    precision    recall  f1-score   support

         political       0.79      0.82      0.80       115
              misc       0.94      0.82      0.87        71
violence/terrorism       0.78      0.82      0.80       104

       avg / total       0.82      0.82      0.82       290

Confusion matrix, without normalization
[[94  2 19]
 [ 8 58  5]
 [17  2 85]]

Logistic Regression



In [21]:

    
def logisticRegression(X_train_dtm, X_test_dtm, y_train, y_test):
    from sklearn.linear_model import LogisticRegression

    # Create instance of estimator
    logistic_regression = LogisticRegression(C=15, n_jobs=-1, random_state=15)
    # Train
    logistic_regression.fit(X_train_dtm, y_train)
    # Evaluate
    y_pred_test = logistic_regression.predict(X_test_dtm)

    accuracy = accuracy_score(y_pred=y_pred_test, y_true=y_test)
    CM = confusion_matrix(y_pred=y_pred_test, y_true=y_test)
    print('accuracy: ', accuracy)
    print(metrics.classification_report(y_test, y_pred_test, target_names=list(label_cat.keys())))
    
    # Plot normalized confusion matrix
    plt.figure(figsize=(12,8))
    plot_confusion_matrix(CM, classes=label_cat.keys(), title='Confusion matrix')
    plt.show()
    
logisticRegression(X_train_dtm, X_test_dtm, y_train, y_test)









    



accuracy:  0.8413793103448276
                    precision    recall  f1-score   support

         political       0.85      0.83      0.84       115
              misc       0.91      0.89      0.90        71
violence/terrorism       0.79      0.83      0.81       104

       avg / total       0.84      0.84      0.84       290

Confusion matrix, without normalization
[[95  2 18]
 [ 3 63  5]
 [14  4 86]]

Classification Trees



In [22]:

    
def tree(X_train_dtm, X_test_dtm, y_train, y_test):
    from sklearn.tree import DecisionTreeClassifier

    class_tree = DecisionTreeClassifier(max_depth=10, random_state=10)

    class_tree.fit(X_train_dtm, y_train)

    # Evaluate
    y_pred_test = class_tree.predict(X_test_dtm)

    accuracy = accuracy_score(y_pred=y_pred_test, y_true=y_test)
    CM = confusion_matrix(y_pred=y_pred_test, y_true=y_test)
    print('accuracy: ', accuracy)
    print(metrics.classification_report(y_test, y_pred_test, target_names=list(label_cat.keys())))

    # Plot normalized confusion matrix
    plt.figure(figsize=(12,8))
    plot_confusion_matrix(CM, classes=label_cat.keys(), title='Confusion matrix')
    plt.show()
    
tree(X_train_dtm, X_test_dtm, y_train, y_test)









    



accuracy:  0.696551724137931
                    precision    recall  f1-score   support

         political       0.82      0.59      0.69       115
              misc       0.66      0.77      0.71        71
violence/terrorism       0.64      0.76      0.69       104

       avg / total       0.72      0.70      0.70       290

Confusion matrix, without normalization
[[68 15 32]
 [ 3 55 13]
 [12 13 79]]

Naive Bayes Classifier - GaussianNB



In [23]:

    
def gaussianNB(X_train_dtm, X_test_dtm, y_train, y_test):
    from sklearn.naive_bayes import GaussianNB

    # Create instance of estimator
    gnb = GaussianNB()
    # Train the model
    gnb.fit(X_train_dtm.toarray(), y_train)
    # Evaluate
    y_pred_test = gnb.predict(X_test_dtm.toarray())

    accuracy = accuracy_score(y_pred=y_pred_test, y_true=y_test)
    CM = confusion_matrix(y_pred=y_pred_test, y_true=y_test)
    print('accuracy: ', accuracy)
    print(metrics.classification_report(y_test, y_pred_test, target_names=list(label_cat.keys())))

    # Plot normalized confusion matrix
    plt.figure(figsize=(12,8))
    plot_confusion_matrix(CM, classes=label_cat.keys(), title='Confusion matrix')
    plt.show()
    
gaussianNB(X_train_dtm, X_test_dtm, y_train, y_test)









    



accuracy:  0.7379310344827587
                    precision    recall  f1-score   support

         political       0.71      0.73      0.72       115
              misc       0.86      0.80      0.83        71
violence/terrorism       0.70      0.70      0.70       104

       avg / total       0.74      0.74      0.74       290

Confusion matrix, without normalization
[[84  5 26]
 [ 8 57  6]
 [27  4 73]]

SVM



In [24]:

    
def svm (X_train_dtm, X_test_dtm, y_train, y_test):
    from sklearn.svm import SVC
    clf = SVC(kernel='linear')
    clf.fit(X_train_dtm, y_train)
    # Evaluate
    y_pred_test = clf.predict(X_test_dtm)
    
    accuracy = accuracy_score(y_pred=y_pred_test, y_true=y_test)
    CM = confusion_matrix(y_pred=y_pred_test, y_true=y_test)
    print('accuracy: ', accuracy)
    print(metrics.classification_report(y_test, y_pred_test, target_names=list(label_cat.keys())))

    # Plot normalized confusion matrix
    plt.figure(figsize=(12,8))
    plot_confusion_matrix(CM, classes=label_cat.keys(), title='Confusion matrix')
    plt.show()
    
svm(X_train_dtm, X_test_dtm, y_train, y_test)









    



accuracy:  0.8413793103448276
                    precision    recall  f1-score   support

         political       0.83      0.83      0.83       115
              misc       0.90      0.90      0.90        71
violence/terrorism       0.82      0.81      0.81       104

       avg / total       0.84      0.84      0.84       290

Confusion matrix, without normalization
[[96  2 17]
 [ 5 64  2]
 [15  5 84]]



In [25]:

    
def randomForest (X_train_dtm, X_test_dtm, y_train, y_test):
    from sklearn.ensemble import RandomForestClassifier
    
    clf = RandomForestClassifier(max_depth=20, random_state=0)
    clf.fit(X_train_dtm, y_train)
    # Evaluate
    y_pred_test = clf.predict(X_test_dtm)
  
    accuracy = accuracy_score(y_pred=y_pred_test, y_true=y_test)
    CM = confusion_matrix(y_pred=y_pred_test, y_true=y_test)
    print('accuracy: ', accuracy)
    print(metrics.classification_report(y_test, y_pred_test, target_names=list(label_cat.keys())))
    
    # Plot normalized confusion matrix
    plt.figure(figsize=(12,8))
    plot_confusion_matrix(CM, classes=label_cat.keys(), title='Confusion matrix')
    plt.show()
    
randomForest(X_train_dtm, X_test_dtm, y_train, y_test)









    



accuracy:  0.8
                    precision    recall  f1-score   support

         political       0.75      0.82      0.78       115
              misc       0.94      0.83      0.88        71
violence/terrorism       0.77      0.76      0.77       104

       avg / total       0.81      0.80      0.80       290

Confusion matrix, without normalization
[[94  2 19]
 [ 8 59  4]
 [23  2 79]]



In [27]:

    
def adaBoostClassifier (X_train_dtm, X_test_dtm, y_train, y_test):
    from sklearn.ensemble import AdaBoostClassifier
    
    clf = AdaBoostClassifier(n_estimators=100)
    clf.fit(X_train_dtm, y_train)
    # Evaluate
    y_pred_test = clf.predict(X_test_dtm)

    accuracy = accuracy_score(y_pred=y_pred_test, y_true=y_test)
    CM = confusion_matrix(y_pred=y_pred_test, y_true=y_test)
    print('accuracy: ', accuracy)
    print(metrics.classification_report(y_test, y_pred_test, target_names=list(label_cat.keys())))

    # Plot normalized confusion matrix
    plt.figure(figsize=(12,8))
    plot_confusion_matrix(CM, classes=label_cat.keys(), title='Confusion matrix')
    plt.show()
    
adaBoostClassifier(X_train_dtm, X_test_dtm, y_train, y_test)









    



accuracy:  0.6931034482758621
                    precision    recall  f1-score   support

         political       0.72      0.70      0.71       115
              misc       0.85      0.55      0.67        71
violence/terrorism       0.61      0.78      0.69       104

       avg / total       0.71      0.69      0.69       290

Confusion matrix, without normalization
[[81  2 32]
 [13 39 19]
 [18  5 81]]



In [30]:

    
def SGDClassifier (X_train_dtm, X_test_dtm, y_train, y_test):
    from sklearn.linear_model import SGDClassifier
    
    clf =  SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, random_state=42)
    clf.fit(X_train_dtm, y_train)
    # Evaluate
    y_pred_test = clf.predict(X_test_dtm)

    accuracy = accuracy_score(y_pred=y_pred_test, y_true=y_test)
    CM = confusion_matrix(y_pred=y_pred_test, y_true=y_test)
    print('accuracy: ', accuracy)
    print(metrics.classification_report(y_test, y_pred_test, target_names=list(label_cat.keys())))

    # Plot normalized confusion matrix
    plt.figure(figsize=(12,8))
    plot_confusion_matrix(CM, classes=label_cat.keys(), title='Confusion matrix')
    plt.show()
    
SGDClassifier(X_train_dtm, X_test_dtm, y_train, y_test)









    



accuracy:  0.8448275862068966
                    precision    recall  f1-score   support

         political       0.83      0.83      0.83       115
              misc       0.90      0.92      0.91        71
violence/terrorism       0.82      0.81      0.81       104

       avg / total       0.84      0.84      0.84       290

Confusion matrix, without normalization
[[96  2 17]
 [ 4 65  2]
 [15  5 84]]



In [33]:

    
# https://www.kdnuggets.com/2016/10/beginners-guide-neural-networks-python-scikit-learn.html/2
def MLPClassifier (X_train_dtm, X_test_dtm, y_train, y_test):
    from sklearn.neural_network import MLPClassifier
    
    clf = MLPClassifier(hidden_layer_sizes=(30,20,20))
    clf.fit(X_train_dtm, y_train)
    # Evaluate
    y_pred_test = clf.predict(X_test_dtm)

    accuracy = accuracy_score(y_pred=y_pred_test, y_true=y_test)
    CM = confusion_matrix(y_pred=y_pred_test, y_true=y_test)
    print('accuracy: ', accuracy)
    print(metrics.classification_report(y_test, y_pred_test, target_names=list(label_cat.keys())))

    # Plot normalized confusion matrix
    plt.figure(figsize=(12,8))
    plot_confusion_matrix(CM, classes=label_cat.keys(), title='Confusion matrix')
    plt.show()
    
MLPClassifier(X_train_dtm, X_test_dtm, y_train, y_test)









    



accuracy:  0.8068965517241379
                    precision    recall  f1-score   support

         political       0.80      0.81      0.81       115
              misc       0.90      0.85      0.87        71
violence/terrorism       0.76      0.78      0.77       104

       avg / total       0.81      0.81      0.81       290

Confusion matrix, without normalization
[[93  3 19]
 [ 4 60  7]
 [19  4 81]]



In [22]:

    
# def predict_topic(model, doc):
#     prediction = model.predict(vect.transform(doc))[0]
#     print(doc)
#     print(prediction)
    
# predict_topic(class_tree, ['hagel iraqi baghdad iraq forces abadi troop terry coalition kuwait secretary_chuck commander us army islamic_state defense train prime_minister advise james_terry obama security is_militant military lieutenant_general'])



In [20]:

    
# store the predicted probabilities for class 1
# y_pred_prob = logistic_regression.predict_proba(X_train_dtm)[:, 0]
# # histogram of predicted probabilities

# # 8 bins
# plt.hist(y_pred_prob, bins=8)

# # x-axis limit from 0 to 1
# plt.xlim(0,1)
# plt.title('Histogram of predicted probabilities')
# plt.xlabel('Predicted probability of topics')
# plt.ylabel('Frequency')



In [ ]:



In [ ]:

	0	label	1	2	3	4	5	6	7	8	...	28	29	30	label_w2v_no	label_w2v_scale	label_w2v_normalize	label_wmd_distance	label_wmd_normalize	label_wmd_scale	label_tfidf_normalize
0	2012_01_01	violence/terrorism	syrian	assad	say	syria	killed	damascus	people	regime	...	civilian	least	journalist	20	3	20	34	6	60	15
1	2012_01_02	misc	use	osc	copyrighted_material	dissemination	usage	reproduction	original	authorize	...	al	agency	location	36	6	25	49	2	60	46
2	2012_01_03	misc	will	year	can	people	one	country	party	make	...	change	political	just	15	15	19	10	3	60	8
3	2012_01_04	misc	quot	apos	say	the	we	it	reuters	terrorists	...	but	don	protest	38	25	10	20	23	60	6
4	2012_01_05	violence/terrorism	baghdad	iraq	sunni	killed	bomb	iraqi	attacks	wound	...	sadr	basra	near	26	5	11	18	14	60	29