In [23]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import KFold
# from sklearn.cross_validation import cross_val_score
import pandas as pd
import numpy as np
from sklearn import metrics

clf = MultinomialNB()
use_jrnl_metrics=False

raw_data=pd.read_csv(r"D:\NIU\Data Mining 2\Project\CombinedRandomDataCleaned4.csv")

In [24]:
raw_data = raw_data.sample(frac=1).reset_index(drop=True)
train = raw_data.sample(frac=0.8, random_state=1)
test = raw_data.loc[~raw_data.index.isin(train.index)]
# , "q&a", "pinterest"
data_columns=["mendeley", "citeulike", "connotea", "twitter", "reddit", "facebook", "googleplus", "blogs", "news", "video", "wikipedia", "weibo", "peer_reviews"]
if use_jrnl_metrics:
    data_columns.append("h_index")
train_data_array=train.as_matrix(columns=data_columns)
train_class_array= train['policy'].values
test_data_array=test.as_matrix(columns=data_columns)
test_class_array= test['policy'].values

In [25]:
kf=KFold(n_splits=10, random_state=None, shuffle=False)
validation_score=0
for train_index, test_index in kf.split(train_data_array):
    X_train, X_test = train_data_array[train_index], train_data_array[test_index]
    y_train, y_test = train_class_array[train_index], train_class_array[test_index]
    clf.fit(X_train, y_train)
    validation_pred=clf.predict(X_test)
    validation_score+=metrics.accuracy_score(y_test,validation_pred)

print("Validation Accuracy:   %0.3f" % (validation_score/kf.get_n_splits()))

pred = clf.predict(test_data_array)
score = metrics.accuracy_score(test_class_array, pred)
print("accuracy:   %0.3f" % score)

score = metrics.precision_score(test_class_array, pred)
print("Precision:   %0.3f" % score)

score = metrics.recall_score(test_class_array, pred)
print("Recall:   %0.3f" % score)

score = metrics.f1_score(test_class_array, pred)
print("F-measure:   %0.3f" % score)


Validation Accuracy:   0.848
accuracy:   0.848
Precision:   0.813
Recall:   0.904
F-measure:   0.856

In [ ]: