In [23]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import KFold
# from sklearn.cross_validation import cross_val_score
import pandas as pd
import numpy as np
from sklearn import metrics
clf = MultinomialNB()
use_jrnl_metrics=False
raw_data=pd.read_csv(r"D:\NIU\Data Mining 2\Project\CombinedRandomDataCleaned4.csv")
In [24]:
raw_data = raw_data.sample(frac=1).reset_index(drop=True)
train = raw_data.sample(frac=0.8, random_state=1)
test = raw_data.loc[~raw_data.index.isin(train.index)]
# , "q&a", "pinterest"
data_columns=["mendeley", "citeulike", "connotea", "twitter", "reddit", "facebook", "googleplus", "blogs", "news", "video", "wikipedia", "weibo", "peer_reviews"]
if use_jrnl_metrics:
data_columns.append("h_index")
train_data_array=train.as_matrix(columns=data_columns)
train_class_array= train['policy'].values
test_data_array=test.as_matrix(columns=data_columns)
test_class_array= test['policy'].values
In [25]:
kf=KFold(n_splits=10, random_state=None, shuffle=False)
validation_score=0
for train_index, test_index in kf.split(train_data_array):
X_train, X_test = train_data_array[train_index], train_data_array[test_index]
y_train, y_test = train_class_array[train_index], train_class_array[test_index]
clf.fit(X_train, y_train)
validation_pred=clf.predict(X_test)
validation_score+=metrics.accuracy_score(y_test,validation_pred)
print("Validation Accuracy: %0.3f" % (validation_score/kf.get_n_splits()))
pred = clf.predict(test_data_array)
score = metrics.accuracy_score(test_class_array, pred)
print("accuracy: %0.3f" % score)
score = metrics.precision_score(test_class_array, pred)
print("Precision: %0.3f" % score)
score = metrics.recall_score(test_class_array, pred)
print("Recall: %0.3f" % score)
score = metrics.f1_score(test_class_array, pred)
print("F-measure: %0.3f" % score)
In [ ]: