In [1]:
from __future__ import print_function
%matplotlib inline
In [2]:
import json
import numpy as np
import pandas as pd
In [3]:
with open('likers.json') as data_file:
data = json.load(data_file)
print('Json length: %s' % len(data))
In [4]:
hoax_pages = {'188189217954979',
'253520844711659',
'199277020680',
'233426770069342',
'68091825232',
'194120424046954',
'126393880733870',
'109383485816534',
'203737476337348',
'159590407439801',
'124489267724876',
'123944574364433',
'130541730433071',
'278440415537619',
'101748583911',
'352513104826417',
'195235103879949'}
In [5]:
%%time
import df_utils
print('Full dataset:')
like_matrix, page, hoax = df_utils.cut_dataset(data,
hoax_pages,
min_post_like=0,
min_user_like=0,
print_results=True)
print('\nIntersection dataset:')
like_matrix_i, page_i, hoax_i = df_utils.filter_intersection(data,
hoax_pages,
print_results=True)
print()
In [6]:
%%time
from sklearn import linear_model
from sklearn.model_selection import cross_val_score
def test_1(like_matrix, hoax):
logreg = linear_model.LogisticRegression(C=1)
return cross_val_score(logreg, like_matrix, hoax, cv=5)
score_test_1 = test_1(like_matrix, hoax)
print('Full dataset - mean: %.3f, std: %.3f' % (np.mean(score_test_1), np.std(score_test_1)))
score_test_1_i = test_1(like_matrix_i, hoax_i)
print('Intersection dataset - mean: %.3f, std: %.3f\n' % (np.mean(score_test_1_i), np.std(score_test_1_i)))
In [7]:
%%time
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn import metrics
def test_2(like_matrix, hoax, test_size, iterations=50):
accuracy_list = []
for seed in range(iterations):
X_train, X_test, Y_train, Y_test = train_test_split(like_matrix, hoax, test_size=test_size, random_state=seed)
logreg = linear_model.LogisticRegression(C=1)
logreg.fit(X_train, Y_train)
Y_pred = logreg.predict(X_test)
accuracy_list.append(metrics.accuracy_score(Y_test, Y_pred))
return accuracy_list
score_test_2 = test_2(like_matrix, hoax, 0.9)
print('Full dataset - mean: %.2f, std: %.2f' % (np.mean(score_test_2), np.std(score_test_2)))
score_test_2_i = test_2(like_matrix_i, hoax_i, 0.9)
print('Intersection dataset - mean: %.2f, std: %.2f' % (np.mean(score_test_2_i), np.std(score_test_2_i)))
print()
In [8]:
%%time
res_all_x = [10, 20, 50, 100, 200, 400, 1000]
score_test_2 = [test_2(like_matrix, hoax, 1-(1/float(i))) for i in res_all_x]
score_test_2_i = [test_2(like_matrix_i, hoax_i, 1-(1/float(i))) for i in res_all_x]
print('Done\n')
In [9]:
#Graph points
res_all_y = [np.mean(i) for i in score_test_2]
print('res_all_y = %s' % str(res_all_y))
res_all_err = [np.std(i) for i in score_test_2]
print('res_all_err = %s' % str(res_all_err))
res_int_x = [10, 20, 50, 100, 200, 400, 1000]
res_int_y = [np.mean(i) for i in score_test_2_i]
print('res_int_y = %s' % str(res_int_y))
res_int_err = [np.std(i) for i in score_test_2_i]
print('res_int_err = %s' % str(res_int_err))
In [16]:
import matplotlib.pyplot as plt
plt.figure(figsize=(10,6))
res_a_x = [1.0 / x for x in res_all_x]
res_i_x = [1.0 / x for x in res_int_x]
fig, ax = plt.subplots(1)
ax.errorbar(res_a_x, res_all_y, yerr=res_all_err, marker='o', label='complete dataset')
ax.errorbar(res_a_x, res_int_y, yerr=res_int_err, marker='+', label='intersection dataset')
plt.ylim(0.45, 1.05)
plt.xscale('log')
plt.xlim(1/1200.0, 1/9.0)
ax.legend(loc='lower right')
ax.grid()
fig.suptitle('Accuracy vs. number of items in learning set')
plt.plot()
Out[16]:
In [11]:
%%time
from sklearn import linear_model
from sklearn import metrics
import df_utils
def test_3(like_matrix, hoax, page):
accuracy_list = []
for p in set(page):
matrix_test, hoax_test, matrix_train, hoax_train = df_utils.split_pages(like_matrix, page, hoax, [str(p)])
logreg = linear_model.LogisticRegression(C=1)
logreg.fit(matrix_train, hoax_train)
Y_pred = logreg.predict(matrix_test)
acc = metrics.accuracy_score(hoax_test, Y_pred)
accuracy_list.append(acc)
return accuracy_list
score_test_3 = test_3(like_matrix, hoax, page)
print('Full dataset - mean: %.3f, std: %.3f' % (np.mean(score_test_3), np.std(score_test_3)))
score_test_3_i = test_3(like_matrix_i, hoax_i, page_i)
print('Intersection dataset - mean: %.3f, std: %.3f\n' % (np.mean(score_test_3_i), np.std(score_test_3_i)))
In [12]:
from sklearn import linear_model
from sklearn import metrics
import df_utils
import random
pages_list = set(page)
hoax_pages = [p for p in pages_list if p in hoax_pages] #remove pages with no posts
not_hoax_pages = [p for p in pages_list if p not in hoax_pages]
accuracy_list = []
for seed in range(50):
random.seed(seed)
pages_tosplit = random.sample(hoax_pages, int(len(hoax_pages)/2.))
pages_tosplit.extend(random.sample(not_hoax_pages, int(len(not_hoax_pages)/2.)))
matrix_train, hoax_train, matrix_test, hoax_test = df_utils.split_pages(like_matrix, page, hoax, pages_tosplit)
logreg = linear_model.LogisticRegression(C=1)
logreg.fit(matrix_train, hoax_train)
Y_pred = logreg.predict(matrix_test)
acc = metrics.accuracy_score(hoax_test, Y_pred)
accuracy_list.append(acc)
print('Full dataset - mean: %.3f, std: %.3f\n' % (np.mean(accuracy_list), np.std(accuracy_list)))
In [13]:
from sklearn import linear_model
from sklearn import metrics
import df_utils
import random
pages_list = set(page_i)
hoax_pages = [p for p in pages_list if p in hoax_pages] #remove pages with no posts
not_hoax_pages = [p for p in pages_list if p not in hoax_pages]
accuracy_list_i = []
for seed in range(50):
random.seed(seed)
pages_tosplit = random.sample(hoax_pages, int(len(hoax_pages)/2.))
pages_tosplit.extend(random.sample(not_hoax_pages, int(len(not_hoax_pages)/2.)))
matrix_train, hoax_train, matrix_test, hoax_test = df_utils.split_pages(like_matrix_i, page_i, hoax_i, pages_tosplit)
logreg = linear_model.LogisticRegression(C=1)
logreg.fit(matrix_train, hoax_train)
Y_pred = logreg.predict(matrix_test)
acc = metrics.accuracy_score(hoax_test, Y_pred)
accuracy_list_i.append(acc)
print('Intersection dataset - mean: %.3f, std: %.3f\n' % (np.mean(accuracy_list_i), np.std(accuracy_list_i)))