In [1]:
# Analysis of Facebook likes using "harmonic" crowdsourcing algorithm.
# This is an analysis performed entirely on the users who have liked both good and bad items;
# this intersection forms the hardest "core" of the problem.
In [2]:
%matplotlib inline
import json_plus
import json
import numpy as np
import matplotlib.pyplot as plt
import random
import voting
import matplotlib
In [55]:
# Set figure size.
matplotlib.rcParams['figure.figsize'] = (4.0, 2.0)
matplotlib.rcParams['errorbar.capsize'] = 5
params = {'legend.fontsize': 'x-large',
'axes.labelsize': 'x-large',
'axes.titlesize':'x-large',
'xtick.labelsize':'x-large',
'ytick.labelsize':'x-large'}
matplotlib.rcParams.update(params)
plt.rc('text', usetex=True)
plt.rc('font', family='serif')
In [44]:
# Data loading
with open('likers.json') as data_file:
data = json.load(data_file)
print("Loaded.")
In [45]:
# Counts
post_list = [i for i in data.keys()]
user_list = []
for i in data.values():
user_list.extend(i)
user_set = set(user_list)
nlikes = len(user_list)
nuser = len(user_set)
nposts = len(data)
print('N. of posts: %s' % nposts)
print('N. of likes %d' % nlikes)
print('N. of unique users: %s' % nuser)
print('Avg. likes per user: %.2f' % (nlikes/nuser))
print('Avg. likes per post: %.2f' % (nlikes/nposts))
In [46]:
# Hoax pages
pages = set([i.split('_')[0] for i in post_list])
# WARNING: commented out pages DO NOT EXIST.
hoax_pages = {'188189217954979',
'253520844711659',
'199277020680',
'233426770069342',
# '68091825232',
# '194120424046954',
'126393880733870',
'109383485816534',
'203737476337348',
'159590407439801',
'124489267724876',
'123944574364433',
# '130541730433071',
'278440415537619',
'101748583911',
'352513104826417',
'195235103879949'
}
nonhoax_pages = pages - hoax_pages
error_pages = hoax_pages - pages
if len(error_pages) > 0:
print "Error: some hoax pages never appear:", error_pages
print("N. of pages: %d hoax vs. %d non-hoax" % (len(hoax_pages), len(nonhoax_pages)))
print "Hoax pages:", hoax_pages
print "Nonhoax pages:", nonhoax_pages
In [47]:
# Hoax post categorization
hoax_posts = {x for x in data if x.split('_')[0] in hoax_pages}
print("N. of posts: %d hoax vs. %d non-hoax" % (len(hoax_posts), nposts-len(hoax_posts)))
In [48]:
# Computes the number of posts per each page.
posts_per_page = {}
for p, ul in data.items():
page = p.split('_')[0]
posts_per_page[page] = posts_per_page.get(page, 0.0) + 1.0
print posts_per_page
In [49]:
def graph_correct(d):
"""Makes a graph of the correctness as a function of number of likes."""
print "Accuracy overall:", d['ratio_correct']
# Now for the graph.
ys = d['ratio_correct_likes']
xs = range(1, 1 + len(ys))
plt.plot(xs, ys, 'bo')
plt.xlabel('Number of likes')
plt.ylabel('Fraction correct')
plt.ylim(ymax=1.01)
plt.show()
In [50]:
# Now let's compute the set of users who have voted for BOTH good and bad items.
hoax_users = set()
nonhoax_users = set()
for p, ul in data.items():
page = p.split('_')[0]
# The truth value depends on the page.
tv = not (page in hoax_pages)
if tv:
nonhoax_users.update(ul)
else:
hoax_users.update(ul)
users_liked_both = hoax_users & nonhoax_users
print "Number of user who liked both type of pages:", len(users_liked_both)
In [51]:
# I want to know, how many of the users above have voted for ONLY two pages.
user_cardinality = {}
for p, ul in data.items():
for u in ul:
if u in users_liked_both:
user_cardinality[u] = 1 + user_cardinality.get(u, 0)
num_with_card_2 = 0
for u, c in user_cardinality.items():
if c == 2:
num_with_card_2 += 1
print "Number of users in intersection dataset with cardinality 2:", num_with_card_2
In [52]:
# Let's compute the set of users who have voted for more than one page.
pages_user = {}
for p, ul in data.items():
page = p.split('_')[0]
for u in ul:
pages_user[u] = pages_user.get(u, set()) | {p}
multi_page_users = set([u for u, ps in pages_user.items() if len(ps) > 1])
print "Numner of users who liked more than one page:", len(multi_page_users)
In [53]:
# Run this code AFTER computing the set of hoax users above.
def add_data(g, use_all=False, use_atleast_two=False):
"""Adds the data to a graph g.
If use_all is True, it uses all data; otherwise it uses data only from the intersection dataset."""
for p, ul in data.items():
page = p.split('_')[0]
# The truth value depends on the page.
tv = not (page in hoax_pages)
for u in ul:
# The polarity is always 1 (pos).
if use_all or (u in users_liked_both) or (use_atleast_two and u in multi_page_users):
g.add_edge(u, p, item_true_value=tv)
print("The graph has been built!")
In [39]:
# Let's get some stats.
reload(voting)
g = voting.VotingGraph()
add_data(g)
g.print_stats()
In [40]:
# Let's get some stats.
reload(voting)
g = voting.VotingGraph()
add_data(g, use_all=True)
g.print_stats()
In [46]:
reload(voting)
TESTS = [10, 20, 40, 100, 200, 400, 1000]
results_all_x = []
results_all_y = []
results_all_err = []
g = voting.VotingGraph()
add_data(g, use_all=True)
for f in TESTS[:3]:
d = g.evaluate_inference(fraction=f)
print f, d['ratio_correct']
results_all_x.append(f)
results_all_y.append(d['ratio_correct'])
results_all_err.append(d['stdev'])
In [47]:
print results_all_x
print results_all_y
print results_all_err
In [48]:
reload(voting)
TESTS = [10, 20, 40, 100, 200, 400, 1000]
g = voting.VotingGraph()
add_data(g, use_all=True)
for f in TESTS[3:5]:
d = g.evaluate_inference(fraction=f)
print f, d['ratio_correct']
results_all_x.append(f)
results_all_y.append(d['ratio_correct'])
results_all_err.append(d['stdev'])
In [49]:
print results_all_x
print results_all_y
print results_all_err
In [50]:
reload(voting)
TESTS = [10, 20, 40, 100, 200, 400, 1000]
g = voting.VotingGraph()
add_data(g, use_all=True)
for f in TESTS[5:]:
d = g.evaluate_inference(fraction=f)
print f, d['ratio_correct']
results_all_x.append(f)
results_all_y.append(d['ratio_correct'])
results_all_err.append(d['stdev'])
In [51]:
print results_all_x
print results_all_y
print results_all_err
In [54]:
# Now for the intersection dataset.
reload(voting)
results_intersection_x = []
results_intersection_y = []
results_intersection_err = []
TESTS = [10, 20, 40, 100, 200, 400, 1000]
g = voting.VotingGraph()
add_data(g, use_all=False)
for f in TESTS[:4]:
d = g.evaluate_inference(fraction=f)
print f, d['ratio_correct']
results_intersection_x.append(f)
results_intersection_y.append(d['ratio_correct'])
results_intersection_err.append(d['stdev'])
In [55]:
# Now for the intersection dataset.
reload(voting)
TESTS = [10, 20, 40, 100, 200, 400, 1000]
g = voting.VotingGraph()
add_data(g, use_all=False)
for f in TESTS[4:]:
d = g.evaluate_inference(fraction=f)
print f, d['ratio_correct']
results_intersection_x.append(f)
results_intersection_y.append(d['ratio_correct'])
results_intersection_err.append(d['stdev'])
In [56]:
print results_intersection_x
print results_intersection_y
print results_intersection_err
In [4]:
# Let's generate the graphs, without having to re-run everything.
import matplotlib
matplotlib.rcParams['figure.figsize'] = (10.0, 6.0)
# res_all_x = results_all_x
# res_all_y = results_all_y
# res_all_err = results_all_err
# res_int_x = results_intersection_x
# res_int_y = results_intersection_y
# res_int_err = results_intersection_err
res_all_x = [10, 20, 40, 100, 200, 400, 1000]
res_all_y = [0.9947928315412187, 0.9945765704584041, 0.9944259908687885, 0.9942717497556207, 0.9941477014847955, 0.9369124304747122, 0.7925489183080401]
res_all_err = [0.00018177945073084267, 0.0001663545936217604, 0.0001440948332039384, 9.8616786904011806e-05, 8.3849643572967626e-05, 0.081584186225229241, 0.15831802229801958]
res_int_x = [10, 20, 40, 100, 200, 400, 1000]
res_int_y = [0.8981516687790452, 0.8505363217930758, 0.8258925611777322, 0.744059529524724, 0.6773977837218189, 0.5783037926434152, 0.519162702188392]
res_int_err = [0.015822423827105362, 0.038183501343605319, 0.06444397351136516, 0.10609695794657316, 0.13381654475491597, 0.12521980663556254, 0.1100818925571475]
# res_gt2_x = [10, 20, 50, 100, 200, 400, 1000]
# res_gt2_y = [0.9973278594065111, 0.9970522006141249, 0.9968679719539622, 0.9968190151911995, 0.9925245944361196, 0.9633779568494931, 0.8052144572058919]
# res_gt2_err = [0.00014918118105160575, 0.00017208488861027103, 9.1833862792139789e-05, 8.3946414408233029e-05, 0.020896853639959776, 0.065442264421548352, 0.16442557191302057]
# We need the fractions
res_a_x = [1.0 / x for x in res_all_x]
res_i_x = [1.0 / x for x in res_int_x]
# res_a_x = [x * nposts for x in res_a_x]
# Builds the plots.
fig, ax = plt.subplots(1)
ax.errorbar(res_a_x, res_all_y, yerr=res_all_err, marker='o', label='complete dataset')
# ax.errorbar(res_a_x, res_gt2_y, yerr=res_gt2_err, marker='x', label='multipage dataset')
ax.errorbar(res_a_x, res_int_y, yerr=res_int_err, marker='+', label='intersection dataset')
plt.ylim(0.45, 1.05)
plt.xscale('log')
plt.xlim(1/1200.0, 1/9.0)
ax.legend()
ax.grid()
plt.title('Accuracy vs. number of items in learning set')
plt.plot()
plt.savefig('accuracy_vs_perc.pdf')
In [5]:
# Let's define the results from the other notebook on logistic regression.
# See ../logistic_regression/logreg_tests_final.ipynb
logi_res_all_y = [0.96669677419354827, 0.95533989813242781, 0.93099144173798531, 0.90329749103942636, 0.84348699993516163, 0.74553227266847766, 0.63488149822408779]
logi_res_all_err = [0.0055568473928726322, 0.0096648261910718368, 0.019235373404428768, 0.038534806597708672, 0.059711620024988706, 0.086215312070645644, 0.11551090232666143]
logi_res_int_y = [0.91654203633291076, 0.89193315989593758, 0.84144907856450035, 0.77512049927988469, 0.70986052732136029, 0.63810367829235748, 0.56102949571836336]
logi_res_int_err = [0.0077756103084875598, 0.013865324498876704, 0.029058334540871818, 0.047490066556372029, 0.05720552897645028, 0.060875349450386827, 0.054692571945068366]
In [38]:
# Let us now plot the results, on the same dataset, of logistic and harmonic.
fig, ax = plt.subplots(figsize=(6, 4))
ax.errorbar(res_a_x, logi_res_all_y, yerr=logi_res_all_err, marker='s', label='logistic regression')
ax.errorbar(res_a_x, res_all_y, yerr=res_all_err, marker='o', label='harmonic algorithm')
plt.ylim(0.45, 1.05)
plt.xscale('log')
plt.xlim(1/1200.0, 1/9.0)
ax.legend()
ax.grid()
plt.xlabel('Fraction of items in training set')
plt.ylabel('Accuracy')
plt.title('Accuracy vs. fraction of items in learning set,\ncomplete dataset')
plt.plot()
plt.savefig('accuracy_vs_fraction_all_s.pdf', bbox_inches='tight')
In [42]:
# Let us now plot the results, on the same dataset, of logistic and harmonic.
fig, ax = plt.subplots(figsize=(10.5, 6))
ax.errorbar(res_a_x, logi_res_all_y, yerr=logi_res_all_err, marker='s', label='logistic regression')
ax.errorbar(res_a_x, res_all_y, yerr=res_all_err, marker='o', label='harmonic algorithm')
plt.ylim(0.45, 1.05)
plt.xscale('log')
plt.xlim(1/1200.0, 1/9.0)
ax.legend()
ax.grid()
plt.xlabel('Fraction of items in training set')
plt.ylabel('Accuracy')
plt.title('Accuracy vs. fraction of items in learning set, complete dataset')
plt.plot()
plt.savefig('accuracy_vs_fraction_all_b.pdf', bbox_inches='tight')
In [37]:
# Now the same, but for the intersection dataset.
fig, ax = plt.subplots(figsize=(6,4))
ax.errorbar(res_a_x, logi_res_int_y, yerr=logi_res_int_err, marker='s', label='logistic regression')
ax.errorbar(res_a_x, res_int_y, yerr=res_int_err, marker='o', label='harmonic algorithm')
plt.ylim(0.45, 1.05)
plt.xscale('log')
plt.xlim(1/1200.0, 1/9.0)
ax.legend()
plt.xlabel('Fraction of items in training set')
plt.ylabel('Accuracy')
ax.grid()
plt.title('Accuracy vs. fraction of items in learning set,\n intersection dataset')
plt.plot()
plt.savefig('accuracy_vs_fraction_int_s.pdf', bbox_inches='tight')
In [43]:
# Now the same, but for the intersection dataset.
fig, ax = plt.subplots(figsize=(10.5, 6))
ax.errorbar(res_a_x, logi_res_int_y, yerr=logi_res_int_err, marker='s', label='logistic regression')
ax.errorbar(res_a_x, res_int_y, yerr=res_int_err, marker='o', label='harmonic algorithm')
plt.ylim(0.45, 1.05)
plt.xscale('log')
plt.xlim(1/1200.0, 1/9.0)
ax.legend()
plt.xlabel('Fraction of items in training set')
plt.ylabel('Accuracy')
ax.grid()
plt.title('Accuracy vs. fraction of items in learning set, intersection dataset')
plt.plot()
plt.savefig('accuracy_vs_fraction_int_b.pdf', bbox_inches='tight')
In [65]:
# We can also produce all four in the same graph, in case we lack space.
fig, ax = plt.subplots(figsize=(9,4.5))
ax.errorbar(res_a_x, res_all_y, yerr=res_all_err, marker='o', label='Harmonic algorithm, complete dataset')
ax.errorbar(res_a_x, logi_res_all_y, yerr=logi_res_all_err, marker='s', ls='--', label='Logistic regression, complete dataset')
ax.errorbar(res_a_x, res_int_y, yerr=res_int_err, marker='^', ls='-.', label='Harmonic algorithm, intersection dataset')
ax.errorbar(res_a_x, logi_res_int_y, yerr=logi_res_int_err, marker='v', ls=':', label='Logistic regression, intersection dataset')
plt.ylim(0.35, 1.05)
plt.xscale('log')
plt.xlim(1/1200.0, 1/9.0)
plt.xlabel('Fraction of items in training set')
plt.ylabel('Accuracy')
ax.legend()
ax.grid()
plt.plot()
plt.savefig('accuracy_vs_fraction_joint.pdf', bbox_inches='tight')
In [71]:
# Now we do the same as above, BUT resampling so that we sample each item
# with probability proportional to the number of likes.
reload(voting)
TESTS = [(10, 50), (20, 50), (50, 50), (100, 50), (200, 50), (400, 50), (1000, 50)]
results_all_edges_x = []
results_all_edges_y = []
results_all_edges_err = []
g = voting.VotingGraph()
add_data(g, use_all=True)
for num_chunks, num_eval_chunks in TESTS:
cs = []
for i in range(num_eval_chunks):
cs.append(g.evaluate_inference_selecting_prop_likes(1.0 / num_chunks))
results_all_edges_x.append(num_chunks)
results_all_edges_y.append(np.average(cs))
results_all_edges_err.append(np.std(cs))
print "Results:", results_all_edges_x, results_all_edges_y, results_all_edges_err
In [29]:
# Now for a comparison graph.
res_int_x = [10, 20, 50, 100, 200, 400, 1000]
res_int_y = [0.897950992817913, 0.8580698419051431, 0.8208923375363725, 0.7617321171387422, 0.6787648070309514, 0.6216285496474175, 0.5144928639391056]
res_int_err = [0.016882041525877665, 0.039889350640687743, 0.063391675604248701, 0.10959444258799378, 0.14373167394339534, 0.13662163819741741, 0.10275693241854568]
res_edg_x = [10, 20, 50, 100, 200, 400, 1000]
res_edg_y = [0.88999788762146181, 0.87536521913147891, 0.82294277400581961, 0.72839750360057598, 0.67358937613451797, 0.63884981894415849, 0.59327719097916076]
res_edg_err = [0.0068357969684385966, 0.0084777131391844795, 0.037350044990327166, 0.085970631359878633, 0.09539911360266122, 0.10455997072894992, 0.10284262914317449]
# We need the fractions
res_e_x = [1.0 / x for x in res_edg_x]
res_i_x = [1.0 / x for x in res_int_x]
# Builds the plots.
fig, ax = plt.subplots(1)
ax.errorbar(res_a_x, res_edg_y, yerr=res_edg_err, marker='o', label='likes sampling')
ax.errorbar(res_i_x, res_int_y, yerr=res_int_err, marker='o', label='item sampling')
plt.ylim(0.45, 1.05)
plt.xscale('log')
plt.xlim(1/1200.0, 1/9.0)
ax.legend()
ax.grid()
fig.suptitle('Accuracy vs. fraction of items in learning set, intersection dataset')
plt.plot()
plt.savefig('accuracy_like_sampling.pdf')
In [54]:
# First, we do the analysis of leave-one-page-out.
reload(voting)
frac_correct_all = [] # On complete graph
frac_correct_w = []
# First, for all.
g = voting.VotingGraph()
add_data(g, use_all=True)
for pg in pages:
print "page:", pg
# Creates the function that classifies items.
def is_learning(itid):
return itid.split('_')[0] != pg
fc = g.evaluate_inference_given_learning(is_learning)
print "For all, correctness:", fc
frac_correct_all.append(fc)
frac_correct_w.append(posts_per_page[pg])
print "Final average correctness for leave-one-page-out all:", np.average(frac_correct_all)
print "Standard deviation:", np.std(frac_correct_all)
In [51]:
# We do the analysis of leave-one-page-out, also for the intersection dataset.
reload(voting)
frac_correct_all = [] # On complete graph
frac_correct_w = []
# First, for all.
g = voting.VotingGraph()
add_data(g, use_all=False)
for pg in pages:
print "page:", pg
# Creates the function that classifies items.
def is_learning(itid):
return itid.split('_')[0] != pg
fc = g.evaluate_inference_given_learning(is_learning)
print "For all, correctness:", fc
frac_correct_int.append(fc)
frac_correct_w.append(posts_per_page[pg])
print "Final average correctness for leave-one-page-out intersection:", np.average(frac_correct_int, weights=frac_correct_w)
In [66]:
# Now, let's try to keep HALF of the pages out.
# Now, we do the analysis in which we randomly select two
# pages hoax and two non-hoax, and we learn from those alone.
reload(voting)
frac_correct_all_half = [] # On complete graph
weights_correct_all_half = []
fraction_pages = 0.5
# First, for all.
g = voting.VotingGraph()
add_data(g, use_all=True)
num_hoax_in = max(1, int(0.5 + len(hoax_pages) * fraction_pages))
num_nonhoax_in = max(1, int(0.5 + len(nonhoax_pages) * fraction_pages))
hoax_pages_l = list(hoax_pages)
nonhoax_pages_l = list(nonhoax_pages)
for _ in range(50):
# Picks pages in and out.
random.shuffle(hoax_pages_l)
random.shuffle(nonhoax_pages_l)
learn_pages = hoax_pages_l[:num_hoax_in] + nonhoax_pages_l[:num_nonhoax_in]
test_pages = hoax_pages_l[num_hoax_in:] + nonhoax_pages_l[num_nonhoax_in:]
# Computes the weight of the run.
num_learn_posts = np.sum([posts_per_page[p] for p in learn_pages])
num_test_posts = np.sum([posts_per_page[p] for p in test_pages])
w = num_learn_posts * num_test_posts
print "N learn", num_learn_posts, "N test", num_test_posts, "w", w
# Defines the function.
def is_learning(itid):
return itid.split('_')[0] in learn_pages
fc = g.evaluate_inference_given_learning(is_learning)
print "Learning from 2 of each kind, all:", fc
frac_correct_all_half.append(fc)
weights_correct_all_half.append(w)
print "Final average correctness for learning from half of each kind, all:", np.average(frac_correct_all_half, weights=weights_correct_all_half)
print "avg", np.average(frac_correct_all_half)
print "stdev", np.std(frac_correct_all_half)
In [69]:
# Now, let's try to keep HALF of the pages out.
# Now, we do the analysis in which we randomly select two
# pages hoax and two non-hoax, and we learn from those alone.
reload(voting)
frac_correct_int_half = [] # On complete graph
weights_correct_int_half = []
fraction_pages = 0.5
# First, for all.
g = voting.VotingGraph()
add_data(g, use_all=False)
num_hoax_in = max(1, int(0.5 + len(hoax_pages) * fraction_pages))
num_nonhoax_in = max(1, int(0.5 + len(nonhoax_pages) * fraction_pages))
hoax_pages_l = list(hoax_pages)
nonhoax_pages_l = list(nonhoax_pages)
for _ in range(50):
# Picks pages in and out.
random.shuffle(hoax_pages_l)
random.shuffle(nonhoax_pages_l)
learn_pages = hoax_pages_l[:num_hoax_in] + nonhoax_pages_l[:num_nonhoax_in]
test_pages = hoax_pages_l[num_hoax_in:] + nonhoax_pages_l[num_nonhoax_in:]
# Computes the weight of the run.
num_learn_posts = np.sum([posts_per_page[p] for p in learn_pages])
num_test_posts = np.sum([posts_per_page[p] for p in test_pages])
w = num_learn_posts * num_test_posts
print "N learn", num_learn_posts, "N test", num_test_posts, "w", w
# Defines the function.
def is_learning(itid):
return itid.split('_')[0] in learn_pages
fc = g.evaluate_inference_given_learning(is_learning)
print "Learning from half of each kind, all:", fc
frac_correct_int_half.append(fc)
weights_correct_int_half.append(w)
print "Final average correctness for learning from half of each kind, intersection:", np.average(frac_correct_int_half, weights=weights_correct_int_half)
print "avg", np.average(frac_correct_int_half)
print "stdev", np.std(frac_correct_int_half)
In [ ]: