We collect data from the following Wikipedia event logging schemas:
In [36]:
import scipy.stats as stats
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import MySQLdb
from tabulate import tabulate
from _mysql_exceptions import OperationalError
from IPython.display import display
from eventlogging import ELEvaluation
In [27]:
el = ELEvaluation(db_host = 'mysql', db_user = 'mediawiki', db_password = 'mediawiki', db_name = 'mediawiki')
In [28]:
# Events over time
datetimes, counts = el.get_event_time_series()
plt.plot(np.array(datetimes), np.array(counts))
plt.show()
In [29]:
display(el.get_most_recommended_items())
display(el.get_most_clicked_items())
display(el.get_stats_per_source())
display(el.get_metric_stats())
In [30]:
display(el.get_long_stats_per_source())
display(el.get_session_stats_per_source())
In [ ]:
In [64]:
# Significance test
df = el.get_stats_per_source()
df = df.set_index('readMoreSource')
df['noClicks'] = df.apply(lambda r: r['views'] * 100 - r['clicks'], axis=1)
# a: clicks, no clicks (= views - clicks)
# b: clicks, no clicks (= views - clicks
#obs = np.array([group_a, group_b])
obs = np.array([
[df.loc['MLT', 'clicks'], df.loc['MLT', 'noClicks']],
[df.loc['Citolytics', 'clicks'], df.loc['Citolytics', 'noClicks']],
])
#obs = np.array([[14452, 4073, 4287], [30864, 11439, 9887]])
#print(obs.shape) # 2,n
chi2, p, dof, expected = stats.chi2_contingency(obs)
#print(p) # p < 0.05
if p < 0.05:
print('Significant at p < 0.05 (p = %f)' % p)
else:
print('Not significant at p < 0.05 (p = %f)' % p)
In [ ]:
In [ ]:
In [ ]: