In [2]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde
#from tabulate import tabulate
import pandas as pd
In [6]:
# Figure size
from pylab import rcParams
rcParams['figure.figsize'] = 12, 6
rcParams['axes.labelsize'] = 12
rcParams['legend.fontsize'] = 12
In [3]:
# Load article stats (from output of flink job)
# Columns: * Name * Words * Headlines * OutLinks * AvgLinkDistance * InLinks
stats_cols = ('name', 'words', 'headlines', 'outlinks', 'avg_link_distance', 'inlinks')
with open('simplewiki_stats', 'rb') as f:
stats = np.loadtxt(f, delimiter='|',
dtype={
'names': stats_cols,
'formats': ('unicode', 'int', 'int', 'int', 'float', 'int')
})
In [4]:
with open('dewiki_stats', 'rb') as f:
stats_de = np.loadtxt(f, delimiter='|',
dtype={
'names': stats_cols,
'formats': ('unicode', 'int', 'int', 'int', 'float', 'int')
})
In [10]:
for col in stats_cols:
if col == 'name':
continue
s = stats[col]
print('\n### %s' % col)
print('Mean: %f\nMin: %f\nMax: %f\nCount: %i' % (np.mean(s), np.min(s), np.max(s), len(s)))
In [47]:
plt.hist(stats['words'], 50, log=True)
plt.xlabel('# words')
plt.ylabel('# articles')
plt.show()
plt.hist(stats['inlinks'], 50, log=True)
plt.xlabel('# inlinks')
plt.ylabel('# articles')
plt.show()
In [24]:
plt.hist(stats_de['words'], 50, log=True)
plt.xlabel('# words')
plt.ylabel('# articles')
plt.show()
plt.hist(stats_de['inlinks'], 50, log=True)
plt.xlabel('# inlinks')
plt.ylabel('# articles')
plt.show()
In [33]:
# simple + dewiki stats in single plot
a = stats_de['words']
b = stats['words']
plt.style.use('seaborn-deep')
bins=np.histogram(np.hstack((a,b)), bins=50)[1] #get the bin edges
plt.hist(a, bins, log=True, label='dewiki')
plt.hist(b, bins, log=True, label='simplewiki')
plt.xlabel('# words')
plt.ylabel('# articles')
plt.legend(loc='upper right')
plt.show()
a = stats_de['inlinks']
b = stats['inlinks']
plt.style.use('seaborn-deep')
bins=np.histogram(np.hstack((a,b)), bins=50)[1] #get the bin edges
plt.hist(a, bins, log=True, label='dewiki')
plt.hist(b, bins, log=True, label='simplewiki')
plt.xlabel('# inlinks')
plt.ylabel('# articles')
plt.legend(loc='upper right')
plt.show()
In [52]:
#plt.figure(figsize=(6,6))
inlinks = stats['inlinks'][stats['inlinks'] > 0] # all articles with at least one inlink
N = len(inlinks) #87958 # article count (CS articles)
#plt.style.use('seaborn-deep')
# Different idf schemas
classic = np.log(N / (1 + inlinks))
smooth = np.log(1 + N / (1 + inlinks))
probalistic = np.log((N - inlinks) / inlinks)
bm25 = np.log((N - inlinks + 0.5) / (inlinks + 0.5))
data = np.vstack([classic, smooth, probalistic, bm25]).T
bins = np.linspace(4, 12, 40) #30
plt.hist(data, bins, label=['classic', 'smooth', 'probalistic', 'bm25'])
plt.xlabel('# idf-inlinks')
plt.ylabel('# articles')
plt.legend(loc='upper left')
plt.show()
plt.hist(smooth, 50, label=['smooth']) # smooth idf
plt.legend(loc='upper left')
plt.show()
plt.hist(probalistic, 50) # probalistic idf
plt.show()
# TODO Compare with dewiki
# Conclusion: Not much difference?
In [5]:
# Link distance measures (0.5 sample)
# format: distance | count
ld_abs = np.genfromtxt ('simplewiki_link_distance_0_5.abs', delimiter='|')
ld_rel = np.genfromtxt ('simplewiki_link_distance_0_5.rel', delimiter='|')
ld_str = np.genfromtxt ('simplewiki_link_distance_0_5.str', delimiter='|')
In [54]:
plt.hist(ld_abs[:,0], weights=ld_abs[:,1])
plt.xlabel('link distance (words)')
plt.ylabel('# articles')
plt.show()
print('Mean: %f;\tMin: %f;\tMax: %f' % (np.mean(ld_abs[:,0]), np.min(ld_abs[:,0]), np.max(ld_abs[:,0])))
plt.hist(ld_rel[:,0], weights=ld_rel[:,1])
plt.xlabel('link distance (relative words)')
plt.ylabel('# articles')
plt.show()
print('Mean: %f;\tMin: %f;\tMax: %f' % (np.mean(ld_rel[:,0]), np.min(ld_rel[:,0]), np.max(ld_rel[:,0])))
plt.hist(ld_str[:,0], weights=ld_str[:,1])
plt.xlabel('link distance (structure)')
plt.ylabel('# articles')
plt.show()
print('Mean: %f;\tMin: %f;\tMax: %f' % (np.mean(ld_str[:,0]), np.min(ld_str[:,0]), np.max(ld_str[:,0])))
In [ ]: