In [1]:
import statsmodels.formula.api as sm
from lxml import etree
from datetime import datetime
import pandas as pd
import pandas.io.data as web
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
pd.set_option('max_columns', 30)
%matplotlib inline
Data from: https://archive.org/details/stackexchange
In [2]:
run_console = True
update_from_web = False
plot_on = True
In [61]:
def plot(s):
# Read dataset
root = etree.parse('Users-%s.xml'%s)
Reputation = []
for x in root.getiterator(tag='row'):
Reputation.append(float(x.get('Reputation')))
# Frequency Analysis by Ranking of Data
# Sort in descending order
a = np.array(Reputation)
a.sort()
a = a[::-1]
# Rank
rank = np.arange(1,len(a)+1)
fig = plt.figure(figsize=(8, 8))
ax = fig.add_subplot(1, 1, 1)
ax.set_xscale('log')
plt.plot(rank, a, 'r-.', label='Reputation')
ax.legend()
ax.title.set_text('Reputation distribution on %s'%s)
plt.show()
In [62]:
sites = ['superuser.com','askubuntu.com','math.stackexchange.com', 'serverfault.com']
for s in sites:
plot(s)
In [3]:
if run_console:
%qtconsole