In [1]:
import statsmodels.formula.api as sm
from lxml import etree
from datetime import datetime
import pandas as pd
import pandas.io.data as web
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
pd.set_option('max_columns', 30)
%matplotlib inline

In [2]:
run_console = True
update_from_web = False
plot_on = True

In [61]:
def plot(s):
    # Read dataset
    root = etree.parse('Users-%s.xml'%s)
    Reputation = []
    for x in root.getiterator(tag='row'):
        Reputation.append(float(x.get('Reputation')))
    # Frequency Analysis by Ranking of Data
    # Sort in descending order
    a = np.array(Reputation)
    a.sort()
    a = a[::-1]
    # Rank
    rank = np.arange(1,len(a)+1)
    
    fig = plt.figure(figsize=(8, 8))
    ax = fig.add_subplot(1, 1, 1)
    ax.set_xscale('log')
    plt.plot(rank, a, 'r-.', label='Reputation')
    ax.legend()
    ax.title.set_text('Reputation distribution on %s'%s)
    plt.show()

In [62]:
sites = ['superuser.com','askubuntu.com','math.stackexchange.com', 'serverfault.com']
for s in sites:
    plot(s)



In [3]:
if run_console:
    %qtconsole