notebook.community

Edit and run



In [1]:

    
import statsmodels.formula.api as sm
from lxml import etree
from datetime import datetime
import pandas as pd
import pandas.io.data as web
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
pd.set_option('max_columns', 30)
%matplotlib inline

Data from: https://archive.org/details/stackexchange



In [2]:

    
run_console = True
update_from_web = False
plot_on = True



In [61]:

    
def plot(s):
    # Read dataset
    root = etree.parse('Users-%s.xml'%s)
    Reputation = []
    for x in root.getiterator(tag='row'):
        Reputation.append(float(x.get('Reputation')))
    # Frequency Analysis by Ranking of Data
    # Sort in descending order
    a = np.array(Reputation)
    a.sort()
    a = a[::-1]
    # Rank
    rank = np.arange(1,len(a)+1)
    
    fig = plt.figure(figsize=(8, 8))
    ax = fig.add_subplot(1, 1, 1)
    ax.set_xscale('log')
    plt.plot(rank, a, 'r-.', label='Reputation')
    ax.legend()
    ax.title.set_text('Reputation distribution on %s'%s)
    plt.show()



In [62]:

    
sites = ['superuser.com','askubuntu.com','math.stackexchange.com', 'serverfault.com']
for s in sites:
    plot(s)



In [3]:

    
if run_console:
    %qtconsole