In [3]:
from __future__ import print_function, division

import pandas as pd
import thinkstats2
import thinkplot

import math
import gzip

%matplotlib inline
formats = ['png', 'pdf']

In [4]:
def ReadFile(filename='soc-Slashdot0902.txt.gz', n=None):
    """Reads a compressed data file.

    Args:
        filename: string name of the file to read
    """
    if filename.endswith('gz'):
        fp = gzip.open(filename)
    else:
        fp = open(filename)

    srcs = {}
    for i, line in enumerate(fp):
        if i == n:
            break

        if line.startswith('#'):
            continue

        src, dest = line.split()
        srcs.setdefault(src, []).append(dest)

    fp.close()

    return srcs

srcs = ReadFile()

In [5]:
def Summarize(srcs):
    """Computes the number of edges for each source."""
    lens = [len(t) for t in srcs.itervalues()]
    mu, sigma2 = thinkstats2.MeanVar(lens)
    print(mu, math.sqrt(sigma2))
    return lens

lens = Summarize(srcs)


12.0914317767 37.6969124125

In [6]:
pmf = thinkstats2.Pmf(lens, 'actual')
cdf = pmf.MakeCdf()
thinkplot.PrePlot(2)
thinkplot.Plot(cdf.xs, cdf.ps, label=cdf.label)
thinkplot.Config(xlabel='number of friends/foes', ylabel='CDF', xscale='log', 
                 xlim=[1, 3000],
                 loc='lower right')
thinkplot.Save('social1', formats=formats)


Writing social1.png
Writing social1.pdf
<matplotlib.figure.Figure at 0x7f96849628d0>

In [7]:
def BiasPmf(pmf, label, invert=False):
    """Returns the Pmf with oversampling proportional to value.

    If pmf is the distribution of true values, the result is the
    distribution that would be seen if values are oversampled in
    proportion to their values; for example, if you ask students
    how big their classes are, large classes are oversampled in
    proportion to their size.

    If invert=True, computes in inverse operation; for example,
    unbiasing a sample collected from students.

    Args:
      pmf: Pmf object.
      invert: boolean

     Returns:
       Pmf object
    """
    new_pmf = pmf.Copy()
    new_pmf.label = label

    for x, p in pmf.Items():
        if invert:
            new_pmf.Mult(x, 1.0/x)
        else:
            new_pmf.Mult(x, x)
        
    new_pmf.Normalize()
    return new_pmf


biased_pmf = BiasPmf(pmf, 'biased')
cdf2 = biased_pmf.MakeCdf()

pmf.Mean(), biased_pmf.Mean()


Out[7]:
(12.091431776749411, 129.61739823546282)

In [8]:
thinkplot.PrePlot(2)
thinkplot.Plot(cdf.xs, cdf.ps, label=cdf.label)
thinkplot.Plot(cdf2.xs, cdf2.ps, label=cdf2.label)
thinkplot.Config(xlabel='number of friends/foes', ylabel='CDF', xscale='log', 
                 xlim=[1, 3000],
                 loc='lower right')
thinkplot.Save('social2', formats=formats)


Writing social2.png
Writing social2.pdf
<matplotlib.figure.Figure at 0x7f96813506d0>

In [8]: