In [3]:
from __future__ import print_function, division
import pandas as pd
import thinkstats2
import thinkplot
import math
import gzip
%matplotlib inline
formats = ['png', 'pdf']
In [4]:
def ReadFile(filename='soc-Slashdot0902.txt.gz', n=None):
"""Reads a compressed data file.
Args:
filename: string name of the file to read
"""
if filename.endswith('gz'):
fp = gzip.open(filename)
else:
fp = open(filename)
srcs = {}
for i, line in enumerate(fp):
if i == n:
break
if line.startswith('#'):
continue
src, dest = line.split()
srcs.setdefault(src, []).append(dest)
fp.close()
return srcs
srcs = ReadFile()
In [5]:
def Summarize(srcs):
"""Computes the number of edges for each source."""
lens = [len(t) for t in srcs.itervalues()]
mu, sigma2 = thinkstats2.MeanVar(lens)
print(mu, math.sqrt(sigma2))
return lens
lens = Summarize(srcs)
In [6]:
pmf = thinkstats2.Pmf(lens, 'actual')
cdf = pmf.MakeCdf()
thinkplot.PrePlot(2)
thinkplot.Plot(cdf.xs, cdf.ps, label=cdf.label)
thinkplot.Config(xlabel='number of friends/foes', ylabel='CDF', xscale='log',
xlim=[1, 3000],
loc='lower right')
thinkplot.Save('social1', formats=formats)
In [7]:
def BiasPmf(pmf, label, invert=False):
"""Returns the Pmf with oversampling proportional to value.
If pmf is the distribution of true values, the result is the
distribution that would be seen if values are oversampled in
proportion to their values; for example, if you ask students
how big their classes are, large classes are oversampled in
proportion to their size.
If invert=True, computes in inverse operation; for example,
unbiasing a sample collected from students.
Args:
pmf: Pmf object.
invert: boolean
Returns:
Pmf object
"""
new_pmf = pmf.Copy()
new_pmf.label = label
for x, p in pmf.Items():
if invert:
new_pmf.Mult(x, 1.0/x)
else:
new_pmf.Mult(x, x)
new_pmf.Normalize()
return new_pmf
biased_pmf = BiasPmf(pmf, 'biased')
cdf2 = biased_pmf.MakeCdf()
pmf.Mean(), biased_pmf.Mean()
Out[7]:
In [8]:
thinkplot.PrePlot(2)
thinkplot.Plot(cdf.xs, cdf.ps, label=cdf.label)
thinkplot.Plot(cdf2.xs, cdf2.ps, label=cdf2.label)
thinkplot.Config(xlabel='number of friends/foes', ylabel='CDF', xscale='log',
xlim=[1, 3000],
loc='lower right')
thinkplot.Save('social2', formats=formats)
In [8]: