In [28]:
%matplotlib inline
from IPython.core.pylabtools import figsize
import numpy as np
from matplotlib import pyplot as plt
figsize(11, 9)

import scipy.stats as stats

dist = stats.beta
n_trials = [0, 1, 2, 3, 4, 5, 8, 15, 50, 500,5000,50000]
data = stats.bernoulli.rvs(0.5, size=n_trials[-1])
x = np.linspace(0, 1, 100)

# For the already prepared, I'm using Binomial's conj. prior.
for k, N in enumerate(n_trials):
    # sx = plt.subplot(len(n_trials) / 2, 1, k + 1)
    sx = plt.subplot(len(n_trials) / 2 , 2, k + 1)
    plt.xlabel("$p$, probabilita' testa") \
        if k in [0, len(n_trials) - 1] else None
    plt.setp(sx.get_yticklabels(), visible=False)
    heads = data[:N].sum()
    y = dist.pdf(x, 1 + heads, 1 + N - heads)
    plt.plot(x, y, label="%d lanci,\n %d teste" % (N, heads))
    plt.fill_between(x, 0, y, color="#348A3D", alpha=0.4)
    plt.vlines(0.5, 0, 4, color="k", linestyles="--", lw=1)

    leg = plt.legend()
    leg.get_frame().set_alpha(0.4)
    plt.autoscale(tight=True)


plt.suptitle("probabilita' a posteriori bayesiane",
             y=1.02,
             fontsize=14)

plt.tight_layout()


esempio: quanto sbagli?

Siamo programmatori bravi e ordinati e facciamo i test di non regressione per i nostri programmi. Nell'ultimo progetto abbiamo svariate centinaia di test e al momento della consegna del programma i test sono tutti positivi.

Il committente chiede: qual'è la probabilità che il programma abbia dei bug?

Come rispondereste?

  • I test passano tutti, quindi il programma non ha bug
  • Non tutto può essere testato e tutti i programmi hanno bug quindi è sicuro che ci siano bug
  • Ad es. il 42% (ovvero una percentuale qualsiasi e siete confidenti che quella sia la risposta giusta).

Vince solo l'ultima risposta ovviamente. Ma sapete come fare?

\begin{align} P( A | X ) = & \frac{ P(X | A) P(A) } {P(X) } \\\\[5pt] & \propto P(X | A) P(A)\;\; (\propto \text{is proportional to } ) \end{align}

$P(A) = p$.

\begin{align} P(X ) & = P(X \text{ and } A) + P(X \text{ and } \sim A) \\\\[5pt] & = P(X|A)P(A) + P(X | \sim A)P(\sim A)\\\\[5pt] & = P(X|A)p + P(X | \sim A)(1-p) \end{align}

$P(X|\sim A) = 0.5$. Then

\begin{align} P(A | X) & = \frac{1\cdot p}{ 1\cdot p +0.5 (1-p) } \\\\ & = \frac{ 2 p}{1+p} \end{align}

In [29]:
%matplotlib inline
from IPython.core.pylabtools import figsize
import numpy as np
from matplotlib import pyplot as plt
figsize(12.5, 4)
p = np.linspace(0, 1, 50)
plt.plot(p, 2 * p / (1 + p), color="#348A3D", lw=2)
plt.fill_between(p, 2*p/(1+p), alpha=.5, facecolor=["#A60608"])
plt.scatter(0.2, 2 * (0.2) / 1.2, s=200, c="#348AFD")
plt.xlim(0, 1)
plt.ylim(0, 1)
plt.xlabel("Prior, $P(A) = p$")
plt.ylabel("Posterior, $P(A|X)$, su $P(A) = p$")
plt.title("Ci sono bug nel mio progetto?")


Out[29]:
<matplotlib.text.Text at 0x11297d950>

esempio: modifica delle abitudini

Ho una applicazione in cloud ed una valutazione del numero di bug corretti nel tempo (per esempio numero di commit al giorno). Stiamo parlando solo di manutenzione CORRETTIVA.

I miei programmatori si lamentano che il loro carico di lavoro è aumentato e chiedono di assumere un altro programmatore. Probabilmente ciò è dovuto al fatto che alcuni nuovi clienti stanno utilizzando parti del programma non sufficientemente testate in precedenza. Nonostante tutte queste informazioni non riesco a dedurre da altre informazioni la veridicità di queste informazioni. Voglio scoprire se veramente c'è stato un aumento del carico di lavoro sui programmatori e quanto è significativo in modo da verificare l'effettiva necessità di un programmatore, e, più importante, da che momento nel tempo questo è avvenuto in modo da comprendere se orientare le vendite in modo differente.


In [30]:
figsize(12.5, 3.5)
count_data = np.loadtxt("data/txtdata.csv")
n_count_data = len(count_data)
plt.bar(np.arange(n_count_data), count_data, color="#348ABD")
plt.xlabel("Tempo (settimane)")
plt.ylabel("numero di eventi")
#plt.title("?")
plt.xlim(0, n_count_data);



In [31]:
figsize(12.5, 4)

import scipy.stats as stats
a = np.arange(16)
poi = stats.poisson
lambda_ = [1.5, 4.25]
colours = ["#348ABD", "#A60628"]

plt.bar(a, poi.pmf(a, lambda_[0]), color=colours[0],
        label="$\lambda = %.1f$" % lambda_[0], alpha=0.60,
        edgecolor=colours[0], lw="3")


plt.xticks(a + 0.4, a)
plt.legend()
plt.ylabel("probability of $k$")
plt.xlabel("$k$")
plt.title("Probability mass function of a Poisson random variable; differing \
$\lambda$ values")


Out[31]:
<matplotlib.text.Text at 0x1134514d0>

In [32]:
figsize(12.5, 4)

import scipy.stats as stats
a = np.arange(16)
poi = stats.poisson
lambda_ = [1.5, 4.25]
colours = ["#348ABD", "#A60628"]

plt.bar(a, poi.pmf(a, lambda_[0]), color=colours[0],
        label="$\lambda = %.1f$" % lambda_[0], alpha=0.60,
        edgecolor=colours[0], lw="3")

plt.bar(a, poi.pmf(a, lambda_[1]), color=colours[1],
        label="$\lambda = %.1f$" % lambda_[1], alpha=0.60,
        edgecolor=colours[1], lw="3")

plt.xticks(a + 0.4, a)
plt.legend()
plt.ylabel("probability of $k$")
plt.xlabel("$k$")
plt.title("Probability mass function of a Poisson random variable; differing \
$\lambda$ values")


Out[32]:
<matplotlib.text.Text at 0x113436990>

In [33]:
a = np.linspace(0, 4, 100)
expo = stats.expon
lambda_ = [0.5]

for l, c in zip(lambda_, colours):
    plt.plot(a, expo.pdf(a, scale=1. / l), lw=3,
             color=c, label="$\lambda = %.1f$" % l)
    plt.fill_between(a, expo.pdf(a, scale=1. / l), color=c, alpha=.33)

plt.legend()
plt.ylabel("PDF at $z$")
plt.xlabel("$z$")
plt.ylim(0, 1.2)
plt.title("Probability density function of an Exponential random variable;\
 differing $\lambda$");



In [34]:
a = np.linspace(0, 4, 100)
expo = stats.expon
lambda_ = [0.5, 1]

for l, c in zip(lambda_, colours):
    plt.plot(a, expo.pdf(a, scale=1. / l), lw=3,
             color=c, label="$\lambda = %.1f$" % l)
    plt.fill_between(a, expo.pdf(a, scale=1. / l), color=c, alpha=.33)

plt.legend()
plt.ylabel("PDF at $z$")
plt.xlabel("$z$")
plt.ylim(0, 1.2)
plt.title("Probability density function of an Exponential random variable;\
 differing $\lambda$");


$$ C_i \sim \text{Poisson}(\lambda) $$$$ \lambda = \begin{cases} \lambda_1 & \text{if } t \lt \tau \cr \lambda_2 & \text{if } t \ge \tau \end{cases} $$\begin{align} &\lambda_1 \sim \text{Exp}( \alpha ) \\\ &\lambda_2 \sim \text{Exp}( \alpha ) \end{align}$$\frac{1}{N}\sum_{i=0}^N \;C_i \approx E[\; \lambda \; |\; \alpha ] = \frac{1}{\alpha}$$

\begin{align} & \tau \sim \text{Uniform}_d(\text{1,70) }\\\\ & \Rightarrow P( \tau = k ) = \frac{1}{70} \end{align}

In [35]:
import pymc as pm
import numpy as np

count_data = np.loadtxt("data/txtdata.csv")
n_count_data = len(count_data)

alpha = 1.0 / count_data.mean()  
                              
lambda_1 = pm.Exponential("lambda_1", alpha)
lambda_2 = pm.Exponential("lambda_2", alpha)

tau = pm.DiscreteUniform("tau", lower=0, upper=n_count_data)

In [36]:
print "Tau output:", ; 
for i in range(10):
    print tau.random(), 
print
print "Lambda_1 output:", 
for i in range(10):
    print lambda_1.random(), 
print
print "Lambda_2 output:", 
for i in range(10):
    print lambda_2.random(), 
print


Tau output: 37 39 9 55 52 42 9 9 43 0
Lambda_1 output: 33.9333651621 0.0304026762859 2.91839700249 0.00530491760976 7.46135517947 6.99727501873 40.3920292579 8.27566932988 26.7496953193 11.0944141868
Lambda_2 output: 113.234539185 5.7992742366 48.2209158967 4.24366983958 43.7459788235 19.6914108017 42.8229530334 19.9483509207 40.6294826201 26.2987561003

In [37]:
@pm.deterministic
def lambda_(tau=tau, lambda_1=lambda_1, lambda_2=lambda_2):
    out = np.zeros(n_count_data)
    out[:tau] = lambda_1
    out[tau:] = lambda_2
    return out

In [38]:
observation = pm.Poisson("obs", lambda_,value=count_data,observed=True)
model = pm.Model([observation, lambda_1, lambda_2, tau])

In [39]:
mcmc = pm.MCMC(model)
mcmc.sample(40000, 10000, 1)


 [-----------------100%-----------------] 40000 of 40000 complete in 10.5 sec

In [40]:
lambda_1_samples = mcmc.trace('lambda_1')[:]
lambda_2_samples = mcmc.trace('lambda_2')[:]
tau_samples = mcmc.trace('tau')[:]

In [42]:
from IPython.core.pylabtools import figsize
import numpy as np
from matplotlib import pyplot as plt
figsize(12.5, 10)

ax = plt.subplot(311)
ax.set_autoscaley_on(False)

plt.hist(lambda_1_samples, histtype='stepfilled', bins=30, alpha=0.85,
         label="posterior  $\lambda_1$", color="#A60628", normed=True)
plt.legend(loc="upper left")
plt.title(r"""Distribuzione delle variabili    $\lambda_1,\;\lambda_2,\;\tau$""")
plt.xlim([15, 30])
plt.xlabel("$\lambda_1$")

ax = plt.subplot(312)
ax.set_autoscaley_on(False)
plt.hist(lambda_2_samples, histtype='stepfilled', bins=30, alpha=0.85,
         label="posterior $\lambda_2$", color="#7A68A6", normed=True)
plt.legend(loc="upper left")
plt.xlim([15, 30])
plt.xlabel("$\lambda_2$")

plt.subplot(313)
w = 1.0 / tau_samples.shape[0] * np.ones_like(tau_samples)
plt.hist(tau_samples, bins=n_count_data, alpha=1,
         label=r"posterior $\tau$",
         color="#467821", weights=w, rwidth=2.)
plt.xticks(np.arange(n_count_data))

plt.legend(loc="upper left")
plt.ylim([0, .75])
plt.xlim([35, len(count_data) - 20])
plt.xlabel(r"$\tau$ (in settimane)")
plt.ylabel("probability");



In [44]:
figsize(12.5, 5)
# tau_samples, lambda_1_samples, lambda_2_samples contain
# N samples from the corresponding posterior distribution
N = tau_samples.shape[0]
expected_events_per_day = np.zeros(n_count_data)
for day in range(0, n_count_data):
    ix = day < tau_samples
    expected_events_per_day[day] = (lambda_1_samples[ix].sum()
                                   + lambda_2_samples[~ix].sum()) / N


plt.plot(range(n_count_data), expected_events_per_day, lw=4, color="#E24A33",
         label="numero atteso di eventi")
plt.xlim(0, n_count_data)
plt.xlabel("Settimana")
plt.ylabel("Numero atteso di eventi")
plt.title("Numero atteso di eventi")
plt.ylim(0, 60)
plt.bar(np.arange(len(count_data)), count_data, color="#348ABD", alpha=0.65,
        label="eventi osservati per settimana")

plt.legend(loc="upper left");



In [45]:
print expected_events_per_day


[ 17.7545366   17.7545366   17.7545366   17.7545366   17.7545366
  17.7545366   17.7545366   17.7545366   17.7545366   17.7545366
  17.7545366   17.7545366   17.7545366   17.7545366   17.7545366
  17.7545366   17.7545366   17.7545366   17.7545366   17.7545366
  17.7545366   17.7545366   17.7545366   17.7545366   17.7545366
  17.7545366   17.7545366   17.7545366   17.7545366   17.7545366
  17.7545366   17.7545366   17.7545366   17.7545366   17.7545366
  17.7545366   17.7545366   17.7545366   17.7545366   17.7545366
  17.75463495  17.75787777  17.90660514  18.40165235  20.21958701
  22.71789065  22.71789065  22.71789065  22.71789065  22.71789065
  22.71789065  22.71789065  22.71789065  22.71789065  22.71789065
  22.71789065  22.71789065  22.71789065  22.71789065  22.71789065
  22.71789065  22.71789065  22.71789065  22.71789065  22.71789065
  22.71789065  22.71789065  22.71789065  22.71789065  22.71789065
  22.71789065  22.71789065  22.71789065  22.71789065]
$$ \lambda = \begin{cases} \lambda_1 & = 17.7545366 \cr \lambda_2 & = 22.71789065 \end{cases} $$

In [50]:
print lambda_1_samples.mean()
print lambda_2_samples.mean()


17.7545366024
22.717890649

In [53]:
l1= lambda_1_samples/lambda_2_samples
print l1.mean()


0.782725732203

In [54]:
print lambda_1_samples.mean()/lambda_2_samples.mean()


0.781522231828

In [56]:
import pymc as pm
parameter = pm.Exponential("poisson_param", 1)
data_generator = pm.Poisson("data_generator", parameter)
data_plus_one = data_generator + 1

In [57]:
print "Children of `parameter`: "
print parameter.children
print "\nParents of `data_generator`: "
print data_generator.parents
print "\nChildren of `data_generator`: "
print data_generator.children


Children of `parameter`: 
set([<pymc.distributions.Poisson 'data_generator' at 0x10cb8e510>])

Parents of `data_generator`: 
{'mu': <pymc.distributions.Exponential 'poisson_param' at 0x10cb8e490>}

Children of `data_generator`: 
set([<pymc.PyMCObjects.Deterministic '(data_generator_add_1)' at 0x10cb8e6d0>])

In [61]:
print "parameter.value =", parameter.value
print "data_generator.value =", data_generator.value
print "data_plus_one.value =", data_plus_one.value


parameter.value = 1.13818543137
data_generator.value = 0
data_plus_one.value = 1

In [62]:
lambda_1 = pm.Exponential("lambda_1", 1)  # prior on first behaviour
lambda_2 = pm.Exponential("lambda_2", 1)  # prior on second behaviour
tau = pm.DiscreteUniform("tau", lower=0, upper=10)  # prior on behaviour change

print "lambda_1.value = %.3f" % lambda_1.value
print "lambda_2.value = %.3f" % lambda_2.value
print "tau.value = %.3f" % tau.value
print

lambda_1.random(), lambda_2.random(), tau.random()

print "After calling random() on the variables..."
print "lambda_1.value = %.3f" % lambda_1.value
print "lambda_2.value = %.3f" % lambda_2.value
print "tau.value = %.3f" % tau.value


lambda_1.value = 1.722
lambda_2.value = 0.141
tau.value = 9.000

After calling random() on the variables...
lambda_1.value = 0.519
lambda_2.value = 0.245
tau.value = 5.000

In [67]:
# A quadratic fit
#———————————————————–
import numpy, pymc

# create some test data
x = numpy.arange(100) * 0.3
f = 0.1 * x**2 - 2.6 * x - 1.5
numpy.random.seed(76523654)
noise = numpy.random.normal(size=100) * .1     # create some Gaussian noise
f = f + noise                                # add noise to the data

z = numpy.polyfit(x, f, 2)   # the traditional chi-square fit
print "The chi-square result: ",  z

#priors
sig = pymc.Uniform('sig', 0.0, 100.0, value=1.)

a = pymc.Uniform('a', -10.0, 10.0, value= 0.0)
b = pymc.Uniform('b', -10.0, 10.0, value= 0.0)
c = pymc.Uniform('c', -10.0, 10.0, value= 0.0)

#model
@pymc.deterministic(plot=False)
def mod_quadratic(x=x, a=a, b=b, c=c):
      return a*x**2 + b*x + c

#likelihood
y = pymc.Normal('y', mu=mod_quadratic, tau=1.0/sig**2, value=f, observed=True)
#———————————————————–

# Now, go to command line and run the following (or alternatively put them in a file):

test = [sig,a,b,c,y]
R = pymc.MCMC(test)    #  build the model
R.sample(10000)              # populate and run it
dir(R)
print 'a    ', R.a.value        # print outputs
print 'b    ', R.b.value
print 'c    ', R.c.value


 The chi-square result:  [ 0.0999244  -2.59879643 -1.49278601]
 [-----------------100%-----------------] 10000 of 10000 complete in 3.3 sec
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-67-86098a1a78f4> in <module>()
     35 R.sample(10000)              # populate and run it
     36 dir(R)
---> 37 print 'a    ', R.a.value        # print outputs
     38 print 'b    ', R.b.value
     39 print 'c    ', R.c.value

AttributeError: 'MCMC' object has no attribute 'a'
a    

In [68]:
dir(R)


Out[68]:
['OCValue',
 '_Sampler__status',
 '__class__',
 '__delattr__',
 '__dict__',
 '__doc__',
 '__format__',
 '__getattribute__',
 '__hash__',
 '__init__',
 '__module__',
 '__name__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__slotnames__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_assign_database_backend',
 '_burn',
 '_burn_till_tuned',
 '_calc_dic',
 '_csv_str',
 '_cur_trace_index',
 '_current_iter',
 '_db_args',
 '_dict_container',
 '_finalize',
 '_funs_to_tally',
 '_generations',
 '_get_deviance',
 '_get_dic',
 '_get_generations',
 '_get_logp',
 '_get_value',
 '_halt',
 '_iter',
 '_loop',
 '_n_tally',
 '_save_interval',
 '_sm_assigned',
 '_state',
 '_sum_deviance',
 '_thin',
 '_tune_interval',
 '_tune_throughout',
 '_tuned_count',
 '_tuning',
 '_value',
 '_variables_to_tally',
 'assign_step_methods',
 'assimilate',
 'change_methods',
 'commit',
 'containers',
 'containing_classes',
 'coparents',
 'db',
 'deterministics',
 'deviance',
 'dic',
 'draw',
 'draw_from_prior',
 'generations',
 'get_node',
 'get_state',
 'halt',
 'icontinue',
 'iprompt',
 'isample',
 'logp',
 'markov_blanket',
 'max_trace_length',
 'moral_neighbors',
 'nodes',
 'observed_stochastics',
 'pause',
 'pbar',
 'potentials',
 'register',
 'remember',
 'remove_step_method',
 'replace',
 'restore_sampler_state',
 'restore_sm_state',
 'sample',
 'save_state',
 'seed',
 'stats',
 'status',
 'step_method_dict',
 'step_methods',
 'stochastics',
 'summary',
 'tally',
 'trace',
 'tune',
 'use_step_method',
 'value',
 'variables',
 'verbose',
 'write_csv']


In [69]:
a=R.trace('a')

In [70]:
a


Out[70]:
<pymc.database.ram.Trace at 0x1134cd290>

In [72]:
import pymc
import numpy as np

### Data generation

# Means and standard deviations of the Gaussian mixture model. The inference
# engine doesn't know these.
means = [0, 4.0]
stdevs = [1, 2.0]

# Rather than randomizing between the mixands, just specify how many
# to draw from each. This makes it really easy to know which draws
# came from which mixands (the first N1 from the first, the rest from
# the secon). The inference engine doesn't know about N1 and N2, only Ndata
N1 = 10
N2 = 40
Ndata = N1+N2

# Seed both the data generator RNG  as well as the global seed (for PyMC)
RNGseed = 123
np.random.seed(RNGseed)

def generate_data(draws_per_mixand):
    """Draw samples from a two-element Gaussian mixture reproducibly.

    Input sequence indicates the number of draws from each mixand. Resulting
    draws are concantenated together.

    """
    RNG = np.random.RandomState(RNGseed)
    values = np.hstack([RNG.normal(means[i], stdevs[i], ndraws)
                        for (i,ndraws) in enumerate(draws_per_mixand)])
    return values

observed_data = generate_data([N1, N2])


### PyMC model setup, step 1: the Dirichlet process and stick-breaking

# Truncation level of the Dirichlet process
Ndp = 50

# "alpha", or the concentration of the stick-breaking construction. There exists
# some interplay between choice of Ndp and concentration: a high concentration
# value implies many clusters, in turn implying low values for the leading
# elements of the probability mass function built by stick-breaking. Since we
# enforce the resulting PMF to sum to one, the probability of the last cluster
# might be then be set artificially high. This may interfere with the Dirichlet
# process' clustering ability.
#
# An example: if Ndp===4, and concentration high enough, stick-breaking might
# yield p===[.1, .1, .1, .7], which isn't desireable. You want to initialize
# concentration so that the last element of the PMF is less than or not much
# more than the a few of the previous ones. So you'd want to initialize at a
# smaller concentration to get something more like, say, p===[.35, .3, .25, .1].
#
# A thought: maybe we can avoid this interdependency by, rather than setting the
# final value of the PMF vector, scale the entire PMF vector to sum to 1? FIXME,
# TODO.
concinit = 5.0
conclo = 0.3
conchi = 100.0
concentration = pymc.Uniform('concentration', lower=conclo, upper=conchi,
                             value=concinit)

# The stick-breaking construction: requires Ndp beta draws dependent on the
# concentration, before the probability mass function is actually constructed.
betas = pymc.Beta('betas', alpha=1, beta=concentration, size=Ndp)

@pymc.deterministic
def pmf(betas=betas):
    "Construct a probability mass function for the truncated Dirichlet process"
    # prod = lambda x: np.exp(np.sum(np.log(x))) # Slow but more accurate(?)
    prod = np.prod
    value = map(lambda (i,u): u * prod(1.0 - betas[:i]), enumerate(betas))
    value[-1] = 1.0 - sum(value[:-1]) # force value to sum to 1
    return value

# The cluster assignments: each data point's estimated cluster ID.
# Remove idinit to allow clusterid to be randomly initialized:
idinit = np.zeros(Ndata, dtype=np.int64)
clusterid = pymc.Categorical('clusterid', p=pmf, size=Ndata, value=idinit)

### PyMC model setup, step 2: clusters' means and stdevs

# An individual data sample is drawn from a Gaussian, whose mean and stdev is
# what we're seeking.

# Hyperprior on clusters' means
mu0_mean = 0.0
mu0_std = 50.0
mu0_prec = 1.0/mu0_std**2
mu0_init = np.zeros(Ndp)
clustermean = pymc.Normal('clustermean', mu=mu0_mean, tau=mu0_prec,
                          size=Ndp, value=mu0_init)

# The cluster's stdev
clustersig_lo = 0.0
clustersig_hi = 100.0
clustersig_init = 50*np.ones(Ndp) # Again, don't really care?
clustersig = pymc.Uniform('clustersig', lower=clustersig_lo,
                          upper=clustersig_hi, size=Ndp, value=clustersig_init)
clusterprec = clustersig ** -2

### PyMC model setup, step 3: data

# So now we have means and stdevs for each of the Ndp clusters. We also have a
# probability mass function over all clusters, and a cluster ID indicating which
# cluster a particular data sample belongs to.

@pymc.deterministic
def data_cluster_mean(clusterid=clusterid, clustermean=clustermean):
    "Converts Ndata cluster IDs and Ndp cluster means to Ndata means."
    return clustermean[clusterid]

@pymc.deterministic
def data_cluster_prec(clusterid=clusterid, clusterprec=clusterprec):
    "Converts Ndata cluster IDs and Ndp cluster precs to Ndata precs."
    return clusterprec[clusterid]

data = pymc.Normal('data', mu=data_cluster_mean, tau=data_cluster_prec,
                   observed=True, value=observed_data)

esempio: premio di produzione

I miei programmatori sono molto bravi e introducono tante nuove feature nel sistema. Qui vedete una rappresentazione del numero di feature (ordinata) che sono introdotte nel tempo (ascissa)Inline image 3

Sebbene il numero delle persone nel team sia sempre rimasto costante, nel tempo vari programmatori si sono avvicendati. Come sta andando? I nuovi programmatori sono più produttivi dei vecchi, c'è stato qualcuno in particolare la cui assunzione ha cambiato in modo determinante la produzione? Ha senso dargli un premio di produzione? data


In [120]:
from pymc import DiscreteUniform, Exponential, deterministic, Poisson, Uniform
import numpy as np

figsize(12.5, 4)
productivity_array =[ 4, 5, 4, 0, 1, 4, 3, 4, 0, 6, 3, 3, 4, 0, 2, 6,
                   3, 3, 5, 4, 5, 3, 1, 4, 4, 1, 5, 5, 3, 4, 2, 5,
                   2, 2, 3, 4, 2, 1, 3, 2, 2, 1, 1, 1, 1, 3, 0, 0,
                   1, 0, 1, 1, 0, 0, 3, 1, 0, 3, 2, 2, 0, 1, 1, 1,
                   0, 1, 0, 1, 0, 0, 0, 2, 1, 0, 0, 0, 1, 1, 0, 2,
                   3, 3, 1, 1, 2, 1, 1, 1, 1, 2, 4, 2, 0, 0, 1, 4,
                   0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1]
a = np.arange(len(productivity_array))
n_count_data = len(productivity_array)
plt.bar(a, productivity_array, color="#348AFD", 
         label="numero atteso di eventi")
plt.ylim(0,7)
plt.title('numero di ticket aperti')
x=plt.xlim(0,len(productivity_array))
plt.xlabel("Giorni")
plt.ylabel("Ticket")


Out[120]:
<matplotlib.text.Text at 0x110b6f350>

In [123]:
switchpoint = DiscreteUniform('switchpoint', lower=0, upper=110, 
                              doc='Switchpoint[year]')

In [124]:
early_mean = Exponential('early_mean', beta=1.)
late_mean = Exponential('late_mean', beta=1.)

In [125]:
@deterministic(plot=False)
def rate(s=switchpoint, e=early_mean, l=late_mean):
    ''' Concatenate Poisson means '''
    out = np.empty(len(productivity_array))
    out[:s] = e
    out[s:] = l
    return out

In [126]:
disasters = Poisson('disasters', mu=rate, 
                    value=productivity_array, 
                    observed=True)

In [127]:
mcmc = pymc.MCMC([disasters,rate,switchpoint,early_mean,late_mean])
mcmc.sample(iter=10000, burn=1000, thin=10)


 [-----------------100%-----------------] 10000 of 10000 complete in 2.4 sec

In [144]:
#disasters_samples = mcmc.trace('disasters')[:]
early_samples = mcmc.trace('early_mean')[:]
late_samples = mcmc.trace('late_mean')[:]
switch_samples = mcmc.trace('switchpoint')[:]

In [143]:
from pymc.Matplot import plot
plot(M)


Plotting early_mean
Plotting late_mean
Plotting switchpoint

In [ ]: