# falsepos

Exploration of a problem interpreting binary test results

``````

In [34]:

from __future__ import print_function, division

import thinkbayes2

from sympy import symbols

``````

`p` is the prevalence of a condition `s` is the sensititivity of the test

The false positive rate is known to be either `t1` (with probability `q`) or `t2` (with probability `1-q`)

``````

In [35]:

p, q, s, t1, t2 = symbols('p q s t1 t2')

``````

I'll use `a` through `h` for each of the 8 possible conditions.

``````

In [36]:

a, b, c, d, e, f, g, h = symbols('a b c d e f g h')

``````

And here are the probabilities of the conditions.

``````

In [37]:

a =   q   *   p   *   s
b =   q   *   p   * (1-s)
c =   q   * (1-p) *   t1
d =   q   * (1-p) * (1-t1)
e = (1-q) *   p   *   s
f = (1-q) *   p   * (1-s)
g = (1-q) * (1-p) *   t2
h = (1-q) * (1-p) * (1-t2)

``````
``````

In [40]:

pmf1 = thinkbayes2.Pmf()
pmf1['sick'] = p*s
pmf1['notsick'] = (1-p)*t1
pmf1

``````
``````

Out[40]:

Pmf({'notsick': t1*(-p + 1), 'sick': p*s})

``````
``````

In [41]:

nc1 = pmf1.Normalize()
nc1.simplify()

``````
``````

Out[41]:

p*s - t1*(p - 1)

``````
``````

In [42]:

pmf2 = thinkbayes2.Pmf()
pmf2['sick'] = p*s
pmf2['notsick'] = (1-p)*t2
pmf2

``````
``````

Out[42]:

Pmf({'notsick': t2*(-p + 1), 'sick': p*s})

``````
``````

In [44]:

nc2 = pmf2.Normalize()
nc2.simplify()

``````
``````

Out[44]:

p*s - t2*(p - 1)

``````
``````

In [47]:

pmf_t = thinkbayes2.Pmf({t1:q, t2:1-q})
pmf_t[t1] *= nc1
pmf_t[t2] *= nc2
pmf_t.Normalize()

``````
``````

Out[47]:

1.0*q*(p*s + t1*(-p + 1)) + (-1.0*q + 1.0)*(p*s + t2*(-p + 1))

``````
``````

In [48]:

pmf_t.Mean().simplify()

``````
``````

Out[48]:

1.0*(q*t1*(p*s - t1*(p - 1)) - t2*(q - 1)*(p*s - t2*(p - 1)))/(q*(p*s - t1*(p - 1)) - (q - 1)*(p*s - t2*(p - 1)))

``````
``````

In [49]:

d1 = dict(q=0.5, p=0.1, s=0.5, t1=0.2, t2=0.8)
pmf_t.Mean().evalf(subs=d1)

``````
``````

Out[49]:

0.662000000000000

``````
``````

In [50]:

d2 = dict(q=0.75, p=0.1, s=0.5, t1=0.4, t2=0.8)
pmf_t.Mean().evalf(subs=d2)

``````
``````

Out[50]:

0.554000000000000

``````
``````

In [53]:

pmf_t[t1].evalf(subs=d2)

``````
``````

Out[53]:

0.615000000000000

``````
``````

In [56]:

x = pmf_t[t1] * pmf1['sick'] + pmf_t[t2] * pmf2['sick']
x.simplify()

``````
``````

Out[56]:

1.0*p*s/(q*(p*s - t1*(p - 1)) - (q - 1)*(p*s - t2*(p - 1)))

``````
``````

In [57]:

x.evalf(subs=d1)

``````
``````

Out[57]:

0.100000000000000

``````
``````

In [58]:

x.evalf(subs=d2)

``````
``````

Out[58]:

0.100000000000000

``````
``````

In [ ]:

``````
``````

In [59]:

t = q * t1 + (1-q) * t2

pmf = thinkbayes2.Pmf()
pmf['sick'] = p*s
pmf['notsick'] = (1-p)*t
pmf

``````
``````

Out[59]:

Pmf({'notsick': (-p + 1)*(q*t1 + t2*(-q + 1)), 'sick': p*s})

``````
``````

In [60]:

pmf.Normalize()

``````
``````

Out[60]:

p*s + (-p + 1)*(q*t1 + t2*(-q + 1))

``````
``````

In [61]:

pmf['sick'].simplify()

``````
``````

Out[61]:

1.0*p*s/(p*s - (p - 1)*(q*t1 - t2*(q - 1)))

``````
``````

In [62]:

pmf['sick'].evalf(subs=d1)

``````
``````

Out[62]:

0.100000000000000

``````
``````

In [64]:

pmf['sick'].evalf(subs=d2)

``````
``````

Out[64]:

0.100000000000000

``````
``````

In [ ]:

``````
``````

In [111]:

gold = thinkbayes2.Pmf()
gold['0 sick t1'] = q * (1-p)**2 * t1**2
gold['1 sick t1'] = q * 2*p*(1-p) * s * t1
gold['2 sick t1'] = q * p**2 * s**2
gold['0 sick t2'] = (1-q) * (1-p)**2 * t2**2
gold['1 sick t2'] = (1-q) * 2*p*(1-p) * s * t2
gold['2 sick t2'] = (1-q) * p**2 * s**2
gold.Normalize()

``````
``````

Out[111]:

p**2*q*s**2 + p**2*s**2*(-q + 1) + 2*p*q*s*t1*(-p + 1) + p*s*t2*(-p + 1)*(-2*q + 2) + q*t1**2*(-p + 1)**2 + t2**2*(-p + 1)**2*(-q + 1)

``````
``````

In [112]:

p0 = gold['0 sick t1'] + gold['0 sick t2']
p0.evalf(subs=d1)

``````
``````

Out[112]:

0.852895633323010

``````
``````

In [113]:

p0.evalf(subs=d2)

``````
``````

Out[113]:

0.826831935836675

``````
``````

In [ ]:

``````
``````

In [114]:

t = q * t1 + (1-q) * t2

``````
``````

In [115]:

collapsed = thinkbayes2.Pmf()
collapsed['0 sick'] = (1-p)**2 * t**2
collapsed['1 sick'] = 2*p*(1-p) * s * t
collapsed['2 sick'] = p**2 * s**2

``````
``````

In [116]:

collapsed.Normalize()

``````
``````

Out[116]:

p**2*s**2 + 2*p*s*(-p + 1)*(q*t1 + t2*(-q + 1)) + (-p + 1)**2*(q*t1 + t2*(-q + 1))**2

``````
``````

In [117]:

collapsed['0 sick'].evalf(subs=d1)

``````
``````

Out[117]:

0.810000000000000

``````
``````

In [118]:

collapsed['0 sick'].evalf(subs=d2)

``````
``````

Out[118]:

0.810000000000000

``````
``````

In [ ]:

``````
``````

In [119]:

pmf1 = thinkbayes2.Pmf()
pmf1['0 sick'] = (1-p)**2 * t1**2
pmf1['1 sick'] = 2*p*(1-p) * s * t1
pmf1['2 sick'] = p**2 * s**2
nc1 = pmf1.Normalize()

``````
``````

In [120]:

pmf2 = thinkbayes2.Pmf()
pmf2['0 sick'] = (1-p)**2 * t2**2
pmf2['1 sick'] = 2*p*(1-p) * s * t2
pmf2['2 sick'] = p**2 * s**2
nc2 = pmf2.Normalize()

``````
``````

In [121]:

pmf_t = thinkbayes2.Pmf({t1:q, t2:1-q})
pmf_t[t1] *= nc1
pmf_t[t2] *= nc2
pmf_t.Normalize()

``````
``````

Out[121]:

1.0*q*(p**2*s**2 + 2*p*s*t1*(-p + 1) + t1**2*(-p + 1)**2) + (-1.0*q + 1.0)*(p**2*s**2 + 2*p*s*t2*(-p + 1) + t2**2*(-p + 1)**2)

``````
``````

In [122]:

x = pmf_t[t1] * pmf1['0 sick'] + pmf_t[t2] * pmf2['0 sick']
x.simplify()

``````
``````

Out[122]:

1.0*(p - 1)**2*(q*t1**2 + t2**2*(-q + 1))/(q*(p**2*s**2 - 2*p*s*t1*(p - 1) + t1**2*(p - 1)**2) - (q - 1)*(p**2*s**2 - 2*p*s*t2*(p - 1) + t2**2*(p - 1)**2))

``````
``````

In [123]:

x.evalf(subs=d1), p0.evalf(subs=d1)

``````
``````

Out[123]:

(0.852895633323010, 0.852895633323010)

``````
``````

In [124]:

x.evalf(subs=d2), p0.evalf(subs=d2)

``````
``````

Out[124]:

(0.826831935836675, 0.826831935836675)

``````

`pmf_t` represents the distribution of `t`

``````

In [5]:

pmf_t = thinkbayes2.Pmf({t1:q, t2:1-q})
pmf_t.Mean().simplify()

``````
``````

Out[5]:

1.0*q*t1 - 1.0*t2*(q - 1)

``````

I'll consider two sets of parameters, `d1` and `d2`, which have the same mean value of `t`.

``````

In [6]:

d1 = dict(q=0.5, p=0.1, s=0.5, t1=0.2, t2=0.8)
pmf_t.Mean().evalf(subs=d1)

``````
``````

Out[6]:

0.500000000000000

``````
``````

In [7]:

d2 = dict(q=0.75, p=0.1, s=0.5, t1=0.4, t2=0.8)
pmf_t.Mean().evalf(subs=d2)

``````
``````

Out[7]:

0.500000000000000

``````

`prob` takes two numbers that represent odds in favor and returns the corresponding probability.

``````

In [8]:

def prob(yes, no):
return yes / (yes + no)

``````

Scenario A

In the first scenario, there are two kinds of people in the world, or two kinds of tests, so there are four outcomes that yield positive tests: two true positives (a and d) and two false positives (c and g).

We can compute the probability of a true positive given a positive test:

``````

In [9]:

res = prob(a+e, c+g)
res.simplify()

``````
``````

Out[9]:

p*s/(-p*q*t1 + p*q*t2 + p*s - p*t2 + q*t1 - q*t2 + t2)

``````

In this scenario, the two parameter sets yield the same answer:

``````

In [10]:

res.evalf(subs=d1)

``````
``````

Out[10]:

0.100000000000000

``````
``````

In [11]:

res.evalf(subs=d2)

``````
``````

Out[11]:

0.100000000000000

``````

Scenario B

Now suppose instead of two kinds of people, or two kinds of tests, the distribution of `t` represents our uncertainty about `t`. That is, we are only considering one test, and we think the false positive rate is the same for everyone, but we don't know what it is.

In this scenario, we need to think about the sampling process that brings patients to see doctors. There are three possibilities:

B1. Only patients who test positive see a doctor.

B2. All patients see a doctor with equal probability, regardless of test results and regardless of whether they are sick or not.

B3. Patients are more or less likely to see a doctor, depending on the test results and whether they are sick or not.

Scenario B1

If patients only see a doctor after testing positive, the doctor doesn't learn anything about `t` just because a patient tests positive. In that case, the doctor should compute the conditional probabilities:

• `p1` is the probability the patient is sick given a positive test and `t1`
• `p2` is the probability the patient is sick given a positive test and `t2`

We can compute `p1` and `p2`, form `pmf_p`, and compute its mean:

``````

In [12]:

p1 = prob(a, c)
p1.simplify()

``````
``````

Out[12]:

p*s/(p*s - t1*(p - 1))

``````
``````

In [13]:

p1.evalf(subs=d1)

``````
``````

Out[13]:

0.217391304347826

``````
``````

In [14]:

p2 = prob(e, g)
p2.simplify()

``````
``````

Out[14]:

p*s/(p*s - t2*(p - 1))

``````
``````

In [15]:

p2.evalf(subs=d1)

``````
``````

Out[15]:

0.0649350649350649

``````
``````

In [16]:

pmf_p = thinkbayes2.Pmf([p1, p2])

``````
``````

In [17]:

pmf_p.Mean().simplify()

``````
``````

Out[17]:

0.5*p*s*(2*p*s - t1*(p - 1) - t2*(p - 1))/((p*s - t1*(p - 1))*(p*s - t2*(p - 1)))

``````
``````

In [18]:

pmf_p.Mean().evalf(subs=d1)

``````
``````

Out[18]:

0.141163184641446

``````
``````

In [19]:

p1.evalf(subs=d2), p2.evalf(subs=d2), pmf_p.Mean().evalf(subs=d2)

``````
``````

Out[19]:

(0.121951219512195, 0.0649350649350649, 0.0934431422236300)

``````

Scenario B2

If all patients see a doctor, the doctor can learn about `t` based on the number of positive and negative tests.

• The likelihood of a positive test given `t1` is `(a+c)/q`

• The likelihood of a positive test given `t2` is `(e+g)/(1-q)`

`update` takes a `pmf` and updates it with these likelihoods

``````

In [20]:

def update(pmf):
post = pmf.Copy()
post[p1] *= (a + c) / q
post[p2] *= (e + g) / (1-q)
post.Normalize()
return post

``````

`post` is what we should believe about `p` after seeing one patient with a positive test:

``````

In [21]:

post = update(pmf_p)
post[p1].simplify()

``````
``````

Out[21]:

(-0.5*p*s + 0.5*t1*(p - 1))/(-1.0*p*s + 0.5*t1*(p - 1) + 0.5*t2*(p - 1))

``````
``````

In [22]:

post.Mean().simplify()

``````
``````

Out[22]:

-1.0*p*s/(-1.0*p*s + 0.5*t1*(p - 1) + 0.5*t2*(p - 1))

``````

When `q` is 0.5, the posterior mean is `p`:

``````

In [23]:

post.Mean().evalf(subs=d1)

``````
``````

Out[23]:

0.100000000000000

``````

But other distributions of `t` yield different values.

``````

In [24]:

post.Mean().evalf(subs=d2)

``````
``````

Out[24]:

0.0847457627118644

``````

Let's see what we get after seeing two patients

``````

In [25]:

post2 = update(post)

``````
``````

In [26]:

post2.Mean().simplify()

``````
``````

Out[26]:

1.0*p*s*(2*p*s - t1*(p - 1) - t2*(p - 1))/((p*s - t1*(p - 1))**2 + (p*s - t2*(p - 1))**2)

``````

Positive tests are more likely under `t2` than `t1`, so each positive test makes it more likely that `t=t2`. So the expected value of `p` converges on `p2`.

``````

In [27]:

post2.Mean().evalf(subs=d1)

``````
``````

Out[27]:

0.0774233508826262

``````
``````

In [28]:

post2.Mean().evalf(subs=d2)

``````
``````

Out[28]:

0.0775295663600526

``````
``````

In [29]:

post3 = update(post2)
post3.Mean().evalf(subs=d1)

``````
``````

Out[29]:

0.0688926818860678

``````
``````

In [30]:

post3.Mean().evalf(subs=d2)

``````
``````

Out[30]:

0.0724135699794844

``````
``````

In [ ]:

``````