notebook.community

Edit and run



In [12]:

    
# Setup ####

# Make plots show in the notebook
%matplotlib inline

# Libraries
import random
import numpy as np
import pandas as pd
from ggplot import *
import scipy as sp
import math
import functools
import sklearn.metrics

# Overall simulation settings
n_data_sets = 100
n_data_points = 100

# Error family settings
def normal_location():
    return np.random.normal(0, 1, 1)

def normal_scale():
    return np.random.lognormal(0, 1, 1)

def lognormal_location():
    return np.random.normal(0, 1, 1)

def lognormal_scale():
    return np.random.lognormal(0, 1, 1)

def exponential_location():
    return(np.random.normal(0,1,1))

def exponential_scale():
    return(np.random.lognormal(0,1,1))



In [7]:

    
# Generating mock data for classification ####

# List of possible families
family_choices = ["normal", "lognormal", "exponential"]

# Generating answer key
answer_key = [random.choice(family_choices) for i in range(n_data_sets)]

print(answer_key)

# Generating mock data sets
def error_family_generator (answer_key):
    def error_family_machine (family):
        if family == "normal":
            location = normal_location()
            scale = normal_scale()
            data = np.random.normal(location, scale, n_data_points)
        elif family == "lognormal":
            location = lognormal_location()
            scale = lognormal_scale()
            data = np.random.lognormal(location, scale, n_data_points)
        elif family == "exponential":
            location = exponential_location()
            scale = exponential_scale()
            data = sp.stats.expon.rvs(location, scale, n_data_points)
        else: 
            raise ValueError("Family type '%s' invalid." %(family,))
         
        return (location, scale, data)
    
    data_info = [error_family_machine(i) for i in answer_key]
    
    locations = [i[0] for i in data_info]
    scales = [i[1] for i in data_info]
    
    meta_data = pd.DataFrame({"id": [i[0] for i in enumerate(answer_key)],
                              "family": answer_key,
                              "location": locations,
                              "scale": scales,
                              "method": "generative"
                             })
    
    # Data sets are arranged by column
    data_sets = pd.DataFrame([i[2] for i in data_info]).transpose()
    
    return {"metadata": meta_data, "data": data_sets}

mock_data = error_family_generator(answer_key)

print(mock_data["metadata"])
#print(mock_data["data"])

ggplot(pd.melt(mock_data["data"]), aes(x="value")) + geom_density() + facet_wrap("variable")









    



['normal', 'normal', 'normal', 'lognormal', 'normal', 'exponential', 'exponential', 'lognormal', 'exponential', 'lognormal', 'normal', 'normal', 'lognormal', 'exponential', 'normal', 'lognormal', 'lognormal', 'lognormal', 'lognormal', 'lognormal', 'exponential', 'normal', 'exponential', 'lognormal', 'lognormal', 'lognormal', 'lognormal', 'lognormal', 'lognormal', 'exponential', 'exponential', 'normal', 'normal', 'normal', 'exponential', 'normal', 'normal', 'exponential', 'exponential', 'normal', 'normal', 'exponential', 'exponential', 'exponential', 'exponential', 'exponential', 'lognormal', 'exponential', 'exponential', 'lognormal', 'lognormal', 'lognormal', 'exponential', 'exponential', 'exponential', 'exponential', 'normal', 'exponential', 'lognormal', 'lognormal', 'normal', 'lognormal', 'normal', 'normal', 'lognormal', 'normal', 'normal', 'exponential', 'exponential', 'lognormal', 'exponential', 'exponential', 'exponential', 'exponential', 'exponential', 'exponential', 'lognormal', 'exponential', 'lognormal', 'normal', 'normal', 'normal', 'lognormal', 'lognormal', 'lognormal', 'lognormal', 'normal', 'exponential', 'normal', 'lognormal', 'exponential', 'normal', 'normal', 'lognormal', 'lognormal', 'normal', 'exponential', 'normal', 'exponential', 'lognormal']
         family  id            location      method             scale
0        normal   0   [-0.660943715855]  generative   [1.74236284348]
1        normal   1     [1.55560807212]  generative  [0.266176655609]
2        normal   2    [-1.93984862927]  generative   [1.05487902084]
3     lognormal   3    [0.577431496207]  generative  [0.552864887969]
4        normal   4   [-0.162647304169]  generative   [3.04475745712]
5   exponential   5   [-0.933116750496]  generative  [0.329501294119]
6   exponential   6   [-0.697883186664]  generative   [1.57073155816]
7     lognormal   7    [0.531308492789]  generative   [2.46540426987]
8   exponential   8   [-0.232392735436]  generative  [0.828558916577]
9     lognormal   9   [-0.381941974007]  generative   [1.02560212389]
10       normal  10    [0.532330060031]  generative  [0.757774511769]
11       normal  11   [-0.883939581888]  generative   [5.53458848392]
12    lognormal  12   [-0.231716920967]  generative  [0.329961918117]
13  exponential  13    [0.229144455791]  generative   [0.37628707902]
14       normal  14    [-1.73750277064]  generative   [1.00253185515]
15    lognormal  15    [0.513464168387]  generative   [1.42044936496]
16    lognormal  16    [-1.02060875955]  generative  [0.941538655899]
17    lognormal  17     [0.19796266771]  generative   [1.41181053614]
18    lognormal  18    [0.585623286825]  generative   [0.60755438733]
19    lognormal  19     [-2.8037957623]  generative    [1.4518031997]
20  exponential  20    [-1.64767329239]  generative   [1.17775257792]
21       normal  21    [0.325309099209]  generative  [0.745354373622]
22  exponential  22    [0.401658100712]  generative  [0.408748118312]
23    lognormal  23   [-0.464085178446]  generative    [0.4215005818]
24    lognormal  24    [0.567306967605]  generative  [0.186976259282]
25    lognormal  25     [0.19448684821]  generative  [0.460690503788]
26    lognormal  26    [0.694395962432]  generative   [3.04839822781]
27    lognormal  27     [1.53011092256]  generative   [1.45320752546]
28    lognormal  28   [-0.905226586226]  generative   [1.01922673199]
29  exponential  29    [-0.34286942839]  generative   [2.38131326537]
..          ...  ..                 ...         ...               ...
70  exponential  70   [-0.735209296774]  generative    [4.1984594568]
71  exponential  71     [1.97808285286]  generative   [1.13152643975]
72  exponential  72    [0.996709156736]  generative  [0.964559662811]
73  exponential  73     [1.15464983986]  generative   [1.32685164119]
74  exponential  74    [0.709271128556]  generative   [0.90079792127]
75  exponential  75   [-0.671998854716]  generative  [0.397046857816]
76    lognormal  76    [0.683041783296]  generative  [0.473283141874]
77  exponential  77   [-0.765617006158]  generative  [0.978799512964]
78    lognormal  78      [3.0087749541]  generative   [1.00285337528]
79       normal  79    [-1.24763148321]  generative  [0.468488324653]
80       normal  80    [-2.71670731917]  generative   [2.49955873924]
81       normal  81   [0.0882842706406]  generative   [7.48805300643]
82    lognormal  82      [1.8284498181]  generative  [0.788971854489]
83    lognormal  83    [-1.39517441485]  generative   [2.81828503162]
84    lognormal  84    [-1.43790301703]  generative   [1.89467416164]
85    lognormal  85    [-1.28145441346]  generative  [0.439513889363]
86       normal  86  [-0.0981422653932]  generative  [0.960050233276]
87  exponential  87    [-1.25386217769]  generative   [4.37449437074]
88       normal  88      [1.4864172307]  generative    [1.8439622792]
89    lognormal  89    [0.354027178203]  generative  [0.904040916274]
90  exponential  90    [0.485759723643]  generative   [1.29918480564]
91       normal  91     [1.29254800535]  generative   [1.37203420068]
92       normal  92     [2.22922948642]  generative  [0.474670992351]
93    lognormal  93   [-0.380158350229]  generative   [5.05807586407]
94    lognormal  94   [0.0417908442741]  generative   [1.32236664267]
95       normal  95   [-0.998123806669]  generative  [0.964948002263]
96  exponential  96    [0.566851118469]  generative  [0.438331874682]
97       normal  97     [1.08759892321]  generative  [0.858908786454]
98  exponential  98     [-1.2796046021]  generative   [1.34152937065]
99    lognormal  99     [1.51580799589]  generative  [0.666779311459]

[100 rows x 5 columns]






    



/usr/local/lib/python3.4/dist-packages/matplotlib/__init__.py:872: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))






    












    Out[7]:





<ggplot: (-891030896)>



In [8]:

    
# Computing likelihoods ####

def normal_llh (data, loc=0, scale=1):
    ll = math.fsum(sp.stats.norm.logpdf(data, loc=loc, scale=scale))
    return ll

def lognormal_llh (data, loc=0, scale=1):
    ll = math.fsum(sp.stats.lognorm.logpdf(data, loc=loc, s=scale))
    return ll

def exponential_llh (data, loc=0, scale=1):
    ll = math.fsum(sp.stats.expon.logpdf(data, loc=loc, scale=scale))
    return ll


def llh_maximizer (data):
        
    def opt_llh (family):
        # Check domain
        if family == "lognormal":
            if any(data <= 0):
                return {"family": family,
                "llh": -float("inf"),
                "location": float("NaN"),
                "scale": float("NaN"),
                "convergence": float("NaN")}
                       
        # Partial functions for optimization
        if family == "normal":
            optim_func = lambda x: -1 * functools.partial(normal_llh, data)(x[0],x[1])
            x0 = (0,1)
        elif family == "lognormal":
            optim_func = lambda x: -1 * functools.partial(lognormal_llh, data)(x[0],x[1])
            x0 = (0,1)
        elif family == "exponential":
            optim_func = lambda x: -1 * functools.partial(exponential_llh, data)(x[0],x[1])
            x0 = (min(data)-1,1)
        else:
            raise ValueError("Family type '%s' invalid." %(family,))
            
        # Find maximum likelihood estimates of parameters
        optim_res = sp.optimize.minimize(optim_func, x0, method="Nelder-Mead")
            
        return {"family": family,
                "llh": -optim_res.fun,
                "location": optim_res.x[0],
                "scale": optim_res.x[1],
                "convergence": optim_res.success}
    
    outcome = pd.DataFrame([opt_llh(family) for family in family_choices])
    
    return (outcome)

llh_raw = [llh_maximizer(mock_data["data"][i[0]]) for i in enumerate(mock_data["data"].columns)]

for i in enumerate(llh_raw):
    llh_raw[i[0]]["id"] = i[0]

lh_results = functools.reduce(lambda x,y: x.append(y), llh_raw)
print(lh_results)









    



   convergence       family         llh    location        scale  id
0         True       normal -200.271654   -0.528271     1.792781   0
1          NaN    lognormal        -inf         NaN          NaN   0
2         True  exponential -250.587357   -5.036346     4.508122   0
0         True       normal   -7.340872    1.620930     0.260401   1
1         True    lognormal   -7.779550    0.583866     0.260588   1
2         True  exponential  -46.538563    1.035161     0.570819   1
0        False       normal -150.827713   -2.009043     1.103498   2
1          NaN    lognormal        -inf         NaN          NaN   2
2         True  exponential -180.224612   -4.247130     2.229051   2
0         True       normal -161.325724    2.099177     1.214500   3
1         True    lognormal -148.181234    0.411234     0.811636   3
2         True  exponential -159.634794    0.470376     2.698919   3
0         True       normal -247.241627    0.156680     2.867609   4
1          NaN    lognormal        -inf         NaN          NaN   4
2         True  exponential -303.947318   -7.529911     7.686592   4
0         True       normal  -13.745847   -0.615550     0.277649   5
1          NaN    lognormal        -inf         NaN          NaN   5
2         True  exponential   15.153501   -0.931728     0.316181   5
0         True       normal -180.178100    0.624066     1.466419   6
1          NaN    lognormal        -inf         NaN          NaN   6
2         True  exponential -129.316269   -0.697409     1.573773   6
0         True       normal -680.373986   34.183234   218.066744   7
1         True    lognormal -238.606071   -0.000314     2.572291   7
2         True  exponential -453.173077    0.000124    34.183018   7
0         True       normal -124.179677    0.601135     0.837648   8
1          NaN    lognormal        -inf         NaN          NaN   8
2         True  exponential  -99.873224   -0.203519     1.683121   8
0         True       normal -175.965749    1.209793     1.405972   9
1         True    lognormal -119.267366   -0.066512     1.028886   9
2         True  exponential -115.153668    0.046151     1.163605   9
..         ...          ...         ...         ...          ...  ..
0         True       normal -143.855508    1.687162     1.019825  90
1         True    lognormal -121.701738    0.325454     0.801097  90
2         True  exponential -117.815815    0.492149     1.195007  90
0         True       normal -161.917907    1.385110     1.221709  91
1          NaN    lognormal        -inf         NaN          NaN  91
2         True  exponential -215.130013   -1.777174     3.162279  91
0         True       normal  -69.574769    2.213043     0.485169  92
1         True    lognormal  -81.271217    0.996741     0.494755  92
2         True  exponential -108.093874    1.128746     1.080676  92
0         True       normal -951.827699  987.960066  3292.289321  93
1         True    lognormal -227.680253    0.000004     5.347747  93
2         True  exponential -789.564225    0.000004   987.965950  93
0         True       normal -302.468628    2.498733     4.981575  94
1         True    lognormal -171.025753   -0.008279     1.277867  94
2         True  exponential -190.839395    0.018430     2.478415  94
0         True       normal -147.892938   -0.924836     1.061858  95
1          NaN    lognormal        -inf         NaN          NaN  95
2         True  exponential -201.161922   -3.674889     2.747220  95
0         True       normal  -73.641013    1.030482     0.505294  96
1         True    lognormal  -45.599090   -0.029167     0.392781  96
2         True  exponential  -22.954783    0.567669     0.462908  96
0         True       normal -111.892670    1.061055     0.740790  97
1          NaN    lognormal        -inf         NaN          NaN  97
2         True  exponential -170.869383   -0.968033     1.936997  97
0         True       normal -209.948174    0.492762     1.974980  98
1          NaN    lognormal        -inf         NaN          NaN  98
2         True  exponential -153.643064   -1.217131     1.709949  98
0         True       normal -262.680026    5.040160     3.346335  99
1         True    lognormal -278.274135    1.008463     1.481208  99
2         True  exponential -239.004856    1.025474     4.066888  99

[300 rows x 6 columns]



In [29]:

    
# Likelihood based model selection ####

# Compute likelihood weights in analog to AIC / BIC weights
# Equivalent when parameter number is equal
lh_results["lh_weights"] = float("NaN")

for i in set(lh_results["id"]):
    id_rows = lh_results["id"] == i
    llh_scores = lh_results["llh"][id_rows] 

    # Find delta likelihood
    d_llh = llh_scores - max(llh_scores)
    
    # Transform to delta likelihood 
    d_lh = [math.exp(i) for i in d_llh]
    
    # Compute simple linear weights
    lh_weights = [i / sum(d_lh) for i in d_lh]
    
    lh_results.loc[id_rows,"lh_weights"] = lh_weights
    
#print(lh_results [["family", "id", "lh_weights"]])

# Extract winner's information
def extract_lh_winner (i):
    lh_subset = lh_results.loc[lh_results["id"] == i,:]
    winner = np.argmax(lh_subset["lh_weights"])
    return {"id": i,
            "family": lh_subset["family"][winner],
            "location": lh_subset["location"][winner],
            "scale": lh_subset["scale"][winner],
            "method": "likelihood"
           }

lh_winners = pd.DataFrame([extract_lh_winner(i) for i in set(lh_results["id"])])

lh_confusion = pd.DataFrame(sklearn.metrics.confusion_matrix(y_true=mock_data["metadata"]["family"], 
                                                y_pred=lh_winners["family"]))
lh_confusion.columns = sorted(set(mock_data["metadata"]["family"] + ".Predicted"))
lh_confusion.index = sorted(set(mock_data["metadata"]["family"] + ".True")) 


print (lh_confusion)









    



                  exponential.Predicted  lognormal.Predicted  normal.Predicted
exponential.True                     35                    0                 1
lognormal.True                       12                   20                 2
normal.True                           0                    0                30