This notebook contains code necessary to create the figures for the recently submitted paper.


In [1]:
%matplotlib inline
from cualid import create_ids
import numpy as np
from difflib import get_close_matches
import pandas as pd
import seaborn as sns
import uuid
import itertools

In [170]:
def mutate_id(id_, n_mutations=3):
    cols = np.random.choice(range(len(id_)), size=(n_mutations), replace=False)
    mutated_id = list(id_)
    for col in cols:
        mutation = np.random.choice(list(' 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!'))
        while mutated_id[col] == mutation:
            mutation = np.random.choice(list(' 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!'))
        mutated_id[col] = mutation
    return ''.join(mutated_id)

def mutate_ids(ids_, n_mutations):
    mutated_ids = {}
    for id_ in ids_:
        mutated_ids[id_] = mutate_id(id_, n_mutations)
    return mutated_ids
        

def get_errors(n_ids=10, id_length=8, n_mutations=2, threshold=.6):
    
    hrids = [e[1] for e in create_ids(n_ids, id_length)]
    mutated_ids = mutate_ids(hrids, n_mutations)
    false_negative = 0
    false_positive = 0
    false = 0
    for correct_id, mutated_id in mutated_ids.items():
        fixed_id = get_close_matches(mutated_id, mutated_ids.keys(), n=1, cutoff=threshold)
        if len(fixed_id) < 1:
            false_negative += 1
            false += 1
        else:
            if fixed_id[0] != correct_id:
                false_positive += 1
                false += 1
                print(fixed_id, correct_id, mutated_id)
        
    return false_positive/n_ids, false_negative/n_ids, false/n_ids

In [193]:
#function for parallel stuff
def create_df(n_id, id_length, n_mutation, threshold):
    false_positive, false_negative, false = get_errors(n_id, id_length, n_mutation, threshold)
    return pd.DataFrame({"#CualIDs With Transcription Errors":[n_id], "CualID Length":[id_length], 
                                 "#Transcription Errors":[n_mutation], "Threshold":[threshold], 
                                 "Fraction Innacurate Correction":[false_positive], 
                                 "Fraction Uncorrectable":[false_negative],
                                 "Fraction Innacurate Correction or Uncorrectable":[false]})

In [194]:
n_ids = [1, 10, 100, 1000]
id_lengths = [7, 8, 9, 10]
n_mutations = [1, 2, 3]
thresholds = [.7]
iterations = 20

a = [n_ids, id_lengths, n_mutations, thresholds]
arguments = np.array(list(itertools.product(*a))*iterations).T

n_ids = list(map(int, arguments[0]))
id_lengths = list(map(int, arguments[1]))
n_mutations = list(map(int, arguments[2]))
thresholds = arguments[3]

In [195]:
from IPython.parallel import Client
clients = Client(profile='cual-id-conda')
clients[:].execute("import pandas as pd")
clients[:].execute("import numpy as np")
clients[:].execute("from cualid import create_ids")
clients[:].execute("from difflib import get_close_matches")
clients[:]['get_errors'] = get_errors
clients[:]['mutate_ids'] = mutate_ids
clients[:]['mutate_id'] = mutate_id

In [196]:
df = pd.DataFrame(columns=["#CualIDs With Transcription Errors", "CualID Length", "#Transcription Errors", "Threshold", "Fraction Innacurate Correction", "Fraction Uncorrectable", "Fraction Innacurate Correction or Uncorrectable"])
h = clients[:].map(create_df, n_ids, id_lengths, n_mutations, thresholds)
for i,r in enumerate(h):
    df = df.append(r)

In [229]:
fig = sns.factorplot(x="#CualIDs With Transcription Errors", 
                     y="Fraction Innacurate Correction", 
                     hue="#Transcription Errors",
                     col="CualID Length", 
                     data=df, 
                     kind="bar");
fig.savefig('figure_2_false.pdf', dpi=300)


(a) The y-axis is the fraction of identifiers that were incorrectly identified by cual-id. This is the rate of false positives


In [208]:
fig = sns.factorplot(x="#CualIDs With Transcription Errors", 
                     y="Fraction Uncorrectable", 
                     hue="#Transcription Errors",
                     col="CualID Length", 
                     data=df, 
                     kind="bar");
fig.savefig('figure_2_false.pdf', dpi=300)


(b) The fraction of Identifiers that were rejected as being to dissimilar to any of the identifiers in the reference set to be resolvable


In [192]:
df_6 = df[df["Threshold"] == .7]
fig = sns.factorplot(x="NumberIDs", y="FalseNegative", hue="NumberMutations",
col="IDLength", data=df_6, kind="bar");
fig.savefig('figure_2_falseNegative.pdf', dpi=300)


(c) The fraction of incorrect identifiers that were either incorrectly associated with an identifier, or were unresolvable


In [ ]: