In [1]:
import numpy as np
import pandas as pd
print pd.__version__


0.19.1

In [2]:
data_dir = "data/pdaid/"
!ls $data_dir


bacterial.csv          neural_predictions.csv viral.csv
fungal.csv             parasitic.csv

In [3]:
amino_acids = {"A": "Alanine",
            "C": "Cysteine",
            "D": "Aspartic Acid",
            "E": "Glutamic Acid",
            "F": "Phenylalanine",
            "G": "Glycine",
            "H": "Histidine",
            "I": "Isoleucine",
            "K": "Lysine",
            "L": "Leucine",
            "M": "Methionine",
            "N": "Asparagine",
            "P": "Proline",
            "Q": "Glutamine",
            "R": "Arginine",
            "S": "Serine",
            "T": "Threonine",
            "U": "Selenocysteine",
            "V": "Valine",
            "W": "Tryptophan",
            "Y": "Tyrosine"}

In [4]:
seq_len = 9    

def data_cleanup(group, group_name):
    group["disease"] = group_name
    group.columns = [
        'type', 'epitope_sequence', 'start', 'end', 'chebi', 'syn', 'protein',
        'protein_id', 'organism', 'oid', 'comments', 'disease'
    ]
    # Remove entries with '+' notation (note: looking into this, e.g. "PLNISLGDVVLY + DEAM(N3)")
    group = group[group.epitope_sequence.str.find('+') == -1]
    # Remove epitopes less than 'seq_len' AAs long
    group = group[group.epitope_sequence.str.len() >= seq_len]
    # Use 'seq_len' AAs from the C-terminus
    group['epitope_sequence'] = group.epitope_sequence.str[-seq_len::]
    group = group[["epitope_sequence", "disease"]]
    group = group[group.epitope_sequence.str.find("Z") == -1]
    group = group[group.duplicated(subset=["epitope_sequence"], keep=False) == False]
    group = group.sample(frac=1).head(3400)
    return group

disease_groups = ["bacterial", "parasitic", "viral", "fungal"]
disease_col = "disease"
bacterial = data_cleanup(pd.DataFrame.from_csv(data_dir + "bacterial.csv"), "bacterial")
fungal = data_cleanup(pd.DataFrame.from_csv(data_dir + "fungal.csv"), "fungal")
parasitic = data_cleanup(pd.DataFrame.from_csv(data_dir + "parasitic.csv"), "parasitic")
viral = data_cleanup(pd.DataFrame.from_csv(data_dir + "viral.csv"), "viral")

    
print "Input -- Bacterial: %d, Fungal: %d, Parasitic: %d, Viral: %d" % \
    (bacterial.shape[0], fungal.shape[0], parasitic.shape[0], viral.shape[0])

# Consider putting fungal back in? (only 39 samples)
all_samples = bacterial\
    .append(parasitic, ignore_index=True)\
    .append(viral, ignore_index= True)
    
print "All filtered samples: %d" % (all_samples.shape[0])


Input -- Bacterial: 2687, Fungal: 21, Parasitic: 2318, Viral: 3400
All filtered samples: 8405

In [5]:
from aa_props import seq_to_aa_props
from sklearn.preprocessing import MinMaxScaler
import random

y = []
X = []

for (rowid, seq, disease) in all_samples.itertuples():
    features = seq_to_aa_props(seq.upper())
    X.append(features)
    y.append(disease)

# Scale the data
scaler = MinMaxScaler()
scaled_X = scaler.fit_transform(X)

ll_training_filename = "/tmp/pdaid_training.ll"
ll_testing_filename = "/tmp/pdaid_testing.ll"

with open(ll_training_filename, 'w') as ll_train:
    with open(ll_testing_filename, 'w') as ll_test:
        for (features, disease) in zip(scaled_X, y):
            if random.random() >= 0.75:
                ll_file = ll_test
            else:
                ll_file = ll_train
            ll_file.write("%d" % (disease_groups.index(disease)))
            for (i, feature) in enumerate(features):
                ll_file.write(" %d:%.12f" % (i+1, feature))
            ll_file.write("\n")

In [6]:
# Find the best C parameter for the LR model
LR_findC = !train -C -s 0 "$ll_training_filename"
print "\n".join(LR_findC)


Doing parameter search with 5-fold cross validation.
log2c= -21.00	rate=40.2723
log2c= -20.00	rate=40.2723
log2c= -19.00	rate=40.2723
log2c= -18.00	rate=40.2723
log2c= -17.00	rate=40.2723
log2c= -16.00	rate=40.2723
log2c= -15.00	rate=40.2723
log2c= -14.00	rate=41.6337
log2c= -13.00	rate=44.3565
log2c= -12.00	rate=47.2693
log2c= -11.00	rate=49.6438
log2c= -10.00	rate=51.7651
log2c=  -9.00	rate=52.4616
log2c=  -8.00	rate=53.5856
log2c=  -7.00	rate=54.2504
log2c=  -6.00	rate=54.6145
log2c=  -5.00	rate=55.0103
log2c=  -4.00	rate=55.1528
log2c=  -3.00	rate=55.0261
log2c=  -2.00	rate=54.9786
log2c=  -1.00	rate=55.0894
log2c=   0.00	rate=55.0894
log2c=   1.00	rate=55.0894
log2c=   2.00	rate=55.0894
Best C = 0.0625  CV accuracy = 55.1528%

In [7]:
# Find the best C parameter for the SVC model
SVC_findC = !train -C -s 2 "$ll_training_filename"
print "\n".join(SVC_findC)


Doing parameter search with 5-fold cross validation.
log2c= -22.00	rate=40.2723
log2c= -21.00	rate=40.2723
log2c= -20.00	rate=40.2723
log2c= -19.00	rate=40.2723
log2c= -18.00	rate=40.2723
log2c= -17.00	rate=41.9503
log2c= -16.00	rate=44.8314
log2c= -15.00	rate=47.76
log2c= -14.00	rate=50.277
log2c= -13.00	rate=52.05
log2c= -12.00	rate=52.6991
log2c= -11.00	rate=53.6489
log2c= -10.00	rate=54.5037
log2c=  -9.00	rate=54.7253
log2c=  -8.00	rate=54.9628
log2c=  -7.00	rate=55.0578
log2c=  -6.00	rate=55.042
log2c=  -5.00	rate=54.9786
log2c=  -4.00	rate=54.9786
log2c=  -3.00	rate=55.0736
log2c=  -2.00	rate=55.0736
log2c=  -1.00	rate=54.9628
log2c=   0.00	rate=54.947
log2c=   1.00	rate=54.947
log2c=   2.00	rate=54.947
log2c=   3.00	rate=54.947
Best C = 0.125  CV accuracy = 55.0736%

In [8]:
def create_model_and_predictions(search_output, solver, model_file, predictions_file, probability_estimates, train_file, test_file):
    # Grab C from the last line...
    bestC = float(search_output[-1].split(" ")[3])
    cmd = "train -c %f -s %d '%s' '%s'" % (bestC, solver, train_file, model_file)
    print "> " + cmd
    createModel = !{cmd}
    print "\n".join(createModel)
    cmd = "predict -b %d '%s' '%s' '%s'" % (probability_estimates, test_file, model_file, predictions_file)
    print "> " + cmd
    predict = !{cmd}
    print "\n".join(predict)
    print "\n"

LR_model_file="/tmp/pdaid_lr.model"
SVC_model_file="/tmp/pdaid_svc.model"
LR_predictions_file = "/tmp/lr.predictions"
SVC_predictions_file = "/tmp/svc.predictions"
    
create_model_and_predictions(LR_findC, 0, LR_model_file, LR_predictions_file, 1, ll_training_filename, ll_testing_filename)
create_model_and_predictions(SVC_findC, 2, SVC_model_file, SVC_predictions_file, 0, ll_training_filename, ll_testing_filename)


> train -c 0.062500 -s 0 '/tmp/pdaid_training.ll' '/tmp/pdaid_lr.model'
iter  1 act 2.481e+01 pre 2.432e+01 delta 7.189e-02 f 2.737e+02 |g| 6.906e+02 CG   1
cg reaches trust region boundary
iter  2 act 2.669e+00 pre 2.647e+00 delta 1.537e-01 f 2.488e+02 |g| 5.157e+01 CG   2
cg reaches trust region boundary
iter  3 act 4.112e+00 pre 4.120e+00 delta 2.515e-01 f 2.462e+02 |g| 7.126e+01 CG   2
cg reaches trust region boundary
iter  4 act 2.848e+00 pre 2.767e+00 delta 3.275e-01 f 2.421e+02 |g| 2.357e+01 CG   2
cg reaches trust region boundary
iter  5 act 2.550e+00 pre 2.594e+00 delta 3.871e-01 f 2.392e+02 |g| 1.395e+01 CG   3
cg reaches trust region boundary
iter  6 act 2.590e+00 pre 2.549e+00 delta 5.449e-01 f 2.367e+02 |g| 1.566e+01 CG   3
cg reaches trust region boundary
iter  7 act 1.849e+00 pre 1.842e+00 delta 7.357e-01 f 2.341e+02 |g| 8.446e+00 CG   3
cg reaches trust region boundary
iter  8 act 1.048e+00 pre 1.035e+00 delta 7.491e-01 f 2.322e+02 |g| 6.118e+00 CG   5
iter  9 act 3.213e-01 pre 3.205e-01 delta 7.491e-01 f 2.312e+02 |g| 3.602e+00 CG  13
iter  1 act 7.576e+01 pre 6.946e+01 delta 5.850e-01 f 2.737e+02 |g| 9.251e+02 CG   2
cg reaches trust region boundary
iter  2 act 8.996e+00 pre 8.188e+00 delta 6.534e-01 f 1.979e+02 |g| 1.312e+02 CG   4
cg reaches trust region boundary
iter  3 act 3.386e+00 pre 3.334e+00 delta 7.383e-01 f 1.889e+02 |g| 3.396e+01 CG   3
cg reaches trust region boundary
iter  4 act 1.719e+00 pre 1.688e+00 delta 8.010e-01 f 1.855e+02 |g| 9.804e+00 CG   4
iter  5 act 6.915e-01 pre 6.850e-01 delta 8.010e-01 f 1.838e+02 |g| 5.376e+00 CG  11
iter  1 act 2.664e+01 pre 2.480e+01 delta 5.124e-01 f 2.737e+02 |g| 3.677e+02 CG   2
cg reaches trust region boundary
iter  2 act 7.489e+00 pre 7.253e+00 delta 6.109e-01 f 2.470e+02 |g| 4.473e+01 CG   3
cg reaches trust region boundary
iter  3 act 4.145e+00 pre 4.206e+00 delta 6.879e-01 f 2.395e+02 |g| 1.523e+01 CG   3
cg reaches trust region boundary
iter  4 act 2.793e+00 pre 2.722e+00 delta 8.318e-01 f 2.354e+02 |g| 1.233e+01 CG   4
cg reaches trust region boundary
iter  5 act 1.009e+00 pre 9.996e-01 delta 8.431e-01 f 2.326e+02 |g| 5.582e+00 CG  11
> predict -b 1 '/tmp/pdaid_testing.ll' '/tmp/pdaid_lr.model' '/tmp/lr.predictions'
Accuracy = 53.9272% (1126/2088)


> train -c 0.125000 -s 2 '/tmp/pdaid_training.ll' '/tmp/pdaid_svc.model'
iter  1 act 9.731e+01 pre 9.731e+01 delta 3.522e-02 f 7.896e+02 |g| 5.525e+03 CG   1
cg reaches trust region boundary
iter  2 act 1.073e+01 pre 1.073e+01 delta 1.072e-01 f 6.923e+02 |g| 3.645e+02 CG   1
cg reaches trust region boundary
iter  3 act 1.569e+01 pre 1.569e+01 delta 1.131e-01 f 6.816e+02 |g| 4.405e+02 CG   2
cg reaches trust region boundary
iter  4 act 7.712e+00 pre 7.822e+00 delta 1.213e-01 f 6.659e+02 |g| 1.280e+02 CG   2
cg reaches trust region boundary
iter  5 act 9.930e+00 pre 9.912e+00 delta 1.742e-01 f 6.582e+02 |g| 2.131e+02 CG   3
cg reaches trust region boundary
iter  6 act 8.322e+00 pre 8.335e+00 delta 2.789e-01 f 6.483e+02 |g| 7.612e+01 CG   3
cg reaches trust region boundary
iter  7 act 7.138e+00 pre 7.129e+00 delta 3.223e-01 f 6.399e+02 |g| 7.065e+01 CG   3
cg reaches trust region boundary
iter  8 act 5.803e+00 pre 5.744e+00 delta 3.583e-01 f 6.328e+02 |g| 7.602e+01 CG   5
cg reaches trust region boundary
iter  9 act 3.524e+00 pre 3.507e+00 delta 3.676e-01 f 6.270e+02 |g| 4.802e+01 CG   6
cg reaches trust region boundary
iter 10 act 2.883e+00 pre 2.883e+00 delta 4.147e-01 f 6.235e+02 |g| 3.039e+01 CG   8
cg reaches trust region boundary
iter 11 act 1.386e+00 pre 1.384e+00 delta 4.220e-01 f 6.206e+02 |g| 5.599e+01 CG  11
iter  1 act 2.795e+02 pre 2.784e+02 delta 2.683e-01 f 7.896e+02 |g| 7.401e+03 CG   2
cg reaches trust region boundary
iter  2 act 1.934e+01 pre 1.843e+01 delta 2.925e-01 f 5.101e+02 |g| 1.859e+02 CG   3
cg reaches trust region boundary
iter  3 act 1.066e+01 pre 1.093e+01 delta 3.472e-01 f 4.908e+02 |g| 1.213e+02 CG   4
cg reaches trust region boundary
iter  4 act 5.809e+00 pre 5.703e+00 delta 3.867e-01 f 4.801e+02 |g| 5.920e+01 CG   4
cg reaches trust region boundary
iter  5 act 3.984e+00 pre 3.934e+00 delta 4.022e-01 f 4.743e+02 |g| 4.702e+01 CG   7
cg reaches trust region boundary
iter  6 act 2.579e+00 pre 2.572e+00 delta 4.165e-01 f 4.703e+02 |g| 3.355e+01 CG   8
cg reaches trust region boundary
iter  7 act 1.329e+00 pre 1.334e+00 delta 4.165e-01 f 4.677e+02 |g| 2.973e+01 CG  12
iter  1 act 1.003e+02 pre 9.964e+01 delta 2.401e-01 f 7.896e+02 |g| 2.942e+03 CG   2
cg reaches trust region boundary
iter  2 act 2.771e+01 pre 2.774e+01 delta 2.923e-01 f 6.894e+02 |g| 2.191e+02 CG   3
cg reaches trust region boundary
iter  3 act 1.522e+01 pre 1.541e+01 delta 3.166e-01 f 6.616e+02 |g| 1.098e+02 CG   3
cg reaches trust region boundary
iter  4 act 1.172e+01 pre 1.161e+01 delta 4.018e-01 f 6.464e+02 |g| 1.093e+02 CG   4
cg reaches trust region boundary
iter  5 act 5.638e+00 pre 5.598e+00 delta 4.338e-01 f 6.347e+02 |g| 4.325e+01 CG   5
cg reaches trust region boundary
iter  6 act 3.281e+00 pre 3.299e+00 delta 4.515e-01 f 6.291e+02 |g| 3.776e+01 CG   8
cg reaches trust region boundary
iter  7 act 1.359e+00 pre 1.359e+00 delta 4.538e-01 f 6.258e+02 |g| 1.799e+01 CG  18
> predict -b 0 '/tmp/pdaid_testing.ll' '/tmp/pdaid_svc.model' '/tmp/svc.predictions'
Accuracy = 53.1609% (1110/2088)



In [9]:
# Simple method to convert from liblinear format to fann format
def convert_liblinear_file(ll_filepath, fann_filepath):
    with open(ll_filepath, 'r') as ll_file:
        data = []
        for line in ll_file.readlines():
            fields = line.strip().split(" ")
            row = [int(fields[0])]
            for (i, field) in enumerate(fields[1:]):
                (colnum, value) = field.split(":")
                assert(int(colnum) == (i + 1))
                row.append(float(value))
            data.append(row)
    with open(fann_filepath, 'w') as fann_file:
        rows = len(data)
        cols = len(data[0]) -1
        fann_file.write("%s %s %s\n" % (rows, cols, 3))
        for row in data:
            disease = row[0]
            disease_vector = [0, 0, 0]
            disease_vector[disease] = 1
            for feature in row[1:]:
                fann_file.write("%.12f " % (feature))
            fann_file.write("\n%s\n" % (" ".join([str(i) for i in disease_vector])))
        
fann_training_file = "/tmp/pdaid_training.fann"
fann_testing_file = "/tmp/pdaid_testing.fann"
convert_liblinear_file(ll_testing_filename, fann_testing_file)
convert_liblinear_file(ll_training_filename, fann_training_file)

In [10]:
from fann2 import libfann

train_data = libfann.training_data()
train_data.read_train_from_file(fann_training_file)

num_input = train_data.num_input_train_data()
num_output = train_data.num_output_train_data()

alpha = 5.0 # Range of 2 to 10, below 5 might cause overfitting
num_hidden = train_data.length_train_data() / (alpha * (num_input  + num_output))
ann = libfann.neural_net()
ann.create_sparse_array(connection_rate=1, layers=(num_input, num_output))
#ann.set_scaling_params(data=train_data, new_input_min=0, new_input_max=1, new_output_min=0, new_output_max=1)
#ann.scale_train(data=train_data)
ann.set_learning_rate(0.7)
#ann.set_train_error_function(libfann.ERRORFUNC_LINEAR)
ann.set_activation_function_hidden(libfann.ELLIOT)
# Steepness= 0.5 ELLIOT AUC=0.88
ann.set_activation_steepness_output(0.5)
ann.set_activation_function_output(libfann.ELLIOT)

neural_network_save_file = "/tmp/pdaid.net"

ann.train_on_data(data=train_data, desired_error=0.1, epochs_between_reports=5, max_epochs=500)
ann.save(neural_network_save_file)
#ann.print_connections()
#ann.print_parameters()


Out[10]:
True

In [11]:
from fann2 import libfann

ann = libfann.neural_net()
ann.create_from_file(neural_network_save_file)

test_data = libfann.training_data()
test_data.read_train_from_file(fann_testing_file)
#ann.scale_train(data=test_data)

ann.reset_MSE()
input=test_data.get_input()
output=test_data.get_output()

prediction_file = data_dir + "neural_predictions.csv"
with open(prediction_file, 'w') as predict_file:
    predict_file.write(",actual,predicted,p0,p1,p2\n")
    for i in range(len(input)):
        predict = ann.test(input[i], output[i])
        predicted_disease = predict.index(max(predict))
        predict_file.write("%d,%d,%d, %s\n" % (i, int(output[i].index(1)), predicted_disease, ",".join([str(i) for i in predict]))) 
        # int(output[i][0]), predict[0]))
print "MSE error on test data: %f" % ann.get_MSE()


MSE error on test data: 0.192090

In [12]:
import pandas as pd
from sklearn.metrics import classification_report

df = pd.DataFrame.from_csv(prediction_file)
real = df.actual.astype("int")
predicted = df.predicted

print [a for a in enumerate(disease_groups)]
print classification_report(real, predicted)


[(0, 'bacterial'), (1, 'parasitic'), (2, 'viral'), (3, 'fungal')]
             precision    recall  f1-score   support

          0       0.44      0.37      0.40       642
          1       0.62      0.54      0.58       590
          2       0.54      0.65      0.59       856

avg / total       0.53      0.53      0.53      2088


In [13]:
import numpy as np
import pandas as pd
from math import log

def alphabet_dict():
    d = {}
    for letter in amino_acids.keys():
        d[letter] = 0
    return d

def init_counters():
    return [alphabet_dict() for i in range(seq_len)]

counters = {"bacterial": init_counters(),
            "viral": init_counters(),
            "parasitic": init_counters(),
            "fungal": init_counters(),
            "total": init_counters()}
for (rowid, seq, disease) in all_samples.itertuples():
    for (i, aa) in enumerate(seq):
        counters[disease][i][aa.upper()] += 1

for i in range(seq_len):
    for aa in amino_acids.keys():
        # TODO: readd fungal values?
        counters["total"][i][aa] += (counters["bacterial"][i][aa] \
                                    + counters["viral"][i][aa]\
                                    + counters["parasitic"][i][aa])
    
def prob(disease, pos, aa):
    counter = counters[disease][pos]
    total = 0
    for count in counter.values():
        total += count
    if counter[aa] == 0:
        return 0
    else:
        return float(counter[aa])/total
    
def safe_log(x):
    if x <= 0:
        return float("-inf")
    return log(x, 2.0)

def generate_kl_heatmap(disease):
    result = [alphabet_dict() for i in range(seq_len)]
    for i in range(seq_len):
        for letter in amino_acids.keys():
            result[i][letter] = prob(disease, i, letter) * safe_log( prob(disease, i, letter) / max(0.00000001, prob("total", i, letter)))

    r = pd.DataFrame(result).transpose()
    vmin = r.min().min()
    vmax = r.max().max()
    center = (vmax - vmin)/2

    import seaborn as sns
    sns.set()
    ax = sns.heatmap(r, linewidths=.5,
            yticklabels=["%s (%s)" % (amino_acids[code], code) for code in r.index],
            xticklabels=["%d" %(i) for i in range(seq_len)], 
            vmin=vmin, vmax=vmax, center=center, cmap="YlGnBu", alpha=0.8)
    ax.set_title('%s Kullback-Leibler Information, $I(i) = \sum_{L=0}^{N}p_{i}^{L}\log_2(p_{i}^{L}/q_{i}^{L})$' % (disease))

In [14]:
%matplotlib inline
generate_kl_heatmap("bacterial")



In [15]:
%matplotlib inline
generate_kl_heatmap("viral")



In [16]:
%matplotlib inline
generate_kl_heatmap("parasitic")