In [1]:
import pandas as pd
import numpy as np
In [2]:
import re
with open('../CMakeLists.txt', 'r') as cmake_file:
content = cmake_file.read()
major_version = re.search('CHIPPER_VERSION_MAJOR (\d+)', content).group(1)
minor_version = re.search('CHIPPER_VERSION_MINOR (\d+)', content).group(1)
patch_version = re.search('CHIPPER_VERSION_PATCH (\d+)', content).group(1)
version = "%s.%s.%s" % (major_version, minor_version, patch_version)
print version
In [3]:
import os
# Note: This notebook uses blast. To install on MacOS, run `brew install blast`.
# Set this to the location of your Swiss-Prot database
swiss_db_path = "/workspace/blast/swiss/swissprot"
# Maximum length of the IEDB epitopes used
max_epitope_len = 18
# Minimum length of the IEDB epitopes used
min_epitope_len = 9
# Total length of the samples used to create the model. The cleave location is at offset = generated_sample_len/2.
generated_sample_len = 20
data_dir = "/workspace/chipper_data/chipper-%s-data/" % (version)
src_dir = "../src/"
models_dir = "../models/"
# Path to IEDB (http://iedb.org/) epitope export file
epitope_csv = data_dir + "epitope_table_export_1478815050.csv"
# Paths for models and predictions
LR_model_file = models_dir + "lr.model"
LR_predictions_file = data_dir + "LR_predictions.txt"
SVC_model_file = models_dir + "svc.model"
SVC_predictions_file = data_dir + "SVC_predictions.txt"
# File to save raw training data set to
raw_data_csv = data_dir + "training_set.csv"
print "data_dir=%s" % (data_dir)
In [4]:
import os, sys, argparse
import subprocess
# Reads protein_ids from file and returns a DataFrame with two columns: assession_id, sequence
def __create_swiss_prot_dataframe(output_path, swiss_db_path, protein_id_file):
blast_output = output_path + "/" + "blast_output.txt"
delimiter="|"
cmd = [
"blastdbcmd", "-db", swiss_db_path, "-entry_batch", protein_id_file,
"-outfmt", "%%a%s%%s" % (delimiter), "-out", blast_output
]
print " ".join(cmd)
with open(os.devnull, "w+") as devnull:
subprocess.call(cmd, stdout=devnull, stderr=subprocess.STDOUT)
with open(blast_output, 'r') as blast:
data = []
for line in blast.readlines():
(protein_id, protein_sequence) = line.split(delimiter)
data.append((protein_id.strip(), protein_sequence.strip()))
return pd.DataFrame(data, columns=['protein_id', 'protein_sequence'])
# Converts the merged IEDB/Swiss-Prot table into training data
def __create_training_set(output_dir, iedb_swiss_merged, generated_sample_len):
data = []
for (protein_id,
row_ids) in iedb_swiss_merged.groupby("protein_id").groups.items():
epitope_rows = iedb_swiss_merged.loc[row_ids]
# Grab the protein sequence from the first row
protein_sequence = epitope_rows['protein_sequence'].iloc[0]
# Sort by the C-terminus ('end')
sorted_epitopes = epitope_rows.sort_values(by="end")
protein_sequence_len = len(protein_sequence)
current_loc = 0
for (i, epitope_sequence, start, end) in \
sorted_epitopes[["epitope_sequence", "start", "end"]].itertuples():
epitope_start = int(start) - 1
epitope_end = int(end) - 1
half_sample_len = generated_sample_len / 2
sample_start_end = lambda pos: (pos - half_sample_len + 1, pos + half_sample_len + 1)
epitope_sequence_len = len(epitope_sequence)
epitope_midpoint = epitope_end - epitope_sequence_len / 2
cleaved_sample_pos = sample_start_end(epitope_end)
uncleaved_sample_pos = sample_start_end(epitope_midpoint)
# Check if our samples are off the protein sequence or overlap (TODO: handle these)
if (uncleaved_sample_pos[0] < current_loc or
cleaved_sample_pos[1] > protein_sequence_len):
continue
current_loc = cleaved_sample_pos[1]
# Double check that the start and end positions are correct
assert protein_sequence[epitope_start:epitope_end + 1] == epitope_sequence,\
"Epitope failed to align to protein"
fetch_seq = lambda pos: protein_sequence[pos[0]:pos[1]]
data.append((fetch_seq(cleaved_sample_pos), 1))
data.append((fetch_seq(uncleaved_sample_pos), 0))
cleavage_data = pd.DataFrame(data)
cleavage_data.columns = ['sequence', 'is_cleaved']
return cleavage_data
def create_iedb_swiss_dataset(epitope_csv, swiss_db_path, output_dir,
max_epitope_len, min_epitope_len,
generated_sample_len):
# Load the CSV from IEDB (skipping the first line, [2:])
epi = pd.DataFrame.from_csv(epitope_csv)[2:]
# Rename the columns to remove whitespace
epi.columns = [
'type', 'epitope_sequence', 'start', 'end', 'chebi', 'syn', 'protein',
'protein_id', 'organism', 'oid', 'comments'
]
# Remove GI entries that start with prefix "SRC"
epi = epi[epi.protein_id.str.startswith("SRC") == False]
# Remove entries with '+' notation (note: looking into this, e.g. "PLNISLGDVVLY + DEAM(N3)")
epi = epi[epi.epitope_sequence.str.find('+') == -1]
# Remove the "GI:" prefix from the GIs provided by IEDB
epi["protein_id"] = epi.protein_id.str.replace("GI:", "")
# Drop any epitopes that are not the desired length
iedb_epitopes = epi[(epi.epitope_sequence.str.len() >= min_epitope_len) \
& (epi.epitope_sequence.str.len() <= max_epitope_len)]\
.loc[:, ["epitope_sequence", "protein_id", "start", "end"]]
# Create a file with a list of unique protein (antigen) IDs for use with BLAST
antigen_ids = iedb_epitopes["protein_id"].unique()
protein_id_file = output_dir + "/" + "protein_ids.txt"
np.savetxt(protein_id_file, antigen_ids, fmt="%s")
num_iedb_epitopes = iedb_epitopes.shape[0]
num_iedb_proteins = antigen_ids.shape[0]
print "There are %d IEDB MHC-1 epitopes from %d unique proteins (antigens)." % (
num_iedb_epitopes, num_iedb_proteins)
swiss_prot_db = __create_swiss_prot_dataframe(output_dir, swiss_db_path,
protein_id_file)
print "Found %d IEDB proteins in Swiss-Prot sequence database" % (
swiss_prot_db.shape[0])
# Merge the IEDB epitops with the SWISS protein data and drop any NaN values (proteins not in SWISSprot)
iedb_swiss_merged = pd.merge(
iedb_epitopes, swiss_prot_db, on="protein_id", how="left").dropna()
# Since we're going to sort on position, convert to numeric columns
iedb_swiss_merged["start"] = pd.to_numeric(iedb_swiss_merged[
"start"]).astype(int)
iedb_swiss_merged["end"] = pd.to_numeric(iedb_swiss_merged["end"]).astype(
int)
# Save the raw merged data for reference
iedb_swiss_merged.to_csv(output_dir + "/" + "merged_iedb_swiss.csv")
return __create_training_set(output_dir, iedb_swiss_merged, generated_sample_len)
raw_training_data = create_iedb_swiss_dataset(epitope_csv, swiss_db_path, data_dir,
max_epitope_len, min_epitope_len,
generated_sample_len)
raw_training_data = raw_training_data.drop_duplicates(subset=["sequence"], keep=False)
raw_training_data.to_csv(raw_data_csv)
In [5]:
import numpy as np
import pandas as pd
from math import log
aa_names = {"A": "Alanine",
"C": "Cysteine",
"D": "Aspartic Acid",
"E": "Glutamic Acid",
"F": "Phenylalanine",
"G": "Glycine",
"H": "Histidine",
"I": "Isoleucine",
"K": "Lysine",
"L": "Leucine",
"M": "Methionine",
"N": "Asparagine",
"P": "Proline",
"Q": "Glutamine",
"R": "Arginine",
"S": "Serine",
"T": "Threonine",
"U": "Selenocysteine",
"V": "Valine",
"W": "Tryptophan",
"Y": "Tyrosine"}
aa_alphabet = "ACDEFGHIKLMNPQRSTUVWY"
def alphabet_dict():
d = {}
for letter in aa_alphabet:
d[letter] = 0
return d
def init_counters():
return [alphabet_dict() for i in range(20)]
counters = [init_counters(), init_counters()]
for (rowid, seq, is_cleaved) in raw_training_data.itertuples():
for (i, aa) in enumerate(seq):
counters[is_cleaved][i][aa] += 1
def prob(is_cleaved, pos, aa):
counter = counters[is_cleaved][pos]
total = 0
for count in counter.values():
total += count
if counter[aa] == 0:
return 0
else:
return float(counter[aa])/total
def safe_log(x):
if x <= 0:
return float("-inf")
return log(x, 2.0)
result = [alphabet_dict() for i in range(20)]
for i in range(20):
for letter in aa_alphabet:
result[i][letter] = prob(1, i, letter) * safe_log( prob(1, i, letter) / max(0.00000001, prob(0, i, letter)))
r = pd.DataFrame(result).transpose()
vmin = r.min().min()
vmax = r.max().max()
center = (vmax - vmin)/2
%matplotlib inline
import seaborn as sns
sns.set()
ax = sns.heatmap(r, linewidths=.5,
yticklabels=["%s (%s)" % (aa_names[code], code) for code in r.index],
xticklabels=["%d" %(i) for i in range(10,0,-1)] + ["%d'" % (i) for i in range(1, 11)],
vmin=vmin, vmax=vmax, center=center, cmap="YlGnBu", alpha=0.8)
ax.set_title('Training Data Kullback-Leibler Information, $I(i) = \sum_{L=0}^{N}p_{i}^{L}\log_2(p_{i}^{L}/q_{i}^{L})$')
Out[5]:
In [6]:
# Read and return the Saxon test set
def fetch_testing_data():
df = pd.DataFrame.from_csv(data_dir + "saxova_in_vivo_mhc_1_ligands_dataset.csv")
df = df[["Sequences", "Activity"]]
df.columns = ["sequence", "is_cleaved"]
df.is_cleaved[df.is_cleaved == -1] = 0
return df
raw_testing_data = fetch_testing_data()
if generated_sample_len != 28:
# Testing data is 28 residues in length, cut off excess residues
trim_residues = (28 - generated_sample_len) / 2
raw_testing_data["sequence"] = raw_testing_data.sequence.str[trim_residues:-trim_residues]
In [7]:
from aa_props import seq_to_aa_props
from sklearn.preprocessing import MinMaxScaler
# We need to remove the Saxova samples from our training set
print "There are %d training samples before removing the Saxova sequences." % (
raw_training_data.shape[0])
raw_training_data = raw_training_data[raw_training_data.sequence.isin(
raw_testing_data.sequence) == False]
print "There are %d training samples after filtering out Saxova." % (
raw_training_data.shape[0])
# Filter to check for selenocysteine (TODO) and an invalid "'"
train_seq_validator = lambda seq: seq.find("U") == -1 and seq.find("'") == -1
# Testing set has two sample which are not the correct len, filter them out too
test_seq_validator = lambda seq: train_seq_validator(seq) and len(seq) == generated_sample_len
process_raw_data = lambda data, is_valid: [(seq_to_aa_props(seq), is_cleaved)
for (i, seq, is_cleaved) in data.itertuples()
if is_valid(seq)]
# Filter AA seqs and expand to AA features
training_X_y = process_raw_data(raw_training_data, train_seq_validator)
testing_X_y = process_raw_data(raw_testing_data, test_seq_validator)
(training_X, training_y) = zip(*training_X_y)
(testing_X, testing_y) = zip(*testing_X_y)
# Scale the data
scaler = MinMaxScaler()
training_X = scaler.fit_transform(training_X)
testing_X = scaler.transform(testing_X)
print "Final training set has %d samples of %d raw samples." % (
training_X.shape[0], raw_training_data.shape[0])
print "Final testing set has %d samples of %d raw samples." % (
testing_X.shape[0], raw_testing_data.shape[0])
In [8]:
train_file = data_dir + "training_data.ll"
test_file = data_dir + "testing_data.ll"
def create_liblinear_files():
for (outpath, rows) in [(train_file, zip(training_y, training_X)),
(test_file, zip(testing_y, testing_X))]:
with open(outpath, 'w') as out:
for (is_cleaved, features) in rows:
out.write("%+d " % (is_cleaved))
for (feature_id, feature_value) in enumerate(features):
out.write("%d:%s " % (feature_id + 1, feature_value))
out.write("\n")
create_liblinear_files()
In [9]:
# Find the best C parameter for the LR model
LR_findC = !train -C -s 0 "$train_file"
print "\n".join(LR_findC)
In [10]:
# Find the best C parameter for the SVC model
SVC_findC = !train -C -s 2 "$train_file"
print "\n".join(SVC_findC)
In [11]:
def create_model_and_predictions(search_output, solver, model_file, predictions_file, probability_estimates):
# Grab C from the last line...
bestC = float(search_output[-1].split(" ")[3])
cmd = "train -c %f -s %d '%s' '%s'" % (bestC, solver, train_file, model_file)
print "> " + cmd
createModel = !{cmd}
print "\n".join(createModel)
cmd = "predict -b %d '%s' '%s' '%s'" % (probability_estimates, test_file, model_file, predictions_file)
print "> " + cmd
predict = !{cmd}
print "\n".join(predict)
print "\n"
In [12]:
create_model_and_predictions(LR_findC, 0, LR_model_file, LR_predictions_file, 1)
create_model_and_predictions(SVC_findC, 2, SVC_model_file, SVC_predictions_file, 0)
In [13]:
from sklearn.metrics import classification_report, matthews_corrcoef, confusion_matrix
def fetch_LR_probabilities(filepath):
predictions = []
with open(filepath, "r") as input:
for line in input.readlines()[1:]:
predictions.append(float(line.split(" ")[1]))
return predictions
def fetch_SVC_predictions(filepath):
predictions = []
with open(filepath, "r") as input:
for line in input.readlines():
predictions.append(int(line))
return predictions
SVC_predictions = fetch_SVC_predictions(SVC_predictions_file)
LR_probabilities = fetch_LR_probabilities(LR_predictions_file)
LR_classification_vector = lambda cutoff: [1 if pred >= cutoff else 0 for pred in LR_probabilities]
def find_best_mcc():
best_mcc = 0.0
best_cutoff = 0.0
for i in range(1, 100):
pred_cutoff = i/100.0
mcc = matthews_corrcoef(testing_y, LR_classification_vector(pred_cutoff))
if (mcc > best_mcc):
best_mcc = mcc
best_cutoff = pred_cutoff
return (best_cutoff, best_mcc)
(best_cutoff, best_mcc) = find_best_mcc()
print "** Logistic Regression Report, cutoff= %.2f (MCC=%.3f) **" % (best_cutoff, best_mcc)
print classification_report(testing_y, LR_classification_vector(best_cutoff))
print "** Support Vector Classification Report (MCC=%.3f) ***" % (matthews_corrcoef(testing_y, SVC_predictions))
print classification_report(testing_y, SVC_predictions)
def print_metrics(name, actual, predicted):
((tn, fp), (fn, tp)) = confusion_matrix(actual, predicted)
sensitivity = 100.0 * tp / (tp + fn)
specificity = 100.0 * tn / (tn + fp)
precision = 100.0 * tp / (tp + fp)
print "%s: sensitivity(recall)=%.1f, specificity=%.1f, precision=%.1f" % (name, sensitivity, specificity, precision)
print_metrics("LR", testing_y, LR_classification_vector(best_cutoff))
print_metrics("SVC", testing_y, SVC_predictions)
In [18]:
%matplotlib notebook
import matplotlib.pyplot as plt
from sklearn import svm, metrics
fpr, tpr, thresholds = metrics.roc_curve(testing_y, LR_probabilities, pos_label=1)
roc_auc = metrics.roc_auc_score(testing_y, LR_probabilities, average='macro', sample_weight=None)
plt.title('Logistic Regression ROC Curve')
plt.plot(fpr, tpr, 'b', label='AUC = %0.2f'% (roc_auc))
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.xlim([-0.1,1.2])
plt.ylim([-0.1,1.2])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
Out[18]:
In [ ]:
In [15]:
def read_LR_model_weights():
weights=[]
with open(LR_model_file, "r") as model:
for line in model.readlines()[6:]:
weights.append(float(line))
return weights
num_hydrophobic_metrics = 18
num_steric_metrics = 17
num_electronic_metrics = 15
num_aa_properties = num_hydrophobic_metrics + num_steric_metrics + num_electronic_metrics
w = read_LR_model_weights()
h_mean = []
h_std = []
s_mean = []
s_std = []
e_mean = []
e_std = []
for chunk in [w[i:i+50] for i in range(0, generated_sample_len * num_aa_properties, num_aa_properties)]:
start = 0
end = num_hydrophobic_metrics
h = chunk[start:end]
start = end
end = start + num_steric_metrics
s = chunk[start:end]
start = end
e = chunk[start:]
h_mean.append(np.mean(h))
h_std.append(np.std(h))
s_mean.append(np.mean(s))
s_std.append(np.std(s))
e_mean.append(np.mean(e))
e_std.append(np.std(e))
In [16]:
%matplotlib notebook
n_groups = generated_sample_len
fig, ax = plt.subplots(figsize=(12,9))
index = np.arange(n_groups)
bar_width = 0.15
ax1 = ax.errorbar(index + bar_width, h_mean, h_std, bar_width, label="Hydrophobic", color='b')
ax2 = ax.errorbar(index, s_mean, s_std, bar_width, label="Steric", color='r')
ax3 = ax.errorbar(index - bar_width, e_mean, e_std, bar_width, label="Electronic", color='g')
ax.set_xlim(-bar_width,len(index)+bar_width)
plt.xlabel('Amino Acid Position')
plt.ylabel('SVM Coefficient Value')
plt.title('Hydrophobic, Steric, and Electronic Effect by Amino Acid Position')
plt.xticks(index, range (n_groups/2, 0, -1) + [str(i)+"'" for i in range (1, n_groups/2+1)])
plt.legend()
plt.tight_layout()
plt.show()
In [17]:
from sys import stdout as out
from aa_props import all_aa_props
scale = np.array(scaler.scale_[0:50])
scaled_aa_props = {}
for (key, values) in all_aa_props.viewitems():
scaled_aa_props[key] = np.multiply(scale, values)
def write_aa_props(srcfile, hdrfile):
generated_warning = """
/*** THIS IS AUTO-GENERATED SOURCE! DO NOT EDIT! ***/
"""
header_content = """
#ifndef CHIPPER_H_DEFINED
#define CHIPPER_H_DEFINED
#include "linear.h"
#include "output.h"
#define NUM_AA_PROPERTIES 50
#define TRAINING_SET_SIZE %s
#define BEST_CUTOFF_VALUE %.2f
#define BEST_CUTOFF_MCC %.3f
#define GENERATED_SAMPLE_LEN %d
#define NUM_FEATURES (GENERATED_SAMPLE_LEN * NUM_AA_PROPERTIES)
int get_aa_properties(char aa, struct feature_node *features);
int predict_cleavage(const char *fasta_input, const char *output, chipper_output_format format, const char *model_file, int output_probabilities, int cutoff_provided, double cutoff);
#endif
""" % (training_X.shape[0], best_cutoff, best_mcc, generated_sample_len)
with open(hdrfile, 'w') as out:
out.write(generated_warning + header_content)
aa_props_src_header = generated_warning + """
#include "chipper.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
/* Amino acid properties from https://www.ncbi.nlm.nih.gov/pubmed/15895431
Scaled to match the current model's input data. */
"""
aa_props_src_footer = """
int get_aa_properties(char aa, struct feature_node *features) {
int i;
int target_aa = aa;
/* If lowercase, make upper case */
if ('a' <= target_aa && target_aa <= 'z') {
target_aa -= ('a'-'A');
}
/* Shift to make 'A' equal to zero offset */
target_aa -= 'A';
if (0 <= target_aa && target_aa <= 25) {
for(i = 0; i < NUM_AA_PROPERTIES; i++) {
if (features[i].index == -1) {
fprintf(stderr, "End of features reached before AA properties for '%c' could be set", target_aa);
exit(EXIT_FAILURE);
}
features[i].value = aa_props[target_aa][i];
}
return aa_props[target_aa][0] == -1? 0: NUM_AA_PROPERTIES;
} else {
return 0;
}
}
"""
with open(srcfile, "w") as out:
out.write(aa_props_src_header)
out.write("static const float aa_props[26][NUM_AA_PROPERTIES] = {\n")
for key in range(ord('A'), ord('Z')+1):
out.write(" {")
try:
values = scaled_aa_props[chr(key)]
out.write(",".join([str(i) for i in values]))
except KeyError:
# We scaled our values from zero to one. Mark misses with -1
out.write(",".join([str(-1) for i in range(0, 50)]))
out.write("}")
if key != ord('Z'):
out.write(",")
out.write("\n")
out.write("};")
out.write(aa_props_src_footer)
aa_props_src = src_dir + "aa_props.c"
aa_props_header = src_dir + "chipper.h"
write_aa_props(aa_props_src, aa_props_header)
index_src = !gindent -linux {aa_props_src} {aa_props_header}