In [1]:
import pandas as pd
import re

data_dir = "/workspace/chipper_data/chipper-svm-rfe/"

iedb_swiss_df = pd.DataFrame.from_csv(data_dir + "merged_iedb_swiss.csv")

saxova_df = pd.DataFrame.from_csv(data_dir + "saxova_in_vivo_mhc_1_ligands_dataset.csv")
saxova_df = saxova_df[["Sequences", "Activity"]]
saxova_df.columns = ["sequence", "is_cleaved"]
saxova_df.is_cleaved[saxova_df.is_cleaved == -1] = 0

print "There are %d training samples before removing the Saxova sequences." % (
    iedb_swiss_df.shape[0])
regex = re.compile("|".join(saxova_df.sequence))
iedb_swiss_df = iedb_swiss_df[iedb_swiss_df.protein_sequence.str.contains(regex) == False]
print "There are %d training samples after filtering out Saxova." % (
    iedb_swiss_df.shape[0])


There are 35848 training samples before removing the Saxova sequences.
There are 35181 training samples after filtering out Saxova.

In [2]:
# Converts the merged IEDB/Swiss-Prot table into training data
def create_training_set(iedb_swiss_merged, generated_sample_len):
    data = []
    for (protein_id,
         row_ids) in iedb_swiss_merged.groupby("protein_id").groups.items():
        epitope_rows = iedb_swiss_merged.loc[row_ids]
        # Grab the protein sequence from the first row
        protein_sequence = epitope_rows['protein_sequence'].iloc[0]
        # Sort by the C-terminus ('end')
        sorted_epitopes = epitope_rows.sort_values(by="end")
        protein_sequence_len = len(protein_sequence)
        current_loc = 0
        for (i, epitope_sequence, start, end) in \
                sorted_epitopes[["epitope_sequence", "start", "end"]].itertuples():
            epitope_start = int(start) - 1
            epitope_end = int(end) - 1

            half_sample_len = generated_sample_len / 2
            sample_start_end = lambda pos: (pos - half_sample_len + 1, pos + half_sample_len + 1)
            
            epitope_sequence_len = len(epitope_sequence)
            epitope_midpoint = epitope_end - epitope_sequence_len / 2
            
            cleaved_sample_pos = sample_start_end(epitope_end)
            uncleaved_sample_pos = sample_start_end(epitope_midpoint)
            
            # Check if our samples are off the protein sequence or overlap (TODO: handle these)
            if (uncleaved_sample_pos[0] < current_loc or
                    cleaved_sample_pos[1] > protein_sequence_len):
                continue

            current_loc = cleaved_sample_pos[1]

            # Double check that the start and end positions are correct
            assert protein_sequence[epitope_start:epitope_end + 1] == epitope_sequence,\
                     "Epitope failed to align to protein"

            fetch_seq = lambda pos: protein_sequence[pos[0]:pos[1]]
            data.append((fetch_seq(cleaved_sample_pos), 1))
            data.append((fetch_seq(uncleaved_sample_pos), 0))
                        
    cleavage_data = pd.DataFrame(data)
    cleavage_data.columns = ['sequence', 'is_cleaved']
    return cleavage_data

raw_training_data = create_training_set(iedb_swiss_df, 28)

In [3]:
raw_testing_data = saxova_df

In [4]:
from aa_props import seq_to_aa_props
from sklearn.preprocessing import MinMaxScaler

generated_sample_len = 28

# Filter to check for selenocysteine (TODO) and an invalid "'"
train_seq_validator = lambda seq: seq.find("U") == -1 and seq.find("'") == -1
# Testing set has two sample which are not the correct len, filter them out too
test_seq_validator = lambda seq: train_seq_validator(seq) and len(seq) == generated_sample_len

process_raw_data = lambda data, is_valid: [(seq_to_aa_props(seq), is_cleaved)
             for (i, seq, is_cleaved) in data.itertuples()
             if is_valid(seq)]

# Filter AA seqs and expand to AA features
training_X_y = process_raw_data(raw_training_data, train_seq_validator)
testing_X_y = process_raw_data(raw_testing_data, test_seq_validator)

(training_X, training_y) = zip(*training_X_y)
(testing_X, testing_y) = zip(*testing_X_y)

# Scale the data
scaler = MinMaxScaler()
training_X = scaler.fit_transform(training_X)
testing_X = scaler.transform(testing_X)

print "Final training set has %d samples of %d raw samples." % (
    training_X.shape[0], raw_training_data.shape[0])
print "Final testing set has %d samples of %d raw samples." % (
    testing_X.shape[0], raw_testing_data.shape[0])


Final training set has 48679 samples of 48682 raw samples.
Final testing set has 416 samples of 419 raw samples.

In [5]:
import matplotlib.pyplot as plt
from sklearn.svm import LinearSVC
from sklearn.feature_selection import RFECV

svc = LinearSVC(C=0.0625)
rfe = RFECV(estimator=svc, step=.05, cv=5, scoring='accuracy', n_jobs=8, verbose=1)
rfe.fit(training_X, training_y)

print("Optimal number of features : %d" % rfe.n_features_)
print("Recursive Feature Elimination (RFE) eliminated %d features" % (training_X.shape[1] - rfe.n_features_))

%matplotlib inline
# Plot number of features VS. cross-validation scores
plt.figure()
plt.xlabel("Number of feature subsets selected")
plt.ylabel("Cross validation score")
plt.plot(range(1, len(rfe.grid_scores_) + 1), rfe.grid_scores_)
plt.show()


Fitting estimator with 1400 features.
Fitting estimator with 1400 features.
Fitting estimator with 1400 features.
Fitting estimator with 1400 features.
Fitting estimator with 1400 features.
Fitting estimator with 1330 features.
Fitting estimator with 1330 features.
Fitting estimator with 1330 features.
Fitting estimator with 1330 features.
Fitting estimator with 1330 features.
Fitting estimator with 1260 features.
Fitting estimator with 1260 features.
Fitting estimator with 1260 features.
Fitting estimator with 1260 features.
Fitting estimator with 1260 features.
Fitting estimator with 1190 features.
Fitting estimator with 1190 features.
Fitting estimator with 1190 features.
Fitting estimator with 1190 features.
Fitting estimator with 1190 features.
Fitting estimator with 1120 features.
Fitting estimator with 1120 features.
Fitting estimator with 1120 features.
Fitting estimator with 1120 features.
Fitting estimator with 1050 features.
Fitting estimator with 1050 features.
Fitting estimator with 1050 features.
Fitting estimator with 1120 features.
Fitting estimator with 1050 features.
Fitting estimator with 980 features.
Fitting estimator with 980 features.
Fitting estimator with 980 features.
Fitting estimator with 980 features.
Fitting estimator with 1050 features.
Fitting estimator with 910 features.
Fitting estimator with 910 features.
Fitting estimator with 910 features.
Fitting estimator with 910 features.
Fitting estimator with 840 features.
Fitting estimator with 840 features.
Fitting estimator with 840 features.
Fitting estimator with 980 features.
Fitting estimator with 770 features.
Fitting estimator with 770 features.
Fitting estimator with 770 features.
Fitting estimator with 840 features.
Fitting estimator with 700 features.
Fitting estimator with 700 features.
Fitting estimator with 700 features.
Fitting estimator with 910 features.
Fitting estimator with 770 features.
Fitting estimator with 630 features.
Fitting estimator with 630 features.
Fitting estimator with 630 features.
Fitting estimator with 560 features.
Fitting estimator with 840 features.
Fitting estimator with 560 features.
Fitting estimator with 700 features.
Fitting estimator with 560 features.
Fitting estimator with 490 features.
Fitting estimator with 490 features.
Fitting estimator with 490 features.
Fitting estimator with 420 features.
Fitting estimator with 420 features.
Fitting estimator with 770 features.
Fitting estimator with 420 features.
Fitting estimator with 350 features.
Fitting estimator with 630 features.
Fitting estimator with 350 features.
Fitting estimator with 280 features.
Fitting estimator with 350 features.
Fitting estimator with 280 features.
Fitting estimator with 210 features.
Fitting estimator with 210 features.
Fitting estimator with 280 features.
Fitting estimator with 140 features.
Fitting estimator with 560 features.
Fitting estimator with 700 features.
Fitting estimator with 140 features.
Fitting estimator with 70 features.
Fitting estimator with 70 features.
Fitting estimator with 210 features.
Fitting estimator with 140 features.
Fitting estimator with 70 features.
Fitting estimator with 490 features.
Fitting estimator with 630 features.
Fitting estimator with 420 features.
Fitting estimator with 350 features.
Fitting estimator with 560 features.
Fitting estimator with 280 features.
Fitting estimator with 210 features.
Fitting estimator with 140 features.
Fitting estimator with 490 features.
Fitting estimator with 70 features.
Fitting estimator with 420 features.
Fitting estimator with 350 features.
Fitting estimator with 280 features.
Fitting estimator with 210 features.
Fitting estimator with 140 features.
Fitting estimator with 70 features.
Optimal number of features : 560
Recursive Feature Elimination (RFE) eliminated 840 features

In [6]:
%matplotlib inline
from learning_curves import plot_learning_curve
import matplotlib.pyplot as plt
from sklearn.model_selection import ShuffleSplit

#def create_learning_curve(title, model):
#    cv = ShuffleSplit(n_splits=10, test_size=0.1, random_state=7)
#    plot_learning_curve(model, title, training_X, training_y, (0.2, 1.01), cv=cv, n_jobs=1)

plot_learning_curve(svc, "Learning Curves Full Feature Set", training_X, training_y)
plot_learning_curve(rfe, "Learning Curves RFE Feature Set", training_X, training_y)

plt.show()


None

In [70]:
from aa_props import hydrophobic_props, steric_props, electronic_props
from collections import Counter

min_pos = -1
max_pos = -1
prop_positions = {}
for (i, is_support) in enumerate(rfe.support_):
    if is_support:
        prop_id = i%50
        pos = i/50
        if prop_positions.has_key(pos):
            (prop_positions[pos]).append(prop_id)
        else:
            prop_positions[pos] = [prop_id]
        max_pos = pos
        if min_pos == -1:
            min_pos = pos

num_hydrophic_props = len(hydrophobic_props['A'])
num_steric_props = len(steric_props['A'])
num_electronic_props = len(electronic_props['A'])
aa_types = ["hydrophobic"] * num_hydrophic_props + ["steric"] * num_steric_props  + ["electronic"] * num_electronic_props 

print "%d hydro, %d steric, %d electro props" % (num_hydrophic_props, num_steric_props, num_electronic_props)

total_len = 0
for key in sorted(prop_positions.keys()):
    total_len += len(prop_positions[key])
    print "%02d: %02d %s" % (key, len(prop_positions[key]), Counter([aa_types[i] for i in prop_positions[key]]))
print "#features: %d" % (total_len)


18 hydro, 17 steric, 15 electro props
00: 10 Counter({'electronic': 5, 'hydrophobic': 3, 'steric': 2})
01: 07 Counter({'steric': 4, 'electronic': 3})
02: 06 Counter({'electronic': 4, 'hydrophobic': 2})
03: 11 Counter({'steric': 5, 'electronic': 3, 'hydrophobic': 3})
04: 25 Counter({'electronic': 9, 'steric': 8, 'hydrophobic': 8})
05: 38 Counter({'hydrophobic': 15, 'steric': 13, 'electronic': 10})
06: 36 Counter({'steric': 14, 'electronic': 11, 'hydrophobic': 11})
07: 33 Counter({'hydrophobic': 12, 'electronic': 11, 'steric': 10})
08: 31 Counter({'hydrophobic': 12, 'electronic': 11, 'steric': 8})
09: 34 Counter({'electronic': 14, 'steric': 10, 'hydrophobic': 10})
10: 37 Counter({'hydrophobic': 15, 'steric': 12, 'electronic': 10})
11: 26 Counter({'hydrophobic': 11, 'electronic': 8, 'steric': 7})
12: 35 Counter({'steric': 12, 'hydrophobic': 12, 'electronic': 11})
13: 38 Counter({'steric': 15, 'hydrophobic': 13, 'electronic': 10})
14: 30 Counter({'hydrophobic': 12, 'steric': 9, 'electronic': 9})
15: 32 Counter({'steric': 11, 'hydrophobic': 11, 'electronic': 10})
16: 20 Counter({'electronic': 7, 'hydrophobic': 7, 'steric': 6})
17: 32 Counter({'steric': 11, 'hydrophobic': 11, 'electronic': 10})
18: 25 Counter({'electronic': 11, 'steric': 7, 'hydrophobic': 7})
19: 10 Counter({'hydrophobic': 4, 'steric': 3, 'electronic': 3})
20: 08 Counter({'hydrophobic': 4, 'steric': 2, 'electronic': 2})
21: 05 Counter({'steric': 2, 'electronic': 2, 'hydrophobic': 1})
22: 08 Counter({'electronic': 5, 'steric': 2, 'hydrophobic': 1})
23: 11 Counter({'hydrophobic': 5, 'steric': 4, 'electronic': 2})
24: 01 Counter({'steric': 1})
25: 02 Counter({'hydrophobic': 2})
26: 05 Counter({'steric': 2, 'electronic': 2, 'hydrophobic': 1})
27: 04 Counter({'steric': 4})
#features: 560

In [53]:
from classification_metrics import classification_metrics

predicted = rfe.predict(testing_X)

classification_metrics(testing_y, predicted, ["non-cleaved", "cleaved"])


Accuracy was 76.92%

             precision    recall  f1-score   support

non-cleaved       0.77      0.76      0.77       208
    cleaved       0.77      0.77      0.77       208

avg / total       0.77      0.77      0.77       416

Confusion Matrix: cols = predictions, rows = actual

                   non-cleaved        cleaved
    non-cleaved            159             49
        cleaved             47            161

In [54]:
from sklearn.metrics import confusion_matrix

((tn, fp), (fn, tp)) = confusion_matrix(testing_y, predicted)
sensitivity = 100.0 * tp / (tp + fn)
specificity = 100.0 * tn / (tn + fp)
precision = 100.0 * tp / (tp + fp)
print "sensitivity(recall)=%.1f, specificity=%.1f, precision=%.1f" % (sensitivity, specificity, precision)

#SVC: sensitivity(recall)=79.3, specificity=77.9, precision=78.2


sensitivity(recall)=77.4, specificity=76.4, precision=76.7

In [8]:
#from sklearn.svm import LinearSVC
#from sklearn.feature_selection import RFECV
#from sklearn.model_selection import GridSearchCV

#parameters={'estimator__C': [pow(2, i) for i in xrange(-25, 4, 1)]}
#svc = LinearSVC()
#rfe = RFECV(estimator=svc, step=.1, cv=5, scoring='accuracy', n_jobs=1)
#clf = GridSearchCV(rfe, parameters, scoring='accuracy', n_jobs=6, cv=5, verbose=1)
#clf.fit(training_X, training_y)

# summarize results
#print("Best: %f using %s" % (clf.best_score_, clf.best_params_))
#means = clf.cv_results_['mean_test_score']
#stds = clf.cv_results_['std_test_score']
#params = clf.cv_results_['params']
#for mean, stdev, param in zip(means, stds, params):
#    print("%f (%f) with: %r" % (mean, stdev, param))