In [1]:
from pepnet import SequenceInput, Output, Predictor


Using Theano backend.

In [18]:
mhc = SequenceInput(
    length=34, name="mhc", encoding="index", variable_length=True,
    dense_layer_sizes=[32])
peptide = SequenceInput(
    length=50, name="peptide", encoding="index", variable_length=True,
    conv_filter_sizes=[9], 
    conv_output_dim=8,
    n_conv_layers=2,
    global_pooling=True,
    dense_layer_sizes=[32])

In [25]:
df = pd.read_csv("class2_data.csv")
print("Loaded %d samples" % len(df))
bad_mhc = ~df.mhc.str.contains("\*")
print("Dropping %d rows without full alleles" % bad_mhc.sum())
df = df[~bad_mhc]


Loaded 134555 samples
Dropping 14847 rows without full alleles

In [37]:
mhc_pseudosequences_dict = {}
with open("pseudosequences.dat") as f:
    for line in f:
        mhc, seq = line.split()
        mhc_pseudosequences_dict[mhc] = seq

In [38]:
valid_mhc_set = set(mhc_pseudosequences_dict.keys())
df.mhc = df.mhc.str.replace("/", "-").str.replace("HLA-DRB", "DRB")
valid_mhc_mask = df.mhc.isin(valid_mhc_set)
print("Dropping %d rows without pseudosequences" % (len(df) - valid_mhc_mask.sum()))
df = df[valid_mhc_mask]


Dropping 18102 rows without pseudosequences

In [39]:
df["output_name"] = df["assay_group"] + ":" + df["assay_method"]

In [40]:
output_counts = df.output_name.value_counts()

In [41]:
sufficiently_large_output_counts = output_counts[output_counts>=200]

In [42]:
sufficiently_large_output_counts


Out[42]:
half maximal inhibitory concentration (IC50):purified MHC/competitive/radioactivity    29908
ligand presentation:cellular MHC/mass spectrometry                                     24396
dissociation constant KD (~IC50):purified MHC/competitive/radioactivity                20906
dissociation constant KD (~EC50):purified MHC/direct/fluorescence                       6869
half maximal inhibitory concentration (IC50):purified MHC/competitive/fluorescence      5497
qualitative binding:purified MHC/competitive/fluorescence                               5333
qualitative binding:purified MHC/direct/fluorescence                                    2668
qualitative binding:cellular MHC/competitive/fluorescence                               1647
qualitative binding:purified MHC/competitive/radioactivity                              1261
qualitative binding:purified MHC                                                         704
qualitative binding:cellular MHC/direct/fluorescence                                     672
half life:purified MHC/direct/fluorescence                                               305
ligand presentation:Edman degradation                                                    284
qualitative binding:purified MHC/direct/phage display                                    264
ligand presentation:secreted MHC/mass spectrometry                                       228
Name: output_name, dtype: int64

In [43]:
len(sufficiently_large_output_counts)


Out[43]:
15

In [44]:
sufficiently_large_output_names = set(sufficiently_large_output_counts.index)
df_subset = df[df.output_name.isin(sufficiently_large_output_names)]
len(df_subset)


Out[44]:
100942

In [45]:
outputs = []

def from_ic50(ic50):
    return np.minimum(1.0, np.maximum(0, (1.0 - np.log(ic50) / np.log(50000))))

def to_ic50(x):
    return 50000.0 ** (1.0 - x)

assert np.allclose(to_ic50(from_ic50(40)), 40)

for output_name in sufficiently_large_output_counts.index:
    if "IC50" in output_name or "EC50" in output_name:
        transform = from_ic50
        inverse = to_ic50
        activation = "sigmoid"
    elif "half life" in output_name:
        transform = np.log
        inverse = np.exp
        activation = "linear"
    else:
        transform = None
        inverse = None
        activation = "sigmoid"
    output = Output(name=output_name, transform=transform, inverse_transform=inverse, activation=activation)
    outputs.append(output)

In [46]:
predictor = Predictor(
    inputs=[mhc, peptide], 
    outputs=outputs,
    merge_mode="multiply")


---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-46-909f773c1656> in <module>()
      2     inputs=[mhc, peptide],
      3     outputs=outputs,
----> 4     merge_mode="multiply")

/Users/iskander/code/pepnet/pepnet/predictor.py in __init__(self, inputs, outputs, merge_mode, hidden_layer_sizes, hidden_activation, hidden_dropout, batch_normalization, optimizer)
     41             outputs = [outputs]
     42 
---> 43         self.inputs = {i.name: i for i in inputs}
     44         self.input_order = [i.name for i in inputs]
     45 

/Users/iskander/code/pepnet/pepnet/predictor.py in <dictcomp>(.0)
     41             outputs = [outputs]
     42 
---> 43         self.inputs = {i.name: i for i in inputs}
     44         self.input_order = [i.name for i in inputs]
     45 

AttributeError: 'str' object has no attribute 'name'

In [47]:
from collections import defaultdict
unique_pmhcs = set(zip(df_subset["peptide"], df_subset["mhc"]))
pmhc_list = sorted(unique_pmhcs)

peptides = [p for (p, _) in pmhc_list]
mhc_names = [m for (_, m) in pmhc_list]

n_unique_pmhc = len(pmhc_list)
pmhc_index_dict = {key: i for (i, key) in enumerate(pmhc_list)}

output_name_list = [o.name for o in outputs]
output_name_index_dict = {output_name: i for i, output_name in enumerate(output_name_list)}
n_outputs = len(output_name_list)

output_is_qual_dict = {
    output_name: any([(substr in output_name) for substr in ("IC50", "EC50", "half life")])
    for output_name in output_name_list
}

print(output_is_qual_dict)

sums = np.zeros((n_unique_pmhc, n_outputs), dtype="float32")
counts = np.zeros_like(sums, dtype="int32")

for (output_name, peptide, mhc, qual, meas) in zip(
        df_subset.output_name, df_subset.peptide, df_subset.mhc,
        df_subset.qual, df_subset.meas):
    row_idx = pmhc_index_dict[(peptide, mhc)]
    col_idx = output_name_index_dict[output_name]
    counts[row_idx, col_idx] += 1
    if output_is_qual_dict[output_name]:
        sums[row_idx, col_idx] += np.log(1 + meas)
    else:
        sums[row_idx, col_idx] += qual.startswith("Positive")

averages = sums / counts
for name, col_idx in output_name_index_dict.items():
    if output_is_qual_dict[name]:
        averages[:, col_idx] = averages[:, col_idx] > 0.5
    else:
        averages[:, col_idx] = np.exp(averages[:, col_idx]) - 1

averages[counts == 0] = np.nan


{'dissociation constant KD (~EC50):purified MHC/direct/fluorescence': True, 'ligand presentation:secreted MHC/mass spectrometry': False, 'ligand presentation:Edman degradation': False, 'ligand presentation:cellular MHC/mass spectrometry': False, 'qualitative binding:purified MHC': False, 'qualitative binding:cellular MHC/direct/fluorescence': False, 'qualitative binding:purified MHC/competitive/radioactivity': False, 'half maximal inhibitory concentration (IC50):purified MHC/competitive/fluorescence': True, 'half life:purified MHC/direct/fluorescence': True, 'half maximal inhibitory concentration (IC50):purified MHC/competitive/radioactivity': True, 'qualitative binding:purified MHC/direct/phage display': False, 'qualitative binding:purified MHC/competitive/fluorescence': False, 'qualitative binding:purified MHC/direct/fluorescence': False, 'dissociation constant KD (~IC50):purified MHC/competitive/radioactivity': True, 'qualitative binding:cellular MHC/competitive/fluorescence': False}

In [48]:
normalized_mhc_names = [mhc_name.replace("/", "-").replace("HLA-DRB", "DRB") for mhc_name in mhc_names]
mhc_inputs = [mhc_pseudosequences[mhc_name] for mhc_name in normalized_mhc_names]

In [ ]:
predictor.fit({"peptide": peptides, "mhc": mhc_inputs}, averages, epochs=2)


Epoch 1/100
62784/94809 [==================>...........] - ETA: 17s - loss: nan - ligand presentation:cellular MHC/mass spectrometry_loss: nan - half maximal inhibitory concentration (IC50):purified MHC/competitive/radioactivity_loss: nan - dissociation constant KD (~IC50):purified MHC/competitive/radioactivity_loss: nan - dissociation constant KD (~EC50):purified MHC/direct/fluorescence_loss: nan - qualitative binding:purified MHC/competitive/fluorescence_loss: nan - half maximal inhibitory concentration (IC50):purified MHC/competitive/fluorescence_loss: nan - qualitative binding:purified MHC/direct/fluorescence_loss: nan - qualitative binding:cellular MHC/competitive/fluorescence_loss: nan - qualitative binding:purified MHC/competitive/radioactivity_loss: nan - qualitative binding:cellular MHC/direct/fluorescence_loss: nan - qualitative binding:purified MHC_loss: nan - ligand presentation:Edman degradation_loss: nan - half life:purified MHC/direct/fluorescence_loss: nan - qualitative binding:purified MHC/direct/phage display_loss: nan - ligand presentation:secreted MHC/mass spectrometry_loss: nan  - ETA: 42s - loss: nan - ligand presentation:cellular MHC/mass spectrometry_loss: nan - half maximal inhibitory concentration (IC50):purified MHC/competitive/radioactivity_loss: nan - dissociation constant KD (~IC50):purified MHC/competitive/radioactivity_loss: nan - dissociation constant KD (~EC50):purified MHC/direct/fluorescence_loss: nan - qualitative binding:purified MHC/competitive/fluorescence_loss: nan - half maximal inhibitory concentration (IC50):purified MHC/competitive/fluorescence_loss: nan - qualitative binding:purified MHC/direct/fluorescence_loss: nan - qualitative binding:cellular MHC/competitive/fluorescence_loss: nan - qualitative binding:purified MHC/competitive/radioactivity_loss: nan - qualitative binding:cellular MHC/direct/fluorescence_loss: nan - qualitative binding:purified MHC_loss: nan - ligand presentation:Edman degradation_loss: nan - half life:purified MHC/direct/fluorescence_loss: nan - qualitative binding:purified MHC/direct/phage display_loss: nan - ligand presentation:secreted MHC/mass spectrometry_loss: nan

In [ ]:


In [22]:
!head pseudosequences.dat | grep DRB


DRB1*01:01	QEFFIASGAAVDAIMWLFLECYDLQRATYHVGFT
DRB1*01:02	QEFFIASGAAVDAIMWLFLECYDLQRATYHAVFT
DRB1*01:03	QEFFIASGAAVDAIMWLFLECYDIDEATYHVGFT
DRB1*01:04	QEFFIASGAAVDAIMWLFLECYDLQRANYHVVFT
DRB1*01:05	QEFFIASGAAVDAIMWLFLECYDLQRATYHVGFT
DRB1*01:06	QEFFIASGAAVDAIMWLFLECYDLQAATYHVVFT
DRB1*01:07	QEFFIASGAAVDAIMWLFLECYDLQRATYHVGFT
DRB1*01:08	QEFFIASGAAVDAIMWLFLECYDLQRATYHVGFT
DRB1*01:09	QEFFIASGAAVDAIMWLFLECYDLQAATYHVGFT
DRB1*01:10	QEFFIASGAAVDAIMWLFLECYDLQKATYHVGFT

In [ ]: