In [1]:
from pepnet import SequenceInput, Output, Predictor
In [18]:
mhc = SequenceInput(
length=34, name="mhc", encoding="index", variable_length=True,
dense_layer_sizes=[32])
peptide = SequenceInput(
length=50, name="peptide", encoding="index", variable_length=True,
conv_filter_sizes=[9],
conv_output_dim=8,
n_conv_layers=2,
global_pooling=True,
dense_layer_sizes=[32])
In [25]:
df = pd.read_csv("class2_data.csv")
print("Loaded %d samples" % len(df))
bad_mhc = ~df.mhc.str.contains("\*")
print("Dropping %d rows without full alleles" % bad_mhc.sum())
df = df[~bad_mhc]
In [37]:
mhc_pseudosequences_dict = {}
with open("pseudosequences.dat") as f:
for line in f:
mhc, seq = line.split()
mhc_pseudosequences_dict[mhc] = seq
In [38]:
valid_mhc_set = set(mhc_pseudosequences_dict.keys())
df.mhc = df.mhc.str.replace("/", "-").str.replace("HLA-DRB", "DRB")
valid_mhc_mask = df.mhc.isin(valid_mhc_set)
print("Dropping %d rows without pseudosequences" % (len(df) - valid_mhc_mask.sum()))
df = df[valid_mhc_mask]
In [39]:
df["output_name"] = df["assay_group"] + ":" + df["assay_method"]
In [40]:
output_counts = df.output_name.value_counts()
In [41]:
sufficiently_large_output_counts = output_counts[output_counts>=200]
In [42]:
sufficiently_large_output_counts
Out[42]:
In [43]:
len(sufficiently_large_output_counts)
Out[43]:
In [44]:
sufficiently_large_output_names = set(sufficiently_large_output_counts.index)
df_subset = df[df.output_name.isin(sufficiently_large_output_names)]
len(df_subset)
Out[44]:
In [45]:
outputs = []
def from_ic50(ic50):
return np.minimum(1.0, np.maximum(0, (1.0 - np.log(ic50) / np.log(50000))))
def to_ic50(x):
return 50000.0 ** (1.0 - x)
assert np.allclose(to_ic50(from_ic50(40)), 40)
for output_name in sufficiently_large_output_counts.index:
if "IC50" in output_name or "EC50" in output_name:
transform = from_ic50
inverse = to_ic50
activation = "sigmoid"
elif "half life" in output_name:
transform = np.log
inverse = np.exp
activation = "linear"
else:
transform = None
inverse = None
activation = "sigmoid"
output = Output(name=output_name, transform=transform, inverse_transform=inverse, activation=activation)
outputs.append(output)
In [46]:
predictor = Predictor(
inputs=[mhc, peptide],
outputs=outputs,
merge_mode="multiply")
In [47]:
from collections import defaultdict
unique_pmhcs = set(zip(df_subset["peptide"], df_subset["mhc"]))
pmhc_list = sorted(unique_pmhcs)
peptides = [p for (p, _) in pmhc_list]
mhc_names = [m for (_, m) in pmhc_list]
n_unique_pmhc = len(pmhc_list)
pmhc_index_dict = {key: i for (i, key) in enumerate(pmhc_list)}
output_name_list = [o.name for o in outputs]
output_name_index_dict = {output_name: i for i, output_name in enumerate(output_name_list)}
n_outputs = len(output_name_list)
output_is_qual_dict = {
output_name: any([(substr in output_name) for substr in ("IC50", "EC50", "half life")])
for output_name in output_name_list
}
print(output_is_qual_dict)
sums = np.zeros((n_unique_pmhc, n_outputs), dtype="float32")
counts = np.zeros_like(sums, dtype="int32")
for (output_name, peptide, mhc, qual, meas) in zip(
df_subset.output_name, df_subset.peptide, df_subset.mhc,
df_subset.qual, df_subset.meas):
row_idx = pmhc_index_dict[(peptide, mhc)]
col_idx = output_name_index_dict[output_name]
counts[row_idx, col_idx] += 1
if output_is_qual_dict[output_name]:
sums[row_idx, col_idx] += np.log(1 + meas)
else:
sums[row_idx, col_idx] += qual.startswith("Positive")
averages = sums / counts
for name, col_idx in output_name_index_dict.items():
if output_is_qual_dict[name]:
averages[:, col_idx] = averages[:, col_idx] > 0.5
else:
averages[:, col_idx] = np.exp(averages[:, col_idx]) - 1
averages[counts == 0] = np.nan
In [48]:
normalized_mhc_names = [mhc_name.replace("/", "-").replace("HLA-DRB", "DRB") for mhc_name in mhc_names]
mhc_inputs = [mhc_pseudosequences[mhc_name] for mhc_name in normalized_mhc_names]
In [ ]:
predictor.fit({"peptide": peptides, "mhc": mhc_inputs}, averages, epochs=2)
In [ ]:
In [22]:
!head pseudosequences.dat | grep DRB
In [ ]: