This notebook is intended to keep track of continual results of the various V3 tropism analyses that we plan to publish. This includes (but is not limited too): differences in clinical parameters, LTR SNPs, cytokine profiles, etc. This script auto-pulls from the Google-spreadsheet that Greg/Kyle/etc are putting data into so it should be able to auto-update as they complete sequencing.
In [7]:
from __future__ import division
import numpy as np
from scipy.stats import linregress
import matplotlib.pyplot as plt
from matplotlib import dates
import pandas as pd
import gspread
from StringIO import StringIO
import csv
import sys
sys.path.append('/home/will/PatientPicker/')
sys.path.append('/home/will/PySeqUtils/')
import LoadingTools
from GeneralSeqTools import fasta_reader, fasta_writer, seq_align_to_ref
base_path = '/home/will/Dropbox/Wigdahl HIV Lab/V3Progress/'
In [8]:
def decide_tropism(inval):
if inval < -6.95:
return 'R5'
elif inval > -2.88:
return 'X4'
elif inval < -4.92:
return 'R5-P'
elif inval >= -4.92:
return 'X4-P'
return np.nan
with open('/home/will/IpythonNotebook/secret.txt') as handle:
line = handle.next()
login, pword = line.split('\t')
gc = gspread.login(login, pword)
spread = gc.open("V3 Project")
worksheet = spread.worksheet('PBMC Progress Report')
handle = StringIO()
writer = csv.writer(handle, delimiter = '\t')
rows = worksheet.get_all_values()
writer.writerow(rows[0])
for row in rows[1:]:
if row[0].startswith('A'):
try:
writer.writerow(row)
except UnicodeEncodeError:
print row
handle.seek(0)
df = pd.read_csv(handle, sep = '\t', parse_dates = [5])
df['HasSeq'] = df['V3 Amino Acid Sequence'].notnull()
df['Date'] = df['Date'].map(pd.to_datetime)
df['Date'][df['Date'] == 'nan'] = np.nan
df['Prediction'] = df['PSSM Score'].map(decide_tropism)
df['Prediction'][df['PCR'].notnull()] = df['Prediction'][df['PCR'].notnull()].fillna('Attempted')
In [3]:
df
Out[3]:
In [4]:
from datetime import timedelta
tdf = df.groupby(['Patient', 'Visit'], as_index=False).first()
date_based = pd.pivot_table(tdf[tdf['Date'].notnull()], rows = 'Date',
cols = 'Prediction',
values = 'Patient',
aggfunc = 'count')
date_cum = pd.expanding_sum(date_based)['2013-10':]
date_cum['Total'] = date_cum.sum(axis=1)
td = date_cum[['Total']].reset_index()
td['dDate'] = (td['Date'] - pd.to_datetime('2013-7-1')).apply(lambda x: x / timedelta64(1, 'D'))
m, b, r, p, e = linregress(td['dDate'], td['Total'])
num_days = (len(tdf)-b)/m
nd = pd.DataFrame({
'Date':pd.date_range(start = '2013-7-1',
freq = 'M',
periods = np.ceil(num_days/30))
})
nd['dDate'] = (nd['Date'] - pd.to_datetime('2013-7-1')).apply(lambda x: x / timedelta64(1, 'D'))
nd['GuessNum'] = m*nd['dDate'] + b
nd = nd.set_index('Date')
pdata = nd['GuessNum'][nd['GuessNum']<len(tdf)]
In [5]:
from operator import itemgetter
pssm_bins = [-15.0, -13.0, -11.0, -9.0, -6.96, -4.92, -2.88, 1]
pssm_names = ['R5-4', 'R5-3', 'R5-2', 'R5-1', 'R5-P', 'X4-P', 'X4-R']
tdf['PSSMCluster'] = pd.Series(np.digitize(tdf['PSSM Score'].values.astype(float), pssm_bins).astype(float),
index=tdf.index)
tdf['PSSMCluster'][tdf['PSSM Score'].isnull()] = np.nan
rdict = dict(enumerate(pssm_names,1))
tdf['PSSMCluster'] = tdf['PSSMCluster'].map(lambda x: rdict.get(x, np.nan))
pd.crosstab(tdf['PSSMCluster'], tdf['Prediction'])
Out[5]:
In [6]:
bar_cols = ['X4', 'X4-P', 'R5', 'R5-P', 'Attempted']
colors = 'rmbcg'
fig, axs = plt.subplots(2, 1, figsize = (8,8))
for ax in axs.flatten():
bottoms = np.zeros_like(date_cum['X4'].values)
for col, c in zip(bar_cols, list(colors)):
ax.bar(list(date_cum.index), date_cum[col].values, color = c, width=5, bottom = bottoms)
bottoms += date_cum[col].values
if ax.is_first_row():
ax.legend(bar_cols, 'lower right')
if ax.is_first_row():
ldate = date_cum.index[-1]+timedelta(days=60)
tmp = pdata[pd.to_datetime('2013-9'):ldate]
ax.plot(tmp.index, tmp.values)
else:
ax.plot(pdata.index, pdata.values)
plt.setp(ax.xaxis.get_majorticklabels(), rotation=45)
ax.set_ylabel('Samples')
fig.tight_layout()
plt.savefig(base_path+'SeqProgress.png')
This figure shows the cumulative number of sequences that we've typed so far. The blue trendline shows the projected rate of completion.
In [6]:
pat_data = LoadingTools.load_redcap_data().groupby(['Patient ID', 'VisitNum']).first()
tdf.set_index(['Patient', 'Visit'], inplace=True)
In [7]:
pat_data['Tropism'] = np.nan
_, trops = pat_data['Tropism'].align(tdf['Prediction'], join = 'left')
pat_data['Tropism'] = trops.replace('Attempted', np.nan)
pat_data['PSSMCluster'] = np.nan
_, pat_data['PSSMCluster'] = pat_data['PSSMCluster'].align(tdf['PSSMCluster'], join = 'left')
In [8]:
def safe_time_min(inser):
if inser.isnull().all():
return pd.NaT
else:
return inser.dropna().min()
qc_cols = [('Race-Asian','min'),
('Race-Indian','min'),
('Race-Black','min'),
('Race-Hawaiian','min'),
('Race-White','min'),
('Race-Multiple','min'),
('Race-Unknown','min'),
('HIV Seropositive Date', 'min'),
('Nadir CD4 count (cells/uL)', 'min'),
('Nadir CD8 count (cells/uL)', 'min'),
('Peak viral load (copies/mL)', 'max')]
date_cols = ['HIV Seropositive Date',
'Date Of Visit',
'Date of initial CD4 count',
'Date of nadir CD4 count',
'Date of latest CD4 count',
'Date of initial CD8 count',
'Date of nadir CD8 count',
'Date of latest CD8 count',
'Date of initial viral load',
'Date of peak viral load',
'Date of latest viral load']
for col in date_cols:
pat_data[col] = pd.to_datetime(pat_data[col], coerce = True)
for col, func in qc_cols:
pat_data[col] = pat_data[col].groupby(level = 'Patient ID').transform(func)
In [9]:
type(pat_data['Date Of Visit'].values[0])
Out[9]:
In [10]:
def to_days(ival):
return ival/np.timedelta64(1, 'D')
date_check_cols = [('Latest viral load', 'Date of latest viral load'),
('Latest CD4 count (cells/uL)', 'Date of latest CD4 count'),
('Latest CD8 count (cells/uL)', 'Date of latest CD8 count')]
cutoff = 90
for col, dcol in date_check_cols:
date_deltas = (pat_data['Date Of Visit'] - pat_data[dcol]).map(to_days).abs()
mask = date_deltas<cutoff
pat_data['Close ' + col] = np.nan
pat_data['Close ' + col][mask] = pat_data[col][mask]
In [11]:
def to_years(ival):
return ival/np.timedelta64(1, 'D')/365
pat_data['YearsSero'] = (pat_data['Date Of Visit'] - pat_data['HIV Seropositive Date']).apply(to_years)
log_cols = [('Latest viral load', 'Log-Latest-VL'),
('Peak viral load (copies/mL)', 'Log-Peak-VL'),
('Close Latest viral load', 'Close-Log-Latest-VL'),
]
for orig, new in log_cols:
pat_data[new] = pat_data[orig].map(np.log10)
In [12]:
pat_data['Tropism'].value_counts()
Out[12]:
In [13]:
from statsmodels.graphics.api import violinplot
from scipy.stats import ttest_ind, kruskal
from statsmodels.stats.power import tt_ind_solve_power
def generate_violion_plots(plot_col, group_col, group_order, ax):
boxes = []
mus = []
stds = []
g_order = []
for group in group_order:
mask = group_col == group
tmp = plot_col[mask].dropna()
if len(tmp) > 2:
g_order.append(group)
boxes.append(tmp.copy().values)
mus.append(plot_col[mask].mean())
stds.append(plot_col[mask].std())
if len(boxes) == 2:
ef = abs(np.diff(mus))/(np.sum(stds))
ratio = len(boxes[1])/len(boxes[0])
n0 = tt_ind_solve_power(effect_size=ef, alpha = alpha, power = power, ratio = ratio)
sizes = [str(int(n0)), str(int(n0*ratio))]
_, pval = ttest_ind(*boxes)
else:
sizes = ['']*len(boxes)
_, pval = kruskal(*boxes)
labels = ['%s n=%i/%s' % (t, len(b), n) for t, b, n in zip(g_order, boxes, sizes)]
violinplot(boxes, ax = ax, labels = labels)
return pval, ax
In [14]:
checks = [('VL', 'Log-Latest-VL'),
('Close-VL', 'Close-Log-Latest-VL'),
('Peak-VL', 'Log-Peak-VL'),
('CD4', 'Latest CD4 count (cells/uL)'),
('Close-CD4', 'Close Latest CD4 count (cells/uL)'),
('Nadir-CD4', 'Nadir CD4 count (cells/uL)'),
('CD8', 'Latest CD8 count (cells/uL)'),
('Close-CD8', 'Close Latest CD8 count (cells/uL)'),
('Nadir-CD8', 'Nadir CD8 count (cells/uL)'),
('TMHDS', 'TMHDS'),
('Years-Sero', 'YearsSero')]
alpha = 0.05
power = 0.8
trops = ['X4', 'R5']
fig, axs = plt.subplots(4,3, figsize = (10,10))
for (name, col), ax in zip(checks, axs.flatten()):
pval, ax = generate_violion_plots(pat_data[col],
pat_data['Tropism'],
trops, ax)
ax.set_title(name + ' pval:%f' % pval)
ax.set_ylim([0, ax.get_ylim()[1]])
plt.tight_layout()
plt.savefig(base_path + 'uncorrected_clinical_params.png')
This figure shows the difference in the X4 vs R5 population for the entire sequenced cohort. This was done at the 'sample level'. We can see a clear difference in Nadir-CD4, Close-CD4 (cd4 within 90 days of visit) and Latest CD4. However, there are numerous confounders in this analysis so I choose a subset of R5 patients such that the two groups are matched for Age, gender, race, etc.
In [15]:
fig, axs = plt.subplots(4,3, figsize = (15,15))
for (name, col), ax in zip(checks, axs.flatten()):
pval, ax = generate_violion_plots(pat_data[col],
pat_data['PSSMCluster'],
pssm_names, ax)
ax.set_title(name + ' pval:%f' % pval)
ax.set_ylim([0, ax.get_ylim()[1]])
plt.setp(ax.xaxis.get_majorticklabels(), rotation=70 )
plt.tight_layout()
plt.savefig(base_path + 'uncorrected_clinical_params_MULTI_R5.png')
In [16]:
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.preprocessing import normalize
pat_data['IsMale'] = pat_data['Gender'] == 'Male'
control_cols = ['Age',
'IsMale',
'Race-Asian',
'Race-Indian',
'Race-Black',
'Race-Hawaiian',
'Race-White',
'Race-Multiple',
'HAART-Naive',
'HAART-Non-Adherent',
'HAART-Off',
'HAART-On',
'YearsSero']
r5_mask = pat_data['Tropism'] == 'R5'
x4_mask = pat_data['Tropism'] == 'X4'
r5_data = pat_data[r5_mask][control_cols].dropna()
x4_data = pat_data[x4_mask][control_cols].dropna()
dists = euclidean_distances(normalize(r5_data.values.astype(float)), normalize(x4_data.values.astype(float)))
In [17]:
def assign_best(dists):
valid_dists = dists.copy()
out = []
for col in range(valid_dists.shape[1]):
pos = valid_dists[:, col].argmin()
out.append(pos)
valid_dists[pos, :] = np.inf
return np.array(out)
def match_items(small_set, large_set):
small_data = normalize(small_set.values.astype(float))
large_data = normalize(large_set.values.astype(float))
dists = euclidean_distances(large_data, small_data)
large_locs = set(assign_best(dists))
mask = pd.Series([i in large_locs for i in range(len(large_set.index))],
index = large_set.index)
return small_set.index, large_set[mask].index
x4_inds, r5_inds = match_items(x4_data, r5_data)
In [18]:
fig, axs = plt.subplots(4,3, figsize = (10,10))
pat_data['Keep'] = False
pat_data['Keep'][x4_inds] = True
pat_data['Keep'][r5_inds] = True
keep_mask = pat_data['Keep']
for (name, col), ax in zip(checks, axs.flatten()):
pval, ax = generate_violion_plots(pat_data[col][keep_mask],
pat_data['Tropism'][keep_mask],
trops, ax)
ax.set_title(name + ' pval:%f' % pval)
ax.set_ylim([0, ax.get_ylim()[1]])
plt.tight_layout()
plt.savefig(base_path + 'matched_clinical_params.png')
Here is the data for the matched cohorts. We still see a strong effect on the CD4.
In [19]:
fig, axs = plt.subplots(4,3, figsize = (15,15))
for (name, col), ax in zip(checks, axs.flatten()):
pval, ax = generate_violion_plots(pat_data[col][keep_mask],
pat_data['PSSMCluster'][keep_mask],
pssm_names, ax)
ax.set_title(name + ' pval:%f' % pval)
ax.set_ylim([0, ax.get_ylim()[1]])
plt.setp(ax.xaxis.get_majorticklabels(), rotation=70 )
plt.tight_layout()
plt.savefig(base_path + 'matched_clinical_params_MULTI_R5.png')
In [20]:
from sklearn.covariance import EllipticEnvelope
cytos = sorted(['IL.8','VEGF','IL.1beta',
'G.CSF','EGF','IL.10','HGF',
'FGF.basic','IFN.alpha','IL.6',
'IL.12','Rantes','Eotaxin',
'GM.CSF','MIP.1beta',
'MCP.1','IL.5','IL.13', 'IFN.gamma','TNF.alpha',
'IL.RA','IL.2','IL.7','IP.10',
'IL.2R','MIG','IL.4','IL.15',
'IL.17','MIP.1alpha']) + ['Th1', 'Th2']
cyto_data_raw = pd.read_csv('/home/will/HIVSystemsBio/NewCytokineAnalysis/CytoRawData.csv', sep = '\t')
cyto_data_raw['Th1'] = cyto_data_raw['IFN.gamma'] + \
cyto_data_raw['IL.2']+cyto_data_raw['TNF.alpha']
cyto_data_raw['Th2'] = cyto_data_raw['IL.4'] + \
cyto_data_raw['IL.5']+cyto_data_raw['IL.10']
In [21]:
cyto_data = cyto_data_raw.groupby(['Patient ID', 'VisitNum']).mean()
tranfer_cols = ['Log-Latest-VL',
'Tropism',
'Keep',
'IsMale',
'Race-Black',
'Age',
'HAART-Naive',
'HAART-Non-Adherent',
'HAART-Off',
'HAART-On',
'Hepatitis C status (HCV)']
for col in tranfer_cols:
_, cyto_data[col] = cyto_data.align(pat_data[col], join='left', axis = 0)
cyto_data['HCV'] = cyto_data['Hepatitis C status (HCV)']
In [22]:
for col in cytos:
env = EllipticEnvelope(contamination=0.05)
env.fit(cyto_data[col].dropna().values.reshape(-1, 1))
mask = env.predict(cyto_data[col].values.reshape(-1,1))
cyto_data[col][mask==-1] = np.nan
In [23]:
fig, axs = plt.subplots(11,3, figsize = (10,20))
for ax, col in zip(axs.flatten(), cytos):
boxes = []
mus = []
stds = []
for trop in trops:
mask = cyto_data['Tropism'] == trop
#mask &= cyto_data['Keep']
boxes.append(cyto_data[col][mask].dropna().values)
mus.append(cyto_data[col][mask].mean())
stds.append(cyto_data[col][mask].std())
ef = abs(np.diff(mus))/(np.sum(stds))
ratio = len(boxes[1])/len(boxes[0])
n0 = tt_ind_solve_power(effect_size=ef, alpha = alpha, power = power, ratio = ratio)
sizes = [n0, n0*ratio]
_, pval = ttest_ind(*boxes)
labels = ['%s n=%i/%i' % (t, len(b), n) for t, b, n in zip(trops, boxes, sizes)]
violinplot(boxes, ax = ax, labels = labels)
#
ax.set_title(col + ' pval:%f' % pval)
ax.set_ylim([0, ax.get_ylim()[1]])
plt.tight_layout()
plt.savefig(base_path+'uncorrected_cyto_data.png')
In [24]:
fig, axs = plt.subplots(11,3, figsize = (10,20))
for ax, col in zip(axs.flatten(), cytos):
boxes = []
mus = []
stds = []
for trop in trops:
mask = cyto_data['Tropism'] == trop
mask &= cyto_data['Keep']
boxes.append(cyto_data[col][mask].dropna().values)
mus.append(cyto_data[col][mask].mean())
stds.append(cyto_data[col][mask].std())
ef = abs(np.diff(mus))/(np.sum(stds))
ratio = len(boxes[1])/len(boxes[0])
n0 = tt_ind_solve_power(effect_size=ef, alpha = alpha, power = power, ratio = ratio)
sizes = [n0, n0*ratio]
_, pval = ttest_ind(*boxes)
labels = ['%s n=%i/%i' % (t, len(b), n) for t, b, n in zip(trops, boxes, sizes)]
violinplot(boxes, ax = ax, labels = labels)
#
ax.set_title(col + ' pval:%f' % pval)
ax.set_ylim([0, ax.get_ylim()[1]])
plt.tight_layout()
plt.savefig(base_path+'matched_cyto_data.png')
In [25]:
import statsmodels.api as sm
con_cols = ['Log-Latest-VL',
'IsR5',
'IsMale',
'Age',
'HAART-Naive',
'HAART-Off',
'HAART-On',
'HCV']
cyto_data['IsR5'] = 1.0
cyto_data['IsR5'][cyto_data['Tropism'].isnull()] = np.nan
cyto_data['IsR5'][cyto_data['Tropism']=='X4'] = 0.0
aa_mask = cyto_data['Race-Black'] == True
fig, axs = plt.subplots(11,3, figsize = (10,20))
for ax, col in zip(axs.flatten(), cytos):
tdata = cyto_data[con_cols + [col]][aa_mask].dropna()
res = sm.GLM(tdata[col],tdata[con_cols].astype(float)).fit()
pval = res.pvalues['IsR5']
boxes = []
mus = []
stds = []
for trop in trops:
mask = cyto_data['Tropism'] == trop
mask &= aa_mask
boxes.append(cyto_data[col][mask].dropna().values)
mus.append(cyto_data[col][mask].mean())
stds.append(cyto_data[col][mask].std())
ef = abs(np.diff(mus))/(np.sum(stds))
ratio = len(boxes[1])/len(boxes[0])
n0 = tt_ind_solve_power(effect_size=ef, alpha = alpha, power = power, ratio = ratio)
sizes = [n0, n0*ratio]
labels = ['%s n=%i/%i' % (t, len(b), n) for t, b, n in zip(trops, boxes, sizes)]
violinplot(boxes, ax = ax, labels = labels)
#
ax.set_title(col + ' pval:%f' % pval)
ax.set_ylim([0, ax.get_ylim()[1]])
plt.tight_layout()
plt.savefig(base_path + 'corrected_cyto_data.png')
In [26]:
import glob
ltr_files = sorted(glob.glob('/home/will/HIVReportGen/Data/PatientFasta/*LTR.fasta'))
ltr_seqs = []
for f in ltr_files:
with open(f) as handle:
ltr_seqs += list(fasta_reader(handle))
print len(ltr_seqs)
In [27]:
conb_ltr = """TGGAAGGGCTAATTCACTCCCAACGAAGACAAGATATCCTTGATCTGTGGATCTACCACACACAA
GGCTACTTCCCTGATTAGCAGAACTACACACCAGGGCCAGGGATCAGATATCCACTGACCTTTGGATGGTGCTACAAGC
TAGTACCAGTTGAGCCAGAGAAGTTAGAAGAAGCCAACAAAGGAGAGAACACCAGCTTGTTACACCCTGTGAGCCTGCA
TGGAATGGATGACCCGGAGAGAGAAGTGTTAGAGTGGAGGTTTGACAGCCGCCTAGCATTTCATCACATGGCCCGAGAG
CTGCATCCGGAGTACTTCAAGAACTGCTGACATCGAGCTTGCTACAAGGGACTTTCCGCTGGGGACTTTCCAGGGAGGC
GTGGCCTGGGCGGGACTGGGGAGTGGCGAGCCCTCAGATCCTGCATATAAGCAGCTGCTTTTTGCCTGTACTGGGTCTC
TCTGGTTAGACCAGATCTGAGCCTGGGAGCTCTCTGGCTAACTAGGGAACCCACTGCTTAAGCCTCAATAAAGCTTGCC
TTGAGTGCTTCAAGTAGTGTGTGCCCGTCTGTTGTGTGACTCTGGTAACTAGAGATCCCTCAGACCCTTTTAGTCAGTG
TGGAAAATCTCT""".replace('\n', '')
ltr_align = list(seq_align_to_ref(ltr_seqs, conb_ltr, max_workers = 20))
In [28]:
ltr_bin_align = []
align_inds = []
for key, seq in ltr_align:
align_inds.append(tuple(key.split('-')))
hit_first = False
bins = []
for g, cb in zip(seq, conb_ltr):
if ~hit_first and (g == '-'):
bins.append(np.nan)
continue
hit_first = True
bins.append(1.0 if g==cb else 0.0)
ltr_bin_align.append(np.array(bins))
ltr_bin_array = np.array(ltr_bin_align)
In [29]:
columns = ['ConB-%03i' % i for i in range(1, len(conb_ltr)+1)]
seq_df = pd.DataFrame(ltr_bin_array,
index = pd.MultiIndex.from_tuples(align_inds,
names = ['Patient', 'Visit']),
columns = columns)
In [30]:
from scipy.stats import fisher_exact
_, seq_df['Tropism'] = seq_df.align(tdf['Prediction'],
join = 'left',
axis = 0)
r5_mask = seq_df['Tropism'] == 'R5'
x4_mask = seq_df['Tropism'] == 'X4'
In [31]:
pvals = []
for col in columns:
r5_col = seq_df[col][r5_mask]
x4_col = seq_df[col][x4_mask]
if (r5_col.isnull().sum()>5) and (x4_col.isnull().sum()>5):
f_table = [[(r5_col == 1).sum(), (x4_col == 1).sum()],
[(r5_col == 0).sum(), (x4_col == 0).sum()]]
_, pval = fisher_exact(f_table)
pvals.append(pval)
else:
pvals.append(np.nan)
In [32]:
ltr_features = pd.read_excel(base_path+'LTR_features.xlsx', 'Sheet1')
ltr_features
Out[32]:
In [33]:
bottoms = 3.0+(ltr_features['Column']*0.5)
heights = 0.5*np.ones_like(ltr_features['EndPos'].values)
widths = ltr_features['EndPos'] - ltr_features['StartPos']
lefts = ltr_features['StartPos']
colors= ltr_features['Color'].values
In [34]:
from statsmodels.sandbox.stats.multicomp import multipletests
apvals = np.array(pvals)
tmask = ~np.isnan(apvals) & (apvals < 1)
reject, adj_pvals, _, _ = multipletests(apvals[tmask], 0.1, 'fdr_bh')
areject = np.zeros_like(apvals)
areject[tmask] = reject
fig, ax = plt.subplots(figsize = (10,5))
ax.plot(-np.log10(pvals))
for num, (col, pval, mc) in enumerate(zip(columns, pvals, areject.flatten())):
if pval < (0.05):
label = '%s p=%f' % (col, pval)
if mc:
label += '*'
ax.annotate(label, (num, -np.log10(pval)))
rects = ax.bar(lefts.values, heights, width=widths.values, bottom=bottoms, color = list(colors))
for left, bottom, label in zip(lefts, bottoms, ltr_features['Name']):
ax.annotate(label, (left+5, bottom+0.25),
rotation=70,
xytext = (left, 7.5),
ha = 'left',
arrowprops = {'width':1, 'headwidth':1})
plt.savefig(base_path + 'snp_figure.png')
In [35]:
from Bio.Seq import Seq
from Bio import Motif
from StringIO import StringIO
from itertools import groupby
from operator import methodcaller
def yield_motifs():
with open('/home/will/LTRtfAnalysis/Jaspar_PWMs.txt') as handle:
for key, lines in groupby(handle, methodcaller('startswith', '>')):
if key:
name = lines.next().strip().split()[-1].lower()
else:
tmp = ''.join(lines)
mot = Motif.read(StringIO(tmp), 'jaspar-pfm')
yield name, mot
yield name+'-R', mot.reverse_complement()
pwm_dict = {}
for num, (name, mot) in enumerate(yield_motifs()):
if num % 100 == 0:
print num
pwm_dict[name] = mot
tmp = u"""A 0 0 6 1 0 0 0 4 2 2 0 0 3
C 1 1 1 0 5 6 4 1 0 0 0 3 5 5 4 0
G 0 6 0 1 1 0 0 0 0 7 1 1 0 0 1 0
T 6 0 0 0 1 1 3 5 7 0 0 0 0 2 2 4"""
pwm_dict['coup2'] = Motif.read(StringIO(tmp), 'jaspar-pfm')
pwm_dict['coup2-R'] = Motif.read(StringIO(tmp), 'jaspar-pfm').reverse_complement()
In [36]:
from Bio.Alphabet import IUPAC
def score_seq(seq, mot):
bseq = Seq(seq, alphabet=IUPAC.unambiguous_dna)
scores = mot.scanPWM(bseq)
for pos, score in enumerate(scores.flatten(),1):
if ~np.isnan(score):
tseq = seq[pos:pos+len(mot)]
yield pos, tseq, score
wanted_mots = ['ap1', 'ap1-R',
'cebpa', 'cebpa-R',
'creb1', 'creb1-R',
'coup2', 'coup2-R',
'ets1','ets1-R',
#'fev', 'fev-R',
'foxc1', 'foxc1-R',
#'gata2', 'gata2-R',
#'gata3', 'gata3-R',
#'hnf4a', 'hnf4a-R',
#'hoxa5', 'hoxa5-R',
'nf-kappab','nf-kappab-R',
'nfatc2', 'nfatc2-R',
'nr2f1','nr2f1-R',
#'tfap2a', 'tfap2a-R',
#'znf354c','znf354c-R',
'nf-kappab', 'nf-kappab-R',
'sp1', 'sp1-R']
big_res = []
for mot_name in wanted_mots:
print mot_name
mot = pwm_dict[mot_name]
thresh = Motif.Thresholds.ScoreDistribution(mot, precision = 50).threshold_fpr(0.01)
for name, seq in ltr_align:
pid, vnum = name.split('-')
for pos, tf_seq, score in score_seq(seq, mot):
big_res.append({
'Patient':pid,
'Visit':vnum,
'TF':mot_name,
'PosSeq':tf_seq,
'Score':score,
'ConBPos':pos
})
tf_df = pd.DataFrame(big_res)
In [37]:
def process_vals(inser):
return pd.rolling_max(inser, 8, min_periods=1)
gkey = ['Patient', 'Visit', 'TF']
tf_df['MaxScore'] = tf_df.groupby(gkey)['Score'].transform(process_vals)
In [38]:
tf_pivot = pd.pivot_table(tf_df,
rows = ['Patient', 'Visit'],
cols = ['TF', 'ConBPos'],
values = ['MaxScore', 'PosSeq'],
aggfunc = 'first')
In [39]:
def calc_adjust(col, bind_data):
confounder_cols = ['Age',
'IsMale',
'Race-Black',
'Race-White',
'HAART-Naive',
'HAART-Non-Adherent',
'HAART-Off',
'HAART-On',
'YearsSero']
condata = pat_data[sorted(set(confounder_cols+[col]))].dropna()
condata, bondata = condata.align(bind_data, join = 'inner', axis = 0)
sumres = sm.GLM(bondata, condata.astype(float)).fit()
return sumres.pvalues[col]
pos_scores = tf_pivot['MaxScore']
results = []
for mot_name in wanted_mots:
print mot_name
score_data = pos_scores[mot_name]
for pat_col_name, pat_col in checks:
clin_data = pat_data[pat_col].dropna()
for col in score_data.columns:
bind_data = score_data[col].dropna().astype(float)
if len(bind_data) < 10:
continue
bdata, cdata = bind_data.align(clin_data, join = 'inner')
if (len(bdata) > 100):
m, b, r, p, _ = linregress(bdata.values, cdata.values)
results.append({
'Motif':mot_name,
'ClinicalVal':pat_col_name,
'Slope':m,
'Intercept':b,
'R2': r**2,
'pval':p,
'ConBCol':col,
'N':len(bdata),
'AdjPval':calc_adjust(pat_col, bind_data)
})
In [40]:
cor_results = pd.DataFrame(results)
nres = pd.pivot_table(cor_results,
rows = ['Motif', 'ConBCol'],
cols = 'ClinicalVal',
values = ['pval', 'Slope', 'R2', 'N', 'AdjPval'],
aggfunc = 'first')
In [41]:
all_pvals = np.array([d['pval'] for d in results if d['N'] > 200])
reject, adj_pvals, _, _ = multipletests(all_pvals, 0.2, 'fdr_bh')
cutoff = all_pvals[reject].max()
print cutoff
lcutoff = -np.log10(cutoff)
In [42]:
cols = [col for _, col in checks]
ncols = pd.Index([col for col, _ in checks])
small_pat_data = pat_data[cols]
small_pat_data.columns = ncols
tf_pat_data = pd.merge(tf_df,
small_pat_data,
left_on = ['Patient', 'Visit'],
right_index = True,
how = 'inner')
In [43]:
from types import TupleType
crazy_big = pd.merge(tf_pat_data,
nres,
left_on = ['TF', 'ConBPos'],
right_index = True,
how = 'inner')
ncols = []
vcols = set(c for c, _ in checks)
for col in crazy_big.columns:
if (type(col) is TupleType) and (col[1] in vcols):
ncols.append((col[-1], col[0]))
elif (type(col) is TupleType):
print col
else:
ncols.append(('APatient', col))
crazy_big.columns = pd.MultiIndex.from_tuples(ncols, names = ['Value', 'Analysis'])
crazy_big.sortlevel(axis=1, inplace=True)
In [44]:
print crazy_big.head().T.to_string()
In [45]:
pmask = crazy_big.xs('pval', level = 'Analysis', axis=1) <= cutoff*10
pmask &= crazy_big.xs('AdjPval', level = 'Analysis', axis=1) <= cutoff*10
rmask = (crazy_big.xs('R2', level = 'Analysis', axis=1) > 0.05)
nmask = (crazy_big.xs('N', level = 'Analysis', axis=1) >= 200)
wanted_mask = (rmask & pmask & nmask).any(axis = 1)
wdata = crazy_big[wanted_mask]
wdata
Out[45]:
In [46]:
def coerce_cols(tup):
try:
return '--'.join(tup)
except TypeError:
return coerce_cols(tup[0])
gkey = [('APatient', 'TF'), ('APatient', 'ConBPos'), ('APatient', 'PosSeq')]
grouped_data = wdata.groupby(gkey).agg(['mean', 'count'])
In [47]:
drop_cols = [tup for tup in grouped_data.columns if (tup[-1] == 'count') and (tup[0] != 'APatient')]
out_data = grouped_data.drop(drop_cols, axis=1)
for cname, _ in checks:
mask = (out_data[cname]['pval']['mean'] > 0.1) & (out_data[cname]['AdjPval']['mean'] > 0.1)
tmask = np.tile(mask.values, (len(out_data[cname].columns),1))
out_data[cname] = out_data[cname].where(np.transpose(tmask))
out_data.reset_index(inplace=True)
out_data.columns = [coerce_cols(tup) for tup in out_data.columns]
out_data.to_excel(base_path+'TFbinding.xlsx', index=False)
In [48]:
from itertools import product
clin_vals = sorted(cor_results['ClinicalVal'].unique())
motifs = sorted(cor_results['Motif'].unique())
pos = range(1, 630)
nindex = pd.MultiIndex.from_tuples(list(product(clin_vals, motifs, pos)), names = ['ClinicalVal', 'Motif', 'ConBCol'])
In [49]:
cor_results.head()
Out[49]:
In [50]:
cor_results['LogP-signed'] = (-np.log10(cor_results['pval']))*np.sign(cor_results['Slope'])
plot_vals = cor_results.groupby(['ClinicalVal', 'Motif', 'ConBCol'])['LogP-signed'].first()
In [51]:
plot_vals.head()
Out[51]:
In [52]:
def make_annotation(ax, tf, positions, color):
if len(positions) == 1:
label = '%s-%i' % (tf, min(positions))
else:
label = '%s-[%i-%i]' % (tf, min(positions), max(positions))
ax.annotate(label,
(max(lpos), val),
textcoords = 'offset points',
xytext = (0, np.random.normal()*4),
color = color)
def simple_color(val):
if val > 0:
return 'b'
return 'r'
fig, axs = plt.subplots(4,3, figsize = (15, 10), sharex=True, sharey=True)
for ax, (col, _) in zip(axs.flatten(), checks):
vals = -nres['pval'][col].applymap(np.log10).fillna(0)
xpos = np.array([int(v.split('-')[1]) for v in vals.columns])
yvals = vals[(vals > 1).any(axis = 1)]
direct = nres['Slope'][col][(vals > 1).any(axis = 1)]
corr = nres['R2'][col][(vals > 1).any(axis = 1)]
ax.bar(left = xpos, height = direct.T.values, c = 'g', width = 1/len(direct.index))
raise KeyboardInterrupt
ax.set_ylim([0, 10])
for tf, row in yvals.iterrows():
lpos = []
for c, val in sorted(row[row>lcutoff].to_dict().items()):
if corr[c][tf] < 0.5:
continue
pos = int(c.split('-')[1])
color = 'b' if direct[c][tf] > 0 else 'r'
#print col, tf, c, val, corr[c][tf]
if (len(lpos)==0) or (lpos[-1]+5>pos):
lpos.append(pos)
else:
make_annotation(ax, tf, lpos, color)
lpos = []
if len(lpos)>0:
make_annotation(ax, tf, lpos, color)
ax.set_title(col)
if ax.is_first_col():
ax.set_ylabel('Slope')
if ax.is_last_row():
ax.set_ylabel('HXB2 Pos')
plt.tight_layout()
#plt.savefig(base_path+'tf_figure.png', dpi=200)
In [ ]:
In [ ]: