Continual V3 Progress Report

This notebook is intended to keep track of continual results of the various V3 tropism analyses that we plan to publish. This includes (but is not limited too): differences in clinical parameters, LTR SNPs, cytokine profiles, etc. This script auto-pulls from the Google-spreadsheet that Greg/Kyle/etc are putting data into so it should be able to auto-update as they complete sequencing.



In [7]:

    
from __future__ import division
import numpy as np
from scipy.stats import linregress
import matplotlib.pyplot as plt
from matplotlib import dates
import pandas as pd
import gspread
from StringIO import StringIO
import csv
import sys
sys.path.append('/home/will/PatientPicker/')
sys.path.append('/home/will/PySeqUtils/')
import LoadingTools
from GeneralSeqTools import fasta_reader, fasta_writer, seq_align_to_ref
base_path = '/home/will/Dropbox/Wigdahl HIV Lab/V3Progress/'

Pull out Google-Docs Data



In [8]:

    
def decide_tropism(inval):
    if inval < -6.95:
        return 'R5'
    elif inval > -2.88:
        return 'X4'
    elif inval < -4.92:
        return 'R5-P'
    elif inval >= -4.92:
        return 'X4-P'
    
    return np.nan

with open('/home/will/IpythonNotebook/secret.txt') as handle:
    line = handle.next()
    login, pword = line.split('\t')
gc = gspread.login(login, pword)
spread = gc.open("V3 Project")
worksheet = spread.worksheet('PBMC Progress Report')
handle = StringIO()
writer = csv.writer(handle, delimiter = '\t')
rows = worksheet.get_all_values()
writer.writerow(rows[0])
for row in rows[1:]:
    if row[0].startswith('A'):
        try:
            writer.writerow(row)
        except UnicodeEncodeError:
            print row
handle.seek(0)

df = pd.read_csv(handle, sep = '\t', parse_dates = [5])
df['HasSeq'] = df['V3 Amino Acid Sequence'].notnull()
df['Date'] = df['Date'].map(pd.to_datetime)
df['Date'][df['Date'] == 'nan'] = np.nan
df['Prediction'] = df['PSSM Score'].map(decide_tropism)
df['Prediction'][df['PCR'].notnull()] = df['Prediction'][df['PCR'].notnull()].fillna('Attempted')



In [3]:

    
df









    Out[3]:




<class 'pandas.core.frame.DataFrame'>
Int64Index: 1461 entries, 0 to 1460
Data columns (total 13 columns):
Patient                   1461  non-null values
Visit                     1461  non-null values
Priority                  117  non-null values
PCR                       441  non-null values
Tracking Number           117  non-null values
Date                      227  non-null values
Prediction                441  non-null values
PSSM Score                197  non-null values
Percentile                197  non-null values
V3 Amino Acid Sequence    197  non-null values
Length                    1459  non-null values
Notes                     151  non-null values
HasSeq                    1461  non-null values
dtypes: bool(1), datetime64[ns](1), float64(4), object(7)

Generate a Linear Regression of the data



In [4]:

    
from datetime import timedelta
tdf = df.groupby(['Patient', 'Visit'], as_index=False).first()

date_based = pd.pivot_table(tdf[tdf['Date'].notnull()], rows = 'Date', 
                            cols = 'Prediction',
                            values = 'Patient',
                            aggfunc = 'count')
date_cum = pd.expanding_sum(date_based)['2013-10':]

date_cum['Total'] = date_cum.sum(axis=1)
td = date_cum[['Total']].reset_index()
td['dDate'] = (td['Date'] - pd.to_datetime('2013-7-1')).apply(lambda x: x / timedelta64(1, 'D'))

m, b, r, p, e = linregress(td['dDate'], td['Total'])
num_days = (len(tdf)-b)/m
nd = pd.DataFrame({
                       'Date':pd.date_range(start = '2013-7-1', 
                                            freq = 'M',
                                            periods = np.ceil(num_days/30))
                       })
nd['dDate'] = (nd['Date'] - pd.to_datetime('2013-7-1')).apply(lambda x: x / timedelta64(1, 'D'))
nd['GuessNum'] = m*nd['dDate'] + b
nd = nd.set_index('Date')
pdata = nd['GuessNum'][nd['GuessNum']<len(tdf)]



In [5]:

    
from operator import itemgetter
pssm_bins = [-15.0, -13.0, -11.0,   -9.0,    -6.96, -4.92, -2.88, 1]
pssm_names = ['R5-4', 'R5-3', 'R5-2', 'R5-1', 'R5-P', 'X4-P', 'X4-R']
tdf['PSSMCluster'] = pd.Series(np.digitize(tdf['PSSM Score'].values.astype(float), pssm_bins).astype(float),
                               index=tdf.index)
tdf['PSSMCluster'][tdf['PSSM Score'].isnull()] = np.nan
rdict = dict(enumerate(pssm_names,1))
tdf['PSSMCluster'] = tdf['PSSMCluster'].map(lambda x: rdict.get(x, np.nan))
pd.crosstab(tdf['PSSMCluster'], tdf['Prediction'])









    Out[5]:






  
    
      Prediction
      Attempted
      R5
      R5-P
      X4
      X4-P
    
    
      PSSMCluster
      
      
      
      
      
    
  
  
    
      R5-1
       0
       12
        0
        0
        0
    
    
      R5-2
       0
       43
        0
        0
        0
    
    
      R5-3
       1
       49
        0
        0
        0
    
    
      R5-4
       0
        9
        0
        0
        0
    
    
      R5-P
       0
        0
       13
        0
        0
    
    
      X4-P
       0
        0
        0
        0
       14
    
    
      X4-R
       0
        0
        0
       13
        0



In [6]:

    
bar_cols = ['X4', 'X4-P', 'R5', 'R5-P', 'Attempted']
colors = 'rmbcg'
fig, axs = plt.subplots(2, 1, figsize = (8,8))
for ax in axs.flatten():

    bottoms = np.zeros_like(date_cum['X4'].values)
    for col, c in zip(bar_cols, list(colors)):
        ax.bar(list(date_cum.index), date_cum[col].values, color = c, width=5, bottom = bottoms)
        bottoms += date_cum[col].values
    
    if ax.is_first_row():
        ax.legend(bar_cols, 'lower right')
    
    if ax.is_first_row():
        ldate = date_cum.index[-1]+timedelta(days=60)
        tmp = pdata[pd.to_datetime('2013-9'):ldate]
        ax.plot(tmp.index, tmp.values)
    else:
        ax.plot(pdata.index, pdata.values)
    plt.setp(ax.xaxis.get_majorticklabels(), rotation=45)
    ax.set_ylabel('Samples')
fig.tight_layout()
plt.savefig(base_path+'SeqProgress.png')

This figure shows the cumulative number of sequences that we've typed so far. The blue trendline shows the projected rate of completion.

Pull out Redcap Data



In [6]:

    
pat_data = LoadingTools.load_redcap_data().groupby(['Patient ID', 'VisitNum']).first()
tdf.set_index(['Patient', 'Visit'], inplace=True)



In [7]:

    
pat_data['Tropism'] = np.nan
_, trops = pat_data['Tropism'].align(tdf['Prediction'], join = 'left')
pat_data['Tropism'] = trops.replace('Attempted', np.nan)
pat_data['PSSMCluster'] = np.nan
_, pat_data['PSSMCluster'] = pat_data['PSSMCluster'].align(tdf['PSSMCluster'], join = 'left')



In [8]:

    
def safe_time_min(inser):
    
    if inser.isnull().all():
        return pd.NaT
    else:
        return inser.dropna().min()
    
qc_cols = [('Race-Asian','min'), 
           ('Race-Indian','min'),
           ('Race-Black','min'),
           ('Race-Hawaiian','min'),
           ('Race-White','min'),
           ('Race-Multiple','min'),
           ('Race-Unknown','min'),
           ('HIV Seropositive Date', 'min'),
           ('Nadir CD4 count (cells/uL)', 'min'),
           ('Nadir CD8 count (cells/uL)', 'min'),
           ('Peak viral load (copies/mL)', 'max')]

date_cols = ['HIV Seropositive Date',
             'Date Of Visit',
             'Date of initial CD4 count',
             'Date of nadir CD4 count',
             'Date of latest CD4 count',
             'Date of initial CD8 count',
             'Date of nadir CD8 count',
             'Date of latest CD8 count',
             'Date of initial viral load',
             'Date of peak viral load',
             'Date of latest viral load']

for col in date_cols:
    pat_data[col] = pd.to_datetime(pat_data[col], coerce = True)

for col, func in qc_cols:
    pat_data[col] = pat_data[col].groupby(level = 'Patient ID').transform(func)



In [9]:

    
type(pat_data['Date Of Visit'].values[0])









    Out[9]:





numpy.datetime64



In [10]:

    
def to_days(ival):
    return ival/np.timedelta64(1, 'D')

date_check_cols = [('Latest viral load', 'Date of latest viral load'),
                   ('Latest CD4 count (cells/uL)', 'Date of latest CD4 count'),
                   ('Latest CD8 count (cells/uL)', 'Date of latest CD8 count')]
cutoff = 90
for col, dcol in date_check_cols:
    date_deltas = (pat_data['Date Of Visit'] - pat_data[dcol]).map(to_days).abs()
    mask = date_deltas<cutoff
    pat_data['Close ' + col] = np.nan
    pat_data['Close ' + col][mask] = pat_data[col][mask]



In [11]:

    
def to_years(ival):
    return ival/np.timedelta64(1, 'D')/365

pat_data['YearsSero'] = (pat_data['Date Of Visit'] - pat_data['HIV Seropositive Date']).apply(to_years)
log_cols = [('Latest viral load', 'Log-Latest-VL'),
            ('Peak viral load (copies/mL)', 'Log-Peak-VL'),
            ('Close Latest viral load', 'Close-Log-Latest-VL'),
            ]
for orig, new in log_cols:
    pat_data[new] = pat_data[orig].map(np.log10)



In [12]:

    
pat_data['Tropism'].value_counts()









    Out[12]:





R5      113
X4       37
X4-P     14
R5-P     13
dtype: int64



In [13]:

    
from statsmodels.graphics.api import violinplot
from scipy.stats import ttest_ind, kruskal
from statsmodels.stats.power import tt_ind_solve_power

def generate_violion_plots(plot_col, group_col, group_order, ax):
    
    boxes = []
    mus = []
    stds = []
    g_order = []
    for group in group_order:
        mask = group_col == group
        tmp = plot_col[mask].dropna()
        if len(tmp) > 2:
            g_order.append(group)
            boxes.append(tmp.copy().values)
            mus.append(plot_col[mask].mean())
            stds.append(plot_col[mask].std())
        
    if len(boxes) == 2:
        ef = abs(np.diff(mus))/(np.sum(stds))
        ratio = len(boxes[1])/len(boxes[0])
        n0 = tt_ind_solve_power(effect_size=ef, alpha = alpha, power = power, ratio = ratio)
        sizes = [str(int(n0)), str(int(n0*ratio))]
        _, pval = ttest_ind(*boxes)
    else:
        sizes = ['']*len(boxes)
        _, pval = kruskal(*boxes)
    
    labels = ['%s n=%i/%s' % (t, len(b), n) for t, b, n in zip(g_order, boxes, sizes)]
    violinplot(boxes, ax = ax, labels = labels)
    return pval, ax



In [14]:

    
checks = [('VL', 'Log-Latest-VL'),
          ('Close-VL', 'Close-Log-Latest-VL'),
          ('Peak-VL', 'Log-Peak-VL'),
          ('CD4', 'Latest CD4 count (cells/uL)'),
          ('Close-CD4', 'Close Latest CD4 count (cells/uL)'),
          ('Nadir-CD4', 'Nadir CD4 count (cells/uL)'),
          ('CD8', 'Latest CD8 count (cells/uL)'),
          ('Close-CD8', 'Close Latest CD8 count (cells/uL)'),
          ('Nadir-CD8', 'Nadir CD8 count (cells/uL)'),
          ('TMHDS', 'TMHDS'),
          ('Years-Sero', 'YearsSero')]
alpha = 0.05
power = 0.8

trops = ['X4', 'R5']
fig, axs = plt.subplots(4,3, figsize = (10,10))
for (name, col), ax in zip(checks, axs.flatten()):
    
    pval, ax = generate_violion_plots(pat_data[col], 
                                      pat_data['Tropism'], 
                                      trops, ax)
    ax.set_title(name + ' pval:%f' % pval)
    ax.set_ylim([0, ax.get_ylim()[1]])

plt.tight_layout()
plt.savefig(base_path + 'uncorrected_clinical_params.png')

This figure shows the difference in the X4 vs R5 population for the entire sequenced cohort. This was done at the 'sample level'. We can see a clear difference in Nadir-CD4, Close-CD4 (cd4 within 90 days of visit) and Latest CD4. However, there are numerous confounders in this analysis so I choose a subset of R5 patients such that the two groups are matched for Age, gender, race, etc.



In [15]:

    
fig, axs = plt.subplots(4,3, figsize = (15,15))
for (name, col), ax in zip(checks, axs.flatten()):
    
    pval, ax = generate_violion_plots(pat_data[col], 
                                      pat_data['PSSMCluster'], 
                                      pssm_names, ax)
    ax.set_title(name + ' pval:%f' % pval)
    ax.set_ylim([0, ax.get_ylim()[1]])
    plt.setp(ax.xaxis.get_majorticklabels(), rotation=70 )
plt.tight_layout()
plt.savefig(base_path + 'uncorrected_clinical_params_MULTI_R5.png')



In [16]:

    
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.preprocessing import normalize
pat_data['IsMale'] = pat_data['Gender'] == 'Male'

control_cols = ['Age', 
                'IsMale', 
                'Race-Asian',
                'Race-Indian',
                'Race-Black',
                'Race-Hawaiian',
                'Race-White',
                'Race-Multiple',
                'HAART-Naive',
                'HAART-Non-Adherent',
                'HAART-Off',
                'HAART-On',
                'YearsSero']

r5_mask = pat_data['Tropism'] == 'R5'
x4_mask = pat_data['Tropism'] == 'X4'

r5_data = pat_data[r5_mask][control_cols].dropna()
x4_data = pat_data[x4_mask][control_cols].dropna()

dists = euclidean_distances(normalize(r5_data.values.astype(float)), normalize(x4_data.values.astype(float)))



In [17]:

    
def assign_best(dists):
    
    valid_dists = dists.copy()
    out = []
    for col in range(valid_dists.shape[1]):
        pos = valid_dists[:, col].argmin()
        out.append(pos)
        valid_dists[pos, :] = np.inf
    return np.array(out)
    
def match_items(small_set, large_set):
    
    small_data = normalize(small_set.values.astype(float))
    large_data = normalize(large_set.values.astype(float))
    
    dists = euclidean_distances(large_data, small_data)
    large_locs = set(assign_best(dists))
    
    
    mask = pd.Series([i in large_locs for i in range(len(large_set.index))],
                     index = large_set.index)
    
    return small_set.index, large_set[mask].index
        
x4_inds, r5_inds = match_items(x4_data, r5_data)



In [18]:

    
fig, axs = plt.subplots(4,3, figsize = (10,10))
pat_data['Keep'] = False
pat_data['Keep'][x4_inds] = True
pat_data['Keep'][r5_inds] = True
    
keep_mask = pat_data['Keep']
for (name, col), ax in zip(checks, axs.flatten()):
    
    pval, ax = generate_violion_plots(pat_data[col][keep_mask], 
                                      pat_data['Tropism'][keep_mask], 
                                      trops, ax)
    ax.set_title(name + ' pval:%f' % pval)
    ax.set_ylim([0, ax.get_ylim()[1]])

plt.tight_layout()
plt.savefig(base_path + 'matched_clinical_params.png')

Here is the data for the matched cohorts. We still see a strong effect on the CD4.



In [19]:

    
fig, axs = plt.subplots(4,3, figsize = (15,15))
for (name, col), ax in zip(checks, axs.flatten()):
    
    pval, ax = generate_violion_plots(pat_data[col][keep_mask], 
                                      pat_data['PSSMCluster'][keep_mask], 
                                      pssm_names, ax)
    ax.set_title(name + ' pval:%f' % pval)
    ax.set_ylim([0, ax.get_ylim()[1]])
    plt.setp(ax.xaxis.get_majorticklabels(), rotation=70 )
plt.tight_layout()
plt.savefig(base_path + 'matched_clinical_params_MULTI_R5.png')

Pull out the cytokine data



In [20]:

    
from sklearn.covariance import EllipticEnvelope
cytos = sorted(['IL.8','VEGF','IL.1beta',
        'G.CSF','EGF','IL.10','HGF',
        'FGF.basic','IFN.alpha','IL.6',
        'IL.12','Rantes','Eotaxin',
        'GM.CSF','MIP.1beta',
        'MCP.1','IL.5','IL.13', 'IFN.gamma','TNF.alpha',
        'IL.RA','IL.2','IL.7','IP.10',
        'IL.2R','MIG','IL.4','IL.15',
        'IL.17','MIP.1alpha']) + ['Th1', 'Th2']

cyto_data_raw = pd.read_csv('/home/will/HIVSystemsBio/NewCytokineAnalysis/CytoRawData.csv', sep = '\t')
cyto_data_raw['Th1'] = cyto_data_raw['IFN.gamma'] + \
                            cyto_data_raw['IL.2']+cyto_data_raw['TNF.alpha']
cyto_data_raw['Th2'] = cyto_data_raw['IL.4'] + \
                            cyto_data_raw['IL.5']+cyto_data_raw['IL.10']



In [21]:

    
cyto_data = cyto_data_raw.groupby(['Patient ID', 'VisitNum']).mean()
tranfer_cols = ['Log-Latest-VL', 
                'Tropism',
                'Keep',
                'IsMale',
                'Race-Black',
                'Age',
                'HAART-Naive',
                'HAART-Non-Adherent',
                'HAART-Off',
                'HAART-On',
                'Hepatitis C status (HCV)']
for col in tranfer_cols:
    _, cyto_data[col] = cyto_data.align(pat_data[col], join='left', axis = 0)
cyto_data['HCV'] = cyto_data['Hepatitis C status (HCV)']



In [22]:

    
for col in cytos:
    env = EllipticEnvelope(contamination=0.05)
    env.fit(cyto_data[col].dropna().values.reshape(-1, 1))
    mask = env.predict(cyto_data[col].values.reshape(-1,1))
    cyto_data[col][mask==-1] = np.nan



In [23]:

    
fig, axs = plt.subplots(11,3, figsize = (10,20))

for ax, col in zip(axs.flatten(), cytos):
    
    boxes = []
    mus = []
    stds = []
    for trop in trops:
        mask = cyto_data['Tropism'] == trop
        #mask &= cyto_data['Keep']
        boxes.append(cyto_data[col][mask].dropna().values)
        mus.append(cyto_data[col][mask].mean())
        stds.append(cyto_data[col][mask].std())
    
    ef = abs(np.diff(mus))/(np.sum(stds))
    ratio = len(boxes[1])/len(boxes[0])
    n0 = tt_ind_solve_power(effect_size=ef, alpha = alpha, power = power, ratio = ratio)
    sizes = [n0, n0*ratio]
    _, pval = ttest_ind(*boxes)
    labels = ['%s n=%i/%i' % (t, len(b), n) for t, b, n in zip(trops, boxes, sizes)]
    violinplot(boxes, ax = ax, labels = labels)
    # 
    ax.set_title(col + ' pval:%f' % pval)
    ax.set_ylim([0, ax.get_ylim()[1]])
plt.tight_layout()
plt.savefig(base_path+'uncorrected_cyto_data.png')



In [24]:

    
fig, axs = plt.subplots(11,3, figsize = (10,20))

for ax, col in zip(axs.flatten(), cytos):
    
    boxes = []
    mus = []
    stds = []
    for trop in trops:
        mask = cyto_data['Tropism'] == trop
        mask &= cyto_data['Keep']
        boxes.append(cyto_data[col][mask].dropna().values)
        mus.append(cyto_data[col][mask].mean())
        stds.append(cyto_data[col][mask].std())
    
    ef = abs(np.diff(mus))/(np.sum(stds))
    ratio = len(boxes[1])/len(boxes[0])
    n0 = tt_ind_solve_power(effect_size=ef, alpha = alpha, power = power, ratio = ratio)
    sizes = [n0, n0*ratio]
    _, pval = ttest_ind(*boxes)
    labels = ['%s n=%i/%i' % (t, len(b), n) for t, b, n in zip(trops, boxes, sizes)]
    violinplot(boxes, ax = ax, labels = labels)
    # 
    ax.set_title(col + ' pval:%f' % pval)
    ax.set_ylim([0, ax.get_ylim()[1]])
plt.tight_layout()
plt.savefig(base_path+'matched_cyto_data.png')



In [25]:

    
import statsmodels.api as sm

con_cols = ['Log-Latest-VL', 
            'IsR5',
            'IsMale',
            'Age',
            'HAART-Naive',
            'HAART-Off',
            'HAART-On',
            'HCV']
cyto_data['IsR5'] = 1.0
cyto_data['IsR5'][cyto_data['Tropism'].isnull()] = np.nan
cyto_data['IsR5'][cyto_data['Tropism']=='X4'] = 0.0
aa_mask = cyto_data['Race-Black'] == True

fig, axs = plt.subplots(11,3, figsize = (10,20))
for ax, col in zip(axs.flatten(), cytos):
    
    tdata = cyto_data[con_cols + [col]][aa_mask].dropna()
    res = sm.GLM(tdata[col],tdata[con_cols].astype(float)).fit()
    pval = res.pvalues['IsR5']

    boxes = []
    mus = []
    stds = []
    for trop in trops:
        mask = cyto_data['Tropism'] == trop
        mask &= aa_mask
        boxes.append(cyto_data[col][mask].dropna().values)
        mus.append(cyto_data[col][mask].mean())
        stds.append(cyto_data[col][mask].std())
    
    ef = abs(np.diff(mus))/(np.sum(stds))
    ratio = len(boxes[1])/len(boxes[0])
    n0 = tt_ind_solve_power(effect_size=ef, alpha = alpha, power = power, ratio = ratio)
    sizes = [n0, n0*ratio]
    
    labels = ['%s n=%i/%i' % (t, len(b), n) for t, b, n in zip(trops, boxes, sizes)]
    violinplot(boxes, ax = ax, labels = labels)
    # 
    ax.set_title(col + ' pval:%f' % pval)
    ax.set_ylim([0, ax.get_ylim()[1]])
plt.tight_layout()
plt.savefig(base_path + 'corrected_cyto_data.png')



In [26]:

    
import glob

ltr_files = sorted(glob.glob('/home/will/HIVReportGen/Data/PatientFasta/*LTR.fasta'))
ltr_seqs = []
for f in ltr_files:
    with open(f) as handle:
        ltr_seqs += list(fasta_reader(handle))
print len(ltr_seqs)



In [27]:

    
conb_ltr = """TGGAAGGGCTAATTCACTCCCAACGAAGACAAGATATCCTTGATCTGTGGATCTACCACACACAA
GGCTACTTCCCTGATTAGCAGAACTACACACCAGGGCCAGGGATCAGATATCCACTGACCTTTGGATGGTGCTACAAGC
TAGTACCAGTTGAGCCAGAGAAGTTAGAAGAAGCCAACAAAGGAGAGAACACCAGCTTGTTACACCCTGTGAGCCTGCA
TGGAATGGATGACCCGGAGAGAGAAGTGTTAGAGTGGAGGTTTGACAGCCGCCTAGCATTTCATCACATGGCCCGAGAG
CTGCATCCGGAGTACTTCAAGAACTGCTGACATCGAGCTTGCTACAAGGGACTTTCCGCTGGGGACTTTCCAGGGAGGC
GTGGCCTGGGCGGGACTGGGGAGTGGCGAGCCCTCAGATCCTGCATATAAGCAGCTGCTTTTTGCCTGTACTGGGTCTC
TCTGGTTAGACCAGATCTGAGCCTGGGAGCTCTCTGGCTAACTAGGGAACCCACTGCTTAAGCCTCAATAAAGCTTGCC
TTGAGTGCTTCAAGTAGTGTGTGCCCGTCTGTTGTGTGACTCTGGTAACTAGAGATCCCTCAGACCCTTTTAGTCAGTG
TGGAAAATCTCT""".replace('\n', '')

ltr_align = list(seq_align_to_ref(ltr_seqs, conb_ltr, max_workers = 20))



In [28]:

    
ltr_bin_align = []
align_inds = []
for key, seq in ltr_align:
    align_inds.append(tuple(key.split('-')))
    hit_first = False
    bins = []
    for g, cb in zip(seq, conb_ltr):
        if ~hit_first and (g == '-'):
            bins.append(np.nan)
            continue
        hit_first = True
        bins.append(1.0 if g==cb else 0.0)
    ltr_bin_align.append(np.array(bins))
ltr_bin_array = np.array(ltr_bin_align)



In [29]:

    
columns = ['ConB-%03i' % i for i in range(1, len(conb_ltr)+1)]
seq_df = pd.DataFrame(ltr_bin_array, 
                      index = pd.MultiIndex.from_tuples(align_inds, 
                                                        names = ['Patient', 'Visit']), 
                      columns = columns)



In [30]:

    
from scipy.stats import fisher_exact

_, seq_df['Tropism'] = seq_df.align(tdf['Prediction'], 
                                    join = 'left', 
                                    axis = 0)
r5_mask = seq_df['Tropism'] == 'R5'
x4_mask = seq_df['Tropism'] == 'X4'



In [31]:

    
pvals = []
for col in columns:
    r5_col = seq_df[col][r5_mask]
    x4_col = seq_df[col][x4_mask]
    if (r5_col.isnull().sum()>5) and (x4_col.isnull().sum()>5):
        f_table = [[(r5_col == 1).sum(), (x4_col == 1).sum()],
                   [(r5_col == 0).sum(), (x4_col == 0).sum()]]
        _, pval = fisher_exact(f_table)
        pvals.append(pval)
    else:
        pvals.append(np.nan)



In [32]:

    
ltr_features = pd.read_excel(base_path+'LTR_features.xlsx', 'Sheet1')
ltr_features









    Out[32]:






  
    
      
      Name
      StartPos
      EndPos
      Column
      Color
    
  
  
    
      0 
                           5' LTR U3
         1
       455
       0
       m
    
    
      1 
            TCF-1 alpha binding site
       315
       329
       1
       g
    
    
      2 
                       NF-kappa-B-II
       350
       359
       1
       b
    
    
      3 
                        NF-kappa-B-I
       364
       373
       2
       b
    
    
      4 
                             Sp1-III
       375
       386
       1
       r
    
    
      5 
                              Sp1-II
       388
       397
       2
       r
    
    
      6 
                               Sp1-I
       398
       408
       1
       r
    
    
      7 
                            TATA Box
       427
       431
       2
       r
    
    
      8 
                         TAR element
       453
       513
       2
       r
    
    
      9 
                  +1 mRNA start site
       456
       456
       0
       g
    
    
      10
                           TAR bulge
       476
       478
       0
       g
    
    
      11
                            TAR loop
       483
       488
       0
       g
    
    
      12
              Poly-A signal sequence
       527
       532
       1
       c
    
    
      13
                           5' LTR U5
       552
       634
       0
       c
    
    
      14
                   U5 binding region
       560
       570
       0
       y
    
    
      15
       Extensive secondary structure
       568
       605
       1
       y



In [33]:

    
bottoms = 3.0+(ltr_features['Column']*0.5)
heights = 0.5*np.ones_like(ltr_features['EndPos'].values)
widths = ltr_features['EndPos'] - ltr_features['StartPos']
lefts = ltr_features['StartPos']
colors= ltr_features['Color'].values



In [34]:

    
from statsmodels.sandbox.stats.multicomp import multipletests
apvals = np.array(pvals)
tmask = ~np.isnan(apvals) & (apvals < 1)
reject, adj_pvals, _, _ = multipletests(apvals[tmask], 0.1, 'fdr_bh')
areject = np.zeros_like(apvals)
areject[tmask] = reject
fig, ax = plt.subplots(figsize = (10,5))
ax.plot(-np.log10(pvals))
for num, (col, pval, mc) in enumerate(zip(columns, pvals, areject.flatten())):
    if pval < (0.05):
        label = '%s p=%f' % (col, pval)
        if mc:
            label += '*'
        ax.annotate(label, (num, -np.log10(pval)))
rects = ax.bar(lefts.values, heights, width=widths.values, bottom=bottoms, color = list(colors))
for left, bottom, label in zip(lefts, bottoms, ltr_features['Name']):
    ax.annotate(label, (left+5, bottom+0.25), 
                rotation=70, 
                xytext = (left, 7.5), 
                ha = 'left',
                arrowprops = {'width':1, 'headwidth':1})
    
plt.savefig(base_path + 'snp_figure.png')



In [35]:

    
from Bio.Seq import Seq
from Bio import Motif
from StringIO import StringIO
from itertools import groupby
from operator import methodcaller

def yield_motifs():
    with open('/home/will/LTRtfAnalysis/Jaspar_PWMs.txt') as handle:
        for key, lines in groupby(handle, methodcaller('startswith', '>')):
            if key:
                name = lines.next().strip().split()[-1].lower()
            else:
                tmp = ''.join(lines)
                mot = Motif.read(StringIO(tmp), 'jaspar-pfm')
                yield name, mot
                yield name+'-R', mot.reverse_complement()
pwm_dict = {}
for num, (name, mot) in enumerate(yield_motifs()):
    if num % 100 == 0:
        print num
    pwm_dict[name] = mot
    
    
tmp = u"""A 0  0 6 1 0 0 0 4 2 2 0 0 3 
C  1 1 1 0 5 6 4 1 0 0 0 3 5 5 4 0  
G  0 6 0 1 1 0 0 0 0 7 1 1 0 0 1 0 
T  6 0 0 0 1 1 3 5 7 0 0 0 0 2 2 4"""

pwm_dict['coup2'] = Motif.read(StringIO(tmp), 'jaspar-pfm')
pwm_dict['coup2-R'] = Motif.read(StringIO(tmp), 'jaspar-pfm').reverse_complement()



In [36]:

    
from Bio.Alphabet import IUPAC

def score_seq(seq, mot):
    bseq = Seq(seq, alphabet=IUPAC.unambiguous_dna)
    scores = mot.scanPWM(bseq)
    for pos, score in enumerate(scores.flatten(),1):
        if ~np.isnan(score):
            tseq = seq[pos:pos+len(mot)]
            yield pos, tseq, score
    

wanted_mots = ['ap1', 'ap1-R',
               'cebpa', 'cebpa-R',
               'creb1', 'creb1-R',
               'coup2', 'coup2-R',
               'ets1','ets1-R',
               #'fev', 'fev-R',
               'foxc1',	'foxc1-R',
               #'gata2',	'gata2-R',
               #'gata3',	'gata3-R',
               #'hnf4a',	'hnf4a-R',
               #'hoxa5',	'hoxa5-R',
               'nf-kappab','nf-kappab-R',
               'nfatc2', 'nfatc2-R',
               'nr2f1','nr2f1-R',
               #'tfap2a', 'tfap2a-R',    
               #'znf354c','znf354c-R',
               'nf-kappab', 'nf-kappab-R',
               'sp1', 'sp1-R']



big_res = []
for mot_name in wanted_mots:
    print mot_name
    mot = pwm_dict[mot_name]
    thresh = Motif.Thresholds.ScoreDistribution(mot, precision = 50).threshold_fpr(0.01)
    for name, seq in ltr_align:
        pid, vnum = name.split('-')
        for pos, tf_seq, score in score_seq(seq, mot):
            big_res.append({
                            'Patient':pid,
                            'Visit':vnum,
                            'TF':mot_name,
                            'PosSeq':tf_seq,
                            'Score':score,
                            'ConBPos':pos
                            })
tf_df = pd.DataFrame(big_res)









    



ap1
ap1-R
cebpa
cebpa-R
creb1
creb1-R
coup2
coup2-R
ets1
ets1-R
foxc1
foxc1-R
nf-kappab
nf-kappab-R
nfatc2
nfatc2-R
nr2f1
nr2f1-R
nf-kappab
nf-kappab-R
sp1
sp1-R



In [37]:

    
def process_vals(inser):
    return pd.rolling_max(inser, 8, min_periods=1)

gkey = ['Patient', 'Visit', 'TF']

tf_df['MaxScore'] = tf_df.groupby(gkey)['Score'].transform(process_vals)



In [38]:

    
tf_pivot = pd.pivot_table(tf_df, 
                          rows = ['Patient', 'Visit'],
                          cols = ['TF', 'ConBPos'],
                          values = ['MaxScore', 'PosSeq'],
                          aggfunc = 'first')



In [39]:

    
def calc_adjust(col, bind_data):
    confounder_cols = ['Age', 
                       'IsMale',
                       'Race-Black',
                       'Race-White',
                       'HAART-Naive',
                       'HAART-Non-Adherent',
                       'HAART-Off',
                       'HAART-On',
                       'YearsSero']
    condata = pat_data[sorted(set(confounder_cols+[col]))].dropna()
    condata, bondata = condata.align(bind_data, join = 'inner', axis = 0)
    sumres = sm.GLM(bondata, condata.astype(float)).fit()
    return sumres.pvalues[col]
    

pos_scores = tf_pivot['MaxScore']

results = []
for mot_name in wanted_mots:
    print mot_name
    score_data = pos_scores[mot_name]
    for pat_col_name, pat_col in checks:
        clin_data = pat_data[pat_col].dropna()
        for col in score_data.columns:
            bind_data = score_data[col].dropna().astype(float)
            if len(bind_data) < 10:
                continue
            bdata, cdata = bind_data.align(clin_data, join = 'inner')
            if (len(bdata) > 100):
                m, b, r, p, _ = linregress(bdata.values, cdata.values)
                results.append({
                                'Motif':mot_name,
                                'ClinicalVal':pat_col_name,
                                'Slope':m,
                                'Intercept':b,
                                'R2': r**2,
                                'pval':p,
                                'ConBCol':col,
                                'N':len(bdata),
                                'AdjPval':calc_adjust(pat_col, bind_data)
                                })









    



ap1
ap1-R
cebpa
cebpa-R
creb1
creb1-R
coup2
coup2-R
ets1
ets1-R
foxc1
foxc1-R
nf-kappab
nf-kappab-R
nfatc2
nfatc2-R
nr2f1
nr2f1-R
nf-kappab
nf-kappab-R
sp1
sp1-R



In [40]:

    
cor_results = pd.DataFrame(results)

nres = pd.pivot_table(cor_results,
                     rows = ['Motif', 'ConBCol'],
                     cols = 'ClinicalVal',
                     values = ['pval', 'Slope', 'R2', 'N', 'AdjPval'],
                     aggfunc = 'first')



In [41]:

    
all_pvals = np.array([d['pval'] for d in results if d['N'] > 200])
reject, adj_pvals, _, _ = multipletests(all_pvals, 0.2, 'fdr_bh')
cutoff = all_pvals[reject].max()
print cutoff
lcutoff = -np.log10(cutoff)









    



0.00835553279278



In [42]:

    
cols = [col for _, col in checks]
ncols = pd.Index([col for col, _ in checks])
small_pat_data = pat_data[cols]
small_pat_data.columns = ncols
tf_pat_data = pd.merge(tf_df,
                       small_pat_data,
                       left_on = ['Patient', 'Visit'],
                       right_index = True, 
                       how = 'inner')



In [43]:

    
from types import TupleType

crazy_big = pd.merge(tf_pat_data,
                     nres,
                     left_on = ['TF', 'ConBPos'],
                     right_index = True,
                     how = 'inner')
ncols = []
vcols = set(c for c, _ in checks)
for col in crazy_big.columns:
    if (type(col) is TupleType) and (col[1] in vcols):
        ncols.append((col[-1], col[0]))
    elif (type(col) is TupleType):
        print col
    else:
        ncols.append(('APatient', col))
        
crazy_big.columns = pd.MultiIndex.from_tuples(ncols, names = ['Value', 'Analysis'])
crazy_big.sortlevel(axis=1, inplace=True)



In [44]:

    
print crazy_big.head().T.to_string()









    



                               0             2004          5022          5548          6089
Value      Analysis                                                                        
APatient   CD4                  724           814           470           473           427
           CD8                 1423           912          1253          1334          1135
           Close-CD4            724           814           470           473           427
           Close-CD8           1423           912          1253          1334          1135
           Close-VL         1.90309           NaN      2.447158      2.653213      1.681241
           ConBPos               90            90            90            90            90
           MaxScore       -16.06473      3.577498      3.577498      3.577498      3.577498
           Nadir-CD4            301           343             2             2             2
           Nadir-CD8           1002           556           100           100           100
           Patient            A0001         A0002         A0004         A0004         A0004
           Peak-VL         4.812913      4.109545      5.194675      5.194675      5.194675
           PosSeq           ACACACC       ACACACC       ACACACC       ACACACC       ACACACC
           Score          -16.06473     -16.06473     -16.06473     -16.06473     -16.06473
           TF                   ap1           ap1           ap1           ap1           ap1
           TMHDS                NaN            11           1.5             5           6.5
           VL               1.90309      1.681241      2.447158      2.653213      1.681241
           Visit                R01           R04           R02           R03           R04
           Years-Sero      11.69863      18.92329      12.53973      13.09589      13.63288
CD4        AdjPval         0.203823      0.203823      0.203823      0.203823      0.203823
           N                    342           342           342           342           342
           R2           0.005424489   0.005424489   0.005424489   0.005424489   0.005424489
           Slope          -6.237929     -6.237929     -6.237929     -6.237929     -6.237929
           pval           0.1741758     0.1741758     0.1741758     0.1741758     0.1741758
CD8        AdjPval        0.5880559     0.5880559     0.5880559     0.5880559     0.5880559
           N                    331           331           331           331           331
           R2          0.0002717202  0.0002717202  0.0002717202  0.0002717202  0.0002717202
           Slope           3.685726      3.685726      3.685726      3.685726      3.685726
           pval           0.7651043     0.7651043     0.7651043     0.7651043     0.7651043
Close-CD4  AdjPval        0.3003322     0.3003322     0.3003322     0.3003322     0.3003322
           N                    249           249           249           249           249
           R2           0.006214849   0.006214849   0.006214849   0.006214849   0.006214849
           Slope          -5.934701     -5.934701     -5.934701     -5.934701     -5.934701
           pval           0.2151029     0.2151029     0.2151029     0.2151029     0.2151029
Close-CD8  AdjPval        0.2467266     0.2467266     0.2467266     0.2467266     0.2467266
           N                    226           226           226           226           226
           R2           0.003532582   0.003532582   0.003532582   0.003532582   0.003532582
           Slope           11.85242      11.85242      11.85242      11.85242      11.85242
           pval           0.3738184     0.3738184     0.3738184     0.3738184     0.3738184
Close-VL   AdjPval        0.7323179     0.7323179     0.7323179     0.7323179     0.7323179
           N                    254           254           254           254           254
           R2            0.00365441    0.00365441    0.00365441    0.00365441    0.00365441
           Slope         0.02030806    0.02030806    0.02030806    0.02030806    0.02030806
           pval           0.3372729     0.3372729     0.3372729     0.3372729     0.3372729
Nadir-CD4  AdjPval       0.05834792    0.05834792    0.05834792    0.05834792    0.05834792
           N                    341           341           341           341           341
           R2           0.005219817   0.005219817   0.005219817   0.005219817   0.005219817
           Slope           -3.89602      -3.89602      -3.89602      -3.89602      -3.89602
           pval           0.1831918     0.1831918     0.1831918     0.1831918     0.1831918
Nadir-CD8  AdjPval        0.5799357     0.5799357     0.5799357     0.5799357     0.5799357
           N                    325           325           325           325           325
           R2          0.0006658844  0.0006658844  0.0006658844  0.0006658844  0.0006658844
           Slope          -2.781292     -2.781292     -2.781292     -2.781292     -2.781292
           pval           0.6430158     0.6430158     0.6430158     0.6430158     0.6430158
Peak-VL    AdjPval        0.6317067     0.6317067     0.6317067     0.6317067     0.6317067
           N                    334           334           334           334           334
           R2          8.127265e-06  8.127265e-06  8.127265e-06  8.127265e-06  8.127265e-06
           Slope      -0.0009368423 -0.0009368423 -0.0009368423 -0.0009368423 -0.0009368423
           pval           0.9586038     0.9586038     0.9586038     0.9586038     0.9586038
TMHDS      AdjPval        0.3075887     0.3075887     0.3075887     0.3075887     0.3075887
           N                    272           272           272           272           272
           R2           0.002763554   0.002763554   0.002763554   0.002763554   0.002763554
           Slope         0.03983661    0.03983661    0.03983661    0.03983661    0.03983661
           pval           0.3878059     0.3878059     0.3878059     0.3878059     0.3878059
VL         AdjPval        0.5598803     0.5598803     0.5598803     0.5598803     0.5598803
           N                    344           344           344           344           344
           R2           0.004279419   0.004279419   0.004279419   0.004279419   0.004279419
           Slope         0.02403557    0.02403557    0.02403557    0.02403557    0.02403557
           pval            0.226206      0.226206      0.226206      0.226206      0.226206
Years-Sero AdjPval       0.03749909    0.03749909    0.03749909    0.03749909    0.03749909
           N                    356           356           356           356           356
           R2           0.002608017   0.002608017   0.002608017   0.002608017   0.002608017
           Slope          0.1124116     0.1124116     0.1124116     0.1124116     0.1124116
           pval           0.3366519     0.3366519     0.3366519     0.3366519     0.3366519



In [45]:

    
pmask = crazy_big.xs('pval', level = 'Analysis', axis=1) <= cutoff*10
pmask &= crazy_big.xs('AdjPval', level = 'Analysis', axis=1) <= cutoff*10
rmask = (crazy_big.xs('R2', level = 'Analysis', axis=1) > 0.05)
nmask = (crazy_big.xs('N', level = 'Analysis', axis=1) >= 200)
wanted_mask = (rmask & pmask & nmask).any(axis = 1)
wdata = crazy_big[wanted_mask]
wdata









    



/usr/local/lib/python2.7/dist-packages/numexpr/necompiler.py:746: DeprecationWarning: using `oa_ndim == 0` when `op_axes` is NULL is deprecated. Use `oa_ndim == -1` or the MultiNew iterator for NumPy <1.8 compatibility
  return compiled_ex(*arguments, **kwargs)
/usr/local/lib/python2.7/dist-packages/numexpr/necompiler.py:746: DeprecationWarning: using `oa_ndim == 0` when `op_axes` is NULL is deprecated. Use `oa_ndim == -1` or the MultiNew iterator for NumPy <1.8 compatibility
  return compiled_ex(*arguments, **kwargs)
/usr/local/lib/python2.7/dist-packages/numexpr/necompiler.py:746: DeprecationWarning: using `oa_ndim == 0` when `op_axes` is NULL is deprecated. Use `oa_ndim == -1` or the MultiNew iterator for NumPy <1.8 compatibility
  return compiled_ex(*arguments, **kwargs)






    Out[45]:




<class 'pandas.core.frame.DataFrame'>
Int64Index: 15085 entries, 873924 to 9519590
Data columns (total 73 columns):
(APatient, CD4)           14356  non-null values
(APatient, CD8)           13721  non-null values
(APatient, Close-CD4)     10378  non-null values
(APatient, Close-CD8)     9216  non-null values
(APatient, Close-VL)      10422  non-null values
(APatient, ConBPos)       15085  non-null values
(APatient, MaxScore)      15085  non-null values
(APatient, Nadir-CD4)     14191  non-null values
(APatient, Nadir-CD8)     13524  non-null values
(APatient, Patient)       15085  non-null values
(APatient, Peak-VL)       13884  non-null values
(APatient, PosSeq)        15085  non-null values
(APatient, Score)         15085  non-null values
(APatient, TF)            15085  non-null values
(APatient, TMHDS)         11155  non-null values
(APatient, VL)            14383  non-null values
(APatient, Visit)         15085  non-null values
(APatient, Years-Sero)    14866  non-null values
(CD4, AdjPval)            15085  non-null values
(CD4, N)                  15085  non-null values
(CD4, R2)                 15085  non-null values
(CD4, Slope)              15085  non-null values
(CD4, pval)               15085  non-null values
(CD8, AdjPval)            15085  non-null values
(CD8, N)                  15085  non-null values
(CD8, R2)                 15085  non-null values
(CD8, Slope)              15085  non-null values
(CD8, pval)               15085  non-null values
(Close-CD4, AdjPval)      15085  non-null values
(Close-CD4, N)            15085  non-null values
(Close-CD4, R2)           15085  non-null values
(Close-CD4, Slope)        15085  non-null values
(Close-CD4, pval)         15085  non-null values
(Close-CD8, AdjPval)      15085  non-null values
(Close-CD8, N)            15085  non-null values
(Close-CD8, R2)           15085  non-null values
(Close-CD8, Slope)        15085  non-null values
(Close-CD8, pval)         15085  non-null values
(Close-VL, AdjPval)       15085  non-null values
(Close-VL, N)             15085  non-null values
(Close-VL, R2)            15085  non-null values
(Close-VL, Slope)         15085  non-null values
(Close-VL, pval)          15085  non-null values
(Nadir-CD4, AdjPval)      15085  non-null values
(Nadir-CD4, N)            15085  non-null values
(Nadir-CD4, R2)           15085  non-null values
(Nadir-CD4, Slope)        15085  non-null values
(Nadir-CD4, pval)         15085  non-null values
(Nadir-CD8, AdjPval)      15085  non-null values
(Nadir-CD8, N)            15085  non-null values
(Nadir-CD8, R2)           15085  non-null values
(Nadir-CD8, Slope)        15085  non-null values
(Nadir-CD8, pval)         15085  non-null values
(Peak-VL, AdjPval)        15085  non-null values
(Peak-VL, N)              15085  non-null values
(Peak-VL, R2)             15085  non-null values
(Peak-VL, Slope)          15085  non-null values
(Peak-VL, pval)           15085  non-null values
(TMHDS, AdjPval)          15085  non-null values
(TMHDS, N)                15085  non-null values
(TMHDS, R2)               15085  non-null values
(TMHDS, Slope)            15085  non-null values
(TMHDS, pval)             15085  non-null values
(VL, AdjPval)             15085  non-null values
(VL, N)                   15085  non-null values
(VL, R2)                  15085  non-null values
(VL, Slope)               15085  non-null values
(VL, pval)                15085  non-null values
(Years-Sero, AdjPval)     15085  non-null values
(Years-Sero, N)           15085  non-null values
(Years-Sero, R2)          15085  non-null values
(Years-Sero, Slope)       15085  non-null values
(Years-Sero, pval)        15085  non-null values
dtypes: float64(68), int64(1), object(4)



In [46]:

    
def coerce_cols(tup):
    try:
        return '--'.join(tup)
    except TypeError:
        return coerce_cols(tup[0])

gkey = [('APatient', 'TF'), ('APatient', 'ConBPos'), ('APatient', 'PosSeq')]
grouped_data = wdata.groupby(gkey).agg(['mean', 'count'])



In [47]:

    
drop_cols = [tup for tup in grouped_data.columns if (tup[-1] == 'count') and (tup[0] != 'APatient')]
out_data = grouped_data.drop(drop_cols, axis=1)
for cname, _ in checks:
    mask = (out_data[cname]['pval']['mean'] > 0.1) & (out_data[cname]['AdjPval']['mean'] > 0.1)
    tmask = np.tile(mask.values, (len(out_data[cname].columns),1))
    out_data[cname] = out_data[cname].where(np.transpose(tmask))
out_data.reset_index(inplace=True)
out_data.columns = [coerce_cols(tup) for tup in out_data.columns]
out_data.to_excel(base_path+'TFbinding.xlsx', index=False)



In [48]:

    
from itertools import product
clin_vals = sorted(cor_results['ClinicalVal'].unique())
motifs = sorted(cor_results['Motif'].unique())
pos = range(1, 630)
nindex = pd.MultiIndex.from_tuples(list(product(clin_vals, motifs, pos)), names = ['ClinicalVal', 'Motif', 'ConBCol'])



In [49]:

    
cor_results.head()









    Out[49]:






  
    
      
      AdjPval
      ClinicalVal
      ConBCol
      Intercept
      Motif
      N
      R2
      Slope
      pval
    
  
  
    
      0
       0.318952
       VL
       60
       2.462736
       ap1
       103
       0.011367
      -0.030164
       0.283773
    
    
      1
       0.260309
       VL
       61
       2.416500
       ap1
       110
       0.014451
      -0.043264
       0.210961
    
    
      2
       0.321392
       VL
       62
       2.460093
       ap1
       121
       0.010198
      -0.036661
       0.270399
    
    
      3
       0.404810
       VL
       63
       2.460184
       ap1
       124
       0.009534
      -0.044369
       0.280646
    
    
      4
       0.193335
       VL
       64
       2.414537
       ap1
       126
       0.013435
      -0.049318
       0.196201



In [50]:

    
cor_results['LogP-signed'] = (-np.log10(cor_results['pval']))*np.sign(cor_results['Slope'])
plot_vals = cor_results.groupby(['ClinicalVal', 'Motif', 'ConBCol'])['LogP-signed'].first()



In [51]:

    
plot_vals.head()









    Out[51]:





ClinicalVal  Motif  ConBCol
CD4          ap1    60         0.343823
                    61         0.207362
                    62         0.030376
                    63         0.128009
                    64         0.557196
Name: LogP-signed, dtype: float64



In [52]:

    
def make_annotation(ax, tf, positions, color):
    if len(positions) == 1:
        label = '%s-%i' % (tf, min(positions))
    else:
        label = '%s-[%i-%i]' % (tf, min(positions), max(positions))
    ax.annotate(label,
                (max(lpos), val), 
                textcoords = 'offset points',
                xytext = (0, np.random.normal()*4),
                color = color)
    
def simple_color(val):
    if val > 0:
        return 'b'
    return 'r'
    

fig, axs = plt.subplots(4,3, figsize = (15, 10), sharex=True, sharey=True)
for ax, (col, _) in zip(axs.flatten(), checks):
    vals = -nres['pval'][col].applymap(np.log10).fillna(0)
    
    xpos = np.array([int(v.split('-')[1]) for v in vals.columns])
    yvals = vals[(vals > 1).any(axis = 1)]
    
    direct = nres['Slope'][col][(vals > 1).any(axis = 1)]
    corr = nres['R2'][col][(vals > 1).any(axis = 1)]
    
    ax.bar(left = xpos, height = direct.T.values, c = 'g', width = 1/len(direct.index))
    raise KeyboardInterrupt
    ax.set_ylim([0, 10])
    
    for tf, row in yvals.iterrows():
        lpos = []
        for c, val in sorted(row[row>lcutoff].to_dict().items()):
            if corr[c][tf] < 0.5:
                continue
            pos = int(c.split('-')[1])
            color = 'b' if direct[c][tf] > 0 else 'r'
            #print col, tf, c, val, corr[c][tf]
            if (len(lpos)==0) or (lpos[-1]+5>pos):
                lpos.append(pos)
            else:
                make_annotation(ax, tf, lpos, color)
                lpos = []
        if len(lpos)>0:
            make_annotation(ax, tf, lpos, color)
            
    
    ax.set_title(col)
    if ax.is_first_col():
        ax.set_ylabel('Slope')
    if ax.is_last_row():
        ax.set_ylabel('HXB2 Pos')
plt.tight_layout()
#plt.savefig(base_path+'tf_figure.png', dpi=200)









    



---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-52-5234f69cf5ce> in <module>()
     18 fig, axs = plt.subplots(4,3, figsize = (15, 10), sharex=True, sharey=True)
     19 for ax, (col, _) in zip(axs.flatten(), checks):
---> 20     vals = -nres['pval'][col].applymap(np.log10).fillna(0)
     21 
     22     xpos = np.array([int(v.split('-')[1]) for v in vals.columns])

AttributeError: 'Series' object has no attribute 'applymap'



In [ ]:



In [ ]:

Prediction	Attempted	R5	R5-P	X4	X4-P
PSSMCluster
R5-1	0	12	0	0	0
R5-2	0	43	0	0	0
R5-3	1	49	0	0	0
R5-4	0	9	0	0	0
R5-P	0	0	13	0	0
X4-P	0	0	0	0	14
X4-R	0	0	0	13	0

	Name	StartPos	EndPos	Column	Color
0	5' LTR U3	1	455	0	m
1	TCF-1 alpha binding site	315	329	1	g
2	NF-kappa-B-II	350	359	1	b
3	NF-kappa-B-I	364	373	2	b
4	Sp1-III	375	386	1	r
5	Sp1-II	388	397	2	r
6	Sp1-I	398	408	1	r
7	TATA Box	427	431	2	r
8	TAR element	453	513	2	r
9	+1 mRNA start site	456	456	0	g
10	TAR bulge	476	478	0	g
11	TAR loop	483	488	0	g
12	Poly-A signal sequence	527	532	1	c
13	5' LTR U5	552	634	0	c
14	U5 binding region	560	570	0	y
15	Extensive secondary structure	568	605	1	y

	AdjPval	ClinicalVal	ConBCol	Intercept	Motif	N	R2	Slope	pval
0	0.318952	VL	60	2.462736	ap1	103	0.011367	-0.030164	0.283773
1	0.260309	VL	61	2.416500	ap1	110	0.014451	-0.043264	0.210961
2	0.321392	VL	62	2.460093	ap1	121	0.010198	-0.036661	0.270399
3	0.404810	VL	63	2.460184	ap1	124	0.009534	-0.044369	0.280646
4	0.193335	VL	64	2.414537	ap1	126	0.013435	-0.049318	0.196201