In [1]:
from __future__ import division
from pandas import *
import os, os.path
import sys
import numpy as np

sys.path.append('/home/will/HIVReportGen/AnalysisCode/')
sys.path.append('/home/will/PySeqUtils/')
os.chdir('/home/will/HIVVariation/')
from GeneralSeqTools import call_muscle

In [2]:
from GeneralSeqTools import fasta_reader

seq_data = []
with open('PBMC_analyzed.clean.fasta') as handle:
    for name, seq in fasta_reader(handle):
        try:
            pid, vn = name.split('-')[0:2]
        except ValueError:
            print name
            raise ValueError
        seq_data.append((pid, vn, seq))
        
seq_df = DataFrame(seq_data, columns=['Patient ID', 'VisitNum', 'Seq'])

In [3]:
wanted_seq_cols = [340, 381] #1-based!!
hxb2_ltr = """TGGAAGGGCTAATTTACTCCCAAAAAAGACAAGATATCCTTGATCTGTGGGTC
TACCACACACAAGGCTACTTCCCTGATTGGCAGAACTACACACCAGGGCCAGG
GATCAGATATCCACTGACCTTTGGATGGTGCTTCAAGCTAGTACCAGTTGAGC
CAGAGAAGGTAGAAGAGGCCAATGAAGGAGAGAACAACAGCTTGTTACACCCT
ATGAGCCTGCATGGGATGGAGGACCCGGAGAAAGAAGTGTTAGTGTGGAAGTT
TGACAGCCGCCTAGCATTTCATCACATGGCCCGAGAGCTGCATCCGGAGTACT
ACAAGGACTGCTGACATCGAGCTTTCTACAAGGGACTTTCCGCTGGGGACTTT
CCAGGGAGGCGTGGCCTGGGCGGGACTGGGGAGTGGCGAGCCCTCAGATGCTG
CATATAAGCAGCTGCTTTTTGCCTGTACTGGGTCTCTCTGGTTAGACCAGATC
TGAGCCTGGGAGCTCTCTGGCTAACTAGGGAACCCACTGCTTAAGCCTCAATA
AAGCTTGCCTTGAGTGCTTCAAGTAGTGTGTGCCCGTCTGTTGTGTGACTCTG
GTAACTAGAGATCCCTCAGACCCTTTTAGTCAGTGTGGAAAATCTCTAGCA""".replace('\n', '')

sp3_pos = hxb2_ltr.find('GAGGCGTGGC')
sp2_pos = hxb2_ltr.find('TGGGCGGGAC')
sp1_pos = hxb2_ltr.find('GGGGAGTGGC')

sp3_locs = [sp3_pos+1+i for i in range(10)]
sp2_locs = [sp2_pos+1+i for i in range(10)]
sp1_locs = [sp1_pos+1+i for i in range(10)]

wanted_seq_cols += sp3_locs+sp2_locs+sp1_locs


def get_wanted_seq_cols(seq_series):
    
    nseqs = [('test', seq_series.values[0]),
             ('hxb2', hxb2_ltr)]
    daln = dict(call_muscle(nseqs))
    out = dict([(str(x), None) for x in wanted_seq_cols])
    scols = set(wanted_seq_cols)
    hxb2pos = 0
    for hxb2_l, test_l in zip(daln['hxb2'], daln['test']):
        if hxb2_l != '-':
            hxb2pos += 1 #1-based!
            
        if hxb2pos in scols:
            out[str(hxb2pos)] = test_l
    
    out_ser = Series(out)
    return out_ser
out = seq_df[['Seq']].apply(get_wanted_seq_cols, axis = 1)

In [4]:
col_groups = [('SP1', [str(x) for x in sp1_locs]),
              ('SP2', [str(x) for x in sp2_locs]),
              ('SP3', [str(x) for x in sp3_locs])]
keep_cols = set(['340', '381'])
for tf, group in col_groups:
    
    tmp = out[group].apply(lambda x: ''.join(x.values), axis=1)
    out[tf] = tmp
    wdrop = [col for col in group if col not in keep_cols]
    out = out.drop(wdrop, axis=1)

In [5]:
nseq_df = concat([seq_df, out], axis=1)
nseq_df['340-381'] = nseq_df[['340', '381']].apply(lambda x: ''.join(x.values), axis = 1)

In [6]:
from itertools import product
counts = []
for col in ['340', '381']:
    hxb2let = hxb2_ltr[int(col)-1]
    ncol = str(col) + '-' + hxb2let
    for let in 'ACGT':
        count = (nseq_df[col] == let).sum()
        counts.append((ncol, let, count))

single_done = set()
for l1, l2 in product('ACGT', 'ACGT'):
    if (l1 == l2):
        if l1 in single_done:
            continue
        single_done.add(l1)
        
    tmp = l1+l2
    count = nseq_df['340-381'] == tmp
    counts.append(('340-381', tmp, count.sum()))

        
df = DataFrame(counts, columns=['Col', 'Let', 'Count'])

In [7]:
pos_counts = pivot_table(df, rows = ['Col', 'Let'], values='Count', aggfunc = 'sum')
print pos_counts
DataFrame({'Counts':pos_counts}).to_excel('CountsForSonya.xlsx')


Col      Let
340-381  AA       0
         AC      67
         AG       2
         AT       2
         CA      15
         CC     536
         CG       1
         CT     282
         GA       0
         GC       4
         GG       0
         GT       0
         TA       3
         TC      48
         TG       1
         TT      11
340-C    A       76
         C      880
         G        5
         T       65
381-C    A       18
         C      671
         G        5
         T      296
Name: Count, dtype: int64

In [8]:
pos_counts.groupby(level='Col').sum()


Out[8]:
Col
340-381     972
340-C      1026
381-C       990
dtype: int64

In [9]:
wanted_pats = set(nseq_df['Patient ID'][nseq_df['340-381'] == 'TT'])
wanted_seqs = nseq_df[nseq_df['Patient ID'].map(lambda x: x in wanted_pats)]
print wanted_seqs.to_string()


     Patient ID VisitNum                                                Seq 340 381         SP1         SP2         SP3 340-381
117       A0333      R01  GATACCCACTGTGTTTTGGATGGTGCTTCAAACTATTACCAGTTGA...   T   T  GGGGAGTGGC  TGGGCGGGAC  GAGGTGTGGC      TT
158       A0107      R05  TTGGATGGTGCTTCAAGTTAGTACCAGTTGATCAAGATAAGGTAGA...   T   T  GGGGAGTGGC  TGGGCGGGAC  GAGGTGTGGC      TT
431       A0107      R00  GTAGAAGAGGCCAATGAAGGAGAGAACAACTGCCTGTTACACCCTA...   T   T  GGGGAGTGGC  TGGGCGGGAA  GAGGTGTGGC      TT
432       A0107      R01  ACACACAAGGCTACTTCCCTGATTGGCAGAACTACACGCCAGGGCC...   T   T  GGGGAGTGGC  TGGGCGGGAC  GAGGTGTGGC      TT
433       A0107      R02  ACTTCCCTGATTGGCAGAACTACACGCCAGGGCCAGGGGTCAGATA...   T   T  GGGGAGTGGC  TGGGCGGGAC  GAGGTGTGGC      TT
437       A0110      R00  GGCTACTTCCCTGATTGGCAGAACTACACACCAGGGCCAGGGGTCA...   C   T  GGGGAGTGGC  TGGGCGGGAC  GAGGTGTGGC      CT
545       A0213      R00  CTACTTCCCTGATTGGCAGAACTACACCCCAGGGCCAGGGATCAGA...   T   T  GGGGAGTGGC  TGGGCGGGAC  GAGGTGTGGT      TT
622       A0333      R00  AATGGTGCTTCAAGCTAGTACCAGTTGAGCCAGAGAAAATAGAAGA...   C   C  AGGGAGTGGC  TGGGCGGGAC  GAGGCGTGGC      CC
683       A0107      R03  CAGGGGTCAGATATCCACTGACCTTTGGATGGTGCTACAAGCTAGT...   T   C  GGGGAGTGGC  TGGGCGGGAC  GAGGCGTGGC      TC
684       A0110      R01  AGTACCAGTTGAGCCAGAGAAGGTAGAAGAGGCCAATAAAGGAGAG...   T   T  GGGGAGTGGC  TGGGCGGGAC  GAGGTGTGGC      TT
809       A0107      R04  AAGAAGTGTTGCAGTGGAAGTTTGACAGCCGCCTAGCATTTCATCA...   -   -  ----------  ----------  ----------      --
884       A0110      R08  GATTTCCACTGACCTTTGGGTGGTGCTTCAAGCTAGTACCAGTTGA...   T   T  GGGGAGTGGC  TGGGCGGGAC  GAGGTGTGGC      TT
948       A0333      R03  CTCCCAGGCTCAGATCTGGTCTAACAAGAGAGACCCAGTACAGGCA...   C   -  ----------  CATGCAGG--  --------TC      C-
1029      A0107      R08  GATATCCACTGACCTTTGGATGGTGCTTCAAGTTAGTACCAGTCGA...   T   T  GGGGAGTGGC  TGGGCGGGAC  GAGGTGTGGC      TT
1030      A0110      R05  AGGTAGAAGAGGCCAATGAAGGAGAGAACAACTGCCTGTTACACCC...   T   T  GGGGAGTGGC  TGGGCGGGAC  GAGGTGTGGC      TT
1046      A0213      R01  TAGTACCAGTTGCGCCAGACAAAGTAGAGGAGGCCAATGAAGGAGA...   A   C  GGGGAGTGGC  TGGGCGGGGC  GAGGCGTGGC      AC
1096      A0479      R00  AGATATCCACTGACCTTTGGATGGTGCTTCAAGCTAGTACCAGTTG...   T   T  GGGGAGTGGC  TGGGCGGGAC  GAGGTGTGGC      TT

In [10]:
store_loc = os.path.join('home', 'will', 'HIVReportGen', 'Data',
                        'SplitRedcap', '2013-01-16',
                        'EntireCohort.hdf')
store = HDFStore('/'+store_loc)
redcap_data = store['redcap']

#fix the Visit Number issue
t = redcap_data['Event Name'].dropna().apply(lambda x: x.split(' - ')[0])
vnum_field = 'Patient visit number'
redcap_data[vnum_field] = redcap_data[vnum_field].combine_first(t)

#Fix the Other-drug issue
ofield = "Drugs used (choice='Other')"
redcap_data[ofield] = redcap_data[ofield].dropna() == 'Checked'

In [11]:
wanted_cols = {'Patient ID':'Patient ID', 
                'Patient visit number':'VisitNum', 
                'Date of visit':'VisitDate',
                'HIV seropositive date':'HIV seropositive date',
                'Total Modified Hopkins Dementia Score':'HIVD'
                }
wanted_redcap = redcap_data[wanted_cols.keys()].rename(columns = wanted_cols)
has_date_mask = wanted_redcap['HIV seropositive date'].notnull()

wanted_redcap['YearsSeropositive'] = np.nan
years_sero_func = lambda x: x['VisitDate'].year - x['HIV seropositive date'].year
years_sero_data = wanted_redcap[['VisitDate', 'HIV seropositive date']].reset_index().dropna().apply(years_sero_func, axis = 1)
years_sero_data.index = has_date_mask[has_date_mask].index
wanted_redcap['YearsSeropositive'][has_date_mask] = years_sero_data

wanted_redcap


Out[11]:
&ltclass 'pandas.core.frame.DataFrame'>
Int64Index: 1419 entries, 0 to 1418
Data columns (total 6 columns):
Patient ID               1419  non-null values
VisitNum                 1419  non-null values
VisitDate                1419  non-null values
HIV seropositive date    1410  non-null values
HIVD                     1223  non-null values
YearsSeropositive        1410  non-null values
dtypes: float64(2), object(4)

In [12]:
wanted_red_pats = wanted_redcap[wanted_redcap['Patient ID'].map(lambda x: x in wanted_pats)]
out_data = merge(wanted_red_pats, wanted_seqs.drop(['Seq'], axis = 1),
                 left_on = ['Patient ID', 'VisitNum'],
                 right_on = ['Patient ID', 'VisitNum'],
                 how = 'outer')
out_data.sort(['Patient ID', 'VisitNum'])
print out_data.to_string()
out_data.to_excel('ThreeFiveTPatInfo.xlsx')


   Patient ID VisitNum            VisitDate HIV seropositive date  HIVD  YearsSeropositive  340  381         SP1         SP2         SP3 340-381
0       A0107      R00  2007-07-17 00:00:00   2004-02-19 00:00:00   NaN                  3    T    T  GGGGAGTGGC  TGGGCGGGAA  GAGGTGTGGC      TT
1       A0107      R01  2008-09-03 00:00:00   2005-02-19 00:00:00  12.0                  3    T    T  GGGGAGTGGC  TGGGCGGGAC  GAGGTGTGGC      TT
2       A0107      R02  2008-12-16 00:00:00   2005-02-19 00:00:00   8.0                  3    T    T  GGGGAGTGGC  TGGGCGGGAC  GAGGTGTGGC      TT
3       A0107      R03  2009-12-01 00:00:00   2004-02-19 00:00:00   9.5                  5    T    C  GGGGAGTGGC  TGGGCGGGAC  GAGGCGTGGC      TC
4       A0107      R04  2010-06-09 00:00:00   2003-02-19 00:00:00  10.0                  7    -    -  ----------  ----------  ----------      --
5       A0107      R05  2010-09-28 00:00:00   2006-02-19 00:00:00  11.5                  4    T    T  GGGGAGTGGC  TGGGCGGGAC  GAGGTGTGGC      TT
6       A0107      R06  2011-03-02 00:00:00   2005-02-19 00:00:00  10.5                  6  NaN  NaN         NaN         NaN         NaN     NaN
7       A0107      R07  2011-09-14 00:00:00   2005-02-19 00:00:00  11.0                  6  NaN  NaN         NaN         NaN         NaN     NaN
8       A0107      R08  2012-03-06 00:00:00   2005-02-19 00:00:00   9.0                  7    T    T  GGGGAGTGGC  TGGGCGGGAC  GAGGTGTGGC      TT
9       A0107      R09  2012-09-12 00:00:00   2005-02-19 00:00:00  12.0                  7  NaN  NaN         NaN         NaN         NaN     NaN
10      A0110      R00  2007-07-31 00:00:00   2000-02-19 00:00:00   NaN                  7    C    T  GGGGAGTGGC  TGGGCGGGAC  GAGGTGTGGC      CT
11      A0110      R01  2008-07-08 00:00:00   2000-02-19 00:00:00   1.5                  8    T    T  GGGGAGTGGC  TGGGCGGGAC  GAGGTGTGGC      TT
12      A0110      R02  2008-10-08 00:00:00   2000-02-19 00:00:00   4.0                  8  NaN  NaN         NaN         NaN         NaN     NaN
13      A0110      R03  2009-10-14 00:00:00   2000-02-19 00:00:00   8.0                  9  NaN  NaN         NaN         NaN         NaN     NaN
14      A0110      R04  2010-04-28 00:00:00   2000-02-19 00:00:00  12.0                 10  NaN  NaN         NaN         NaN         NaN     NaN
15      A0110      R05  2010-10-06 00:00:00   2000-02-19 00:00:00   7.5                 10    T    T  GGGGAGTGGC  TGGGCGGGAC  GAGGTGTGGC      TT
16      A0110      R06  2011-04-06 00:00:00   2000-02-19 00:00:00  11.5                 11  NaN  NaN         NaN         NaN         NaN     NaN
17      A0110      R07  2011-10-12 00:00:00   2000-02-19 00:00:00   6.0                 11  NaN  NaN         NaN         NaN         NaN     NaN
18      A0110      R08  2012-04-03 00:00:00   2000-02-19 00:00:00  11.0                 12    T    T  GGGGAGTGGC  TGGGCGGGAC  GAGGTGTGGC      TT
19      A0110      R09  2012-10-10 00:00:00   2000-02-19 00:00:00  12.0                 12  NaN  NaN         NaN         NaN         NaN     NaN
20      A0213      R00  2008-11-12 00:00:00   1995-02-19 00:00:00  11.0                 13    T    T  GGGGAGTGGC  TGGGCGGGAC  GAGGTGTGGT      TT
21      A0213      R01  2011-11-09 00:00:00   1996-02-19 00:00:00  11.0                 15    A    C  GGGGAGTGGC  TGGGCGGGGC  GAGGCGTGGC      AC
22      A0213      R02  2012-05-09 00:00:00   1996-02-19 00:00:00  12.0                 16  NaN  NaN         NaN         NaN         NaN     NaN
23      A0333      R00  2009-09-09 00:00:00   1993-09-19 00:00:00  11.0                 16    C    C  AGGGAGTGGC  TGGGCGGGAC  GAGGCGTGGC      CC
24      A0333      R01  2010-03-16 00:00:00   1993-02-19 00:00:00  11.0                 17    T    T  GGGGAGTGGC  TGGGCGGGAC  GAGGTGTGGC      TT
25      A0333      R02  2011-03-23 00:00:00   1993-09-19 00:00:00  12.0                 18  NaN  NaN         NaN         NaN         NaN     NaN
26      A0333      R03  2012-03-06 00:00:00   1993-02-19 00:00:00  11.0                 19    C    -  ----------  CATGCAGG--  --------TC      C-
27      A0333      R04  2012-11-28 00:00:00   1993-09-19 00:00:00  12.0                 19  NaN  NaN         NaN         NaN         NaN     NaN
28      A0333      R05  2013-02-19 00:00:00   1993-09-19 00:00:00   NaN                 20  NaN  NaN         NaN         NaN         NaN     NaN
29      A0479      R00  2011-11-30 00:00:00   1999-02-19 00:00:00  10.0                 12    T    T  GGGGAGTGGC  TGGGCGGGAC  GAGGTGTGGC      TT
30      A0479      R01  2012-11-15 00:00:00   1999-02-19 00:00:00  10.5                 13  NaN  NaN         NaN         NaN         NaN     NaN

In [13]:
from Bio import Motif
from Bio.Seq import Seq, Alphabet
from StringIO import StringIO

def make_seq(seq, comp = False):
    if comp:
        return Seq(seq,Alphabet.IUPAC.unambiguous_dna).reverse_complement()
    else:
        return Seq(seq,Alphabet.IUPAC.unambiguous_dna) 

def score_seq(mot, seq, comp = False):
    return mot.scanPWM(make_seq(seq, comp = comp))[0]


tmp = u"""A 1 2 0 0 0 2 0 0 1 2 
C 1 1 0 0 5 0 1 0 1 0
G 4 4 8 8 2 4 5 6 6 0
T 2 1 0 0 1 2 2 2 0 6"""

sp1_mot = Motif.read(StringIO(tmp), 'jaspar-pfm')

In [28]:
test_seqs = [('sp3', 'GAGGCGTGGC'),
             ('sp2', 'TGGGCGGGAC'),
             ('sp1', 'GGGGAGTGGC')]
res = []
for name, base_seq in test_seqs:
    bs = list(base_seq)
    
    mat = np.zeros((6, len(bs)+2))
    for n in range(len(bs)):
        olet = bs[n]
        for ln, let in enumerate('ACTG'):
            bs[n] = let
            mat[ln+1, n+1] = score_seq(sp1_mot, ''.join(bs))
        bs[n] = olet
    res.append((name, base_seq, mat.copy(), score_seq(sp1_mot, base_seq)))

In [29]:
def add_letters(lets):
    yvals = {'A':1, 'C':2, 'T':3,'G':4}
    for n, l in enumerate(lets):
        yval = yvals[l]
        plt.annotate(l, xy=(n+1, yval), fontsize=60)

def add_sig(sig_mask):
    nrows, ncols = sig_mask.shape
    rows, cols = np.where(sig_mask)
    for row, col in zip(rows, cols):
        plt.annotate('*', xy=(col, row), fontsize=60)

In [31]:
from pylab import get_cmap

#plt.figure(figsize = (10, 90))
#fig, subs = plt.subplots(3,1, figsize = (5,15))
width_per_let = 0.75
for name, seq, r, conB_score in res:
    if True:
        plt.figure(figsize = (width_per_let*len(seq),3.4))
        plt.title(name)
        plt.pcolor(-(r-conB_score), cmap = get_cmap('RdBu'))
        plt.yticks([1.5,2.5,3.5,4.5], ('A', 'C', 'T', 'G'))
        plt.xticks(np.arange(1.5, len(seq)+1), range(1,len(seq)+1))
        plt.ylim([1,5])
        plt.xlim([1,len(seq)+1])
        plt.clim([-10, 10])
        add_letters(seq)
        add_sig(abs(r[0:-1, 0:-1]-conB_score)>3.84)
        plt.savefig('vSNP-%s.png' % name)
        plt.close()

In [14]:
from functools import partial

calc_sp_val = partial(score_seq, sp1_mot)
for col in ['SP1', 'SP2', 'SP3']:
    wmask = nseq_df[col].map(lambda x: '-' not in x)
    nseq_df[col+'-bind'] = np.nan
    nseq_df[col+'-bind'][wmask] = nseq_df[col][wmask].map(calc_sp_val)

In [40]:
cbseq = 'GAGGCGTGGC'
cb_bind = calc_sp_val(cbseq)

variants = []
for key, df in nseq_df.groupby('SP3'):
    if '-' not in key:
        d = ''
        for num, (c, k) in enumerate(zip(cbseq, key),1):
            if c != k:
                d += str(num)+k
        variants.append((key, d, cb_bind, df['SP3-bind'].mean(), len(df)))
var_df = DataFrame(variants, columns = ['SP3-Seq', 
                                        'SP3-Variant',
                                        'ConB-Binding',
                                        'Variant-Binding',
                                        'NumSamples'])
var_df.to_excel('sp3_variants.xlsx', index = False)

In [15]:
cols = ['SP1-bind', 'SP2-bind', 'SP3-bind']
lets = 'ACGT'
bins = np.linspace(-5,10, 20)
tit = '%s-%s %s N:%i'
for pos in ['340', '381']:
    fig, axs = plt.subplots(4,3, sharex=True, sharey=True, figsize=(10,10))
    for ax, (let, col) in zip(axs.flatten(), product(lets, cols)):

        mask = nseq_df[pos] == let
        counts, _ = np.histogram(nseq_df[col][mask].dropna().values, bins=bins)
        fracs = counts / counts.sum()
        ax.bar(bins[1:], fracs, bins[1]-bins[0])
        ntit = tit % (pos, let, col, len(nseq_df[col][mask].dropna()))
        ax.set_title(ntit)
        if col == 'SP1-bind':
            ax.set_ylabel('Frequency')
        if let == 'T':
            ax.set_xlabel('Binding score')
    fig.tight_layout()
    plt.savefig('binding-' + pos + '-fig.png')



In [15]: