In [1]:
import os
import sys
sys.path.append('/home/will/PySeqUtils/')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
os.chdir('/home/will/NewCoEvo/')
import GeneralSeqTools
import TFSeqTools

In [3]:
import glob
from itertools import islice, imap
import os, os.path
import csv

def get_gi_acc(fname):
    gb = fname.split('/')[-1].split('.')[0]
    with open(fname) as handle:
        for line in handle:
            if line.startswith('ACCESSION'):
                acc = line.strip().split()[-1]
                return gb, acc
    raise AssertionError



gi_to_acc_dict = {}

fname = '/home/will/WLAHDB_data/gi_to_acc.csv'
if os.path.exists(fname):
    with open(fname) as handle:
        for row in csv.reader(handle):
            gi_to_acc_dict[row[0]] = row[1]
else:
    gb_files = glob.glob('/home/will/WLAHDB_data/GenbankDL/*.gb')
    with open(fname, 'w') as handle:
        writer = csv.writer(handle)
        for num, (gbm, acc) in enumerate(imap(get_gi_acc, gb_files)):
            if (num == 100) or (num % 50000 == 0):
                print num
            gi_to_acc_dict[gbm] = acc
            writer.writerow((gbm, acc))

In [4]:
files = [('B', sorted(glob.glob('/home/will/WLAHDB_data/SeqDump/B_*')))]
seqs = []
for sub, sfiles in files:
    for f in sfiles:
        with open(f) as handle:
            base_name = f.rsplit(os.sep,1)[1].rsplit('.',1)[0]
            prot = base_name.split('_')[1]
            for name, seq in GeneralSeqTools.fasta_reader(handle):
                seqs.append({
                             'Seq':seq,
                             'ID':gi_to_acc_dict[name],
                             'Prot':prot,
                             })
            
seqdf = pd.DataFrame(seqs)

In [5]:
pseqdf = pd.pivot_table(seqdf, 
                        rows = 'ID', 
                        cols = 'Prot', 
                        values = 'Seq', 
                        aggfunc = 'first')

In [65]:
tmpenv = pseqdf[['gp120', 'gp41']].fillna('-').apply(lambda x: ''.join(x), axis=1)
tmpenv[tmpenv == '--'] = np.nan
pseqdf['env'] = pseqdf['env'].combine_first(tmpenv)

In [18]:
ld = []
for f in glob.glob('/home/will/SubCData/LANLRes/*.xlsx'):
    tmp = pd.read_excel(f, 'Sheet1')
    print f
    if 'Coreceptor' in tmp:
        print tmp['Coreceptor'].unique()
    else:
        print 'not there!'
    
    ld.append(tmp.copy())
    
lanl_data = pd.concat(ld, axis = 0, ignore_index=True)


/home/will/SubCData/LANLRes/LANLResults_M.xlsx
[nan u'CXCR4' u'CCR3 CCR5' u'CCR5' u'GPR15 APJ FPRL1'
 u'CCR5 CXCR4 CCR1 CCR2B CCR3' u'CCR5 CXCR4' u'CCR3 CCR5 CXCR4' u'UNK'
 u'CCR3 CCR5 CCR8' u'CCR3 CXCR4' u'CCR3 CXCR4 CCR5'
 u'CCR5 CCR1 CCR2b CCR3 CCR4 CXCR4' u'CCR5 GPR15' u'CCR5 CXCR4 CCR3 CCR1'
 u'CCR5 CCR3 CCR8' u'CCR5 CCR3' u'CXCR4 CCR3' u'CCR5 CXCR4 CCR3'
 u'CCR5 CXCR4 CCR4' u'CCR5 CXCR4 CCR3 CCR2b CCR8 V28' u'CXCR4 CCR3 STRL33'
 u'CCR5 CCR3 GPR15 STRL33 CCR8' u'CXCR4 V28' u'CCR5 CXCR4 CCR1 CCR2b CCR3'
 u'CCR5 CCR3 GPR15' u'CCR5 STRL33' u'CXCR4 GPR15' u'CCR5 CCR3 STRL33'
 u'CCR5 GPR15 STRL33']
/home/will/SubCData/LANLRes/LANLResults_F.xlsx
[nan u'CCR5' u'CCR5 CXCR6 CCR8' u'CCR5 CXCR4' u'CXCR4' u'UNK' u'CCR5 CCR8'
 u'CXCR4 CCR8 V28 CCR3 GPR15 STRL33' u'CCR5 GPR15'
 u'CCR5 CXCR4 CCR1 CCR2b CCR3 CCR4']
/home/will/SubCData/LANLRes/LANLResults_O.xlsx
[nan u'CCR5' u'CXCR4 CCR5' u'CXCR4' u'CCR5 CXCR4 CCR3' u'CCR5 CXCR4'
 u'CCR3 CXCR4' u'CCR5 CSCR4' u'CCR3 CCR5' u'CCR5 CCR2b CCR3 CXCR4'
 u'CCR3 CCR5 CXCR4' u'CXCR4 CCR3 CCR8' u'CCR2b CCR3 CCR5 CCR8 CXCR4'
 u'CCR1 CCR5' u'CCR5 CCR8']

In [21]:
from collections import Counter
uniq_items = Counter()
for key, val in lanl_data['Coreceptor'].dropna().to_dict().items():
    uniq_items += Counter(val.split())
print uniq_items


Counter({u'CCR5': 7277, u'CXCR4': 1730, u'CCR3': 164, u'GPR15': 51, u'FPRL1': 44, u'APJ': 44, u'CCR8': 26, u'UNK': 14, u'CCR1': 8, u'CCR2b': 7, u'STRL33': 6, u'CXCR6': 5, u'V28': 3, u'CCR4': 3, u'CSCR4': 2, u'CCR2B': 1})

In [26]:
from functools import partial

def is_trop(receptor, val):
    if val == val:
        return 1.0 if receptor in val else 0.0
    else:
        return np.nan
        
    
    
lanl_data['IsR5'] = lanl_data['Coreceptor'].map(partial(is_trop, 'CCR5'))
lanl_data['IsX4'] = lanl_data['Coreceptor'].map(partial(is_trop, 'CXCR4'))
lanl_data['HasTrop'] = lanl_data['IsR5'].notnull() | lanl_data['IsX4'].notnull()

In [35]:
agg_lanl_data = lanl_data.groupby('Accession').first()

In [66]:
tmp_lanl = agg_lanl_data[agg_lanl_data['HasTrop']]
wlanl, wseqs = tmp_lanl.align(pseqdf, join='inner', axis=0)

In [67]:
wseqs.apply(lambda x: x.notnull(), axis=0).astype(float).sum()


Out[67]:
Prot
env      1488
gag       132
gp120    1443
gp41     1448
ltr       305
nef       494
pol       133
rev         4
tat         1
v3       3988
vif       330
vpr       222
vpu       131
dtype: float64

In [171]:
ref_func = TFSeqTools.align_to_ref

In [109]:
import ConSeqs
from functools import partial

def get_ref_align(ref_func, conbseq, tseq):
    if tseq == tseq:
        query, ref = ref_func(tseq, conbseq)
        return ''.join(q for q,r in zip(query, ref) if r != '-')
    return np.nan



align_funcs = [('env', partial(get_ref_align, ref_func, ConSeqs.GetConSeq('env', alphabet='pro'))),
               ('ltr', partial(get_ref_align, ref_func, ConSeqs.GetConSeq('ltr', alphabet='dna'))),
               ('nef', partial(get_ref_align, ref_func, ConSeqs.GetConSeq('nef', alphabet='pro'))),
               ('pol', partial(get_ref_align, ref_func, ConSeqs.GetConSeq('pol', alphabet='pro'))),
               ('vif', partial(get_ref_align, ref_func, ConSeqs.GetConSeq('vif', alphabet='pro'))),
               ('vpr', partial(get_ref_align, ref_func, ConSeqs.GetConSeq('vpr', alphabet='pro'))),
               ('vpu', partial(get_ref_align, ref_func, ConSeqs.GetConSeq('vpu', alphabet='pro'))),
               ('gag', partial(get_ref_align, ref_func, ConSeqs.GetConSeq('gag', alphabet='pro'))),
               ('v3', partial(get_ref_align, ref_func, 'CTRPNNNTRKSIHIGPGRAFYTTGEIIGDIRQAHC')),
               ]

aseqs = {}
for col, afunc in align_funcs:
    
    aseqs[col] = wseqs[col].map(afunc)


aseqs_df = pd.DataFrame(aseqs)

In [111]:
def decide_trop(ser):
    if (ser['IsX4'] == 1) and (ser['IsR5'] == 0):
        return 1.0
    elif (ser['IsX4'] == 0) and (ser['IsR5'] == 1):
        return 0.0
    return np.nan

naseqs_df, alanl = aseqs_df.align(wlanl, axis=0, join='inner')
wmask = alanl[['IsX4', 'IsR5']].any(axis=1) & ~alanl[['IsX4', 'IsR5']].all(axis=1)
wtrop = alanl[['IsX4', 'IsR5']][wmask].apply(decide_trop, axis=1)

In [142]:
hwseqs = naseqs_df.apply(lambda x: x.notnull(), axis=0).astype(float)
hwseqs['Trop'] = wtrop
pd.pivot_table(hwseqs, 
               rows = 'Trop',
               aggfunc = 'sum')


Out[142]:
env gag ltr nef pol v3 vif vpr vpu
Trop
0 1215 94 197 380 95 3183 254 186 65
1 118 16 53 52 17 240 18 13 2

In [137]:
from operator import itemgetter
from sklearn.metrics import adjusted_mutual_info_score, normalized_mutual_info_score

results = []
for col in naseqs_df.columns:
    
    wanted_seqs, wanted_trops = naseqs_df[col].dropna().align(wtrop, join='inner')
    
    for pos in range(len(wanted_seqs.values[0])):
        getter = itemgetter(pos)
        wanted_col = wanted_seqs.map(getter)
        adj_score = adjusted_mutual_info_score(wanted_col.values, wanted_trops.values)
        norm_score = normalized_mutual_info_score(wanted_col.values, wanted_trops.values)
        results.append({
                        'Prot':col,
                        'Pos':pos+1,
                        'adj_score':adj_score,
                        'norm_score':norm_score
                        })

In [138]:
results_df = pd.DataFrame(results).groupby(['Prot', 'Pos']).first()

In [139]:
fig, ax = plt.subplots(1,1, figsize = (15, 5))
results_df.ix['env'].plot(ax = ax, alpha=0.5)
ax.set_xlim(0, results_df.ix['env'].index[-1])
ax.set_xlabel('Env Pos')
ax.set_ylabel('Score')


Out[139]:
<matplotlib.text.Text at 0x1a9f09d0>

In [140]:
prots = list(aseqs_df.columns)
fig, axs = plt.subplots(len(prots), 1, figsize = (10,15))
for ax, prot in zip(axs.flatten(), prots):
    
    results_df.ix[prot].plot(ax = ax, alpha=0.8, legend = False, linewidth=2)
    ax.set_xlim(0, results_df.ix[prot].index[-1])
    if ax.is_last_row():
        ax.set_xlabel('ConB Pos')
    ax.set_title(prot)
    ax.set_ylabel('Score')
    ax.set_ylim(0, ax.get_ylim()[1])
fig.tight_layout()



In [146]:
genome_dict = {}
with open('HIV1_ALL_2012_genome_DNA.fasta') as handle:
    for name, seq in GeneralSeqTools.fasta_reader(handle):
        genome_dict[name.rsplit('.', 1)[-1]] = seq.replace('-', '')
genome_ser = pd.Series(genome_dict)

In [155]:
trops = agg_lanl_data[['IsX4', 'IsR5']].dropna().apply(decide_trop, axis=1).dropna()
trop_omes, wgenomes = trops.align(genome_ser, join='inner')

In [157]:
with open('/home/will/PySeqUtils/HIVDBFiles/HXB2Sequence.fasta') as handle:
    _, conb_genome = GeneralSeqTools.fasta_reader(handle).next()

In [177]:
from Bio.Seq import Seq

def rev_trans(align_func, offset, seq):
    tseq = Seq(seq[:-(rframe+1)]).reverse_complement().translate(stop_symbol='X')
    return align_func(tseq.tostring())

rgenomes = {}
for rframe in range(3):
    
    cb_seq = Seq(conb_genome[:-(rframe+1)]).reverse_complement().translate(stop_symbol='X')
    afunc = partial(get_ref_align, ref_func, cb_seq)
    rev_ref_func = partial(rev_trans, afunc, rframe)
    rgenomes[rframe] = wgenomes.map(rev_ref_func)
    
rgenomes = pd.DataFrame(rgenomes)

In [178]:
null_results = []
for col in rgenomes.columns:
    
    wanted_seqs, wanted_trops = rgenomes[col].dropna().align(trop_omes, join='inner')
    
    for pos in range(len(wanted_seqs.values[0])):
        getter = itemgetter(pos)
        wanted_col = wanted_seqs.map(getter)
        adj_score = adjusted_mutual_info_score(wanted_col.values, wanted_trops.values)
        norm_score = normalized_mutual_info_score(wanted_col.values, wanted_trops.values)
        null_results.append({
                             'Frame':col+1,
                             'Pos':pos+1,
                             'adj_score':adj_score,
                             'norm_score':norm_score
                             })

In [184]:
null_res_df = pd.DataFrame(null_results)
null_res_df['adj_score'] = null_res_df['adj_score'].clip(0)
null_res_df['norm_score'] = null_res_df['norm_score'].clip(0)

In [196]:
fig, axs = plt.subplots(2,1, sharex=True, figsize = (10, 10))
plot_cols = ['adj_score', 'norm_score']
edges = np.linspace(0, 0.2, 50)
for ax, col in zip(axs.flatten(), plot_cols):
    null_res_df[col].hist(bins = edges, ax = ax)
    ax.set_ylabel('#sites')
    ax.set_xlabel(col)



In [198]:
all_rgenomes = {}
for rframe in range(3):
    print 'frame', rframe
    cb_seq = Seq(conb_genome[:-(rframe+1)]).reverse_complement().translate(stop_symbol='X')
    afunc = partial(get_ref_align, ref_func, cb_seq)
    rev_ref_func = partial(rev_trans, afunc, rframe)
    all_rgenomes[rframe] = genome_ser.map(rev_ref_func)
    
all_rgenomes = pd.DataFrame(all_rgenomes)


frame 0
frame 1
frame 2

In [199]:
from itertools import combinations
con_len = len(conb_genome)


pairwise_null = []
for p1, p2 in combinations(range(con_len), 2):
    
    for frame in all_rgenomes.columns:
        col1 = all_rgenomes[frame].map(itemgetter(p1))
        col2 = all_rgenomes[frame].map(itemgetter(p2))
        pairwise_null.append({
                              'Frame':frame+1,
                              'Pos1': p1,
                              'Pos2': p2,
                              'adj_score': adjusted_mutual_info_score(col1.values, col2.values),
                              'norm_score': normalized_mutual_info_score(col1.values, col2.values),
                          
                              })


---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
<ipython-input-199-c519fb8ae91b> in <module>()
      8     for frame in all_rgenomes.columns:
      9         col1 = all_rgenomes[frame].map(itemgetter(p1))
---> 10         col2 = all_rgenomes[frame].map(itemgetter(p2))
     11         pairwise_null.append({
     12                               'Frame':frame+1,

/usr/local/lib/python2.7/dist-packages/pandas/core/series.pyc in map(self, arg, na_action)
   2495             return Series(new_values, index=self.index, name=self.name)
   2496         else:
-> 2497             mapped = map_f(values, arg)
   2498             return Series(mapped, index=self.index, name=self.name)
   2499 

/usr/local/lib/python2.7/dist-packages/pandas/lib.so in pandas.lib.map_infer (pandas/lib.c:42840)()

IndexError: string index out of range

In [ ]: