In [1]:
import os, os.path
import sys
import pandas as pd
import numpy as np

os.chdir('/home/will/SadiVariation/')
sys.path.append('/home/will/PySeqUtils/')

In [2]:
from GeneralSeqTools import fasta_reader, fasta_writer, WebPSSM_V3_series
import glob

In [3]:
files = [('x4_seqs.fasta.old', 'x4_seqs.fasta'),
         ('r5_seqs.fasta.old', 'r5_seqs.fasta')]
for ifile, ofile in files:
    with open(ifile) as handle:
        with open(ofile, 'w') as ohandle:
            for name, seq in fasta_reader(handle):
                fasta_writer(ohandle, [(name, seq[1:-1])])

In [4]:
subtype_files = glob.glob('/home/will/WLAHDB_data/SubtypeGuess/*.gb')
subtypes = []
for f in subtype_files:
    gb = f.rsplit(os.sep, 1)[-1].split('.')[0]
    with open(f) as handle:
        subtype = handle.next().strip()
        if subtype != 'Unk':
            subtypes.append((int(gb), subtype))
subtype_df = pd.DataFrame(subtypes, columns = ['GI', 'Subtype'])

subtype_ser = subtype_df.groupby('GI')['Subtype'].first()


---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-4-5d88017929c9> in <module>()
      4     gb = f.rsplit(os.sep, 1)[-1].split('.')[0]
      5     with open(f) as handle:
----> 6         subtype = handle.next().strip()
      7         if subtype != 'Unk':
      8             subtypes.append((int(gb), subtype))

KeyboardInterrupt: 

In [ ]:
with open('hxb2.needle') as handle:
    aligned_seqs = list(fasta_reader(handle))

In [ ]:
from scipy.stats import linregress

hxb2_inds = [350, 364, 375, 388, 398, 456]
our_inds = [-103, -99, -85, -74, -63, 0]
m, b, _, _, _ = linregress(hxb2_inds, our_inds)
new_our_inds = np.arange(0, 634)
new_hxb2_inds = np.ceil(m*new_our_inds+b)

In [ ]:
starts = range(0, len(aligned_seqs), 2)
aligned = []
for s in starts:
    _, hxb2_seq = aligned_seqs[s]
    gi, gi_seq = aligned_seqs[s+1]
    aseq = ''.join(q for q, r in zip(gi_seq, hxb2_seq) if r.isalpha())
    aligned.append((int(gi), np.array(list(aseq))))

In [ ]:
aligned_ser = pd.DataFrame(aligned, columns=['GI', 'alignment']).groupby('GI').first()['alignment']
subs, _ = subtype_ser.align(aligned_ser, join='right')

In [ ]:
subs.value_counts()

In [ ]:
aset = set(gi for gi, _ in aligned)
with open('/home/will/WLAHDB_data/SeqDump/B_v3.fasta') as handle:
    v3_seqs = []
    for gi, seq in fasta_reader(handle):
        if int(gi) in aset:
            v3_seqs.append((int(gi), seq))
    print len(v3_seqs)

In [ ]:
trop_dict = dict((int(gi), 'X4' if trop > 0.5 else 'R5') for gi, trop in WebPSSM_V3_series(v3_seqs))
v3_ser = pd.Series(trop_dict)
trop_data, _ = v3_ser.align(aligned_ser, join='right')

In [ ]:
trop_data.value_counts()

In [ ]:
lanl_data = pd.read_csv('/home/will/HIVTropism/R5Cluster/LANLResults.tsv', sep = '\t')

In [ ]:
wtissue, _ = lanl_data['NewSimpleTissue'].dropna().align(aligned_ser, join = 'right')
wcor_receptor, _ = lanl_data['Coreceptor'].dropna().align(aligned_ser, join = 'right')

In [ ]:
check_seqs = [('CEBP-US2', 'ATTTCATCA', -170, -162),
              ('ATF-CREB', 'CTGACATCG', -123, -115),
              ('CEBP-US1', 'AGCTTTCTACAA', -114, -103),
              ('NFKB-II', 'AGGGACTTTCC', -103, -93),
              ('NFKB-I', 'GGGGACTTTCC', -99, -89),
              ('SP-III', 'GAGGCGTGG', -85, -77),
              ('SP-II', 'TGGGCGGGA', -74, -66),
              ('SP-I', 'GGGGAGTGG', -63, -55),
              ('AP1-I', 'TTGAGTGCT', 85, 93),
              ('AP1-II', 'TGTTGTGTGAC', 121, 138),
              ('AP1-III', 'TTTAGTCAG', 153, 161),
              ('DS3-B', 'TCAGTGTGGAAAATC', 158, 175),
              ('DS3-C', 'GTAGTGTGGAAAATC', 158, 175),
              ('DS3-D', 'TCAGTGTGGAAAATC', 158, 175),
              ('DS3-A', 'ACTGTGTAAAAATC', 158, 175)
              ]
slop = 20
check_dict = {'Subs':subs, 'tissue': wtissue, 'Coreceptor': trop_data}
for name, seq, start, stop in check_seqs:
    mask = (new_hxb2_inds>(start-slop)) & (new_hxb2_inds<(stop+slop))
    extracted = aligned_ser.map(lambda x: ''.join(x[mask]).replace('-', ''))
    
    check_dict[name] = extracted.map(lambda x: seq in x).map(float)
    check_dict[name][extracted.map(len)==0] = np.nan
df = pd.DataFrame(check_dict)

In [ ]:


In [ ]:
print pd.pivot_table(df, rows = 'Subs', aggfunc='mean').T*100

In [ ]:
print 9.0/290

In [ ]:
print pd.pivot_table(df, rows = 'Coreceptor', aggfunc='mean').T*100

In [ ]:
def find_best(tf_seq, ltr_seq):
    width = len(tf_seq)
    scores = []
    for start in range(0, len(ltr_seq)-width):
        scores.append(sum(s == t for s, t in zip(tf_seq, ltr_seq[start:(start+width)])))
    if scores:
        return max(scores)/float(width)
    else:
        return np.nan

best_dict = {'Subs':subs, 'tissue': wtissue, 'Coreceptor': trop_data}
for name, seq, start, stop in check_seqs:
    print name
    mask = (new_hxb2_inds>(start-slop)) & (new_hxb2_inds<(stop+slop))
    extracted = aligned_ser.map(lambda x: ''.join(x[mask]).replace('-', ''))
    best_dict[name] = extracted.map(lambda x: find_best(seq, x))
bdf = pd.DataFrame(best_dict)

In [ ]:
print (1-pd.pivot_table(bdf, rows = 'Subs')).T*100

In [ ]:
print (1-pd.pivot_table(bdf, rows = 'Coreceptor')).T*100

In [109]:
slop=5
mask = (new_hxb2_inds>(159-slop)) & (new_hxb2_inds<(174+slop))
ds3_ser = aligned_ser.map(lambda x: ''.join(x[mask]).replace('-', ''))
ds3_ser[ds3_ser.map(len)==0] = np.nan
with open('r5_seqs.fasta', 'w') as handle:
    fasta_writer(handle, ds3_ser[trop_data == 'R5'].dropna().to_dict().items())

with open('x4_seqs.fasta', 'w') as handle:
    fasta_writer(handle, ds3_ser[trop_data == 'X4'].dropna().to_dict().items())
    
with open('subC_seqs.fasta', 'w') as handle:
    fasta_writer(handle, ds3_ser[subs == 'C'].dropna().to_dict().items())
          
#print ds3_ser[trop_data == 'X4'].dropna()

In [5]:
quick_seqs = []
with open('r5_seqs.fasta') as handle:
    for name, seq in fasta_reader(handle):
        quick_seqs.append({
                           'Name':name,
                           'Trop':'R5',
                           'Seq':seq
                           })
with open('x4_seqs.fasta') as handle:
    for name, seq in fasta_reader(handle):
        quick_seqs.append({
                           'Name':name,
                           'Trop':'X4',
                           'Seq':seq
                           })

In [9]:
from Bio.Seq import Seq
from Bio import Motif
from StringIO import StringIO
from itertools import groupby
from operator import methodcaller
from Bio.Alphabet import IUPAC

def yield_motifs():
    with open('/home/will/LTRtfAnalysis/Jaspar_PWMs.txt') as handle:
        for key, lines in groupby(handle, methodcaller('startswith', '>')):
            if key:
                name = lines.next().strip().split()[-1].lower()
            else:
                tmp = ''.join(lines)
                mot = Motif.read(StringIO(tmp), 'jaspar-pfm')
                yield name, mot
                yield name+'-R', mot.reverse_complement()
pwm_dict = {}
for num, (name, mot) in enumerate(yield_motifs()):
    if num % 100 == 0:
        print num
    pwm_dict[name] = mot


0
100
200

In [53]:
from functools import partial
from scipy.stats import ttest_ind, gaussian_kde, chi2_contingency

def score_seq(mot, seq):
    bseq = Seq(seq, alphabet=IUPAC.unambiguous_dna)
    scores = mot.scanPWM(bseq)
    return np.max(scores)

def make_cdfs(kde, points):
    
    cdf = []
    for point in points:
        cdf.append(kde.integrate_box_1d(-np.inf, point))
    return 1-np.array(cdf)
    
    
wanted_mots = ['cebpa-R',
               #'nfatc2',
               'nfatc2-R']
fig, axs = plt.subplots(2,1, sharex=True, figsize = (10, 5))
quick_seqs_df = pd.DataFrame(quick_seqs)

r5_mask = quick_seqs_df['Trop'] == 'R5'
x4_mask = quick_seqs_df['Trop'] == 'X4'
for ax, mot in zip(axs.flatten(), wanted_mots):
    quick_seqs_df[mot] = quick_seqs_df['Seq'].map(partial(score_seq, pwm_dict[mot]))
    
    r5_vals = quick_seqs_df[mot][r5_mask].dropna().values
    x4_vals = quick_seqs_df[mot][x4_mask].dropna().values
    
    r5_kde = gaussian_kde(r5_vals)
    x4_kde = gaussian_kde(x4_vals)
    
    points = np.linspace(0, 15)
    ax.plot(points, make_cdfs(r5_kde, points), 'b', label = 'R5')
    ax.plot(points, make_cdfs(x4_kde, points), 'g', label = 'X4')
    ax.set_title(mot)
    if ax.is_last_row():
        ax.set_xlabel('TF Score')
    else:
        ax.legend()
    
    ax.set_ylabel('Frac Sequences')
    thresh = Motif.Thresholds.ScoreDistribution(pwm_dict[mot], precision = 100).threshold_fpr(0.005)
    ax.vlines(thresh, 0, 1)
    
    ch2table = [[(r5_vals>thresh).sum(), (r5_vals<=thresh).sum()],
                [(x4_vals>thresh).sum(), (x4_vals<=thresh).sum()],]
    
    _, pval, _, _ = chi2_contingency(ch2table)
    print mot, np.mean(r5_vals), np.mean(x4_vals), pval
plt.tight_layout()
#plt.savefig('/home/will/Dropbox/Wigdahl HIV Lab/SadiTFFigure/TFscores.png', dpi = 300)


cebpa-R 3.25429779065 2.42559581026 0.000490673533494
nfatc2-R 9.56427351272 11.4786407471 0.243616994612

In [30]:
ax.vlines?

In [119]:
from sklearn.svm import OneClassSVM

data = np.array([2,5,30,4,8,3,5,4,2,5,3,4,5,4]).reshape((-1, 1))
tmp = OneClassSVM().fit(data).predict(data)
data[tmp>0]


Out[119]:
array([[5],
       [5],
       [5],
       [5]])

In [121]:
count = 0
for num, f in enumerate(glob.glob('/home/will/WLAHDB_data/RegionSplit/ltr/*.fasta')):
    if num % 5000 == 0:
        print num, count
    with open(f) as handle:
        if len(handle.read()) > 10:
            count += 1


0 0
5000 98
10000 175
15000 250
20000 346
25000 444
30000 536
35000 623
40000 706
45000 793
50000 887
55000 970
60000 1055
65000 1151
70000 1247
75000 1343
80000 1445
85000 1541
90000 1636
95000 1729
100000 1837
105000 1933
110000 2030
115000 2128
120000 2227
125000 2297
130000 2395
135000 2494
140000 2588
145000 2678
150000 2767
155000 2870
160000 2962
165000 3050
170000 3130
175000 3224
180000 3327
185000 3416
190000 3511
195000 3602
200000 3704
205000 3804
210000 3905
215000 3997
220000 4077
225000 4166
230000 4256
235000 4350
240000 4442
245000 4524
250000 4605
255000 4706
260000 4812
265000 4917
270000 5013
275000 5119
280000 5222
285000 5304
290000 5404
295000 5506
300000 5607
305000 5693
310000 5793
315000 5881
320000 5988
325000 6090
330000 6198
335000 6300
340000 6398
345000 6507
350000 6584
355000 6659
360000 6763
365000 6863
370000 6959
375000 7053
380000 7141
385000 7223
390000 7328
395000 7421
400000 7532
405000 7631
410000 7755
415000 7846
420000 7958
425000 8072
430000 8173
435000 8271
440000 8349
445000 8454
450000 8544
455000 8631
460000 8744
465000 8834
470000 8935
475000 9032
480000 9132
485000 9220
490000 9330
495000 9406
500000 9509
505000 9614
510000 9712

In [ ]: