In [1]:
from pandas import *
from subprocess import check_call
from tempfile import NamedTemporaryFile as NTF
import os, os.path
import numpy as np
from scipy.stats import ttest_ind
from itertools import groupby,combinations, islice
from operator import itemgetter
import sys

from random import shuffle
import csv, shlex, shutil

os.chdir('/home/will/HIVTropism/')
sys.path.append('/home/will/HIVReportGen/AnalysisCode/')
sys.path.append('/home/will/PySeqUtils/')

In [2]:
from GeneralSeqTools import fasta_reader,

In [19]:
tat_ex1_pos = (5830, 6044) #0 based

pos_data = read_csv('simple_results.txt', sep = '\t')

with open('Tat1-AB1_passed-cleaned.fasta') as handle:
    seq_data = list(fasta_reader(handle))

In [43]:
from Bio.Seq import Seq
from Bio.Alphabet import generic_dna, generic_protein

def translate_to_tat(inseq, start_pos, end_pos, rep = 0):
    
    if rep == 2:
        print 'ESCAPED!!!'
        
        return 
    nstart = tat_ex1_pos[0] - start_pos
    nend = tat_ex1_pos[1] - end_pos
    
    if (nstart >= 0):
        nseq = inseq[nstart:nend]
        tseq = Seq(nseq, generic_dna)
        aa = tseq.translate()
        return aa.tostring()
    else:
        #print 'starts late'
        nseq = 'N'*abs(nstart)+inseq
        #print 'fixing'
        #print inseq
        #print nseq
        #print len(inseq), len(nseq), tat_ex1_pos[1]-tat_ex1_pos[0]
        return translate_to_tat(nseq, tat_ex1_pos[0], end_pos, rep = rep+1)

In [48]:
prots = []
names = []
for (name, seq), (idx, row) in zip(seq_data, pos_data.iterrows()):
    aa = translate_to_tat(seq, row['GenomeStart']-1, row['GenomeEnd']-1)
    if sum(1 for l in aa if l == 'X') < 10:
        
        prots.append(aa)
        names.append(name)

In [49]:
seq_df = DataFrame({
                        'Tat':Series(prots, index = names)
                    })

In [51]:
from SeqProcessTools import align_seq_data_frame

out_align = align_seq_data_frame(seq_df, '/home/will/HIVReportGen/Data/BlastDB/ConBseqs.txt')


/usr/local/lib/python2.7/dist-packages/futures/__init__.py:24: DeprecationWarning: The futures package has been deprecated. Use the concurrent.futures package instead.
  DeprecationWarning)

In [61]:
cohort_data = read_csv('Texas_Cohort_Data.txt', sep = '\t')
nout_align = out_align.reset_index()
nout_align['short_ind'] = nout_align['index'].map(lambda x: x.split('-')[0])


nout = merge(nout_align, cohort_data, 
            left_index = 'short_ind', right_index = 'Patient ID')

nout = nout.drop(nout['NeuroCog'] == 'Not Tested', axis = 0)
print nout


<class 'pandas.core.frame.DataFrame'>
Int64Index: 26 entries, 2 to 27
Data columns:
index            26  non-null values
Tat              26  non-null values
Tat-bin-align    26  non-null values
Tat-seq-align    26  non-null values
short_ind        26  non-null values
Patient ID       26  non-null values
NeuroCog         26  non-null values
NeuroPath        26  non-null values
dtypes: object(8)

In [63]:
from SeqAnalysisTools import calculate_fisher_exact

impaired = nout['NeuroCog'] != 'Normal'

res = calculate_fisher_exact(nout['Tat-bin-align'][impaired], nout['Tat-bin-align'][~impaired])

In [68]:
#import numpy as np
plt.figure(figsize = (10,10))
res.plot()
plt.ylabel('p-value')
plt.xlabel('Tat EX-1')
plt.ylim([0,1.1])


Out[68]:
(0, 1.1)

In [69]:
res.to_csv('p_value_res.csv')

In [74]:
nout.dropna()['NeuroCog'].value_counts()


Out[74]:
MCMD            16
HAD              6
Subsyndromic     3
Normal           1

In [76]:
nout['NeuroCog']


Out[76]:
2           Normal
3     Subsyndromic
4     Subsyndromic
5     Subsyndromic
6             MCMD
7             MCMD
8             MCMD
9             MCMD
10            MCMD
11            MCMD
12            MCMD
13            MCMD
14            MCMD
15            MCMD
16            MCMD
17            MCMD
18            MCMD
19            MCMD
20             HAD
21             HAD
22             HAD
23            MCMD
24            MCMD
25             HAD
26             HAD
27             HAD
Name: NeuroCog

In [ ]: