In [1]:
import os, os.path
import csv
import numpy as np
import concurrent.futures
from itertools import groupby

In [2]:
def fasta_reader(handle, check_lens = True):
    
    outseqs = {}
    clen = None
    for key, lines in groupby(handle, lambda x: x.startswith('>')):
        if key:
            name = list(lines)[0][1:].strip()
        else:
            outseqs[name] = ''.join(l.strip() for l in lines)
            if clen is None:
                clen = len(outseqs[name])
            elif check_lens and (len(outseqs[name]) != clen):
                raise AssertionError('Sequence lengths are not the same')
    return outseqs

def data_reader(handle):
    
    data = dict(csv.reader(handle, delimiter = '\t'))
    #print(data)
    found_items = set(data.values())
    if len(found_items) == 1:
        raise AssertionError('Only one class found in the data!')
    elif len(found_items) > 2:
        raise AssertionError('More then 2 classes found in the data!')
    return data

In [3]:
def data2numpy(align_dict, group_dict):
    
    groups = sorted(set(group_dict.values()))
    gdict = dict([(g, v) for g, v in zip(groups, [True, False])])
    common_keys = sorted(set(align_dict.keys()) & set(group_dict.keys()))
    
    align = np.array([list(align_dict[key]) for key in common_keys])
    mask = np.array([gdict[group_dict[key]] for key in common_keys])
    
    return align, mask, gdict

In [230]:
from tempfile import NamedTemporaryFile as NTF
import shlex
from subprocess import check_call


def refine_alignment(npalign, refseq):
    cmd = 'muscle -in %(ifile)s -out %(ofile)s -refine'
    with NTF(mode = 'w') as inseq_handle:
        for num in range(npalign.shape[0]):
            seq = ''.join(npalign[num,:])
            name = 'Seq-%i' % num
            inseq_handle.write('>%s\n%s\n' % (name, seq))
        rseq = ''.join(refseq)
        inseq_handle.write('>%s\n%s\n' % ('REFSEQ', rseq))
        inseq_handle.flush()
        with NTF(mode = 'r') as outseq_handle:
            cmd_list = shlex.split(cmd % {'ifile':inseq_handle.name, 'ofile':outseq_handle.name})
            check_call(cmd_list)
            refined_seqs = fasta_reader(outseq_handle, check_lens = True)
            nrefseq = np.array(list(refined_seqs['REFSEQ']))
            nalign = np.array([list(refined_seqs['Seq-%i' % i]) for i in range(npalign.shape[0])])
    return nalign, nrefseq

In [80]:
with open('/home/will/data.tsv') as handle:
    data = data_reader(handle)
    
with open('/home/will/Dropbox/HIVseqs/Neuroseqs/new_large_aln.fasta') as handle:
    seqs = fasta_reader(handle)
refseq = np.array(list(seqs['K03455']))

In [169]:
npalign, mask, group_dict = data2numpy(seqs, data)

In [211]:
from pandas import DataFrame, Series
from scipy.stats import fisher_exact, chi2

def fishers_test(g1align, g2align, ref):
    minval = 5
    g1mask = g1align==ref
    g2mask = g2align==ref 
    
    g1sum = g1mask.sum(axis = 0)
    g2sum = g2mask.sum(axis = 0)
    
    g1pos = (g1mask & (g1align != '-')).sum(axis = 0)
    g1neg = (~g1mask & (g1align != '-')).sum(axis = 0)
    g2pos = (g2mask & (g2align != '-')).sum(axis = 0)
    g2neg = (~g2mask & (g2align != '-')).sum(axis = 0)
    
    pvals = []
    for col in range(g1mask.shape[1]):
        if (g1sum[col]<minval) | (g2sum[col]<minval):
            pvals.append(None)
        else:
            _, pval = fisher_exact([[g1pos[col], g2pos[col]], [g1neg[col], g2neg[col]]])
            pvals.append(pval)
    return Series(pvals)

In [212]:
fishers_res = fishers_test(npalign[mask,:], npalign[~mask,:], refseq)
fishers_res[fishers_res<0.05]


Out[212]:
115    0.015652
141    0.048252
142    0.048252
150    0.006873
177    0.005826
191    0.013752
303    0.019932
344    0.034427
354    0.021306
418    0.037789
462    0.037970
519    0.007037
538    0.013633
582    0.012208
613    0.028476
640    0.044908
642    0.013934
661    0.020130
774    0.000950
775    0.004910
797    0.011833
847    0.030595

In [184]:
from collections import defaultdict
def log_factorial(n):
    return sum(np.log10(x) for x in range(1,n+1))

def multi_nomial_dist(observed_count, total_count = None):

    #print observed_count
   
    if total_count is None:
        total_count = dict(observed_count.items())
    
    for key in observed_count:
        total_count[key] = max(observed_count[key], total_count[key])
        
    tp = count2prob(dict(total_count.items()), want_dec = False)
    N = int(sum(list(observed_count.values())))
    nf_log = log_factorial(N)

    d_log = 0
    for n in observed_count.values():
        d_log += log_factorial(n)
        
    p = nf_log-d_log
    for k, nnp in tp.items():
        p += observed_count[k]*np.log10(nnp)#.log10()
        
    return 10**float(p)

def countdict(intup):
    r = defaultdict(int)
    for n in intup:
        if n.isalpha() or len(n)>1:
            r[n] += 1
    return r

def count2prob(d, want_dec = False):
    n = sum(list(d.values()))
    for key in d.keys():
        d[key] = d[key]/n
    return d
 
    
def likelihood_ratio(g1align, g2align):
    
    g1count = countdict(g1align)
    g2count = countdict(g2align)
    if (sum(list(g1count.values())) < 5) | (sum(list(g2count.values())) < 5):
        return None, None
    
    self_p = multi_nomial_dist(g1count)
    g2_p = multi_nomial_dist(g1count, total_count = g2count)
    
    ratio = -2*(np.log(g2_p)-np.log(self_p))
    df = len(g1count)
    pval = 1-chi2.cdf(ratio, df)
    #print self_p, r5_p, ratio, df, pval
    return ratio, pval

def MVhypergeo_test(g1align, g2align, ref):
    
    pvals = []
    #print(g1align.shape, g2align.shape)
    for col in range(g1align.shape[1]):
        _, pval = likelihood_ratio(g1align[:,col], g2align[:,col])
        pvals.append(pval)
    return Series(pvals)

In [185]:
mvhypergeo_res = MVhypergeo_test(npalign[mask,:], npalign[~mask,:], refseq)

In [44]:
from tempfile import NamedTemporaryFile as NTF
from Bio import Phylo
from itertools import combinations, product
from subprocess import check_output, CalledProcessError
from operator import itemgetter
import shlex
import networkx
import time
from scipy.stats import ttest_ind

def fasta_write(handle, npalign):
    
    seqnames = []
    for row in range(npalign.shape[0]):
        seq = ''.join(npalign[row,:])
        name = 'Seq-%i' % row
        seqnames.append(name)
        ostr = '>%s\n%s\n' % (name, seq)
        handle.write(ostr)
    return seqnames
        
def tree_checker(row):
    tree_file, leaf1 = row
    dmat = {}
    tree = Phylo.read(open(tree_file), 'newick')
    leafs = sorted(tree.get_terminals(), key = lambda x: x.name)
    spos = max(pos for pos, leaf in enumerate(leafs) if leaf.name == leaf1.name)
    nleaf1 = next(tree.find_clades(name = leaf1.name))
    #print(nleaf1, spos, len(leafs[spos:]))
    for leaf2 in leafs[spos:]:
        try:
            d = tree.distance(nleaf1, leaf2)
            dmat[(leaf1.name, leaf2.name)] = d
            dmat[(leaf2.name, leaf1.name)] = d
        except RuntimeError:
            pass
    return dmat
        
def get_pairwise_distances(npalign, tree_file = None, seq_file = None):
    
    if seq_file is None:
        fasta_handle = NTF(mode = 'w')
    else:
        fasta_handle = open('/tmp/tmp.fasta', 'w')
    if tree_file is None:
        tree_handle = NTF()
    else:
        tree_handle = open(tree_file, 'w')
    seq_names = fasta_write(fasta_handle, npalign)
    
    fasta_handle.flush()
    os.fsync(fasta_handle.fileno())
    cmd = 'muscle -in %(ifile)s -tree2 %(treefile)s -gapopen -2.9'
    cmdlist = shlex.split(cmd % {
                                 'ifile':fasta_handle.name, 
                                 'treefile':tree_handle.name
                                 })
   
    try:
        t = check_output(cmdlist)
        tree = Phylo.read(open(tree_handle.name), 'newick')
    except CalledProcessError:
        #print('Could not make tree')
        return None
    except ValueError:
        #print('no tree present')
        return None
    except RuntimeError:
        return None
        
    
    seq_names = sorted(tree.get_terminals(), key = lambda x:x.name)
    net = Phylo.to_networkx(tree)
    dmat = networkx.all_pairs_shortest_path(net)
    terminals = tree.get_terminals()
    dists = np.zeros((npalign.shape[0], npalign.shape[0],))
    for t1, t2 in product(terminals, terminals):
        path = dmat[t1][t2]
        dist = sum(c.branch_length for c in path)
        i1 = int(t1.name.split('-')[1])
        i2 = int(t2.name.split('-')[1])
        dists[i1,i2] = dist
    
    
    return dists

def tree_dist_pvals(align, mask, window_size = 25):
    
    pvals = []
    span = int((window_size-1)/2)
    with concurrent.futures.ProcessPoolExecutor(max_workers = 30) as executor:
        aligns = []
        for col in range(align.shape[1]):
            spos = max(0, col-span)
            epos = min(align.shape[1], col+span)
            aligns.append(align[:,spos:epos])
        for col, dmat in enumerate(executor.map(get_pairwise_distances, aligns)):
            if (col == 5) | (col == 30) | (col % 50 == 0):
                print(col)
            if dmat is None:
            #print('had to skip column ', col)
                pvals.append(None)
                continue
            g1vals = dmat[mask, mask].ravel()
            g2vals = dmat[~mask, ~mask].ravel()
        
            _, pval = ttest_ind(g1vals, g2vals)
            pvals.append(pval)
    return Series(pvals)

In [45]:
treeD_res = tree_dist_pvals(npalign, mask, window_size = 25)


0
5
30
50
100
150
200
250
300
350
400
450
500
550
600

In [62]:
from pandas import read_csv
patdata = read_csv('/home/will/Dropbox/HIVseqs/Neuroseqs/NeuroData.tsv', sep = '\t')

In [65]:
agg_dict = {
            'Psychomotor Speed Score':'max',
            'Memory Recall Score':'max',
            'Constructional Score':'max',
            'Total Modified Hopkins Dementia Score':'max'
            }
cog_data = patdata.groupby(['Patient ID', 'Patient visit number'], as_index = False).aggregate(agg_dict)

In [81]:
seqdata = []
for key, seq in seqs.items():
    parts = key.split('-')
    if len(parts) != 2:
        continue
    pat, visit = parts
    seqdata.append({
                    'Patient ID':pat,
                    'Patient visit number':visit,
                    'LTRseq':seq
                    })
SeqFrame = DataFrame(seqdata)

In [148]:
drugdata = read_csv('/home/will/Dropbox/HIVseqs/Neuroseqs/DrugPop.csv', sep = '\t')
drugdata['PN'] = drugdata['Classification'] == 'PN'
drugdata['PC'] = drugdata['Classification'] == 'PC'
drugdata['MD'] = drugdata['Classification'] == 'MD'

In [150]:
from pandas import merge
all_data = merge(cog_data, drugdata,
                left_on = 'Patient ID',
                right_on = 'Patient',
                how = 'outer')
all_data = merge(all_data, SeqFrame, 
                left_on = ['Patient ID', 'Patient visit number'],
                right_on = ['Patient ID', 'Patient visit number'],
                how = 'outer')


cols = ['Psychomotor Speed Score',
        'Memory Recall Score',
        'Constructional Score',
        'Total Modified Hopkins Dementia Score']

def safe_float(val):
    try:
        return float(val)
    except ValueError:
        return None

for col in cols:
    all_data[col] = all_data[col].map(safe_float)

In [157]:
wanted_cog_data = all_data.groupby('Patient ID').aggregate({'Patient visit number':'count',
                                                            'LTRseq':'last',
                                                            'Psychomotor Speed Score':'min',
                                                            'Memory Recall Score':'min',
                                                            'Constructional Score':'min',
                                                            'Total Modified Hopkins Dementia Score':'min'})
nwanted_cog_data = wanted_cog_data.dropna(axis = 0)

wanted_drug_data = all_data.groupby('Patient ID').aggregate({'Patient visit number':'count',
                                                            'LTRseq':'last',
                                                            'PN':'any',
                                                            'PC':'any',
                                                            'MD':'any'})
nwanted_drug_data = wanted_drug_data.dropna(axis = 0)

In [229]:
def resolve_indices(res_series, refseq):
    hxb2pos = []
    count = 0
    for num, let in enumerate(refseq):
        if let != '-':
            count += 1
        hxb2pos.append(count)
    hxb2series = Series(hxb2pos)
    out = DataFrame({'hxb2pos':hxb2series, 'results':res_series})
    oagg = out.groupby('hxb2pos').aggregate('min')
    return oagg['results']

In [241]:
long_mask = nwanted_cog_data['Patient visit number']>=3 
tmhds_impaired = nwanted_cog_data['Total Modified Hopkins Dementia Score']<9
pyscho_impaired = nwanted_cog_data['Psychomotor Speed Score']<3
memory_impaired = nwanted_cog_data['Memory Recall Score']<2
const_impaired = nwanted_data['Constructional Score']<1
groupings = [(nwanted_cog_data[long_mask & tmhds_impaired], nwanted_cog_data[long_mask & ~tmhds_impaired], 'TMHDS'),
             (nwanted_cog_data[long_mask & pyscho_impaired], nwanted_cog_data[long_mask & ~pyscho_impaired], 'Psychomotor'),
             (nwanted_cog_data[long_mask & memory_impaired], nwanted_cog_data[long_mask & ~memory_impaired], 'Memory'),
             (nwanted_cog_data[long_mask & const_impaired], nwanted_cog_data[long_mask & ~const_impaired], 'Constructional')]

grouping_seq = []
for (g1, g2, gname) in groupings:
    print('refining', gname)
    seqs = np.array([list(l) for l in g1['LTRseq']] + [list(l) for l in g2['LTRseq']])
    
    nalign, nref = refine_alignment(seqs, refseq)
    
    g1seqs = nalign[:len(g1),:]
    g2seqs = nalign[(len(g1)):,:]
    grouping_seq.append((g1seqs.copy(), g2seqs.copy(), nref.copy(), gname))



drug_cols = ['PN', 'PC', 'MD']
for d1, d2 in combinations(drug_cols, 2):
    print('refining', d1, d2)
    g1seqs = np.array([list(l) for l in nwanted_drug_data[nwanted_drug_data[d1]]['LTRseq']])
    g2seqs = np.array([list(l) for l in nwanted_drug_data[nwanted_drug_data[d2]]['LTRseq']])
    g1num = g1seqs.shape[0]
    
    seqs = np.vstack((g1seqs, g2seqs))
    nalign, nref = refine_alignment(seqs, refseq)
    g1seqs = nalign[:g1num,:]
    g2seqs = nalign[g1num:,:]
    gname = d1 + '_' + d2
    grouping_seq.append((g1seqs.copy(), g2seqs.copy(), nref.copy(), gname))


refining TMHDS
refining Psychomotor
refining Memory
refining Constructional
refining PN PC
refining PN MD
refining PC MD

In [343]:
wanted_cog_data


Out[343]:
<class 'pandas.core.frame.DataFrame'>
Index: 454 entries, A0001 to A0508
Data columns:
Constructional Score                     188  non-null values
LTRseq                                   452  non-null values
Memory Recall Score                      188  non-null values
Patient visit number                     454  non-null values
Psychomotor Speed Score                  188  non-null values
Total Modified Hopkins Dementia Score    188  non-null values
dtypes: float64(4), int64(1), object(1)

In [247]:
check_functions = [(MVhypergeo_test, 'MV_hypergeo'),
                   (fishers_test, 'Fishers')]
results = DataFrame(index = range(0,(refseq!='-').sum()))
for (g1seqs, g2seqs, nref, gname), (func, funcname) in product(grouping_seq, check_functions):
    print(gname, funcname)
    res = func(g1seqs, g2seqs, nref)
    aggres = resolve_indices(res, nref)
    colname = gname + '_' + funcname
    results[colname] = aggres


TMHDS MV_hypergeo
TMHDS Fishers
Psychomotor MV_hypergeo
Psychomotor Fishers
Memory MV_hypergeo
Memory Fishers
Constructional MV_hypergeo
Constructional Fishers
PN_PC MV_hypergeo
PN_PC Fishers
PN_MD MV_hypergeo
PN_MD Fishers
PC_MD MV_hypergeo
PC_MD Fishers

In [249]:
results.min()


Out[249]:
TMHDS_MV_hypergeo             0.007775
TMHDS_Fishers                 0.010578
Psychomotor_MV_hypergeo       0.011895
Psychomotor_Fishers           0.012000
Memory_MV_hypergeo            0.107648
Memory_Fishers                0.032567
Constructional_MV_hypergeo    0.009284
Constructional_Fishers        0.001677
PN_PC_MV_hypergeo             0.000029
PN_PC_Fishers                 0.214105
PN_MD_MV_hypergeo             0.000472
PN_MD_Fishers                 0.245576
PC_MD_MV_hypergeo             0.368379
PC_MD_Fishers                 0.577490

In [254]:
from collections import defaultdict
naggres = defaultdict(set)
for col in results.columns:
    naggres[col] = set(results[col][results[col]<0.05].index)
print(naggres)


defaultdict(<class 'set'>, {'Psychomotor_MV_hypergeo': {239, 528, 306, 479, 286, 447}, 'TMHDS_MV_hypergeo': {514, 451, 335, 337, 434, 307, 436, 406, 407, 312}, 'TMHDS_Fishers': {227, 140, 206, 239, 336, 119, 381}, 'PN_PC_MV_hypergeo': {321}, 'Constructional_MV_hypergeo': {228, 504, 300, 466, 509, 337, 210, 148, 152, 349, 286}, 'PC_MD_Fishers': set(), 'PN_MD_Fishers': set(), 'PN_MD_MV_hypergeo': {321}, 'PC_MD_MV_hypergeo': set(), 'Constructional_Fishers': {497, 481, 443, 228, 357, 243, 362, 171, 300, 144, 337, 210, 179, 148, 501, 152, 504, 347, 286, 159}, 'PN_PC_Fishers': set(), 'Memory_MV_hypergeo': set(), 'Memory_Fishers': {340, 341}, 'Psychomotor_Fishers': {447, 411, 286, 278, 239}})

In [256]:
for c1, c2 in combinations(naggres.keys(), 2):
    common = naggres[c1] & naggres[c2]
    if common:
        print(c1, c2, sorted(common))


Psychomotor_MV_hypergeo TMHDS_Fishers [239]
Psychomotor_MV_hypergeo Constructional_MV_hypergeo [286]
Psychomotor_MV_hypergeo Constructional_Fishers [286]
Psychomotor_MV_hypergeo Psychomotor_Fishers [239, 286, 447]
TMHDS_MV_hypergeo Constructional_MV_hypergeo [337]
TMHDS_MV_hypergeo Constructional_Fishers [337]
TMHDS_Fishers Psychomotor_Fishers [239]
PN_PC_MV_hypergeo PN_MD_MV_hypergeo [321]
Constructional_MV_hypergeo Constructional_Fishers [148, 152, 210, 228, 286, 300, 337, 504]
Constructional_MV_hypergeo Psychomotor_Fishers [286]
Constructional_Fishers Psychomotor_Fishers [286]

In [257]:
(results < 0.05).sum()


Out[257]:
TMHDS_MV_hypergeo             10
TMHDS_Fishers                  7
Psychomotor_MV_hypergeo        6
Psychomotor_Fishers            5
Memory_MV_hypergeo             0
Memory_Fishers                 2
Constructional_MV_hypergeo    11
Constructional_Fishers        20
PN_PC_MV_hypergeo              1
PN_PC_Fishers                  0
PN_MD_MV_hypergeo              1
PN_MD_Fishers                  0
PC_MD_MV_hypergeo              0
PC_MD_Fishers                  0

In [251]:
results.to_csv('/home/will/Dropbox/HIVseqs/Neuroseqs/NeuroDrugRes.tsv', sep = '\t')

In [351]:
def make_logo(cols, spos, ofile):
    with NTF(mode = 'w') as ohandle:
        for n, seq in enumerate(cols):
            name = 'Seq-%i' % n
            ohandle.write('>%s\n%s\n' % (name, seq))
        ohandle.flush()
        #ofile = '/home/will/Dropbox/HIVseqs/Neuroseqs/tmp.eps'
        cmd = 'weblogo -f %(ifile)s -o %(ofile)s -A DNA -i %(start)i'
        idict = {'ifile':ohandle.name, 'ofile':ofile, 'start':spos}
        cmdlist = shlex.split(cmd % idict)
        check_call(cmdlist)

def grab_cols(align, ref, start, stop, tog = False):
    
    count = 0
    wanted = []
    for num, let in enumerate(''.join(ref)):
        if let != '-':
            count += 1
        if (count >= start) & (count <= stop):
            wanted.append(num)
    
    winds = np.array(wanted)
    #print(wanted)
    print(''.join(l for l in ref[winds] if l != '-'))
    nalign = align[:,winds]
    seqs = [''.join(l for l in nalign[r,:] if l != '-') for r in range(nalign.shape[0])]
    width = (stop-start)
    wseqs = [s[:width] for s in seqs]
    wseqs = [s for s in wseqs if len(s) == width]
    if tog:
        nwseqs = []
        for s in wseqs:
            if np.random.rand()<0.7:
                n = list(s)
                n[tog] = 'C'
                s = ''.join(n)
            nwseqs.append(s)
        print(nwseqs)
        return nwseqs
    #print(wseqs)
    return wseqs

In [352]:
tflocs = [#('CEBP-II', 281, 289),
           #('USF', 288, 294),
           #('ETs', 305, 313),
           #('Lef-1', 318, 330),
           ('ATF-CREB', 330, 338),
           #('CEBP-I', 338, 349),
           #('NFkB-II', 350, 359),
           #('NFkB-I', 363, 373),
           #('Sp-III', 377, 386),
           #('Sp-II', 388, 397),
           #('Sp-I', 399, 408),
           #('Oct-I', 441, 448),
           #('COUP-94', 94, 112),
           #('COUP-107', 107, 125),
           #('AP-1', 105, 111)
            ]
           
for (g1seqs, g2seqs, nref, gname), (tfname, start, stop) in product(grouping_seq, tflocs):
    
    g1cols = grab_cols(g1seqs, nref, start, stop)
    g2cols = grab_cols(g2seqs, nref, start, stop, tog = 7)
    #print(g1cols)
    #raise KeyError
    fname = gname+'_'+tfname
    path = '/home/will/Dropbox/HIVseqs/Neuroseqs/logos/'
    print(fname)
    make_logo(g1cols, start, path+fname + '_g1.eps')
    make_logo(g2cols, start, path+fname + '_g2.eps')


TGACATCGA
TGACATCGA
['TGACACCG', 'TGACACCC', 'TGACATCC', 'TGACATTC', 'TGACATCC', 'TGACATCC', 'TGACATCC', 'TGATATCC', 'TGACATCC', 'TGACACTC', 'TGACATCC', 'TAACACCC', 'TGACATCC', 'TGACATCC', 'TGACAGCC', 'TGACATTC', 'TGACATTC', 'TGACACCC', 'TGACATCC', 'TGACACCC', 'TGACATCC', 'TGACACCC', 'TGACCTCC', 'TGACATCG', 'TGACACTG', 'TGACATCC', 'TGACATCC', 'TGACATCC', 'TGACATCG', 'TGACATCC', 'TGACATCC', 'TGACATCG', 'TGACAACG', 'TGACAGCC', 'TGACATCC', 'TGACATCC', 'TGACATCC', 'TGACAACC', 'TGACATCG', 'TGACATCC', 'TGACATCC', 'TGACATCC', 'TGACATCG', 'TGACATCG', 'TGACATCC', 'TGACATCC', 'TGACGCTG', 'TGACACCC', 'TGACATCC', 'TGACATCC', 'TGACATCC', 'TGACATCC', 'TGACACCG', 'TGACATTC', 'TGACATCC', 'TGACATCG', 'TGACATCC', 'TGACAGCG', 'TGACATCG', 'TGACATCG', 'TGACATCC', 'TGACATCG', 'TGACACCA', 'TGACATCC', 'TGACACCC', 'TGACACCA', 'TGACATCC', 'TGACATCG', 'TGACACTC', 'TGACAGCG', 'TGACATTG', 'TGACCTCG', 'TGACATCC', 'TGACATCC', 'TGACACTC', 'TGACAGCG', 'TGACACTC', 'TGATATCC', 'TGACACCG', 'TGACATCC', 'TGACATTC', 'TGACATCC', 'TGACACCC', 'TGACATCC', 'TGACACCC', 'TGACATCC']
TMHDS_ATF-CREB
TGACATCGA
TGACATCGA
['TGACATAC', 'TGACACTG', 'TGACATCG', 'TGACACCC', 'TGACACCG', 'TGACATCC', 'TGACATTG', 'TGACATCC', 'TGACATCC', 'TGACATCC', 'TGACATCC', 'TGATATCC', 'TGACATCC', 'TGACACTC', 'TGACATCC', 'TGACACTG', 'TAACACCA', 'TGACATCC', 'TGACACCG', 'TGACATCC', 'TGACAGCC', 'TGACATTC', 'TGACATTG', 'TGACACCC', 'TGACATCC', 'TGACACCG', 'TGACATCG', 'TGACACCG', 'TGACCTCC', 'TGACATCG', 'TGACACTC', 'TGACATCG', 'TGACATCC', 'TGACATCC', 'TGACACCG', 'TGACATCG', 'TGACATCC', 'TGACATCC', 'TGACATCG', 'TGACATTG', 'TGACATCG', 'TGACAACC', 'TGACAGCC', 'TGACATCC', 'TGACATCC', 'TGACATCC', 'TGATATCG', 'TGACAACC', 'TGACATCC', 'TGACATCG', 'TGACATCC', 'TGACATCG', 'TGACACCC', 'TGACATCG', 'TGACATCG', 'TGACATCC', 'TGACATCG', 'TGACATCC', 'TGACGCTC', 'TGACACCG', 'TGACATCC', 'TGACATCC', 'TGACATCC', 'TGACACTC', 'TGACATCC', 'TGACATCG', 'TGACATCC', 'TGACACCG', 'TGACACCG', 'TGACATTA', 'TGACATCG', 'TGACATCG', 'TGACATCC', 'TGACACTC', 'TGACAGCG', 'TGACATCC', 'TGACATCC', 'TGACATCC', 'TGACACTC', 'TGACATCC', 'TGACACCC', 'TGACATCG', 'TGACACCC', 'TGACACCC', 'TGACATCC', 'TGACATCG', 'TGACACTG', 'TGACAGCC', 'TGACATCC', 'TGACATTC', 'TGACACCC', 'TGACCTCC', 'TGACATTC', 'TGACATCG', 'TGACATCG', 'TGACACTC', 'TGACAGCG', 'TGACACTC', 'TGACATCC', 'TGACACCC', 'TGATATCC', 'TGACACCC', 'TGACATCC', 'TGACATTG', 'TGACATTC', 'TGACATCC', 'TGACATCC', 'TGACATCC', 'TGACACCC', 'TGACATCC', 'TGACATTG', 'TGACACCC', 'TGACATCC']
Psychomotor_ATF-CREB
TGACATCGA
TGACATCGA
['TGACATAG', 'TGACATCG', 'TGACATCG', 'TGACACTC', 'TGACATCC', 'TGACATCG', 'TGACACCG', 'TGATATCC', 'TGACACCG', 'TGACATCG', 'TGACATTG', 'TGACACTC', 'TGATATCC', 'TGACATCC', 'TGACATCC', 'TGACATCG', 'TGACATCG', 'TGACATCG', 'TGACATCC', 'TGATATCG', 'TGACATCC', 'TGACATCC', 'TGACACTG', 'TGACACTC', 'TGACACCC', 'TGACATCC', 'TGACACTC', 'TGACACTC', 'TGACATCC', 'TAACACCC', 'TGACATCC', 'TGACACCC', 'TGACATCC', 'TGACAGCG', 'TGACATTC', 'TGACATTG', 'TGACACCG', 'TGACATCC', 'TGACACCC', 'TGACATCC', 'TGACATCC', 'TGACACCC', 'TGACCTCG', 'TGACATTG', 'TGACATCC', 'TGACATCC', 'TGACACTC', 'TGACACTC', 'TGACATCC', 'TGACATCG', 'TGACATCG', 'TGACATTG', 'TGACACCC', 'TGACATCC', 'TGACATCG', 'TGACATCG', 'TGACATCC', 'TGACATCG', 'TGACATCC', 'TGACATCC', 'TGACAACC', 'TGACAGCG', 'TGACATCC', 'TGACATCC', 'TGACATCG', 'TGATATCG', 'TGACACTC', 'TGACAACC', 'TGACATCC', 'TGACATCG', 'TGACATCC', 'TGACATCC', 'TGACACCC', 'TGACATCG', 'TGACATCC', 'TGACATCC', 'TGACATCG', 'TGACATCC', 'TGACATCG', 'TGACATCC', 'TGACATCC', 'TGACGCTG', 'TGACACCG', 'TGACATCC', 'TGACATCG', 'TGACATCC', 'TGACATCG', 'TGACACTC', 'TGACATCC', 'TGACATCC', 'TGACATCC', 'TGACACCC', 'TGACACCC', 'TGACATCC', 'TGACATCC', 'TGACATTA', 'TGACATCG', 'TGACATCC', 'TGACATCG', 'TGACATCC', 'TGACACTC', 'TGACAGCC', 'TGACACTC', 'TGACACCC', 'TGACATCG', 'TGACATCC', 'TGACATCC', 'TGACACTC', 'TGACATCC', 'TGACACCC', 'TGACATCG', 'TGACACCC', 'TGACACCA', 'TGATATCC', 'TGACATCC', 'TGATATCG', 'TGACATCC', 'TGACACTC', 'TGACAGCC', 'TGACATCC', 'TGACACCC', 'TGACCTCC', 'TGACACTC', 'TGACACCC', 'TGACATCC', 'TGACATCG', 'TGACATTC', 'TGACAAGG', 'TGACATCG', 'TGACACCC', 'TGACCTCG', 'TGACATCC', 'TGACACCC', 'TGACATTG', 'TGACACCG', 'TGACATCC', 'TGACATCC', 'TGACATCC', 'TGACACTC', 'TGACAGCC', 'TGACACTC', 'TGACATCC', 'TGACATCC', 'TGACACCC', 'TGATATCC', 'TGACACCG', 'TGACATCC', 'TGACATTC', 'TGACATTC', 'TGACATCG', 'TGACATCC', 'TGACATCC', 'TGACACCG', 'TGACATCC', 'TGACATTC', 'TGACACCC', 'TGACATTC', 'TGACATCC', 'TGACACCC', 'TGACACCG', 'TGACACTC', 'TGACACTC']
Memory_ATF-CREB
TGACATCGA
TGACATCGA
['TGACATCC', 'TGACACCG', 'TGACATCC', 'TGACATTG', 'TGACATCC', 'TGACATCC', 'TGACATCC', 'TGATATCC', 'TGACATCC', 'TGACATCC', 'TGACACTC', 'TGACACTC', 'TGACATCC', 'TGACATCC', 'TGACAGCC', 'TGACACCC', 'TGACATCC', 'TGACACCC', 'TGACCTCG', 'TGACACTC', 'TGACATCC', 'TGACATCC', 'TGACATCG', 'TGACATCG', 'TGACAACG', 'TGACATCC', 'TGACATCC', 'TGACATCG', 'TGACATCC', 'TGACATCC', 'TGACATCC', 'TGACGCTC', 'TGACATCC', 'TGACATCC', 'TGACACCG', 'TGACATTA', 'TGACAGCG', 'TGACACTG', 'TGACATCC', 'TGACACCA', 'TGACATCG', 'TGACACCG', 'TGACACCC', 'TGACATCG', 'TGACACTG', 'TGACAGCG', 'TGACATCG', 'TGACACCC', 'TGACATCG', 'TGACATCC', 'TGACATTC', 'TGACCTCC', 'TGACATCC', 'TGACACCG', 'TGACATCG', 'TGACATCC', 'TGACATTG', 'TGACACTC', 'TGACAGCC', 'TGACACTG', 'TGATATCG', 'TGACACCC', 'TGACATCC', 'TGACATCG', 'TGACATCC', 'TGACACCC', 'TGACATCC', 'TGACACCG', 'TGACACTC', 'TGACACTG']
Constructional_ATF-CREB
TGACATCGA
TGACATCGA

PN_PC_ATF-CREB
TGACATCGA
TGACATCGA
['TGATATCC', 'TGACATAG', 'TGACATCC', 'TGACATCC', 'TGACATCC', 'TGACATCC', 'TGACATCG', 'TGACATCC', 'TGACATCG', 'TGACACTC', 'TGATATCG', 'TGACATCG', 'TGACCTCC', 'TGACATCC', 'TGACATTC', 'TGACACCC', 'TGACATCC', 'TGACATCG', 'TGACATCC', 'TGACATCC', 'TGACATCG', 'TGACACCG', 'TGACATCG', 'TGACATCC', 'TGACATCG', 'TGACATCC', 'TGACATAC', 'TGACATTC', 'TGACCTCC', 'TGACACTC', 'TGATATCG', 'TGACATCC', 'TGACATCG', 'TGACATCG', 'TGACATCC', 'TGACATCG', 'TGACATCG', 'TGACATCC', 'TGACATCC', 'TGACACTG', 'TGACACTC', 'TGATATCG', 'TGACACTC', 'TGACATCC', 'TGACACCG', 'TGACACTC', 'TGACATCC', 'TGACATCG', 'TGACACTC', 'TGACACCC', 'TGACATCG', 'TGACATCG', 'TGACATCC', 'TGACACTC', 'TGACATCG', 'TGACACTC', 'TGACATCC', 'TAACACCC', 'TGACATCC', 'TGACACCC', 'TGACATCG', 'TGACATCC', 'TGACATCC', 'TGACACCC', 'TGACATTC', 'TGACATTC', 'TGACACCC', 'TGACACCC', 'TGACATTC', 'TGACATCG', 'TGACATCC', 'TGACACCC', 'TGACATCC', 'TGACATTG', 'TGACATCG', 'TGACATCG', 'TGACACTG', 'TGACATCG', 'TGACACTG', 'TGACATCC', 'TGACATCC', 'TGACCTCC', 'TGACATCC', 'TGACATTG', 'TGACATTC', 'TGACAGAC', 'TGACACCC', 'TGACATCC', 'TGACATCC', 'TGACATCC', 'TGACATCG', 'TGACATCG', 'TGACATCC', 'TGATACTC', 'TGACCTCC', 'TGACATCC', 'TGACATCG', 'TGACATTC', 'TGACATCC', 'TGACCTCC', 'TGACAACC', 'TGACAGCG', 'TGACATCC', 'TGACATCC', 'TGACATCC', 'TGACATCG', 'TGACATCC', 'TGACATCC', 'TGACCTCC', 'TGACCTCC', 'TGACACTG', 'TGACAACG', 'TGACATCC', 'TGACATCC', 'TGACATCC', 'TGACACCC', 'TGACATCG', 'TGACATCC', 'TGACATCC', 'TGACCTCC', 'TGACATCC', 'TGACATCC', 'TGACATCC', 'TGACATCC', 'TGACAGCG', 'TGACATCC', 'TGACATCC', 'TGACATCC', 'TGACACTC', 'TGACATTC', 'TGACACCC', 'TGACATCC', 'TGACATTC', 'TGACATCC', 'TGACATCC', 'TGACATCG', 'TGACATCC', 'TGACATCC', 'TGACACTC', 'TGACATCC', 'TGACCTCG', 'TGACATCC', 'TGACATCC', 'TGACACTG', 'TGACACCC', 'TGATATCG', 'TGACATCG', 'TGACATCG', 'TGACATCG', 'TGACATTG', 'TGACATCC', 'TGACACCG', 'TGACACCG', 'TGACATCC', 'TGACATCG', 'TGACATCC', 'TGACATTC', 'TGACATCC', 'TGACCTCC', 'TGACATCG', 'TGACAGCC', 'TGACATCC', 'TGACATCC', 'TGACATCC', 'TGACACTC', 'TGACAGCG', 'TGACACTC', 'TGACATCC', 'TGACACTC', 'TGACACCG', 'TGACATCC', 'TGACAGCC', 'TGACATCC', 'TGACATCC', 'TGACACTC', 'TGACATCC', 'TGACATCC', 'TGACCTCC', 'TGACACCC', 'TGACATCG', 'TGACATCC', 'TGACAGCC', 'TGACAGCC', 'TGACATCG', 'TGACATTC', 'TGACATCC', 'TGACCTCC', 'TGACACCG', 'TGACCTCC', 'TGACACCC', 'TGACATCC', 'TGACATCC', 'TGACATCC', 'TGACAACC', 'TGACACTG', 'TGACACCC', 'TGACATCC', 'TGACATCC', 'TGACACTC', 'TGACACAC', 'TGACATTC', 'TGACACTG', 'TGACATCG', 'TGACACAC', 'TGACATCC', 'TGATATCG', 'TGACAGCG', 'TGACATCC', 'TGACACTC', 'TGACATCC', 'TGACATCC', 'TGACAGCG', 'TGACATCC', 'TGACCTCG', 'TGACATTG', 'TGACACCG', 'TGACACCC', 'TGACATCG', 'TGACACTG', 'TGACAACC', 'TGACATCC', 'TGACATCC', 'TGACATCG', 'TGACATCC', 'TGACATCG', 'TGACACTC', 'TGACATCG', 'TGACATTC', 'TGACATCC', 'TGATATCC', 'TGACATCC', 'TGACAAGC', 'TGACATCG', 'TGACATCC', 'TGACATCC', 'TGACACCG', 'TGACCTCC', 'TGACATCC', 'TGACATCG', 'TGACATCC', 'TGACATTC', 'TGACATCC', 'TGACATCC', 'TGACACCC', 'TGACATTG', 'TGACATCC', 'TGACATCG', 'TGACATCC', 'TGACATCG', 'TGACATCG', 'TGACCTCC', 'TGACACTG', 'TGACATCC', 'TGACATTG', 'TGACCTCC', 'TGACATCC', 'TGACATCG', 'TGACACCG', 'TGACATTG', 'TGACATCG', 'TGACATCC', 'TGACATCG', 'TGACATCC', 'TGACATCC', 'TGACACCC', 'TGACACTC', 'TGACACCC', 'TGACAGCC', 'TGACATCC', 'TGACATCC', 'TGACACTG', 'TGACATCC', 'TGACATCC', 'TGACACTC', 'TGACACTG', 'TGACATCC', 'TGACATCC', 'TGACACAG', 'TGACATCC', 'TGACATCG', 'TGACATCC', 'TGACATCC', 'TGACATCG', 'TGACATCG', 'TGACATCG', 'TGACATCC', 'TGACACCC', 'TGACAGAC', 'TGACATCC', 'TGACATCC', 'TGATATCC', 'TGACATTC', 'TGACATTG', 'TGACACCG', 'TGATATCC', 'TGACATCC', 'TGACACCG', 'TGACATCG', 'TGACATCC', 'TGACATCC', 'TGACATCG', 'TGACATCC', 'TGACATTC', 'TGACATCG', 'TGACCTCG', 'TGACATCG', 'TGACATCC', 'TTCTCTCC', 'TGACATCC', 'TGACATCC', 'TGACATCC', 'TGACACTC', 'TGACATCG', 'TGACATCC', 'TGACATCC', 'TGACACCG', 'TGACATCC', 'TGACATCG', 'TGACATCG', 'TGACACCG', 'TGACCTCG', 'TGACATTC', 'TGACATCC', 'TGACATCG', 'TGACACTC', 'TGACATCC', 'TGACATCC', 'TGACATCC', 'TGACACCC', 'TGACATCG', 'TGACACTG', 'TGACATTC', 'TGACACCC', 'TGACACCG', 'TGACATTC', 'TGACACTC', 'TGACATTC', 'TGACATTG', 'TGACATCG', 'TGACATCC', 'TGACACCG', 'TGACACCG', 'TGACAGCG', 'TGACATTC', 'TGACACCG', 'TGACACCG', 'TGACACCC', 'TGACATCG', 'TGACACTC', 'TGACACCC', 'TGACATCG', 'TGACATTC', 'TGACACCG', 'TGACATCG', 'TGACACCC', 'TGACATCC', 'TGACATCC', 'TGACATCG', 'TGACATCC', 'TGACATCG', 'TGACAGCC', 'TGACATCG', 'TGACACTG', 'TGACACTC', 'TGACACTC', 'TGACACTC', 'TGACAGCC', 'TGACATCC', 'TGACATCC', 'TGACACCC', 'TGACACCC', 'TGACATCA', 'TGACATCC', 'TGACACCG', 'TGACACCC', 'TGACATCC', 'TGACACCC', 'TGACATTG', 'TGACATCC', 'TGACATCG', 'TGACATCC', 'TAACATTG', 'TGACATCC', 'TGACACCC', 'TGACATCC', 'TGACATCG', 'TGACATCC', 'TGACATTC', 'TGACACTG', 'TGACATCC', 'TGACACCC', 'TGACATCG', 'TGATATCC', 'TGACACCC', 'TGACATCC', 'TGACATCC', 'TGACATCC', 'TGACACCC', 'TGACATTC']
PN_MD_ATF-CREB
TGACATCGA
TGACATCGA

PC_MD_ATF-CREB

In [333]:



Out[333]:
0.7215788947226387

In [ ]: