In [1]:
from __future__ import division
import pandas as pd
import numpy as np
import os
import sys
import gspread
from StringIO import StringIO
import csv
sys.path.append('/home/will/PySeqUtils/')
sys.path.append('/home/will/PatientPicker/')
os.chdir('/home/will/LTRGraphs/')

In [2]:
import LoadingTools
import GeneralSeqTools
import TFSeqTools
import ConSeqs

In [3]:
redcap_data = LoadingTools.load_redcap_data()
redcap_data['SingleID'] = redcap_data[['Patient ID', 'VisitNum']].apply(lambda x: '-'.join(x), axis = 1)

In [4]:
def decide_tropism(inval):
    if inval < -6.95:
        return 'R5'
    elif inval > -2.88:
        return 'X4'
    elif inval < -4.92:
        return 'R5-P'
    elif inval >= -4.92:
        return 'X4-P'
    
    return 'Unk'

gc = gspread.login('judowill@gmail.com', 'gfzasfxjagrxdmqq')
spread = gc.open("V3 Project")
worksheet = spread.worksheet('PBMC Progress Report')
handle = StringIO()
writer = csv.writer(handle, delimiter = '\t')
rows = worksheet.get_all_values()
writer.writerow(rows[0])
for row in rows[1:]:
    if row[0].startswith('A'):
        try:
            writer.writerow(row)
        except UnicodeEncodeError:
            print row
handle.seek(0)

df = pd.read_csv(handle, sep = '\t', parse_dates = [5])
df['HasSeq'] = df['V3 Amino Acid Sequence'].notnull()
df['Date'] = df['Date'].map(pd.to_datetime)
df['Date'][df['Date'] == 'nan'] = np.nan
df['TropismPrediction'] = df['PSSM Score'].map(decide_tropism)
df['SingleID'] = df[['Patient', 'Visit']].apply(lambda x: '-'.join(x), axis = 1)

In [5]:
pat_data = pd.merge(redcap_data, df,
                    left_on ='SingleID',
                    right_on = 'SingleID',
                    how = 'outer').groupby('SingleID').first()

In [6]:
import glob
ltr_files = sorted(glob.glob('/home/will/HIVReportGen/Data/PatientFasta/*LTR.fasta'))
ltr_seqs = {}
for f in ltr_files:
    with open(f) as handle:
        _, seq = GeneralSeqTools.fasta_reader(handle).next()
        fname = os.path.basename(f).rsplit('-', 1)[0]
        ltr_seqs[fname] = seq

In [7]:
ltr_df = pd.DataFrame({
                       'LTR':pd.Series(ltr_seqs)
                       })
ltr_df.head()


Out[7]:
LTR
A0001-R01 TACACACCAGGGCCAGGAGTCAGATATCCACTGACCTTTGGATGGT...
A0001-R04 CTAGTACCAGTTGAGCCAGAGAAGTTAGAAGAAGCCAACAAAGGAG...
A0002-R00 GGTCAGATATCCACTGACCTTTGGATGGTGCTACAAGCTAGTACCA...
A0002-R01 CAGGGCCAGGGGTCAGATATCCACTGACCTTTGGATGGTGCTACAA...
A0002-R03 TTGTTACACCCTGTGAGCCTGCATG

In [8]:
conb_ltr = ConSeqs.GetConSeq('ltr')
conb_ltr


Out[8]:
'tGGAaGGGcTaaTttacTCcCaaaaaAgacAagAtATcCTTGAtcTGTGGgtctAccAcAcaCAaGGCTacTTCCCtGAtTgGCAgAAcTACACAccAGGgccaGGgatcAGaTatCCacTgaccTTTGGaTGgTGcTtcAAgcTAGTACCAgTtgAgCcAGagaaggTagAagagGccAatgaaggagagaacaacagctTGtTaCAcCCtatgagCCtgCATGGgatgGAgGAcccgGAgaaagAAGtgtTagtgTGGAagTttGACAgccgccTaGcatttcatCAcatggCccgaGAgctgcATCCggAgTactacaaggActGcTGACatcgagctttctacaaGGGACTTTCCgCtgGGGACTTTccagggagGcGtggcctGGgcgggaCtggGGAgtggCgagCCCtcAGAtgcTgCATATAAGCAGCtGCttTttGccTGTACtGGgTCTCTCTggttaGaCCAGATCtGAGCctGGGAGcTCtctggctaactagggaacccactgcttaagcctcaataaagcttgccttgagtgcttcaagtagtgtgtgcccgtctgttgtgtgactctggtaactagagatccctcagacccttttagtcagtgtggaaaatctctagca'

In [9]:
from functools import partial
def scan_seq(mot, name, seq):
    if len(seq) < len(mot):
        return pd.Series([np.nan, np.nan], index = [name+'-Score', name+'-Seq'])
    score, _, seq = TFSeqTools.simple_score_pwm(seq, mot)
    return pd.Series([score, seq], index = [name+'-Score', name+'-Seq'])

def region_extractor(conb_ltr, start, stop, seq):
    oseq = TFSeqTools.slice_to_ref(seq, conb_ltr, start, stop)
    nseq = oseq.replace('-', '')
    if len(nseq):
        return nseq
    else:
        return np.nan

pwm_dict = TFSeqTools.Load_PWMS()
             
regions = [('AP1-IV', 104-5, 104+15, pwm_dict['ap1']),
           ('AP1-III', 119-5, 119+15, pwm_dict['ap1']),
           ('AP1-II', 154-5, 154+15, pwm_dict['ap1']),
           ('AP1-I', 213-5, 213+15, pwm_dict['ap1']),
           ('CEBP-II', 280-5, 280+15, pwm_dict['cebpa']),
           ('ETS', 304-5, 304+15, pwm_dict['ets1']),
           ('ATF/Creb', 329-5, 329+15, pwm_dict['creb1']),
           ('CEBP-I', 329-5, 329+15, pwm_dict['cebpa']),
           ('NFKB-II', 349-5, 349+15, pwm_dict['nf-kappab']),
           ('NFKB-I', 362-5, 362+15, pwm_dict['nf-kappab']),
           ('SpIII', 376-5, 376+15, pwm_dict['sp1']),
           ('SPII', 387-5, 387+15, pwm_dict['sp1']),
           ('SPI', 398-5, 398+15, pwm_dict['sp1'])]

ltr_df_cp = ltr_df.copy()
for tfname, start, stop, mot in regions:
    r_ex = partial(region_extractor, conb_ltr, start, stop)
    tf_checker = partial(scan_seq, mot, tfname)
    ltr_df_cp[tfname+'-Region'] = ltr_df_cp['LTR'].map(r_ex)
    out = ltr_df_cp[tfname+'-Region'].dropna().apply(tf_checker)
    ltr_df_cp = pd.merge(ltr_df_cp, out,
                         left_index=True,
                         right_index=True,
                         how='left')

In [10]:
cols = dict([('Patient ID', 'PatientID'),
        ('VisitNum', 'VisitNum'),
        ('Age', 'Age'),
        ('Gender', 'Gender'),
        ('Latest viral load', 'VL'),
        ('Latest CD4 count (cells/uL)', 'CD4'),
        ('Latest CD8 count (cells/uL)', 'CD8'),
        ('Current ART status', 'ART'),
        ('Hepatitis C status (HCV)', 'HCV'),
        ('TropismPrediction', 'Tropism')
        ])

tfnames = [tf for tf, _, _, _ in regions]
score_cols = [tf+'-Score' for tf, _, _, _ in regions]
seq_cols = [tf+'-Seq' for tf, _, _, _ in regions]

wanted_pat = pat_data[cols.keys()].dropna()
wanted_scores = ltr_df_cp[score_cols+seq_cols].dropna()
wanted_scores['TFJoin'] = wanted_scores[seq_cols].apply(lambda x: ''.join(x), axis=1)
#wanted_scores = wanted_scores.drop(seq_cols, axis=1)

check_data = pd.concat(wanted_pat.align(wanted_scores, axis=0, join='inner'), axis=1).rename(columns = cols)
check_data = check_data.fillna(check_data[score_cols].min())

ncols = dict((col, col.replace('-', '_').replace('/', '_')) for col in check_data.columns)
check_data = check_data.rename(columns = ncols)

In [11]:
import TreeingTools

tree = TreeingTools.run_FastTree(check_data['TFJoin'].to_dict().items(),
                                 alphabet=TreeingTools.generic_dna)

In [23]:
import networkx as nx
from itertools import combinations
import csv
with open('ltr_tree.nwk', 'w') as handle:
    tree.write_to_stream(handle, schema = 'phylip', exclude_chars=True)

In [32]:


In [37]:
def get_node_names(node):
    if node.taxon is None:
        return node.oid
    else:
        return node.taxon.label
        
names = [get_node_names(node) for node in tree.nodes()]

tree_graph = nx.DiGraph()
tree_graph.add_nodes_from(names)

for parent_tree_node in tree.nodes():
    parent_graph_name = get_node_names(parent_tree_node)
    for child_node in parent_tree_node.child_nodes():
        edge_len = child_node.edge_length
        child_graph_name = get_node_names(child_node)
        tree_graph.add_edge(parent_graph_name, child_graph_name, weight=edge_len)

In [41]:
check_data['LogVL'] = check_data['VL'].map(np.log10)
check_data.to_csv('ltr_node_attr.csv')
nx.write_gml(tree_graph, 'tree_graph.gml')

In [44]:
check_data.std()


Out[44]:
CD8                 717.468733
Age                   8.063528
CD4                 274.584899
VL                92871.898151
AP1_IV_Score          2.496936
AP1_III_Score         0.840058
AP1_II_Score          2.382229
AP1_I_Score           2.069136
CEBP_II_Score         1.818473
ETS_Score             1.085574
ATF_Creb_Score        1.630819
CEBP_I_Score          1.782758
NFKB_II_Score         1.751523
NFKB_I_Score          1.251658
SpIII_Score           1.203220
SPII_Score            1.011706
SPI_Score             0.761576
LogVL                 1.157234
dtype: float64

In [45]:
check_data['AP1_I_Score'].describe()


Out[45]:
count    454.000000
mean       2.711878
std        2.069136
min       -4.104490
25%        1.721269
50%        1.721269
75%        5.704474
max        8.215880
dtype: float64

In [107]:
nscore_cols = [col for col in check_data.columns if 'Score' in col]
pat_graph = nx.Graph()
with open('node_attributes.csv', 'w') as handle:
    writer = csv.DictWriter(handle, check_data.columns)
    for key, row in check_data.iterrows():
        pat_graph.add_node(key, **row.dropna().to_dict())
        writer.writerow(row.dropna().to_dict())
    
with open('edge_table.csv', 'w') as handle:
    writer = csv.DictWriter(handle, ['Source', 'Target']+nscore_cols)
    for (key1, row1), (key2, row2) in combinations(check_data.iterrows(), 2):
    
        tdists = (row1[nscore_cols]-row2[nscore_cols])
        tdict = tdists.dropna().to_dict()
        tdict['Source'], tdict['Target'] = (key1, key2)
        writer.writerow(tdict)
        mean_dists = tdists.mean()
        if mean_dists == 0:
            pat_graph.add_edge(key1, key2)

In [108]:
nx.write_gml(pat_graph, 'ltr_tf_graph.gml')

In [91]:
dists = []
for (key1, row1), (key2, row2) in combinations(check_data.iterrows(), 2):
    
    dists.append(row1[score_cols]-row2[score_cols])
dists_df = pd.DataFrame(dists)

In [92]:
print diffs_df.std()


AP1-I       2.162032
AP1-II      2.667938
AP1-III     0.985831
AP1-IV      2.541642
ATF/Creb    2.102272
CEBP-I      2.114661
CEBP-II     2.361735
ETS         1.787472
NFKB-I      4.248597
NFKB-II     5.451791
SPI         1.692703
SPII        2.695668
SpIII       2.695507
dtype: float64

In [101]:
from scipy.stats import gaussian_kde
#fig, ax = plt.subplots(1,1, figsize=(20,5))
vals = diffs_df.fillna(diffs_df.min()).mean(axis=1)
#kde = gaussian_kde(vals)
rank_vals = vals.rank()



#ax.set_xticks(np.arange(0, 10, 0.5))

In [105]:
print vals[rank_vals.idxmax()], vals[rank_vals.idxmin()]


9.93954424102 0.0

In [ ]:
vals