In [1]:
import sys
sys.path.append('/home/will/PySeqUtils/')
import os, os.path
os.chdir('/home/will/HIVTropism/R5Cluster/')
from itertools import islice
import pandas as pd
from GeneralSeqTools import fasta_reader, call_muscle

In [2]:
import autoreload
import SeqSklearn
%load_ext autoreload
%autoreload 2

In [3]:
from sklearn.preprocessing import label_binarize
def decide_tropism(inval):
    if inval < -6.95:
        return 'R5'
    elif inval > -2.88:
        return 'X4'
    elif inval < -4.92:
        return 'R5-P'
    elif inval >= -4.92:
        return 'X4-P'
    
    return np.nan

In [56]:
lanl_data = pd.read_csv('LANLResults.tsv', sep='\t')
def safe_float(num):
    
    try:
        return float(num)
    except ValueError:
        return np.nan
    
def convert_group_series(inser, cluster_only=False):
    
    groups = inser.dropna().unique()
    c_col = inser.name+'-cluster'
    if cluster_only:
        odf = pd.DataFrame(np.nan, 
                           columns=[c_col],
                           index=inser.index)
        for cnum, ocol in enumerate(groups,1):
            odf[c_col][inser==ocol] = cnum
        
    else:
        new_labels = [inser.name+'-'+col for col in groups]
        odf = pd.DataFrame(np.nan, 
                           columns=new_labels+[c_col],
                           index=inser.index)
    
        for cnum, (ocol, ncol) in enumerate(zip(groups, new_labels),1):
            odf[ncol] = (inser==ocol).astype(float)
            odf[ncol][inser.isnull()] = np.nan
            odf[c_col][inser==ocol] = cnum
        
    return odf
    
    
    

treated_lanl_dict = {'CD8':lanl_data['CD8 count'],
                     'CD4':lanl_data['CD4 count'],
                     'Log-VL':lanl_data['Viral load'].map(np.log10),
                     'Years-Infected':lanl_data['Days from Infection']/365,
                     'Years-Seropositve':lanl_data['Days from Seroconversion'].map(safe_float)/365,
                     'Accession':lanl_data['Accession'],
                     'Gender':lanl_data['Patient Sex'],
                     'Tissue':lanl_data['Sample Tissue'],
                     'STissue':lanl_data['NewSimpleTissue'],
                     'DrugNaive':lanl_data['Drug Naive'],
                     'GeoRegion':lanl_data['Georegion'],
                     
                }

treated_lanl = pd.DataFrame(treated_lanl_dict)
treated_lanl = pd.concat([treated_lanl, 
                          convert_group_series(treated_lanl['Gender']),
                          convert_group_series(treated_lanl['Tissue'], cluster_only=True),
                          convert_group_series(treated_lanl['STissue'], cluster_only=True),
                          convert_group_series(treated_lanl['DrugNaive']),
                          convert_group_series(treated_lanl['GeoRegion'], cluster_only=True),
                          ], 
                         axis=1)
treated_lanl


Out[56]:
&ltclass 'pandas.core.frame.DataFrame'>
Int64Index: 199770 entries, 0 to 199769
Data columns (total 20 columns):
Accession            199770  non-null values
CD4                  29856  non-null values
CD8                  3489  non-null values
DrugNaive            97267  non-null values
Gender               75088  non-null values
GeoRegion            189856  non-null values
Log-VL               44978  non-null values
STissue              134220  non-null values
Tissue               134898  non-null values
Years-Infected       4467  non-null values
Years-Seropositve    30263  non-null values
Gender-M             75088  non-null values
Gender-F             75088  non-null values
Gender-cluster       75088  non-null values
Tissue-cluster       134898  non-null values
STissue-cluster      134220  non-null values
DrugNaive-no         97267  non-null values
DrugNaive-yes        97267  non-null values
DrugNaive-cluster    97267  non-null values
GeoRegion-cluster    189856  non-null values
dtypes: float64(14), object(6)

In [57]:
def safe_mean(inser):
    return inser.mean()

def most_common(inser):
    try:
        return inser.value_counts().index[0]
    except IndexError:
        return np.nan

V3_tropism = pd.read_csv('LANLTropism.tsv', sep='\t')
V3_tropism = V3_tropism[V3_tropism['percentile']<0.95]
trop_data = V3_tropism.groupby('name')['score'].mean()
pssm_bins = [-15.0, -13.0, -11.0, -9.0, -6.96, -4.92, -2.88, 1]
trop_data = pd.DataFrame({
                           'PSSMScore':trop_data,
                           'PSSMTrop':trop_data.map(decide_tropism),
                           'PSSMSubs':pd.Series(np.digitize(trop_data.values, pssm_bins), 
                                                index=trop_data.index),
                           })
ntrop_data = pd.concat([trop_data, convert_group_series(trop_data['PSSMTrop'])], axis=1)
print ntrop_data.head()


          PSSMScore  PSSMSubs PSSMTrop  PSSMTrop-R5  PSSMTrop-R5-P  \
name                                                                 
AB001137 -10.016667         3       R5            1              0   
AB001138 -10.016667         3       R5            1              0   
AB001139  -8.840000         4       R5            1              0   
AB001140  -6.263333         5     R5-P            0              1   
AB001141  -6.263333         5     R5-P            0              1   

          PSSMTrop-X4-P  PSSMTrop-X4  PSSMTrop-cluster  
name                                                    
AB001137              0            0                 1  
AB001138              0            0                 1  
AB001139              0            0                 1  
AB001140              0            0                 2  
AB001141              0            0                 2  

In [58]:
nlanl_data = pd.merge(treated_lanl, ntrop_data,
                      left_on='Accession',
                      right_index=True, 
                      how='outer').groupby('Accession').first()
nlanl_data


Out[58]:
&ltclass 'pandas.core.frame.DataFrame'>
Index: 223154 entries, A04321 to Z95460
Data columns (total 27 columns):
CD4                  29856  non-null values
CD8                  3489  non-null values
DrugNaive            97267  non-null values
Gender               75088  non-null values
GeoRegion            189856  non-null values
Log-VL               44978  non-null values
STissue              134220  non-null values
Tissue               134898  non-null values
Years-Infected       4467  non-null values
Years-Seropositve    30263  non-null values
Gender-M             75088  non-null values
Gender-F             75088  non-null values
Gender-cluster       75088  non-null values
Tissue-cluster       134898  non-null values
STissue-cluster      134220  non-null values
DrugNaive-no         97267  non-null values
DrugNaive-yes        97267  non-null values
DrugNaive-cluster    97267  non-null values
GeoRegion-cluster    189856  non-null values
PSSMScore            83916  non-null values
PSSMSubs             83916  non-null values
PSSMTrop             83916  non-null values
PSSMTrop-R5          83916  non-null values
PSSMTrop-R5-P        83916  non-null values
PSSMTrop-X4-P        83916  non-null values
PSSMTrop-X4          83916  non-null values
PSSMTrop-cluster     83916  non-null values
dtypes: float64(21), object(6)

In [7]:
tmp_data = pd.read_csv('cluster_dump.tsv', sep=',')

In [8]:
pivot_data = pd.pivot_table(tmp_data, rows='Accession', 
                            cols=['Prot','Start'], 
                            values='Cluster')

In [10]:
from sklearn.metrics import adjusted_rand_score
from scipy.stats import norm
from scipy.stats import mode

ns_score = adjusted_rand_score
pvals = []
nreps = 250
nclusters = []
prots = ['LTR', 'gp120', 'Int', 'gp41']

for prot in prots:
    for col_start in pivot_data[prot].columns:
        if (col_start % 50 == 0):
            print prot, col_start
    
        col, ps_sub = nlanl_data['PSSMSubs'].dropna().align(pivot_data[prot][col_start].dropna(), join='inner')
        ts = ns_score(col.values, ps_sub.values)
        runs = [ns_score(col.values, np.random.permutation(ps_sub.values)) for _ in range(nreps)]
        args = norm.fit(runs)
        fit_model = norm(*args)
        pval = fit_model.sf(ts)
        pvals.append({
                      'Start':col_start,
                      'Prot':prot,
                      'TrueScore':ts,
                      'MeanScore':args[0],
                      'Pval':pval
                      })
        
        tmp = pd.DataFrame({'V3':col, 'other':ps_sub})
        out_vals = tmp.groupby('other')['V3'].transform(lambda x: mode(x)[0])
        nclusters += [(prot, col_start+17, acc, ncl) for acc, ncl in  out_vals.to_dict().items()]


LTR 0
LTR 50
LTR 100
LTR 150
LTR 200
LTR 250
LTR 300
LTR 350
LTR 400
LTR 450
LTR 500
LTR 550
gp120 0
gp120 50
gp120 100
gp120 150
gp120 200
gp120 250
gp120 300
gp120 350
gp120 400
Int 0
Int 50
Int 100
Int 150
Int 200
gp41 0
gp41 50
gp41 100
gp41 150
gp41 200
gp41 250

In [11]:
from statsmodels.stats.multitest import multipletests

pval_df = pd.DataFrame(pvals)



_, bh_pvals, _, _ = multipletests(pval_df['Pval'], alpha = 0.01, method = 'fdr_bh')
pval_df['BHPval'] = bh_pvals

In [12]:
mask = (pval_df['Prot']=='gp120') & (pval_df['Start']>325)& (pval_df['Start']<375)
pval_df[mask]['BHPval'].head(n=50).map(np.log10)


Out[12]:
921   -196.034027
922   -108.898466
923   -113.798820
924    -79.058561
925   -240.529846
926   -247.137454
927   -289.478251
928   -296.348724
929   -247.353157
930   -306.064058
931   -263.719587
932          -inf
933          -inf
934   -202.367140
935   -235.510340
936   -250.156140
937   -249.401348
938   -181.962664
939          -inf
940   -294.279317
941   -237.842913
942   -164.047538
943   -163.345702
944   -167.673721
945   -185.683847
946   -221.338858
947   -235.939103
948   -256.304430
949   -304.964248
950    -92.815505
951    -56.241550
952    -97.451519
953    -77.084299
954    -36.922108
955    -57.140097
956    -28.958409
957    -25.539462
958    -19.937658
959    -18.947138
960    -15.508214
961    -19.714102
962    -31.236106
963    -33.935118
964    -35.242547
965    -36.074070
966    -32.479590
967    -31.050153
968    -32.482380
969    -45.834508
Name: BHPval, dtype: float64

In [33]:
wprots = set(['LTR', 'gp120', 'Int', 'gp41'])
sig_dict = {}
for prot, rows in pval_df.groupby('Prot'):
    if prot in wprots:
        tot = pd.Series(np.nan, index=range(rows['Start'].max()+17))
        lpvals = -np.log10(rows['BHPval']).replace({-np.inf:-500})
        lpvals.index = rows['Start']+17
        smoothed_p = pd.rolling_mean(lpvals, 5)
        sig_dict[prot], _ = smoothed_p.align(tot)
        plt.figure(figsize=(8,4))
        plt.plot(smoothed_p.index, smoothed_p)
        plt.ylabel('-log10(pval)')
        plt.xlabel('ConB Position')
        plt.title(prot)
        plt.savefig('final_figures/%s_clustering.png' % prot, dpi=1000)



In [34]:
cnames = ['Prot', 'FStart', 'Accession', 'NewCluster']
new_clust_df = pd.pivot_table(pd.DataFrame(nclusters, columns = cnames),
                              rows='Accession',
                              cols=['Prot', 'FStart'],
                              values='NewCluster',
                              aggfunc='first')

In [95]:
from PlottingTools import make_heatmap
from matplotlib import colors
from sklearn.cluster import KMeans

cmap = colors.ListedColormap(['white', 'red', 'blue', 'green', 'cyan', 'purple', 'magenta'])

for prot, sig_vals in sig_dict.items():

    tdf, tmask = new_clust_df[prot].dropna(axis=0).align(sig_vals, axis=1, join='outer')
    tdf, scores = tdf.align(nlanl_data['PSSMScore'].dropna(), axis=0, join='inner')
    tvals = tdf.values

    order = np.argsort(scores.values)
    tvals[:, tmask.values<1.5] = np.nan
    tvals = tvals[order,:]
    
    xpos = np.array(tdf.columns)
    if prot == 'gp120':
        xpos += 30
        pass
    
    xx, yy = np.meshgrid(xpos, range(tvals.shape[0]))

    #print xx
    fig, (ax1, ax2) = plt.subplots(2,1, figsize=(10,5), sharex=True)

    #plt.figure(figsize=(10,5))
    ax1.set_title(prot)
    ax1.plot(xpos, sig_vals)
    ax1.set_ylabel('-log10(p-value)')
    ax1.set_xlim([0, max(xpos)+17])

    #plt.figure()
    ax2.pcolormesh(xx,yy, tvals, vmin=0, vmax=8, cmap=cmap)
    #ax2.imshow(tvals+1, aspect='auto', cmap=cmap)
    ax2.set_yticks([])
    ax2.set_ylabel('ordered by PSSM with R5 at top')
    ax1.set_xlim([0, max(xpos)+17])
    plt.savefig('final_figures/%s_scanning_group.png' % prot, dpi=1000)



In [67]:
10**(-sig_dict['Int'].max())


Out[67]:
7.910398333408357e-186

In [65]:
best_pos = sig_dict['Int'].idxmax()
print best_pos
int_clusts = pivot_data['Int'][best_pos-17]
wanted_lanl, int_pos_clusts = nlanl_data.align(int_clusts.dropna(), axis=0, join='inner')
wanted_lanl['IntClusts'] = int_pos_clusts

med_pssm = wanted_lanl.groupby('IntClusts')['PSSMScore'].median()
ranks = med_pssm.rank(method='first')
rep_dict = ranks.to_dict()
wanted_lanl['IntClusts'] = wanted_lanl['IntClusts'].replace(rep_dict)


keys = [('Log-VL', 'boxplot'),
        ('CD4', 'boxplot'),
        ('Years-Seropositve', 'boxplot'),
        ('CD8', 'boxplot'),
        ('PSSMScore', 'boxplot'),
        ('Count', 'count'),]

fig, axs = plt.subplots(3,2, figsize=(10,10), sharex=True)

for ax, (col,typ) in zip(axs.flatten(), keys):
    
    if col == 'Count':
        boxes = [len(vals) for _, vals in wanted_lanl.groupby('IntClusts')]
    else:
        boxes = [vals.dropna() for _, vals in wanted_lanl.groupby('IntClusts')[col]]
        nums = [len(vals.dropna()) for _, vals in wanted_lanl.groupby('IntClusts')[col]]
        print col, nums
    if typ == 'boxplot':
        ax.boxplot(boxes)
    elif typ == 'frac':
        ax.bar(np.arange(0.5,len(boxes)), [b.mean() for b in boxes])
        ax.set_ylim([0, 1])
    elif typ == 'count':
        ax.bar(np.arange(0.5,len(boxes)), boxes)
        
        
    ax.set_title('IntClusts-' + col)
    #if ax.is_last_row():
    #    num_clust = int(wanted_lanl['IntClusts'].max())
    #    ax.set_xticklabels(['CL-%i' % i for i in range(1, len(boxes)+1)])


109
Log-VL [54, 12, 83, 2, 32, 35, 86, 30, 15, 17, 37, 7, 44, 11, 14, 9, 80, 4, 0, 6, 19, 123, 67, 57, 6, 2, 26, 99, 0]
CD4 [5, 6, 1, 2, 17, 17, 2, 9, 5, 7, 5, 3, 31, 8, 11, 0, 22, 2, 0, 3, 8, 24, 30, 13, 5, 2, 2, 32, 0]
Years-Seropositve [76, 10, 133, 0, 28, 16, 93, 18, 15, 11, 36, 6, 12, 7, 7, 10, 57, 28, 0, 4, 10, 112, 36, 39, 2, 0, 26, 81, 0]
CD8 [0, 0, 0, 0, 0, 0, 0, 0, 2, 1, 0, 0, 1, 1, 0, 0, 2, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 2, 0]
PSSMScore [88, 23, 150, 19, 34, 52, 96, 49, 40, 42, 50, 20, 101, 30, 35, 18, 141, 32, 15, 15, 47, 212, 126, 74, 17, 9, 45, 146, 8]

In [ ]: