In [1]:
import sys
sys.path.append('/home/will/PySeqUtils/')
import os, os.path
os.chdir('/home/will/HIVTropism/R5Cluster/')
from itertools import islice
import pandas as pd
from GeneralSeqTools import fasta_reader, call_muscle

In [2]:
autoreload?


Object `autoreload` not found.

In [3]:
import autoreload
import SeqSklearn
%load_ext autoreload
%autoreload 2

Pulling out LANL Data


In [4]:
def decide_tropism(inval):
    if inval < -6.95:
        return 'R5'
    elif inval > -2.88:
        return 'X4'
    elif inval < -4.92:
        return 'R5-P'
    elif inval >= -4.92:
        return 'X4-P'
    
    return np.nan

In [5]:
lanl_data = pd.read_csv('LANLResults.tsv', sep='\t')

In [6]:
from sklearn.preprocessing import label_binarize
def safe_float(num):
    
    try:
        return float(num)
    except ValueError:
        return np.nan
    
def convert_group_series(inser, cluster_only=False):
    
    groups = inser.dropna().unique()
    c_col = inser.name+'-cluster'
    if cluster_only:
        odf = pd.DataFrame(np.nan, 
                           columns=[c_col],
                           index=inser.index)
        for cnum, ocol in enumerate(groups,1):
            odf[c_col][inser==ocol] = cnum
        
    else:
        new_labels = [inser.name+'-'+col for col in groups]
        odf = pd.DataFrame(np.nan, 
                           columns=new_labels+[c_col],
                           index=inser.index)
    
        for cnum, (ocol, ncol) in enumerate(zip(groups, new_labels),1):
            odf[ncol] = (inser==ocol).astype(float)
            odf[ncol][inser.isnull()] = np.nan
            odf[c_col][inser==ocol] = cnum
        
    return odf
    
    
    

treated_lanl_dict = {'Log-CD8':lanl_data['CD8 count'].map(np.log10),
                     'Log-CD4':lanl_data['CD4 count'].map(np.log10),
                     'Log-VL':lanl_data['Viral load'].map(np.log10),
                     'Years-Infected':lanl_data['Days from Infection']/365,
                     'Years-Seropositve':lanl_data['Days from Seroconversion'].map(safe_float)/365,
                     'Accession':lanl_data['Accession'],
                     'Gender':lanl_data['Patient Sex'],
                     'Tissue':lanl_data['Sample Tissue'],
                     'STissue':lanl_data['NewSimpleTissue'],
                     'DrugNaive':lanl_data['Drug Naive'],
                     'GeoRegion':lanl_data['Georegion'],
                     
                }

treated_lanl = pd.DataFrame(treated_lanl_dict)
treated_lanl = pd.concat([treated_lanl, 
                          convert_group_series(treated_lanl['Gender']),
                          convert_group_series(treated_lanl['Tissue'], cluster_only=True),
                          convert_group_series(treated_lanl['STissue'], cluster_only=True),
                          convert_group_series(treated_lanl['DrugNaive']),
                          convert_group_series(treated_lanl['GeoRegion'], cluster_only=True),
                          ], 
                         axis=1)
treated_lanl


Out[6]:
&ltclass 'pandas.core.frame.DataFrame'>
Int64Index: 199770 entries, 0 to 199769
Data columns (total 20 columns):
Accession            199770  non-null values
DrugNaive            97267  non-null values
Gender               75088  non-null values
GeoRegion            189856  non-null values
Log-CD4              29856  non-null values
Log-CD8              3489  non-null values
Log-VL               44978  non-null values
STissue              134220  non-null values
Tissue               134898  non-null values
Years-Infected       4467  non-null values
Years-Seropositve    30263  non-null values
Gender-M             75088  non-null values
Gender-F             75088  non-null values
Gender-cluster       75088  non-null values
Tissue-cluster       134898  non-null values
STissue-cluster      134220  non-null values
DrugNaive-no         97267  non-null values
DrugNaive-yes        97267  non-null values
DrugNaive-cluster    97267  non-null values
GeoRegion-cluster    189856  non-null values
dtypes: float64(14), object(6)

In [7]:
def safe_mean(inser):
    return inser.mean()

def most_common(inser):
    try:
        return inser.value_counts().index[0]
    except IndexError:
        return np.nan

V3_tropism = pd.read_csv('LANLTropism.tsv', sep='\t')
V3_tropism = V3_tropism[V3_tropism['percentile']<0.95]
trop_data = V3_tropism.groupby('name')['score'].mean()
pssm_bins = [-15.0, -13.0, -11.0, -9.0, -6.96, -4.92, -2.88, 1]
trop_data = pd.DataFrame({
                           'PSSMScore':trop_data,
                           'PSSMTrop':trop_data.map(decide_tropism),
                           'PSSMSubs':pd.Series(np.digitize(trop_data.values, pssm_bins), 
                                                index=trop_data.index),
                           })
ntrop_data = pd.concat([trop_data, convert_group_series(trop_data['PSSMTrop'])], axis=1)
print ntrop_data.head()


          PSSMScore  PSSMSubs PSSMTrop  PSSMTrop-R5  PSSMTrop-R5-P  \
name                                                                 
AB001137 -10.016667         3       R5            1              0   
AB001138 -10.016667         3       R5            1              0   
AB001139  -8.840000         4       R5            1              0   
AB001140  -6.263333         5     R5-P            0              1   
AB001141  -6.263333         5     R5-P            0              1   

          PSSMTrop-X4-P  PSSMTrop-X4  PSSMTrop-cluster  
name                                                    
AB001137              0            0                 1  
AB001138              0            0                 1  
AB001139              0            0                 1  
AB001140              0            0                 2  
AB001141              0            0                 2  

In [8]:
nlanl_data = pd.merge(treated_lanl, ntrop_data,
                      left_on='Accession',
                      right_index=True, 
                      how='outer').groupby('Accession').first()
nlanl_data


Out[8]:
&ltclass 'pandas.core.frame.DataFrame'>
Index: 223154 entries, A04321 to Z95460
Data columns (total 27 columns):
DrugNaive            97267  non-null values
Gender               75088  non-null values
GeoRegion            189856  non-null values
Log-CD4              29856  non-null values
Log-CD8              3489  non-null values
Log-VL               44978  non-null values
STissue              134220  non-null values
Tissue               134898  non-null values
Years-Infected       4467  non-null values
Years-Seropositve    30263  non-null values
Gender-M             75088  non-null values
Gender-F             75088  non-null values
Gender-cluster       75088  non-null values
Tissue-cluster       134898  non-null values
STissue-cluster      134220  non-null values
DrugNaive-no         97267  non-null values
DrugNaive-yes        97267  non-null values
DrugNaive-cluster    97267  non-null values
GeoRegion-cluster    189856  non-null values
PSSMScore            83916  non-null values
PSSMSubs             83916  non-null values
PSSMTrop             83916  non-null values
PSSMTrop-R5          83916  non-null values
PSSMTrop-R5-P        83916  non-null values
PSSMTrop-X4-P        83916  non-null values
PSSMTrop-X4          83916  non-null values
PSSMTrop-cluster     83916  non-null values
dtypes: float64(21), object(6)

Bring in Seqs


In [9]:
import glob
from Bio.Seq import Seq
from Bio.Alphabet import generic_dna
from GeneralSeqTools import fasta_writer, seq_align_to_ref, WebPSSM_V3_fasta
from concurrent.futures import ThreadPoolExecutor
import csv
from itertools import imap
import pickle



def trans_seq(handle, wanted_seqs, trans=True):
    for name, seq in fasta_reader(handle):
        if name not in wanted_seqs:
            continue
            
        tseq = ''.join(l for l in seq if l.isalpha())
        if trans:
            rseq = Seq(tseq, generic_dna).translate()
            yield name, ''.join(l for l in rseq.tostring() if l.isalpha())
        else:
            yield name, tseq

with open('/home/will/HIVReportGen/Data/BlastDB/ConBseqs.txt') as handle:
    ref_seqs = dict()
    for line in handle:
        parts = line.strip().split('\t')
        ref_seqs[parts[0]] = parts[1]
            

files = glob.glob('/home/will/HIVTropism/LANLdata/SubB-*.fasta')
oseqs = []
if os.path.exists('aligned_seq.pkl'):
    with open('aligned_seq.pkl') as handle:
        oseqs = pickle.load(handle)
else:
    for f in files:
        fname = f.rsplit(os.sep,1)[-1].split('.')[0]
        parts = fname.split('-')
        prot_name = parts[1].replace('_','-')
        if prot_name == 'V3':
            continue
        with open(f) as handle:
        
            prot_seqs = list(trans_seq(handle, set(trop_data.index), trans=prot_name != 'LTR'))
            print prot_name, len(prot_seqs)
            aligned_seqs = seq_align_to_ref(prot_seqs, ref_seqs[prot_name], max_workers=20)
            for name, seq in aligned_seqs:
                oseqs.append({
                              'Accession':name,
                              'Seq':seq,
                              'Prot':prot_name
                              })

In [10]:
aligned_seqs = pd.pivot_table(pd.DataFrame(oseqs),
                              rows='Accession',
                              cols='Prot',
                              values='Seq',
                              aggfunc='first')

Scanning Co-Linear


In [11]:
def var_func(X, y):
    scores = np.squeeze(np.asarray(np.var(X, axis=0)))
    pvals = np.ones_like(scores)
    return scores, pvals

In [12]:
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV, RandomizedSearchCV
from sklearn.cluster import KMeans
from sklearn.feature_selection import SelectPercentile, SelectKBest



param_grid = {'n_clusters':range(2,30)}

region_transform = Pipeline(steps = [('SeqTransormed', SeqSklearn.BioTransformer(typ='aa')),
                                     ('VarFilter', SelectKBest(var_func, k=20))])

region_predictor = GridSearchCV(KMeans(), 
                                param_grid, n_jobs=5, pre_dispatch=10)

region_scorer = KMeans()

In [13]:
from operator import itemgetter
import warnings
import csv

#handle = open('cluster_dump.tsv', 'w')
writer = csv.DictWriter(handle, ['Prot', 'Start', 'Accession', 'Cluster'])
writer.writeheader()
scan_width = 35
out_data = []
for col in ['Int', 'LTR', 'Nef', 'Tat-2', 'Tat-1', 'PR', 'RT', 'Vif', 'Vpr', 'gp120', 'gp41']:
    break
    seq_df, known_subs = aligned_seqs[col].dropna().align(nlanl_data['PSSMSubs'].dropna(), join='inner')
    seq_list_seqs = [list(l) for l in seq_df.values]
    seq_len = len(seq_list_seqs[0])
    for start in range(0,seq_len-scan_width):
        if start < 20:
            print col, start
        elif start > (seq_len-50):
            print col, start-seq_len
        getter = itemgetter(*range(start,start+scan_width))
        seq_region = np.array(map(getter, seq_list_seqs))
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            region_transform.fit(seq_region, np.ones((seq_region.shape[0], 1)))
            seq_data = region_transform.transform(seq_region)
        region_predictor.fit(seq_data)
        clusters = region_predictor.predict(seq_data)
        for c, acc in zip(clusters, seq_df.index):
            out_data.append({
                             'Prot':col,
                             'Start':start,
                             'Accession':acc,
                             'Cluster':c
                             })
            writer.writerow({
                             'Prot':col,
                             'Start':start,
                             'Accession':acc,
                             'Cluster':c
                             })


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-13-a05f09bb5b69> in <module>()
      5 #handle = open('cluster_dump.tsv', 'w')
      6 writer = csv.DictWriter(handle, ['Prot', 'Start', 'Accession', 'Cluster'])
----> 7 writer.writeheader()
      8 scan_width = 35
      9 out_data = []

/usr/lib/python2.7/csv.pyc in writeheader(self)
    135     def writeheader(self):
    136         header = dict(zip(self.fieldnames, self.fieldnames))
--> 137         self.writerow(header)
    138 
    139     def _dict_to_list(self, rowdict):

/usr/lib/python2.7/csv.pyc in writerow(self, rowdict)
    146 
    147     def writerow(self, rowdict):
--> 148         return self.writer.writerow(self._dict_to_list(rowdict))
    149 
    150     def writerows(self, rowdicts):

ValueError: I/O operation on closed file

In [ ]:
handle.close()

In [ ]:
tmp = [len(aligned_seqs[col].dropna().values[0]) for col in aligned_seqs.columns]

In [ ]:
aligned_seqs.columns

In [ ]:
#from sklearn.cross_validation import permutation_test_score, Bootstrap
##
#
#tregion = region_scorer.set_params(n_clusters=4)
#ts, scores, pval = permutation_test_score(tregion, seq_data, y=known_subs.values,
#                                          scoring = SeqSklearn.normalized_mutual_info_score_linker,                       
#                                          n_permutations=100,
#                                          cv=Bootstrap(seq_region.shape[0], train_size=0.7, n_iter=1))
#print ts, pval

In [14]:
seq_df, known_subs = aligned_seqs['Int'].dropna().align(nlanl_data['PSSMScore'].dropna(), join='inner')

In [15]:
from SeqSklearn import BinBasedCluster

clust = BinBasedCluster(bins=pssm_bins)
getter = itemgetter(*range(109-17,109+17))
seq_list_seqs = [list(l) for l in seq_df.values]
seq_region = np.array(map(getter, seq_list_seqs))
region_transform.fit(seq_region, np.ones((seq_region.shape[0], 1)))
seq_data = region_transform.transform(seq_region)

pca_trans, biny, xx, yy, Z = clust.make_vern_points(seq_data, known_subs.values)


(1734, 20) (1734, 10)
/usr/local/lib/python2.7/dist-packages/sklearn/feature_selection/univariate_selection.py:319: UserWarning: Duplicate scores. Result may depend on feature ordering.There are probably duplicate features, or you used a classification score for a regression task.
  warn("Duplicate scores. Result may depend on feature ordering."

In [16]:
from pylab import get_cmap
plt.figure(figsize=(10,10))
jitter = 0.01*np.random.randn(*pca_trans.shape)+pca_trans
plt.scatter(jitter[:,0], jitter[:,1], vmax = 0, c=known_subs, cmap=get_cmap('copper_r'), alpha=0.5)
cbar = plt.colorbar()
cbar.set_label('PSSMScore')
plt.ylabel('PC-1')
plt.xlabel('PC-2')


Out[16]:
<matplotlib.text.Text at 0x85351d0>

In [20]:
from sklearn.cross_validation import permutation_test_score, Bootstrap
from SeqSklearn import silhouette_score_linker

def check_num_clusts(X_data, n_clusts=range(2, 30), n_iter=10):
    
    cv = Bootstrap(X_data.shape[0], n_iter=n_iter, train_size=0.7)
    
    pred = GridSearchCV(KMeans(), {'n_clusters':n_clusts}, 
                        cv = cv,
                        scoring=silhouette_score_linker,
                        verbose=1,
                        refit=True)
    pred.fit(X_data)
    res = []
    for d in pred.grid_scores_:
        for r in d.cv_validation_scores:
            res.append((d.parameters['n_clusters'], r))
    return pred, pd.DataFrame(res, columns=['n_clusters','score'])


_, trop_data = check_num_clusts(seq_data, 
                                n_iter=5, 
                                n_clusts=range(2, 60))
t = trop_data.groupby('n_clusters')['score'].mean()
e = trop_data.groupby('n_clusters')['score'].std()
plt.errorbar(t.index, t.values, yerr=e.values)
#plt.title('Clustering of North American V3 sequnces')
plt.xlabel('Cluster Size')
plt.xlim([1.5, 60])
plt.ylabel('Silhouette Score')


[Parallel(n_jobs=1)]: Done   1 jobs       | elapsed:    0.1s
[Parallel(n_jobs=1)]: Done  50 jobs       | elapsed:   12.2s
[Parallel(n_jobs=1)]: Done 200 jobs       | elapsed:  1.9min
[Parallel(n_jobs=1)]: Done 290 out of 290 | elapsed:  3.6min finished
Fitting 5 folds for each of 58 candidates, totalling 290 fits
Out[20]:
<matplotlib.text.Text at 0x9971c50>

In [21]:
t = trop_data.groupby('n_clusters')['score'].mean()
e = trop_data.groupby('n_clusters')['score'].std()
plt.errorbar(t.index, t.values, yerr=e.values)
#plt.title('Clustering of North American V3 sequnces')
plt.xlabel('Cluster Size')
plt.xlim([1.5, 60])
plt.ylabel('Silhouette Score')


Out[21]:
<matplotlib.text.Text at 0x5a1a550>

In [26]:
pd.DataFrame({
              'val':t,
              'err':e
              }).to_csv('make_out.csv')

In [33]:
t = pd.read_csv('make_out.csv', names = ['nc', 'err', 'val'])
plt.errorbar(t['nc'], t['val'].values, yerr=t['err'].values)
plt.xlabel('Cluster Size')
plt.xlim([1.5, 60])
plt.ylim([0,1])
plt.ylabel('Silhouette Score')
plt.savefig('final_figures/tall_int_109_clustering.png', dpi=1000)



In [ ]: