In [1]:
import sys
sys.path.append('/home/will/PySeqUtils/')
import os, os.path
os.chdir('/home/will/HIVTropism/R5Cluster/')
from itertools import islice
import pandas as pd
from GeneralSeqTools import fasta_reader, call_muscle

In [2]:
lanl_data = pd.read_csv('LANLResults.tsv', sep='\t')

In [178]:
lanl_data


Out[178]:
&ltclass 'pandas.core.frame.DataFrame'>
Int64Index: 199770 entries, 0 to 199769
Data columns (total 39 columns):
#                            199770  non-null values
SE id(SA)                    199770  non-null values
Accession                    199770  non-null values
GI number                    199770  non-null values
Version                      199770  non-null values
Patient Id                   135652  non-null values
Patient Sex                  75088  non-null values
Risk Factor                  63381  non-null values
Infection Country            7731  non-null values
Infection City               4974  non-null values
Infection Year               21503  non-null values
HLA type                     12440  non-null values
Project                      40812  non-null values
Patient ethnicity            371  non-null values
Progression                  11359  non-null values
Clone Name                   39792  non-null values
Georegion                    189856  non-null values
Sampling Year                199770  non-null values
Sampling Year Upper          199770  non-null values
Patient Age                  26638  non-null values
Patient Health               25802  non-null values
Phenotype                    1298  non-null values
Coreceptor                   3451  non-null values
Sample Tissue                134898  non-null values
NewSimpleTissue              134220  non-null values
Simple Sample Tissue         134895  non-null values
Culture Method               60217  non-null values
Molecule type                199770  non-null values
Drug Naive                   97267  non-null values
Problematic Sequence         199770  non-null values
Viral load                   44978  non-null values
CD4 count                    29856  non-null values
CD8 count                    3489  non-null values
Days from Infection          4467  non-null values
Days from Seroconversion     60207  non-null values
Days from first Sample       40572  non-null values
Fiebig Stage                 6310  non-null values
Days from treatment start    2542  non-null values
Days from treatment end      854  non-null values
dtypes: float64(11), int64(5), object(23)

In [186]:
from sklearn.preprocessing import label_binarize
def safe_float(num):
    
    try:
        return float(num)
    except ValueError:
        return np.nan
    
def convert_group_series(inser, cluster_only=False):
    
    groups = inser.dropna().unique()
    c_col = inser.name+'-cluster'
    if cluster_only:
        odf = pd.DataFrame(np.nan, 
                           columns=[c_col],
                           index=inser.index)
        for cnum, ocol in enumerate(groups,1):
            odf[c_col][inser==ocol] = cnum
        
    else:
        new_labels = [inser.name+'-'+col for col in groups]
        odf = pd.DataFrame(np.nan, 
                           columns=new_labels+[c_col],
                           index=inser.index)
    
        for cnum, (ocol, ncol) in enumerate(zip(groups, new_labels),1):
            odf[ncol] = (inser==ocol).astype(float)
            odf[ncol][inser.isnull()] = np.nan
            odf[c_col][inser==ocol] = cnum
        
    return odf
    
    
    

treated_lanl_dict = {'CD8':lanl_data['CD8 count'].clip(0, 3000),
                     'CD4':lanl_data['CD4 count'].clip(0, 2000),
                     'Log-VL':lanl_data['Viral load'].map(np.log10),
                     'Years-Infected':lanl_data['Days from Infection']/365,
                     'Years-Seropositve':lanl_data['Days from Seroconversion'].map(safe_float)/365,
                     'Accession':lanl_data['Accession'],
                     'Gender':lanl_data['Patient Sex'],
                     'Tissue':lanl_data['Sample Tissue'],
                     'STissue':lanl_data['NewSimpleTissue'],
                     'DrugNaive':lanl_data['Drug Naive'],
                     'GeoRegion':lanl_data['Georegion'],
                     'PatientID':lanl_data['Patient Id']
                     
                }

treated_lanl = pd.DataFrame(treated_lanl_dict)
treated_lanl = pd.concat([treated_lanl, 
                          convert_group_series(treated_lanl['Gender']),
                          convert_group_series(treated_lanl['Tissue'], cluster_only=True),
                          convert_group_series(treated_lanl['STissue'], cluster_only=True),
                          convert_group_series(treated_lanl['DrugNaive']),
                          convert_group_series(treated_lanl['GeoRegion'], cluster_only=True),
                          ], 
                         axis=1)
treated_lanl


Out[186]:
&ltclass 'pandas.core.frame.DataFrame'>
Int64Index: 199770 entries, 0 to 199769
Data columns (total 21 columns):
Accession            199770  non-null values
CD4                  29856  non-null values
CD8                  3489  non-null values
DrugNaive            97267  non-null values
Gender               75088  non-null values
GeoRegion            189856  non-null values
Log-VL               44978  non-null values
PatientID            135652  non-null values
STissue              134220  non-null values
Tissue               134898  non-null values
Years-Infected       4467  non-null values
Years-Seropositve    30263  non-null values
Gender-M             75088  non-null values
Gender-F             75088  non-null values
Gender-cluster       75088  non-null values
Tissue-cluster       134898  non-null values
STissue-cluster      134220  non-null values
DrugNaive-no         97267  non-null values
DrugNaive-yes        97267  non-null values
DrugNaive-cluster    97267  non-null values
GeoRegion-cluster    189856  non-null values
dtypes: float64(15), object(6)

In [4]:
def decide_tropism(inval):
    if inval < -6.95:
        return 'R5'
    elif inval > -2.88:
        return 'X4'
    elif inval < -4.92:
        return 'R5-P'
    elif inval >= -4.92:
        return 'X4-P'
    
    return np.nan

In [38]:
def safe_mean(inser):
    return inser.mean()

def most_common(inser):
    try:
        return inser.value_counts().index[0]
    except IndexError:
        return np.nan

V3_tropism = pd.read_csv('LANLTropism.tsv', sep='\t')
V3_tropism = V3_tropism[V3_tropism['percentile']<0.95]
trop_data = V3_tropism.groupby('name')['score'].mean()
pssm_bins = [-13.0, -11.0, -9.0, -6.96, -4.92, -2.88]

trop_data = pd.DataFrame({
                           'PSSMScore':trop_data,
                           'PSSMTrop':trop_data.map(decide_tropism),
                           'PSSMSubs':pd.Series(np.digitize(trop_data.values, pssm_bins), 
                                                index=trop_data.index),
                           })
ntrop_data = pd.concat([trop_data, convert_group_series(trop_data['PSSMTrop'])], axis=1)
print ntrop_data.head()


          PSSMScore  PSSMSubs PSSMTrop  PSSMTrop-R5  PSSMTrop-R5-P  \
name                                                                 
AB001137 -10.016667         2       R5            1              0   
AB001138 -10.016667         2       R5            1              0   
AB001139  -8.840000         3       R5            1              0   
AB001140  -6.263333         4     R5-P            0              1   
AB001141  -6.263333         4     R5-P            0              1   

          PSSMTrop-X4-P  PSSMTrop-X4  PSSMTrop-cluster  
name                                                    
AB001137              0            0                 1  
AB001138              0            0                 1  
AB001139              0            0                 1  
AB001140              0            0                 2  
AB001141              0            0                 2  

In [39]:
nlanl_data = pd.merge(treated_lanl, ntrop_data,
                      left_on='Accession',
                      right_index=True, 
                      how='outer')
nlanl_data


Out[39]:
&ltclass 'pandas.core.frame.DataFrame'>
Int64Index: 223154 entries, 0 to 199769
Data columns (total 28 columns):
Accession            223154  non-null values
CD4                  29856  non-null values
CD8                  3489  non-null values
DrugNaive            97267  non-null values
Gender               75088  non-null values
GeoRegion            189856  non-null values
Log-VL               44978  non-null values
STissue              134220  non-null values
Tissue               134898  non-null values
Years-Infected       4467  non-null values
Years-Seropositve    30263  non-null values
Gender-M             75088  non-null values
Gender-F             75088  non-null values
Gender-cluster       75088  non-null values
Tissue-cluster       134898  non-null values
STissue-cluster      134220  non-null values
DrugNaive-no         97267  non-null values
DrugNaive-yes        97267  non-null values
DrugNaive-cluster    97267  non-null values
GeoRegion-cluster    189856  non-null values
PSSMScore            83916  non-null values
PSSMSubs             83916  non-null values
PSSMTrop             83916  non-null values
PSSMTrop-R5          83916  non-null values
PSSMTrop-R5-P        83916  non-null values
PSSMTrop-X4-P        83916  non-null values
PSSMTrop-X4          83916  non-null values
PSSMTrop-cluster     83916  non-null values
dtypes: float64(21), object(7)

Just to make sure. There should be an increased Log-VL in X4 as compared to R5.


In [7]:
nlanl_data[['Log-VL', 'PSSMTrop']].dropna().boxplot(column=['Log-VL'], 
                                                            by='PSSMTrop')


Out[7]:
<matplotlib.axes.AxesSubplot at 0x50a6c10>

In [8]:
from Bio.Seq import Seq
from Bio.Alphabet import generic_dna
from collections import Counter, defaultdict
from GeneralSeqTools import fasta_writer

def filter_seq(handle, trans):
    for name, seq in fasta_reader(handle):
        tseq = ''.join(l for l in seq if l.isalpha())
        l = len(tseq)
        if (l == 105):
            if trans:
                rseq = Seq(tseq, generic_dna).translate()
                yield name, rseq.tostring()
            else:
                yield name, tseq



with open('V3filter.nt.fasta.raln') as handle:
    seq_list = list(fasta_reader(handle))
                

with open('V3filter.aa.fasta.raln') as handle:
    aa_seq_list = list(fasta_reader(handle))

In [9]:
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin
from itertools import product
from scipy.sparse import csr_matrix, eye

class BioTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self, typ = 'nuc'):
        self.typ = typ
        
    def fit(self, *args):
        return self
    
    def transform(self, X):
        if self.typ == 'nuc':
            letters = 'ACGT-'
        else:
            letters = 'ARNDCEQGHILKMFPSTWYV-'
        
        nrows, ncols = X.shape
        #out = eye(nrows, ncols*len(letters), format='csr')
        data = []
        rows = []
        cols = []
        for row in range(nrows):
            for num, (col,l) in enumerate(product(range(ncols), letters)):
                if X[row, col].upper()==l:
                    data.append(1)
                    rows.append(row)
                    cols.append(num)
                
        return csr_matrix((np.array(data), (np.array(rows), np.array(cols))), 
                          shape=(nrows, ncols*len(letters)), dtype=float).todense()

In [10]:
tmp_seqs = np.array([list(seq) for name, seq in seq_list])
names = [name for name, _ in seq_list]
nuc_df = pd.DataFrame(BioTransformer(typ='nuc').transform(tmp_seqs), index=names)

tmp_aa_seqs = np.array([list(seq) for name, seq in aa_seq_list])
aa_names = [name for name, _ in aa_seq_list]
aa_df = pd.DataFrame(BioTransformer(typ='aa').transform(tmp_aa_seqs), index=aa_names)

In [11]:
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans, MiniBatchKMeans, AffinityPropagation
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import silhouette_score, adjusted_rand_score, normalized_mutual_info_score
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.cross_validation import Bootstrap


def sil_linker(predictor, X):
    clusters = predictor.predict(X)
    if len(set(clusters)) == 1:
        clusters[-1] += 1
    return silhouette_score(X, clusters)

def rand_linker(predictor, X, y):
    
    clusters = predictor.predict(X)
    return normalized_mutual_info_score(y, clusters)


#nuc_dist = euclidean_distances(tmp_bin)
#aa_dist = euclidean_distances(tmp_aa_bin)

Start with some simple analysis using only North American sequences from PBMC/serum and sampled in North America


In [50]:
pat_lanl_data = nlanl_data.groupby('Accession').first()

In [40]:
wanted_mask = (pat_lanl_data['GeoRegion'] == 'North America') & \
((pat_lanl_data['STissue'] == 'PBMC') | (pat_lanl_data['STissue'] == 'plasma/serum/blood'))
NA_blood_df, NA_wanted_lanl = aa_df.align(pat_lanl_data[wanted_mask], axis=0, join='inner')

In [14]:
from sklearn.cross_validation import permutation_test_score

def check_num_clusts(X_data, n_clusts=range(2, 30), n_iter=10):
    
    cv = Bootstrap(X_data.shape[0], n_iter=n_iter, train_size=0.7)
    
    pred = GridSearchCV(KMeans(), {'n_clusters':n_clusts}, 
                        n_jobs=-1, pre_dispatch=30, cv = cv,
                        scoring=sil_linker,
                        refit=True)
    pred.fit(X_data)
    res = []
    for d in pred.grid_scores_:
        for r in d.cv_validation_scores:
            res.append((d.parameters['n_clusters'], r))
    return pred, pd.DataFrame(res, columns=['n_clusters','score'])

def check_trop_score(X_data, trop_clusters):
    
    cv = Bootstrap(X_data.shape[0], n_iter=3, train_size=0.7)
    pred = KMeans(n_clusters=len(set(trop_clusters)))
    t_score, scores, pval = permutation_test_score(pred, X_data, 
                                                   n_permutations=100,
                                                   y = trop_clusters,
                                                   n_jobs=20,
                                                   scoring=rand_linker,
                                                   cv=cv)
    return t_score, scores, pval

In [144]:
_, trop_data = check_num_clusts(NA_blood_df.values, 
                                n_iter=10, 
                                n_clusts=range(2, 60))
t = trop_data.groupby('n_clusters')['score'].mean()
e = trop_data.groupby('n_clusters')['score'].std()
plt.errorbar(t.index, t.values, yerr=e.values)
plt.title('Clustering of North American V3 sequnces')
plt.xlabel('Cluster Size')
plt.xlim([1.5, 60])
plt.ylim([0, 1])
plt.ylabel('Silhouette Score')
plt.savefig('final_figures/long_NA_v3_clustering.png', dpi = 1000)



In [88]:
from SeqSklearn import BinBasedCluster

bin_clust = BinBasedCluster(bins = pssm_bins)

pca_trans, biny, xx, yy, Z = bin_clust.make_vern_points(NA_blood_df.values, NA_wanted_lanl['PSSMScore'])


(24799, 672) (24799, 10)

In [105]:
from pylab import get_cmap
plt.figure(figsize=(10,10))
jitter = 0.1*np.random.randn(*pca_trans.shape)+pca_trans
plt.scatter(jitter[:,0], jitter[:,1], vmax = 0, c=NA_wanted_lanl['PSSMScore'], cmap=get_cmap('copper_r'), alpha=0.5)
cbar = plt.colorbar()
cbar.set_label('PSSMScore')
plt.ylabel('PC-1')
plt.xlabel('PC-2')


Out[105]:
<matplotlib.text.Text at 0x1b4e1f50>

In [41]:
v3_clust_pred = KMeans(n_clusters=7, n_jobs=-1)
v3_cluster_vals = pd.Series(v3_clust_pred.fit_predict(NA_blood_df.values), NA_blood_df.index)

NA_wanted_lanl['V3Clusters'] = v3_cluster_vals

In [135]:
vals = NA_wanted_lanl['PSSMScore'].dropna().values
counts, edges = np.histogram(vals, bins = 500, normed=True)

In [133]:
print edges[x4][:-1].shape, counts[x4[:-1]].shape


(215,) (215,)

In [138]:
plt.figure(figsize=(10,10))
r5 = edges<-6.96
r5p = (edges>-6.96) & (edges<-4.92)
x4p = (edges>-4.92) & (edges<-2.88)
x4 = (edges>-2.88)
ax = plt.gca()
ax.bar(edges[r5], counts[r5[:-1]], width=0.001, edgecolor = 'green')
ax.bar(edges[r5p], counts[r5p[:-1]], width=0.001, edgecolor = 'orange')
ax.bar(edges[x4p], counts[x4p[:-1]], width=0.001, edgecolor = 'blue')
ax.bar(edges[x4][:-1], counts[x4[:-1]], width=0.001, edgecolor = 'red')
ax.set_ylabel('Frac Sequences')
ax.set_xlabel('PSSM Score')
plt.savefig('final_figures/pssm_scores.png', dpi=1000)



In [ ]:


In [42]:
pssm_ranks = NA_wanted_lanl.groupby('V3Clusters')['PSSMScore'].median().rank(method='first').to_dict()
NA_wanted_lanl['V3Clusters'] = NA_wanted_lanl['V3Clusters'].replace(pssm_ranks)

In [29]:
keys = [('Log-VL', 'boxplot'),
        ('CD4', 'boxplot'),
        ('Years-Seropositve', 'boxplot'),
        ('CD8', 'boxplot'),
        ('PSSMScore', 'boxplot'),
        ('Count', 'count'),]

fig, axs = plt.subplots(3,2, figsize=(10,10), sharex=True)

for ax, (col,typ) in zip(axs.flatten(), keys):
    
    if col == 'Count':
        boxes = [len(vals) for _, vals in NA_wanted_lanl.groupby('V3Clusters')]
    else:
        boxes = [vals.dropna() for _, vals in NA_wanted_lanl.groupby('V3Clusters')[col]]
        nums = [len(vals.dropna()) for _, vals in NA_wanted_lanl.groupby('V3Clusters')[col]]
        print col, nums
    if typ == 'boxplot':
        ax.boxplot(boxes)
    elif typ == 'frac':
        ax.bar(np.arange(0.5,len(boxes)), [b.mean() for b in boxes])
        ax.set_ylim([0, 1])
    elif typ == 'count':
        ax.bar(np.arange(0.5,len(boxes)), boxes)
        
        
    ax.set_title('clustering-' + col)
    if ax.is_last_row():
        num_clust = int(NA_wanted_lanl['V3Clusters'].max())
        ax.set_xticklabels(['CL-%i' % i for i in range(1, len(boxes)+1)])
        
plt.savefig('final_figures/clustering_pat_results.png', dpi=1000)


Log-VL [5474, 3879, 1674, 2416, 1619, 876, 353]
CD4 [3598, 2589, 1164, 1478, 993, 579, 195]
Years-Seropositve [2192, 1370, 713, 877, 1063, 112, 163]
CD8 [681, 476, 198, 212, 154, 12, 33]
PSSMScore [7725, 6126, 2318, 3354, 1988, 1085, 394]

In [147]:
def num_not_null(ser):
    return ser.notnull().sum()

agg_dict = {'Log-VL': num_not_null,
            'CD4':num_not_null,
            'Years-Seropositve':num_not_null,
            'CD8':num_not_null,
            'PSSMScore':num_not_null}
print NA_wanted_lanl.groupby('PSSMSubs').agg(agg_dict).sum()
print len(NA_wanted_lanl.index)


CD4                  10596
PSSMScore            22990
Log-VL               16291
CD8                   1766
Years-Seropositve     6490
dtype: float64
24799

In [48]:
keys = [('Log-VL', 'boxplot'),
        ('CD4', 'boxplot'),
        ('Years-Seropositve', 'boxplot'),
        ('CD8', 'boxplot'),
        ('PSSMScore', 'boxplot'),
        ('Count', 'count'),]

fig, axs = plt.subplots(3,2, figsize=(10,10), sharex=True)

for ax, (col,typ) in zip(axs.flatten(), keys):
    
    if col == 'Count':
        boxes = [len(vals) for _, vals in NA_wanted_lanl.groupby('PSSMSubs')]
    else:
        boxes = [vals.dropna() for _, vals in NA_wanted_lanl.groupby('PSSMSubs')[col]]
        nums = [len(vals.dropna()) for _, vals in NA_wanted_lanl.groupby('PSSMSubs')[col]]
        print col, nums
    if typ == 'boxplot':
        ax.boxplot(boxes)
    elif typ == 'frac':
        ax.bar(np.arange(0.5,len(boxes)), [b.mean() for b in boxes])
        ax.set_ylim([0, 1])
    elif typ == 'count':
        ax.bar(np.arange(0.5,len(boxes)), boxes)
        
        
    ax.set_title('binning-' + col)
    if ax.is_last_row():
        num_clust = int(NA_wanted_lanl['PSSMSubs'].max())
        ax.set_xticklabels(['CL-%i' % i for i in range(1, len(boxes)+1)])
        
plt.savefig('final_figures/binning_pat_results.png', dpi=1000)


Log-VL [1437, 5220, 3903, 2732, 1142, 729, 1128]
CD4 [821, 3329, 2680, 1982, 584, 280, 920]
Years-Seropositve [671, 2380, 1098, 961, 296, 392, 692]
CD8 [141, 669, 288, 262, 108, 60, 238]
PSSMScore [2038, 8258, 5996, 3643, 1437, 982, 636]

If we do the clustering analysis on this sequence data we would expect to see two main clusters (X4 and R5). So when scanning with the Silhouette Score we expect to see a peak at 2 and then trail off. Then, to confirm that this is X4/R5 clustering we would expect the Adjusted Rand Index to have a positive value when comparing these two clusters to known X4/R5 tropism.


In [51]:


In [53]:
from scipy.stats import hypergeom

def fisher_pval():
    
    pass

def fisher_odds(val):
    pass

#order=['X4', 'X4-P', 'R5-P']+['R5-%i' %i for i in range(1,7)]
larger_wanted_mask = (pat_lanl_data['GeoRegion'] == 'North America') & pat_lanl_data['PSSMSubs'].notnull()
NA_lanl = pat_lanl_data[larger_wanted_mask]
tissue_num = NA_lanl['STissue'].value_counts()
cluster_num = NA_lanl['PSSMSubs'].value_counts()
#cluster_num.index = pd.Index(order)

counts = pd.crosstab(NA_lanl['STissue'], NA_lanl['PSSMSubs'])
#counts.columns = pd.Index(order)

pvals = []
for tissue, row in counts.iterrows():
    for cluster, val in row.to_dict().items():
        
        rv = hypergeom(larger_wanted_mask.sum(), tissue_num[tissue], cluster_num[cluster])
        pvals.append({
                      'Tissue':tissue,
                      'Cluster':cluster,
                      'Pval': rv.sf(val),
                      'Expected': rv.median(),
                      'Observed': val
                      })


pval_df = pd.DataFrame(pvals)



#(counts.T/counts.sum(axis=1)).T.to_excel('seq_counts.xlsx')

In [54]:
pdata = pd.pivot_table(pval_df, rows='Tissue', cols='Cluster', 
                       values=['Pval', 'Observed', 'Expected'])
pdata['Pval'] = -pdata['Pval'].applymap(np.log10)

#pdata['Pval'][order].to_excel('new_seq_counts.xlsx', sheet_name='Pvals')
#pdata['Observed'][order].to_excel(writer, sheet_name='Observed')
#pdata['Expected'][order].to_excel(writer, sheet_name='Expected')

In [139]:
tmp_data.index


Out[139]:
Index([BAL, CSF, GALT, Intestinal, PBMC, brain, cervix/vagina, colon, feces, gastric aspirate, kidney, liver, lung, lymph node, meninges, monocyte, plasma/serum/blood, rectum, resting CD4+ T cells, saliva, semen/testis, spleen, sputum, urethra, urine], dtype=object)

In [143]:
from PlottingTools import make_heatmap_df
order = ['R5-1', 'R5-2', 'R5-3', 'R5-4', 'R5-P', 'X4-P', 'X4']
tmp_data = pdata['Pval'].copy()
tmp_data.columns= pd.Index(order)
tmp_data[tmp_data<2] = np.nan
order = ['CSF', 'brain', 'meninges', 
         'PBMC', 'monocyte', 'plasma/serum/blood', 'resting CD4+ T cells',
         'lymph node', 'spleen',
         'GALT', 'colon',
         'BAL', 'lung', 'liver', 'cervix/vagina',
         'semen/testis', 'urethra']
fig = make_heatmap_df(tmp_data.ix[order], colormap='copper_r', figsize=(10,10))
cbar = plt.colorbar()
cbar.set_clim([0, 40])
cbar.set_ticks(range(0, 40, 5))
plt.savefig('final_figures/bining_tissue_figure_shortened.png', dpi=1000)



In [213]:
mask = ((trim_lanl['PSSMScore'] > -2.98) & (trim_lanl['STissue'] == 'brain'))
wanted_acc = set(trim_lanl['Accession'][mask])
tdata = trim_lanl[['Accession','PSSMScore']][mask].groupby('Accession').first()
out_seqs = []
for name, seq in aa_seq_list:
    if name in wanted_acc:
        out_seqs.append({
                         'Accession':name,
                         'PSSM':tdata.ix[name]['PSSMScore'],
                         'Seq':seq
                         })
out_df = pd.DataFrame(out_seqs)

In [229]:
with open('extra_brain.fasta', 'w') as handle:
    found = set()
    for name, seq in aa_seq_list:
        if name in wanted_acc:
            if seq not in found:
                fasta_writer(handle, [(name+'NEWSEQS!!!!!!', seq)])
                found.add(seq)

In [214]:
out_df.to_csv('brain_x4.tsv', sep='\t')

In [201]:
ax = trim_lanl.boxplot(column='PSSMScore', by = 'STissue', vert=False, figsize=(10,10))
ax.set_ylim([-1, ax.get_ylim()[1]])
order = ['R5-E', 'R5-1', 'R5-2', 'R5-3', 'R5-P', 'X4-P', 'X4']
for line, name in zip(pssm_bins, order):
    ax.annotate(name, (line-1.5, -0.5), fontsize=10)
ax.annotate('X4', (-1.5, -0.5), fontsize=10)


Out[201]:
<matplotlib.text.Annotation at 0xdc5cf50>

In [197]:
print pssm_bins
trim_lanl = nlanl_data[['Accession', 'STissue', 'PSSMScore', 'DrugNaive-yes']]
trim_lanl['PSSMScore'] = trim_lanl['PSSMScore'].clip(-20, 0)
fig, axs = plt.subplots(1,2, figsize=(10,10), sharey=True)
tissues = trim_lanl['STissue'].unique()
for (is_naive, tl), ax in zip(trim_lanl.groupby('DrugNaive-yes'), axs.flatten()):
    
    boxes = dict([(tiss, tgroup['PSSMScore'].dropna()) for tiss, tgroup in tl.groupby('STissue')])
    ax.boxplot([boxes.get(tiss, []) for tiss in tissues], vert=False)
    #tl.boxplot(by=['STissue'], column='PSSMScore', vert=False, ax=ax)
    if is_naive:
        ax.set_title('Drug Naive')
    else:
        ax.set_title('Seen Therapy')
    ax.set_ylim([-1, ax.get_ylim()[1]])
    ax.set_yticklabels(tissues)
    ax.vlines(pssm_bins, *ax.get_ylim())
    order = ['R5-E', 'R5-1', 'R5-2', 'R5-3', 'R5-P', 'X4-P', 'X4']
    for line, name in zip(pssm_bins, order):
        ax.annotate(name, (line-1.5, -0.5), fontsize=10)
    ax.annotate('X4', (-1.5, -0.5), fontsize=10)


[-13.0, -11.0, -9.0, -6.96, -4.92, -2.88]

Other Seq Stuff


In [215]:
NC_data = pd.read_csv('HIVSeqDB-MetaData.tsv', sep = '\t')

In [218]:
NC_wanted = pd.DataFrame(
                         {
                          'Accession':NC_data['Genbank Locus'],
                          'HAART':NC_data['Antiretroviral Treatment'],
                          'Neuropath':NC_data['Neuropathological diagnosis'],
                          'Neurocog':NC_data['Neurocognitive diagnosis'],
                          }
                         )
NC_merge = pd.merge(NC_wanted, nlanl_data,
                    left_on = 'Accession',
                    right_on = 'Accession')

In [242]:
NC_merge['Accession'][NC_merge['PSSMScore']>=-2.88]


Out[242]:
100    DQ288553
101    DQ288552
102    DQ288551
117    DQ288568
118    DQ288567
120    DQ288565
121    DQ288564
122    DQ288563
123    DQ288562
124    DQ288561
125    DQ288560
126    DQ288559
127    DQ288558
128    DQ288557
129    DQ288556
130    DQ288555
265    DQ288416
296    AY124970
300    AY159654
301    AY159653
302    AY159652
303    AY159651
419    AF174778
618    AF409567
621    AF409564
622    AF409563
627    AF409558
637    AF409548
642    AF409543
649    AF409536
650    AF409535
651    AF409534
Name: Accession, dtype: object

In [226]:
NC_merge.boxplot(column='PSSMScore', by='Neurocog', vert=False, figsize=(10,4))


Out[226]:
<matplotlib.axes.AxesSubplot at 0x37bce8d0>

In [227]:
NC_merge.boxplot(column='PSSMScore', by='Neuropath', vert=False, figsize=(10,6))


Out[227]:
<matplotlib.axes.AxesSubplot at 0x31c05490>

In [234]:
pd.crosstab(NC_merge['PSSMSubs'], NC_merge['Neurocog']).T


PSSMSubs                      0    1    2   3   4  5   6
Neurocog                                                
HAD: mild                     2    6   15  12   2  0   0
HAD: severe                  20   67    7   1   9  0   0
HAD: severity not specified  69  273   70  50   8  1   0
NPI: unknown                  0   10    0   0   0  0   0
None                         39  196  147  62  31  4  32
Unknown diagnosis             0   80   15   0   0  0   0

In [235]:
orer


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-235-288c6059e23b> in <module>()
----> 1 orer

NameError: name 'orer' is not defined

In [236]:
order


Out[236]:
['R5-E', 'R5-1', 'R5-2', 'R5-3', 'R5-P', 'X4-P', 'X4']

In [ ]: