author: sgibbons@mit.edu
date: 22 Dec 2016
language: Python 3.5
license: BSD3

rrna_copy_number_analysis.ipynb



In [1]:

    
import numpy as np
import pandas as pd
import scipy as sp
import matplotlib.pyplot as plt
import seaborn as sns
import h5py as h5
#import empcolors
%run ../../code/colors-and-styles/empcolors.py
%matplotlib inline









    



/Users/jonsanders/Development/anaconda/envs/py3/lib/python3.5/site-packages/IPython/html.py:14: ShimWarning: The `IPython.html` package has been deprecated. You should import from `notebook` instead. `IPython.html.widgets` has moved to `ipywidgets`.
  "`IPython.html.widgets` has moved to `ipywidgets`.", ShimWarning)



In [2]:

    
#filter tables by metadata presence/absence filter_samples_from_otu_table.py -i emp_cr_gg_13_8.biom -m emp_qiime_mapping_all_emp.tsv -s 'Description:*' -o emp_cr_gg_13_8_filt.biom --output_mapping_fp emp_qiime_mapping_all_emp_filt.tsv

emp_gg_otus_sampsum = pd.read_csv('../../data/predicted-rrna-copy-number/emp_cr_gg_13_8.qc_filtered_filt_summary_samplesum.txt',sep='\t',index_col=0,header=None).sort_index()
emp_gg_otus_norm_sampsum = pd.read_csv('../../data/predicted-rrna-copy-number/emp_cr_gg_13_8.normalized_qcfilt_summary_samplesum.txt',sep='\t',index_col=0,header=None).sort_index()

emp_gg_otus_meta = pd.read_csv('../../data/mapping-files/emp_qiime_mapping_qc_filtered.tsv',sep='\t',index_col=0,header=0).sort_index()



In [4]:

    
emp_gg_otus_meta['sampsum'] = emp_gg_otus_sampsum[1]
emp_gg_otus_meta['normsampsum'] = emp_gg_otus_norm_sampsum[1]
emp_gg_otus_meta['copynumberdepletion'] = np.divide(emp_gg_otus_norm_sampsum[1],emp_gg_otus_sampsum[1])
emp_gg_otus_meta['averagecopy'] = np.divide(1,np.divide(emp_gg_otus_norm_sampsum[1],emp_gg_otus_sampsum[1]))



In [5]:

    
emp_gg_otus_meta.head()









    Out[5]:







  
    
      
      BarcodeSequence
      LinkerPrimerSequence
      Description
      host_subject_id
      study_id
      title
      principal_investigator
      doi
      ebi_accession
      target_gene
      ...
      salinity_psu
      oxygen_mg_per_l
      phosphate_umol_per_l
      ammonium_umol_per_l
      nitrate_umol_per_l
      sulfate_umol_per_l
      sampsum
      normsampsum
      copynumberdepletion
      averagecopy
    
    
      #SampleID
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      1001.SKB1
      GTCCGCAAGTTA
      GTGCCAGCMGCCGCGGTAA
      Burmese bulk
      SKB1
      1001
      Understanding cultivar-specificity and soil de...
      Suzanne Kennedy
      10.1371/journal.pone.0099641
      ERP016540
      16S rRNA
      ...
      7.15
      NaN
      NaN
      NaN
      NaN
      NaN
      11875.0
      4417.225397
      0.371977
      2.688339
    
    
      1001.SKB2
      CGTAGAGCTCTC
      GTGCCAGCMGCCGCGGTAA
      Burmese bulk
      SKB2
      1001
      Understanding cultivar-specificity and soil de...
      Suzanne Kennedy
      10.1371/journal.pone.0099641
      ERP016540
      16S rRNA
      ...
      7.15
      NaN
      NaN
      NaN
      NaN
      NaN
      12706.0
      4667.695238
      0.367362
      2.722114
    
    
      1001.SKB3
      CCTCTGAGAGCT
      GTGCCAGCMGCCGCGGTAA
      Burmese bulk
      SKB3
      1001
      Understanding cultivar-specificity and soil de...
      Suzanne Kennedy
      10.1371/journal.pone.0099641
      ERP016540
      16S rRNA
      ...
      7.15
      NaN
      NaN
      NaN
      NaN
      NaN
      15287.0
      5505.959127
      0.360173
      2.776446
    
    
      1001.SKB4
      CCTCGATGCAGT
      GTGCCAGCMGCCGCGGTAA
      Burmese Rhizo
      SKB4
      1001
      Understanding cultivar-specificity and soil de...
      Suzanne Kennedy
      10.1371/journal.pone.0099641
      ERP016540
      16S rRNA
      ...
      7.15
      NaN
      NaN
      NaN
      NaN
      NaN
      13091.0
      4398.515079
      0.335995
      2.976232
    
    
      1001.SKB5
      GCGGACTATTCA
      GTGCCAGCMGCCGCGGTAA
      Burmese Rhizo
      SKB5
      1001
      Understanding cultivar-specificity and soil de...
      Suzanne Kennedy
      10.1371/journal.pone.0099641
      ERP016540
      16S rRNA
      ...
      7.15
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
    
  

5 rows × 79 columns



In [16]:

    
# save data file
emp_gg_otus_meta.loc[:,['#SampleID','empo_0','empo_1','empo_2','empo_3','averagecopy']].to_csv('../../data/predicted-rrna-copy-number/emp_rrna_averagecopy_empo.csv')



In [7]:

    
def format_figure():
    plt.xlabel('Average community 16S copy number', fontsize=14)
    plt.ylabel('Number of samples', fontsize=14)
    plt.xticks(fontsize=12)
    plt.yticks(fontsize=12)
    plt.xlim([0,8])
    plt.tight_layout()



In [9]:

    
#emp_gg_otus_meta['averagecopy'].plot(kind='hist',legend=False,bins=200,linewidth=0)

plt.hist(emp_gg_otus_meta['averagecopy'].dropna(), bins=200, linewidth=0, color=get_empo_cat_color('EMP sample'))
format_figure()
plt.savefig('copy_number_16s_emp_empo0.pdf')



In [10]:

    
#biom_grouped = emp_gg_otus_meta.groupby('empo_1')
#biom_grouped['averagecopy'].plot(kind='hist',alpha=0.3,legend=True,bins=200,linewidth=0)

for i in emp_gg_otus_meta.empo_1.unique():
    plt.hist(emp_gg_otus_meta[emp_gg_otus_meta.empo_1 == i]['averagecopy'].dropna(), label=i,
             bins=200, linewidth=0, color=get_empo_cat_color(i), alpha=0.7)

#plt.legend(loc=1,prop={'size':8})
format_figure()
plt.savefig('copy_number_16s_emp_empo1.pdf')



In [11]:

    
# biom_grouped = emp_gg_otus_meta.groupby('empo_2')
# biom_grouped['averagecopy'].plot(kind='hist',alpha=0.5,legend=True,bins=200,linewidth=0)

for i in ['Animal', 'Non-saline', 'Plant', 'Saline']:
    plt.hist(emp_gg_otus_meta[emp_gg_otus_meta.empo_2 == i]['averagecopy'].dropna(), label=i,
             bins=200, linewidth=0, color=get_empo_cat_color(i), alpha=0.8)

#plt.legend(loc=1,prop={'size':8})
format_figure()
plt.savefig('copy_number_16s_emp_empo2.pdf')



In [12]:

    
# biom_grouped = emp_gg_otus_meta.groupby('empo_3')
# biom_grouped['averagecopy'].plot(kind='hist',alpha=0.5,legend=True,bins=200,linewidth=0)

for i in emp_gg_otus_meta.empo_3.unique():
    plt.hist(emp_gg_otus_meta[emp_gg_otus_meta.empo_3 == i]['averagecopy'].dropna(), label=i,
             bins=200, linewidth=0, color=get_empo_cat_color(i), alpha=0.5)

#plt.legend(loc=1,prop={'size':8})
format_figure()
plt.savefig('copy_number_16s_emp_empo3.pdf')



In [13]:

    
bp = emp_gg_otus_meta.boxplot('averagecopy',by='empo_2',grid=False,rot=90)
plt.xlabel('')
plt.title('')
plt.ylim([0,8])









    Out[13]:





(0, 8)



In [14]:

    
bp = emp_gg_otus_meta.boxplot('averagecopy',by='empo_3',grid=False,rot=90)
plt.xlabel('')
plt.title('')
plt.ylim([0,8])









    Out[14]:





(0, 8)



In [15]:

    
biom_grouped = emp_gg_otus_meta.groupby('env_material')

biom_grouped['averagecopy'].plot(kind='hist',alpha=0.5,legend=True,bins=200,linewidth=0)
plt.legend(loc=1,prop={'size':3})
plt.xlabel('average community 16S copy number')
plt.xlim([0,8])
#plt.savefig('copy_number_16s_emp_env_material.pdf', dpi=300, format='pdf')









    Out[15]:





(0, 8)



In [ ]:

	BarcodeSequence	LinkerPrimerSequence	Description	host_subject_id	study_id	title	principal_investigator	doi	ebi_accession	target_gene	...	salinity_psu	oxygen_mg_per_l	phosphate_umol_per_l	ammonium_umol_per_l	nitrate_umol_per_l	sulfate_umol_per_l	sampsum	normsampsum	copynumberdepletion	averagecopy
#SampleID
1001.SKB1	GTCCGCAAGTTA	GTGCCAGCMGCCGCGGTAA	Burmese bulk	SKB1	1001	Understanding cultivar-specificity and soil de...	Suzanne Kennedy	10.1371/journal.pone.0099641	ERP016540	16S rRNA	...	7.15	NaN	NaN	NaN	NaN	NaN	11875.0	4417.225397	0.371977	2.688339
1001.SKB2	CGTAGAGCTCTC	GTGCCAGCMGCCGCGGTAA	Burmese bulk	SKB2	1001	Understanding cultivar-specificity and soil de...	Suzanne Kennedy	10.1371/journal.pone.0099641	ERP016540	16S rRNA	...	7.15	NaN	NaN	NaN	NaN	NaN	12706.0	4667.695238	0.367362	2.722114
1001.SKB3	CCTCTGAGAGCT	GTGCCAGCMGCCGCGGTAA	Burmese bulk	SKB3	1001	Understanding cultivar-specificity and soil de...	Suzanne Kennedy	10.1371/journal.pone.0099641	ERP016540	16S rRNA	...	7.15	NaN	NaN	NaN	NaN	NaN	15287.0	5505.959127	0.360173	2.776446
1001.SKB4	CCTCGATGCAGT	GTGCCAGCMGCCGCGGTAA	Burmese Rhizo	SKB4	1001	Understanding cultivar-specificity and soil de...	Suzanne Kennedy	10.1371/journal.pone.0099641	ERP016540	16S rRNA	...	7.15	NaN	NaN	NaN	NaN	NaN	13091.0	4398.515079	0.335995	2.976232
1001.SKB5	GCGGACTATTCA	GTGCCAGCMGCCGCGGTAA	Burmese Rhizo	SKB5	1001	Understanding cultivar-specificity and soil de...	Suzanne Kennedy	10.1371/journal.pone.0099641	ERP016540	16S rRNA	...	7.15	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN