notebook.community

Edit and run



In [1]:

    
%run GLOBALS.py









    



3.5.2 |Continuum Analytics, Inc.| (default, Jul  2 2016, 17:52:12) 
[GCC 4.2.1 Compatible Apple LLVM 4.2 (clang-425.0.28)]



In [2]:

    
%load_ext autoreload
%autoreload 2



In [3]:

    
import matplotlib

matplotlib.use('TkAgg')
import matplotlib.pyplot as plt



In [4]:

    
#import numpy as np
import os
import pandas as pd
import re
import seaborn as sns
#import matplotlib.pyplot as plt
%matplotlib inline



In [5]:

    
# Control whether to import the original data. 
import_original_data = True
write_excel = True



In [6]:

    
print(os.getcwd())









    



/Users/janet/elvizAnalysis/ipython_notebooks



In [7]:

    
sns.set(style="whitegrid")



In [8]:

    
# Import the csv that translates the 127_HOW14 type labels to weeks and replicates.

from elviz_utils import IMPORT_METAINFO_TYPES, read_sample_info



In [9]:

    
sample_info = read_sample_info(MAIN_DIR)

sample_info.head()



In [10]:

    
from abundance_utils import read_and_reduce_elviz_csv

#read_and_reduce_elviz_csv(
#    filename = 'elviz-contigs-1056169.csv', 
#    filepath=filepath,
#    sample_info = sample_info)



In [11]:

    
! ls









    



Bacteriovoracaceae_and_Myxococcaceae--Families.ipynb
GLOBALS.py
calculate_abundances-Copy1.ipynb
calculate_abundances.ipynb
demo_elviz_pca_module.ipynb
depreciated
elviz_abundance_plotting.ipynb
heatmap_all_below--burkholderiales_at_differing_depths.ipynb
plot_bar_charts.ipynb
plots



In [12]:

    
from abundance_utils import get_elviz_filenames

elviz_files = get_elviz_filenames(main_dir='../')
elviz_files[0:4]









    Out[12]:





['elviz-contigs-1056013.csv',
 'elviz-contigs-1056016.csv',
 'elviz-contigs-1056019.csv',
 'elviz-contigs-1056022.csv']



In [13]:

    
from elviz_utils import make_directory

make_directory(dirpath=MAIN_DIR + "/plots")
make_directory(dirpath=MAIN_DIR + 'results')



In [14]:

    
from abundance_utils import read_and_reduce_all



In [15]:

    
from abundance_utils import project_number_from_filename



In [16]:

    
os.path.join(MAIN_DIR, 'raw_data')









    Out[16]:





'../raw_data'



In [17]:

    
if import_original_data:
    data_reduced = read_and_reduce_all(filename_list=elviz_files,
                                       filepath=os.path.join(MAIN_DIR, 'raw_data'),
                                       sample_info = sample_info)
    print(data_reduced.size)  
else: 
    data_reduced = pd.read_csv(
        MAIN_DIR + "/results/reduced_data--all_taxonomy_remains.csv")

data_reduced.head()









    



elviz-contigs-1056016.csv
elviz-contigs-1056019.csv
elviz-contigs-1056022.csv
elviz-contigs-1056025.csv
elviz-contigs-1056028.csv
elviz-contigs-1056031.csv
elviz-contigs-1056034.csv
elviz-contigs-1056037.csv
elviz-contigs-1056040.csv
elviz-contigs-1056043.csv
elviz-contigs-1056046.csv
elviz-contigs-1056049.csv
elviz-contigs-1056052.csv
elviz-contigs-1056055.csv
elviz-contigs-1056058.csv
elviz-contigs-1056061.csv
elviz-contigs-1056064.csv
elviz-contigs-1056067.csv
elviz-contigs-1056070.csv
elviz-contigs-1056073.csv
elviz-contigs-1056076.csv
elviz-contigs-1056079.csv
elviz-contigs-1056082.csv
elviz-contigs-1056085.csv
elviz-contigs-1056088.csv
elviz-contigs-1056091.csv
elviz-contigs-1056094.csv
elviz-contigs-1056097.csv
elviz-contigs-1056100.csv
elviz-contigs-1056103.csv
elviz-contigs-1056106.csv
elviz-contigs-1056109.csv
elviz-contigs-1056112.csv
elviz-contigs-1056115.csv
elviz-contigs-1056118.csv
elviz-contigs-1056121.csv
elviz-contigs-1056124.csv
elviz-contigs-1056127.csv
elviz-contigs-1056130.csv
elviz-contigs-1056133.csv
elviz-contigs-1056136.csv
elviz-contigs-1056139.csv
elviz-contigs-1056142.csv
elviz-contigs-1056145.csv
elviz-contigs-1056148.csv
elviz-contigs-1056151.csv
elviz-contigs-1056154.csv
elviz-contigs-1056157.csv
elviz-contigs-1056160.csv
elviz-contigs-1056163.csv
elviz-contigs-1056166.csv
elviz-contigs-1056169.csv
elviz-contigs-1056172.csv
elviz-contigs-1056175.csv
elviz-contigs-1056178.csv
elviz-contigs-1056181.csv
elviz-contigs-1056184.csv
elviz-contigs-1056187.csv
elviz-contigs-1056190.csv
elviz-contigs-1056193.csv
elviz-contigs-1056196.csv
elviz-contigs-1056199.csv
elviz-contigs-1056202.csv
elviz-contigs-1056205.csv
elviz-contigs-1056208.csv
elviz-contigs-1056211.csv
elviz-contigs-1056214.csv
elviz-contigs-1056217.csv
elviz-contigs-1056220.csv
elviz-contigs-1056223.csv
elviz-contigs-1056226.csv
elviz-contigs-1056229.csv
elviz-contigs-1056232.csv
elviz-contigs-1056235.csv
elviz-contigs-1056238.csv
elviz-contigs-1056241.csv
elviz-contigs-1056244.csv
elviz-contigs-1056247.csv
elviz-contigs-1056250.csv
elviz-contigs-1056253.csv
elviz-contigs-1056256.csv
elviz-contigs-1056259.csv
elviz-contigs-1056262.csv
elviz-contigs-1056265.csv
elviz-contigs-1056268.csv
elviz-contigs-1056271.csv
elviz-contigs-1056274.csv
841815






    Out[17]:






  
    
      
      Kingdom
      Phylum
      Class
      Order
      Family
      Genus
      Length
      fraction of reads
      project
      ID
      oxy
      rep
      week
    
  
  
    
      0
      Bacteria
      Proteobacteria
      Gammaproteobacteria
      Methylococcales
      Methylococcaceae
      Methylobacter
      9948861
      0.205558
      1056013
      1_LOW4
      Low
      1
      4
    
    
      1
      Bacteria
      Proteobacteria
      Betaproteobacteria
      Methylophilales
      Methylophilaceae
      Methylotenera
      5066955
      0.185898
      1056013
      1_LOW4
      Low
      1
      4
    
    
      2
      Bacteria
      Proteobacteria
      
      
      
      other
      3930509
      0.075027
      1056013
      1_LOW4
      Low
      1
      4
    
    
      3
      Bacteria
      Proteobacteria
      Gammaproteobacteria
      
      
      other
      5620690
      0.073601
      1056013
      1_LOW4
      Low
      1
      4
    
    
      4
      Bacteria
      Bacteroidetes
      Flavobacteriia
      Flavobacteriales
      Flavobacteriaceae
      Flavobacterium
      4654774
      0.065548
      1056013
      1_LOW4
      Low
      1
      4



In [18]:

    
if import_original_data:
    data_reduced.to_csv(MAIN_DIR + "/results/reduced_data--all_taxonomy_remains.csv", index=False)



In [19]:

    
data_reduced.head()









    Out[19]:






  
    
      
      Kingdom
      Phylum
      Class
      Order
      Family
      Genus
      Length
      fraction of reads
      project
      ID
      oxy
      rep
      week
    
  
  
    
      0
      Bacteria
      Proteobacteria
      Gammaproteobacteria
      Methylococcales
      Methylococcaceae
      Methylobacter
      9948861
      0.205558
      1056013
      1_LOW4
      Low
      1
      4
    
    
      1
      Bacteria
      Proteobacteria
      Betaproteobacteria
      Methylophilales
      Methylophilaceae
      Methylotenera
      5066955
      0.185898
      1056013
      1_LOW4
      Low
      1
      4
    
    
      2
      Bacteria
      Proteobacteria
      
      
      
      other
      3930509
      0.075027
      1056013
      1_LOW4
      Low
      1
      4
    
    
      3
      Bacteria
      Proteobacteria
      Gammaproteobacteria
      
      
      other
      5620690
      0.073601
      1056013
      1_LOW4
      Low
      1
      4
    
    
      4
      Bacteria
      Bacteroidetes
      Flavobacteriia
      Flavobacteriales
      Flavobacteriaceae
      Flavobacterium
      4654774
      0.065548
      1056013
      1_LOW4
      Low
      1
      4



In [20]:

    
# Should have only 1 row per Id now. 
methylophilus_only = data_reduced[data_reduced['Genus']=='Methylophilus']
print(methylophilus_only[['ID', 'Length', 'fraction of reads']].sort_values('Length'))









    



            ID   Length  fraction of reads
219    63_LOW9    15882           0.000028
49    73_LOW10    20437           0.000760
99     51_LOW8    29583           0.000217
100   74_LOW10    33515           0.000278
26   122_LOW14    39064           0.001171
59    92_HOW11    42204           0.000607
103   75_LOW10    45610           0.000124
27   124_LOW14    46654           0.001526
22   121_LOW14    47061           0.001651
29     61_LOW9    52709           0.001694
76      4_LOW4    52969           0.000506
28   112_LOW13    55629           0.001615
74     13_LOW5    58884           0.000652
67      9_HOW4    60376           0.000780
33      8_HOW4    61368           0.001438
116  123_LOW14    68043           0.000175
41     49_LOW8    71714           0.000932
48      3_LOW4    72753           0.001035
52      2_LOW4    73660           0.000926
48      1_LOW4    74310           0.001062
88     39_LOW7    77028           0.000347
56     14_LOW5    88088           0.000793
57     27_LOW6    88622           0.000875
50     15_LOW5    93077           0.000953
57     37_LOW7    96499           0.000638
57     57_HOW8    96525           0.000506
36     10_HOW4   101934           0.001388
39      7_HOW4   105754           0.001432
34   100_LOW12   108658           0.001117
61     25_LOW6   108785           0.000540
..         ...      ...                ...
6     97_LOW12  2984166           0.015834
2     80_HOW10  3001616           0.087030
11    91_HOW11  3011404           0.016662
1      32_HOW6  3017162           0.347133
8     94_HOW11  3083901           0.028545
1    128_HOW14  3096023           0.205689
7     99_LOW12  3812409           0.022149
0      70_HOW9  3885996           0.371372
0      58_HOW8  4742705           0.363455
1      33_HOW6  4901475           0.188410
1      45_HOW7  5125910           0.198458
0      31_HOW6  5194359           0.383851
1      19_HOW5  5334592           0.100792
0    115_HOW13  5336219           0.531892
1     88_LOW11  5341441           0.068999
1    106_HOW12  5385211           0.115445
0    127_HOW14  5404569           0.360175
3    118_HOW13  5426623           0.021599
0    104_HOW12  5430800           0.644610
1     93_HOW11  5430835           0.071349
1    130_HOW14  5449081           0.082110
1    103_HOW12  5450134           0.214184
1    117_HOW13  5577034           0.158158
7     87_LOW11  5643812           0.025104
2     81_HOW10  5647252           0.045446
2     82_HOW10  5690423           0.045236
1      44_HOW7  5795352           0.252276
0      43_HOW7  5805202           0.407512
1    129_HOW14  5873847           0.121108
1    105_HOW12  6111634           0.301190

[88 rows x 3 columns]



In [21]:

    
if write_excel:
    from abundance_utils import write_excel_files
    write_excel_files(dataframe = data_reduced, filepath = MAIN_DIR + '/results')



In [22]:

    
from abundance_utils import reduce_to_genus_only
data_reduced_genus = reduce_to_genus_only(data_reduced)
data_reduced_genus.head()









    Out[22]:






  
    
      
      ID
      rep
      week
      oxy
      Genus
      Length
      fraction of reads
      project
    
  
  
    
      7268
      118_HOW13
      4
      13
      High
      Methylobacter
      4326741
      0.820344
      1056250
    
    
      11786
      130_HOW14
      4
      14
      High
      Methylobacter
      4428101
      0.776829
      1056274
    
    
      2432
      106_HOW12
      4
      12
      High
      Methylobacter
      4325001
      0.623003
      1056226
    
    
      45856
      82_HOW10
      4
      10
      High
      Methylobacter
      5488758
      0.589218
      1056178
    
    
      14497
      16_LOW5
      4
      5
      Low
      other
      40011267
      0.548044
      137285980



In [23]:

    
# Write a copy to csv
if import_original_data:
    data_reduced_genus.to_csv(
        MAIN_DIR + "/results/reduced_data--genus_only.csv", index=False)



In [24]:

    
data_reduced_genus.head(3)









    Out[24]:






  
    
      
      ID
      rep
      week
      oxy
      Genus
      Length
      fraction of reads
      project
    
  
  
    
      7268
      118_HOW13
      4
      13
      High
      Methylobacter
      4326741
      0.820344
      1056250
    
    
      11786
      130_HOW14
      4
      14
      High
      Methylobacter
      4428101
      0.776829
      1056274
    
    
      2432
      106_HOW12
      4
      12
      High
      Methylobacter
      4325001
      0.623003
      1056226



In [25]:

    
by_repl_and_week_Genus = data_reduced_genus.groupby(['rep','week','oxy'])



In [26]:

    
if write_excel:
    write_excel_files(dataframe= data_reduced_genus, 
             filepath = MAIN_DIR +  '/results', 
             by_genus=True)



In [ ]:

	ID	oxy	rep	week	project
0	1_LOW4	Low	1	4	1056013
1	13_LOW5	Low	1	5	1056037
2	25_LOW6	Low	1	6	1056061
3	37_LOW7	Low	1	7	1056085
4	49_LOW8	Low	1	8	1056109

	Kingdom	Phylum	Class	Order	Family	Genus	Length	fraction of reads	project	ID	oxy	rep	week
0	Bacteria	Proteobacteria	Gammaproteobacteria	Methylococcales	Methylococcaceae	Methylobacter	9948861	0.205558	1056013	1_LOW4	Low	1	4
1	Bacteria	Proteobacteria	Betaproteobacteria	Methylophilales	Methylophilaceae	Methylotenera	5066955	0.185898	1056013	1_LOW4	Low	1	4
2	Bacteria	Proteobacteria				other	3930509	0.075027	1056013	1_LOW4	Low	1	4
3	Bacteria	Proteobacteria	Gammaproteobacteria			other	5620690	0.073601	1056013	1_LOW4	Low	1	4
4	Bacteria	Bacteroidetes	Flavobacteriia	Flavobacteriales	Flavobacteriaceae	Flavobacterium	4654774	0.065548	1056013	1_LOW4	Low	1	4

	ID	rep	week	oxy	Genus	Length	fraction of reads	project
7268	118_HOW13	4	13	High	Methylobacter	4326741	0.820344	1056250
11786	130_HOW14	4	14	High	Methylobacter	4428101	0.776829	1056274
2432	106_HOW12	4	12	High	Methylobacter	4325001	0.623003	1056226
45856	82_HOW10	4	10	High	Methylobacter	5488758	0.589218	1056178
14497	16_LOW5	4	5	Low	other	40011267	0.548044	137285980