In [ ]:

    
import numpy as np
import os
import re

import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib
matplotlib.use('Agg')
%matplotlib inline

import pandas as pd
import seaborn as sns



In [ ]:

    
import sys 
print(sys.version)



In [ ]:

    
%matplotlib inline

from pylab import rcParams
rcParams['figure.figsize'] = 4, 2.5

The network directory in this share (which is still uploading, btw) contains a pickle (data.pkl) and the code used to generate it (network.py). The lfdr_pcor object in the pickle has the partial correlations after pruning, but poor has all of them (the full network). network.txt has a text version of the network (after pruning) that can be sucked into cytoscape.

The network data was calculated from mapping to genome bins:

Two-organism sub-graph: jmatsen@waffle:/dacb/meta4_bins/analysis/network$ column -t network.txt | head -n 5 source target pcor association Ga0081607_11219 Ga0081607_115212 0.01928 positive Ga0081607_11219 Ga0081607_116221 0.01995 positive Ga0081607_11219 Ga0081607_107914 0.02173 positive Ga0081607_11219 Ga0081607_115213 0.02291 positive

Full graph:



In [ ]:

    
! ls -lh ../waffle_network_dir/*.tsv



In [ ]:

    
! wc -l ../waffle_network_dir/network.py.tsv



In [ ]:

    
! head -n 5 ../waffle_network_dir/network.py.tsv | csvlook -t

waffle:waffle_network_dir (master) jmatsen$ head -n 5 network.py.tsv | csvlook -t |-------------------+------------------+--------------------| | geneA | geneB | pcor | |-------------------+------------------+--------------------| | Ga0081614_142199 | Ga0081614_142197 | -0.00395096462737 | | Ga0081614_142199 | Ga0081614_142198 | -0.00391884521297 | | Ga0081614_142199 | Ga0081614_18334 | -0.0011141895928 | | Ga0081614_142199 | Ga0081614_17829 | -0.00156949879462 | |-------------------+------------------+--------------------|



In [ ]:

    
! ls -lh ../waffle_network_dir/network.py.tsv

Make a 10k row version of the file for development.

waffle:waffle_network_dir (master) jmatsen$ head -n 10000 network.py.tsv > network.py.10000.tsv



In [ ]:

    
network = pd.read_csv('../waffle_network_dir/network.py.tsv', skiprows=1,
                      #skipfooter = 49995001 - 1*10**4,
                      #skipfooter = 1000,  # can't have skipfooter with dtype. :(
                      sep='\t', names = ['source', 'target', 'pcor'],
                      dtype = {'source':str, 'target':str, 'pcor':float})



In [ ]:

    
network.shape



In [ ]:

    
network.head()



In [ ]:

    
def label_associations(row):
    if row['pcor'] > 0:
        val = 'positive'
    elif row['pcor'] < 0:
        val = 'negative'
    elif row['pcor'] == 0:
        val = 'drop me'
    return val



In [ ]:

    
network['association'] = network.apply(label_associations, axis=1)



In [ ]:

    
network['association'].unique()



In [ ]:

    
print("shape before dropping rows with pcor == 0: {}".format(network.shape))
network = network[network['association'] != 'drop me']
      
print("shape after dropping rows with pcor == 0: {}".format(network.shape))



In [ ]:

    
network.head(3)

I am only using 3.7% of Waffle's memory at the beginning :)



In [ ]:

    
! top -o %MEM | head

PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND 9687 jmatsen 20 0 3230324 2.299g 39844 S 0.0 3.7 1:02.16 python



In [ ]:

    
network['target_organism'] = network['target'].str.extract('([A-z0-9]+)_[0-9]+')
network['target_gene'] = network['target'].str.extract('[A-z0-9]+_([0-9]+)')
network['source_organism'] = network['source'].str.extract('([A-z0-9]+)_[0-9]+')
network['source_gene'] = network['source'].str.extract('[A-z0-9]+_([0-9]+)')



In [ ]:

    
network.head()



In [ ]:

    
network = network.rename(columns=lambda x: re.sub('source$', 'source_locus_tag', x))
network = network.rename(columns=lambda x: re.sub('target$', 'target_locus_tag', x))



In [ ]:

    
network.head(2)



In [ ]:

    
network['target_organism'].unique()



In [ ]:

    
len(network['target_organism'].unique())



In [ ]:

    
network['cross_species'] = network['source_organism'] != network['target_organism']



In [ ]:

    
network.cross_species.describe()



In [ ]:

    
network.cross_species.plot.hist()



In [ ]:

    
network.pcor.plot.hist()



In [ ]:

    
fig, ax = plt.subplots(1, 1, figsize=(4, 3))
plt.hist(network.pcor)
plt.yscale('log', nonposy='clip')
plt.xlabel('partial correlation value')
plt.ylabel('# edges')
plt.tight_layout()
plt.savefig('161209_hist_of_pcor_values.pdf')
plt.savefig('161209_hist_of_pcor_values.png', dpi=600)



In [ ]:

    
fig, ax = plt.subplots(1, 1, figsize=(5, 2.5))
plt.hist(network.pcor, 50)
plt.yscale('log', nonposy='clip')
plt.xlabel('partial correlation value')
plt.ylabel('# edges')
plt.tight_layout()
plt.savefig('161209_hist_of_pcor_values_50_bins.pdf')
plt.savefig('161209_hist_of_pcor_values_50_bins.png', dpi=600)

jmatsen@waffle:/dacb/meta4_bins/data$ head -n 4 genome_bins.locus_to_organism.tsv Ga0081607_1001 Methylobacter-123 (UID203) Ga0081607_1002 Methylobacter-123 (UID203) Ga0081607_1003 Methylobacter-123 (UID203) Ga0081607_1004 Methylobacter-123 (UID203)



In [ ]:

    
locus_to_organism = pd.read_csv('/dacb/meta4_bins/data/genome_bins.locus_to_organism.tsv', sep='\t',
                               names=['locus', 'organism'])



In [ ]:

    
locus_to_organism.head()



In [ ]:

    
# Found a problem: 
# Expected exactly 2 organsm names, but we have 3
#   {'Methylobacter-123 (UID203) ', 'Methylobacter-123 (UID203)', 'Methylotenera mobilis-49 (UID203)'}
# http://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.str.strip.html
#   strips both left and right whitespace  :) 
locus_to_organism['organism'] = locus_to_organism['organism'].str.strip()



In [ ]:

    
locus_to_organism['organism ID'] = locus_to_organism['locus'].str.extract('([A-z]+[0-9]+)_[0-9]+')



In [ ]:

    
source_organism_names = locus_to_organism[['organism ID', 'organism']].drop_duplicates()
target_organism_names = locus_to_organism[['organism ID', 'organism']].drop_duplicates()



In [ ]:

    
source_organism_names = source_organism_names.rename(
    columns={'organism ID':'source_organism', 'organism':'source_organism_name'})
target_organism_names = target_organism_names.rename(
    columns={'organism ID':'target_organism', 'organism':'target_organism_name'})



In [ ]:

    
source_organism_names



In [ ]:

    
merged = pd.merge(network, source_organism_names)



In [ ]:

    
len(merged.source_organism_name.unique())



In [ ]:

    
merged.head(2)



In [ ]:

    
merged = pd.merge(merged, target_organism_names)
print(merged.shape)
print(network.shape)



In [ ]:

    
merged.head()



In [ ]:

    
len(merged.target_organism_name.unique())



In [ ]:

    
print(merged.shape)
print(network.shape)

merged.tail(3)

Use summary_counts, not summary_rpkm for gene names.

jmatsen@waffle:/dacb/meta4_bins/analysis/assemble_summaries$ ag Ga0081607_11219 summary_rpkm.xls | head -n 10 jmatsen@waffle:/dacb/meta4_bins/analysis/assemble_summaries$ ag Ga0081607_11219 summary_counts.xls | head -n 10 2015:Methylobacter-123 (UID203) Ga0081607_11219 hypothetical protein 243652 6660 160 285587 448 89 94 4893 13 66994 47733 163 301 3 146 1851 26 53125 249288 21 14249 28 12 42296 23538 2778 1918 2061 217 173983 164307 398 450 1170 10410 30 344 2224 2164 1452 810 338 656 70 222 3475 1143 2672 1313 1246 930 54 23 9942 9603 2381 8196 29 49 23721 7808 33195 17291 5825 6609 36 83 40661 28629 17949 12227 15478 15054 125 1010 10214 66875 40225 944 11993 9572 56 9375



In [ ]:

    
genes = pd.read_csv('/dacb/meta4_bins/analysis/assemble_summaries/summary_counts.xls', 
                    sep = '\t', usecols=[1, 2])



In [ ]:

    
genes.tail(3)

genes['locus'] = genes['locus_tag'].str.extract('([A-z0-9]+)_[0-9]+') genes['target_gene'] = genes['locus_tag'].str.extract('[A-z0-9]+_([0-9]+)')



In [ ]:

    
genes.tail()



In [ ]:

    
genes[genes['locus_tag'] == 'Ga0081607_11219']



In [ ]:

    
merged.head(2)



In [ ]:

    
source_genes = genes[['locus_tag', 'product']].rename(
    columns={'locus_tag':'source_locus_tag', 'product':'source_gene_product'})
target_genes = genes[['locus_tag', 'product']].rename(
    columns={'locus_tag':'target_locus_tag', 'product':'target_gene_product'})



In [ ]:

    
source_genes.head(2)



In [ ]:

    
network.shape



In [ ]:

    
merged.shape



In [ ]:

    
merged = pd.merge(merged, source_genes)



In [ ]:

    
merged.shape



In [ ]:

    
merged = pd.merge(merged, target_genes)



In [ ]:

    
merged.shape



In [ ]:

    
merged.head(2)



In [ ]:

    
merged.head(3)



In [ ]:

    
merged['sort'] = merged.pcor.abs()
merged = merged.sort(columns='sort', ascending=False).drop('sort', axis=1)



In [ ]:

    
merged['pcor'].describe()



In [ ]:

    
merged.head(2)



In [ ]:

    
filename = '50M_network'



In [ ]:

    
! ls ../data



In [ ]:

    
dirname = '../data/50M_network/'



In [ ]:

    
if not os.path.exists(dirname):
    print"make dir {}".format(dirname)
    os.mkdir(dirname)
else:
    print("dir {} already exists.".format(dirname))



In [ ]:

    
path = dirname + filename + '.tsv'
print('save to : {}'.format(path))
merged.to_csv(path, sep='\t', index=False)



In [ ]:

    
# The CSV isn't a good idea because of the gene names. 
#merged.to_csv(dirname + filename + '.csv')



In [ ]:

    
merged.head(100).to_csv(dirname + filename + '--100' + '.tsv', sep='\t', index=False)



In [ ]:

    
os.listdir(dirname)



In [ ]:

    
merged.shape