In [ ]:
import numpy as np
import os
import re
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib
matplotlib.use('Agg')
%matplotlib inline
import pandas as pd
import seaborn as sns
In [ ]:
import sys
print(sys.version)
In [ ]:
%matplotlib inline
from pylab import rcParams
rcParams['figure.figsize'] = 4, 2.5
The network directory in this share (which is still uploading, btw) contains a pickle (data.pkl) and the code used to generate it (network.py). The lfdr_pcor object in the pickle has the partial correlations after pruning, but poor has all of them (the full network). network.txt has a text version of the network (after pruning) that can be sucked into cytoscape.
The network data was calculated from mapping to genome bins:
Full graph:
In [ ]:
! ls -lh ../waffle_network_dir/*.tsv
In [ ]:
! wc -l ../waffle_network_dir/network.py.tsv
In [ ]:
! head -n 5 ../waffle_network_dir/network.py.tsv | csvlook -t
In [ ]:
! ls -lh ../waffle_network_dir/network.py.tsv
Make a 10k row version of the file for development.
In [ ]:
network = pd.read_csv('../waffle_network_dir/network.py.tsv', skiprows=1,
#skipfooter = 49995001 - 1*10**4,
#skipfooter = 1000, # can't have skipfooter with dtype. :(
sep='\t', names = ['source', 'target', 'pcor'],
dtype = {'source':str, 'target':str, 'pcor':float})
In [ ]:
network.shape
In [ ]:
network.head()
In [ ]:
def label_associations(row):
if row['pcor'] > 0:
val = 'positive'
elif row['pcor'] < 0:
val = 'negative'
elif row['pcor'] == 0:
val = 'drop me'
return val
In [ ]:
network['association'] = network.apply(label_associations, axis=1)
In [ ]:
network['association'].unique()
In [ ]:
print("shape before dropping rows with pcor == 0: {}".format(network.shape))
network = network[network['association'] != 'drop me']
print("shape after dropping rows with pcor == 0: {}".format(network.shape))
In [ ]:
network.head(3)
I am only using 3.7% of Waffle's memory at the beginning :)
In [ ]:
! top -o %MEM | head
In [ ]:
network['target_organism'] = network['target'].str.extract('([A-z0-9]+)_[0-9]+')
network['target_gene'] = network['target'].str.extract('[A-z0-9]+_([0-9]+)')
network['source_organism'] = network['source'].str.extract('([A-z0-9]+)_[0-9]+')
network['source_gene'] = network['source'].str.extract('[A-z0-9]+_([0-9]+)')
In [ ]:
network.head()
In [ ]:
network = network.rename(columns=lambda x: re.sub('source$', 'source_locus_tag', x))
network = network.rename(columns=lambda x: re.sub('target$', 'target_locus_tag', x))
In [ ]:
network.head(2)
In [ ]:
network['target_organism'].unique()
In [ ]:
len(network['target_organism'].unique())
In [ ]:
network['cross_species'] = network['source_organism'] != network['target_organism']
In [ ]:
network.cross_species.describe()
In [ ]:
network.cross_species.plot.hist()
In [ ]:
network.pcor.plot.hist()
In [ ]:
fig, ax = plt.subplots(1, 1, figsize=(4, 3))
plt.hist(network.pcor)
plt.yscale('log', nonposy='clip')
plt.xlabel('partial correlation value')
plt.ylabel('# edges')
plt.tight_layout()
plt.savefig('161209_hist_of_pcor_values.pdf')
plt.savefig('161209_hist_of_pcor_values.png', dpi=600)
In [ ]:
fig, ax = plt.subplots(1, 1, figsize=(5, 2.5))
plt.hist(network.pcor, 50)
plt.yscale('log', nonposy='clip')
plt.xlabel('partial correlation value')
plt.ylabel('# edges')
plt.tight_layout()
plt.savefig('161209_hist_of_pcor_values_50_bins.pdf')
plt.savefig('161209_hist_of_pcor_values_50_bins.png', dpi=600)
In [ ]:
locus_to_organism = pd.read_csv('/dacb/meta4_bins/data/genome_bins.locus_to_organism.tsv', sep='\t',
names=['locus', 'organism'])
In [ ]:
locus_to_organism.head()
In [ ]:
# Found a problem:
# Expected exactly 2 organsm names, but we have 3
# {'Methylobacter-123 (UID203) ', 'Methylobacter-123 (UID203)', 'Methylotenera mobilis-49 (UID203)'}
# http://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.str.strip.html
# strips both left and right whitespace :)
locus_to_organism['organism'] = locus_to_organism['organism'].str.strip()
In [ ]:
locus_to_organism['organism ID'] = locus_to_organism['locus'].str.extract('([A-z]+[0-9]+)_[0-9]+')
In [ ]:
source_organism_names = locus_to_organism[['organism ID', 'organism']].drop_duplicates()
target_organism_names = locus_to_organism[['organism ID', 'organism']].drop_duplicates()
In [ ]:
source_organism_names = source_organism_names.rename(
columns={'organism ID':'source_organism', 'organism':'source_organism_name'})
target_organism_names = target_organism_names.rename(
columns={'organism ID':'target_organism', 'organism':'target_organism_name'})
In [ ]:
source_organism_names
In [ ]:
merged = pd.merge(network, source_organism_names)
In [ ]:
len(merged.source_organism_name.unique())
In [ ]:
merged.head(2)
In [ ]:
merged = pd.merge(merged, target_organism_names)
print(merged.shape)
print(network.shape)
In [ ]:
merged.head()
In [ ]:
len(merged.target_organism_name.unique())
In [ ]:
print(merged.shape)
print(network.shape)
merged.tail(3)
Use summary_counts, not summary_rpkm for gene names.
jmatsen@waffle:/dacb/meta4_bins/analysis/assemble_summaries$ ag Ga0081607_11219 summary_rpkm.xls | head -n 10
jmatsen@waffle:/dacb/meta4_bins/analysis/assemble_summaries$ ag Ga0081607_11219 summary_counts.xls | head -n 10
2015:Methylobacter-123 (UID203) Ga0081607_11219 hypothetical protein 243652 6660 160 285587 448 89 94 4893 13
66994 47733 163 301 3 146 1851 26 53125 249288 21 14249 28 12 42296 23538 2778 1918 2061 217 173983 164307 398 450 1170 10410 30 344 2224 2164 1452 810 338 656 70 222 3475 1143 2672 1313 1246 930 54 23 9942 9603 2381 8196 29 49 23721 7808 33195 17291 5825 6609 36 83 40661 28629 17949 12227 15478 15054 125 1010 10214 66875 40225 944 11993 9572 56 9375
In [ ]:
genes = pd.read_csv('/dacb/meta4_bins/analysis/assemble_summaries/summary_counts.xls',
sep = '\t', usecols=[1, 2])
In [ ]:
genes.tail(3)
In [ ]:
genes.tail()
In [ ]:
genes[genes['locus_tag'] == 'Ga0081607_11219']
In [ ]:
merged.head(2)
In [ ]:
source_genes = genes[['locus_tag', 'product']].rename(
columns={'locus_tag':'source_locus_tag', 'product':'source_gene_product'})
target_genes = genes[['locus_tag', 'product']].rename(
columns={'locus_tag':'target_locus_tag', 'product':'target_gene_product'})
In [ ]:
source_genes.head(2)
In [ ]:
network.shape
In [ ]:
merged.shape
In [ ]:
merged = pd.merge(merged, source_genes)
In [ ]:
merged.shape
In [ ]:
merged = pd.merge(merged, target_genes)
In [ ]:
merged.shape
In [ ]:
merged.head(2)
In [ ]:
merged.head(3)
In [ ]:
merged['sort'] = merged.pcor.abs()
merged = merged.sort(columns='sort', ascending=False).drop('sort', axis=1)
In [ ]:
merged['pcor'].describe()
In [ ]:
merged.head(2)
In [ ]:
filename = '50M_network'
In [ ]:
! ls ../data
In [ ]:
dirname = '../data/50M_network/'
In [ ]:
if not os.path.exists(dirname):
print"make dir {}".format(dirname)
os.mkdir(dirname)
else:
print("dir {} already exists.".format(dirname))
In [ ]:
path = dirname + filename + '.tsv'
print('save to : {}'.format(path))
merged.to_csv(path, sep='\t', index=False)
In [ ]:
# The CSV isn't a good idea because of the gene names.
#merged.to_csv(dirname + filename + '.csv')
In [ ]:
merged.head(100).to_csv(dirname + filename + '--100' + '.tsv', sep='\t', index=False)
In [ ]:
os.listdir(dirname)
In [ ]:
merged.shape