In [1]:
import numpy as np
import re
import pandas as pd
import seaborn as sns
In [2]:
%matplotlib inline
from pylab import rcParams
rcParams['figure.figsize'] = 4, 2.5
The network directory in this share (which is still uploading, btw) contains a pickle (data.pkl) and the code used to generate it (network.py). The lfdr_pcor object in the pickle has the partial correlations after pruning, but poor has all of them (the full network). network.txt has a text version of the network (after pruning) that can be sucked into cytoscape.
The network data was calculated from mapping to genome bins:
In [3]:
network = pd.read_csv('/dacb/meta4_bins/analysis/network/network.txt',
sep='\t')
In [4]:
network.head()
Out[4]:
In [5]:
network['target_organism'] = network['target'].str.extract('([A-z0-9]+)_[0-9]+')
network['target_gene'] = network['target'].str.extract('[A-z0-9]+_([0-9]+)')
network['source_organism'] = network['source'].str.extract('([A-z0-9]+)_[0-9]+')
network['source_gene'] = network['source'].str.extract('[A-z0-9]+_([0-9]+)')
In [6]:
network.head()
Out[6]:
In [7]:
network = network.rename(columns=lambda x: re.sub('source$', 'source_locus_tag', x))
network = network.rename(columns=lambda x: re.sub('target$', 'target_locus_tag', x))
In [8]:
network.head(2)
Out[8]:
In [9]:
network['target_organism'].unique()
Out[9]:
In [10]:
network['cross_species'] = network['source_organism'] != network['target_organism']
In [11]:
network.cross_species.describe()
Out[11]:
In [12]:
network.cross_species.plot.hist()
Out[12]:
In [13]:
network.weight.plot.hist()
Out[13]:
In [14]:
locus_to_organism = pd.read_csv('/dacb/meta4_bins/data/genome_bins.locus_to_organism.tsv', sep='\t',
names=['locus', 'organism'])
In [15]:
locus_to_organism.head()
Out[15]:
In [16]:
# Found a problem:
# Expected exactly 2 organsm names, but we have 3
# {'Methylobacter-123 (UID203) ', 'Methylobacter-123 (UID203)', 'Methylotenera mobilis-49 (UID203)'}
# http://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.str.strip.html
# strips both left and right whitespace :)
locus_to_organism['organism'] = locus_to_organism['organism'].str.strip()
In [17]:
locus_to_organism['organism ID'] = locus_to_organism['locus'].str.extract('([A-z]+[0-9]+)_[0-9]+')
In [18]:
source_organism_names = locus_to_organism[['organism ID', 'organism']].drop_duplicates()
target_organism_names = locus_to_organism[['organism ID', 'organism']].drop_duplicates()
In [19]:
source_organism_names = source_organism_names.rename(columns={'organism ID':'source_organism', 'organism':'source_organism_name'})
target_organism_names = target_organism_names.rename(columns={'organism ID':'target_organism', 'organism':'target_organism_name'})
In [20]:
source_organism_names
Out[20]:
In [21]:
merged = pd.merge(network, source_organism_names)
In [22]:
merged.source_organism_name.unique()
Out[22]:
In [23]:
merged.head(2)
Out[23]:
In [24]:
merged = pd.merge(merged, target_organism_names)
print(merged.shape)
print(network.shape)
In [25]:
merged.head()
Out[25]:
In [26]:
merged.source_organism_name.unique()
Out[26]:
In [27]:
print(merged.shape)
print(network.shape)
In [28]:
merged.tail()
Out[28]:
In [29]:
# # d[(d['x']>2) & (d['y']>7)]
merged[(merged['source_organism_name'] == 'Methylotenera mobilis-49 (UID203)') &
(merged['target_organism_name'] == 'Methylobacter-123 (UID203)')].head(2)
Out[29]:
In [30]:
merged[(merged['source_organism_name'] == 'Methylobacter-123 (UID203)') &
(merged['target_organism_name'] == 'Methylotenera mobilis-49 (UID203)')].head(2)
Out[30]:
Use summary_counts, not summary_rpkm for gene names.
jmatsen@waffle:/dacb/meta4_bins/analysis/assemble_summaries$ ag Ga0081607_11219 summary_rpkm.xls | head -n 10
jmatsen@waffle:/dacb/meta4_bins/analysis/assemble_summaries$ ag Ga0081607_11219 summary_counts.xls | head -n 10
2015:Methylobacter-123 (UID203) Ga0081607_11219 hypothetical protein 243652 6660 160 285587 448 89 94 4893 13
66994 47733 163 301 3 146 1851 26 53125 249288 21 14249 28 12 42296 23538 2778 1918 2061 217 173983 164307 398 450 1170 10410 30 344 2224 2164 1452 810 338 656 70 222 3475 1143 2672 1313 1246 930 54 23 9942 9603 2381 8196 29 49 23721 7808 33195 17291 5825 6609 36 83 40661 28629 17949 12227 15478 15054 125 1010 10214 66875 40225 944 11993 9572 56 9375
In [31]:
genes = pd.read_csv('/dacb/meta4_bins/analysis/assemble_summaries/summary_counts.xls', sep = '\t', usecols=[1, 2])
In [32]:
genes.tail(3)
Out[32]:
In [33]:
genes.tail()
Out[33]:
In [34]:
genes[genes['locus_tag'] == 'Ga0081607_11219']
Out[34]:
In [35]:
merged.head(2)
Out[35]:
In [36]:
source_genes = genes[['locus_tag', 'product']].rename(columns={'locus_tag':'source_locus_tag', 'product':'source_gene_product'})
target_genes = genes[['locus_tag', 'product']].rename(columns={'locus_tag':'target_locus_tag', 'product':'target_gene_product'})
In [37]:
source_genes.head(2)
Out[37]:
In [38]:
network.shape
Out[38]:
In [39]:
merged.shape
Out[39]:
In [40]:
merged = pd.merge(merged, source_genes)
In [41]:
merged.shape
Out[41]:
In [42]:
merged = pd.merge(merged, target_genes)
In [43]:
merged.shape
Out[43]:
In [44]:
merged.head(2)
Out[44]:
In [45]:
merged.head(3)
Out[45]:
In [46]:
merged['sort'] = merged.weight.abs()
merged = merged.sort(columns='sort', ascending=False).drop('sort', axis=1)
In [47]:
merged['weight'].describe()
Out[47]:
In [48]:
merged.head(2)
Out[48]:
In [49]:
merged['source_gene_product'].unique()[0:4]
Out[49]:
In [50]:
filename = 'Methylobacter-123--Methylotenera_mobilis-49_network'
In [51]:
! ls ../data
In [52]:
! mkdir ../data/Methylobacter--Methylotenera
In [53]:
dirname = '../data/Methylobacter--Methylotenera/'
In [54]:
merged.to_csv(dirname + filename + '.tsv', sep='\t', index=False)
In [55]:
# The CSV isn't a good idea because of the gene names.
#merged.to_csv(dirname + filename + '.csv')
In [56]:
merged.head(100).to_csv(dirname + filename + '--100' + '.tsv', sep='\t', index=False)
In [57]:
merged.shape
Out[57]:
In [58]:
top_genes = list(merged['source_gene_product'].unique()[0:3]) + list(merged['target_gene_product'].unique()[0:3])
In [59]:
has_top_genes = merged[(merged['source_gene_product'].isin(top_genes)) | (merged['target_gene_product'].isin(top_genes))]
In [60]:
has_top_genes.shape
Out[60]:
In [61]:
has_top_genes.to_csv(dirname + filename + '--some_top_genes--1268_rows' + '.tsv', sep='\t', index=False)
In [62]:
merged.head()
Out[62]:
In [63]:
network.head()
Out[63]:
In [64]:
def extract_nodes(df, source=True):
if source:
string = 'source'
else:
string = 'target'
cols = df.columns.str.contains(string)
df_subset = df.iloc[:, cols]
df_subset = df_subset.rename(columns=lambda x: x.replace(string + "_", ''))
df_subset = df_subset.rename(columns=lambda x: x.replace(string, 'locus_tag'))
return df_subset
In [65]:
extract_nodes(merged, True).head(3)
Out[65]:
In [66]:
extract_nodes(merged, False).head(3)
Out[66]:
In [67]:
nodes = pd.concat([extract_nodes(merged, True).drop_duplicates(),
extract_nodes(merged, False).drop_duplicates()], axis = 0)
In [68]:
nodes.shape
Out[68]:
In [69]:
nodes.drop_duplicates(inplace=True)
In [70]:
nodes.shape
Out[70]:
In [71]:
nodes.to_csv('../data/Methylobacter--Methylotenera/nodes.tsv', sep='\t', index=False)
In [ ]: