In [1]:
import numpy as np
import re

import pandas as pd
import seaborn as sns


/home/jmatsen/miniconda2/envs/neo4j/lib/python2.7/site-packages/matplotlib/font_manager.py:273: UserWarning: Matplotlib is building the font cache using fc-list. This may take a moment.
  warnings.warn('Matplotlib is building the font cache using fc-list. This may take a moment.')

In [2]:
%matplotlib inline

from pylab import rcParams
rcParams['figure.figsize'] = 4, 2.5

The network directory in this share (which is still uploading, btw) contains a pickle (data.pkl) and the code used to generate it (network.py). The lfdr_pcor object in the pickle has the partial correlations after pruning, but poor has all of them (the full network). network.txt has a text version of the network (after pruning) that can be sucked into cytoscape.

The network data was calculated from mapping to genome bins:

jmatsen@waffle:/dacb/meta4_bins/analysis/network$ column -t network.txt | head -n 5 source target weight association Ga0081607_11219 Ga0081607_115212 0.01928 positive Ga0081607_11219 Ga0081607_116221 0.01995 positive Ga0081607_11219 Ga0081607_107914 0.02173 positive Ga0081607_11219 Ga0081607_115213 0.02291 positive

In [3]:
network = pd.read_csv('/dacb/meta4_bins/analysis/network/network.txt', 
                      sep='\t')

In [4]:
network.head()


Out[4]:
source target weight association
0 Ga0081607_11219 Ga0081607_115212 0.01928 positive
1 Ga0081607_11219 Ga0081607_116221 0.01995 positive
2 Ga0081607_11219 Ga0081607_107914 0.02173 positive
3 Ga0081607_11219 Ga0081607_115213 0.02291 positive
4 Ga0081607_108214 Ga0081607_11235 0.01928 positive

In [5]:
network['target_organism'] = network['target'].str.extract('([A-z0-9]+)_[0-9]+')
network['target_gene'] = network['target'].str.extract('[A-z0-9]+_([0-9]+)')
network['source_organism'] = network['source'].str.extract('([A-z0-9]+)_[0-9]+')
network['source_gene'] = network['source'].str.extract('[A-z0-9]+_([0-9]+)')


/home/jmatsen/miniconda2/envs/neo4j/lib/python2.7/site-packages/ipykernel/__main__.py:1: FutureWarning: currently extract(expand=None) means expand=False (return Index/Series/DataFrame) but in a future version of pandas this will be changed to expand=True (return DataFrame)
  if __name__ == '__main__':
/home/jmatsen/miniconda2/envs/neo4j/lib/python2.7/site-packages/ipykernel/__main__.py:2: FutureWarning: currently extract(expand=None) means expand=False (return Index/Series/DataFrame) but in a future version of pandas this will be changed to expand=True (return DataFrame)
  from ipykernel import kernelapp as app
/home/jmatsen/miniconda2/envs/neo4j/lib/python2.7/site-packages/ipykernel/__main__.py:3: FutureWarning: currently extract(expand=None) means expand=False (return Index/Series/DataFrame) but in a future version of pandas this will be changed to expand=True (return DataFrame)
  app.launch_new_instance()
/home/jmatsen/miniconda2/envs/neo4j/lib/python2.7/site-packages/ipykernel/__main__.py:4: FutureWarning: currently extract(expand=None) means expand=False (return Index/Series/DataFrame) but in a future version of pandas this will be changed to expand=True (return DataFrame)

In [6]:
network.head()


Out[6]:
source target weight association target_organism target_gene source_organism source_gene
0 Ga0081607_11219 Ga0081607_115212 0.01928 positive Ga0081607 115212 Ga0081607 11219
1 Ga0081607_11219 Ga0081607_116221 0.01995 positive Ga0081607 116221 Ga0081607 11219
2 Ga0081607_11219 Ga0081607_107914 0.02173 positive Ga0081607 107914 Ga0081607 11219
3 Ga0081607_11219 Ga0081607_115213 0.02291 positive Ga0081607 115213 Ga0081607 11219
4 Ga0081607_108214 Ga0081607_11235 0.01928 positive Ga0081607 11235 Ga0081607 108214

In [7]:
network = network.rename(columns=lambda x: re.sub('source$', 'source_locus_tag', x))
network = network.rename(columns=lambda x: re.sub('target$', 'target_locus_tag', x))

In [8]:
network.head(2)


Out[8]:
source_locus_tag target_locus_tag weight association target_organism target_gene source_organism source_gene
0 Ga0081607_11219 Ga0081607_115212 0.01928 positive Ga0081607 115212 Ga0081607 11219
1 Ga0081607_11219 Ga0081607_116221 0.01995 positive Ga0081607 116221 Ga0081607 11219

In [9]:
network['target_organism'].unique()


Out[9]:
array(['Ga0081607', 'Ga0081629'], dtype=object)

In [10]:
network['cross_species'] = network['source_organism'] != network['target_organism']

In [11]:
network.cross_species.describe()


Out[11]:
count      2486
unique        2
top       False
freq       2252
Name: cross_species, dtype: object

In [12]:
network.cross_species.plot.hist()


Out[12]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f024d930190>

In [13]:
network.weight.plot.hist()


Out[13]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f024b991190>
jmatsen@waffle:/dacb/meta4_bins/data$ head -n 4 genome_bins.locus_to_organism.tsv Ga0081607_1001 Methylobacter-123 (UID203) Ga0081607_1002 Methylobacter-123 (UID203) Ga0081607_1003 Methylobacter-123 (UID203) Ga0081607_1004 Methylobacter-123 (UID203)

In [14]:
locus_to_organism = pd.read_csv('/dacb/meta4_bins/data/genome_bins.locus_to_organism.tsv', sep='\t',
                               names=['locus', 'organism'])

In [15]:
locus_to_organism.head()


Out[15]:
locus organism
0 Ga0081607_1001 Methylobacter-123 (UID203)
1 Ga0081607_1002 Methylobacter-123 (UID203)
2 Ga0081607_1003 Methylobacter-123 (UID203)
3 Ga0081607_1004 Methylobacter-123 (UID203)
4 Ga0081607_1005 Methylobacter-123 (UID203)

In [16]:
# Found a problem: 
# Expected exactly 2 organsm names, but we have 3
#   {'Methylobacter-123 (UID203) ', 'Methylobacter-123 (UID203)', 'Methylotenera mobilis-49 (UID203)'}
# http://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.str.strip.html
#   strips both left and right whitespace  :) 
locus_to_organism['organism'] = locus_to_organism['organism'].str.strip()

In [17]:
locus_to_organism['organism ID'] = locus_to_organism['locus'].str.extract('([A-z]+[0-9]+)_[0-9]+')


/home/jmatsen/miniconda2/envs/neo4j/lib/python2.7/site-packages/ipykernel/__main__.py:1: FutureWarning: currently extract(expand=None) means expand=False (return Index/Series/DataFrame) but in a future version of pandas this will be changed to expand=True (return DataFrame)
  if __name__ == '__main__':

In [18]:
source_organism_names = locus_to_organism[['organism ID', 'organism']].drop_duplicates()
target_organism_names = locus_to_organism[['organism ID', 'organism']].drop_duplicates()

In [19]:
source_organism_names = source_organism_names.rename(columns={'organism ID':'source_organism', 'organism':'source_organism_name'})
target_organism_names = target_organism_names.rename(columns={'organism ID':'target_organism', 'organism':'target_organism_name'})

In [20]:
source_organism_names


Out[20]:
source_organism source_organism_name
0 Ga0081607 Methylobacter-123 (UID203)
260 Ga0081608 Methylotenera mobilis-123 (UID3888)
560 Ga0081609 Acidovorax-123 (UID4000)
578 Ga0081610 Rhodocyclaceae-127 (UID3972)
938 Ga0081611 Methylophilus methylotrophus-127-1 (UID203)
1080 Ga0081612 Methylophilus methylotrophus-127-2 (UID203)
1307 Ga0081613 Acidovorax-127 (UID4105)
1367 Ga0081614 Methylobacter tundripaludum-129 (UID203)
1454 Ga0081615 Opitutae-129 (UID2982)
1647 Ga0081616 Burkholderiales-129 (UID4000)
2031 Ga0081617 Methylophilus methylotrophus-129-1 (UID203)
2146 Ga0081618 Methylophilus methylotrophus-129-2 (UID203)
2233 Ga0081619 Bacteriovorax-21 (UID3187)
2285 Ga0081620 Bacteria-21 (UID203)
2539 Ga0081621 Acidovorax-21 (UID4105)
2985 Ga0081622 Methylosarcina-21 (UID203)
3394 Ga0081623 Methylococcaceae-21 (UID203)
3948 Ga0081624 Methylophilaceae-40 (UID3888)
4188 Ga0081625 Opititae-40 (UID2982)
4212 Ga0081626 Methylophilaceae-49 (UID203)
4413 Ga0081627 Methylotenera mobilis-49 (UID3888)
4639 Ga0081628 Bacteroidetes-49 (UID2591)
4709 Ga0081629 Methylotenera mobilis-49 (UID203)
4931 Ga0081630 Bacteriovorax-49 (UID3187)
5308 Ga0081631 Rhodocyclaceae-49 (UID3972)
5498 Ga0081632 Flavobacteriaceae-49 (UID2817)
5721 Ga0081633 Methylophilus methylotrophus-55 (UID3888)
5738 Ga0081634 Methylosarcina lacus-55 (UID4274)
6181 Ga0081635 Burkholderiales-55 (UID4000)
6234 Ga0081636 Methylococcaceae-55 (UID203)
6288 Ga0081637 Methylophilaceae-55 (UID3888)
6312 Ga0081638 Methylovulum miyakonense-55 (UID4274)
6645 Ga0081639 Bacteriovorax-63 (UID3187)
6654 Ga0081640 Methylotenera mobilis-63 (UID3888)
6945 Ga0081641 Methylobacter-69 (UID4274)
7125 Ga0081642 Methylophilus methylotrophus-69 (UID3888)
7156 Ga0081643 Methylosarcina lacus-69 (UID4274)
7373 Ga0081644 Acidovora-69x (UID4105)
7818 Ga0081645 Acidovorax-75 (UID4105)
8086 Ga0081646 Bacteroidetes-76 (UID2591)
8337 Ga0081647 Methylotenera mobilis-76-1 (UID203)
8545 Ga0081648 Burkholderiales-76 (UID4002)
8652 Ga0081649 Methylotenera mobilis-76-2 (UID203)
9061 Ga0081650 Methylophilus methylotrophus-79 (UID3888)
9078 Ga0081651 Acidovorax-79 (UID4214)
9176 Ga0081652 Flavobacteriaceae-79-1 (UID2817)
9383 Ga0081653 Flavobacteriaceae-79-2 (UID2817)
9644 Ga0081654 Bacteroidetes-8 (UID2591)
9987 Ga0081656 Bacteriovora-8x (UID3187)
10277 Ga0081657 Methylobacte-98r (UID4274)
10403 Ga0081658 Methylophilus methylotrophus-98 (UID3888)
10434 Ga0081659 Acidovorax-98 (UID4105)
10793 Ga0081655 Methylophilaceae-8 (UID203)

In [21]:
merged = pd.merge(network, source_organism_names)

In [22]:
merged.source_organism_name.unique()


Out[22]:
array(['Methylobacter-123 (UID203)', 'Methylotenera mobilis-49 (UID203)'], dtype=object)

In [23]:
merged.head(2)


Out[23]:
source_locus_tag target_locus_tag weight association target_organism target_gene source_organism source_gene cross_species source_organism_name
0 Ga0081607_11219 Ga0081607_115212 0.01928 positive Ga0081607 115212 Ga0081607 11219 False Methylobacter-123 (UID203)
1 Ga0081607_11219 Ga0081607_116221 0.01995 positive Ga0081607 116221 Ga0081607 11219 False Methylobacter-123 (UID203)

In [24]:
merged = pd.merge(merged, target_organism_names)
print(merged.shape)
print(network.shape)


(2486, 11)
(2486, 9)

In [25]:
merged.head()


Out[25]:
source_locus_tag target_locus_tag weight association target_organism target_gene source_organism source_gene cross_species source_organism_name target_organism_name
0 Ga0081607_11219 Ga0081607_115212 0.01928 positive Ga0081607 115212 Ga0081607 11219 False Methylobacter-123 (UID203) Methylobacter-123 (UID203)
1 Ga0081607_11219 Ga0081607_116221 0.01995 positive Ga0081607 116221 Ga0081607 11219 False Methylobacter-123 (UID203) Methylobacter-123 (UID203)
2 Ga0081607_11219 Ga0081607_107914 0.02173 positive Ga0081607 107914 Ga0081607 11219 False Methylobacter-123 (UID203) Methylobacter-123 (UID203)
3 Ga0081607_11219 Ga0081607_115213 0.02291 positive Ga0081607 115213 Ga0081607 11219 False Methylobacter-123 (UID203) Methylobacter-123 (UID203)
4 Ga0081607_108214 Ga0081607_11235 0.01928 positive Ga0081607 11235 Ga0081607 108214 False Methylobacter-123 (UID203) Methylobacter-123 (UID203)

In [26]:
merged.source_organism_name.unique()


Out[26]:
array(['Methylobacter-123 (UID203)', 'Methylotenera mobilis-49 (UID203)'], dtype=object)

In [27]:
print(merged.shape)
print(network.shape)


(2486, 11)
(2486, 9)

In [28]:
merged.tail()


Out[28]:
source_locus_tag target_locus_tag weight association target_organism target_gene source_organism source_gene cross_species source_organism_name target_organism_name
2481 Ga0081629_11451 Ga0081629_11135 0.03896 positive Ga0081629 11135 Ga0081629 11451 False Methylotenera mobilis-49 (UID203) Methylotenera mobilis-49 (UID203)
2482 Ga0081629_11451 Ga0081629_12205 0.04051 positive Ga0081629 12205 Ga0081629 11451 False Methylotenera mobilis-49 (UID203) Methylotenera mobilis-49 (UID203)
2483 Ga0081629_11451 Ga0081629_10266 0.04218 positive Ga0081629 10266 Ga0081629 11451 False Methylotenera mobilis-49 (UID203) Methylotenera mobilis-49 (UID203)
2484 Ga0081629_11451 Ga0081629_11844 0.04379 positive Ga0081629 11844 Ga0081629 11451 False Methylotenera mobilis-49 (UID203) Methylotenera mobilis-49 (UID203)
2485 Ga0081629_11451 Ga0081629_10632 0.04406 positive Ga0081629 10632 Ga0081629 11451 False Methylotenera mobilis-49 (UID203) Methylotenera mobilis-49 (UID203)

In [29]:
# # d[(d['x']>2) & (d['y']>7)]
merged[(merged['source_organism_name'] == 'Methylotenera mobilis-49 (UID203)') &
       (merged['target_organism_name'] == 'Methylobacter-123 (UID203)')].head(2)


Out[29]:
source_locus_tag target_locus_tag weight association target_organism target_gene source_organism source_gene cross_species source_organism_name target_organism_name
1949 Ga0081629_12073 Ga0081607_11876 -0.01917 negative Ga0081607 11876 Ga0081629 12073 True Methylotenera mobilis-49 (UID203) Methylobacter-123 (UID203)
1950 Ga0081629_12073 Ga0081607_111619 0.02039 positive Ga0081607 111619 Ga0081629 12073 True Methylotenera mobilis-49 (UID203) Methylobacter-123 (UID203)

In [30]:
merged[(merged['source_organism_name'] == 'Methylobacter-123 (UID203)') &
       (merged['target_organism_name'] == 'Methylotenera mobilis-49 (UID203)')].head(2)


Out[30]:
source_locus_tag target_locus_tag weight association target_organism target_gene source_organism source_gene cross_species source_organism_name target_organism_name
2086 Ga0081607_108214 Ga0081629_10591 -0.02555 negative Ga0081629 10591 Ga0081607 108214 True Methylobacter-123 (UID203) Methylotenera mobilis-49 (UID203)
2087 Ga0081607_10701 Ga0081629_12205 0.02161 positive Ga0081629 12205 Ga0081607 10701 True Methylobacter-123 (UID203) Methylotenera mobilis-49 (UID203)

Use summary_counts, not summary_rpkm for gene names.

jmatsen@waffle:/dacb/meta4_bins/analysis/assemble_summaries$ ag Ga0081607_11219 summary_rpkm.xls | head -n 10 jmatsen@waffle:/dacb/meta4_bins/analysis/assemble_summaries$ ag Ga0081607_11219 summary_counts.xls | head -n 10 2015:Methylobacter-123 (UID203) Ga0081607_11219 hypothetical protein 243652 6660 160 285587 448 89 94 4893 13 66994 47733 163 301 3 146 1851 26 53125 249288 21 14249 28 12 42296 23538 2778 1918 2061 217 173983 164307 398 450 1170 10410 30 344 2224 2164 1452 810 338 656 70 222 3475 1143 2672 1313 1246 930 54 23 9942 9603 2381 8196 29 49 23721 7808 33195 17291 5825 6609 36 83 40661 28629 17949 12227 15478 15054 125 1010 10214 66875 40225 944 11993 9572 56 9375


In [31]:
genes = pd.read_csv('/dacb/meta4_bins/analysis/assemble_summaries/summary_counts.xls', sep = '\t', usecols=[1, 2])

In [32]:
genes.tail(3)


Out[32]:
locus_tag product
169834 Ga0081655_13359 sulfur compound chelating protein SoxZ
169835 Ga0081655_133510 thiosulfate-binding protein SoxY
169836 Ga0081655_133511 predicted Zn-dependent protease
genes['locus'] = genes['locus_tag'].str.extract('([A-z0-9]+)_[0-9]+') genes['target_gene'] = genes['locus_tag'].str.extract('[A-z0-9]+_([0-9]+)')

In [33]:
genes.tail()


Out[33]:
locus_tag product
169832 Ga0081655_13357 alkyl hydroperoxide reductase subunit F
169833 Ga0081655_13358 peroxiredoxin (alkyl hydroperoxide reductase s...
169834 Ga0081655_13359 sulfur compound chelating protein SoxZ
169835 Ga0081655_133510 thiosulfate-binding protein SoxY
169836 Ga0081655_133511 predicted Zn-dependent protease

In [34]:
genes[genes['locus_tag'] == 'Ga0081607_11219']


Out[34]:
locus_tag product
2013 Ga0081607_11219 hypothetical protein

In [35]:
merged.head(2)


Out[35]:
source_locus_tag target_locus_tag weight association target_organism target_gene source_organism source_gene cross_species source_organism_name target_organism_name
0 Ga0081607_11219 Ga0081607_115212 0.01928 positive Ga0081607 115212 Ga0081607 11219 False Methylobacter-123 (UID203) Methylobacter-123 (UID203)
1 Ga0081607_11219 Ga0081607_116221 0.01995 positive Ga0081607 116221 Ga0081607 11219 False Methylobacter-123 (UID203) Methylobacter-123 (UID203)

In [36]:
source_genes = genes[['locus_tag', 'product']].rename(columns={'locus_tag':'source_locus_tag', 'product':'source_gene_product'})
target_genes = genes[['locus_tag', 'product']].rename(columns={'locus_tag':'target_locus_tag', 'product':'target_gene_product'})

In [37]:
source_genes.head(2)


Out[37]:
source_locus_tag source_gene_product
0 Ga0081607_10011 DNA gyrase B subunit, carboxyl terminus
1 Ga0081607_10012 Putative addiction module component

In [38]:
network.shape


Out[38]:
(2486, 9)

In [39]:
merged.shape


Out[39]:
(2486, 11)

In [40]:
merged = pd.merge(merged, source_genes)

In [41]:
merged.shape


Out[41]:
(2486, 12)

In [42]:
merged = pd.merge(merged, target_genes)

In [43]:
merged.shape


Out[43]:
(2486, 13)

In [44]:
merged.head(2)


Out[44]:
source_locus_tag target_locus_tag weight association target_organism target_gene source_organism source_gene cross_species source_organism_name target_organism_name source_gene_product target_gene_product
0 Ga0081607_11219 Ga0081607_115212 0.01928 positive Ga0081607 115212 Ga0081607 11219 False Methylobacter-123 (UID203) Methylobacter-123 (UID203) hypothetical protein flagellar protein FlaG
1 Ga0081607_115213 Ga0081607_115212 0.02512 positive Ga0081607 115212 Ga0081607 115213 False Methylobacter-123 (UID203) Methylobacter-123 (UID203) flagellin flagellar protein FlaG

In [45]:
merged.head(3)


Out[45]:
source_locus_tag target_locus_tag weight association target_organism target_gene source_organism source_gene cross_species source_organism_name target_organism_name source_gene_product target_gene_product
0 Ga0081607_11219 Ga0081607_115212 0.01928 positive Ga0081607 115212 Ga0081607 11219 False Methylobacter-123 (UID203) Methylobacter-123 (UID203) hypothetical protein flagellar protein FlaG
1 Ga0081607_115213 Ga0081607_115212 0.02512 positive Ga0081607 115212 Ga0081607 115213 False Methylobacter-123 (UID203) Methylobacter-123 (UID203) flagellin flagellar protein FlaG
2 Ga0081607_109111 Ga0081607_115212 0.01954 positive Ga0081607 115212 Ga0081607 109111 False Methylobacter-123 (UID203) Methylobacter-123 (UID203) bacterial nucleoid protein HU beta subunit flagellar protein FlaG

In [46]:
merged['sort'] = merged.weight.abs()
merged = merged.sort(columns='sort', ascending=False).drop('sort', axis=1)


/home/jmatsen/miniconda2/envs/neo4j/lib/python2.7/site-packages/ipykernel/__main__.py:2: FutureWarning: sort(columns=....) is deprecated, use sort_values(by=.....)
  from ipykernel import kernelapp as app

In [47]:
merged['weight'].describe()


Out[47]:
count    2486.000000
mean        0.020346
std         0.016258
min        -0.041380
25%         0.020020
50%         0.022290
75%         0.026930
max         0.074740
Name: weight, dtype: float64

In [48]:
merged.head(2)


Out[48]:
source_locus_tag target_locus_tag weight association target_organism target_gene source_organism source_gene cross_species source_organism_name target_organism_name source_gene_product target_gene_product
417 Ga0081607_113410 Ga0081607_10311 0.07474 positive Ga0081607 10311 Ga0081607 113410 False Methylobacter-123 (UID203) Methylobacter-123 (UID203) hypothetical protein hypothetical protein
1808 Ga0081607_10311 Ga0081607_113410 0.07474 positive Ga0081607 113410 Ga0081607 10311 False Methylobacter-123 (UID203) Methylobacter-123 (UID203) hypothetical protein hypothetical protein

In [49]:
merged['source_gene_product'].unique()[0:4]


Out[49]:
array(['hypothetical protein', 'type IV pilus assembly protein PilA',
       'LSU ribosomal protein L35P',
       'nitric oxide reductase, NorC subunit apoprotein'], dtype=object)

In [50]:
filename = 'Methylobacter-123--Methylotenera_mobilis-49_network'

In [51]:
! ls ../data


gene_names.tsv		      network_broken.csv  network_with_info.txt
Methylobacter--Methylotenera  network_broken.tsv
network_broken_20.csv	      network.txt

In [52]:
! mkdir ../data/Methylobacter--Methylotenera


mkdir: cannot create directory ‘../data/Methylobacter--Methylotenera’: File exists

In [53]:
dirname = '../data/Methylobacter--Methylotenera/'

In [54]:
merged.to_csv(dirname + filename + '.tsv', sep='\t', index=False)

In [55]:
# The CSV isn't a good idea because of the gene names. 
#merged.to_csv(dirname + filename + '.csv')

In [56]:
merged.head(100).to_csv(dirname + filename + '--100' + '.tsv', sep='\t', index=False)

In [57]:
merged.shape


Out[57]:
(2486, 13)

In [58]:
top_genes = list(merged['source_gene_product'].unique()[0:3]) + list(merged['target_gene_product'].unique()[0:3])

In [59]:
has_top_genes = merged[(merged['source_gene_product'].isin(top_genes)) | (merged['target_gene_product'].isin(top_genes))]

In [60]:
has_top_genes.shape


Out[60]:
(1268, 13)

In [61]:
has_top_genes.to_csv(dirname + filename + '--some_top_genes--1268_rows' + '.tsv', sep='\t', index=False)

Make a file with just nodes.

Start with the merged stuff since I have to take the source and target nodes separately no matter which DF I start with.


In [62]:
merged.head()


Out[62]:
source_locus_tag target_locus_tag weight association target_organism target_gene source_organism source_gene cross_species source_organism_name target_organism_name source_gene_product target_gene_product
417 Ga0081607_113410 Ga0081607_10311 0.07474 positive Ga0081607 10311 Ga0081607 113410 False Methylobacter-123 (UID203) Methylobacter-123 (UID203) hypothetical protein hypothetical protein
1808 Ga0081607_10311 Ga0081607_113410 0.07474 positive Ga0081607 113410 Ga0081607 10311 False Methylobacter-123 (UID203) Methylobacter-123 (UID203) hypothetical protein hypothetical protein
1641 Ga0081629_11135 Ga0081629_10266 0.07207 positive Ga0081629 10266 Ga0081629 11135 False Methylotenera mobilis-49 (UID203) Methylotenera mobilis-49 (UID203) hypothetical protein type IV pilus assembly protein PilA
1360 Ga0081629_10266 Ga0081629_11135 0.07207 positive Ga0081629 11135 Ga0081629 10266 False Methylotenera mobilis-49 (UID203) Methylotenera mobilis-49 (UID203) type IV pilus assembly protein PilA hypothetical protein
1388 Ga0081629_11135 Ga0081629_12085 0.06670 positive Ga0081629 12085 Ga0081629 11135 False Methylotenera mobilis-49 (UID203) Methylotenera mobilis-49 (UID203) hypothetical protein hypothetical protein

In [63]:
network.head()


Out[63]:
source_locus_tag target_locus_tag weight association target_organism target_gene source_organism source_gene cross_species
0 Ga0081607_11219 Ga0081607_115212 0.01928 positive Ga0081607 115212 Ga0081607 11219 False
1 Ga0081607_11219 Ga0081607_116221 0.01995 positive Ga0081607 116221 Ga0081607 11219 False
2 Ga0081607_11219 Ga0081607_107914 0.02173 positive Ga0081607 107914 Ga0081607 11219 False
3 Ga0081607_11219 Ga0081607_115213 0.02291 positive Ga0081607 115213 Ga0081607 11219 False
4 Ga0081607_108214 Ga0081607_11235 0.01928 positive Ga0081607 11235 Ga0081607 108214 False

In [64]:
def extract_nodes(df, source=True):
    if source:
        string = 'source'
    else:
        string = 'target'
    cols = df.columns.str.contains(string)
    df_subset = df.iloc[:, cols]
    df_subset = df_subset.rename(columns=lambda x: x.replace(string + "_", ''))
    df_subset = df_subset.rename(columns=lambda x: x.replace(string, 'locus_tag'))
    return df_subset

In [65]:
extract_nodes(merged, True).head(3)


Out[65]:
locus_tag organism gene organism_name gene_product
417 Ga0081607_113410 Ga0081607 113410 Methylobacter-123 (UID203) hypothetical protein
1808 Ga0081607_10311 Ga0081607 10311 Methylobacter-123 (UID203) hypothetical protein
1641 Ga0081629_11135 Ga0081629 11135 Methylotenera mobilis-49 (UID203) hypothetical protein

In [66]:
extract_nodes(merged, False).head(3)


Out[66]:
locus_tag organism gene organism_name gene_product
417 Ga0081607_10311 Ga0081607 10311 Methylobacter-123 (UID203) hypothetical protein
1808 Ga0081607_113410 Ga0081607 113410 Methylobacter-123 (UID203) hypothetical protein
1641 Ga0081629_10266 Ga0081629 10266 Methylotenera mobilis-49 (UID203) type IV pilus assembly protein PilA

In [67]:
nodes = pd.concat([extract_nodes(merged, True).drop_duplicates(), 
                   extract_nodes(merged, False).drop_duplicates()], axis = 0)

In [68]:
nodes.shape


Out[68]:
(510, 5)

In [69]:
nodes.drop_duplicates(inplace=True)

In [70]:
nodes.shape


Out[70]:
(255, 5)

In [71]:
nodes.to_csv('../data/Methylobacter--Methylotenera/nodes.tsv', sep='\t', index=False)

In [ ]: