In [1]:
import pandas as pd

In [2]:
gene_info = pd.read_csv('/home/jmatsen/Neo4j_meta4/data/gene_info.tsv', sep='\t')

In [3]:
network = pd.read_csv('/home/jmatsen/Neo4j_meta4/data/network.txt', sep='\t')

In [4]:
gene_info.shape


Out[4]:
(212710, 3)

In [5]:
gene_info.head(2)


Out[5]:
genome locus_tag product
0 Methylotenera mobilis JLW8 Mmol_0001 chromosomal replication initiator protein DnaA
1 Methylotenera mobilis JLW8 Mmol_0002 DNA polymerase III, beta subunit (EC 2.7.7.7)

In [6]:
gene_info[gene_info['locus_tag'].str.contains('Mmol_')]


Out[6]:
genome locus_tag product
0 Methylotenera mobilis JLW8 Mmol_0001 chromosomal replication initiator protein DnaA
1 Methylotenera mobilis JLW8 Mmol_0002 DNA polymerase III, beta subunit (EC 2.7.7.7)
2 Methylotenera mobilis JLW8 Mmol_0003 DNA gyrase subunit B (EC 5.99.1.3)
3 Methylotenera mobilis JLW8 Mmol_0004 transcriptional regulator, TetR family
4 Methylotenera mobilis JLW8 Mmol_0005 efflux transporter, RND family, MFP subunit
5 Methylotenera mobilis JLW8 Mmol_0006 transporter, hydrophobe/amphiphile efflux-1 (...
6 Methylotenera mobilis JLW8 Mmol_0007 RND efflux system, outer membrane lipoprotein,...
7 Methylotenera mobilis JLW8 Mmol_0008 transcriptional regulator, LysR family
8 Methylotenera mobilis JLW8 Mmol_0009 short-chain dehydrogenase/reductase SDR
9 Methylotenera mobilis JLW8 Mmol_0010 NmrA family protein
10 Methylotenera mobilis JLW8 Mmol_0011 transcriptional regulator, HxlR family
11 Methylotenera mobilis JLW8 Mmol_0012 hypothetical protein
12 Methylotenera mobilis JLW8 Mmol_0013 hypothetical protein
13 Methylotenera mobilis JLW8 Mmol_0014 transcriptional regulator, LysR family
14 Methylotenera mobilis JLW8 Mmol_0015 Heat shock protein. Metallo peptidase. MEROPS ...
15 Methylotenera mobilis JLW8 Mmol_0016 Integral membrane protein TerC
16 Methylotenera mobilis JLW8 Mmol_0017 cytochrome B561
17 Methylotenera mobilis JLW8 Mmol_0018 beta-lactamase domain protein
18 Methylotenera mobilis JLW8 Mmol_0019 DoxX family protein
19 Methylotenera mobilis JLW8 Mmol_0020 GCN5-related N-acetyltransferase
20 Methylotenera mobilis JLW8 Mmol_0021 coenzyme PQQ biosynthesis protein A
21 Methylotenera mobilis JLW8 Mmol_0022 diguanylate cyclase
22 Methylotenera mobilis JLW8 Mmol_0023 CBS domain containing membrane protein
23 Methylotenera mobilis JLW8 Mmol_0024 formaldehyde-activating enzyme
24 Methylotenera mobilis JLW8 Mmol_0025 hypothetical protein
25 Methylotenera mobilis JLW8 Mmol_0026 putative transmembrane protein
26 Methylotenera mobilis JLW8 Mmol_0027 protein of unknown function DUF883 ElaB
27 Methylotenera mobilis JLW8 Mmol_0028 A/G-specific DNA-adenine glycosylase (EC 3.2....
28 Methylotenera mobilis JLW8 Mmol_0029 AsmA family protein
29 Methylotenera mobilis JLW8 Mmol_0030 Undecaprenyl-diphosphatase (EC 3.6.1.27)
... ... ... ...
2308 Methylotenera mobilis JLW8 Mmol_2320 replication restart DNA helicase PriA
2309 Methylotenera mobilis JLW8 Mmol_2321 arginyl-tRNA synthetase (EC 6.1.1.19)
2310 Methylotenera mobilis JLW8 Mmol_2322 Sporulation domain protein
2311 Methylotenera mobilis JLW8 Mmol_2323 DSBA oxidoreductase
2312 Methylotenera mobilis JLW8 Mmol_2324 short-chain dehydrogenase/reductase SDR
2313 Methylotenera mobilis JLW8 Mmol_2325 ABC transporter related
2314 Methylotenera mobilis JLW8 Mmol_2326 YaeQ family protein
2315 Methylotenera mobilis JLW8 Mmol_2327 putative adenylate/guanylate cyclase
2316 Methylotenera mobilis JLW8 Mmol_2328 glutamine--fructose-6-phosphate transaminase
2317 Methylotenera mobilis JLW8 Mmol_2329 UDP-N-acetylglucosamine pyrophosphorylase (EC ...
2318 Methylotenera mobilis JLW8 Mmol_2330 ATP synthase F1 subcomplex epsilon subunit
2319 Methylotenera mobilis JLW8 Mmol_2331 ATP synthase F1, beta subunit
2320 Methylotenera mobilis JLW8 Mmol_2332 ATP synthase F1 subcomplex gamma subunit
2321 Methylotenera mobilis JLW8 Mmol_2333 ATP synthase F1 subcomplex alpha subunit
2322 Methylotenera mobilis JLW8 Mmol_2334 ATP synthase F1 subcomplex delta subunit
2323 Methylotenera mobilis JLW8 Mmol_2335 ATP synthase F0, B subunit
2324 Methylotenera mobilis JLW8 Mmol_2336 ATP synthase F0, C subunit
2325 Methylotenera mobilis JLW8 Mmol_2337 ATP synthase F0, A subunit
2326 Methylotenera mobilis JLW8 Mmol_2338 ATP synthase I chain
2327 Methylotenera mobilis JLW8 Mmol_2339 chromosome segregation DNA-binding protein
2328 Methylotenera mobilis JLW8 Mmol_2340 chromosome segregation ATPase
2329 Methylotenera mobilis JLW8 Mmol_2341 16S rRNA m(7)G-527 methyltransferase (EC 2.1....
2330 Methylotenera mobilis JLW8 Mmol_2342 glucose inhibited division protein A
2331 Methylotenera mobilis JLW8 Mmol_2343 Domain of unknown function DUF1924
2332 Methylotenera mobilis JLW8 Mmol_2344 protein of unknown function DUF1508
2333 Methylotenera mobilis JLW8 Mmol_2345 tRNA modification GTPase trmE
2334 Methylotenera mobilis JLW8 Mmol_2346 protein translocase subunit yidC
2335 Methylotenera mobilis JLW8 Mmol_2347 protein of unknown function DUF37
2336 Methylotenera mobilis JLW8 Mmol_2348 ribonuclease P protein component (EC 3.1.26.5)
2337 Methylotenera mobilis JLW8 Mmol_2349 LSU ribosomal protein L34P

2338 rows × 3 columns


In [7]:
gene_info[gene_info['locus_tag'].str.contains('Ga0081629')]


Out[7]:
genome locus_tag product

In [8]:
gene_info[gene_info['locus_tag'].str.contains('Ga0081607')]


Out[8]:
genome locus_tag product

In [9]:
gene_info[gene_info['locus_tag'].str.contains('Ga*')]


Out[9]:
genome locus_tag product
115803 Methylomonas sp. MK1 G006DRAFT_0002 ATP-dependent Clp endopeptidase, proteolytic ...
115804 Methylomonas sp. MK1 G006DRAFT_0003 phage major capsid protein, HK97 family
115805 Methylomonas sp. MK1 G006DRAFT_0004 hypothetical protein
115806 Methylomonas sp. MK1 G006DRAFT_0005 hypothetical protein
115807 Methylomonas sp. MK1 G006DRAFT_0006 phage tail tape measure protein, TP901 family,...
115808 Methylomonas sp. MK1 G006DRAFT_0007 hypothetical protein
115809 Methylomonas sp. MK1 G006DRAFT_0008 hypothetical protein
115810 Methylomonas sp. MK1 G006DRAFT_0009 hypothetical protein
115811 Methylomonas sp. MK1 G006DRAFT_0010 glucosylceramidase
115812 Methylomonas sp. MK1 G006DRAFT_0011 hypothetical protein
115813 Methylomonas sp. MK1 G006DRAFT_0012 chromosome partitioning protein
115814 Methylomonas sp. MK1 G006DRAFT_0013 hypothetical protein
115815 Methylomonas sp. MK1 G006DRAFT_0014 outer membrane protein, multidrug efflux system
115816 Methylomonas sp. MK1 G006DRAFT_0015 multidrug efflux pump
115817 Methylomonas sp. MK1 G006DRAFT_0016 membrane fusion protein, multidrug efflux system
115818 Methylomonas sp. MK1 G006DRAFT_0017 transcriptional regulator, TetR family
115819 Methylomonas sp. MK1 G006DRAFT_0019 Hemolysin activation/secretion protein
115820 Methylomonas sp. MK1 G006DRAFT_0020 SapC protein
115821 Methylomonas sp. MK1 G006DRAFT_0021 filamentous hemagglutinin family N-terminal d...
115822 Methylomonas sp. MK1 G006DRAFT_0022 mxaD protein
115823 Methylomonas sp. MK1 G006DRAFT_0023 hypothetical protein
115824 Methylomonas sp. MK1 G006DRAFT_0024 two component transcriptional regulator, LuxR ...
115825 Methylomonas sp. MK1 G006DRAFT_0025 methanol dehydrogenase (cytochrome) large sub...
115826 Methylomonas sp. MK1 G006DRAFT_0026 mxaJ protein
115827 Methylomonas sp. MK1 G006DRAFT_0027 cytochrome cL apoprotein
115828 Methylomonas sp. MK1 G006DRAFT_0028 methanol dehydrogenase (cytochrome) small sub...
115829 Methylomonas sp. MK1 G006DRAFT_0029 MoxR-like ATPase
115830 Methylomonas sp. MK1 G006DRAFT_0030 hypothetical protein
115831 Methylomonas sp. MK1 G006DRAFT_0031 hypothetical protein
115832 Methylomonas sp. MK1 G006DRAFT_0032 mxaA protein
... ... ... ...
212680 Methylotenera sp. N17 FG11DRAFT_2642 RNA polymerase, sigma 54 subunit, RpoN/SigL
212681 Methylotenera sp. N17 FG11DRAFT_2643 lipopolysaccharide export system ATP-binding ...
212682 Methylotenera sp. N17 FG11DRAFT_2644 lipopolysaccharide export system protein LptA
212683 Methylotenera sp. N17 FG11DRAFT_2645 lipopolysaccharide export system protein LptC
212684 Methylotenera sp. N17 FG11DRAFT_2646 3-deoxy-D-manno-octulosonate 8-phosphate phos...
212685 Methylotenera sp. N17 FG11DRAFT_2647 arabinose-5-phosphate isomerase
212686 Methylotenera sp. N17 FG11DRAFT_2648 monovalent cation:H+ antiporter-2, CPA2 family
212687 Methylotenera sp. N17 FG11DRAFT_2649 glycine oxidase
212688 Methylotenera sp. N17 FG11DRAFT_2650 Uncharacterized conserved protein YdhG, YjbR/...
212689 Methylotenera sp. N17 FG11DRAFT_2651 hypothetical protein
212690 Methylotenera sp. N17 FG11DRAFT_2652 hypothetical protein
212691 Methylotenera sp. N17 FG11DRAFT_2653 cyclic pyranopterin phosphate synthase
212692 Methylotenera sp. N17 FG11DRAFT_2654 Putative Zn-dependent protease, contains TPR ...
212693 Methylotenera sp. N17 FG11DRAFT_2655 thiosulfate oxidation carrier complex protein ...
212694 Methylotenera sp. N17 FG11DRAFT_2656 peroxiredoxin (alkyl hydroperoxide reductase ...
212695 Methylotenera sp. N17 FG11DRAFT_2657 alkyl hydroperoxide reductase subunit F
212696 Methylotenera sp. N17 FG11DRAFT_2658 hypothetical protein
212697 Methylotenera sp. N17 FG11DRAFT_2659 DnaA family protein
212698 Methylotenera sp. N17 FG11DRAFT_2660 Predicted PurR-regulated permease PerM
212699 Methylotenera sp. N17 FG11DRAFT_2661 phosphoribosylformylglycinamidine cyclo-ligase
212700 Methylotenera sp. N17 FG11DRAFT_2662 Protein of unknown function (DUF3108)
212701 Methylotenera sp. N17 FG11DRAFT_2663 Protein of unknown function (DUF3108)
212702 Methylotenera sp. N17 FG11DRAFT_2664 16S rRNA (cytosine967-C5)-methyltransferase
212703 Methylotenera sp. N17 FG11DRAFT_2665 Na+/H+-dicarboxylate symporter
212704 Methylotenera sp. N17 FG11DRAFT_2666 Predicted N-acetyltransferase YhbS
212705 Methylotenera sp. N17 FG11DRAFT_2667 hydrophobic/amphiphilic exporter-1, HAE1 family
212706 Methylotenera sp. N17 FG11DRAFT_2668 RND family efflux transporter, MFP subunit
212707 Methylotenera sp. N17 FG11DRAFT_2669 AmpD protein
212708 Methylotenera sp. N17 FG11DRAFT_2670 cation:H+ antiporter
212709 Methylotenera sp. N17 FG11DRAFT_2671 two-component system, NtrC family, response r...

34769 rows × 3 columns


In [10]:
'Ga0081607_104311


  File "<ipython-input-10-2a14c53b90a0>", line 1
    'Ga0081607_104311
                    ^
SyntaxError: EOL while scanning string literal

In [ ]:
#gene_info.genome.str #.contains("ethylo").unique()
gene_info[gene_info.genome.str.contains('ethylo')]['genome'] #['genome'].unique()

In [ ]:
len(gene_info.genome.unique())

In [ ]:
network.shape

In [ ]:
network.head(2)

In [ ]:
# df['date'] = df['raw'].str.extract('(....-..-..)', expand=True)
network['locus tag'] = network['target'].str.extract('([A-z]+[0-9]+)_[0-9]+')

In [ ]:
network['source'].unique()

In [ ]:
network['locus tag'].unique()

In [ ]:
network.head(2) #genome.unique()

In [ ]:
network.head()

In [ ]:
joined = network.join(gene_info, how='left')

In [ ]:
print(network.shape)
joined.shape

In [ ]:
joined.head()

In [ ]:
joined.to_csv('/home/jmatsen/Neo4j_meta4/data/network_with_info.txt')

Trim out just genomes with "ethylo" in them.


In [ ]:
joined.genome.unique()

In [ ]:
#methylo_subset = 
# df[df['A'].str.contains("hello")]
#methylo = 
joined[joined.genome.str.contains('ethylo')]

In [ ]:
print(joined.shape)
print(methylo.shape)

In [ ]: