In [1]:
    
import pandas as pd
    
In [2]:
    
gene_info = pd.read_csv('/home/jmatsen/Neo4j_meta4/data/gene_info.tsv', sep='\t')
    
In [3]:
    
network = pd.read_csv('/home/jmatsen/Neo4j_meta4/data/network.txt', sep='\t')
    
In [4]:
    
gene_info.shape
    
    Out[4]:
(212710, 3)
In [5]:
    
gene_info.head(2)
    
    Out[5]:
  
    
       
      genome 
      locus_tag 
      product 
     
  
  
    
      0 
      Methylotenera mobilis JLW8 
      Mmol_0001 
      chromosomal replication initiator protein DnaA 
     
    
      1 
      Methylotenera mobilis JLW8 
      Mmol_0002 
      DNA polymerase III, beta subunit (EC 2.7.7.7) 
     
  
In [6]:
    
gene_info[gene_info['locus_tag'].str.contains('Mmol_')]
    
    Out[6]:
  
    
       
      genome 
      locus_tag 
      product 
     
  
  
    
      0 
      Methylotenera mobilis JLW8 
      Mmol_0001 
      chromosomal replication initiator protein DnaA 
     
    
      1 
      Methylotenera mobilis JLW8 
      Mmol_0002 
      DNA polymerase III, beta subunit (EC 2.7.7.7) 
     
    
      2 
      Methylotenera mobilis JLW8 
      Mmol_0003 
      DNA gyrase subunit B (EC 5.99.1.3) 
     
    
      3 
      Methylotenera mobilis JLW8 
      Mmol_0004 
      transcriptional regulator, TetR family 
     
    
      4 
      Methylotenera mobilis JLW8 
      Mmol_0005 
      efflux transporter, RND family, MFP subunit 
     
    
      5 
      Methylotenera mobilis JLW8 
      Mmol_0006 
      transporter, hydrophobe/amphiphile efflux-1  (... 
     
    
      6 
      Methylotenera mobilis JLW8 
      Mmol_0007 
      RND efflux system, outer membrane lipoprotein,... 
     
    
      7 
      Methylotenera mobilis JLW8 
      Mmol_0008 
      transcriptional regulator, LysR family 
     
    
      8 
      Methylotenera mobilis JLW8 
      Mmol_0009 
      short-chain dehydrogenase/reductase SDR 
     
    
      9 
      Methylotenera mobilis JLW8 
      Mmol_0010 
      NmrA family protein 
     
    
      10 
      Methylotenera mobilis JLW8 
      Mmol_0011 
      transcriptional regulator, HxlR family 
     
    
      11 
      Methylotenera mobilis JLW8 
      Mmol_0012 
      hypothetical protein 
     
    
      12 
      Methylotenera mobilis JLW8 
      Mmol_0013 
      hypothetical protein 
     
    
      13 
      Methylotenera mobilis JLW8 
      Mmol_0014 
      transcriptional regulator, LysR family 
     
    
      14 
      Methylotenera mobilis JLW8 
      Mmol_0015 
      Heat shock protein. Metallo peptidase. MEROPS ... 
     
    
      15 
      Methylotenera mobilis JLW8 
      Mmol_0016 
      Integral membrane protein TerC 
     
    
      16 
      Methylotenera mobilis JLW8 
      Mmol_0017 
      cytochrome B561 
     
    
      17 
      Methylotenera mobilis JLW8 
      Mmol_0018 
      beta-lactamase domain protein 
     
    
      18 
      Methylotenera mobilis JLW8 
      Mmol_0019 
      DoxX family protein 
     
    
      19 
      Methylotenera mobilis JLW8 
      Mmol_0020 
      GCN5-related N-acetyltransferase 
     
    
      20 
      Methylotenera mobilis JLW8 
      Mmol_0021 
      coenzyme PQQ biosynthesis protein A 
     
    
      21 
      Methylotenera mobilis JLW8 
      Mmol_0022 
      diguanylate cyclase 
     
    
      22 
      Methylotenera mobilis JLW8 
      Mmol_0023 
      CBS domain containing membrane protein 
     
    
      23 
      Methylotenera mobilis JLW8 
      Mmol_0024 
      formaldehyde-activating enzyme 
     
    
      24 
      Methylotenera mobilis JLW8 
      Mmol_0025 
      hypothetical protein 
     
    
      25 
      Methylotenera mobilis JLW8 
      Mmol_0026 
      putative transmembrane protein 
     
    
      26 
      Methylotenera mobilis JLW8 
      Mmol_0027 
      protein of unknown function DUF883 ElaB 
     
    
      27 
      Methylotenera mobilis JLW8 
      Mmol_0028 
      A/G-specific DNA-adenine glycosylase (EC  3.2.... 
     
    
      28 
      Methylotenera mobilis JLW8 
      Mmol_0029 
      AsmA family protein 
     
    
      29 
      Methylotenera mobilis JLW8 
      Mmol_0030 
      Undecaprenyl-diphosphatase (EC 3.6.1.27) 
     
    
      ... 
      ... 
      ... 
      ... 
     
    
      2308 
      Methylotenera mobilis JLW8 
      Mmol_2320 
      replication restart DNA helicase PriA 
     
    
      2309 
      Methylotenera mobilis JLW8 
      Mmol_2321 
      arginyl-tRNA synthetase (EC 6.1.1.19) 
     
    
      2310 
      Methylotenera mobilis JLW8 
      Mmol_2322 
      Sporulation domain protein 
     
    
      2311 
      Methylotenera mobilis JLW8 
      Mmol_2323 
      DSBA oxidoreductase 
     
    
      2312 
      Methylotenera mobilis JLW8 
      Mmol_2324 
      short-chain dehydrogenase/reductase SDR 
     
    
      2313 
      Methylotenera mobilis JLW8 
      Mmol_2325 
      ABC transporter related 
     
    
      2314 
      Methylotenera mobilis JLW8 
      Mmol_2326 
      YaeQ family protein 
     
    
      2315 
      Methylotenera mobilis JLW8 
      Mmol_2327 
      putative adenylate/guanylate cyclase 
     
    
      2316 
      Methylotenera mobilis JLW8 
      Mmol_2328 
      glutamine--fructose-6-phosphate transaminase 
     
    
      2317 
      Methylotenera mobilis JLW8 
      Mmol_2329 
      UDP-N-acetylglucosamine pyrophosphorylase (EC ... 
     
    
      2318 
      Methylotenera mobilis JLW8 
      Mmol_2330 
      ATP synthase F1 subcomplex epsilon subunit 
     
    
      2319 
      Methylotenera mobilis JLW8 
      Mmol_2331 
      ATP synthase F1, beta subunit 
     
    
      2320 
      Methylotenera mobilis JLW8 
      Mmol_2332 
      ATP synthase F1 subcomplex gamma subunit 
     
    
      2321 
      Methylotenera mobilis JLW8 
      Mmol_2333 
      ATP synthase F1 subcomplex alpha subunit 
     
    
      2322 
      Methylotenera mobilis JLW8 
      Mmol_2334 
      ATP synthase F1 subcomplex delta subunit 
     
    
      2323 
      Methylotenera mobilis JLW8 
      Mmol_2335 
      ATP synthase F0, B subunit 
     
    
      2324 
      Methylotenera mobilis JLW8 
      Mmol_2336 
      ATP synthase F0, C subunit 
     
    
      2325 
      Methylotenera mobilis JLW8 
      Mmol_2337 
      ATP synthase F0, A subunit 
     
    
      2326 
      Methylotenera mobilis JLW8 
      Mmol_2338 
      ATP synthase I chain 
     
    
      2327 
      Methylotenera mobilis JLW8 
      Mmol_2339 
      chromosome segregation DNA-binding protein 
     
    
      2328 
      Methylotenera mobilis JLW8 
      Mmol_2340 
      chromosome segregation ATPase 
     
    
      2329 
      Methylotenera mobilis JLW8 
      Mmol_2341 
      16S rRNA m(7)G-527 methyltransferase (EC  2.1.... 
     
    
      2330 
      Methylotenera mobilis JLW8 
      Mmol_2342 
      glucose inhibited division protein A 
     
    
      2331 
      Methylotenera mobilis JLW8 
      Mmol_2343 
      Domain of unknown function DUF1924 
     
    
      2332 
      Methylotenera mobilis JLW8 
      Mmol_2344 
      protein of unknown function DUF1508 
     
    
      2333 
      Methylotenera mobilis JLW8 
      Mmol_2345 
      tRNA modification GTPase trmE 
     
    
      2334 
      Methylotenera mobilis JLW8 
      Mmol_2346 
      protein translocase subunit yidC 
     
    
      2335 
      Methylotenera mobilis JLW8 
      Mmol_2347 
      protein of unknown function DUF37 
     
    
      2336 
      Methylotenera mobilis JLW8 
      Mmol_2348 
      ribonuclease P protein component (EC 3.1.26.5) 
     
    
      2337 
      Methylotenera mobilis JLW8 
      Mmol_2349 
      LSU ribosomal protein L34P 
     
  
2338 rows × 3 columns
In [7]:
    
gene_info[gene_info['locus_tag'].str.contains('Ga0081629')]
    
    Out[7]:
  
    
       
      genome 
      locus_tag 
      product 
     
  
  
  
In [8]:
    
gene_info[gene_info['locus_tag'].str.contains('Ga0081607')]
    
    Out[8]:
  
    
       
      genome 
      locus_tag 
      product 
     
  
  
  
In [9]:
    
gene_info[gene_info['locus_tag'].str.contains('Ga*')]
    
    Out[9]:
  
    
       
      genome 
      locus_tag 
      product 
     
  
  
    
      115803 
      Methylomonas sp. MK1 
      G006DRAFT_0002 
      ATP-dependent Clp endopeptidase, proteolytic  ... 
     
    
      115804 
      Methylomonas sp. MK1 
      G006DRAFT_0003 
      phage major capsid protein, HK97 family 
     
    
      115805 
      Methylomonas sp. MK1 
      G006DRAFT_0004 
      hypothetical protein 
     
    
      115806 
      Methylomonas sp. MK1 
      G006DRAFT_0005 
      hypothetical protein 
     
    
      115807 
      Methylomonas sp. MK1 
      G006DRAFT_0006 
      phage tail tape measure protein, TP901 family,... 
     
    
      115808 
      Methylomonas sp. MK1 
      G006DRAFT_0007 
      hypothetical protein 
     
    
      115809 
      Methylomonas sp. MK1 
      G006DRAFT_0008 
      hypothetical protein 
     
    
      115810 
      Methylomonas sp. MK1 
      G006DRAFT_0009 
      hypothetical protein 
     
    
      115811 
      Methylomonas sp. MK1 
      G006DRAFT_0010 
      glucosylceramidase 
     
    
      115812 
      Methylomonas sp. MK1 
      G006DRAFT_0011 
      hypothetical protein 
     
    
      115813 
      Methylomonas sp. MK1 
      G006DRAFT_0012 
      chromosome partitioning protein 
     
    
      115814 
      Methylomonas sp. MK1 
      G006DRAFT_0013 
      hypothetical protein 
     
    
      115815 
      Methylomonas sp. MK1 
      G006DRAFT_0014 
      outer membrane protein, multidrug efflux system 
     
    
      115816 
      Methylomonas sp. MK1 
      G006DRAFT_0015 
      multidrug efflux pump 
     
    
      115817 
      Methylomonas sp. MK1 
      G006DRAFT_0016 
      membrane fusion protein, multidrug efflux system 
     
    
      115818 
      Methylomonas sp. MK1 
      G006DRAFT_0017 
      transcriptional regulator, TetR family 
     
    
      115819 
      Methylomonas sp. MK1 
      G006DRAFT_0019 
      Hemolysin activation/secretion protein 
     
    
      115820 
      Methylomonas sp. MK1 
      G006DRAFT_0020 
      SapC protein 
     
    
      115821 
      Methylomonas sp. MK1 
      G006DRAFT_0021 
      filamentous hemagglutinin family N-terminal  d... 
     
    
      115822 
      Methylomonas sp. MK1 
      G006DRAFT_0022 
      mxaD protein 
     
    
      115823 
      Methylomonas sp. MK1 
      G006DRAFT_0023 
      hypothetical protein 
     
    
      115824 
      Methylomonas sp. MK1 
      G006DRAFT_0024 
      two component transcriptional regulator, LuxR ... 
     
    
      115825 
      Methylomonas sp. MK1 
      G006DRAFT_0025 
      methanol dehydrogenase (cytochrome) large  sub... 
     
    
      115826 
      Methylomonas sp. MK1 
      G006DRAFT_0026 
      mxaJ protein 
     
    
      115827 
      Methylomonas sp. MK1 
      G006DRAFT_0027 
      cytochrome cL apoprotein 
     
    
      115828 
      Methylomonas sp. MK1 
      G006DRAFT_0028 
      methanol dehydrogenase (cytochrome) small  sub... 
     
    
      115829 
      Methylomonas sp. MK1 
      G006DRAFT_0029 
      MoxR-like ATPase 
     
    
      115830 
      Methylomonas sp. MK1 
      G006DRAFT_0030 
      hypothetical protein 
     
    
      115831 
      Methylomonas sp. MK1 
      G006DRAFT_0031 
      hypothetical protein 
     
    
      115832 
      Methylomonas sp. MK1 
      G006DRAFT_0032 
      mxaA protein 
     
    
      ... 
      ... 
      ... 
      ... 
     
    
      212680 
      Methylotenera sp. N17 
      FG11DRAFT_2642 
      RNA polymerase, sigma 54 subunit, RpoN/SigL 
     
    
      212681 
      Methylotenera sp. N17 
      FG11DRAFT_2643 
      lipopolysaccharide export system ATP-binding  ... 
     
    
      212682 
      Methylotenera sp. N17 
      FG11DRAFT_2644 
      lipopolysaccharide export system protein LptA 
     
    
      212683 
      Methylotenera sp. N17 
      FG11DRAFT_2645 
      lipopolysaccharide export system protein LptC 
     
    
      212684 
      Methylotenera sp. N17 
      FG11DRAFT_2646 
      3-deoxy-D-manno-octulosonate 8-phosphate  phos... 
     
    
      212685 
      Methylotenera sp. N17 
      FG11DRAFT_2647 
      arabinose-5-phosphate isomerase 
     
    
      212686 
      Methylotenera sp. N17 
      FG11DRAFT_2648 
      monovalent cation:H+ antiporter-2, CPA2 family 
     
    
      212687 
      Methylotenera sp. N17 
      FG11DRAFT_2649 
      glycine oxidase 
     
    
      212688 
      Methylotenera sp. N17 
      FG11DRAFT_2650 
      Uncharacterized conserved protein YdhG,  YjbR/... 
     
    
      212689 
      Methylotenera sp. N17 
      FG11DRAFT_2651 
      hypothetical protein 
     
    
      212690 
      Methylotenera sp. N17 
      FG11DRAFT_2652 
      hypothetical protein 
     
    
      212691 
      Methylotenera sp. N17 
      FG11DRAFT_2653 
      cyclic pyranopterin phosphate synthase 
     
    
      212692 
      Methylotenera sp. N17 
      FG11DRAFT_2654 
      Putative Zn-dependent protease, contains TPR  ... 
     
    
      212693 
      Methylotenera sp. N17 
      FG11DRAFT_2655 
      thiosulfate oxidation carrier complex protein ... 
     
    
      212694 
      Methylotenera sp. N17 
      FG11DRAFT_2656 
      peroxiredoxin (alkyl hydroperoxide reductase  ... 
     
    
      212695 
      Methylotenera sp. N17 
      FG11DRAFT_2657 
      alkyl hydroperoxide reductase subunit F 
     
    
      212696 
      Methylotenera sp. N17 
      FG11DRAFT_2658 
      hypothetical protein 
     
    
      212697 
      Methylotenera sp. N17 
      FG11DRAFT_2659 
      DnaA family protein 
     
    
      212698 
      Methylotenera sp. N17 
      FG11DRAFT_2660 
      Predicted PurR-regulated permease PerM 
     
    
      212699 
      Methylotenera sp. N17 
      FG11DRAFT_2661 
      phosphoribosylformylglycinamidine cyclo-ligase 
     
    
      212700 
      Methylotenera sp. N17 
      FG11DRAFT_2662 
      Protein of unknown function (DUF3108) 
     
    
      212701 
      Methylotenera sp. N17 
      FG11DRAFT_2663 
      Protein of unknown function (DUF3108) 
     
    
      212702 
      Methylotenera sp. N17 
      FG11DRAFT_2664 
      16S rRNA (cytosine967-C5)-methyltransferase 
     
    
      212703 
      Methylotenera sp. N17 
      FG11DRAFT_2665 
      Na+/H+-dicarboxylate symporter 
     
    
      212704 
      Methylotenera sp. N17 
      FG11DRAFT_2666 
      Predicted N-acetyltransferase YhbS 
     
    
      212705 
      Methylotenera sp. N17 
      FG11DRAFT_2667 
      hydrophobic/amphiphilic exporter-1, HAE1 family 
     
    
      212706 
      Methylotenera sp. N17 
      FG11DRAFT_2668 
      RND family efflux transporter, MFP subunit 
     
    
      212707 
      Methylotenera sp. N17 
      FG11DRAFT_2669 
      AmpD protein 
     
    
      212708 
      Methylotenera sp. N17 
      FG11DRAFT_2670 
      cation:H+ antiporter 
     
    
      212709 
      Methylotenera sp. N17 
      FG11DRAFT_2671 
      two-component system, NtrC family, response  r... 
     
  
34769 rows × 3 columns
In [10]:
    
'Ga0081607_104311
    
    
  File "<ipython-input-10-2a14c53b90a0>", line 1
    'Ga0081607_104311
                    ^
SyntaxError: EOL while scanning string literal
In [ ]:
    
#gene_info.genome.str #.contains("ethylo").unique()
gene_info[gene_info.genome.str.contains('ethylo')]['genome'] #['genome'].unique()
    
In [ ]:
    
len(gene_info.genome.unique())
    
In [ ]:
    
network.shape
    
In [ ]:
    
network.head(2)
    
In [ ]:
    
# df['date'] = df['raw'].str.extract('(....-..-..)', expand=True)
network['locus tag'] = network['target'].str.extract('([A-z]+[0-9]+)_[0-9]+')
    
In [ ]:
    
network['source'].unique()
    
In [ ]:
    
network['locus tag'].unique()
    
In [ ]:
    
network.head(2) #genome.unique()
    
In [ ]:
    
network.head()
    
In [ ]:
    
joined = network.join(gene_info, how='left')
    
In [ ]:
    
print(network.shape)
joined.shape
    
In [ ]:
    
joined.head()
    
In [ ]:
    
joined.to_csv('/home/jmatsen/Neo4j_meta4/data/network_with_info.txt')
    
Trim out just genomes with "ethylo" in them.
In [ ]:
    
joined.genome.unique()
    
In [ ]:
    
#methylo_subset = 
# df[df['A'].str.contains("hello")]
#methylo = 
joined[joined.genome.str.contains('ethylo')]
    
In [ ]:
    
print(joined.shape)
print(methylo.shape)
    
In [ ]:
    
    
Content source: JanetMatsen/Neo4j_meta4
Similar notebooks: