In [1]:
import pandas as pd
In [2]:
gene_info = pd.read_csv('/home/jmatsen/Neo4j_meta4/data/gene_info.tsv', sep='\t')
In [3]:
network = pd.read_csv('/home/jmatsen/Neo4j_meta4/data/network.txt', sep='\t')
In [4]:
gene_info.shape
Out[4]:
(212710, 3)
In [5]:
gene_info.head(2)
Out[5]:
genome
locus_tag
product
0
Methylotenera mobilis JLW8
Mmol_0001
chromosomal replication initiator protein DnaA
1
Methylotenera mobilis JLW8
Mmol_0002
DNA polymerase III, beta subunit (EC 2.7.7.7)
In [6]:
gene_info[gene_info['locus_tag'].str.contains('Mmol_')]
Out[6]:
genome
locus_tag
product
0
Methylotenera mobilis JLW8
Mmol_0001
chromosomal replication initiator protein DnaA
1
Methylotenera mobilis JLW8
Mmol_0002
DNA polymerase III, beta subunit (EC 2.7.7.7)
2
Methylotenera mobilis JLW8
Mmol_0003
DNA gyrase subunit B (EC 5.99.1.3)
3
Methylotenera mobilis JLW8
Mmol_0004
transcriptional regulator, TetR family
4
Methylotenera mobilis JLW8
Mmol_0005
efflux transporter, RND family, MFP subunit
5
Methylotenera mobilis JLW8
Mmol_0006
transporter, hydrophobe/amphiphile efflux-1 (...
6
Methylotenera mobilis JLW8
Mmol_0007
RND efflux system, outer membrane lipoprotein,...
7
Methylotenera mobilis JLW8
Mmol_0008
transcriptional regulator, LysR family
8
Methylotenera mobilis JLW8
Mmol_0009
short-chain dehydrogenase/reductase SDR
9
Methylotenera mobilis JLW8
Mmol_0010
NmrA family protein
10
Methylotenera mobilis JLW8
Mmol_0011
transcriptional regulator, HxlR family
11
Methylotenera mobilis JLW8
Mmol_0012
hypothetical protein
12
Methylotenera mobilis JLW8
Mmol_0013
hypothetical protein
13
Methylotenera mobilis JLW8
Mmol_0014
transcriptional regulator, LysR family
14
Methylotenera mobilis JLW8
Mmol_0015
Heat shock protein. Metallo peptidase. MEROPS ...
15
Methylotenera mobilis JLW8
Mmol_0016
Integral membrane protein TerC
16
Methylotenera mobilis JLW8
Mmol_0017
cytochrome B561
17
Methylotenera mobilis JLW8
Mmol_0018
beta-lactamase domain protein
18
Methylotenera mobilis JLW8
Mmol_0019
DoxX family protein
19
Methylotenera mobilis JLW8
Mmol_0020
GCN5-related N-acetyltransferase
20
Methylotenera mobilis JLW8
Mmol_0021
coenzyme PQQ biosynthesis protein A
21
Methylotenera mobilis JLW8
Mmol_0022
diguanylate cyclase
22
Methylotenera mobilis JLW8
Mmol_0023
CBS domain containing membrane protein
23
Methylotenera mobilis JLW8
Mmol_0024
formaldehyde-activating enzyme
24
Methylotenera mobilis JLW8
Mmol_0025
hypothetical protein
25
Methylotenera mobilis JLW8
Mmol_0026
putative transmembrane protein
26
Methylotenera mobilis JLW8
Mmol_0027
protein of unknown function DUF883 ElaB
27
Methylotenera mobilis JLW8
Mmol_0028
A/G-specific DNA-adenine glycosylase (EC 3.2....
28
Methylotenera mobilis JLW8
Mmol_0029
AsmA family protein
29
Methylotenera mobilis JLW8
Mmol_0030
Undecaprenyl-diphosphatase (EC 3.6.1.27)
...
...
...
...
2308
Methylotenera mobilis JLW8
Mmol_2320
replication restart DNA helicase PriA
2309
Methylotenera mobilis JLW8
Mmol_2321
arginyl-tRNA synthetase (EC 6.1.1.19)
2310
Methylotenera mobilis JLW8
Mmol_2322
Sporulation domain protein
2311
Methylotenera mobilis JLW8
Mmol_2323
DSBA oxidoreductase
2312
Methylotenera mobilis JLW8
Mmol_2324
short-chain dehydrogenase/reductase SDR
2313
Methylotenera mobilis JLW8
Mmol_2325
ABC transporter related
2314
Methylotenera mobilis JLW8
Mmol_2326
YaeQ family protein
2315
Methylotenera mobilis JLW8
Mmol_2327
putative adenylate/guanylate cyclase
2316
Methylotenera mobilis JLW8
Mmol_2328
glutamine--fructose-6-phosphate transaminase
2317
Methylotenera mobilis JLW8
Mmol_2329
UDP-N-acetylglucosamine pyrophosphorylase (EC ...
2318
Methylotenera mobilis JLW8
Mmol_2330
ATP synthase F1 subcomplex epsilon subunit
2319
Methylotenera mobilis JLW8
Mmol_2331
ATP synthase F1, beta subunit
2320
Methylotenera mobilis JLW8
Mmol_2332
ATP synthase F1 subcomplex gamma subunit
2321
Methylotenera mobilis JLW8
Mmol_2333
ATP synthase F1 subcomplex alpha subunit
2322
Methylotenera mobilis JLW8
Mmol_2334
ATP synthase F1 subcomplex delta subunit
2323
Methylotenera mobilis JLW8
Mmol_2335
ATP synthase F0, B subunit
2324
Methylotenera mobilis JLW8
Mmol_2336
ATP synthase F0, C subunit
2325
Methylotenera mobilis JLW8
Mmol_2337
ATP synthase F0, A subunit
2326
Methylotenera mobilis JLW8
Mmol_2338
ATP synthase I chain
2327
Methylotenera mobilis JLW8
Mmol_2339
chromosome segregation DNA-binding protein
2328
Methylotenera mobilis JLW8
Mmol_2340
chromosome segregation ATPase
2329
Methylotenera mobilis JLW8
Mmol_2341
16S rRNA m(7)G-527 methyltransferase (EC 2.1....
2330
Methylotenera mobilis JLW8
Mmol_2342
glucose inhibited division protein A
2331
Methylotenera mobilis JLW8
Mmol_2343
Domain of unknown function DUF1924
2332
Methylotenera mobilis JLW8
Mmol_2344
protein of unknown function DUF1508
2333
Methylotenera mobilis JLW8
Mmol_2345
tRNA modification GTPase trmE
2334
Methylotenera mobilis JLW8
Mmol_2346
protein translocase subunit yidC
2335
Methylotenera mobilis JLW8
Mmol_2347
protein of unknown function DUF37
2336
Methylotenera mobilis JLW8
Mmol_2348
ribonuclease P protein component (EC 3.1.26.5)
2337
Methylotenera mobilis JLW8
Mmol_2349
LSU ribosomal protein L34P
2338 rows × 3 columns
In [7]:
gene_info[gene_info['locus_tag'].str.contains('Ga0081629')]
Out[7]:
genome
locus_tag
product
In [8]:
gene_info[gene_info['locus_tag'].str.contains('Ga0081607')]
Out[8]:
genome
locus_tag
product
In [9]:
gene_info[gene_info['locus_tag'].str.contains('Ga*')]
Out[9]:
genome
locus_tag
product
115803
Methylomonas sp. MK1
G006DRAFT_0002
ATP-dependent Clp endopeptidase, proteolytic ...
115804
Methylomonas sp. MK1
G006DRAFT_0003
phage major capsid protein, HK97 family
115805
Methylomonas sp. MK1
G006DRAFT_0004
hypothetical protein
115806
Methylomonas sp. MK1
G006DRAFT_0005
hypothetical protein
115807
Methylomonas sp. MK1
G006DRAFT_0006
phage tail tape measure protein, TP901 family,...
115808
Methylomonas sp. MK1
G006DRAFT_0007
hypothetical protein
115809
Methylomonas sp. MK1
G006DRAFT_0008
hypothetical protein
115810
Methylomonas sp. MK1
G006DRAFT_0009
hypothetical protein
115811
Methylomonas sp. MK1
G006DRAFT_0010
glucosylceramidase
115812
Methylomonas sp. MK1
G006DRAFT_0011
hypothetical protein
115813
Methylomonas sp. MK1
G006DRAFT_0012
chromosome partitioning protein
115814
Methylomonas sp. MK1
G006DRAFT_0013
hypothetical protein
115815
Methylomonas sp. MK1
G006DRAFT_0014
outer membrane protein, multidrug efflux system
115816
Methylomonas sp. MK1
G006DRAFT_0015
multidrug efflux pump
115817
Methylomonas sp. MK1
G006DRAFT_0016
membrane fusion protein, multidrug efflux system
115818
Methylomonas sp. MK1
G006DRAFT_0017
transcriptional regulator, TetR family
115819
Methylomonas sp. MK1
G006DRAFT_0019
Hemolysin activation/secretion protein
115820
Methylomonas sp. MK1
G006DRAFT_0020
SapC protein
115821
Methylomonas sp. MK1
G006DRAFT_0021
filamentous hemagglutinin family N-terminal d...
115822
Methylomonas sp. MK1
G006DRAFT_0022
mxaD protein
115823
Methylomonas sp. MK1
G006DRAFT_0023
hypothetical protein
115824
Methylomonas sp. MK1
G006DRAFT_0024
two component transcriptional regulator, LuxR ...
115825
Methylomonas sp. MK1
G006DRAFT_0025
methanol dehydrogenase (cytochrome) large sub...
115826
Methylomonas sp. MK1
G006DRAFT_0026
mxaJ protein
115827
Methylomonas sp. MK1
G006DRAFT_0027
cytochrome cL apoprotein
115828
Methylomonas sp. MK1
G006DRAFT_0028
methanol dehydrogenase (cytochrome) small sub...
115829
Methylomonas sp. MK1
G006DRAFT_0029
MoxR-like ATPase
115830
Methylomonas sp. MK1
G006DRAFT_0030
hypothetical protein
115831
Methylomonas sp. MK1
G006DRAFT_0031
hypothetical protein
115832
Methylomonas sp. MK1
G006DRAFT_0032
mxaA protein
...
...
...
...
212680
Methylotenera sp. N17
FG11DRAFT_2642
RNA polymerase, sigma 54 subunit, RpoN/SigL
212681
Methylotenera sp. N17
FG11DRAFT_2643
lipopolysaccharide export system ATP-binding ...
212682
Methylotenera sp. N17
FG11DRAFT_2644
lipopolysaccharide export system protein LptA
212683
Methylotenera sp. N17
FG11DRAFT_2645
lipopolysaccharide export system protein LptC
212684
Methylotenera sp. N17
FG11DRAFT_2646
3-deoxy-D-manno-octulosonate 8-phosphate phos...
212685
Methylotenera sp. N17
FG11DRAFT_2647
arabinose-5-phosphate isomerase
212686
Methylotenera sp. N17
FG11DRAFT_2648
monovalent cation:H+ antiporter-2, CPA2 family
212687
Methylotenera sp. N17
FG11DRAFT_2649
glycine oxidase
212688
Methylotenera sp. N17
FG11DRAFT_2650
Uncharacterized conserved protein YdhG, YjbR/...
212689
Methylotenera sp. N17
FG11DRAFT_2651
hypothetical protein
212690
Methylotenera sp. N17
FG11DRAFT_2652
hypothetical protein
212691
Methylotenera sp. N17
FG11DRAFT_2653
cyclic pyranopterin phosphate synthase
212692
Methylotenera sp. N17
FG11DRAFT_2654
Putative Zn-dependent protease, contains TPR ...
212693
Methylotenera sp. N17
FG11DRAFT_2655
thiosulfate oxidation carrier complex protein ...
212694
Methylotenera sp. N17
FG11DRAFT_2656
peroxiredoxin (alkyl hydroperoxide reductase ...
212695
Methylotenera sp. N17
FG11DRAFT_2657
alkyl hydroperoxide reductase subunit F
212696
Methylotenera sp. N17
FG11DRAFT_2658
hypothetical protein
212697
Methylotenera sp. N17
FG11DRAFT_2659
DnaA family protein
212698
Methylotenera sp. N17
FG11DRAFT_2660
Predicted PurR-regulated permease PerM
212699
Methylotenera sp. N17
FG11DRAFT_2661
phosphoribosylformylglycinamidine cyclo-ligase
212700
Methylotenera sp. N17
FG11DRAFT_2662
Protein of unknown function (DUF3108)
212701
Methylotenera sp. N17
FG11DRAFT_2663
Protein of unknown function (DUF3108)
212702
Methylotenera sp. N17
FG11DRAFT_2664
16S rRNA (cytosine967-C5)-methyltransferase
212703
Methylotenera sp. N17
FG11DRAFT_2665
Na+/H+-dicarboxylate symporter
212704
Methylotenera sp. N17
FG11DRAFT_2666
Predicted N-acetyltransferase YhbS
212705
Methylotenera sp. N17
FG11DRAFT_2667
hydrophobic/amphiphilic exporter-1, HAE1 family
212706
Methylotenera sp. N17
FG11DRAFT_2668
RND family efflux transporter, MFP subunit
212707
Methylotenera sp. N17
FG11DRAFT_2669
AmpD protein
212708
Methylotenera sp. N17
FG11DRAFT_2670
cation:H+ antiporter
212709
Methylotenera sp. N17
FG11DRAFT_2671
two-component system, NtrC family, response r...
34769 rows × 3 columns
In [10]:
'Ga0081607_104311
File "<ipython-input-10-2a14c53b90a0>", line 1
'Ga0081607_104311
^
SyntaxError: EOL while scanning string literal
In [ ]:
#gene_info.genome.str #.contains("ethylo").unique()
gene_info[gene_info.genome.str.contains('ethylo')]['genome'] #['genome'].unique()
In [ ]:
len(gene_info.genome.unique())
In [ ]:
network.shape
In [ ]:
network.head(2)
In [ ]:
# df['date'] = df['raw'].str.extract('(....-..-..)', expand=True)
network['locus tag'] = network['target'].str.extract('([A-z]+[0-9]+)_[0-9]+')
In [ ]:
network['source'].unique()
In [ ]:
network['locus tag'].unique()
In [ ]:
network.head(2) #genome.unique()
In [ ]:
network.head()
In [ ]:
joined = network.join(gene_info, how='left')
In [ ]:
print(network.shape)
joined.shape
In [ ]:
joined.head()
In [ ]:
joined.to_csv('/home/jmatsen/Neo4j_meta4/data/network_with_info.txt')
Trim out just genomes with "ethylo" in them.
In [ ]:
joined.genome.unique()
In [ ]:
#methylo_subset =
# df[df['A'].str.contains("hello")]
#methylo =
joined[joined.genome.str.contains('ethylo')]
In [ ]:
print(joined.shape)
print(methylo.shape)
In [ ]:
Content source: JanetMatsen/Neo4j_meta4
Similar notebooks: