In [1]:
from hmmerclust import hmmerclust
import settings
/Users/matt/.virtualenvs/hmmerclust_dev4/lib/python2.7/site-packages/Bio/SearchIO/__init__.py:211: BiopythonExperimentalWarning: Bio.SearchIO is an experimental submodule which may undergo significant changes prior to its future official release.
BiopythonExperimentalWarning)
In [2]:
ls genomes/
ls: genomes/: No such file or directory
In [3]:
genome_list = !ls ../data/genomes/
genome_dir = '../data/genomes/'
In [4]:
db = hmmerclust.OrganismDB('t3ss_database',
genome_list,
genome_dir,
freshfasta=True)
making combined fasta for NC_002516.gb
5572 proteins were added
5572 unique proteins were added -- dropping redundant ones
making combined fasta for NC_002695.gb
10776 proteins were added
10776 unique proteins were added -- dropping redundant ones
making combined fasta for NC_003143.gb
14574 proteins were added
14574 unique proteins were added -- dropping redundant ones
making combined fasta for NC_003198.gb
18685 proteins were added
18685 unique proteins were added -- dropping redundant ones
making combined fasta for NC_007645.gb
24962 proteins were added
24945 unique proteins were added -- dropping redundant ones
making combined fasta for NC_007650.gb
27318 proteins were added
27301 unique proteins were added -- dropping redundant ones
making combined fasta for NC_007651.gb
30594 proteins were added
30577 unique proteins were added -- dropping redundant ones
making combined fasta for NC_007760.gb
35020 proteins were added
35002 unique proteins were added -- dropping redundant ones
making combined fasta for NC_010287.gb
35905 proteins were added
35887 unique proteins were added -- dropping redundant ones
making combined fasta for NC_013722.gb
39019 proteins were added
39001 unique proteins were added -- dropping redundant ones
making combined fasta for NC_013971.gb
42326 proteins were added
42306 unique proteins were added -- dropping redundant ones
making combined fasta for NC_015677.gb
46171 proteins were added
46150 unique proteins were added -- dropping redundant ones
making combined fasta for NC_018518.gb
49788 proteins were added
49563 unique proteins were added -- dropping redundant ones
making combined fasta for NC_022904.gb
54559 proteins were added
54319 unique proteins were added -- dropping redundant ones
Adding organism attributes for NC_002516.gb
Adding organism attributes for NC_002695.gb
Adding organism attributes for NC_003143.gb
Adding organism attributes for NC_003198.gb
Adding organism attributes for NC_007645.gb
Adding organism attributes for NC_007650.gb
Adding organism attributes for NC_007651.gb
Adding organism attributes for NC_007760.gb
Adding organism attributes for NC_010287.gb
Adding organism attributes for NC_013722.gb
Adding organism attributes for NC_013971.gb
Adding organism attributes for NC_015677.gb
Adding organism attributes for NC_018518.gb
Adding organism attributes for NC_022904.gb
/Users/matt/.virtualenvs/hmmerclust_dev4/lib/python2.7/site-packages/Bio/GenBank/Scanner.py:983: BiopythonParserWarning: Premature end of file in sequence data
BiopythonParserWarning)
the organisms attribute of the db is a list of Organism objects
In [5]:
db.organisms
Out[5]:
[<hmmerclust.hmmerclust.Organism at 0x106b382d0>,
<hmmerclust.hmmerclust.Organism at 0x101fb23d0>,
<hmmerclust.hmmerclust.Organism at 0x104ede050>,
<hmmerclust.hmmerclust.Organism at 0x105087050>,
<hmmerclust.hmmerclust.Organism at 0x105087110>,
<hmmerclust.hmmerclust.Organism at 0x104ede550>,
<hmmerclust.hmmerclust.Organism at 0x106b381d0>,
<hmmerclust.hmmerclust.Organism at 0x101ba5cd0>,
<hmmerclust.hmmerclust.Organism at 0x105aa4050>,
<hmmerclust.hmmerclust.Organism at 0x101ba5fd0>,
<hmmerclust.hmmerclust.Organism at 0x101ba5950>,
<hmmerclust.hmmerclust.Organism at 0x1048962d0>,
<hmmerclust.hmmerclust.Organism at 0x1048968d0>,
<hmmerclust.hmmerclust.Organism at 0x109068e90>]
the organism object encapsulates this data:
In [6]:
db.organisms[1].__dict__
Out[6]:
{'accesion_version': 'NC_002695.1',
'accession': 'NC_002695',
'clazz': 'Gammaproteobacteria',
'description': 'Escherichia coli O157:H7 str. Sakai chromosome, complete genome.',
'family': 'Enterobacteriaceae',
'genome_length': 5498450,
'genome_path': '../data/genomes/NC_002695.gb',
'genus': 'Escherichia',
'kingdom': 'Bacteria',
'loci': [],
'name': 'Escherichia coli O157:H7 str. Sakai',
'order': 'Enterobacteriales',
'parent_db': <hmmerclust.hmmerclust.OrganismDB instance at 0x104627e18>,
'phylum': 'Proteobacteria',
'proteins': [],
'rRNA16S_sequence': None,
'species': 'Escherichia coli',
'taxonomy': ['Bacteria',
'Proteobacteria',
'Gammaproteobacteria',
'Enterobacteriales',
'Enterobacteriaceae',
'Escherichia'],
'tree_order': 0}
In [7]:
#notice a combined_fasta file was created when the db was made
!ls
16S_aligned.csv group_fastas locus_fastas
__init__.py hhsearch_results out.svg
alignments hmm settings.py
combined_fasta hmmerclust_demo.ipynb settings.pyc
In [8]:
!ls alignments
InvA_PF00771_seed.txt OrgB_PB004806.txt SipC_PF09599_seed.txt
InvC_PF00006_seed.txt PrgH_PF09480_seed.txt SipD_PF06511_seed.txt
InvE_PF07201_seed.txt PrgI_PF09392_seed.txt SpaO_PF01052_seed.txt
InvG_PF00263_seed.txt PrgJ_PB000379.txt SpaP_PF00813_seed.txt
InvH_PF04741_seed.txt PrgK_PF01514_seed.txt SpaQ_PF01313_seed.txt
InvJ_PF02510_seed.txt PscP_PF02120_seed.txt SpaR_PF01311_seed.txt
OrgA_PF09482_seed.txt SipB_PF04888_seed.txt SpaS_PF01312_seed.txt
In [9]:
combined_fasta = './combined_fasta'
s = hmmerclust.HmmSearch(db, combined_fasta,
freshbuild=True,
freshsearch=True)
building Hmm for InvA_PF00771_seed.txt
building Hmm for InvC_PF00006_seed.txt
building Hmm for InvE_PF07201_seed.txt
building Hmm for InvG_PF00263_seed.txt
building Hmm for InvH_PF04741_seed.txt
building Hmm for InvJ_PF02510_seed.txt
building Hmm for OrgA_PF09482_seed.txt
building Hmm for OrgB_PB004806.txt
building Hmm for PrgH_PF09480_seed.txt
building Hmm for PrgI_PF09392_seed.txt
building Hmm for PrgJ_PB000379.txt
building Hmm for PrgK_PF01514_seed.txt
building Hmm for PscP_PF02120_seed.txt
building Hmm for SipB_PF04888_seed.txt
building Hmm for SipC_PF09599_seed.txt
building Hmm for SipD_PF06511_seed.txt
building Hmm for SpaO_PF01052_seed.txt
building Hmm for SpaP_PF00813_seed.txt
building Hmm for SpaQ_PF01313_seed.txt
building Hmm for SpaR_PF01311_seed.txt
building Hmm for SpaS_PF01312_seed.txt
hhbuild complete for ['InvA', 'InvC', 'InvE', 'InvG', 'InvH', 'InvJ', 'OrgA', 'OrgB', 'PrgH', 'PrgI', 'PrgJ', 'PrgK', 'PscP', 'SipB', 'SipC', 'SipD', 'SpaO', 'SpaP', 'SpaQ', 'SpaR', 'SpaS']
running HHsearch on InvA
running HHsearch on InvC
running HHsearch on InvE
running HHsearch on InvG
running HHsearch on InvH
running HHsearch on InvJ
running HHsearch on OrgA
running HHsearch on OrgB
running HHsearch on PrgH
running HHsearch on PrgI
running HHsearch on PrgJ
running HHsearch on PrgK
running HHsearch on PscP
running HHsearch on SipB
running HHsearch on SipC
running HHsearch on SipD
running HHsearch on SpaO
running HHsearch on SpaP
running HHsearch on SpaQ
running HHsearch on SpaR
running HHsearch on SpaS
extracted 39 hits for InvA.out
extracted 296 hits for InvC.out
extracted 19 hits for InvE.out
extracted 63 hits for InvG.out
extracted 1 hits for InvH.out
extracted 3 hits for InvJ.out
extracted 8 hits for OrgA.out
extracted 2 hits for OrgB.out
extracted 12 hits for PrgH.out
extracted 41 hits for PrgI.out
extracted 13 hits for PrgJ.out
extracted 41 hits for PrgK.out
extracted 47 hits for PscP.out
extracted 29 hits for SipB.out
extracted 4 hits for SipC.out
extracted 8 hits for SipD.out
extracted 59 hits for SpaO.out
extracted 39 hits for SpaP.out
extracted 38 hits for SpaQ.out
extracted 40 hits for SpaR.out
extracted 42 hits for SpaS.out
adding proteins to organism NC_002516
adding proteins to organism NC_002695
adding proteins to organism NC_003143
adding proteins to organism NC_003198
adding proteins to organism NC_007645
adding proteins to organism NC_007650
adding proteins to organism NC_007651
adding proteins to organism NC_007760
adding proteins to organism NC_010287
adding proteins to organism NC_013722
adding proteins to organism NC_013971
adding proteins to organism NC_015677
adding proteins to organism NC_018518
adding proteins to organism NC_022904
adding SearchIO hit objects for NC_002516
adding SearchIO hit objects for NC_002695
adding SearchIO hit objects for NC_003143
adding SearchIO hit objects for NC_003198
adding SearchIO hit objects for NC_007645
adding SearchIO hit objects for NC_007650
adding SearchIO hit objects for NC_007651
adding SearchIO hit objects for NC_007760
adding SearchIO hit objects for NC_010287
adding SearchIO hit objects for NC_013722
adding SearchIO hit objects for NC_013971
adding SearchIO hit objects for NC_015677
adding SearchIO hit objects for NC_018518
adding SearchIO hit objects for NC_022904
setting best hit values for Pseudomonas aeruginosa PAO1
setting best hit values for Escherichia coli O157:H7 str. Sakai
setting best hit values for Yersinia pestis CO92
setting best hit values for Salmonella enterica subsp. enterica serovar Typhi str. CT18
setting best hit values for Hahella chejuensis KCTC 2396
setting best hit values for Burkholderia thailandensis E264
setting best hit values for Burkholderia thailandensis E264
setting best hit values for Anaeromyxobacter dehalogenans 2CP-C
setting best hit values for Chlamydia trachomatis 434/Bu
setting best hit values for Xanthomonas albilineans GPE PC73
setting best hit values for Erwinia amylovora ATCC 49946
setting best hit values for Ramlibacter tataouinensis TTB310
setting best hit values for Bordetella pertussis 18323
setting best hit values for Pandoraea pnomenusa 3kgm
In [10]:
#hits now found in the db object
db.organisms[1].__dict__
Out[10]:
{'accesion_version': 'NC_002695.1',
'accession': 'NC_002695',
'clazz': 'Gammaproteobacteria',
'description': 'Escherichia coli O157:H7 str. Sakai chromosome, complete genome.',
'family': 'Enterobacteriaceae',
'genome_length': 5498450,
'genome_path': '../data/genomes/NC_002695.gb',
'genus': 'Escherichia',
'kingdom': 'Bacteria',
'loci': [],
'name': 'Escherichia coli O157:H7 str. Sakai',
'order': 'Enterobacteriales',
'parent_db': <hmmerclust.hmmerclust.OrganismDB instance at 0x104627e18>,
'phylum': 'Proteobacteria',
'proteins': [NP_311099.1 - ABC transporter ATP-binding protein,
NP_310807.1 - hypothetical protein,
NP_312703.1 - ATP synthase F0F1 subunit alpha,
NP_312230.1 - ABC transporter ATP-binding protein,
NP_311368.2 - glycine cleavage system transcriptional repressor,
NP_313267.1 - hypothetical protein,
NP_312611.1 - hypothetical protein,
NP_309773.1 - oligopeptide ABC transporter ATP-binding protein,
NP_309779.1 - transporter,
NP_312226.1 - hypothetical protein,
NP_311745.1 - EprI,
NP_311753.1 - surface presentation of antigens protein SpaO,
NP_312260.1 - outer membrane porin HofQ,
NP_311743.1 - EprK,
NP_311757.1 - ATP synthase SpaL,
NP_311751.1 - EpaQ,
NP_311759.1 - EivE,
NP_312472.2 - hypothetical protein,
NP_312579.1 - protein EscF,
NP_310061.1 - potassium-tellurite ethidium and proflavin transporter,
NP_308962.1 - putrescine transporter ATP-binding protein,
NP_312609.1 - EscS,
NP_310704.1 - flagellar MS-ring protein,
NP_312084.1 - ATP-dependent metalloprotease,
NP_310682.1 - amino-acid ABC transporter ATP-binding protein,
NP_308654.1 - iron-enterobactin transporter ATP-binding protein,
NP_309004.1 - recombination factor protein RarA,
NP_312595.1 - EscN,
NP_310714.1 - flagellar biosynthesis protein FliP,
NP_310716.1 - flagellar biosynthesis protein FliR,
NP_308868.1 - tail assembly protein,
NP_310712.1 - flagellar motor switch protein FliN,
NP_310616.1 - flagellar biosynthesis protein FlhA,
NP_308283.1 - FhiA protein,
NP_309894.1 - peptide ABC transporter ATP-binding protein,
NP_312608.1 - EscT,
NP_308440.2 - ferric transporter ATP-binding protein,
NP_312599.1 - hypothetical protein,
NP_310114.1 - ABC transporter ATP-binding protein,
NP_312885.1 - ATP-dependent protease ATP-binding protein HslU,
NP_312610.1 - type III secretion system protein,
NP_310715.1 - flagellar biosynthesis protein FliQ,
NP_309863.1 - anthranilate synthase component I,
NP_311754.1 - EivJ,
NP_311758.1 - EivA,
NP_312602.1 - EscC,
NP_309060.1 - ABC transporter ATPase,
NP_312701.1 - ATP synthase F0F1 subunit beta,
NP_308935.1 - glutathione transporter ATP-binding protein,
NP_311746.1 - EprH,
NP_312743.1 - transcription termination factor Rho,
NP_311640.1 - hypothetical protein,
NP_310711.1 - flagellar motor switch protein FliM,
NP_311744.1 - EprJ,
NP_311742.1 - hypothetical protein,
NP_312592.1 - SepQ,
NP_312607.1 - secretion system apparatus protein SsaU,
NP_310617.1 - flagellar biosynthesis protein FlhB,
NP_308520.1 - DNA-binding ATP-dependent protease La,
NP_308681.1 - citrate lyase subunit alpha,
NP_309724.1 - ferric enterobactin transport ATP-binding protein,
NP_312600.1 - EscJ,
NP_309774.1 - hypothetical protein,
NP_311760.1 - EivG,
NP_312584.1 - SepL,
NP_310707.1 - flagellum-specific ATP synthase,
NP_310391.1 - lipoprotein,
NP_313107.1 - phosphonate C-P lyase system protein PhnK,
NP_311440.1 - ABC transporter ATP-binding protein,
NP_312596.1 - hypothetical protein,
NP_312582.1 - protein EspD,
NP_311752.1 - surface presentation of antigens protein SpaP,
NP_312581.1 - protein EspB,
NP_310709.1 - flagellar hook-length control protein,
NP_311748.1 - surface presentation of antigens protein SpaS,
NP_313376.1 - ABC transporter ATP-binding protein],
'rRNA16S_sequence': None,
'species': 'Escherichia coli',
'taxonomy': ['Bacteria',
'Proteobacteria',
'Gammaproteobacteria',
'Enterobacteriales',
'Enterobacteriaceae',
'Escherichia'],
'tree_order': 0}
In [11]:
db.find_loci(5, 15000, colordict=settings.COLOUR_DICT)
finding loci for Pseudomonas aeruginosa PAO1
total of 2 found for Pseudomonas aeruginosa PAO1
finding loci for Escherichia coli O157:H7 str. Sakai
total of 3 found for Escherichia coli O157:H7 str. Sakai
finding loci for Yersinia pestis CO92
total of 3 found for Yersinia pestis CO92
finding loci for Salmonella enterica subsp. enterica serovar Typhi str. CT18
total of 3 found for Salmonella enterica subsp. enterica serovar Typhi str. CT18
finding loci for Hahella chejuensis KCTC 2396
total of 4 found for Hahella chejuensis KCTC 2396
finding loci for Burkholderia thailandensis E264
total of 3 found for Burkholderia thailandensis E264
finding loci for Burkholderia thailandensis E264
total of 1 found for Burkholderia thailandensis E264
finding loci for Anaeromyxobacter dehalogenans 2CP-C
total of 2 found for Anaeromyxobacter dehalogenans 2CP-C
finding loci for Chlamydia trachomatis 434/Bu
total of 1 found for Chlamydia trachomatis 434/Bu
finding loci for Xanthomonas albilineans GPE PC73
total of 2 found for Xanthomonas albilineans GPE PC73
finding loci for Erwinia amylovora ATCC 49946
total of 5 found for Erwinia amylovora ATCC 49946
finding loci for Ramlibacter tataouinensis TTB310
total of 1 found for Ramlibacter tataouinensis TTB310
finding loci for Bordetella pertussis 18323
total of 2 found for Bordetella pertussis 18323
finding loci for Pandoraea pnomenusa 3kgm
total of 4 found for Pandoraea pnomenusa 3kgm
In [12]:
# loci attribute now populated
db.organisms[1].loci
Out[12]:
[<hmmerclust.hmmerclust.Locus instance at 0x1061ca0e0>,
<hmmerclust.hmmerclust.Locus instance at 0x108d2a248>,
<hmmerclust.hmmerclust.Locus instance at 0x104dae368>]
In [13]:
df = hmmerclust.FinalDataFrame(db)
Int64Index([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
...
826, 827, 828, 829, 830, 831, 832, 833, 834, 835],
dtype='int64', length=836)
In [14]:
df.df.head()
Out[14]:
org_name
org_acc
org_phylum
org_class
org_order
org_family
org_genus
org_species
org_tree_order
org_genome_length
...
prot_acc
prot_gi
prot_product
prot_translation
prot_numb_of_res
hit_query
hit_evalue
hit_bitscore
hit_bias
locus_id
0
Pseudomonas aeruginosa PAO1
NC_002516
Proteobacteria
Gammaproteobacteria
Pseudomonadales
Pseudomonadaceae
Pseudomonas
Pseudomonas aeruginosa
0
6264404
...
NP_250132.1
15596638
hypothetical protein
MAVAPGVLLPPTPDVKPKAAAPKSQQKTPEPSNDKTSSFSDMYAKE...
427
PscP
1.100000e-21
78.1
1.1
<hmmerclust.hmmerclust.Locus instance at 0x104...
1
Pseudomonas aeruginosa PAO1
NC_002516
Proteobacteria
Gammaproteobacteria
Pseudomonadales
Pseudomonadaceae
Pseudomonas
Pseudomonas aeruginosa
0
6264404
...
NP_250414.1
15596920
type III export protein PscJ
MRRTVKGLSRMALLALVLALGGCKVELYTGISQKEGNEMLALLRSE...
248
PrgK
2.300000e-72
244.3
0.2
<hmmerclust.hmmerclust.Locus instance at 0x105...
2
Pseudomonas aeruginosa PAO1
NC_002516
Proteobacteria
Gammaproteobacteria
Pseudomonadales
Pseudomonadaceae
Pseudomonas
Pseudomonas aeruginosa
0
6264404
...
NP_250134.1
15596640
flagellar motor switch protein FliM
MAVQDLLSQDEIDALLHGVDDGLVETEVEATPGSVKSYDLTSQDRI...
323
SpaO
1.300000e-17
64.9
0.2
<hmmerclust.hmmerclust.Locus instance at 0x104...
3
Pseudomonas aeruginosa PAO1
NC_002516
Proteobacteria
Gammaproteobacteria
Pseudomonadales
Pseudomonadaceae
Pseudomonas
Pseudomonas aeruginosa
0
6264404
...
NP_249795.1
15596301
flagellum-specific ATP synthase
MRLERTSFARRLEGYTEAVSLPAQPVVEGRLLRMVGLTLEAEGLQA...
451
InvC
1.700000e-72
245.2
0.0
None
4
Pseudomonas aeruginosa PAO1
NC_002516
Proteobacteria
Gammaproteobacteria
Pseudomonadales
Pseudomonadaceae
Pseudomonas
Pseudomonas aeruginosa
0
6264404
...
NP_250138.1
15596644
flagellar biosynthesis protein FliQ
MTPEVALDLFREALWLTAMIVGVLVVPSLLVGLVVAMFQAATQINE...
89
SpaQ
1.800000e-29
103.0
8.8
<hmmerclust.hmmerclust.Locus instance at 0x104...
5 rows × 22 columns
In [15]:
df.df.ix[0]
Out[15]:
org_name Pseudomonas aeruginosa PAO1
org_acc NC_002516
org_phylum Proteobacteria
org_class Gammaproteobacteria
org_order Pseudomonadales
org_family Pseudomonadaceae
org_genus Pseudomonas
org_species Pseudomonas aeruginosa
org_tree_order 0
org_genome_length 6264404
org_prot_count 82
org_numb_loci 2
prot_acc NP_250132.1
prot_gi 15596638
prot_product hypothetical protein
prot_translation MAVAPGVLLPPTPDVKPKAAAPKSQQKTPEPSNDKTSSFSDMYAKE...
prot_numb_of_res 427
hit_query PscP
hit_evalue 1.1e-21
hit_bitscore 78.1
hit_bias 1.1
locus_id <hmmerclust.hmmerclust.Locus instance at 0x104...
Name: 0, dtype: object
In [16]:
%pylab
%matplotlib inline
figsize(20,10)
Using matplotlib backend: MacOSX
Populating the interactive namespace from numpy and matplotlib
In [17]:
# number of loci identified by family
figsize(5,5)
df.df.groupby(['org_family'])['locus_id'].nunique().plot(kind='bar')
Out[17]:
<matplotlib.axes._subplots.AxesSubplot at 0x1062b2150>
In [18]:
figsize(15,5)
df.df[df.df.hit_query=='InvG'].hit_evalue.order().plot(logy=True, kind='bar')
Out[18]:
<matplotlib.axes._subplots.AxesSubplot at 0x106063850>
In [19]:
InvG_only = hmmerclust.HeatMap(df.df,
by_locus=True,
cols=settings.HEATMAP_COLUMNS,
singleletters=settings.HEATMAP_ABBREVIATIONS,
subset=['InvG'])
Notice if we set by_locus=False, the rows are not by locus, rather a hit can be anywhere in the genome. In this case we see a lot of InvC hits, which are probably a lot of homologous
In [20]:
hmmerclust.HeatMap(df.df,
by_locus=False,
cols=settings.HEATMAP_COLUMNS,
singleletters=settings.HEATMAP_ABBREVIATIONS)
Out[20]:
<hmmerclust.hmmerclust.HeatMap instance at 0x10737a758>
In [21]:
#the heatmap is built from this unstacked pandas dataframe
InvG_only.unstacked_df.head()
Out[21]:
hit_query
InvC
SpaO
SpaP
SpaQ
SpaR
SpaS
InvA
PscP
OrgB
OrgA
...
PrgK
PrgH
InvG
InvH
InvE
PrgI
InvJ
SipC
SipB
SipD
org_species
locus_id
org_tree_order
Yersinia pestis
<hmmerclust.hmmerclust.Locus instance at 0x1053dd248>
0
1
1
1
1
1
1
1
0
0
0
...
1
1
1
0
0
2
0
0
0
0
Xanthomonas albilineans
<hmmerclust.hmmerclust.Locus instance at 0x106979050>
0
1
1
1
1
1
1
1
0
0
1
...
1
1
1
0
1
2
0
1
1
1
Salmonella enterica
<hmmerclust.hmmerclust.Locus instance at 0x106600128>
0
1
1
1
1
1
1
1
0
0
1
...
1
1
1
1
1
1
1
1
1
1
<hmmerclust.hmmerclust.Locus instance at 0x10652e1b8>
0
1
1
1
1
1
1
1
0
0
0
...
1
0
1
0
1
2
0
0
2
0
Ramlibacter tataouinensis
<hmmerclust.hmmerclust.Locus instance at 0x105e29050>
0
2
1
1
1
1
1
1
1
0
0
...
1
0
1
0
0
0
0
0
0
0
5 rows × 21 columns
we could grab all the loci from this df:
In [22]:
InvG_only.unstacked_df.reset_index().locus_id
Out[22]:
0 <hmmerclust.hmmerclust.Locus instance at 0x105...
1 <hmmerclust.hmmerclust.Locus instance at 0x106...
2 <hmmerclust.hmmerclust.Locus instance at 0x106...
3 <hmmerclust.hmmerclust.Locus instance at 0x106...
4 <hmmerclust.hmmerclust.Locus instance at 0x105...
5 <hmmerclust.hmmerclust.Locus instance at 0x105...
6 <hmmerclust.hmmerclust.Locus instance at 0x106...
7 <hmmerclust.hmmerclust.Locus instance at 0x106...
8 <hmmerclust.hmmerclust.Locus instance at 0x105...
9 <hmmerclust.hmmerclust.Locus instance at 0x106...
10 <hmmerclust.hmmerclust.Locus instance at 0x105...
11 <hmmerclust.hmmerclust.Locus instance at 0x108...
12 <hmmerclust.hmmerclust.Locus instance at 0x104...
13 <hmmerclust.hmmerclust.Locus instance at 0x106...
14 <hmmerclust.hmmerclust.Locus instance at 0x105...
15 <hmmerclust.hmmerclust.Locus instance at 0x104...
16 <hmmerclust.hmmerclust.Locus instance at 0x105...
17 <hmmerclust.hmmerclust.Locus instance at 0x104...
18 <hmmerclust.hmmerclust.Locus instance at 0x101...
19 <hmmerclust.hmmerclust.Locus instance at 0x106...
20 <hmmerclust.hmmerclust.Locus instance at 0x106...
Name: locus_id, dtype: object
In [23]:
invg_loci = list(InvG_only.unstacked_df.reset_index().locus_id)
then visualize them:
In [24]:
figsize(15,5)
for locus in invg_loci:
hmmerclust.LocusView(locus)
----------------------------------------------------------------------
----------------------------------------------------------------------
Organism: Yersinia pestis CO92
Locus id: 4382904904
accession query evalue bitscore bias name
0 YP_002345337.1 InvG 3e-32 113.1 4.7 type III secretion pro
1 YP_002345338.1 PrgH 0.61 10.7 0.1 type-III secretion pro
2 YP_002345339.1 - - - - type III secretion app
3 YP_002345341.1 PrgI 6.9e-06 28.3 0.5 type III secretion app
4 YP_002345342.1 - - - - type III secretion app
5 YP_002345343.1 PrgI 1e-10 43.8 2.7 type III secretion app
6 YP_002345344.1 PrgK 9.8e-58 196.5 1.2 type III secretion sys
7 YP_002345345.1 - - - - hypothetical protein
8 YP_002345346.1 - - - - type III secretion sys
9 YP_002345347.1 InvA 1e-226 756.2 4.3 secretion system appar
10 YP_002345348.1 InvC 6.4e-71 240.1 0 type III secretion sys
11 YP_002345349.1 - - - - type III secretion sys
12 YP_002345351.1 SpaO 1e-19 71.6 0 type III secretion sys
13 YP_002345352.1 SpaP 8.1e-71 239.5 10.6 type III secretion sys
14 YP_002345353.1 SpaQ 2.2e-27 96.3 14.1 type III secretion app
15 YP_002345354.1 SpaR 6.4e-62 210.8 23.1 type III secretion app
16 YP_002345355.1 SpaS 2.7e-110 370.3 0.7 secretion system appar
----------------------------------------------------------------------
----------------------------------------------------------------------
Organism: Xanthomonas albilineans GPE PC73
Locus id: 4405563472
accession query evalue bitscore bias name
0 YP_003375972.1 SipD 4.9e-21 76.9 0.1 xsa-invasion protein
1 YP_003375973.1 - - - - xsa-associated protein
2 YP_003375974.1 - - - - xsa-associated protein
3 YP_003375975.1 - - - - xsa-associated protein
4 YP_003375976.1 - - - - xsa-associated protein
5 YP_003375977.1 - - - - hypothetical protein
6 YP_003375978.1 - - - - hypothetical protein
7 YP_003375979.1 - - - - oxygen-regulated invas
8 YP_003375980.1 OrgA 9.5e-20 72.8 0 oxygen-regulated invas
9 YP_003375981.1 PrgK 3.9e-39 135.8 0.1 xanthomonas secretion
10 YP_003375982.1 PrgI 2.7e-07 32.8 0.1 xanthomonas secretion
11 YP_003375983.1 PrgI 6.3e-08 34.8 0.5 xanthomonas secretion
12 YP_003375984.1 PrgH 2.6e-60 206.1 0 xanthomonas secretion
13 YP_003375985.1 - - - - xanthomonas secretion
14 YP_003375986.1 InvG 1.8e-25 91 0.9 xanthomonas secretion
15 YP_003375987.1 InvE 1.4e-20 75.8 6 xanthomonas secretion
16 YP_003375988.1 InvA 7.3e-203 677.3 5.4 xanthomonas secretion
17 YP_003375989.1 - - - - xanthomonas secretion
18 YP_003375990.1 InvC 2.3e-63 215.4 0.1 xanthomonas secretion
19 YP_003375991.1 - - - - xsa-associated protein
20 YP_003375992.1 - - - - xsa-associated protein
21 YP_003375993.1 SpaO 3.6e-19 69.9 0.1 xanthomonas secretion
22 YP_003375994.1 SpaP 4.3e-62 211.1 11.4 xanthomonas secretion
23 YP_003375995.1 SpaQ 9.8e-26 91 8.2 xanthomonas secretion
24 YP_003375996.1 SpaR 1.4e-40 140.9 20.1 xanthomonas secretion
25 YP_003375997.1 SpaS 1.6e-83 282.3 0.4 xanthomonas secretion
26 YP_003375998.1 - - - - xsa-invasion chaperone
27 YP_003375999.1 SipB 5.8e-26 93.4 4.2 xsa-invasion protein
28 YP_003376000.1 SipC 5.8e-17 63.5 26 xsa-invasion protein
----------------------------------------------------------------------
----------------------------------------------------------------------
Organism: Salmonella enterica subsp. enterica serovar Typhi str. CT18
Locus id: 4401922344
accession query evalue bitscore bias name
0 NP_457264.1 OrgA 5.9e-70 236.4 3.9 cell invasion protein
1 NP_457265.1 PrgK 8.7e-59 199.9 0 pathogenicity 1 island
2 NP_457266.1 PrgJ 1.1e-07 33.6 0 pathogenicity 1 island
3 NP_457267.1 PrgI 7.1e-10 41.1 0.1 pathogenicity 1 island
4 NP_457268.1 PrgH 1.1e-152 510.1 0 pathogenicity 1 island
5 NP_457269.1 - - - - AraC family transcript
6 NP_457270.1 - - - - invasion protein regul
7 NP_457271.1 - - - - cell invasion protein
8 NP_457272.1 - - - - tyrosine phosphatase
9 NP_457273.1 - - - - chaperone protein
10 NP_457274.1 - - - - hypothetical protein
11 NP_457275.1 - - - - acyl carrier protein
12 NP_457276.1 - - - - pathogenicity island 1
13 NP_457277.1 SipD 2.7e-175 584 11 pathogenicity island 1
14 NP_457278.1 SipC 2.3e-135 452.8 34.3 pathogenicity island 1
15 NP_457279.1 SipB 1.4e-70 239.9 24.3 pathogenicity island 1
16 NP_457280.1 - - - - chaperone protein SicA
17 NP_457281.1 SpaS 3.4e-117 393 4.4 secretory protein
18 NP_457282.1 SpaR 1.7e-78 265.2 16.6 secretory protein
19 NP_457283.1 SpaQ 2.2e-25 89.9 0.8 secretory protein
20 NP_457284.1 SpaP 3.3e-66 224.5 3.7 secretory protein
21 NP_457285.1 SpaO 1.8e-16 61.2 0 surface presentation o
22 NP_457286.1 InvJ 4.8e-213 708.1 7.2 surface presentation o
23 NP_457287.1 - - - - secretory protein
24 NP_457288.1 InvC 9.1e-70 236.3 0 secretory apparatus AT
25 NP_457289.1 - - - - secretory protein
26 NP_457290.1 InvA 2.1e-224 748.5 9.3 secretory protein
27 NP_457291.1 InvE 5.3e-42 145.5 0.4 cell invasion protein
28 NP_457292.1 InvG 1.6e-30 107.4 0.6 secretory protein
29 NP_457293.1 - - - - AraC family transcript
30 NP_457294.1 InvH 6.8e-96 319.8 6 cell adherance/invasio
----------------------------------------------------------------------
----------------------------------------------------------------------
Organism: Salmonella enterica subsp. enterica serovar Typhi str. CT18
Locus id: 4401062328
accession query evalue bitscore bias name
0 NP_456106.1 SpaS 1.5e-127 427 2.5 type III secretion pro
1 NP_456107.1 SpaR 1.6e-76 258.7 17.8 type III secretion pro
2 NP_456108.1 SpaQ 9.2e-26 91.1 12.9 type III secretion pro
3 NP_456109.1 SpaP 2.9e-67 227.9 7.6 type III secretion pro
4 NP_456110.1 SpaO 1.6e-18 67.8 0 type III secretion pro
5 NP_456111.1 - - - - type III secretion pro
6 NP_456112.1 - - - - type III secretion pro
7 NP_456113.1 InvC 9.7e-69 232.9 0 type III secretion ATP
8 NP_456114.1 InvA 2.6e-209 698.7 12.3 type III secretion pro
9 NP_456115.1 - - - - pathogenicity island p
10 NP_456116.1 InvE 1.7e-20 75.5 5.9 secretion system prote
11 NP_456117.1 - - - - pathogenicity island p
12 NP_456118.1 - - - - pathogenicity island p
13 NP_456119.1 PrgK 7e-64 216.6 0.3 pathogenicity island l
14 NP_456120.1 PrgI 2.6e-07 32.9 1.4 pathogenicity island p
15 NP_456121.1 - - - - pathogenicity island p
16 NP_456122.1 PrgI 3.2e-08 35.8 0.4 pathogenicity island p
17 NP_456123.1 - - - - pathogenicity island e
18 NP_456124.1 - - - - pathogenicity island e
19 NP_456125.1 - - - - pathogenicity island p
20 NP_456126.1 - - - - pathogenicity island e
21 NP_456127.1 SipB 0.53 11.4 7.5 pathogenicity island e
22 NP_456128.1 SipB 1.8e-65 223.1 19.4 pathogenicity island e
23 NP_456129.1 - - - - type III secretion sys
24 NP_456130.1 - - - - pathogenicity island e
25 NP_456131.1 - - - - pathogenicity island p
26 NP_456132.1 - - - - secretion system prote
27 NP_456133.1 - - - - pathogenicity island p
28 NP_456134.1 InvG 4e-29 102.9 3.6 outer membrane secreto
----------------------------------------------------------------------
----------------------------------------------------------------------
Organism: Ramlibacter tataouinensis TTB310
Locus id: 4393701456
accession query evalue bitscore bias name
0 WP_013902302.1 SpaQ 6.8e-21 75.5 7.5 aldolase
1 WP_013902303.1 SpaP 1.1e-64 219.5 7.8 flagellar biosynthesis
2 WP_013902304.1 SpaO 2.7e-13 51.1 0 translocation protein
3 WP_041676481.1 PscP 0.00084 20.7 0 hypothetical protein
4 WP_041676482.1 InvA 2.4e-218 728.5 0.1 hypersensitivity respo
5 WP_013902307.1 SpaS 1.5e-104 351.4 0.2 translocation protein
6 WP_041676483.1 - - - - hypothetical protein
7 WP_013902309.1 - - - - hypothetical protein
8 WP_013902310.1 - - - - hypothetical protein
9 WP_013902311.1 - - - - hypothetical protein
10 WP_041675640.1 - - - - hypothetical protein
11 WP_041676485.1 - - - - lytic transglycosylase
12 WP_041676486.1 - - - - hypothetical protein
13 WP_041676487.1 SpaR 1.7e-61 209.5 17.5 type III secretion sys
14 WP_013902316.1 - - - - hypothetical protein
15 WP_013902317.1 InvC 5.1e-68 230.6 0 ATP synthase
16 WP_041675641.1 - - - - hypothetical protein
17 WP_013902319.1 - - - - hypothetical protein
18 WP_041676488.1 PrgK 1.6e-61 208.9 0.1 YscJ/HrcJ family type
19 WP_013902321.1 - - - - hypothetical protein
20 WP_041675642.1 - - - - hypothetical protein
21 WP_013902323.1 - - - - hypothetical protein
22 WP_041675643.1 - - - - hypothetical protein
23 WP_013902326.1 - - - - hypothetical protein
24 WP_041676489.1 - - - - hypothetical protein
25 WP_041676490.1 InvG 1.9e-26 94.2 0 hypothetical protein
26 WP_013902329.1 - - - - hypothetical protein
27 WP_041675644.1 - - - - hypothetical protein
28 WP_041676491.1 - - - - hypothetical protein
29 WP_041675645.1 - - - - hypothetical protein
30 WP_013902335.1 - - - - N-acetylglucosamine-6-
31 WP_041676492.1 InvC 0.045 15 0 glycerol-3-phosphate A
----------------------------------------------------------------------
----------------------------------------------------------------------
Organism: Pseudomonas aeruginosa PAO1
Locus id: 4386698016
accession query evalue bitscore bias name
0 NP_250381.1 SpaS 4.7e-123 412.2 0.2 translocation protein
1 NP_250382.1 SpaR 1.2e-69 236.1 24.2 translocation protein
2 NP_250383.1 SpaQ 4.1e-27 95.4 7.6 translocation protein
3 NP_250384.1 SpaP 1.7e-73 248.3 6.1 type III secretion sys
4 NP_250385.1 SpaO 1.6e-23 83.8 0 type III secretion sys
5 NP_250386.1 PscP 1.6e-09 39.1 6.7 translocation protein
6 NP_250387.1 - - - - translocation protein
7 NP_250388.1 InvC 2.2e-75 254.7 0 type III secretion sys
8 NP_250389.1 InvE 2.6e-37 130.2 9.3 type III secretion out
9 NP_250390.1 - - - - hypothetical protein
10 NP_250391.1 - - - - hypothetical protein
11 NP_250392.1 - - - - hypothetical protein
12 NP_250393.1 - - - - hypothetical protein
13 NP_250394.1 InvA 1e-243 812.4 5.9 type III secretory app
14 NP_250395.1 - - - - transcriptional regula
15 NP_250396.1 - - - - regulator in type III
16 NP_250397.1 - - - - type III secretion pro
17 NP_250398.1 - - - - regulatory protein Pcr
18 NP_250399.1 SipB 4.6e-69 234.9 36.6 translocator protein P
19 NP_250400.1 - - - - translocator outer mem
20 NP_250401.1 - - - - exoenzyme S synthesis
21 NP_250402.1 - - - - ExsE protein
22 NP_250403.1 - - - - exoenzyme S synthesis
23 NP_250404.1 - - - - transcriptional regula
24 NP_250405.1 - - - - ExsD protein
25 NP_250406.1 - - - - type III export appara
26 NP_250407.1 InvG 9.6e-37 127.7 0.2 type III secretion out
27 NP_250408.1 - - - - type III export protei
28 NP_250409.1 - - - - type III export protei
29 NP_250410.1 PrgI 4.6e-12 48.1 0.4 type III export protei
30 NP_250411.1 - - - - type III export protei
31 NP_250412.1 - - - - type III export protei
32 NP_250413.1 PrgI 9e-16 60 0.3 type III export protei
33 NP_250414.1 PrgK 2.3e-72 244.3 0.2 type III export protei
----------------------------------------------------------------------
----------------------------------------------------------------------
Organism: Pandoraea pnomenusa 3kgm
Locus id: 4407716464
accession query evalue bitscore bias name
0 WP_038622073.1 SpaS 5.6e-95 319.9 0 secretion system appar
1 WP_023873175.1 SpaR 9.7e-52 177.5 25.5 type III secretion pro
2 WP_023597451.1 SpaQ 2.6e-24 86.4 12.4 type III secretion sys
3 WP_023597452.1 SpaP 7.5e-67 226.6 7.4 flagellar biosynthesis
4 WP_023597453.1 - - - - hypothetical protein
5 WP_023597454.1 - - - - hypothetical protein
6 WP_023597455.1 - - - - hypothetical protein
7 WP_023597456.1 InvC 2.6e-67 228.3 0 type III secretion sys
8 WP_031627404.1 InvA 3.6e-220 734.5 0 secretion system appar
9 WP_023597458.1 - - - - secretion protein
10 WP_023597459.1 - - - - transcriptional regula
11 WP_023597460.1 - - - - hypothetical protein
12 WP_031627405.1 InvG 2e-30 107.2 0.2 hypothetical protein
13 WP_031627406.1 - - - - hypothetical protein
14 WP_023597463.1 - - - - hypothetical protein
15 WP_023597464.1 InvE 1e-08 37.1 1 hypothetical protein
16 WP_031627407.1 - - - - hypothetical protein
17 WP_023597466.1 - - - - hypothetical protein
18 WP_036642781.1 PrgK 1.4e-49 169.9 0.2 type III secretion pro
19 WP_029754348.1 PrgI 3.7e-07 32.4 0 hypothetical protein
20 WP_029754349.1 - - - - hypothetical protein
21 WP_023597470.1 PrgI 4.3e-09 38.6 0.1 type III secretion sys
----------------------------------------------------------------------
----------------------------------------------------------------------
Organism: Pandoraea pnomenusa 3kgm
Locus id: 4402360976
accession query evalue bitscore bias name
0 WP_023598119.1 SipD 5.3e-65 221.4 0.2 cell division protein
1 WP_023598120.1 SipC 2.9e-16 61.2 4.4 hypothetical protein
2 WP_023598121.1 SipB 7.6e-31 109.4 8.9 hypothetical protein
3 WP_029754447.1 - - - - chaperone protein SicA
4 WP_041612875.1 SpaS 1.9e-95 321.5 0.1 type III secretion sys
5 WP_023598124.1 SpaR 1.1e-59 203.6 11.5 type III secretion sys
6 WP_023598125.1 SpaQ 2.4e-24 86.5 4.7 type III secretion sys
7 WP_023598126.1 SpaP 1.3e-64 219.3 5.3 type III secretion sys
8 WP_023598127.1 SpaO 2.5e-16 60.8 0 hypothetical protein
9 WP_023598128.1 - - - - hypothetical protein
10 WP_023598129.1 - - - - hypothetical protein
11 WP_023598130.1 InvC 4.6e-67 227.5 0 ATP synthase SpaL
12 WP_023872767.1 InvA 1.1e-215 719.8 4.1 type III secretion sys
13 WP_023598132.1 InvE 1.3e-14 56.3 0.2 hypothetical protein
14 WP_038621901.1 InvG 2.3e-33 116.7 0.1 type III secretion sys
15 WP_023872764.1 - - - - transcriptional regula
16 WP_023598135.1 - - - - hypothetical protein
17 WP_023598136.1 - - - - hypothetical protein
18 WP_029754450.1 OrgA 6.1e-21 76.7 3.3 hypothetical protein
19 WP_036642379.1 PrgK 6.1e-53 180.8 0.2 hypothetical protein
20 WP_023598139.1 PrgI 3.3e-07 32.5 0 type III secretion sys
21 WP_029754452.1 PrgI 0.0078 18.5 1.9 hypothetical protein
22 WP_031627463.1 PrgH 1.6e-80 272.6 0 hypothetical protein
----------------------------------------------------------------------
----------------------------------------------------------------------
Organism: Pandoraea pnomenusa 3kgm
Locus id: 4394214320
accession query evalue bitscore bias name
0 WP_031627391.1 SipB 1.1e-20 76.1 41.7 hypothetical protein
1 WP_023597369.1 - - - - hypothetical protein
2 WP_023597370.1 - - - - hypothetical protein
3 WP_038622135.1 - - - - hypothetical protein
4 WP_023597372.1 PrgI 3.3e-06 29.3 1.2 hypothetical protein
5 WP_031627392.1 PrgK 3.2e-62 211.2 0.1 hypothetical protein
6 WP_023597374.1 - - - - hypothetical protein
7 WP_038622133.1 - - - - type III secretion sys
8 WP_023597376.1 InvC 2.6e-71 241.3 0 ATP synthase
9 WP_023597377.1 - - - - hypothetical protein
10 WP_023597378.1 - - - - hypothetical protein
11 WP_023597379.1 SpaO 8.1e-18 65.6 0 hypothetical protein
12 WP_031627393.1 SpaP 1.6e-69 235.3 11.1 type III secretion sys
13 WP_023597381.1 SpaQ 9.2e-27 94.3 5.1 preprotein translocase
14 WP_031627394.1 SpaR 2.1e-56 192.8 18.3 hypothetical protein
15 WP_023597383.1 SpaS 3e-107 360.3 1.2 type III secretion pro
16 WP_023597384.1 - - - - hypothetical protein
17 WP_041624416.1 InvG 1.1e-35 124.3 2 hypothetical protein
18 WP_031627395.1 - - - - hypothetical protein
19 WP_031627396.1 - - - - hypothetical protein
20 WP_023597387.1 - - - - hypothetical protein
21 WP_031627397.1 - - - - hypothetical protein
22 WP_023597389.1 - - - - hypothetical protein
23 WP_031627398.1 - - - - hypothetical protein
24 WP_031627399.1 - - - - hypothetical protein
25 WP_041624417.1 - - - - hypothetical protein
26 WP_023597393.1 InvE 3.9e-11 45 0.3 hypothetical protein
27 WP_023597394.1 - - - - hypothetical protein
28 WP_023597395.1 - - - - hypothetical protein
29 WP_041624418.1 - - - - hypothetical protein
30 WP_023597397.1 InvA 1.1e-242 808.9 2.6 Low calcium response l
31 WP_041624419.1 - - - - hypothetical protein
32 WP_023597399.1 - - - - hypothetical protein
33 WP_023597401.1 PrgI 1.4e-05 27.3 0.5 type III secretion pro
----------------------------------------------------------------------
----------------------------------------------------------------------
Organism: Hahella chejuensis KCTC 2396
Locus id: 4406060528
accession query evalue bitscore bias name
0 WP_041598672.1 InvG 1e-33 117.8 0.1 hypothetical protein
1 WP_011397071.1 - - - - putative type III expo
2 WP_011397072.1 - - - - hypothetical protein
3 WP_011397073.1 PrgI 3e-09 39.1 0.4 hypothetical protein
4 WP_011397074.1 - - - - hypothetical protein
5 WP_011397075.1 PrgI 4e-22 80.4 4.5 hypothetical protein
6 WP_041599578.1 PrgK 3.4e-61 207.8 0 type III secretory pro
7 WP_011397077.1 - - - - hypothetical protein
8 WP_011397078.1 - - - - flagellar biosynthesis
9 WP_041599579.1 - - - - hypothetical protein
10 WP_041598673.1 - - - - hypothetical protein
11 WP_041599580.1 - - - - hypothetical protein
12 WP_041598674.1 - - - - hypothetical protein
13 WP_011397083.1 - - - - hypothetical protein
14 WP_011397084.1 - - - - hypothetical protein
15 WP_011397085.1 - - - - hypothetical protein
16 WP_041598675.1 - - - - hypothetical protein
17 WP_011397087.1 - - - - transposase
18 WP_041598676.1 - - - - hypothetical protein
19 WP_011397088.1 SpaS 2.5e-116 390.1 4.3 preprotein translocase
20 WP_011397089.1 SpaR 3.6e-66 224.8 26 preprotein translocase
21 WP_011397090.1 SpaQ 3.1e-26 92.6 11 type III secretory pat
22 WP_011397091.1 SpaP 6.5e-74 249.6 6.1 flagellar biosynthesis
23 WP_011397092.1 SpaO 2.4e-22 80.1 0 flagellar motor switch
24 WP_011397093.1 PscP 5.3e-11 43.8 0.1 hypothetical protein
25 WP_011397094.1 - - - - hypothetical protein
26 WP_041599581.1 InvC 2.9e-73 247.7 0 ATP synthase
27 WP_011397096.1 - - - - ATPase
28 WP_041598677.1 - - - - hypothetical protein
29 WP_041598678.1 - - - - hypothetical protein
30 WP_011397098.1 - - - - cupin
31 WP_041599582.1 InvA 3.4e-242 807.3 6.9 Low calcium response l
32 WP_011397100.1 - - - - hypothetical protein
33 WP_011397101.1 - - - - hypothetical protein
34 WP_011397102.1 - - - - YopN chaperone SycN-li
35 WP_011397103.1 InvE 9e-38 131.7 10 type II secretion targ
36 WP_011397104.1 PrgJ 4.2 8.6 7 alpha-ketoglutarate de
37 WP_011397105.1 SipB 2.2e-55 190 54.4 hypothetical protein
----------------------------------------------------------------------
----------------------------------------------------------------------
Organism: Hahella chejuensis KCTC 2396
Locus id: 4389355672
accession query evalue bitscore bias name
0 WP_011398842.1 SipB 0.00099 20.3 11.9 hypothetical protein
1 WP_041599871.1 SipB 6.1e-37 129.4 24.6 hypothetical protein
2 WP_011398844.1 - - - - hypothetical protein
3 WP_011398845.1 - - - - hypothetical protein
4 WP_011398846.1 - - - - hypothetical protein
5 WP_011398847.1 InvA 7.8e-248 826 1.6 type III secretory pat
6 WP_011398848.1 - - - - hypothetical protein
7 WP_011398849.1 - - - - hypothetical protein
8 WP_011398850.1 - - - - hypothetical protein
9 WP_011398851.1 - - - - hypothetical protein
10 WP_041598874.1 InvE 4.1e-37 129.5 8 hypothetical protein
11 WP_011398853.1 InvC 2.1e-73 248.2 0 ATP synthase
12 WP_011398854.1 - - - - putative type III secr
13 WP_011398855.1 - - - - hypothetical protein
14 WP_041598875.1 SpaO 1.5e-23 84 0 hypothetical protein
15 WP_011398857.1 SpaP 5.3e-74 249.9 9 flagellar biosynthesis
16 WP_011398858.1 SpaQ 1.5e-24 87.2 11 type III secretory pat
17 WP_011398859.1 SpaR 2.4e-66 225.3 23.6 preprotein translocase
18 WP_011398860.1 SpaS 7.2e-119 398.5 5.3 preprotein translocase
19 WP_011398861.1 - - - - amino acid ABC transpo
20 WP_011398862.1 - - - - deacetylase
21 WP_011398863.1 - - - - N-acetyltransferase
22 WP_011398865.1 - - - - type III secretion sys
23 WP_011398866.1 - - - - hypothetical protein
24 WP_041599872.1 PrgK 1.3e-59 202.7 0 type III secretory pro
25 WP_011398868.1 - - - - hypothetical protein
26 WP_011398870.1 - - - - hypothetical protein
27 WP_011398872.1 - - - - hypothetical protein
28 WP_011398874.1 - - - - hypothetical protein
29 WP_011398875.1 PrgH 0.035 14.8 0 hypothetical protein
30 WP_011398876.1 InvG 3.4e-33 116.1 1.7 type II secretory path
----------------------------------------------------------------------
----------------------------------------------------------------------
Organism: Escherichia coli O157:H7 str. Sakai
Locus id: 4442989128
accession query evalue bitscore bias name
0 NP_311742.1 OrgA 3.7e-70 237.1 0.6 hypothetical protein
1 NP_311743.1 PrgK 1.6e-52 179.5 0.2 EprK
2 NP_311744.1 PrgI 4.5e-06 28.9 1.1 EprJ
3 NP_311745.1 PrgI 7.8e-10 40.9 0.2 EprI
4 NP_311746.1 PrgH 4.3e-94 317.3 7.1 EprH
5 NP_311747.1 - - - - transcriptional regula
6 NP_311748.1 SpaS 7.5e-105 352.4 3.2 surface presentation o
7 NP_311751.1 SpaQ 1e-25 91 0.6 EpaQ
8 NP_311752.1 SpaP 2.1e-65 221.9 11.1 surface presentation o
9 NP_311753.1 SpaO 1.8e-16 61.2 0 surface presentation o
10 NP_311754.1 InvJ 5.3e-09 37.4 0.1 EivJ
11 NP_311755.1 - - - - hypothetical protein
12 NP_311756.1 - - - - EivI
13 NP_311757.1 InvC 7.3e-69 233.3 0 ATP synthase SpaL
14 NP_311758.1 InvA 6e-218 727.2 9.9 EivA
15 NP_311759.1 InvE 1.4e-36 127.8 1.7 EivE
16 NP_311760.1 InvG 1.5e-28 101 0.2 EivG
----------------------------------------------------------------------
----------------------------------------------------------------------
Organism: Escherichia coli O157:H7 str. Sakai
Locus id: 4376421224
accession query evalue bitscore bias name
0 NP_312579.1 PrgI 1.3e-11 46.6 0.4 protein EscF
1 NP_312580.1 - - - - hypothetical protein
2 NP_312581.1 SipB 2.6 9.1 37.9 protein EspB
3 NP_312582.1 SipB 7.8e-39 135.6 25 protein EspD
4 NP_312583.1 - - - - protein EspA
5 NP_312584.1 InvE 2.9e-24 87.7 1.1 SepL
6 NP_312585.1 - - - - EscD
7 NP_312586.1 - - - - gamma intimin
8 NP_312587.1 - - - - protein CesT
9 NP_312588.1 - - - - hypothetical protein
10 NP_312589.1 - - - - hypothetical protein
11 NP_312590.1 - - - - hypothetical protein
12 NP_312591.1 - - - - hypothetical protein
13 NP_944569.1 - - - - hypothetical protein
14 NP_312592.1 SpaO 0.41 12 0 SepQ
15 NP_312593.1 - - - - hypothetical protein
16 NP_312594.1 - - - - hypothetical protein
17 NP_312595.1 InvC 5.9e-70 236.9 0 EscN
18 NP_312596.1 InvA 3.8e-213 711.3 13.7 hypothetical protein
19 NP_312597.1 - - - - hypothetical protein
20 NP_312598.1 - - - - SepZ
21 NP_312599.1 PrgI 1.2e-11 46.7 0.9 hypothetical protein
22 NP_312600.1 PrgK 3.4e-58 198 5.3 EscJ
23 NP_312601.1 - - - - SepD
24 NP_312602.1 InvG 7e-32 111.9 0.1 EscC
25 NP_312603.1 - - - - CesD
26 NP_312604.1 - - - - hypothetical protein
27 NP_312605.1 - - - - negative regulator Grl
28 NP_312606.1 - - - - hypothetical protein
29 NP_312607.1 SpaS 8.6e-100 335.8 3.3 secretion system appar
30 NP_312608.1 SpaR 4.7e-50 172 23.6 EscT
31 NP_312609.1 SpaQ 1.9e-25 90.1 11.3 EscS
32 NP_312610.1 SpaP 4.6e-67 227.3 9.9 type III secretion sys
33 NP_312611.1 OrgB 1.1e-40 139.3 12.1 hypothetical protein
----------------------------------------------------------------------
----------------------------------------------------------------------
Organism: Erwinia amylovora ATCC 49946
Locus id: 4403024240
accession query evalue bitscore bias name
0 WP_004157269.1 OrgA 1e-18 69.4 1.4 oxidoreductase
1 WP_013034867.1 PrgK 1.3e-40 140.6 2.3 hypothetical protein
2 WP_004157271.1 PrgI 6.1e-07 31.7 0.1 type III secretion sys
3 WP_004157272.1 PrgH 3.4e-73 248.5 4.4 type III secretion sys
4 WP_004157273.1 - - - - type III secretion sys
5 WP_004157274.1 InvG 4.4e-27 96.3 0 type III secretion sys
6 WP_004157276.1 InvE 3.5e-21 77.7 1.7 invasion protein
7 WP_004162107.1 InvA 2e-209 699 7.2 type III secretion sys
8 WP_004157278.1 - - - - hypothetical protein
9 WP_004157280.1 InvC 1.3e-70 239.1 0.1 ATP synthase SpaL
10 WP_004157282.1 - - - - hypothetical protein
11 WP_004157283.1 InvJ 0.00041 21.3 0.2 type III secretion sys
12 WP_004157284.1 SpaO 6e-18 66 0 hypothetical protein
13 WP_013036007.1 SpaP 3.4e-67 227.7 8.2 type III secretion sys
14 WP_004157286.1 SpaQ 4.8e-22 79.2 8.9 type III secretion sys
15 WP_004157287.1 SpaR 4e-45 155.8 16.5 hypothetical protein
16 WP_004164209.1 SpaS 2.6e-81 275 4.3 type III secretion sys
17 WP_004157289.1 - - - - chaperone protein SicA
18 WP_004162114.1 SipB 2.7e-32 114.1 3 cell division protein
19 WP_004165414.1 - - - - type III secretion sys
20 WP_004157293.1 SipD 1.7e-22 81.7 0.6 cell division protein
----------------------------------------------------------------------
----------------------------------------------------------------------
Organism: Erwinia amylovora ATCC 49946
Locus id: 4390056088
accession query evalue bitscore bias name
0 WP_004155899.1 SipD 3.5e-24 87.2 1.3 type III effector
1 WP_004155898.1 - - - - type III secretion sys
2 WP_004155897.1 SipB 5.3e-34 119.7 72.1 type III secretion sys
3 WP_004155896.1 - - - - chaperone protein SicA
4 WP_004161836.1 SpaS 3.4e-94 317.3 2.2 type III secretion sys
5 WP_004155887.1 SpaR 3.9e-46 159.2 21.3 hypothetical protein
6 WP_004155885.1 SpaQ 3e-22 79.8 10 type III secretion sys
7 WP_004155882.1 SpaP 1.1e-64 219.5 11.3 type III secretion sys
8 WP_004155880.1 SpaO 1.8e-17 64.5 0 type III secretion sys
9 WP_004155879.1 - - - - type III secretion sys
10 WP_004155878.1 - - - - type III secretion sys
11 WP_004155876.1 InvC 2.4e-72 244.7 0 ATP synthase SpaL
12 WP_004155875.1 - - - - hypothetical protein
13 WP_004155873.1 InvA 6.7e-214 713.8 7.4 type III secretion sys
14 WP_004155871.1 InvE 3.9e-24 87.3 5.6 invasion protein
15 WP_004155864.1 InvG 1.3e-27 97.9 0 type III secretion sys
16 WP_013036191.1 - - - - AraC family transcript
17 WP_033477864.1 - - - - hypothetical protein
18 WP_004155859.1 PrgH 5.7e-78 264.2 0.1 type III secretion sys
19 WP_004155857.1 PrgI 5.7e-08 35 0.3 type III secretion sys
20 WP_004155856.1 PrgI 1.1e-06 30.9 0.1 type III secretion sys
21 WP_004155855.1 PrgK 2e-41 143.3 1.4 hypothetical protein
22 WP_004155852.1 OrgA 6.9e-27 96.1 0.5 type III secretion sys
----------------------------------------------------------------------
----------------------------------------------------------------------
Organism: Erwinia amylovora ATCC 49946
Locus id: 4375782392
accession query evalue bitscore bias name
0 WP_004155366.1 InvG 2.3e-32 113.4 0.1 secretin
1 WP_004155365.1 - - - - type III secretion sys
2 WP_004155364.1 - - - - HPr kinase
3 WP_004155363.1 - - - - type III secretion sys
4 WP_004155362.1 - - - - type III secretion sys
5 WP_004155361.1 PrgK 3.2e-69 234 0 type III secretion pro
6 WP_004155360.1 PrgI 1.3e-15 59.4 0.1 type III secretion sys
7 WP_033477547.1 - - - - Hrp pili protein HrpA
8 WP_004155357.1 - - - - hypothetical protein
9 WP_004155356.1 - - - - hypothetical protein
10 WP_004155355.1 - - - - ATPase AAA
11 WP_004155354.1 - - - - transcriptional regula
12 WP_004155353.1 - - - - sensor kinase
13 WP_004155352.1 - - - - RNA polymerase sigma f
14 WP_004155350.1 InvE 1.8e-45 156.7 10 hypersensitivity respo
15 WP_004155349.1 InvA 3.6e-222 741.2 3.8 harpin secretion prote
16 WP_004155348.1 - - - - type III secretion pro
17 WP_004155347.1 InvC 5.8e-71 240.2 0 ATP synthase
18 WP_004155346.1 - - - - type III secretion sys
19 WP_004155345.1 - - - - type III secretion sys
20 WP_004155343.1 SpaO 1.6e-19 71.1 0.1 type III secretion sys
21 WP_004155342.1 SpaP 2.3e-64 218.5 13.6 harpin secretion prote
22 WP_004155341.1 SpaQ 1.7e-26 93.4 9.5 Type III secretion pro
23 WP_004155340.1 SpaR 3.5e-80 270.7 18.2 type III secretion sys
24 WP_004155339.1 SpaS 7.2e-130 434.6 0.1 type III secretion sys
----------------------------------------------------------------------
----------------------------------------------------------------------
Organism: Chlamydia trachomatis 434/Bu
Locus id: 4384640752
accession query evalue bitscore bias name
0 YP_001654892.1 PrgK 3.8e-39 135.8 0 type III secretion sys
1 YP_001654893.1 - - - - hypothetical protein
2 YP_001654894.1 - - - - type III secretion sys
3 YP_001654895.1 SpaP 6.3e-65 220.3 13.8 type III secretion sys
4 YP_001654896.1 SpaQ 4.9e-25 88.8 9.8 type III secretion sys
5 YP_001654897.1 SpaR 6.4e-36 125.7 24.1 type III secretion sys
6 YP_001654898.1 - - - - hypothetical protein
7 YP_001654899.1 - - - - hypothetical protein
8 YP_001654900.1 - - - - hypothetical protein
9 YP_001654901.1 - - - - hypothetical protein
10 YP_001654902.1 - - - - general secretion path
11 YP_001654903.1 - - - - general secretion path
12 YP_001654904.1 - - - - general secretion path
13 YP_001654905.1 InvG 1.6e-35 123.7 0.2 general secretion path
14 YP_001654906.1 - - - - hypothetical protein
15 YP_001654907.1 - - - - proline dipeptidase
16 YP_001654908.1 - - - - DNA mismatch repair pr
17 YP_001654909.1 - - - - type III secretion cha
18 YP_001654910.1 - - - - hypothetical protein
19 YP_001654911.1 SipB 5.5e-18 67.1 59.4 type III secretion sys
20 YP_001654912.1 SipB 1.8e-39 137.7 52.8 type III secretion sys
----------------------------------------------------------------------
----------------------------------------------------------------------
Organism: Burkholderia thailandensis E264
Locus id: 4376129760
accession query evalue bitscore bias name
0 YP_439019.1 OrgA 1.1e-64 219.3 0.2 oxygen-regulated invas
1 YP_439020.1 PrgK 1.2e-47 163.6 1.6 type III secretion sys
2 YP_439021.1 PrgI 6.2e-09 38.1 0.5 type III secretion sys
3 YP_439022.1 PrgI 3.2e-09 39 0.3 type III secretion sys
4 YP_439023.1 PrgH 8.5e-131 438 0 type III secretion sys
5 YP_439024.1 - - - - type III secretion sys
6 YP_439025.1 InvG 2.1e-29 103.8 0.8 type III secretion sys
7 YP_439026.1 InvE 2.3e-39 136.8 5.1 type III secretion sys
8 YP_439027.1 InvA 1.2e-219 732.9 2.5 type III secretion sys
9 YP_439028.1 - - - - type III secretion sys
10 YP_439029.1 InvC 5.8e-69 233.7 0 ATP synthase SpaL
11 YP_439030.1 - - - - surface presentation o
12 YP_439031.1 - - - - BsaU protein
13 YP_439032.1 SpaO 1.4e-19 71.2 0 type III secretion sys
14 YP_439033.1 SpaP 1.3e-66 225.9 1.3 surface presentation o
15 YP_439034.1 SpaQ 1.5e-24 87.2 4.7 type III secretion sys
16 YP_439035.1 SpaR 4.2e-50 172.1 11.9 type III secretion sys
17 YP_439036.1 SpaS 3.1e-89 301.1 0.5 surface presentation o
18 YP_439037.1 - - - - type III secretion cha
19 YP_439038.1 SipB 1.2e-62 213.9 41.8 BipB protein
20 YP_439039.1 SipC 1e-122 411.2 31.9 type III secretion tar
21 YP_439040.1 - - - - DNA-binding protein Bp
22 YP_439041.1 SipD 2.2e-27 97.8 4.1 BprD protein
----------------------------------------------------------------------
----------------------------------------------------------------------
Organism: Burkholderia thailandensis E264
Locus id: 4316328160
accession query evalue bitscore bias name
0 YP_438939.1 SpaR 2.7e-51 176 17.2 type III secretion inn
1 YP_438940.1 - - - - type III secretory pat
2 YP_438941.1 InvC 4.1e-70 237.4 0 type III secretion sys
3 YP_438942.1 - - - - type III secretion sys
4 YP_438943.1 - - - - hypothetical protein
5 YP_438944.1 PrgK 6.2e-66 223.3 0 lipoprotein transmembr
6 YP_438945.1 - - - - HrpB2-like protein
7 YP_438946.1 - - - - hypothetical protein
8 YP_438947.1 SpaS 1.2e-101 341.9 1.2 type III secretion sys
9 YP_438948.1 InvA 2.5e-223 745 0 type III secretion inn
10 YP_438949.1 PscP 7.3e-07 30.5 0.1 type III secretion inn
11 YP_438950.1 SpaO 5.7e-19 69.3 0 surface presentation o
12 YP_438951.1 SpaP 1.7e-64 218.9 12.4 type III secretion sys
13 YP_438952.1 SpaQ 2.9e-24 86.3 4.6 type III secretion inn
14 YP_438953.1 - - - - hypothetical protein
15 YP_438954.1 - - - - hypothetical protein
16 YP_438955.1 PrgH 0.42 11.2 0 serine protease
17 YP_438956.1 - - - - hypothetical protein
18 YP_438957.1 - - - - hypothetical protein
19 YP_438958.1 - - - - hypothetical protein
20 YP_438959.1 - - - - regulatory protein Hrp
21 YP_438960.1 - - - - hrp protein
22 YP_438961.1 - - - - hypothetical protein
23 YP_438962.1 - - - - sensor histidine kinas
24 YP_438963.1 - - - - DNA-binding response r
25 YP_438964.1 - - - - type II/III secretion
26 YP_438965.1 - - - - twitching motility pro
27 YP_438966.1 InvG 9.2e-20 72.5 4.3 type IV pilus biogenes
28 YP_438967.1 - - - - pilO family protein
29 YP_438968.1 - - - - hypothetical protein
30 YP_438969.1 - - - - type II/IV secretion s
31 YP_438970.1 - - - - type IV pilus biogenes
32 YP_438971.1 - - - - type IV pilus biogenes
33 YP_438972.1 - - - - hypothetical protein
34 YP_438973.1 - - - - type IV prepilin
35 YP_438974.1 InvG 8.6e-35 121.3 1 type II/III secretion
----------------------------------------------------------------------
----------------------------------------------------------------------
Organism: Bordetella pertussis 18323
Locus id: 4400943760
accession query evalue bitscore bias name
0 WP_010930838.1 InvC 0.025 15.9 0 ABC transporter ATP-bi
1 WP_014905545.1 - - - - phosphonate ABC transp
2 WP_023853652.1 - - - - phosphonoacetaldehyde
3 WP_010930836.1 - - - - ABC transporter substr
4 WP_023853663.1 - - - - tRNA uridine 5-carboxy
5 WP_010930742.1 - - - - transposase
6 WP_003809615.1 - - - - transcription elongati
7 WP_019247142.1 - - - - methyl-accepting chemo
8 WP_003809610.1 - - - - pyrimidine permease
9 WP_004568205.1 - - - - hypothetical protein
10 WP_010930835.1 - - - - ABC transporter substr
11 WP_005013747.1 - - - - transposase
12 WP_010930833.1 - - - - hypothetical protein
13 WP_003809869.1 PrgI 6.8e-09 37.9 1.4 type III secretion pro
14 WP_003820040.1 - - - - hypothetical protein
15 WP_010930832.1 - - - - type III secretion pro
16 WP_014905547.1 InvA 7.6e-241 802.9 4.7 type III secretion por
17 WP_010930830.1 - - - - hypothetical protein
18 WP_010930829.1 - - - - hypothetical protein
19 WP_010930828.1 - - - - hypothetical protein
20 WP_014905548.1 InvE 7.6e-17 63.6 8.4 membrane protein
21 WP_003820049.1 - - - - hypothetical protein
22 WP_010930826.1 - - - - hypothetical protein
23 WP_004568102.1 - - - - hypothetical protein
24 WP_010930825.1 - - - - membrane protein
25 WP_041166135.1 SipB 2.8e-59 202.8 37.7 membrane protein
26 WP_003809889.1 - - - - hypothetical protein
27 WP_010930823.1 - - - - hypothetical protein
28 WP_010930822.1 PrgI 1e-18 69.4 0 type III secretion pro
29 WP_010930821.1 PrgK 1.3e-69 235.3 0 type III secretion sys
30 WP_003820059.1 - - - - hypothetical protein
31 WP_023995095.1 - - - - type III secretion app
32 WP_003820061.1 InvC 8.1e-72 243 0 ATP synthase
33 WP_003820063.1 - - - - type III secretion pro
34 WP_010930819.1 - - - - hypothetical protein
35 WP_010930818.1 SpaO 6e-22 78.8 0 type III secretion sys
36 WP_004568087.1 SpaP 2.4e-72 244.5 9.7 flagellar biosynthesis
37 WP_010930817.1 SpaQ 3.6e-25 89.2 10.9 type III secretion pro
38 WP_010930816.1 SpaR 3.7e-61 208.4 19.2 type III secretion sys
39 WP_014905550.1 SpaS 2.7e-104 350.5 0.1 type III secretion pro
40 WP_014905551.1 - - - - hypothetical protein
41 WP_014905552.1 InvG 1e-36 127.6 0 type III secretion pro
----------------------------------------------------------------------
----------------------------------------------------------------------
Organism: Anaeromyxobacter dehalogenans 2CP-C
Locus id: 4402803056
accession query evalue bitscore bias name
0 WP_011419745.1 InvG 4.1e-46 158.2 1.4 general secretion path
1 WP_011419746.1 - - - - hypothetical protein
2 WP_011419747.1 - - - - acetoacetate metabolis
3 WP_041453298.1 - - - - hypothetical protein
4 WP_011419748.1 - - - - 50S ribosomal protein
5 WP_011419749.1 InvC 1.8e-41 143.8 0 transcription terminat
6 WP_011419750.1 - - - - hypothetical protein
7 WP_011419751.1 - - - - hypothetical protein
8 WP_011419752.1 - - - - DNA ligase
9 WP_011419753.1 - - - - acylphosphatase
10 WP_011419754.1 - - - - radical SAM protein
11 WP_011419755.1 - - - - ribonuclease G
12 WP_041453299.1 - - - - ribonuclease BN
13 WP_011419757.1 - - - - DNA-binding protein
14 WP_011419758.1 InvA 4e-195 651.8 0.2 type III secretion pro
15 WP_011419759.1 SpaS 6.7e-74 250.6 0 type III secretion pro
16 WP_011419760.1 SpaR 5.4e-29 103 7.6 type III secretion pro
17 WP_011419761.1 SpaQ 4.7e-27 95.2 4.4 type III secretion pro
18 WP_011419762.1 SpaP 3.7e-65 221.1 3 flagellar biosynthesis
19 WP_011419763.1 - - - - hypothetical protein
20 WP_011419764.1 SpaO 4.3e-21 76 0 type III secretion pro
21 WP_041453300.1 - - - - hypothetical protein
22 WP_011419765.1 PrgK 3.7e-40 139.1 0 secretion protein
23 WP_011419766.1 PrgI 0.035 16.4 2.7 hypothetical protein
In [25]:
#836 protein in the total search
len(df.df)
Out[25]:
836
In [26]:
#remember the list of invg loci we made in the previous step
invg_loci
Out[26]:
[<hmmerclust.hmmerclust.Locus instance at 0x1053dd248>,
<hmmerclust.hmmerclust.Locus instance at 0x106979050>,
<hmmerclust.hmmerclust.Locus instance at 0x106600128>,
<hmmerclust.hmmerclust.Locus instance at 0x10652e1b8>,
<hmmerclust.hmmerclust.Locus instance at 0x105e29050>,
<hmmerclust.hmmerclust.Locus instance at 0x10577b320>,
<hmmerclust.hmmerclust.Locus instance at 0x106b86a70>,
<hmmerclust.hmmerclust.Locus instance at 0x10666b290>,
<hmmerclust.hmmerclust.Locus instance at 0x105ea63b0>,
<hmmerclust.hmmerclust.Locus instance at 0x1069f25f0>,
<hmmerclust.hmmerclust.Locus instance at 0x105a04098>,
<hmmerclust.hmmerclust.Locus instance at 0x108d2a248>,
<hmmerclust.hmmerclust.Locus instance at 0x104dae368>,
<hmmerclust.hmmerclust.Locus instance at 0x10670d170>,
<hmmerclust.hmmerclust.Locus instance at 0x105aaf098>,
<hmmerclust.hmmerclust.Locus instance at 0x104d123f8>,
<hmmerclust.hmmerclust.Locus instance at 0x105584ef0>,
<hmmerclust.hmmerclust.Locus instance at 0x104d670e0>,
<hmmerclust.hmmerclust.Locus instance at 0x10145f0e0>,
<hmmerclust.hmmerclust.Locus instance at 0x106511290>,
<hmmerclust.hmmerclust.Locus instance at 0x1066d7170>]
In [27]:
#make a new df that only has proteins in the invg_loci
invg_only_df = df.df[df.df['locus_id'].isin(invg_loci)]
len(invg_only_df)
Out[27]:
288
In [28]:
#now filter this by evalue < 0.01
evalue_cut = invg_only_df[invg_only_df.hit_evalue < 0.01]
In [29]:
#down to 278 proteins
len(evalue_cut)
Out[29]:
278
In [30]:
#export all these proteins as fasta files
hmmerclust.RelatedProteinGroup(evalue_cut)
Out[30]:
<hmmerclust.hmmerclust.RelatedProteinGroup instance at 0x106db0290>
In [36]:
#the hits are grouped by name, for MSA, in this new folder
!ls group_fastas/
InvA.fasta InvG.fasta OrgA.fasta PrgI.fasta PscP.fasta SipD.fasta SpaQ.fasta
InvC.fasta InvH.fasta OrgB.fasta PrgJ.fasta SipB.fasta SpaO.fasta SpaR.fasta
InvE.fasta InvJ.fasta PrgH.fasta PrgK.fasta SipC.fasta SpaP.fasta SpaS.fasta
In [ ]:
Content source: mattsolo1/hmmerclust
Similar notebooks: