In [1]:
# coding: utf-8
# python2.7
from __future__ import division, print_function
from parsers import CitationWindowParser
from context_parsing_functions import create_context_parsers
import subprocess
import re
# functions to make indexing param files
def index_collections(window_specs, index_dir, collection_dir):
""" Takes window specs {'Sentence': [(0,0)], 'Word': [(0,0)}
Args:
window_specs = dict with lists of tuples
"""
parsers = create_context_parsers(window_specs)
index_names = []
for parser in parsers:
index_name = parser.get_stringy_name()
citation_field = parser.get_field_name()
write_param_file(citation_field, index_name, index_dir, collection_dir)
index_names.append(index_name) # add index_name to list of index_names
build_indexes(index_names)
return
def write_param_file(citation_field, index_name, index_dir, collection_dir):
"""Writes param files for building Indri indexes from each document collection.
Args:
"""
stop_stem_list = ['ss','sn','ns','nn']
for stop_stem in stop_stem_list:
filename = "param-I.{}.{}.xml".format(index_name, stop_stem)
print("...writing index param file: {}".format(filename))
with open("{}Param_Files/{}".format(index_dir, filename), 'w') as param:
param.write(param_text(citation_field, index_name, stop_stem, index_dir, collection_dir))
return
def param_text(citation_field, index_name, stop_stem, index_dir, collection_dir):
if re.search('[ns]s', stop_stem):
param_text = """<parameters>
<index>{}I.{}.{}</index>
<memory>2G</memory>
<storeDocs>false</storeDocs>
<stemmer>
<name>krovetz</name>
</stemmer>
<field>
<name>isearchdoc</name>
</field>
<field>
<name>author</name>
</field>
<field>
<name>category</name>
</field>
<field>
<name>description</name>
</field>
<field>
<name>docno</name>
</field>
<field>
<name>documentlink</name>
</field>
<field>
<name>fulltext</name>
</field>
<field>
<name>subject</name>
</field>
<field>
<name>title</name>
</field>
<field>
<name>type</name>
</field>
<field>
<name>venue</name>
</field>
<field>
<name>citations</name>
</field>
<field>
<name>{}</name>
</field>
<corpus>
<path>{}PF+PN+C.{}/</path>
<class>xml</class>
</corpus>
</parameters>
""".format(index_dir, index_name, stop_stem, citation_field, collection_dir, index_name)
else:
param_text = """<parameters>
<index>{}I.{}.{}</index>
<memory>2G</memory>
<storeDocs>false</storeDocs>
<field>
<name>isearchdoc</name>
</field>
<field>
<name>author</name>
</field>
<field>
<name>category</name>
</field>
<field>
<name>description</name>
</field>
<field>
<name>docno</name>
</field>
<field>
<name>documentlink</name>
</field>
<field>
<name>fulltext</name>
</field>
<field>
<name>subject</name>
</field>
<field>
<name>title</name>
</field>
<field>
<name>type</name>
</field>
<field>
<name>venue</name>
</field>
<field>
<name>citations</name>
</field>
<field>
<name>{}</name>
</field>
<corpus>
<path>{}PF+PN+C.{}/</path>
<class>xml</class>
</corpus>
</parameters>
""".format(index_dir, index_name, stop_stem, citation_field, collection_dir, index_name)
return param_text
# functions to build Indri indexes
def build_indexes(index_names):
for index_name in index_names:
subprocess.call(["./build_indri_index.bash", str(index_name)])
return
In [4]:
index_names = []
index_name = 'a'
index_names.append(index_name)
index_names
Out[4]:
In [ ]: