In the concept search sub-project, we'll be working with the Enron email dataset. To implement query expansion algorithms, it is probably easiest to work directly with the raw text files on your hard disk. However, because some query expansion techniques require using search results, it is also convenient to have a copy of the data in a search engine. This notebook contains scripts to extract the dataset and indexing it in Elasticsearch. It also contains examples of how to query Elasticsearch.
First, download the dataset from OneDrive (choose enron_mail_clean.tar.gz).
Next, update f =
and out_dir =
to match where you downloaded the archive and where you want to extract the data to.
In [16]:
import tarfile
import os
import sys
f = '/Users/wrvhage/Downloads/enron_mail_clean.tar.gz'
out_dir = '/Users/wrvhage/Data'
In [17]:
# Extract text files
def extract_documents(members):
for tarinfo in members:
p, t = os.path.split(tarinfo.name)
if p.endswith('all_documents'):
yield tarinfo
tar = tarfile.open(f)
tar.extractall(path=out_dir, members=extract_documents(tar))
tar.close()
In [18]:
# convert data to json
import json
data_dir = os.path.join(out_dir, 'enron_mail_clean')
dump_dir = os.path.join(out_dir, 'enron_email_clean_json')
if not os.path.exists(dump_dir):
os.makedirs(dump_dir)
for person in os.listdir(data_dir):
with open(os.path.join(dump_dir, person), 'w') as out_file:
document_dir = os.path.join(data_dir, person, 'all_documents')
for doc in os.listdir(document_dir):
with open(os.path.join(document_dir, doc), 'r') as f:
text = f.read()
a = { 'index' : { '_index' : 'enron', '_type' : 'email', '_id': '{}/{}'.format(person, doc)} }
out_file.write(json.dumps(a))
out_file.write('\n')
d = {'text': text}
out_file.write(json.dumps(d))
out_file.write('\n')
Install Elasticsearch (instructions)
Start Elasticsearch by typing ./bin/elasticsearch
in the directory where you installed it.
More info on getting started with Elasticsearch (including links to useful plugins).
Install the Python Elasticsearch Client:
pip install elasticsearch-py
In [19]:
from elasticsearch import Elasticsearch
es = Elasticsearch()
In [20]:
# create index
config = {}
config['settings'] = {
'analysis' : {
'analyzer': {
'default': {
'type':'standard',
'stopwords': '_english_',
}
}
}
}
config['mappings'] = {
'email': {
'properties': {
'text': {
'type': 'string',
'term_vector': 'with_positions_offsets_payloads'
},
}
}
}
es.indices.create(index='enron', body=config)
In [21]:
# index data
for p in os.listdir(dump_dir):
with open(os.path.join(dump_dir, p), 'r') as f:
data = f.read()
es.bulk(index='enron', doc_type='email', body=data)
In [22]:
# match all
query = {'query': {'match_all': {}}}
res = es.search(index='enron', doc_type='email', body=query)
print("Got %d Hits:" % res['hits']['total'])
#print json.dumps(res, indent=4, separators=(',', ': '))
In [26]:
# query string query (complex boolean queries possible. See:
# https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-query-string-query.html)
query = {
"query": {
"query_string": {
"query": "(natural AND gas) OR industrial"
}
}
}
res = es.search(index='enron', doc_type='email', body=query)
print("Got %d Hits:" % res['hits']['total'])
#print json.dumps(res, indent=4, separators=(',', ': '))
In [23]:
# Term query
# https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-term-query.html
query = {
"query": {
"term" : { "text" : "natural" }
}
}
res = es.search(index='enron', doc_type='email', body=query)
print("Got %d Hits:" % res['hits']['total'])
#print json.dumps(res, indent=4, separators=(',', ': '))
In [28]:
# significant terms aggregation
# https://www.elastic.co/guide/en/elasticsearch/reference/current/search-aggregations-bucket-significantterms-aggregation.html
query = {
"query": {
"query_string": {
"query": "(natural AND gas) OR industrial"
}
},
"aggregations" : {
"significantTerms" : {
"significant_terms" : { "field" : "text", "size": 15 }
}
}
}
res = es.search(index='enron', doc_type='email', body=query, size=0)
print("Got %d Hits:" % res['hits']['total'])
#print json.dumps(res, indent=4, separators=(',', ': '))
In [29]:
# Delete enron index (uncomment if needed)
#es.indices.delete(index='enron', ignore=[400, 404])