In [ ]:
from IPython.core.display import display, HTML
from IPython.display import Audio
import pandas as pd
from whoosh.index import open_dir
from whoosh.qparser import MultifieldParser
from whoosh import sorting, query
df = pd.DataFrame.from_csv('database.csv')
index = open_dir('text_index')
In [ ]:
# Utils functions
def search(input_query,
matching_fields=['name', 'description', 'tags'],
groupedby=None):
print('Searching for "%s"...' % input_query)
query = MultifieldParser(matching_fields, index.schema)\
.parse(input_query)
results = index.searcher().search(query, groupedby=groupedby)
return results
def display_results(results):
for count, result in enumerate(results):
print(' %i: %s (%s)' % (count + 1, \
df.loc[int(result['pandas_index']), 'name'],
df.loc[int(result['pandas_index']), 'path']))
display(Audio(df.loc[int(result['pandas_index']), 'path']))
In [ ]:
# Search with query terms 'dog'
display_results(search('dog'))
In [ ]:
# Search for 'dog' sounds up to 6 seconds long
display_results(search('dog duration:[0 TO 6]'))
In [ ]:
# Include only sounds with 'Creative Commons Zero' license
display_results(search('dog duration:[0 TO 6] license:cc_0'))
In [ ]:
def display_facet_results(results, facet_field):
print('Facet %s' % facet_field)
for key, value in sorted(results.groups(facet_field).items()):
print('\t%s: %i' % (key, len(value)))
# Define some facets, perform a search and display facet's output
facets = sorting.Facets()
facets.add_facet("license", sorting.FieldFacet("license"))
facets.add_facet("duration",
sorting.RangeFacet("duration", 0, 1000, 10, hardend=False))
results = search('dog', groupedby=facets)
display_facet_results(results, "license")
display_facet_results(results, "duration")
In [ ]:
# Calculate tags facet considering all sounds in the index
facets = sorting.Facets()
facets.add_facet("tags", sorting.FieldFacet("tags", allow_overlap=True))
results = index.searcher().search(query.Every(), groupedby=facets)
# Generate html file with visual representation of tagcloud
content = '<br><br><br>'
max_len_docs = max([len(docs) for tag, docs in results.groups("tags").items()]) # Get max tag frequency
for tag, docs in results.groups("tags").items():
if len(docs) > 3: # Only show tags used more than 3 times
content += '<span style="font-size:%ipx;margin-right:10px;">%s</span> ' % ((10+(50*len(docs)/max_len_docs))**1.2, tag)
display(HTML(content))