b) Metadata-based retrieval



In [ ]:

    
from IPython.core.display import display, HTML
from IPython.display import Audio
import pandas as pd
from whoosh.index import open_dir
from whoosh.qparser import MultifieldParser
from whoosh import sorting, query
df = pd.DataFrame.from_csv('database.csv')
index = open_dir('text_index')



In [ ]:

    
# Utils functions

def search(input_query,
           matching_fields=['name', 'description', 'tags'],
           groupedby=None):
    print('Searching for "%s"...' % input_query)
    query = MultifieldParser(matching_fields, index.schema)\
        .parse(input_query)
    results = index.searcher().search(query, groupedby=groupedby)
    return results

def display_results(results):
    for count, result in enumerate(results):
        print('  %i: %s (%s)' % (count + 1, \
              df.loc[int(result['pandas_index']), 'name'],
              df.loc[int(result['pandas_index']), 'path']))
        display(Audio(df.loc[int(result['pandas_index']), 'path']))



In [ ]:

    
# Search with query terms 'dog'
display_results(search('dog'))



In [ ]:

    
# Search for 'dog' sounds up to 6 seconds long
display_results(search('dog duration:[0 TO 6]'))



In [ ]:

    
# Include only sounds with 'Creative Commons Zero' license
display_results(search('dog duration:[0 TO 6] license:cc_0'))



In [ ]:

    
def display_facet_results(results, facet_field):
    print('Facet %s' % facet_field)
    for key, value in sorted(results.groups(facet_field).items()):
        print('\t%s: %i' % (key, len(value)))

# Define some facets, perform a search and display facet's output
facets = sorting.Facets()
facets.add_facet("license", sorting.FieldFacet("license"))
facets.add_facet("duration",
    sorting.RangeFacet("duration", 0, 1000, 10, hardend=False))
results = search('dog', groupedby=facets)
display_facet_results(results, "license")
display_facet_results(results, "duration")



In [ ]:

    
# Calculate tags facet considering all sounds in the index
facets = sorting.Facets()
facets.add_facet("tags", sorting.FieldFacet("tags", allow_overlap=True))
results = index.searcher().search(query.Every(), groupedby=facets)

# Generate html file with visual representation of tagcloud
content = '<br><br><br>'
max_len_docs = max([len(docs) for tag, docs in results.groups("tags").items()])  # Get max tag frequency
for tag, docs in results.groups("tags").items():
    if len(docs) > 3:  # Only show tags used more than 3 times
        content += '<span style="font-size:%ipx;margin-right:10px;">%s</span> ' % ((10+(50*len(docs)/max_len_docs))**1.2, tag)
display(HTML(content))

Chapter 10: Sound Sharing and Retreival

b) Metadata-based retrieval