In [ ]:
For more information about the Elasticsearch Query and Aggregation API, check out [the Elasticsearch API documentation](https://www.elastic.co/guide/en/elasticsearch/reference/current/search.html)
For more information about the `elasticsearch-dsl-py` library, check out the [elasticsearch-dsl-py documentation on ReadTheDocs](http://elasticsearch-dsl.readthedocs.org/en/latest/).
[sharepa can be found on pypi](https://pypi.python.org/pypi/sharepa/). It is essentially a special case of `elasticsearch-dsl-py`'s Search object, so the documentation of the library should also apply to sharepa.
In [1]:
%matplotlib inline
from IPython.display import display
In [2]:
from sharepa import ShareSearch
from sharepa.analysis import bucket_to_dataframe
In [3]:
def tags(term=None, agg='significant_terms'):
# Create a search object
search = ShareSearch()
# If there is a term provided, only search for documents that match that term
if term:
search = search.query('match', _all=term)
# Set up the aggregation to aggregate based on the tags field
search.aggs.bucket('top tags', agg, field='tags')
print(search.to_dict())
# This pulls the results from osf.io/api/v1/share/search/
results = search.execute()
# Convert the results into a pandas dataframe
df = bucket_to_dataframe(
'top tags',
results.aggregations['top tags']['buckets']
).sort('top tags', ascending=False)
display(df)
if agg == 'significant_terms':
y = ['bg_count', 'top tags']
else:
y = 'top tags'
df.plot(x='key', y=y, kind='bar')
In [4]:
tags(agg='terms')
In [5]:
tags()
In [6]:
tags('cancer')
In [7]:
tags('flu')
In [8]:
tags('influenza')
In [9]:
tags('vaccine')