In [1]:
import sys
sys.path.append('../')
sys.path.append('../src/')
import pandas
import scipy
import numpy as np
from pprint import pprint
import secure
sys.path.append('../searchbetter/')
import search
reload(search)
import rewriter
reload(rewriter)
import utils
reload(utils)
import analysis.plots as plots
reload(plots)
import analysis.stats as stats
reload(stats)
import analysis.experiment as experiment
reload(experiment)
import plotly
import plotly.graph_objs as go
import plotly.offline as py
import webcolors
py.init_notebook_mode()
import matplotlib.pyplot as plt
%matplotlib inline
# definitions and stuff
# colors plotly uses
colors = [
'#1f77b4', # blue
'#ff7f0e', # orange
'#2ca02c', # green
'#d62728', # red
'#9467bd' # purple
]
# need to convert, e.g., #FF0000 to 'rgb(255,0,0)'
rgb_colors = [webcolors.hex_to_rgb(color) for color in colors]
color_strings = ['rgb(%s,%s,%s)' % (c[0], c[1], c[2]) for c in rgb_colors]
In [2]:
## MAKE REWRITERS
model_path = secure.MODEL_PATH_BASE+'word2vec/word2vec'
w2v_rewriter = rewriter.Word2VecRewriter(model_path, create=False)
rewriters = [
rewriter.ControlRewriter(),
rewriter.WikipediaRewriter(),
w2v_rewriter
]
In [3]:
## MAKE OUR SEARCH ENGINES
# edX search engine
dataset_path = secure.DATASET_PATH_BASE+'Master CourseListings - edX.csv'
index_path = secure.INDEX_PATH_BASE+'edx'
edx_engine = search.EdXSearchEngine(dataset_path, index_path, create=False)
# Udacity search engine
dataset_path = secure.DATASET_PATH_BASE+'udacity-api.json'
index_path = secure.INDEX_PATH_BASE+'udacity'
udacity_engine = search.UdacitySearchEngine(dataset_path, index_path, create=False)
# Dart search engine... it's prebuilt so we can just use it!
index_path = secure.INDEX_PATH_BASE+'dart'
DART_SEARCH_FIELDS = [
'title',
'query',
'description'
]
dart_engine = search.PrebuiltSearchEngine(DART_SEARCH_FIELDS, index_path)
In [4]:
# sorted in order of biggest to smallest, roughly
engines = [dart_engine, edx_engine, udacity_engine]
engines_with_metadata = [
{
'engine': dart_engine,
'slug': 'dart',
'name': 'DART'
},
{
'engine': edx_engine,
'slug': 'edx',
'name': 'edX',
},
{
'engine': udacity_engine,
'slug': 'udacity',
'name': 'Udacity'
}
]
In [5]:
# GENERATE DART STATS
metadata = engines_with_metadata[0]
df = experiment.generate_stats(
metadata['engine'],
metadata['slug'],
rewriters,
filename='../test/test-search-terms/generic.txt',
cached=True)
In [6]:
# GENERATE EDX STATS
metadata = engines_with_metadata[1]
df = experiment.generate_stats(
metadata['engine'],
metadata['slug'],
rewriters,
filename='../test/test-search-terms/generic.txt',
cached=True)
In [7]:
# GENERATE UDACITY STATS
metadata = engines_with_metadata[2]
df = experiment.generate_stats(
metadata['engine'],
metadata['slug'],
rewriters,
filename='../test/test-search-terms/generic.txt',
cached=True)
In [8]:
# load dataframes
all_dfs = [experiment.generate_stats(m['engine'], m['slug'], rewriters, filename=None, cached=True) for m in engines_with_metadata]
In [9]:
reload(experiment)
reload(plots)
engine_names = [e['name'] for e in engines_with_metadata]
rewriter_names = [
'Wikipedia',
'Word2Vec'
]
fig = experiment.display_engine_plots(all_dfs, engine_names, rewriter_names, colors)
In [15]:
num_engines = len(engine_names)
figs = [plots.summary_bar_chart(all_dfs[i], engine_names[i]) for i in range(0, num_engines)]
for fig in figs:
py.iplot(fig)
In [24]:
## HOW MANY SEARCH TERMS GET ZERO HITS BY DEFAULT?
print "Percent of search terms that get zero hits:"
for i in range(0, num_engines):
df = all_dfs[i]
total_search_terms = len(df)
zero_search_terms = len(df[df['control'] == 0])
zero_percent = (zero_search_terms + 0.0) / total_search_terms
engine_name = engine_names[i]
print "{}: {}".format(engine_name, zero_percent)