In [1]:
import sys
sys.path.append('../')
sys.path.append('../src/')


import pandas
import scipy
import numpy as np
from pprint import pprint
    
import secure

sys.path.append('../searchbetter/')
import search
reload(search)
import rewriter
reload(rewriter)
import utils
reload(utils)

import analysis.plots as plots
reload(plots)
import analysis.stats as stats
reload(stats)
import analysis.experiment as experiment
reload(experiment)

import plotly
import plotly.graph_objs as go
import plotly.offline as py

import webcolors

py.init_notebook_mode()


import matplotlib.pyplot as plt
%matplotlib inline


# definitions and stuff

# colors plotly uses

colors = [
    '#1f77b4', # blue
    '#ff7f0e', # orange
    '#2ca02c', # green
    '#d62728', # red
    '#9467bd'  # purple
]

# need to convert, e.g., #FF0000 to 'rgb(255,0,0)'
rgb_colors = [webcolors.hex_to_rgb(color) for color in colors]
color_strings = ['rgb(%s,%s,%s)' % (c[0], c[1], c[2]) for c in rgb_colors]



In [2]:
## MAKE REWRITERS
model_path = secure.MODEL_PATH_BASE+'word2vec/word2vec'
w2v_rewriter = rewriter.Word2VecRewriter(model_path, create=False)

rewriters = [
    rewriter.ControlRewriter(),
    rewriter.WikipediaRewriter(),
    w2v_rewriter
]

In [3]:
## MAKE OUR SEARCH ENGINES

# edX search engine
dataset_path = secure.DATASET_PATH_BASE+'Master CourseListings - edX.csv'
index_path = secure.INDEX_PATH_BASE+'edx'
edx_engine = search.EdXSearchEngine(dataset_path, index_path, create=False)

# Udacity search engine
dataset_path = secure.DATASET_PATH_BASE+'udacity-api.json'
index_path = secure.INDEX_PATH_BASE+'udacity'
udacity_engine = search.UdacitySearchEngine(dataset_path, index_path, create=False)

# Dart search engine... it's prebuilt so we can just use it!
index_path = secure.INDEX_PATH_BASE+'dart'
DART_SEARCH_FIELDS = [
    'title',
    'query',
    'description'
]
dart_engine = search.PrebuiltSearchEngine(DART_SEARCH_FIELDS, index_path)

In [4]:
# sorted in order of biggest to smallest, roughly
engines = [dart_engine, edx_engine, udacity_engine]

engines_with_metadata = [
    {
        'engine': dart_engine,
        'slug': 'dart',
        'name': 'DART'
    },
    {
        'engine': edx_engine,
        'slug': 'edx',
        'name': 'edX',
    },
    {
        'engine': udacity_engine,
        'slug': 'udacity',
        'name': 'Udacity'
    }
]

In [5]:
# GENERATE DART STATS
metadata = engines_with_metadata[0]
df = experiment.generate_stats(
    metadata['engine'],
    metadata['slug'],
    rewriters,
    filename='../test/test-search-terms/generic.txt',
    cached=True)

In [6]:
# GENERATE EDX STATS
metadata = engines_with_metadata[1]
df = experiment.generate_stats(
    metadata['engine'],
    metadata['slug'],
    rewriters,
    filename='../test/test-search-terms/generic.txt',
    cached=True)

In [7]:
# GENERATE UDACITY STATS
metadata = engines_with_metadata[2]
df = experiment.generate_stats(
    metadata['engine'],
    metadata['slug'],
    rewriters,
    filename='../test/test-search-terms/generic.txt',
    cached=True)

In [8]:
# load dataframes
all_dfs = [experiment.generate_stats(m['engine'], m['slug'], rewriters, filename=None, cached=True) for m in engines_with_metadata]

In [9]:
reload(experiment)
reload(plots)

engine_names = [e['name'] for e in engines_with_metadata]
rewriter_names = [
    'Wikipedia',
    'Word2Vec'
]

fig = experiment.display_engine_plots(all_dfs, engine_names, rewriter_names, colors)


y = 1.01x + 92.75, r^2 = 0.53
y = 1.93x + 179.94, r^2 = 0.60
y = 1.10x + 1.50, r^2 = 0.37
y = 2.61x + 1.75, r^2 = 0.63
y = 1.00x + 1.77, r^2 = 0.69
y = 1.30x + 9.72, r^2 = 0.52

In [15]:
num_engines = len(engine_names)
figs = [plots.summary_bar_chart(all_dfs[i], engine_names[i]) for i in range(0, num_engines)]

for fig in figs:
    py.iplot(fig)


                mean         std
control    82.980515  299.837149
wiki      183.765556  413.860401
word2vec  354.375236  742.367256
                mean         std
control     0.000000    0.000000
wiki      105.513208  267.781091
word2vec   74.867925  140.853679
              mean        std
control   1.466967   7.326312
wiki      5.468468  12.730393
word2vec  8.339339  23.564894
              mean       std
control   0.000000  0.000000
wiki      4.386667  9.600464
word2vec  3.798095  7.395919
               mean        std
control   10.976744  14.063731
wiki      12.821705  16.808875
word2vec  24.666667  25.140025
          mean        std
control    0.0   0.000000
wiki       2.2   5.940445
word2vec  13.6  10.297788

In [24]:
## HOW MANY SEARCH TERMS GET ZERO HITS BY DEFAULT?

print "Percent of search terms that get zero hits:"

for i in range(0, num_engines):
    df = all_dfs[i]
    total_search_terms = len(df)
    zero_search_terms = len(df[df['control'] == 0])
    zero_percent = (zero_search_terms + 0.0) / total_search_terms
    engine_name = engine_names[i]
    
    print "{}: {}".format(engine_name, zero_percent)


Percent of search terms that get zero hits:
DART: 0.228172293364
edX: 0.91792782305
Udacity: 0.13768115942