# Search Project for CST 495

CMU Movie Summary Corpus http://www.cs.cmu.edu/~ark/personas/

Dustin D'Avignon

Chris Ngo

# Let's go

We begin with normalise the text by removing unwanted characters and converting to lowercase



In [6]:

import csv
import re

with open("data/MovieSummaries/plot_summaries.tsv") as f:
tag = re.compile(r'\b[0-9]+\b')
rgx = re.compile(r'\b[a-zA-Z]+\b')
#docs = [ (' '.join(re.findall(tag, x[0])).lower(), ' '.join(re.findall(rgx, x[1])).lower()) for i,x in enumerate(r) if r>1 ]
docs= {}
for i,x in enumerate(r):
if i >1:
docs[' '.join(re.findall(tag, x[0])).lower()] = ' '.join(re.findall(rgx, x[1])).lower()


> now to normalize the movie meta data to swap the item titles with index from above ** just the basics for now to get index, tried to pull out genre, but it was screwing up the rest of the code due to potential parsing errors **


In [7]:

import csv
import re

tag = re.compile(r'\b[0-9]+\b')
rgx = re.compile(r'\b[a-zA-Z]+\b')
docs2= {}
for i,x in enumerate(r):
if i >1:
docs2[' '.join(re.findall(tag, x[0])).lower()] = ' '.join(re.findall(rgx, x[2])).lower(), ' '.join(re.findall(rgx, x[8])).lower()

#print(docs2)



now is the time to join the docs together



In [8]:

doc = [(docs2.get(x), y) for x, y in docs.items() if docs2.get(x)]

# for testing
# import random
#print doc[random.randint(0, len(doc)-1)]
print doc[0][0], doc[0][1]

items_t = [ d[0] for d in doc ] # item titles
items_d = [ d[1] for d in doc ] # item description
items_i = range(0 , len(items_t)) # item id






# term freq



In [10]:

corpus = items_d[0:25]
print corpus






start by computing frequncy of entire corpus



In [11]:

tf = {}
for doc in corpus:
for word in doc.split():
if word in tf:
tf[word] += 1
else:
tf[word] = 1
print(tf)






now that we have normailised the data we can compute the term frequency



In [12]:

from collections import Counter

def get_tf(corpus):
tf = Counter()
for doc in corpus:
for word in doc.split():
tf[word] += 1
return tf

tf = get_tf(corpus)
print(tf)






# doc freq



In [16]:

import collections

def get_tf(document):
tf = Counter()
for word in document.split():
tf[word] += 1
return tf

def get_dtf(corpus):
dtf = {}
for i,doc in enumerate(corpus):
dtf[i]= get_tf(doc)
return dtf

dtf = get_dtf(items_d)
dtf[342]




Out[16]:

Counter({'a': 7,
'again': 1,
'and': 26,
'angry': 1,
'are': 1,
'around': 1,
'as': 1,
'at': 1,
'attempt': 1,
'away': 2,
'back': 3,
'barking': 1,
'be': 3,
'been': 1,
'begins': 1,
'bone': 1,
'but': 7,
'by': 1,
'can': 2,
'catcher': 4,
'catches': 1,
'caught': 2,
'chases': 1,
'chasing': 1,
'city': 2,
'cover': 1,
'crawls': 1,
'cries': 1,
'day': 1,
'digs': 1,
'disguises': 1,
'doesn': 1,
'dog': 16,
'dogs': 1,
'drama': 1,
'driver': 1,
'drives': 1,
'driving': 1,
'enter': 1,
'escapes': 3,
'fools': 1,
'for': 2,
'from': 4,
'frowned': 1,
'gate': 2,
'get': 2,
'gets': 2,
'gives': 1,
'goes': 2,
'going': 1,
'grabs': 3,
'happily': 1,
'happy': 1,
'he': 22,
'hides': 2,
'him': 6,
'himself': 2,
'his': 8,
'hits': 1,
'hole': 2,
'horrified': 1,
'house': 1,
'humming': 1,
'hungry': 1,
'in': 4,
'inside': 1,
'is': 9,
'it': 6,
'jerry': 1,
'know': 1,
'lamp': 1,
'last': 1,
'lets': 1,
'locked': 1,
'looks': 2,
'main': 1,
'manhole': 1,
'napkin': 1,
'news': 1,
'newspaper': 3,
'no': 1,
'now': 1,
'of': 1,
'off': 1,
'order': 1,
'orders': 1,
'own': 1,
'panicking': 1,
'past': 1,
'pound': 3,
'protagonist': 1,
'pursues': 1,
'quiet': 1,
'realizes': 1,
'remain': 1,
'remove': 1,
'roll': 1,
'runs': 1,
's': 4,
'sacrifice': 1,
'says': 2,
'sees': 7,
'shows': 1,
'sits': 1,
'sleep': 2,
'sleeping': 1,
'some': 1,
'son': 1,
'song': 1,
'speeds': 1,
'spike': 16,
'stick': 1,
'stops': 1,
'street': 1,
't': 1,
'tags': 1,
'taken': 1,
'taking': 1,
'tells': 1,
'that': 2,
'the': 34,
'theme': 1,
'then': 3,
'they': 1,
'through': 1,
'throws': 1,
'tip': 1,
'to': 16,
'toes': 1,
'tom': 1,
'took': 1,
'trash': 2,
'tricked': 1,
'tries': 1,
'truck': 4,
'turns': 1,
'tyke': 7,
'under': 1,
'up': 2,
'use': 1,
'uses': 1,
'wakes': 1,
'walk': 1,
'was': 1,
'wear': 1,
'wears': 1,
'when': 1,
'where': 1,
'while': 1,
'who': 3,
'will': 1,
'with': 2,
'without': 1,
'yawns': 1,
'yet': 1})



compute dtf for item descriptions



In [17]:

dtf = get_dtf(items_d)
dtf[12]




Out[17]:

Counter({'a': 10,
'ability': 1,
'accept': 1,
'affection': 1,
'after': 1,
'against': 1,
'an': 1,
'and': 6,
'are': 1,
'aristocrat': 1,
'aristocratic': 1,
'aristocrats': 2,
'army': 1,
'as': 3,
'at': 3,
'aware': 1,
'bankrupt': 1,
'because': 1,
'becomes': 2,
'begins': 1,
'bulgaria': 1,
'but': 3,
'cinema': 1,
'cki': 1,
'com': 1,
'comes': 1,
'company': 1,
'condescended': 1,
'consents': 1,
'database': 1,
'daughter': 1,
'descendant': 1,
'devotion': 1,
'distressed': 1,
'dreaming': 1,
'during': 1,
'edu': 1,
'end': 1,
'enterprising': 1,
'eventual': 1,
'exile': 2,
'failed': 1,
'falling': 1,
'family': 2,
'father': 2,
'filmy': 1,
'financially': 1,
'forced': 1,
'fortune': 1,
'founds': 1,
'frequenting': 1,
'frustrates': 1,
'fuw': 1,
'girl': 1,
'go': 1,
'haberdashery': 1,
'he': 4,
'heart': 1,
'help': 2,
'her': 1,
'him': 1,
'his': 6,
'hopfer': 1,
'http': 2,
'imdb': 1,
'impecunious': 1,
'impoverished': 1,
'in': 8,
'indolence': 1,
'influential': 1,
'info': 1,
'into': 1,
'is': 4,
'it': 1,
'izabela': 3,
'lack': 1,
'late': 1,
'lazy': 1,
'life': 1,
'love': 2,
'make': 1,
'makes': 1,
'marrying': 1,
'merchant': 1,
'merchants': 1,
'met': 1,
'mincel': 1,
'money': 2,
'new': 1,
'noble': 1,
'now': 1,
'of': 8,
'on': 1,
'or': 1,
'owner': 1,
'part': 1,
'partnership': 1,
'pensions': 1,
'pl': 1,
'polish': 2,
'proves': 1,
'quest': 1,
'rank': 1,
'respected': 1,
'restaurant': 1,
'return': 1,
'risks': 1,
'romantic': 1,
'russia': 1,
'russian': 2,
'russo': 1,
's': 4,
'salesman': 1,
'salons': 1,
'science': 1,
'searchplotwriters': 1,
'secure': 1,
'sentenced': 1,
'set': 1,
'sets': 1,
'shareholders': 1,
'she': 1,
'siberia': 1,
'social': 1,
'supplying': 1,
'taking': 1,
'the': 11,
'theatres': 1,
'their': 1,
'these': 1,
'to': 12,
'tomasz': 1,
'too': 1,
'true': 1,
'tsarist': 1,
'turkish': 1,
'two': 1,
'undertake': 1,
'up': 2,
'uprising': 1,
'uses': 1,
'vacuous': 1,
'waiter': 1,
'war': 1,
'warsaw': 2,
'while': 2,
'who': 1,
'widow': 1,
'win': 1,
'with': 3,
'without': 1,
'wokulski': 5,
'work': 1,
'www': 1,
'young': 1})



# term freq matrix

with the lexicon we are able to compute the term freq matrix



In [18]:

def get_tfm(corpus):

def get_lexicon(corpus):
lexicon = set()
for doc in corpus:
lexicon.update([word for word in doc.split()])
return list(lexicon)

lexicon = get_lexicon(corpus)

tfm =[]
for doc in corpus:
tfv = [0]*len(lexicon)
for term in doc.split():
tfv[lexicon.index(term)] += 1

tfm.append(tfv)

return tfm, lexicon

#test_corpus = ['mountain bike', 'road bike carbon', 'bike helmet']
#tfm, lexicon = get_tfm(test_corpus)
#print lexicon
#print tfm



# sparsity of term frequency matrix

We took the approach of using Bokeh for displaying the sparsity of term frequency matrix



In [64]:

#!pip install bokeh




In [19]:

import pandas as pd
from bokeh.plotting import figure, output_notebook, show, vplot

# sparsity as a function of document count
n = []
s = []
for i in range(100,1000,100):
corpus = items_d[0:i]
tfm, lexicon = get_tfm(corpus)
c = [ [x.count(0), x.count(1)] for x in tfm]
n_zero = sum([ y[0] for y in c])
n_one = sum( [y[1] for y in c])
s.append(1.0 - (float(n_one) / (n_one + n_zero)))
n.append(i)

output_notebook(hide_banner=True)
p = figure(x_axis_label='Documents', y_axis_label='Sparsity', plot_width=400, plot_height=400)
p.line(n, s, line_width=2)
p.circle(n, s, fill_color="white", size=8)
show(p)




var element = $('#b9b01796-cf0d-4d8c-a890-4c20da2134bc'); (function(global) { function now() { return new Date(); } if (typeof (window._bokeh_onload_callbacks) === "undefined") { window._bokeh_onload_callbacks = []; } function run_callbacks() { window._bokeh_onload_callbacks.forEach(function(callback) { callback() }); delete window._bokeh_onload_callbacks console.info("Bokeh: all callbacks have finished"); } function load_libs(js_urls, callback) { window._bokeh_onload_callbacks.push(callback); if (window._bokeh_is_loading > 0) { console.log("Bokeh: BokehJS is being loaded, scheduling callback at", now()); return null; } if (js_urls == null || js_urls.length === 0) { run_callbacks(); return null; } console.log("Bokeh: BokehJS not loaded, scheduling load and callback at", now()); window._bokeh_is_loading = js_urls.length; for (var i = 0; i < js_urls.length; i++) { var url = js_urls[i]; var s = document.createElement('script'); s.src = url; s.async = false; s.onreadystatechange = s.onload = function() { window._bokeh_is_loading--; if (window._bokeh_is_loading === 0) { console.log("Bokeh: all BokehJS libraries loaded"); run_callbacks() } }; s.onerror = function() { console.warn("failed to load library " + url); }; console.log("Bokeh: injecting script tag for BokehJS library: ", url); document.getElementsByTagName("head")[0].appendChild(s); } }; var js_urls = ['https://cdn.pydata.org/bokeh/release/bokeh-0.12.0.min.js', 'https://cdn.pydata.org/bokeh/release/bokeh-widgets-0.12.0.min.js', 'https://cdn.pydata.org/bokeh/release/bokeh-compiler-0.12.0.min.js']; var inline_js = [ function(Bokeh) { Bokeh.set_log_level("info"); }, function(Bokeh) { Bokeh.$("#0b6f1159-0843-4dcf-9d48-66d60d3797cd").text("BokehJS successfully loaded");
},
function(Bokeh) {
console.log("Bokeh: injecting CSS: https://cdn.pydata.org/bokeh/release/bokeh-0.12.0.min.css");
Bokeh.embed.inject_css("https://cdn.pydata.org/bokeh/release/bokeh-0.12.0.min.css");
console.log("Bokeh: injecting CSS: https://cdn.pydata.org/bokeh/release/bokeh-widgets-0.12.0.min.css");
Bokeh.embed.inject_css("https://cdn.pydata.org/bokeh/release/bokeh-widgets-0.12.0.min.css");
}
];

function run_inline_js() {
for (var i = 0; i < inline_js.length; i++) {
inline_js[i](window.Bokeh);
}
}

console.log("Bokeh: BokehJS loaded, going straight to plotting");
run_inline_js();
} else {
console.log("Bokeh: BokehJS plotting callback run at", now());
run_inline_js();
});
}
}(this));

(function(global) {
function now() {
return new Date();
}

if (typeof (window._bokeh_onload_callbacks) === "undefined") {
}

function run_callbacks() {
console.info("Bokeh: all callbacks have finished");
}

console.log("Bokeh: BokehJS is being loaded, scheduling callback at", now());
return null;
}
if (js_urls == null || js_urls.length === 0) {
run_callbacks();
return null;
}
for (var i = 0; i < js_urls.length; i++) {
var url = js_urls[i];
var s = document.createElement('script');
s.src = url;
s.async = false;
run_callbacks()
}
};
s.onerror = function() {
console.warn("failed to load library " + url);
};
console.log("Bokeh: injecting script tag for BokehJS library: ", url);
}
};var element = document.getElementById("e1bc1ff8-bdd8-4611-b991-417dd3af0818");
if (element == null) {
console.log("Bokeh: ERROR: autoload.js configured with elementid 'e1bc1ff8-bdd8-4611-b991-417dd3af0818' but no matching script tag was found. ")
return false;
}

var js_urls = [];

var inline_js = [
function(Bokeh) {
Bokeh.$(function() { var docs_json = {"36a44a77-9055-4624-ad2a-9debe0f312fd":{"roots":{"references":[{"attributes":{"plot":{"id":"96e86096-9d04-46d7-914b-ffc18c7d7dd7","subtype":"Figure","type":"Plot"}},"id":"096fc33c-700a-4984-8c69-4300cee4f0e2","type":"PanTool"},{"attributes":{"line_color":{"value":"#1f77b4"},"line_width":{"value":2},"x":{"field":"x"},"y":{"field":"y"}},"id":"1c8167c3-fe6e-4bfa-8893-19cae5fc1995","type":"Line"},{"attributes":{"plot":{"id":"96e86096-9d04-46d7-914b-ffc18c7d7dd7","subtype":"Figure","type":"Plot"}},"id":"dcf621d2-e629-4fd0-b8d5-498eb01a68c9","type":"ResetTool"},{"attributes":{"plot":{"id":"96e86096-9d04-46d7-914b-ffc18c7d7dd7","subtype":"Figure","type":"Plot"}},"id":"56df6dec-d92f-43a2-894f-da046393d29b","type":"WheelZoomTool"},{"attributes":{},"id":"4d823e6b-36d4-4795-9b0e-fda604f242c5","type":"BasicTickFormatter"},{"attributes":{"callback":null},"id":"a076b183-e64d-4af0-8e6b-1fcba8c10f16","type":"DataRange1d"},{"attributes":{"data_source":{"id":"b55f007d-bc4a-431e-b72b-4cfd1d167c41","type":"ColumnDataSource"},"glyph":{"id":"077d3922-ec82-4cf8-8d7a-121ced721ef9","type":"Circle"},"hover_glyph":null,"nonselection_glyph":{"id":"afeb5f44-2707-4679-95a3-ccb97a1d4149","type":"Circle"},"selection_glyph":null},"id":"d5924349-8070-4046-9d36-2ab267387c1a","type":"GlyphRenderer"},{"attributes":{"active_drag":"auto","active_scroll":"auto","active_tap":"auto","tools":[{"id":"096fc33c-700a-4984-8c69-4300cee4f0e2","type":"PanTool"},{"id":"56df6dec-d92f-43a2-894f-da046393d29b","type":"WheelZoomTool"},{"id":"1221667e-9b66-4dff-a60f-4c0e50a69241","type":"BoxZoomTool"},{"id":"3d3781fe-6b35-484c-a97d-dbf8d1339110","type":"SaveTool"},{"id":"dcf621d2-e629-4fd0-b8d5-498eb01a68c9","type":"ResetTool"},{"id":"1c8b20e4-4662-4eae-a3ef-e576a0295475","type":"HelpTool"}]},"id":"7af2970f-4ab8-48c9-ab8d-f71f6ad17d4a","type":"Toolbar"},{"attributes":{"fill_alpha":{"value":0.1},"fill_color":{"value":"#1f77b4"},"line_alpha":{"value":0.1},"line_color":{"value":"#1f77b4"},"size":{"units":"screen","value":8},"x":{"field":"x"},"y":{"field":"y"}},"id":"afeb5f44-2707-4679-95a3-ccb97a1d4149","type":"Circle"},{"attributes":{"plot":null,"text":null},"id":"5c8546cb-5a9f-4181-b085-af240d5224ed","type":"Title"},{"attributes":{},"id":"f09bc96c-2260-423d-8b7d-83066d8ad4d0","type":"BasicTicker"},{"attributes":{"below":[{"id":"130f649e-6e12-40bd-9483-f8e8572168de","type":"LinearAxis"}],"left":[{"id":"d9419de8-d6d1-430b-84a1-e3c38ca03e6f","type":"LinearAxis"}],"plot_height":400,"plot_width":400,"renderers":[{"id":"130f649e-6e12-40bd-9483-f8e8572168de","type":"LinearAxis"},{"id":"b9e2d67c-3b3f-406f-88b5-cec77a6f31ec","type":"Grid"},{"id":"d9419de8-d6d1-430b-84a1-e3c38ca03e6f","type":"LinearAxis"},{"id":"92678c86-e79f-4796-8e56-cac0d4c8788c","type":"Grid"},{"id":"782610f5-9aab-4d51-9575-41e52fc52322","type":"BoxAnnotation"},{"id":"4ba86816-e1e3-4505-bcc6-960a11a4be82","type":"GlyphRenderer"},{"id":"d5924349-8070-4046-9d36-2ab267387c1a","type":"GlyphRenderer"}],"title":{"id":"5c8546cb-5a9f-4181-b085-af240d5224ed","type":"Title"},"tool_events":{"id":"ca4ccb11-20be-4563-889e-9d9a27926077","type":"ToolEvents"},"toolbar":{"id":"7af2970f-4ab8-48c9-ab8d-f71f6ad17d4a","type":"Toolbar"},"x_range":{"id":"a076b183-e64d-4af0-8e6b-1fcba8c10f16","type":"DataRange1d"},"y_range":{"id":"e2c33fcf-ffe4-4649-aad4-5b2896efce83","type":"DataRange1d"}},"id":"96e86096-9d04-46d7-914b-ffc18c7d7dd7","subtype":"Figure","type":"Plot"},{"attributes":{"axis_label":"Documents","formatter":{"id":"4d823e6b-36d4-4795-9b0e-fda604f242c5","type":"BasicTickFormatter"},"plot":{"id":"96e86096-9d04-46d7-914b-ffc18c7d7dd7","subtype":"Figure","type":"Plot"},"ticker":{"id":"f09bc96c-2260-423d-8b7d-83066d8ad4d0","type":"BasicTicker"}},"id":"130f649e-6e12-40bd-9483-f8e8572168de","type":"LinearAxis"},{"attributes":{"axis_label":"Sparsity","formatter":{"id":"2a90637b-f1ee-4275-a716-db41fd099295","type":"BasicTickFormatter"},"plot":{"id":"96e86096-9d04-46d7-914b-ffc18c7d7dd7","subtype":"Figure","type":"Plot"},"ticker":{"id":"cc52df6e-6b2a-4630-8cf7-acf66f9e0435","type":"BasicTicker"}},"id":"d9419de8-d6d1-430b-84a1-e3c38ca03e6f","type":"LinearAxis"},{"attributes":{"plot":{"id":"96e86096-9d04-46d7-914b-ffc18c7d7dd7","subtype":"Figure","type":"Plot"},"ticker":{"id":"f09bc96c-2260-423d-8b7d-83066d8ad4d0","type":"BasicTicker"}},"id":"b9e2d67c-3b3f-406f-88b5-cec77a6f31ec","type":"Grid"},{"attributes":{"dimension":1,"plot":{"id":"96e86096-9d04-46d7-914b-ffc18c7d7dd7","subtype":"Figure","type":"Plot"},"ticker":{"id":"cc52df6e-6b2a-4630-8cf7-acf66f9e0435","type":"BasicTicker"}},"id":"92678c86-e79f-4796-8e56-cac0d4c8788c","type":"Grid"},{"attributes":{"callback":null},"id":"e2c33fcf-ffe4-4649-aad4-5b2896efce83","type":"DataRange1d"},{"attributes":{},"id":"2a90637b-f1ee-4275-a716-db41fd099295","type":"BasicTickFormatter"},{"attributes":{"line_alpha":{"value":0.1},"line_color":{"value":"#1f77b4"},"line_width":{"value":2},"x":{"field":"x"},"y":{"field":"y"}},"id":"00a35aea-9136-4ff0-ac39-912099d82ec7","type":"Line"},{"attributes":{"fill_color":{"value":"white"},"line_color":{"value":"#1f77b4"},"size":{"units":"screen","value":8},"x":{"field":"x"},"y":{"field":"y"}},"id":"077d3922-ec82-4cf8-8d7a-121ced721ef9","type":"Circle"},{"attributes":{},"id":"ca4ccb11-20be-4563-889e-9d9a27926077","type":"ToolEvents"},{"attributes":{"data_source":{"id":"424dbf43-1e91-44e0-ba05-374ca69039df","type":"ColumnDataSource"},"glyph":{"id":"1c8167c3-fe6e-4bfa-8893-19cae5fc1995","type":"Line"},"hover_glyph":null,"nonselection_glyph":{"id":"00a35aea-9136-4ff0-ac39-912099d82ec7","type":"Line"},"selection_glyph":null},"id":"4ba86816-e1e3-4505-bcc6-960a11a4be82","type":"GlyphRenderer"},{"attributes":{"plot":{"id":"96e86096-9d04-46d7-914b-ffc18c7d7dd7","subtype":"Figure","type":"Plot"}},"id":"1c8b20e4-4662-4eae-a3ef-e576a0295475","type":"HelpTool"},{"attributes":{},"id":"cc52df6e-6b2a-4630-8cf7-acf66f9e0435","type":"BasicTicker"},{"attributes":{"callback":null,"column_names":["y","x"],"data":{"x":[100,200,300,400,500,600,700,800,900],"y":[0.9801531000329073,0.9871327115598528,0.9899146939326247,0.9913779443048117,0.9922816755064662,0.9929777061040556,0.9935487768766185,0.9940079050355346,0.9943778843925557]}},"id":"424dbf43-1e91-44e0-ba05-374ca69039df","type":"ColumnDataSource"},{"attributes":{"bottom_units":"screen","fill_alpha":{"value":0.5},"fill_color":{"value":"lightgrey"},"left_units":"screen","level":"overlay","line_alpha":{"value":1.0},"line_color":{"value":"black"},"line_dash":[4,4],"line_width":{"value":2},"plot":null,"render_mode":"css","right_units":"screen","top_units":"screen"},"id":"782610f5-9aab-4d51-9575-41e52fc52322","type":"BoxAnnotation"},{"attributes":{"callback":null,"column_names":["y","x"],"data":{"x":[100,200,300,400,500,600,700,800,900],"y":[0.9801531000329073,0.9871327115598528,0.9899146939326247,0.9913779443048117,0.9922816755064662,0.9929777061040556,0.9935487768766185,0.9940079050355346,0.9943778843925557]}},"id":"b55f007d-bc4a-431e-b72b-4cfd1d167c41","type":"ColumnDataSource"},{"attributes":{"plot":{"id":"96e86096-9d04-46d7-914b-ffc18c7d7dd7","subtype":"Figure","type":"Plot"}},"id":"3d3781fe-6b35-484c-a97d-dbf8d1339110","type":"SaveTool"},{"attributes":{"overlay":{"id":"782610f5-9aab-4d51-9575-41e52fc52322","type":"BoxAnnotation"},"plot":{"id":"96e86096-9d04-46d7-914b-ffc18c7d7dd7","subtype":"Figure","type":"Plot"}},"id":"1221667e-9b66-4dff-a60f-4c0e50a69241","type":"BoxZoomTool"}],"root_ids":["96e86096-9d04-46d7-914b-ffc18c7d7dd7"]},"title":"Bokeh Application","version":"0.12.0"}}; var render_items = [{"docid":"36a44a77-9055-4624-ad2a-9debe0f312fd","elementid":"e1bc1ff8-bdd8-4611-b991-417dd3af0818","modelid":"96e86096-9d04-46d7-914b-ffc18c7d7dd7","notebook_comms_target":"b57aae7f-28eb-4896-bb27-33df8a0c1b78"}]; Bokeh.embed.embed_items(docs_json, render_items); }); }, function(Bokeh) { } ]; function run_inline_js() { for (var i = 0; i < inline_js.length; i++) { inline_js[i](window.Bokeh); } } if (window._bokeh_is_loading === 0) { console.log("Bokeh: BokehJS loaded, going straight to plotting"); run_inline_js(); } else { load_libs(js_urls, function() { console.log("Bokeh: BokehJS plotting callback run at", now()); run_inline_js(); }); } }(this)); Out[19]: <Bokeh Notebook handle for In[19]>  # boolean search After doing the term frequency matrix, we went into using our first ranking function. We are using a boolean search to find documents that contains the words that are included within a user specified query. This is how our boolean search algorithm works: • Compute the lexicon for the corpus • Compute the term frequency matrix for the corpus • Convert query to query vector using the same lexicon • Compare each documents term frequncy vector to the query vector - specifically for each document in the corpus: • Compute a ranking score for each document by taking the dot product of the document's term frequency vector and the query vector • Sort the documents by ranking score  In [20]: # compute term frequency matrix and lexicon tfm, lexicon = get_tfm(corpus) # define our query qry = 'red bike' # convert query to query vector using lexicon qrv = [0]*len(lexicon) for term in qry.split(): if term in lexicon: qrv[lexicon.index(term)] = 1 #print qrv # compare query vector to each term frequency vector # this is dot product between qrv and each row of tfm for i,tfv in enumerate(tfm): print i, sum([ xy[0] * xy[1] for xy in zip(qrv, tfv) ])   0 0 1 0 2 0 3 0 4 0 5 0 6 0 7 0 8 0 9 0 10 0 11 0 12 0 13 1 14 0 15 0 16 1 17 0 18 0 19 0 20 0 21 0 22 0 23 0 24 0 25 0 26 0 27 0 28 0 29 0 30 1 31 0 32 0 33 0 34 0 35 0 36 0 37 0 38 0 39 0 40 0 41 0 42 0 43 0 44 0 45 0 46 0 47 0 48 0 49 0 50 0 51 0 52 0 53 0 54 0 55 0 56 0 57 0 58 1 59 0 60 0 61 0 62 0 63 1 64 0 65 0 66 0 67 0 68 0 69 0 70 0 71 0 72 0 73 0 74 0 75 0 76 0 77 0 78 0 79 0 80 0 81 0 82 0 83 0 84 0 85 0 86 0 87 0 88 0 89 0 90 0 91 0 92 0 93 0 94 0 95 0 96 0 97 0 98 0 99 0 100 0 101 0 102 0 103 0 104 0 105 0 106 0 107 0 108 2 109 0 110 0 111 0 112 0 113 0 114 0 115 0 116 0 117 0 118 0 119 0 120 0 121 0 122 0 123 0 124 0 125 0 126 0 127 0 128 0 129 0 130 0 131 0 132 0 133 1 134 0 135 0 136 0 137 2 138 0 139 0 140 0 141 0 142 0 143 0 144 0 145 0 146 0 147 0 148 0 149 0 150 0 151 0 152 0 153 0 154 0 155 0 156 0 157 0 158 0 159 0 160 0 161 0 162 0 163 0 164 0 165 0 166 0 167 0 168 0 169 0 170 0 171 0 172 0 173 0 174 0 175 0 176 0 177 0 178 0 179 0 180 0 181 0 182 0 183 0 184 0 185 0 186 0 187 0 188 0 189 0 190 0 191 0 192 0 193 0 194 1 195 0 196 0 197 0 198 0 199 0 200 0 201 0 202 0 203 0 204 0 205 0 206 0 207 0 208 0 209 0 210 0 211 0 212 0 213 0 214 0 215 0 216 0 217 0 218 0 219 0 220 3 221 0 222 0 223 0 224 0 225 0 226 0 227 0 228 0 229 0 230 0 231 0 232 0 233 0 234 0 235 0 236 0 237 0 238 0 239 0 240 0 241 0 242 0 243 0 244 0 245 0 246 0 247 1 248 0 249 0 250 0 251 0 252 0 253 0 254 0 255 0 256 0 257 0 258 0 259 0 260 0 261 0 262 0 263 0 264 0 265 0 266 0 267 0 268 0 269 0 270 0 271 0 272 0 273 0 274 0 275 0 276 0 277 0 278 0 279 1 280 0 281 0 282 0 283 0 284 0 285 0 286 0 287 0 288 0 289 0 290 0 291 0 292 0 293 0 294 1 295 0 296 0 297 0 298 0 299 0 300 0 301 0 302 0 303 0 304 0 305 0 306 0 307 0 308 0 309 0 310 0 311 1 312 0 313 1 314 0 315 0 316 0 317 1 318 0 319 0 320 0 321 0 322 0 323 0 324 0 325 0 326 0 327 0 328 0 329 0 330 0 331 0 332 0 333 0 334 0 335 0 336 0 337 0 338 0 339 0 340 0 341 0 342 0 343 0 344 0 345 0 346 0 347 0 348 2 349 0 350 0 351 0 352 0 353 0 354 0 355 0 356 0 357 0 358 0 359 0 360 0 361 0 362 0 363 0 364 0 365 0 366 0 367 0 368 0 369 0 370 0 371 0 372 0 373 0 374 0 375 0 376 0 377 0 378 3 379 0 380 0 381 0 382 0 383 0 384 1 385 0 386 0 387 0 388 0 389 0 390 0 391 0 392 0 393 0 394 0 395 0 396 0 397 0 398 0 399 0 400 1 401 0 402 0 403 0 404 0 405 1 406 0 407 0 408 0 409 0 410 0 411 1 412 0 413 0 414 0 415 0 416 0 417 0 418 0 419 0 420 0 421 0 422 0 423 0 424 0 425 0 426 2 427 0 428 0 429 0 430 0 431 0 432 0 433 0 434 0 435 0 436 0 437 0 438 0 439 0 440 0 441 0 442 0 443 0 444 0 445 0 446 0 447 0 448 0 449 0 450 0 451 0 452 0 453 0 454 0 455 0 456 0 457 0 458 0 459 0 460 0 461 0 462 1 463 0 464 0 465 0 466 0 467 0 468 0 469 0 470 0 471 0 472 0 473 0 474 0 475 0 476 0 477 0 478 0 479 0 480 0 481 0 482 1 483 0 484 0 485 0 486 0 487 0 488 0 489 0 490 0 491 0 492 0 493 0 494 0 495 0 496 0 497 0 498 14 499 0 500 0 501 0 502 0 503 0 504 0 505 0 506 0 507 0 508 0 509 0 510 0 511 0 512 0 513 0 514 0 515 0 516 0 517 0 518 0 519 0 520 0 521 0 522 0 523 0 524 0 525 0 526 0 527 0 528 0 529 0 530 0 531 0 532 0 533 0 534 0 535 0 536 0 537 0 538 0 539 0 540 0 541 0 542 0 543 0 544 1 545 0 546 0 547 0 548 0 549 0 550 0 551 0 552 0 553 2 554 0 555 0 556 0 557 0 558 0 559 0 560 0 561 0 562 0 563 0 564 0 565 0 566 0 567 0 568 0 569 0 570 0 571 0 572 0 573 0 574 0 575 0 576 0 577 0 578 0 579 0 580 0 581 0 582 0 583 0 584 0 585 0 586 0 587 0 588 0 589 0 590 0 591 0 592 0 593 0 594 0 595 0 596 0 597 0 598 0 599 0 600 0 601 0 602 0 603 0 604 0 605 0 606 0 607 0 608 0 609 0 610 0 611 0 612 0 613 0 614 0 615 0 616 0 617 0 618 0 619 0 620 0 621 0 622 1 623 0 624 0 625 0 626 0 627 0 628 0 629 0 630 0 631 0 632 0 633 0 634 0 635 5 636 0 637 0 638 0 639 0 640 0 641 0 642 0 643 0 644 0 645 0 646 0 647 0 648 0 649 0 650 0 651 0 652 0 653 1 654 0 655 0 656 0 657 0 658 0 659 1 660 0 661 0 662 0 663 0 664 0 665 0 666 0 667 0 668 0 669 0 670 0 671 0 672 0 673 0 674 0 675 0 676 1 677 0 678 0 679 1 680 0 681 0 682 4 683 0 684 0 685 0 686 0 687 0 688 0 689 0 690 0 691 0 692 0 693 0 694 0 695 0 696 0 697 0 698 0 699 0 700 0 701 0 702 0 703 0 704 0 705 0 706 0 707 0 708 0 709 0 710 0 711 0 712 0 713 0 714 0 715 3 716 0 717 0 718 0 719 0 720 0 721 0 722 0 723 0 724 0 725 0 726 0 727 0 728 0 729 0 730 0 731 0 732 0 733 0 734 0 735 0 736 0 737 0 738 0 739 0 740 0 741 0 742 0 743 0 744 0 745 0 746 0 747 0 748 0 749 0 750 0 751 0 752 0 753 0 754 0 755 0 756 0 757 0 758 0 759 0 760 0 761 0 762 0 763 0 764 0 765 0 766 0 767 0 768 0 769 0 770 0 771 0 772 0 773 0 774 0 775 0 776 0 777 0 778 0 779 0 780 0 781 0 782 0 783 0 784 0 785 0 786 0 787 0 788 0 789 0 790 0 791 0 792 1 793 0 794 0 795 0 796 0 797 0 798 0 799 0 800 0 801 0 802 0 803 0 804 0 805 0 806 0 807 0 808 0 809 0 810 0 811 0 812 0 813 0 814 0 815 0 816 0 817 0 818 0 819 0 820 0 821 0 822 0 823 0 824 0 825 0 826 0 827 0 828 0 829 0 830 0 831 0 832 0 833 0 834 0 835 0 836 0 837 1 838 0 839 0 840 0 841 0 842 0 843 0 844 0 845 0 846 0 847 0 848 0 849 0 850 0 851 0 852 0 853 0 854 0 855 0 856 0 857 0 858 0 859 0 860 0 861 0 862 0 863 0 864 0 865 0 866 0 867 0 868 0 869 0 870 0 871 0 872 0 873 0 874 0 875 0 876 0 877 0 878 0 879 0 880 0 881 0 882 0 883 0 884 0 885 0 886 0 887 0 888 0 889 1 890 0 891 0 892 0 893 0 894 0 895 0 896 0 897 0 898 0 899 0  To compute the document ranking score we used the function get_results_tf() with results from the term frequency matrix  In [21]: def get_results_tf(qry, tfm, lexicon): qrv =[0]*len(lexicon) for term in qry.split(): if term in lexicon: qrv[lexicon.index(term)] = 1 results = [] for i, tfv in enumerate(tfm): score = 0 score = sum([ xy[0] * xy[1] for xy in zip(qrv,tfv)]) results.append([score, i]) sorted_results = sorted(results, key=lambda t: t[0] * -1) return sorted_results def print_results(results,n, head=True): ''' Helper function to print results ''' if head: print('\nTop %d from recall set of %d items:' % (n,len(results))) for r in results[:n]: print('\t%0.2f - %s'%(r[0],items_t[r[1]])) else: print('\nBottom %d from recall set of %d items:' % (n,len(results))) for r in results[-n:]: print('\t%0.2f - %s'%(r[0],items_t[r[1]])) tfm, lexicon = get_tfm(items_d[:1000]) results = get_results_tf('fun times', tfm , lexicon) print_results(results,10)   Top 10 from recall set of 1000 items: 4.00 - ('the challenge', 'm family film m children s m adventure m teen m comedy') 3.00 - ('color me kubrick', 'm lgbt m drama m comedy m indie') 3.00 - ('halloween years later', 'm cult m drama m horror m slasher m teen') 3.00 - ('b b s kids', 'm family film m domestic comedy m comedy m animation') 2.00 - ('the last day of summer', 'm family film m fantasy m comedy') 2.00 - ('eti', 'm romance film') 2.00 - ('halloweentown', 'm children s fantasy m children s family') 2.00 - ('des pissenlits par la racine', 'm comedy') 2.00 - ('santouri', 'm drama m world cinema') 2.00 - ('banjo the woodpile cat', 'm short film m family film m children s family m animation')  # Inverted Index the inverted index maps terms to the document in which they can be found  In [22]: def create_inverted_index(corpus): idx={} for i, document in enumerate(corpus): for word in document.split(): if word in idx: idx[word].append(i) else: idx[word] = [i] ## HIDE return idx test_corpus = ['mountain bike red','road bike carbon','bike helmet'] idx = create_inverted_index(test_corpus) print(idx)   {'mountain': [0], 'helmet': [2], 'bike': [0, 1, 2], 'red': [0], 'carbon': [1], 'road': [1]}  inverted index for document titles  In [23]: idx = create_inverted_index(items_d) print(set(idx['good']).intersection(set(idx['times']))) print(items_d[2061])   set([32488, 13314, 25605, 7688, 29707, 27661, 40338, 16911, 33808, 529, 12306, 12307, 16302, 534, 37798, 14224, 40111, 35356, 13094, 542, 31, 30240, 23587, 29221, 10278, 18983, 8234, 10283, 44, 39282, 560, 16435, 25141, 5696, 28218, 3131, 35388, 34367, 37440, 26689, 2114, 3652, 9286, 8801, 21070, 3853, 5715, 24046, 14934, 29881, 32141, 31426, 18523, 13404, 12429, 8798, 27232, 25697, 4283, 7270, 23313, 6516, 3691, 108, 29373, 20036, 39955, 11892, 629, 15479, 12408, 23161, 22652, 11836, 42112, 33921, 2690, 22055, 27270, 24711, 14444, 6423, 17037, 39277, 4752, 15505, 3943, 5781, 2710, 2711, 20632, 4036, 34971, 668, 5278, 28204, 32928, 12450, 1190, 2728, 22556, 9386, 38571, 30893, 7854, 10927, 33456, 39880, 13679, 12318, 16054, 22200, 30580, 17595, 24765, 22206, 36981, 4801, 24770, 17611, 16068, 8176, 7879, 5068, 41163, 30924, 27938, 3279, 42194, 19667, 18132, 20693, 27350, 37310, 31960, 4825, 5339, 16092, 1760, 40161, 6750, 35077, 4359, 34534, 1255, 3304, 9449, 41194, 35563, 31980, 36697, 22255, 17136, 22770, 39669, 34039, 21716, 15983, 13052, 5839, 33535, 7936, 11523, 33029, 14086, 2689, 40200, 26377, 19213, 25358, 783, 26895, 26385, 2322, 39700, 17685, 28208, 10008, 5403, 12080, 17697, 25890, 24356, 12166, 34598, 40569, 37672, 37057, 9351, 35629, 21806, 40825, 9008, 41779, 8500, 5941, 6281, 312, 1337, 11580, 32309, 26944, 7154, 41283, 29077, 11590, 31125, 5449, 37864, 19791, 7504, 6481, 19282, 29011, 32142, 11568, 13665, 5978, 17245, 18270, 13797, 34565, 4961, 21860, 8166, 18279, 28009, 22891, 29586, 1902, 11631, 4840, 14194, 30867, 10100, 13372, 22506, 24958, 3477, 19329, 30083, 33669, 13190, 39106, 26505, 39307, 15698, 22082, 13710, 31495, 4725, 23192, 1938, 7583, 41966, 35222, 39319, 9113, 31215, 20892, 10617, 41613, 12193, 7586, 23339, 21574, 36262, 32167, 34728, 27564, 25005, 942, 25519, 4019, 36790, 9144, 28603, 28606, 34751, 16832, 4768, 38852, 35937, 20779, 33738, 15819, 20940, 39586, 40398, 7585, 7635, 18901, 15609, 35289, 32219, 6108, 23518, 29663, 6112, 9698, 28645, 38018, 2536, 19945, 36842, 34285, 9710, 26607, 20026, 35314, 40303, 25598, 40952, 24175, 20990, 23125]) soldiers with the u n forces that entered korea during the korean war rape a village girl named eon rae the villagers ostracize eon rae and her son unable to make a living eon rae joins the brothel district that has been set up near the u n base on the other side of the river from the village the war and the introduction of u s culture break down the social order of the village after several village children have died the villagers put the blame on the prostitutes eventually the villagers unable to maintain the village leave their homes one by one eon rae and her son also leave synopsis from cite web  improve the ranking function  In [24]: def get_results_tf(qry, idx): score = Counter() for term in qry.split(): for doc in idx[term]: score[doc] += 1 results=[] for x in [[r[0],r[1]] for r in zip(score.keys(), score.values())]: if x[1] > 0: results.append([x[1],x[0]]) sorted_results = sorted(results, key=lambda t: t[0] * -1 ) return sorted_results; idx = create_inverted_index(items_d) results = get_results_tf('zombies', idx) print_results(results,20)   Top 20 from recall set of 190 items: 30.00 - ('burial ground the nights of terror', 'm thriller m zombie film m horror m world cinema') 19.00 - ('dance of the dead', 'm zombie film m horror m indie m teen m comedy') 19.00 - ('video dead', 'm zombie film m horror m b movie m indie') 16.00 - ('zombies zombies zombies', 'm zombie film m b movie m horror m comedy') 14.00 - ('big tits zombie', 'm zombie film m japanese movies m horror') 14.00 - ('flesheater', 'm horror m indie m creature film m zombie film m b movie m teen') 13.00 - ('shaun of the dead', 'm parody m romantic comedy m horror m doomsday film m cult m comedy m zombie film m black comedy m horror comedy') 12.00 - ('dawn of the dead', 'm horror m indie m doomsday film m cult m splatter film m zombie film') 12.00 - ('dead and deader', 'm science fiction m horror m television movie m sci fi horror m zombie film m action') 11.00 - ('route', 'm zombie film m horror m creature film') 11.00 - ('undead or alive', 'm action adventure m zombie film m western m horror') 11.00 - ('hide and creep', 'm science fiction m b movie m comedy m zombie film m horror m horror comedy') 10.00 - ('the stink of flesh', 'm cult m black comedy m horror m comedy m zombie film') 10.00 - ('abraham lincoln vs zombies', 'm action m horror') 9.00 - ('planet terror', 'm thriller m action adventure m science fiction m horror m indie m creature film m cult m zombie film m disaster m action thrillers m action') 9.00 - ('night of the living dead', 'm mystery m horror') 9.00 - ('when good ghouls go bad', 'm black comedy m fantasy m comedy m children s fantasy') 8.00 - ('zombi', 'm zombie film m horror m creature film m world cinema') 8.00 - ('day of the dead contagium', 'm zombie film m horror') 8.00 - ('land of the dead', 'm thriller m science fiction m horror m indie m doomsday film m creature film m cult m splatter film m zombie film m action m dystopia')  enter different queries  In [25]: results = get_results_tf('ghouls and ghosts', idx) print_results(results, 10)   Top 10 from recall set of 39747 items: 181.00 - ('in the line of duty witness', 'm action thrillers m world cinema m action adventure m martial arts film m action m chinese movies') 165.00 - ('dragon head', 'm science fiction m horror m world cinema m anime m disaster m japanese movies m action') 165.00 - ('band of the hand', 'm crime fiction m thriller m action thrillers m action adventure m drama m crime thriller m action') 162.00 - ('underworld rise of the lycans', 'm thriller m horror m gothic film m action adventure m period piece m fantasy m action m costume horror') 145.00 - ('franklin and the green knight', 'm family film m children s m animation') 144.00 - ('devil s diary', 'm horror m teen m television movie') 140.00 - ('wishology', 'm fantasy') 139.00 - ('the runaways', 'm punk rock m biography m indie m musical m drama m music m biographical film') 134.00 - ('the guard post', 'm mystery m horror') 129.00 - ('the mists of avalon', 'm costume drama m fantasy adventure m fantasy m feminist film')   In [26]: import pandas as pd from bokeh.plotting import output_notebook, show from bokeh.charts import Bar from bokeh.charts.attributes import CatAttr #from bokeh.models import ColumnDataSource df = pd.DataFrame({'term':[x for x in idx.keys()],'freq':[len(x) for x in idx.values()]}) output_notebook(hide_banner=True) p = Bar(df.sort_values('freq', ascending=False)[:30], label=CatAttr(columns=['term'], sort=False), values='freq', plot_width=800, plot_height=400) show(p)   var element =$('#2093ea49-a561-4433-aa88-3a90325ecacb');

(function(global) {
function now() {
return new Date();
}

if (typeof (window._bokeh_onload_callbacks) === "undefined") {
}

function run_callbacks() {
console.info("Bokeh: all callbacks have finished");
}

console.log("Bokeh: BokehJS is being loaded, scheduling callback at", now());
return null;
}
if (js_urls == null || js_urls.length === 0) {
run_callbacks();
return null;
}
for (var i = 0; i < js_urls.length; i++) {
var url = js_urls[i];
var s = document.createElement('script');
s.src = url;
s.async = false;
run_callbacks()
}
};
s.onerror = function() {
console.warn("failed to load library " + url);
};
console.log("Bokeh: injecting script tag for BokehJS library: ", url);
}
};

var js_urls = ['https://cdn.pydata.org/bokeh/release/bokeh-0.12.0.min.js', 'https://cdn.pydata.org/bokeh/release/bokeh-widgets-0.12.0.min.js', 'https://cdn.pydata.org/bokeh/release/bokeh-compiler-0.12.0.min.js'];

var inline_js = [
function(Bokeh) {
Bokeh.set_log_level("info");
},

function(Bokeh) {
Bokeh.$("#39b1e159-0ba8-4a4f-9f69-3e8ed0b65fea").text("BokehJS successfully loaded"); }, function(Bokeh) { console.log("Bokeh: injecting CSS: https://cdn.pydata.org/bokeh/release/bokeh-0.12.0.min.css"); Bokeh.embed.inject_css("https://cdn.pydata.org/bokeh/release/bokeh-0.12.0.min.css"); console.log("Bokeh: injecting CSS: https://cdn.pydata.org/bokeh/release/bokeh-widgets-0.12.0.min.css"); Bokeh.embed.inject_css("https://cdn.pydata.org/bokeh/release/bokeh-widgets-0.12.0.min.css"); } ]; function run_inline_js() { for (var i = 0; i < inline_js.length; i++) { inline_js[i](window.Bokeh); } } if (window._bokeh_is_loading === 0) { console.log("Bokeh: BokehJS loaded, going straight to plotting"); run_inline_js(); } else { load_libs(js_urls, function() { console.log("Bokeh: BokehJS plotting callback run at", now()); run_inline_js(); }); } }(this)); (function(global) { function now() { return new Date(); } if (typeof (window._bokeh_onload_callbacks) === "undefined") { window._bokeh_onload_callbacks = []; } function run_callbacks() { window._bokeh_onload_callbacks.forEach(function(callback) { callback() }); delete window._bokeh_onload_callbacks console.info("Bokeh: all callbacks have finished"); } function load_libs(js_urls, callback) { window._bokeh_onload_callbacks.push(callback); if (window._bokeh_is_loading > 0) { console.log("Bokeh: BokehJS is being loaded, scheduling callback at", now()); return null; } if (js_urls == null || js_urls.length === 0) { run_callbacks(); return null; } console.log("Bokeh: BokehJS not loaded, scheduling load and callback at", now()); window._bokeh_is_loading = js_urls.length; for (var i = 0; i < js_urls.length; i++) { var url = js_urls[i]; var s = document.createElement('script'); s.src = url; s.async = false; s.onreadystatechange = s.onload = function() { window._bokeh_is_loading--; if (window._bokeh_is_loading === 0) { console.log("Bokeh: all BokehJS libraries loaded"); run_callbacks() } }; s.onerror = function() { console.warn("failed to load library " + url); }; console.log("Bokeh: injecting script tag for BokehJS library: ", url); document.getElementsByTagName("head")[0].appendChild(s); } };var element = document.getElementById("6a635638-0b90-43f7-b3ab-b8749ec1b624"); if (element == null) { console.log("Bokeh: ERROR: autoload.js configured with elementid '6a635638-0b90-43f7-b3ab-b8749ec1b624' but no matching script tag was found. ") return false; } var js_urls = []; var inline_js = [ function(Bokeh) { Bokeh.$(function() {
var render_items = [{"docid":"f2f141e0-0197-42ba-9f69-25fd7ccb13d4","elementid":"6a635638-0b90-43f7-b3ab-b8749ec1b624","modelid":"23dfe37d-8217-4430-bfb3-0776f838877e","notebook_comms_target":"1b8c2520-a670-4655-be10-91b52409a558"}];

Bokeh.embed.embed_items(docs_json, render_items);
});
},
function(Bokeh) {
}
];

function run_inline_js() {
for (var i = 0; i < inline_js.length; i++) {
inline_js[i](window.Bokeh);
}
}

console.log("Bokeh: BokehJS loaded, going straight to plotting");
run_inline_js();
} else {
console.log("Bokeh: BokehJS plotting callback run at", now());
run_inline_js();
});
}
}(this));

Out[26]:

<Bokeh Notebook handle for In[26]>



# TF-IDF

To implement TF-IDF we used the function: $$IDF = log ( 1 + \frac{N}{n_t} )$$



In [27]:

import math

def idf(term, idx, n):
return math.log( float(n) / (1 + len(idx[term])))

print(idf('zombie',idx,len(items_d)))
print(idf('survival',idx,len(items_d)))
print(idf('invasions',idx,len(items_d)))




4.35124994957
4.91040628425
8.45297461909



### TF-IDF Intuition



In [28]:

from bokeh.charts import vplot

idx = create_inverted_index(items_d)

df = pd.DataFrame({'term':[x for x in idx.keys()],'freq':[len(x) for x in idx.values()],
'idf':[idf(x, idx, len(items_t)) for x in idx.keys()]})

output_notebook(hide_banner=True)
p1 = Bar(df.sort_values('freq', ascending=False)[:30], label=CatAttr(columns=['term'], sort=False), values='freq',
plot_width=800, plot_height=400)
p2 = Bar(df.sort_values('freq', ascending=False)[:30], label=CatAttr(columns=['term'], sort=False), values='idf',
plot_width=800, plot_height=400)
p = vplot(p1, p2)
show(p)




var element = $('#52138dc2-9d79-49e6-95f4-42e8d39e7bad'); (function(global) { function now() { return new Date(); } if (typeof (window._bokeh_onload_callbacks) === "undefined") { window._bokeh_onload_callbacks = []; } function run_callbacks() { window._bokeh_onload_callbacks.forEach(function(callback) { callback() }); delete window._bokeh_onload_callbacks console.info("Bokeh: all callbacks have finished"); } function load_libs(js_urls, callback) { window._bokeh_onload_callbacks.push(callback); if (window._bokeh_is_loading > 0) { console.log("Bokeh: BokehJS is being loaded, scheduling callback at", now()); return null; } if (js_urls == null || js_urls.length === 0) { run_callbacks(); return null; } console.log("Bokeh: BokehJS not loaded, scheduling load and callback at", now()); window._bokeh_is_loading = js_urls.length; for (var i = 0; i < js_urls.length; i++) { var url = js_urls[i]; var s = document.createElement('script'); s.src = url; s.async = false; s.onreadystatechange = s.onload = function() { window._bokeh_is_loading--; if (window._bokeh_is_loading === 0) { console.log("Bokeh: all BokehJS libraries loaded"); run_callbacks() } }; s.onerror = function() { console.warn("failed to load library " + url); }; console.log("Bokeh: injecting script tag for BokehJS library: ", url); document.getElementsByTagName("head")[0].appendChild(s); } }; var js_urls = ['https://cdn.pydata.org/bokeh/release/bokeh-0.12.0.min.js', 'https://cdn.pydata.org/bokeh/release/bokeh-widgets-0.12.0.min.js', 'https://cdn.pydata.org/bokeh/release/bokeh-compiler-0.12.0.min.js']; var inline_js = [ function(Bokeh) { Bokeh.set_log_level("info"); }, function(Bokeh) { Bokeh.$("#de1cfd1c-1059-4d1a-83d9-239f1925cae0").text("BokehJS successfully loaded");
},
function(Bokeh) {
console.log("Bokeh: injecting CSS: https://cdn.pydata.org/bokeh/release/bokeh-0.12.0.min.css");
Bokeh.embed.inject_css("https://cdn.pydata.org/bokeh/release/bokeh-0.12.0.min.css");
console.log("Bokeh: injecting CSS: https://cdn.pydata.org/bokeh/release/bokeh-widgets-0.12.0.min.css");
Bokeh.embed.inject_css("https://cdn.pydata.org/bokeh/release/bokeh-widgets-0.12.0.min.css");
}
];

function run_inline_js() {
for (var i = 0; i < inline_js.length; i++) {
inline_js[i](window.Bokeh);
}
}

console.log("Bokeh: BokehJS loaded, going straight to plotting");
run_inline_js();
} else {
console.log("Bokeh: BokehJS plotting callback run at", now());
run_inline_js();
});
}
}(this));

/Users/dustin/anaconda2/lib/python2.7/site-packages/ipykernel/__main__.py:13: BokehDeprecationWarning: bokeh.io.vplot was deprecated in Bokeh 0.12.0; please use bokeh.models.layouts.Column instead

(function(global) {
function now() {
return new Date();
}

if (typeof (window._bokeh_onload_callbacks) === "undefined") {
}

function run_callbacks() {
console.info("Bokeh: all callbacks have finished");
}

console.log("Bokeh: BokehJS is being loaded, scheduling callback at", now());
return null;
}
if (js_urls == null || js_urls.length === 0) {
run_callbacks();
return null;
}
for (var i = 0; i < js_urls.length; i++) {
var url = js_urls[i];
var s = document.createElement('script');
s.src = url;
s.async = false;
run_callbacks()
}
};
s.onerror = function() {
console.warn("failed to load library " + url);
};
console.log("Bokeh: injecting script tag for BokehJS library: ", url);
}
};var element = document.getElementById("6067b708-a0f7-429c-898e-b845b1969084");
if (element == null) {
console.log("Bokeh: ERROR: autoload.js configured with elementid '6067b708-a0f7-429c-898e-b845b1969084' but no matching script tag was found. ")
return false;
}

var js_urls = [];

var inline_js = [
function(Bokeh) {
Bokeh.$(function() { var docs_json = {"228434f8-5648-4d6e-9898-707adaf02248":{"roots":{"references":[{"attributes":{"data_source":{"id":"e7d71a6a-dcfa-494f-9efb-d09b3e01d693","type":"ColumnDataSource"},"glyph":{"id":"15bce605-0047-4d01-a937-94a5c12238b6","type":"Rect"},"hover_glyph":null,"nonselection_glyph":null,"selection_glyph":null},"id":"14b0c9c7-080c-4922-89ed-15d530da2af3","type":"GlyphRenderer"},{"attributes":{"data_source":{"id":"c0e6047b-ffd0-48f6-a8b1-bf656ce09229","type":"ColumnDataSource"},"glyph":{"id":"fb3db002-82b0-43e6-a4da-f9de5ed1f195","type":"Rect"},"hover_glyph":null,"nonselection_glyph":null,"selection_glyph":null},"id":"c583f87d-ef6e-4674-8c71-25f4bbc53776","type":"GlyphRenderer"},{"attributes":{"data_source":{"id":"d2e9edae-cc0f-4921-8b19-d09cf61b1ff2","type":"ColumnDataSource"},"glyph":{"id":"af7dd631-b6e7-4b5f-8afc-74b6d7068daf","type":"Rect"},"hover_glyph":null,"nonselection_glyph":null,"selection_glyph":null},"id":"573db6ce-97af-4cc6-a6a9-643a2395cae7","type":"GlyphRenderer"},{"attributes":{"fill_alpha":{"field":"fill_alpha"},"fill_color":{"field":"color"},"height":{"field":"height","units":"data"},"line_color":{"field":"line_color"},"width":{"field":"width","units":"data"},"x":{"field":"x"},"y":{"field":"y"}},"id":"53e51508-4e95-4d86-b9ab-1777babc6334","type":"Rect"},{"attributes":{"callback":null,"column_names":["line_color","line_alpha","color","fill_alpha","height","width","y","x","label"],"data":{"chart_index":[{"term":"who"}],"color":["#f22c40"],"fill_alpha":[0.8],"height":[-0.5241157474214996],"label":[{"term":"who"}],"line_alpha":[1.0],"line_color":["white"],"term":["who"],"width":[0.8],"x":["who"],"y":[-0.2620578737107498]}},"id":"d28f523d-d8e6-49e7-9b92-a6bb49b9bd34","type":"ColumnDataSource"},{"attributes":{"fill_alpha":{"field":"fill_alpha"},"fill_color":{"field":"color"},"height":{"field":"height","units":"data"},"line_color":{"field":"line_color"},"width":{"field":"width","units":"data"},"x":{"field":"x"},"y":{"field":"y"}},"id":"5a7d16cf-ab77-48ba-8718-aec26dcf6567","type":"Rect"},{"attributes":{"fill_alpha":{"field":"fill_alpha"},"fill_color":{"field":"color"},"height":{"field":"height","units":"data"},"line_color":{"field":"line_color"},"width":{"field":"width","units":"data"},"x":{"field":"x"},"y":{"field":"y"}},"id":"52ddc914-d65b-45f8-ae52-c8754a4184ea","type":"Rect"},{"attributes":{"dimension":1,"plot":{"id":"f9ba29c1-b2c1-43a5-b32b-b3519a51adbc","subtype":"Chart","type":"Plot"},"ticker":{"id":"3c711e40-3f43-4107-866c-327bd9cbb0dc","type":"BasicTicker"}},"id":"66f0fb3f-73e8-4dfd-bdc0-bf32680dfc7c","type":"Grid"},{"attributes":{"callback":null,"column_names":["line_color","line_alpha","color","fill_alpha","height","width","y","x","label"],"data":{"chart_index":[{"term":"with"}],"color":["#f22c40"],"fill_alpha":[0.8],"height":[-1.19290732857723],"label":[{"term":"with"}],"line_alpha":[1.0],"line_color":["white"],"term":["with"],"width":[0.8],"x":["with"],"y":[-0.596453664288615]}},"id":"ad72acb6-f7c5-4145-a75b-16d6ab673153","type":"ColumnDataSource"},{"attributes":{"callback":null,"column_names":["line_color","line_alpha","color","fill_alpha","height","width","y","x","label"],"data":{"chart_index":[{"term":"an"}],"color":["#f22c40"],"fill_alpha":[0.8],"height":[-0.31825768035584007],"label":[{"term":"an"}],"line_alpha":[1.0],"line_color":["white"],"term":["an"],"width":[0.8],"x":["an"],"y":[-0.15912884017792003]}},"id":"ee1233b0-dec8-4d05-a1db-ab5c734e0884","type":"ColumnDataSource"},{"attributes":{"callback":null,"column_names":["line_color","line_alpha","color","fill_alpha","height","width","y","x","label"],"data":{"chart_index":[{"term":"for"}],"color":["#f22c40"],"fill_alpha":[0.8],"height":[-0.8106326539861032],"label":[{"term":"for"}],"line_alpha":[1.0],"line_color":["white"],"term":["for"],"width":[0.8],"x":["for"],"y":[-0.4053163269930516]}},"id":"d0a9e417-3219-4610-b936-218e209eb996","type":"ColumnDataSource"},{"attributes":{"fill_alpha":{"field":"fill_alpha"},"fill_color":{"field":"color"},"height":{"field":"height","units":"data"},"line_color":{"field":"line_color"},"width":{"field":"width","units":"data"},"x":{"field":"x"},"y":{"field":"y"}},"id":"5c4838ab-b947-40fa-9d8f-25ff7851107b","type":"Rect"},{"attributes":{"callback":null,"column_names":["line_color","line_alpha","color","fill_alpha","height","width","y","x","label"],"data":{"chart_index":[{"term":"to"}],"color":["#f22c40"],"fill_alpha":[0.8],"height":[480856.0],"label":[{"term":"to"}],"line_alpha":[1.0],"line_color":["white"],"term":["to"],"width":[0.8],"x":["to"],"y":[240428.0]}},"id":"2f577255-c195-48ba-99a0-ecc83d18705f","type":"ColumnDataSource"},{"attributes":{"fill_alpha":{"field":"fill_alpha"},"fill_color":{"field":"color"},"height":{"field":"height","units":"data"},"line_color":{"field":"line_color"},"width":{"field":"width","units":"data"},"x":{"field":"x"},"y":{"field":"y"}},"id":"b170b8d7-24ec-4178-8c43-b9fdcaf4af25","type":"Rect"},{"attributes":{"callback":null,"end":864628.8},"id":"66404f37-b2a8-4b51-84c8-3f0067cb47e6","type":"Range1d"},{"attributes":{"callback":null,"column_names":["line_color","line_alpha","color","fill_alpha","height","width","y","x","label"],"data":{"chart_index":[{"term":"from"}],"color":["#f22c40"],"fill_alpha":[0.8],"height":[-0.36259327431717525],"label":[{"term":"from"}],"line_alpha":[1.0],"line_color":["white"],"term":["from"],"width":[0.8],"x":["from"],"y":[-0.18129663715858763]}},"id":"b7c12cb6-a8dc-45b4-bef7-c67388ad863d","type":"ColumnDataSource"},{"attributes":{"fill_alpha":{"field":"fill_alpha"},"fill_color":{"field":"color"},"height":{"field":"height","units":"data"},"line_color":{"field":"line_color"},"width":{"field":"width","units":"data"},"x":{"field":"x"},"y":{"field":"y"}},"id":"1036c666-e82e-493d-94ed-e279e05bef99","type":"Rect"},{"attributes":{"fill_alpha":{"field":"fill_alpha"},"fill_color":{"field":"color"},"height":{"field":"height","units":"data"},"line_color":{"field":"line_color"},"width":{"field":"width","units":"data"},"x":{"field":"x"},"y":{"field":"y"}},"id":"17e91aa8-63e5-4105-b9eb-bff155cea47e","type":"Rect"},{"attributes":{"data_source":{"id":"4a916ec0-b9e4-475b-a328-0689f5d1a8a7","type":"ColumnDataSource"},"glyph":{"id":"6f3a53bc-0d51-485c-9980-4e701f474dab","type":"Rect"},"hover_glyph":null,"nonselection_glyph":null,"selection_glyph":null},"id":"368d8fdc-c1b4-4eab-95e5-cdc04a33bd99","type":"GlyphRenderer"},{"attributes":{"data_source":{"id":"745746fe-c19a-4192-b659-e755718ba2f5","type":"ColumnDataSource"},"glyph":{"id":"b170b8d7-24ec-4178-8c43-b9fdcaf4af25","type":"Rect"},"hover_glyph":null,"nonselection_glyph":null,"selection_glyph":null},"id":"2a2a2e69-a9ca-43dd-a245-812fbb4ea5a4","type":"GlyphRenderer"},{"attributes":{"fill_alpha":{"field":"fill_alpha"},"fill_color":{"field":"color"},"height":{"field":"height","units":"data"},"line_color":{"field":"line_color"},"width":{"field":"width","units":"data"},"x":{"field":"x"},"y":{"field":"y"}},"id":"f6216011-d772-49b8-a582-f35d57338d0f","type":"Rect"},{"attributes":{"callback":null,"column_names":["line_color","line_alpha","color","fill_alpha","height","width","y","x","label"],"data":{"chart_index":[{"term":"as"}],"color":["#f22c40"],"fill_alpha":[0.8],"height":[-0.744100377785451],"label":[{"term":"as"}],"line_alpha":[1.0],"line_color":["white"],"term":["as"],"width":[0.8],"x":["as"],"y":[-0.3720501888927255]}},"id":"b74e7c4f-c80c-4239-beea-bbd3f6812bb1","type":"ColumnDataSource"},{"attributes":{"callback":null,"column_names":["line_color","line_alpha","color","fill_alpha","height","width","y","x","label"],"data":{"chart_index":[{"term":"he"}],"color":["#f22c40"],"fill_alpha":[0.8],"height":[-1.4512959601344462],"label":[{"term":"he"}],"line_alpha":[1.0],"line_color":["white"],"term":["he"],"width":[0.8],"x":["he"],"y":[-0.7256479800672231]}},"id":"75fa5ceb-befd-491d-b24d-b901afe3cc9b","type":"ColumnDataSource"},{"attributes":{"data_source":{"id":"d0584287-8a08-416c-a720-573fbdc45c81","type":"ColumnDataSource"},"glyph":{"id":"3451cc85-b13b-4679-a612-5fd37a35bc8b","type":"Rect"},"hover_glyph":null,"nonselection_glyph":null,"selection_glyph":null},"id":"0bdfc78f-f6f3-4e6d-a756-e32ae7697ec6","type":"GlyphRenderer"},{"attributes":{"callback":null,"column_names":["line_color","line_alpha","color","fill_alpha","height","width","y","x","label"],"data":{"chart_index":[{"term":"him"}],"color":["#f22c40"],"fill_alpha":[0.8],"height":[-0.829407305751567],"label":[{"term":"him"}],"line_alpha":[1.0],"line_color":["white"],"term":["him"],"width":[0.8],"x":["him"],"y":[-0.4147036528757835]}},"id":"34993822-5213-4048-a792-ef5711f2fe6f","type":"ColumnDataSource"},{"attributes":{"callback":null,"column_names":["line_color","line_alpha","color","fill_alpha","height","width","y","x","label"],"data":{"chart_index":[{"term":"from"}],"color":["#f22c40"],"fill_alpha":[0.8],"height":[60644.0],"label":[{"term":"from"}],"line_alpha":[1.0],"line_color":["white"],"term":["from"],"width":[0.8],"x":["from"],"y":[30322.0]}},"id":"07065e70-c22e-40ef-a622-a7676f5bc562","type":"ColumnDataSource"},{"attributes":{"fill_alpha":{"field":"fill_alpha"},"fill_color":{"field":"color"},"height":{"field":"height","units":"data"},"line_color":{"field":"line_color"},"width":{"field":"width","units":"data"},"x":{"field":"x"},"y":{"field":"y"}},"id":"1429726b-42aa-4f5c-b13f-07d27d199630","type":"Rect"},{"attributes":{"data_source":{"id":"f060e950-65e4-4931-8658-84b77de23aad","type":"ColumnDataSource"},"glyph":{"id":"f873ecb2-1f4a-4ef6-a3be-bb6468ec4c29","type":"Rect"},"hover_glyph":null,"nonselection_glyph":null,"selection_glyph":null},"id":"e0641a18-fcb9-469b-97b1-3c0a00ae52c9","type":"GlyphRenderer"},{"attributes":{"data_source":{"id":"164d2856-0572-4f91-aea5-6a6d2cd065fe","type":"ColumnDataSource"},"glyph":{"id":"3b8d19c9-4d0f-4fcf-ba23-cd49d7347091","type":"Rect"},"hover_glyph":null,"nonselection_glyph":null,"selection_glyph":null},"id":"a7b18b75-36c9-483a-aeae-3bb7ff0f198b","type":"GlyphRenderer"},{"attributes":{"data_source":{"id":"2f577255-c195-48ba-99a0-ecc83d18705f","type":"ColumnDataSource"},"glyph":{"id":"054cafa8-436d-4760-9ad3-fef7806bf304","type":"Rect"},"hover_glyph":null,"nonselection_glyph":null,"selection_glyph":null},"id":"d5370cda-9644-421e-b8d4-669d221e7103","type":"GlyphRenderer"},{"attributes":{"bottom_units":"screen","fill_alpha":{"value":0.5},"fill_color":{"value":"lightgrey"},"left_units":"screen","level":"overlay","line_alpha":{"value":1.0},"line_color":{"value":"black"},"line_dash":[4,4],"line_width":{"value":2},"plot":null,"render_mode":"css","right_units":"screen","top_units":"screen"},"id":"89af47fd-7684-42c6-862b-480efa0d9a57","type":"BoxAnnotation"},{"attributes":{},"id":"7279e6e6-20de-49f5-b726-dcb5b15d70d6","type":"BasicTickFormatter"},{"attributes":{"fill_alpha":{"field":"fill_alpha"},"fill_color":{"field":"color"},"height":{"field":"height","units":"data"},"line_color":{"field":"line_color"},"width":{"field":"width","units":"data"},"x":{"field":"x"},"y":{"field":"y"}},"id":"f873ecb2-1f4a-4ef6-a3be-bb6468ec4c29","type":"Rect"},{"attributes":{"fill_alpha":{"field":"fill_alpha"},"fill_color":{"field":"color"},"height":{"field":"height","units":"data"},"line_color":{"field":"line_color"},"width":{"field":"width","units":"data"},"x":{"field":"x"},"y":{"field":"y"}},"id":"0db1e70c-b798-4f76-96fa-ee7f7d88deb4","type":"Rect"},{"attributes":{"data_source":{"id":"37769868-c296-4e5e-bf0d-f5eccc1eb209","type":"ColumnDataSource"},"glyph":{"id":"b3eca825-1ea2-4135-9988-9d5b15bd96ca","type":"Rect"},"hover_glyph":null,"nonselection_glyph":null,"selection_glyph":null},"id":"42842623-de88-4282-bdf6-1ff91c5902ca","type":"GlyphRenderer"},{"attributes":{"fill_alpha":{"field":"fill_alpha"},"fill_color":{"field":"color"},"height":{"field":"height","units":"data"},"line_color":{"field":"line_color"},"width":{"field":"width","units":"data"},"x":{"field":"x"},"y":{"field":"y"}},"id":"4b1d76fd-d65d-48fc-b61d-73004f58e1be","type":"Rect"},{"attributes":{"fill_alpha":{"field":"fill_alpha"},"fill_color":{"field":"color"},"height":{"field":"height","units":"data"},"line_color":{"field":"line_color"},"width":{"field":"width","units":"data"},"x":{"field":"x"},"y":{"field":"y"}},"id":"833d94a8-8243-4c17-aa37-21982b5ab65d","type":"Rect"},{"attributes":{"active_drag":"auto","active_scroll":"auto","active_tap":"auto","tools":[{"id":"285a53e0-9ba5-4adf-a93a-599a05a2b1d3","type":"PanTool"},{"id":"9d6c1ff6-3edc-47aa-a54b-a1a6fd6b9086","type":"WheelZoomTool"},{"id":"2638e969-df0b-4d97-9f7d-1de9a4dbc50a","type":"BoxZoomTool"},{"id":"704e3120-3b6a-449b-9d4b-202759c860ff","type":"SaveTool"},{"id":"2dc75eb2-a5fa-473b-9dae-54d055d378ea","type":"ResetTool"},{"id":"ddc9545f-569f-4206-8fde-00f0eaea8589","type":"HelpTool"}]},"id":"5badf80e-2ada-4753-b16a-dba8f50f758f","type":"Toolbar"},{"attributes":{"data_source":{"id":"155c0b63-3018-41a3-9a30-9db15a2e06d7","type":"ColumnDataSource"},"glyph":{"id":"39b5794c-8178-438b-9584-a417ee62010a","type":"Rect"},"hover_glyph":null,"nonselection_glyph":null,"selection_glyph":null},"id":"50cd347c-84a6-4306-b228-1403d7bc3862","type":"GlyphRenderer"},{"attributes":{"fill_alpha":{"field":"fill_alpha"},"fill_color":{"field":"color"},"height":{"field":"height","units":"data"},"line_color":{"field":"line_color"},"width":{"field":"width","units":"data"},"x":{"field":"x"},"y":{"field":"y"}},"id":"6eba8405-cf73-4a28-a766-e7c15d19858f","type":"Rect"},{"attributes":{"data_source":{"id":"d10d227c-341f-44e1-89bd-83a30e510cbb","type":"ColumnDataSource"},"glyph":{"id":"8110f4f9-9a94-45ae-9d7d-01cd3d8dad0e","type":"Rect"},"hover_glyph":null,"nonselection_glyph":null,"selection_glyph":null},"id":"6f19ca39-a719-47ed-a2e0-dfe515c49100","type":"GlyphRenderer"},{"attributes":{"fill_alpha":{"field":"fill_alpha"},"fill_color":{"field":"color"},"height":{"field":"height","units":"data"},"line_color":{"field":"line_color"},"width":{"field":"width","units":"data"},"x":{"field":"x"},"y":{"field":"y"}},"id":"28cf1daf-bcf1-48d1-8a53-a57342433954","type":"Rect"},{"attributes":{"callback":null,"column_names":["line_color","line_alpha","color","fill_alpha","height","width","y","x","label"],"data":{"chart_index":[{"term":"s"}],"color":["#f22c40"],"fill_alpha":[0.8],"height":[157928.0],"label":[{"term":"s"}],"line_alpha":[1.0],"line_color":["white"],"term":["s"],"width":[0.8],"x":["s"],"y":[78964.0]}},"id":"4a916ec0-b9e4-475b-a328-0689f5d1a8a7","type":"ColumnDataSource"},{"attributes":{"data_source":{"id":"15c8d403-ca1e-4c1d-b204-1a8c11ec2e06","type":"ColumnDataSource"},"glyph":{"id":"300a52ce-2159-4061-bef9-27e4ff6030e0","type":"Rect"},"hover_glyph":null,"nonselection_glyph":null,"selection_glyph":null},"id":"bff31e20-1d7b-401b-9773-af1438fd40cd","type":"GlyphRenderer"},{"attributes":{"callback":null,"column_names":["line_color","line_alpha","color","fill_alpha","height","width","y","x","label"],"data":{"chart_index":[{"term":"their"}],"color":["#f22c40"],"fill_alpha":[0.8],"height":[53503.0],"label":[{"term":"their"}],"line_alpha":[1.0],"line_color":["white"],"term":["their"],"width":[0.8],"x":["their"],"y":[26751.5]}},"id":"59d5fa0d-306f-4ebe-8360-4c174f419c56","type":"ColumnDataSource"},{"attributes":{"data_source":{"id":"29639765-7199-4a04-9910-60919bf7f82c","type":"ColumnDataSource"},"glyph":{"id":"28cf1daf-bcf1-48d1-8a53-a57342433954","type":"Rect"},"hover_glyph":null,"nonselection_glyph":null,"selection_glyph":null},"id":"b103d63b-8f5c-4cb8-a642-cf2ddccbada3","type":"GlyphRenderer"},{"attributes":{"below":[{"id":"2af85d4b-122f-4083-910c-6aee8ca9b295","type":"CategoricalAxis"}],"left":[{"id":"29cfe57f-4cda-46b4-8181-c9c19631c459","type":"LinearAxis"}],"plot_height":400,"plot_width":800,"renderers":[{"id":"89af47fd-7684-42c6-862b-480efa0d9a57","type":"BoxAnnotation"},{"id":"8c316225-196b-4c90-8e87-f28eae4a9310","type":"GlyphRenderer"},{"id":"d5370cda-9644-421e-b8d4-669d221e7103","type":"GlyphRenderer"},{"id":"c583f87d-ef6e-4674-8c71-25f4bbc53776","type":"GlyphRenderer"},{"id":"a7b18b75-36c9-483a-aeae-3bb7ff0f198b","type":"GlyphRenderer"},{"id":"deb4d911-d78f-4141-bb1f-54e86d087374","type":"GlyphRenderer"},{"id":"42842623-de88-4282-bdf6-1ff91c5902ca","type":"GlyphRenderer"},{"id":"14b0c9c7-080c-4922-89ed-15d530da2af3","type":"GlyphRenderer"},{"id":"c063900f-02cb-46f0-85e4-60418904bc1a","type":"GlyphRenderer"},{"id":"745cae12-85e2-4b8e-a8fc-da73cc133ef2","type":"GlyphRenderer"},{"id":"368d8fdc-c1b4-4eab-95e5-cdc04a33bd99","type":"GlyphRenderer"},{"id":"187ac1a0-74a8-41dc-bcf2-6f5825da9c44","type":"GlyphRenderer"},{"id":"19fdd1b0-c4b4-4271-a63b-7a25d4d4d69a","type":"GlyphRenderer"},{"id":"60cdbffa-722b-4078-bffc-ad98981d7304","type":"GlyphRenderer"},{"id":"0e27a912-246c-4d97-a1f9-b89e51a92b53","type":"GlyphRenderer"},{"id":"573db6ce-97af-4cc6-a6a9-643a2395cae7","type":"GlyphRenderer"},{"id":"26925648-4d41-4d01-a7f6-db4c46372205","type":"GlyphRenderer"},{"id":"3e98c68c-e3bd-4d0a-b198-65fd328233d7","type":"GlyphRenderer"},{"id":"2a2a2e69-a9ca-43dd-a245-812fbb4ea5a4","type":"GlyphRenderer"},{"id":"951d009e-a7ba-4629-82cb-eb57595e4c4a","type":"GlyphRenderer"},{"id":"f065d4a9-6b4e-47a7-a9c4-2c429da7455a","type":"GlyphRenderer"},{"id":"1851a8c6-1dd0-4b6d-9e95-e030700fff2a","type":"GlyphRenderer"},{"id":"a0df3022-0ceb-4fab-bbdf-214e83fb8f38","type":"GlyphRenderer"},{"id":"0734b97b-d68b-42c0-bd9a-09e641bd7ddd","type":"GlyphRenderer"},{"id":"bb8145b9-a7a3-4b32-940a-bbf7a4c5e4c7","type":"GlyphRenderer"},{"id":"c2216d76-a7d5-4758-9c95-77eaa9e1caad","type":"GlyphRenderer"},{"id":"d3aba626-4626-4632-b4dc-8c89c982f174","type":"GlyphRenderer"},{"id":"6f19ca39-a719-47ed-a2e0-dfe515c49100","type":"GlyphRenderer"},{"id":"79f3343c-80ee-4061-be08-db97ab97c45c","type":"GlyphRenderer"},{"id":"c713fd0e-6265-48f8-ad10-72a2624e583f","type":"GlyphRenderer"},{"id":"e5c6d8f3-87ee-418a-803c-b5fa27962362","type":"GlyphRenderer"},{"id":"94e01f01-411b-4464-8bcd-28a6c515db54","type":"Legend"},{"id":"2af85d4b-122f-4083-910c-6aee8ca9b295","type":"CategoricalAxis"},{"id":"29cfe57f-4cda-46b4-8181-c9c19631c459","type":"LinearAxis"},{"id":"c7bb252e-cc61-4c65-b27f-6c6a3da4c777","type":"Grid"}],"title":{"id":"d2611002-0555-41b1-93df-d59bca0d625d","type":"Title"},"tool_events":{"id":"7d1bf0d3-6f82-4168-816b-918883e5a488","type":"ToolEvents"},"toolbar":{"id":"8e446a7b-d8c1-4768-a007-734e68d56aad","type":"Toolbar"},"x_mapper_type":"auto","x_range":{"id":"e0a81336-b35a-4f18-8585-931ae4358035","type":"FactorRange"},"y_mapper_type":"auto","y_range":{"id":"66404f37-b2a8-4b51-84c8-3f0067cb47e6","type":"Range1d"}},"id":"04efd8e2-c1e4-40ac-9e9b-6bb9bc49f084","subtype":"Chart","type":"Plot"},{"attributes":{"callback":null,"column_names":["line_color","line_alpha","color","fill_alpha","height","width","y","x","label"],"data":{"chart_index":[{"term":"who"}],"color":["#f22c40"],"fill_alpha":[0.8],"height":[71275.0],"label":[{"term":"who"}],"line_alpha":[1.0],"line_color":["white"],"term":["who"],"width":[0.8],"x":["who"],"y":[35637.5]}},"id":"bd365f63-2073-4be4-992d-b8ca14eb1e4a","type":"ColumnDataSource"},{"attributes":{"fill_alpha":{"field":"fill_alpha"},"fill_color":{"field":"color"},"height":{"field":"height","units":"data"},"line_color":{"field":"line_color"},"width":{"field":"width","units":"data"},"x":{"field":"x"},"y":{"field":"y"}},"id":"2edbb47c-9c66-4984-8a54-868c5df7431a","type":"Rect"},{"attributes":{"data_source":{"id":"694dcede-905f-417b-ac07-2f3be1ef5c9a","type":"ColumnDataSource"},"glyph":{"id":"305c7bd3-65c6-450a-9380-cc8d66bcbf8f","type":"Rect"},"hover_glyph":null,"nonselection_glyph":null,"selection_glyph":null},"id":"deb4d911-d78f-4141-bb1f-54e86d087374","type":"GlyphRenderer"},{"attributes":{"callback":null,"column_names":["line_color","line_alpha","color","fill_alpha","height","width","y","x","label"],"data":{"chart_index":[{"term":"on"}],"color":["#f22c40"],"fill_alpha":[0.8],"height":[-0.68264220669066],"label":[{"term":"on"}],"line_alpha":[1.0],"line_color":["white"],"term":["on"],"width":[0.8],"x":["on"],"y":[-0.34132110334533]}},"id":"5949eb2d-c4df-4325-a55f-0e08cd3e9a26","type":"ColumnDataSource"},{"attributes":{"fill_alpha":{"field":"fill_alpha"},"fill_color":{"field":"color"},"height":{"field":"height","units":"data"},"line_color":{"field":"line_color"},"width":{"field":"width","units":"data"},"x":{"field":"x"},"y":{"field":"y"}},"id":"fb3db002-82b0-43e6-a4da-f9de5ed1f195","type":"Rect"},{"attributes":{"callback":null,"column_names":["line_color","line_alpha","color","fill_alpha","height","width","y","x","label"],"data":{"chart_index":[{"term":"a"}],"color":["#f22c40"],"fill_alpha":[0.8],"height":[-2.1903210288055357],"label":[{"term":"a"}],"line_alpha":[1.0],"line_color":["white"],"term":["a"],"width":[0.8],"x":["a"],"y":[-1.0951605144027678]}},"id":"5f7a68ff-ff7a-49f2-af42-a77170da541e","type":"ColumnDataSource"},{"attributes":{"fill_alpha":{"field":"fill_alpha"},"fill_color":{"field":"color"},"height":{"field":"height","units":"data"},"line_color":{"field":"line_color"},"width":{"field":"width","units":"data"},"x":{"field":"x"},"y":{"field":"y"}},"id":"3b8d19c9-4d0f-4fcf-ba23-cd49d7347091","type":"Rect"},{"attributes":{"data_source":{"id":"aabb1342-d512-4cf5-b0ae-0b3b80b41893","type":"ColumnDataSource"},"glyph":{"id":"9afa1774-5e45-482d-8b7f-2164879240bc","type":"Rect"},"hover_glyph":null,"nonselection_glyph":null,"selection_glyph":null},"id":"69303976-48f4-4529-a78e-7a2e8db1b792","type":"GlyphRenderer"},{"attributes":{"callback":null,"column_names":["line_color","line_alpha","color","fill_alpha","height","width","y","x","label"],"data":{"chart_index":[{"term":"it"}],"color":["#f22c40"],"fill_alpha":[0.8],"height":[-0.25998635905994466],"label":[{"term":"it"}],"line_alpha":[1.0],"line_color":["white"],"term":["it"],"width":[0.8],"x":["it"],"y":[-0.12999317952997233]}},"id":"5cad6bd6-fcbe-4ab6-bc1a-d24271e76266","type":"ColumnDataSource"},{"attributes":{"data_source":{"id":"c43049ac-f0f6-43de-9488-43753806052d","type":"ColumnDataSource"},"glyph":{"id":"1429726b-42aa-4f5c-b13f-07d27d199630","type":"Rect"},"hover_glyph":null,"nonselection_glyph":null,"selection_glyph":null},"id":"c2216d76-a7d5-4758-9c95-77eaa9e1caad","type":"GlyphRenderer"},{"attributes":{"callback":null,"column_names":["line_color","line_alpha","color","fill_alpha","height","width","y","x","label"],"data":{"chart_index":[{"term":"her"}],"color":["#f22c40"],"fill_alpha":[0.8],"height":[150961.0],"label":[{"term":"her"}],"line_alpha":[1.0],"line_color":["white"],"term":["her"],"width":[0.8],"x":["her"],"y":[75480.5]}},"id":"484e8969-e839-4e31-bf09-9c80fe597b82","type":"ColumnDataSource"},{"attributes":{"callback":null,"column_names":["line_color","line_alpha","color","fill_alpha","height","width","y","x","label"],"data":{"chart_index":[{"term":"his"}],"color":["#f22c40"],"fill_alpha":[0.8],"height":[-1.5369607257262137],"label":[{"term":"his"}],"line_alpha":[1.0],"line_color":["white"],"term":["his"],"width":[0.8],"x":["his"],"y":[-0.7684803628631068]}},"id":"9dbf952f-ce33-4d4f-860c-fad6672fae1a","type":"ColumnDataSource"},{"attributes":{"fill_alpha":{"field":"fill_alpha"},"fill_color":{"field":"color"},"height":{"field":"height","units":"data"},"line_color":{"field":"line_color"},"width":{"field":"width","units":"data"},"x":{"field":"x"},"y":{"field":"y"}},"id":"5fc5269d-8bed-4677-a3dd-1b5163c644b3","type":"Rect"},{"attributes":{"fill_alpha":{"field":"fill_alpha"},"fill_color":{"field":"color"},"height":{"field":"height","units":"data"},"line_color":{"field":"line_color"},"width":{"field":"width","units":"data"},"x":{"field":"x"},"y":{"field":"y"}},"id":"70b95ace-7577-4594-83ba-43eaa9753515","type":"Rect"},{"attributes":{"plot":{"id":"f9ba29c1-b2c1-43a5-b32b-b3519a51adbc","subtype":"Chart","type":"Plot"}},"id":"9d6c1ff6-3edc-47aa-a54b-a1a6fd6b9086","type":"WheelZoomTool"},{"attributes":{"data_source":{"id":"a7afb923-e359-4320-9b45-59aec7764d1b","type":"ColumnDataSource"},"glyph":{"id":"74b21637-5474-4799-aa8d-aeb0d8e57f21","type":"Rect"},"hover_glyph":null,"nonselection_glyph":null,"selection_glyph":null},"id":"da7c754e-64c2-4fd2-837d-b56508157dff","type":"GlyphRenderer"},{"attributes":{},"id":"366f535f-1055-46ac-8c13-daae06a29440","type":"CategoricalTicker"},{"attributes":{"fill_alpha":{"field":"fill_alpha"},"fill_color":{"field":"color"},"height":{"field":"height","units":"data"},"line_color":{"field":"line_color"},"width":{"field":"width","units":"data"},"x":{"field":"x"},"y":{"field":"y"}},"id":"76d4569f-f858-4390-a682-3160d58c4078","type":"Rect"},{"attributes":{"callback":null,"column_names":["line_color","line_alpha","color","fill_alpha","height","width","y","x","label"],"data":{"chart_index":[{"term":"but"}],"color":["#f22c40"],"fill_alpha":[0.8],"height":[-0.5316912235702537],"label":[{"term":"but"}],"line_alpha":[1.0],"line_color":["white"],"term":["but"],"width":[0.8],"x":["but"],"y":[-0.26584561178512683]}},"id":"4daee180-4430-4ff0-a6fb-87c5d5f533e5","type":"ColumnDataSource"},{"attributes":{"data_source":{"id":"34fb182e-2df4-438d-b3d3-dfffd48954d3","type":"ColumnDataSource"},"glyph":{"id":"2edbb47c-9c66-4984-8a54-868c5df7431a","type":"Rect"},"hover_glyph":null,"nonselection_glyph":null,"selection_glyph":null},"id":"e5c6d8f3-87ee-418a-803c-b5fa27962362","type":"GlyphRenderer"},{"attributes":{"data_source":{"id":"fc657dfc-52ba-4643-bd64-9c03ba37daa9","type":"ColumnDataSource"},"glyph":{"id":"b753510c-f57d-4f5f-91de-d7ff2bba1733","type":"Rect"},"hover_glyph":null,"nonselection_glyph":null,"selection_glyph":null},"id":"0e27a912-246c-4d97-a1f9-b89e51a92b53","type":"GlyphRenderer"},{"attributes":{"data_source":{"id":"07065e70-c22e-40ef-a622-a7676f5bc562","type":"ColumnDataSource"},"glyph":{"id":"6eba8405-cf73-4a28-a766-e7c15d19858f","type":"Rect"},"hover_glyph":null,"nonselection_glyph":null,"selection_glyph":null},"id":"bb8145b9-a7a3-4b32-940a-bbf7a4c5e4c7","type":"GlyphRenderer"},{"attributes":{"fill_alpha":{"field":"fill_alpha"},"fill_color":{"field":"color"},"height":{"field":"height","units":"data"},"line_color":{"field":"line_color"},"width":{"field":"width","units":"data"},"x":{"field":"x"},"y":{"field":"y"}},"id":"51e69745-7e70-484f-9317-0728664d5f7d","type":"Rect"},{"attributes":{"callback":null,"column_names":["line_color","line_alpha","color","fill_alpha","height","width","y","x","label"],"data":{"chart_index":[{"term":"when"}],"color":["#f22c40"],"fill_alpha":[0.8],"height":[-0.1677574297722078],"label":[{"term":"when"}],"line_alpha":[1.0],"line_color":["white"],"term":["when"],"width":[0.8],"x":["when"],"y":[-0.0838787148861039]}},"id":"0f5d7561-0160-492a-8df1-cb03df61e6ed","type":"ColumnDataSource"},{"attributes":{"plot":{"id":"f9ba29c1-b2c1-43a5-b32b-b3519a51adbc","subtype":"Chart","type":"Plot"}},"id":"2dc75eb2-a5fa-473b-9dae-54d055d378ea","type":"ResetTool"},{"attributes":{"dimension":1,"plot":{"id":"04efd8e2-c1e4-40ac-9e9b-6bb9bc49f084","subtype":"Chart","type":"Plot"},"ticker":{"id":"881f8b3e-8e54-479a-8e08-4cf1675dffb7","type":"BasicTicker"}},"id":"c7bb252e-cc61-4c65-b27f-6c6a3da4c777","type":"Grid"},{"attributes":{"fill_alpha":{"field":"fill_alpha"},"fill_color":{"field":"color"},"height":{"field":"height","units":"data"},"line_color":{"field":"line_color"},"width":{"field":"width","units":"data"},"x":{"field":"x"},"y":{"field":"y"}},"id":"8110f4f9-9a94-45ae-9d7d-01cd3d8dad0e","type":"Rect"},{"attributes":{"fill_alpha":{"field":"fill_alpha"},"fill_color":{"field":"color"},"height":{"field":"height","units":"data"},"line_color":{"field":"line_color"},"width":{"field":"width","units":"data"},"x":{"field":"x"},"y":{"field":"y"}},"id":"614597e2-b058-4170-ba0b-1b245aabbc9a","type":"Rect"},{"attributes":{"axis_label":"Sum( Freq )","formatter":{"id":"4fcf722c-0abb-4d3e-a57d-87d7639e7e8d","type":"BasicTickFormatter"},"plot":{"id":"04efd8e2-c1e4-40ac-9e9b-6bb9bc49f084","subtype":"Chart","type":"Plot"},"ticker":{"id":"881f8b3e-8e54-479a-8e08-4cf1675dffb7","type":"BasicTicker"}},"id":"29cfe57f-4cda-46b4-8181-c9c19631c459","type":"LinearAxis"},{"attributes":{"callback":null,"column_names":["line_color","line_alpha","color","fill_alpha","height","width","y","x","label"],"data":{"chart_index":[{"term":"in"}],"color":["#f22c40"],"fill_alpha":[0.8],"height":[-1.6612419647301946],"label":[{"term":"in"}],"line_alpha":[1.0],"line_color":["white"],"term":["in"],"width":[0.8],"x":["in"],"y":[-0.8306209823650973]}},"id":"29639765-7199-4a04-9910-60919bf7f82c","type":"ColumnDataSource"},{"attributes":{"callback":null,"column_names":["line_color","line_alpha","color","fill_alpha","height","width","y","x","label"],"data":{"chart_index":[{"term":"is"}],"color":["#f22c40"],"fill_alpha":[0.8],"height":[-1.6689833604821003],"label":[{"term":"is"}],"line_alpha":[1.0],"line_color":["white"],"term":["is"],"width":[0.8],"x":["is"],"y":[-0.8344916802410501]}},"id":"be0856cc-2fea-4ec8-a688-da957019dfec","type":"ColumnDataSource"},{"attributes":{"callback":null,"column_names":["line_color","line_alpha","color","fill_alpha","height","width","y","x","label"],"data":{"chart_index":[{"term":"to"}],"color":["#f22c40"],"fill_alpha":[0.8],"height":[-2.4331260111631927],"label":[{"term":"to"}],"line_alpha":[1.0],"line_color":["white"],"term":["to"],"width":[0.8],"x":["to"],"y":[-1.2165630055815964]}},"id":"1d1dcfd5-1f28-416a-8a71-6150f2c81ce9","type":"ColumnDataSource"},{"attributes":{"callback":null,"column_names":["line_color","line_alpha","color","fill_alpha","height","width","y","x","label"],"data":{"chart_index":[{"term":"when"}],"color":["#f22c40"],"fill_alpha":[0.8],"height":[49908.0],"label":[{"term":"when"}],"line_alpha":[1.0],"line_color":["white"],"term":["when"],"width":[0.8],"x":["when"],"y":[24954.0]}},"id":"34fb182e-2df4-438d-b3d3-dfffd48954d3","type":"ColumnDataSource"},{"attributes":{"plot":{"id":"04efd8e2-c1e4-40ac-9e9b-6bb9bc49f084","subtype":"Chart","type":"Plot"}},"id":"a552f5cc-9e70-464f-8c99-f24cced298d2","type":"ResetTool"},{"attributes":{"fill_alpha":{"field":"fill_alpha"},"fill_color":{"field":"color"},"height":{"field":"height","units":"data"},"line_color":{"field":"line_color"},"width":{"field":"width","units":"data"},"x":{"field":"x"},"y":{"field":"y"}},"id":"7cf44005-7f4b-4de1-977e-a2893549fb82","type":"Rect"},{"attributes":{"fill_alpha":{"field":"fill_alpha"},"fill_color":{"field":"color"},"height":{"field":"height","units":"data"},"line_color":{"field":"line_color"},"width":{"field":"width","units":"data"},"x":{"field":"x"},"y":{"field":"y"}},"id":"48d6b071-5257-4a34-bcf9-7bc239a00653","type":"Rect"},{"attributes":{"callback":null,"column_names":["line_color","line_alpha","color","fill_alpha","height","width","y","x","label"],"data":{"chart_index":[{"term":"by"}],"color":["#f22c40"],"fill_alpha":[0.8],"height":[92465.0],"label":[{"term":"by"}],"line_alpha":[1.0],"line_color":["white"],"term":["by"],"width":[0.8],"x":["by"],"y":[46232.5]}},"id":"e8f8eb56-e983-41a8-8f24-8ac7258386ed","type":"ColumnDataSource"},{"attributes":{"fill_alpha":{"field":"fill_alpha"},"fill_color":{"field":"color"},"height":{"field":"height","units":"data"},"line_color":{"field":"line_color"},"width":{"field":"width","units":"data"},"x":{"field":"x"},"y":{"field":"y"}},"id":"bb631ae2-b634-4ae5-b3af-bf9f0b1ffd74","type":"Rect"},{"attributes":{"data_source":{"id":"37084e66-8ef2-410c-8d18-a2914b2e8473","type":"ColumnDataSource"},"glyph":{"id":"f6216011-d772-49b8-a582-f35d57338d0f","type":"Rect"},"hover_glyph":null,"nonselection_glyph":null,"selection_glyph":null},"id":"55715da0-958e-490d-b7a4-6351a79c53fa","type":"GlyphRenderer"},{"attributes":{"fill_alpha":{"field":"fill_alpha"},"fill_color":{"field":"color"},"height":{"field":"height","units":"data"},"line_color":{"field":"line_color"},"width":{"field":"width","units":"data"},"x":{"field":"x"},"y":{"field":"y"}},"id":"8cfa0ab2-67b0-48fa-960f-7abac04b4a2d","type":"Rect"},{"attributes":{"data_source":{"id":"15563b26-5d4d-4ba0-a5bb-e023d10d2d10","type":"ColumnDataSource"},"glyph":{"id":"833d94a8-8243-4c17-aa37-21982b5ab65d","type":"Rect"},"hover_glyph":null,"nonselection_glyph":null,"selection_glyph":null},"id":"1851a8c6-1dd0-4b6d-9e95-e030700fff2a","type":"GlyphRenderer"},{"attributes":{"data_source":{"id":"7a090935-0041-4536-b847-baa4a484092a","type":"ColumnDataSource"},"glyph":{"id":"10a35282-64ba-44f4-867d-fee153e47559","type":"Rect"},"hover_glyph":null,"nonselection_glyph":null,"selection_glyph":null},"id":"af738d58-6c14-4605-b4a4-fb07a26784bd","type":"GlyphRenderer"},{"attributes":{"data_source":{"id":"d0a9e417-3219-4610-b936-218e209eb996","type":"ColumnDataSource"},"glyph":{"id":"05262314-b795-4b8b-a554-f9344725c404","type":"Rect"},"hover_glyph":null,"nonselection_glyph":null,"selection_glyph":null},"id":"e02bb52c-f782-49be-bbfe-667d81ba996e","type":"GlyphRenderer"},{"attributes":{"callback":null,"column_names":["line_color","line_alpha","color","fill_alpha","height","width","y","x","label"],"data":{"chart_index":[{"term":"as"}],"color":["#f22c40"],"fill_alpha":[0.8],"height":[88813.0],"label":[{"term":"as"}],"line_alpha":[1.0],"line_color":["white"],"term":["as"],"width":[0.8],"x":["as"],"y":[44406.5]}},"id":"745746fe-c19a-4192-b659-e755718ba2f5","type":"ColumnDataSource"},{"attributes":{"data_source":{"id":"0f5d7561-0160-492a-8df1-cb03df61e6ed","type":"ColumnDataSource"},"glyph":{"id":"614597e2-b058-4170-ba0b-1b245aabbc9a","type":"Rect"},"hover_glyph":null,"nonselection_glyph":null,"selection_glyph":null},"id":"91345efc-a714-46ca-97f5-edaaa5b4bc86","type":"GlyphRenderer"},{"attributes":{"data_source":{"id":"9dbf952f-ce33-4d4f-860c-fad6672fae1a","type":"ColumnDataSource"},"glyph":{"id":"79107a53-b26a-4cda-83f2-358d83ac72dc","type":"Rect"},"hover_glyph":null,"nonselection_glyph":null,"selection_glyph":null},"id":"3114f9a8-6beb-45b1-85f7-c459a0b412ed","type":"GlyphRenderer"},{"attributes":{"fill_alpha":{"field":"fill_alpha"},"fill_color":{"field":"color"},"height":{"field":"height","units":"data"},"line_color":{"field":"line_color"},"width":{"field":"width","units":"data"},"x":{"field":"x"},"y":{"field":"y"}},"id":"3451cc85-b13b-4679-a612-5fd37a35bc8b","type":"Rect"},{"attributes":{"fill_alpha":{"field":"fill_alpha"},"fill_color":{"field":"color"},"height":{"field":"height","units":"data"},"line_color":{"field":"line_color"},"width":{"field":"width","units":"data"},"x":{"field":"x"},"y":{"field":"y"}},"id":"6aaa69f2-628d-410f-8ad4-062a3bc56f33","type":"Rect"},{"attributes":{"data_source":{"id":"55d8769e-aa38-48a1-947e-93ace3278825","type":"ColumnDataSource"},"glyph":{"id":"4b1d76fd-d65d-48fc-b61d-73004f58e1be","type":"Rect"},"hover_glyph":null,"nonselection_glyph":null,"selection_glyph":null},"id":"21ea4b6d-a771-48a4-be6f-80ecae339e32","type":"GlyphRenderer"},{"attributes":{"data_source":{"id":"da3c0344-8a9f-490b-ac20-bc9756bc6de5","type":"ColumnDataSource"},"glyph":{"id":"b59f4205-1e08-4252-aa0f-cd5eea8e0b72","type":"Rect"},"hover_glyph":null,"nonselection_glyph":null,"selection_glyph":null},"id":"951d009e-a7ba-4629-82cb-eb57595e4c4a","type":"GlyphRenderer"},{"attributes":{"callback":null,"column_names":["line_color","line_alpha","color","fill_alpha","height","width","y","x","label"],"data":{"chart_index":[{"term":"by"}],"color":["#f22c40"],"fill_alpha":[0.8],"height":[-0.784397091937627],"label":[{"term":"by"}],"line_alpha":[1.0],"line_color":["white"],"term":["by"],"width":[0.8],"x":["by"],"y":[-0.3921985459688135]}},"id":"15c8d403-ca1e-4c1d-b204-1a8c11ec2e06","type":"ColumnDataSource"},{"attributes":{},"id":"7d1bf0d3-6f82-4168-816b-918883e5a488","type":"ToolEvents"},{"attributes":{"callback":null,"column_names":["line_color","line_alpha","color","fill_alpha","height","width","y","x","label"],"data":{"chart_index":[{"term":"that"}],"color":["#f22c40"],"fill_alpha":[0.8],"height":[139031.0],"label":[{"term":"that"}],"line_alpha":[1.0],"line_color":["white"],"term":["that"],"width":[0.8],"x":["that"],"y":[69515.5]}},"id":"145c52ac-f99e-4cd0-8dbd-a030faef06b6","type":"ColumnDataSource"},{"attributes":{"callback":null,"column_names":["line_color","line_alpha","color","fill_alpha","height","width","y","x","label"],"data":{"chart_index":[{"term":"are"}],"color":["#f22c40"],"fill_alpha":[0.8],"height":[-0.18838121494707363],"label":[{"term":"are"}],"line_alpha":[1.0],"line_color":["white"],"term":["are"],"width":[0.8],"x":["are"],"y":[-0.09419060747353682]}},"id":"c85a58c0-69df-4a7b-8a21-b3fa2ad6d914","type":"ColumnDataSource"},{"attributes":{"data_source":{"id":"b7c12cb6-a8dc-45b4-bef7-c67388ad863d","type":"ColumnDataSource"},"glyph":{"id":"bd86702d-3a1d-4004-a4d4-c7797701056f","type":"Rect"},"hover_glyph":null,"nonselection_glyph":null,"selection_glyph":null},"id":"ba27f464-709f-4ec1-91f7-e288b5689346","type":"GlyphRenderer"},{"attributes":{"callback":null,"column_names":["line_color","line_alpha","color","fill_alpha","height","width","y","x","label"],"data":{"chart_index":[{"term":"she"}],"color":["#f22c40"],"fill_alpha":[0.8],"height":[-0.7956351589899237],"label":[{"term":"she"}],"line_alpha":[1.0],"line_color":["white"],"term":["she"],"width":[0.8],"x":["she"],"y":[-0.3978175794949618]}},"id":"55d8769e-aa38-48a1-947e-93ace3278825","type":"ColumnDataSource"},{"attributes":{"data_source":{"id":"5cad6bd6-fcbe-4ab6-bc1a-d24271e76266","type":"ColumnDataSource"},"glyph":{"id":"b85d1257-2a0a-41e3-a89a-0a1e2f3fa1e4","type":"Rect"},"hover_glyph":null,"nonselection_glyph":null,"selection_glyph":null},"id":"53d8b755-1b32-4642-9199-7769fe5f2765","type":"GlyphRenderer"},{"attributes":{"data_source":{"id":"76d22f84-bcc8-48f6-ac26-659d3a48b22c","type":"ColumnDataSource"},"glyph":{"id":"1036c666-e82e-493d-94ed-e279e05bef99","type":"Rect"},"hover_glyph":null,"nonselection_glyph":null,"selection_glyph":null},"id":"4086b991-3b18-44c1-b013-6351f43b2dfe","type":"GlyphRenderer"},{"attributes":{"callback":null,"end":0.0,"start":-3.119620785393146},"id":"94aac789-7c2c-453e-83c3-38b60e6583d7","type":"Range1d"},{"attributes":{"callback":null,"column_names":["line_color","line_alpha","color","fill_alpha","height","width","y","x","label"],"data":{"chart_index":[{"term":"is"}],"color":["#f22c40"],"fill_alpha":[0.8],"height":[223950.0],"label":[{"term":"is"}],"line_alpha":[1.0],"line_color":["white"],"term":["is"],"width":[0.8],"x":["is"],"y":[111975.0]}},"id":"37769868-c296-4e5e-bf0d-f5eccc1eb209","type":"ColumnDataSource"},{"attributes":{"data_source":{"id":"67602b36-6c2f-48ec-83bc-625a300ea0d5","type":"ColumnDataSource"},"glyph":{"id":"ce9fc461-44f8-48d6-8e41-e672771ba6c7","type":"Rect"},"hover_glyph":null,"nonselection_glyph":null,"selection_glyph":null},"id":"0734b97b-d68b-42c0-bd9a-09e641bd7ddd","type":"GlyphRenderer"},{"attributes":{"plot":{"id":"04efd8e2-c1e4-40ac-9e9b-6bb9bc49f084","subtype":"Chart","type":"Plot"}},"id":"d121a3a1-a9a1-4066-abcd-8ba75c7d6dbc","type":"WheelZoomTool"},{"attributes":{"data_source":{"id":"14089801-16c1-4d82-8c7f-c878662e6349","type":"ColumnDataSource"},"glyph":{"id":"7633984f-3402-4cae-aa58-e282c5291d52","type":"Rect"},"hover_glyph":null,"nonselection_glyph":null,"selection_glyph":null},"id":"d880f7e2-ea22-4da3-8c53-1adc67f96a3e","type":"GlyphRenderer"},{"attributes":{"fill_alpha":{"field":"fill_alpha"},"fill_color":{"field":"color"},"height":{"field":"height","units":"data"},"line_color":{"field":"line_color"},"width":{"field":"width","units":"data"},"x":{"field":"x"},"y":{"field":"y"}},"id":"766e7d02-ea66-4e01-b2d4-dceb27ac15c5","type":"Rect"},{"attributes":{},"id":"4fcf722c-0abb-4d3e-a57d-87d7639e7e8d","type":"BasicTickFormatter"},{"attributes":{},"id":"881f8b3e-8e54-479a-8e08-4cf1675dffb7","type":"BasicTicker"},{"attributes":{"callback":null,"column_names":["line_color","line_alpha","color","fill_alpha","height","width","y","x","label"],"data":{"chart_index":[{"term":"they"}],"color":["#f22c40"],"fill_alpha":[0.8],"height":[73441.0],"label":[{"term":"they"}],"line_alpha":[1.0],"line_color":["white"],"term":["they"],"width":[0.8],"x":["they"],"y":[36720.5]}},"id":"5a6fb1d0-b0e6-4b9f-a65c-74e6bb21fd43","type":"ColumnDataSource"},{"attributes":{"callback":null,"column_names":["line_color","line_alpha","color","fill_alpha","height","width","y","x","label"],"data":{"chart_index":[{"term":"at"}],"color":["#f22c40"],"fill_alpha":[0.8],"height":[63856.0],"label":[{"term":"at"}],"line_alpha":[1.0],"line_color":["white"],"term":["at"],"width":[0.8],"x":["at"],"y":[31928.0]}},"id":"67602b36-6c2f-48ec-83bc-625a300ea0d5","type":"ColumnDataSource"},{"attributes":{"axis_label":"Term","formatter":{"id":"eb6a07e3-e16a-4a89-82ad-798359cf7a1a","type":"CategoricalTickFormatter"},"major_label_orientation":0.7853981633974483,"plot":{"id":"f9ba29c1-b2c1-43a5-b32b-b3519a51adbc","subtype":"Chart","type":"Plot"},"ticker":{"id":"160ca09b-35de-46bc-812e-9daa968c818c","type":"CategoricalTicker"}},"id":"310fe043-084c-4eaf-8706-17aa4578ca11","type":"CategoricalAxis"},{"attributes":{"fill_alpha":{"field":"fill_alpha"},"fill_color":{"field":"color"},"height":{"field":"height","units":"data"},"line_color":{"field":"line_color"},"width":{"field":"width","units":"data"},"x":{"field":"x"},"y":{"field":"y"}},"id":"762536ca-0765-4e78-a0b1-6f20f40f70f6","type":"Rect"},{"attributes":{"fill_alpha":{"field":"fill_alpha"},"fill_color":{"field":"color"},"height":{"field":"height","units":"data"},"line_color":{"field":"line_color"},"width":{"field":"width","units":"data"},"x":{"field":"x"},"y":{"field":"y"}},"id":"ce9fc461-44f8-48d6-8e41-e672771ba6c7","type":"Rect"},{"attributes":{"fill_alpha":{"field":"fill_alpha"},"fill_color":{"field":"color"},"height":{"field":"height","units":"data"},"line_color":{"field":"line_color"},"width":{"field":"width","units":"data"},"x":{"field":"x"},"y":{"field":"y"}},"id":"9afa1774-5e45-482d-8b7f-2164879240bc","type":"Rect"},{"attributes":{"plot":null,"text":null},"id":"c80c9e60-447d-4dc8-ad53-7a75280c9908","type":"Title"},{"attributes":{"fill_alpha":{"field":"fill_alpha"},"fill_color":{"field":"color"},"height":{"field":"height","units":"data"},"line_color":{"field":"line_color"},"width":{"field":"width","units":"data"},"x":{"field":"x"},"y":{"field":"y"}},"id":"05262314-b795-4b8b-a554-f9344725c404","type":"Rect"},{"attributes":{"data_source":{"id":"8d097423-e168-4997-a1a3-5e3cd511b3d0","type":"ColumnDataSource"},"glyph":{"id":"a35753a5-bf8c-4db0-addb-9cce8c5082ac","type":"Rect"},"hover_glyph":null,"nonselection_glyph":null,"selection_glyph":null},"id":"26925648-4d41-4d01-a7f6-db4c46372205","type":"GlyphRenderer"},{"attributes":{"fill_alpha":{"field":"fill_alpha"},"fill_color":{"field":"color"},"height":{"field":"height","units":"data"},"line_color":{"field":"line_color"},"width":{"field":"width","units":"data"},"x":{"field":"x"},"y":{"field":"y"}},"id":"15aa29cf-c7d2-40e5-ba14-5acf0e1900e7","type":"Rect"},{"attributes":{"callback":null,"column_names":["line_color","line_alpha","color","fill_alpha","height","width","y","x","label"],"data":{"chart_index":[{"term":"of"}],"color":["#f22c40"],"fill_alpha":[0.8],"height":[-1.8225821091584873],"label":[{"term":"of"}],"line_alpha":[1.0],"line_color":["white"],"term":["of"],"width":[0.8],"x":["of"],"y":[-0.9112910545792436]}},"id":"7a090935-0041-4536-b847-baa4a484092a","type":"ColumnDataSource"},{"attributes":{"fill_alpha":{"field":"fill_alpha"},"fill_color":{"field":"color"},"height":{"field":"height","units":"data"},"line_color":{"field":"line_color"},"width":{"field":"width","units":"data"},"x":{"field":"x"},"y":{"field":"y"}},"id":"acffc3a1-0f7b-4fd6-a528-9ccc061df5f7","type":"Rect"},{"attributes":{"data_source":{"id":"19ca9357-0c3a-4b73-b938-58c91024f77e","type":"ColumnDataSource"},"glyph":{"id":"7cf44005-7f4b-4de1-977e-a2893549fb82","type":"Rect"},"hover_glyph":null,"nonselection_glyph":null,"selection_glyph":null},"id":"d3aba626-4626-4632-b4dc-8c89c982f174","type":"GlyphRenderer"},{"attributes":{"plot":{"id":"f9ba29c1-b2c1-43a5-b32b-b3519a51adbc","subtype":"Chart","type":"Plot"}},"id":"704e3120-3b6a-449b-9d4b-202759c860ff","type":"SaveTool"},{"attributes":{},"id":"49c760b7-1b0e-4b19-82a2-b49c7f187418","type":"ToolEvents"},{"attributes":{"fill_alpha":{"field":"fill_alpha"},"fill_color":{"field":"color"},"height":{"field":"height","units":"data"},"line_color":{"field":"line_color"},"width":{"field":"width","units":"data"},"x":{"field":"x"},"y":{"field":"y"}},"id":"10a35282-64ba-44f4-867d-fee153e47559","type":"Rect"},{"attributes":{"data_source":{"id":"75fa5ceb-befd-491d-b24d-b901afe3cc9b","type":"ColumnDataSource"},"glyph":{"id":"a2ca33c0-1db9-435c-896c-23a9848e4daa","type":"Rect"},"hover_glyph":null,"nonselection_glyph":null,"selection_glyph":null},"id":"b35108c2-cbef-429c-969e-201959982ef6","type":"GlyphRenderer"},{"attributes":{"fill_alpha":{"field":"fill_alpha"},"fill_color":{"field":"color"},"height":{"field":"height","units":"data"},"line_color":{"field":"line_color"},"width":{"field":"width","units":"data"},"x":{"field":"x"},"y":{"field":"y"}},"id":"79107a53-b26a-4cda-83f2-358d83ac72dc","type":"Rect"},{"attributes":{"fill_alpha":{"field":"fill_alpha"},"fill_color":{"field":"color"},"height":{"field":"height","units":"data"},"line_color":{"field":"line_color"},"width":{"field":"width","units":"data"},"x":{"field":"x"},"y":{"field":"y"}},"id":"edf5d94e-0b3b-4223-8485-ec246ac400c4","type":"Rect"},{"attributes":{},"id":"cf0116d9-3bab-43c4-b630-e08f36790cd3","type":"CategoricalTickFormatter"},{"attributes":{"callback":null,"column_names":["line_color","line_alpha","color","fill_alpha","height","width","y","x","label"],"data":{"chart_index":[{"term":"a"}],"color":["#f22c40"],"fill_alpha":[0.8],"height":[377195.0],"label":[{"term":"a"}],"line_alpha":[1.0],"line_color":["white"],"term":["a"],"width":[0.8],"x":["a"],"y":[188597.5]}},"id":"164d2856-0572-4f91-aea5-6a6d2cd065fe","type":"ColumnDataSource"},{"attributes":{},"id":"eb6a07e3-e16a-4a89-82ad-798359cf7a1a","type":"CategoricalTickFormatter"},{"attributes":{"data_source":{"id":"c85a58c0-69df-4a7b-8a21-b3fa2ad6d914","type":"ColumnDataSource"},"glyph":{"id":"762536ca-0765-4e78-a0b1-6f20f40f70f6","type":"Rect"},"hover_glyph":null,"nonselection_glyph":null,"selection_glyph":null},"id":"141cc16c-c3b9-4c42-8ae7-f94ecd2357da","type":"GlyphRenderer"},{"attributes":{"fill_alpha":{"field":"fill_alpha"},"fill_color":{"field":"color"},"height":{"field":"height","units":"data"},"line_color":{"field":"line_color"},"width":{"field":"width","units":"data"},"x":{"field":"x"},"y":{"field":"y"}},"id":"300a52ce-2159-4061-bef9-27e4ff6030e0","type":"Rect"},{"attributes":{},"id":"3c711e40-3f43-4107-866c-327bd9cbb0dc","type":"BasicTicker"},{"attributes":{"fill_alpha":{"field":"fill_alpha"},"fill_color":{"field":"color"},"height":{"field":"height","units":"data"},"line_color":{"field":"line_color"},"width":{"field":"width","units":"data"},"x":{"field":"x"},"y":{"field":"y"}},"id":"b753510c-f57d-4f5f-91de-d7ff2bba1733","type":"Rect"},{"attributes":{"plot":{"id":"f9ba29c1-b2c1-43a5-b32b-b3519a51adbc","subtype":"Chart","type":"Plot"}},"id":"285a53e0-9ba5-4adf-a93a-599a05a2b1d3","type":"PanTool"},{"attributes":{"callback":null,"column_names":["line_color","line_alpha","color","fill_alpha","height","width","y","x","label"],"data":{"chart_index":[{"term":"his"}],"color":["#f22c40"],"fill_alpha":[0.8],"height":[196252.0],"label":[{"term":"his"}],"line_alpha":[1.0],"line_color":["white"],"term":["his"],"width":[0.8],"x":["his"],"y":[98126.0]}},"id":"51b1be87-dcc4-4a97-afe8-c531da6930be","type":"ColumnDataSource"},{"attributes":{"data_source":{"id":"b74e7c4f-c80c-4239-beea-bbd3f6812bb1","type":"ColumnDataSource"},"glyph":{"id":"6aaa69f2-628d-410f-8ad4-062a3bc56f33","type":"Rect"},"hover_glyph":null,"nonselection_glyph":null,"selection_glyph":null},"id":"31247e71-a099-42fb-a9f5-32e19bf209e2","type":"GlyphRenderer"},{"attributes":{"callback":null,"column_names":["line_color","line_alpha","color","fill_alpha","height","width","y","x","label"],"data":{"chart_index":[{"term":"at"}],"color":["#f22c40"],"fill_alpha":[0.8],"height":[-0.4142022909756191],"label":[{"term":"at"}],"line_alpha":[1.0],"line_color":["white"],"term":["at"],"width":[0.8],"x":["at"],"y":[-0.20710114548780956]}},"id":"76d22f84-bcc8-48f6-ac26-659d3a48b22c","type":"ColumnDataSource"},{"attributes":{"plot":{"id":"04efd8e2-c1e4-40ac-9e9b-6bb9bc49f084","subtype":"Chart","type":"Plot"}},"id":"8ad77cc0-8c70-49ca-a49c-c5e0c0ca4080","type":"SaveTool"},{"attributes":{"data_source":{"id":"5949eb2d-c4df-4325-a55f-0e08cd3e9a26","type":"ColumnDataSource"},"glyph":{"id":"766e7d02-ea66-4e01-b2d4-dceb27ac15c5","type":"Rect"},"hover_glyph":null,"nonselection_glyph":null,"selection_glyph":null},"id":"eea7c897-061c-473b-ab1c-c806a638f762","type":"GlyphRenderer"},{"attributes":{"callback":null,"factors":["the","to","and","a","of","is","in","his","he","s","her","with","that","him","for","she","by","as","on","they","but","who","at","from","an","has","it","their","are","when"]},"id":"e0a81336-b35a-4f18-8585-931ae4358035","type":"FactorRange"},{"attributes":{"fill_alpha":{"field":"fill_alpha"},"fill_color":{"field":"color"},"height":{"field":"height","units":"data"},"line_color":{"field":"line_color"},"width":{"field":"width","units":"data"},"x":{"field":"x"},"y":{"field":"y"}},"id":"b59f4205-1e08-4252-aa0f-cd5eea8e0b72","type":"Rect"},{"attributes":{"plot":null,"text":null},"id":"d2611002-0555-41b1-93df-d59bca0d625d","type":"Title"},{"attributes":{"fill_alpha":{"field":"fill_alpha"},"fill_color":{"field":"color"},"height":{"field":"height","units":"data"},"line_color":{"field":"line_color"},"width":{"field":"width","units":"data"},"x":{"field":"x"},"y":{"field":"y"}},"id":"a2ca33c0-1db9-435c-896c-23a9848e4daa","type":"Rect"},{"attributes":{"callback":null,"column_names":["line_color","line_alpha","color","fill_alpha","height","width","y","x","label"],"data":{"chart_index":[{"term":"s"}],"color":["#f22c40"],"fill_alpha":[0.8],"height":[-1.319701647499043],"label":[{"term":"s"}],"line_alpha":[1.0],"line_color":["white"],"term":["s"],"width":[0.8],"x":["s"],"y":[-0.6598508237495215]}},"id":"d0584287-8a08-416c-a720-573fbdc45c81","type":"ColumnDataSource"},{"attributes":{"fill_alpha":{"field":"fill_alpha"},"fill_color":{"field":"color"},"height":{"field":"height","units":"data"},"line_color":{"field":"line_color"},"width":{"field":"width","units":"data"},"x":{"field":"x"},"y":{"field":"y"}},"id":"ecbafdd0-6b59-482d-a545-929bfffe315c","type":"Rect"},{"attributes":{"fill_alpha":{"field":"fill_alpha"},"fill_color":{"field":"color"},"height":{"field":"height","units":"data"},"line_color":{"field":"line_color"},"width":{"field":"width","units":"data"},"x":{"field":"x"},"y":{"field":"y"}},"id":"b85d1257-2a0a-41e3-a89a-0a1e2f3fa1e4","type":"Rect"},{"attributes":{"callback":null,"factors":["the","to","and","a","of","is","in","his","he","s","her","with","that","him","for","she","by","as","on","they","but","who","at","from","an","has","it","their","are","when"]},"id":"ba17365d-c3ce-4e75-8a5c-f298d09e9b9d","type":"FactorRange"},{"attributes":{"callback":null,"column_names":["line_color","line_alpha","color","fill_alpha","height","width","y","x","label"],"data":{"chart_index":[{"term":"and"}],"color":["#f22c40"],"fill_alpha":[0.8],"height":[-2.380726294161053],"label":[{"term":"and"}],"line_alpha":[1.0],"line_color":["white"],"term":["and"],"width":[0.8],"x":["and"],"y":[-1.1903631470805265]}},"id":"37084e66-8ef2-410c-8d18-a2914b2e8473","type":"ColumnDataSource"},{"attributes":{"data_source":{"id":"be0856cc-2fea-4ec8-a688-da957019dfec","type":"ColumnDataSource"},"glyph":{"id":"d3ff6fb3-7a77-4113-812f-27970755cda8","type":"Rect"},"hover_glyph":null,"nonselection_glyph":null,"selection_glyph":null},"id":"6e96654f-2434-4685-91a7-099dac8260cf","type":"GlyphRenderer"},{"attributes":{"overlay":{"id":"89af47fd-7684-42c6-862b-480efa0d9a57","type":"BoxAnnotation"},"plot":{"id":"04efd8e2-c1e4-40ac-9e9b-6bb9bc49f084","subtype":"Chart","type":"Plot"}},"id":"5cce698b-6d0d-45fa-bacf-b8e8940213f2","type":"BoxZoomTool"},{"attributes":{"data_source":{"id":"d28f523d-d8e6-49e7-9b92-a6bb49b9bd34","type":"ColumnDataSource"},"glyph":{"id":"5c4838ab-b947-40fa-9d8f-25ff7851107b","type":"Rect"},"hover_glyph":null,"nonselection_glyph":null,"selection_glyph":null},"id":"9db32142-9140-4a63-ad69-344748a7669f","type":"GlyphRenderer"},{"attributes":{"data_source":{"id":"1d1dcfd5-1f28-416a-8a71-6150f2c81ce9","type":"ColumnDataSource"},"glyph":{"id":"53e51508-4e95-4d86-b9ab-1777babc6334","type":"Rect"},"hover_glyph":null,"nonselection_glyph":null,"selection_glyph":null},"id":"60f66e62-1aaa-4ea5-809c-f59df297c5c6","type":"GlyphRenderer"},{"attributes":{"data_source":{"id":"5f7a68ff-ff7a-49f2-af42-a77170da541e","type":"ColumnDataSource"},"glyph":{"id":"aa5fe5fb-c0ee-4413-b817-fd37e0582151","type":"Rect"},"hover_glyph":null,"nonselection_glyph":null,"selection_glyph":null},"id":"043aab3f-9521-4d36-9cf2-5163d2764ad3","type":"GlyphRenderer"},{"attributes":{"fill_alpha":{"field":"fill_alpha"},"fill_color":{"field":"color"},"height":{"field":"height","units":"data"},"line_color":{"field":"line_color"},"width":{"field":"width","units":"data"},"x":{"field":"x"},"y":{"field":"y"}},"id":"aa5fe5fb-c0ee-4413-b817-fd37e0582151","type":"Rect"},{"attributes":{"callback":null,"column_names":["line_color","line_alpha","color","fill_alpha","height","width","y","x","label"],"data":{"chart_index":[{"term":"but"}],"color":["#f22c40"],"fill_alpha":[0.8],"height":[71817.0],"label":[{"term":"but"}],"line_alpha":[1.0],"line_color":["white"],"term":["but"],"width":[0.8],"x":["but"],"y":[35908.5]}},"id":"15563b26-5d4d-4ba0-a5bb-e023d10d2d10","type":"ColumnDataSource"},{"attributes":{"callback":null,"column_names":["line_color","line_alpha","color","fill_alpha","height","width","y","x","label"],"data":{"chart_index":[{"term":"in"}],"color":["#f22c40"],"fill_alpha":[0.8],"height":[222223.0],"label":[{"term":"in"}],"line_alpha":[1.0],"line_color":["white"],"term":["in"],"width":[0.8],"x":["in"],"y":[111111.5]}},"id":"e7d71a6a-dcfa-494f-9efb-d09b3e01d693","type":"ColumnDataSource"},{"attributes":{"fill_alpha":{"field":"fill_alpha"},"fill_color":{"field":"color"},"height":{"field":"height","units":"data"},"line_color":{"field":"line_color"},"width":{"field":"width","units":"data"},"x":{"field":"x"},"y":{"field":"y"}},"id":"054cafa8-436d-4760-9ad3-fef7806bf304","type":"Rect"},{"attributes":{},"id":"160ca09b-35de-46bc-812e-9daa968c818c","type":"CategoricalTicker"},{"attributes":{"fill_alpha":{"field":"fill_alpha"},"fill_color":{"field":"color"},"height":{"field":"height","units":"data"},"line_color":{"field":"line_color"},"width":{"field":"width","units":"data"},"x":{"field":"x"},"y":{"field":"y"}},"id":"a35753a5-bf8c-4db0-addb-9cce8c5082ac","type":"Rect"},{"attributes":{"data_source":{"id":"26501d04-bbc5-45c8-99c5-621b8dcdc091","type":"ColumnDataSource"},"glyph":{"id":"70b95ace-7577-4594-83ba-43eaa9753515","type":"Rect"},"hover_glyph":null,"nonselection_glyph":null,"selection_glyph":null},"id":"745cae12-85e2-4b8e-a8fc-da73cc133ef2","type":"GlyphRenderer"},{"attributes":{"data_source":{"id":"412d07b1-650f-4a95-bd78-02dcc26a5901","type":"ColumnDataSource"},"glyph":{"id":"ecbafdd0-6b59-482d-a545-929bfffe315c","type":"Rect"},"hover_glyph":null,"nonselection_glyph":null,"selection_glyph":null},"id":"c713fd0e-6265-48f8-ad10-72a2624e583f","type":"GlyphRenderer"},{"attributes":{"fill_alpha":{"field":"fill_alpha"},"fill_color":{"field":"color"},"height":{"field":"height","units":"data"},"line_color":{"field":"line_color"},"width":{"field":"width","units":"data"},"x":{"field":"x"},"y":{"field":"y"}},"id":"15bce605-0047-4d01-a937-94a5c12238b6","type":"Rect"},{"attributes":{"children":[{"id":"04efd8e2-c1e4-40ac-9e9b-6bb9bc49f084","subtype":"Chart","type":"Plot"},{"id":"f9ba29c1-b2c1-43a5-b32b-b3519a51adbc","subtype":"Chart","type":"Plot"}]},"id":"80044f24-9906-4463-86d2-5c29055ea3a5","type":"Column"},{"attributes":{"active_drag":"auto","active_scroll":"auto","active_tap":"auto","tools":[{"id":"65761574-eddc-479e-b632-0da47fd77a25","type":"PanTool"},{"id":"d121a3a1-a9a1-4066-abcd-8ba75c7d6dbc","type":"WheelZoomTool"},{"id":"5cce698b-6d0d-45fa-bacf-b8e8940213f2","type":"BoxZoomTool"},{"id":"8ad77cc0-8c70-49ca-a49c-c5e0c0ca4080","type":"SaveTool"},{"id":"a552f5cc-9e70-464f-8c99-f24cced298d2","type":"ResetTool"},{"id":"e4c3cd70-c790-4b59-af82-1543e8d70108","type":"HelpTool"}]},"id":"8e446a7b-d8c1-4768-a007-734e68d56aad","type":"Toolbar"},{"attributes":{"callback":null,"column_names":["line_color","line_alpha","color","fill_alpha","height","width","y","x","label"],"data":{"chart_index":[{"term":"the"}],"color":["#f22c40"],"fill_alpha":[0.8],"height":[-2.971067414660139],"label":[{"term":"the"}],"line_alpha":[1.0],"line_color":["white"],"term":["the"],"width":[0.8],"x":["the"],"y":[-1.4855337073300694]}},"id":"f060e950-65e4-4931-8658-84b77de23aad","type":"ColumnDataSource"},{"attributes":{"data_source":{"id":"ee1233b0-dec8-4d05-a1db-ab5c734e0884","type":"ColumnDataSource"},"glyph":{"id":"edf5d94e-0b3b-4223-8485-ec246ac400c4","type":"Rect"},"hover_glyph":null,"nonselection_glyph":null,"selection_glyph":null},"id":"798cd31e-b711-4806-8ca4-9f93dbee56b5","type":"GlyphRenderer"},{"attributes":{"data_source":{"id":"bd365f63-2073-4be4-992d-b8ca14eb1e4a","type":"ColumnDataSource"},"glyph":{"id":"15aa29cf-c7d2-40e5-ba14-5acf0e1900e7","type":"Rect"},"hover_glyph":null,"nonselection_glyph":null,"selection_glyph":null},"id":"a0df3022-0ceb-4fab-bbdf-214e83fb8f38","type":"GlyphRenderer"},{"attributes":{"callback":null,"column_names":["line_color","line_alpha","color","fill_alpha","height","width","y","x","label"],"data":{"chart_index":[{"term":"are"}],"color":["#f22c40"],"fill_alpha":[0.8],"height":[50948.0],"label":[{"term":"are"}],"line_alpha":[1.0],"line_color":["white"],"term":["are"],"width":[0.8],"x":["are"],"y":[25474.0]}},"id":"412d07b1-650f-4a95-bd78-02dcc26a5901","type":"ColumnDataSource"},{"attributes":{"callback":null,"column_names":["line_color","line_alpha","color","fill_alpha","height","width","y","x","label"],"data":{"chart_index":[{"term":"of"}],"color":["#f22c40"],"fill_alpha":[0.8],"height":[261131.0],"label":[{"term":"of"}],"line_alpha":[1.0],"line_color":["white"],"term":["of"],"width":[0.8],"x":["of"],"y":[130565.5]}},"id":"694dcede-905f-417b-ac07-2f3be1ef5c9a","type":"ColumnDataSource"},{"attributes":{"data_source":{"id":"ad72acb6-f7c5-4145-a75b-16d6ab673153","type":"ColumnDataSource"},"glyph":{"id":"51e69745-7e70-484f-9317-0728664d5f7d","type":"Rect"},"hover_glyph":null,"nonselection_glyph":null,"selection_glyph":null},"id":"d30a8bae-c937-44d9-aad4-347935c1ff06","type":"GlyphRenderer"},{"attributes":{"callback":null,"column_names":["line_color","line_alpha","color","fill_alpha","height","width","y","x","label"],"data":{"chart_index":[{"term":"she"}],"color":["#f22c40"],"fill_alpha":[0.8],"height":[93510.0],"label":[{"term":"she"}],"line_alpha":[1.0],"line_color":["white"],"term":["she"],"width":[0.8],"x":["she"],"y":[46755.0]}},"id":"8d097423-e168-4997-a1a3-5e3cd511b3d0","type":"ColumnDataSource"},{"attributes":{"callback":null,"column_names":["line_color","line_alpha","color","fill_alpha","height","width","y","x","label"],"data":{"chart_index":[{"term":"with"}],"color":["#f22c40"],"fill_alpha":[0.8],"height":[139121.0],"label":[{"term":"with"}],"line_alpha":[1.0],"line_color":["white"],"term":["with"],"width":[0.8],"x":["with"],"y":[69560.5]}},"id":"b19e4e80-8f06-460d-aab5-d95566197ec8","type":"ColumnDataSource"},{"attributes":{"fill_alpha":{"field":"fill_alpha"},"fill_color":{"field":"color"},"height":{"field":"height","units":"data"},"line_color":{"field":"line_color"},"width":{"field":"width","units":"data"},"x":{"field":"x"},"y":{"field":"y"}},"id":"7633984f-3402-4cae-aa58-e282c5291d52","type":"Rect"},{"attributes":{"callback":null,"column_names":["line_color","line_alpha","color","fill_alpha","height","width","y","x","label"],"data":{"chart_index":[{"term":"for"}],"color":["#f22c40"],"fill_alpha":[0.8],"height":[94923.0],"label":[{"term":"for"}],"line_alpha":[1.0],"line_color":["white"],"term":["for"],"width":[0.8],"x":["for"],"y":[47461.5]}},"id":"d2e9edae-cc0f-4921-8b19-d09cf61b1ff2","type":"ColumnDataSource"},{"attributes":{"data_source":{"id":"b6f18d71-0ff7-4871-899f-a09febe8f0b5","type":"ColumnDataSource"},"glyph":{"id":"17e91aa8-63e5-4105-b9eb-bff155cea47e","type":"Rect"},"hover_glyph":null,"nonselection_glyph":null,"selection_glyph":null},"id":"8609c997-a419-487b-bd39-85ccc44b8cb9","type":"GlyphRenderer"},{"attributes":{"fill_alpha":{"field":"fill_alpha"},"fill_color":{"field":"color"},"height":{"field":"height","units":"data"},"line_color":{"field":"line_color"},"width":{"field":"width","units":"data"},"x":{"field":"x"},"y":{"field":"y"}},"id":"bd86702d-3a1d-4004-a4d4-c7797701056f","type":"Rect"},{"attributes":{"fill_alpha":{"field":"fill_alpha"},"fill_color":{"field":"color"},"height":{"field":"height","units":"data"},"line_color":{"field":"line_color"},"width":{"field":"width","units":"data"},"x":{"field":"x"},"y":{"field":"y"}},"id":"39b5794c-8178-438b-9584-a417ee62010a","type":"Rect"},{"attributes":{"fill_alpha":{"field":"fill_alpha"},"fill_color":{"field":"color"},"height":{"field":"height","units":"data"},"line_color":{"field":"line_color"},"width":{"field":"width","units":"data"},"x":{"field":"x"},"y":{"field":"y"}},"id":"74b21637-5474-4799-aa8d-aeb0d8e57f21","type":"Rect"},{"attributes":{"data_source":{"id":"34993822-5213-4048-a792-ef5711f2fe6f","type":"ColumnDataSource"},"glyph":{"id":"eaec330d-290c-4a8a-aabc-ed8e8f949bdc","type":"Rect"},"hover_glyph":null,"nonselection_glyph":null,"selection_glyph":null},"id":"b5f8639f-318d-4870-a73a-7d019a2604b5","type":"GlyphRenderer"},{"attributes":{"plot":{"id":"04efd8e2-c1e4-40ac-9e9b-6bb9bc49f084","subtype":"Chart","type":"Plot"}},"id":"e4c3cd70-c790-4b59-af82-1543e8d70108","type":"HelpTool"},{"attributes":{"overlay":{"id":"ee42f00b-60e6-4f1e-a34c-e2a300bbbb58","type":"BoxAnnotation"},"plot":{"id":"f9ba29c1-b2c1-43a5-b32b-b3519a51adbc","subtype":"Chart","type":"Plot"}},"id":"2638e969-df0b-4d97-9f7d-1de9a4dbc50a","type":"BoxZoomTool"},{"attributes":{"legends":[["the",[{"id":"e0641a18-fcb9-469b-97b1-3c0a00ae52c9","type":"GlyphRenderer"}]],["to",[{"id":"60f66e62-1aaa-4ea5-809c-f59df297c5c6","type":"GlyphRenderer"}]],["and",[{"id":"55715da0-958e-490d-b7a4-6351a79c53fa","type":"GlyphRenderer"}]],["a",[{"id":"043aab3f-9521-4d36-9cf2-5163d2764ad3","type":"GlyphRenderer"}]],["of",[{"id":"af738d58-6c14-4605-b4a4-fb07a26784bd","type":"GlyphRenderer"}]],["is",[{"id":"6e96654f-2434-4685-91a7-099dac8260cf","type":"GlyphRenderer"}]],["in",[{"id":"b103d63b-8f5c-4cb8-a642-cf2ddccbada3","type":"GlyphRenderer"}]],["his",[{"id":"3114f9a8-6beb-45b1-85f7-c459a0b412ed","type":"GlyphRenderer"}]],["he",[{"id":"b35108c2-cbef-429c-969e-201959982ef6","type":"GlyphRenderer"}]],["s",[{"id":"0bdfc78f-f6f3-4e6d-a756-e32ae7697ec6","type":"GlyphRenderer"}]],["her",[{"id":"50cd347c-84a6-4306-b228-1403d7bc3862","type":"GlyphRenderer"}]],["with",[{"id":"d30a8bae-c937-44d9-aad4-347935c1ff06","type":"GlyphRenderer"}]],["that",[{"id":"d880f7e2-ea22-4da3-8c53-1adc67f96a3e","type":"GlyphRenderer"}]],["him",[{"id":"b5f8639f-318d-4870-a73a-7d019a2604b5","type":"GlyphRenderer"}]],["for",[{"id":"e02bb52c-f782-49be-bbfe-667d81ba996e","type":"GlyphRenderer"}]],["she",[{"id":"21ea4b6d-a771-48a4-be6f-80ecae339e32","type":"GlyphRenderer"}]],["by",[{"id":"bff31e20-1d7b-401b-9773-af1438fd40cd","type":"GlyphRenderer"}]],["as",[{"id":"31247e71-a099-42fb-a9f5-32e19bf209e2","type":"GlyphRenderer"}]],["on",[{"id":"eea7c897-061c-473b-ab1c-c806a638f762","type":"GlyphRenderer"}]],["they",[{"id":"69303976-48f4-4529-a78e-7a2e8db1b792","type":"GlyphRenderer"}]],["but",[{"id":"fc1a6880-3414-4556-9670-c0b0adf034d1","type":"GlyphRenderer"}]],["who",[{"id":"9db32142-9140-4a63-ad69-344748a7669f","type":"GlyphRenderer"}]],["at",[{"id":"4086b991-3b18-44c1-b013-6351f43b2dfe","type":"GlyphRenderer"}]],["from",[{"id":"ba27f464-709f-4ec1-91f7-e288b5689346","type":"GlyphRenderer"}]],["an",[{"id":"798cd31e-b711-4806-8ca4-9f93dbee56b5","type":"GlyphRenderer"}]],["has",[{"id":"da7c754e-64c2-4fd2-837d-b56508157dff","type":"GlyphRenderer"}]],["it",[{"id":"53d8b755-1b32-4642-9199-7769fe5f2765","type":"GlyphRenderer"}]],["their",[{"id":"8609c997-a419-487b-bd39-85ccc44b8cb9","type":"GlyphRenderer"}]],["are",[{"id":"141cc16c-c3b9-4c42-8ae7-f94ecd2357da","type":"GlyphRenderer"}]],["when",[{"id":"91345efc-a714-46ca-97f5-edaaa5b4bc86","type":"GlyphRenderer"}]]],"location":"top_left","plot":{"id":"f9ba29c1-b2c1-43a5-b32b-b3519a51adbc","subtype":"Chart","type":"Plot"}},"id":"64836395-67ef-4fe9-84ae-cdfa623be721","type":"Legend"},{"attributes":{"callback":null,"column_names":["line_color","line_alpha","color","fill_alpha","height","width","y","x","label"],"data":{"chart_index":[{"term":"he"}],"color":["#f22c40"],"fill_alpha":[0.8],"height":[180140.0],"label":[{"term":"he"}],"line_alpha":[1.0],"line_color":["white"],"term":["he"],"width":[0.8],"x":["he"],"y":[90070.0]}},"id":"26501d04-bbc5-45c8-99c5-621b8dcdc091","type":"ColumnDataSource"},{"attributes":{"fill_alpha":{"field":"fill_alpha"},"fill_color":{"field":"color"},"height":{"field":"height","units":"data"},"line_color":{"field":"line_color"},"width":{"field":"width","units":"data"},"x":{"field":"x"},"y":{"field":"y"}},"id":"d3ff6fb3-7a77-4113-812f-27970755cda8","type":"Rect"},{"attributes":{"data_source":{"id":"e8f8eb56-e983-41a8-8f24-8ac7258386ed","type":"ColumnDataSource"},"glyph":{"id":"5fc5269d-8bed-4677-a3dd-1b5163c644b3","type":"Rect"},"hover_glyph":null,"nonselection_glyph":null,"selection_glyph":null},"id":"3e98c68c-e3bd-4d0a-b198-65fd328233d7","type":"GlyphRenderer"},{"attributes":{"axis_label":"Sum( Idf )","formatter":{"id":"7279e6e6-20de-49f5-b726-dcb5b15d70d6","type":"BasicTickFormatter"},"plot":{"id":"f9ba29c1-b2c1-43a5-b32b-b3519a51adbc","subtype":"Chart","type":"Plot"},"ticker":{"id":"3c711e40-3f43-4107-866c-327bd9cbb0dc","type":"BasicTicker"}},"id":"abee3360-f0b7-44b6-860d-7ef9be7f5ae9","type":"LinearAxis"},{"attributes":{"plot":{"id":"f9ba29c1-b2c1-43a5-b32b-b3519a51adbc","subtype":"Chart","type":"Plot"}},"id":"ddc9545f-569f-4206-8fde-00f0eaea8589","type":"HelpTool"},{"attributes":{"data_source":{"id":"484e8969-e839-4e31-bf09-9c80fe597b82","type":"ColumnDataSource"},"glyph":{"id":"76d4569f-f858-4390-a682-3160d58c4078","type":"Rect"},"hover_glyph":null,"nonselection_glyph":null,"selection_glyph":null},"id":"187ac1a0-74a8-41dc-bcf2-6f5825da9c44","type":"GlyphRenderer"},{"attributes":{"callback":null,"column_names":["line_color","line_alpha","color","fill_alpha","height","width","y","x","label"],"data":{"chart_index":[{"term":"their"}],"color":["#f22c40"],"fill_alpha":[0.8],"height":[-0.2373125000185678],"label":[{"term":"their"}],"line_alpha":[1.0],"line_color":["white"],"term":["their"],"width":[0.8],"x":["their"],"y":[-0.1186562500092839]}},"id":"b6f18d71-0ff7-4871-899f-a09febe8f0b5","type":"ColumnDataSource"},{"attributes":{"fill_alpha":{"field":"fill_alpha"},"fill_color":{"field":"color"},"height":{"field":"height","units":"data"},"line_color":{"field":"line_color"},"width":{"field":"width","units":"data"},"x":{"field":"x"},"y":{"field":"y"}},"id":"b3eca825-1ea2-4135-9988-9d5b15bd96ca","type":"Rect"},{"attributes":{"callback":null,"column_names":["line_color","line_alpha","color","fill_alpha","height","width","y","x","label"],"data":{"chart_index":[{"term":"and"}],"color":["#f22c40"],"fill_alpha":[0.8],"height":[456308.0],"label":[{"term":"and"}],"line_alpha":[1.0],"line_color":["white"],"term":["and"],"width":[0.8],"x":["and"],"y":[228154.0]}},"id":"c0e6047b-ffd0-48f6-a8b1-bf656ce09229","type":"ColumnDataSource"},{"attributes":{"bottom_units":"screen","fill_alpha":{"value":0.5},"fill_color":{"value":"lightgrey"},"left_units":"screen","level":"overlay","line_alpha":{"value":1.0},"line_color":{"value":"black"},"line_dash":[4,4],"line_width":{"value":2},"plot":null,"render_mode":"css","right_units":"screen","top_units":"screen"},"id":"ee42f00b-60e6-4f1e-a34c-e2a300bbbb58","type":"BoxAnnotation"},{"attributes":{"callback":null,"column_names":["line_color","line_alpha","color","fill_alpha","height","width","y","x","label"],"data":{"chart_index":[{"term":"an"}],"color":["#f22c40"],"fill_alpha":[0.8],"height":[58014.0],"label":[{"term":"an"}],"line_alpha":[1.0],"line_color":["white"],"term":["an"],"width":[0.8],"x":["an"],"y":[29007.0]}},"id":"c43049ac-f0f6-43de-9488-43753806052d","type":"ColumnDataSource"},{"attributes":{"fill_alpha":{"field":"fill_alpha"},"fill_color":{"field":"color"},"height":{"field":"height","units":"data"},"line_color":{"field":"line_color"},"width":{"field":"width","units":"data"},"x":{"field":"x"},"y":{"field":"y"}},"id":"af7dd631-b6e7-4b5f-8afc-74b6d7068daf","type":"Rect"},{"attributes":{"axis_label":"Term","formatter":{"id":"cf0116d9-3bab-43c4-b630-e08f36790cd3","type":"CategoricalTickFormatter"},"major_label_orientation":0.7853981633974483,"plot":{"id":"04efd8e2-c1e4-40ac-9e9b-6bb9bc49f084","subtype":"Chart","type":"Plot"},"ticker":{"id":"366f535f-1055-46ac-8c13-daae06a29440","type":"CategoricalTicker"}},"id":"2af85d4b-122f-4083-910c-6aee8ca9b295","type":"CategoricalAxis"},{"attributes":{"legends":[["the",[{"id":"8c316225-196b-4c90-8e87-f28eae4a9310","type":"GlyphRenderer"}]],["to",[{"id":"d5370cda-9644-421e-b8d4-669d221e7103","type":"GlyphRenderer"}]],["and",[{"id":"c583f87d-ef6e-4674-8c71-25f4bbc53776","type":"GlyphRenderer"}]],["a",[{"id":"a7b18b75-36c9-483a-aeae-3bb7ff0f198b","type":"GlyphRenderer"}]],["of",[{"id":"deb4d911-d78f-4141-bb1f-54e86d087374","type":"GlyphRenderer"}]],["is",[{"id":"42842623-de88-4282-bdf6-1ff91c5902ca","type":"GlyphRenderer"}]],["in",[{"id":"14b0c9c7-080c-4922-89ed-15d530da2af3","type":"GlyphRenderer"}]],["his",[{"id":"c063900f-02cb-46f0-85e4-60418904bc1a","type":"GlyphRenderer"}]],["he",[{"id":"745cae12-85e2-4b8e-a8fc-da73cc133ef2","type":"GlyphRenderer"}]],["s",[{"id":"368d8fdc-c1b4-4eab-95e5-cdc04a33bd99","type":"GlyphRenderer"}]],["her",[{"id":"187ac1a0-74a8-41dc-bcf2-6f5825da9c44","type":"GlyphRenderer"}]],["with",[{"id":"19fdd1b0-c4b4-4271-a63b-7a25d4d4d69a","type":"GlyphRenderer"}]],["that",[{"id":"60cdbffa-722b-4078-bffc-ad98981d7304","type":"GlyphRenderer"}]],["him",[{"id":"0e27a912-246c-4d97-a1f9-b89e51a92b53","type":"GlyphRenderer"}]],["for",[{"id":"573db6ce-97af-4cc6-a6a9-643a2395cae7","type":"GlyphRenderer"}]],["she",[{"id":"26925648-4d41-4d01-a7f6-db4c46372205","type":"GlyphRenderer"}]],["by",[{"id":"3e98c68c-e3bd-4d0a-b198-65fd328233d7","type":"GlyphRenderer"}]],["as",[{"id":"2a2a2e69-a9ca-43dd-a245-812fbb4ea5a4","type":"GlyphRenderer"}]],["on",[{"id":"951d009e-a7ba-4629-82cb-eb57595e4c4a","type":"GlyphRenderer"}]],["they",[{"id":"f065d4a9-6b4e-47a7-a9c4-2c429da7455a","type":"GlyphRenderer"}]],["but",[{"id":"1851a8c6-1dd0-4b6d-9e95-e030700fff2a","type":"GlyphRenderer"}]],["who",[{"id":"a0df3022-0ceb-4fab-bbdf-214e83fb8f38","type":"GlyphRenderer"}]],["at",[{"id":"0734b97b-d68b-42c0-bd9a-09e641bd7ddd","type":"GlyphRenderer"}]],["from",[{"id":"bb8145b9-a7a3-4b32-940a-bbf7a4c5e4c7","type":"GlyphRenderer"}]],["an",[{"id":"c2216d76-a7d5-4758-9c95-77eaa9e1caad","type":"GlyphRenderer"}]],["has",[{"id":"d3aba626-4626-4632-b4dc-8c89c982f174","type":"GlyphRenderer"}]],["it",[{"id":"6f19ca39-a719-47ed-a2e0-dfe515c49100","type":"GlyphRenderer"}]],["their",[{"id":"79f3343c-80ee-4061-be08-db97ab97c45c","type":"GlyphRenderer"}]],["are",[{"id":"c713fd0e-6265-48f8-ad10-72a2624e583f","type":"GlyphRenderer"}]],["when",[{"id":"e5c6d8f3-87ee-418a-803c-b5fa27962362","type":"GlyphRenderer"}]]],"location":"top_left","plot":{"id":"04efd8e2-c1e4-40ac-9e9b-6bb9bc49f084","subtype":"Chart","type":"Plot"}},"id":"94e01f01-411b-4464-8bcd-28a6c515db54","type":"Legend"},{"attributes":{"callback":null,"column_names":["line_color","line_alpha","color","fill_alpha","height","width","y","x","label"],"data":{"chart_index":[{"term":"on"}],"color":["#f22c40"],"fill_alpha":[0.8],"height":[83519.0],"label":[{"term":"on"}],"line_alpha":[1.0],"line_color":["white"],"term":["on"],"width":[0.8],"x":["on"],"y":[41759.5]}},"id":"da3c0344-8a9f-490b-ac20-bc9756bc6de5","type":"ColumnDataSource"},{"attributes":{"data_source":{"id":"51b1be87-dcc4-4a97-afe8-c531da6930be","type":"ColumnDataSource"},"glyph":{"id":"acffc3a1-0f7b-4fd6-a528-9ccc061df5f7","type":"Rect"},"hover_glyph":null,"nonselection_glyph":null,"selection_glyph":null},"id":"c063900f-02cb-46f0-85e4-60418904bc1a","type":"GlyphRenderer"},{"attributes":{"callback":null,"column_names":["line_color","line_alpha","color","fill_alpha","height","width","y","x","label"],"data":{"chart_index":[{"term":"they"}],"color":["#f22c40"],"fill_alpha":[0.8],"height":[-0.5540520616132202],"label":[{"term":"they"}],"line_alpha":[1.0],"line_color":["white"],"term":["they"],"width":[0.8],"x":["they"],"y":[-0.2770260308066101]}},"id":"aabb1342-d512-4cf5-b0ae-0b3b80b41893","type":"ColumnDataSource"},{"attributes":{"fill_alpha":{"field":"fill_alpha"},"fill_color":{"field":"color"},"height":{"field":"height","units":"data"},"line_color":{"field":"line_color"},"width":{"field":"width","units":"data"},"x":{"field":"x"},"y":{"field":"y"}},"id":"6f3a53bc-0d51-485c-9980-4e701f474dab","type":"Rect"},{"attributes":{"callback":null,"column_names":["line_color","line_alpha","color","fill_alpha","height","width","y","x","label"],"data":{"chart_index":[{"term":"her"}],"color":["#f22c40"],"fill_alpha":[0.8],"height":[-1.2745842320715253],"label":[{"term":"her"}],"line_alpha":[1.0],"line_color":["white"],"term":["her"],"width":[0.8],"x":["her"],"y":[-0.6372921160357626]}},"id":"155c0b63-3018-41a3-9a30-9db15a2e06d7","type":"ColumnDataSource"},{"attributes":{"fill_alpha":{"field":"fill_alpha"},"fill_color":{"field":"color"},"height":{"field":"height","units":"data"},"line_color":{"field":"line_color"},"width":{"field":"width","units":"data"},"x":{"field":"x"},"y":{"field":"y"}},"id":"eaec330d-290c-4a8a-aabc-ed8e8f949bdc","type":"Rect"},{"attributes":{"callback":null,"column_names":["line_color","line_alpha","color","fill_alpha","height","width","y","x","label"],"data":{"chart_index":[{"term":"has"}],"color":["#f22c40"],"fill_alpha":[0.8],"height":[56551.0],"label":[{"term":"has"}],"line_alpha":[1.0],"line_color":["white"],"term":["has"],"width":[0.8],"x":["has"],"y":[28275.5]}},"id":"19ca9357-0c3a-4b73-b938-58c91024f77e","type":"ColumnDataSource"},{"attributes":{"plot":{"id":"04efd8e2-c1e4-40ac-9e9b-6bb9bc49f084","subtype":"Chart","type":"Plot"}},"id":"65761574-eddc-479e-b632-0da47fd77a25","type":"PanTool"},{"attributes":{"below":[{"id":"310fe043-084c-4eaf-8706-17aa4578ca11","type":"CategoricalAxis"}],"left":[{"id":"abee3360-f0b7-44b6-860d-7ef9be7f5ae9","type":"LinearAxis"}],"plot_height":400,"plot_width":800,"renderers":[{"id":"ee42f00b-60e6-4f1e-a34c-e2a300bbbb58","type":"BoxAnnotation"},{"id":"e0641a18-fcb9-469b-97b1-3c0a00ae52c9","type":"GlyphRenderer"},{"id":"60f66e62-1aaa-4ea5-809c-f59df297c5c6","type":"GlyphRenderer"},{"id":"55715da0-958e-490d-b7a4-6351a79c53fa","type":"GlyphRenderer"},{"id":"043aab3f-9521-4d36-9cf2-5163d2764ad3","type":"GlyphRenderer"},{"id":"af738d58-6c14-4605-b4a4-fb07a26784bd","type":"GlyphRenderer"},{"id":"6e96654f-2434-4685-91a7-099dac8260cf","type":"GlyphRenderer"},{"id":"b103d63b-8f5c-4cb8-a642-cf2ddccbada3","type":"GlyphRenderer"},{"id":"3114f9a8-6beb-45b1-85f7-c459a0b412ed","type":"GlyphRenderer"},{"id":"b35108c2-cbef-429c-969e-201959982ef6","type":"GlyphRenderer"},{"id":"0bdfc78f-f6f3-4e6d-a756-e32ae7697ec6","type":"GlyphRenderer"},{"id":"50cd347c-84a6-4306-b228-1403d7bc3862","type":"GlyphRenderer"},{"id":"d30a8bae-c937-44d9-aad4-347935c1ff06","type":"GlyphRenderer"},{"id":"d880f7e2-ea22-4da3-8c53-1adc67f96a3e","type":"GlyphRenderer"},{"id":"b5f8639f-318d-4870-a73a-7d019a2604b5","type":"GlyphRenderer"},{"id":"e02bb52c-f782-49be-bbfe-667d81ba996e","type":"GlyphRenderer"},{"id":"21ea4b6d-a771-48a4-be6f-80ecae339e32","type":"GlyphRenderer"},{"id":"bff31e20-1d7b-401b-9773-af1438fd40cd","type":"GlyphRenderer"},{"id":"31247e71-a099-42fb-a9f5-32e19bf209e2","type":"GlyphRenderer"},{"id":"eea7c897-061c-473b-ab1c-c806a638f762","type":"GlyphRenderer"},{"id":"69303976-48f4-4529-a78e-7a2e8db1b792","type":"GlyphRenderer"},{"id":"fc1a6880-3414-4556-9670-c0b0adf034d1","type":"GlyphRenderer"},{"id":"9db32142-9140-4a63-ad69-344748a7669f","type":"GlyphRenderer"},{"id":"4086b991-3b18-44c1-b013-6351f43b2dfe","type":"GlyphRenderer"},{"id":"ba27f464-709f-4ec1-91f7-e288b5689346","type":"GlyphRenderer"},{"id":"798cd31e-b711-4806-8ca4-9f93dbee56b5","type":"GlyphRenderer"},{"id":"da7c754e-64c2-4fd2-837d-b56508157dff","type":"GlyphRenderer"},{"id":"53d8b755-1b32-4642-9199-7769fe5f2765","type":"GlyphRenderer"},{"id":"8609c997-a419-487b-bd39-85ccc44b8cb9","type":"GlyphRenderer"},{"id":"141cc16c-c3b9-4c42-8ae7-f94ecd2357da","type":"GlyphRenderer"},{"id":"91345efc-a714-46ca-97f5-edaaa5b4bc86","type":"GlyphRenderer"},{"id":"64836395-67ef-4fe9-84ae-cdfa623be721","type":"Legend"},{"id":"310fe043-084c-4eaf-8706-17aa4578ca11","type":"CategoricalAxis"},{"id":"abee3360-f0b7-44b6-860d-7ef9be7f5ae9","type":"LinearAxis"},{"id":"66f0fb3f-73e8-4dfd-bdc0-bf32680dfc7c","type":"Grid"}],"title":{"id":"c80c9e60-447d-4dc8-ad53-7a75280c9908","type":"Title"},"tool_events":{"id":"49c760b7-1b0e-4b19-82a2-b49c7f187418","type":"ToolEvents"},"toolbar":{"id":"5badf80e-2ada-4753-b16a-dba8f50f758f","type":"Toolbar"},"x_mapper_type":"auto","x_range":{"id":"ba17365d-c3ce-4e75-8a5c-f298d09e9b9d","type":"FactorRange"},"y_mapper_type":"auto","y_range":{"id":"94aac789-7c2c-453e-83c3-38b60e6583d7","type":"Range1d"}},"id":"f9ba29c1-b2c1-43a5-b32b-b3519a51adbc","subtype":"Chart","type":"Plot"},{"attributes":{"data_source":{"id":"5a6fb1d0-b0e6-4b9f-a65c-74e6bb21fd43","type":"ColumnDataSource"},"glyph":{"id":"48d6b071-5257-4a34-bcf9-7bc239a00653","type":"Rect"},"hover_glyph":null,"nonselection_glyph":null,"selection_glyph":null},"id":"f065d4a9-6b4e-47a7-a9c4-2c429da7455a","type":"GlyphRenderer"},{"attributes":{"callback":null,"column_names":["line_color","line_alpha","color","fill_alpha","height","width","y","x","label"],"data":{"chart_index":[{"term":"that"}],"color":["#f22c40"],"fill_alpha":[0.8],"height":[-1.192260205018764],"label":[{"term":"that"}],"line_alpha":[1.0],"line_color":["white"],"term":["that"],"width":[0.8],"x":["that"],"y":[-0.596130102509382]}},"id":"14089801-16c1-4d82-8c7f-c878662e6349","type":"ColumnDataSource"},{"attributes":{"data_source":{"id":"b19e4e80-8f06-460d-aab5-d95566197ec8","type":"ColumnDataSource"},"glyph":{"id":"bb631ae2-b634-4ae5-b3af-bf9f0b1ffd74","type":"Rect"},"hover_glyph":null,"nonselection_glyph":null,"selection_glyph":null},"id":"19fdd1b0-c4b4-4271-a63b-7a25d4d4d69a","type":"GlyphRenderer"},{"attributes":{"callback":null,"column_names":["line_color","line_alpha","color","fill_alpha","height","width","y","x","label"],"data":{"chart_index":[{"term":"the"}],"color":["#f22c40"],"fill_alpha":[0.8],"height":[823456.0],"label":[{"term":"the"}],"line_alpha":[1.0],"line_color":["white"],"term":["the"],"width":[0.8],"x":["the"],"y":[411728.0]}},"id":"da88fe6c-5156-4e6b-8634-b99a09e9d024","type":"ColumnDataSource"},{"attributes":{"callback":null,"column_names":["line_color","line_alpha","color","fill_alpha","height","width","y","x","label"],"data":{"chart_index":[{"term":"it"}],"color":["#f22c40"],"fill_alpha":[0.8],"height":[54730.0],"label":[{"term":"it"}],"line_alpha":[1.0],"line_color":["white"],"term":["it"],"width":[0.8],"x":["it"],"y":[27365.0]}},"id":"d10d227c-341f-44e1-89bd-83a30e510cbb","type":"ColumnDataSource"},{"attributes":{"data_source":{"id":"59d5fa0d-306f-4ebe-8360-4c174f419c56","type":"ColumnDataSource"},"glyph":{"id":"8cfa0ab2-67b0-48fa-960f-7abac04b4a2d","type":"Rect"},"hover_glyph":null,"nonselection_glyph":null,"selection_glyph":null},"id":"79f3343c-80ee-4061-be08-db97ab97c45c","type":"GlyphRenderer"},{"attributes":{"data_source":{"id":"da88fe6c-5156-4e6b-8634-b99a09e9d024","type":"ColumnDataSource"},"glyph":{"id":"5a7d16cf-ab77-48ba-8718-aec26dcf6567","type":"Rect"},"hover_glyph":null,"nonselection_glyph":null,"selection_glyph":null},"id":"8c316225-196b-4c90-8e87-f28eae4a9310","type":"GlyphRenderer"},{"attributes":{"fill_alpha":{"field":"fill_alpha"},"fill_color":{"field":"color"},"height":{"field":"height","units":"data"},"line_color":{"field":"line_color"},"width":{"field":"width","units":"data"},"x":{"field":"x"},"y":{"field":"y"}},"id":"305c7bd3-65c6-450a-9380-cc8d66bcbf8f","type":"Rect"},{"attributes":{"callback":null,"column_names":["line_color","line_alpha","color","fill_alpha","height","width","y","x","label"],"data":{"chart_index":[{"term":"him"}],"color":["#f22c40"],"fill_alpha":[0.8],"height":[96722.0],"label":[{"term":"him"}],"line_alpha":[1.0],"line_color":["white"],"term":["him"],"width":[0.8],"x":["him"],"y":[48361.0]}},"id":"fc657dfc-52ba-4643-bd64-9c03ba37daa9","type":"ColumnDataSource"},{"attributes":{"data_source":{"id":"145c52ac-f99e-4cd0-8dbd-a030faef06b6","type":"ColumnDataSource"},"glyph":{"id":"52ddc914-d65b-45f8-ae52-c8754a4184ea","type":"Rect"},"hover_glyph":null,"nonselection_glyph":null,"selection_glyph":null},"id":"60cdbffa-722b-4078-bffc-ad98981d7304","type":"GlyphRenderer"},{"attributes":{"callback":null,"column_names":["line_color","line_alpha","color","fill_alpha","height","width","y","x","label"],"data":{"chart_index":[{"term":"has"}],"color":["#f22c40"],"fill_alpha":[0.8],"height":[-0.2927166514249843],"label":[{"term":"has"}],"line_alpha":[1.0],"line_color":["white"],"term":["has"],"width":[0.8],"x":["has"],"y":[-0.14635832571249216]}},"id":"a7afb923-e359-4320-9b45-59aec7764d1b","type":"ColumnDataSource"},{"attributes":{"data_source":{"id":"4daee180-4430-4ff0-a6fb-87c5d5f533e5","type":"ColumnDataSource"},"glyph":{"id":"0db1e70c-b798-4f76-96fa-ee7f7d88deb4","type":"Rect"},"hover_glyph":null,"nonselection_glyph":null,"selection_glyph":null},"id":"fc1a6880-3414-4556-9670-c0b0adf034d1","type":"GlyphRenderer"}],"root_ids":["80044f24-9906-4463-86d2-5c29055ea3a5"]},"title":"Bokeh Application","version":"0.12.0"}}; var render_items = [{"docid":"228434f8-5648-4d6e-9898-707adaf02248","elementid":"6067b708-a0f7-429c-898e-b845b1969084","modelid":"80044f24-9906-4463-86d2-5c29055ea3a5","notebook_comms_target":"a96de24a-e021-45b8-875d-15c4dee8feb4"}]; Bokeh.embed.embed_items(docs_json, render_items); }); }, function(Bokeh) { } ]; function run_inline_js() { for (var i = 0; i < inline_js.length; i++) { inline_js[i](window.Bokeh); } } if (window._bokeh_is_loading === 0) { console.log("Bokeh: BokehJS loaded, going straight to plotting"); run_inline_js(); } else { load_libs(js_urls, function() { console.log("Bokeh: BokehJS plotting callback run at", now()); run_inline_js(); }); } }(this)); Out[28]: <Bokeh Notebook handle for In[28]>  ### TF-IDF Ranking We then created an inverted index for the TD-IDF ranking  In [29]: def create_inverted_index(corpus): idx={} for i, doc in enumerate(corpus): for word in doc.split(): if word in idx: if i in idx[word]: # Update document's frequency idx[word][i] += 1 else: # Add document idx[word][i] = 1 else: # Add term idx[word] = {i:1} return idx def get_results_tfidf(qry, idx, n): score = Counter() for term in qry.split(): if term in idx: i = idf(term, idx, n) for doc in idx[term]: score[doc] += idx[term][doc] * i results=[] for x in [[r[0],r[1]] for r in zip(score.keys(), score.values())]: if x[1] > 0: results.append([x[1],x[0]]) sorted_results = sorted(results, key=lambda t: t[0] * -1 ) return sorted_results idx = create_inverted_index(items_d) results = get_results_tfidf('lookout action bike zombie', idx, len(items_d)) print_results(results,10)   Top 10 from recall set of 1874 items: 115.77 - ('i bought a vampire motorcycle', 'm parody m horror m slasher m horror comedy') 104.68 - ('burial ground the nights of terror', 'm thriller m zombie film m horror m world cinema') 90.60 - ('polladhavan', 'm romance film m action m drama') 78.51 - ('hatchet ii', 'm thriller m horror m cult m comedy m black comedy m action m slasher') 70.47 - ('the dirt bike kid', 'm family film m children s family m fantasy m adventure m comedy') 60.40 - ('tuff turf', 'm romantic drama m romance film m action m drama m teen') 57.58 - ('hide and creep', 'm science fiction m b movie m comedy m zombie film m horror m horror comedy') 57.58 - ('day of the dead', 'm cult m zombie film m horror m indie') 57.37 - ('amityville dollhouse', 'm horror') 52.34 - ('fido', 'm parody m horror m period piece m drama m comedy m zombie film m romance film m horror comedy')  Ideally we do not want scores to be the same for lots of documents. High TF-IDF scores in shorter documents should be more relevant - so we could try by boosting the score for documents that are shorter than average.  In [30]: def get_results_tfidf_boost(qry, corpus): idx = create_inverted_index(corpus) n = len(corpus) d = [len(x.split()) for x in corpus] d_avg = float(sum(d)) / len(d) score = Counter() for term in qry.split(): if term in idx: i = idf(term, idx, n) for doc in idx[term]: f = float(idx[term][doc]) score[doc] += i * ( f / (float(d[doc]) / d_avg) ) results=[] for x in [[r[0],r[1]] for r in zip(score.keys(), score.values())]: if x[1] > 0: # output [0] score, [1] doc_id results.append([x[1],x[0]]) sorted_results = sorted(results, key=lambda t: t[0] * -1 ) return sorted_results   In [31]: from bokeh.charts import Scatter results = get_results_tfidf_boost('zombie invasion', items_d) print_results(results, 10) # Plot score vs item length df = pd.DataFrame({'score':[float(x[0]) for x in results], 'length':[len(items_d[x[1]].split()) for x in results]}) output_notebook() p = Scatter(df, x='score', y='length') show(p)   Top 10 from recall set of 566 items: 104.51 - ('reel zombies', 'm horror m horror comedy') 91.58 - ('zombie girl the movie', 'm documentary') 86.76 - ('caustic zombies', 'm horror') 80.42 - ('zombie bloodbath', 'm zombie film m horror m comedy') 75.51 - ('mathrubhoomi', '') 75.51 - ('gladiatress', 'm parody m sword and sandal m action m comedy') 71.67 - ('first platoon', 'm comedy film m horror') 68.65 - ('feeders', 'm drama m science fiction m horror') 61.06 - ('dead roses', 'm zombie film m horror m indie') 59.23 - ('time runner', 'm thriller m science fiction m action') Loading BokehJS ... var element =$('#5a216238-1e1b-4527-a45f-92144c42de28');

(function(global) {
function now() {
return new Date();
}

if (typeof (window._bokeh_onload_callbacks) === "undefined") {
}

function run_callbacks() {
console.info("Bokeh: all callbacks have finished");
}

console.log("Bokeh: BokehJS is being loaded, scheduling callback at", now());
return null;
}
if (js_urls == null || js_urls.length === 0) {
run_callbacks();
return null;
}
for (var i = 0; i < js_urls.length; i++) {
var url = js_urls[i];
var s = document.createElement('script');
s.src = url;
s.async = false;
run_callbacks()
}
};
s.onerror = function() {
console.warn("failed to load library " + url);
};
console.log("Bokeh: injecting script tag for BokehJS library: ", url);
}
};

var js_urls = ['https://cdn.pydata.org/bokeh/release/bokeh-0.12.0.min.js', 'https://cdn.pydata.org/bokeh/release/bokeh-widgets-0.12.0.min.js', 'https://cdn.pydata.org/bokeh/release/bokeh-compiler-0.12.0.min.js'];

var inline_js = [
function(Bokeh) {
Bokeh.set_log_level("info");
},

function(Bokeh) {
Bokeh.$("#d8f18236-2dd9-45fc-b90d-91588983af11").text("BokehJS successfully loaded"); }, function(Bokeh) { console.log("Bokeh: injecting CSS: https://cdn.pydata.org/bokeh/release/bokeh-0.12.0.min.css"); Bokeh.embed.inject_css("https://cdn.pydata.org/bokeh/release/bokeh-0.12.0.min.css"); console.log("Bokeh: injecting CSS: https://cdn.pydata.org/bokeh/release/bokeh-widgets-0.12.0.min.css"); Bokeh.embed.inject_css("https://cdn.pydata.org/bokeh/release/bokeh-widgets-0.12.0.min.css"); } ]; function run_inline_js() { for (var i = 0; i < inline_js.length; i++) { inline_js[i](window.Bokeh); } } if (window._bokeh_is_loading === 0) { console.log("Bokeh: BokehJS loaded, going straight to plotting"); run_inline_js(); } else { load_libs(js_urls, function() { console.log("Bokeh: BokehJS plotting callback run at", now()); run_inline_js(); }); } }(this)); (function(global) { function now() { return new Date(); } if (typeof (window._bokeh_onload_callbacks) === "undefined") { window._bokeh_onload_callbacks = []; } function run_callbacks() { window._bokeh_onload_callbacks.forEach(function(callback) { callback() }); delete window._bokeh_onload_callbacks console.info("Bokeh: all callbacks have finished"); } function load_libs(js_urls, callback) { window._bokeh_onload_callbacks.push(callback); if (window._bokeh_is_loading > 0) { console.log("Bokeh: BokehJS is being loaded, scheduling callback at", now()); return null; } if (js_urls == null || js_urls.length === 0) { run_callbacks(); return null; } console.log("Bokeh: BokehJS not loaded, scheduling load and callback at", now()); window._bokeh_is_loading = js_urls.length; for (var i = 0; i < js_urls.length; i++) { var url = js_urls[i]; var s = document.createElement('script'); s.src = url; s.async = false; s.onreadystatechange = s.onload = function() { window._bokeh_is_loading--; if (window._bokeh_is_loading === 0) { console.log("Bokeh: all BokehJS libraries loaded"); run_callbacks() } }; s.onerror = function() { console.warn("failed to load library " + url); }; console.log("Bokeh: injecting script tag for BokehJS library: ", url); document.getElementsByTagName("head")[0].appendChild(s); } };var element = document.getElementById("e92f1326-35b0-4ef7-bc55-6a230c9c2034"); if (element == null) { console.log("Bokeh: ERROR: autoload.js configured with elementid 'e92f1326-35b0-4ef7-bc55-6a230c9c2034' but no matching script tag was found. ") return false; } var js_urls = []; var inline_js = [ function(Bokeh) { Bokeh.$(function() {
var render_items = [{"docid":"2d8c634d-9b34-4e20-94a2-c7dd0d59a7c8","elementid":"e92f1326-35b0-4ef7-bc55-6a230c9c2034","modelid":"b40a2039-af69-4a04-ba16-1dfcd1db261e","notebook_comms_target":"b43ba019-a818-4755-bea8-82886d2751be"}];

Bokeh.embed.embed_items(docs_json, render_items);
});
},
function(Bokeh) {
}
];

function run_inline_js() {
for (var i = 0; i < inline_js.length; i++) {
inline_js[i](window.Bokeh);
}
}

console.log("Bokeh: BokehJS loaded, going straight to plotting");
run_inline_js();
} else {
console.log("Bokeh: BokehJS plotting callback run at", now());
run_inline_js();
});
}
}(this));

Out[31]:

<Bokeh Notebook handle for In[31]>



## Implementing BM25

To implement BM25, we used the function get_results_bm25 that used arguments "query, corpus, and the index sizes. We then printed out the results using a Bokeh chart.



In [32]:

def get_results_bm25(qry, corpus, k1=1.5, b=0.75):
idx = create_inverted_index(corpus)
# 1.Assign (integer) n to be the number of documents in the corpus
n = len(corpus)
# 2.Assign (list) d with elements corresponding to the number of terms in each document in the corpus
d = [len(x.split()) for x in corpus]
# 3.Assign (float) d_avg as the average document length of the documents in the corpus
d_avg = float(sum(d)) / len(d)
score = Counter()
for term in qry.split():
if term in idx:
i = idf(term, idx, n)
for doc in idx[term]:
# 4.Assign (float) f equal to the number of times the term appears in doc
f = float(idx[term][doc])
# 5.Assign (float) s the BM25 score for this (term, document) pair
s = i * (( f * (k1 + 1) ) / (f + k1 * (1 - b + (b * (float(d[doc]) / d_avg)))))
score[doc] += s

results=[]
for x in [[r[0],r[1]] for r in zip(score.keys(), score.values())]:
if x[1] > 0:
results.append([x[1],x[0]])

sorted_results = sorted(results, key=lambda t: t[0] * -1 )
return sorted_results




In [33]:

results = get_results_bm25('zombie apacolypse', items_d)
print_results(results, 10)




Top 10 from recall set of 224 items:
11.21 - ('zombie bloodbath', 'm zombie film m horror m comedy')
11.19 - ('day of the dead', 'm cult m zombie film m horror m indie')
10.68 - ('fido', 'm parody m horror m period piece m drama m comedy m zombie film m romance film m horror comedy')
10.67 - ('zombie vs mardi gras', 'm horror m comedy m indie')
10.64 - ('hatchet ii', 'm thriller m horror m cult m comedy m black comedy m action m slasher')
10.64 - ('super', 'm thriller m science fiction m action adventure m mystery m drama m action')
10.62 - ('colin', 'm b movie m creature film m psychological thriller m drama m zombie film m horror m action')
10.48 - ('burial ground the nights of terror', 'm thriller m zombie film m horror m world cinema')
10.31 - ('first platoon', 'm comedy film m horror')
10.31 - ('reel zombies', 'm horror m horror comedy')




In [34]:

!pip install bokeh
from bokeh.charts import Scatter

results = get_results_bm25('zombie apacolypse', items_d, k1=1.5, b=0.75)

# Plot score vs item length
df = pd.DataFrame({'score':[float(x[0]) for x in results],
'length':[len(items_d[x[1]].split()) for x in results]})
output_notebook()
p = Scatter(df, x='score', y='length')
show(p)




You are using pip version 8.1.2, however version 9.0.1 is available.

var element = $('#10e237cc-05bb-489b-9767-0dc2f2735b6e'); (function(global) { function now() { return new Date(); } if (typeof (window._bokeh_onload_callbacks) === "undefined") { window._bokeh_onload_callbacks = []; } function run_callbacks() { window._bokeh_onload_callbacks.forEach(function(callback) { callback() }); delete window._bokeh_onload_callbacks console.info("Bokeh: all callbacks have finished"); } function load_libs(js_urls, callback) { window._bokeh_onload_callbacks.push(callback); if (window._bokeh_is_loading > 0) { console.log("Bokeh: BokehJS is being loaded, scheduling callback at", now()); return null; } if (js_urls == null || js_urls.length === 0) { run_callbacks(); return null; } console.log("Bokeh: BokehJS not loaded, scheduling load and callback at", now()); window._bokeh_is_loading = js_urls.length; for (var i = 0; i < js_urls.length; i++) { var url = js_urls[i]; var s = document.createElement('script'); s.src = url; s.async = false; s.onreadystatechange = s.onload = function() { window._bokeh_is_loading--; if (window._bokeh_is_loading === 0) { console.log("Bokeh: all BokehJS libraries loaded"); run_callbacks() } }; s.onerror = function() { console.warn("failed to load library " + url); }; console.log("Bokeh: injecting script tag for BokehJS library: ", url); document.getElementsByTagName("head")[0].appendChild(s); } }; var js_urls = ['https://cdn.pydata.org/bokeh/release/bokeh-0.12.0.min.js', 'https://cdn.pydata.org/bokeh/release/bokeh-widgets-0.12.0.min.js', 'https://cdn.pydata.org/bokeh/release/bokeh-compiler-0.12.0.min.js']; var inline_js = [ function(Bokeh) { Bokeh.set_log_level("info"); }, function(Bokeh) { Bokeh.$("#53061ba4-fed1-4a42-9878-4a58dfdeaaa9").text("BokehJS successfully loaded");
},
function(Bokeh) {
console.log("Bokeh: injecting CSS: https://cdn.pydata.org/bokeh/release/bokeh-0.12.0.min.css");
Bokeh.embed.inject_css("https://cdn.pydata.org/bokeh/release/bokeh-0.12.0.min.css");
console.log("Bokeh: injecting CSS: https://cdn.pydata.org/bokeh/release/bokeh-widgets-0.12.0.min.css");
Bokeh.embed.inject_css("https://cdn.pydata.org/bokeh/release/bokeh-widgets-0.12.0.min.css");
}
];

function run_inline_js() {
for (var i = 0; i < inline_js.length; i++) {
inline_js[i](window.Bokeh);
}
}

console.log("Bokeh: BokehJS loaded, going straight to plotting");
run_inline_js();
} else {
console.log("Bokeh: BokehJS plotting callback run at", now());
run_inline_js();
});
}
}(this));

(function(global) {
function now() {
return new Date();
}

if (typeof (window._bokeh_onload_callbacks) === "undefined") {
}

function run_callbacks() {
console.info("Bokeh: all callbacks have finished");
}

console.log("Bokeh: BokehJS is being loaded, scheduling callback at", now());
return null;
}
if (js_urls == null || js_urls.length === 0) {
run_callbacks();
return null;
}
for (var i = 0; i < js_urls.length; i++) {
var url = js_urls[i];
var s = document.createElement('script');
s.src = url;
s.async = false;
run_callbacks()
}
};
s.onerror = function() {
console.warn("failed to load library " + url);
};
console.log("Bokeh: injecting script tag for BokehJS library: ", url);
}
};var element = document.getElementById("108ca026-ef2e-40ce-991e-68043c37119e");
if (element == null) {
console.log("Bokeh: ERROR: autoload.js configured with elementid '108ca026-ef2e-40ce-991e-68043c37119e' but no matching script tag was found. ")
return false;
}

var js_urls = [];

var inline_js = [
function(Bokeh) {
Bokeh.$(function() { var docs_json = {"16080bd9-f1dc-497c-ad7d-5e4d469ca7bd":{"roots":{"references":[{"attributes":{"dimension":1,"plot":{"id":"5c01d2c5-18a4-4fbe-b5ec-f9938ce1546a","subtype":"Chart","type":"Plot"},"ticker":{"id":"cb38ce18-86e1-4af8-9f2c-c64d9e969842","type":"BasicTicker"}},"id":"7472899c-4a31-4993-887a-e6f0a020467f","type":"Grid"},{"attributes":{"axis_label":"length","formatter":{"id":"efc53fc3-bcc8-4772-b612-72ba4723ee3b","type":"BasicTickFormatter"},"plot":{"id":"5c01d2c5-18a4-4fbe-b5ec-f9938ce1546a","subtype":"Chart","type":"Plot"},"ticker":{"id":"cb38ce18-86e1-4af8-9f2c-c64d9e969842","type":"BasicTicker"}},"id":"f2fcf00a-1b53-4c47-b5db-d437a395abbd","type":"LinearAxis"},{"attributes":{"location":"top_left","plot":{"id":"5c01d2c5-18a4-4fbe-b5ec-f9938ce1546a","subtype":"Chart","type":"Plot"}},"id":"5266ff66-8ec6-4c2f-908e-1d2794b3964d","type":"Legend"},{"attributes":{"plot":{"id":"5c01d2c5-18a4-4fbe-b5ec-f9938ce1546a","subtype":"Chart","type":"Plot"}},"id":"d4cba06d-a664-496f-83f8-1d8ebb7c03d3","type":"WheelZoomTool"},{"attributes":{"plot":{"id":"5c01d2c5-18a4-4fbe-b5ec-f9938ce1546a","subtype":"Chart","type":"Plot"}},"id":"95e2ce23-ebfe-4d59-86e0-12a408cb8a61","type":"PanTool"},{"attributes":{"callback":null,"column_names":["x_values","y_values"],"data":{"chart_index":[null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null],"x_values":[11.212967541874537,11.188759288611855,10.678492957983497,10.673894119509853,10.64022517066827,10.638963988706255,10.619696260239227,10.481405519985309,10.306149493716852,10.306149493716852,10.08378910113629,9.87551538383189,9.770163546116537,9.754440408363765,9.718179490338814,9.692613067038643,9.628921919975353,9.591046246020625,9.578568520981204,9.504018178270666,9.467037385902831,9.44264019319575,9.364238488658621,9.334499768953103,9.322622415809615,9.260630562100758,9.22192181126315,9.169163170211283,9.148021287597583,9.104057744723569,9.09143018925676,9.068923662695045,9.046364985609424,8.952289263291041,8.943249599595768,8.928717990063166,8.89280514144363,8.89055630499102,8.817606885476202,8.786163072783806,8.785885798070046,8.744218980405616,8.737064091473702,8.723396753669299,8.682048388427164,8.682048388427164,8.661520849799551,8.651069414205246,8.608364151543674,8.607005946778777,8.600516547217309,8.560322183070156,8.523564346172522,8.490734865487727,8.456363748538074,8.40299789073154,8.397487721716713,8.384001011524532,8.384001011524532,8.345800442278078,8.345800442278078,8.326830402736935,8.326692044452216,8.307808674130564,8.23768469677742,8.178119207626198,8.178119207626198,8.141767680588481,8.114584007361236,8.105737886638789,8.07002557329585,8.07002557329585,8.058018530177991,8.052287163398544,8.034626562692095,8.017043260334145,8.017043260334145,8.016915005290317,7.982106529555825,7.955976842749431,7.930268653926644,7.828403452676492,7.803451063698113,7.795269959645379,7.795269959645379,7.778717608675626,7.778536495475045,7.7622354027531415,7.7622354027531415,7.745822896932911,7.729320691703559,7.713205224555757,7.683894791164338,7.680861106380587,7.656604994412016,7.652606411099706,7.648600328103177,7.616979881840671,7.561835062958083,7.5540006558025725,7.538607246345739,7.507595444267678,7.507595444267678,7.503694765495524,7.461830969167229,7.411363577448149,7.401417496863124,7.327503609539178,7.317603094489845,7.298200960557928,7.1838119640637705,7.1838119640637705,7.155747260088201,7.155747260088201,7.118532158848673,7.1140589559547465,7.100270587485675,7.100270587485675,7.100270587485675,7.018649899574672,6.991858418561486,6.978539232960635,6.9520525187678786,6.9256384823143495,6.89967772063095,6.882268356038764,6.873690859505423,6.848086175529787,6.848086175529787,6.784671674723388,6.763671974856196,6.750155386392811,6.742922839800933,6.661301983060942,6.652097420725768,6.652097420725768,6.634027856807053,6.631025802542374,6.577579178505417,6.441553716429274,6.38275463095998,6.343987365819055,6.262726023272648,6.244828799653751,6.2253988331631005,6.136830778151078,6.045681663674384,5.9962036591552685,5.952290119992328,5.909084787940495,5.899638098239375,5.861877175518894,5.8431774038790225,5.781608487506485,5.715459714296744,5.688919196703884,5.680098433812744,5.6191107674086265,5.589043098621285,5.550994801142414,5.492733801301002,5.484450462107821,5.364050581186291,5.309628448492776,5.266300876006088,5.24876843387456,5.229980195795404,5.181866986161409,4.842806611615554,4.795308975853502,4.776506039224642,4.7517687722775825,4.727241817512471,4.709012122500152,4.678939743511186,4.573787992682984,4.429983643289134,4.3953959056329515,4.387537961137403,4.379631509241804,4.259985301906862,4.215815765479119,4.153720706294497,4.144322578203584,4.139639444153744,4.137267686566722,4.107151531891289,4.102551981939468,4.066123159796391,3.9649042649144626,3.9182556577732353,3.776867484955339,3.6819662039085923,3.519258069496709,3.398461200931552,3.3796495997620246,3.361045108974989,3.3518194640526335,3.3365554243414253,3.33351928404357,3.2192720285914898,3.120550490818706,2.930812883734435,2.886929670727938,2.8733435480955043,2.8051373410318994,2.7880581824955675,2.7421452024491537,2.7380461664639757,2.6878232416375014,2.645126923801677,2.64321838899826,2.6280486900476423,2.4054418518071703,2.399140538636686,2.243404319479077,2.1562534324184153,2.1127290936581984,2.0018941566698936,1.9050759835811348,1.7757897167137882,1.6866709981848798,1.527706791585083,1.5057294319963885],"y_values":[82,417,526,148,860,410,350,1286,46,46,395,77,85,373,768,91,96,201,100,106,858,759,340,120,121,820,716,732,136,752,18,19,145,929,284,286,27,1348,1250,32,1265,34,313,35,37,37,38,469,623,769,41,43,794,198,508,831,989,52,52,54,54,55,215,217,554,63,63,65,238,67,69,69,419,70,71,72,72,249,74,256,77,647,274,85,85,86,659,87,87,88,477,90,879,92,689,690,890,96,304,510,101,311,311,728,106,538,325,115,557,339,125,125,127,127,599,130,131,131,131,137,139,140,142,642,146,652,401,150,150,155,680,946,685,165,978,978,984,985,172,1050,189,490,200,815,512,212,221,226,566,575,236,240,242,956,978,259,260,267,646,275,282,671,298,305,1142,313,736,1603,848,379,869,386,390,393,398,416,442,1002,450,1008,475,1662,497,499,500,1106,507,508,516,539,550,585,610,656,693,699,705,708,713,714,753,789,865,884,890,921,929,951,953,978,1000,1001,1009,1138,1142,1248,1314,1349,1445,1538,1678,1787,2013,2048]}},"id":"c698b058-6e73-4cd3-b9b6-579265ddae08","type":"ColumnDataSource"},{"attributes":{"plot":{"id":"5c01d2c5-18a4-4fbe-b5ec-f9938ce1546a","subtype":"Chart","type":"Plot"}},"id":"9490f802-392f-4cd2-a2af-209b07381010","type":"SaveTool"},{"attributes":{"fill_alpha":{"value":0.7},"fill_color":{"value":"#f22c40"},"line_color":{"value":"#f22c40"},"size":{"units":"screen","value":8},"x":{"field":"x_values"},"y":{"field":"y_values"}},"id":"be1f6e70-d758-4512-b6b9-d88325928798","type":"Circle"},{"attributes":{"plot":null,"text":null},"id":"bb364acf-0d62-4270-abca-778e2431fd09","type":"Title"},{"attributes":{"callback":null,"end":12.183691352862352,"start":0.5350056210085737},"id":"a70b152e-d21c-47c9-961e-fbe79cc696ba","type":"Range1d"},{"attributes":{},"id":"efc53fc3-bcc8-4772-b612-72ba4723ee3b","type":"BasicTickFormatter"},{"attributes":{"plot":{"id":"5c01d2c5-18a4-4fbe-b5ec-f9938ce1546a","subtype":"Chart","type":"Plot"}},"id":"89292342-d575-405b-b125-d9ff21cda3b7","type":"ResetTool"},{"attributes":{"plot":{"id":"5c01d2c5-18a4-4fbe-b5ec-f9938ce1546a","subtype":"Chart","type":"Plot"}},"id":"15ad7b0c-b30f-498c-b65d-f9bdbc6b9fce","type":"HelpTool"},{"attributes":{"plot":{"id":"5c01d2c5-18a4-4fbe-b5ec-f9938ce1546a","subtype":"Chart","type":"Plot"},"ticker":{"id":"2b7caf17-6518-4292-afdc-7aea9e37ee7e","type":"BasicTicker"}},"id":"4c92b760-0e6b-488f-ad28-58da67847098","type":"Grid"},{"attributes":{"below":[{"id":"9553a1cc-72d7-4256-bdb1-91076ca0311c","type":"LinearAxis"}],"left":[{"id":"f2fcf00a-1b53-4c47-b5db-d437a395abbd","type":"LinearAxis"}],"renderers":[{"id":"3a0c0822-822f-4094-9b3a-ecb6c32dd3c5","type":"BoxAnnotation"},{"id":"a0fcc391-147e-476c-b2b5-bf523727d03b","type":"GlyphRenderer"},{"id":"5266ff66-8ec6-4c2f-908e-1d2794b3964d","type":"Legend"},{"id":"9553a1cc-72d7-4256-bdb1-91076ca0311c","type":"LinearAxis"},{"id":"f2fcf00a-1b53-4c47-b5db-d437a395abbd","type":"LinearAxis"},{"id":"4c92b760-0e6b-488f-ad28-58da67847098","type":"Grid"},{"id":"7472899c-4a31-4993-887a-e6f0a020467f","type":"Grid"}],"title":{"id":"bb364acf-0d62-4270-abca-778e2431fd09","type":"Title"},"tool_events":{"id":"89cdbc96-e959-4c3f-b9b0-776b2b30856e","type":"ToolEvents"},"toolbar":{"id":"8f087af3-b199-4782-89d7-8a6882eb91a0","type":"Toolbar"},"x_mapper_type":"auto","x_range":{"id":"a70b152e-d21c-47c9-961e-fbe79cc696ba","type":"Range1d"},"y_mapper_type":"auto","y_range":{"id":"3fc2a468-4486-4ab6-8340-2131ffd6597a","type":"Range1d"}},"id":"5c01d2c5-18a4-4fbe-b5ec-f9938ce1546a","subtype":"Chart","type":"Plot"},{"attributes":{},"id":"89cdbc96-e959-4c3f-b9b0-776b2b30856e","type":"ToolEvents"},{"attributes":{"active_drag":"auto","active_scroll":"auto","active_tap":"auto","tools":[{"id":"95e2ce23-ebfe-4d59-86e0-12a408cb8a61","type":"PanTool"},{"id":"d4cba06d-a664-496f-83f8-1d8ebb7c03d3","type":"WheelZoomTool"},{"id":"54958788-a979-46e7-9f62-5379511918ad","type":"BoxZoomTool"},{"id":"9490f802-392f-4cd2-a2af-209b07381010","type":"SaveTool"},{"id":"89292342-d575-405b-b125-d9ff21cda3b7","type":"ResetTool"},{"id":"15ad7b0c-b30f-498c-b65d-f9bdbc6b9fce","type":"HelpTool"}]},"id":"8f087af3-b199-4782-89d7-8a6882eb91a0","type":"Toolbar"},{"attributes":{},"id":"2b7caf17-6518-4292-afdc-7aea9e37ee7e","type":"BasicTicker"},{"attributes":{"bottom_units":"screen","fill_alpha":{"value":0.5},"fill_color":{"value":"lightgrey"},"left_units":"screen","level":"overlay","line_alpha":{"value":1.0},"line_color":{"value":"black"},"line_dash":[4,4],"line_width":{"value":2},"plot":null,"render_mode":"css","right_units":"screen","top_units":"screen"},"id":"3a0c0822-822f-4094-9b3a-ecb6c32dd3c5","type":"BoxAnnotation"},{"attributes":{"data_source":{"id":"c698b058-6e73-4cd3-b9b6-579265ddae08","type":"ColumnDataSource"},"glyph":{"id":"be1f6e70-d758-4512-b6b9-d88325928798","type":"Circle"},"hover_glyph":null,"nonselection_glyph":null,"selection_glyph":null},"id":"a0fcc391-147e-476c-b2b5-bf523727d03b","type":"GlyphRenderer"},{"attributes":{"axis_label":"score","formatter":{"id":"1522cd0b-06cc-44a7-b343-c4f2dbb1f6a4","type":"BasicTickFormatter"},"plot":{"id":"5c01d2c5-18a4-4fbe-b5ec-f9938ce1546a","subtype":"Chart","type":"Plot"},"ticker":{"id":"2b7caf17-6518-4292-afdc-7aea9e37ee7e","type":"BasicTicker"}},"id":"9553a1cc-72d7-4256-bdb1-91076ca0311c","type":"LinearAxis"},{"attributes":{"callback":null,"end":2251.0,"start":-185.0},"id":"3fc2a468-4486-4ab6-8340-2131ffd6597a","type":"Range1d"},{"attributes":{},"id":"cb38ce18-86e1-4af8-9f2c-c64d9e969842","type":"BasicTicker"},{"attributes":{},"id":"1522cd0b-06cc-44a7-b343-c4f2dbb1f6a4","type":"BasicTickFormatter"},{"attributes":{"overlay":{"id":"3a0c0822-822f-4094-9b3a-ecb6c32dd3c5","type":"BoxAnnotation"},"plot":{"id":"5c01d2c5-18a4-4fbe-b5ec-f9938ce1546a","subtype":"Chart","type":"Plot"}},"id":"54958788-a979-46e7-9f62-5379511918ad","type":"BoxZoomTool"}],"root_ids":["5c01d2c5-18a4-4fbe-b5ec-f9938ce1546a"]},"title":"Bokeh Application","version":"0.12.0"}}; var render_items = [{"docid":"16080bd9-f1dc-497c-ad7d-5e4d469ca7bd","elementid":"108ca026-ef2e-40ce-991e-68043c37119e","modelid":"5c01d2c5-18a4-4fbe-b5ec-f9938ce1546a","notebook_comms_target":"6af8fa6e-4088-49eb-a1c4-4140baf7d67e"}]; Bokeh.embed.embed_items(docs_json, render_items); }); }, function(Bokeh) { } ]; function run_inline_js() { for (var i = 0; i < inline_js.length; i++) { inline_js[i](window.Bokeh); } } if (window._bokeh_is_loading === 0) { console.log("Bokeh: BokehJS loaded, going straight to plotting"); run_inline_js(); } else { load_libs(js_urls, function() { console.log("Bokeh: BokehJS plotting callback run at", now()); run_inline_js(); }); } }(this)); Out[34]: <Bokeh Notebook handle for In[34]>  ## Implementing Random Forest Machine Learning Using the example from class to implement random forest ranking algorithm.  In [35]: import findspark import os findspark.init(os.getenv('HOME') + '/spark-1.6.0-bin-hadoop2.6') os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-csv_2.10:1.3.0 pyspark-shell'   In [36]: import pyspark try: print(sc) except NameError: sc = pyspark.SparkContext() print(sc)   <pyspark.context.SparkContext object at 0x186955290>   In [37]: from pyspark.sql import SQLContext import os sqlContext = SQLContext(sc) df = sqlContext.read.format('data/MovieSummaries/plot_summaries.tsv').options().options(header='true', inferSchema='true', delimiter=',') \ .load(os.getcwd() + 'data/MovieSummaries/plot_summaries.tsv') df.schema df.dropna()   --------------------------------------------------------------------------- Py4JJavaError Traceback (most recent call last) <ipython-input-37-96bf1b27a5d7> in <module>() 3 4 sqlContext = SQLContext(sc) ----> 5 df = sqlContext.read.format('data/MovieSummaries/plot_summaries.tsv').options() .options(header='true', inferSchema='true', delimiter=',') .load(os.getcwd() + 'data/MovieSummaries/plot_summaries.tsv') 6 7 df.schema /Users/dustin/spark-1.6.0-bin-hadoop2.6/python/pyspark/sql/readwriter.pyc in load(self, path, format, schema, **options) 135 self._jreader.load(self._sqlContext._sc._jvm.PythonUtils.toSeq(path))) 136 else: --> 137 return self._df(self._jreader.load(path)) 138 else: 139 return self._df(self._jreader.load()) /Users/dustin/spark-1.6.0-bin-hadoop2.6/python/lib/py4j-0.9-src.zip/py4j/java_gateway.py in __call__(self, *args) 811 answer = self.gateway_client.send_command(command) 812 return_value = get_return_value( --> 813 answer, self.gateway_client, self.target_id, self.name) 814 815 for temp_arg in temp_args: /Users/dustin/spark-1.6.0-bin-hadoop2.6/python/pyspark/sql/utils.pyc in deco(*a, **kw) 43 def deco(*a, **kw): 44 try: ---> 45 return f(*a, **kw) 46 except py4j.protocol.Py4JJavaError as e: 47 s = e.java_exception.toString() /Users/dustin/spark-1.6.0-bin-hadoop2.6/python/lib/py4j-0.9-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name) 306 raise Py4JJavaError( 307 "An error occurred while calling {0}{1}{2}.\n". --> 308 format(target_id, ".", name), value) 309 else: 310 raise Py4JError( Py4JJavaError: An error occurred while calling o25.load. : java.lang.ClassNotFoundException: Failed to find data source: data/MovieSummaries/plot_summaries.tsv. Please find packages at http://spark-packages.org at org.apache.spark.sql.execution.datasources.ResolvedDataSource$.lookupDataSource(ResolvedDataSource.scala:77)
at org.apache.spark.sql.execution.datasources.ResolvedDataSource$.apply(ResolvedDataSource.scala:102) at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:119) at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:109) at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) at java.lang.reflect.Method.invoke(Method.java:498) at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:231) at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:381) at py4j.Gateway.invoke(Gateway.java:259) at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:133) at py4j.commands.CallCommand.execute(CallCommand.java:79) at py4j.GatewayConnection.run(GatewayConnection.java:209) at java.lang.Thread.run(Thread.java:745) Caused by: java.lang.ClassNotFoundException: data/MovieSummaries/plot_summaries.tsv.DefaultSource at java.net.URLClassLoader.findClass(URLClassLoader.java:381) at java.lang.ClassLoader.loadClass(ClassLoader.java:424) at java.lang.ClassLoader.loadClass(ClassLoader.java:357) at org.apache.spark.sql.execution.datasources.ResolvedDataSource$$anonfun4$$anonfun$apply$1.apply(ResolvedDataSource.scala:62) at org.apache.spark.sql.execution.datasources.ResolvedDataSource$$anonfun4$$anonfun$apply$1.apply(ResolvedDataSource.scala:62) at scala.util.Try$.apply(Try.scala:161)
at org.apache.spark.sql.execution.datasources.ResolvedDataSource$$anonfun4.apply(ResolvedDataSource.scala:62) at org.apache.spark.sql.execution.datasources.ResolvedDataSource$$anonfun$4.apply(ResolvedDataSource.scala:62) at scala.util.Try.orElse(Try.scala:82) at org.apache.spark.sql.execution.datasources.ResolvedDataSource$.lookupDataSource(ResolvedDataSource.scala:62)
... 14 more




In [ ]:

sqlContext.registerDataFrameAsTable(df,'dataset')
sqlContext.tableNames()

data_full = sqlContext.sql("select label_relevanceBinary, feature_1, feature_2, feature_3, feature_4 \
feature_5, feature_6, feature_7, feature_8, feature_9, feature_10 \
from dataset").rdd




In [ ]:

from pyspark.mllib.classification import SVMWithSGD, SVMModel
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.feature import StandardScaler

label = data_full.map(lambda row: row[0])
features = data_full.map(lambda row: row[1:])

model = StandardScaler().fit(features)
features_transform = model.transform(features)

# Now combine and convert back to labelled points:
transformedData = label.zip(features_transform)
transformedData = transformedData.map(lambda row: LabeledPoint(row[0],[row[1]]))

transformedData.take(5)




In [ ]:

data_train, data_test = transformedData.randomSplit([.75,.25],seed=1973)

print('Training data records = ' + str(data_train.count()))
print('Training data records = ' + str(data_test.count()))




In [ ]:

from pyspark.mllib.tree import RandomForest

model = RandomForest.trainClassifier(data_train, numClasses=2, categoricalFeaturesInfo={},
numTrees=400, featureSubsetStrategy="auto",
impurity='gini', maxDepth=10, maxBins=32)