# Search Project for CST 495

CMU Movie Summary Corpus http://www.cs.cmu.edu/~ark/personas/

Dustin D'Avignon

Chris Ngo

# Let's go

We begin with normalise the text by removing unwanted characters and converting to lowercase



In [6]:

import csv
import re

with open("data/MovieSummaries/plot_summaries.tsv") as f:
tag = re.compile(r'\b[0-9]+\b')
rgx = re.compile(r'\b[a-zA-Z]+\b')
#docs = [ (' '.join(re.findall(tag, x[0])).lower(), ' '.join(re.findall(rgx, x[1])).lower()) for i,x in enumerate(r) if r>1 ]
docs= {}
for i,x in enumerate(r):
if i >1:
docs[' '.join(re.findall(tag, x[0])).lower()] = ' '.join(re.findall(rgx, x[1])).lower()


> now to normalize the movie meta data to swap the item titles with index from above ** just the basics for now to get index, tried to pull out genre, but it was screwing up the rest of the code due to potential parsing errors **


In [7]:

import csv
import re

tag = re.compile(r'\b[0-9]+\b')
rgx = re.compile(r'\b[a-zA-Z]+\b')
docs2= {}
for i,x in enumerate(r):
if i >1:
docs2[' '.join(re.findall(tag, x[0])).lower()] = ' '.join(re.findall(rgx, x[2])).lower(), ' '.join(re.findall(rgx, x[8])).lower()

#print(docs2)



now is the time to join the docs together



In [8]:

doc = [(docs2.get(x), y) for x, y in docs.items() if docs2.get(x)]

# for testing
# import random
#print doc[random.randint(0, len(doc)-1)]
print doc[0][0], doc[0][1]

items_t = [ d[0] for d in doc ] # item titles
items_d = [ d[1] for d in doc ] # item description
items_i = range(0 , len(items_t)) # item id






# term freq



In [10]:

corpus = items_d[0:25]
print corpus






start by computing frequncy of entire corpus



In [11]:

tf = {}
for doc in corpus:
for word in doc.split():
if word in tf:
tf[word] += 1
else:
tf[word] = 1
print(tf)






now that we have normailised the data we can compute the term frequency



In [12]:

from collections import Counter

def get_tf(corpus):
tf = Counter()
for doc in corpus:
for word in doc.split():
tf[word] += 1
return tf

tf = get_tf(corpus)
print(tf)






# doc freq



In [16]:

import collections

def get_tf(document):
tf = Counter()
for word in document.split():
tf[word] += 1
return tf

def get_dtf(corpus):
dtf = {}
for i,doc in enumerate(corpus):
dtf[i]= get_tf(doc)
return dtf

dtf = get_dtf(items_d)
dtf[342]




Out[16]:

Counter({'a': 7,
'again': 1,
'and': 26,
'angry': 1,
'are': 1,
'around': 1,
'as': 1,
'at': 1,
'attempt': 1,
'away': 2,
'back': 3,
'barking': 1,
'be': 3,
'been': 1,
'begins': 1,
'bone': 1,
'but': 7,
'by': 1,
'can': 2,
'catcher': 4,
'catches': 1,
'caught': 2,
'chases': 1,
'chasing': 1,
'city': 2,
'cover': 1,
'crawls': 1,
'cries': 1,
'day': 1,
'digs': 1,
'disguises': 1,
'doesn': 1,
'dog': 16,
'dogs': 1,
'drama': 1,
'driver': 1,
'drives': 1,
'driving': 1,
'enter': 1,
'escapes': 3,
'fools': 1,
'for': 2,
'from': 4,
'frowned': 1,
'gate': 2,
'get': 2,
'gets': 2,
'gives': 1,
'goes': 2,
'going': 1,
'grabs': 3,
'happily': 1,
'happy': 1,
'he': 22,
'hides': 2,
'him': 6,
'himself': 2,
'his': 8,
'hits': 1,
'hole': 2,
'horrified': 1,
'house': 1,
'humming': 1,
'hungry': 1,
'in': 4,
'inside': 1,
'is': 9,
'it': 6,
'jerry': 1,
'know': 1,
'lamp': 1,
'last': 1,
'lets': 1,
'locked': 1,
'looks': 2,
'main': 1,
'manhole': 1,
'napkin': 1,
'news': 1,
'newspaper': 3,
'no': 1,
'now': 1,
'of': 1,
'off': 1,
'order': 1,
'orders': 1,
'own': 1,
'panicking': 1,
'past': 1,
'pound': 3,
'protagonist': 1,
'pursues': 1,
'quiet': 1,
'realizes': 1,
'remain': 1,
'remove': 1,
'roll': 1,
'runs': 1,
's': 4,
'sacrifice': 1,
'says': 2,
'sees': 7,
'shows': 1,
'sits': 1,
'sleep': 2,
'sleeping': 1,
'some': 1,
'son': 1,
'song': 1,
'speeds': 1,
'spike': 16,
'stick': 1,
'stops': 1,
'street': 1,
't': 1,
'tags': 1,
'taken': 1,
'taking': 1,
'tells': 1,
'that': 2,
'the': 34,
'theme': 1,
'then': 3,
'they': 1,
'through': 1,
'throws': 1,
'tip': 1,
'to': 16,
'toes': 1,
'tom': 1,
'took': 1,
'trash': 2,
'tricked': 1,
'tries': 1,
'truck': 4,
'turns': 1,
'tyke': 7,
'under': 1,
'up': 2,
'use': 1,
'uses': 1,
'wakes': 1,
'walk': 1,
'was': 1,
'wear': 1,
'wears': 1,
'when': 1,
'where': 1,
'while': 1,
'who': 3,
'will': 1,
'with': 2,
'without': 1,
'yawns': 1,
'yet': 1})



compute dtf for item descriptions



In [17]:

dtf = get_dtf(items_d)
dtf[12]




Out[17]:

Counter({'a': 10,
'ability': 1,
'accept': 1,
'affection': 1,
'after': 1,
'against': 1,
'an': 1,
'and': 6,
'are': 1,
'aristocrat': 1,
'aristocratic': 1,
'aristocrats': 2,
'army': 1,
'as': 3,
'at': 3,
'aware': 1,
'bankrupt': 1,
'because': 1,
'becomes': 2,
'begins': 1,
'bulgaria': 1,
'but': 3,
'cinema': 1,
'cki': 1,
'com': 1,
'comes': 1,
'company': 1,
'condescended': 1,
'consents': 1,
'database': 1,
'daughter': 1,
'descendant': 1,
'devotion': 1,
'distressed': 1,
'dreaming': 1,
'during': 1,
'edu': 1,
'end': 1,
'enterprising': 1,
'eventual': 1,
'exile': 2,
'failed': 1,
'falling': 1,
'family': 2,
'father': 2,
'filmy': 1,
'financially': 1,
'forced': 1,
'fortune': 1,
'founds': 1,
'frequenting': 1,
'frustrates': 1,
'fuw': 1,
'girl': 1,
'go': 1,
'haberdashery': 1,
'he': 4,
'heart': 1,
'help': 2,
'her': 1,
'him': 1,
'his': 6,
'hopfer': 1,
'http': 2,
'imdb': 1,
'impecunious': 1,
'impoverished': 1,
'in': 8,
'indolence': 1,
'influential': 1,
'info': 1,
'into': 1,
'is': 4,
'it': 1,
'izabela': 3,
'lack': 1,
'late': 1,
'lazy': 1,
'life': 1,
'love': 2,
'make': 1,
'makes': 1,
'marrying': 1,
'merchant': 1,
'merchants': 1,
'met': 1,
'mincel': 1,
'money': 2,
'new': 1,
'noble': 1,
'now': 1,
'of': 8,
'on': 1,
'or': 1,
'owner': 1,
'part': 1,
'partnership': 1,
'pensions': 1,
'pl': 1,
'polish': 2,
'proves': 1,
'quest': 1,
'rank': 1,
'respected': 1,
'restaurant': 1,
'return': 1,
'risks': 1,
'romantic': 1,
'russia': 1,
'russian': 2,
'russo': 1,
's': 4,
'salesman': 1,
'salons': 1,
'science': 1,
'searchplotwriters': 1,
'secure': 1,
'sentenced': 1,
'set': 1,
'sets': 1,
'shareholders': 1,
'she': 1,
'siberia': 1,
'social': 1,
'supplying': 1,
'taking': 1,
'the': 11,
'theatres': 1,
'their': 1,
'these': 1,
'to': 12,
'tomasz': 1,
'too': 1,
'true': 1,
'tsarist': 1,
'turkish': 1,
'two': 1,
'undertake': 1,
'up': 2,
'uprising': 1,
'uses': 1,
'vacuous': 1,
'waiter': 1,
'war': 1,
'warsaw': 2,
'while': 2,
'who': 1,
'widow': 1,
'win': 1,
'with': 3,
'without': 1,
'wokulski': 5,
'work': 1,
'www': 1,
'young': 1})



# term freq matrix

with the lexicon we are able to compute the term freq matrix



In [18]:

def get_tfm(corpus):

def get_lexicon(corpus):
lexicon = set()
for doc in corpus:
lexicon.update([word for word in doc.split()])
return list(lexicon)

lexicon = get_lexicon(corpus)

tfm =[]
for doc in corpus:
tfv = [0]*len(lexicon)
for term in doc.split():
tfv[lexicon.index(term)] += 1

tfm.append(tfv)

return tfm, lexicon

#test_corpus = ['mountain bike', 'road bike carbon', 'bike helmet']
#tfm, lexicon = get_tfm(test_corpus)
#print lexicon
#print tfm



# sparsity of term frequency matrix

We took the approach of using Bokeh for displaying the sparsity of term frequency matrix



In [64]:

#!pip install bokeh




In [19]:

import pandas as pd
from bokeh.plotting import figure, output_notebook, show, vplot

# sparsity as a function of document count
n = []
s = []
for i in range(100,1000,100):
corpus = items_d[0:i]
tfm, lexicon = get_tfm(corpus)
c = [ [x.count(0), x.count(1)] for x in tfm]
n_zero = sum([ y[0] for y in c])
n_one = sum( [y[1] for y in c])
s.append(1.0 - (float(n_one) / (n_one + n_zero)))
n.append(i)

output_notebook(hide_banner=True)
p = figure(x_axis_label='Documents', y_axis_label='Sparsity', plot_width=400, plot_height=400)
p.line(n, s, line_width=2)
p.circle(n, s, fill_color="white", size=8)
show(p)




We are using a boolean search to find documents that contains the words that are included within a user specified query. This is how our boolean search algorithm works:
• Compute the lexicon for the corpus
• Compute the term frequency matrix for the corpus
• Convert query to query vector using the same lexicon
• Compare each documents term frequncy vector to the query vector - specifically for each document in the corpus:
  • Compute a ranking score for each document by taking the dot product of the document's term frequency vector and the query vector
  • Sort the documents by ranking score 1 14 0 15 0 16 1 17 0 18 0 19 0 20 0 21 0 22 0 23 0 24 0 25 0 26 0 27 0 28 0 29 0 30 1 31 0 32 0 33 0 34 0 35 0 36 0 37 0 38 0 39 0 40 0 41 0 42 0 43 0 44 0 45 0 46 0 47 0 48 0 49 0 50 0 51 0 52 0 53 0 54 0 55 0 56 0 57 0 58 1 59 0 60 0 61 0 62 0 63 1 64 0 65 0 66 0 67 0 68 0 69 0 70 0 71 0 72 0 73 0 74 0 75 0 76 0 77 0 78 0 79 0 80 0 81 0 82 0 83 0 84 0 85 0 86 0 87 0 88 0 89 0 90 0 91 0 92 0 93 0 94 0 95 0 96 0 97 0 98 0 99 0 100 0 101 0 102 0 103 0 104 0 105 0 106 0 107 0 108 2 109 0 110 0 111 0 112 0 113 0 114 0 115 0 116 0 117 0 118 0 119 0 120 0 121 0 122 0 123 0 124 0 125 0 126 0 127 0 128 0 129 0 130 0 131 0 132 0 133 1 134 0 135 0 136 0 137 2 138 0 139 0 140 0 141 0 142 0 143 0 144 0 145 0 146 0 147 0 148 0 149 0 150 0 151 0 152 0 153 0 154 0 155 0 156 0 157 0 158 0 159 0 160 0 161 0 162 0 163 0 164 0 165 0 166 0 167 0 168 0 169 0 170 0 171 0 172 0 173 0 174 0 175 0 176 0 177 0 178 0 179 0 180 0 181 0 182 0 183 0 184 0 185 0 186 0 187 0 188 0 189 0 190 0 191 0 192 0 193 0 194 1 195 0 196 0 197 0 198 0 199 0 200 0 201 0 202 0 203 0 204 0 205 0 206 0 207 0 208 0 209 0 210 0 211 0 212 0 213 0 214 0 215 0 216 0 217 0 218 0 219 0 220 3 221 0 222 0 223 0 224 0 225 0 226 0 227 0 228 0 229 0 230 0 231 0 232 0 233 0 234 0 235 0 236 0 237 0 238 0 239 0 240 0 241 0 242 0 243 0 244 0 245 0 246 0 247 1 248 0 249 0 250 0 251 0 252 0 253 0 254 0 255 0 256 0 257 0 258 0 259 0 260 0 261 0 262 0 263 0 264 0 265 0 266 0 267 0 268 0 269 0 270 0 271 0 272 0 273 0 274 0 275 0 276 0 277 0 278 0 279 1 280 0 281 0 282 0 283 0 284 0 285 0 286 0 287 0 288 0 289 0 290 0 291 0 292 0 293 0 294 1 295 0 296 0 297 0 298 0 299 0 300 0 301 0 302 0 303 0 304 0 305 0 306 0 307 0 308 0 309 0 310 0 311 1 312 0 313 1 314 0 315 0 316 0 317 1 318 0 319 0 320 0 321 0 322 0 323 0 324 0 325 0 326 0 327 0 328 0 329 0 330 0 331 0 332 0 333 0 334 0 335 0 336 0 337 0 338 0 339 0 340 0 341 0 342 0 343 0 344 0 345 0 346 0 347 0 348 2 349 0 350 0 351 0 352 0 353 0 354 0 355 0 356 0 357 0 358 0 359 0 360 0 361 0 362 0 363 0 364 0 365 0 366 0 367 0 368 0 369 0 370 0 371 0 372 0 373 0 374 0 375 0 376 0 377 0 378 3 379 0 380 0 381 0 382 0 383 0 384 1 385 0 386 0 387 0 388 0 389 0 390 0 391 0 392 0 393 0 394 0 395 0 396 0 397 0 398 0 399 0 400 1 401 0 402 0 403 0 404 0 405 1 406 0 407 0 408 0 409 0 410 0 411 1 412 0 413 0 414 0 415 0 416 0 417 0 418 0 419 0 420 0 421 0 422 0 423 0 424 0 425 0 426 2 427 0 428 0 429 0 430 0 431 0 432 0 433 0 434 0 435 0 436 0 437 0 438 0 439 0 440 0 441 0 442 0 443 0 444 0 445 0 446 0 447 0 448 0 449 0 450 0 451 0 452 0 453 0 454 0 455 0 456 0 457 0 458 0 459 0 460 0 461 0 462 1 463 0 464 0 465 0 466 0 467 0 468 0 469 0 470 0 471 0 472 0 473 0 474 0 475 0 476 0 477 0 478 0 479 0 480 0 481 0 482 1 483 0 484 0 485 0 486 0 487 0 488 0 489 0 490 0 491 0 492 0 493 0 494 0 495 0 496 0 497 0 498 14 499 0 500 0 501 0 502 0 503 0 504 0 505 0 506 0 507 0 508 0 509 0 510 0 511 0 512 0 513 0 514 0 515 0 516 0 517 0 518 0 519 0 520 0 521 0 522 0 523 0 524 0 525 0 526 0 527 0 528 0 529 0 530 0 531 0 532 0 533 0 534 0 535 0 536 0 537 0 538 0 539 0 540 0 541 0 542 0 543 0 544 1 545 0 546 0 547 0 548 0 549 0 550 0 551 0 552 0 553 2 554 0 555 0 556 0 557 0 558 0 559 0 560 0 561 0 562 0 563 0 564 0 565 0 566 0 567 0 568 0 569 0 570 0 571 0 572 0 573 0 574 0 575 0 576 0 577 0 578 0 579 0 580 0 581 0 582 0 583 0 584 0 585 0 586 0 587 0 588 0 589 0 590 0 591 0 592 0 593 0 594 0 595 0 596 0 597 0 598 0 599 0 600 0 601 0 602 0 603 0 604 0 605 0 606 0 607 0 608 0 609 0 610 0 611 0 612 0 613 0 614 0 615 0 616 0 617 0 618 0 619 0 620 0 621 0 622 1 623 0 624 0 625 0 626 0 627 0 628 0 629 0 630 0 631 0 632 0 633 0 634 0 635 5 636 0 637 0 638 0 639 0 640 0 641 0 642 0 643 0 644 0 645 0 646 0 647 0 648 0 649 0 650 0 651 0 652 0 653 1 654 0 655 0 656 0 657 0 658 0 659 1 660 0 661 0 662 0 663 0 664 0 665 0 666 0 667 0 668 0 669 0 670 0 671 0 672 0 673 0 674 0 675 0 676 1 677 0 678 0 679 1 680 0 681 0 682 4 683 0 684 0 685 0 686 0 687 0 688 0 689 0 690 0 691 0 692 0 693 0 694 0 695 0 696 0 697 0 698 0 699 0 700 0 701 0 702 0 703 0 704 0 705 0 706 0 707 0 708 0 709 0 710 0 711 0 712 0 713 0 714 0 715 3 716 0 717 0 718 0 719 0 720 0 721 0 722 0 723 0 724 0 725 0 726 0 727 0 728 0 729 0 730 0 731 0 732 0 733 0 734 0 735 0 736 0 737 0 738 0 739 0 740 0 741 0 742 0 743 0 744 0 745 0 746 0 747 0 748 0 749 0 750 0 751 0 752 0 753 0 754 0 755 0 756 0 757 0 758 0 759 0 760 0 761 0 762 0 763 0 764 0 765 0 766 0 767 0 768 0 769 0 770 0 771 0 772 0 773 0 774 0 775 0 776 0 777 0 778 0 779 0 780 0 781 0 782 0 783 0 784 0 785 0 786 0 787 0 788 0 789 0 790 0 791 0 792 1 793 0 794 0 795 0 796 0 797 0 798 0 799 0 800 0 801 0 802 0 803 0 804 0 805 0 806 0 807 0 808 0 809 0 810 0 811 0 812 0 813 0 814 0 815 0 816 0 817 0 818 0 819 0 820 0 821 0 822 0 823 0 824 0 825 0 826 0 827 0 828 0 829 0 830 0 831 0 832 0 833 0 834 0 835 0 836 0 837 1 838 0 839 0 840 0 841 0 842 0 843 0 844 0 845 0 846 0 847 0 848 0 849 0 850 0 851 0 852 0 853 0 854 0 855 0 856 0 857 0 858 0 859 0 860 0 861 0 862 0 863 0 864 0 865 0 866 0 867 0 868 0 869 0 870 0 871 0 872 0 873 0 874 0 875 0 876 0 877 0 878 0 879 0 880 0 881 0 882 0 883 0 884 0 885 0 886 0 887 0 888 0 889 1 890 0 891 0 892 0 893 0 894 0 895 0 896 0 897 0 898 0 899 0  To compute the document ranking score we used the function get_results_tf() with results from the term frequency matrix  In [21]: def get_results_tf(qry, tfm, lexicon): qrv =[0]*len(lexicon) for term in qry.split(): if term in lexicon: qrv[lexicon.index(term)] = 1 results = [] for i, tfv in enumerate(tfm): score = 0 score = sum([ xy[0] * xy[1] for xy in zip(qrv,tfv)]) results.append([score, i]) sorted_results = sorted(results, key=lambda t: t[0] * -1) return sorted_results def print_results(results,n, head=True): ''' Helper function to print results ''' if head: print('\nTop %d from recall set of %d items:' % (n,len(results))) for r in results[:n]: print('\t%0.2f - %s'%(r[0],items_t[r[1]])) else: print('\nBottom %d from recall set of %d items:' % (n,len(results))) for r in results[-n:]: print('\t%0.2f - %s'%(r[0],items_t[r[1]])) tfm, lexicon = get_tfm(items_d[:1000]) results = get_results_tf('fun times', tfm , lexicon) print_results(results,10)   Top 10 from recall set of 1000 items: 4.00 - ('the challenge', 'm family film m children s m adventure m teen m comedy') 3.00 - ('color me kubrick', 'm lgbt m drama m comedy m indie') 3.00 - ('halloween years later', 'm cult m drama m horror m slasher m teen') 3.00 - ('b b s kids', 'm family film m domestic comedy m comedy m animation') 2.00 - ('the last day of summer', 'm family film m fantasy m comedy') 2.00 - ('eti', 'm romance film') 2.00 - ('halloweentown', 'm children s fantasy m children s family') 2.00 - ('des pissenlits par la racine', 'm comedy') 2.00 - ('santouri', 'm drama m world cinema') 2.00 - ('banjo the woodpile cat', 'm short film m family film m children s family m animation')  # Inverted Index the inverted index maps terms to the document in which they can be found  In [22]: def create_inverted_index(corpus): idx={} for i, document in enumerate(corpus): for word in document.split(): if word in idx: idx[word].append(i) else: idx[word] = [i] ## HIDE return idx test_corpus = ['mountain bike red','road bike carbon','bike helmet'] idx = create_inverted_index(test_corpus) print(idx)   {'mountain': [0], 'helmet': [2], 'bike': [0, 1, 2], 'red': [0], 'carbon': [1], 'road': [1]}  inverted index for document titles  In [23]: idx = create_inverted_index(items_d) print(set(idx['good']).intersection(set(idx['times']))) print(items_d[2061])   set([32488, 13314, 25605, 7688, 29707, 27661, 40338, 16911, 33808, 529, 12306, 12307, 16302, 534, 37798, 14224, 40111, 35356, 13094, 542, 31, 30240, 23587, 29221, 10278, 18983, 8234, 10283, 44, 39282, 560, 16435, 25141, 5696, 28218, 3131, 35388, 34367, 37440, 26689, 2114, 3652, 9286, 8801, 21070, 3853, 5715, 24046, 14934, 29881, 32141, 31426, 18523, 13404, 12429, 8798, 27232, 25697, 4283, 7270, 23313, 6516, 3691, 108, 29373, 20036, 39955, 11892, 629, 15479, 12408, 23161, 22652, 11836, 42112, 33921, 2690, 22055, 27270, 24711, 14444, 6423, 17037, 39277, 4752, 15505, 3943, 5781, 2710, 2711, 20632, 4036, 34971, 668, 5278, 28204, 32928, 12450, 1190, 2728, 22556, 9386, 38571, 30893, 7854, 10927, 33456, 39880, 13679, 12318, 16054, 22200, 30580, 17595, 24765, 22206, 36981, 4801, 24770, 17611, 16068, 8176, 7879, 5068, 41163, 30924, 27938, 3279, 42194, 19667, 18132, 20693, 27350, 37310, 31960, 4825, 5339, 16092, 1760, 40161, 6750, 35077, 4359, 34534, 1255, 3304, 9449, 41194, 35563, 31980, 36697, 22255, 17136, 22770, 39669, 34039, 21716, 15983, 13052, 5839, 33535, 7936, 11523, 33029, 14086, 2689, 40200, 26377, 19213, 25358, 783, 26895, 26385, 2322, 39700, 17685, 28208, 10008, 5403, 12080, 17697, 25890, 24356, 12166, 34598, 40569, 37672, 37057, 9351, 35629, 21806, 40825, 9008, 41779, 8500, 5941, 6281, 312, 1337, 11580, 32309, 26944, 7154, 41283, 29077, 11590, 31125, 5449, 37864, 19791, 7504, 6481, 19282, 29011, 32142, 11568, 13665, 5978, 17245, 18270, 13797, 34565, 4961, 21860, 8166, 18279, 28009, 22891, 29586, 1902, 11631, 4840, 14194, 30867, 10100, 13372, 22506, 24958, 3477, 19329, 30083, 33669, 13190, 39106, 26505, 39307, 15698, 22082, 13710, 31495, 4725, 23192, 1938, 7583, 41966, 35222, 39319, 9113, 31215, 20892, 10617, 41613, 12193, 7586, 23339, 21574, 36262, 32167, 34728, 27564, 25005, 942, 25519, 4019, 36790, 9144, 28603, 28606, 34751, 16832, 4768, 38852, 35937, 20779, 33738, 15819, 20940, 39586, 40398, 7585, 7635, 18901, 15609, 35289, 32219, 6108, 23518, 29663, 6112, 9698, 28645, 38018, 2536, 19945, 36842, 34285, 9710, 26607, 20026, 35314, 40303, 25598, 40952, 24175, 20990, 23125]) soldiers with the u n forces that entered korea during the korean war rape a village girl named eon rae the villagers ostracize eon rae and her son unable to make a living eon rae joins the brothel district that has been set up near the u n base on the other side of the river from the village the war and the introduction of u s culture break down the social order of the village after several village children have died the villagers put the blame on the prostitutes eventually the villagers unable to maintain the village leave their homes one by one eon rae and her son also leave synopsis from cite web  improve the ranking function  In [24]: def get_results_tf(qry, idx): score = Counter() for term in qry.split(): for doc in idx[term]: score[doc] += 1 results=[] for x in [[r[0],r[1]] for r in zip(score.keys(), score.values())]: if x[1] > 0: results.append([x[1],x[0]]) sorted_results = sorted(results, key=lambda t: t[0] * -1 ) return sorted_results; idx = create_inverted_index(items_d) results = get_results_tf('zombies', idx) print_results(results,20)   Top 20 from recall set of 190 items: 30.00 - ('burial ground the nights of terror', 'm thriller m zombie film m horror m world cinema') 19.00 - ('dance of the dead', 'm zombie film m horror m indie m teen m comedy') 19.00 - ('video dead', 'm zombie film m horror m b movie m indie') 16.00 - ('zombies zombies zombies', 'm zombie film m b movie m horror m comedy') 14.00 - ('big tits zombie', 'm zombie film m japanese movies m horror') 14.00 - ('flesheater', 'm horror m indie m creature film m zombie film m b movie m teen') 13.00 - ('shaun of the dead', 'm parody m romantic comedy m horror m doomsday film m cult m comedy m zombie film m black comedy m horror comedy') 12.00 - ('dawn of the dead', 'm horror m indie m doomsday film m cult m splatter film m zombie film') 12.00 - ('dead and deader', 'm science fiction m horror m television movie m sci fi horror m zombie film m action') 11.00 - ('route', 'm zombie film m horror m creature film') 11.00 - ('undead or alive', 'm action adventure m zombie film m western m horror') 11.00 - ('hide and creep', 'm science fiction m b movie m comedy m zombie film m horror m horror comedy') 10.00 - ('the stink of flesh', 'm cult m black comedy m horror m comedy m zombie film') 10.00 - ('abraham lincoln vs zombies', 'm action m horror') 9.00 - ('planet terror', 'm thriller m action adventure m science fiction m horror m indie m creature film m cult m zombie film m disaster m action thrillers m action') 9.00 - ('night of the living dead', 'm mystery m horror') 9.00 - ('when good ghouls go bad', 'm black comedy m fantasy m comedy m children s fantasy') 8.00 - ('zombi', 'm zombie film m horror m creature film m world cinema') 8.00 - ('day of the dead contagium', 'm zombie film m horror') 8.00 - ('land of the dead', 'm thriller m science fiction m horror m indie m doomsday film m creature film m cult m splatter film m zombie film m action m dystopia')  enter different queries  In [25]: results = get_results_tf('ghouls and ghosts', idx) print_results(results, 10)   Top 10 from recall set of 39747 items: 181.00 - ('in the line of duty witness', 'm action thrillers m world cinema m action adventure m martial arts film m action m chinese movies') 165.00 - ('dragon head', 'm science fiction m horror m world cinema m anime m disaster m japanese movies m action') 165.00 - ('band of the hand', 'm crime fiction m thriller m action thrillers m action adventure m drama m crime thriller m action') 162.00 - ('underworld rise of the lycans', 'm thriller m horror m gothic film m action adventure m period piece m fantasy m action m costume horror') 145.00 - ('franklin and the green knight', 'm family film m children s m animation') 144.00 - ('devil s diary', 'm horror m teen m television movie') 140.00 - ('wishology', 'm fantasy') 139.00 - ('the runaways', 'm punk rock m biography m indie m musical m drama m music m biographical film') 134.00 - ('the guard post', 'm mystery m horror') 129.00 - ('the mists of avalon', 'm costume drama m fantasy adventure m fantasy m feminist film')   In [26]: import pandas as pd from bokeh.plotting import output_notebook, show from bokeh.charts import Bar from bokeh.charts.attributes import CatAttr #from bokeh.models import Out[26]:

<Bokeh Notebook handle for In[26]>



# TF-IDF

To implement TF-IDF we used the function: $$IDF = log ( 1 + \frac{N}{n_t} )$$



In [27]:

import math

def idf(term, idx, n):
return math.log( float(n) / (1 + len(idx[term])))

print(idf('zombie',idx,len(items_d)))
print(idf('survival',idx,len(items_d)))
print(idf('invasions',idx,len(items_d)))




4.35124994957
4.91040628425
8.45297461909



### TF-IDF Intuition



In [28]:

from bokeh.charts import vplot

idx = create_inverted_index(items_d)

df = pd.DataFrame({'term':[x for x in idx.keys()],'freq':[len(x) for x in idx.values()],
'idf':[idf(x, idx, len(items_t)) for x in idx.keys()]})

output_notebook(hide_banner=True)
p1 = Bar(df.sort_values('freq', ascending=False)[:30], label=CatAttr(columns=['term'], sort=False), values='freq',
plot_width=800, plot_height=400)
p2 = Bar(df.sort_values('freq', ascending=False)[:30], label=CatAttr(columns=['term'], sort=False), values='idf',
plot_width=800, plot_height=400)
p = vplot(p1, p2)
show(p)




/Users/dustin/anaconda2/lib/python2.7/site-packages/ipykernel/__main__.py:13: BokehDeprecationWarning: bokeh.io.vplot was deprecated in Bokeh 0.12.0; please use bokeh.models.layouts.Column instead

Freq Idf frequency idx[word][i] += 1 else: # Add document idx[word][i] = 1 else: # Add term idx[word] = {i:1} return idx def get_results_tfidf(qry, idx, n): score = Counter() for term in qry.split(): if term in idx: i = idf(term, idx, n) for doc in idx[term]: score[doc] += idx[term][doc] * i results=[] for x in [[r[0],r[1]] for r in zip(score.keys(), score.values())]: if x[1] > 0: results.append([x[1],x[0]]) sorted_results = sorted(results, key=lambda t: t[0] * -1 ) return sorted_results idx = create_inverted_index(items_d) results = get_results_tfidf('lookout action bike zombie', idx, len(items_d)) print_results(results,10)   Top 10 from recall set of 1874 items: 115.77 - ('i bought a vampire motorcycle', 'm parody m horror m slasher m horror comedy') 104.68 - ('burial ground the nights of terror', 'm thriller m zombie film m horror m world cinema') 90.60 - ('polladhavan', 'm romance film m action m drama') 78.51 - ('hatchet ii', 'm thriller m horror m cult m comedy m black comedy m action m slasher') 70.47 - ('the dirt bike kid', 'm family film m children s family m fantasy m adventure m comedy') 60.40 - ('tuff turf', 'm romantic drama m romance film m action m drama m teen') 57.58 - ('hide and creep', 'm science fiction m b movie m comedy m zombie film m horror m horror comedy') 57.58 - ('day of the dead', 'm cult m zombie film m horror m indie') 57.37 - ('amityville dollhouse', 'm horror') 52.34 - ('fido', 'm parody m horror m period piece m drama m comedy m zombie film m romance film m horror comedy')  Ideally we do not want scores to be the same for lots of documents. High TF-IDF scores in shorter documents should be more relevant - so we could try by boosting the score for documents that are shorter than average.  ## Implementing BM25

To implement BM25, we used the function get_results_bm25 that used arguments "query, corpus, and the index sizes. We then printed out the results using a Bokeh chart.



In [32]:

def get_results_bm25(qry, corpus, k1=1.5, b=0.75):
idx = create_inverted_index(corpus)
# 1.Assign (integer) n to be the number of documents in the corpus
n = len(corpus)
# 2.Assign (list) d with elements corresponding to the number of terms in each document in the corpus
d = [len(x.split()) for x in corpus]
# 3.Assign (float) d_avg as the average document length of the documents in the corpus
d_avg = float(sum(d)) / len(d)
score = Counter()
for term in qry.split():
if term in idx:
i = idf(term, idx, n)
for doc in idx[term]:
# 4.Assign (float) f equal to the number of times the term appears in doc
f = float(idx[term][doc])
# 5.Assign (float) s the BM25 score for this (term, document) pair
s = i * (( f * (k1 + 1) ) / (f + k1 * (1 - b + (b * (float(d[doc]) / d_avg)))))
score[doc] += s

results=[]
for x in [[r[0],r[1]] for r in zip(score.keys(), score.values())]:
if x[1] > 0:
results.append([x[1],x[0]])

sorted_results = sorted(results, key=lambda t: t[0] * -1 )
return sorted_results




In [33]:

results = get_results_bm25('zombie apacolypse', items_d)
print_results(results, 10)




Top 10 from recall set of 224 items:
11.21 - ('zombie bloodbath', 'm zombie film m horror m comedy')
11.19 - ('day of the dead', 'm cult m zombie film m horror m indie')
10.68 - ('fido', 'm parody m horror m period piece m drama m comedy m zombie film m romance film m horror comedy')
10.67 - ('zombie vs mardi gras', 'm horror m comedy m indie')
10.64 - ('hatchet ii', 'm thriller m horror m cult m comedy m black comedy m action m slasher')
10.64 - ('super', 'm thriller m science fiction m action adventure m mystery m drama m action')
10.62 - ('colin', 'm b movie m creature film m psychological thriller m drama m zombie film m horror m action')
10.48 - ('burial ground the nights of terror', 'm thriller m zombie film m horror m world cinema')
10.31 - ('first platoon', 'm comedy film m horror')
10.31 - ('reel zombies', 'm horror m horror comedy')




In [34]:

!pip install bokeh
from bokeh.charts import Scatter

results = get_results_bm25('zombie apacolypse', items_d, k1=1.5, b=0.75)

# Plot score vs item length
df = pd.DataFrame({'score':[float(x[0]) for x in results],
'length':[len(items_d[x[1]].split()) for x in results]})
output_notebook()
p = Scatter(df, x='score', y='length')
show(p)




You are using pip version 8.1.2, however version 9.0.1 is available.

In [35]: import findspark import os findspark.init(os.getenv('HOME') + '/spark-1.6.0-bin-hadoop2.6') os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-csv_2.10:1.3.0 pyspark-shell'   In [36]: import pyspark try: print(sc) except NameError: sc = pyspark.SparkContext() print(sc)   <pyspark.context.SparkContext object at 0x186955290>   In [37]: from pyspark.sql import SQLContext import os sqlContext = SQLContext(sc) df = sqlContext.read.format('data/MovieSummaries/plot_summaries.tsv').options().options(header='true', inferSchema='true', delimiter=',') \ .load(os.getcwd() + 'data/MovieSummaries/plot_summaries.tsv') df.schema df.dropna()   --------------------------------------------------------------------------- Py4JJavaError Traceback (most recent call last) <ipython-input-37-96bf1b27a5d7> in <module>() 3 4 sqlContext = SQLContext(sc) ----> 5 df = sqlContext.read.format('data/MovieSummaries/plot_summaries.tsv').options() .options(header='true', inferSchema='true', delimiter=',') .load(os.getcwd() + 'data/MovieSummaries/plot_summaries.tsv') 6 7 df.schema /Users/dustin/spark-1.6.0-bin-hadoop2.6/python/pyspark/sql/readwriter.pyc in load(self, path, format, schema, **options) 135 self._jreader.load(self._sqlContext._sc._jvm.PythonUtils.toSeq(path))) 136 else: --> 137 return self._df(self._jreader.load(path)) 138 else: 139 return self._df(self._jreader.load()) /Users/dustin/spark-1.6.0-bin-hadoop2.6/python/lib/py4j-0.9-src.zip/py4j/java_gateway.py in __call__(self, *args) 811 answer = self.gateway_client.send_command(command) 812 return_value = get_return_value( --> 813 answer, self.gateway_client, self.target_id, self.name) 814 815 for temp_arg in temp_args: /Users/dustin/spark-1.6.0-bin-hadoop2.6/python/pyspark/sql/utils.pyc in deco(*a, **kw) 43 def deco(*a, **kw): 44 try: ---> 45 return f(*a, **kw) 46 except py4j.protocol.Py4JJavaError as e: 47 s = e.java_exception.toString() /Users/dustin/spark-1.6.0-bin-hadoop2.6/python/lib/py4j-0.9-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name) 306 raise Py4JJavaError( 307 "An error occurred while calling {0}{1}{2}.\n". --> 308 format(target_id, ".", name), value) 309 else: 310 raise Py4JError( Py4JJavaError: An error occurred while calling o25.load. : java.lang.ClassNotFoundException: Failed to find data source: data/MovieSummaries/plot_summaries.tsv. Please find packages at http://spark-packages.org at org.apache.spark.sql.execution.datasources.ResolvedDataSource$.lookupDataSource(ResolvedDataSource.scala:77)
at org.apache.spark.sql.execution.datasources.ResolvedDataSource$.apply(ResolvedDataSource.scala:102) at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:119) at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:109) at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) at java.lang.reflect.Method.invoke(Method.java:498) at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:231) at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:381) at py4j.Gateway.invoke(Gateway.java:259) at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:133) at py4j.commands.CallCommand.execute(CallCommand.java:79) at py4j.GatewayConnection.run(GatewayConnection.java:209) at java.lang.Thread.run(Thread.java:745) Caused by: java.lang.ClassNotFoundException: data/MovieSummaries/plot_summaries.tsv.DefaultSource at java.net.URLClassLoader.findClass(URLClassLoader.java:381) at java.lang.ClassLoader.loadClass(ClassLoader.java:424) at java.lang.ClassLoader.loadClass(ClassLoader.java:357) at org.apache.spark.sql.execution.datasources.ResolvedDataSource$$anonfun4$$anonfun$apply$1.apply(ResolvedDataSource.scala:62) at org.apache.spark.sql.execution.datasources.ResolvedDataSource$$anonfun4$$anonfun$apply$1.apply(ResolvedDataSource.scala:62) at scala.util.Try$.apply(Try.scala:161)
at org.apache.spark.sql.execution.datasources.ResolvedDataSource$$anonfun4.apply(ResolvedDataSource.scala:62) at org.apache.spark.sql.execution.datasources.ResolvedDataSource$$anonfun$4.apply(ResolvedDataSource.scala:62) at scala.util.Try.orElse(Try.scala:82) at org.apache.spark.sql.execution.datasources.ResolvedDataSource$.lookupDataSource(ResolvedDataSource.scala:62)
... 14 more




In [ ]:

sqlContext.registerDataFrameAsTable(df,'dataset')
sqlContext.tableNames()

data_full = sqlContext.sql("select label_relevanceBinary, feature_1, feature_2, feature_3, feature_4 \
feature_5, feature_6, feature_7, feature_8, feature_9, feature_10 \
from dataset").rdd




In [ ]:

from pyspark.mllib.classification import SVMWithSGD, SVMModel
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.feature import StandardScaler

label = data_full.map(lambda row: row[0])
features = data_full.map(lambda row: row[1:])

model = StandardScaler().fit(features)
features_transform = model.transform(features)

# Now combine and convert back to labelled points:
transformedData = label.zip(features_transform)
transformedData = transformedData.map(lambda row: LabeledPoint(row[0],[row[1]]))

transformedData.take(5)




In [ ]:

data_train, data_test = transformedData.randomSplit([.75,.25],seed=1973)

print('Training data records = ' + str(data_train.count()))
print('Training data records = ' + str(data_test.count()))




In [ ]:

from pyspark.mllib.tree import RandomForest

model = RandomForest.trainClassifier(data_train, numClasses=2, categoricalFeaturesInfo={},
numTrees=400, featureSubsetStrategy="auto",
impurity='gini', maxDepth=10, maxBins=32)