In [1]:
import pandas as pd
import pickle
import numpy as np
# Load the bar review dataset
review = pd.read_pickle('../output/bar_reviews_cleaned_and_tokenized_SF.pickle')
print review.head(2)
print review.tail(2)
business_id date review_id stars \
10 UsFtqoBl7naz8AVUBZMjQQ 2013-11-08 Di3exaUCFNw1V4kSNW5pgA 5
11 UsFtqoBl7naz8AVUBZMjQQ 2014-03-29 0Lua2-PbqEQMjD9r89-asw 3
text type \
10 All the food is great here. But the best thing... review
11 We checked this place out this past Monday for... review
user_id votes_cool votes_funny votes_useful \
10 uK8tzraOp4M5u3uYrqIBXg 0.0 0.0 0.0
11 I_47G-R2_egp7ME5u_ltew 0.0 0.0 0.0
cleaned_tokenized
10 [[food, great], [best, thing, wing], [wing, si...
11 [[checked, place, past, monday, wing, night], ...
business_id date \
84215 8781c06a4e2407f5e027cd503f4aab675e76615b NaN
84216 8781c06a4e2407f5e027cd503f4aab675e76615b NaN
review_id stars \
84215 0e446098-6893-4315-9ed8-243c1926dae6 4.0
84216 a5eb8ce2-2f30-4f4b-885b-5d163e606629 5.0
text type \
84215 Buffalo wings w/ hotter sauce - just the right... NaN
84216 It's thinly sliced steak covered with cheese o... NaN
user_id votes_cool votes_funny \
84215 d1d2fa20-3413-41ee-adc2-b58bc9b160e8 NaN NaN
84216 4585e5c9-f4b7-4bdb-94d4-39dc8e124db6 NaN NaN
votes_useful cleaned_tokenized
84215 NaN [[buffalo, wing, w, hotter, sauce, -, right, a...
84216 NaN [[thinly, sliced, steak, covered, cheese, warm...
In [2]:
# Load the training users
user_set_training = pickle.load(open('../output/training_users.pickle', 'rb'))
# Make the active review set training only
review = review[review.user_id.isin(user_set_training)]
In [3]:
from collections import OrderedDict
from itertools import chain
# n_reviews = 100 # all of them...
# Flatten the reviews, so each review is just a single list of words.
reviews_merged_bus = OrderedDict()
business_set = list(set(review.business_id.values[:]))
for i_bus, bus_id in enumerate(business_set):
if ((i_bus%5)==0):
print '\r Fraction Processed',float(i_bus+1)/len(business_set),
# This horrible line first collapses each review of a corresponding business into a list
# of lists, and then collapses the list of sentences to a long list of words
reviews_merged_bus[bus_id] = list(chain.from_iterable(chain.from_iterable(
review.cleaned_tokenized[review.business_id==bus_id] )) )
docs_bus = reviews_merged_bus.values()
print
print len(docs_bus)
Fraction Processed 0.99928762244
5615
In [4]:
import gensim
from itertools import chain
import sys
sys.path.append('../vectorsearch/')
import nltk_helper
import doc2vec
from gensim.models.doc2vec import TaggedDocument
import pandas as pd
n_epochs = 10
n_docs = 10 # -1 for almost all of them...
# Generate the tagged document list.
docs = [TaggedDocument(words, [business_set[index],])
for index, words in enumerate(docs_bus[:])]
print '\nFirst Doc: \n-----------------\n', docs[0]
path /data/insight_yelp/input/
First Doc:
-----------------
TaggedDocument(['crooning', 'gay', 'men', u'friend', 'tear', 'joint', 'regularly', 'recognized', u'medium', '-', 'straight', 'gay', 'best', 'karaoke', 'bar', 'valley', 'coming', 'ever', 'since', 'used', 'hop', 'fence', 'old', 'apartment', 'complex', u'block', 'away', 'tell', 'place', 'cant', 'beat', 'fun', 'value', 'dept', 'mean', 'else', 'get', '4', 'absolut', u'tonic', '-', 'plus', 'free', 'oftentimes', 'campy', 'klassy', 'entertainment', 'crowd', 'singing', u'skill', 'singing', u'taste', 'run', 'gamut', 'thats', 'part', 'appeal', 'crowd', 'mostly', 'gay', 'men', u'30', u'40', u'go', 'older', 'younger', u'lesbian', 'also', 'hold', 'court', 'well', 'mean', 'else', u'honor', 'sing', 'pat', 'benatar', 'melissa', 'many', 'gay', 'men', 'bring', 'straight', u'girlfriend', 'most', 'likely', 'someone', 'sing', 'grease', 'duet', 'believe', 'not', 'handful', 'straight', 'men', 'dragged', 'not', 'fear', 'crowd', 'very', 'friendly', 'welcoming', u'bartender', u'owner', 'take', 'care', 'people', 'kind', 'music', 'expect', 'people', 'completely', 'hammered', 'singing', 'public', 'very', 'first', 'time', 'others', 'serious', 'seasoned', '-', 'classical', 'training', 'via', 'phoenix', u'men', 'choir', u'guy', 'would', 'make', 'american', 'idol', 'drool', 'desire', 'song', 'list', 'large', 'respectable', 'think', 'george', 'dragon', 'better', 'list', '-', 'see', 'review', u'song', 'run', 'spectrum', 'liza', 'cabaret', 'tim', 'mcgraw', 'hip-hop', 'rap', u'80', 'alt', 'rock', 'u2', 'acdc', 'elton', 'john', 'stevie', 'wonder', 'frank', 'sinatra', 'place', 'totally', 'small', 'divey', '-and', 'bit', 'weathered', 'looking', 'speed', 'past', '7th', 'st', 'useless', 'trivia', 'alert', 'phoenix', 'architectural', 'history', u'buff', 'bar', 'identical', 'chez', 'nous', 'without', u'booth', 'not', 'dark', u'building', 'built', 'man', 'karaoke', u'run', 'thursday', 'night', 'saturday', 'night', 'other', 'themed', u'evening', 'throughout', 'week', 'incl', 'not', 'making', 'greek', 'god', 'revue', 'stripper', 'show', 'hosted', 'catty', 'drag', u'queen', 'sunday', 'night', u'gay-men', u'their-friends', u'the-media', u'the-valley', u'the-fence', u'this-place', u'the-fun', u'the-crowd', u'the-gamut', u'the-appeal', u'the-crowd', u'their-30s', u'the-honors', u'straight-men', u'the-crowd', u'the-bartenders', u'the-people', u'what-kind', u'some-people', u'the-very-first-time', u'the-phoenix', u'these-guys', u'the-song-list', u'my-review', u'the-songs', u'the-spectrum', u'hip-hop-rap', u'this-place', u'this-bar', u'the-booths', u'the-same-man', u'thursday-night', u'saturday-night', u'the-week', u'sunday-night', 'went', 'last', 'night', 'karaoke', 'flamin', 'steve', 'jenni', 'work', 'blast', 'gay', 'bar', 'bunch', u'u', 'great', 'selection', u'song', 'guy', u'run', 'running', 'karaoke', '15', u'year', 'nicely', 'decorated', 'guess', 'used', 'real', 'dive', u'goddess', 'room', 'clean', 'no', 'water', 'floor', 'gotta', 'say', 'though', 'gay', 'men', 'sing', u'song', 'werent', 'danzig', 'acdc', 'right', 'said', 'fred', u'rendition', 'would', 'fallen', 'asleep', 'staff', 'awesome', 'everyone', 'crowd', 'really', 'friendly', 'definitely', 'go', 'back', u'a-bunch', u'15-years', u'a-real-dive', u'the-floor', u'gay-men', u'the-staff', u'the-crowd', u'here', 'place', 'go', 'everybody', u'know', 'name', 'most', u'patron', 'happen', 'gay', 'love', 'coming', u'gay', 'hanging', u'drink', 'neighborhood', 'bar', 'crowd', 'supa', 'mixed', 'mean', 'else', 'go', 'hardened', 'biker', 'boy', 'compliment', 'nail', 'polish', 'choice', 'awesome', 'ive', 'also', 'threatened', 'stealing', u'shoe', 'well', 'asked', 'one', 'buy', 'lipstick', 'drag', 'queen', u'bartender', 'courteous', 'super', 'funny', 'fave', 'bartender', 'michael', u'he', 'cutie', 'great', 'pour', 'whether', 'youre', u'short', 't-shirt', 'dressed', u'nine', 'stop', 'youll', 'find', 'corner', 'fit', u'a-place', u'your-name', u'the-patrons', u'my-gays', u'some-drinks', u'a-neighborhood-bar', u'the-crowd', u'my-shoes', u'the-bartenders', u'my-fave-bartender', u'a-cutie', u'a-great-pour', u'a-t-shirt', u'the-nines', u'a-corner', 'went', 'drag', 'king', 'contest', u'woman', 'very', 'over', 'weight', 'not', 'very', 'entertaining', 'bar', 'size', 'shack', 'definetly', 'older', 'male', 'crowd', 'lesbian', 'between', u'age', '21-35', 'not', 'spot', u'all-the-women', u'the-bar', u'the-size', u'a-shack', u'the-ages', u'your-spot', 'im', 'giving', u'apollo', 'four', u'star', 'lot', u'thing', 'look', 'bar', 'clean', 'well-maintained', 'including', u'goddess', 'restroom', 'also', 'friendly', 'staff', 'diverse', 'group', u'patron', 'great', 'outdoor', 'patio', 'featuring', 'roman', u'column', 'perhaps', 'most', 'importantly', u'feature', 'karaoke', 'thursday', '-', 'saturday', u'night', 'husband', 'went', 'friday', 'night', 'karaoke', 'stevey', 'p', 'kristin', u'so', 'great', 'time', u'singer', 'talent', 'definitely', 'better', 'average', 'karaoke', 'dive', 'bar', 'song', 'choice', 'good', 'avid', 'karaoke', 'singer', 'didnt', 'like', 'fact', 'no', 'separate', 'stage', 'area', 'carved', 'current', 'singer', 'stand', 'stand', u'chair', 'between', u'table', 'strain', u'neck', 'look', u'lyric', 'karaoke', 'also', 'wasnt', 'central', 'focus', 'place', 'much', 'very', 'loud', 'talking', 'socializing', 'going', 'spite', 'singing', 'im', 'not', 'going', 'penalize', u'apollo', 'much', 'though', 'know', 'serious', 'karaoke', u'freak', 'like', 'want', 'focus', 'one', 'other', 'complaint', '-', u'u', 'group', 'also', 'little', 'disappointed', 'weak', u'drink', '-', 'dont', 'know', 'crowd', 'control', 'final', 'tab', '70', 'expected', 'feeling', 'little', 'loopy', 'most', 'night', 'speaking', 'feeling', 'loopy', 'appreciative', 'helpful', u'bartender', 'wanted', 'call', 'cab', 'one', 'came', 'fairly', 'quickly', 'after', 'called', 'one', u'u', 'apparently', 'company', 'karaoke', 'one', 'kobalt', 'another', 'gay', 'bar', 'park', 'central', 'complex', 'social', 'dynamic', 'bar', 'scene', 'pretty', 'diverse', 'crowd', 'karaoke', 'part', 'experience', 'really', 'like', 'apollo', 'however', 'ever', 'want', 'go', 'somewhere', 'little', 'low-key', 'really', 'focus', 'karaoke', 'part', 'might', 'want', 'check', 'kobalt', u'four-stars', u'a-lot', u'a-bar', u'a-diverse-group', u'my-husband', u'a-friday-night', u'kristin-s', u'a-great-time', u'the-fact', u'our-chairs', u'our-necks', u'the-lyrics', u'the-place', u'the-focus', u'the-group', u'the-night', u'the-bartenders', u'a-cab', u'the-company', u'the-one', u'another-gay-bar', u'a-pretty-diverse-crowd', u'the-experience', u'the-karaoke-part', 'spilled', 'drink', 'over', 'significant', 'other', 'table', 'others', 'party', 'christine', 'kristin', 'etc', 'spilled', 'kamikaze', u'shot', 'over', u'table', 'karaoke', u'book', 'still', 'not', 'get', 'kicked', 'dumped', 'said', 'score', 'reason', 'updating', 'bar', u'owner', 'ron', 'lee', 'asked', 'update', 'since', 'not', 'divey', 'anymore', 'arent', 'classroom', 'carpet', 'gave', 'way', 'faux-finish', 'wood', u'floor', 'new', 'plasma', u'screen', 'replaced', 'old', u'tv', 'old', u'furnishing', 'gave', 'way', 'new', u'table', 'barstools', 'wrought', 'iron', u'back', 'og', 'swivel', 'naugahyde', u'chair', 'still', 'lurk', 'bar', 'thanks', u'guy', 'original', 'review', 'almost', 'year', 'half', 'ago', 'interior', 'remodel', 'new', 'outdoor', 'patio', 'response', '2006', 'smoking', u'bar', 'ban', u'my-drink', u'the-tables', u'the-reason', u'the-bar-owners', u'new-tables', u'the-bar', u'my-original-review', u'a-response', 'went', u'friend', 'birthday', 'gathering', 'co-worker', 'hadnt', u'apollo', u'year', 'wasnt', 'really', 'looking', 'forward', 'since', 'last', 'time', 'bunch', 'old', u'guy', 'say', 'wasnt', 'bad', 'experience', 'place', 'definitely', 'changed', 'way', 'bright', 'crowd', 'mixed', 'old', u'v', 'younger', 'although', 'not', 'young', u'bartender', 'pleasant', 'one', 'bartender', 'little', 'overboard', 'talking', 'bad', 'other', u'bar', 'kind', u'turn', 'still', 'friendly', 'overall', 'wont', 'hangout', 'fun', 'night', 'long', 'youre', 'group', 'know', u'a-few-friends', u'a-birthday-gathering', u'a-co-worker', u'the-last-time', u'a-bunch', u'old-guys', u'a-bad-experience', u'the-place', u'its-way', u'the-crowd', u'the-bartenders', u'one-bartender', u'other-bars', u'which-kind', u'my-hangout', u'the-night', u'a-group', 'went', u'apollo', 'saturday', 'night', 'group', 'included', 'kristin', 'christine', 'significant', 'others', 'apparently', u'weekend', 'karaoke', 'let', 'tell', 'not', 'average', 'karaoke', 'experience', 'mean', 'karaoke', '34', 'people', 'sing', 'actually', 'decent', 'really', 'good', u'voice', 'geez', 'horrible', 'screeching', u'rendition', 'total', 'eclipse', 'heart', 'oh', 'right', 'husband', 'sang', 'awful', 'funny', 'way', 'go', 'participate', 'karaoke', 'plenty', 'tv', u'screen', u'word', 'really', 'liked', u'tv', 'reserved', 'show', 'football', 'sportscenter', u'apollo', 'moderately', 'sized', 'establishment', 'outdoor', 'patio', 'saw', u'lady', 'bathroom', 'cleverly', 'marked', u'goddess', 'sign', 'tiny', 'however', 'never', 'line', 'most', 'crowd', 'male', 'persuasion', 'husband', 'beer', 'cant', 'comment', u'price', 'mixed', u'drink', u'shot', 'beer', 'cheaper', 'most', u'place', 'valley', 'bud', 'light', u'draft', '275', 'fat', 'tire', u'draft', '3', 'since', 'designated', 'driver', 'evening', 'switched', 'soda', 'halfway', 'night', '2', 'included', 'free', u'refill', u'bartender', 'extremely', 'friendly', 'very', 'prompt', 'service', 'also', 'waiter', u'float', 'throughout', 'space', 'dont', 'want', 'go', 'bar', 'crowd', 'flowed', 'throughout', 'evening', 'never', 'got', 'full', 'people', 'getting', 'knocked', 'impossible', 'move', 'very', 'laid', 'back', 'place', 'hang', 'chat', 'enjoy', 'singing', 'along', 'sweet', 'caroline', u'a-group', u'the-weekends', u'the-people', u'total-eclipse', u'the-heart', u'my-husband', u'a-funny-way', u'tv-screens', u'the-words', u'a-few-tvs', u'an-outdoor-patio', u'the-ladies-bathroom', u'a-line', u'the-crowd', u'the-male-persuasion', u'my-husband', u'the-prices', u'mixed-drinks', u'most-places', u'the-valley', u'light-drafts', u'the-designated-driver', u'the-night', u'free-refills', u'the-bartenders', u'a-waiter', u'the-space', u'the-bar', u'the-crowd', u'the-evening', u'sweet-caroline', 'crowd', u'apollo', 'always', 'seems', 'pretty', 'diverse', 'stopped', 'second', 'third', 'time', 'friend', 'saturday', 'night', 'virtually', 'every', 'group', 'seemed', 'represented', u'apollo', 'charming', 'sense', 'completely', u'lack', 'pretentiousness', 'hip', u'bar', 'scottsdale', 'phoenix', 'karaoke', 'night', 'always', 'great', 'idea', 'dude', 'singing', 'creed', 'weird', 'right', u'drink', 'nice', 'stiff', 'even', u'apollo', 'isnt', 'weekly', 'hangout', 'list', 'definitely', 'like', 'overall', 'vibe', 'think', 'also', 'one', 'oldest', 'gay', u'bar', 'phoenix', u'the-crowd', u'a-friend', u'a-saturday-night', u'the-sense', u'the-pretentiousness', u'karaoke-night', u'a-great-idea', u'some-dude', u'the-overall-vibe', 'ahhh', u'apollo', 'little', 'gay', 'bay', 'street', u'apollo', 'evolved', 'over', u'year', 'ive', 'good', u'time', 'makeover', 'almost', 'complete', u'bathroom', 'renovated', 'soon', 'hope', 'might', 'going', 'back', 'often', 'surly', u'bartender', 'long', 'replaced', 'huge', 'patio', 'constructed', 'although', 'no', 'longer', 'smoke', 'enjoyable', 'area', 'although', 'not', 'first', 'choice', u'bar', 'stop', 'time', 'time', 'enjoy', 'well', 'made', 'read', 'strong', 'cocktail', 'catch', u'bud', 'best', u'night', 'opinion', 'karaoke', 'nites', 'place', 'normally', u'pack', 'very', 'diverse', 'crowd', 'aside', u'issue', u'men', 'bathroom', u'padlock', 'worse', 'overly', 'bright', 'lighting', 'bar', 'getting', 'better', 'after', 'chatting', 'owner', 'great', 'know', 'touch', u'patron', 'quickly', 'address', u'issue', 'im', 'really', 'looking', 'forward', 'completion', u'renovation', u'the-years', u'some-good-times', u'the-best-nights', u'my-opinion', u'the-place', u'a-very-diverse-crowd', u'my-issues', u'the-mens-bathroom', u'this-bar', u'the-owner', u'their-patrons', u'any-issues', u'the-completion', u'the-renovations', 'went', 'last', 'night', 'cheap', u'drink', 'friendly', 'staff', 'return', 'headed', 'over', u'apollo', 'last', 'friday', 'night', 'possibly', 'drink', 'enough', 'sing', 'karaoke', 'started', 'night', 'saying', 'absolutely', 'would', 'not', 'singing', 'going', 'make', 'fun', 'forward', u'hour', 'meredith', u'brook', 'bitch', 'coming', 'mouth', 'blame', 'strong', u'drink', 'great', 'service', u'apollo', 'crowd', 'great', 'would', 'definitely', 'go', 'back', 'patio', 'also', 'great', 'addition', 'since', 'many', u'companion', u'smoker', 'could', 'take', u'drink', 'outside', u'some-karaoke', u'my-mouth', u'the-strong-drinks', u'great-service', u'the-crowd', u'the-patio', u'a-great-addition', u'my-companions', u'their-drinks', 'im', 'not', 'really', 'fan', 'place', 'seems', 'awfully', 'crowded', 'hard', 'navigate', 'worse', 'though', 'charge', '6', 'red', 'bull', 'no', 'liquor', 'bunkhouse', 'usually', u'charge', '2', 'back', 'patio', 'small', 'skinny', 'whereas', 'usual', 'clientele', 'not', u'a-fan', u'the-place', u'a-red-bull', u'no-liquor', u'the-back-patio', u'the-usual-clientele', 'one', 'time', u'apollo', 'gay', u'dude', 'asked', 'friend', 'real', u'woman', 'knew', 'waxed', 'mustache', 'going', 'night', 'never', 'mind', 'always', 'feel', 'like', 'diva', u'girl', u'apollo', 'handful', u'time', 'karaoke', 'im', 'always', 'drunken', 'mess', 'every', 'time', 'leave', 'well', 'certainly', 'cant', 'handle', 'liquor', u'drink', 'poured', 'well', 'cheap', 'cheap', u'drink', 'first', 'time', 'think', 'pink', 'panther', u'martini', '4', 'great', 'place', 'gather', 'bunch', u'friend', 'karaoke', 'yes', 'gay', 'bar', 'coolest', 'one', 'ive', 'always', 'met', 'great', u'guy', 'never', 'interested', 'always', 'nice', 'willing', 'sing', 'duet', 'karaoke', 'host', 'always', 'fun', 'not', 'mic', 'hog', 'like', u'place', 'complaint', 'gay', u'friend', 'like', 'sing', u'lot', 'show', u'tune', 'boring', 'definitely', 'great', u'performer', 'most', 'good', 'time', 'dont', 'feel', 'like', 'need', 'belt', 'like', 'whitney', 'houston', 'come', 'favorite', u'thing', u'apollo', '1', 'free', 'fresh', 'popcorn', 'best', 'thing', 'munch', 'after', 'youve', 'lot', 'drink', '2', u'lady', 'room', 'never', 'line', 'mainly', 'gay', u'dude', 'frequent', 'place', 'love', u'apollo', u'my-friend', u'real-women', u'my-mustache', u'a-diva', u'the-girls', u'a-handful', u'a-drunken-mess', u'my-liquor', u'the-drinks', u'a-bunch', u'great-guys', u'a-duet', u'the-karaoke-host', u'some-places', u'my-only-complaint', u'my-gay-friends', u'show-tunes', u'a-good-time', u'whitney-houston', u'my-favorite-things', u'best-thing', u'a-lot', u'the-ladies-room', u'a-line', 'much', 'better', u'experience', 'experienced', 'initial', 'review', u'price', 'line', 'other', u'bar', 'like', 'phoenix', u'owner', 'very', 'supportive', 'community', 'count', 'something', 'may', 'cranky', 'mood', 'posted', 'initial', 'review', 'nice', 'patio', 'good', 'seating', 'shade', u'my-initial-review', u'the-prices', u'other-bars', u'the-owners', u'the-community', u'my-initial-review', u'good-seating', 'youre', 'going', 'go', u'apollo', 'karaoke', 'probably', 'start', 'practicing', 'fake', 'im', u'writ', 'routine', 'youre', 'going', 'need', 'think', u'le', 'hip-hop', 'along', u'line', 'show', u'tune', 'old', u'ballad', 'shit', 'around', 'born', 'dont', 'get', 'wrong', 'though', 'dig', 'free', 'popcorn', 'free', u'condom', 'pours', 'strong', 'enough', 'get', 'date', 'put', 'im', 'guessing', 'lot', 'putting', 'gone', u'men', 'bathroom', 'door', 'permanently', 'propped', 'open', u'the-lines', u'show-tunes', u'your-date', u'a-lot', u'the-mens-bathroom-door', 'breeder', 'favorite', 'gay', 'bar', u'bartender', 'hilarious', u'drink', 'cheap', u'wall', 'salmon', 'scene', 'usually', 'mix', 'gay', 'men', 'pretty', u'girl', 'win', 'like', 'sing', 'karaoke', 'place', 'sing', 'prince', 'mostly', 'awful', 'falsetto', 'still', 'get', 'cheered', u'my-favorite-gay-bar', u'the-bartenders', u'the-drinks', u'the-walls', u'the-scene', u'a-mix', u'gay-men', u'pretty-girls', u'a-place', 'never', u'year', 'back', 'treated', 'horrible', 'service', 'approached', 'owner', 'treated', u'u', 'like', 'crap', 'boycotted', 'bar', '2', u'year', 'tried', 'prove', 'a-holes', 'plenty', 'friendly', u'bar', 'appreciate', 'business', u'horrible-service', u'the-owner', u'this-bar', u'2-years', u'the-business', 'im', 'feeling', 'low', 'looking', 'great', 'atmosphere', 'cheer', 'go', 'great', u'drink', 'staff', 'very', 'nice', 'throw', 'little', 'karaoke', 'fun', 'most', 'definetly', 'place', u'a-great-atmosphere', u'the-staff', u'the-place', 'very', 'upset', 'place', 'im', 'outside', 'beer', 'garden', 'cigarette', 'owner', 'came', 'said', 'smell', 'pot', 'person', 'told', 'medical', 'patient', 'medicated', 'got', 'bar', 'told', 'not', 'smoke', 'parking', 'lot', 'told', 'havent', 'know', u'law', 'accused', 'smoking', 'parking', 'lot', 'really', 'feel', 'discriminated', 'not', 'come', 'back', 'even', 'u', 'respect', u'rule', 'feel', 'discriminate', 'u', 'call', 'u', 'front', 'everyone', 'definitely', 'not', 'coming', 'back', u'this-place', u'the-beer-garden', u'a-cigarette', u'the-owner', u'my-person', u'the-bar', u'the-parking-lot', u'the-laws', u'the-parking-lot', u'their-rules', u'apollo', 'fun', 'experience', 'checked', 'bank', 'account', 'find', 'erroneously', 'overcharged', 'over', '3500', 'called', 'tried', 'tell', 'debit', 'hold', 'tip', 'not', 'week', 'later', 'charge', 'went', 'said', 'would', 'someone', 'call', 'back', 'not', 'got', 'run', 'around', 'gave', 'very', 'generous', 'tip', 'expense', 'fun', 'check', 'bank', 'account', 'dont', 'play', u'rule', u'a-fun-experience', u'my-bank-account', u'the-charge', u'the-run', u'a-very-generous-tip', u'my-expense', u'your-bank-account', u'the-rules', 'fun'], [u'lLI8ObL8aCVbkrrtAW0EHw'])
In [10]:
from gensim.models import doc2vec
model = doc2vec.Doc2Vec(min_count=4, window=5, size=200, sample=1e-4, negative=10, workers=12)
# Build the vocab from list of sentences.
model.build_vocab(docs)
In [11]:
from random import shuffle
for epoch in range(10):
print '\rTraining Epoch %i, alpha %1.4f'%(epoch+1, model.alpha),
#model.train(np.random.permutation(docs))
shuffle(docs)
model.train(docs)
model.alpha -= 0.001 # decrease the learning rate
model.min_alpha = model.alpha # fix the learning rate, no decay
#model.init_sims(replace=True)
# # Normalize the word vectors.
# vec_norms = np.sqrt(np.sum(model.syn0**2, axis=1))
# model.syn0 = (model.syn0/vec_norms[:, numpy.newaxis])
# # Normalize the doc vectors.
# vec_norms = np.sqrt(np.sum(model.docvecs.doctag_syn0**2, axis=1))
# model.docvecs.doctag_syn0 = (model.docvecs.doctag_syn0/vec_norms[:, numpy.newaxis])
model.save('../output/doc2vec_bars_200_neg_10_win_5.model')
Training Epoch 10, alpha 0.0160
In [13]:
# Can find similar documents..
print model.docvecs.most_similar(positive=['KUinHkKyGhznElgIzx0yIw']), '\n'
# Can find similar words...Re: Dream Companies and contact from recruiters
print model.most_similar(positive=['beer']), '\n'
# Can find documents that are most similar to keywords....
print model.docvecs.most_similar(positive=[model['beer'], model['music']]), '\n'
# Can find words that are most common in documents
print review.text[review.review_id=='KUinHkKyGhznElgIzx0yIw'].values
print model.most_similar(positive=[model.docvecs['KUinHkKyGhznElgIzx0yIw']]), '\n'
[(u'yyUJKvG-C4VipITrAS0nIQ', 0.6710823774337769), (u'g3fipTPN2LBe_U42niTDcw', 0.6688743829727173), (u'ANaGwB8tVAc1qM1QAJecsQ', 0.6611914038658142), (u'LVjRN5pMJ8hhDmX0lbclpQ', 0.6471917629241943), (u'YP-sxa8i95v_scvXN2o4_w', 0.6342615485191345), (u'XkyZAQAaGO9i3on-b3fswg', 0.6326159834861755), (u'qdTtkZVgcdu3SEA6tzBPdw', 0.6320397853851318), (u'L7eGNKkuy_XdQ_35Y1Kacg', 0.629540741443634), (u'cd46siFt_-08j9-kSbVEgA', 0.6246991157531738), (u'3mp5jXdxC2yqSK6sgRQfEg', 0.6225901246070862)]
[('draft', 0.7733069062232971), (u'wine', 0.7592223882675171), ('tap', 0.7554447054862976), ('micro-brews', 0.7395221590995789), ('brew', 0.7344582080841064), (u'cocktail', 0.7317838072776794), ('draught', 0.7313833236694336), ('selection', 0.723392128944397), ('bottled', 0.71184903383255), ('whisky', 0.6925665140151978)]
[(u'W3SROyBvrFKT5C2ySdx1qw', 0.44849711656570435), (u'6w6gMZ3iBLGcUM4RBIuifQ', 0.43213915824890137), (u'KCP4tSmVRD6Gk3xbPEAf3w', 0.3971535861492157), (u'3WsATGkAIXV-56eWjdzecw', 0.3879733681678772), (u'3jzEz2q9HZYF2XU1Gm41nA', 0.3710145354270935), (u'JS0gYaJR5HZDhZG0TJRRGg', 0.3570679724216461), (u'QSmI5Y9bhCLIw9YYKOiQkg', 0.34995037317276), (u'Bxn0LTYR9BxEeXReFbXDJA', 0.34519267082214355), (u'Ejw0lND0g8WBQj4pCllUnQ', 0.3383505046367645), (u'EaAo1G89msEiSQLi1jX_Hw', 0.33163875341415405)]
[ u'My brother and I make the trek from N Scottsdale to The Drummer almost every weekend. Jesse makes the HOTTEST suicide grilled wings on the planet - we love \'em! Service is great and the "regulars" are pretty friendly too. Drink prices are good and there are plenty of TV\'s. One of the better "dive bars" in the area.']
[('rory', 0.5823439359664917), ('carin', 0.5686616897583008), ('biotch', 0.5440913438796997), ('seit', 0.5418999195098877), ('uwe', 0.5400148034095764), ('reaffirms', 0.5130881071090698), ('accoustic', 0.512511134147644), ('gaurd', 0.5113624930381775), ('wrist', 0.5096732974052429), ('self-guided', 0.4946335256099701)]
In [ ]:
In [ ]:
Content source: erccarls/vectorsearch
Similar notebooks: