In [1]:
import pandas as pd
import pickle
import numpy as np

# Load the bar review dataset 
review = pd.read_pickle('../output/bar_reviews_cleaned_and_tokenized_SF.pickle')
print review.head(2)
print review.tail(2)


               business_id        date               review_id stars  \
10  UsFtqoBl7naz8AVUBZMjQQ  2013-11-08  Di3exaUCFNw1V4kSNW5pgA     5   
11  UsFtqoBl7naz8AVUBZMjQQ  2014-03-29  0Lua2-PbqEQMjD9r89-asw     3   

                                                 text    type  \
10  All the food is great here. But the best thing...  review   
11  We checked this place out this past Monday for...  review   

                   user_id  votes_cool  votes_funny  votes_useful  \
10  uK8tzraOp4M5u3uYrqIBXg         0.0          0.0           0.0   
11  I_47G-R2_egp7ME5u_ltew         0.0          0.0           0.0   

                                    cleaned_tokenized  
10  [[food, great], [best, thing, wing], [wing, si...  
11  [[checked, place, past, monday, wing, night], ...  
                                    business_id date  \
84215  8781c06a4e2407f5e027cd503f4aab675e76615b  NaN   
84216  8781c06a4e2407f5e027cd503f4aab675e76615b  NaN   

                                  review_id stars  \
84215  0e446098-6893-4315-9ed8-243c1926dae6   4.0   
84216  a5eb8ce2-2f30-4f4b-885b-5d163e606629   5.0   

                                                    text type  \
84215  Buffalo wings w/ hotter sauce - just the right...  NaN   
84216  It's thinly sliced steak covered with cheese o...  NaN   

                                    user_id  votes_cool  votes_funny  \
84215  d1d2fa20-3413-41ee-adc2-b58bc9b160e8         NaN          NaN   
84216  4585e5c9-f4b7-4bdb-94d4-39dc8e124db6         NaN          NaN   

       votes_useful                                  cleaned_tokenized  
84215           NaN  [[buffalo, wing, w, hotter, sauce, -, right, a...  
84216           NaN  [[thinly, sliced, steak, covered, cheese, warm...  

In [2]:
# Load the training users
user_set_training = pickle.load(open('../output/training_users.pickle', 'rb'))
# Make the active review set training only 
review = review[review.user_id.isin(user_set_training)]

In [3]:
from collections import OrderedDict
from itertools import chain


# n_reviews = 100 # all of them... 
# Flatten the reviews, so each review is just a single list of words.
reviews_merged_bus = OrderedDict()
business_set = list(set(review.business_id.values[:]))
for i_bus, bus_id in enumerate(business_set):
    if ((i_bus%5)==0):
        print '\r Fraction Processed',float(i_bus+1)/len(business_set),
    # This horrible line first collapses each review of a corresponding business into a list
    # of lists, and then collapses the list of sentences to a long list of words
    reviews_merged_bus[bus_id] = list(chain.from_iterable(chain.from_iterable( 
                                     review.cleaned_tokenized[review.business_id==bus_id] )) )
docs_bus = reviews_merged_bus.values()
print 
print len(docs_bus)


 Fraction Processed 0.99928762244
5615

In [4]:
import gensim
from itertools import chain
import sys
sys.path.append('../vectorsearch/')
import nltk_helper
import doc2vec
from gensim.models.doc2vec import TaggedDocument
import pandas as pd
n_epochs = 10
n_docs = 10 # -1 for almost all of them...

# Generate the tagged document list. 

docs = [TaggedDocument(words, [business_set[index],])
                             for index, words in enumerate(docs_bus[:])]

print '\nFirst Doc: \n-----------------\n', docs[0]


path /data/insight_yelp/input/

First Doc: 
-----------------
TaggedDocument(['crooning', 'gay', 'men', u'friend', 'tear', 'joint', 'regularly', 'recognized', u'medium', '-', 'straight', 'gay', 'best', 'karaoke', 'bar', 'valley', 'coming', 'ever', 'since', 'used', 'hop', 'fence', 'old', 'apartment', 'complex', u'block', 'away', 'tell', 'place', 'cant', 'beat', 'fun', 'value', 'dept', 'mean', 'else', 'get', '4', 'absolut', u'tonic', '-', 'plus', 'free', 'oftentimes', 'campy', 'klassy', 'entertainment', 'crowd', 'singing', u'skill', 'singing', u'taste', 'run', 'gamut', 'thats', 'part', 'appeal', 'crowd', 'mostly', 'gay', 'men', u'30', u'40', u'go', 'older', 'younger', u'lesbian', 'also', 'hold', 'court', 'well', 'mean', 'else', u'honor', 'sing', 'pat', 'benatar', 'melissa', 'many', 'gay', 'men', 'bring', 'straight', u'girlfriend', 'most', 'likely', 'someone', 'sing', 'grease', 'duet', 'believe', 'not', 'handful', 'straight', 'men', 'dragged', 'not', 'fear', 'crowd', 'very', 'friendly', 'welcoming', u'bartender', u'owner', 'take', 'care', 'people', 'kind', 'music', 'expect', 'people', 'completely', 'hammered', 'singing', 'public', 'very', 'first', 'time', 'others', 'serious', 'seasoned', '-', 'classical', 'training', 'via', 'phoenix', u'men', 'choir', u'guy', 'would', 'make', 'american', 'idol', 'drool', 'desire', 'song', 'list', 'large', 'respectable', 'think', 'george', 'dragon', 'better', 'list', '-', 'see', 'review', u'song', 'run', 'spectrum', 'liza', 'cabaret', 'tim', 'mcgraw', 'hip-hop', 'rap', u'80', 'alt', 'rock', 'u2', 'acdc', 'elton', 'john', 'stevie', 'wonder', 'frank', 'sinatra', 'place', 'totally', 'small', 'divey', '-and', 'bit', 'weathered', 'looking', 'speed', 'past', '7th', 'st', 'useless', 'trivia', 'alert', 'phoenix', 'architectural', 'history', u'buff', 'bar', 'identical', 'chez', 'nous', 'without', u'booth', 'not', 'dark', u'building', 'built', 'man', 'karaoke', u'run', 'thursday', 'night', 'saturday', 'night', 'other', 'themed', u'evening', 'throughout', 'week', 'incl', 'not', 'making', 'greek', 'god', 'revue', 'stripper', 'show', 'hosted', 'catty', 'drag', u'queen', 'sunday', 'night', u'gay-men', u'their-friends', u'the-media', u'the-valley', u'the-fence', u'this-place', u'the-fun', u'the-crowd', u'the-gamut', u'the-appeal', u'the-crowd', u'their-30s', u'the-honors', u'straight-men', u'the-crowd', u'the-bartenders', u'the-people', u'what-kind', u'some-people', u'the-very-first-time', u'the-phoenix', u'these-guys', u'the-song-list', u'my-review', u'the-songs', u'the-spectrum', u'hip-hop-rap', u'this-place', u'this-bar', u'the-booths', u'the-same-man', u'thursday-night', u'saturday-night', u'the-week', u'sunday-night', 'went', 'last', 'night', 'karaoke', 'flamin', 'steve', 'jenni', 'work', 'blast', 'gay', 'bar', 'bunch', u'u', 'great', 'selection', u'song', 'guy', u'run', 'running', 'karaoke', '15', u'year', 'nicely', 'decorated', 'guess', 'used', 'real', 'dive', u'goddess', 'room', 'clean', 'no', 'water', 'floor', 'gotta', 'say', 'though', 'gay', 'men', 'sing', u'song', 'werent', 'danzig', 'acdc', 'right', 'said', 'fred', u'rendition', 'would', 'fallen', 'asleep', 'staff', 'awesome', 'everyone', 'crowd', 'really', 'friendly', 'definitely', 'go', 'back', u'a-bunch', u'15-years', u'a-real-dive', u'the-floor', u'gay-men', u'the-staff', u'the-crowd', u'here', 'place', 'go', 'everybody', u'know', 'name', 'most', u'patron', 'happen', 'gay', 'love', 'coming', u'gay', 'hanging', u'drink', 'neighborhood', 'bar', 'crowd', 'supa', 'mixed', 'mean', 'else', 'go', 'hardened', 'biker', 'boy', 'compliment', 'nail', 'polish', 'choice', 'awesome', 'ive', 'also', 'threatened', 'stealing', u'shoe', 'well', 'asked', 'one', 'buy', 'lipstick', 'drag', 'queen', u'bartender', 'courteous', 'super', 'funny', 'fave', 'bartender', 'michael', u'he', 'cutie', 'great', 'pour', 'whether', 'youre', u'short', 't-shirt', 'dressed', u'nine', 'stop', 'youll', 'find', 'corner', 'fit', u'a-place', u'your-name', u'the-patrons', u'my-gays', u'some-drinks', u'a-neighborhood-bar', u'the-crowd', u'my-shoes', u'the-bartenders', u'my-fave-bartender', u'a-cutie', u'a-great-pour', u'a-t-shirt', u'the-nines', u'a-corner', 'went', 'drag', 'king', 'contest', u'woman', 'very', 'over', 'weight', 'not', 'very', 'entertaining', 'bar', 'size', 'shack', 'definetly', 'older', 'male', 'crowd', 'lesbian', 'between', u'age', '21-35', 'not', 'spot', u'all-the-women', u'the-bar', u'the-size', u'a-shack', u'the-ages', u'your-spot', 'im', 'giving', u'apollo', 'four', u'star', 'lot', u'thing', 'look', 'bar', 'clean', 'well-maintained', 'including', u'goddess', 'restroom', 'also', 'friendly', 'staff', 'diverse', 'group', u'patron', 'great', 'outdoor', 'patio', 'featuring', 'roman', u'column', 'perhaps', 'most', 'importantly', u'feature', 'karaoke', 'thursday', '-', 'saturday', u'night', 'husband', 'went', 'friday', 'night', 'karaoke', 'stevey', 'p', 'kristin', u'so', 'great', 'time', u'singer', 'talent', 'definitely', 'better', 'average', 'karaoke', 'dive', 'bar', 'song', 'choice', 'good', 'avid', 'karaoke', 'singer', 'didnt', 'like', 'fact', 'no', 'separate', 'stage', 'area', 'carved', 'current', 'singer', 'stand', 'stand', u'chair', 'between', u'table', 'strain', u'neck', 'look', u'lyric', 'karaoke', 'also', 'wasnt', 'central', 'focus', 'place', 'much', 'very', 'loud', 'talking', 'socializing', 'going', 'spite', 'singing', 'im', 'not', 'going', 'penalize', u'apollo', 'much', 'though', 'know', 'serious', 'karaoke', u'freak', 'like', 'want', 'focus', 'one', 'other', 'complaint', '-', u'u', 'group', 'also', 'little', 'disappointed', 'weak', u'drink', '-', 'dont', 'know', 'crowd', 'control', 'final', 'tab', '70', 'expected', 'feeling', 'little', 'loopy', 'most', 'night', 'speaking', 'feeling', 'loopy', 'appreciative', 'helpful', u'bartender', 'wanted', 'call', 'cab', 'one', 'came', 'fairly', 'quickly', 'after', 'called', 'one', u'u', 'apparently', 'company', 'karaoke', 'one', 'kobalt', 'another', 'gay', 'bar', 'park', 'central', 'complex', 'social', 'dynamic', 'bar', 'scene', 'pretty', 'diverse', 'crowd', 'karaoke', 'part', 'experience', 'really', 'like', 'apollo', 'however', 'ever', 'want', 'go', 'somewhere', 'little', 'low-key', 'really', 'focus', 'karaoke', 'part', 'might', 'want', 'check', 'kobalt', u'four-stars', u'a-lot', u'a-bar', u'a-diverse-group', u'my-husband', u'a-friday-night', u'kristin-s', u'a-great-time', u'the-fact', u'our-chairs', u'our-necks', u'the-lyrics', u'the-place', u'the-focus', u'the-group', u'the-night', u'the-bartenders', u'a-cab', u'the-company', u'the-one', u'another-gay-bar', u'a-pretty-diverse-crowd', u'the-experience', u'the-karaoke-part', 'spilled', 'drink', 'over', 'significant', 'other', 'table', 'others', 'party', 'christine', 'kristin', 'etc', 'spilled', 'kamikaze', u'shot', 'over', u'table', 'karaoke', u'book', 'still', 'not', 'get', 'kicked', 'dumped', 'said', 'score', 'reason', 'updating', 'bar', u'owner', 'ron', 'lee', 'asked', 'update', 'since', 'not', 'divey', 'anymore', 'arent', 'classroom', 'carpet', 'gave', 'way', 'faux-finish', 'wood', u'floor', 'new', 'plasma', u'screen', 'replaced', 'old', u'tv', 'old', u'furnishing', 'gave', 'way', 'new', u'table', 'barstools', 'wrought', 'iron', u'back', 'og', 'swivel', 'naugahyde', u'chair', 'still', 'lurk', 'bar', 'thanks', u'guy', 'original', 'review', 'almost', 'year', 'half', 'ago', 'interior', 'remodel', 'new', 'outdoor', 'patio', 'response', '2006', 'smoking', u'bar', 'ban', u'my-drink', u'the-tables', u'the-reason', u'the-bar-owners', u'new-tables', u'the-bar', u'my-original-review', u'a-response', 'went', u'friend', 'birthday', 'gathering', 'co-worker', 'hadnt', u'apollo', u'year', 'wasnt', 'really', 'looking', 'forward', 'since', 'last', 'time', 'bunch', 'old', u'guy', 'say', 'wasnt', 'bad', 'experience', 'place', 'definitely', 'changed', 'way', 'bright', 'crowd', 'mixed', 'old', u'v', 'younger', 'although', 'not', 'young', u'bartender', 'pleasant', 'one', 'bartender', 'little', 'overboard', 'talking', 'bad', 'other', u'bar', 'kind', u'turn', 'still', 'friendly', 'overall', 'wont', 'hangout', 'fun', 'night', 'long', 'youre', 'group', 'know', u'a-few-friends', u'a-birthday-gathering', u'a-co-worker', u'the-last-time', u'a-bunch', u'old-guys', u'a-bad-experience', u'the-place', u'its-way', u'the-crowd', u'the-bartenders', u'one-bartender', u'other-bars', u'which-kind', u'my-hangout', u'the-night', u'a-group', 'went', u'apollo', 'saturday', 'night', 'group', 'included', 'kristin', 'christine', 'significant', 'others', 'apparently', u'weekend', 'karaoke', 'let', 'tell', 'not', 'average', 'karaoke', 'experience', 'mean', 'karaoke', '34', 'people', 'sing', 'actually', 'decent', 'really', 'good', u'voice', 'geez', 'horrible', 'screeching', u'rendition', 'total', 'eclipse', 'heart', 'oh', 'right', 'husband', 'sang', 'awful', 'funny', 'way', 'go', 'participate', 'karaoke', 'plenty', 'tv', u'screen', u'word', 'really', 'liked', u'tv', 'reserved', 'show', 'football', 'sportscenter', u'apollo', 'moderately', 'sized', 'establishment', 'outdoor', 'patio', 'saw', u'lady', 'bathroom', 'cleverly', 'marked', u'goddess', 'sign', 'tiny', 'however', 'never', 'line', 'most', 'crowd', 'male', 'persuasion', 'husband', 'beer', 'cant', 'comment', u'price', 'mixed', u'drink', u'shot', 'beer', 'cheaper', 'most', u'place', 'valley', 'bud', 'light', u'draft', '275', 'fat', 'tire', u'draft', '3', 'since', 'designated', 'driver', 'evening', 'switched', 'soda', 'halfway', 'night', '2', 'included', 'free', u'refill', u'bartender', 'extremely', 'friendly', 'very', 'prompt', 'service', 'also', 'waiter', u'float', 'throughout', 'space', 'dont', 'want', 'go', 'bar', 'crowd', 'flowed', 'throughout', 'evening', 'never', 'got', 'full', 'people', 'getting', 'knocked', 'impossible', 'move', 'very', 'laid', 'back', 'place', 'hang', 'chat', 'enjoy', 'singing', 'along', 'sweet', 'caroline', u'a-group', u'the-weekends', u'the-people', u'total-eclipse', u'the-heart', u'my-husband', u'a-funny-way', u'tv-screens', u'the-words', u'a-few-tvs', u'an-outdoor-patio', u'the-ladies-bathroom', u'a-line', u'the-crowd', u'the-male-persuasion', u'my-husband', u'the-prices', u'mixed-drinks', u'most-places', u'the-valley', u'light-drafts', u'the-designated-driver', u'the-night', u'free-refills', u'the-bartenders', u'a-waiter', u'the-space', u'the-bar', u'the-crowd', u'the-evening', u'sweet-caroline', 'crowd', u'apollo', 'always', 'seems', 'pretty', 'diverse', 'stopped', 'second', 'third', 'time', 'friend', 'saturday', 'night', 'virtually', 'every', 'group', 'seemed', 'represented', u'apollo', 'charming', 'sense', 'completely', u'lack', 'pretentiousness', 'hip', u'bar', 'scottsdale', 'phoenix', 'karaoke', 'night', 'always', 'great', 'idea', 'dude', 'singing', 'creed', 'weird', 'right', u'drink', 'nice', 'stiff', 'even', u'apollo', 'isnt', 'weekly', 'hangout', 'list', 'definitely', 'like', 'overall', 'vibe', 'think', 'also', 'one', 'oldest', 'gay', u'bar', 'phoenix', u'the-crowd', u'a-friend', u'a-saturday-night', u'the-sense', u'the-pretentiousness', u'karaoke-night', u'a-great-idea', u'some-dude', u'the-overall-vibe', 'ahhh', u'apollo', 'little', 'gay', 'bay', 'street', u'apollo', 'evolved', 'over', u'year', 'ive', 'good', u'time', 'makeover', 'almost', 'complete', u'bathroom', 'renovated', 'soon', 'hope', 'might', 'going', 'back', 'often', 'surly', u'bartender', 'long', 'replaced', 'huge', 'patio', 'constructed', 'although', 'no', 'longer', 'smoke', 'enjoyable', 'area', 'although', 'not', 'first', 'choice', u'bar', 'stop', 'time', 'time', 'enjoy', 'well', 'made', 'read', 'strong', 'cocktail', 'catch', u'bud', 'best', u'night', 'opinion', 'karaoke', 'nites', 'place', 'normally', u'pack', 'very', 'diverse', 'crowd', 'aside', u'issue', u'men', 'bathroom', u'padlock', 'worse', 'overly', 'bright', 'lighting', 'bar', 'getting', 'better', 'after', 'chatting', 'owner', 'great', 'know', 'touch', u'patron', 'quickly', 'address', u'issue', 'im', 'really', 'looking', 'forward', 'completion', u'renovation', u'the-years', u'some-good-times', u'the-best-nights', u'my-opinion', u'the-place', u'a-very-diverse-crowd', u'my-issues', u'the-mens-bathroom', u'this-bar', u'the-owner', u'their-patrons', u'any-issues', u'the-completion', u'the-renovations', 'went', 'last', 'night', 'cheap', u'drink', 'friendly', 'staff', 'return', 'headed', 'over', u'apollo', 'last', 'friday', 'night', 'possibly', 'drink', 'enough', 'sing', 'karaoke', 'started', 'night', 'saying', 'absolutely', 'would', 'not', 'singing', 'going', 'make', 'fun', 'forward', u'hour', 'meredith', u'brook', 'bitch', 'coming', 'mouth', 'blame', 'strong', u'drink', 'great', 'service', u'apollo', 'crowd', 'great', 'would', 'definitely', 'go', 'back', 'patio', 'also', 'great', 'addition', 'since', 'many', u'companion', u'smoker', 'could', 'take', u'drink', 'outside', u'some-karaoke', u'my-mouth', u'the-strong-drinks', u'great-service', u'the-crowd', u'the-patio', u'a-great-addition', u'my-companions', u'their-drinks', 'im', 'not', 'really', 'fan', 'place', 'seems', 'awfully', 'crowded', 'hard', 'navigate', 'worse', 'though', 'charge', '6', 'red', 'bull', 'no', 'liquor', 'bunkhouse', 'usually', u'charge', '2', 'back', 'patio', 'small', 'skinny', 'whereas', 'usual', 'clientele', 'not', u'a-fan', u'the-place', u'a-red-bull', u'no-liquor', u'the-back-patio', u'the-usual-clientele', 'one', 'time', u'apollo', 'gay', u'dude', 'asked', 'friend', 'real', u'woman', 'knew', 'waxed', 'mustache', 'going', 'night', 'never', 'mind', 'always', 'feel', 'like', 'diva', u'girl', u'apollo', 'handful', u'time', 'karaoke', 'im', 'always', 'drunken', 'mess', 'every', 'time', 'leave', 'well', 'certainly', 'cant', 'handle', 'liquor', u'drink', 'poured', 'well', 'cheap', 'cheap', u'drink', 'first', 'time', 'think', 'pink', 'panther', u'martini', '4', 'great', 'place', 'gather', 'bunch', u'friend', 'karaoke', 'yes', 'gay', 'bar', 'coolest', 'one', 'ive', 'always', 'met', 'great', u'guy', 'never', 'interested', 'always', 'nice', 'willing', 'sing', 'duet', 'karaoke', 'host', 'always', 'fun', 'not', 'mic', 'hog', 'like', u'place', 'complaint', 'gay', u'friend', 'like', 'sing', u'lot', 'show', u'tune', 'boring', 'definitely', 'great', u'performer', 'most', 'good', 'time', 'dont', 'feel', 'like', 'need', 'belt', 'like', 'whitney', 'houston', 'come', 'favorite', u'thing', u'apollo', '1', 'free', 'fresh', 'popcorn', 'best', 'thing', 'munch', 'after', 'youve', 'lot', 'drink', '2', u'lady', 'room', 'never', 'line', 'mainly', 'gay', u'dude', 'frequent', 'place', 'love', u'apollo', u'my-friend', u'real-women', u'my-mustache', u'a-diva', u'the-girls', u'a-handful', u'a-drunken-mess', u'my-liquor', u'the-drinks', u'a-bunch', u'great-guys', u'a-duet', u'the-karaoke-host', u'some-places', u'my-only-complaint', u'my-gay-friends', u'show-tunes', u'a-good-time', u'whitney-houston', u'my-favorite-things', u'best-thing', u'a-lot', u'the-ladies-room', u'a-line', 'much', 'better', u'experience', 'experienced', 'initial', 'review', u'price', 'line', 'other', u'bar', 'like', 'phoenix', u'owner', 'very', 'supportive', 'community', 'count', 'something', 'may', 'cranky', 'mood', 'posted', 'initial', 'review', 'nice', 'patio', 'good', 'seating', 'shade', u'my-initial-review', u'the-prices', u'other-bars', u'the-owners', u'the-community', u'my-initial-review', u'good-seating', 'youre', 'going', 'go', u'apollo', 'karaoke', 'probably', 'start', 'practicing', 'fake', 'im', u'writ', 'routine', 'youre', 'going', 'need', 'think', u'le', 'hip-hop', 'along', u'line', 'show', u'tune', 'old', u'ballad', 'shit', 'around', 'born', 'dont', 'get', 'wrong', 'though', 'dig', 'free', 'popcorn', 'free', u'condom', 'pours', 'strong', 'enough', 'get', 'date', 'put', 'im', 'guessing', 'lot', 'putting', 'gone', u'men', 'bathroom', 'door', 'permanently', 'propped', 'open', u'the-lines', u'show-tunes', u'your-date', u'a-lot', u'the-mens-bathroom-door', 'breeder', 'favorite', 'gay', 'bar', u'bartender', 'hilarious', u'drink', 'cheap', u'wall', 'salmon', 'scene', 'usually', 'mix', 'gay', 'men', 'pretty', u'girl', 'win', 'like', 'sing', 'karaoke', 'place', 'sing', 'prince', 'mostly', 'awful', 'falsetto', 'still', 'get', 'cheered', u'my-favorite-gay-bar', u'the-bartenders', u'the-drinks', u'the-walls', u'the-scene', u'a-mix', u'gay-men', u'pretty-girls', u'a-place', 'never', u'year', 'back', 'treated', 'horrible', 'service', 'approached', 'owner', 'treated', u'u', 'like', 'crap', 'boycotted', 'bar', '2', u'year', 'tried', 'prove', 'a-holes', 'plenty', 'friendly', u'bar', 'appreciate', 'business', u'horrible-service', u'the-owner', u'this-bar', u'2-years', u'the-business', 'im', 'feeling', 'low', 'looking', 'great', 'atmosphere', 'cheer', 'go', 'great', u'drink', 'staff', 'very', 'nice', 'throw', 'little', 'karaoke', 'fun', 'most', 'definetly', 'place', u'a-great-atmosphere', u'the-staff', u'the-place', 'very', 'upset', 'place', 'im', 'outside', 'beer', 'garden', 'cigarette', 'owner', 'came', 'said', 'smell', 'pot', 'person', 'told', 'medical', 'patient', 'medicated', 'got', 'bar', 'told', 'not', 'smoke', 'parking', 'lot', 'told', 'havent', 'know', u'law', 'accused', 'smoking', 'parking', 'lot', 'really', 'feel', 'discriminated', 'not', 'come', 'back', 'even', 'u', 'respect', u'rule', 'feel', 'discriminate', 'u', 'call', 'u', 'front', 'everyone', 'definitely', 'not', 'coming', 'back', u'this-place', u'the-beer-garden', u'a-cigarette', u'the-owner', u'my-person', u'the-bar', u'the-parking-lot', u'the-laws', u'the-parking-lot', u'their-rules', u'apollo', 'fun', 'experience', 'checked', 'bank', 'account', 'find', 'erroneously', 'overcharged', 'over', '3500', 'called', 'tried', 'tell', 'debit', 'hold', 'tip', 'not', 'week', 'later', 'charge', 'went', 'said', 'would', 'someone', 'call', 'back', 'not', 'got', 'run', 'around', 'gave', 'very', 'generous', 'tip', 'expense', 'fun', 'check', 'bank', 'account', 'dont', 'play', u'rule', u'a-fun-experience', u'my-bank-account', u'the-charge', u'the-run', u'a-very-generous-tip', u'my-expense', u'your-bank-account', u'the-rules', 'fun'], [u'lLI8ObL8aCVbkrrtAW0EHw'])

In [10]:
from gensim.models import doc2vec

model = doc2vec.Doc2Vec(min_count=4, window=5, size=200, sample=1e-4, negative=10, workers=12)
# Build the vocab from list of sentences.
model.build_vocab(docs)

In [11]:
from random import shuffle

for epoch in range(10):
    print '\rTraining Epoch %i, alpha %1.4f'%(epoch+1, model.alpha),
    #model.train(np.random.permutation(docs))
    shuffle(docs)
    model.train(docs)
    model.alpha -= 0.001 # decrease the learning rate
    model.min_alpha = model.alpha # fix the learning rate, no decay

#model.init_sims(replace=True)    
# # Normalize the word vectors.
# vec_norms = np.sqrt(np.sum(model.syn0**2, axis=1))
# model.syn0 = (model.syn0/vec_norms[:, numpy.newaxis])
# # Normalize the doc vectors.
# vec_norms = np.sqrt(np.sum(model.docvecs.doctag_syn0**2, axis=1))
# model.docvecs.doctag_syn0 = (model.docvecs.doctag_syn0/vec_norms[:, numpy.newaxis])

model.save('../output/doc2vec_bars_200_neg_10_win_5.model')


Training Epoch 10, alpha 0.0160

In [13]:
# Can find similar documents..
print model.docvecs.most_similar(positive=['KUinHkKyGhznElgIzx0yIw']), '\n'

# Can find similar words...Re: Dream Companies and contact from recruiters
print model.most_similar(positive=['beer']), '\n'

# Can find documents that are most similar to keywords.... 
print model.docvecs.most_similar(positive=[model['beer'], model['music']]), '\n'

# Can find words that are most common in documents
print review.text[review.review_id=='KUinHkKyGhznElgIzx0yIw'].values
print model.most_similar(positive=[model.docvecs['KUinHkKyGhznElgIzx0yIw']]), '\n'


[(u'yyUJKvG-C4VipITrAS0nIQ', 0.6710823774337769), (u'g3fipTPN2LBe_U42niTDcw', 0.6688743829727173), (u'ANaGwB8tVAc1qM1QAJecsQ', 0.6611914038658142), (u'LVjRN5pMJ8hhDmX0lbclpQ', 0.6471917629241943), (u'YP-sxa8i95v_scvXN2o4_w', 0.6342615485191345), (u'XkyZAQAaGO9i3on-b3fswg', 0.6326159834861755), (u'qdTtkZVgcdu3SEA6tzBPdw', 0.6320397853851318), (u'L7eGNKkuy_XdQ_35Y1Kacg', 0.629540741443634), (u'cd46siFt_-08j9-kSbVEgA', 0.6246991157531738), (u'3mp5jXdxC2yqSK6sgRQfEg', 0.6225901246070862)] 

[('draft', 0.7733069062232971), (u'wine', 0.7592223882675171), ('tap', 0.7554447054862976), ('micro-brews', 0.7395221590995789), ('brew', 0.7344582080841064), (u'cocktail', 0.7317838072776794), ('draught', 0.7313833236694336), ('selection', 0.723392128944397), ('bottled', 0.71184903383255), ('whisky', 0.6925665140151978)] 

[(u'W3SROyBvrFKT5C2ySdx1qw', 0.44849711656570435), (u'6w6gMZ3iBLGcUM4RBIuifQ', 0.43213915824890137), (u'KCP4tSmVRD6Gk3xbPEAf3w', 0.3971535861492157), (u'3WsATGkAIXV-56eWjdzecw', 0.3879733681678772), (u'3jzEz2q9HZYF2XU1Gm41nA', 0.3710145354270935), (u'JS0gYaJR5HZDhZG0TJRRGg', 0.3570679724216461), (u'QSmI5Y9bhCLIw9YYKOiQkg', 0.34995037317276), (u'Bxn0LTYR9BxEeXReFbXDJA', 0.34519267082214355), (u'Ejw0lND0g8WBQj4pCllUnQ', 0.3383505046367645), (u'EaAo1G89msEiSQLi1jX_Hw', 0.33163875341415405)] 

[ u'My brother and I make the trek from N Scottsdale to The Drummer almost every weekend.   Jesse makes the HOTTEST suicide grilled wings on the planet - we love \'em!  Service is great and the "regulars" are pretty friendly too.  Drink prices are good and there are plenty of TV\'s.  One of the better "dive bars" in the area.']
[('rory', 0.5823439359664917), ('carin', 0.5686616897583008), ('biotch', 0.5440913438796997), ('seit', 0.5418999195098877), ('uwe', 0.5400148034095764), ('reaffirms', 0.5130881071090698), ('accoustic', 0.512511134147644), ('gaurd', 0.5113624930381775), ('wrist', 0.5096732974052429), ('self-guided', 0.4946335256099701)] 


In [ ]:


In [ ]: