In [ ]:
import nltk
nltk.download()
In [ ]:
def get_words(url):
import requests
words = requests.get(url).content.decode('latin-1')
word_list = words.split('\n')
index = 0
while index < len(word_list):
word = word_list[index]
if ';' in word or not word:
word_list.pop(index)
else:
index+=1
return word_list
#Get lists of positive and negative words
p_url = 'http://ptrckprry.com/course/ssd/data/positive-words.txt'
n_url = 'http://ptrckprry.com/course/ssd/data/negative-words.txt'
positive_words = get_words(p_url)
negative_words = get_words(n_url)
In [ ]:
with open('data/community.txt','r') as f:
community = f.read()
with open('data/le_monde.txt','r') as f:
le_monde = f.read()
In [ ]:
from nltk import word_tokenize
cpos = cneg = lpos = lneg = 0
for word in word_tokenize(community):
if word in positive_words:
cpos+=1
if word in negative_words:
cneg+=1
for word in word_tokenize(le_monde):
if word in positive_words:
lpos+=1
if word in negative_words:
lneg+=1
print("community {0:1.2f}%\t {1:1.2f}%\t {2:1.2f}%".format(cpos/len(word_tokenize(community))*100,
cneg/len(word_tokenize(community))*100,
(cpos-cneg)/len(word_tokenize(community))*100))
print("le monde {0:1.2f}%\t {1:1.2f}%\t {2:1.2f}%".format(lpos/len(word_tokenize(le_monde))*100,
lneg/len(word_tokenize(le_monde))*100,
(lpos-lneg)/len(word_tokenize(le_monde))*100))
In [ ]:
nrc = "data/NRC-emotion-lexicon-wordlevel-alphabetized-v0.92.txt"
count=0
emotion_dict=dict()
with open(nrc,'r') as f:
all_lines = list()
for line in f:
if count < 46:
count+=1
continue
line = line.strip().split('\t')
if int(line[2]) == 1:
if emotion_dict.get(line[0]):
emotion_dict[line[0]].append(line[1])
else:
emotion_dict[line[0]] = [line[1]]
In [ ]:
def get_nrc_data():
nrc = "data/NRC-emotion-lexicon-wordlevel-alphabetized-v0.92.txt"
count=0
emotion_dict=dict()
with open(nrc,'r') as f:
all_lines = list()
for line in f:
if count < 46:
count+=1
continue
line = line.strip().split('\t')
if int(line[2]) == 1:
if emotion_dict.get(line[0]):
emotion_dict[line[0]].append(line[1])
else:
emotion_dict[line[0]] = [line[1]]
return emotion_dict
In [ ]:
emotion_dict = get_nrc_data()
emotion_dict['abandoned']
In [ ]:
CONSUMER_KEY = ""
CONSUMER_SECRET = ""
TOKEN = ""
TOKEN_SECRET = ""
In [ ]:
with open('yelp_keys.txt','r') as f:
count = 0
for line in f:
if count == 0:
CONSUMER_KEY = line.strip()
if count == 1:
CONSUMER_SECRET = line.strip()
if count == 2:
TOKEN = line.strip()
if count == 3:
TOKEN_SECRET = line.strip()
count+=1
In [ ]:
#We'll use the get_lat_lng function we wrote way back in week 3
def get_lat_lng(address):
url = 'https://maps.googleapis.com/maps/api/geocode/json?address='
url += address
import requests
response = requests.get(url)
if not (response.status_code == 200):
return None
data = response.json()
if not( data['status'] == 'OK'):
return None
main_result = data['results'][0]
geometry = main_result['geometry']
latitude = geometry['location']['lat']
longitude = geometry['location']['lng']
return latitude,longitude
In [ ]:
lat,long = get_lat_lng("Columbia University")
In [ ]:
#Now set up our search parameters
def set_search_parameters(lat,long,radius):
#See the Yelp API for more details
params = {}
params["term"] = "restaurant"
params["ll"] = "{},{}".format(str(lat),str(long))
params["radius_filter"] = str(radius) #The distance around our point in metres
params["limit"] = "10" #Limit ourselves to 10 results
return params
In [ ]:
set_search_parameters(lat,long,200)
In [ ]:
def get_results(params):
import rauth
consumer_key = CONSUMER_KEY
consumer_secret = CONSUMER_SECRET
token = TOKEN
token_secret = TOKEN_SECRET
session = rauth.OAuth1Session(
consumer_key = consumer_key
,consumer_secret = consumer_secret
,access_token = token
,access_token_secret = token_secret)
request = session.get("http://api.yelp.com/v2/search",params=params)
#Transforms the JSON API response into a Python dictionary
data = request.json()
session.close()
return data
In [ ]:
#Get the results
response = get_results(set_search_parameters(get_lat_lng("Community Food and Juice")[0],get_lat_lng("Community Food and Juice")[1],200))
In [ ]:
all_snippets = list()
for business in response['businesses']:
name = business['name']
snippet = business['snippet_text']
id = business['id']
all_snippets.append((id,name,snippet))
all_snippets
In [ ]:
def get_snippets(response):
all_snippets = list()
for business in response['businesses']:
name = business['name']
snippet = business['snippet_text']
id = business['id']
all_snippets.append((id,name,snippet))
return all_snippets
In [ ]:
def emotion_analyzer(text,emotion_dict=emotion_dict):
#Set up the result dictionary
emotions = {x for y in emotion_dict.values() for x in y}
emotion_count = dict()
for emotion in emotions:
emotion_count[emotion] = 0
#Analyze the text and normalize by total number of words
total_words = len(text.split())
for word in text.split():
if emotion_dict.get(word):
for emotion in emotion_dict.get(word):
emotion_count[emotion] += 1/len(text.split())
return emotion_count
In [ ]:
print("%-12s %1s\t%1s %1s %1s %1s %1s %1s %1s %1s"%(
"restaurant","fear","trust","negative","positive","joy","disgust","anticip",
"sadness","surprise"))
for snippet in all_snippets:
text = snippet[2]
result = emotion_analyzer(text)
print("%-12s %1.2f\t%1.2f\t%1.2f\t%1.2f\t%1.2f\t%1.2f\t%1.2f\t%1.2f\t%1.2f"%(
snippet[1][0:10],result['fear'],result['trust'],
result['negative'],result['positive'],result['joy'],result['disgust'],
result['anticipation'],result['sadness'],result['surprise']))
In [ ]:
def comparative_emotion_analyzer(text_tuples):
print("%-20s %1s\t%1s %1s %1s %1s %1s %1s %1s %1s"%(
"restaurant","fear","trust","negative","positive","joy","disgust","anticip",
"sadness","surprise"))
for text_tuple in text_tuples:
text = text_tuple[2]
result = emotion_analyzer(text)
print("%-20s %1.2f\t%1.2f\t%1.2f\t%1.2f\t%1.2f\t%1.2f\t%1.2f\t%1.2f\t%1.2f"%(
text_tuple[1][0:20],result['fear'],result['trust'],
result['negative'],result['positive'],result['joy'],result['disgust'],
result['anticipation'],result['sadness'],result['surprise']))
#And test it
comparative_emotion_analyzer(all_snippets)
In [ ]:
def analyze_nearby_restaurants(address,radius):
lat,long = get_lat_lng(address)
params = set_search_parameters(lat,long,radius)
response = get_results(params)
snippets = get_snippets(response)
comparative_emotion_analyzer(snippets)
#And test it
analyze_nearby_restaurants("Community Food and Juice",200)
In [ ]:
#Test it on some other place
analyze_nearby_restaurants("221 Baker Street",200)
In [ ]:
all_snippets
In [ ]:
text=''
for snippet in all_snippets:
text+=snippet[2]
text
In [ ]:
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
%matplotlib inline
wordcloud = WordCloud(stopwords=STOPWORDS,background_color='white',width=3000,height=3000).generate(text)
plt.imshow(wordcloud)
plt.axis('off')
plt.show()
In [ ]:
import nltk
from nltk.corpus import PlaintextCorpusReader
community_root = "data/community"
le_monde_root = "data/le_monde"
community_files = "community.*"
le_monde_files = "le_monde.*"
heights_root = "data/heights"
heights_files = "heights.*"
amigos_root = "data/amigos"
amigos_files = "amigos.*"
community_data = PlaintextCorpusReader(community_root,community_files)
le_monde_data = PlaintextCorpusReader(le_monde_root,le_monde_files)
heights_data = PlaintextCorpusReader(heights_root,heights_files)
amigos_data = PlaintextCorpusReader(amigos_root,amigos_files)
In [ ]:
amigos_data.fileids()
In [ ]:
amigos_data.raw()
In [ ]:
def comparative_emotion_analyzer(text_tuples,name_location=1,text_location=2):
print("%-20s %1s\t%1s %1s %1s %1s %1s %1s %1s %1s"%(
"restaurant","fear","trust","negative","positive","joy","disgust","anticip",
"sadness","surprise"))
for text_tuple in text_tuples:
text = text_tuple[text_location]
result = emotion_analyzer(text)
print("%-20s %1.2f\t%1.2f\t%1.2f\t%1.2f\t%1.2f\t%1.2f\t%1.2f\t%1.2f\t%1.2f"%(
text_tuple[name_location][0:20],result['fear'],result['trust'],
result['negative'],result['positive'],result['joy'],result['disgust'],
result['anticipation'],result['sadness'],result['surprise']))
#And test it
comparative_emotion_analyzer(all_snippets)
In [ ]:
restaurant_data = [('community',community_data.raw()),('le monde',le_monde_data.raw())
,('heights',heights_data.raw()), ('amigos',amigos_data.raw())]
comparative_emotion_analyzer(restaurant_data,0,1)
token: A sequence (or group) of characters of interest. For e.g., in the below analysis, a token = a word
In [ ]:
#Construct tokens (words/sentences) from the text
text = le_monde_data.raw()
import nltk
from nltk import sent_tokenize,word_tokenize
sentences = nltk.Text(sent_tokenize(text))
print(len(sentences))
words = nltk.Text(word_tokenize(text))
print(len(words))
In [ ]:
num_chars=len(text)
num_words=len(word_tokenize(text))
num_sentences=len(sent_tokenize(text))
vocab = {x.lower() for x in word_tokenize(text)}
print(num_chars,int(num_chars/num_words),int(num_words/num_sentences),(len(vocab)/num_words))
In [ ]:
def get_complexity(text):
num_chars=len(text)
num_words=len(word_tokenize(text))
num_sentences=len(sent_tokenize(text))
vocab = {x.lower() for x in word_tokenize(text)}
return len(vocab),int(num_chars/num_words),int(num_words/num_sentences),len(vocab)/num_words
In [ ]:
get_complexity(le_monde_data.raw())
In [ ]:
for text in restaurant_data:
(vocab,word_size,sent_size,vocab_to_text) = get_complexity(text[1])
print("{0:15s}\t{1:1.2f}\t{2:1.2f}\t{3:1.2f}\t{4:1.2f}".format(text[0],vocab,word_size,sent_size,vocab_to_text))
In [ ]:
texts = restaurant_data
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
%matplotlib inline
#Remove unwanted words
#As we look at the cloud, we can get rid of words that don't make sense by adding them to this variable
DELETE_WORDS = []
def remove_words(text_string,DELETE_WORDS=DELETE_WORDS):
for word in DELETE_WORDS:
text_string = text_string.replace(word,' ')
return text_string
#Remove short words
MIN_LENGTH = 0
def remove_short_words(text_string,min_length = MIN_LENGTH):
word_list = text_string.split()
for word in word_list:
if len(word) < min_length:
text_string = text_string.replace(' '+word+' ',' ',1)
return text_string
#Set up side by side clouds
COL_NUM = 2
ROW_NUM = 2
fig, axes = plt.subplots(ROW_NUM, COL_NUM, figsize=(12,12))
for i in range(0,len(texts)):
text_string = remove_words(texts[i][1])
text_string = remove_short_words(text_string)
ax = axes[i%2]
ax = axes[i//2, i%2] #Use this if ROW_NUM >=2
ax.set_title(texts[i][0])
wordcloud = WordCloud(stopwords=STOPWORDS,background_color='white',width=1200,height=1000,max_words=20).generate(text_string)
ax.imshow(wordcloud)
ax.axis('off')
plt.show()
nltk.download()
In [ ]:
from nltk.book import *
In [ ]:
inaugural.fileids()
In [ ]:
inaugural.raw('1861-Lincoln.txt')
In [ ]:
texts = [('trump',inaugural.raw('2017-Trump.txt')),
('obama',inaugural.raw('2009-Obama.txt')+inaugural.raw('2013-Obama.txt')),
('jackson',inaugural.raw('1829-Jackson.txt')+inaugural.raw('1833-Jackson.txt')),
('washington',inaugural.raw('1789-Washington.txt')+inaugural.raw('1793-Washington.txt'))]
for text in texts:
(vocab,word_size,sent_size,vocab_to_text) = get_complexity(text[1])
print("{0:15s}\t{1:1.2f}\t{2:1.2f}\t{3:1.2f}\t{4:1.2f}".format(text[0],vocab,word_size,sent_size,vocab_to_text))
In [ ]:
from nltk.corpus import inaugural
sentence_lengths = list()
for fileid in inaugural.fileids():
sentence_lengths.append(get_complexity(' '.join(inaugural.words(fileid)))[2])
plt.plot(sentence_lengths)
In [ ]:
text4.dispersion_plot(["government", "citizen", "freedom", "duties", "America",'independence','God','patriotism'])
In [ ]:
from nltk.stem.porter import PorterStemmer
p_stemmer = PorterStemmer()
text = inaugural.raw()
striptext = text.replace('\n\n', ' ')
striptext = striptext.replace('\n', ' ')
sentences = sent_tokenize(striptext)
words = word_tokenize(striptext)
text = nltk.Text([p_stemmer.stem(i).lower() for i in words])
text.dispersion_plot(["govern", "citizen", "free", "america",'independ','god','patriot'])
In [ ]:
!pip install vaderSentiment
In [ ]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
In [ ]:
headers = ['pos','neg','neu','compound']
texts = restaurant_data
analyzer = SentimentIntensityAnalyzer()
for i in range(len(texts)):
name = texts[i][0]
sentences = sent_tokenize(texts[i][1])
pos=compound=neu=neg=0
for sentence in sentences:
vs = analyzer.polarity_scores(sentence)
pos+=vs['pos']/(len(sentences))
compound+=vs['compound']/(len(sentences))
neu+=vs['neu']/(len(sentences))
neg+=vs['neg']/(len(sentences))
print(name,pos,neg,neu,compound)
In [ ]:
def vader_comparison(texts):
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
headers = ['pos','neg','neu','compound']
print("Name\t",' pos\t','neg\t','neu\t','compound')
analyzer = SentimentIntensityAnalyzer()
for i in range(len(texts)):
name = texts[i][0]
sentences = sent_tokenize(texts[i][1])
pos=compound=neu=neg=0
for sentence in sentences:
vs = analyzer.polarity_scores(sentence)
pos+=vs['pos']/(len(sentences))
compound+=vs['compound']/(len(sentences))
neu+=vs['neu']/(len(sentences))
neg+=vs['neg']/(len(sentences))
print('%-10s'%name,'%1.2f\t'%pos,'%1.2f\t'%neg,'%1.2f\t'%neu,'%1.2f\t'%compound)
In [ ]:
vader_comparison(restaurant_data)
In [ ]:
en={}
try:
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
sentences = sent_detector.tokenize(community_data.raw().strip())
for sentence in sentences:
tokenized = nltk.word_tokenize(sentence)
tagged = nltk.pos_tag(tokenized)
chunked = nltk.ne_chunk(tagged)
for tree in chunked:
if hasattr(tree, 'label'):
ne = ' '.join(c[0] for c in tree.leaves())
en[ne] = [tree.label(), ' '.join(c[1] for c in tree.leaves())]
except Exception as e:
print(str(e))
import pprint
pp = pprint.PrettyPrinter(indent=4)
pp.pprint(en)
In [ ]:
meaningful_sents = list()
i=0
for sentence in sentences:
if 'service' in sentence:
i+=1
meaningful_sents.append((i,sentence))
vader_comparison(meaningful_sents)
In [ ]:
def get_affect(text,word,lower=False):
import nltk
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
sentences = sent_detector.tokenize(text.strip())
sentence_count = 0
running_total = 0
for sentence in sentences:
if lower: sentence = sentence.lower()
if word in sentence:
vs = analyzer.polarity_scores(sentence)
running_total += vs['compound']
sentence_count += 1
if sentence_count == 0: return 0
return running_total/sentence_count
In [ ]:
get_affect(community_data.raw(),'service',True)
In [ ]:
nltk.Text(community_data.words()).concordance('service',100)
A naive form of summarization is to identify the most frequent words in a piece of text and use the occurrence of these words in sentences to rate the importance of a sentence.
In [ ]:
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from collections import OrderedDict
import pprint
In [ ]:
text = community_data.raw()
summary_sentences = []
candidate_sentences = {}
candidate_sentence_counts = {}
striptext = text.replace('\n\n', ' ')
striptext = striptext.replace('\n', ' ')
In [ ]:
words = word_tokenize(striptext)
lowercase_words = [word.lower() for word in words
if word not in stopwords.words() and word.isalpha()]
In [ ]:
word_frequencies = FreqDist(lowercase_words)
most_frequent_words = FreqDist(lowercase_words).most_common(20)
pp = pprint.PrettyPrinter(indent=4)
pp.pprint(most_frequent_words)
In [ ]:
sentences = sent_tokenize(striptext)
for sentence in sentences:
candidate_sentences[sentence] = sentence.lower()
candidate_sentences
In [ ]:
for long, short in candidate_sentences.items():
count = 0
for freq_word, frequency_score in most_frequent_words:
if freq_word in short:
count += frequency_score
candidate_sentence_counts[long] = count
In [2]:
sorted_sentences = OrderedDict(sorted(
candidate_sentence_counts.items(),
key = lambda x: x[0],
reverse = True)[:4])
pp.pprint(sorted_sentences)
In [ ]:
def build_naive_summary(text):
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from collections import OrderedDict
summary_sentences = []
candidate_sentences = {}
candidate_sentence_counts = {}
striptext = text.replace('\n\n', ' ')
striptext = striptext.replace('\n', ' ')
words = word_tokenize(striptext)
lowercase_words = [word.lower() for word in words
if word not in stopwords.words() and word.isalpha()]
word_frequencies = FreqDist(lowercase_words)
most_frequent_words = FreqDist(lowercase_words).most_common(20)
sentences = sent_tokenize(striptext)
for sentence in sentences:
candidate_sentences[sentence] = sentence.lower()
for long, short in candidate_sentences.items():
count = 0
for freq_word, frequency_score in most_frequent_words:
if freq_word in short:
count += frequency_score
candidate_sentence_counts[long] = count
sorted_sentences = OrderedDict(sorted(
candidate_sentence_counts.items(),
key = lambda x: x[1],
reverse = True)[:4])
return sorted_sentences
In [ ]:
summary = '\n'.join(build_naive_summary(community_data.raw()))
print(summary)
In [ ]:
summary = '\n'.join(build_naive_summary(le_monde_data.raw()))
print(summary)
In [ ]:
build_naive_summary(inaugural.raw('1789-Washington.txt'))
In [ ]:
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
%matplotlib inline
import nltk
from nltk.corpus import PlaintextCorpusReader
from nltk import sent_tokenize,word_tokenize
from nltk.book import *
In [ ]:
import nltk
from nltk.corpus import PlaintextCorpusReader
community_root = "data/community"
le_monde_root = "data/le_monde"
community_files = "community.*"
le_monde_files = "le_monde.*"
heights_root = "data/heights"
heights_files = "heights.*"
amigos_root = "data/amigos"
amigos_files = "amigos.*"
community_data = PlaintextCorpusReader(community_root,community_files)
le_monde_data = PlaintextCorpusReader(le_monde_root,le_monde_files)
heights_data = PlaintextCorpusReader(heights_root,heights_files)
amigos_data = PlaintextCorpusReader(amigos_root,amigos_files)
In [ ]:
type(community_data)
In [ ]:
text = community_data.raw()
summary_sentences = []
candidate_sentences = {}
candidate_sentence_counts = {}
striptext = text.replace('\n\n', ' ')
striptext = striptext.replace('\n', ' ')
In [ ]:
import gensim.summarization
In [ ]:
In [ ]:
#!pip install gensim
In [ ]:
import gensim.summarization
In [ ]:
summary = gensim.summarization.summarize(striptext, word_count=100)
print(summary)
In [ ]:
print(gensim.summarization.keywords(striptext,words=10))
In [ ]:
summary = '\n'.join(build_naive_summary(community_data.raw()))
print(summary)
In [ ]:
text = le_monde_data.raw()
summary_sentences = []
candidate_sentences = {}
candidate_sentence_counts = {}
striptext = text.replace('\n\n', ' ')
striptext = striptext.replace('\n', ' ')
summary = gensim.summarization.summarize(striptext, word_count=100)
print(summary)
#print(gensim.summarization.keywords(striptext,words=10))
In [ ]:
from gensim import corpora
from gensim.models.ldamodel import LdaModel
from gensim.parsing.preprocessing import STOPWORDS
import pprint
In [ ]:
text = PlaintextCorpusReader("data/","Nikon_coolpix_4300.txt").raw()
striptext = text.replace('\n\n', ' ')
striptext = striptext.replace('\n', ' ')
sentences = sent_tokenize(striptext)
#words = word_tokenize(striptext)
#tokenize each sentence into word tokens
texts = [[word for word in sentence.lower().split()
if word not in STOPWORDS and word.isalnum()]
for sentence in sentences]
len(texts)
In [ ]:
print(text)
In [ ]:
text
In [ ]:
dictionary = corpora.Dictionary(texts) #(word_id,frequency) pairs
corpus = [dictionary.doc2bow(text) for text in texts] #(word_id,freq) pairs by sentence
#print(dictionary.token2id)
#print(dictionary.keys())
#print(corpus[9])
#print(texts[9])
#print(dictionary[73])
#dictionary[4]
In [ ]:
#Set parameters
num_topics = 5 #The number of topics that should be generated
passes = 10
In [ ]:
lda = LdaModel(corpus,
id2word=dictionary,
num_topics=num_topics,
passes=10)
In [ ]:
In [ ]:
pp = pprint.PrettyPrinter(indent=4)
pp.pprint(lda.print_topics(num_words=3))
In [ ]:
from operator import itemgetter
lda.get_document_topics(corpus[0],minimum_probability=0.05,per_word_topics=False)
sorted(lda.get_document_topics(corpus[0],minimum_probability=0,per_word_topics=False),key=itemgetter(1),reverse=True)
In [ ]:
def draw_wordcloud(lda,topicnum,min_size=0,STOPWORDS=[]):
word_list=[]
prob_total = 0
for word,prob in lda.show_topic(topicnum,topn=50):
prob_total +=prob
for word,prob in lda.show_topic(topicnum,topn=50):
if word in STOPWORDS or len(word) < min_size:
continue
freq = int(prob/prob_total*1000)
alist=[word]
word_list.extend(alist*freq)
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
%matplotlib inline
text = ' '.join(word_list)
wordcloud = WordCloud(stopwords=STOPWORDS,background_color='white',width=3000,height=3000).generate(' '.join(word_list))
plt.imshow(wordcloud)
plt.axis('off')
plt.show()
In [ ]:
draw_wordcloud(lda,2)
In [ ]:
REMOVE_WORDS = {'shall','generally','spirit','country','people','nation','nations','great','better'}
#Create a word dictionary (id, word)
texts = [[word for word in sentence.lower().split()
if word not in STOPWORDS and word not in REMOVE_WORDS and word.isalnum()]
for sentence in sentences]
dictionary = corpora.Dictionary(texts)
#Create a corpus of documents
text_list = list()
for fileid in inaugural.fileids():
text = inaugural.words(fileid)
doc=list()
for word in text:
if word in STOPWORDS or word in REMOVE_WORDS or not word.isalpha() or len(word) <5:
continue
doc.append(word)
text_list.append(doc)
by_address_corpus = [dictionary.doc2bow(text) for text in text_list]
In [ ]:
lda = LdaModel(by_address_corpus,
id2word=dictionary,
num_topics=20,
passes=10)
In [ ]:
pp = pprint.PrettyPrinter(indent=4)
pp.pprint(lda.print_topics(num_words=10))
In [ ]:
len(by_address_corpus)
In [ ]:
from operator import itemgetter
sorted(lda.get_document_topics(by_address_corpus[0],minimum_probability=0,per_word_topics=False),key=itemgetter(1),reverse=True)
In [ ]:
draw_wordcloud(lda,18)
In [ ]:
print(lda.show_topic(12,topn=5))
print(lda.show_topic(18,topn=5))
In [ ]:
doc_list = [community_data,le_monde_data,amigos_data,heights_data]
all_text = community_data.raw() + le_monde_data.raw() + amigos_data.raw() + heights_data.raw()
documents = [doc.raw() for doc in doc_list]
texts = [[word for word in document.lower().split()
if word not in STOPWORDS and word.isalnum()]
for document in documents]
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
In [ ]:
from gensim.similarities.docsim import Similarity
from gensim import corpora, models, similarities
lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=2)
doc = """
Many, many years ago, I used to frequent this place for their amazing french toast.
It's been a while since then and I've been hesitant to review a place I haven't been to in 7-8 years...
but I passed by French Roast and, feeling nostalgic, decided to go back.
It was a great decision.
Their Bloody Mary is fantastic and includes bacon (which was perfectly cooked!!), olives,
cucumber, and celery. The Irish coffee is also excellent, even without the cream which is what I ordered.
Great food, great drinks, a great ambiance that is casual yet familiar like a tiny little French cafe.
I highly recommend coming here, and will be back whenever I'm in the area next.
Juan, the bartender, is great!! One of the best in any brunch spot in the city, by far.
"""
vec_bow = dictionary.doc2bow(doc.lower().split())
vec_lsi = lsi[vec_bow]
index = similarities.MatrixSimilarity(lsi[corpus])
sims = index[vec_lsi]
sims = sorted(enumerate(sims), key=lambda item: -item[1])
In [ ]:
sims
In [ ]:
doc="""
I went to Mexican Festival Restaurant for Cinco De Mayo because I had been there years
prior and had such a good experience. This time wasn't so good. The food was just
mediocre and it wasn't hot when it was brought to our table. They brought my friends food out
10 minutes before everyone else and it took forever to get drinks. We let it slide because the place was
packed with people and it was Cinco De Mayo. Also, the margaritas we had were slamming! Pure tequila.
But then things took a turn for the worst. As I went to get something out of my purse which was on
the back of my chair, I looked down and saw a huge water bug. I had to warn the lady next to me because
it was so close to her chair. We called the waitress over and someone came with a broom and a dustpan and
swept it away like it was an everyday experience. No one seemed phased.
Even though our waitress was very nice, I do not think we will be returning to Mexican Festival again.
It seems the restaurant is a shadow of its former self.
"""
vec_bow = dictionary.doc2bow(doc.lower().split())
vec_lsi = lsi[vec_bow]
index = similarities.MatrixSimilarity(lsi[corpus])
sims = index[vec_lsi]
sims = sorted(enumerate(sims), key=lambda item: -item[1])
sims
In [ ]: