notebook.community



In [1]:

    
import nltk
import numpy as np
import pandas as pd
import connect_aws_db as cadb
from textblob import TextBlob as tb



In [2]:

    
import string
import os

from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer



In [3]:

    
engine = cadb.connect_aws_db(write_unicode=True)



In [10]:

    
nltk.download()









    



showing info http://www.nltk.org/nltk_data/






    Out[10]:





True

Read in BF reviews



In [4]:

    
cmd = "SELECT review_rating, review_text FROM bf_reviews"



In [5]:

    
bfdf = pd.read_sql_query(cmd, engine)



In [6]:

    
bfdf









    Out[6]:






  
    
      
      review_rating
      review_text
    
  
  
    
      0
      5
      Really nice property. Great walking areas for ...
    
    
      1
      5
      We've stayed at this La Quinta several times w...
    
    
      2
      5
      This place was awesome!! The entire staff was ...
    
    
      3
      5
      We've stayed at this hotel two times with our ...
    
    
      4
      4
      My room while a bit small was very clean and t...
    
    
      5
      3
      The big gentleman who manages the front desk i...
    
    
      6
      1
      I was driving from So Cal to Houston, TX for t...
    
    
      7
      2
      the hotel was fine but it is all hard scape ar...
    
    
      8
      4
      Hotel has a grass area and dog potty station. ...
    
    
      9
      3
      My room in general was in need of a lot of gen...
    
    
      10
      5
      Very dog friendly. Aimee at the front desk was...
    
    
      11
      3
      Hotel and room were nice and the staff were ni...
    
    
      12
      5
      Flawless
    
    
      13
      5
      For once, the pictures posted for a hotel aren...
    
    
      14
      5
      $50 a nite, but well worth it. Quiet, dog frie...
    
    
      15
      1
      I can only hope the public does a little resea...



In [7]:

    
bfreviews = ('').join(bfdf['review_text'].values)



In [8]:

    
bfreviews[:500]









    Out[8]:





u"Really nice property. Great walking areas for the pets. Centrally located. A lot of swimming pools to use. A water park for the kids. Pets are not allowed in the water park of course. The casita room was nice, but there was no table, making it hard to eat anything with the dogs there. Casita rooms are small. Overall, very nice!We've stayed at this La Quinta several times with our two mini schnauzers. The staff is all very friendly. Rooms were clean but not overly fancy - we felt like we didn't n"

Tokenize BF Data



In [ ]:

    
bftokens = nltk.word_tokenize(bfreviews)



In [29]:

    
len(bftokens)









    Out[29]:





1042



In [30]:

    
stemmer = PorterStemmer()



In [31]:

    
bfstemmed = []



In [ ]:



In [32]:

    
for item in bftokens:
    bfstemmed.append(stemmer.stem(item))



In [33]:

    
len(bfstemmed)









    Out[33]:





1042



In [34]:

    
bftext = nltk.Text(bfstemmed)



In [35]:

    
bfwords = [w.lower() for w in bftext]



In [36]:

    
bfvocab = sorted(set(bfwords))



In [37]:

    
len(bfvocab)









    Out[37]:





388

Read in Yelp reviews



In [14]:

    
cmd = "SELECT review_rating, review_text FROM yelp_reviews"



In [15]:

    
yelpdf = pd.read_sql_query(cmd, engine)



In [16]:

    
len(yelpdf)









    Out[16]:





6263



In [17]:

    
yelpreviews = ('').join(yelpdf['review_text'].values)
yelptokens = nltk.wordpunct_tokenize(yelpreviews)
yelptext = nltk.Text(yelptokens)
yelpwords = [w.lower() for w in yelptext]

Word occurrence with scikit-learn



In [27]:

    
#bfreviews



In [38]:

    
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(bfwords)
X_train_counts.shape









    Out[38]:





(1042, 372)



In [39]:

    
count_vect.vocabulary_.get(u'algorithm')



In [ ]:

TFIDF on BF Reviews Using TextBlobs



In [20]:

    
def tf(word, blob):
    return blob.words.count(word) / len(blob.words)

def n_containing(word, bloblist):
    return sum(1 for blob in bloblist if word in blob)

def idf(word, bloblist):
    return np.log(len(bloblist) / (1 + n_containing(word, bloblist)))

def tfidf(word, blob, bloblist):
    return tf(word, blob) * idf(word, bloblist)



In [22]:

    
document1 = tb(bfreviews)



In [23]:

    
document2 = tb(yelpreviews)



In [24]:

    
bloblist = [document1, document2]



In [ ]:



In [21]:

    
for i, blob in enumerate(bloblist):
    print("Top words in document {}".format(i + 1))
    scores = {word: tfidf(word, blob, bloblist) for word in blob.words}
    sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    for word, score in sorted_words[:3]:
        print("\tWord: {}, TF-IDF: {}".format(word, round(score, 5)))









    



Top words in document 1
	Word: all, TF-IDF: nan
	Word: Continental, TF-IDF: nan
	Word: manages, TF-IDF: nan
Top words in document 2






    



---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-21-aa629df9729f> in <module>()
      6 for i, blob in enumerate(bloblist):
      7     print("Top words in document {}".format(i + 1))
----> 8     scores = {word: tfidf(word, blob, bloblist) for word in blob.words}
      9     sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True)
     10     for word, score in sorted_words[:3]:

<ipython-input-21-aa629df9729f> in <dictcomp>((word,))
      6 for i, blob in enumerate(bloblist):
      7     print("Top words in document {}".format(i + 1))
----> 8     scores = {word: tfidf(word, blob, bloblist) for word in blob.words}
      9     sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True)
     10     for word, score in sorted_words[:3]:

<ipython-input-20-8de784bf7f05> in tfidf(word, blob, bloblist)
      9 
     10 def tfidf(word, blob, bloblist):
---> 11     return tf(word, blob) * idf(word, bloblist)

<ipython-input-20-8de784bf7f05> in tf(word, blob)
      1 def tf(word, blob):
----> 2     return blob.words.count(word) / len(blob.words)
      3 
      4 def n_containing(word, bloblist):
      5     return sum(1 for blob in bloblist if word in blob)

/Applications/anaconda/lib/python2.7/site-packages/textblob/blob.pyc in count(self, strg, case_sensitive, *args, **kwargs)
    234         """
    235         if not case_sensitive:
--> 236             return [word.lower() for word in self].count(strg.lower(), *args,
    237                     **kwargs)
    238         return self._collection.count(strg, *args, **kwargs)

KeyboardInterrupt:



In [ ]:

TFIDF on BF Reviews Using SciKitLearn

The number of words in the bringfido sample corpus:



In [37]:

    
len(bfwords)









    Out[37]:





983



In [54]:

    
count_vect = CountVectorizer()
count_vect.fit_transform(bfwords)









    Out[54]:





<983x379 sparse matrix of type '<type 'numpy.int64'>'
	with 898 stored elements in Compressed Sparse Row format>



In [55]:

    
bf_train_counts[0]









    Out[55]:





<1x379 sparse matrix of type '<type 'numpy.int64'>'
	with 1 stored elements in Compressed Sparse Row format>



In [56]:

    
print('vocabulary: {}'.format(count_vect.vocabulary))









    



vocabulary: None



In [ ]:



In [40]:

    
tfidf_transformer = TfidfTransformer()
bf_train_tfidf = tfidf_transformer.fit_transform(bf_train_counts)
bf_train_tfidf.shape









    Out[40]:





(983, 379)



In [42]:

    
tf = TfidfVectorizer(analyzer='word', stop_words='english')



In [43]:

    
bftfidf_matrix = tf.fit_transform(bfwords)



In [44]:

    
bf_feature_names = tf.get_feature_names()



In [48]:

    
#bf_feature_names[0:50]



In [50]:

    
bfdense = bftfidf_matrix.todense()



In [ ]:

TFIDF on Yelp Reviews

First I want to import all of the Yelp review text, and perform a TFIDF on those reviews:



In [18]:



In [21]:

    
yelp_count_vect = CountVectorizer()
yelp_train_counts = yelp_count_vect.fit_transform(yelpwords)
yelp_train_counts.shape









    Out[21]:





(1244957, 22909)



In [22]:

    
yelp_tfidf_transformer = TfidfTransformer()
yelp_train_tfidf = yelp_tfidf_transformer.fit_transform(yelp_train_counts)
yelp_train_tfidf.shape









    Out[22]:





(1244957, 22909)



In [23]:

    
#from sklearn.metrics import jaccard_similarity_score



In [66]:

    
#jaccard_similarity_score(bf_train_tfidf, yelp_train_tfidf)

Examples



In [22]:

    
path = './tf-idf'
token_dict = {}


def tokenize(text):
    tokens = nltk.word_tokenize(text)
    stems = []
    for item in tokens:
        stems.append(PorterStemmer().stem(item))
    return stems



In [24]:

    
categories = ['alt.atheism', 'soc.religion.christian',
              'comp.graphics', 'sci.med']

from sklearn.datasets import fetch_20newsgroups
twenty_train = fetch_20newsgroups(subset='train',
    categories=categories, shuffle=True, random_state=42)









    



WARNING:sklearn.datasets.twenty_newsgroups:Downloading dataset from http://people.csail.mit.edu/jrennie/20Newsgroups/20news-bydate.tar.gz (14 MB)



In [ ]:

    
for dirpath, dirs, files in os.walk(path):
    for f in files:
        fname = os.path.join(dirpath, f)
        print "fname=", fname
        with open(fname) as pearl:
            text = pearl.read()
            token_dict[f] = text.lower().translate(None, string.punctuation)

tfidf = TfidfVectorizer(tokenizer=tokenize, stop_words='english')
tfs = tfidf.fit_transform(token_dict.values())

str = 'all great and precious things are lonely.'
response = tfidf.transform([str])
print response

feature_names = tfidf.get_feature_names()
for col in response.nonzero()[1]:
    print feature_names[col], ' - ', response[0, col]



In [43]:

    
#twenty_train.data



In [ ]:

    
yelp_count_vect = CountVectorizer()
yelp_train_counts = yelp_count_vect.fit_transform(twenty_train.data)
X_train_counts.shape



In [26]:

    
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(twenty_train.data)
X_train_counts.shape









    Out[26]:





(2257, 35788)



In [28]:

    
X_train_counts[0:3, 0:3]









    Out[28]:





<3x3 sparse matrix of type '<type 'numpy.int64'>'
	with 0 stored elements in Compressed Sparse Row format>

	review_rating	review_text
0	5	Really nice property. Great walking areas for ...
1	5	We've stayed at this La Quinta several times w...
2	5	This place was awesome!! The entire staff was ...
3	5	We've stayed at this hotel two times with our ...
4	4	My room while a bit small was very clean and t...
5	3	The big gentleman who manages the front desk i...
6	1	I was driving from So Cal to Houston, TX for t...
7	2	the hotel was fine but it is all hard scape ar...
8	4	Hotel has a grass area and dog potty station. ...
9	3	My room in general was in need of a lot of gen...
10	5	Very dog friendly. Aimee at the front desk was...
11	3	Hotel and room were nice and the staff were ni...
12	5	Flawless
13	5	For once, the pictures posted for a hotel aren...
14	5	$50 a nite, but well worth it. Quiet, dog frie...
15	1	I can only hope the public does a little resea...