In [1]:
import nltk
import numpy as np
import pandas as pd
import connect_aws_db as cadb
from textblob import TextBlob as tb

In [2]:
import string
import os

from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer

In [3]:
engine = cadb.connect_aws_db(write_unicode=True)

In [10]:
nltk.download()


showing info http://www.nltk.org/nltk_data/
Out[10]:
True

Read in BF reviews


In [4]:
cmd = "SELECT review_rating, review_text FROM bf_reviews"

In [5]:
bfdf = pd.read_sql_query(cmd, engine)

In [6]:
bfdf


Out[6]:
review_rating review_text
0 5 Really nice property. Great walking areas for ...
1 5 We've stayed at this La Quinta several times w...
2 5 This place was awesome!! The entire staff was ...
3 5 We've stayed at this hotel two times with our ...
4 4 My room while a bit small was very clean and t...
5 3 The big gentleman who manages the front desk i...
6 1 I was driving from So Cal to Houston, TX for t...
7 2 the hotel was fine but it is all hard scape ar...
8 4 Hotel has a grass area and dog potty station. ...
9 3 My room in general was in need of a lot of gen...
10 5 Very dog friendly. Aimee at the front desk was...
11 3 Hotel and room were nice and the staff were ni...
12 5 Flawless
13 5 For once, the pictures posted for a hotel aren...
14 5 $50 a nite, but well worth it. Quiet, dog frie...
15 1 I can only hope the public does a little resea...

In [7]:
bfreviews = ('').join(bfdf['review_text'].values)

In [8]:
bfreviews[:500]


Out[8]:
u"Really nice property. Great walking areas for the pets. Centrally located. A lot of swimming pools to use. A water park for the kids. Pets are not allowed in the water park of course. The casita room was nice, but there was no table, making it hard to eat anything with the dogs there. Casita rooms are small. Overall, very nice!We've stayed at this La Quinta several times with our two mini schnauzers. The staff is all very friendly. Rooms were clean but not overly fancy - we felt like we didn't n"

Tokenize BF Data


In [ ]:
bftokens = nltk.word_tokenize(bfreviews)

In [29]:
len(bftokens)


Out[29]:
1042

In [30]:
stemmer = PorterStemmer()

In [31]:
bfstemmed = []

In [ ]:


In [32]:
for item in bftokens:
    bfstemmed.append(stemmer.stem(item))

In [33]:
len(bfstemmed)


Out[33]:
1042

In [34]:
bftext = nltk.Text(bfstemmed)

In [35]:
bfwords = [w.lower() for w in bftext]

In [36]:
bfvocab = sorted(set(bfwords))

In [37]:
len(bfvocab)


Out[37]:
388

Read in Yelp reviews


In [14]:
cmd = "SELECT review_rating, review_text FROM yelp_reviews"

In [15]:
yelpdf = pd.read_sql_query(cmd, engine)

In [16]:
len(yelpdf)


Out[16]:
6263

In [17]:
yelpreviews = ('').join(yelpdf['review_text'].values)
yelptokens = nltk.wordpunct_tokenize(yelpreviews)
yelptext = nltk.Text(yelptokens)
yelpwords = [w.lower() for w in yelptext]

Word occurrence with scikit-learn


In [27]:
#bfreviews

In [38]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(bfwords)
X_train_counts.shape


Out[38]:
(1042, 372)

In [39]:
count_vect.vocabulary_.get(u'algorithm')

In [ ]:

TFIDF on BF Reviews Using TextBlobs


In [20]:
def tf(word, blob):
    return blob.words.count(word) / len(blob.words)

def n_containing(word, bloblist):
    return sum(1 for blob in bloblist if word in blob)

def idf(word, bloblist):
    return np.log(len(bloblist) / (1 + n_containing(word, bloblist)))

def tfidf(word, blob, bloblist):
    return tf(word, blob) * idf(word, bloblist)

In [22]:
document1 = tb(bfreviews)

In [23]:
document2 = tb(yelpreviews)

In [24]:
bloblist = [document1, document2]

In [ ]:


In [21]:
for i, blob in enumerate(bloblist):
    print("Top words in document {}".format(i + 1))
    scores = {word: tfidf(word, blob, bloblist) for word in blob.words}
    sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    for word, score in sorted_words[:3]:
        print("\tWord: {}, TF-IDF: {}".format(word, round(score, 5)))


Top words in document 1
	Word: all, TF-IDF: nan
	Word: Continental, TF-IDF: nan
	Word: manages, TF-IDF: nan
Top words in document 2
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-21-aa629df9729f> in <module>()
      6 for i, blob in enumerate(bloblist):
      7     print("Top words in document {}".format(i + 1))
----> 8     scores = {word: tfidf(word, blob, bloblist) for word in blob.words}
      9     sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True)
     10     for word, score in sorted_words[:3]:

<ipython-input-21-aa629df9729f> in <dictcomp>((word,))
      6 for i, blob in enumerate(bloblist):
      7     print("Top words in document {}".format(i + 1))
----> 8     scores = {word: tfidf(word, blob, bloblist) for word in blob.words}
      9     sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True)
     10     for word, score in sorted_words[:3]:

<ipython-input-20-8de784bf7f05> in tfidf(word, blob, bloblist)
      9 
     10 def tfidf(word, blob, bloblist):
---> 11     return tf(word, blob) * idf(word, bloblist)

<ipython-input-20-8de784bf7f05> in tf(word, blob)
      1 def tf(word, blob):
----> 2     return blob.words.count(word) / len(blob.words)
      3 
      4 def n_containing(word, bloblist):
      5     return sum(1 for blob in bloblist if word in blob)

/Applications/anaconda/lib/python2.7/site-packages/textblob/blob.pyc in count(self, strg, case_sensitive, *args, **kwargs)
    234         """
    235         if not case_sensitive:
--> 236             return [word.lower() for word in self].count(strg.lower(), *args,
    237                     **kwargs)
    238         return self._collection.count(strg, *args, **kwargs)

KeyboardInterrupt: 

In [ ]:

TFIDF on BF Reviews Using SciKitLearn

The number of words in the bringfido sample corpus:


In [37]:
len(bfwords)


Out[37]:
983

In [54]:
count_vect = CountVectorizer()
count_vect.fit_transform(bfwords)


Out[54]:
<983x379 sparse matrix of type '<type 'numpy.int64'>'
	with 898 stored elements in Compressed Sparse Row format>

In [55]:
bf_train_counts[0]


Out[55]:
<1x379 sparse matrix of type '<type 'numpy.int64'>'
	with 1 stored elements in Compressed Sparse Row format>

In [56]:
print('vocabulary: {}'.format(count_vect.vocabulary))


vocabulary: None

In [ ]:


In [40]:
tfidf_transformer = TfidfTransformer()
bf_train_tfidf = tfidf_transformer.fit_transform(bf_train_counts)
bf_train_tfidf.shape


Out[40]:
(983, 379)

In [42]:
tf = TfidfVectorizer(analyzer='word', stop_words='english')

In [43]:
bftfidf_matrix = tf.fit_transform(bfwords)

In [44]:
bf_feature_names = tf.get_feature_names()

In [48]:
#bf_feature_names[0:50]

In [50]:
bfdense = bftfidf_matrix.todense()

In [ ]:

TFIDF on Yelp Reviews

First I want to import all of the Yelp review text, and perform a TFIDF on those reviews:


In [18]:


In [21]:
yelp_count_vect = CountVectorizer()
yelp_train_counts = yelp_count_vect.fit_transform(yelpwords)
yelp_train_counts.shape


Out[21]:
(1244957, 22909)

In [22]:
yelp_tfidf_transformer = TfidfTransformer()
yelp_train_tfidf = yelp_tfidf_transformer.fit_transform(yelp_train_counts)
yelp_train_tfidf.shape


Out[22]:
(1244957, 22909)

In [23]:
#from sklearn.metrics import jaccard_similarity_score

In [66]:
#jaccard_similarity_score(bf_train_tfidf, yelp_train_tfidf)

Examples


In [22]:
path = './tf-idf'
token_dict = {}


def tokenize(text):
    tokens = nltk.word_tokenize(text)
    stems = []
    for item in tokens:
        stems.append(PorterStemmer().stem(item))
    return stems

In [24]:
categories = ['alt.atheism', 'soc.religion.christian',
              'comp.graphics', 'sci.med']

from sklearn.datasets import fetch_20newsgroups
twenty_train = fetch_20newsgroups(subset='train',
    categories=categories, shuffle=True, random_state=42)


WARNING:sklearn.datasets.twenty_newsgroups:Downloading dataset from http://people.csail.mit.edu/jrennie/20Newsgroups/20news-bydate.tar.gz (14 MB)

In [ ]:
for dirpath, dirs, files in os.walk(path):
    for f in files:
        fname = os.path.join(dirpath, f)
        print "fname=", fname
        with open(fname) as pearl:
            text = pearl.read()
            token_dict[f] = text.lower().translate(None, string.punctuation)

tfidf = TfidfVectorizer(tokenizer=tokenize, stop_words='english')
tfs = tfidf.fit_transform(token_dict.values())

str = 'all great and precious things are lonely.'
response = tfidf.transform([str])
print response

feature_names = tfidf.get_feature_names()
for col in response.nonzero()[1]:
    print feature_names[col], ' - ', response[0, col]

In [43]:
#twenty_train.data

In [ ]:
yelp_count_vect = CountVectorizer()
yelp_train_counts = yelp_count_vect.fit_transform(twenty_train.data)
X_train_counts.shape

In [26]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(twenty_train.data)
X_train_counts.shape


Out[26]:
(2257, 35788)

In [28]:
X_train_counts[0:3, 0:3]


Out[28]:
<3x3 sparse matrix of type '<type 'numpy.int64'>'
	with 0 stored elements in Compressed Sparse Row format>