In [1]:
import nltk
import numpy as np
import pandas as pd
import connect_aws_db as cadb
from textblob import TextBlob as tb
In [2]:
import string
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
In [3]:
engine = cadb.connect_aws_db(write_unicode=True)
In [10]:
nltk.download()
Out[10]:
In [4]:
cmd = "SELECT review_rating, review_text FROM bf_reviews"
In [5]:
bfdf = pd.read_sql_query(cmd, engine)
In [6]:
bfdf
Out[6]:
In [7]:
bfreviews = ('').join(bfdf['review_text'].values)
In [8]:
bfreviews[:500]
Out[8]:
In [ ]:
bftokens = nltk.word_tokenize(bfreviews)
In [29]:
len(bftokens)
Out[29]:
In [30]:
stemmer = PorterStemmer()
In [31]:
bfstemmed = []
In [ ]:
In [32]:
for item in bftokens:
bfstemmed.append(stemmer.stem(item))
In [33]:
len(bfstemmed)
Out[33]:
In [34]:
bftext = nltk.Text(bfstemmed)
In [35]:
bfwords = [w.lower() for w in bftext]
In [36]:
bfvocab = sorted(set(bfwords))
In [37]:
len(bfvocab)
Out[37]:
In [14]:
cmd = "SELECT review_rating, review_text FROM yelp_reviews"
In [15]:
yelpdf = pd.read_sql_query(cmd, engine)
In [16]:
len(yelpdf)
Out[16]:
In [17]:
yelpreviews = ('').join(yelpdf['review_text'].values)
yelptokens = nltk.wordpunct_tokenize(yelpreviews)
yelptext = nltk.Text(yelptokens)
yelpwords = [w.lower() for w in yelptext]
In [27]:
#bfreviews
In [38]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(bfwords)
X_train_counts.shape
Out[38]:
In [39]:
count_vect.vocabulary_.get(u'algorithm')
In [ ]:
In [20]:
def tf(word, blob):
return blob.words.count(word) / len(blob.words)
def n_containing(word, bloblist):
return sum(1 for blob in bloblist if word in blob)
def idf(word, bloblist):
return np.log(len(bloblist) / (1 + n_containing(word, bloblist)))
def tfidf(word, blob, bloblist):
return tf(word, blob) * idf(word, bloblist)
In [22]:
document1 = tb(bfreviews)
In [23]:
document2 = tb(yelpreviews)
In [24]:
bloblist = [document1, document2]
In [ ]:
In [21]:
for i, blob in enumerate(bloblist):
print("Top words in document {}".format(i + 1))
scores = {word: tfidf(word, blob, bloblist) for word in blob.words}
sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True)
for word, score in sorted_words[:3]:
print("\tWord: {}, TF-IDF: {}".format(word, round(score, 5)))
In [ ]:
In [37]:
len(bfwords)
Out[37]:
In [54]:
count_vect = CountVectorizer()
count_vect.fit_transform(bfwords)
Out[54]:
In [55]:
bf_train_counts[0]
Out[55]:
In [56]:
print('vocabulary: {}'.format(count_vect.vocabulary))
In [ ]:
In [40]:
tfidf_transformer = TfidfTransformer()
bf_train_tfidf = tfidf_transformer.fit_transform(bf_train_counts)
bf_train_tfidf.shape
Out[40]:
In [42]:
tf = TfidfVectorizer(analyzer='word', stop_words='english')
In [43]:
bftfidf_matrix = tf.fit_transform(bfwords)
In [44]:
bf_feature_names = tf.get_feature_names()
In [48]:
#bf_feature_names[0:50]
In [50]:
bfdense = bftfidf_matrix.todense()
In [ ]:
In [18]:
In [21]:
yelp_count_vect = CountVectorizer()
yelp_train_counts = yelp_count_vect.fit_transform(yelpwords)
yelp_train_counts.shape
Out[21]:
In [22]:
yelp_tfidf_transformer = TfidfTransformer()
yelp_train_tfidf = yelp_tfidf_transformer.fit_transform(yelp_train_counts)
yelp_train_tfidf.shape
Out[22]:
In [23]:
#from sklearn.metrics import jaccard_similarity_score
In [66]:
#jaccard_similarity_score(bf_train_tfidf, yelp_train_tfidf)
In [22]:
path = './tf-idf'
token_dict = {}
def tokenize(text):
tokens = nltk.word_tokenize(text)
stems = []
for item in tokens:
stems.append(PorterStemmer().stem(item))
return stems
In [24]:
categories = ['alt.atheism', 'soc.religion.christian',
'comp.graphics', 'sci.med']
from sklearn.datasets import fetch_20newsgroups
twenty_train = fetch_20newsgroups(subset='train',
categories=categories, shuffle=True, random_state=42)
In [ ]:
for dirpath, dirs, files in os.walk(path):
for f in files:
fname = os.path.join(dirpath, f)
print "fname=", fname
with open(fname) as pearl:
text = pearl.read()
token_dict[f] = text.lower().translate(None, string.punctuation)
tfidf = TfidfVectorizer(tokenizer=tokenize, stop_words='english')
tfs = tfidf.fit_transform(token_dict.values())
str = 'all great and precious things are lonely.'
response = tfidf.transform([str])
print response
feature_names = tfidf.get_feature_names()
for col in response.nonzero()[1]:
print feature_names[col], ' - ', response[0, col]
In [43]:
#twenty_train.data
In [ ]:
yelp_count_vect = CountVectorizer()
yelp_train_counts = yelp_count_vect.fit_transform(twenty_train.data)
X_train_counts.shape
In [26]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(twenty_train.data)
X_train_counts.shape
Out[26]:
In [28]:
X_train_counts[0:3, 0:3]
Out[28]: