In [22]:
# Import python libs
import sqlite3 as sqlite # work with sqlite databases
import os # used to set working directory
import pandas as pd # process data with pandas dataframe
import numpy as np
In [2]:
# Setup pandas display options
pd.options.display.max_colwidth = 500
In [3]:
# Constants
small_sqlite = "example_db.sqlite"
In [4]:
# Set working directory
os.chdir('../Data/')
In [68]:
# Read sqlite query results into a pandas DataFrame
con = sqlite.connect(small_sqlite)
df = pd.read_sql_query("SELECT * from Documents", con)
con.close()
df.head()
Out[68]:
In [ ]:
In [56]:
from sklearn.feature_extraction.text import CountVectorizer
In [57]:
vectorizer = CountVectorizer()
In [58]:
X = vectorizer.fit_transform(df['NOTE_TEXT'].tolist())
X
Out[58]:
In [59]:
X.toarray()
Out[59]:
In [60]:
vectorizer.get_feature_names()
Out[60]:
In [61]:
vectorizer2 = CountVectorizer(stop_words='english', ngram_range=(1, 2))
X2 = vectorizer2.fit_transform(df['NOTE_TEXT'].tolist())
vectorizer2.get_feature_names()
Out[61]:
$W_{t,d} = log(1+tf_{t,d}) \cdot log_{10}(\frac{N}{df_{t}})$
Best known weighting scheme in information retrieval
Note: the “-” in tf-idf is a hyphen, not a minus sign!
Increases with the number of occurrences within a document
Increases with the rarity of the term in the collection
In [65]:
from sklearn.feature_extraction.text import TfidfTransformer
transformer = TfidfTransformer(use_idf=True)
tfidf_result = transformer.fit_transform(X2)
In [66]:
def display_scores(vectorizer, tfidf_result):
scores = zip(vectorizer.get_feature_names(),
np.asarray(tfidf_result.sum(axis=0)).ravel())
sorted_scores = sorted(scores, key=lambda x: x[1], reverse=True)
for item in sorted_scores:
print "{0:20} Score: {1}".format(item[0], item[1])
In [67]:
display_scores(vectorizer2, tfidf_result)
In [15]:
from nltk.stem.porter import *
from nltk.tokenize import word_tokenize
import string
stemmer = PorterStemmer()
def stem_tokens(tokens, stemmer):
stemmed = []
for item in tokens:
stemmed.append(stemmer.stem(item))
return stemmed
def tokenize(text):
tokens = word_tokenize(text)
tokens = [i for i in tokens if i not in string.punctuation]
stems = stem_tokens(tokens, stemmer)
return stems
In [16]:
vectorizer3 = CountVectorizer(tokenizer=tokenize, stop_words='english', ngram_range=(1, 2))
X3 = vectorizer3.fit_transform(df['NOTE_TEXT'].tolist())
vectorizer3.get_feature_names()
Out[16]: