Text Vectorization


In [22]:
# Import python libs

import sqlite3 as sqlite       # work with sqlite databases
import os                      # used to set working directory
import pandas as pd            # process data with pandas dataframe
import numpy as np

In [2]:
# Setup pandas display options

pd.options.display.max_colwidth = 500

In [3]:
# Constants

small_sqlite = "example_db.sqlite"

In [4]:
# Set working directory

os.chdir('../Data/')

In [68]:
# Read sqlite query results into a pandas DataFrame
con = sqlite.connect(small_sqlite)

df = pd.read_sql_query("SELECT * from Documents", con)

con.close()

df.head()


Out[68]:
DOCID NOTE_TEXT CATEGORY
0 1 The Cologne Carnival is a carnival that takes place every year in Cologne. Traditionally, the carnival season is declared open at 11 minutes past 11 on the 11th of the 11th month November. A
1 2 Every Cologne Carnival there are 3 people who are granted the titles of Virgin, Prince, and Farmer. By tradition, the prince is deemed to be the highest representative of the festivities. B

In [ ]:


In [56]:
from sklearn.feature_extraction.text import CountVectorizer

In [57]:
vectorizer = CountVectorizer()

In [58]:
X = vectorizer.fit_transform(df['NOTE_TEXT'].tolist())

X


Out[58]:
<2x41 sparse matrix of type '<type 'numpy.int64'>'
	with 47 stored elements in Compressed Sparse Row format>

In [59]:
X.toarray()


Out[59]:
array([[2, 2, 0, 0, 1, 0, 0, 3, 2, 1, 0, 1, 0, 0, 0, 0, 1, 2, 1, 1, 1, 1,
        1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 4, 0, 0, 0, 0, 1, 0, 0, 1],
       [0, 0, 1, 2, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 2,
        0, 0, 0, 1, 0, 4, 1, 0, 0, 0, 4, 1, 1, 1, 1, 0, 1, 1, 0]], dtype=int64)

In [60]:
vectorizer.get_feature_names()


Out[60]:
[u'11',
 u'11th',
 u'and',
 u'are',
 u'at',
 u'be',
 u'by',
 u'carnival',
 u'cologne',
 u'declared',
 u'deemed',
 u'every',
 u'farmer',
 u'festivities',
 u'granted',
 u'highest',
 u'in',
 u'is',
 u'minutes',
 u'month',
 u'november',
 u'of',
 u'on',
 u'open',
 u'past',
 u'people',
 u'place',
 u'prince',
 u'representative',
 u'season',
 u'takes',
 u'that',
 u'the',
 u'there',
 u'titles',
 u'to',
 u'tradition',
 u'traditionally',
 u'virgin',
 u'who',
 u'year']
Let's remove stop words and include bigrams...

In [61]:
vectorizer2 = CountVectorizer(stop_words='english', ngram_range=(1, 2))
X2 = vectorizer2.fit_transform(df['NOTE_TEXT'].tolist())
vectorizer2.get_feature_names()


Out[61]:
[u'11',
 u'11 11th',
 u'11 minutes',
 u'11th',
 u'11th 11th',
 u'11th month',
 u'carnival',
 u'carnival carnival',
 u'carnival people',
 u'carnival season',
 u'carnival takes',
 u'cologne',
 u'cologne carnival',
 u'cologne traditionally',
 u'declared',
 u'declared open',
 u'deemed',
 u'deemed highest',
 u'farmer',
 u'farmer tradition',
 u'festivities',
 u'festivities prince',
 u'granted',
 u'granted titles',
 u'highest',
 u'highest representative',
 u'minutes',
 u'minutes past',
 u'month',
 u'month november',
 u'november',
 u'open',
 u'open 11',
 u'past',
 u'past 11',
 u'people',
 u'people granted',
 u'place',
 u'place year',
 u'prince',
 u'prince deemed',
 u'prince farmer',
 u'prince prince',
 u'representative',
 u'representative festivities',
 u'season',
 u'season declared',
 u'takes',
 u'takes place',
 u'titles',
 u'titles virgin',
 u'tradition',
 u'tradition prince',
 u'traditionally',
 u'traditionally carnival',
 u'virgin',
 u'virgin prince',
 u'year',
 u'year cologne']

tf-idf weighting

  • tf - Term Frequency
  • idf - Inverse Document Frequency
  • The tf-idf weight of a term is the product of its tf weight and its idf weight.

$W_{t,d} = log(1+tf_{t,d}) \cdot log_{10}(\frac{N}{df_{t}})$

  • Best known weighting scheme in information retrieval

  • Note: the “-” in tf-idf is a hyphen, not a minus sign!

  • Increases with the number of occurrences within a document

  • Increases with the rarity of the term in the collection


In [65]:
from sklearn.feature_extraction.text import TfidfTransformer

transformer = TfidfTransformer(use_idf=True)
tfidf_result = transformer.fit_transform(X2)

In [66]:
def display_scores(vectorizer, tfidf_result):
    scores = zip(vectorizer.get_feature_names(),
                 np.asarray(tfidf_result.sum(axis=0)).ravel())
    sorted_scores = sorted(scores, key=lambda x: x[1], reverse=True)
    for item in sorted_scores:
        print "{0:20} Score: {1}".format(item[0], item[1])

In [67]:
display_scores(vectorizer2, tfidf_result)


prince               Score: 0.620780335148
carnival             Score: 0.431895108609
cologne              Score: 0.324737599179
11                   Score: 0.30121228115
11th                 Score: 0.30121228115
cologne carnival     Score: 0.21758008975
carnival people      Score: 0.155195083787
deemed               Score: 0.155195083787
deemed highest       Score: 0.155195083787
farmer               Score: 0.155195083787
farmer tradition     Score: 0.155195083787
festivities          Score: 0.155195083787
festivities prince   Score: 0.155195083787
granted              Score: 0.155195083787
granted titles       Score: 0.155195083787
highest              Score: 0.155195083787
highest representative Score: 0.155195083787
people               Score: 0.155195083787
people granted       Score: 0.155195083787
prince deemed        Score: 0.155195083787
prince farmer        Score: 0.155195083787
prince prince        Score: 0.155195083787
representative       Score: 0.155195083787
representative festivities Score: 0.155195083787
titles               Score: 0.155195083787
titles virgin        Score: 0.155195083787
tradition            Score: 0.155195083787
tradition prince     Score: 0.155195083787
virgin               Score: 0.155195083787
virgin prince        Score: 0.155195083787
11 11th              Score: 0.150606140575
11 minutes           Score: 0.150606140575
11th 11th            Score: 0.150606140575
11th month           Score: 0.150606140575
carnival carnival    Score: 0.150606140575
carnival season      Score: 0.150606140575
carnival takes       Score: 0.150606140575
cologne traditionally Score: 0.150606140575
declared             Score: 0.150606140575
declared open        Score: 0.150606140575
minutes              Score: 0.150606140575
minutes past         Score: 0.150606140575
month                Score: 0.150606140575
month november       Score: 0.150606140575
november             Score: 0.150606140575
open                 Score: 0.150606140575
open 11              Score: 0.150606140575
past                 Score: 0.150606140575
past 11              Score: 0.150606140575
place                Score: 0.150606140575
place year           Score: 0.150606140575
season               Score: 0.150606140575
season declared      Score: 0.150606140575
takes                Score: 0.150606140575
takes place          Score: 0.150606140575
traditionally        Score: 0.150606140575
traditionally carnival Score: 0.150606140575
year                 Score: 0.150606140575
year cologne         Score: 0.150606140575

In [15]:
from nltk.stem.porter import *
from nltk.tokenize import word_tokenize
import string

stemmer = PorterStemmer()
def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

def tokenize(text):
    tokens = word_tokenize(text)
    tokens = [i for i in tokens if i not in string.punctuation]
    stems = stem_tokens(tokens, stemmer)
    return stems

In [16]:
vectorizer3 = CountVectorizer(tokenizer=tokenize, stop_words='english', ngram_range=(1, 2))
X3 = vectorizer3.fit_transform(df['NOTE_TEXT'].tolist())
vectorizer3.get_feature_names()


Out[16]:
[u'11',
 u'11 11th',
 u'11 minut',
 u'11th',
 u'11th 11th',
 u'11th month',
 u'3',
 u'3 peopl',
 u'carniv',
 u'carniv 3',
 u'carniv carniv',
 u'carniv place',
 u'carniv season',
 u'cologn',
 u'cologn carniv',
 u'cologn tradit',
 u'declar',
 u'declar open',
 u'deem',
 u'deem highest',
 u'everi',
 u'everi cologn',
 u'everi year',
 u'farmer',
 u'farmer tradit',
 u'festiv',
 u'grant',
 u'grant titl',
 u'highest',
 u'highest repres',
 u'minut',
 u'minut past',
 u'month',
 u'month novemb',
 u'novemb',
 u'open',
 u'open 11',
 u'past',
 u'past 11',
 u'peopl',
 u'peopl grant',
 u'place',
 u'place everi',
 u'princ',
 u'princ deem',
 u'princ farmer',
 u'repres',
 u'repres festiv',
 u'season',
 u'season declar',
 u'titl',
 u'titl virgin',
 u'tradit',
 u'tradit carniv',
 u'tradit princ',
 u'virgin',
 u'virgin princ',
 u'year',
 u'year cologn']