DictVectorizer implements what is called one-of-K or “one-hot” coding for categorical


In [1]:
measurements = [
    {'city': 'Dubai', 'temperature': 33.},
    {'city': 'London', 'temperature': 12.},
    {'city': 'San Francisco', 'temperature': 18.},
]

In [2]:
from sklearn.feature_extraction import DictVectorizer
vec = DictVectorizer()

In [3]:
vec.fit_transform(measurements).toarray()


Out[3]:
array([[  1.,   0.,   0.,  33.],
       [  0.,   1.,   0.,  12.],
       [  0.,   0.,   1.,  18.]])

In [4]:
vec.get_feature_names()


Out[4]:
['city=Dubai', 'city=London', 'city=San Francisco', 'temperature']

special case


In [6]:
pos_window = [
    {
        'word-2': 'the',
        'pos-2': 'DT',
        'word-1': 'cat',
        'pos-1': 'NN',
        'word+1': 'on',
        'pos+1': 'PP',
    },
    # in a real application one would extract many such dictionaries
]

In [7]:
vec = DictVectorizer()
pos_vec = vec.fit_transform(pos_window)

In [10]:
pos_vec.toarray()


Out[10]:
array([[ 1.,  1.,  1.,  1.,  1.,  1.]])

In [16]:
vec.get_feature_names()


Out[16]:
['pos+1=PP', 'pos-1=NN', 'pos-2=DT', 'word+1=on', 'word-1=cat', 'word-2=the']

Feature Hashing

For large hash table sizes, it can be disabled, to allow the output to be passed to estimators like sklearn.naive_bayes.MultinomialNB or sklearn.feature_selection.chi2 feature selectors that expect non-negative inputs.r

CountVectorizer


In [18]:
from sklearn.feature_extraction.text import CountVectorizer

In [37]:
vectorizer = CountVectorizer()

In [38]:
corpus = [
    'This is the first document.',
    'This is the second second document.',
    'And the third one.',
    'Is this the first document?',
]
vectorizer.fit(corpus)


Out[38]:
CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [39]:
analyze = vectorizer.build_analyzer()

In [40]:
analyze('hi there dear vanaja how are you')


Out[40]:
['hi', 'there', 'dear', 'vanaja', 'how', 'are', 'you']

In [43]:
# mono -grams
vectorizer.get_feature_names()


Out[43]:
['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']

In [44]:
print(vectorizer.transform(corpus).toarray())


[[0 1 1 1 0 0 1 0 1]
 [0 1 0 1 0 2 1 0 1]
 [1 0 0 0 1 0 1 1 0]
 [0 1 1 1 0 0 1 0 1]]

In [45]:
vectorizer = CountVectorizer(ngram_range=(1, 2))

In [46]:
vectorizer.fit(corpus)


Out[46]:
CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [86]:
vectorizer.get_feature_names()


Out[86]:
['and',
 'and the',
 'document',
 'first',
 'first document',
 'is',
 'is the',
 'is this',
 'one',
 'second',
 'second document',
 'second second',
 'the',
 'the first',
 'the second',
 'the third',
 'third',
 'third one',
 'this',
 'this is',
 'this the']

In [87]:
from sklearn.feature_extraction.text import TfidfVectorizer

tf_vec = TfidfVectorizer(lowercase=True)

tf_vec.fit(data)

In [ ]: