In [1]:
measurements = [
{'city': 'Dubai', 'temperature': 33.},
{'city': 'London', 'temperature': 12.},
{'city': 'San Francisco', 'temperature': 18.},
]
In [2]:
from sklearn.feature_extraction import DictVectorizer
vec = DictVectorizer()
In [3]:
vec.fit_transform(measurements).toarray()
Out[3]:
In [4]:
vec.get_feature_names()
Out[4]:
In [6]:
pos_window = [
{
'word-2': 'the',
'pos-2': 'DT',
'word-1': 'cat',
'pos-1': 'NN',
'word+1': 'on',
'pos+1': 'PP',
},
# in a real application one would extract many such dictionaries
]
In [7]:
vec = DictVectorizer()
pos_vec = vec.fit_transform(pos_window)
In [10]:
pos_vec.toarray()
Out[10]:
In [16]:
vec.get_feature_names()
Out[16]:
In [18]:
from sklearn.feature_extraction.text import CountVectorizer
In [37]:
vectorizer = CountVectorizer()
In [38]:
corpus = [
'This is the first document.',
'This is the second second document.',
'And the third one.',
'Is this the first document?',
]
vectorizer.fit(corpus)
Out[38]:
In [39]:
analyze = vectorizer.build_analyzer()
In [40]:
analyze('hi there dear vanaja how are you')
Out[40]:
In [43]:
# mono -grams
vectorizer.get_feature_names()
Out[43]:
In [44]:
print(vectorizer.transform(corpus).toarray())
In [45]:
vectorizer = CountVectorizer(ngram_range=(1, 2))
In [46]:
vectorizer.fit(corpus)
Out[46]:
In [86]:
vectorizer.get_feature_names()
Out[86]:
In [87]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf_vec = TfidfVectorizer(lowercase=True)
tf_vec.fit(data)
In [ ]: