In [1]:
%matplotlib inline

In [6]:
from sklearn.feature_extraction import DictVectorizer
onehot_encoder = DictVectorizer()

instances = [
    {'city': 'NY'},
    {'city': 'SF'},
    {'city': 'AZ'}]
print onehot_encoder.fit_transform(instances).toarray()


[[ 0.  1.  0.]
 [ 0.  0.  1.]
 [ 1.  0.  0.]]

In [14]:
from sklearn.feature_extraction.text import CountVectorizer
corpus = ['UNC played Duke in basketball',
         'Duke lost the basketball game', 
          'I ate a sandwich']

In [15]:
vectorizer = CountVectorizer()
print vectorizer.fit_transform(corpus).todense()


[[0 1 1 0 1 0 1 0 0 1]
 [0 1 1 1 0 1 0 0 1 0]
 [1 0 0 0 0 0 0 1 0 0]]

In [16]:
print vectorizer.vocabulary_


{u'duke': 2, u'basketball': 1, u'lost': 5, u'played': 6, u'in': 4, u'game': 3, u'sandwich': 7, u'unc': 9, u'ate': 0, u'the': 8}

In [17]:
from sklearn.metrics.pairwise import euclidean_distances

In [18]:
counts = vectorizer.fit_transform(corpus).todense()

In [19]:
counts


Out[19]:
matrix([[0, 1, 1, 0, 1, 0, 1, 0, 0, 1],
        [0, 1, 1, 1, 0, 1, 0, 0, 1, 0],
        [1, 0, 0, 0, 0, 0, 0, 1, 0, 0]])

In [21]:
print 'distance between 1st and 2nd doc', euclidean_distances(counts[0], counts[1])
print 'distance between 1st and 3rd doc', euclidean_distances(counts[0], counts[2])
print 'distance between 2nd and 3rd doc', euclidean_distances(counts[1], counts[2])


distance between 1st and 2nd doc [[ 2.44948974]]
distance between 1st and 3rd doc [[ 2.64575131]]
distance between 2nd and 3rd doc [[ 2.64575131]]

In [22]:
#stopword removal
vectorizer = CountVectorizer(stop_words='english')
print vectorizer.fit_transform(corpus).todense()
print vectorizer.vocabulary_


[[0 1 1 0 0 1 0 1]
 [0 1 1 1 1 0 0 0]
 [1 0 0 0 0 0 1 0]]
{u'duke': 2, u'basketball': 1, u'lost': 4, u'played': 5, u'game': 3, u'sandwich': 6, u'unc': 7, u'ate': 0}

In [76]:
#stemming
from sklearn.feature_extraction.text import CountVectorizer
corpus = [
    'He ate the sandwiches',
    'Every sandwich was eaten by him' ]

In [77]:
vectorizer = CountVectorizer(binary=True, stop_words='english')
print vectorizer.fit_transform(corpus).todense()
print vectorizer.vocabulary_


[[1 0 0 1]
 [0 1 1 0]]
{u'sandwich': 2, u'ate': 0, u'sandwiches': 3, u'eaten': 1}

In [25]:
#lemmatization using NLTK
corpus = [
    'I am gathering ingredients for the sandwich.',
    'There were many wizards at the gathering.' ]

In [27]:
from nltk.stem.wordnet import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
print lemmatizer.lemmatize('gathering', 'v')
print lemmatizer.lemmatize('gathering', 'n')


gather
gathering

In [28]:
#stemming
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
print stemmer.stem('gathering')


gather

In [29]:
from nltk import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import pos_tag

In [30]:
wordnet_tags =['v','n']

In [36]:
corpus = [
    'He ate the sandwiches',
    'Every sandwich was eaten by him'
]




In [38]:
stemmer = PorterStemmer()
print 'Stemmed: ', [[stemmer.stem(token) for token in word_tokenize(document)] for document in corpus]


 Stemmed:  [[u'He', u'ate', u'the', u'sandwich'], [u'Everi', u'sandwich', u'wa', u'eaten', u'by', u'him']]

In [66]:
def lemmatize(token, tag):
    if tag[0].lower() in ['n', 'v']:
        #print tag[0].lower()
        return lemmatizer.lemmatize(token, tag[0].lower())
    return token

In [55]:
lemmatizer = WordNetLemmatizer()
tagged_corpus = [pos_tag(word_tokenize(document)) for document in corpus]

In [56]:
print tagged_corpus


[[('He', 'PRP'), ('ate', 'VBD'), ('the', 'DT'), ('sandwiches', 'NNS')], [('Every', 'DT'), ('sandwich', 'NN'), ('was', 'VBD'), ('eaten', 'VBN'), ('by', 'IN'), ('him', 'PRP')]]

In [57]:
print [word_tokenize(document) for document in corpus]


[['He', 'ate', 'the', 'sandwiches'], ['Every', 'sandwich', 'was', 'eaten', 'by', 'him']]

In [48]:
pos_tag(['He'])


Out[48]:
[('He', 'PRP')]

In [49]:
pos_tag(['Sandwiches'])


Out[49]:
[('Sandwiches', 'NNS')]

In [67]:
print "lemmatized:", [[lemmatize(token,tag) for token, tag in document] for document in tagged_corpus]


lemmatized: [['He', u'eat', 'the', u'sandwich'], ['Every', 'sandwich', u'be', u'eat', 'by', 'him']]

In [69]:
lemmatizer.lemmatize('ate',"VBD"[0].lower())


Out[69]:
u'eat'

In [70]:
"VBD"[0].lower()


Out[70]:
'v'

In [71]:
document


Out[71]:
[('Every', 'DT'),
 ('sandwich', 'NN'),
 ('was', 'VBD'),
 ('eaten', 'VBN'),
 ('by', 'IN'),
 ('him', 'PRP')]

In [80]:
from sklearn.feature_extraction.text import CountVectorizer
corpus = ['The dog ate a sandwich, the wizard transfigured a sandwich, and I ate a sandwich']
vectorizer = CountVectorizer(stop_words='english')
print vectorizer.fit_transform(corpus).todense()
print vectorizer.vocabulary_


[[2 1 3 1 1]]
{u'sandwich': 2, u'wizard': 4, u'dog': 1, u'transfigured': 3, u'ate': 0}

In [83]:
#tf idf
from sklearn.feature_extraction.text import TfidfVectorizer
corpus = [
    'The dog ate a sandwich and I ate a sandwich',
    'The wizard transfigured a sandwich'
]
vectorizer = TfidfVectorizer(stop_words='english')
print vectorizer.fit_transform(corpus).todense()


[[ 0.75458397  0.37729199  0.53689271  0.          0.        ]
 [ 0.          0.          0.44943642  0.6316672   0.6316672 ]]

In [84]:
#hashing trikc
from sklearn.feature_extraction.text import HashingVectorizer
corpus = ['the', 'ate', 'bacon', 'cat']
vectorizer = HashingVectorizer(n_features=6)
print vectorizer.transform(corpus).todense()


[[-1.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  1.  0.  0.]
 [ 0.  0.  0.  0. -1.  0.]
 [ 0.  1.  0.  0.  0.  0.]]

In [85]:
#Feature Extraction from Images
from sklearn import datasets
digits = datasets.load_digits()

In [86]:
print 'Digit:', digits.target[0]


Digit: 0

In [93]:
print digits.images[0]
print digits.images[0].shape


[[  0.   0.   5.  13.   9.   1.   0.   0.]
 [  0.   0.  13.  15.  10.  15.   5.   0.]
 [  0.   3.  15.   2.   0.  11.   8.   0.]
 [  0.   4.  12.   0.   0.   8.   8.   0.]
 [  0.   5.   8.   0.   0.   9.   8.   0.]
 [  0.   4.  11.   0.   1.  12.   7.   0.]
 [  0.   2.  14.   5.  10.  12.   0.   0.]
 [  0.   0.   6.  13.  10.   0.   0.   0.]]
(8, 8)

In [99]:
digits.images[0].reshape(-1,64).shape


Out[99]:
(1, 64)

In [ ]: