notebook.community

Edit and run



In [1]:

    
%matplotlib inline



In [6]:

    
from sklearn.feature_extraction import DictVectorizer
onehot_encoder = DictVectorizer()

instances = [
    {'city': 'NY'},
    {'city': 'SF'},
    {'city': 'AZ'}]
print onehot_encoder.fit_transform(instances).toarray()









    



[[ 0.  1.  0.]
 [ 0.  0.  1.]
 [ 1.  0.  0.]]



In [14]:

    
from sklearn.feature_extraction.text import CountVectorizer
corpus = ['UNC played Duke in basketball',
         'Duke lost the basketball game', 
          'I ate a sandwich']



In [15]:

    
vectorizer = CountVectorizer()
print vectorizer.fit_transform(corpus).todense()









    



[[0 1 1 0 1 0 1 0 0 1]
 [0 1 1 1 0 1 0 0 1 0]
 [1 0 0 0 0 0 0 1 0 0]]



In [16]:

    
print vectorizer.vocabulary_









    



{u'duke': 2, u'basketball': 1, u'lost': 5, u'played': 6, u'in': 4, u'game': 3, u'sandwich': 7, u'unc': 9, u'ate': 0, u'the': 8}



In [17]:

    
from sklearn.metrics.pairwise import euclidean_distances



In [18]:

    
counts = vectorizer.fit_transform(corpus).todense()



In [19]:

    
counts









    Out[19]:





matrix([[0, 1, 1, 0, 1, 0, 1, 0, 0, 1],
        [0, 1, 1, 1, 0, 1, 0, 0, 1, 0],
        [1, 0, 0, 0, 0, 0, 0, 1, 0, 0]])



In [21]:

    
print 'distance between 1st and 2nd doc', euclidean_distances(counts[0], counts[1])
print 'distance between 1st and 3rd doc', euclidean_distances(counts[0], counts[2])
print 'distance between 2nd and 3rd doc', euclidean_distances(counts[1], counts[2])









    



distance between 1st and 2nd doc [[ 2.44948974]]
distance between 1st and 3rd doc [[ 2.64575131]]
distance between 2nd and 3rd doc [[ 2.64575131]]



In [22]:

    
#stopword removal
vectorizer = CountVectorizer(stop_words='english')
print vectorizer.fit_transform(corpus).todense()
print vectorizer.vocabulary_









    



[[0 1 1 0 0 1 0 1]
 [0 1 1 1 1 0 0 0]
 [1 0 0 0 0 0 1 0]]
{u'duke': 2, u'basketball': 1, u'lost': 4, u'played': 5, u'game': 3, u'sandwich': 6, u'unc': 7, u'ate': 0}



In [76]:

    
#stemming
from sklearn.feature_extraction.text import CountVectorizer
corpus = [
    'He ate the sandwiches',
    'Every sandwich was eaten by him' ]



In [77]:

    
vectorizer = CountVectorizer(binary=True, stop_words='english')
print vectorizer.fit_transform(corpus).todense()
print vectorizer.vocabulary_









    



[[1 0 0 1]
 [0 1 1 0]]
{u'sandwich': 2, u'ate': 0, u'sandwiches': 3, u'eaten': 1}



In [25]:

    
#lemmatization using NLTK
corpus = [
    'I am gathering ingredients for the sandwich.',
    'There were many wizards at the gathering.' ]



In [27]:

    
from nltk.stem.wordnet import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
print lemmatizer.lemmatize('gathering', 'v')
print lemmatizer.lemmatize('gathering', 'n')









    



gather
gathering



In [28]:

    
#stemming
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
print stemmer.stem('gathering')









    



gather



In [29]:

    
from nltk import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import pos_tag



In [30]:

    
wordnet_tags =['v','n']



In [36]:

    
corpus = [
    'He ate the sandwiches',
    'Every sandwich was eaten by him'
]



In [38]:

    
stemmer = PorterStemmer()
print 'Stemmed: ', [[stemmer.stem(token) for token in word_tokenize(document)] for document in corpus]









    



 Stemmed:  [[u'He', u'ate', u'the', u'sandwich'], [u'Everi', u'sandwich', u'wa', u'eaten', u'by', u'him']]



In [66]:

    
def lemmatize(token, tag):
    if tag[0].lower() in ['n', 'v']:
        #print tag[0].lower()
        return lemmatizer.lemmatize(token, tag[0].lower())
    return token



In [55]:

    
lemmatizer = WordNetLemmatizer()
tagged_corpus = [pos_tag(word_tokenize(document)) for document in corpus]



In [56]:

    
print tagged_corpus









    



[[('He', 'PRP'), ('ate', 'VBD'), ('the', 'DT'), ('sandwiches', 'NNS')], [('Every', 'DT'), ('sandwich', 'NN'), ('was', 'VBD'), ('eaten', 'VBN'), ('by', 'IN'), ('him', 'PRP')]]



In [57]:

    
print [word_tokenize(document) for document in corpus]









    



[['He', 'ate', 'the', 'sandwiches'], ['Every', 'sandwich', 'was', 'eaten', 'by', 'him']]



In [48]:

    
pos_tag(['He'])









    Out[48]:





[('He', 'PRP')]



In [49]:

    
pos_tag(['Sandwiches'])









    Out[49]:





[('Sandwiches', 'NNS')]



In [67]:

    
print "lemmatized:", [[lemmatize(token,tag) for token, tag in document] for document in tagged_corpus]









    



lemmatized: [['He', u'eat', 'the', u'sandwich'], ['Every', 'sandwich', u'be', u'eat', 'by', 'him']]



In [69]:

    
lemmatizer.lemmatize('ate',"VBD"[0].lower())









    Out[69]:





u'eat'



In [70]:

    
"VBD"[0].lower()









    Out[70]:





'v'



In [71]:

    
document









    Out[71]:





[('Every', 'DT'),
 ('sandwich', 'NN'),
 ('was', 'VBD'),
 ('eaten', 'VBN'),
 ('by', 'IN'),
 ('him', 'PRP')]



In [80]:

    
from sklearn.feature_extraction.text import CountVectorizer
corpus = ['The dog ate a sandwich, the wizard transfigured a sandwich, and I ate a sandwich']
vectorizer = CountVectorizer(stop_words='english')
print vectorizer.fit_transform(corpus).todense()
print vectorizer.vocabulary_









    



[[2 1 3 1 1]]
{u'sandwich': 2, u'wizard': 4, u'dog': 1, u'transfigured': 3, u'ate': 0}



In [83]:

    
#tf idf
from sklearn.feature_extraction.text import TfidfVectorizer
corpus = [
    'The dog ate a sandwich and I ate a sandwich',
    'The wizard transfigured a sandwich'
]
vectorizer = TfidfVectorizer(stop_words='english')
print vectorizer.fit_transform(corpus).todense()









    



[[ 0.75458397  0.37729199  0.53689271  0.          0.        ]
 [ 0.          0.          0.44943642  0.6316672   0.6316672 ]]



In [84]:

    
#hashing trikc
from sklearn.feature_extraction.text import HashingVectorizer
corpus = ['the', 'ate', 'bacon', 'cat']
vectorizer = HashingVectorizer(n_features=6)
print vectorizer.transform(corpus).todense()









    



[[-1.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  1.  0.  0.]
 [ 0.  0.  0.  0. -1.  0.]
 [ 0.  1.  0.  0.  0.  0.]]



In [85]:

    
#Feature Extraction from Images
from sklearn import datasets
digits = datasets.load_digits()



In [86]:

    
print 'Digit:', digits.target[0]









    



Digit: 0



In [93]:

    
print digits.images[0]
print digits.images[0].shape









    



[[  0.   0.   5.  13.   9.   1.   0.   0.]
 [  0.   0.  13.  15.  10.  15.   5.   0.]
 [  0.   3.  15.   2.   0.  11.   8.   0.]
 [  0.   4.  12.   0.   0.   8.   8.   0.]
 [  0.   5.   8.   0.   0.   9.   8.   0.]
 [  0.   4.  11.   0.   1.  12.   7.   0.]
 [  0.   2.  14.   5.  10.  12.   0.   0.]
 [  0.   0.   6.  13.  10.   0.   0.   0.]]
(8, 8)



In [99]:

    
digits.images[0].reshape(-1,64).shape









    Out[99]:





(1, 64)



In [ ]: