In [1]:
%matplotlib inline
In [6]:
from sklearn.feature_extraction import DictVectorizer
onehot_encoder = DictVectorizer()
instances = [
{'city': 'NY'},
{'city': 'SF'},
{'city': 'AZ'}]
print onehot_encoder.fit_transform(instances).toarray()
In [14]:
from sklearn.feature_extraction.text import CountVectorizer
corpus = ['UNC played Duke in basketball',
'Duke lost the basketball game',
'I ate a sandwich']
In [15]:
vectorizer = CountVectorizer()
print vectorizer.fit_transform(corpus).todense()
In [16]:
print vectorizer.vocabulary_
In [17]:
from sklearn.metrics.pairwise import euclidean_distances
In [18]:
counts = vectorizer.fit_transform(corpus).todense()
In [19]:
counts
Out[19]:
In [21]:
print 'distance between 1st and 2nd doc', euclidean_distances(counts[0], counts[1])
print 'distance between 1st and 3rd doc', euclidean_distances(counts[0], counts[2])
print 'distance between 2nd and 3rd doc', euclidean_distances(counts[1], counts[2])
In [22]:
#stopword removal
vectorizer = CountVectorizer(stop_words='english')
print vectorizer.fit_transform(corpus).todense()
print vectorizer.vocabulary_
In [76]:
#stemming
from sklearn.feature_extraction.text import CountVectorizer
corpus = [
'He ate the sandwiches',
'Every sandwich was eaten by him' ]
In [77]:
vectorizer = CountVectorizer(binary=True, stop_words='english')
print vectorizer.fit_transform(corpus).todense()
print vectorizer.vocabulary_
In [25]:
#lemmatization using NLTK
corpus = [
'I am gathering ingredients for the sandwich.',
'There were many wizards at the gathering.' ]
In [27]:
from nltk.stem.wordnet import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
print lemmatizer.lemmatize('gathering', 'v')
print lemmatizer.lemmatize('gathering', 'n')
In [28]:
#stemming
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
print stemmer.stem('gathering')
In [29]:
from nltk import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import pos_tag
In [30]:
wordnet_tags =['v','n']
In [36]:
corpus = [
'He ate the sandwiches',
'Every sandwich was eaten by him'
]
In [38]:
stemmer = PorterStemmer()
print 'Stemmed: ', [[stemmer.stem(token) for token in word_tokenize(document)] for document in corpus]
In [66]:
def lemmatize(token, tag):
if tag[0].lower() in ['n', 'v']:
#print tag[0].lower()
return lemmatizer.lemmatize(token, tag[0].lower())
return token
In [55]:
lemmatizer = WordNetLemmatizer()
tagged_corpus = [pos_tag(word_tokenize(document)) for document in corpus]
In [56]:
print tagged_corpus
In [57]:
print [word_tokenize(document) for document in corpus]
In [48]:
pos_tag(['He'])
Out[48]:
In [49]:
pos_tag(['Sandwiches'])
Out[49]:
In [67]:
print "lemmatized:", [[lemmatize(token,tag) for token, tag in document] for document in tagged_corpus]
In [69]:
lemmatizer.lemmatize('ate',"VBD"[0].lower())
Out[69]:
In [70]:
"VBD"[0].lower()
Out[70]:
In [71]:
document
Out[71]:
In [80]:
from sklearn.feature_extraction.text import CountVectorizer
corpus = ['The dog ate a sandwich, the wizard transfigured a sandwich, and I ate a sandwich']
vectorizer = CountVectorizer(stop_words='english')
print vectorizer.fit_transform(corpus).todense()
print vectorizer.vocabulary_
In [83]:
#tf idf
from sklearn.feature_extraction.text import TfidfVectorizer
corpus = [
'The dog ate a sandwich and I ate a sandwich',
'The wizard transfigured a sandwich'
]
vectorizer = TfidfVectorizer(stop_words='english')
print vectorizer.fit_transform(corpus).todense()
In [84]:
#hashing trikc
from sklearn.feature_extraction.text import HashingVectorizer
corpus = ['the', 'ate', 'bacon', 'cat']
vectorizer = HashingVectorizer(n_features=6)
print vectorizer.transform(corpus).todense()
In [85]:
#Feature Extraction from Images
from sklearn import datasets
digits = datasets.load_digits()
In [86]:
print 'Digit:', digits.target[0]
In [93]:
print digits.images[0]
print digits.images[0].shape
In [99]:
digits.images[0].reshape(-1,64).shape
Out[99]:
In [ ]: