In [1]:
import nltk
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tnrange, tqdm_notebook
from math import *
import csv
%matplotlib inline
In [2]:
from nltk.book import *
In [18]:
text = "Machine learning is the science of getting computers to act without being explicitly programmed. In the past decade, machine learning has given us self-driving cars, practical speech recognition, effective web search, and a vastly improved understanding of the human genome. Machine learning is so pervasive today that you probably use it dozens of times a day without knowing it. Many researchers also think it is the best way to make progress towards human-level AI. In this class, you will learn about the most effective machine learning techniques, and gain practice implementing them and getting them to work for yourself. More importantly, you'll learn about not only the theoretical underpinnings of learning, but also gain the practical know-how needed to quickly and powerfully apply these techniques to new problems. Finally, you'll learn about some of Silicon Valley's best practices in innovation as it pertains to machine learning and AI."
In [19]:
# extract sentences
sents = nltk.sent_tokenize(text)
#extract words
tokens = nltk.word_tokenize(text)
# tagged tokens
tagged_tokens = nltk.pos_tag(tokens)
In [24]:
# Sentences in another language:
tokenizer = nltk.data.load('tokenizers/punkt/italian.pickle')
italiantext = 'Come ormai tutti sanno, ogni anno, in gran parte del mondo, si celebra il Giorno della Memoria (27 gennaio), in memoria dei terribili crimini contro l’umanità perpetrati dai Nazisti prima e durante la Seconda Guerra Mondiale. Milioni di ebrei furono deportati nei campi di concentramento e sterminio; i più fortunati riuscirono a nascondersi o fuggire prima.'
tokenizer.tokenize(italiantext)
Out[24]:
In [28]:
# POS tagging
nltk.help.upenn_tagset('RB')
nltk.help.upenn_tagset('NN.*')
In [ ]:
# Non c'è un solo pos tagger...quello di default è il maxent treebank pos tagger, ma ci sono anche
# il crf,hmm,brill,tnt
# QUALI SONO LE DIFFERENZE??
# Trainare un tagger:
from nltk.corpus import treebank
train_data = treebank.tagged_sents()[:3000]
test_data = treebank.tagged_sents()[3000:]
from nltk.tag import tnt
tnt_pos_tagger = tnt.TnT()
tnt_pos_tagger.train(train_data)
tnt_pos_tagger.evaluate(test_data)
import pickle
f = open(‘tnt_treebank_pos_tagger.pickle’, ‘w’)
pickle.dump(tnt_pos_tagger, f)
f.close()
tnt_tagger.tag(nltk.word_tokenize(“this is a tnt treebank tnt tagger”))
In [33]:
from nltk.stem.porter import PorterStemmer
# https://tartarus.org/martin/PorterStemmer/
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem import SnowballStemmer
# http://snowball.tartarus.org/
In [34]:
porter_stemmer = PorterStemmer()
porter_stemmer.stem('saying')
lancaster_stemmer = LancasterStemmer()
lancaster_stemmer.stem('saying')
snowball_stemmer = SnowballStemmer('english')
snowball_stemmer.stem('saying')
Out[34]:
In [41]:
snowball_stemmer_it = SnowballStemmer('italian')
snowball_stemmer_it.stem('parlando')
Out[41]:
In [ ]:
# http://wordnet.princeton.edu/
In [42]:
from nltk.stem import WordNetLemmatizer
In [43]:
wordnet_lemmatizer = WordNetLemmatizer()
In [44]:
wordnet_lemmatizer.lemmatize('are')
Out[44]:
In [47]:
wordnet_lemmatizer.lemmatize('is', pos='v')
# risulta quindi importante fare pos tagging prima della lemmization
Out[47]:
In [49]:
from nltk.tag.stanford import StanfordPOSTagger
In [50]:
english_postagger = StanfordPOSTagger('C:\\stanford-postagger-full-2014-08-27\\models\\english-bidirectional-distsim.tagger',
'C:\\stanford-postagger-full-2014-08-27\\stanford-postagger.jar')
In [52]:
english_postagger.tag('this is stanford postagger in nltk for python users'.split())
Out[52]:
In [1]:
from nltk.tag.stanford import StanfordNERTagger
In [2]:
english_nertagger = StanfordNERTagger('C:\\stanford-ner-2014-08-27\\classifiers\\english.all.3class.distsim.crf.ser.gz',
'C:\\stanford-ner-2014-08-27\\stanford-ner.jar')
In [ ]:
english_nertagger.tag('Pincopallino is working at StarWars in Montevarchi'.split())
In [78]:
from nltk.parse.stanford import StanfordParser
In [88]:
english_parser = StanfordParser('C:\\stanford-parser-full-2014-08-27\\stanford-parser.jar',
'C:\\stanford-parser-full-2014-08-27\\stanford-parser-3.4.1-models.jar')
In [89]:
analisi = english_parser.parse_sents('Francesco is a good guy')
In [19]:
import nltk
from nltk.corpus import names
import random
import collections
names = ([(name, 'male') for name in names.words('male.txt')] + [(name, 'female') for name in names.words('female.txt')])
random.shuffle(names)
In [216]:
def gender_features(word):
word = word.lower()
most_freq_char = collections.Counter(word).most_common(1)[0]
repeated_char = [word[i] for i in range(0,len(word)-1) if word[i+1] == word[i]]
VOWELS = 'aeiou'
return {
'last_letter': word[-1],
# 'last_two_letters': word[-2:],
# 'first_letters': word[0],
# 'number_of_k': len([i for i in word if i == 'k']),
# 'number_of_e': len([i for i in word if i == 'e']),
# 'most_freq_char': most_freq_char[0] if most_freq_char[1] > 1 else '',
'repeated_char': '' if len(repeated_char) == 0 else repeated_char[-1],
# 'start_vowels': word[0] in VOWELS,
# 'end_vowels': word[-1] in VOWELS,
# 'num_vowels': len([i for i in word if i in VOWELS]),
# 'num_non_vowels': len([i for i in word if i not in VOWELS])
}
featuresets = [(gender_features(n), g) for (n, g) in names]
train_set, test_set = featuresets[500:], featuresets[:500]
In [217]:
# Naive Bayes Classifier
nb_classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(nb_classifier, test_set))
nb_classifier.show_most_informative_features(20)
In [218]:
# Maxent Classifier
me_classifier = nltk.MaxentClassifier.train(train_set)
print(nltk.classify.accuracy(me_classifier, test_set))
me_classifier.show_most_informative_features(5)
In [99]:
def gender_features2(name):
features = {}
features["firstletter"] = name[0].lower()
features["lastletter"] = name[-1].lower()
for letter in 'abcdefghijklmnopqrstuvwxyz':
features["count(%s)" % letter] = name.lower().count(letter)
features["has(%s)" % letter] = (letter in name.lower())
return features
In [100]:
featuresets = [(gender_features2(n), g) for (n, g) in names]
train_set, test_set = featuresets[500:], featuresets[:500]
nb2_classifier = nltk.NaiveBayesClassifier.train(train_set)
me2_classifier = nltk.MaxentClassifier.train(train_set)
In [103]:
print(nltk.classify.accuracy(nb2_classifier, test_set))
print(nltk.classify.accuracy(me2_classifier, test_set))
In [ ]:
In [ ]: