In [1]:
import nltk
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tnrange, tqdm_notebook
from math import *
import csv
%matplotlib inline

In [2]:
from nltk.book import *


*** Introductory Examples for the NLTK Book ***
Loading text1, ..., text9 and sent1, ..., sent9
Type the name of the text or sentence to view it.
Type: 'texts()' or 'sents()' to list the materials.
text1: Moby Dick by Herman Melville 1851
text2: Sense and Sensibility by Jane Austen 1811
text3: The Book of Genesis
text4: Inaugural Address Corpus
text5: Chat Corpus
text6: Monty Python and the Holy Grail
text7: Wall Street Journal
text8: Personals Corpus
text9: The Man Who Was Thursday by G . K . Chesterton 1908

In [18]:
text = "Machine learning is the science of getting computers to act without being explicitly programmed. In the past decade, machine learning has given us self-driving cars, practical speech recognition, effective web search, and a vastly improved understanding of the human genome. Machine learning is so pervasive today that you probably use it dozens of times a day without knowing it. Many researchers also think it is the best way to make progress towards human-level AI. In this class, you will learn about the most effective machine learning techniques, and gain practice implementing them and getting them to work for yourself. More importantly, you'll learn about not only the theoretical underpinnings of learning, but also gain the practical know-how needed to quickly and powerfully apply these techniques to new problems. Finally, you'll learn about some of Silicon Valley's best practices in innovation as it pertains to machine learning and AI."

Tokenize: sentences and words


In [19]:
# extract sentences
sents = nltk.sent_tokenize(text)

#extract words
tokens = nltk.word_tokenize(text)

# tagged tokens
tagged_tokens = nltk.pos_tag(tokens)

In [24]:
# Sentences in another language:
tokenizer = nltk.data.load('tokenizers/punkt/italian.pickle')

italiantext = 'Come ormai tutti sanno, ogni anno, in gran parte del mondo, si celebra il Giorno della Memoria (27 gennaio), in memoria dei terribili crimini contro l’umanità perpetrati dai Nazisti prima e durante la Seconda Guerra Mondiale. Milioni di ebrei furono deportati nei campi di concentramento e sterminio; i più fortunati riuscirono a nascondersi o fuggire prima.'

tokenizer.tokenize(italiantext)


Out[24]:
['Come ormai tutti sanno, ogni anno, in gran parte del mondo, si celebra il Giorno della Memoria (27 gennaio), in memoria dei terribili crimini contro l’umanità perpetrati dai Nazisti prima e durante la Seconda Guerra Mondiale.',
 'Milioni di ebrei furono deportati nei campi di concentramento e sterminio; i più fortunati riuscirono a nascondersi o fuggire prima.']

POS Tagging


In [28]:
# POS tagging
nltk.help.upenn_tagset('RB')
nltk.help.upenn_tagset('NN.*')


RB: adverb
    occasionally unabatingly maddeningly adventurously professedly
    stirringly prominently technologically magisterially predominately
    swiftly fiscally pitilessly ...
NN: noun, common, singular or mass
    common-carrier cabbage knuckle-duster Casino afghan shed thermostat
    investment slide humour falloff slick wind hyena override subhumanity
    machinist ...
NNP: noun, proper, singular
    Motown Venneboerger Czestochwa Ranzer Conchita Trumplane Christos
    Oceanside Escobar Kreisler Sawyer Cougar Yvette Ervin ODI Darryl CTCA
    Shannon A.K.C. Meltex Liverpool ...
NNPS: noun, proper, plural
    Americans Americas Amharas Amityvilles Amusements Anarcho-Syndicalists
    Andalusians Andes Andruses Angels Animals Anthony Antilles Antiques
    Apache Apaches Apocrypha ...
NNS: noun, common, plural
    undergraduates scotches bric-a-brac products bodyguards facets coasts
    divestitures storehouses designs clubs fragrances averages
    subjectivists apprehensions muses factory-jobs ...

In [ ]:
# Non c'è un solo pos tagger...quello di default è il maxent treebank pos tagger, ma ci sono anche
# il crf,hmm,brill,tnt

# QUALI SONO LE DIFFERENZE??

# Trainare un tagger:
from nltk.corpus import treebank
train_data = treebank.tagged_sents()[:3000]
test_data = treebank.tagged_sents()[3000:]

from nltk.tag import tnt
tnt_pos_tagger = tnt.TnT()
tnt_pos_tagger.train(train_data)
tnt_pos_tagger.evaluate(test_data)

import pickle
f = open(tnt_treebank_pos_tagger.pickle, w)
pickle.dump(tnt_pos_tagger, f)
f.close()

tnt_tagger.tag(nltk.word_tokenize(this is a tnt treebank tnt tagger))

Stemming:

riduzione di una parola alla sua radice o forma base


In [33]:
from nltk.stem.porter import PorterStemmer
# https://tartarus.org/martin/PorterStemmer/
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem import SnowballStemmer
# http://snowball.tartarus.org/

In [34]:
porter_stemmer = PorterStemmer()
porter_stemmer.stem('saying')

lancaster_stemmer = LancasterStemmer()
lancaster_stemmer.stem('saying')

snowball_stemmer = SnowballStemmer('english')
snowball_stemmer.stem('saying')


Out[34]:
'say'

In [41]:
snowball_stemmer_it = SnowballStemmer('italian')
snowball_stemmer_it.stem('parlando')


Out[41]:
'parl'

Lemmization:

riduzione al lemma: considera anche il contesto della parola


In [ ]:
# http://wordnet.princeton.edu/

In [42]:
from nltk.stem import WordNetLemmatizer

In [43]:
wordnet_lemmatizer = WordNetLemmatizer()

In [44]:
wordnet_lemmatizer.lemmatize('are')


Out[44]:
'are'

In [47]:
wordnet_lemmatizer.lemmatize('is', pos='v')
# risulta quindi importante fare pos tagging prima della lemmization


Out[47]:
'be'

STANFORD

POSTagger


In [49]:
from nltk.tag.stanford import StanfordPOSTagger

In [50]:
english_postagger = StanfordPOSTagger('C:\\stanford-postagger-full-2014-08-27\\models\\english-bidirectional-distsim.tagger',
                                      'C:\\stanford-postagger-full-2014-08-27\\stanford-postagger.jar')

In [52]:
english_postagger.tag('this is stanford postagger in nltk for python users'.split())


Out[52]:
[('this', 'DT'),
 ('is', 'VBZ'),
 ('stanford', 'JJ'),
 ('postagger', 'NN'),
 ('in', 'IN'),
 ('nltk', 'NN'),
 ('for', 'IN'),
 ('python', 'NN'),
 ('users', 'NNS')]

Named Entity Recognizer (NER)


In [1]:
from nltk.tag.stanford import StanfordNERTagger

In [2]:
english_nertagger = StanfordNERTagger('C:\\stanford-ner-2014-08-27\\classifiers\\english.all.3class.distsim.crf.ser.gz',
                              'C:\\stanford-ner-2014-08-27\\stanford-ner.jar')


---------------------------------------------------------------------------
LookupError                               Traceback (most recent call last)
<ipython-input-2-bd33fac66288> in <module>()
      1 english_nertagger = StanfordNERTagger('C:\\stanford-ner-2014-08-27\\classifiers\\english.all.3class.distsim.crf.ser.gz',
----> 2                               'C:\\stanford-ner-2014-08-27\\stanford-ner.jar')

/home/ale/anaconda3/lib/python3.6/site-packages/nltk/tag/stanford.py in __init__(self, *args, **kwargs)
    166 
    167     def __init__(self, *args, **kwargs):
--> 168         super(StanfordNERTagger, self).__init__(*args, **kwargs)
    169 
    170     @property

/home/ale/anaconda3/lib/python3.6/site-packages/nltk/tag/stanford.py in __init__(self, model_filename, path_to_jar, encoding, verbose, java_options)
     51                 self._JAR, path_to_jar,
     52                 searchpath=(), url=_stanford_url,
---> 53                 verbose=verbose)
     54 
     55         self._stanford_model = find_file(model_filename,

/home/ale/anaconda3/lib/python3.6/site-packages/nltk/__init__.py in find_jar(name_pattern, path_to_jar, env_vars, searchpath, url, verbose, is_regex)
    717         searchpath=(), url=None, verbose=True, is_regex=False):
    718     return next(find_jar_iter(name_pattern, path_to_jar, env_vars,
--> 719                          searchpath, url, verbose, is_regex))
    720 
    721 

/home/ale/anaconda3/lib/python3.6/site-packages/nltk/__init__.py in find_jar_iter(name_pattern, path_to_jar, env_vars, searchpath, url, verbose, is_regex)
    633         else:
    634             raise LookupError('Could not find %s jar file at %s' %
--> 635                             (name_pattern, path_to_jar))
    636 
    637     # Check environment variables

LookupError: Could not find stanford-ner.jar jar file at C:\stanford-ner-2014-08-27\stanford-ner.jar

In [ ]:
english_nertagger.tag('Pincopallino is working at StarWars in Montevarchi'.split())

Parser: analisi logica


In [78]:
from nltk.parse.stanford import StanfordParser

In [88]:
english_parser = StanfordParser('C:\\stanford-parser-full-2014-08-27\\stanford-parser.jar',
                                'C:\\stanford-parser-full-2014-08-27\\stanford-parser-3.4.1-models.jar')

In [89]:
analisi = english_parser.parse_sents('Francesco is a good guy')

Classification algorithms:

Assegnazioni di classi o categorie al testo

Classic models:

- Naive Bayes Model
- Maximum Entropy Classifier

In [19]:
import nltk
from nltk.corpus import names
import random
import collections


names = ([(name, 'male') for name in names.words('male.txt')] + [(name, 'female') for name in names.words('female.txt')])
 
random.shuffle(names)

In [216]:
def gender_features(word):
    word = word.lower()
    most_freq_char = collections.Counter(word).most_common(1)[0]
    repeated_char = [word[i] for i in range(0,len(word)-1) if word[i+1] == word[i]]
    VOWELS = 'aeiou'
    return {
        'last_letter': word[-1],
#         'last_two_letters': word[-2:],
#         'first_letters': word[0],
#         'number_of_k': len([i for i in word if i == 'k']),
#         'number_of_e': len([i for i in word if i == 'e']),
#         'most_freq_char': most_freq_char[0] if most_freq_char[1] > 1 else '',
        'repeated_char': '' if len(repeated_char) == 0 else repeated_char[-1],
#         'start_vowels': word[0] in VOWELS,
#          'end_vowels': word[-1] in VOWELS,        
#          'num_vowels': len([i for i in word if i in VOWELS]),
#          'num_non_vowels': len([i for i in word if i not in VOWELS])
    }


featuresets = [(gender_features(n), g) for (n, g) in names]
train_set, test_set = featuresets[500:], featuresets[:500]

In [217]:
# Naive Bayes Classifier

nb_classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(nb_classifier, test_set))

nb_classifier.show_most_informative_features(20)


0.802
Most Informative Features
             last_letter = 'a'            female : male   =     35.5 : 1.0
             last_letter = 'k'              male : female =     30.3 : 1.0
             last_letter = 'f'              male : female =     15.9 : 1.0
             last_letter = 'p'              male : female =     11.9 : 1.0
             last_letter = 'v'              male : female =     11.2 : 1.0
             last_letter = 'd'              male : female =     10.3 : 1.0
             last_letter = 'm'              male : female =      8.7 : 1.0
             last_letter = 'o'              male : female =      8.1 : 1.0
             last_letter = 'r'              male : female =      6.5 : 1.0
             last_letter = 'g'              male : female =      5.4 : 1.0
             last_letter = 'w'              male : female =      5.4 : 1.0
           repeated_char = 'o'              male : female =      5.1 : 1.0
           repeated_char = 'a'              male : female =      5.1 : 1.0
             last_letter = 't'              male : female =      4.4 : 1.0
             last_letter = 's'              male : female =      4.3 : 1.0
             last_letter = 'z'              male : female =      4.0 : 1.0
             last_letter = 'j'              male : female =      4.0 : 1.0
             last_letter = 'b'              male : female =      3.8 : 1.0
             last_letter = 'i'            female : male   =      3.8 : 1.0
           repeated_char = 'e'            female : male   =      3.6 : 1.0

In [218]:
# Maxent Classifier

me_classifier = nltk.MaxentClassifier.train(train_set)
print(nltk.classify.accuracy(me_classifier, test_set))

me_classifier.show_most_informative_features(5)


  ==> Training (100 iterations)

      Iteration    Log Likelihood    Accuracy
      ---------------------------------------
             1          -0.69315        0.371
             2          -0.43404        0.749
             3          -0.38942        0.772
             4          -0.37293        0.772
             5          -0.36564        0.773
             6          -0.36210        0.773
             7          -0.36027        0.773
             8          -0.35928        0.773
             9          -0.35873        0.773
            10          -0.35840        0.773
            11          -0.35820        0.773
            12          -0.35807        0.773
            13          -0.35798        0.773
            14          -0.35791        0.773
            15          -0.35786        0.773
            16          -0.35782        0.773
            17          -0.35779        0.773
            18          -0.35776        0.773
            19          -0.35773        0.773
            20          -0.35771        0.773
            21          -0.35769        0.773
            22          -0.35767        0.773
            23          -0.35765        0.773
            24          -0.35764        0.773
            25          -0.35763        0.773
            26          -0.35761        0.773
            27          -0.35760        0.773
            28          -0.35759        0.773
            29          -0.35758        0.773
            30          -0.35757        0.773
            31          -0.35756        0.773
            32          -0.35755        0.773
            33          -0.35754        0.773
            34          -0.35753        0.773
            35          -0.35753        0.773
            36          -0.35752        0.773
            37          -0.35751        0.773
            38          -0.35751        0.773
            39          -0.35750        0.773
            40          -0.35749        0.773
            41          -0.35749        0.773
            42          -0.35748        0.773
            43          -0.35748        0.773
            44          -0.35747        0.773
            45          -0.35747        0.773
            46          -0.35746        0.773
            47          -0.35746        0.773
            48          -0.35746        0.773
            49          -0.35745        0.773
            50          -0.35745        0.773
            51          -0.35744        0.773
            52          -0.35744        0.773
            53          -0.35744        0.773
            54          -0.35743        0.773
            55          -0.35743        0.773
            56          -0.35743        0.773
            57          -0.35742        0.773
            58          -0.35742        0.773
            59          -0.35742        0.773
            60          -0.35742        0.773
            61          -0.35741        0.773
            62          -0.35741        0.773
            63          -0.35741        0.773
            64          -0.35741        0.773
            65          -0.35740        0.773
            66          -0.35740        0.773
            67          -0.35740        0.773
            68          -0.35740        0.773
            69          -0.35740        0.773
            70          -0.35739        0.773
            71          -0.35739        0.773
            72          -0.35739        0.773
            73          -0.35739        0.773
            74          -0.35739        0.773
            75          -0.35738        0.773
            76          -0.35738        0.773
            77          -0.35738        0.773
            78          -0.35738        0.773
            79          -0.35738        0.773
            80          -0.35738        0.773
            81          -0.35738        0.773
            82          -0.35737        0.773
            83          -0.35737        0.773
            84          -0.35737        0.773
            85          -0.35737        0.773
            86          -0.35737        0.773
            87          -0.35737        0.773
            88          -0.35737        0.773
            89          -0.35736        0.773
            90          -0.35736        0.773
            91          -0.35736        0.773
            92          -0.35736        0.773
            93          -0.35736        0.773
            94          -0.35736        0.773
            95          -0.35736        0.773
            96          -0.35736        0.773
            97          -0.35736        0.773
            98          -0.35735        0.773
            99          -0.35735        0.773
         Final          -0.35735        0.773
0.802
   5.661 last_letter=='c' and label is 'male'
   5.556 last_letter==' ' and label is 'female'
   5.170 repeated_char=='v' and label is 'female'
  -5.141 last_letter=='a' and label is 'male'
  -3.698 last_letter=='k' and label is 'female'

In [99]:
def gender_features2(name):
    features = {}
    features["firstletter"] = name[0].lower()
    features["lastletter"] = name[-1].lower()
    for letter in 'abcdefghijklmnopqrstuvwxyz':
        features["count(%s)" % letter] = name.lower().count(letter)
        features["has(%s)" % letter] = (letter in name.lower())
    return features

In [100]:
featuresets = [(gender_features2(n), g) for (n, g) in names]
train_set, test_set = featuresets[500:], featuresets[:500]

nb2_classifier = nltk.NaiveBayesClassifier.train(train_set)
me2_classifier = nltk.MaxentClassifier.train(train_set)


  ==> Training (100 iterations)

      Iteration    Log Likelihood    Accuracy
      ---------------------------------------
             1          -0.69315        0.373
             2          -0.61404        0.627
             3          -0.59980        0.627
             4          -0.58638        0.628
             5          -0.57376        0.636
             6          -0.56189        0.653
             7          -0.55076        0.672
      Training stopped: keyboard interrupt
         Final          -0.54031        0.688

In [103]:
print(nltk.classify.accuracy(nb2_classifier, test_set))
print(nltk.classify.accuracy(me2_classifier, test_set))


0.766
0.714

In [ ]:


In [ ]: