In [1]:
import nltk
In [2]:
from nltk.corpus import brown
brown.words()[0:10]
Out[2]:
In [3]:
brown.tagged_words()[0:10]
Out[3]:
In [4]:
len(brown.words())
Out[4]:
In [5]:
dir(brown)
Out[5]:
In [6]:
from nltk.book import *
In [7]:
dir(text1)
Out[7]:
In [8]:
len(text1)
Out[8]:
In [9]:
from nltk import sent_tokenize, word_tokenize, pos_tag
text = "Machine learning is the science of getting computers to act without being explicitly programmed. In the past decade, machine learning has given us self-driving cars, practical speech recognition, effective web search, and a vastly improved understanding of the human genome. Machine learning is so pervasive today that you probably use it dozens of times a day without knowing it. Many researchers also think it is the best way to make progress towards human-level AI. In this class, you will learn about the most effective machine learning techniques, and gain practice implementing them and getting them to work for yourself. More importantly, you'll learn about not only the theoretical underpinnings of learning, but also gain the practical know-how needed to quickly and powerfully apply these techniques to new problems. Finally, you'll learn about some of Silicon Valley's best practices in innovation as it pertains to machine learning and AI."
sents = sent_tokenize(text)
sents
Out[9]:
In [10]:
len(sents)
Out[10]:
In [11]:
tokens = word_tokenize(text)
tokens
Out[11]:
In [12]:
len(tokens)
Out[12]:
In [13]:
tagged_tokens = pos_tag(tokens)
tagged_tokens
Out[13]:
Sentence boundary disambiguation (SBD), also known as sentence breaking, is the problem in natural language processing of deciding where sentences begin and end. Often natural language processing tools require their input to be divided into sentences for a number of reasons. However sentence boundary identification is challenging because punctuation marks are often ambiguous. For example, a period may denote an abbreviation, decimal point, an ellipsis, or an email address – not the end of a sentence. About 47% of the periods in the Wall Street Journal corpus denote abbreviations. As well, question marks and exclamation marks may appear in embedded quotations, emoticons, computer code, and slang. Languages like Japanese and Chinese have unambiguous sentence-ending markers.
In [26]:
text = "this’s a sent tokenize test. this is sent two. is this sent three? sent 4 is cool! Now it’s your turn."
from nltk.tokenize import sent_tokenize
sent_tokenize_list = sent_tokenize(text)
len(sent_tokenize_list)
Out[26]:
In [27]:
sent_tokenize_list
Out[27]:
In [28]:
import nltk.data
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
tokenizer.tokenize(text)
Out[28]:
In [29]:
spanish_tokenizer = nltk.data.load('tokenizers/punkt/spanish.pickle')
spanish_tokenizer.tokenize('Hola amigo. Estoy bien.')
Out[29]:
In [37]:
from nltk.tokenize import word_tokenize
word_tokenize('Hello World.')
Out[37]:
In [38]:
word_tokenize("this's a test")
Out[38]:
In [41]:
from nltk.tokenize import TreebankWordTokenizer
tokenizer = TreebankWordTokenizer()
tokenizer.tokenize("this’s a test")
Out[41]:
In [43]:
# Standard word tokenizer.
_word_tokenize = TreebankWordTokenizer().tokenize
def word_tokenize(text):
"""
Return a tokenized copy of *text*,
using NLTK's recommended word tokenizer
(currently :class:`.TreebankWordTokenizer`).
This tokenizer is designed to work on a sentence at a time.
"""
return _word_tokenize(text)
word_tokenize("this’s a test")
Out[43]:
In [46]:
from nltk.tokenize import WordPunctTokenizer
word_punct_tokenizer = WordPunctTokenizer()
word_punct_tokenizer.tokenize('This’s a test')
Out[46]:
In corpus linguistics, part-of-speech tagging (POS tagging or POST), also called grammatical tagging or word-category disambiguation, is the process of marking up a word in a text (corpus) as corresponding to a particular part of speech, based on both its definition, as well as its context—i.e. relationship with adjacent and related words in a phrase, sentence, or paragraph. A simplified form of this is commonly taught to school-age children, in the identification of words as nouns, verbs, adjectives, adverbs, etc.
Once performed by hand, POS tagging is now done in the context of computational linguistics, using algorithms which associate discrete terms, as well as hidden parts of speech, in accordance with a set of descriptive tags. POS-tagging algorithms fall into two distinctive groups: rule-based and stochastic. E. Brill’s tagger, one of the first and most widely used English POS-taggers, employs rule-based algorithms.
In [14]:
text = nltk.word_tokenize("Dive into NLTK: Part-of-speech tagging and POS Tagger")
text
Out[14]:
In [15]:
nltk.pos_tag(text)
Out[15]:
In [16]:
nltk.help.upenn_tagset('NN.*')
nltk.help.upenn_tagset('VB.*')
nltk.help.upenn_tagset('JJ.*')
nltk.help.upenn_tagset('CC.*')
nltk.help.upenn_tagset('IN.*')
nltk.help.upenn_tagset('PRP.*')
nltk.help.upenn_tagset('DT.*')
In [18]:
# Natural Language Toolkit: TnT Tagger
#
# Copyright (C) 2001-2013 NLTK Project
# Author: Sam Huston <sjh900@gmail.com>
#
# URL: <http://www.nltk.org/>
# For license information, see LICENSE.TXT
'''
Implementation of 'TnT - A Statisical Part of Speech Tagger'
by Thorsten Brants
http://acl.ldc.upenn.edu/A/A00/A00-1031.pdf
'''
from __future__ import print_function
from math import log
from operator import itemgetter
from nltk.probability import FreqDist, ConditionalFreqDist
from nltk.tag.api import TaggerI
class TnT(TaggerI):
'''
TnT - Statistical POS tagger
IMPORTANT NOTES:
* DOES NOT AUTOMATICALLY DEAL WITH UNSEEN WORDS
- It is possible to provide an untrained POS tagger to
create tags for unknown words, see __init__ function
* SHOULD BE USED WITH SENTENCE-DELIMITED INPUT
- Due to the nature of this tagger, it works best when
trained over sentence delimited input.
- However it still produces good results if the training
data and testing data are separated on all punctuation eg: [,.?!]
- Input for training is expected to be a list of sentences
where each sentence is a list of (word, tag) tuples
- Input for tag function is a single sentence
Input for tagdata function is a list of sentences
Output is of a similar form
* Function provided to process text that is unsegmented
- Please see basic_sent_chop()
TnT uses a second order Markov model to produce tags for
a sequence of input, specifically:
argmax [Proj(P(t_i|t_i-1,t_i-2)P(w_i|t_i))] P(t_T+1 | t_T)
IE: the maximum projection of a set of probabilities
The set of possible tags for a given word is derived
from the training data. It is the set of all tags
that exact word has been assigned.
To speed up and get more precision, we can use log addition
to instead multiplication, specifically:
argmax [Sigma(log(P(t_i|t_i-1,t_i-2))+log(P(w_i|t_i)))] +
log(P(t_T+1|t_T))
The probability of a tag for a given word is the linear
interpolation of 3 markov models; a zero-order, first-order,
and a second order model.
P(t_i| t_i-1, t_i-2) = l1*P(t_i) + l2*P(t_i| t_i-1) +
l3*P(t_i| t_i-1, t_i-2)
A beam search is used to limit the memory usage of the algorithm.
The degree of the beam can be changed using N in the initialization.
N represents the maximum number of possible solutions to maintain
while tagging.
It is possible to differentiate the tags which are assigned to
capitalized words. However this does not result in a significant
gain in the accuracy of the results.'''
In [19]:
from nltk.corpus import treebank
len(treebank.tagged_sents())
Out[19]:
In [20]:
train_data = treebank.tagged_sents()[:3000]
test_data = treebank.tagged_sents()[3000:]
In [21]:
train_data[0]
Out[21]:
In [ ]:
test_data[0]
In [57]:
from nltk.tag import tnt
tnt_pos_tagger = tnt.TnT()
tnt_pos_tagger.train(train_data)
tnt_pos_tagger.evaluate(test_data)
Out[57]:
In [58]:
import pickle
f = open('tnt_pos_tagger.pickle', "w")
pickle.dump(tnt_pos_tagger, f)
f.close()
In [ ]:
tnt_tagger.tag(nltk.word_tokenize("this is a tnt treebank tnt tagger"))
In linguistic morphology and information retrieval, stemming is the process for reducing inflected (or sometimes derived) words to their stem, base or root form—generally a written word form. The stem need not be identical to the morphological root of the word; it is usually sufficient that related words map to the same stem, even if this stem is not in itself a valid root. Algorithms for stemming have been studied in computer science since the 1960s. Many search engines treat words with the same stem as synonyms as a kind of query expansion, a process called conflation.
Stemming programs are commonly referred to as stemming algorithms or stemmers.
In [ ]:
from nltk.stem.porter import PorterStemmer
porter_stemmer = PorterStemmer()
from nltk.stem.lancaster import LancasterStemmer
lancaster_stemmer = LancasterStemmer()
from nltk.stem import SnowballStemmer
snowball_stemmer = SnowballStemmer("english")
#from nltk.stem.api import StemmerI
#api_stemmer = StemmerI()
from nltk.stem.regexp import RegexpStemmer
regexp_stemmer = RegexpStemmer("english")
from nltk.stem.isri import ISRIStemmer
isri_stemmer = ISRIStemmer()
from nltk.stem.rslp import RSLPStemmer
rlsp_stemmer = RSLPStemmer()
if __name__ == "__main__":
import doctest
doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)
In [ ]:
words = ['maximum','presumably','multiply','provision','owed','ear','saying','crying','string','meant','cement']
porter_words = []
for word in words:
porter_words.append(porter_stemmer.stem(word))
porter_words
In [ ]:
lancaster_words = []
for word in words:
lancaster_words.append(lancaster_stemmer.stem(word))
lancaster_words
In [ ]:
snowball_words = []
for word in words:
snowball_words.append(snowball_stemmer.stem(word))
snowball_words
In [ ]:
isri_words = []
for word in words:
isri_words.append(isri_stemmer.stem(word))
isri_words
In [ ]:
rlsp_words = []
for word in words:
rlsp_words.append(rlsp_stemmer.stem(word))
rlsp_words
In [ ]:
regexp_words = []
for word in words:
regexp_words.append(regexp_stemmer.stem(word))
regexp_words
In [ ]:
Lemmatisation (or lemmatization) in linguistics, is the process of grouping together the different inflected forms of a word so they can be analysed as a single item.
In computational linguistics, lemmatisation is the algorithmic process of determining the lemma for a given word. Since the process may involve complex tasks such as understanding context and determining the part of speech of a word in a sentence (requiring, for example, knowledge of the grammar of a language) it can be a hard task to implement a lemmatiser for a new language.
In many languages, words appear in several inflected forms. For example, in English, the verb ‘to walk’ may appear as ‘walk’, ‘walked’, ‘walks’, ‘walking’. The base form, ‘walk’, that one might look up in a dictionary, is called the lemma for the word. The combination of the base form with the part of speech is often called the lexeme of the word.
Lemmatisation is closely related to stemming. The difference is that a stemmer operates on a single word without knowledge of the context, and therefore cannot discriminate between words which have different meanings depending on part of speech. However, stemmers are typically easier to implement and run faster, and the reduced accuracy may not matter for some applications.
In [ ]:
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
In [ ]:
words_lem = ['dogs','churches','aardwolves','abaci','hardrock','attractive','are','is']
#words_lem_pos = pos_tag(words_lem)
wordnet_words = []
for word in words_lem:
if word == 'is' or word == 'are':
# for verbs
wordnet_words.append(wordnet_lemmatizer.lemmatize(word, pos='v'))
else:
#
wordnet_words.append(wordnet_lemmatizer.lemmatize(word))
wordnet_words
In [ ]:
In [53]:
import nltk
sentence = "At eight o'clock on Thursday morning Arthur didn't feel very good."
tokens = nltk.word_tokenize(sentence)
tokens
Out[53]:
In [54]:
tagged = nltk.pos_tag(tokens)
tagged[0:6]
Out[54]:
In [55]:
entities = nltk.chunk.ne_chunk(tagged)
entities
Out[55]:
In [56]:
from nltk.corpus import treebank
t = treebank.parsed_sents('wsj_0001.mrg')[0]
t.draw()
In [ ]: