In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import nltk
import re
import pprint
from nltk import word_tokenize
downloading Crime and Punishment
In [2]:
from urllib import request
url = 'http://www.gutenberg.org/files/2554/2554.txt'
response = request.urlopen(url)
raw = response.read().decode('utf8')
type(raw)
Out[2]:
number of characters:
In [3]:
len(raw)
Out[3]:
In [4]:
raw[:75]
Out[4]:
In [5]:
tokens = word_tokenize(raw)
type(tokens)
Out[5]:
In [6]:
len(tokens)
Out[6]:
In [7]:
tokens[:10]
Out[7]:
Create a Text
object from tokens
In [8]:
text = nltk.Text(tokens)
type(text)
Out[8]:
In [9]:
text[1024:1062]
Out[9]:
find collocations (words that frequently appear together)
In [10]:
text.collocations()
Project Gutenberg is a collocation for this text because it is included as a header and possibly footer for the raw text file
find start and end manually using find
In [11]:
raw.find('PART I')
Out[11]:
reverse find using rfind
In [12]:
raw.rfind("End of Project Gutenberg's Crime")
Out[12]:
In [13]:
raw = raw[5338:1157746] # slightly different from NLTK Book value
In [14]:
raw.find("PART I")
Out[14]:
In [15]:
url = "http://news.bbc.co.uk/2/hi/health/2284783.stm"
html = request.urlopen(url).read().decode('utf8')
html[:60]
Out[15]:
In [16]:
type(html)
Out[16]:
In [17]:
from bs4 import BeautifulSoup
raw = BeautifulSoup(html, 'html.parser').get_text()
tokens = word_tokenize(raw)
tokens[:50]
Out[17]:
find start and end indices of the content (manually) and create a Text
object
In [18]:
tokens = tokens[110:390]
text = nltk.Text(tokens)
text
Out[18]:
get concordance of gene -- shows occurrences of the word gene
In [19]:
text.concordance('gene')
In [20]:
path = nltk.data.find('corpora/unicode_samples/polish-lat2.txt')
path
Out[20]:
In [21]:
with open(path, encoding='latin2') as f:
for line in f:
line_strip = line.strip()
print(line_strip)
In [22]:
with open(path, encoding='latin2') as f:
for line in f:
line_strip = line.strip()
print(line_strip.encode('unicode_escape'))
In [23]:
import unicodedata
with open(path, encoding='latin2') as f:
lines = f.readlines()
line = lines[2]
print(line.encode('unicode_escape'))
In [24]:
for c in line:
if ord(c) > 127:
print('{} U+{:04x} {}'.format(c.encode('utf8'), ord(c), unicodedata.name(c)))
In [25]:
for c in line:
if ord(c) > 127:
print('{} U+{:04x} {}'.format(c, ord(c), unicodedata.name(c)))
Using Python string methods and re
with Unicode characters
In [26]:
line
Out[26]:
In [27]:
line.find('zosta\u0142y')
Out[27]:
In [28]:
line = line.lower()
line
Out[28]:
In [29]:
line.encode('unicode_escape')
Out[29]:
In [30]:
import re
m = re.search('\u015b\w*', line)
m.group()
Out[30]:
In [31]:
m.group().encode('unicode_escape')
Out[31]:
Can use Unicode strings with NLTK tokenizers
In [32]:
word_tokenize(line)
Out[32]:
skipping this section
cheatsheet:
Operator Behavior
. Wildcard, matches any character
^abc Matches some pattern abc at the start of a string
abc$ Matches some pattern abc at the end of a string
[abc] Matches one of a set of characters
[A-Z0-9] Matches one of a range of characters
ed|ing|s Matches one of the specified strings (disjunction)
* Zero or more of previous item, e.g. a*, [a-z]* (also known as Kleene Closure)
+ One or more of previous item, e.g. a+, [a-z]+
? Zero or one of the previous item (i.e. optional), e.g. a?, [a-z]?
{n} Exactly n repeats where n is a non-negative integer
{n,} At least n repeats
{,n} No more than n repeats
{m,n} At least m and no more than n repeats
a(b|c)+ Parentheses that indicate the scope of the operators
find all vowels in a word and count them
In [33]:
import re
word = 'supercalifragilisticexpialidocious'
vowel_matches = re.findall(r'[aeiou]', word)
vowel_matches
Out[33]:
In [34]:
len(vowel_matches)
Out[34]:
Frequencies for sequences of 2+ vowels in the text
In [35]:
wsj = sorted(set(nltk.corpus.treebank.words()))
len(wsj)
Out[35]:
In [36]:
fd = nltk.FreqDist(vowels for word in wsj
for vowels in re.findall(r'[aeiou]{2,}', word))
len(fd)
Out[36]:
In [37]:
fd.most_common(12)
Out[37]:
In [38]:
regexp = r'^[AEIOUaeiou]+|[AEIOUaeiou]+$|[^AEIOUaeiou]'
def compress(word):
pieces = re.findall(regexp, word)
return ''.join(pieces)
In [39]:
re.findall(regexp, 'Universal')
Out[39]:
In [40]:
english_udhr = nltk.corpus.udhr.words('English-Latin1')
print(nltk.tokenwrap(compress(w) for w in english_udhr[:75]))
Extract consonant-vowel sequences from text
In [41]:
rotokas_words = nltk.corpus.toolbox.words('rotokas.dic')
cvs = [consonant_vowel for w in rotokas_words
for consonant_vowel in re.findall(r'[ptksvr][aeiou]', w)]
cvs[:25]
Out[41]:
In [42]:
cfd = nltk.ConditionalFreqDist(cvs)
cfd.tabulate()
create an index such that: cv_index['su']
returns all words containing su
Use nltk.Index()
In [43]:
cv_word_pairs = [(cv, w) for w in rotokas_words
for cv in re.findall(r'[ptksvr][aeiou]', w)]
cv_index = nltk.Index(cv_word_pairs)
type(cv_index)
Out[43]:
In [44]:
cv_index['su']
Out[44]:
In [45]:
cv_index['po']
Out[45]:
In [46]:
def stem(word):
for suffix in ['ing', 'ly', 'ed', 'ious', 'ies', 'ive', 'es', 's', 'ment']:
if word.endswith(suffix):
return word[:-len(suffix)]
return word
In [47]:
stem('walking')
Out[47]:
alternative using re
module...
In [48]:
def stem_regexp(word):
regexp = r'^(.*?)(ing|ly|ed|ious|ies|ive|es|s|ment)?$'
stem, suffix = re.findall(regexp, word)[0]
return stem
In [49]:
raw = """DENNIS: Listen, strange women lying in ponds distributing swords
is no basis for a system of government. Supreme executive power derives from
a mandate from the masses, not from some farcical aquatic ceremony."""
tokens = word_tokenize(raw)
tokens
Out[49]:
In [50]:
[stem_regexp(t) for t in tokens]
Out[50]:
In [51]:
from nltk.corpus import gutenberg, nps_chat
moby = nltk.Text(gutenberg.words('melville-moby_dick.txt'))
moby.findall(r"<a> (<.*>) <man>")
In [52]:
chat = nltk.Text(nps_chat.words())
chat.findall(r"<.*> <.*> <bro>")
In [53]:
chat.findall(r"<l.*>{3,}")
In [54]:
raw = """DENNIS: Listen, strange women lying in ponds distributing swords
is no basis for a system of government. Supreme executive power derives from
a mandate from the masses, not from some farcical aquatic ceremony."""
tokens = word_tokenize(raw)
tokens
Out[54]:
In [55]:
porter = nltk.PorterStemmer()
lancaster = nltk.LancasterStemmer()
[porter.stem(t) for t in tokens]
Out[55]:
In [56]:
[lancaster.stem(t) for t in tokens]
Out[56]:
Porter stemmer correctly handled lying -> lie while Lancaster stemmer did not
Defining a custom Text
class that uses the Porter Stemmer and can generate concordance
for a text using word stems
In [57]:
class IndexedText(object):
def __init__(self, stemmer, text):
self._text = text
self._stemmer = stemmer
self._index = nltk.Index((self._stem(word), i)
for (i, word) in enumerate(text))
def concordance(self, word, width=40):
key = self._stem(word)
wc = int(width/4) # words of context
for i in self._index[key]:
lcontext = ' '.join(self._text[i-wc:i])
rcontext = ' '.join(self._text[i:i+wc])
ldisplay = '{:>{width}}'.format(lcontext[-width:], width=width)
rdisplay = '{:{width}}'.format(rcontext[:width], width=width)
print(ldisplay, rdisplay)
def _stem(self, word):
return self._stemmer.stem(word).lower()
In [58]:
porter = nltk.PorterStemmer()
grail = nltk.corpus.webtext.words('grail.txt')
text = IndexedText(porter, grail)
text.concordance('lie')
In [59]:
wnl = nltk.WordNetLemmatizer()
[wnl.lemmatize(t) for t in tokens]
Out[59]:
In [60]:
raw = """'When I'M a Duchess,' she said to herself, (not in a very hopeful tone
though), 'I won't have any pepper in my kitchen AT ALL. Soup does very
well without--Maybe it's always pepper that makes people hot-tempered,'..."""
split on whitespace
In [61]:
re.split(r' ', raw)
Out[61]:
In [62]:
re.split(r'[ \t\n]+', raw)
Out[62]:
re
offers \w
(word characters) and \W
(all characters except letters, digits, _ )
split on nonword characters:
In [63]:
re.split(r'\W+', raw)
Out[63]:
exclude empty strings...
In [64]:
re.findall(r'\w+|\Sw*', raw)
Out[64]:
allow internal hyphens and apostrophes in words
In [65]:
re.findall(r"\w+(?:[-']\w+)*|'|[-.(\)]+|\S\w*", raw)
Out[65]:
In [66]:
text = 'That U.S.A. poster-print costs $12.40...'
pattern = r'''(?x) # set flag to allow verbose regexps
([A-Z]\.)+ # abbreviations, e.g. U.S.A.
| \w+(-\w+)* # words with optional internal hyphens
| \$?\d+(\.\d+)?%? # currency and percentages, e.g. $12.40, 82%
| \.\.\. # ellipsis
| [][.,;"'?():-_`] # these are separate tokens; includes ], [
'''
nltk.regexp_tokenize(text, pattern)
Out[66]:
(?x) is a verbose flag -- strips out embedded whitespace and comments
Important to have a "gold standard" for tokenization to compare performance of a custom tokenizer...
NLTK Corpus includes Penn Treebank corpus, tokenized and raw text, for this purpose:
nltk.corpus.treebank_raw.raw()
and nltk.corpus.treebank.words()
Tokenization is a specific case of the more general segmentation
Average number of words per sentence:
In [67]:
len(nltk.corpus.brown.words()) / len(nltk.corpus.brown.sents())
Out[67]:
Segmenting a stream of characters into sentences: sent_tokenize
In [68]:
import pprint
text = nltk.corpus.gutenberg.raw('chesterton-thursday.txt')
sents = nltk.sent_tokenize(text)
pprint.pprint(sents[79:89])