Author: A. Sean Pue
A lesson for AL340 Digital Humanities Seminar (Spring 2015)
--> Double Click here to start<--
This is an IPython Notebook. It contains numerous cells that can be of different types (Markdown, Code, Headings). This is lesson two. You may need to install the Natural Language Toolkit to begin.
Select the cell above this one by double clicking it with your mouse.
You can see that it contains text in a format called Markdown. To execute the cell,
press shift+enter.
To complete this tutorial (which is meant for a classroom), execute the cells and follow the instructions.
In [ ]:
#The next line imports the nltk
import nltk
# next line makes graphics appear in the browser rather than separate window
%matplotlib inline
Here we will work with this file: http://www.gutenberg.org/cache/epub/11/pg11.txt
In [ ]:
import urllib2
url = 'http://www.gutenberg.org/cache/epub/11/pg11.txt'
# get text
t = urllib2.urlopen(url).read().decode('utf8')
assert(t)
len(t)
In [ ]:
text_starts= t.index('CHAPTER I') # find where chapter starts
text_ends = t.index('THE END')
t = t[text_starts:text_ends]
len(t)
In [ ]:
import re # imports regular expression module (which allows fancy text searching)
chapters = re.findall('CHAPTER .+?(?=CHAPTER|$)',t,re.DOTALL) # This is a regular expression that splits out chapters
In [ ]:
# Here is a sample of chapter 2 (or chapter[1])
chapters[1][0:500]
In [ ]:
for chapter_number, chapter_text in enumerate(chapters):
file_name = 'data/alice/chapter-'+str(chapter_number+1)+'.txt'
with open(file_name,'w') as f:
f.write(chapter_text)
In [ ]:
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
corpusdir = 'data/alice/' # Directory of corpus.
corpus0 = PlaintextCorpusReader(corpusdir, '.*')
corpus = nltk.Text(corpus0.words())
In [ ]:
len(corpus)
In [ ]:
len(set(corpus))
In [ ]:
corpus[0:10]
In [ ]:
corpus.concordance('Alice') # you can also try corpus.concordance('Alice',lines=all) or lines = 100, etc.
In [ ]:
corpus.similar('caterpillar')
In [ ]:
corpus.common_contexts(['hatter','queen'])
In [ ]:
fd = nltk.FreqDist(corpus)
In [ ]:
fd['Alice']
In [ ]:
fd.keys()
In [ ]:
fd.hapaxes()
In [ ]:
fd.plot(20,cumulative=False)
In [ ]:
new_corpus = [w.lower() for w in corpus if w.isalpha()] # keep only alphabetic words and lowercase it
In [ ]:
stopwords = nltk.corpus.stopwords.words('english')
print stopwords
In [ ]:
new_corpus = [w for w in new_corpus if not w in stopwords] # erase words in stoplist
corpus = nltk.Text(new_corpus)
In [ ]:
fd2 = nltk.FreqDist(corpus) # create a FreqDist (Frequency Distribution)
In [ ]:
fd2.plot(20,cumulative=False)
In [ ]:
corpus.dispersion_plot(['alice','rabbit','queen'])
In [ ]:
corpus.collocations(num=1000)
In [ ]:
for x in nltk.bigrams(corpus):
print x
In [ ]:
for x in nltk.trigrams(corpus):
print x
In [ ]:
for x in nltk.ngrams(corpus,7):
print x
In [ ]:
nltk.pos_tag(corpus[0:1000])
In [ ]: