NLTK Built In Corpora


In [ ]:
#Inspect the texts available in the built in gutenberg corpus
from nltk.corpus import gutenberg
fileids = gutenberg.fileids()
print len(fileids), 'files'
print fileids

Alice in Wonderland


In [ ]:
#Inspect the raw text of the Alice in Wonderland text
alice_raw = gutenberg.raw(fileids=['carroll-alice.txt'])

#Inspecting the type
print 'Type: ', type(alice_raw)
print

#Looking at the first 250 characters
print alice_raw[:250]

You try it.

  1. Open "Genesis" copora. It contains 8 texts.
  2. Output the names and first 100 characters of each text.