In [3]:
from nltk.corpus import gutenberg
In [8]:
fileids = gutenberg.fileids()
print len(fileids), "files"
print fileids
In [10]:
alice_raw = gutenberg.raw(fileids=['carroll-alice.txt'])
In [11]:
print 'type: ', type(alice_raw)
In [13]:
print alice_raw[:250]
In [14]:
from nltk.corpus import genesis
In [16]:
fileids = genesis.fileids()
print len(fileids), "files"
print fileids
In [23]:
for fileid in fileids:
print genesis.raw(fileids=[fileid])[:100] + "\n"
In [26]:
import nltk
text = nltk.bigrams('Hello')
In [27]:
for b in text:
print b
In [28]:
words = nltk.bigrams(['This', 'is', 'gonna', 'be', 'great!'])
In [29]:
for b in words:
print b
In [30]:
from langdetect import detect
In [32]:
print detect("War doesn't show who's right, just who's left.")
print detect("Ein, zwei, drei, vier")
In [34]:
import unicodecsv
In [ ]:
!cat data/language