In [1]:
from nltk.corpus import gutenberg
In [2]:
fileids = gutenberg.fileids()
print len(fileids), "files"
print fileids
In [3]:
alice_raw = gutenberg.raw(fileids=['carroll-alice.txt'])
In [4]:
print 'type: ', type(alice_raw)
In [5]:
print alice_raw[:250]
In [6]:
from nltk.corpus import genesis
In [7]:
fileids = genesis.fileids()
print len(fileids), "files"
print fileids
In [8]:
for fileid in fileids:
print genesis.raw(fileids=[fileid])[:100] + "\n"
In [9]:
import nltk
text = nltk.bigrams('Hello')
In [10]:
for b in text:
print b
In [11]:
words = nltk.bigrams(['This', 'is', 'gonna', 'be', 'great!'])
In [12]:
for b in words:
print b
In [13]:
from langdetect import detect
In [14]:
print detect("War doesn't show who's right, just who's left.")
print detect("Ein, zwei, drei, vier")
In [15]:
import unicodecsv
In [17]:
!cat data/7languages.txt
In [34]:
with open('data/7languages.txt', 'rb') as input_file:
row_reader = unicodecsv.reader(input_file)
result = []
for row in row_reader:
lang = detect(row[0])
result += [lang.encode('ascii', 'ignore')]
print row[0], "|", lang
truth = ['ko', 'de', 'zh', 'ar', 'es', 'ja']
print "\n", truth
print result
print "\n", nltk.ConfusionMatrix(truth, result)
In [ ]:
with open('data/7languages.txt', 'rb') as input_file:
row_reader = unicodecsv.reader(input_file)
result = []
for row in row_reader:
lang = detect(row[0])
result += [lang.encode('ascii', 'ignore')]
print row[0], "|", lang
truth = ['ko', 'de', 'zh', 'ar', 'es', 'ja']
print "\n", truth
print result
print "\n", nltk.ConfusionMatrix(truth, result)m
In [37]:
result = []
for fileid in fileids:
lang = detect(genesis.raw(fileids=[fileid])[:100])
result += [lang.encode('ascii', 'ignore')]
print genesis.raw(fileids=[fileid])[:100], "|", lang, "\n"
In [38]:
some_text = "This is some #@*!$ text! This can't be right!"
print nltk.word_tokenize(some_text)
print nltk.wordpunct_tokenize(some_text)
In [41]:
with open('data/7languages.txt', 'rb') as input_file:
row_reader = unicodecsv.reader(input_file)
for row in row_reader:
tokens = nltk.word_tokenize(row[0])
for t in tokens:
print t, "|||"
print
In [44]:
from rosette.api import API, RosetteParameters
In [49]:
api = API(service_url="https://api.rosette.com/rest/v1", user_key="40fe14de7872ebf3b8c5e11c17fb7a5f")
params = RosetteParameters()
op = api.morphology()
In [51]:
with open('data/7languages.txt', 'rb') as input_file:
row_reader = unicodecsv.reader(input_file)
for row in row_reader:
params["content"] = row[0]
result = op.operate(params)
tokens = result['lemmas']
for t in tokens:
print t['text'], "|||",
print