In [4]:
import enchant
In [5]:
# The underlying programming model provided by the Enchant library is based on the notion of Providers.
# A provider is a piece of code that provides spell-checking services which Enchant can use to perform its work.
# Different providers exist for performing spellchecking using different frameworks -
# for example there is an aspell provider and a MySpell provider.
## no need to check brokers while running enchant, this is just a simple check if all is installed
b = enchant.Broker()
print(b.describe())
b.list_dicts()
Out[5]:
In [6]:
enchant.list_languages()
Out[6]:
In [7]:
d = enchant.Dict("it_IT")
In [8]:
d.check('Giulia'), d.check('pappapero')
Out[8]:
In [9]:
print( d.suggest("potreima") )
print( d.suggest("marema") )
print( d.suggest("se metto troppe parole lo impallo") )
print( d.suggest("van no") )
print( d.suggest("due parole") )
In [10]:
# Dict objects can also be used to check words against a custom list of correctly-spelled words
# known as a Personal Word List. This is simply a file listing the words to be considered, one word per line.
# The following example creates a Dict object for the personal word list stored in “mywords.txt”:
pwl = enchant.request_pwl_dict("../../Data_nlp/mywords.txt")
In [11]:
pwl.check('pappapero'), pwl.suggest('cittin'), pwl.check('altro')
Out[11]:
In [12]:
# PyEnchant also provides the class DictWithPWL which can be used to combine a language dictionary
# and a personal word list file:
d2 = enchant.DictWithPWL("it_IT", "../../Data_nlp/mywords.txt")
In [13]:
d2.check('altro') & d2.check('pappapero'), d2.suggest('cittin')
Out[13]:
In [14]:
%%timeit
d2.suggest('poliza')
In [15]:
from enchant.checker import SpellChecker
chkr = SpellChecker("it_IT")
In [16]:
chkr.set_text("questo è un picclo esmpio per dire cm funziona")
for err in chkr:
print(err.word)
print(chkr.suggest(err.word))
In [17]:
print(chkr.word, chkr.wordpos)
In [18]:
chkr.replace('pippo')
chkr.get_text()
Out[18]:
As explained above, the module enchant.tokenize provides the ability to split text into its component words. The current implementation is based only on the rules for the English language, and so might not be completely suitable for your language of choice. Fortunately, it is straightforward to extend the functionality of this module.
To implement a new tokenization routine for the language TAG, simply create a class/function “tokenize” within the module “enchant.tokenize.TAG”. This function will automatically be detected by the module’s get_tokenizer function and used when appropriate. The easiest way to accomplish this is to copy the module “enchant.tokenize.en” and modify it to suit your needs.
In [19]:
from enchant.tokenize import get_tokenizer
tknzr = get_tokenizer("en_US") # not tak for it_IT up to now
[w for w in tknzr("this is some simple text")]
Out[19]:
In [20]:
from enchant.tokenize import get_tokenizer, HTMLChunker
tknzr = get_tokenizer("en_US")
[w for w in tknzr("this is <span class='important'>really important</span> text")]
Out[20]:
In [28]:
tknzr = get_tokenizer("en_US",chunkers=(HTMLChunker,))
[w for w in tknzr("this is <span class='important'>really important</span> text")]
Out[28]:
In [21]:
from enchant.tokenize import get_tokenizer, EmailFilter
tknzr = get_tokenizer("en_US")
[w for w in tknzr("send an email to fake@example.com please")]
Out[21]:
In [22]:
tknzr = get_tokenizer("en_US", filters = [EmailFilter])
[w for w in tknzr("send an email to fake@example.com please")]
Out[22]:
Other modules:
The module enchant.checker.CmdLineChecker provides the class CmdLineChecker which can be used to interactively check the spelling of some text. It uses standard input and standard output to interact with the user through a command-line interface. The code below shows how to create and use this class from within a python application, along with a short sample checking session:
The module enchant.checker.wxSpellCheckerDialog provides the class wxSpellCheckerDialog which can be used to interactively check the spelling of some text. The code below shows how to create and use such a dialog from within a wxPython application.
In [23]:
import gensim, logging
from gensim.models import Word2Vec
In [32]:
model = gensim.models.KeyedVectors.load_word2vec_format(
'../../Data_nlp/GoogleNews-vectors-negative300.bin.gz', binary=True)
In [33]:
model.doesnt_match("breakfast brian dinner lunch".split())
Out[33]:
In [35]:
# give text with w1 w2 your_distance to check if model and w1-w2 have give the same distance
model.evaluate_word_pairs()
In [36]:
len(model.index2word)
Out[36]:
In [37]:
# check accuracy against a premade grouped words
questions_words = model.accuracy('../../Data_nlp/word2vec/trunk/questions-words.txt')
phrases_words = model.accuracy('../../Data_nlp/word2vec/trunk/questions-phrases.txt')
In [38]:
questions_words[4]['incorrect']
Out[38]:
In [39]:
print( model.n_similarity(['pasta'], ['spaghetti']) )
print( model.n_similarity(['pasta'], ['tomato']) )
print( model.n_similarity(['pasta'], ['car']) )
print( model.n_similarity(['cat'], ['dog']) )
In [40]:
model.similar_by_vector( model.word_vec('welcome') )
Out[40]:
In [41]:
model.similar_by_word('welcome')
Out[41]:
In [42]:
model.syn0[4,]
Out[42]:
In [43]:
model.index2word[4]
Out[43]:
In [44]:
model.word_vec('is')
Out[44]:
In [45]:
model.syn0norm[4,]
Out[45]:
In [46]:
model.vector_size
Out[46]:
In [47]:
import numpy as np
model.similar_by_vector( (model.word_vec('Goofy') + model.word_vec('Minni'))/2 )
Out[47]:
In [48]:
import pyemd
# This method only works if `pyemd` is installed (can be installed via pip, but requires a C compiler).
sentence_obama = 'Obama speaks to the media in Illinois'.lower().split()
sentence_president = 'The president greets the press in Chicago'.lower().split()
# Remove their stopwords.
import nltk
stopwords = nltk.corpus.stopwords.words('english')
sentence_obama = [w for w in sentence_obama if w not in stopwords]
sentence_president = [w for w in sentence_president if w not in stopwords]
# Compute WMD.
distance = model.wmdistance(sentence_obama, sentence_president)
print(distance)
In [49]:
import nltk
stopwords = nltk.corpus.stopwords.words('english')
def sentence_distance(s1, s2):
sentence_obama = [w for w in s1.split() if w not in stopwords]
sentence_president = [w for w in s2.split() if w not in stopwords]
print(sentence_obama, sentence_president, sep='\t')
print(model.wmdistance(sentence_obama, sentence_president), end='\n\n')
In [50]:
sentence_distance('I run every day in the morning', 'I like football')
sentence_distance('I run every day in the morning', 'I run since I was born')
sentence_distance('I run every day in the morning', 'you are idiot')
sentence_distance('I run every day in the morning', 'Are you idiot?')
sentence_distance('I run every day in the morning', 'Is it possible to die?')
sentence_distance('I run every day in the morning', 'Is it possible to die')
sentence_distance('I run every day in the morning', 'I run every day')
sentence_distance('I run every day in the morning', 'I eat every day')
sentence_distance('I run every day in the morning', 'I have breakfast in the morning')
sentence_distance('I run every day in the morning', 'I have breakfast every day in the morning')
sentence_distance('I run every day in the morning', 'Each day I run')
sentence_distance('I run every day in the morning', 'I run every day in the morning')
In [51]:
sentence_distance('I run every day in the morning', 'Each day I run')
sentence_distance('I run every day in the morning', 'Each I run')
sentence_distance('I run every day in the morning', 'Each day run')
sentence_distance('I run every day in the morning', 'Each day I')
sentence_distance('I every day in the morning', 'Each day I run')
sentence_distance('I run day in the morning', 'Each day I run')
sentence_distance('I run every in morning', 'Each day I run')
sentence_distance('I run every in', 'Each day I run')
In [52]:
def get_vect(w):
try:
return model.word_vec(w)
except KeyError:
return np.zeros(model.vector_size)
def calc_avg(s):
ws = [get_vect(w) for w in s.split() if w not in stopwords]
avg_vect = sum(ws)/len(ws)
return avg_vect
from scipy.spatial import distance
def get_euclidean(s1, s2):
return distance.euclidean(calc_avg(s1), calc_avg(s2))
In [53]:
# same questions
s1 = 'Astrology: I am a Capricorn Sun Cap moon and cap rising...what does that say about me?'
s2 = "I'm a triple Capricorn (Sun, Moon and ascendant in Capricorn) What does this say about me?"
sentence_distance(s1, s2)
print(get_euclidean(s1, s2))
In [54]:
# same questions as above without punctuations
s1 = 'Astrology I am a Capricorn Sun Cap moon and cap rising what does that say about me'
s2 = "I am a triple Capricorn Sun Moon and ascendant in Capricorn What does this say about me"
sentence_distance(s1, s2)
print(get_euclidean(s1, s2))
In [55]:
# same questions
s1 = 'What is best way to make money online'
s2 = 'What is best way to ask for money online?'
sentence_distance(s1,s2)
print(get_euclidean(s1, s2))
In [56]:
# different questions
s1 = 'How did Darth Vader fought Darth Maul in Star Wars Legends?'
s2 = 'Does Quora have a character limit for profile descriptions?'
sentence_distance(s1,s2)
print(get_euclidean(s1, s2))
In [57]:
# the order of the words doesn't change the distanace bewteeen the two phrases
s1ws = [w for w in s1.split() if w not in stopwords]
s2ws = [w for w in s2.split() if w not in stopwords]
print(model.wmdistance(s1ws, s2ws) )
print(model.wmdistance(s1ws[::-1], s2ws) )
print(model.wmdistance(s1ws, s2ws[::-1]) )
print(model.wmdistance(s1ws[3:]+s1ws[0:3], s2ws[::-1]) )
conclusion:
In [60]:
from googletrans import Translator
In [61]:
o = open("../../AliceNelPaeseDelleMeraviglie.txt")
all = ''
for l in o: all += l
In [62]:
translator = Translator()
In [63]:
for i in range(42, 43, 1):
print(all[i * 1000:i * 1000 + 1000], end='\n\n')
print(translator.translate(all[i * 1000:i * 1000 + 1000], dest='en').text)
In [64]:
## if language is not passed it is guessed, so it can detect a language
frase = "Ciao Giulia, ti va un gelato?"
det = translator.detect(frase)
print("Languge:", det.lang, " with confidence:", det.confidence)
In [65]:
# command line usage, but it seems to don't work to me
!translate "veritas lux mea" -s la -d en
In [66]:
translations = translator.translate(
['The quick brown fox', 'jumps over', 'the lazy dog'], dest='ko')
for translation in translations:
print(translation.origin, ' -> ', translation.text)
In [67]:
phrase = translator.translate(frase, 'en')
phrase.origin, phrase.text, phrase.src, phrase.pronunciation, phrase.dest
Out[67]:
How To install:
Infos:
In [70]:
from treetagger import TreeTagger
tt = TreeTagger(language='english')
tt.tag('What is the airspeed of an unladen swallow?')
Out[70]:
In [71]:
tt = TreeTagger(language='italian')
tt.tag('Proviamo a vedere un pò se funziona bene questo tagger')
Out[71]:
In [ ]:
In [ ]:
In [ ]: