In [ ]:
import numpy as np
import codecs
open_file = codecs.open('Diretório')
header = open_file.next()
data = []
for row in open_file:
klass, _,_, twit = row.split('{')
data.append(twit)
data = np.array(data)
len(data)
In [ ]:
from tweetokenize import Tokenizer
gettokens = Tokenizer()
palavras = gettokens.tokenize(twit.encode('ASCII', 'ignore'))
In [ ]:
import unicodedata
for j in range(len(palavras)):
words=unicodedata.normalize('NFKD', [i for i in palavras])
In [ ]:
def tokenize(objeto):
from tweetokenize import Tokenizer
gettokens = Tokenizer()
text = [gettokens.tokenize(words) for words in objeto]
return text
In [ ]:
import unicodecsv as un
len(termos)
In [ ]:
import unicodedata
normal = []
for i in range(len(termos)):
normal.append([unicodedata.normalize('NFKD', words).encode('ASCII', 'ignore') for words in termos[i]])
In [ ]:
from tweetokenize import Tokenizer
gettokens = Tokenizer()
words=unicodedata.normalize('NFKD', [gettokens.tokenize(teste.encode('ASCII', 'ignore')) for teste in termos[40247]])
In [ ]:
[gettokens.tokenize(teste.encode('ASCII', 'ignore')) for teste in termos[40247]]