In [ ]:
import numpy as np
import codecs

open_file = codecs.open('Diretório')
header = open_file.next()
data = []

for row in open_file:
    klass, _,_, twit = row.split('{')
    data.append(twit)
data = np.array(data)
len(data)

In [ ]:
from tweetokenize import Tokenizer
gettokens = Tokenizer()
palavras = gettokens.tokenize(twit.encode('ASCII', 'ignore'))

In [ ]:
import unicodedata
for j in range(len(palavras)):
    words=unicodedata.normalize('NFKD', [i for i in palavras])

In [ ]:
def tokenize(objeto):
    from tweetokenize import Tokenizer
    gettokens = Tokenizer()
    
    text = [gettokens.tokenize(words) for words in objeto]
    
    return text

In [ ]:
import unicodecsv as un
len(termos)

In [ ]:
import unicodedata
normal = []
for i in range(len(termos)):
    normal.append([unicodedata.normalize('NFKD', words).encode('ASCII', 'ignore') for words in termos[i]])

In [ ]:
from tweetokenize import Tokenizer
gettokens = Tokenizer()
words=unicodedata.normalize('NFKD', [gettokens.tokenize(teste.encode('ASCII', 'ignore')) for teste in termos[40247]])

In [ ]:
[gettokens.tokenize(teste.encode('ASCII', 'ignore')) for teste in termos[40247]]