In [5]:
import sys
reload(sys)
sys.setdefaultencoding("utf-8")
import nltk
import csv as csv
import unicodecsv as un
import codecs
import numpy as np
from nltk.corpus import wordnet as wn
from nltk.tokenize import word_tokenize

#Open up the csv file in to a Python object
csv_file_object = codecs.open('SMSSpamCollection', 'rb', 'utf-8', 'ignore')
header = csv_file_object.next() #The next command just skip the first line which is a header
data = []                       #Create a variable called 'data'
for row in csv_file_object:     #run through each row in the csv file,
    data.append(row.split('\t'))            #Adding each row to the data variable
  
data = np.array(data)           #Then convert from a list to an array
print len(data)
#Be aware that each item is currentlya string in this format

In [6]:
data[0]


Out[6]:
array([u'ham',
       u'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...\n'], 
      dtype='<U911')

In [7]:
csv_slang_object = csv.reader(open('slang.txt', 'rb'), delimiter=';')
headerSlang = csv_slang_object.next()
dataSlang = {}  # troquei por um dict (tabele hash), pois fica mais fácil procurar
for line in csv_slang_object:
    dataSlang[line[0]] = word_tokenize(line[1].lower())

In [8]:
import sys
reload(sys)
import string
sys.setdefaultencoding("utf-8")
des_file = codecs.open('SMSSpamAnalytics-teste', 'wb', 'utf-8')
writer = un.writer(des_file, encoding='utf-8', delimiter='{')
#writer = csv.writer(des_file, delimiter=';', quotechar='"')
writer.writerow(["Classe","Tokens_expandido", "Tokens_com_girias", "Tokens_em_dicionario"])#write the column headers

# usando o dicionário....
j = 0
no_tk = []

while j <= 5573:
    tks_girias = []
    tks_dict = []
    base = data[j,1].lower()
    #sent = string.split(base)
    sent = word_tokenize(data[j,1].lower())

    for i in range(len(sent)):
        if sent[i] in dataSlang.keys():
            tks_girias.append(sent[i])
            sent[i:i+1] = dataSlang[sent[i]]
            no_tk = string.join(sent)
            pass
        if wn.synsets(sent[i]):
            tks_dict.append(sent[i])
            no_tk = string.join(sent)
            pass
        
        
   


    writer.writerow([data[j,0],no_tk, str(tks_girias), str(tks_dict)])
#    print sent
    j = j+1

des_file.close()

In [ ]:
'''# sem usar o dicionário... 

sent = word_tokenize(data[0][1].lower())
print sent
for i in range(len(sent)):
    if sent[i] in dataSlang.keys():
        sent[i:i+1] = dataSlang[sent[i]]
            
print sent'''

In [ ]:
data[35]

In [ ]:
string = word_tokenize("Ola mundo tokenizado gonna")
string

In [ ]:
nova = []
nova = ' '.join(string)

In [ ]:
nova

In [ ]: