In [5]:
import sys
reload(sys)
sys.setdefaultencoding("utf-8")
import nltk
import csv as csv
import unicodecsv as un
import codecs
import numpy as np
from nltk.corpus import wordnet as wn
from nltk.tokenize import word_tokenize
#Open up the csv file in to a Python object
csv_file_object = codecs.open('SMSSpamCollection', 'rb', 'utf-8', 'ignore')
header = csv_file_object.next() #The next command just skip the first line which is a header
data = [] #Create a variable called 'data'
for row in csv_file_object: #run through each row in the csv file,
data.append(row.split('\t')) #Adding each row to the data variable
data = np.array(data) #Then convert from a list to an array
print len(data)
#Be aware that each item is currentlya string in this format
In [6]:
data[0]
Out[6]:
In [7]:
csv_slang_object = csv.reader(open('slang.txt', 'rb'), delimiter=';')
headerSlang = csv_slang_object.next()
dataSlang = {} # troquei por um dict (tabele hash), pois fica mais fácil procurar
for line in csv_slang_object:
dataSlang[line[0]] = word_tokenize(line[1].lower())
In [8]:
import sys
reload(sys)
import string
sys.setdefaultencoding("utf-8")
des_file = codecs.open('SMSSpamAnalytics-teste', 'wb', 'utf-8')
writer = un.writer(des_file, encoding='utf-8', delimiter='{')
#writer = csv.writer(des_file, delimiter=';', quotechar='"')
writer.writerow(["Classe","Tokens_expandido", "Tokens_com_girias", "Tokens_em_dicionario"])#write the column headers
# usando o dicionário....
j = 0
no_tk = []
while j <= 5573:
tks_girias = []
tks_dict = []
base = data[j,1].lower()
#sent = string.split(base)
sent = word_tokenize(data[j,1].lower())
for i in range(len(sent)):
if sent[i] in dataSlang.keys():
tks_girias.append(sent[i])
sent[i:i+1] = dataSlang[sent[i]]
no_tk = string.join(sent)
pass
if wn.synsets(sent[i]):
tks_dict.append(sent[i])
no_tk = string.join(sent)
pass
writer.writerow([data[j,0],no_tk, str(tks_girias), str(tks_dict)])
# print sent
j = j+1
des_file.close()
In [ ]:
'''# sem usar o dicionário...
sent = word_tokenize(data[0][1].lower())
print sent
for i in range(len(sent)):
if sent[i] in dataSlang.keys():
sent[i:i+1] = dataSlang[sent[i]]
print sent'''
In [ ]:
data[35]
In [ ]:
string = word_tokenize("Ola mundo tokenizado gonna")
string
In [ ]:
nova = []
nova = ' '.join(string)
In [ ]:
nova
In [ ]: