In [ ]:
import sklearn
print sklearn.__version__
In [1]:
import gensim
print gensim.__version__
In [4]:
import string
import csv as csv
import codecs
import numpy as np
csv_file_object = codecs.open('Baseline/en-balanced/Tweets-sarcastic-balanced', 'rb', 'utf-8', 'ignore')
header = csv_file_object.next()
data = []
for row in csv_file_object:
data.append(row.split('{'))
data = np.array(data)
print len(data)
In [6]:
import sys
reload(sys)
sys.setdefaultencoding("utf-8")
import unicodecsv as un
from tweetokenize import Tokenizer
from nltk.tokenize import word_tokenize
des_file_object = codecs.open('Baseline/en-balanced/baseline_nohashtag/Sarcasmo/Tweets-sarcasm-balanced-nohashtag', 'wb', 'utf-8')
writer = un.writer(des_file_object, encoding='utf-8', delimiter='{')
writer.writerow(["classe", "id", "usr", "tweet", "hashtag"])
## DICIONARIO SLANG ###
csv_slang_object = csv.reader(open('Baseline/en-balanced/slang.txt', 'rb'), delimiter=';')
headerSlang = csv_slang_object.next()
dataSlang = {} # troquei por um dict (tabele hash), pois fica mais fácil procurar
for line in csv_slang_object:
dataSlang[line[0]] = word_tokenize(line[1].lower())
### ###
j = 0
for j in range(len(data)):
gettokens = Tokenizer()
result = gettokens.tokenize(data[j][2].lower())
hastag = []
toke_text = []
for i in result:
if(i.startswith('#')): #if(i.startswith('#sarcasm') or i.startswith('#music')):
hastag.append(i)
#print i
else:
if i in dataSlang.keys():
toke_text.append(string.join(dataSlang[i]))
else:
toke_text.append(i)
for k in range(len(toke_text)):
union_text = string.join(toke_text)
writer.writerow(['sarcasmo', data[j][0], data[j][1], union_text, string.join(hastag)])
des_file_object.close()
In [ ]:
dataSlang['tks']
In [ ]: