In [ ]:
import sklearn
print sklearn.__version__

In [1]:
import gensim
print gensim.__version__


0.12.1

In [4]:
import string
import csv as csv
import codecs
import numpy as np

csv_file_object = codecs.open('Baseline/en-balanced/Tweets-sarcastic-balanced', 'rb', 'utf-8', 'ignore')
header = csv_file_object.next()
data = []
for row in csv_file_object:
    data.append(row.split('{'))
data = np.array(data)
print len(data)

In [6]:
import sys
reload(sys)
sys.setdefaultencoding("utf-8")
import unicodecsv as un
from tweetokenize import Tokenizer
from nltk.tokenize import word_tokenize
des_file_object = codecs.open('Baseline/en-balanced/baseline_nohashtag/Sarcasmo/Tweets-sarcasm-balanced-nohashtag', 'wb', 'utf-8')
writer = un.writer(des_file_object, encoding='utf-8', delimiter='{')
writer.writerow(["classe", "id", "usr", "tweet", "hashtag"])

## DICIONARIO SLANG ###
csv_slang_object = csv.reader(open('Baseline/en-balanced/slang.txt', 'rb'), delimiter=';')
headerSlang = csv_slang_object.next()
dataSlang = {}  # troquei por um dict (tabele hash), pois fica mais fácil procurar
for line in csv_slang_object:
    dataSlang[line[0]] = word_tokenize(line[1].lower())
###       ###

j = 0
for j in range(len(data)):
    gettokens = Tokenizer()
    result = gettokens.tokenize(data[j][2].lower())
    hastag = []
    toke_text = []
    for i in result:
        if(i.startswith('#')):            #if(i.startswith('#sarcasm') or i.startswith('#music')):
            hastag.append(i)
            #print i
        else:
            if i in dataSlang.keys():
                toke_text.append(string.join(dataSlang[i]))
            else:
                toke_text.append(i)
            for k in range(len(toke_text)):
                union_text = string.join(toke_text)
        
    writer.writerow(['sarcasmo', data[j][0], data[j][1], union_text, string.join(hastag)])
    
des_file_object.close()

In [ ]:
dataSlang['tks']

In [ ]: