In [1]:
import numpy as np
from sklearn.linear_model import LogisticRegression
np.random.seed(241)  # for reproducibility
from gensim.models import Word2Vec
import sklearn.cross_validation as valid
import sklearn.metrics as metrics
from keras.constraints import unitnorm
from keras.layers.core import Dense,Dropout,Activation
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM, GRU
from keras.models import Sequential
import re, string
import io, csv
from sklearn.cross_validation import KFold
import sklearn.cross_validation as valid
from keras.optimizers import RMSprop


Using Theano backend.

In [2]:
np.random.seed(241)
punc = r'[\!#$%&()*+,-./:;<=>?\[\]^_`{|}~\"]'
punc_regex = re.compile(punc)
space_regex = re.compile('\s+')
article_regex = re.compile(r'^.*\nTEXT\:', re.M | re.DOTALL)
digits_regex = re.compile('\d+')
someOneRegex = re.compile(r"@\S+\s")
urlfinder = re.compile("https?:\/\/\S+")

In [3]:
def process_text(text,soup=True):
    if soup:
        soup = BeautifulSoup(text, 'html.parser')
        text = soup.get_text()
    text = digits_regex.sub(u'0', text.lower())
    text = urlfinder.sub(u' ReplacedUrl ', text)
    text = punc_regex.sub(u' ', text)
    text = space_regex.sub(u' ', text)
    return text

In [4]:
def pad_sentences(data, maxlen=None, value=0.):
    lengths = [len(s) for s in data]
    nb_samples = len(data)
    if maxlen is None:
        maxlen = np.max(lengths)
    x = (np.ones((nb_samples, maxlen)) * value).astype(np.int)
    for idx, s in enumerate(data):
        if len(s) == 0:
            continue  # empty list was found
        trunc = s[-maxlen:]
        x[idx, -len(trunc):] = trunc
    return x, maxlen

In [5]:
# w2v_model = Word2Vec.load_word2vec_format('data\\bilingual.bin', binary=True)
w2v_model = Word2Vec.load('data\\bilingual_mixed')

In [6]:
def twitter_parse_rus(file_name,sentiment, X, y):
    with open(file_name, 'r') as csv_file:
        reader = csv.reader(csv_file, delimiter=';')
        for row in reader:
            rowText = row[3].decode('utf-8')
            if len(rowText) > 10:
                rowText = someOneRegex.sub('@someone', rowText)
                rowText = process_text(rowText, soup=False)
                sent = []
                words = rowText.split()
                for word in words:
                    if word in w2v_model.vocab:
                        sent.append(w2v_model.vocab[word].index)
                X.append(sent)
                y.append(sentiment)
                
def twitter_parse_en(file_name, X, y):
    with open(file_name, 'r') as csv_file:
        reader = csv.reader(csv_file, delimiter=',')
        for row in reader:
            try:
                rowText = row[5].decode('utf-8')
                sentiment = int(row[0])
                if(sentiment == 2):
                    continue
                elif(sentiment == 4):
                    sentiment = 1                    
                if len(rowText) > 10:
                    rowText = someOneRegex.sub('@someone', rowText)
                    rowText = process_text(rowText, soup=False)
                    sent = []
                    words = rowText.split()
                    for word in words:
                        if word in w2v_model.vocab:
                            sent.append(w2v_model.vocab[word].index)
                    X.append(sent)
                    y.append(sentiment)
            except:
                v = 0

In [7]:
X_eng = []
y_eng = []
# twitter_parse('data\\negative.csv',0,X,y)
# twitter_parse('data\\positive.csv',1,X,y)
twitter_parse_en('data\\en_twitter.csv',X_eng,y_eng)
X_eng, maxlen = pad_sentences(X_eng)
y_eng = np.array(y_eng)
ss = np.random.choice(X_eng.shape[0], 20000)
X_eng = X_eng[ss]
y_eng = y_eng[ss]
print X_eng.shape
X_eng_train, X_eng_test, y_eng_train, y_eng_test = valid.train_test_split(X_eng, y_eng, test_size=0.5)


(20000L, 52L)

In [8]:
X_rus = []
y_rus = []
twitter_parse_rus('data\\rus_negative_twitter.csv',0,X_rus,y_rus)
twitter_parse_rus('data\\rus_positive_twitter.csv',1,X_rus,y_rus)
X_rus, maxlen = pad_sentences(X_rus, maxlen)
y_rus = np.array(y_rus)
ss = np.random.choice(X_rus.shape[0], 20000)
X_rus = X_rus[ss]
y_rus = y_rus[ss]
print X_rus.shape
X_rus_train, X_rus_test, y_rus_train, y_rus_test = valid.train_test_split(X_rus, y_rus, test_size=0.5)


(20000L, 52L)

In [9]:
X_train = np.vstack((X_eng_train,X_rus_train))
y_train = np.hstack((y_eng_train,y_rus_train))

In [10]:
# set parameters:
word_vocub_len = w2v_model.syn0.shape[0]
word_maxlen= maxlen
word_embedding_dims = w2v_model.syn0.shape[1]
hidden_dims = 250
nb_epoch = 2

In [11]:
model = Sequential()
model.add(Embedding(input_dim=word_vocub_len,output_dim=word_embedding_dims, input_length=word_maxlen,weights=[w2v_model.syn0], W_constraint=unitnorm()))#, weights=[wordvectors.W], W_constraint=unitnorm()
model.add(Dropout(0.2))
# model.add(LSTM(64,return_sequences=True))
model.add(GRU(64, dropout_W=0.2, dropout_U=0.2))
# model.add(Dense(128,activation='relu'))
model.add(Dense(1))
model.add(Activation('sigmoid'))
print('compile start')
rmsprop = RMSprop(lr=0.001)
model.compile(loss='binary_crossentropy', optimizer=rmsprop, metrics=["accuracy"])
print('model compilled')


compile start
model compilled

In [12]:
batch_size = 128
('fit start')
# ss_eng = np.random.choice(X.shape[0], 15000)
# ss_rus = np.random.choice(X_rus.shape[0], 500)
# X_train, X_test, y_train, y_test = valid.train_test_split(np.vstack((X[ss_eng],X_rus[ss_rus])), np.hstack((y[ss_eng],y_rus[ss_rus])), test_size=0.3)
print(X_train.shape)
# for i in xrange(N_epoch):
model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=1, verbose=1)
print('fitting ended')


(20000L, 52L)
Epoch 1/1
20000/20000 [==============================] - 574s - loss: 0.6123 - acc: 0.6666   
fitting ended

In [13]:
y_predscore = model.predict_proba(X_eng_test, batch_size=batch_size, verbose=1)
y_pred = np.round(y_predscore)
auc = metrics.roc_auc_score(y_eng_test, y_predscore)
acc = metrics.accuracy_score(y_eng_test,y_pred)
print 'validation on english dataset: accuracy = {:.5}, roc_auc ={:.5}'.format( acc, auc)


10000/10000 [==============================] - 5s     
validation on english dataset: accuracy = 0.736, roc_auc =0.80574

In [14]:
ss = np.random.choice(X_rus_test.shape[0], X_eng_test.shape[0])
y_predscore = model.predict_proba(X_rus_test[ss], batch_size=batch_size, verbose=1)
y_pred = np.round(y_predscore)
auc = metrics.roc_auc_score(y_rus_test[ss], y_predscore)
acc = metrics.accuracy_score(y_rus_test[ss], y_pred)
print 'validation on russian dataset: accuracy = {:.5}, roc_auc ={:.5}'.format(acc, auc)


10000/10000 [==============================] - 5s     
validation on russian dataset: accuracy = 0.6968, roc_auc =0.78053

In [ ]: