In [1]:
import numpy as np
from sklearn.linear_model import LogisticRegression
np.random.seed(241)  # for reproducibility
from gensim.models import Word2Vec
import sklearn.cross_validation as valid
import sklearn.metrics as metrics
from keras.constraints import unitnorm
from keras.layers.core import Dense,Dropout,Activation
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM, GRU
from keras.models import Sequential
import re, string
import io, csv
from sklearn.cross_validation import KFold
import sklearn.cross_validation as valid
from keras.optimizers import RMSprop


Using Theano backend.

In [2]:
np.random.seed(241)
punc = r'[\!#$%&()*+,-./:;<=>?\[\]^_`{|}~\"]'
punc_regex = re.compile(punc)
space_regex = re.compile('\s+')
article_regex = re.compile(r'^.*\nTEXT\:', re.M | re.DOTALL)
digits_regex = re.compile('\d+')
someOneRegex = re.compile(r"@\S+\s")
urlfinder = re.compile("https?:\/\/\S+")

In [3]:
def process_text(text,soup=True):
    if soup:
        soup = BeautifulSoup(text, 'html.parser')
        text = soup.get_text()
    text = digits_regex.sub(u'0', text.lower())
    text = urlfinder.sub(u' ReplacedUrl ', text)
    text = punc_regex.sub(u' ', text)
    text = space_regex.sub(u' ', text)
    return text

In [4]:
def pad_sentences(data, maxlen=None, value=0.):
    lengths = [len(s) for s in data]
    nb_samples = len(data)
    if maxlen is None:
        maxlen = np.max(lengths)
    x = (np.ones((nb_samples, maxlen)) * value).astype(np.int)
    for idx, s in enumerate(data):
        if len(s) == 0:
            continue  # empty list was found
        trunc = s[-maxlen:]
        x[idx, -len(trunc):] = trunc
    return x, maxlen

In [5]:
w2v_model = Word2Vec.load_word2vec_format('data\\bilingual.bin', binary=True)
# w2v_model = Word2Vec.load('data\\bilingual_mixed')

In [12]:
def parse_kaz(file_name, X, y):
    with open(file_name, 'r') as csv_file:
        reader = csv.reader(csv_file, delimiter='\t')
        for row in reader:
            try:
                rowText = row[1].decode('utf-8')
                sentiment = int(row[0])
                if(sentiment == -1):
                    sentiment = 0                  
                if len(rowText) > 10:
                    rowText = someOneRegex.sub('@someone', rowText)
                    rowText = process_text(rowText, soup=False)
                    sent = []
                    words = rowText.split()
                    for word in words:
                        if word in w2v_model.vocab:
                            sent.append(w2v_model.vocab[word].index)
                    X.append(sent)
                    y.append(sentiment)
            except:
                v = 0
def twitter_parse_rus(file_name,sentiment, X, y):
    with open(file_name, 'r') as csv_file:
        reader = csv.reader(csv_file, delimiter=';')
        for row in reader:
            rowText = row[3].decode('utf-8')
            if len(rowText) > 10:
                rowText = someOneRegex.sub('@someone', rowText)
                rowText = process_text(rowText, soup=False)
                sent = []
                words = rowText.split()
                for word in words:
                    if word in w2v_model.vocab:
                        sent.append(w2v_model.vocab[word].index)
                X.append(sent)
                y.append(sentiment)
                
def twitter_parse_en(file_name, X, y):
    with open(file_name, 'r') as csv_file:
        reader = csv.reader(csv_file, delimiter=',')
        for row in reader:
            try:
                rowText = row[5].decode('utf-8')
                sentiment = int(row[0])
                if(sentiment == 2):
                    continue
                elif(sentiment == 4):
                    sentiment = 1                    
                if len(rowText) > 10:
                    rowText = someOneRegex.sub('@someone', rowText)
                    rowText = process_text(rowText, soup=False)
                    sent = []
                    words = rowText.split()
                    for word in words:
                        if word in w2v_model.vocab:
                            sent.append(w2v_model.vocab[word].index)
                    X.append(sent)
                    y.append(sentiment)
            except:
                v = 0

In [32]:
X_kaz = []
y_kaz = []
# twitter_parse('data\\negative.csv',0,X,y)
# twitter_parse('data\\positive.csv',1,X,y)
parse_kaz('data\\kaz_news_comments.csv',X_kaz,y_kaz)
print len(X_kaz)
X_kaz, maxlen = pad_sentences(X_kaz)
y_kaz = np.array(y_kaz)
print X_kaz.shape
X_kaz_train, X_kaz_test, y_kaz_train, y_kaz_test = valid.train_test_split(X_kaz, y_kaz, test_size=0.80)


554
(554L, 60L)

In [33]:
X_rus = []
y_rus = []
twitter_parse_rus('data\\rus_negative_twitter.csv',0,X_rus,y_rus)
twitter_parse_rus('data\\rus_positive_twitter.csv',1,X_rus,y_rus)
X_rus, maxlen = pad_sentences(X_rus, maxlen)
y_rus = np.array(y_rus)
ss = np.random.choice(X_rus.shape[0], 20000)
X_rus = X_rus[ss]
y_rus = y_rus[ss]
print X_rus.shape
X_rus_train, X_rus_test, y_rus_train, y_rus_test = valid.train_test_split(X_rus, y_rus, test_size=0.25)


(20000L, 60L)

In [34]:
X_train = np.vstack((X_kaz_train,X_rus_train))
y_train = np.hstack((y_kaz_train,y_rus_train))

In [35]:
# set parameters:
word_vocub_len = w2v_model.syn0.shape[0]
word_maxlen= maxlen
word_embedding_dims = w2v_model.syn0.shape[1]
hidden_dims = 250
nb_epoch = 2

In [36]:
model = Sequential()
model.add(Embedding(input_dim=word_vocub_len,output_dim=word_embedding_dims, input_length=word_maxlen,weights=[w2v_model.syn0], W_constraint=unitnorm()))#, weights=[wordvectors.W], W_constraint=unitnorm()
model.add(Dropout(0.2))
# model.add(LSTM(64,return_sequences=True))
model.add(GRU(64, dropout_W=0.2, dropout_U=0.2))
# model.add(Dense(128,activation='relu'))
model.add(Dense(1))
model.add(Activation('sigmoid'))
print('compile start')
rmsprop = RMSprop(lr=0.001)
model.compile(loss='binary_crossentropy', optimizer=rmsprop, metrics=["accuracy"])
print('model compilled')


compile start
model compilled

In [37]:
batch_size = 128
('fit start')
# ss_kaz = np.random.choice(X.shape[0], 15000)
# ss_rus = np.random.choice(X_rus.shape[0], 500)
# X_train, X_test, y_train, y_test = valid.train_test_split(np.vstack((X[ss_kaz],X_rus[ss_rus])), np.hstack((y[ss_kaz],y_rus[ss_rus])), test_size=0.3)
print(X_train.shape)
# for i in xrange(N_epoch):
model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=1, verbose=1)
print('fitting ended')


(15110L, 60L)
Epoch 1/1
15110/15110 [==============================] - 252s - loss: 0.6114 - acc: 0.6661   
fitting ended

In [38]:
y_predscore = model.predict_proba(X_kaz_test, batch_size=batch_size, verbose=1)
y_pred = np.round(y_predscore)
auc = metrics.roc_auc_score(y_kaz_test, y_predscore)
acc = metrics.accuracy_score(y_kaz_test,y_pred)
print 'validation on kazakh dataset: accuracy = {:.5}, roc_auc ={:.5}'.format( acc, auc)


444/444 [==============================] - 0s     
validation on kazakh dataset: accuracy = 0.55405, roc_auc =0.63825

In [39]:
y_predscore = model.predict_proba(X_rus_test, batch_size=batch_size, verbose=1)
y_pred = np.round(y_predscore)
auc = metrics.roc_auc_score(y_rus_test, y_predscore)
acc = metrics.accuracy_score(y_rus_test, y_pred)
print 'validation on russian dataset: accuracy = {:.5}, roc_auc ={:.5}'.format(acc, auc)


5000/5000 [==============================] - 3s     
validation on russian dataset: accuracy = 0.724, roc_auc =0.7998

In [ ]: