In [1]:
import numpy as np
from sklearn.linear_model import LogisticRegression
np.random.seed(241) # for reproducibility
from gensim.models import Word2Vec
import sklearn.cross_validation as valid
import sklearn.metrics as metrics
from keras.constraints import unitnorm
from keras.layers.core import Dense,Dropout,Activation
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM, GRU
from keras.models import Sequential
import re, string
import io, csv
from sklearn.cross_validation import KFold
import sklearn.cross_validation as valid
from keras.optimizers import RMSprop
In [2]:
np.random.seed(241)
punc = r'[\!#$%&()*+,-./:;<=>?\[\]^_`{|}~\"]'
punc_regex = re.compile(punc)
space_regex = re.compile('\s+')
article_regex = re.compile(r'^.*\nTEXT\:', re.M | re.DOTALL)
digits_regex = re.compile('\d+')
someOneRegex = re.compile(r"@\S+\s")
urlfinder = re.compile("https?:\/\/\S+")
In [3]:
def process_text(text,soup=True):
if soup:
soup = BeautifulSoup(text, 'html.parser')
text = soup.get_text()
text = digits_regex.sub(u'0', text.lower())
text = urlfinder.sub(u' ReplacedUrl ', text)
text = punc_regex.sub(u' ', text)
text = space_regex.sub(u' ', text)
return text
In [4]:
def pad_sentences(data, maxlen=None, value=0.):
lengths = [len(s) for s in data]
nb_samples = len(data)
if maxlen is None:
maxlen = np.max(lengths)
x = (np.ones((nb_samples, maxlen)) * value).astype(np.int)
for idx, s in enumerate(data):
if len(s) == 0:
continue # empty list was found
trunc = s[-maxlen:]
x[idx, -len(trunc):] = trunc
return x, maxlen
In [5]:
w2v_model = Word2Vec.load_word2vec_format('data\\bilingual.bin', binary=True)
# w2v_model = Word2Vec.load('data\\bilingual_mixed')
In [12]:
def parse_kaz(file_name, X, y):
with open(file_name, 'r') as csv_file:
reader = csv.reader(csv_file, delimiter='\t')
for row in reader:
try:
rowText = row[1].decode('utf-8')
sentiment = int(row[0])
if(sentiment == -1):
sentiment = 0
if len(rowText) > 10:
rowText = someOneRegex.sub('@someone', rowText)
rowText = process_text(rowText, soup=False)
sent = []
words = rowText.split()
for word in words:
if word in w2v_model.vocab:
sent.append(w2v_model.vocab[word].index)
X.append(sent)
y.append(sentiment)
except:
v = 0
def twitter_parse_rus(file_name,sentiment, X, y):
with open(file_name, 'r') as csv_file:
reader = csv.reader(csv_file, delimiter=';')
for row in reader:
rowText = row[3].decode('utf-8')
if len(rowText) > 10:
rowText = someOneRegex.sub('@someone', rowText)
rowText = process_text(rowText, soup=False)
sent = []
words = rowText.split()
for word in words:
if word in w2v_model.vocab:
sent.append(w2v_model.vocab[word].index)
X.append(sent)
y.append(sentiment)
def twitter_parse_en(file_name, X, y):
with open(file_name, 'r') as csv_file:
reader = csv.reader(csv_file, delimiter=',')
for row in reader:
try:
rowText = row[5].decode('utf-8')
sentiment = int(row[0])
if(sentiment == 2):
continue
elif(sentiment == 4):
sentiment = 1
if len(rowText) > 10:
rowText = someOneRegex.sub('@someone', rowText)
rowText = process_text(rowText, soup=False)
sent = []
words = rowText.split()
for word in words:
if word in w2v_model.vocab:
sent.append(w2v_model.vocab[word].index)
X.append(sent)
y.append(sentiment)
except:
v = 0
In [32]:
X_kaz = []
y_kaz = []
# twitter_parse('data\\negative.csv',0,X,y)
# twitter_parse('data\\positive.csv',1,X,y)
parse_kaz('data\\kaz_news_comments.csv',X_kaz,y_kaz)
print len(X_kaz)
X_kaz, maxlen = pad_sentences(X_kaz)
y_kaz = np.array(y_kaz)
print X_kaz.shape
X_kaz_train, X_kaz_test, y_kaz_train, y_kaz_test = valid.train_test_split(X_kaz, y_kaz, test_size=0.80)
In [33]:
X_rus = []
y_rus = []
twitter_parse_rus('data\\rus_negative_twitter.csv',0,X_rus,y_rus)
twitter_parse_rus('data\\rus_positive_twitter.csv',1,X_rus,y_rus)
X_rus, maxlen = pad_sentences(X_rus, maxlen)
y_rus = np.array(y_rus)
ss = np.random.choice(X_rus.shape[0], 20000)
X_rus = X_rus[ss]
y_rus = y_rus[ss]
print X_rus.shape
X_rus_train, X_rus_test, y_rus_train, y_rus_test = valid.train_test_split(X_rus, y_rus, test_size=0.25)
In [34]:
X_train = np.vstack((X_kaz_train,X_rus_train))
y_train = np.hstack((y_kaz_train,y_rus_train))
In [35]:
# set parameters:
word_vocub_len = w2v_model.syn0.shape[0]
word_maxlen= maxlen
word_embedding_dims = w2v_model.syn0.shape[1]
hidden_dims = 250
nb_epoch = 2
In [36]:
model = Sequential()
model.add(Embedding(input_dim=word_vocub_len,output_dim=word_embedding_dims, input_length=word_maxlen,weights=[w2v_model.syn0], W_constraint=unitnorm()))#, weights=[wordvectors.W], W_constraint=unitnorm()
model.add(Dropout(0.2))
# model.add(LSTM(64,return_sequences=True))
model.add(GRU(64, dropout_W=0.2, dropout_U=0.2))
# model.add(Dense(128,activation='relu'))
model.add(Dense(1))
model.add(Activation('sigmoid'))
print('compile start')
rmsprop = RMSprop(lr=0.001)
model.compile(loss='binary_crossentropy', optimizer=rmsprop, metrics=["accuracy"])
print('model compilled')
In [37]:
batch_size = 128
('fit start')
# ss_kaz = np.random.choice(X.shape[0], 15000)
# ss_rus = np.random.choice(X_rus.shape[0], 500)
# X_train, X_test, y_train, y_test = valid.train_test_split(np.vstack((X[ss_kaz],X_rus[ss_rus])), np.hstack((y[ss_kaz],y_rus[ss_rus])), test_size=0.3)
print(X_train.shape)
# for i in xrange(N_epoch):
model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=1, verbose=1)
print('fitting ended')
In [38]:
y_predscore = model.predict_proba(X_kaz_test, batch_size=batch_size, verbose=1)
y_pred = np.round(y_predscore)
auc = metrics.roc_auc_score(y_kaz_test, y_predscore)
acc = metrics.accuracy_score(y_kaz_test,y_pred)
print 'validation on kazakh dataset: accuracy = {:.5}, roc_auc ={:.5}'.format( acc, auc)
In [39]:
y_predscore = model.predict_proba(X_rus_test, batch_size=batch_size, verbose=1)
y_pred = np.round(y_predscore)
auc = metrics.roc_auc_score(y_rus_test, y_predscore)
acc = metrics.accuracy_score(y_rus_test, y_pred)
print 'validation on russian dataset: accuracy = {:.5}, roc_auc ={:.5}'.format(acc, auc)
In [ ]: