In [ ]:
import sys
HOME_DIR=''
FNC_PATH='{}/fnc-1-baseline'.format(HOME_DIR)

#must add local path to the FNC utils, so we can import and reuse them
sys.path.append(FNC_PATH + '/utils/')

In [ ]:
import gensim

#Google news vectors link https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/view
#We can change it with fastText, but it's 6GB and it's super slow to download

W2V_MODEL='{}/model/GoogleNews-vectors-negative300.bin'.format(HOME_DIR)
# Load Google's pre-trained Word2Vec model.
w2vmodel = gensim.models.KeyedVectors.load_word2vec_format(W2V_MODEL, binary=True)

In [ ]:
import pandas as pd

def read_data(path=FNC_PATH + '/fnc-1'):
    stances = pd.read_csv(path + '/train_stances.csv')
    stances.set_index('Body ID', inplace=True)
    
    bodies = pd.read_csv(path + '/train_bodies.csv')
    bodies.set_index('Body ID', inplace=True)
    
    ds = pd.merge(bodies, stances, how='inner', right_index=True, left_index=True)
    
    return ds

In [ ]:
from sklearn.model_selection import train_test_split

def get_data_split(ds, test_size = 0.2):
    train, validation = train_test_split(ds, test_size = test_size)
    return train, validation

In [ ]:
ds = read_data()

train, validation = get_data_split(ds)
print "Train examples: %d"%len(train)
print "Test examples: %d"%len(validation)

print 
print train['Stance'].value_counts()

In [ ]:
import numpy as np
import nltk
import re

from sklearn import feature_extraction
from sklearn import preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

def dense_to_one_hot(labels_dense, num_classes):
    """Convert class labels from scalars to one-hot vectors."""
    num_labels = labels_dense.shape[0]
    index_offset = np.arange(num_labels) * num_classes
    labels_one_hot = np.zeros((num_labels, num_classes))
    labels_one_hot.flat[index_offset + labels_dense.ravel()] = 1
    return labels_one_hot

def normalize_word(w):
    return wnl.lemmatize(w.lower()).lower()

def tokenize_sentenses(sentences):
    return sentences.apply(lambda s: nltk.word_tokenize(s.decode('utf-8')))

def lemmatize_tokens(series):
    return series.apply(lambda tokens: [normalize_word(t) for t in tokens])

def remove_stopwords(words):
    # Removes stopwords from a list of tokens
    return words.apply(lambda l: [w for w in l if w not in feature_extraction.text.ENGLISH_STOP_WORDS])

def trainTFIDF(corpus, max_ngram):
    vectorizer = TfidfVectorizer(ngram_range=(1, max_ngram), lowercase=True, stop_words="english", min_df=10, max_df=100)
    vectorizer.fit(corpus)
    
    return vectorizer

def encode_pos(word):
    return ['_'.join(x) for x in nltk.pos_tag(word)]

def doc2vec(terms):
    return np.mean([w2vmodel[w] if w in w2vmodel.vocab else np.zeros(300) for w in terms], axis=0).tolist()

def prepare_features(dataset):
    from scipy.sparse import hstack
    
    #Usefull link https://www.dataquest.io/blog/natural-language-processing-with-python/
    tokens = tokenize_sentenses(dataset['Headline'])
    lemmas = lemmatize_tokens(tokens)
    no_stop_words = remove_stopwords(lemmas)
    pos_tags = no_stop_words.apply(encode_pos)
    tf_idf = vectorizer.transform(dataset['Headline'])
    #np.asmatrix(matrix.tolist())
    embeddings = np.asmatrix(no_stop_words.apply(doc2vec).tolist())
    
    return hstack((tf_idf, embeddings)) 

le = preprocessing.LabelEncoder()
wnl = nltk.WordNetLemmatizer()
vectorizer = trainTFIDF(train['Headline'], 2)
matrix = prepare_features(train)
train_labels = dense_to_one_hot(le.fit_transform(train['Stance']), 4)

print matrix.shape

In [ ]:
import gc

w2vmodel = None
vectorizer = None
le = None
wnl = None

gc.collect()

In [ ]:
def train_model(x_train, y_train):
    from keras import metrics
    
    x_train = x_train.toarray()
    input_size = x_train.shape[1]
    output_size = 4
    
    from keras.models import Sequential
    from keras.layers import Dense, Dropout, Activation
    from keras.optimizers import SGD

    model = Sequential()
    # Dense(64) is a fully-connected layer with 64 hidden units.
    # in the first layer, you must specify the expected input data shape:
    # here, 20-dimensional vectors.
    model.add(Dense(1024, activation='relu', input_dim=input_size))
    model.add(Dropout(0.5))
    model.add(Dense(512, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(output_size, activation='softmax'))

    #sgd = SGD(lr=1e-04, decay=2, momentum=0.9, nesterov=True)
    model.compile(loss='categorical_crossentropy',
                  optimizer="rmsprop",
                  metrics=['accuracy'])

    model.fit(x_train, y_train,
              epochs=15,
              batch_size=128)
    #score = model.evaluate(x_test, y_test, batch_size=16)
    
train_model(matrix, train_labels)

In [ ]: