http://www.fakenewschallenge.org/
https://github.com/FakeNewsChallenge/fnc-1-baseline
http://textminingonline.com/dive-into-nltk-part-iv-stemming-and-lemmatization
https://www.dataquest.io/blog/natural-language-processing-with-python/
http://www.nltk.org/book/ch03.html
https://www.tensorflow.org/tutorials/recurrent
https://www.tensorflow.org/programmers_guide/reading_data
http://scikit-learn.org/stable/modules/classes.html#module-sklearn.preprocessing
https://github.com/adam-p/markdown-here/wiki/Markdown-Cheatsheet
https://keras.io/getting-started/sequential-model-guide/
In [ ]:
import sys
HOME_DIR=''
FNC_PATH='{}/fnc-1-baseline'.format(HOME_DIR)
#must add local path to the FNC utils, so we can import and reuse them
sys.path.append(FNC_PATH + '/utils/')
In [ ]:
import gensim
#Google news vectors link https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/view
#We can change it with fastText, but it's 6GB and it's super slow to download
W2V_MODEL='{}/model/GoogleNews-vectors-negative300.bin'.format(HOME_DIR)
# Load Google's pre-trained Word2Vec model.
w2vmodel = gensim.models.KeyedVectors.load_word2vec_format(W2V_MODEL, binary=True)
In [ ]:
import pandas as pd
def read_data(path=FNC_PATH + '/fnc-1'):
stances = pd.read_csv(path + '/train_stances.csv')
stances.set_index('Body ID', inplace=True)
bodies = pd.read_csv(path + '/train_bodies.csv')
bodies.set_index('Body ID', inplace=True)
ds = pd.merge(bodies, stances, how='inner', right_index=True, left_index=True)
return ds
In [ ]:
from sklearn.model_selection import train_test_split
def get_data_split(ds, test_size = 0.2):
train, validation = train_test_split(ds, test_size = test_size)
return train, validation
In [ ]:
ds = read_data()
train, validation = get_data_split(ds)
print "Train examples: %d"%len(train)
print "Test examples: %d"%len(validation)
print
print train['Stance'].value_counts()
In [ ]:
import numpy as np
import nltk
import re
from sklearn import feature_extraction
from sklearn import preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
def dense_to_one_hot(labels_dense, num_classes):
"""Convert class labels from scalars to one-hot vectors."""
num_labels = labels_dense.shape[0]
index_offset = np.arange(num_labels) * num_classes
labels_one_hot = np.zeros((num_labels, num_classes))
labels_one_hot.flat[index_offset + labels_dense.ravel()] = 1
return labels_one_hot
def normalize_word(w):
return wnl.lemmatize(w.lower()).lower()
def tokenize_sentenses(sentences):
return sentences.apply(lambda s: nltk.word_tokenize(s.decode('utf-8')))
def lemmatize_tokens(series):
return series.apply(lambda tokens: [normalize_word(t) for t in tokens])
def remove_stopwords(words):
# Removes stopwords from a list of tokens
return words.apply(lambda l: [w for w in l if w not in feature_extraction.text.ENGLISH_STOP_WORDS])
def trainTFIDF(corpus, max_ngram):
vectorizer = TfidfVectorizer(ngram_range=(1, max_ngram), lowercase=True, stop_words="english", min_df=10, max_df=100)
vectorizer.fit(corpus)
return vectorizer
def encode_pos(word):
return ['_'.join(x) for x in nltk.pos_tag(word)]
def doc2vec(terms):
return np.mean([w2vmodel[w] if w in w2vmodel.vocab else np.zeros(300) for w in terms], axis=0).tolist()
def prepare_features(dataset):
from scipy.sparse import hstack
#Usefull link https://www.dataquest.io/blog/natural-language-processing-with-python/
tokens = tokenize_sentenses(dataset['Headline'])
lemmas = lemmatize_tokens(tokens)
no_stop_words = remove_stopwords(lemmas)
pos_tags = no_stop_words.apply(encode_pos)
tf_idf = vectorizer.transform(dataset['Headline'])
#np.asmatrix(matrix.tolist())
embeddings = np.asmatrix(no_stop_words.apply(doc2vec).tolist())
return hstack((tf_idf, embeddings))
le = preprocessing.LabelEncoder()
wnl = nltk.WordNetLemmatizer()
vectorizer = trainTFIDF(train['Headline'], 2)
matrix = prepare_features(train)
train_labels = dense_to_one_hot(le.fit_transform(train['Stance']), 4)
print matrix.shape
In [ ]:
import gc
w2vmodel = None
vectorizer = None
le = None
wnl = None
gc.collect()
In [ ]:
def train_model(x_train, y_train):
from keras import metrics
x_train = x_train.toarray()
input_size = x_train.shape[1]
output_size = 4
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.optimizers import SGD
model = Sequential()
# Dense(64) is a fully-connected layer with 64 hidden units.
# in the first layer, you must specify the expected input data shape:
# here, 20-dimensional vectors.
model.add(Dense(1024, activation='relu', input_dim=input_size))
model.add(Dropout(0.5))
model.add(Dense(512, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(output_size, activation='softmax'))
#sgd = SGD(lr=1e-04, decay=2, momentum=0.9, nesterov=True)
model.compile(loss='categorical_crossentropy',
optimizer="rmsprop",
metrics=['accuracy'])
model.fit(x_train, y_train,
epochs=15,
batch_size=128)
#score = model.evaluate(x_test, y_test, batch_size=16)
train_model(matrix, train_labels)
In [ ]: