In [1]:
import pandas as pd
import fastparquet
from gensim.models.wrappers import FastText
#import gensim
import matplotlib.pyplot as plt
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
import nltk
nltk.download('punkt')
nltk.download("stopwords")
from nltk.corpus import stopwords
import re
import string
In [2]:
pd.options.mode.chained_assignment = None
In [3]:
# load data from precomputed dataframe
pfile = fastparquet.ParquetFile('5col_DFrame.parq')
df = pfile.to_pandas() # all columns
#df2 = pfile.to_pandas(columns=['floats', 'times']) # pick some columns
In [4]:
df.tail()
Out[4]:
In [5]:
#create fasttext obj
fs = FastText()
## Create list of all words in our target dataset
words = []
for entry in df.wordlist:
for word in entry:
words.append(word)
#word = word.lower()
#i = i+1
#if 1 <= i <= 10:
# if word in model:
# model2 = model[word]
#distinct words in our dataset
len(words)
Out[5]:
In [6]:
##load the FastText pre-trained vectors from facebook.
#this is actually not needed for our task - we do not need the high-dimensional semantic information contained within
#the model for the kaggle challenge.
#lots of problems loading the dataset through gensim, takes a long time.
#using "words" list to exclude unneeded words would be a useful extension
#model = fs.load_binary_data('wiki.en.bin')
#model = fs.load_fasttext_format('wiki.en')
model = fs.load_word2vec_format('wiki.en.vec')
#print model.words # list of words in dictionary
#print(model['king']) # get the vector of the word 'king'
In [12]:
#stub for a function removing uneeded words from fasttextmodel - unexplored
# i = 0
# for word in words:
# if (word in model):
# i = i + 1
# if 1 <= i <= 10:
# print(word in model)
# print(word)
#model2 = {model[word] for word in model if word in words}
type(words)
Out[12]:
In [23]:
words.remove("90")