In [1]:
import pandas as pd
import fastparquet
from gensim.models.wrappers import FastText
#import gensim

import matplotlib.pyplot as plt
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
import nltk
nltk.download('punkt')
nltk.download("stopwords") 
from nltk.corpus import stopwords

import re
import string


C:\Users\goldaw\AppData\Local\Continuum\Anaconda3\lib\site-packages\gensim\utils.py:855: UserWarning: detected Windows; aliasing chunkize to chunkize_serial
  warnings.warn("detected Windows; aliasing chunkize to chunkize_serial")
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\goldaw\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\goldaw\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!

In [2]:
pd.options.mode.chained_assignment = None

In [3]:
# load data from precomputed dataframe
pfile = fastparquet.ParquetFile('5col_DFrame.parq') 
df = pfile.to_pandas() # all columns 
#df2 = pfile.to_pandas(columns=['floats', 'times']) # pick some columns

In [4]:
df.tail()


Out[4]:
sentiment text text_clean1 hashtags wordlist no_names_hashtags
10494 Negative The question about God and the Veterans. What ... question god veterans softball gopdebates [gopdebates] [question, god, veterans, softball] question god veterans softball
10495 Negative I thought #LastComicStanding airs on Wednesday... thought lastcomicstanding airs wednesday night... [lastcomicstanding, gopdebates] [thought, airs, wednesday, nights] thought airs wednesday nights
10496 Negative Bingo! Put that in your article!!! #GOPDebates... bingo put article gopdebates httpstcoxaaqwagf [gopdebates] [bingo, put, article] bingo put article
10497 Negative RT @RWSurferGirl: Fox is cherry picking the ca... fox cherry picking candidates jeb softball que... [gopdebates, gopdebates] [fox, cherry, picking, candidates, jeb, softba... fox cherry picking candidates jeb softball que...
10498 Neutral Waiting on Trumps answer about God #GOPDebates... waiting trumps answer god gopdebates aintenoug... [gopdebates, aintenoughpopcornforthis] [waiting, trumps, answer, god] waiting trumps answer god

In [5]:
#create fasttext obj
fs = FastText()

## Create list of all words in our target dataset
words = [] 
for entry in df.wordlist:
    for word in entry:
        words.append(word)
        #word = word.lower()
        #i = i+1
        #if 1 <= i <= 10:
        #    if word in model:
        #        model2 = model[word]

#distinct words in our dataset        
len(words)


Out[5]:
61205

In [6]:
##load the FastText pre-trained vectors from facebook.

#this is actually not needed for our task - we do not need the high-dimensional semantic information contained within 
#the model for the kaggle challenge.

#lots of problems loading the dataset through gensim, takes a long time.
#using "words" list to exclude unneeded words would be a useful extension

#model = fs.load_binary_data('wiki.en.bin')
#model = fs.load_fasttext_format('wiki.en')
model =  fs.load_word2vec_format('wiki.en.vec')
#print model.words # list of words in dictionary
#print(model['king']) # get the vector of the word 'king'

In [9]:
#stub for a function removing uneeded words from fasttextmodel - unexplored
# i = 0
# for word in words:
#     if (word in model):
#         i = i + 1
#         if 1 <= i <= 10:
#             print(word in model)
#             print(word)

model2 = {model[word] for word in model if word in words}


---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-9-8592ed92ba49> in <module>()
      8 #             print(word)
      9 
---> 10 model2 = [model[word] for word in model if word in words]

<ipython-input-9-8592ed92ba49> in <listcomp>(.0)
      8 #             print(word)
      9 
---> 10 model2 = [model[word] for word in model if word in words]

C:\Users\goldaw\AppData\Local\Continuum\Anaconda3\lib\site-packages\gensim\models\keyedvectors.py in __getitem__(self, words)
    574             return self.word_vec(words)
    575 
--> 576         return vstack([self.word_vec(word) for word in words])
    577 
    578     def __contains__(self, word):

TypeError: 'int' object is not iterable

In [23]:
words.remove("90")