In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
import nltk
nltk.download('punkt')
nltk.download("stopwords")
from nltk.corpus import stopwords
import re
import string
In [2]:
pd.options.mode.chained_assignment = None
In [3]:
URL = 'train_data.csv'
def load_data(url=URL):
return pd.read_csv(url)
In [4]:
df = load_data()
In [5]:
df.tail()
Out[5]:
In [6]:
def cleantweet(s):
'''
:s : string; a tweet
:return : list; words that don't contain url, @somebody, and in utf-8 and lower case
'''
s = re.sub(cleantweet.pattern, '', s, 1)
remove_punctuation_map = dict.fromkeys(map(ord, string.punctuation))
# actually remove the punctutation
s = s.translate(remove_punctuation_map)
# tokenize tweet into sentences
sents = sent_tokenize(s)
# tokenize sentences into list of words
words = [word_tokenize(s) for s in sents]
# NO IDEA
words = [e for sent in words for e in sent]
return [cleantweet.stemmer.stem(e.lower()) for e in words]
In [7]:
# Removing the morphological and inflexional endings from words in English.
cleantweet.stemmer = PorterStemmer()
cleantweet.pattern = re.compile(r'@\w+')
df.loc[:,'text'] = df.loc[:,'text'].map(cleantweet)
In [8]:
df.head()
Out[8]:
In [9]:
from gensim.models.wrappers import FastText
In [13]:
fs = FastText()
In [15]:
#model = fs.load_binary_data('wiki.en.bin')
#model = fs.load_fasttext_format('wiki.en')
model = fs.load_word2vec_format('wiki.en.vec')
#print model.words # list of words in dictionary
print(model['king']) # get the vector of the word 'king'
In [ ]: