In [1]:
import pandas as pd

import matplotlib.pyplot as plt
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
import nltk
nltk.download('punkt')
nltk.download("stopwords") 
from nltk.corpus import stopwords

import re
import string


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\goldaw\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\goldaw\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.

In [2]:
pd.options.mode.chained_assignment = None

In [3]:
URL = 'train_data.csv'
def load_data(url=URL):
	return pd.read_csv(url)

In [4]:
df = load_data()

In [5]:
df.tail()


Out[5]:
sentiment text
10494 Negative The question about God and the Veterans. What ...
10495 Negative I thought #LastComicStanding airs on Wednesday...
10496 Negative Bingo! Put that in your article!!! #GOPDebates...
10497 Negative RT @RWSurferGirl: Fox is cherry picking the ca...
10498 Neutral Waiting on Trumps answer about God #GOPDebates...

In [6]:
def cleantweet(s):
	'''
	:s : string; a tweet
	:return : list; words that don't contain url, @somebody, and in utf-8 and lower case
	'''
	s = re.sub(cleantweet.pattern, '', s, 1)
	remove_punctuation_map = dict.fromkeys(map(ord, string.punctuation))
	# actually remove the punctutation
	s = s.translate(remove_punctuation_map)
	# tokenize tweet into sentences
	sents = sent_tokenize(s)
# tokenize sentences into list of words
	words = [word_tokenize(s) for s in sents]
# NO IDEA
	words = [e for sent in words for e in sent]
	return [cleantweet.stemmer.stem(e.lower()) for e in words]

In [7]:
# Removing the morphological and inflexional endings from words in English.
cleantweet.stemmer = PorterStemmer()
cleantweet.pattern = re.compile(r'@\w+')
df.loc[:,'text'] = df.loc[:,'text'].map(cleantweet)

In [8]:
df.head()


Out[8]:
sentiment text
0 Neutral [rt, how, did, everyon, feel, about, the, clim...
1 Positive [rt, didnt, catch, the, full, gopdeb, last, ni...
2 Neutral [rt, no, mention, of, tamir, rice, and, the, g...
3 Positive [rt, that, carli, fiorina, is, trend, hour, af...
4 Positive [rt, gopdeb, w, realdonaldtrump, deliv, the, h...

In [9]:
from gensim.models.wrappers import FastText


C:\Users\goldaw\AppData\Local\Continuum\Anaconda3\lib\site-packages\gensim\utils.py:855: UserWarning: detected Windows; aliasing chunkize to chunkize_serial
  warnings.warn("detected Windows; aliasing chunkize to chunkize_serial")

In [13]:
fs = FastText()

In [15]:
#model = fs.load_binary_data('wiki.en.bin')
#model = fs.load_fasttext_format('wiki.en')
model =  fs.load_word2vec_format('wiki.en.vec') 
#print model.words # list of words in dictionary
print(model['king']) # get the vector of the word 'king'


[ -1.80510003e-02  -6.94670016e-03   1.31530002e-01   6.42630011e-02
  -3.42370011e-02  -5.53309977e-01  -1.79409996e-01  -2.88029999e-01
   8.11149999e-02   6.79829996e-03   2.55299985e-01   4.36430007e-01
   9.33870003e-02   4.62969989e-01   6.50679991e-02  -3.14610004e-01
   1.21749997e-01  -8.86579975e-02  -7.34260008e-02   3.20060015e-01
  -3.09450001e-01   1.13830000e-01  -1.47640005e-01   2.74780005e-01
  -9.99680012e-02   4.17810008e-02   1.96339995e-01   1.18110001e-01
  -1.67170003e-01   3.90130013e-01   4.02820017e-03   4.28659990e-02
  -1.37449995e-01  -1.06040001e-01  -1.52810007e-01   4.45760004e-02
  -6.00619987e-02  -1.97459996e-01   3.40440005e-01  -2.18170002e-01
   1.89219993e-02  -2.08590001e-01   7.17080012e-03   2.10400000e-01
   1.28950002e-02   1.96750000e-01   8.52750018e-02   2.68629994e-02
  -1.55200005e-01  -1.81789994e-01   1.05799995e-02   8.56330022e-02
   6.47500008e-02   1.50659993e-01  -2.86819991e-02   1.18060000e-01
   1.62330002e-01   3.84259999e-01  -1.41929999e-01   4.51689988e-01
   2.17620000e-01  -1.54660001e-01   1.23250000e-02   5.89389987e-02
  -1.23319998e-01   2.67159998e-01   4.83850002e-01  -2.40270004e-01
   3.84539999e-02  -1.19099997e-01   2.15279996e-01   1.24650002e-02
   4.63010013e-01  -6.61109982e-04   6.39880002e-01   5.09659983e-02
   3.01220000e-01   7.16700032e-02   8.81060027e-03   2.94790000e-01
  -1.88690007e-01  -2.20709994e-01  -5.43139987e-02  -1.61330000e-01
  -1.91839993e-01   5.92119992e-02  -4.55890000e-01  -2.37579998e-02
   1.88329995e-01   1.81229994e-01   3.81020010e-02   7.46729970e-02
  -1.67559996e-01  -1.35539994e-01   1.83970004e-01   4.16189991e-02
   1.58979997e-01   2.27210000e-01  -1.60820007e-01   1.22639999e-01
   6.20720029e-01   8.75430033e-02  -2.64699996e-01   5.66070005e-02
  -7.99639970e-02  -4.49900001e-01  -1.80580005e-01   1.10969998e-01
   4.36850011e-01   4.46269996e-02   4.11700010e-01   1.13339998e-01
   1.97899994e-02  -8.18869993e-02  -8.88200030e-02  -1.33769996e-02
  -1.28869995e-01   3.16819996e-01  -7.55840018e-02   1.84979998e-02
  -7.67920017e-02  -1.30510002e-01   4.04890001e-01   8.07919979e-01
   2.67679989e-01   4.89850014e-01   2.99580008e-01   1.77100003e-02
  -2.39319995e-01   2.53950000e-01   3.85010004e-01   2.23719999e-01
  -2.05329999e-01  -1.91760004e-01   1.54330000e-01   6.99899998e-03
   2.36760005e-01   9.33550000e-02  -4.90359992e-01   5.68189979e-01
  -2.83230007e-01  -1.59020007e-01  -1.32009998e-01   2.42210001e-01
   1.47489995e-01   1.33860007e-01  -2.15709999e-01   5.72090000e-02
   2.51379997e-01  -1.07139997e-01  -8.74729976e-02  -1.11460000e-01
   1.19680002e-01   2.32429996e-01   3.42589989e-02  -2.48109996e-01
   5.37679970e-01   3.33399996e-02   4.50610012e-01  -3.78730008e-03
   1.88460007e-01   4.29170012e-01   6.01379991e-01   1.95020005e-01
   2.10439995e-01   1.86140001e-01   1.85609996e-01  -1.01190001e-01
   2.30969995e-01   2.99970001e-01  -8.09179991e-02   1.22129999e-01
  -2.53659993e-01   3.31429988e-01   1.27590001e-02   1.17290001e-02
   8.34809989e-02   2.22939998e-01  -9.68180001e-02   9.07230005e-02
   2.11640000e-01  -9.94899962e-03  -1.09370001e-01   1.11769997e-01
  -2.73699999e-01   1.18749999e-01   2.60659993e-01  -1.36209995e-01
   1.02089997e-02   6.06129989e-02   1.00460000e-01   1.05690002e-01
  -2.84240007e-01  -3.12990010e-01   2.11889997e-01  -3.07630002e-02
   2.17230007e-01   3.86979997e-01  -3.70970011e-01   1.25149995e-01
   5.90330005e-01  -1.96950004e-01   2.14469999e-01   1.94890007e-01
   1.86289996e-01  -3.44650000e-02  -1.71829998e-01   1.33949995e-01
   1.89400002e-01   1.49759993e-01   2.61170007e-02   3.18129987e-01
   2.23429993e-01   2.23539993e-01   2.34139994e-01   3.03140014e-01
  -3.44850011e-02   1.16930000e-01  -1.24609999e-01  -1.69359997e-01
   1.58919999e-03   5.99300005e-02   2.11649999e-01  -4.87679988e-02
  -1.57930002e-01  -1.11560002e-01  -4.89950001e-01  -3.60799991e-02
   1.92489997e-01  -1.30689994e-01  -9.68419984e-02   4.59030002e-01
   2.51890004e-01   7.14080036e-02  -3.01459998e-01  -4.93570000e-01
   2.58619994e-01  -1.24119997e-01   4.25279997e-02   5.70029989e-02
   1.10689998e-01   3.14850003e-01  -5.43260016e-02  -3.14590000e-02
  -3.20059992e-02  -3.05310011e-01  -1.84029996e-01   3.03130001e-01
   3.15459996e-01   1.58500001e-01  -3.63319993e-01   3.55230004e-01
   1.29830003e-01   9.67010036e-02   1.25400007e-01   5.76650016e-02
   6.65669963e-02   2.50940006e-02  -1.16300002e-01   1.16690002e-01
  -1.81409996e-02  -2.42320001e-01  -1.54009998e-01   1.52399996e-02
   5.62190004e-02  -2.46559996e-02  -3.12830001e-01  -8.09200015e-03
  -1.17930003e-01  -1.63910002e-01  -1.21030003e-01  -4.10679996e-01
   4.23970014e-01   1.36529999e-02   2.67130006e-02   1.65069997e-01
   1.65439993e-01  -3.84449996e-02  -9.79940034e-03   1.50039997e-02
  -4.92170006e-02   6.61860034e-02  -1.50279999e-01  -6.10669982e-03
   2.74109989e-01   6.65780008e-02   2.82449991e-01   1.19390003e-01
   6.42499998e-02   2.33960003e-01  -4.89859998e-01   2.55519986e-01
  -7.48040006e-02  -2.83089995e-01   3.90020013e-01  -8.06410015e-02
  -2.19219998e-01  -4.79689986e-01  -8.21859986e-02   1.59559995e-01]

In [ ]: