In [1]:
import numpy as np
import pandas as pd
import time
import math
from nltk.corpus import stopwords
from pyspark import SparkContext
from pyspark import Row
from pyspark.sql import SQLContext
from pyspark.ml.feature import Word2Vec
from pyspark.ml.clustering import KMeans
from pyspark.mllib.linalg import Vectors
from pyspark.ml.feature import PCA
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import word2vecUtilities as wvu
In [2]:
t0 = time.time()
datapath = '/Users/jorgecastanon/Documents/github/w2v/data/tweets.gz'
tweets = sqlContext.read.json(datapath)
tweets.registerTempTable("tweets")
twr = tweets.count()
print "Number of tweets read: ", twr
# this line add ~7 seconds (from ~24.5 seconds to ~31.5 seconds)
# Number of tweets read: 239082
print "Elapsed time (seconds): ", time.time() - t0
#Elapsed time (seconds): 31.9646401405
In [4]:
filterPath = '/Users/jorgecastanon/Documents/github/w2v/data/filter.txt'
filter = pd.read_csv(filterPath,header=None)
filter.head()
Out[4]:
In [5]:
# Construct SQL Command
t0 = time.time()
sqlString = "("
for substr in filter[0]: #iteration on the list of words to filter (at most 50-100 words)
sqlString = sqlString+"text LIKE '%"+substr+"%' OR "
sqlString = sqlString+"text LIKE '%"+substr.upper()+"%' OR "
sqlString=sqlString[:-4]+")"
sqlFilterCommand = "SELECT lang, text FROM tweets WHERE (lang = 'en') AND "+sqlString
# Query tweets in english that contain at least one of the keywords
tweetsDF = sqlContext.sql(sqlFilterCommand).cache()
twf = tweetsDF.count()
print "Number of tweets after filtering: ", twf
# last line add ~9 seconds (from ~0.72 seconds to ~9.42 seconds)
print "Elapsed time (seconds): ", time.time() - t0
print "Percetage of Tweets Used: ", float(twf)/twr
In [6]:
tweetsRDD = tweetsDF.select('text').rdd
def parseAndRemoveStopWords(text):
t = text[0].replace(";"," ").replace(":"," ").replace('"',' ').replace('-',' ')
t = t.replace(',',' ').replace('.',' ')
t = t.lower().split(" ")
stop = stopwords.words('english')
return [i for i in t if i not in stop]
tw = tweetsRDD.map(parseAndRemoveStopWords)
In [8]:
# map to df
twDF = tw.map(lambda p: Row(text=p)).toDF()
# default minCount = 5 (we may need to try something larger: 20-100 to reduce cost)
# default vectorSize = 100 (we may want to keep default)
t0 = time.time()
word2Vec = Word2Vec(vectorSize=100, minCount=5, inputCol="text", outputCol="result")
modelW2V = word2Vec.fit(twDF)
wordVectorsDF = modelW2V.getVectors()
print "Elapsed time (seconds) to train Word2Vec: ", time.time() - t0
In [9]:
print sc.version
In [10]:
vocabSize = wordVectorsDF.count()
print "Vocabulary Size: ", vocabSize
In [11]:
topN = 13
synonymsDF = modelW2V.findSynonyms('christmas', topN).toPandas()
synonymsDF
Out[11]:
In [12]:
synonymsDF = modelW2V.findSynonyms('dog', 5).toPandas()
synonymsDF
Out[12]:
In [ ]:
In [ ]:
In [13]:
dfW2V = wordVectorsDF.select('vector').withColumnRenamed('vector','features')
numComponents = 3
pca = PCA(k = numComponents, inputCol = 'features', outputCol = 'pcaFeatures')
model = pca.fit(dfW2V)
dfComp = model.transform(dfW2V).select("pcaFeatures")
In [14]:
word = 'christmas'
nwords = 200
#############
r = wvu.topNwordsToPlot(dfComp,wordVectorsDF,word,nwords)
############
fs=20 #fontsize
w = r['word']
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
height = 10
width = 10
fig.set_size_inches(width, height)
ax.scatter(r['X'], r['Y'], r['Z'], color='red', s=100, marker='o', edgecolors='black')
for i, txt in enumerate(w):
if(i<7):
ax.text(r['X'].ix[i],r['Y'].ix[i],r['Z'].ix[i], '%s' % (txt), size=20, zorder=1, color='k')
ax.set_xlabel('1st. Component', fontsize=fs)
ax.set_ylabel('2nd. Component', fontsize=fs)
ax.set_zlabel('3rd. Component', fontsize=fs)
ax.set_title('Visualization of Word2Vec via PCA', fontsize=fs)
ax.grid(True)
plt.show()
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [20]:
t0=time.time()
K = int(math.floor(math.sqrt(float(vocabSize)/2)))
# K ~ sqrt(n/2) this is a rule of thumb for choosing K,
# where n is the number of words in the model
# feel free to choose K with a fancier algorithm
dfW2V = wordVectorsDF.select('vector').withColumnRenamed('vector','features')
kmeans = KMeans(k=K, seed=1)
modelK = kmeans.fit(dfW2V)
labelsDF = modelK.transform(dfW2V).select('prediction').withColumnRenamed('prediction','labels')
print "Number of Clusters (K) Used: ", K
print "Elapsed time (seconds) :", time.time() - t0
In [ ]: