In [2]:
%load_ext autoreload
%autoreload 2
import sys, os
sys.path.append('/Users/arimorcos/Github/getRedditDataset/')
#sys.path.append('D:\Documents\GitHub\getRedditDataset')
#from celebReddit import countWords
import redditDB
import datetime
import praw
from matplotlib import pyplot as pp
from matplotlib import colorbar as cb
from mpl_toolkits.mplot3d import Axes3D
import numpy as np
import pandas as PD
import scipy
from scipy import spatial
from sklearn import cluster, decomposition
%matplotlib inline
In [3]:
def addCopyright(ax, xOffset=0, yOffset=0):
year = datetime.date.today().year
if any([i == '_axis3don' for i in dir(ax)]):
ax.text(0 + xOffset, -.05 + yOffset, 0, r'$\copyright$ Ari Morcos ' + str(year), transform=ax.transAxes)
else:
ax.text(0 + xOffset, -.05 + yOffset, r'$\copyright$ Ari Morcos ' + str(year), transform=ax.transAxes)
In [4]:
""" Establish database connection """
db = redditDB.RedditDB('March2-8_Top200')
In [5]:
""" Get comments from subreddits """
if not os.path.isfile('popSubs.npy'):
subList = praw.Reddit(user_agent='pop').get_popular_subreddits(limit=200)
subList = [sub.display_name for sub in subList]
np.save('popSubs.npy', subList)
else:
subList = list(np.load('popSubs.npy'))
comments = db.getSubredditCommentText(subList[0])
In [6]:
""" Strip formatting """
comments = [''.join(s for s in comment if ord(s)>31 and ord(s)<126) for comment in comments]
In [7]:
""" Enumerate words to get list and values """
def countNWords(allWords):
wordCounts = {}
for word in allWords:
if word in wordCounts:
wordCounts[word] += 1
else:
wordCounts[word] = 1
return wordCounts
In [8]:
""" Concatenate all comments together and split into individual words """
allComments = " ".join(comments)
allWords = allComments.split()
allWords = [word.lower() for word in allWords]
wordCounts = countNWords(allWords)
In [9]:
""" sort by values and get 100 most frequent words"""
sortedWords = sorted(wordCounts, key=wordCounts.get, reverse=True)
mostFreqWords = sortedWords[0:100]
In [10]:
freqWordCounts = [wordCounts[word] for word in mostFreqWords]
In [11]:
reload = False
usedSubs = subList[0:50]
if not os.path.isfile('allCounts.npy') or reload:
allCounts = np.zeros(shape=(len(mostFreqWords) ,len(usedSubs)))
subInd = 0
for sub in usedSubs:
# get comments
comments = db.getSubredditCommentText(sub)
comments = [''.join(s for s in comment if ord(s)>31 and ord(s)<126) for comment in comments]
# convert to all words
allComments = " ".join(comments)
allWords = allComments.split()
allWords = [word.lower() for word in allWords]
# count words
wordCounts = countNWords(allWords)
# get total words
totWords = float(len(allWords))
# get freqWordCounts
freqWordCounts = [wordCounts[word] if word in wordCounts else 0 for word in mostFreqWords ]
freqWordCounts = [i/totWords for i in freqWordCounts]
# concatenate
allCounts[:,subInd] = freqWordCounts
# increment subInd
subInd += 1
# save
np.save('allCounts',allCounts)
else:
allCounts = np.load('allCounts.npy')
In [12]:
""" get pairwise distance """
normMax = np.max(allCounts, axis=1)
normCounts = np.transpose(np.divide(np.transpose(allCounts), normMax))
distVec = spatial.distance.pdist(np.transpose(normCounts), 'euclidean')
distMat = spatial.distance.squareform(distVec)
In [13]:
""" print top words """
print '100 Most Frequent Words'
for word, ind in zip(mostFreqWords, range(len(mostFreqWords))):
print str(ind+1) + ': ' + word
In [47]:
%matplotlib inline
fig, ax = pp.subplots()
fig.set_figheight(15)
fig.set_figwidth(15)
axMat = ax.matshow(distMat)
ax.set_xticks(range(len(distMat)))
ax.set_xticklabels(subList, rotation=90);
ax.set_yticks(range(len(distMat)))
ax.set_yticklabels(subList);
cbar = fig.colorbar(axMat, shrink=0.8)
cbar.set_label('Euclidean distance')
addCopyright(ax)
In [15]:
""" Plot most 10 most similar pairs"""
x = distMat.copy()
np.fill_diagonal(x, np.inf)
for ind in range(30):
minInd = np.argmin(x)
inds = np.unravel_index(minInd, distMat.shape)
print usedSubs[inds[0]] + ', ' + usedSubs[inds[1]] + ': ' + str(x[inds])
x[inds] = np.inf
x[inds[1], inds[0]] = np.inf
In [16]:
""" Plot most 10 most dissimilar pairs"""
x = distMat.copy()
np.fill_diagonal(x, -1*np.inf)
for ind in range(30):
maxInd = np.argmax(x)
inds = np.unravel_index(maxInd, distMat.shape)
print usedSubs[inds[0]] + ', ' + usedSubs[inds[1]] + ': ' + str(x[inds])
x[inds] = -1*np.inf
x[inds[1], inds[0]] = -1*np.inf
In [17]:
""" perform pca """
pcaObj = decomposition.PCA()
reducedCounts = pcaObj.fit_transform(normCounts.transpose())[:,0:3]
In [48]:
""" plot cumsum of variance explained """
varExp = np.cumsum(pcaObj.explained_variance_ratio_)
fig = pp.figure(figsize=(15,15))
ax = pp.subplot()
ax.plot(varExp, marker='o');
ax.set_xlabel("Number of principal components")
ax.set_ylabel("Cumulative variance explained")
ax.set_ylim(bottom=0, top=1)
for item in ([ax.title, ax.xaxis.label, ax.yaxis.label] +
ax.get_xticklabels() + ax.get_yticklabels()):
item.set_fontsize(20)
addCopyright(ax)
In [19]:
""" cluster points """
apObj = cluster.AffinityPropagation(damping=0.75)
apObj.fit(reducedCounts);
len(set(apObj.labels_))
Out[19]:
In [20]:
# plot 3d
#%matplotlib qt
%matplotlib inline
fig = pp.figure(figsize=(15,15))
ax = pp.subplot(projection='3d')
ax.scatter(reducedCounts[:,0], reducedCounts[:,1], reducedCounts[:,2], s=120, c=apObj.labels_)
ax.set_xlabel('PC1'); ax.set_ylabel('PC2'); ax.set_zlabel('PC3')
for item in ([ax.title, ax.xaxis.label, ax.yaxis.label, ax.zaxis.label] +
ax.get_xticklabels() + ax.get_yticklabels() + ax.get_zticklabels()):
item.set_fontsize(20)
addCopyright(ax)
In [22]:
%matplotlib inline
#plot 2d
fig = pp.figure(figsize=(15,15))
ax2 = pp.subplot()
ax2.scatter(reducedCounts[:,0], reducedCounts[:,1], s=50, c=apObj.labels_)
ax2.set_xlabel('PC1'); ax2.set_ylabel('PC2')
# get distance between all points
dists = spatial.distance.squareform(spatial.distance.pdist(reducedCounts[:,0:2], 'euclidean'))
np.fill_diagonal(dists,np.inf)
# label points
for point in range(len(usedSubs)):
yOffset = 0
if any(np.argwhere(dists[point,:] < 0.05)):
yOffset = 0.01
if reducedCounts[point,1] < reducedCounts[np.argwhere(dists[point,:] < 0.05),1]:
yOffset = -1*yOffset
if point == 37 or point == 23: # ALL THE KLUGE
yOffset = -0.02
elif point == 27:
yOffset = 0.02
ax2.text(reducedCounts[point,0]+0.02, reducedCounts[point,1]+yOffset, usedSubs[point],
size=10, rotation=0, va='center', ha='left')
for item in ([ax2.title, ax2.xaxis.label, ax2.yaxis.label] +
ax2.get_xticklabels() + ax2.get_yticklabels()):
item.set_fontsize(20)
ax2.set_xlim(left=-1.6, right=1.6)
addCopyright(ax2)
In [ ]:
""" animate """
shouldSave = False
def animate(i, ax):
newVal = i*5
if newVal > 360:
newVal = 0
ax.azim = newVal
import matplotlib.animation as animation
%matplotlib qt
%pylab qt
fig = pp.figure(figsize=(10,10))
ax = pp.subplot(projection='3d')
ax.scatter(reducedCounts[:,0], reducedCounts[:,1], reducedCounts[:,2], s=120, c=apObj.labels_)
ax.set_xlabel('PC1'); ax.set_ylabel('PC2'); ax.set_zlabel('PC3')
for item in ([ax.title, ax.xaxis.label, ax.yaxis.label, ax.zaxis.label] +
ax.get_xticklabels() + ax.get_yticklabels() + ax.get_zticklabels()):
item.set_fontsize(20)
addCopyright(ax)
ax.elev = 20
ani = animation.FuncAnimation(fig, animate, fargs=[ax], frames=range(int(72)), interval=175)
if shouldSave:
pp.rcParams['animation.ffmpeg_path'] = 'C:\\FFMPEG\\bin\\ffmpeg.exe'
pp.rcParams['animation.mencoder_path'] = 'C:\\Mencoder\\mencoder.exe'
Writer = animation.FFMpegWriter()
ani.save('D:\\Dropbox\\Website\\Blog Posts\\201503\\Clustering subreddits\\pc3DAnimation.avi',
writer=Writer, dpi=300, savefig_kwargs={'transparent': True})
print('Saved file: complete!')
In [50]:
""" plot normalized word distribution """
def plotBarNormFreq(normCounts, barSubs, usedSubs):
# get meanCounts
meanNormCounts = np.mean(normCounts, axis=1)
#generate colors
colors = 'brgmy'
fig, ax = pp.subplots(figsize=(15,15))
handles = []
width = 1./(len(barSubs))
for sub, ind in zip(barSubs, range(len(barSubs))):
# normalize
tempNorm = normCounts[:,usedSubs.index(barSubs[ind])]/meanNormCounts
# sort
sortedNorm = sorted(tempNorm, reverse=True)
sortOrder = [i[0] for i in sorted(enumerate(tempNorm), key=lambda x:x[1], reverse=True)]
# plot bar
handles.append(ax.bar([i + -0.5*width + width*ind for i in range(len(mostFreqWords))],
sortedNorm, width=width, color=colors[ind], align='center'))
# plot chance line
ax.plot([0, len(mostFreqWords)], [1, 1], ls='--', color='k')
# label axes
ax.set_ylabel('Normalized word frequency')
ax.set_xticks(range(len(mostFreqWords)));
ax.set_xticklabels([mostFreqWords[i] for i in sortOrder], rotation=-90);
ax.set_xlim(left=0-0.5*width, right=len(mostFreqWords)-0.5*width);
ax.set_title(", ".join(barSubs) + ' Normalized Word Frequency')
for item in ([ax.title, ax.xaxis.label, ax.yaxis.label] +
ax.get_yticklabels()):
item.set_fontsize(20)
if len(barSubs) > 1:
ax.legend(handles, barSubs, loc='upper right');
addCopyright(ax, yOffset=-0.01)
In [51]:
%matplotlib inline
plotBarNormFreq(normCounts, ['gentlemanboners','Celebs'], usedSubs)
In [40]:
plotBarNormFreq(normCounts, ['nfl'], usedSubs)
In [53]:
plotBarNormFreq(normCounts, ['TrollXChromosomes'], usedSubs)
In [42]:
plotBarNormFreq(normCounts, ['pcmasterrace'], usedSubs)
In [43]:
plotBarNormFreq(normCounts, ['circlejerk'], usedSubs)
In [45]:
plotBarNormFreq(normCounts, ['science'], usedSubs)
In [ ]:
In [44]:
plotBarNormFreq(normCounts, ['movies'], usedSubs)
In [49]:
""" plot pcs themselves """
%matplotlib inline
pcaObj.components_
barSubs = ['PC 1', 'PC 2', 'PC 3']
barSubs = ['PC 1']
#generate colors
colors = 'brgmy'
fig, ax = pp.subplots(figsize=(15,15))
handles = []
width = 1
for sub, ind in zip(barSubs, range(len(barSubs))):
# normalize
tempNorm = np.sum(np.abs(pcaObj.components_[0:2,:]), axis=0)
# sort
sortedNorm = sorted(tempNorm, reverse=True)
sortOrder = [i[0] for i in sorted(enumerate(tempNorm), key=lambda x:x[1], reverse=True)]
# plot bar
handles.append(ax.bar([i + ind for i in range(len(mostFreqWords))],
sortedNorm, width=width, color=colors[ind], align='center'))
# label axes
ax.set_ylabel('Summed principal component contribution (a.u.)')
ax.set_xticks(range(len(mostFreqWords)));
ax.set_xticklabels([mostFreqWords[i] for i in sortOrder], rotation=-90);
ax.set_xlim(left=0-0.5*width, right=len(mostFreqWords)-0.5*width);
#ax.set_title('PC contributions')
for item in ([ax.title, ax.xaxis.label, ax.yaxis.label] +
ax.get_yticklabels()):
item.set_fontsize(20)
addCopyright(ax, yOffset=-0.02)
In [ ]: