In [1]:
import pandas as pd
import scipy.sparse as ss
import numpy as np
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import normalize
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils import check_array
from os.path import isfile
from warnings import warn
import subprocess
import re
In [2]:
raw_data = pd.read_csv('subreddit-overlap')
In [3]:
raw_data.head()
Out[3]:
In [4]:
subreddit_popularity = raw_data.groupby('t2_subreddit')['NumOverlaps'].sum()
subreddits = np.array(subreddit_popularity.sort_values(ascending=False).index)
In [5]:
index_map = dict(np.vstack([subreddits, np.arange(subreddits.shape[0])]).T)
In [6]:
count_matrix = ss.coo_matrix((raw_data.NumOverlaps,
(raw_data.t2_subreddit.map(index_map),
raw_data.t1_subreddit.map(index_map))),
shape=(subreddits.shape[0], subreddits.shape[0]),
dtype=np.float64)
In [7]:
conditional_prob_matrix = count_matrix.tocsr()
conditional_prob_matrix = normalize(conditional_prob_matrix, norm='l1', copy=False)
In [8]:
reduced_vectors = TruncatedSVD(n_components=500,
random_state=1).fit_transform(conditional_prob_matrix)
norm_reduced_vectors = normalize(reduced_vectors, norm='l2', copy=False)
In [9]:
class LargeVis (BaseEstimator):
def __init__(self, n_components=2, perplexity=50.0, gamma=5,
layout_samples=None, n_neighbors=None, negative_samples=5,
alpha=1.0, n_cores=4, knn_prop=3, trees=50):
self.n_components = n_components
self.perplexity = perplexity
self.layout_samples = layout_samples
self.alpha = alpha
self.n_cores = n_cores
self.knn_prop = knn_prop
self.negative_samples = negative_samples
self.n_neighbors = n_neighbors
self.gamma = gamma
self.trees = trees
if self.n_neighbors is None:
self.n_neighbors = int(self.perplexity * 3)
def fit_transform(self, X, y=None):
if self.layout_samples is None:
layout_samples = X.shape[0] / 100.0
else:
layout_samples = self.layout_samples
X = check_array(X, dtype=np.float64)
np.savetxt('/tmp/largevis_input',
X, header='{} {}'.format(*X.shape),
comments='')
subprocess.check_call(['/Users/leland/Source/LargeVis/Linux/LargeVis',
'-input', '/tmp/largevis_input',
'-output', '/tmp/largevis_output',
'-outdim', str(self.n_components),
'-perp', str(self.perplexity),
'-samples', str(layout_samples),
'-gamma', str(self.gamma),
'-prop', str(self.knn_prop),
'-trees', str(self.trees),
'-neigh', str(self.n_neighbors),
'-alpha', str(self.alpha),
'-neg', str(self.negative_samples),
'-threads', str(self.n_cores)])
self.embedding_ = np.loadtxt('/tmp/largevis_output', skiprows=1)
return self.embedding_
def fit(self, X, y=None):
self.fit_transform(X)
return self
In [10]:
if isfile('largevis_subreddit_map.npy'):
subreddit_map = np.load('largevis_subreddit_map.npy')
else:
subreddit_map = LargeVis().fit_transform(reduced_vectors[:10000])
np.save('largevis_subreddit_map.npy', subreddit_map)
In [11]:
subreddit_map_df = pd.DataFrame(subreddit_map[:10000], columns=('x', 'y'))
subreddit_map_df['subreddit'] = subreddits[:10000]
subreddit_map_df.head()
Out[11]:
In [12]:
import hdbscan
In [13]:
clusterer = hdbscan.HDBSCAN(min_samples=5,
min_cluster_size=20,
cluster_selection_method='eom').fit(subreddit_map[:10000])
cluster_ids = clusterer.labels_
In [14]:
subreddit_map_df['cluster'] = cluster_ids
In [59]:
from jinja2 import Environment, FileSystemLoader, select_autoescape
env = Environment(
loader=FileSystemLoader('.')
)
template = env.get_template('bokeh_html_template.html')
In [60]:
from bokeh.plotting import figure
from bokeh.models import HoverTool, ColumnDataSource, CustomJS, value
from bokeh.models.widgets import TextInput, AutocompleteInput
from bokeh.layouts import layout, Spacer
from bokeh.models.mappers import LinearColorMapper
from bokeh.palettes import plasma
from collections import OrderedDict
from bokeh.embed import file_html
from bokeh.resources import CDN
In [61]:
palette = ['#777777'] + plasma(cluster_ids.max())
colormap = LinearColorMapper(palette=palette, low=-2, high=cluster_ids.max())
color_dict = {'field': 'cluster', 'transform': colormap}
subreddit_map_df['fill_alpha'] = np.exp((subreddit_map.min() -
subreddit_map.max()) / 5.0) + 0.05
plot_data = ColumnDataSource(subreddit_map_df)
tsne_plot = figure(title='A Map of Subreddits',
plot_width = 700,
plot_height = 700,
tools= ('pan, wheel_zoom, box_zoom,'
'box_select, resize, reset'),
active_scroll=u'wheel_zoom')
tsne_plot.add_tools( HoverTool(tooltips = OrderedDict([('subreddit', '@subreddit'),
('cluster', '@cluster')])))
# draw the subreddits as circles on the plot
tsne_plot.circle(u'x', u'y', source=plot_data,
fill_color=color_dict, line_color=None, fill_alpha='fill_alpha',
size=10, hover_line_color=u'black')
# configure visual elements of the plot
tsne_plot.title.text_font_size = value('18pt')
tsne_plot.title.align = 'center'
tsne_plot.xaxis.visible = False
tsne_plot.yaxis.visible = False
tsne_plot.grid.grid_line_color = None
tsne_plot.outline_line_color = '#222222'
subreddit_input_jscode = """
var data = source.data;
var start = plot_range.start;
var end = plot_range.end;
subreddit = cb_obj.value;
subrs = data['subreddit'];
alpha = data['fill_alpha'];
if (subreddit === '') {
for (i = 0; i < alpha.length; i++) {
alpha[i] = Math.exp((start - end) / 5.0) + 0.05;
}
} else {
for (i = 0; i < subrs.length; i++) {
if (subrs[i] === subreddit) {
alpha[i] = 1.0;
} else {
alpha[i] = 0.01;
}
}
}
source.trigger('change');
"""
subreddit_inputbox = AutocompleteInput(completions=subreddit_map_df.subreddit.tolist(),
title='Locate a subreddit:')
subreddit_inputbox.callback = CustomJS(args=dict(source=plot_data,
plot_range=tsne_plot.x_range
), code=subreddit_input_jscode)
cluster_input_jscode = """
data = source.get('data');
var start = plot_range.start;
var end = plot_range.end;
cluster = cb_obj.value;
clusters = data['cluster'];
alpha = data['fill_alpha'];
if (cluster === '') {
for (i = 0; i < alpha.length; i++) {
alpha[i] = Math.exp((start - end) / 5.0) + 0.05;
}
} else {
for (i = 0; i < alpha.length; i++) {
if (clusters[i] === Number(cluster)) {
alpha[i] = 1.0;
} else {
alpha[i] = 0.01;
}
}
}
source.trigger('change');
"""
cluster_inputbox = TextInput(title='Locate a cluster (by number)', disabled=True)
cluster_inputbox.callback = CustomJS(args=dict(source=plot_data,
plot_range=tsne_plot.x_range
), code=cluster_input_jscode)
full_layout = layout([[tsne_plot],
[subreddit_inputbox, cluster_inputbox],
[Spacer()],
],
sizing_mode='scale_width')
jscode="""
var data = source.data;
var start = cb_obj.start;
var end = cb_obj.end;
alpha = data['fill_alpha'];
val1 = input1.value;
val2 = input2.value;
if ((val1 === '') && (val2 === '')) {
for (i = 0; i < alpha.length; i++) {
alpha[i] = Math.exp((start - end) / 5.0) + 0.05;
}
source.trigger('change');
}
"""
tsne_plot.x_range.callback = CustomJS(args=dict(source=plot_data,
input1=subreddit_inputbox,
input2=cluster_inputbox), code=jscode)
tsne_plot.y_range.callback = CustomJS(args=dict(source=plot_data,
input1=subreddit_inputbox,
input2=cluster_inputbox), code=jscode)
html = file_html(full_layout, CDN, "A Map of Subreddits", template=template)
In [62]:
with open('subreddit_map.html', 'w') as outfile:
outfile.write(html)
In [ ]: