In [1]:
import pandas as pd
import scipy.sparse as ss
import numpy as np
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import normalize
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils import check_array
from os.path import isfile
from warnings import warn
import subprocess
import re

In [2]:
raw_data = pd.read_csv('subreddit-overlap')

In [3]:
raw_data.head()


Out[3]:
t1_subreddit t2_subreddit NumOverlaps
0 roblox spaceengineers 20
1 madlads Guitar 29
2 Chargers BigBrother 29
3 NetflixBestOf celebnsfw 35
4 JoeRogan Glitch_in_the_Matrix 28

In [4]:
subreddit_popularity = raw_data.groupby('t2_subreddit')['NumOverlaps'].sum()
subreddits = np.array(subreddit_popularity.sort_values(ascending=False).index)

In [5]:
index_map = dict(np.vstack([subreddits, np.arange(subreddits.shape[0])]).T)

In [6]:
count_matrix = ss.coo_matrix((raw_data.NumOverlaps, 
                              (raw_data.t2_subreddit.map(index_map),
                               raw_data.t1_subreddit.map(index_map))),
                             shape=(subreddits.shape[0], subreddits.shape[0]),
                             dtype=np.float64)

In [7]:
conditional_prob_matrix = count_matrix.tocsr()
conditional_prob_matrix = normalize(conditional_prob_matrix, norm='l1', copy=False)

In [8]:
reduced_vectors = TruncatedSVD(n_components=500,
                               random_state=1).fit_transform(conditional_prob_matrix)
norm_reduced_vectors = normalize(reduced_vectors, norm='l2', copy=False)

In [9]:
class LargeVis (BaseEstimator):
    
    def __init__(self, n_components=2, perplexity=50.0, gamma=5,
                 layout_samples=None, n_neighbors=None, negative_samples=5,
                 alpha=1.0, n_cores=4, knn_prop=3, trees=50):
        self.n_components = n_components
        self.perplexity = perplexity
        self.layout_samples = layout_samples
        self.alpha = alpha
        self.n_cores = n_cores
        self.knn_prop = knn_prop
        self.negative_samples = negative_samples
        self.n_neighbors = n_neighbors
        self.gamma = gamma
        self.trees = trees
        if self.n_neighbors is None:
            self.n_neighbors = int(self.perplexity * 3)


    def fit_transform(self, X, y=None):
        
        if self.layout_samples is None:
            layout_samples = X.shape[0] / 100.0
        else:
            layout_samples = self.layout_samples
            
        X = check_array(X, dtype=np.float64)
        np.savetxt('/tmp/largevis_input', 
                   X, header='{} {}'.format(*X.shape), 
                   comments='')
        subprocess.check_call(['/Users/leland/Source/LargeVis/Linux/LargeVis',
                               '-input', '/tmp/largevis_input',
                               '-output', '/tmp/largevis_output',
                               '-outdim', str(self.n_components),
                               '-perp', str(self.perplexity),
                               '-samples', str(layout_samples),
                               '-gamma', str(self.gamma),
                               '-prop', str(self.knn_prop),
                               '-trees', str(self.trees),
                               '-neigh', str(self.n_neighbors),
                               '-alpha', str(self.alpha),
                               '-neg', str(self.negative_samples),
                               '-threads', str(self.n_cores)])
        self.embedding_ = np.loadtxt('/tmp/largevis_output', skiprows=1)
        return self.embedding_
    
    def fit(self, X, y=None):
        self.fit_transform(X)
        return self

In [10]:
if isfile('largevis_subreddit_map.npy'):
    subreddit_map = np.load('largevis_subreddit_map.npy')
else:
    subreddit_map = LargeVis().fit_transform(reduced_vectors[:10000])
    np.save('largevis_subreddit_map.npy', subreddit_map)

In [11]:
subreddit_map_df = pd.DataFrame(subreddit_map[:10000], columns=('x', 'y'))
subreddit_map_df['subreddit'] = subreddits[:10000]
subreddit_map_df.head()


Out[11]:
x y subreddit
0 -2.469311 2.295230 AskReddit
1 -2.801981 2.136050 pics
2 -2.734101 2.063090 funny
3 -3.564055 2.174888 todayilearned
4 -5.986312 2.277558 worldnews

In [12]:
import hdbscan

In [13]:
clusterer = hdbscan.HDBSCAN(min_samples=5, 
                            min_cluster_size=20, 
                            cluster_selection_method='eom').fit(subreddit_map[:10000])
cluster_ids = clusterer.labels_

In [14]:
subreddit_map_df['cluster'] = cluster_ids

In [59]:
from jinja2 import Environment, FileSystemLoader, select_autoescape
env = Environment(
    loader=FileSystemLoader('.')
)
template = env.get_template('bokeh_html_template.html')

In [60]:
from bokeh.plotting import figure
from bokeh.models import HoverTool, ColumnDataSource, CustomJS, value
from bokeh.models.widgets import TextInput, AutocompleteInput
from bokeh.layouts import layout, Spacer
from bokeh.models.mappers import LinearColorMapper
from bokeh.palettes import plasma
from collections import OrderedDict
from bokeh.embed import file_html
from bokeh.resources import CDN

In [61]:
palette = ['#777777'] + plasma(cluster_ids.max())
colormap = LinearColorMapper(palette=palette, low=-2, high=cluster_ids.max())
color_dict = {'field': 'cluster', 'transform': colormap}
subreddit_map_df['fill_alpha'] = np.exp((subreddit_map.min() - 
                                         subreddit_map.max()) / 5.0) + 0.05

plot_data = ColumnDataSource(subreddit_map_df)

tsne_plot = figure(title='A Map of Subreddits',
                   plot_width = 700,
                   plot_height = 700,
                   tools= ('pan, wheel_zoom, box_zoom,'
                           'box_select, resize, reset'),
                   active_scroll=u'wheel_zoom')

tsne_plot.add_tools( HoverTool(tooltips = OrderedDict([('subreddit', '@subreddit'),
                                                       ('cluster', '@cluster')])))

# draw the subreddits as circles on the plot
tsne_plot.circle(u'x', u'y', source=plot_data,
                 fill_color=color_dict, line_color=None, fill_alpha='fill_alpha',
                 size=10, hover_line_color=u'black')

# configure visual elements of the plot
tsne_plot.title.text_font_size = value('18pt')
tsne_plot.title.align = 'center'
tsne_plot.xaxis.visible = False
tsne_plot.yaxis.visible = False
tsne_plot.grid.grid_line_color = None
tsne_plot.outline_line_color = '#222222'

subreddit_input_jscode = """
    var data = source.data;
    var start = plot_range.start;
    var end = plot_range.end;   
    subreddit = cb_obj.value;
    subrs = data['subreddit'];
    alpha = data['fill_alpha'];
    
    if (subreddit === '') {
        for (i = 0; i < alpha.length; i++) {
             alpha[i] = Math.exp((start - end) / 5.0) + 0.05;
        }  
    } else {
        for (i = 0; i < subrs.length; i++) {
            if (subrs[i] === subreddit) {
                alpha[i] = 1.0;
            } else {
                alpha[i] = 0.01;
            }
        }
    }
    source.trigger('change');
"""

subreddit_inputbox = AutocompleteInput(completions=subreddit_map_df.subreddit.tolist(), 
                             title='Locate a subreddit:')
subreddit_inputbox.callback = CustomJS(args=dict(source=plot_data,
                                       plot_range=tsne_plot.x_range
                                      ), code=subreddit_input_jscode)

cluster_input_jscode = """
    data = source.get('data');
    var start = plot_range.start;
    var end = plot_range.end;   
    cluster = cb_obj.value;
    clusters = data['cluster'];
    alpha = data['fill_alpha'];
    
    if (cluster === '') {
        for (i = 0; i < alpha.length; i++) {
             alpha[i] = Math.exp((start - end) / 5.0) + 0.05;
        }    
    } else {
        for (i = 0; i < alpha.length; i++) {
            if (clusters[i] === Number(cluster)) {
                alpha[i] = 1.0;
            } else {
                alpha[i] = 0.01;
            }
        }
    }
    source.trigger('change');
"""

cluster_inputbox = TextInput(title='Locate a cluster (by number)', disabled=True)
cluster_inputbox.callback = CustomJS(args=dict(source=plot_data,
                                       plot_range=tsne_plot.x_range
                                      ), code=cluster_input_jscode)

full_layout = layout([[tsne_plot], 
                      [subreddit_inputbox, cluster_inputbox],
                      [Spacer()],
                     ], 
                     sizing_mode='scale_width')

jscode="""
    var data = source.data;
    var start = cb_obj.start;
    var end = cb_obj.end;
    alpha = data['fill_alpha'];
    val1 = input1.value;
    val2 = input2.value;
    if ((val1 === '') && (val2 === '')) {
        for (i = 0; i < alpha.length; i++) {
             alpha[i] = Math.exp((start - end) / 5.0) + 0.05;
        }
        source.trigger('change');
    }
"""

tsne_plot.x_range.callback = CustomJS(args=dict(source=plot_data, 
                                                input1=subreddit_inputbox, 
                                                input2=cluster_inputbox), code=jscode)
tsne_plot.y_range.callback = CustomJS(args=dict(source=plot_data, 
                                                input1=subreddit_inputbox, 
                                                input2=cluster_inputbox), code=jscode)

html = file_html(full_layout, CDN, "A Map of Subreddits", template=template)

In [62]:
with open('subreddit_map.html', 'w') as outfile:
    outfile.write(html)

In [ ]: