notebook.community

Edit and run



In [1]:

    
import pandas as pd
import scipy.sparse as ss
import numpy as np
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import normalize
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils import check_array
from os.path import isfile
from warnings import warn
import subprocess
import re



In [2]:

    
raw_data = pd.read_csv('subreddit-overlap')



In [3]:

    
raw_data.head()









    Out[3]:






  
    
      
      t1_subreddit
      t2_subreddit
      NumOverlaps
    
  
  
    
      0
      roblox
      spaceengineers
      20
    
    
      1
      madlads
      Guitar
      29
    
    
      2
      Chargers
      BigBrother
      29
    
    
      3
      NetflixBestOf
      celebnsfw
      35
    
    
      4
      JoeRogan
      Glitch_in_the_Matrix
      28



In [4]:

    
subreddit_popularity = raw_data.groupby('t2_subreddit')['NumOverlaps'].sum()
subreddits = np.array(subreddit_popularity.sort_values(ascending=False).index)



In [5]:

    
index_map = dict(np.vstack([subreddits, np.arange(subreddits.shape[0])]).T)



In [6]:

    
count_matrix = ss.coo_matrix((raw_data.NumOverlaps, 
                              (raw_data.t2_subreddit.map(index_map),
                               raw_data.t1_subreddit.map(index_map))),
                             shape=(subreddits.shape[0], subreddits.shape[0]),
                             dtype=np.float64)



In [7]:

    
conditional_prob_matrix = count_matrix.tocsr()
conditional_prob_matrix = normalize(conditional_prob_matrix, norm='l1', copy=False)



In [8]:

    
reduced_vectors = TruncatedSVD(n_components=500,
                               random_state=1).fit_transform(conditional_prob_matrix)
norm_reduced_vectors = normalize(reduced_vectors, norm='l2', copy=False)



In [9]:

    
class LargeVis (BaseEstimator):
    
    def __init__(self, n_components=2, perplexity=50.0, gamma=5,
                 layout_samples=None, n_neighbors=None, negative_samples=5,
                 alpha=1.0, n_cores=4, knn_prop=3, trees=50):
        self.n_components = n_components
        self.perplexity = perplexity
        self.layout_samples = layout_samples
        self.alpha = alpha
        self.n_cores = n_cores
        self.knn_prop = knn_prop
        self.negative_samples = negative_samples
        self.n_neighbors = n_neighbors
        self.gamma = gamma
        self.trees = trees
        if self.n_neighbors is None:
            self.n_neighbors = int(self.perplexity * 3)


    def fit_transform(self, X, y=None):
        
        if self.layout_samples is None:
            layout_samples = X.shape[0] / 100.0
        else:
            layout_samples = self.layout_samples
            
        X = check_array(X, dtype=np.float64)
        np.savetxt('/tmp/largevis_input', 
                   X, header='{} {}'.format(*X.shape), 
                   comments='')
        subprocess.check_call(['/Users/leland/Source/LargeVis/Linux/LargeVis',
                               '-input', '/tmp/largevis_input',
                               '-output', '/tmp/largevis_output',
                               '-outdim', str(self.n_components),
                               '-perp', str(self.perplexity),
                               '-samples', str(layout_samples),
                               '-gamma', str(self.gamma),
                               '-prop', str(self.knn_prop),
                               '-trees', str(self.trees),
                               '-neigh', str(self.n_neighbors),
                               '-alpha', str(self.alpha),
                               '-neg', str(self.negative_samples),
                               '-threads', str(self.n_cores)])
        self.embedding_ = np.loadtxt('/tmp/largevis_output', skiprows=1)
        return self.embedding_
    
    def fit(self, X, y=None):
        self.fit_transform(X)
        return self



In [10]:

    
if isfile('largevis_subreddit_map.npy'):
    subreddit_map = np.load('largevis_subreddit_map.npy')
else:
    subreddit_map = LargeVis().fit_transform(reduced_vectors[:10000])
    np.save('largevis_subreddit_map.npy', subreddit_map)



In [11]:

    
subreddit_map_df = pd.DataFrame(subreddit_map[:10000], columns=('x', 'y'))
subreddit_map_df['subreddit'] = subreddits[:10000]
subreddit_map_df.head()









    Out[11]:






  
    
      
      x
      y
      subreddit
    
  
  
    
      0
      -2.469311
      2.295230
      AskReddit
    
    
      1
      -2.801981
      2.136050
      pics
    
    
      2
      -2.734101
      2.063090
      funny
    
    
      3
      -3.564055
      2.174888
      todayilearned
    
    
      4
      -5.986312
      2.277558
      worldnews



In [12]:

    
import hdbscan



In [13]:

    
clusterer = hdbscan.HDBSCAN(min_samples=5, 
                            min_cluster_size=20, 
                            cluster_selection_method='eom').fit(subreddit_map[:10000])
cluster_ids = clusterer.labels_



In [14]:

    
subreddit_map_df['cluster'] = cluster_ids



In [59]:

    
from jinja2 import Environment, FileSystemLoader, select_autoescape
env = Environment(
    loader=FileSystemLoader('.')
)
template = env.get_template('bokeh_html_template.html')



In [60]:

    
from bokeh.plotting import figure
from bokeh.models import HoverTool, ColumnDataSource, CustomJS, value
from bokeh.models.widgets import TextInput, AutocompleteInput
from bokeh.layouts import layout, Spacer
from bokeh.models.mappers import LinearColorMapper
from bokeh.palettes import plasma
from collections import OrderedDict
from bokeh.embed import file_html
from bokeh.resources import CDN



In [61]:

    
palette = ['#777777'] + plasma(cluster_ids.max())
colormap = LinearColorMapper(palette=palette, low=-2, high=cluster_ids.max())
color_dict = {'field': 'cluster', 'transform': colormap}
subreddit_map_df['fill_alpha'] = np.exp((subreddit_map.min() - 
                                         subreddit_map.max()) / 5.0) + 0.05

plot_data = ColumnDataSource(subreddit_map_df)

tsne_plot = figure(title='A Map of Subreddits',
                   plot_width = 700,
                   plot_height = 700,
                   tools= ('pan, wheel_zoom, box_zoom,'
                           'box_select, resize, reset'),
                   active_scroll=u'wheel_zoom')

tsne_plot.add_tools( HoverTool(tooltips = OrderedDict([('subreddit', '@subreddit'),
                                                       ('cluster', '@cluster')])))

# draw the subreddits as circles on the plot
tsne_plot.circle(u'x', u'y', source=plot_data,
                 fill_color=color_dict, line_color=None, fill_alpha='fill_alpha',
                 size=10, hover_line_color=u'black')

# configure visual elements of the plot
tsne_plot.title.text_font_size = value('18pt')
tsne_plot.title.align = 'center'
tsne_plot.xaxis.visible = False
tsne_plot.yaxis.visible = False
tsne_plot.grid.grid_line_color = None
tsne_plot.outline_line_color = '#222222'

subreddit_input_jscode = """
    var data = source.data;
    var start = plot_range.start;
    var end = plot_range.end;   
    subreddit = cb_obj.value;
    subrs = data['subreddit'];
    alpha = data['fill_alpha'];
    
    if (subreddit === '') {
        for (i = 0; i < alpha.length; i++) {
             alpha[i] = Math.exp((start - end) / 5.0) + 0.05;
        }  
    } else {
        for (i = 0; i < subrs.length; i++) {
            if (subrs[i] === subreddit) {
                alpha[i] = 1.0;
            } else {
                alpha[i] = 0.01;
            }
        }
    }
    source.trigger('change');
"""

subreddit_inputbox = AutocompleteInput(completions=subreddit_map_df.subreddit.tolist(), 
                             title='Locate a subreddit:')
subreddit_inputbox.callback = CustomJS(args=dict(source=plot_data,
                                       plot_range=tsne_plot.x_range
                                      ), code=subreddit_input_jscode)

cluster_input_jscode = """
    data = source.get('data');
    var start = plot_range.start;
    var end = plot_range.end;   
    cluster = cb_obj.value;
    clusters = data['cluster'];
    alpha = data['fill_alpha'];
    
    if (cluster === '') {
        for (i = 0; i < alpha.length; i++) {
             alpha[i] = Math.exp((start - end) / 5.0) + 0.05;
        }    
    } else {
        for (i = 0; i < alpha.length; i++) {
            if (clusters[i] === Number(cluster)) {
                alpha[i] = 1.0;
            } else {
                alpha[i] = 0.01;
            }
        }
    }
    source.trigger('change');
"""

cluster_inputbox = TextInput(title='Locate a cluster (by number)', disabled=True)
cluster_inputbox.callback = CustomJS(args=dict(source=plot_data,
                                       plot_range=tsne_plot.x_range
                                      ), code=cluster_input_jscode)

full_layout = layout([[tsne_plot], 
                      [subreddit_inputbox, cluster_inputbox],
                      [Spacer()],
                     ], 
                     sizing_mode='scale_width')

jscode="""
    var data = source.data;
    var start = cb_obj.start;
    var end = cb_obj.end;
    alpha = data['fill_alpha'];
    val1 = input1.value;
    val2 = input2.value;
    if ((val1 === '') && (val2 === '')) {
        for (i = 0; i < alpha.length; i++) {
             alpha[i] = Math.exp((start - end) / 5.0) + 0.05;
        }
        source.trigger('change');
    }
"""

tsne_plot.x_range.callback = CustomJS(args=dict(source=plot_data, 
                                                input1=subreddit_inputbox, 
                                                input2=cluster_inputbox), code=jscode)
tsne_plot.y_range.callback = CustomJS(args=dict(source=plot_data, 
                                                input1=subreddit_inputbox, 
                                                input2=cluster_inputbox), code=jscode)

html = file_html(full_layout, CDN, "A Map of Subreddits", template=template)



In [62]:

    
with open('subreddit_map.html', 'w') as outfile:
    outfile.write(html)



In [ ]:

	t1_subreddit	t2_subreddit	NumOverlaps
0	roblox	spaceengineers	20
1	madlads	Guitar	29
2	Chargers	BigBrother	29
3	NetflixBestOf	celebnsfw	35
4	JoeRogan	Glitch_in_the_Matrix	28

	x	y	subreddit
0	-2.469311	2.295230	AskReddit
1	-2.801981	2.136050	pics
2	-2.734101	2.063090	funny
3	-3.564055	2.174888	todayilearned
4	-5.986312	2.277558	worldnews