notebook.community

Edit and run



In [124]:

    
import pandas as pd
import numpy as np

from gensim.models import Word2Vec
from gensim.models.wrappers import FastText

from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.cluster.hierarchy import cophenet
from scipy.cluster.hierarchy import fcluster

from sklearn.decomposition import TruncatedSVD, PCA
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.manifold import TSNE
from sklearn.preprocessing import MaxAbsScaler

from plotly import tools
import plotly.offline as py
import plotly.figure_factory as ff
import plotly.graph_objs as go

py.init_notebook_mode()



In [3]:

    
# Autoreload changed modules
%load_ext autoreload
%autoreload 2



In [71]:

    
# Load data
data_clinton = pd.read_csv("datasets/parsed/words.clinton.all.txt", sep=' ', header=None)
data_trump = pd.read_csv("datasets/parsed/words.trump.all.txt", sep=' ', header=None)



In [72]:

    
clinton_group = data_clinton.groupby([5], as_index = False).agg(['count']).reset_index()
trump_group = data_trump.groupby([5], as_index = False).agg(['count']).reset_index()



In [185]:

    
x = clinton_group[5]
y_clinton = [item for sublist in clinton_group[0].values for item in sublist]
y_trump = [item for sublist in trump_group[0].values for item in sublist]

trace1 = go.Bar(
    x=x,
    y=y_clinton,
    text=y_clinton,
    textposition = 'auto',
    textfont=dict(
        family='sans serif',
        size=12,
        color='#000000'
    ),
    marker=dict(
        color='blue',
        line=dict(
            color='blue',
            width=1.5),
        ),
    opacity=0.6
)

trace2 = go.Bar(
    x=x,
    y=y_trump,
    text=y_trump,
    textposition = 'auto',
        textfont=dict(
        family='sans serif',
        size=12,
        color='#000000'
    ),
    marker=dict(
        color='red',
        line=dict(
            color='red',
            width=1.5),
        ),
    opacity=0.6
)

data = [trace1,trace2]

py.iplot(data, filename='grouped-bar-direct-labels')



In [149]:

    
def top_tfidf_feats(row, features, top_n=25):
    ''' Get top n tfidf values in row and return them with their corresponding feature names.'''
    topn_ids = np.argsort(row)[::-1][:top_n]
    top_feats = [(features[i], row[i]) for i in topn_ids]
    df = pd.DataFrame(top_feats)
    df.columns = ['feature', 'tfidf']
    return df

def top_feats_in_doc(Xtr, features, row_id, top_n=25):
    ''' Top tfidf features in specific document (matrix row) '''
    row = np.squeeze(Xtr[row_id].toarray())
    return top_tfidf_feats(row, features, top_n)

def top_mean_feats(Xtr, features, grp_ids=None, min_tfidf=0.1, top_n=25):
    ''' Return the top n features that on average are most important amongst documents in rows
        indentified by indices in grp_ids. '''
    if grp_ids:
        D = Xtr[grp_ids].toarray()
    else:
        D = Xtr.toarray()

    D[D < min_tfidf] = 0
    tfidf_means = np.mean(D, axis=0)
    return top_tfidf_feats(tfidf_means, features, top_n)



In [186]:

    
from sklearn.feature_extraction.text import TfidfVectorizer

text = pd.read_csv("datasets/parsed/sentence.trump.txt", sep='\r', header=None)

tf_transform = TfidfVectorizer(stop_words="english")
tf = tf_transform.fit_transform(text[0].values)



In [187]:

    
top = top_mean_feats(tf, tf_transform.get_feature_names())
top



In [184]:

    
x = top["feature"].values
t = top["tfidf"].values

trace1 = go.Bar(
    x=x,
    y=t,
    text=t,
    textposition = 'auto',
    textfont=dict(
        family='sans serif',
        size=12,
        color='white'
    ),
    marker=dict(
        color='blue',
        line=dict(
            color='blue',
            width=1.5),
        ),
    opacity=0.6
)

data = [trace1]

py.iplot(data, filename='bar-direct-labels')

	feature	tfidf
0	say	0.020041
1	look	0.019665
2	country	0.019541
3	going	0.019500
4	just	0.018746
5	don	0.015640
6	doing	0.015048
7	think	0.015041
8	did	0.015002
9	know	0.014018
10	wrong	0.013897
11	said	0.013655
12	tell	0.012772
13	ve	0.012568
14	people	0.012022
15	thing	0.012012
16	like	0.011397
17	years	0.010965
18	secretary	0.010821
19	good	0.010541
20	clinton	0.010508
21	way	0.009980
22	jobs	0.009785
23	leaving	0.009528
24	want	0.009469