In [124]:
import pandas as pd
import numpy as np

from gensim.models import Word2Vec
from gensim.models.wrappers import FastText

from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.cluster.hierarchy import cophenet
from scipy.cluster.hierarchy import fcluster

from sklearn.decomposition import TruncatedSVD, PCA
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.manifold import TSNE
from sklearn.preprocessing import MaxAbsScaler

from plotly import tools
import plotly.offline as py
import plotly.figure_factory as ff
import plotly.graph_objs as go

py.init_notebook_mode()



In [3]:
# Autoreload changed modules
%load_ext autoreload
%autoreload 2

In [71]:
# Load data
data_clinton = pd.read_csv("datasets/parsed/words.clinton.all.txt", sep=' ', header=None)
data_trump = pd.read_csv("datasets/parsed/words.trump.all.txt", sep=' ', header=None)

In [72]:
clinton_group = data_clinton.groupby([5], as_index = False).agg(['count']).reset_index()
trump_group = data_trump.groupby([5], as_index = False).agg(['count']).reset_index()

In [185]:
x = clinton_group[5]
y_clinton = [item for sublist in clinton_group[0].values for item in sublist]
y_trump = [item for sublist in trump_group[0].values for item in sublist]

trace1 = go.Bar(
    x=x,
    y=y_clinton,
    text=y_clinton,
    textposition = 'auto',
    textfont=dict(
        family='sans serif',
        size=12,
        color='#000000'
    ),
    marker=dict(
        color='blue',
        line=dict(
            color='blue',
            width=1.5),
        ),
    opacity=0.6
)

trace2 = go.Bar(
    x=x,
    y=y_trump,
    text=y_trump,
    textposition = 'auto',
        textfont=dict(
        family='sans serif',
        size=12,
        color='#000000'
    ),
    marker=dict(
        color='red',
        line=dict(
            color='red',
            width=1.5),
        ),
    opacity=0.6
)

data = [trace1,trace2]

py.iplot(data, filename='grouped-bar-direct-labels')



In [149]:
def top_tfidf_feats(row, features, top_n=25):
    ''' Get top n tfidf values in row and return them with their corresponding feature names.'''
    topn_ids = np.argsort(row)[::-1][:top_n]
    top_feats = [(features[i], row[i]) for i in topn_ids]
    df = pd.DataFrame(top_feats)
    df.columns = ['feature', 'tfidf']
    return df

def top_feats_in_doc(Xtr, features, row_id, top_n=25):
    ''' Top tfidf features in specific document (matrix row) '''
    row = np.squeeze(Xtr[row_id].toarray())
    return top_tfidf_feats(row, features, top_n)

def top_mean_feats(Xtr, features, grp_ids=None, min_tfidf=0.1, top_n=25):
    ''' Return the top n features that on average are most important amongst documents in rows
        indentified by indices in grp_ids. '''
    if grp_ids:
        D = Xtr[grp_ids].toarray()
    else:
        D = Xtr.toarray()

    D[D < min_tfidf] = 0
    tfidf_means = np.mean(D, axis=0)
    return top_tfidf_feats(tfidf_means, features, top_n)

In [186]:
from sklearn.feature_extraction.text import TfidfVectorizer

text = pd.read_csv("datasets/parsed/sentence.trump.txt", sep='\r', header=None)

tf_transform = TfidfVectorizer(stop_words="english")
tf = tf_transform.fit_transform(text[0].values)

In [187]:
top = top_mean_feats(tf, tf_transform.get_feature_names())
top


Out[187]:
feature tfidf
0 say 0.020041
1 look 0.019665
2 country 0.019541
3 going 0.019500
4 just 0.018746
5 don 0.015640
6 doing 0.015048
7 think 0.015041
8 did 0.015002
9 know 0.014018
10 wrong 0.013897
11 said 0.013655
12 tell 0.012772
13 ve 0.012568
14 people 0.012022
15 thing 0.012012
16 like 0.011397
17 years 0.010965
18 secretary 0.010821
19 good 0.010541
20 clinton 0.010508
21 way 0.009980
22 jobs 0.009785
23 leaving 0.009528
24 want 0.009469

In [184]:
x = top["feature"].values
t = top["tfidf"].values

trace1 = go.Bar(
    x=x,
    y=t,
    text=t,
    textposition = 'auto',
    textfont=dict(
        family='sans serif',
        size=12,
        color='white'
    ),
    marker=dict(
        color='blue',
        line=dict(
            color='blue',
            width=1.5),
        ),
    opacity=0.6
)

data = [trace1]

py.iplot(data, filename='bar-direct-labels')