In [124]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from gensim.models.wrappers import FastText
from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.cluster.hierarchy import cophenet
from scipy.cluster.hierarchy import fcluster
from sklearn.decomposition import TruncatedSVD, PCA
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.manifold import TSNE
from sklearn.preprocessing import MaxAbsScaler
from plotly import tools
import plotly.offline as py
import plotly.figure_factory as ff
import plotly.graph_objs as go
py.init_notebook_mode()
In [3]:
# Autoreload changed modules
%load_ext autoreload
%autoreload 2
In [71]:
# Load data
data_clinton = pd.read_csv("datasets/parsed/words.clinton.all.txt", sep=' ', header=None)
data_trump = pd.read_csv("datasets/parsed/words.trump.all.txt", sep=' ', header=None)
In [72]:
clinton_group = data_clinton.groupby([5], as_index = False).agg(['count']).reset_index()
trump_group = data_trump.groupby([5], as_index = False).agg(['count']).reset_index()
In [185]:
x = clinton_group[5]
y_clinton = [item for sublist in clinton_group[0].values for item in sublist]
y_trump = [item for sublist in trump_group[0].values for item in sublist]
trace1 = go.Bar(
x=x,
y=y_clinton,
text=y_clinton,
textposition = 'auto',
textfont=dict(
family='sans serif',
size=12,
color='#000000'
),
marker=dict(
color='blue',
line=dict(
color='blue',
width=1.5),
),
opacity=0.6
)
trace2 = go.Bar(
x=x,
y=y_trump,
text=y_trump,
textposition = 'auto',
textfont=dict(
family='sans serif',
size=12,
color='#000000'
),
marker=dict(
color='red',
line=dict(
color='red',
width=1.5),
),
opacity=0.6
)
data = [trace1,trace2]
py.iplot(data, filename='grouped-bar-direct-labels')
In [149]:
def top_tfidf_feats(row, features, top_n=25):
''' Get top n tfidf values in row and return them with their corresponding feature names.'''
topn_ids = np.argsort(row)[::-1][:top_n]
top_feats = [(features[i], row[i]) for i in topn_ids]
df = pd.DataFrame(top_feats)
df.columns = ['feature', 'tfidf']
return df
def top_feats_in_doc(Xtr, features, row_id, top_n=25):
''' Top tfidf features in specific document (matrix row) '''
row = np.squeeze(Xtr[row_id].toarray())
return top_tfidf_feats(row, features, top_n)
def top_mean_feats(Xtr, features, grp_ids=None, min_tfidf=0.1, top_n=25):
''' Return the top n features that on average are most important amongst documents in rows
indentified by indices in grp_ids. '''
if grp_ids:
D = Xtr[grp_ids].toarray()
else:
D = Xtr.toarray()
D[D < min_tfidf] = 0
tfidf_means = np.mean(D, axis=0)
return top_tfidf_feats(tfidf_means, features, top_n)
In [186]:
from sklearn.feature_extraction.text import TfidfVectorizer
text = pd.read_csv("datasets/parsed/sentence.trump.txt", sep='\r', header=None)
tf_transform = TfidfVectorizer(stop_words="english")
tf = tf_transform.fit_transform(text[0].values)
In [187]:
top = top_mean_feats(tf, tf_transform.get_feature_names())
top
Out[187]:
In [184]:
x = top["feature"].values
t = top["tfidf"].values
trace1 = go.Bar(
x=x,
y=t,
text=t,
textposition = 'auto',
textfont=dict(
family='sans serif',
size=12,
color='white'
),
marker=dict(
color='blue',
line=dict(
color='blue',
width=1.5),
),
opacity=0.6
)
data = [trace1]
py.iplot(data, filename='bar-direct-labels')