This notebook demonstrates the algorithm we used in our project. It shows an example of how we clustered using Nonnegative Matrix Factorization. We manually inspect the output of NMF to determine the best number of clusters for each group. Then, we create word clouds for specific groups and demographic splits.
In [1]:
import random
import warnings
import matplotlib as mpl
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import StratifiedKFold, permutation_test_score
from utils.clean_up import *
from utils.categorize_demographics import *
from utils.nonnegative_matrix_factorization import nmf_labels
from utils.distinctive_tokens import log_odds_ratio
from utils.wc_colors import *
from utils.splits import *
from utils.plotting import *
warnings.filterwarnings('ignore')
%matplotlib inline
In [2]:
mpl.rc('savefig', dpi=300)
params = {'figure.dpi' : 300,
'axes.axisbelow' : True,
'lines.antialiased' : True}
for (k, v) in params.items():
plt.rcParams[k] = v
sns.set_style("dark")
In [3]:
# Keeping track of the names of the essays
essay_dict = {'essay0' : 'My self summary',
'essay1' : 'What I\'m doing with my life',
'essay2' : 'I\'m really good at',
'essay3' : 'The first thing people notice about me',
'essay4' : 'Favorite books, movies, tv, food',
'essay5' : 'The six things I could never do without',
'essay6' : 'I spend a lot of time thinking about',
'essay7' : 'On a typical Friday night I am',
'essay8' : 'The most private thing I am willing to admit',
'essay9' : 'You should message me if'}
In [4]:
df = pd.read_csv('data/profiles.20120630.csv')
essay_list = ['essay0', 'essay4']
df_0, df_4 = clean_up(df, essay_list)
df_0 = recategorize(df_0)
df_4 = recategorize(df_4)
In [5]:
K = 25
count_matrix, tfidf_matrix, vocab = col_to_data_matrix(df_0, 'essay0')
df_0['group'] = nmf_labels(tfidf_matrix, K)
In [6]:
K = 25
count_matrix, tfidf_matrix, vocab = col_to_data_matrix(df_4, 'essay4')
df_4['group'] = nmf_labels(tfidf_matrix, K)
Note: count_matrix
, tfidf_matrix
, and vocab
correspond to the data found in df_4
.
In [7]:
counts = counts_by_class(count_matrix, df_4, 'group', one_vs_one=False, vals=15)
diffs = diff_prop(counts, vocab)
t, _ = wf(diffs, 100)
wcloud(t, cyan)
In [8]:
demog = 'ethnicity'
subset = subset_df(df_4, demog, ['white', 'black'])
grouped = group_pct(subset, demog)
lollipop(grouped, demog)
In [9]:
demog = 'drugs'
subset = subset_df(df_4, demog, ['yes','no','unknown'])
grouped = group_pct(subset, demog)
lollipop(grouped, demog)
In [10]:
counts = counts_by_class(count_matrix, df_4, 'drugs', one_vs_one=True, vals=['yes', 'no'])
log_odds = log_odds_ratio(counts, vocab, use_variance=True)
t, b = wf(log_odds, 100)
wcloud(t, blue)
wcloud(b, red)
In [14]:
counts = counts_by_class(count_matrix, df_4, 'drugs', ['yes', 'no'])
In [15]:
log_odds = log_odds_ratio(np.array(counts), vocab, use_variance=True)
In [10]:
colors = ['#348ABD', '#A60628', '#7A68A6', '#467821', '#D55E00', '#CC79A7',
'#56B4E9', '#009E73', '#F0E442', '#0072B2', '#A500FF', '#FFA500']
tmp = log_odds.sort('log_odds_ratio', ascending=False)
tmp = tmp.set_index('features')
top = tmp.iloc[:15]
top['group'] = 0
bottom = tmp.iloc[-15:]
bottom['group'] = 1
tmp = top.append(bottom)
f, ax = plt.subplots()
tmp['log_odds_ratio'].plot(kind = 'bar', ax = ax, color=[colors[i] for i in tmp['group']])
ax.set_ylim([-17,17])
ax.set_xlabel('')
ax.set_ylabel('log odds ratio')
plt.legend([])
fs = 14
for label in (ax.get_xticklabels() + ax.get_yticklabels()):
label.set_fontweight('bold')
label.set_fontsize(fs)
label.set_color('lightgray')
ax.xaxis.label.set_color('lightgray')
ax.xaxis.label.set_fontweight('bold')
ax.xaxis.label.set_fontsize(fs)
ax.yaxis.label.set_color('lightgray')
ax.yaxis.label.set_fontweight('bold')
ax.yaxis.label.set_fontsize(fs)
In [8]:
X = np.array(tfidf_matrix.todense())
y = df_4.drugs.values
In [9]:
logistic = LogisticRegression()
In [10]:
b_hats_logistic = betas(logistic, X, y, test_size=0.25)
In [11]:
rf = RandomForestClassifier()
In [12]:
b_hats_rf = betas(rf, X, y, test_size=0.25)
In [12]:
b_hats_logistic
Out[12]:
In [13]:
b_hats_rf
Out[13]:
For the random forest classifier, the values are not the beta hats. They are based on the feature_importances_
attribute. If this isn't useful, we could edit utils/classification.py
.
Also, we might want to turn these into binary classification problems. For the drugs
feature, for example, we could exclude unknown values.
In either case, this is just a placeholder for now.
In [15]:
cv = StratifiedKFold(y, 2)
score, permutation_scores, pvalue = permutation_test_score(
logistic, X, y, scoring="accuracy", cv=cv, n_permutations=100, n_jobs=1)
print("Classification score %s (pvalue : %s)" % (score, pvalue))
In [29]:
# View histogram of permutation scores
n_classes = np.unique(y).size
plt.hist(permutation_scores, label='Permutation scores')
plt.plot(2 * [score], ylim, '--g', linewidth=3,
label='Classification Score'
' (pvalue %s)' % pvalue)
plt.legend()
plt.xlabel('Score')
Out[29]:
In [7]:
# Takes in an essay, data frame, and demographic list. Graphs the
# percentage each demographic is present within each cluster. Assumes that
# for each demographic listed, there is a filename.
def plot_bars(df, essay, demog_list, filenames):
colors = ['#348ABD', '#A60628', '#7A68A6', '#467821', '#D55E00', '#CC79A7',
'#56B4E9', '#009E73', '#F0E442', '#0072B2', '#A500FF', '#FFA500' ]
sns.set_style("dark")
fs = 28
for f, demog in enumerate(demog_list):
this = pd.DataFrame({'count' :
df.groupby([demog, 'group'])['group'].count()}).reset_index()
that = this.groupby(demog, as_index=False)['count'].sum()
this = pd.merge(this, that, on=demog)
this['pct'] = this.count_x / this.count_y
fig, ax = plt.subplots(figsize=(12, 8))
# lines
lineval = this.groupby('group')['pct'].max()
for i, g in enumerate(lineval):
plt.plot([i, i], [0, g],
linewidth=10,
color='lightgray',
zorder=1)
# markers
for i, d in enumerate(this[demog].unique()):
tdf = this[this[demog]==d]
plt.scatter(range(len(tdf)), tdf.pct,
s=400,
color=colors[i],edgecolor = 'lightgray', lw = 4,
zorder=2, label=d.capitalize())
plt.xlim(-0.5, len(tdf)-0.5)
plt.ylim(0)
plt.gca().get_yaxis().set_major_formatter(
mpl.ticker.FuncFormatter(lambda y, p: format(y, '.0%')))
plt.xlabel('Group')
plt.ylabel('Normalized Percentage of Users')
#plt.title(essay_dict[essay_list[0]], fontsize = 18, fontweight = 'bold', color = 'lightgray')
lg = ax.legend(title=demog.title(), fontsize = fs, loc = 'upper right', bbox_to_anchor = (1.15, 1))
for text in lg.get_texts():
plt.setp(text, color = 'lightgray', weight = 'bold')
lg.get_title().set_fontweight('bold')
lg.get_title().set_color('lightgray')
lg.get_title().set_fontsize(fs)
for label in (ax.get_xticklabels() + ax.get_yticklabels()):
label.set_fontweight('bold')
label.set_fontsize(fs)
label.set_color('lightgray')
ax.xaxis.label.set_color('lightgray')
ax.xaxis.label.set_fontweight('bold')
ax.xaxis.label.set_fontsize(fs)
ax.yaxis.label.set_color('lightgray')
ax.yaxis.label.set_fontweight('bold')
ax.yaxis.label.set_fontsize(fs)
plt.savefig(filenames[f], transparent=True)
In [24]:
df_0_simple = df_0[df_0.gender_orientation.isin(['M straight','M gay', 'F straight','F gay'])]
plot_bars(df_0_simple, 'essay0', ['gender_orientation'], ['essay0_gender_orientation.png'])
In [25]:
df_0_simple = df_0[df_0.ethnicity.isin(['white','black','asian','hispanic', 'multi'])]
plot_bars(df_0_simple,'essay0', ['ethnicity'], ['essay0_ethnicity.png'])
In [26]:
plot_bars(df_4, 'essay4', ['sex'], ['essay4_sex.png'])
In [27]:
df_4_simple = df_4[df_4.ethnicity.isin(['white','black','asian','hispanic', 'multi'])]
plot_bars(df_4_simple,'essay4', ['ethnicity'], ['essay4_ethnicity.png'])
In [43]:
count_0 = count_matrix[np.array(df_0.group==2), :]
count_1 = count_matrix[np.array(df_0.group!=2), :]
wcloud(count_0, count_1, vocab, n, yellow, 'group2.png')
In [44]:
count_0 = count_matrix[np.array(df_0.group==6), :]
count_1 = count_matrix[np.array(df_0.group!=6), :]
wcloud(count_0, count_1, vocab, n, cyan, 'group6.png')
In [45]:
count_0 = count_matrix[np.array(df_0.group==11), :]
count_1 = count_matrix[np.array(df_0.group!=11), :]
wcloud(count_0, count_1, vocab, n, yellow, 'group11.png')
In [46]:
count_0 = count_matrix[np.array(df_0.group==16), :]
count_1 = count_matrix[np.array(df_0.group!=16), :]
wcloud(count_0, count_1, vocab, n, cyan, 'group16.png')
In [33]:
count_0 = count_matrix[np.array((df_clean.group==4) &
(df_clean.ethnicity=='asian')), :]
count_1 = count_matrix[np.array((df_clean.group!=4) &
(df_clean.ethnicity.isin(['black',
'hispanic / latin',
'multi',
'white']))), :]
wcloud(count_0, count_1, vocab, n, blue, 'essay0_group4_asian.png')
In [35]:
count_0 = count_matrix[np.array((df_clean.group==9) &
(df_clean.ethnicity=='hispanic / latin')), :]
count_1 = count_matrix[np.array((df_clean.group!=9) &
(df_clean.ethnicity.isin(['black',
'asian',
'multi',
'white']))), :]
wcloud(count_0, count_1, vocab, n, purple, 'essay0_group9_hispanic.png')
In [36]:
count_0 = count_matrix[np.array((df_clean.group==1) & (df_clean.gender_orientation=='M gay')), :]
count_1 = count_matrix[np.array((df_clean.group!=1) & (df_clean.gender_orientation!='M gay')), :]
wcloud(count_0, count_1, vocab, n, purple, 'essay0_group1_mgay.png')
In [37]:
count_0 = count_matrix[np.array((df_clean.group==2) & (df_clean.sex=='F')), :]
count_1 = count_matrix[np.array((df_clean.group!=2) & (df_clean.sex!='F')), :]
wcloud(count_0, count_1, vocab, n, red_blue, 'essay0_group2_f.png')
In [39]:
count_0 = count_matrix[np.array((df_clean.group==6) & (df_clean.gender_orientation=='M gay')), :]
count_1 = count_matrix[np.array((df_clean.group!=6) & (df_clean.gender_orientation!='M gay')), :]
wcloud(count_0, count_1, vocab, n, purple, 'essay0_group6_mgay.png')
In [40]:
count_0 = count_matrix[np.array((df_clean.group==7) & (df_clean.gender_orientation=='F gay')), :]
count_1 = count_matrix[np.array((df_clean.group!=7) & (df_clean.gender_orientation!='F gay')), :]
wcloud(count_0, count_1, vocab, n, blue, 'essay0_group7_fgay.png')
In [47]:
n=100
In [48]:
count_0 = count_matrix[np.array(df_4.group==1), :]
count_1 = count_matrix[np.array(df_4.group!=1), :]
wcloud(count_0, count_1, vocab, n, yellow, 'group1.png')
In [49]:
count_0 = count_matrix[np.array(df_clean.group==8), :]
count_1 = count_matrix[np.array(df_clean.group!=8), :]
wcloud(count_0, count_1, vocab, n, cyan, 'group8.png')
In [50]:
count_0 = count_matrix[np.array(df_clean.group==15), :]
count_1 = count_matrix[np.array(df_clean.group!=15), :]
wcloud(count_0, count_1, vocab, n, yellow, 'group15.png')
In [51]:
count_0 = count_matrix[np.array(df_clean.group==22), :]
count_1 = count_matrix[np.array(df_clean.group!=22), :]
wcloud(count_0, count_1, vocab, n, cyan, 'group22.png')
In [52]:
count_0 = count_matrix[np.array((df_clean.group==7) &
(df_clean.ethnicity=='black')), :]
count_1 = count_matrix[np.array((df_clean.group!=7) &
(df_clean.ethnicity.isin(['asian',
'hispanic / latin',
'multi',
'white']))), :]
wcloud(count_0, count_1, vocab, n, red, 'essay4_group7_black.png')
In [53]:
count_0 = count_matrix[np.array((df_clean.group==22) &
(df_clean.ethnicity.isin(['white',
'multi']))), :]
count_1 = count_matrix[np.array((df_clean.group!=22) &
(df_clean.ethnicity.isin(['asian',
'black',
'hispanic / latin']))), :]
wcloud(count_0, count_1, vocab, n, green_orange, 'essay4_group22_whitemulti.png')
In [54]:
count_0 = count_matrix[np.array((df_clean.group==12) & (df_clean.ethnicity=='asian')), :]
count_1 = count_matrix[np.array((df_clean.group!=12) & (df_clean.ethnicity!='asian')), :]
wcloud(count_0, count_1, vocab, n, blue, 'essay4_group12_asian.png')
In [55]:
count_0 = count_matrix[np.array((df_clean.group==1) & (df_clean.ethnicity=='white')), :]
count_1 = count_matrix[np.array((df_clean.group!=1) & (df_clean.ethnicity!='white')), :]
wcloud(count_0, count_1, vocab, n, orange, 'essay4_group1_white.png')
In [57]:
count_0 = count_matrix[np.array((df_clean.group.isin([2, 15, 24])) & (df_clean.sex=='F')), :]
count_1 = count_matrix[np.array((df_clean.group.isin([2, 15, 24])) & (df_clean.ethnicity!='F')), :]
wcloud(count_0, count_1, vocab, n, blue, 'essay4_movies_women.png')
In [59]:
count_0 = count_matrix[np.array((df_clean.group.isin([2, 15, 24])) & (df_clean.sex=='M')), :]
count_1 = count_matrix[np.array((df_clean.group.isin([2, 15, 24])) & (df_clean.ethnicity!='M')), :]
wcloud(count_0, count_1, vocab, n, red, 'essay4_movies_men.png')
In [ ]: