This notebook demonstrates the algorithm we used in our project. It shows an example of how we clustered using Nonnegative Matrix Factorization. We manually inspect the output of NMF to determine the best number of clusters for each group. Then, we create word clouds for specific groups and demographic splits.
In [1]:
import warnings
import numpy as np
import pandas as pd
from scipy.sparse import hstack
from sklearn.cross_validation import cross_val_predict
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from utils.categorize_demographics import recategorize
from utils.clean_up import clean_up, col_to_data_matrix
from utils.distinctive_tokens import log_odds_ratio
from utils.happyfuntokenizing import Tokenizer
from utils.nonnegative_matrix_factorization import nmf_labels
warnings.filterwarnings('ignore')
In [2]:
essay_dict = {'essay0' : 'My self summary',
'essay1' : 'What I\'m doing with my life',
'essay2' : 'I\'m really good at',
'essay3' : 'The first thing people notice about me',
'essay4' : 'Favorite books, movies, tv, food',
'essay5' : 'The six things I could never do without',
'essay6' : 'I spend a lot of time thinking about',
'essay7' : 'On a typical Friday night I am',
'essay8' : 'The most private thing I am willing to admit',
'essay9' : 'You should message me if'}
In [3]:
df = pd.read_csv('data/profiles.20120630.csv')
essay_list = ['essay4']
df_4 = clean_up(df, essay_list)
df_4 = recategorize(df_4)
In [8]:
df_4_y = df_4[df_4.drugs == 'yes'] #take only users with yes/no drug status
df_4_n = df_4[df_4.drugs == 'no']
df_4_y = df_4_y.sample(6500, random_state=42) #subsample data for both y and no
df_4_n = df_4_n.sample(6500, random_state=42)
drugs = df_4_y.append(df_4_n) #combine dfs
drugs['y'] = drugs['drugs'].apply(lambda x: 1 if x == 'yes' else 0) #add column for 1/0 if drug use
In [18]:
K = 25
count_matrix, tfidf_matrix, vocab = col_to_data_matrix(drugs, 'essay4', min_df=0.001)
drugs['group'] = nmf_labels(tfidf_matrix, K) #group assignment per user (group with maximum weight)
In [6]:
y = drugs.y.values #1/0 vector
In [7]:
X = tfidf_matrix.copy()
In [8]:
count_0 = count_matrix[np.array(drugs.drugs=='yes'), :].sum(axis=0)
count_1 = count_matrix[np.array(drugs.drugs=='no'), :].sum(axis=0)
counts = np.array(np.vstack((count_0, count_1)))
log_odds = log_odds_ratio(counts, vocab, use_variance=True)
In [9]:
n = 2000
top = log_odds.sort('log_odds_ratio', ascending=False)['features'].tolist()[:n]
bottom = log_odds.sort('log_odds_ratio', ascending=False)['features'].tolist()[-n:]
log_odds_features = top + bottom
In [10]:
log_odds_mask = np.array([t in log_odds_features for t in vocab])
In [11]:
X = X[:,log_odds_mask]
In [12]:
# nmf = pd.get_dummies(drugs.group, prefix='nmf').values
# X = hstack([X, nmf], format='csr')
Logistic Regression, naive Bayes, SVM, Random Forest
In [13]:
clf0 = LogisticRegression()
clf1 = MultinomialNB()
clf2 = LinearSVC()
clf3 = RandomForestClassifier()
In [14]:
for clf, name in zip([clf0, clf1, clf2, clf3],
['Logistic Regression', 'naive Bayes', 'SVM', 'Random Forest']):
yhat = cross_val_predict(clf, X, y, cv=10)
print("Accuracy: %0.4f [%s]" % (accuracy_score(y, yhat), name))
In [15]:
print("""Without feature selection:
Accuracy: 0.6715 [Logistic Regression]
Accuracy: 0.6738 [naive Bayes]
Accuracy: 0.6387 [SVM]
Accuracy: 0.6305 [Random Forest]""")