In [1]:
import warnings
import pandas as pd
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from utils.categorize_demographics import *
from utils.clean_up import clean_up
from utils.distinctive_tokens import log_odds_ratio
from utils.lexical_features import *
from utils.nonnegative_matrix_factorization import nmf_labels, nmf_inspect
from utils.plotting import lollipop_paper
from utils.splits import *
from utils.text_representation import feature_vectors
warnings.filterwarnings('ignore')
%matplotlib inline
In [2]:
df = pd.read_csv('data/profiles.20120630.csv')
essay_list = ['essay0', 'essay4']
df_0, df_4 = clean_up(df, essay_list)
df_0 = recategorize(df_0)
df_4 = recategorize(df_4)
Based on the universal tagset of Petrov, Das, & McDonald (link).
ADJ - adjectives
ADP - adpositions (prepositions and postpositions)
ADV - adverbs
CONJ - conjunctions
DET - determiners
NOUN - nouns (common and proper)
NUM - cardinal numbers
PART - particles or other function words
PRON - pronouns
PUNCT - punctuation
VERB - verbs (all tenses and modes)
X - other: foreign words, typos, abbreviations
spaCy-specific information: https://spacy.io/docs#token-postags.
In [3]:
pos = pos_df(df_0.essay0)
pos_norm = pos_normalize(pos)
In [4]:
pos['n_tokens'] = pos.sum(axis=1)
pos_by_split(df_0, pos, 'sex', ['n_tokens'], print_levels=True)
In [5]:
profane = load_words('data/profane.txt')
profanity = pd.DataFrame(contains(profane, df_0.essay0),
columns=['profanity'])
print(profanity.profanity.sum() / profanity.shape[0])
pos_by_split(df_0, profanity, 'sex', ['profanity'])
In [6]:
slang = load_words('data/slang.txt')
slang = pd.DataFrame(contains(slang, df_0.essay0),
columns=['slang'])
print(slang.slang.sum() / slang.shape[0])
pos_by_split(df_0, slang, 'sex', ['slang'])
In [7]:
pos_by_split(df_0, pos_norm, 'sex', ['ADJ', 'NOUN', 'VERB'])
In [8]:
f = subset_df(df_0, 'sex', ['F'])
m = subset_df(df_0, 'sex', ['M'])
tagged_f = tag_corpus(f.essay0)
tagged_m = tag_corpus(m.essay0)
In [9]:
top_terms(tagged_f, tagged_m, 'ADJ', diff_prop, 15)
In [10]:
top_terms(tagged_f, tagged_m, 'ADJ', log_odds_ratio, 10)
In [11]:
top_terms(tagged_f, tagged_m, 'NOUN', log_odds_ratio, 10)
In [12]:
top_terms(tagged_f, tagged_m, 'VERB', log_odds_ratio, 10)
In [13]:
specs = {'stop_words' : 'english', 'ngram_range' : (1, 3), 'min_df' : 0.005}
In [14]:
counts, tfidf, vocab = feature_vectors(df_0.essay0, specs)
In [15]:
len(vocab)
Out[15]:
In [16]:
K = 25
nmf_inspect(tfidf, vocab, k_vals=[K], n_words=50)
In [17]:
labels = ['meet & greet', 'the city', 'enthusiastic', 'straight talk', 'about me', 'novelty',
'seeking', 'carefree', 'casual', 'enjoy', 'transplant', 'nots', 'moments',
'personality', 'amusing', 'review', 'region', 'career-focused', 'locals',
'unconstrained', 'active', 'creative', 'carpe diem', 'cheerful', 'jet setter']
In [18]:
df_0['group'] = nmf_labels(tfidf, k=K)
In [19]:
demog = 'sex'
subset = subset_df(df_0, demog, ['F', 'M'])
grouped = group_pct(subset, demog)
lollipop_paper(grouped, demog, topic_labels=labels)
In [20]:
counts, tfidf, vocab = feature_vectors(df_4.essay4, specs)
In [21]:
len(vocab)
Out[21]:
In [22]:
K = 25
nmf_inspect(tfidf, vocab, k_vals=[K], n_words=50)
In [23]:
labels = ['like', 'TV-hits', 'enthusiastic', 'favorite-0', 'genres-movies', 'genres-music',
'misc-0', 'TV-comedies-0', 'genres-food', 'nots', 'teen', 'everything',
'movies-drama-0', 'time periods', 'avid', 'miscellaneous', 'music-rock',
'movies-sci-fi', 'TV-comedies-1', 'movies-drama-1', 'kinds', 'favorite-1',
'novelty', 'TV-drama', 'genres-books']
In [24]:
df_4['group'] = nmf_labels(tfidf, k=K)
In [25]:
demog = 'sex'
subset = subset_df(df_4, demog, ['M', 'F'])
grouped = group_pct(subset, demog)
lollipop_paper(grouped, demog, topic_labels=labels)
In [26]:
mask = df_4['group'].isin([10, 12, 17, 19])
movies = counts[np.array(mask), :]
movies = counts_by_class(movies, df_4[mask], 'sex',
one_vs_one=True, vals=['M', 'F'])
log_odds = log_odds_ratio(movies, vocab, use_variance=True)
In [27]:
print_terms(log_odds, 15)
In [28]:
demog = 'drugs'
subset = subset_df(df_4, demog, ['yes', 'no', 'unknown'])
grouped = group_pct(subset, demog)
lollipop_paper(grouped, demog, colors=['Black', 'LightGray', 'Red'], topic_labels=labels)
In [29]:
def drug_labels(df):
labels = []
for i in df.index:
if df.drugs[i] in ('no', 'unknown'):
labels.append(0)
elif df.drugs[i] == 'yes':
labels.append(1)
return labels
In [30]:
# numerical drug usage labels
df_4['labels'] = drug_labels(df_4)
# split on drug usage status
drugs_yes = subset_df(df_4, 'drugs', ['yes'])
drugs_no = subset_df(df_4, 'drugs', ['no'])
drugs_unknown = subset_df(df_4, 'drugs', ['unknown'])
# balanced sample of known drug users
drugs_known_sample = pd.concat([drugs_no.sample(drugs_yes.shape[0],
random_state=42),
drugs_yes],
ignore_index=True)
In [31]:
pred_4 = {'vocabulary' : vocab}
_, known, _ = feature_vectors(drugs_known_sample.essay4, pred_4)
_, unknown, _ = feature_vectors(drugs_unknown.essay4, pred_4)
In [32]:
known_train, known_test, y_train, y_test = train_test_split(known,
drugs_known_sample['labels'],
test_size=0.2, random_state=42)
In [33]:
model = LogisticRegression()
model.fit(known_train, y_train)
Out[33]:
In [34]:
yhat = model.predict(known_test)
print(accuracy_score(y_test, yhat))
In [35]:
drugs_unknown['yhat'] = model.predict(unknown)
print(drugs_unknown.yhat.sum() / drugs_unknown.shape[0])
In [36]:
drugs_unknown.groupby('group')['yhat'].mean().sort_values()
Out[36]:
"Tell me something I don't know. Can I predict drug usage status based on text alone? How well? Then, find the terms that are most indicative of drug usage.
In [37]:
essay_list = ['essay' + str(i) for i in range(10)]
df_0, df_1, df_2, df_3, df_4, df_5, df_6, df_7, df_8, df_9 = clean_up(df,
essay_list,
min_words=0)
In [38]:
# combine all essays
essays = df_0.essay0.str.cat([df_1.essay1, df_2.essay2, df_3.essay3,
df_4.essay4, df_5.essay5, df_6.essay6,
df_7.essay7, df_8.essay8, df_9.essay9], sep=' ')
df_0['essays'] = essays
df_0 = recategorize(df_0)
In [39]:
# only keep observations with more than five tokens
all_essays = df_0[df_0.essays.str.split().str.len() > 5]
all_essays.shape, df.shape
Out[39]:
In [40]:
# numerical drug usage labels
all_essays['labels'] = drug_labels(all_essays)
# split on drug usage status
drugs_yes = subset_df(all_essays, 'drugs', ['yes'])
drugs_no = subset_df(all_essays, 'drugs', ['no'])
drugs_unknown = subset_df(all_essays, 'drugs', ['unknown'])
# balanced sample of known drug users
drugs_known_sample = pd.concat([drugs_no.sample(drugs_yes.shape[0],
random_state=42),
drugs_yes],
ignore_index=True)
In [41]:
# vocabulary for all essays
_, _, all_vocab = feature_vectors(all_essays.essays, specs)
print(len(all_vocab))
In [42]:
pred_all = {'vocabulary' : all_vocab}
_, known, _ = feature_vectors(drugs_known_sample.essays, pred_all)
_, unknown, _ = feature_vectors(drugs_unknown.essays, pred_all)
In [43]:
known_train, known_test, y_train, y_test = train_test_split(known,
drugs_known_sample['labels'],
test_size=0.2, random_state=42)
In [44]:
model = LogisticRegression()
model.fit(known_train, y_train)
Out[44]:
In [45]:
yhat = model.predict(known_test)
print(accuracy_score(y_test, yhat))
In [46]:
drugs_unknown['yhat'] = model.predict(unknown)
In [47]:
print(drugs_unknown.yhat.sum() / drugs_unknown.shape[0])
In [48]:
token_dict = {coeff : i for i, coeff in enumerate(model.coef_[0])}
In [49]:
coefficients = model.coef_[0]
coefficients.sort()
In [50]:
import numpy as np
In [51]:
for i in range(1, 26):
print(all_vocab[token_dict[coefficients[-i]]] + ' (' +
str(np.round(np.exp(coefficients[-i]), 2)) + ')', end=', ')
In [ ]: