OkNLP


In [1]:
import warnings

import pandas as pd
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

from utils.categorize_demographics import *
from utils.clean_up import clean_up
from utils.distinctive_tokens import log_odds_ratio
from utils.lexical_features import *
from utils.nonnegative_matrix_factorization import nmf_labels, nmf_inspect
from utils.plotting import lollipop_paper
from utils.splits import *
from utils.text_representation import feature_vectors


warnings.filterwarnings('ignore')

%matplotlib inline

Data


In [2]:
df = pd.read_csv('data/profiles.20120630.csv')

essay_list = ['essay0', 'essay4']
df_0, df_4 = clean_up(df, essay_list)

df_0 = recategorize(df_0)
df_4 = recategorize(df_4)

Lexical

Based on the universal tagset of Petrov, Das, & McDonald (link).

ADJ - adjectives
ADP - adpositions (prepositions and postpositions)
ADV - adverbs
CONJ - conjunctions
DET - determiners
NOUN - nouns (common and proper)
NUM - cardinal numbers
PART - particles or other function words
PRON - pronouns
PUNCT - punctuation
VERB - verbs (all tenses and modes)
X - other: foreign words, typos, abbreviations

spaCy-specific information: https://spacy.io/docs#token-postags.


In [3]:
pos = pos_df(df_0.essay0)
pos_norm = pos_normalize(pos)

Essay Length


In [4]:
pos['n_tokens'] = pos.sum(axis=1)
pos_by_split(df_0, pos, 'sex', ['n_tokens'], print_levels=True)


Levels (in order): ['F' 'M']

n_tokens
[means] 'a': 149.58946578 'b': 138.72696526
p-values:
  [permutation]: 0.0
  [classical]:   0.0

Profanity and Slang


In [5]:
profane = load_words('data/profane.txt')
profanity = pd.DataFrame(contains(profane, df_0.essay0),
                         columns=['profanity'])
print(profanity.profanity.sum() / profanity.shape[0])
pos_by_split(df_0, profanity, 'sex', ['profanity'])


0.0600098191019
profanity
[means] 'a': 0.05815862 'b': 0.06125739
p-values:
  [permutation]: 0.1439
  [classical]:   0.1390202


In [6]:
slang = load_words('data/slang.txt')
slang = pd.DataFrame(contains(slang, df_0.essay0),
                     columns=['slang'])
print(slang.slang.sum() / slang.shape[0])
pos_by_split(df_0, slang, 'sex', ['slang'])


0.559084557574
slang
[means] 'a': 0.5411097 'b': 0.57119828
p-values:
  [permutation]: 0.0
  [classical]:   0.0

Adjectives, Nouns, and Verbs


In [7]:
pos_by_split(df_0, pos_norm, 'sex', ['ADJ', 'NOUN', 'VERB'])


ADJ
[means] 'a': 0.10614445 'b': 0.10162557
p-values:
  [permutation]: 0.0
  [classical]:   0.0

NOUN
[means] 'a': 0.18651103 'b': 0.18868403
p-values:
  [permutation]: 0.0
  [classical]:   3.163e-05

VERB
[means] 'a': 0.18277245 'b': 0.18272425
p-values:
  [permutation]: 0.9122
  [classical]:   0.90974631

Tokens


In [8]:
f = subset_df(df_0, 'sex', ['F'])
m = subset_df(df_0, 'sex', ['M'])

tagged_f = tag_corpus(f.essay0)
tagged_m = tag_corpus(m.essay0)

Adjectives


In [9]:
top_terms(tagged_f, tagged_m, 'ADJ', diff_prop, 15)


my | happy | independent | favorite | sweet | silly | important | passionate | warm | amazing | beautiful | adventurous | creative | loyal | social

that | nice | more | few | other | most | its | cool | interesting | your | easy | good | which | much | last

In [10]:
top_terms(tagged_f, tagged_m, 'ADJ', log_odds_ratio, 10)


independent | sweet | my | sassy | silly | happy | warm | favorite | girly | fabulous

nice | cool | its | that | few | interesting | martial | most | masculine | more

Nouns


In [11]:
top_terms(tagged_f, tagged_m, 'NOUN', log_odds_ratio, 10)


girl | family | who | yoga | men | gal | heels | love | dancing | friends

guy | computer | engineer | guitar | sports | software | women | video | technology | geek

Verbs


In [12]:
top_terms(tagged_f, tagged_m, 'VERB', log_odds_ratio, 10)


love | am | laugh | laughing | dancing | adore | loving | dance | appreciate | being

m | was | play | playing | laid | 'll | working | hit | moved | been

Semantic


In [13]:
specs = {'stop_words' : 'english', 'ngram_range' : (1, 3), 'min_df' : 0.005}

Essay 0

Text Representation


In [14]:
counts, tfidf, vocab = feature_vectors(df_0.essay0, specs)

In [15]:
len(vocab)


Out[15]:
2058

Clustering


In [16]:
K = 25
nmf_inspect(tfidf, vocab, k_vals=[K], n_words=50)


25
Group 0:
people | new people | new | meet | meet new | meet new people | meeting | meeting new | meeting new people | looking meet | looking meet new | love meeting | love meeting new | new friends | meet people | like meet | people 'm | love meet | friends | enjoy meeting | interesting | 'm looking meet | city | interesting people | meeting people | interested | want meet | interested meeting | 'm new | people love | experiences | just moved | online | dating | site | new experiences | people like | cool | outgoing | hang | new places | hoping | excited | make new | recently | make new friends | people n't | moved | people make | :)

Group 1:
san | francisco | san francisco | moved san | moved san francisco | city | living san | living san francisco | living | moved | just moved san | live san | live | diego | native | san diego | lived san | grew | lived san francisco | lived | originally | years | recently | year | raised san | home | york | currently | new york | born raised san | recently moved | new | los | university | offer | just moved | exploring | angeles | los angeles | working | chicago | santa | love living | hi | graduated | love city | explore | state | college | work

Group 2:
love | travel | love travel | outdoors | love laugh | love outdoors | love love | cook | laugh | dance | love cook | especially | love going | life love | love life | food | city | love good | people love | :) | love dance | world | love people | animals | favorite | nature | things love | dogs | sports | beach | great | love music | really love | love traveling | love sports | love meet | love food | heart | explore | big | 'd love | person love | love animals | friends love | love family | absolutely | love job | love city | adventure | hike

Group 3:
know | just | want | ask | want know | message | just ask | really | know just | :) | 'll | say | 'm just | n't know | nt | wanna | profile | just want | talk | write | getting | questions | u | send | getting know | know 'm | just moved | know want | feel | thing | lol | feel free | trying | need | like know | tell | meet | send message | really want | come | maybe | hey | n't want | guy | hit | shy | site | free | right | let know

Group 4:
'm | pretty | 'm pretty | 'm really | 'm looking | really | guy | 'm just | say 'm | 'll | think 'm | 'm good | 'm trying | nerd | life 'm | 'm working | bit | 'm bit | 'm going | laid | hi 'm | shy | sure | time 'm | 'm big | big | people 'm | girl | things 'm | 'm interested | 'm happy | fan | 'm kind | usually | 'd | 'm passionate | 'm currently | 'm usually | know 'm | 'm sure | friends 'm | geek | 'm originally | 'm new | 'm doing | 'm open | 'm laid | 'm little | sarcastic | right 'm

Group 5:
new | things | new things | trying | trying new | try | trying new things | try new | try new things | places | learning | new places | learning new | learning new things | exploring | learn | things like | restaurants | explore | exploring new | learn new | new restaurants | things love | things 'm | new york | york | love trying | doing | different | like try | things life | adventures | traveling | travel | 'm trying | explore new | experience | trying new restaurants | experiences | new experiences | looking new | love learning | finding | discovering | adventure | city | foods | things enjoy | willing | finding new

Group 6:
looking | 'm looking | guy | relationship | looking meet | meet | share | woman | nice | just looking | man | partner | guy looking | girl | looking new | single | forward | looking forward | friend | just | term | friends | long term | date | hang | new friends | looking fun | looking friends | special | guys | right | dating | long | wants | looking share | great | 'm looking meet | looking meet new | looking partner | sex | maybe | interests | friendship | smart | term relationship | long term relationship | women | nice guy | crime | chemistry

Group 7:
going | easy | easy going | 'm easy | 'm easy going | guy | going guy | easy going guy | pretty easy | pretty easy going | pretty | laid | love going | 'm going | person | like going | earth | 'm pretty | simple | enjoy going | friendly | drama | likes | going movies | movies | funny | try | concerts | loves | sports | 's going | hanging | gym | girl | playing | shows | staying | nice | beach | home | just | hang | 'm laid | :) | kind | enjoys | chill | positive | honest | relaxed

Group 8:
m | nt | s | guy | u | ve | lol | chill | nice | old | pretty | alot | laid | kinda | n | really | say | hit | girl | years old | hi | wanna | hey | single | lets | :) | haha | type | shy | real | person | outgoing | good time | cool | hang | big | ca | 2 | school | just | good | sports | got | party | currently | cause | message | tattoos | student | smoke

Group 9:
like | 'd like | 'd | things | things like | really like | n't like | feel like | really | stuff | like think | feel | like people | think | people | lot | like going | watch | read | people like | games | like meet | movies | play | like travel | cook | eat | like things | like 'm | just like | like fun | like good | dance | like laugh | drink | like read | stay | like play | talk | bike | nice | life like | little | like cook | hang | time like | doing | sports | going | like stay

Group 10:
moved | sf | years | ago | coast | east | years ago | school | east coast | city | moved sf | just moved | grew | year | college | went | california | originally | recently | just | west | recently moved | living | west coast | nyc | job | months | grad | year ago | moved san | la | moved bay | grad school | spent | moved san francisco | went school | boston | months ago | 'm originally | york | new york | 2 | midwest | lived | came | couple | graduated | south | went college | transplant

Group 11:
n't | ca | ca n't | does | does n't | really | wo | wo n't | n't like | n't know | did | think | n't really | mean | 're | did n't | probably | say | seriously | n't think | mind | people | way | 'll | need | things | n't want | care | right | profile | n't mind | tell | believe | people n't | n't mean | long | thing | hate | actually | n't care | wait | does n't mean | n't seriously | try | n't believe | use | feel | read | stand | day

Group 12:
time | spend | spend time | good time | free | lot | free time | spending | spending time | lot time | good | spend lot | time friends | spend lot time | time time | time 'm | having | working | time like | time love | spent | school | having good | having good time | busy | spare | student | work time | spare time | doing | long time | long | spending time friends | time family | work | job | day | time enjoy | currently | great time | right | days | enjoy spending | enjoy spending time | spent time | thinking | taking | enjoying | quite | possible

Group 13:
good | humor | sense | sense humor | good sense | good sense humor | great | good food | food | great sense | great sense humor | good time | conversation | good conversation | sarcastic | 'm good | love good | dry | enjoy good | good company | company | good friends | good people | appreciate | listener | wine | good listener | like good | intelligent | times | guy | having good | pretty good | really good | man | bad | smart | honest | wit | witty | earth | having | friend | good friend | loyal | heart | important | nice | kind | life good

Group 14:
fun | loving | fun loving | 'm fun | having fun | having | outgoing | guy | girl | adventurous | like fun | looking fun | loves | caring | smart | likes | honest | fun 'm | love fun | spontaneous | fun love | earth | funny | loyal | intelligent | fun like | :) | energetic | lot fun | 's fun | witty | passionate | kind | hang | laid | sweet | ambitious | enjoys | friendly | adventure | creative | woman | compassionate | happy | athletic | romantic | chill | young | nice | active

Group 15:
's | let | let 's | 're | think | 'll | way | self | right | 's just | thing | say | little | best | 'd | profile | think 's | great | day | summary | life 's | maybe | ok | better | long | friend | 's good | people 's | probably | 's 'm | said | favorite | man | self summary | yes | 's pretty | really | 's really | start | write | oh | just | tell | awesome | bit | know 's | makes | place | true | 's hard

Group 16:
bay | area | bay area | moved bay | moved bay area | native | area native | bay area native | grew | living | living bay | grew bay | grew bay area | 'm bay | lived | 'm bay area | east bay | area 'm | lived bay | lived bay area | raised bay area | raised bay | college | east | originally | exploring | recently | years | new | offer | explore | moved | home | born raised bay | transplant | place | live | berkeley | oakland | working | places | year | recently moved | enjoying | currently | moving | boston | work | la | went

Group 17:
work | hard | play | work hard | working | hard play | work hard play | play hard | hard play hard | hard working | progress | work progress | harder | job | try | week | live | love work | company | school | balance | day | work time | currently | industry | busy | weekends | 's hard | work 'm | gym | tech | lot | outside | 'm working | active | career | days | business | social | eat | sleep | travel | guitar | startup | doing | run | professional | healthy | ambitious | stay

Group 18:
born | raised | born raised | raised san | born raised san | raised bay | born raised bay | california | raised bay area | college | school | sf | went | years | family | lived | berkeley | oakland | currently | girl | southern | parents | graduated | went college | bay | northern | spent | came | south | went school | chinese | hawaii | la | sports | cal | old | northern california | american | giants | san | southern california | texas | studied | state | high school | educated | san diego | diego | uc | age

Group 19:
open | minded | open minded | 'm open | creative | honest | relationship | adventurous | curious | passionate | intelligent | mind | kind | heart | strong | interested | independent | woman | nature | compassionate | loyal | self | playful | open mind | free | spontaneous | funny | caring | long | confident | earth | romantic | relationships | man | spirit | book | warm | spiritual | open new | smart | lover | seeking | sensual | generous | thoughtful | experiences | non | loving | affectionate | friendly

Group 20:
enjoy | friends | family | hiking | watching | friends family | outdoors | traveling | hanging | cooking | great | family friends | active | wine | movies | sports | reading | playing | activities | spending | home | going | running | camping | hanging friends | biking | beach | spending time | really enjoy | close | movie | city | dinner | exploring | dancing | important | outdoor | travel | enjoy going | enjoy life | skiing | staying | restaurants | watching movies | eating | taking | trips | swimming | enjoy outdoors | snowboarding

Group 21:
music | art | live | movies | live music | play | food | playing | games | love music | dancing | listening | books | guitar | shows | reading | listen | rock | video | creative | photography | dance | film | cooking | video games | design | watching | making | writing | nature | listening music | old | arts | science | artist | lover | favorite | coffee | sports | jazz | read | passion | interests | watch | beer | museums | history | kinds | nerd | hop

Group 22:
life | live | world | share | living | live life | believe | things life | want | fullest | day | man | best | happy | simple | enjoy life | important | experiences | partner | passionate | love life | enjoying | life fullest | experience | short | life 'm | life 's | woman | passion | adventure | great | positive | way | heart | life love | feel | moment | living life | life short | beautiful | special | family | things | life like | enjoying life | nature | healthy | better | journey | share life

Group 23:
person | people | make | laugh | think | funny | kind | pretty | say | try | things | happy | lot | make laugh | people laugh | believe | honest | smile | best | way | make people | really | little | better | tend | love laugh | make people laugh | making | friends | shy | positive | world | self | talk | bit | makes | important | smart | loyal | feel | consider | creative | care | usually | type | sarcastic | times | caring | interesting | thing

Group 24:
've | lived | years | 've lived | got | 've got | world | lot | traveled | year | bit | spent | past | 'd | places | travel | home | countries | different | worked | 5 | far | told | 've told | europe | 10 | 've traveled | seen | long | learned | 've spent | quite | years 've | old | couple | place | recently | 2 | 3 | feel | california | little | country | 4 | come | 6 | times | 'll | met | way



In [17]:
labels = ['meet & greet', 'the city', 'enthusiastic', 'straight talk', 'about me', 'novelty',
          'seeking', 'carefree', 'casual', 'enjoy', 'transplant', 'nots', 'moments',
          'personality', 'amusing', 'review', 'region', 'career-focused', 'locals',
          'unconstrained', 'active', 'creative', 'carpe diem', 'cheerful', 'jet setter']

In [18]:
df_0['group'] = nmf_labels(tfidf, k=K)

In [19]:
demog = 'sex'
subset = subset_df(df_0, demog, ['F', 'M'])
grouped = group_pct(subset, demog)
lollipop_paper(grouped, demog, topic_labels=labels)


Essay 4

Text Representation


In [20]:
counts, tfidf, vocab = feature_vectors(df_4.essay4, specs)

In [21]:
len(vocab)


Out[21]:
2898

Clustering


In [22]:
K = 25
nmf_inspect(tfidf, vocab, k_vals=[K], n_words=50)


25
Group 0:
like | music like | movies like | really | really like | stuff | food like | things | like music | books like | like movies | n't like | like food | movies | lot | stuff like | like lot | like read | kind | comedies | like good | music | good | shows like | things like | like books | think | like reading | make | different | like watch | especially | old | like kinds | feel | like eat | like types | just | people | bands | pretty | feel like | funny | probably | like cook | eat | ones | nt | lots | metal

Group 1:
men | mad | mad men | arrested | development | arrested development | bad | breaking | breaking bad | rock | 30 | 30 rock | tv | parks | sunny | wire | dexter | shows | office | curb | sunny philadelphia | 's sunny | philadelphia | rec | parks rec | community | curb enthusiasm | enthusiasm | recreation | parks recreation | 's sunny philadelphia | daily | seinfeld | tv shows | archer | shows arrested | weeds | shows arrested development | colbert | louie | lost | classic | sopranos | classic rock | walking | walking dead | blood | report | friday night | night lights

Group 2:
love | food love | love food | love music | music love | love movies | love love | movies love | cook | love good | eat | food | love cook | love read | love eat | good | books love | live | movies | especially | just | love reading | music | really love | love sushi | dance | love books | going | love going | great | love types | horror | actually | food love food | love actually | try | love watching | cooking | comedies | just love | love live | love kinds | romantic | pray | wine | make | old | life | eat pray | pray love

Group 3:
favorite | favorite food | favorite movies | favorite books | favorite music | favorite movie | favorite book | favorite shows | favorite tv | time favorite | time | favorite foods | music favorite | include | favorite tv shows | favorite band | foods | food favorite | movies favorite | favorite authors | band | probably | books favorite | authors | author | favorite things | tv shows | favorite bands | movies | say | bands | shows | artists | genre | listen | hard | far | moment | tv | current | really | r&b | recent | pick | genres | hands | artist | series | color | second

Group 4:
sci | fi | sci fi | action | comedy | horror | fantasy | movies | drama | comedies | fi fantasy | sci fi fantasy | romantic | rock | action movies | films | classic | adventure | thrillers | metal | flicks | romance | romantic comedies | mystery | horror movies | novels | stuff | series | anime | suspense | chick | good | classic rock | dramas | fan | punk | genres | books | chick flicks | documentaries | pop | foreign | historical | occasional | history | classics | comic | movies like | movie | movies comedies

Group 5:
hop | hip | hip hop | rock | r&b | jazz | reggae | rap | pop | country | music hip | music hip hop | classic | old | hip hop r&b | hop r&b | soul | alternative | classic rock | music | old school | school | classical | indie | metal | funk | listen | blues | house | punk | electronic | oldies | dance | dubstep | techno | trance | folk | 90 | 90 's | latin | indie rock | salsa | comedy | alternative rock | electro | electronica | 80 | 90s | 80 's | action

Group 6:
'm | fan | 've | reading | pretty | 'll | food 'm | right | 'm big | really | 'm reading | big | currently | 'm pretty | music 'm | just | open | huge | good | big fan | 'm really | picky | listening | right 'm | going | 'm currently | 'm open | time | books 'm | 'm huge | movies 'm | stuff | probably | 'm big fan | comes | 'm picky | sucker | 'd | try | 'm sucker | say | eat | sure | lately | favorites | 'm fan | currently reading | bit | usually | think

Group 7:
bang | big bang | bang theory | theory | big bang theory | big | mother | met | met mother | shows | fan | big fan | tv | big lebowski | lebowski | friends | tv shows | house | half | office | kiss | 'm big | community | criminal minds | walking dead | minds | walking | criminal | ncis | new girl | bones | scrubs | men | dead | 500 days summer | 500 days | days summer | big fish | psych | dexter | 500 | inception | man | seinfeld | try | knight | csi | sherlock | movies big | burn notice

Group 8:
italian | thai | mexican | food | indian | chinese | japanese | sushi | french | vietnamese | mexican food | korean | thai indian | food italian | mexican italian | food thai | food mexican | thai food | italian mexican | indian thai | italian food | food sushi | american | food indian | indian food | ethiopian | mexican thai | mediterranean | thai italian | greek | bbq | thai mexican | indian mexican | asian | sushi thai | pizza | italian thai | thai chinese | chinese food | eastern | thai japanese | foods | middle eastern | thai vietnamese | seafood | chinese japanese | indian italian | jazz | sushi mexican | middle

Group 9:
n't | ca | ca n't | watch | really | tv | n't watch | watch tv | know | n't really | does | n't like | does n't | think | eat | n't watch tv | just | n't know | did | say | n't read | wo | wo n't | n't think | want | n't eat | did n't | movies n't | people | make | food n't | wait | 'll | time | shows n't | n't tv | favorites | tv n't | listen | really watch | 're | care | music n't | just n't | probably | thing | netflix | 'd | anymore | seen

Group 10:
harry | potter | harry potter | series | potter series | harry potter series | games | hunger games | hunger | books harry | books harry potter | girl | books | twilight | dragon | girl dragon | tattoo | dragon tattoo | girl dragon tattoo | glee | met | pride | prejudice | pride prejudice | harry met | sally | met sally | harry met sally | disney | vampire | lady | kill | trilogy | girls | gaga | kill mockingbird | mockingbird | lady gaga | love actually | princess | movies | friends | dark | lotr | grey | pop | anatomy | met mother | mother | adele

Group 11:
books | movies | food | music | shows | music food | books movies | good | movies music | movies shows | country | yes | shows music | dance | action | lots | kind | horror | favorite books | food love | comedy | house | pop | classical | rap | tv | ones | pizza | good food | spicy | type | taste | healthy | funny | asian | art | mood | music country | food food | food good | :) | food yes | action movies | movies good | horror movies | country food | jazz | live | eclectic | alchemist

Group 12:
sunshine | mind | eternal | eternal sunshine | spotless | spotless mind | sunshine spotless | sunshine spotless mind | eternal sunshine spotless | miss sunshine | little miss sunshine | little miss | amelie | miss | little | garden state | state | garden | lost | life | beautiful | translation | lost translation | beauty | darko | donnie | donnie darko | 500 | days summer | summer | 500 days | 500 days summer | radiohead | american beauty | royal | beautiful mind | memento | tenenbaums | royal tenenbaums | requiem | american | requiem dream | dream | movies | days | sleep | city | adaptation | sky | things

Group 13:
's | 80 | 80 's | let | good | let 's | 90 | 90 's | life | 've | just | 70 | world | 70 's | 's good | time | man | guide | great | know | 60 | day | think | ender | 's just | 's game | ender 's | 's guide | 60 's | 's music | ender 's game | stuff | story | night | people | hard | food 's | got | 're | hitchhiker | hitchhiker 's | grey | yes | little | oh | pan | hitchhiker 's guide | way | galaxy | king

Group 14:
read | lot | read lot | 've | time | watch | listen | recently | books | read books | just | n't read | books read | 've read | lately | reading | love read | like read | recently read | eat | time read | usually | watch lot | nt | loved | favorites | book read | just read | tend | used | work | enjoyed | listen lot | watched | liked | stuff | times | really | best | read book | school | listening | fun | 'll | finished | magazines | books 've | pretty | lot movies | movies

Group 15:
list | just | 'll | long | ask | way | goes | list goes | things | try | favorites | far | know | time | talk | say | 'll try | tell | short | think | 'll just | conversation | later | come | forever | good | right | oh | person | let | category | meet | 're | start | seriously | probably | 'll eat | food 'll | taste | make | want | question | eclectic | :) | just finished | tastes | feel | going | 'd | finished

Group 16:
david | black | john | tom | radiohead | bob | life | yeah | brothers | man | beatles | keys | james | dylan | black keys | bowie | dead | young | la | band | smith | new | white | music | bob dylan | michael | boys | johnny | david bowie | arcade | cash | neil | old | vonnegut | waits | stones | little | sedaris | bon | tom waits | paul | cat | velvet | ray | death | davis | pink | iver | bon iver | johnny cash

Group 17:
star | lord | wars | rings | lord rings | star wars | trek | star trek | trilogy | series | matrix | princess | rings trilogy | lord rings trilogy | dark | bride | indiana | indiana jones | firefly | princess bride | movies | original | jones | park | battlestar | galactica | 2 | dead | battlestar galactica | knight | avatar | doctor | man | dark knight | galaxy | lotr | v | guide | lost | batman | 1 | books | world | war | 3 | scott | x | guide galaxy | monty | python

Group 18:
family | modern | modern family | guy | family guy | shows | tv | office | park | south | south park | mother | met mother | met | tv shows | girl | glee | simpsons | dad | house | american dad | law | law order | dexter | american | new girl | 30 | parks | friends | order | daily | watch | 30 rock | anatomy | colbert | hangover | grey | report | true | channel | true blood | colbert report | grey 's | entourage | 's anatomy | housewives | community | grey 's anatomy | blood | dragon

Group 19:
club | fight | fight club | shawshank | redemption | shawshank redemption | pulp fiction | pulp | fear | red | hot | loathing | fear loathing | peppers | red hot | vegas | chili | las | las vegas | chili peppers | american | hot chili | red hot chili | hot chili peppers | loathing las | fear loathing las | loathing las vegas | movies | dead | rye | catcher | catcher rye | park | lebowski | big lebowski | old | kill | big | suspects | usual | day | usual suspects | dark | movies shawshank | movies shawshank redemption | knight | american beauty | office | 2 | hangover

Group 20:
kinds | kinds music | love kinds | kinds food | like kinds | music | kinds movies | music kinds | food | listen | different | country | foods | comedy | music food | music country | especially | action | rap | mood | lots | movies | music especially | watch | music love | open | food especially | metal | depends | music listen | long | depends mood | heavy metal | drama | movies like | rock | nt | love | comedies | heavy | kind | picky | try | usually | music really | reader | music like | pop | country music | biographies

Group 21:
book | movie | favorite book | favorite movie | food | music | good | fav | book read | reading | great | best | read book | just | saw | finished | called | currently | m | right | time | buff | life | day | pick | seen | pizza | loved | reader | currently reading | just finished | sushi | comic | story | type | week | current | recent | watched | person | kind | :) | series | novel | night | alchemist | huge | food sushi | nt | better

Group 22:
enjoy | new | types | trying | reading | trying new | things | foods | types music | good | films | try | music | live | really enjoy | new things | different | restaurants | really | cooking | jazz | types food | favorites | enjoy reading | especially | try new | classical | watching | foreign | rock | going | variety | world | open | love trying | love trying new | live music | great | new restaurants | enjoy good | trying new things | time | listening | currently | love types | music enjoy | movies enjoy | new foods | include | wide

Group 23:
game | thrones | game thrones | 's game | ender | ender 's | ender 's game | walking | walking dead | blood | dead | games | true blood | series | true | currently | hunger | hunger games | dexter | song ice | ice | shows | song | boardwalk | currently reading | empire | boardwalk empire | breaking | breaking bad | tv | firefly | reading | wire | books | bad | battlestar | hbo | archer | galactica | battlestar galactica | fringe | community | sherlock | californication | dune | lost | shameless | girl | sons | new

Group 24:
fiction | non | non fiction | science | science fiction | fiction books | pulp fiction | pulp | read non | read non fiction | historical | non fiction books | history | historical fiction | films | books | documentaries | biographies | classical | jazz | foreign | read fiction | tend | reading | novels | prefer | especially | fantasy | philosophy | rock | usually | world | indie | folk | good | lots | stuff | foreign films | political | politics | classic | generally | independent | art | blues | live | psychology | electronic | tv | occasional



In [23]:
labels = ['like', 'TV-hits', 'enthusiastic', 'favorite-0', 'genres-movies', 'genres-music',
          'misc-0', 'TV-comedies-0', 'genres-food', 'nots', 'teen', 'everything',
          'movies-drama-0', 'time periods', 'avid', 'miscellaneous', 'music-rock',
          'movies-sci-fi', 'TV-comedies-1', 'movies-drama-1', 'kinds', 'favorite-1',
          'novelty', 'TV-drama', 'genres-books']

In [24]:
df_4['group'] = nmf_labels(tfidf, k=K)

In [25]:
demog = 'sex'
subset = subset_df(df_4, demog, ['M', 'F'])
grouped = group_pct(subset, demog)
lollipop_paper(grouped, demog, topic_labels=labels)


Superordinate


In [26]:
mask = df_4['group'].isin([10, 12, 17, 19])
movies = counts[np.array(mask), :]
movies = counts_by_class(movies, df_4[mask], 'sex',
                         one_vs_one=True, vals=['M', 'F'])
log_odds = log_odds_ratio(movies, vocab, use_variance=True)

In [27]:
print_terms(log_odds, 15)


star | star wars | wars | matrix | fight | park | fight club | man | dark knight | knight | pulp | godfather | trek | star trek | rings

love | girl | harry | potter | pride | prejudice | harry potter | pride prejudice | bones | girls | anatomy | jane | hunger games | grey 's | 's anatomy

In [28]:
demog = 'drugs'
subset = subset_df(df_4, demog, ['yes', 'no', 'unknown'])
grouped = group_pct(subset, demog)
lollipop_paper(grouped, demog, colors=['Black', 'LightGray', 'Red'], topic_labels=labels)


Predictions


In [29]:
def drug_labels(df):
    labels = []
    for i in df.index:
        if df.drugs[i] in ('no', 'unknown'):
            labels.append(0)
        elif df.drugs[i] == 'yes':
            labels.append(1)
    return labels

Essay 4


In [30]:
# numerical drug usage labels
df_4['labels'] = drug_labels(df_4)

# split on drug usage status
drugs_yes = subset_df(df_4, 'drugs', ['yes'])
drugs_no = subset_df(df_4, 'drugs', ['no'])
drugs_unknown = subset_df(df_4, 'drugs', ['unknown'])

# balanced sample of known drug users
drugs_known_sample = pd.concat([drugs_no.sample(drugs_yes.shape[0],
                                                random_state=42),
                                drugs_yes],
                               ignore_index=True)

In [31]:
pred_4 = {'vocabulary' : vocab}
_, known, _ = feature_vectors(drugs_known_sample.essay4, pred_4)
_, unknown, _ = feature_vectors(drugs_unknown.essay4, pred_4)

In [32]:
known_train, known_test, y_train, y_test = train_test_split(known,
                                                            drugs_known_sample['labels'],
                                                            test_size=0.2, random_state=42)

In [33]:
model = LogisticRegression()
model.fit(known_train, y_train)


Out[33]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [34]:
yhat = model.predict(known_test)
print(accuracy_score(y_test, yhat))


0.680029154519

In [35]:
drugs_unknown['yhat'] = model.predict(unknown)
print(drugs_unknown.yhat.sum() / drugs_unknown.shape[0])


0.552282893071

In [36]:
drugs_unknown.groupby('group')['yhat'].mean().sort_values()


Out[36]:
group
20    0.243902
7     0.245370
18    0.295720
8     0.312500
10    0.313008
22    0.352159
2     0.388422
3     0.436252
11    0.461153
17    0.482759
21    0.500000
24    0.501618
14    0.506849
4     0.517361
13    0.577193
23    0.585821
0     0.593537
6     0.598165
5     0.621931
9     0.626126
1     0.631418
15    0.649616
12    0.660633
19    0.661376
16    0.840190
Name: yhat, dtype: float64

All Essays

"Tell me something I don't know. Can I predict drug usage status based on text alone? How well? Then, find the terms that are most indicative of drug usage.


In [37]:
essay_list = ['essay' + str(i) for i in range(10)]
df_0, df_1, df_2, df_3, df_4, df_5, df_6, df_7, df_8, df_9 = clean_up(df,
                                                                      essay_list,
                                                                      min_words=0)

In [38]:
# combine all essays
essays = df_0.essay0.str.cat([df_1.essay1, df_2.essay2, df_3.essay3,
                              df_4.essay4, df_5.essay5, df_6.essay6,
                              df_7.essay7, df_8.essay8, df_9.essay9], sep=' ')
df_0['essays'] = essays

df_0 = recategorize(df_0)

In [39]:
# only keep observations with more than five tokens
all_essays = df_0[df_0.essays.str.split().str.len() > 5]
all_essays.shape, df.shape


Out[39]:
((57490, 34), (59946, 32))

In [40]:
# numerical drug usage labels
all_essays['labels'] = drug_labels(all_essays)

# split on drug usage status
drugs_yes = subset_df(all_essays, 'drugs', ['yes'])
drugs_no = subset_df(all_essays, 'drugs', ['no'])
drugs_unknown = subset_df(all_essays, 'drugs', ['unknown'])

# balanced sample of known drug users
drugs_known_sample = pd.concat([drugs_no.sample(drugs_yes.shape[0],
                                                random_state=42),
                                drugs_yes],
                               ignore_index=True)

In [41]:
# vocabulary for all essays
_, _, all_vocab = feature_vectors(all_essays.essays, specs)
print(len(all_vocab))


6835

In [42]:
pred_all = {'vocabulary' : all_vocab}
_, known, _ = feature_vectors(drugs_known_sample.essays, pred_all)
_, unknown, _ = feature_vectors(drugs_unknown.essays, pred_all)

In [43]:
known_train, known_test, y_train, y_test = train_test_split(known,
                                                            drugs_known_sample['labels'],
                                                            test_size=0.2, random_state=42)

In [44]:
model = LogisticRegression()
model.fit(known_train, y_train)


Out[44]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [45]:
yhat = model.predict(known_test)
print(accuracy_score(y_test, yhat))


0.727417302799

In [46]:
drugs_unknown['yhat'] = model.predict(unknown)

In [47]:
print(drugs_unknown.yhat.sum() / drugs_unknown.shape[0])


0.524143472861

Most Predictive Terms


In [48]:
token_dict = {coeff : i for i, coeff in enumerate(model.coef_[0])}

In [49]:
coefficients = model.coef_[0]
coefficients.sort()

In [50]:
import numpy as np

In [51]:
for i in range(1, 26):
    print(all_vocab[token_dict[coefficients[-i]]] + ' (' + 
          str(np.round(np.exp(coefficients[-i]), 2)) + ')', end=', ')


sex (68.96), shit (45.51), music (20.95), weed (18.46), party (15.54), beer (14.18), dubstep (13.86), fuck (12.28), drinking (11.48), smoking (11.39), partying (10.59), chill (9.45), hair (8.84), park (8.09), fucking (7.93), dj (7.9), burning (7.78), electronic (7.05), drunk (6.67), ass (6.36), reggae (6.18), robbins (5.81), dude (5.74), smoke (5.68), cat (5.5), 

In [ ]: