This notebook sets up the workflow for the various functions we have implemented. It shows an example of how we clustered using Nonnegative Matrix Factorization. We manually inspect the output of NMF to determine the best number of clusters for each group


In [1]:
import pickle
import warnings

from utils.hash import make
from utils.calculate_pmi_features import *
from utils.clean_up import *
from utils.categorize_demographics import *
from utils.reduce_dimensions import run_kmeans
from utils.nonnegative_matrix_factorization import nmf_inspect, nmf_labels
warnings.filterwarnings('ignore')

Getting the data, cleaning it, and categorizing demographic data


In [2]:
df = get_data()

In [3]:
essay_list = ['essay0','essay4','essay5']
df_clean = clean_up(df, essay_list)

In [4]:
df_clean.fillna('', inplace=True)

In [5]:
df.columns.values


Out[5]:
array(['username', 'age', 'body_type', 'diet', 'drinks', 'drugs',
       'education', 'essay0', 'essay1', 'essay2', 'essay3', 'essay4',
       'essay5', 'essay6', 'essay7', 'essay8', 'essay9', 'ethnicity',
       'height', 'income', 'job', 'last_online', 'location', 'offspring',
       'orientation', 'pets', 'religion', 'sex', 'sign', 'smokes',
       'speaks', 'status', 'TotalEssays'], dtype=object)

In [6]:
df_clean['religion'] = df_clean['religion'].apply(religion_categories)
df_clean['job'] = df_clean['job'].apply(job_categories)
df_clean['drugs'] = df_clean['drugs'].apply(drug_categories)
df_clean['diet'] = df_clean['diet'].apply(diet_categories)
df_clean['body_type'] = df_clean['body_type'].apply(body_categories)
df_clean['drinks'] = df_clean['drinks'].apply(drink_categories)
df_clean['sign'] = df_clean['sign'].apply(sign_categories)
df_clean['ethnicity'] = df_clean['ethnicity'].apply(ethnicity_categories)
df_clean['pets'] = df_clean['pets'].apply(pets_categories)
df_clean['speaks'] = df_clean['speaks'].apply(language_categories)

Splitting the dataframe by gender, running clustering separately on each.


In [7]:
df_male = df_clean[df_clean['sex'] == 'm']

In [8]:
df_female = df_clean[df_clean['sex'] == 'f']

In [9]:
count_matrix_m, tfidf_matrix_m, vocab_m = col_to_data_matrix(df_male, 'essay0') #save out

In [11]:
count_matrix_f, tfidf_matrix_f, vocab_f = col_to_data_matrix(df_female, 'essay0')

In [10]:
vocab_m


Out[10]:
['!',
 '! !',
 '! ! !',
 '! )',
 '! ) .',
 "! i'm",
 '! love',
 '"',
 '" "',
 '" ,',
 '" -',
 '" .',
 '%',
 '&',
 "'",
 "' s",
 '(',
 '( )',
 "( i'm",
 ')',
 ') ,',
 ') .',
 ") . i'm",
 ") i'm",
 '*',
 '* *',
 '+',
 ',',
 ', "',
 ', (',
 ', ,',
 ', .',
 ', ...',
 ', adventurous',
 ', art',
 ', art ,',
 ', believe',
 ', biking',
 ', biking ,',
 ', camping',
 ', camping ,',
 ", can't",
 ', caring',
 ', cooking',
 ', cooking ,',
 ', creative',
 ', creative ,',
 ', dancing',
 ', doing',
 ", don't",
 ', easy',
 ', eating',
 ', enjoy',
 ', especially',
 ', exploring',
 ', family',
 ', feel',
 ', food',
 ', friendly',
 ', friends',
 ', fun',
 ', fun ,',
 ', funny',
 ', funny ,',
 ', getting',
 ', going',
 ', good',
 ', great',
 ', hanging',
 ', happy',
 ', having',
 ', hiking',
 ', hiking ,',
 ', honest',
 ', honest ,',
 ", i'd",
 ", i'll",
 ", i'm",
 ", i'm looking",
 ", i've",
 ', intelligent',
 ', intelligent ,',
 ", it's",
 ', just',
 ', kind',
 ', know',
 ', learning',
 ', life',
 ', like',
 ', little',
 ', live',
 ', lived',
 ', living',
 ', long',
 ', looking',
 ', love',
 ', loving',
 ', loyal',
 ', make',
 ', making',
 ', maybe',
 ', moved',
 ', movies',
 ', movies ,',
 ', music',
 ', music ,',
 ', need',
 ', new',
 ', nice',
 ', open',
 ', outgoing',
 ', passionate',
 ', people',
 ', photography',
 ', play',
 ', playing',
 ', pretty',
 ', probably',
 ', read',
 ', reading',
 ', reading ,',
 ', really',
 ', right',
 ', rock',
 ', running',
 ', sarcastic',
 ', say',
 ', smart',
 ', smart ,',
 ', snowboarding',
 ', spontaneous',
 ', swimming',
 ', taking',
 ', tend',
 ", that's",
 ', things',
 ', think',
 ', time',
 ', travel',
 ', travel ,',
 ', traveling',
 ', traveling ,',
 ', try',
 ', trying',
 ', usually',
 ', want',
 ', watching',
 ', went',
 ', work',
 ', working',
 ", you're",
 '-',
 '- -',
 "- i'm",
 '.',
 '. "',
 '. (',
 '. )',
 '. ,',
 ". , i'm",
 '. -',
 '. .',
 '. ?',
 '. believe',
 '. best',
 '. big',
 '. born',
 ". can't",
 '. com',
 '. com /',
 '. consider',
 '. currently',
 ". don't",
 '. enjoy',
 '. family',
 '. favorite',
 '. feel',
 '. friends',
 '. fun',
 '. good',
 '. got',
 '. great',
 '. grew',
 '. guess',
 '. hope',
 ". i'd",
 ". i'd like",
 ". i'll",
 ". i'm",
 ". i'm big",
 ". i'm looking",
 ". i'm pretty",
 ". i'm really",
 ". i've",
 ". i've lived",
 '. im',
 '. interested',
 ". it's",
 '. just',
 '. kind',
 '. know',
 ". let's",
 '. life',
 '. like',
 '. live',
 '. lived',
 '. looking',
 '. lot',
 '. love',
 '. love travel',
 '. make',
 '. maybe',
 '. moved',
 '. music',
 '. need',
 '. new',
 '. oh',
 '. oh ,',
 '. open',
 '. people',
 '. play',
 '. prefer',
 '. pretty',
 '. probably',
 '. read',
 '. really',
 '. recently',
 '. right',
 '. said',
 '. said ,',
 '. say',
 '. spend',
 '. spent',
 '. tend',
 ". that's",
 ". there's",
 '. things',
 '. think',
 '. time',
 '. travel',
 '. try',
 '. used',
 '. usually',
 '. want',
 '. went',
 '. work',
 '. yes',
 ". you're",
 '..',
 '...',
 "... i'm",
 '....',
 '.....',
 '/',
 '1',
 '10',
 '10 years',
 '12',
 '2',
 '20',
 '3',
 '30',
 '4',
 '5',
 '5 years',
 '6',
 '7',
 '8',
 ':',
 ': "',
 ": i'm",
 ':)',
 ':/',
 ':/ /',
 ';',
 ';)',
 '?',
 '? )',
 "? i'm",
 'able',
 'abroad',
 'absolutely',
 'act',
 'active',
 'active ,',
 'activities',
 'activity',
 'actually',
 'add',
 'admit',
 'adventure',
 'adventure .',
 'adventures',
 'adventures .',
 'adventurous',
 'adventurous ,',
 'affectionate',
 'afraid',
 'age',
 'ago',
 'ago .',
 'amazing',
 'ambitious',
 'america',
 'american',
 'animals',
 'answer',
 'appreciate',
 'area',
 'area ,',
 'area .',
 "aren't",
 'art',
 'art ,',
 'artist',
 'artistic',
 'arts',
 'ask',
 'ask .',
 'ass',
 'athletic',
 'attention',
 'attitude',
 'attracted',
 'attractive',
 'average',
 'away',
 'awesome',
 'awesome .',
 'awkward',
 'backpacking',
 'bad',
 'balance',
 'band',
 'bar',
 'bars',
 'baseball',
 'basically',
 'basketball',
 'bay',
 'bay .',
 'bay area',
 'bay area ,',
 'bay area .',
 'beach',
 'beach ,',
 'beautiful',
 'beauty',
 'beer',
 'believe',
 'berkeley',
 'best',
 'best .',
 'better',
 'better .',
 'big',
 'bike',
 'biking',
 'biking ,',
 'bit',
 'black',
 'board',
 'board games',
 'body',
 'book',
 'books',
 'books ,',
 'bored',
 'boring',
 'born',
 'born raised',
 'boston',
 'box',
 'boy',
 'break',
 'bring',
 'build',
 'building',
 'bunch',
 'business',
 'busy',
 'ca',
 'california',
 'california .',
 'called',
 'came',
 'camping',
 'camping ,',
 "can't",
 'car',
 'care',
 'care .',
 'career',
 'caring',
 'caring ,',
 'cars',
 'case',
 'casual',
 'catch',
 'cats',
 'cause',
 'certain',
 'challenge',
 'chance',
 'change',
 'character',
 'check',
 'chemistry',
 'chicago',
 'child',
 'children',
 'chill',
 'cities',
 'city',
 'city ,',
 'city .',
 'class',
 'clean',
 'climbing',
 'climbing ,',
 'close',
 'club',
 'clubs',
 'coast',
 'coffee',
 'cold',
 'college',
 'college .',
 'com',
 'com /',
 'come',
 'comedy',
 'comes',
 'comfortable',
 'coming',
 'common',
 'communication',
 'community',
 'company',
 'company .',
 'compassionate',
 'complete',
 'completely',
 'computer',
 'computers',
 'concerts',
 'concerts ,',
 'confident',
 'connection',
 'consider',
 'constantly',
 'conversation',
 'conversation ,',
 'conversation .',
 'conversations',
 'cook',
 'cook ,',
 'cooking',
 'cooking ,',
 'cool',
 'countries',
 'country',
 'couple',
 'course',
 'crazy',
 'create',
 'creating',
 'creative',
 'creative ,',
 'creativity',
 'culture',
 'cultures',
 'curious',
 'curious ,',
 'current',
 'currently',
 'cute',
 'cycling',
 'dad',
 'daily',
 'damn',
 'dance',
 'dancing',
 'dancing ,',
 'dark',
 'date',
 'dating',
 'day',
 'day ,',
 'day .',
 'days',
 'deal',
 'decent',
 'decided',
 'deep',
 'deeply',
 'definitely',
 'degree',
 'described',
 'design',
 'designer',
 'desire',
 'despite',
 'did',
 "didn't",
 'different',
 'difficult',
 'dinner',
 'dive',
 'does',
 "doesn't",
 'dog',
 'dogs',
 'doing',
 'doing .',
 "don't",
 "don't know",
 "don't like",
 "don't really",
 "don't think",
 "don't want",
 'dont',
 'drama',
 'dream',
 'dreams',
 'drink',
 'drinking',
 'drinks',
 'drive',
 'driven',
 'driving',
 'dry',
 'dude',
 'early',
 'earth',
 'easily',
 'east',
 'east bay',
 'east coast',
 'easy',
 'easy going',
 'easy going ,',
 'easy-going',
 'eat',
 'eating',
 'educated',
 'education',
 'emotional',
 'emotionally',
 'end',
 'energetic',
 'energy',
 'engineer',
 'engineering',
 'english',
 'enjoy',
 'enjoy .',
 'enjoy going',
 'enjoy life',
 'enjoying',
 'enjoys',
 'equally',
 'especially',
 'europe',
 'events',
 'eventually',
 'everyday',
 'exactly',
 'excited',
 'exciting',
 'exercise',
 'expect',
 'experience',
 'experiences',
 'experiences .',
 'explore',
 'exploring',
 'extremely',
 'eyes',
 'face',
 'fact',
 'fair',
 'fairly',
 'fall',
 'family',
 'family ,',
 'family .',
 'family friends',
 'fan',
 'far',
 'fast',
 'father',
 'favorite',
 'feel',
 'feel free',
 'feel like',
 'feeling',
 'feels',
 'figure',
 'film',
 'finally',
 'finding',
 'fine',
 'finished',
 'fit',
 'focus',
 'follow',
 'food',
 'food ,',
 'food .',
 'foods',
 'football',
 'form',
 'forward',
 'francisco',
 'francisco ,',
 'francisco .',
 'free',
 'free time',
 'french',
 'fresh',
 'friend',
 'friend .',
 'friendly',
 'friendly ,',
 'friends',
 'friends ,',
 'friends .',
 'friends family',
 'friendship',
 'fullest',
 'fun',
 'fun ,',
 'fun .',
 'funny',
 'funny ,',
 'future',
 'game',
 'games',
 'games ,',
 'games .',
 'gay',
 'geek',
 'geeky',
 'general',
 'generally',
 'generous',
 'gentleman',
 'genuine',
 'gets',
 'getting',
 'getting know',
 'giants',
 'girl',
 'girls',
 'given',
 'giving',
 'glass',
 'goal',
 'goals',
 'god',
 'goes',
 'going',
 'going ,',
 'going .',
 'golf',
 'good',
 'good ,',
 'good .',
 'good conversation',
 'good food',
 'good friends',
 'good sense',
 'good sense humor',
 'good time',
 'good time .',
 'goofy',
 'got',
 'grad',
 'grad school',
 'graduate',
 'graduated',
 'great',
 'grew',
 'group',
 'grow',
 'growing',
 'grown',
 'guess',
 'guitar',
 'guy',
 'guy ,',
 'guy .',
 'guys',
 'gym',
 'haha',
 'hair',
 'half',
 'hand',
 'hands',
 'hang',
 'hanging',
 'hanging friends',
 'happen',
 'happens',
 'happiness',
 'happy',
 'happy ,',
 'happy .',
 'hard',
 'hate',
 "haven't",
 'having',
 'having fun',
 'head',
 'health',
 'healthy',
 'hear',
 'heart',
 'heart ,',
 'heart .',
 'hell',
 'hello',
 'help',
 'helping',
 "here's",
 'hey',
 'hi',
 'hi ,',
 'high',
 'high school',
 'highly',
 'hike',
 'hiking',
 'hiking ,',
 'history',
 'hit',
 'hobbies',
 'hold',
 'home',
 'home ,',
 'home .',
 'honest',
 'honest ,',
 'honestly',
 'honesty',
 'hope',
 'hopefully',
 'hoping',
 'hot',
 'hours',
 'house',
 'http',
 'http :/',
 'http :/ /',
 'huge',
 'human',
 'humble',
 'humor',
 'humor ,',
 'humor .',
 "i'd",
 "i'd like",
 "i'd love",
 "i'll",
 "i'm",
 "i'm .",
 "i'm big",
 "i'm bit",
 "i'm currently",
 "i'm easy",
 "i'm going",
 "i'm good",
 "i'm happy",
 "i'm interested",
 "i'm just",
 "i'm kind",
 "i'm looking",
 "i'm open",
 "i'm originally",
 "i'm passionate",
 "i'm pretty",
 "i'm really",
 "i'm sure",
 "i'm trying",
 "i'm working",
 "i've",
 "i've got",
 "i've lived",
 'ice',
 'idea',
 'ideas',
 'im',
 'important',
 'important .',
 'include',
 'including',
 'independent',
 'individual',
 'industry',
 'inside',
 'instead',
 'intellectual',
 'intelligent',
 'intelligent ,',
 'intense',
 'interested',
 'interesting',
 'interests',
 'internet',
 'involved',
 "isn't",
 "it's",
 'japan',
 'jazz',
 'job',
 'job ,',
 'job .',
 'joke',
 'jokes',
 'journey',
 'joy',
 'just',
 'just .',
 'just like',
 'just looking',
 'just moved',
 'keeping',
 'keeps',
 'key',
 'kick',
 'kid',
 'kids',
 'kind',
 'kind ,',
 'kind guy',
 'kinda',
 'kinds',
 'know',
 'know ,',
 'know .',
 'knowledge',
 'known',
 'knows',
 'la',
 'laid',
 'laid ,',
 'language',
 'languages',
 'large',
 'late',
 'lately',
 'later',
 'laugh',
 'laugh ,',
 'laugh .',
 'laughing',
 'laughter',
 'lazy',
 'lead',
 'learn',
 'learned',
 'learning',
 'learning new',
 'leave',
 'left',
 'let',
 "let's",
 'lets',
 'level',
 'liberal',
 'life',
 'life ,',
 'life .',
 "life . i'm",
 "life's",
 'lifestyle',
 'light',
 'like',
 'like ,',
 'like .',
 'like going',
 'like good',
 'like meet',
 'like people',
 'like think',
 'likely',
 'likes',
 'line',
 'list',
 'listen',
 'listener',
 'listening',
 'little',
 'little bit',
 'live',
 'live .',
 'live life',
 'live music',
 'lived',
 'lives',
 'living',
 'living san',
 'living san francisco',
 'local',
 'lol',
 'long',
 'long term',
 'longer',
 'look',
 'looking',
 'looking forward',
 'looking meet',
 'looks',
 'lost',
 'lot',
 'lot ,',
 'lot .',
 'lot time',
 'lots',
 'loud',
 'love',
 'love ,',
 'love .',
 'love going',
 'love good',
 'love laugh',
 'love life',
 'love music',
 'love outdoors',
 'love travel',
 'loved',
 'lover',
 'loves',
 'loving',
 'loving ,',
 'low',
 'loyal',
 'loyal ,',
 'lucky',
 'major',
 'make',
 'make laugh',
 'make people',
 'makes',
 'making',
 'male',
 'man',
 'man ,',
 'man .',
 'married',
 'match',
 'matter',
 'maybe',
 'meal',
 'mean',
 'means',
 'meet',
 'meet .',
 'meet new',
 'meet new people',
 'meet people',
 'meeting',
 'meeting new',
 'meeting new people',
 'men',
 'message',
 'met',
 'mexico',
 'middle',
 'midwest',
 'mind',
 'mind .',
 'minded',
 'miss',
 'mission',
 'mix',
 'mom',
 'moment',
 'moment .',
 'moments',
 'money',
 'month',
 'months',
 'morning',
 'motorcycle',
 'mountain',
 'mountains',
 'moved',
 'moved bay',
 'moved bay area',
 'moved san',
 'moved san francisco',
 'moved sf',
 'movie',
 'movies',
 'movies ,',
 'movies .',
 'moving',
 'museums',
 ...]

In [12]:
nmf_inspect(tfidf_matrix_m, vocab_m)


3
Group 0:
, ( ) ) , ... ! music / : , i'm

Group 1:
- " - - ) ( . - . . " : ...

Group 2:
. i'm . i'm like love people life just . love . like


5
Group 0:
, ( ) ) , music / : ... , i'm music ,

Group 1:
. i'm . i'm like . like love don't life people just

Group 2:
" . " ... . ! ? ) ( , " " "

Group 3:
- - - . - ) ( : - i'm / . '

Group 4:
new ! san francisco san francisco bay moved . area years


7
Group 0:
. like love . like life . love people time don't things

Group 1:
, music music , , , , love good : , good movies hiking

Group 2:
san . new francisco san francisco bay moved area bay area years

Group 3:
" . " . , " , " " " . ? " , " -

Group 4:
- - - . - - i'm : . / , " - '

Group 5:
i'm . i'm , i'm . guy pretty looking i'm looking i'm pretty just

Group 6:
) ( ! ... ) . / ) , ? ! ! *


9
Group 0:
, music , , music , : art . , , good , . , love

Group 1:
. like . like don't life think want know time . don't

Group 2:
- - - . - - i'm : / " - ' ? ;

Group 3:
i'm . i'm , i'm . pretty guy i'm pretty i'm looking i've looking

Group 4:
! ... ! ! ? im ! ! ! just .. know ....

Group 5:
) ( ) . ) , / . ( : ! ) , . )

Group 6:
love new people . love enjoy . things going like friends

Group 7:
san . francisco san francisco moved bay area years bay area city

Group 8:
" . " , " " " " . " , ? , " - :



In [14]:
nmf_inspect(tfidf_matrix_f, vocab_f)


3
Group 0:
, ( ) ) , music : , love ... ! good

Group 1:
. i'm love . i'm like . love people life . like just

Group 2:
- " - - ) ( ! . - ... . " :


5
Group 0:
, ( ) : ) , music , love music , . /

Group 1:
. i'm like . i'm love . love . like life don't people

Group 2:
" . " . ... , * , " ) ? (

Group 3:
- - - . - ( ) : - i'm / - love .

Group 4:
! new love ... i'm bay san area moved francisco


7
Group 0:
, ( ) ) , : music , love music , / good

Group 1:
. like love . love . like life people don't time want

Group 2:
i'm . i'm . , i'm i'm looking pretty i've i'm pretty , looking

Group 3:
! ... ! ! ! ! ! love ) ( just ? ..

Group 4:
- - - . - ( ) : / - i'm - love ) .

Group 5:
new . bay san area san francisco francisco love moved bay area

Group 6:
" . " . , " , " " * ) ( ?


9
Group 0:
, music , love music , good , good : friends , dancing friends

Group 1:
. like love . love . like life people time want don't

Group 2:
- - - . - - i'm - love : / , & ?

Group 3:
" . " . , " " " , ? " . " , ...

Group 4:
i'm . i'm . , i'm i'm looking pretty i'm pretty i've looking ,

Group 5:
! ... ! ! love ! ! ! just im fun .. :)

Group 6:
* * * . * / . , : ' ? profile

Group 7:
new san bay . area san francisco francisco love bay area moved

Group 8:
) ( ) . ) , , / . : ... ! )



In [ ]: