This notebook sets up the workflow for the various functions we have implemented. It shows an example of how we clustered using Nonnegative Matrix Factorization. We manually inspect the output of NMF to determine the best number of clusters for each group
In [1]:
import pickle
import warnings
from utils.hash import make
from utils.calculate_pmi_features import *
from utils.clean_up import *
from utils.categorize_demographics import *
from utils.reduce_dimensions import run_kmeans
from utils.nonnegative_matrix_factorization import nmf_inspect, nmf_labels
warnings.filterwarnings('ignore')
Getting the data, cleaning it, and categorizing demographic data
In [2]:
df = get_data()
In [3]:
essay_list = ['essay0','essay4','essay5']
df_clean = clean_up(df, essay_list)
In [4]:
df_clean.fillna('', inplace=True)
In [5]:
df.columns.values
Out[5]:
array(['username', 'age', 'body_type', 'diet', 'drinks', 'drugs',
'education', 'essay0', 'essay1', 'essay2', 'essay3', 'essay4',
'essay5', 'essay6', 'essay7', 'essay8', 'essay9', 'ethnicity',
'height', 'income', 'job', 'last_online', 'location', 'offspring',
'orientation', 'pets', 'religion', 'sex', 'sign', 'smokes',
'speaks', 'status', 'TotalEssays'], dtype=object)
In [6]:
df_clean['religion'] = df_clean['religion'].apply(religion_categories)
df_clean['job'] = df_clean['job'].apply(job_categories)
df_clean['drugs'] = df_clean['drugs'].apply(drug_categories)
df_clean['diet'] = df_clean['diet'].apply(diet_categories)
df_clean['body_type'] = df_clean['body_type'].apply(body_categories)
df_clean['drinks'] = df_clean['drinks'].apply(drink_categories)
df_clean['sign'] = df_clean['sign'].apply(sign_categories)
df_clean['ethnicity'] = df_clean['ethnicity'].apply(ethnicity_categories)
df_clean['pets'] = df_clean['pets'].apply(pets_categories)
df_clean['speaks'] = df_clean['speaks'].apply(language_categories)
Splitting the dataframe by gender, running clustering separately on each.
In [7]:
df_male = df_clean[df_clean['sex'] == 'm']
In [8]:
df_female = df_clean[df_clean['sex'] == 'f']
In [9]:
count_matrix_m, tfidf_matrix_m, vocab_m = col_to_data_matrix(df_male, 'essay0') #save out
In [11]:
count_matrix_f, tfidf_matrix_f, vocab_f = col_to_data_matrix(df_female, 'essay0')
In [10]:
vocab_m
Out[10]:
['!',
'! !',
'! ! !',
'! )',
'! ) .',
"! i'm",
'! love',
'"',
'" "',
'" ,',
'" -',
'" .',
'%',
'&',
"'",
"' s",
'(',
'( )',
"( i'm",
')',
') ,',
') .',
") . i'm",
") i'm",
'*',
'* *',
'+',
',',
', "',
', (',
', ,',
', .',
', ...',
', adventurous',
', art',
', art ,',
', believe',
', biking',
', biking ,',
', camping',
', camping ,',
", can't",
', caring',
', cooking',
', cooking ,',
', creative',
', creative ,',
', dancing',
', doing',
", don't",
', easy',
', eating',
', enjoy',
', especially',
', exploring',
', family',
', feel',
', food',
', friendly',
', friends',
', fun',
', fun ,',
', funny',
', funny ,',
', getting',
', going',
', good',
', great',
', hanging',
', happy',
', having',
', hiking',
', hiking ,',
', honest',
', honest ,',
", i'd",
", i'll",
", i'm",
", i'm looking",
", i've",
', intelligent',
', intelligent ,',
", it's",
', just',
', kind',
', know',
', learning',
', life',
', like',
', little',
', live',
', lived',
', living',
', long',
', looking',
', love',
', loving',
', loyal',
', make',
', making',
', maybe',
', moved',
', movies',
', movies ,',
', music',
', music ,',
', need',
', new',
', nice',
', open',
', outgoing',
', passionate',
', people',
', photography',
', play',
', playing',
', pretty',
', probably',
', read',
', reading',
', reading ,',
', really',
', right',
', rock',
', running',
', sarcastic',
', say',
', smart',
', smart ,',
', snowboarding',
', spontaneous',
', swimming',
', taking',
', tend',
", that's",
', things',
', think',
', time',
', travel',
', travel ,',
', traveling',
', traveling ,',
', try',
', trying',
', usually',
', want',
', watching',
', went',
', work',
', working',
", you're",
'-',
'- -',
"- i'm",
'.',
'. "',
'. (',
'. )',
'. ,',
". , i'm",
'. -',
'. .',
'. ?',
'. believe',
'. best',
'. big',
'. born',
". can't",
'. com',
'. com /',
'. consider',
'. currently',
". don't",
'. enjoy',
'. family',
'. favorite',
'. feel',
'. friends',
'. fun',
'. good',
'. got',
'. great',
'. grew',
'. guess',
'. hope',
". i'd",
". i'd like",
". i'll",
". i'm",
". i'm big",
". i'm looking",
". i'm pretty",
". i'm really",
". i've",
". i've lived",
'. im',
'. interested',
". it's",
'. just',
'. kind',
'. know',
". let's",
'. life',
'. like',
'. live',
'. lived',
'. looking',
'. lot',
'. love',
'. love travel',
'. make',
'. maybe',
'. moved',
'. music',
'. need',
'. new',
'. oh',
'. oh ,',
'. open',
'. people',
'. play',
'. prefer',
'. pretty',
'. probably',
'. read',
'. really',
'. recently',
'. right',
'. said',
'. said ,',
'. say',
'. spend',
'. spent',
'. tend',
". that's",
". there's",
'. things',
'. think',
'. time',
'. travel',
'. try',
'. used',
'. usually',
'. want',
'. went',
'. work',
'. yes',
". you're",
'..',
'...',
"... i'm",
'....',
'.....',
'/',
'1',
'10',
'10 years',
'12',
'2',
'20',
'3',
'30',
'4',
'5',
'5 years',
'6',
'7',
'8',
':',
': "',
": i'm",
':)',
':/',
':/ /',
';',
';)',
'?',
'? )',
"? i'm",
'able',
'abroad',
'absolutely',
'act',
'active',
'active ,',
'activities',
'activity',
'actually',
'add',
'admit',
'adventure',
'adventure .',
'adventures',
'adventures .',
'adventurous',
'adventurous ,',
'affectionate',
'afraid',
'age',
'ago',
'ago .',
'amazing',
'ambitious',
'america',
'american',
'animals',
'answer',
'appreciate',
'area',
'area ,',
'area .',
"aren't",
'art',
'art ,',
'artist',
'artistic',
'arts',
'ask',
'ask .',
'ass',
'athletic',
'attention',
'attitude',
'attracted',
'attractive',
'average',
'away',
'awesome',
'awesome .',
'awkward',
'backpacking',
'bad',
'balance',
'band',
'bar',
'bars',
'baseball',
'basically',
'basketball',
'bay',
'bay .',
'bay area',
'bay area ,',
'bay area .',
'beach',
'beach ,',
'beautiful',
'beauty',
'beer',
'believe',
'berkeley',
'best',
'best .',
'better',
'better .',
'big',
'bike',
'biking',
'biking ,',
'bit',
'black',
'board',
'board games',
'body',
'book',
'books',
'books ,',
'bored',
'boring',
'born',
'born raised',
'boston',
'box',
'boy',
'break',
'bring',
'build',
'building',
'bunch',
'business',
'busy',
'ca',
'california',
'california .',
'called',
'came',
'camping',
'camping ,',
"can't",
'car',
'care',
'care .',
'career',
'caring',
'caring ,',
'cars',
'case',
'casual',
'catch',
'cats',
'cause',
'certain',
'challenge',
'chance',
'change',
'character',
'check',
'chemistry',
'chicago',
'child',
'children',
'chill',
'cities',
'city',
'city ,',
'city .',
'class',
'clean',
'climbing',
'climbing ,',
'close',
'club',
'clubs',
'coast',
'coffee',
'cold',
'college',
'college .',
'com',
'com /',
'come',
'comedy',
'comes',
'comfortable',
'coming',
'common',
'communication',
'community',
'company',
'company .',
'compassionate',
'complete',
'completely',
'computer',
'computers',
'concerts',
'concerts ,',
'confident',
'connection',
'consider',
'constantly',
'conversation',
'conversation ,',
'conversation .',
'conversations',
'cook',
'cook ,',
'cooking',
'cooking ,',
'cool',
'countries',
'country',
'couple',
'course',
'crazy',
'create',
'creating',
'creative',
'creative ,',
'creativity',
'culture',
'cultures',
'curious',
'curious ,',
'current',
'currently',
'cute',
'cycling',
'dad',
'daily',
'damn',
'dance',
'dancing',
'dancing ,',
'dark',
'date',
'dating',
'day',
'day ,',
'day .',
'days',
'deal',
'decent',
'decided',
'deep',
'deeply',
'definitely',
'degree',
'described',
'design',
'designer',
'desire',
'despite',
'did',
"didn't",
'different',
'difficult',
'dinner',
'dive',
'does',
"doesn't",
'dog',
'dogs',
'doing',
'doing .',
"don't",
"don't know",
"don't like",
"don't really",
"don't think",
"don't want",
'dont',
'drama',
'dream',
'dreams',
'drink',
'drinking',
'drinks',
'drive',
'driven',
'driving',
'dry',
'dude',
'early',
'earth',
'easily',
'east',
'east bay',
'east coast',
'easy',
'easy going',
'easy going ,',
'easy-going',
'eat',
'eating',
'educated',
'education',
'emotional',
'emotionally',
'end',
'energetic',
'energy',
'engineer',
'engineering',
'english',
'enjoy',
'enjoy .',
'enjoy going',
'enjoy life',
'enjoying',
'enjoys',
'equally',
'especially',
'europe',
'events',
'eventually',
'everyday',
'exactly',
'excited',
'exciting',
'exercise',
'expect',
'experience',
'experiences',
'experiences .',
'explore',
'exploring',
'extremely',
'eyes',
'face',
'fact',
'fair',
'fairly',
'fall',
'family',
'family ,',
'family .',
'family friends',
'fan',
'far',
'fast',
'father',
'favorite',
'feel',
'feel free',
'feel like',
'feeling',
'feels',
'figure',
'film',
'finally',
'finding',
'fine',
'finished',
'fit',
'focus',
'follow',
'food',
'food ,',
'food .',
'foods',
'football',
'form',
'forward',
'francisco',
'francisco ,',
'francisco .',
'free',
'free time',
'french',
'fresh',
'friend',
'friend .',
'friendly',
'friendly ,',
'friends',
'friends ,',
'friends .',
'friends family',
'friendship',
'fullest',
'fun',
'fun ,',
'fun .',
'funny',
'funny ,',
'future',
'game',
'games',
'games ,',
'games .',
'gay',
'geek',
'geeky',
'general',
'generally',
'generous',
'gentleman',
'genuine',
'gets',
'getting',
'getting know',
'giants',
'girl',
'girls',
'given',
'giving',
'glass',
'goal',
'goals',
'god',
'goes',
'going',
'going ,',
'going .',
'golf',
'good',
'good ,',
'good .',
'good conversation',
'good food',
'good friends',
'good sense',
'good sense humor',
'good time',
'good time .',
'goofy',
'got',
'grad',
'grad school',
'graduate',
'graduated',
'great',
'grew',
'group',
'grow',
'growing',
'grown',
'guess',
'guitar',
'guy',
'guy ,',
'guy .',
'guys',
'gym',
'haha',
'hair',
'half',
'hand',
'hands',
'hang',
'hanging',
'hanging friends',
'happen',
'happens',
'happiness',
'happy',
'happy ,',
'happy .',
'hard',
'hate',
"haven't",
'having',
'having fun',
'head',
'health',
'healthy',
'hear',
'heart',
'heart ,',
'heart .',
'hell',
'hello',
'help',
'helping',
"here's",
'hey',
'hi',
'hi ,',
'high',
'high school',
'highly',
'hike',
'hiking',
'hiking ,',
'history',
'hit',
'hobbies',
'hold',
'home',
'home ,',
'home .',
'honest',
'honest ,',
'honestly',
'honesty',
'hope',
'hopefully',
'hoping',
'hot',
'hours',
'house',
'http',
'http :/',
'http :/ /',
'huge',
'human',
'humble',
'humor',
'humor ,',
'humor .',
"i'd",
"i'd like",
"i'd love",
"i'll",
"i'm",
"i'm .",
"i'm big",
"i'm bit",
"i'm currently",
"i'm easy",
"i'm going",
"i'm good",
"i'm happy",
"i'm interested",
"i'm just",
"i'm kind",
"i'm looking",
"i'm open",
"i'm originally",
"i'm passionate",
"i'm pretty",
"i'm really",
"i'm sure",
"i'm trying",
"i'm working",
"i've",
"i've got",
"i've lived",
'ice',
'idea',
'ideas',
'im',
'important',
'important .',
'include',
'including',
'independent',
'individual',
'industry',
'inside',
'instead',
'intellectual',
'intelligent',
'intelligent ,',
'intense',
'interested',
'interesting',
'interests',
'internet',
'involved',
"isn't",
"it's",
'japan',
'jazz',
'job',
'job ,',
'job .',
'joke',
'jokes',
'journey',
'joy',
'just',
'just .',
'just like',
'just looking',
'just moved',
'keeping',
'keeps',
'key',
'kick',
'kid',
'kids',
'kind',
'kind ,',
'kind guy',
'kinda',
'kinds',
'know',
'know ,',
'know .',
'knowledge',
'known',
'knows',
'la',
'laid',
'laid ,',
'language',
'languages',
'large',
'late',
'lately',
'later',
'laugh',
'laugh ,',
'laugh .',
'laughing',
'laughter',
'lazy',
'lead',
'learn',
'learned',
'learning',
'learning new',
'leave',
'left',
'let',
"let's",
'lets',
'level',
'liberal',
'life',
'life ,',
'life .',
"life . i'm",
"life's",
'lifestyle',
'light',
'like',
'like ,',
'like .',
'like going',
'like good',
'like meet',
'like people',
'like think',
'likely',
'likes',
'line',
'list',
'listen',
'listener',
'listening',
'little',
'little bit',
'live',
'live .',
'live life',
'live music',
'lived',
'lives',
'living',
'living san',
'living san francisco',
'local',
'lol',
'long',
'long term',
'longer',
'look',
'looking',
'looking forward',
'looking meet',
'looks',
'lost',
'lot',
'lot ,',
'lot .',
'lot time',
'lots',
'loud',
'love',
'love ,',
'love .',
'love going',
'love good',
'love laugh',
'love life',
'love music',
'love outdoors',
'love travel',
'loved',
'lover',
'loves',
'loving',
'loving ,',
'low',
'loyal',
'loyal ,',
'lucky',
'major',
'make',
'make laugh',
'make people',
'makes',
'making',
'male',
'man',
'man ,',
'man .',
'married',
'match',
'matter',
'maybe',
'meal',
'mean',
'means',
'meet',
'meet .',
'meet new',
'meet new people',
'meet people',
'meeting',
'meeting new',
'meeting new people',
'men',
'message',
'met',
'mexico',
'middle',
'midwest',
'mind',
'mind .',
'minded',
'miss',
'mission',
'mix',
'mom',
'moment',
'moment .',
'moments',
'money',
'month',
'months',
'morning',
'motorcycle',
'mountain',
'mountains',
'moved',
'moved bay',
'moved bay area',
'moved san',
'moved san francisco',
'moved sf',
'movie',
'movies',
'movies ,',
'movies .',
'moving',
'museums',
...]
In [12]:
nmf_inspect(tfidf_matrix_m, vocab_m)
3
Group 0:
, ( ) ) , ... ! music / : , i'm
Group 1:
- " - - ) ( . - . . " : ...
Group 2:
. i'm . i'm like love people life just . love . like
5
Group 0:
, ( ) ) , music / : ... , i'm music ,
Group 1:
. i'm . i'm like . like love don't life people just
Group 2:
" . " ... . ! ? ) ( , " " "
Group 3:
- - - . - ) ( : - i'm / . '
Group 4:
new ! san francisco san francisco bay moved . area years
7
Group 0:
. like love . like life . love people time don't things
Group 1:
, music music , , , , love good : , good movies hiking
Group 2:
san . new francisco san francisco bay moved area bay area years
Group 3:
" . " . , " , " " " . ? " , " -
Group 4:
- - - . - - i'm : . / , " - '
Group 5:
i'm . i'm , i'm . guy pretty looking i'm looking i'm pretty just
Group 6:
) ( ! ... ) . / ) , ? ! ! *
9
Group 0:
, music , , music , : art . , , good , . , love
Group 1:
. like . like don't life think want know time . don't
Group 2:
- - - . - - i'm : / " - ' ? ;
Group 3:
i'm . i'm , i'm . pretty guy i'm pretty i'm looking i've looking
Group 4:
! ... ! ! ? im ! ! ! just .. know ....
Group 5:
) ( ) . ) , / . ( : ! ) , . )
Group 6:
love new people . love enjoy . things going like friends
Group 7:
san . francisco san francisco moved bay area years bay area city
Group 8:
" . " , " " " " . " , ? , " - :
In [14]:
nmf_inspect(tfidf_matrix_f, vocab_f)
3
Group 0:
, ( ) ) , music : , love ... ! good
Group 1:
. i'm love . i'm like . love people life . like just
Group 2:
- " - - ) ( ! . - ... . " :
5
Group 0:
, ( ) : ) , music , love music , . /
Group 1:
. i'm like . i'm love . love . like life don't people
Group 2:
" . " . ... , * , " ) ? (
Group 3:
- - - . - ( ) : - i'm / - love .
Group 4:
! new love ... i'm bay san area moved francisco
7
Group 0:
, ( ) ) , : music , love music , / good
Group 1:
. like love . love . like life people don't time want
Group 2:
i'm . i'm . , i'm i'm looking pretty i've i'm pretty , looking
Group 3:
! ... ! ! ! ! ! love ) ( just ? ..
Group 4:
- - - . - ( ) : / - i'm - love ) .
Group 5:
new . bay san area san francisco francisco love moved bay area
Group 6:
" . " . , " , " " * ) ( ?
9
Group 0:
, music , love music , good , good : friends , dancing friends
Group 1:
. like love . love . like life people time want don't
Group 2:
- - - . - - i'm - love : / , & ?
Group 3:
" . " . , " " " , ? " . " , ...
Group 4:
i'm . i'm . , i'm i'm looking pretty i'm pretty i've looking ,
Group 5:
! ... ! ! love ! ! ! just im fun .. :)
Group 6:
* * * . * / . , : ' ? profile
Group 7:
new san bay . area san francisco francisco love bay area moved
Group 8:
) ( ) . ) , , / . : ... ! )
In [ ]:
Content source: juanshishido/okcupid
Similar notebooks: