In [5]:
## Import libraries for these exercises
%matplotlib inline
from sklearn.feature_extraction import stop_words
import pandas as pd
from tokenize_resumes import *
from sklearn.feature_extraction.text import TfidfVectorizer

In [6]:
## Initialize sklearn vectorizer object with our tokenization function
vectorizer = CountVectorizer(tokenizer=spacy_tokenize)

In [8]:
## Calculate vectors for the entire dataset
resume_dataset = get_resumes_dataset()
vectors = vectorizer.fit_transform(resume_dataset.resumes)

In [9]:
## Define a method to compare two resumes (for utility purposes)

def compare_resumes(res1, res2):
    resvec1 = vectorizer.transform([res1])
    resvec2 = vectorizer.transform([res2])
    
    return cosine_similarity(resvec1, resvec2)

In [ ]:


In [10]:
## Example of cosine similarity over the entire dataset for a given resume

#cosine_similarity(vectorizer.transform([resume_dataset.resumes[61]]), vectors)

In [11]:
## Define a function to find people like "Kevin" from the first 100 people

like_kevin = sorted(resume_dataset.resumes[0:100], key = lambda x: compare_resumes(resume_dataset.resumes[0], x), reverse=True)

In [12]:
## As above (but using word vectors from spacy)

like_kevin_wv = sorted(resume_dataset.resumes[0:100], key = lambda x: NLP(resume_dataset.resumes[0]).similarity(NLP(x)), reverse=True)

Term Frequency analysis


In [13]:
document_lengths = vectors.sum(axis=1).T.tolist()[0]
pd.Series(document_lengths).plot.hist()


Out[13]:
<matplotlib.axes._subplots.AxesSubplot at 0x11643a5c0>

In [14]:
terms = pd.DataFrame(vectors.sum(axis=0).T, columns=['frequency'])
terms['name'] = vectorizer.get_feature_names()
terms


Out[14]:
frequency name
0 1079 $
1 7 'd
2 23 'm
3 2 're
4 1829 's
5 23 've
6 816 +
7 1 -11
8 2 -12
9 1 -15
10 2 -1996
11 1 -2001
12 3 -2004
13 1 -2012
14 1 -3
15 2 -3%
16 1 -5
17 1 -accompany
18 1 -accomplished
19 2 -accountable
20 3 -act
21 2 -actively
22 1 -additional
23 2 -adobe
24 1 -aided
25 1 -among
26 2 -analyze
27 2 -analyzes
28 1 -answered
29 1 -asean
... ... ...
25284 176
25285 25
25286 80
25287 221
25288 1 ㆍmaking
25289 1 ㆍreported
25290 1 ㆍresolved
25291 1 ㆍsolved
25292 5 上海市
25293 7 北京市
25294 1 台中市
25295 4 台北市
25296 2 天津市
25297 1 go
25298 1 microsoft
25299 1 quickbooks
25300 1 ability
25301 1 data
25302 1 dedicated
25303 1 excellent
25304 2 experience
25305 1 outstanding
25306 1 self
25307 1 thorough
25308 158
25309 40
25310 26
25311 15
25312 1 
25313 1 􏰀participated

25314 rows × 2 columns


In [15]:
terms.frequency.sort_values(ascending=False)[0:100].plot.hist(bins=100)


Out[15]:
<matplotlib.axes._subplots.AxesSubplot at 0x1222a5278>

In [16]:
terms.sort_values(by='frequency', ascending=False)


Out[16]:
frequency name
19941 6658 sales
14106 5379 ma
4657 3511 business
14389 3183 marketing
15787 3160 new
7435 2882 development
14244 2860 management
21042 2553 software
24674 2483 work
6843 2433 customer
22328 2136 team
9104 1977 experience
24931 1853 years
4308 1850 boston
20868 1845 skills
4 1829 's
18048 1772 product
14249 1676 manager
6973 1625 data
20459 1614 service
7430 1604 developed
5944 1509 company
21964 1433 support
7368 1394 design
20469 1384 services
18115 1376 project
22130 1370 system
22137 1355 systems
919 1315 2014
913 1286 2013
... ... ...
14579 1 mcm
14580 1 mcmullan
14581 1 mcmurry
14582 1 mco
14583 1 mcp
14585 1 mcpherson.susanj@gmail.com
2571 1 aliemeke
14540 1 mb
14538 1 mazel
14531 1 mayhew
14487 1 mathew
14489 1 mathsoft
14491 1 matilda
14493 1 matlab(written
14494 1 matra
14496 1 matriculation
14497 1 matrikon
2587 1 allergens
14503 1 mattell
2586 1 allentown
2583 1 allegra
14512 1 mavis
14515 1 maxi
14516 1 maxie
14517 1 maxima
14518 1 maximacreances
14522 1 maximizes
2577 1 aligns
14529 1 may-83
25313 1 􏰀participated

25314 rows × 2 columns

TF-IDF and stopwords example


In [17]:
vectorizer_stopwords = TfidfVectorizer(stop_words=None)
vectors_stopwords = vectorizer_stopwords.fit_transform(resume_dataset.resumes)

vectorizer_nostopwords = TfidfVectorizer(stop_words='english')
vectors_nostopwords = vectorizer_nostopwords.fit_transform(resume_dataset.resumes)


terms_stopwords = pd.DataFrame(vectors_stopwords.sum(axis=0).T, columns=['frequency'])
terms_stopwords['name'] = vectorizer_stopwords.get_feature_names()

terms_nostopwords = pd.DataFrame(vectors_nostopwords.sum(axis=0).T, columns=['frequency'])
terms_nostopwords['name'] = vectorizer_nostopwords.get_feature_names()

In [19]:



Out[19]:
frequency name
0 1079 $
1 7 'd
2 23 'm
3 2 're
4 1829 's
5 23 've
6 816 +
7 1 -11
8 2 -12
9 1 -15
10 2 -1996
11 1 -2001
12 3 -2004
13 1 -2012
14 1 -3
15 2 -3%
16 1 -5
17 1 -accompany
18 1 -accomplished
19 2 -accountable
20 3 -act
21 2 -actively
22 1 -additional
23 2 -adobe
24 1 -aided
25 1 -among
26 2 -analyze
27 2 -analyzes
28 1 -answered
29 1 -asean
... ... ...
25284 176
25285 25
25286 80
25287 221
25288 1 ㆍmaking
25289 1 ㆍreported
25290 1 ㆍresolved
25291 1 ㆍsolved
25292 5 上海市
25293 7 北京市
25294 1 台中市
25295 4 台北市
25296 2 天津市
25297 1 go
25298 1 microsoft
25299 1 quickbooks
25300 1 ability
25301 1 data
25302 1 dedicated
25303 1 excellent
25304 2 experience
25305 1 outstanding
25306 1 self
25307 1 thorough
25308 158
25309 40
25310 26
25311 15
25312 1 
25313 1 􏰀participated

25314 rows × 2 columns


In [ ]:


In [ ]: