notebook.community

Edit and run



In [5]:

    
## Import libraries for these exercises
%matplotlib inline
from sklearn.feature_extraction import stop_words
import pandas as pd
from tokenize_resumes import *
from sklearn.feature_extraction.text import TfidfVectorizer



In [6]:

    
## Initialize sklearn vectorizer object with our tokenization function
vectorizer = CountVectorizer(tokenizer=spacy_tokenize)



In [8]:

    
## Calculate vectors for the entire dataset
resume_dataset = get_resumes_dataset()
vectors = vectorizer.fit_transform(resume_dataset.resumes)



In [9]:

    
## Define a method to compare two resumes (for utility purposes)

def compare_resumes(res1, res2):
    resvec1 = vectorizer.transform([res1])
    resvec2 = vectorizer.transform([res2])
    
    return cosine_similarity(resvec1, resvec2)



In [ ]:



In [10]:

    
## Example of cosine similarity over the entire dataset for a given resume

#cosine_similarity(vectorizer.transform([resume_dataset.resumes[61]]), vectors)



In [11]:

    
## Define a function to find people like "Kevin" from the first 100 people

like_kevin = sorted(resume_dataset.resumes[0:100], key = lambda x: compare_resumes(resume_dataset.resumes[0], x), reverse=True)



In [12]:

    
## As above (but using word vectors from spacy)

like_kevin_wv = sorted(resume_dataset.resumes[0:100], key = lambda x: NLP(resume_dataset.resumes[0]).similarity(NLP(x)), reverse=True)

Term Frequency analysis



In [13]:

    
document_lengths = vectors.sum(axis=1).T.tolist()[0]
pd.Series(document_lengths).plot.hist()









    Out[13]:





<matplotlib.axes._subplots.AxesSubplot at 0x11643a5c0>



In [14]:

    
terms = pd.DataFrame(vectors.sum(axis=0).T, columns=['frequency'])
terms['name'] = vectorizer.get_feature_names()
terms









    Out[14]:







  
    
      
      frequency
      name
    
  
  
    
      0
      1079
      $
    
    
      1
      7
      'd
    
    
      2
      23
      'm
    
    
      3
      2
      're
    
    
      4
      1829
      's
    
    
      5
      23
      've
    
    
      6
      816
      +
    
    
      7
      1
      -11
    
    
      8
      2
      -12
    
    
      9
      1
      -15
    
    
      10
      2
      -1996
    
    
      11
      1
      -2001
    
    
      12
      3
      -2004
    
    
      13
      1
      -2012
    
    
      14
      1
      -3
    
    
      15
      2
      -3%
    
    
      16
      1
      -5
    
    
      17
      1
      -accompany
    
    
      18
      1
      -accomplished
    
    
      19
      2
      -accountable
    
    
      20
      3
      -act
    
    
      21
      2
      -actively
    
    
      22
      1
      -additional
    
    
      23
      2
      -adobe
    
    
      24
      1
      -aided
    
    
      25
      1
      -among
    
    
      26
      2
      -analyze
    
    
      27
      2
      -analyzes
    
    
      28
      1
      -answered
    
    
      29
      1
      -asean
    
    
      ...
      ...
      ...
    
    
      25284
      176
      ✓
    
    
      25285
      25
      ❑
    
    
      25286
      80
      ❖
    
    
      25287
      221
      ➢
    
    
      25288
      1
      ㆍmaking
    
    
      25289
      1
      ㆍreported
    
    
      25290
      1
      ㆍresolved
    
    
      25291
      1
      ㆍsolved
    
    
      25292
      5
      上海市
    
    
      25293
      7
      北京市
    
    
      25294
      1
      台中市
    
    
      25295
      4
      台北市
    
    
      25296
      2
      天津市
    
    
      25297
      1
      go
    
    
      25298
      1
      microsoft
    
    
      25299
      1
      quickbooks
    
    
      25300
      1
      ability
    
    
      25301
      1
      data
    
    
      25302
      1
      dedicated
    
    
      25303
      1
      excellent
    
    
      25304
      2
      experience
    
    
      25305
      1
      outstanding
    
    
      25306
      1
      self
    
    
      25307
      1
      thorough
    
    
      25308
      158
      
    
    
      25309
      40
      
    
    
      25310
      26
      
    
    
      25311
      15
      
    
    
      25312
      1
      
    
    
      25313
      1
      􏰀participated
    
  

25314 rows × 2 columns



In [15]:

    
terms.frequency.sort_values(ascending=False)[0:100].plot.hist(bins=100)









    Out[15]:





<matplotlib.axes._subplots.AxesSubplot at 0x1222a5278>



In [16]:

    
terms.sort_values(by='frequency', ascending=False)









    Out[16]:







  
    
      
      frequency
      name
    
  
  
    
      19941
      6658
      sales
    
    
      14106
      5379
      ma
    
    
      4657
      3511
      business
    
    
      14389
      3183
      marketing
    
    
      15787
      3160
      new
    
    
      7435
      2882
      development
    
    
      14244
      2860
      management
    
    
      21042
      2553
      software
    
    
      24674
      2483
      work
    
    
      6843
      2433
      customer
    
    
      22328
      2136
      team
    
    
      9104
      1977
      experience
    
    
      24931
      1853
      years
    
    
      4308
      1850
      boston
    
    
      20868
      1845
      skills
    
    
      4
      1829
      's
    
    
      18048
      1772
      product
    
    
      14249
      1676
      manager
    
    
      6973
      1625
      data
    
    
      20459
      1614
      service
    
    
      7430
      1604
      developed
    
    
      5944
      1509
      company
    
    
      21964
      1433
      support
    
    
      7368
      1394
      design
    
    
      20469
      1384
      services
    
    
      18115
      1376
      project
    
    
      22130
      1370
      system
    
    
      22137
      1355
      systems
    
    
      919
      1315
      2014
    
    
      913
      1286
      2013
    
    
      ...
      ...
      ...
    
    
      14579
      1
      mcm
    
    
      14580
      1
      mcmullan
    
    
      14581
      1
      mcmurry
    
    
      14582
      1
      mco
    
    
      14583
      1
      mcp
    
    
      14585
      1
      mcpherson.susanj@gmail.com
    
    
      2571
      1
      aliemeke
    
    
      14540
      1
      mb
    
    
      14538
      1
      mazel
    
    
      14531
      1
      mayhew
    
    
      14487
      1
      mathew
    
    
      14489
      1
      mathsoft
    
    
      14491
      1
      matilda
    
    
      14493
      1
      matlab(written
    
    
      14494
      1
      matra
    
    
      14496
      1
      matriculation
    
    
      14497
      1
      matrikon
    
    
      2587
      1
      allergens
    
    
      14503
      1
      mattell
    
    
      2586
      1
      allentown
    
    
      2583
      1
      allegra
    
    
      14512
      1
      mavis
    
    
      14515
      1
      maxi
    
    
      14516
      1
      maxie
    
    
      14517
      1
      maxima
    
    
      14518
      1
      maximacreances
    
    
      14522
      1
      maximizes
    
    
      2577
      1
      aligns
    
    
      14529
      1
      may-83
    
    
      25313
      1
      􏰀participated
    
  

25314 rows × 2 columns

TF-IDF and stopwords example



In [17]:

    
vectorizer_stopwords = TfidfVectorizer(stop_words=None)
vectors_stopwords = vectorizer_stopwords.fit_transform(resume_dataset.resumes)

vectorizer_nostopwords = TfidfVectorizer(stop_words='english')
vectors_nostopwords = vectorizer_nostopwords.fit_transform(resume_dataset.resumes)


terms_stopwords = pd.DataFrame(vectors_stopwords.sum(axis=0).T, columns=['frequency'])
terms_stopwords['name'] = vectorizer_stopwords.get_feature_names()

terms_nostopwords = pd.DataFrame(vectors_nostopwords.sum(axis=0).T, columns=['frequency'])
terms_nostopwords['name'] = vectorizer_nostopwords.get_feature_names()



In [19]:









    Out[19]:







  
    
      
      frequency
      name
    
  
  
    
      0
      1079
      $
    
    
      1
      7
      'd
    
    
      2
      23
      'm
    
    
      3
      2
      're
    
    
      4
      1829
      's
    
    
      5
      23
      've
    
    
      6
      816
      +
    
    
      7
      1
      -11
    
    
      8
      2
      -12
    
    
      9
      1
      -15
    
    
      10
      2
      -1996
    
    
      11
      1
      -2001
    
    
      12
      3
      -2004
    
    
      13
      1
      -2012
    
    
      14
      1
      -3
    
    
      15
      2
      -3%
    
    
      16
      1
      -5
    
    
      17
      1
      -accompany
    
    
      18
      1
      -accomplished
    
    
      19
      2
      -accountable
    
    
      20
      3
      -act
    
    
      21
      2
      -actively
    
    
      22
      1
      -additional
    
    
      23
      2
      -adobe
    
    
      24
      1
      -aided
    
    
      25
      1
      -among
    
    
      26
      2
      -analyze
    
    
      27
      2
      -analyzes
    
    
      28
      1
      -answered
    
    
      29
      1
      -asean
    
    
      ...
      ...
      ...
    
    
      25284
      176
      ✓
    
    
      25285
      25
      ❑
    
    
      25286
      80
      ❖
    
    
      25287
      221
      ➢
    
    
      25288
      1
      ㆍmaking
    
    
      25289
      1
      ㆍreported
    
    
      25290
      1
      ㆍresolved
    
    
      25291
      1
      ㆍsolved
    
    
      25292
      5
      上海市
    
    
      25293
      7
      北京市
    
    
      25294
      1
      台中市
    
    
      25295
      4
      台北市
    
    
      25296
      2
      天津市
    
    
      25297
      1
      go
    
    
      25298
      1
      microsoft
    
    
      25299
      1
      quickbooks
    
    
      25300
      1
      ability
    
    
      25301
      1
      data
    
    
      25302
      1
      dedicated
    
    
      25303
      1
      excellent
    
    
      25304
      2
      experience
    
    
      25305
      1
      outstanding
    
    
      25306
      1
      self
    
    
      25307
      1
      thorough
    
    
      25308
      158
      
    
    
      25309
      40
      
    
    
      25310
      26
      
    
    
      25311
      15
      
    
    
      25312
      1
      
    
    
      25313
      1
      􏰀participated
    
  

25314 rows × 2 columns



In [ ]:



In [ ]:

	frequency	name
0	1079	$
1	7	'd
2	23	'm
3	2	're
4	1829	's
5	23	've
6	816	+
7	1	-11
8	2	-12
9	1	-15
10	2	-1996
11	1	-2001
12	3	-2004
13	1	-2012
14	1	-3
15	2	-3%
16	1	-5
17	1	-accompany
18	1	-accomplished
19	2	-accountable
20	3	-act
21	2	-actively
22	1	-additional
23	2	-adobe
24	1	-aided
25	1	-among
26	2	-analyze
27	2	-analyzes
28	1	-answered
29	1	-asean
...	...	...
25284	176	✓
25285	25	❑
25286	80	❖
25287	221	➢
25288	1	ㆍmaking
25289	1	ㆍreported
25290	1	ㆍresolved
25291	1	ㆍsolved
25292	5	上海市
25293	7	北京市
25294	1	台中市
25295	4	台北市
25296	2	天津市
25297	1	go
25298	1	microsoft
25299	1	quickbooks
25300	1	ability
25301	1	data
25302	1	dedicated
25303	1	excellent
25304	2	experience
25305	1	outstanding
25306	1	self
25307	1	thorough
25308	158	
25309	40	
25310	26	
25311	15	
25312	1
25313	1	􏰀participated

	frequency	name
19941	6658	sales
14106	5379	ma
4657	3511	business
14389	3183	marketing
15787	3160	new
7435	2882	development
14244	2860	management
21042	2553	software
24674	2483	work
6843	2433	customer
22328	2136	team
9104	1977	experience
24931	1853	years
4308	1850	boston
20868	1845	skills
4	1829	's
18048	1772	product
14249	1676	manager
6973	1625	data
20459	1614	service
7430	1604	developed
5944	1509	company
21964	1433	support
7368	1394	design
20469	1384	services
18115	1376	project
22130	1370	system
22137	1355	systems
919	1315	2014
913	1286	2013
...	...	...
14579	1	mcm
14580	1	mcmullan
14581	1	mcmurry
14582	1	mco
14583	1	mcp
14585	1	mcpherson.susanj@gmail.com
2571	1	aliemeke
14540	1	mb
14538	1	mazel
14531	1	mayhew
14487	1	mathew
14489	1	mathsoft
14491	1	matilda
14493	1	matlab(written
14494	1	matra
14496	1	matriculation
14497	1	matrikon
2587	1	allergens
14503	1	mattell
2586	1	allentown
2583	1	allegra
14512	1	mavis
14515	1	maxi
14516	1	maxie
14517	1	maxima
14518	1	maximacreances
14522	1	maximizes
2577	1	aligns
14529	1	may-83
25313	1	􏰀participated