In [1]:
import requests
In [3]:
from bs4 import BeautifulSoup
In [4]:
from sklearn.feature_extraction.text import CountVectorizer
In [11]:
texts = []
for i in range(0, 1000, 10):
soup = BeautifulSoup(requests.get('http://www.indeed.com/jobs?q=Data+Scientist&l=San+Francisco,+CA&start='+str(i)).text)
texts += [a.text for a in soup.findAll('span',{'class':'summary'})]
print (len(texts), " Job descriptions")
In [12]:
texts[0]
Out[12]:
In [15]:
vect = CountVectorizer(ngram_range=(1,3), stop_words='english')
matrix = vect.fit_transform(texts)
len(vect.get_feature_names())
Out[15]:
In [20]:
vect.get_feature_names()[:20]
Out[20]:
In [39]:
freq = [(word, matrix.getcol(idx).sum()) for word, idx in vect.vocabulary_.items()]
In [41]:
# sort from largest to smallest
for phrase, times in sorted(freq, key = lambda x: -x[1])[:25]:
print (phrase, times)
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]: