In [1]:
import requests
  • Latest version of BeautifulSoup is no longer called BeautifulSoup, but instead bs4.

In [3]:
from bs4 import BeautifulSoup

In [4]:
from sklearn.feature_extraction.text import CountVectorizer
  • Grab data scinece jobs in San Francisco, CA from Indeed

In [11]:
texts = []
for i in range(0, 1000, 10):
    soup = BeautifulSoup(requests.get('http://www.indeed.com/jobs?q=Data+Scientist&l=San+Francisco,+CA&start='+str(i)).text)
    texts += [a.text for a in soup.findAll('span',{'class':'summary'})]
print (len(texts), " Job descriptions")


1000  Job descriptions

In [12]:
texts[0]


Out[12]:
'\nPig, Hive, Hadoop, Python, R, SQL to wrangle large data sets into actionable insights. Work closely with the predictive analytics, data acquisition, customer...'

In [15]:
vect = CountVectorizer(ngram_range=(1,3), stop_words='english')
matrix = vect.fit_transform(texts)
len(vect.get_feature_names())


Out[15]:
15825

In [20]:
vect.get_feature_names()[:20]


Out[20]:
['000',
 '000 data',
 '000 data scientists',
 '000 developers',
 '000 developers designers',
 '10',
 '10 gigabytes',
 '10 gigabytes second',
 '10 research',
 '10 research groups',
 '10 years',
 '10 years bachelor',
 '100',
 '100 promising',
 '100 promising companies',
 '100 year',
 '100 year old',
 '10b',
 '10b 100',
 '10b 100 year']

In [39]:
freq = [(word, matrix.getcol(idx).sum()) for word, idx in vect.vocabulary_.items()]

In [41]:
# sort from largest to smallest 
for phrase, times in sorted(freq, key = lambda x: -x[1])[:25]:
    print (phrase, times)


data 808
scientists 361
clinical 190
scientific 161
experience 161
scientist 151
team 141
analytics 139
machine 136
learning 135
data scientists 132
machine learning 129
engineers 123
work 116
analysis 106
research 98
health 88
development 84
data scientist 83
world 74
statistical 73
mining 66
data mining 65
project 64
laboratory 63

In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]: