This is a little bit messy, but I wanted to get something done quick while the kids were in school today. Just a basic scrape job on cybercoders (which I hear isn't all that great). Goal is to feed IBM's watson a lot of data and have it automagicially spit out relevant information. Easy peasy.
In [1]:
import os
import requests
from bs4 import BeautifulSoup
def get_job_posts():
'''This method goes to cyber coders and returns a list of the jobs as listed on the main
page.'''
job_posts = []
for x in range(12):
res = requests.get('http://www.cybercoders.com/search/?page='+ str(x) +'&searchterms=python&searchlocation=&newsearch=true&sorttype=')
soup = BeautifulSoup(res.text, 'lxml')
posts = soup.select('div.job-title')
job_posts.append(posts)
print("Total pages scraped: %s \n" %len(job_posts))
return job_posts
def get_job_pages(posts):
'''This method dives into the listings and pulls the href... then it appends the html for
processing in the write_job_text method '''
job_pages = []
for items in posts:
for links in items:
#print(links.select('a'))
for link in links.select('a'):
ipage = requests.get('http://www.cybercoders.com/%s' %link.get('href'))
job_pages.append(ipage)
print("Total pages collected: %s \n" %(len(job_pages)))
return job_pages
job_file = open('job_text.txt', 'w')
def write_job_text(pages):
'''This method simply pastes the text of the div job details to a text file... '''
job_counter = 0
for page in pages:
soup1 = BeautifulSoup(page.text, 'lxml')
job_details = soup1.select('div.job-details')
for job_desc in job_details:
#print(type(job_desc))
#print(job_desc.text)
job_counter += 1
job_file.write(str(job_desc.text))
print("Total job postings written to file: %s \n" %job_counter)
job_file.close()
j = get_job_posts()
k = get_job_pages(j)
write_job_text(k)
In [2]:
import re
import collections
# Ugly, but effective method to filter common words from a corpus of text.
stopwords = ['a', 'about', 'above', 'across', 'after', 'afterwards']
stopwords += ['again', 'against', 'all', 'almost', 'alone', 'along']
stopwords += ['already', 'also', 'although', 'always', 'am', 'among']
stopwords += ['amongst', 'amoungst', 'amount', 'an', 'and', 'another']
stopwords += ['any', 'anyhow', 'anyone', 'anything', 'anyway', 'anywhere']
stopwords += ['are', 'around', 'as', 'at', 'back', 'be', 'became']
stopwords += ['because', 'become', 'becomes', 'becoming', 'been']
stopwords += ['before', 'beforehand', 'behind', 'being', 'below']
stopwords += ['beside', 'besides', 'between', 'beyond', 'bill', 'both']
stopwords += ['bottom', 'but', 'by', 'call', 'can', 'cannot', 'cant']
stopwords += ['co', 'computer', 'con', 'could', 'couldnt', 'cry', 'de']
stopwords += ['describe', 'detail', 'did', 'do', 'done', 'down', 'due']
stopwords += ['during', 'each', 'eg', 'eight', 'either', 'eleven', 'else']
stopwords += ['elsewhere', 'empty', 'enough', 'etc', 'even', 'ever']
stopwords += ['every', 'everyone', 'everything', 'everywhere', 'except']
stopwords += ['few', 'fifteen', 'fifty', 'fill', 'find', 'fire', 'first']
stopwords += ['five', 'for', 'former', 'formerly', 'forty', 'found']
stopwords += ['four', 'from', 'front', 'full', 'further', 'get', 'give']
stopwords += ['go', 'had', 'has', 'hasnt', 'have', 'he', 'hence', 'her']
stopwords += ['here', 'hereafter', 'hereby', 'herein', 'hereupon', 'hers']
stopwords += ['herself', 'him', 'himself', 'his', 'how', 'however']
stopwords += ['hundred', 'i', 'ie', 'if', 'in', 'inc', 'indeed']
stopwords += ['interest', 'into', 'is', 'it', 'its', 'itself', 'keep']
stopwords += ['last', 'latter', 'latterly', 'least', 'less', 'ltd', 'made']
stopwords += ['many', 'may', 'me', 'meanwhile', 'might', 'mill', 'mine']
stopwords += ['more', 'moreover', 'most', 'mostly', 'move', 'much']
stopwords += ['must', 'my', 'myself', 'name', 'namely', 'neither', 'never']
stopwords += ['nevertheless', 'next', 'nine', 'no', 'nobody', 'none']
stopwords += ['noone', 'nor', 'not', 'nothing', 'now', 'nowhere', 'of']
stopwords += ['off', 'often', 'on','once', 'one', 'only', 'onto', 'or']
stopwords += ['other', 'others', 'otherwise', 'our', 'ours', 'ourselves']
stopwords += ['out', 'over', 'own', 'part', 'per', 'perhaps', 'please']
stopwords += ['put', 'rather', 're', 's', 'same', 'see', 'seem', 'seemed']
stopwords += ['seeming', 'seems', 'serious', 'several', 'she', 'should']
stopwords += ['show', 'side', 'since', 'sincere', 'six', 'sixty', 'so']
stopwords += ['some', 'somehow', 'someone', 'something', 'sometime']
stopwords += ['sometimes', 'somewhere', 'still', 'such', 'system', 'take']
stopwords += ['ten', 'than', 'that', 'the', 'their', 'them', 'themselves']
stopwords += ['then', 'thence', 'there', 'thereafter', 'thereby', 'today']
stopwords += ['therefore', 'therein', 'thereupon', 'these', 'they']
stopwords += ['thick', 'thin', 'third', 'this', 'those', 'though', 'three']
stopwords += ['three', 'through', 'throughout', 'thru', 'thus', 'to']
stopwords += ['together', 'too', 'top', 'toward', 'towards', 'twelve']
stopwords += ['twenty', 'two', 'un', 'under', 'until', 'up', 'upon']
stopwords += ['us', 'very', 'via', 'was', 'we', 'well', 'were', 'what', 'want']
stopwords += ['whatever', 'when', 'whence', 'whenever', 'where']
stopwords += ['whereafter', 'whereas', 'whereby', 'wherein', 'whereupon']
stopwords += ['wherever', 'whether', 'which', 'while', 'whither', 'who']
stopwords += ['whoever', 'whole', 'whom', 'whose', 'why', 'will', 'with']
stopwords += ['within', 'without','work', 'would', 'yet', 'you', 'your']
stopwords += ['yours', 'yourself', 'yourselves']
stopwords += ['experience', 'job', 'details', 'u', 'posted', 'need', 'based', 'years', 'new', 'world', 'knowledge', 'join',
'reasons', 'salary', 'position', 'skills', 'exciting', 'market', 'small', 'excellent', 'great', 'year',
'compensation', 'love', 'provide', 'usl', 'highly', 'e', 'day', 'growth', 'way', 'people', 'facing',
'big', 'll', 'g', 'amazing', 'passion', '14', 'office', 'vacation', 'located', '5', 'medical', 'pto', 'offer',
'used', 'exposure', 'captial', 'ca', 'help', 'funded', 'ideally', 'country', 'preferred', 'immediately', 'search',
'active', 'improve', '100', '150k', 'dental', 'innovation', 'desire', 'clean', '401k', 'life', 'd', 'paid', 'non',
'founded', 'changing', 'helps', '24', 'lunches', 'talent', 'familiarity', 'generous', '120', 'good', 'early',
'pay', 'l', 'matching', 'advice', 'directly', 'ma', 'rapidly', 'don', 'money', 'experienced', '3', 'strong',
'read', 'company', 'positionat', 'doing', '2015', '2', 'opportunity', '1', 'currently', 'social', 'oriented'
'like', 'self', 'vision', 'clients', 'understanding', 'impact', 'using', 't', 'passionate', 'fit', 'related',
'employees', 'health', 'make', 'hands', 'real', 'just', 'investment', 'comprehensive', 'doe', 'continue', 'extremely',
'com', 'vc', 'flagship', 'problems', 'effective', 'afficient', '401', 'schedule', 'taking', 'methods', 'commercial',
'success', 'k', 'solid', 'quickly', 'outings', 'successful', 'create', 'provided', 'culture', '4', 'fun', 'bs',
'awesome', 'users', 'talented', 'doingyou', 'b', 'free', 'grow', 'boston', 'san', 'francisco', 'retreats', 'offers',
'deliver', 'holiday', 'holidays', 'lots', '104k', 'frisco', 'ambitious', 'minimize', 'bring', 'sprit', 'bright',
'dinners', 'm', 'catered', 'uses', 'perks', 'ms', 'happy', 'know', 'opportunites', 'chat', 'days', 'revolutionizing',
'crowd', 'tx', 'enable', 'seeking', 'seek', 'interviewing', 'soon', 'sized', 'award', 'lopez', 'needed', 'preferably',
'doingas', 'making', 'r', '2009', 'starting', 'pa', 'apply', 'authorized', 'looking', 'end', 'benefits', '12', 'growing',
'hire', 'ideal', 'proven', 'aspects', 'deadlines', 'promise', 'fast', 'live', 'bonus', 'hours', 'best', 'plus', 'user',
'high', 'ability', 'level', 'cutting', 'edge', 'oriented', 'like', 'us1', 'large', 'equity', 'creative', 'options',
'casual', 'future', 'candidate', '10', 'cybercoders', '0', 'place', 'area', 'group', 'youcompetitive', 'occasional',
'fully', 'use', 'excited', 'bachelor', 'tv', 'better', 'employee','applicants', 'team', 'time', 'startup',
'customer', 'advanced', 'stock', 'management']
words = re.sub(r'([a-z](?=[A-Z])|[A-Z](?=[A-Z][a-z]))', r'\1 ', open('job_text.txt').read())
words = re.findall('\w+', open('job_text.txt').read().lower())
print(type(collections.Counter(words)))
c = collections.Counter(words)
for sword in stopwords:
del c[sword]
c.most_common()
Out[2]:
In [3]:
len(words)
Out[3]:
In [4]:
# Fancy way to count words that are wanted instead of filtering words that are unwanted
import re
import collections
wanted = ['automated', 'database', 'flask', 'postgresql', 'javascript', 'linux', 'unix', 'ansible',
'puppet', 'selenium', 'mysql', 'restful', 'gui', 'artificial', 'intelligence', 'django', 'api', 'full', 'stack',
'automation', 'sql', 'data', 'security', 'testing', 'test', 'git', 'json', 'cloud', 'devops', 'celery',
'analytics', 'visualization', 'redis', 'mongodb', 'nosql', 'openstack', 'aws', 'salt', 'analysis', 'golang',
'qa', 'distributed', 'chef',]
matches = re.findall('\w+', open('job_text.txt').read().lower())
counts = collections.Counter(matches)
my_wants = list(map(lambda x:(x,counts[x]),wanted))
sorted(my_wants, key=lambda wanted_words: wanted_words[1], reverse=True)
Out[4]:
In [13]:
# A much easier technique to count wanted words than the method posted above...
wanted = ['automated', 'database', 'flask', 'postgresql', 'javascript', 'linux', 'unix', 'ansible',
'puppet', 'selenium', 'mysql', 'restful', 'gui', 'artificial', 'intelligence', 'django', 'api', 'full', 'stack',
'automation', 'sql', 'data', 'security', 'testing', 'test', 'git', 'json', 'cloud', 'devops', 'celery',
'analytics', 'visualization', 'redis', 'mongodb', 'nosql', 'openstack', 'aws', 'salt', 'analysis', 'golang',
'qa', 'distributed', 'chef', 'docker']
cnt = collections.Counter()
words = re.findall('\w+', open('job_text.txt').read().lower())
for word in words:
if word in wanted:
cnt[word] += 1
print(cnt)
Here we are searching for devops and dba jobs to see what is relevant. Simply change the word in search terms in the get_jobs_posts() method to change the value
In [1]:
import os
import requests
from bs4 import BeautifulSoup
def get_job_posts():
'''This method goes to cyber coders and returns a list of the jobs as listed on the main
page.'''
job_posts = []
for x in range(8):
res = requests.get('http://www.cybercoders.com/search/?page='+ str(x) +'&searchterms=devops&searchlocation=&newsearch=true&sorttype=')
soup = BeautifulSoup(res.text, 'lxml')
posts = soup.select('div.job-title')
job_posts.append(posts)
print("Total pages scraped: %s \n" %len(job_posts))
return job_posts
def get_job_pages(posts):
'''This method dives into the listings and pulls the href... then it appends the html for
processing in the write_job_text method '''
job_pages = []
for items in posts:
for links in items:
#print(links.select('a'))
for link in links.select('a'):
ipage = requests.get('http://www.cybercoders.com/%s' %link.get('href'))
job_pages.append(ipage)
print("Total pages collected: %s \n" %(len(job_pages)))
return job_pages
job_file = open('job_text_dba.txt', 'w')
def write_job_text(pages):
'''This method simply pastes the text of the div job details to a text file... '''
job_counter = 0
for page in pages:
soup1 = BeautifulSoup(page.text, 'lxml')
job_details = soup1.select('div.job-details')
for job_desc in job_details:
#print(type(job_desc))
#print(job_desc.text)
job_counter += 1
job_file.write(str(job_desc.text))
print("Total job postings written to file: %s \n" %job_counter)
job_file.close()
j = get_job_posts()
k = get_job_pages(j)
write_job_text(k)
In [2]:
import re
import collections
# Ugly, but effective method to filter common words from a corpus of text.
stopwords = ['a', 'about', 'above', 'across', 'after', 'afterwards']
stopwords += ['again', 'against', 'all', 'almost', 'alone', 'along']
stopwords += ['already', 'also', 'although', 'always', 'am', 'among']
stopwords += ['amongst', 'amoungst', 'amount', 'an', 'and', 'another']
stopwords += ['any', 'anyhow', 'anyone', 'anything', 'anyway', 'anywhere']
stopwords += ['are', 'around', 'as', 'at', 'back', 'be', 'became']
stopwords += ['because', 'become', 'becomes', 'becoming', 'been']
stopwords += ['before', 'beforehand', 'behind', 'being', 'below']
stopwords += ['beside', 'besides', 'between', 'beyond', 'bill', 'both']
stopwords += ['bottom', 'but', 'by', 'call', 'can', 'cannot', 'cant']
stopwords += ['co', 'computer', 'con', 'could', 'couldnt', 'cry', 'de']
stopwords += ['describe', 'detail', 'did', 'do', 'done', 'down', 'due']
stopwords += ['during', 'each', 'eg', 'eight', 'either', 'eleven', 'else']
stopwords += ['elsewhere', 'empty', 'enough', 'etc', 'even', 'ever']
stopwords += ['every', 'everyone', 'everything', 'everywhere', 'except']
stopwords += ['few', 'fifteen', 'fifty', 'fill', 'find', 'fire', 'first']
stopwords += ['five', 'for', 'former', 'formerly', 'forty', 'found']
stopwords += ['four', 'from', 'front', 'full', 'further', 'get', 'give']
stopwords += ['go', 'had', 'has', 'hasnt', 'have', 'he', 'hence', 'her']
stopwords += ['here', 'hereafter', 'hereby', 'herein', 'hereupon', 'hers']
stopwords += ['herself', 'him', 'himself', 'his', 'how', 'however']
stopwords += ['hundred', 'i', 'ie', 'if', 'in', 'inc', 'indeed']
stopwords += ['interest', 'into', 'is', 'it', 'its', 'itself', 'keep']
stopwords += ['last', 'latter', 'latterly', 'least', 'less', 'ltd', 'made']
stopwords += ['many', 'may', 'me', 'meanwhile', 'might', 'mill', 'mine']
stopwords += ['more', 'moreover', 'most', 'mostly', 'move', 'much']
stopwords += ['must', 'my', 'myself', 'name', 'namely', 'neither', 'never']
stopwords += ['nevertheless', 'next', 'nine', 'no', 'nobody', 'none']
stopwords += ['noone', 'nor', 'not', 'nothing', 'now', 'nowhere', 'of']
stopwords += ['off', 'often', 'on','once', 'one', 'only', 'onto', 'or']
stopwords += ['other', 'others', 'otherwise', 'our', 'ours', 'ourselves']
stopwords += ['out', 'over', 'own', 'part', 'per', 'perhaps', 'please']
stopwords += ['put', 'rather', 're', 's', 'same', 'see', 'seem', 'seemed']
stopwords += ['seeming', 'seems', 'serious', 'several', 'she', 'should']
stopwords += ['show', 'side', 'since', 'sincere', 'six', 'sixty', 'so']
stopwords += ['some', 'somehow', 'someone', 'something', 'sometime']
stopwords += ['sometimes', 'somewhere', 'still', 'such', 'system', 'take']
stopwords += ['ten', 'than', 'that', 'the', 'their', 'them', 'themselves']
stopwords += ['then', 'thence', 'there', 'thereafter', 'thereby', 'today']
stopwords += ['therefore', 'therein', 'thereupon', 'these', 'they']
stopwords += ['thick', 'thin', 'third', 'this', 'those', 'though', 'three']
stopwords += ['three', 'through', 'throughout', 'thru', 'thus', 'to']
stopwords += ['together', 'too', 'top', 'toward', 'towards', 'twelve']
stopwords += ['twenty', 'two', 'un', 'under', 'until', 'up', 'upon']
stopwords += ['us', 'very', 'via', 'was', 'we', 'well', 'were', 'what', 'want']
stopwords += ['whatever', 'when', 'whence', 'whenever', 'where']
stopwords += ['whereafter', 'whereas', 'whereby', 'wherein', 'whereupon']
stopwords += ['wherever', 'whether', 'which', 'while', 'whither', 'who']
stopwords += ['whoever', 'whole', 'whom', 'whose', 'why', 'will', 'with']
stopwords += ['within', 'without','work', 'would', 'yet', 'you', 'your']
stopwords += ['yours', 'yourself', 'yourselves']
stopwords += ['experience', 'job', 'details', 'u', 'posted', 'need', 'based', 'years', 'new', 'world', 'knowledge', 'join',
'reasons', 'salary', 'position', 'skills', 'exciting', 'market', 'small', 'excellent', 'great', 'year',
'compensation', 'love', 'provide', 'usl', 'highly', 'e', 'day', 'growth', 'way', 'people', 'facing',
'big', 'll', 'g', 'amazing', 'passion', '14', 'office', 'vacation', 'located', '5', 'medical', 'pto', 'offer',
'used', 'exposure', 'captial', 'ca', 'help', 'funded', 'ideally', 'country', 'preferred', 'immediately', 'search',
'active', 'improve', '100', '150k', 'dental', 'innovation', 'desire', 'clean', '401k', 'life', 'd', 'paid', 'non',
'founded', 'changing', 'helps', '24', 'lunches', 'talent', 'familiarity', 'generous', '120', 'good', 'early',
'pay', 'l', 'matching', 'advice', 'directly', 'ma', 'rapidly', 'don', 'money', 'experienced', '3', 'strong',
'read', 'company', 'positionat', 'doing', '2015', '2', 'opportunity', '1', 'currently', 'social', 'oriented'
'like', 'self', 'vision', 'clients', 'understanding', 'impact', 'using', 't', 'passionate', 'fit', 'related',
'employees', 'health', 'make', 'hands', 'real', 'just', 'investment', 'comprehensive', 'doe', 'continue', 'extremely',
'com', 'vc', 'flagship', 'problems', 'effective', 'afficient', '401', 'schedule', 'taking', 'methods', 'commercial',
'success', 'k', 'solid', 'quickly', 'outings', 'successful', 'create', 'provided', 'culture', '4', 'fun', 'bs',
'awesome', 'users', 'talented', 'doingyou', 'b', 'free', 'grow', 'boston', 'san', 'francisco', 'retreats', 'offers',
'deliver', 'holiday', 'holidays', 'lots', '104k', 'frisco', 'ambitious', 'minimize', 'bring', 'sprit', 'bright',
'dinners', 'm', 'catered', 'uses', 'perks', 'ms', 'happy', 'know', 'opportunites', 'chat', 'days', 'revolutionizing',
'crowd', 'tx', 'enable', 'seeking', 'seek', 'interviewing', 'soon', 'sized', 'award', 'lopez', 'needed', 'preferably',
'doingas', 'making', 'r', '2009', 'starting', 'pa', 'apply', 'authorized', 'looking', 'end', 'benefits', '12', 'growing',
'hire', 'ideal', 'proven', 'aspects', 'deadlines', 'promise', 'fast', 'live', 'bonus', 'hours', 'best', 'plus', 'user',
'high', 'ability', 'level', 'cutting', 'edge', 'oriented', 'like', 'us1', 'large', 'equity', 'creative', 'options',
'casual', 'future', 'candidate', '10', 'cybercoders', '0', 'place', 'area', 'group', 'youcompetitive', 'occasional',
'fully', 'use', 'excited', 'bachelor', 'tv', 'better', 'employee','applicants', 'team', 'time', 'startup',
'customer', 'advanced', 'stock', 'management', '2016']
words = re.sub(r'([a-z](?=[A-Z])|[A-Z](?=[A-Z][a-z]))', r'\1 ', open('job_text_dba.txt').read())
words = re.findall('\w+', open('job_text_dba.txt').read().lower())
#print(type(collections.Counter(words)))
x = collections.Counter(words)
for sword in stopwords:
del x[sword]
x.most_common()
Out[2]:
In [4]:
# A much easier technique to count wanted words than the method posted above...
wanted = ['automated', 'database', 'flask', 'postgresql', 'javascript', 'linux', 'unix', 'ansible',
'puppet','rhel', 'coreos', 'aws', 'boto', 'selenium', 'mysql', 'restful', 'gui', 'artificial',
'intelligence', 'django', 'api', 'full', 'stack', 'dba', 'certification', 'oracle', '11g', 'MSSQL'
'automation', 'sql', 'data', 'security', 'testing', 'test', 'git', 'json', 'cloud', 'devops', 'celery',
'analytics', 'visualization', 'redis', 'mongodb', 'nosql', 'openstack', 'aws', 'salt', 'analysis', 'golang',
'qa', 'distributed', 'chef', 'docker', 'hadoop', 'sqlite', 'monitoring', 'micro', 'python', 'ruby', 'ibm',
'jenkins', 'travis', 'git', 'ec2', 's3', 'amazon', 'azure' , 'openstack', 'proxmox', 'vm']
cnt = collections.Counter()
words = re.findall('\w+', open('job_text_dba.txt').read().lower())
for word in words:
if word in wanted:
cnt[word] += 1
print(cnt)
In [ ]: