This is a little bit messy, but I wanted to get something done quick while the kids were in school today. Just a basic scrape job on cybercoders (which I hear isn't all that great). Goal is to feed IBM's watson a lot of data and have it automagicially spit out relevant information. Easy peasy.


In [1]:
import os 
import requests
from bs4 import BeautifulSoup

def get_job_posts():
    '''This method goes to cyber coders and returns a list of the jobs as listed on the main
    page.'''
    job_posts = []
    for x in range(12):
        res = requests.get('http://www.cybercoders.com/search/?page='+ str(x) +'&searchterms=python&searchlocation=&newsearch=true&sorttype=')
        soup = BeautifulSoup(res.text, 'lxml')
        posts = soup.select('div.job-title')
        job_posts.append(posts)
    print("Total pages scraped: %s \n" %len(job_posts))
    return job_posts

def get_job_pages(posts):
    '''This method dives into the listings and pulls the href... then it appends the html for 
    processing in the write_job_text method '''
    job_pages = []
    for items in posts:
        for links in items:
            #print(links.select('a'))
            for link in links.select('a'):
                ipage = requests.get('http://www.cybercoders.com/%s' %link.get('href'))
                job_pages.append(ipage)
    print("Total pages collected: %s \n" %(len(job_pages)))
    return job_pages


job_file = open('job_text.txt', 'w')

def write_job_text(pages):
    '''This method simply pastes the text of the div job details to a text file... '''
    job_counter = 0
    for page in pages:
        soup1 = BeautifulSoup(page.text, 'lxml')
        job_details = soup1.select('div.job-details')
        for job_desc in job_details:
            #print(type(job_desc))
            #print(job_desc.text)
            job_counter += 1
            job_file.write(str(job_desc.text))
    print("Total job postings written to file: %s \n" %job_counter)
    job_file.close()

j = get_job_posts()
k = get_job_pages(j)
write_job_text(k)


Total pages scraped: 12 

Total pages collected: 240 

Total job postings written to file: 240 


In [2]:
import re
import collections


# Ugly, but effective method to filter common words from a corpus of text.
stopwords = ['a', 'about', 'above', 'across', 'after', 'afterwards']
stopwords += ['again', 'against', 'all', 'almost', 'alone', 'along']
stopwords += ['already', 'also', 'although', 'always', 'am', 'among']
stopwords += ['amongst', 'amoungst', 'amount', 'an', 'and', 'another']
stopwords += ['any', 'anyhow', 'anyone', 'anything', 'anyway', 'anywhere']
stopwords += ['are', 'around', 'as', 'at', 'back', 'be', 'became']
stopwords += ['because', 'become', 'becomes', 'becoming', 'been']
stopwords += ['before', 'beforehand', 'behind', 'being', 'below']
stopwords += ['beside', 'besides', 'between', 'beyond', 'bill', 'both']
stopwords += ['bottom', 'but', 'by', 'call', 'can', 'cannot', 'cant']
stopwords += ['co', 'computer', 'con', 'could', 'couldnt', 'cry', 'de']
stopwords += ['describe', 'detail', 'did', 'do', 'done', 'down', 'due']
stopwords += ['during', 'each', 'eg', 'eight', 'either', 'eleven', 'else']
stopwords += ['elsewhere', 'empty', 'enough', 'etc', 'even', 'ever']
stopwords += ['every', 'everyone', 'everything', 'everywhere', 'except']
stopwords += ['few', 'fifteen', 'fifty', 'fill', 'find', 'fire', 'first']
stopwords += ['five', 'for', 'former', 'formerly', 'forty', 'found']
stopwords += ['four', 'from', 'front', 'full', 'further', 'get', 'give']
stopwords += ['go', 'had', 'has', 'hasnt', 'have', 'he', 'hence', 'her']
stopwords += ['here', 'hereafter', 'hereby', 'herein', 'hereupon', 'hers']
stopwords += ['herself', 'him', 'himself', 'his', 'how', 'however']
stopwords += ['hundred', 'i', 'ie', 'if', 'in', 'inc', 'indeed']
stopwords += ['interest', 'into', 'is', 'it', 'its', 'itself', 'keep']
stopwords += ['last', 'latter', 'latterly', 'least', 'less', 'ltd', 'made']
stopwords += ['many', 'may', 'me', 'meanwhile', 'might', 'mill', 'mine']
stopwords += ['more', 'moreover', 'most', 'mostly', 'move', 'much']
stopwords += ['must', 'my', 'myself', 'name', 'namely', 'neither', 'never']
stopwords += ['nevertheless', 'next', 'nine', 'no', 'nobody', 'none']
stopwords += ['noone', 'nor', 'not', 'nothing', 'now', 'nowhere', 'of']
stopwords += ['off', 'often', 'on','once', 'one', 'only', 'onto', 'or']
stopwords += ['other', 'others', 'otherwise', 'our', 'ours', 'ourselves']
stopwords += ['out', 'over', 'own', 'part', 'per', 'perhaps', 'please']
stopwords += ['put', 'rather', 're', 's', 'same', 'see', 'seem', 'seemed']
stopwords += ['seeming', 'seems', 'serious', 'several', 'she', 'should']
stopwords += ['show', 'side', 'since', 'sincere', 'six', 'sixty', 'so']
stopwords += ['some', 'somehow', 'someone', 'something', 'sometime']
stopwords += ['sometimes', 'somewhere', 'still', 'such', 'system', 'take']
stopwords += ['ten', 'than', 'that', 'the', 'their', 'them', 'themselves']
stopwords += ['then', 'thence', 'there', 'thereafter', 'thereby', 'today']
stopwords += ['therefore', 'therein', 'thereupon', 'these', 'they']
stopwords += ['thick', 'thin', 'third', 'this', 'those', 'though', 'three']
stopwords += ['three', 'through', 'throughout', 'thru', 'thus', 'to']
stopwords += ['together', 'too', 'top', 'toward', 'towards', 'twelve']
stopwords += ['twenty', 'two', 'un', 'under', 'until', 'up', 'upon']
stopwords += ['us', 'very', 'via', 'was', 'we', 'well', 'were', 'what', 'want']
stopwords += ['whatever', 'when', 'whence', 'whenever', 'where']
stopwords += ['whereafter', 'whereas', 'whereby', 'wherein', 'whereupon']
stopwords += ['wherever', 'whether', 'which', 'while', 'whither', 'who']
stopwords += ['whoever', 'whole', 'whom', 'whose', 'why', 'will', 'with']
stopwords += ['within', 'without','work', 'would', 'yet', 'you', 'your']
stopwords += ['yours', 'yourself', 'yourselves']
stopwords += ['experience', 'job', 'details', 'u', 'posted', 'need', 'based', 'years', 'new', 'world', 'knowledge', 'join', 
             'reasons', 'salary', 'position', 'skills', 'exciting', 'market', 'small', 'excellent', 'great', 'year', 
             'compensation', 'love', 'provide', 'usl', 'highly', 'e', 'day', 'growth', 'way', 'people', 'facing', 
             'big', 'll', 'g', 'amazing', 'passion', '14', 'office', 'vacation', 'located', '5', 'medical', 'pto', 'offer',
             'used', 'exposure', 'captial', 'ca', 'help', 'funded', 'ideally', 'country', 'preferred', 'immediately', 'search',
             'active', 'improve', '100', '150k', 'dental', 'innovation', 'desire', 'clean', '401k', 'life', 'd', 'paid', 'non',
             'founded', 'changing', 'helps', '24', 'lunches', 'talent', 'familiarity', 'generous', '120', 'good', 'early',
             'pay', 'l', 'matching', 'advice', 'directly', 'ma', 'rapidly', 'don', 'money', 'experienced', '3', 'strong',
             'read', 'company', 'positionat', 'doing', '2015', '2', 'opportunity', '1', 'currently', 'social', 'oriented'
             'like', 'self', 'vision', 'clients', 'understanding', 'impact', 'using', 't', 'passionate', 'fit', 'related',
             'employees', 'health', 'make', 'hands', 'real', 'just', 'investment', 'comprehensive', 'doe', 'continue', 'extremely',
             'com', 'vc', 'flagship', 'problems', 'effective', 'afficient', '401', 'schedule', 'taking', 'methods', 'commercial', 
             'success', 'k', 'solid', 'quickly', 'outings', 'successful', 'create', 'provided', 'culture', '4', 'fun', 'bs',
             'awesome', 'users', 'talented', 'doingyou', 'b', 'free', 'grow', 'boston', 'san', 'francisco', 'retreats', 'offers',
             'deliver', 'holiday', 'holidays', 'lots', '104k', 'frisco', 'ambitious', 'minimize', 'bring', 'sprit', 'bright',
             'dinners', 'm', 'catered', 'uses', 'perks', 'ms', 'happy', 'know', 'opportunites', 'chat', 'days', 'revolutionizing',
             'crowd', 'tx', 'enable', 'seeking', 'seek', 'interviewing', 'soon', 'sized', 'award', 'lopez', 'needed', 'preferably',
             'doingas', 'making', 'r', '2009', 'starting', 'pa', 'apply', 'authorized', 'looking', 'end', 'benefits', '12', 'growing',
             'hire', 'ideal', 'proven', 'aspects', 'deadlines', 'promise', 'fast', 'live', 'bonus', 'hours', 'best', 'plus', 'user',
             'high', 'ability', 'level', 'cutting', 'edge', 'oriented', 'like', 'us1', 'large', 'equity', 'creative', 'options',
             'casual', 'future', 'candidate', '10', 'cybercoders', '0', 'place', 'area', 'group', 'youcompetitive', 'occasional',
             'fully', 'use', 'excited', 'bachelor', 'tv', 'better', 'employee','applicants', 'team', 'time', 'startup',
             'customer', 'advanced', 'stock', 'management']


words = re.sub(r'([a-z](?=[A-Z])|[A-Z](?=[A-Z][a-z]))', r'\1 ', open('job_text.txt').read())
words = re.findall('\w+', open('job_text.txt').read().lower())
print(type(collections.Counter(words)))
c = collections.Counter(words)
for sword in stopwords:
    del c[sword]
c.most_common()


<class 'collections.Counter'>
Out[2]:
[('python', 808),
 ('development', 394),
 ('software', 358),
 ('data', 319),
 ('developer', 314),
 ('engineer', 231),
 ('django', 212),
 ('web', 209),
 ('working', 187),
 ('design', 160),
 ('product', 159),
 ('competitive', 152),
 ('platform', 147),
 ('technologies', 139),
 ('environment', 136),
 ('systems', 128),
 ('technology', 121),
 ('build', 117),
 ('code', 112),
 ('applications', 111),
 ('javascript', 106),
 ('2016', 104),
 ('stack', 98),
 ('services', 97),
 ('start', 97),
 ('java', 97),
 ('engineering', 96),
 ('senior', 93),
 ('mobile', 93),
 ('industry', 92),
 ('cloud', 89),
 ('sql', 89),
 ('building', 89),
 ('test', 86),
 ('science', 85),
 ('linux', 84),
 ('application', 83),
 ('solutions', 82),
 ('learning', 82),
 ('backend', 80),
 ('c', 78),
 ('products', 76),
 ('testing', 73),
 ('agile', 68),
 ('programming', 65),
 ('02', 65),
 ('develop', 65),
 ('aws', 65),
 ('api', 64),
 ('engineers', 63),
 ('performance', 63),
 ('base', 63),
 ('mysql', 62),
 ('architecture', 62),
 ('tools', 62),
 ('technical', 62),
 ('developers', 61),
 ('right', 58),
 ('machine', 58),
 ('processing', 57),
 ('scalable', 56),
 ('features', 55),
 ('business', 54),
 ('databases', 54),
 ('companies', 53),
 ('open', 52),
 ('framework', 52),
 ('support', 51),
 ('lead', 51),
 ('leading', 51),
 ('integration', 50),
 ('projects', 49),
 ('analytics', 49),
 ('degree', 49),
 ('implement', 49),
 ('automation', 48),
 ('including', 48),
 ('developing', 48),
 ('package', 47),
 ('flexible', 47),
 ('quality', 45),
 ('nosql', 45),
 ('server', 44),
 ('database', 43),
 ('source', 43),
 ('git', 42),
 ('01', 41),
 ('driven', 41),
 ('security', 41),
 ('html', 41),
 ('distributed', 41),
 ('postgresql', 40),
 ('innovative', 40),
 ('financial', 40),
 ('infrastructure', 39),
 ('complex', 39),
 ('object', 38),
 ('availability', 38),
 ('role', 38),
 ('04', 37),
 ('maintain', 37),
 ('requirements', 37),
 ('resume', 37),
 ('css', 37),
 ('communication', 36),
 ('career', 36),
 ('customers', 36),
 ('huge', 35),
 ('scale', 35),
 ('unit', 35),
 ('mongodb', 35),
 ('required', 35),
 ('multiple', 34),
 ('redis', 34),
 ('js', 34),
 ('service', 34),
 ('comfortable', 34),
 ('apis', 33),
 ('practices', 33),
 ('production', 33),
 ('send', 33),
 ('professional', 33),
 ('class', 33),
 ('backed', 32),
 ('room', 32),
 ('writing', 32),
 ('scientist', 31),
 ('teams', 31),
 ('deployment', 31),
 ('paced', 30),
 ('unix', 30),
 ('restful', 30),
 ('languages', 30),
 ('plan', 30),
 ('leadership', 30),
 ('insurance', 29),
 ('ruby', 29),
 ('core', 29),
 ('space', 29),
 ('flask', 29),
 ('tech', 29),
 ('interested', 29),
 ('platforms', 29),
 ('existing', 29),
 ('creating', 28),
 ('jquery', 28),
 ('background', 28),
 ('coding', 28),
 ('rest', 28),
 ('project', 27),
 ('write', 27),
 ('learn', 26),
 ('enterprise', 26),
 ('businesses', 26),
 ('different', 26),
 ('similar', 26),
 ('integrate', 26),
 ('analysis', 26),
 ('beautiful', 26),
 ('mentor', 25),
 ('process', 25),
 ('participate', 25),
 ('field', 25),
 ('custom', 25),
 ('cassandra', 25),
 ('frameworks', 25),
 ('relational', 25),
 ('algorithms', 25),
 ('automated', 24),
 ('various', 24),
 ('needs', 24),
 ('expertise', 24),
 ('collaborative', 24),
 ('scalability', 24),
 ('video', 24),
 ('transaction', 24),
 ('php', 24),
 ('relocation', 24),
 ('remote', 24),
 ('hard', 24),
 ('solving', 24),
 ('collaborate', 23),
 ('language', 23),
 ('tests', 23),
 ('healthcare', 23),
 ('000', 22),
 ('ip', 22),
 ('contribute', 22),
 ('implementation', 22),
 ('generation', 22),
 ('components', 22),
 ('expert', 22),
 ('online', 22),
 ('continuous', 22),
 ('latest', 22),
 ('built', 21),
 ('information', 21),
 ('daily', 21),
 ('million', 21),
 ('amazon', 21),
 ('manage', 21),
 ('junior', 21),
 ('client', 21),
 ('hadoop', 21),
 ('venture', 21),
 ('windows', 21),
 ('able', 21),
 ('members', 21),
 ('proficiency', 21),
 ('global', 20),
 ('rapid', 20),
 ('angular', 20),
 ('scripting', 20),
 ('stage', 20),
 ('tcp', 20),
 ('care', 20),
 ('network', 20),
 ('include', 20),
 ('optimization', 20),
 ('html5', 19),
 ('streaming', 19),
 ('word', 19),
 ('following', 19),
 ('capital', 19),
 ('storage', 19),
 ('location', 19),
 ('candidates', 19),
 ('dynamic', 19),
 ('architect', 19),
 ('app', 19),
 ('significant', 19),
 ('ve', 18),
 ('05', 18),
 ('solve', 18),
 ('things', 18),
 ('mid', 18),
 ('allows', 18),
 ('qa', 18),
 ('tool', 18),
 ('assistance', 18),
 ('designing', 18),
 ('largest', 18),
 ('saas', 18),
 ('believe', 18),
 ('program', 18),
 ('necessary', 18),
 ('challenging', 18),
 ('internet', 18),
 ('designers', 17),
 ('smart', 17),
 ('opportunities', 17),
 ('delivery', 17),
 ('issues', 17),
 ('control', 17),
 ('home', 17),
 ('millions', 17),
 ('near', 17),
 ('communicate', 17),
 ('format', 17),
 ('predictive', 17),
 ('challenges', 17),
 ('solution', 17),
 ('internal', 17),
 ('ground', 16),
 ('intelligence', 16),
 ('providers', 16),
 ('member', 16),
 ('teamwhat', 16),
 ('add', 16),
 ('access', 16),
 ('headquartered', 16),
 ('contributing', 16),
 ('potential', 16),
 ('modeling', 16),
 ('selenium', 16),
 ('analytical', 16),
 ('forward', 16),
 ('scaling', 16),
 ('unlimited', 16),
 ('media', 16),
 ('city', 16),
 ('interface', 16),
 ('park', 16),
 ('optimize', 16),
 ('energy', 16),
 ('efficient', 16),
 ('thinking', 16),
 ('proprietary', 16),
 ('advertising', 16),
 ('postgres', 16),
 ('cross', 16),
 ('lives', 16),
 ('uswe', 15),
 ('marketplace', 15),
 ('maintaining', 15),
 ('models', 15),
 ('define', 15),
 ('basic', 15),
 ('consumer', 15),
 ('oracle', 15),
 ('alongside', 15),
 ('balance', 15),
 ('apps', 15),
 ('experts', 15),
 ('unique', 15),
 ('set', 15),
 ('come', 15),
 ('github', 15),
 ('credit', 15),
 ('fastest', 15),
 ('austin', 15),
 ('private', 15),
 ('disruptive', 15),
 ('chance', 15),
 ('results', 15),
 ('motivated', 14),
 ('turn', 14),
 ('mission', 14),
 ('ready', 14),
 ('lunch', 14),
 ('mining', 14),
 ('state', 14),
 ('expand', 14),
 ('independently', 14),
 ('methodologies', 14),
 ('funding', 14),
 ('computing', 14),
 ('offering', 14),
 ('cases', 14),
 ('enjoy', 14),
 ('problem', 14),
 ('consumers', 14),
 ('managing', 14),
 ('ensure', 14),
 ('plans', 14),
 ('external', 14),
 ('140k', 14),
 ('local', 14),
 ('leader', 14),
 ('skilled', 13),
 ('person', 13),
 ('functional', 13),
 ('elasticsearch', 13),
 ('meet', 13),
 ('chef', 13),
 ('tdd', 13),
 ('environments', 13),
 ('ideas', 13),
 ('proficient', 13),
 ('collection', 13),
 ('css3', 13),
 ('managers', 13),
 ('provides', 13),
 ('equivalent', 13),
 ('spirit', 13),
 ('simple', 13),
 ('nice', 13),
 ('cool', 13),
 ('focus', 13),
 ('written', 13),
 ('downtown', 13),
 ('basis', 13),
 ('functions', 13),
 ('snacks', 13),
 ('cycle', 13),
 ('york', 13),
 ('bonuses', 12),
 ('track', 12),
 ('specifications', 12),
 ('rewarded', 12),
 ('willing', 12),
 ('packages', 12),
 ('applying', 12),
 ('statistical', 12),
 ('critical', 12),
 ('brand', 12),
 ('speak', 12),
 ('art', 12),
 ('organization', 12),
 ('celery', 12),
 ('environmentwhat', 12),
 ('planning', 12),
 ('sports', 12),
 ('flexibility', 12),
 ('50', 12),
 ('reach', 12),
 ('nyc', 12),
 ('pluswhat', 12),
 ('libraries', 12),
 ('partner', 12),
 ('massive', 12),
 ('complete', 12),
 ('mvc', 12),
 ('rabbitmq', 12),
 ('analyze', 12),
 ('finance', 12),
 ('patients', 11),
 ('annual', 11),
 ('responsible', 11),
 ('debug', 11),
 ('7', 11),
 ('available', 11),
 ('ownership', 11),
 ('interact', 11),
 ('foundation', 11),
 ('investors', 11),
 ('payment', 11),
 ('extend', 11),
 ('revenue', 11),
 ('robust', 11),
 ('marketing', 11),
 ('received', 11),
 ('100k', 11),
 ('environmentso', 11),
 ('servers', 11),
 ('utilizing', 11),
 ('coverage', 11),
 ('saving', 11),
 ('sets', 11),
 ('developmentwhat', 11),
 ('difference', 11),
 ('key', 11),
 ('important', 11),
 ('google', 11),
 ('winning', 11),
 ('secure', 11),
 ('week', 11),
 ('food', 11),
 ('closely', 11),
 ('view', 11),
 ('positionrequirements', 11),
 ('known', 11),
 ('public', 11),
 ('deep', 11),
 ('6', 11),
 ('floor', 11),
 ('loves', 11),
 ('benefit', 10),
 ('santa', 10),
 ('away', 10),
 ('node', 10),
 ('exceptional', 10),
 ('helping', 10),
 ('store', 10),
 ('functionality', 10),
 ('voice', 10),
 ('ui', 10),
 ('cs', 10),
 ('documentation', 10),
 ('improvements', 10),
 ('content', 10),
 ('embedded', 10),
 ('providing', 10),
 ('familiar', 10),
 ('ios', 10),
 ('pragmatism', 10),
 ('connect', 10),
 ('interpersonal', 10),
 ('principals', 10),
 ('ux', 10),
 ('backbone', 10),
 ('brands', 10),
 ('implementing', 10),
 ('qualified', 10),
 ('globally', 10),
 ('sources', 10),
 ('doingwe', 10),
 ('shape', 10),
 ('tons', 10),
 ('webdriver', 10),
 ('firms', 10),
 ('joining', 10),
 ('exercising', 10),
 ('patterns', 10),
 ('individually', 10),
 ('asynchronous', 10),
 ('second', 10),
 ('collaboratively', 10),
 ('addition', 10),
 ('billion', 10),
 ('record', 10),
 ('automate', 10),
 ('stocked', 10),
 ('close', 10),
 ('irvine', 10),
 ('extensive', 10),
 ('monitoring', 10),
 ('processes', 10),
 ('experiences', 10),
 ('device', 10),
 ('healthy', 10),
 ('education', 10),
 ('sdlc', 10),
 ('encryption', 10),
 ('facebook', 10),
 ('suite', 10),
 ('emergency', 10),
 ('multi', 10),
 ('input', 9),
 ('initiatives', 9),
 ('decision', 9),
 ('having', 9),
 ('jmeter', 9),
 ('event', 9),
 ('direction', 9),
 ('http', 9),
 ('takes', 9),
 ('month', 9),
 ('reading', 9),
 ('past', 9),
 ('especially', 9),
 ('reviews', 9),
 ('structures', 9),
 ('pipelines', 9),
 ('minimum', 9),
 ('actively', 9),
 ('engagement', 9),
 ('operations', 9),
 ('assist', 9),
 ('held', 9),
 ('visibility', 9),
 ('70', 9),
 ('weekly', 9),
 ('contract', 9),
 ('ad', 9),
 ('marketers', 9),
 ('strategic', 9),
 ('docker', 9),
 ('master', 9),
 ('25', 9),
 ('document', 9),
 ('minimal', 9),
 ('speed', 9),
 ('play', 9),
 ('lending', 9),
 ('integrating', 9),
 ('sqlalchemy', 9),
 ('grinder', 9),
 ('capacity', 9),
 ('scrum', 9),
 ('hundreds', 9),
 ('encouraged', 9),
 ('bachelors', 9),
 ('constantly', 9),
 ('site', 9),
 ('mysqlwhat', 9),
 ('apache', 9),
 ('levels', 9),
 ('game', 9),
 ('angularjs', 9),
 ('elegant', 9),
 ('brightest', 9),
 ('devops', 9),
 ('monica', 9),
 ('doing1', 9),
 ('bleeding', 9),
 ('messaging', 9),
 ('composed', 9),
 ('gym', 9),
 ('json', 9),
 ('review', 9),
 ('sr', 9),
 ('areas', 9),
 ('stability', 9),
 ('15', 8),
 ('points', 8),
 ('lucene', 8),
 ('setting', 8),
 ('analyzing', 8),
 ('medicine', 8),
 ('entire', 8),
 ('ecommerce', 8),
 ('seattle', 8),
 ('desired', 8),
 ('handle', 8),
 ('tell', 8),
 ('savvy', 8),
 ('effectively', 8),
 ('challenge', 8),
 ('leverage', 8),
 ('digital', 8),
 ('apple', 8),
 ('interviews', 8),
 ('delivering', 8),
 ('120k', 8),
 ('serviceswhat', 8),
 ('responsibilities', 8),
 ('participating', 8),
 ('entrepreneurs', 8),
 ('creators', 8),
 ('administration', 8),
 ('single', 8),
 ('running', 8),
 ('notch', 8),
 ('buy', 8),
 ('bash', 8),
 ('option', 8),
 ('pride', 8),
 ('tier', 8),
 ('pandas', 8),
 ('matlab', 8),
 ('easy', 8),
 ('idea', 8),
 ('programs', 8),
 ('substantial', 8),
 ('recognized', 8),
 ('couple', 8),
 ('phone', 8),
 ('kitchen', 8),
 ('interesting', 8),
 ('partners', 8),
 ('tasks', 8),
 ('immediate', 8),
 ('power', 8),
 ('bringing', 8),
 ('applied', 8),
 ('efforts', 8),
 ('coming', 8),
 ('moving', 8),
 ('300', 8),
 ('journey', 8),
 ('payments', 8),
 ('ok', 8),
 ('focused', 8),
 ('image', 8),
 ('ny', 8),
 ('load', 8),
 ('lifecycle', 8),
 ('statistics', 8),
 ('ones', 8),
 ('component', 8),
 ('frontend', 8),
 ('ror', 8),
 ('worked', 8),
 ('scala', 8),
 ('email', 8),
 ('skill', 8),
 ('individuals', 8),
 ('fortune', 8),
 ('ups', 8),
 ('21', 8),
 ('coordinate', 8),
 ('understand', 8),
 ('feel', 7),
 ('producing', 7),
 ('performing', 7),
 ('welcome', 7),
 ('nginx', 7),
 ('continuing', 7),
 ('going', 7),
 ('handling', 7),
 ('dedicated', 7),
 ('essential', 7),
 ('simply', 7),
 ('complexity', 7),
 ('mentoring', 7),
 ('true', 7),
 ('receiving', 7),
 ('youwe', 7),
 ('etl', 7),
 ('reliability', 7),
 ('experiencewhat', 7),
 ('comprised', 7),
 ('farming', 7),
 ('robotics', 7),
 ('initiative', 7),
 ('rdbms', 7),
 ('palo', 7),
 ('zero', 7),
 ('worldwide', 7),
 ('bug', 7),
 ('truly', 7),
 ('requirement', 7),
 ('staff', 7),
 ('function', 7),
 ('county', 7),
 ('party', 7),
 ('resolve', 7),
 ('extreme', 7),
 ('20', 7),
 ('clearance', 7),
 ('previous', 7),
 ('usfast', 7),
 ('students', 7),
 ('brings', 7),
 ('bay', 7),
 ('history', 7),
 ('principles', 7),
 ('events', 7),
 ('deal', 7),
 ('130k', 7),
 ('visualization', 7),
 ('costs', 7),
 ('mix', 7),
 ('thought', 7),
 ('shell', 7),
 ('risk', 7),
 ('techniques', 7),
 ('privately', 7),
 ('csswhat', 7),
 ('training', 7),
 ('sdet', 7),
 ('solr', 7),
 ('increase', 7),
 ('stores', 7),
 ('transforming', 7),
 ('110', 7),
 ('invent', 7),
 ('volume', 7),
 ('improvement', 7),
 ('lauren', 7),
 ('execute', 7),
 ('travel', 7),
 ('semantic', 7),
 ('collaboration', 7),
 ('alto', 7),
 ('order', 7),
 ('regular', 7),
 ('established', 7),
 ('typically', 7),
 ('rates', 7),
 ('caltrain', 7),
 ('scientists', 6),
 ('recognition', 6),
 ('disrupt', 6),
 ('lamp', 6),
 ('difficult', 6),
 ('relevant', 6),
 ('ship', 6),
 ('incredible', 6),
 ('main', 6),
 ('house', 6),
 ('places', 6),
 ('android', 6),
 ('you1', 6),
 ('etcwhat', 6),
 ('hear', 6),
 ('assisting', 6),
 ('secret', 6),
 ('relaxed', 6),
 ('capabilities', 6),
 ('reporting', 6),
 ('interacting', 6),
 ('technologieswhat', 6),
 ('gitwhat', 6),
 ('identify', 6),
 ('minds', 6),
 ('growthwhat', 6),
 ('masters', 6),
 ('pre', 6),
 ('literally', 6),
 ('error', 6),
 ('transactions', 6),
 ('style', 6),
 ('sell', 6),
 ('blockchain', 6),
 ('additional', 6),
 ('value', 6),
 ('supporting', 6),
 ('phd', 6),
 ('microsoft', 6),
 ('run', 6),
 ('orm', 6),
 ('attitude', 6),
 ('sharing', 6),
 ('expect', 6),
 ('stehle', 6),
 ('asap', 6),
 ('hackers', 6),
 ('participation', 6),
 ('benefits3', 6),
 ('bigger', 6),
 ('los', 6),
 ('balancewhat', 6),
 ('primary', 6),
 ('traditional', 6),
 ('really', 6),
 ('player', 6),
 ('california', 6),
 ('specification', 6),
 ('3rd', 6),
 ('ins', 6),
 ('visionso', 6),
 ('distribution', 6),
 ('bugs', 6),
 ('assess', 6),
 ('director', 6),
 ('developments', 6),
 ('billions', 6),
 ('precision', 6),
 ('orange', 6),
 ('allow', 6),
 ('newest', 6),
 ('financing', 6),
 ('perl', 6),
 ('bootstrap', 6),
 ('oo', 6),
 ('talk', 6),
 ('title', 6),
 ('ben', 6),
 ('accurate', 6),
 ('discuss', 6),
 ('launch', 6),
 ('youstrong', 6),
 ('position1', 6),
 ('research', 6),
 ('engineerlocation', 6),
 ('keeping', 6),
 ('compliance', 6),
 ('jira', 6),
 ('version', 6),
 ('policy', 6),
 ('marionette', 6),
 ('memcache', 6),
 ('beach', 6),
 ('practiceswhat', 6),
 ('breakfast', 6),
 ('sf', 6),
 ('angeles', 6),
 ('generating', 6),
 ('wants', 6),
 ('concurrency', 6),
 ('chicago', 6),
 ('blocks', 6),
 ('traffic', 6),
 ('optimal', 6),
 ('lot', 6),
 ('gaming', 6),
 ('28', 6),
 ('standard', 6),
 ('community', 6),
 ('cyber', 6),
 ('90k', 6),
 ('shopping', 6),
 ('outs', 6),
 ('30', 6),
 ('gain', 6),
 ('safety', 6),
 ('technologists', 6),
 ('page', 6),
 ('tackling', 6),
 ('physicians', 6),
 ('point', 6),
 ('ecosystem', 6),
 ('greater', 6),
 ('backgrounds', 6),
 ('does', 6),
 ('station', 6),
 ('ibm', 6),
 ('positionmore', 6),
 ('creation', 6),
 ('releases', 6),
 ('sharp', 6),
 ('heart', 6),
 ('plenty', 6),
 ('mathematics', 6),
 ('pluses', 6),
 ('improving', 6),
 ('look', 6),
 ('o', 5),
 ('talents', 5),
 ('responsive', 5),
 ('query', 5),
 ('major', 5),
 ('depending', 5),
 ('leave', 5),
 ('couchdb', 5),
 ('pyramid', 5),
 ('retailers', 5),
 ('layers', 5),
 ('commuter', 5),
 ('db', 5),
 ('modular', 5),
 ('fullstack', 5),
 ('involving', 5),
 ('jenkins', 5),
 ('diving', 5),
 ('firm', 5),
 ('hit', 5),
 ('integrated', 5),
 ('gets', 5),
 ('minded', 5),
 ('sass', 5),
 ('redwood', 5),
 ('environmentif', 5),
 ('troubleshoot', 5),
 ('retail', 5),
 ('evaluation', 5),
 ('moderate', 5),
 ('endless', 5),
 ('meetings', 5),
 ('achieving', 5),
 ('monitors', 5),
 ('wellness', 5),
 ('schedules', 5),
 ('abundant', 5),
 ('mentality', 5),
 ('groups', 5),
 ('excel', 5),
 ('browser', 5),
 ('leveraging', 5),
 ('hbase', 5),
 ('ansible', 5),
 ('numpy', 5),
 ('teammates', 5),
 ('appium', 5),
 ('cambridge', 5),
 ('games', 5),
 ('dna', 5),
 ('prove', 5),
 ('11', 5),
 ('attention', 5),
 ('produces', 5),
 ('youexcellent', 5),
 ('hospitals', 5),
 ('discussions', 5),
 ('8', 5),
 ('successfully', 5),
 ('direct', 5),
 ('promoting', 5),
 ('iterative', 5),
 ('algorithm', 5),
 ('specific', 5),
 ('assurance', 5),
 ('hyper', 5),
 ('resources', 5),
 ('visit', 5),
 ('decisions', 5),
 ('parking', 5),
 ('ticket', 5),
 ('advancement', 5),
 ('2014', 5),
 ('adaptable', 5),
 ('ajax', 5),
 ('deploy', 5),
 ('recent', 5),
 ('size', 5),
 ('versed', 5),
 ('goal', 5),
 ('follow', 5),
 ('consists', 5),
 ('patient', 5),
 ('fixing', 5),
 ('meals', 5),
 ('stakeholders', 5),
 ('enhance', 5),
 ('leverages', 5),
 ('ec2', 5),
 ('tremendous', 5),
 ('purchase', 5),
 ('massively', 5),
 ('metrics', 5),
 ('company2', 5),
 ('share', 5),
 ('commerce', 5),
 ('debugging', 5),
 ('organizations', 5),
 ('storm', 5),
 ('little', 5),
 ('researching', 5),
 ('installationswhat', 5),
 ('monthly', 5),
 ('gather', 5),
 ('short', 5),
 ('modern', 5),
 ('telecommute', 5),
 ('drive', 5),
 ('qualifications', 5),
 ('mono', 5),
 ('demonstrate', 5),
 ('mode', 5),
 ('eat', 5),
 ('maintenance', 5),
 ('demand', 5),
 ('interfaces', 5),
 ('virtual', 5),
 ('establish', 5),
 ('values', 5),
 ('individual', 5),
 ('accommodate', 5),
 ('goals', 5),
 ('engine', 5),
 ('maintainable', 5),
 ('requires', 5),
 ('embrace', 5),
 ('agriculture', 5),
 ('fault', 5),
 ('duties', 5),
 ('eager', 5),
 ('developmentso', 5),
 ('common', 5),
 ('guide', 5),
 ('airport', 5),
 ('folks', 5),
 ('thousands', 5),
 ('presence', 5),
 ('puppet', 5),
 ('turnoverwhat', 5),
 ('selection', 5),
 ('pathology', 5),
 ...]

In [3]:
len(words)


Out[3]:
65816

In [4]:
# Fancy way to count words that are wanted instead of filtering words that are unwanted
import re
import collections


wanted = ['automated', 'database', 'flask', 'postgresql', 'javascript', 'linux', 'unix', 'ansible',
          'puppet', 'selenium', 'mysql', 'restful', 'gui', 'artificial', 'intelligence', 'django', 'api', 'full', 'stack',
         'automation', 'sql', 'data', 'security', 'testing', 'test', 'git', 'json', 'cloud', 'devops', 'celery',
         'analytics', 'visualization', 'redis', 'mongodb', 'nosql', 'openstack', 'aws', 'salt', 'analysis', 'golang',
          'qa', 'distributed', 'chef',]
matches = re.findall('\w+', open('job_text.txt').read().lower())
counts = collections.Counter(matches)
my_wants = list(map(lambda x:(x,counts[x]),wanted)) 
sorted(my_wants, key=lambda wanted_words: wanted_words[1], reverse=True)


Out[4]:
[('data', 319),
 ('django', 212),
 ('full', 113),
 ('javascript', 106),
 ('stack', 98),
 ('sql', 89),
 ('cloud', 89),
 ('test', 86),
 ('linux', 84),
 ('testing', 73),
 ('aws', 65),
 ('api', 64),
 ('mysql', 62),
 ('analytics', 49),
 ('automation', 48),
 ('nosql', 45),
 ('database', 43),
 ('git', 42),
 ('security', 41),
 ('distributed', 41),
 ('postgresql', 40),
 ('mongodb', 35),
 ('redis', 34),
 ('unix', 30),
 ('restful', 30),
 ('flask', 29),
 ('analysis', 26),
 ('automated', 24),
 ('qa', 18),
 ('selenium', 16),
 ('intelligence', 16),
 ('chef', 13),
 ('celery', 12),
 ('json', 9),
 ('devops', 9),
 ('visualization', 7),
 ('ansible', 5),
 ('puppet', 5),
 ('artificial', 4),
 ('golang', 3),
 ('openstack', 2),
 ('gui', 1),
 ('salt', 0)]

In [13]:
# A much easier technique to count wanted words than the method posted above...
wanted = ['automated', 'database', 'flask', 'postgresql', 'javascript', 'linux', 'unix', 'ansible',
          'puppet', 'selenium', 'mysql', 'restful', 'gui', 'artificial', 'intelligence', 'django', 'api', 'full', 'stack',
         'automation', 'sql', 'data', 'security', 'testing', 'test', 'git', 'json', 'cloud', 'devops', 'celery',
         'analytics', 'visualization', 'redis', 'mongodb', 'nosql', 'openstack', 'aws', 'salt', 'analysis', 'golang',
          'qa', 'distributed', 'chef', 'docker']
cnt = collections.Counter()
words = re.findall('\w+', open('job_text.txt').read().lower())
for word in words:
    if word in wanted:
        cnt[word] += 1
print(cnt)


Counter({'data': 319, 'django': 212, 'full': 113, 'javascript': 106, 'stack': 98, 'cloud': 89, 'sql': 89, 'test': 86, 'linux': 84, 'testing': 73, 'aws': 65, 'api': 64, 'mysql': 62, 'analytics': 49, 'automation': 48, 'nosql': 45, 'database': 43, 'git': 42, 'distributed': 41, 'security': 41, 'postgresql': 40, 'mongodb': 35, 'redis': 34, 'restful': 30, 'unix': 30, 'flask': 29, 'analysis': 26, 'automated': 24, 'qa': 18, 'intelligence': 16, 'selenium': 16, 'chef': 13, 'celery': 12, 'docker': 9, 'devops': 9, 'json': 9, 'visualization': 7, 'ansible': 5, 'puppet': 5, 'artificial': 4, 'golang': 3, 'openstack': 2, 'gui': 1})

Devops and DBA

Here we are searching for devops and dba jobs to see what is relevant. Simply change the word in search terms in the get_jobs_posts() method to change the value


In [1]:
import os 
import requests
from bs4 import BeautifulSoup

def get_job_posts():
    '''This method goes to cyber coders and returns a list of the jobs as listed on the main
    page.'''
    job_posts = []
    for x in range(8):
        res = requests.get('http://www.cybercoders.com/search/?page='+ str(x) +'&searchterms=devops&searchlocation=&newsearch=true&sorttype=')
        soup = BeautifulSoup(res.text, 'lxml')
        posts = soup.select('div.job-title')
        job_posts.append(posts)
    print("Total pages scraped: %s \n" %len(job_posts))
    return job_posts

def get_job_pages(posts):
    '''This method dives into the listings and pulls the href... then it appends the html for 
    processing in the write_job_text method '''
    job_pages = []
    for items in posts:
        for links in items:
            #print(links.select('a'))
            for link in links.select('a'):
                ipage = requests.get('http://www.cybercoders.com/%s' %link.get('href'))
                job_pages.append(ipage)
    print("Total pages collected: %s \n" %(len(job_pages)))
    return job_pages


job_file = open('job_text_dba.txt', 'w')

def write_job_text(pages):
    '''This method simply pastes the text of the div job details to a text file... '''
    job_counter = 0
    for page in pages:
        soup1 = BeautifulSoup(page.text, 'lxml')
        job_details = soup1.select('div.job-details')
        for job_desc in job_details:
            #print(type(job_desc))
            #print(job_desc.text)
            job_counter += 1
            job_file.write(str(job_desc.text))
    print("Total job postings written to file: %s \n" %job_counter)
    job_file.close()

j = get_job_posts()
k = get_job_pages(j)
write_job_text(k)


Total pages scraped: 8 

Total pages collected: 160 

Total job postings written to file: 160 


In [2]:
import re
import collections


# Ugly, but effective method to filter common words from a corpus of text.
stopwords = ['a', 'about', 'above', 'across', 'after', 'afterwards']
stopwords += ['again', 'against', 'all', 'almost', 'alone', 'along']
stopwords += ['already', 'also', 'although', 'always', 'am', 'among']
stopwords += ['amongst', 'amoungst', 'amount', 'an', 'and', 'another']
stopwords += ['any', 'anyhow', 'anyone', 'anything', 'anyway', 'anywhere']
stopwords += ['are', 'around', 'as', 'at', 'back', 'be', 'became']
stopwords += ['because', 'become', 'becomes', 'becoming', 'been']
stopwords += ['before', 'beforehand', 'behind', 'being', 'below']
stopwords += ['beside', 'besides', 'between', 'beyond', 'bill', 'both']
stopwords += ['bottom', 'but', 'by', 'call', 'can', 'cannot', 'cant']
stopwords += ['co', 'computer', 'con', 'could', 'couldnt', 'cry', 'de']
stopwords += ['describe', 'detail', 'did', 'do', 'done', 'down', 'due']
stopwords += ['during', 'each', 'eg', 'eight', 'either', 'eleven', 'else']
stopwords += ['elsewhere', 'empty', 'enough', 'etc', 'even', 'ever']
stopwords += ['every', 'everyone', 'everything', 'everywhere', 'except']
stopwords += ['few', 'fifteen', 'fifty', 'fill', 'find', 'fire', 'first']
stopwords += ['five', 'for', 'former', 'formerly', 'forty', 'found']
stopwords += ['four', 'from', 'front', 'full', 'further', 'get', 'give']
stopwords += ['go', 'had', 'has', 'hasnt', 'have', 'he', 'hence', 'her']
stopwords += ['here', 'hereafter', 'hereby', 'herein', 'hereupon', 'hers']
stopwords += ['herself', 'him', 'himself', 'his', 'how', 'however']
stopwords += ['hundred', 'i', 'ie', 'if', 'in', 'inc', 'indeed']
stopwords += ['interest', 'into', 'is', 'it', 'its', 'itself', 'keep']
stopwords += ['last', 'latter', 'latterly', 'least', 'less', 'ltd', 'made']
stopwords += ['many', 'may', 'me', 'meanwhile', 'might', 'mill', 'mine']
stopwords += ['more', 'moreover', 'most', 'mostly', 'move', 'much']
stopwords += ['must', 'my', 'myself', 'name', 'namely', 'neither', 'never']
stopwords += ['nevertheless', 'next', 'nine', 'no', 'nobody', 'none']
stopwords += ['noone', 'nor', 'not', 'nothing', 'now', 'nowhere', 'of']
stopwords += ['off', 'often', 'on','once', 'one', 'only', 'onto', 'or']
stopwords += ['other', 'others', 'otherwise', 'our', 'ours', 'ourselves']
stopwords += ['out', 'over', 'own', 'part', 'per', 'perhaps', 'please']
stopwords += ['put', 'rather', 're', 's', 'same', 'see', 'seem', 'seemed']
stopwords += ['seeming', 'seems', 'serious', 'several', 'she', 'should']
stopwords += ['show', 'side', 'since', 'sincere', 'six', 'sixty', 'so']
stopwords += ['some', 'somehow', 'someone', 'something', 'sometime']
stopwords += ['sometimes', 'somewhere', 'still', 'such', 'system', 'take']
stopwords += ['ten', 'than', 'that', 'the', 'their', 'them', 'themselves']
stopwords += ['then', 'thence', 'there', 'thereafter', 'thereby', 'today']
stopwords += ['therefore', 'therein', 'thereupon', 'these', 'they']
stopwords += ['thick', 'thin', 'third', 'this', 'those', 'though', 'three']
stopwords += ['three', 'through', 'throughout', 'thru', 'thus', 'to']
stopwords += ['together', 'too', 'top', 'toward', 'towards', 'twelve']
stopwords += ['twenty', 'two', 'un', 'under', 'until', 'up', 'upon']
stopwords += ['us', 'very', 'via', 'was', 'we', 'well', 'were', 'what', 'want']
stopwords += ['whatever', 'when', 'whence', 'whenever', 'where']
stopwords += ['whereafter', 'whereas', 'whereby', 'wherein', 'whereupon']
stopwords += ['wherever', 'whether', 'which', 'while', 'whither', 'who']
stopwords += ['whoever', 'whole', 'whom', 'whose', 'why', 'will', 'with']
stopwords += ['within', 'without','work', 'would', 'yet', 'you', 'your']
stopwords += ['yours', 'yourself', 'yourselves']
stopwords += ['experience', 'job', 'details', 'u', 'posted', 'need', 'based', 'years', 'new', 'world', 'knowledge', 'join', 
             'reasons', 'salary', 'position', 'skills', 'exciting', 'market', 'small', 'excellent', 'great', 'year', 
             'compensation', 'love', 'provide', 'usl', 'highly', 'e', 'day', 'growth', 'way', 'people', 'facing', 
             'big', 'll', 'g', 'amazing', 'passion', '14', 'office', 'vacation', 'located', '5', 'medical', 'pto', 'offer',
             'used', 'exposure', 'captial', 'ca', 'help', 'funded', 'ideally', 'country', 'preferred', 'immediately', 'search',
             'active', 'improve', '100', '150k', 'dental', 'innovation', 'desire', 'clean', '401k', 'life', 'd', 'paid', 'non',
             'founded', 'changing', 'helps', '24', 'lunches', 'talent', 'familiarity', 'generous', '120', 'good', 'early',
             'pay', 'l', 'matching', 'advice', 'directly', 'ma', 'rapidly', 'don', 'money', 'experienced', '3', 'strong',
             'read', 'company', 'positionat', 'doing', '2015', '2', 'opportunity', '1', 'currently', 'social', 'oriented'
             'like', 'self', 'vision', 'clients', 'understanding', 'impact', 'using', 't', 'passionate', 'fit', 'related',
             'employees', 'health', 'make', 'hands', 'real', 'just', 'investment', 'comprehensive', 'doe', 'continue', 'extremely',
             'com', 'vc', 'flagship', 'problems', 'effective', 'afficient', '401', 'schedule', 'taking', 'methods', 'commercial', 
             'success', 'k', 'solid', 'quickly', 'outings', 'successful', 'create', 'provided', 'culture', '4', 'fun', 'bs',
             'awesome', 'users', 'talented', 'doingyou', 'b', 'free', 'grow', 'boston', 'san', 'francisco', 'retreats', 'offers',
             'deliver', 'holiday', 'holidays', 'lots', '104k', 'frisco', 'ambitious', 'minimize', 'bring', 'sprit', 'bright',
             'dinners', 'm', 'catered', 'uses', 'perks', 'ms', 'happy', 'know', 'opportunites', 'chat', 'days', 'revolutionizing',
             'crowd', 'tx', 'enable', 'seeking', 'seek', 'interviewing', 'soon', 'sized', 'award', 'lopez', 'needed', 'preferably',
             'doingas', 'making', 'r', '2009', 'starting', 'pa', 'apply', 'authorized', 'looking', 'end', 'benefits', '12', 'growing',
             'hire', 'ideal', 'proven', 'aspects', 'deadlines', 'promise', 'fast', 'live', 'bonus', 'hours', 'best', 'plus', 'user',
             'high', 'ability', 'level', 'cutting', 'edge', 'oriented', 'like', 'us1', 'large', 'equity', 'creative', 'options',
             'casual', 'future', 'candidate', '10', 'cybercoders', '0', 'place', 'area', 'group', 'youcompetitive', 'occasional',
             'fully', 'use', 'excited', 'bachelor', 'tv', 'better', 'employee','applicants', 'team', 'time', 'startup',
             'customer', 'advanced', 'stock', 'management', '2016']


words = re.sub(r'([a-z](?=[A-Z])|[A-Z](?=[A-Z][a-z]))', r'\1 ', open('job_text_dba.txt').read())
words = re.findall('\w+', open('job_text_dba.txt').read().lower())
#print(type(collections.Counter(words)))
x = collections.Counter(words)
for sword in stopwords:
    del x[sword]
x.most_common()


Out[2]:
[('devops', 501),
 ('engineer', 344),
 ('systems', 198),
 ('linux', 195),
 ('cloud', 156),
 ('environment', 152),
 ('tools', 150),
 ('development', 137),
 ('software', 133),
 ('aws', 128),
 ('infrastructure', 122),
 ('deployment', 111),
 ('python', 107),
 ('automation', 104),
 ('working', 104),
 ('chef', 102),
 ('support', 100),
 ('services', 98),
 ('monitoring', 95),
 ('build', 91),
 ('puppet', 89),
 ('competitive', 87),
 ('technology', 87),
 ('technologies', 86),
 ('continuous', 83),
 ('web', 81),
 ('production', 78),
 ('applications', 77),
 ('scripting', 76),
 ('including', 75),
 ('data', 74),
 ('application', 74),
 ('configuration', 74),
 ('administration', 73),
 ('maintain', 73),
 ('performance', 70),
 ('environments', 69),
 ('operations', 69),
 ('product', 68),
 ('engineering', 68),
 ('solutions', 65),
 ('products', 62),
 ('ruby', 62),
 ('building', 61),
 ('issues', 61),
 ('security', 61),
 ('server', 60),
 ('02', 55),
 ('base', 54),
 ('technical', 54),
 ('engineers', 54),
 ('industry', 52),
 ('integration', 51),
 ('platform', 51),
 ('java', 50),
 ('scale', 49),
 ('availability', 48),
 ('teams', 47),
 ('career', 47),
 ('servers', 45),
 ('flexible', 44),
 ('design', 42),
 ('windows', 41),
 ('jenkins', 41),
 ('role', 41),
 ('develop', 40),
 ('manage', 40),
 ('source', 39),
 ('git', 38),
 ('mobile', 38),
 ('start', 38),
 ('perl', 37),
 ('amazon', 37),
 ('open', 36),
 ('bash', 36),
 ('implement', 36),
 ('ansible', 35),
 ('mysql', 34),
 ('networking', 33),
 ('processes', 33),
 ('developing', 33),
 ('delivery', 33),
 ('ensure', 32),
 ('multiple', 32),
 ('troubleshooting', 32),
 ('business', 32),
 ('01', 32),
 ('code', 32),
 ('companies', 32),
 ('unix', 31),
 ('professional', 30),
 ('load', 29),
 ('practices', 29),
 ('maintenance', 29),
 ('troubleshoot', 29),
 ('senior', 29),
 ('room', 28),
 ('customers', 28),
 ('degree', 28),
 ('network', 28),
 ('ec2', 28),
 ('service', 27),
 ('equivalent', 27),
 ('automated', 27),
 ('process', 27),
 ('04', 26),
 ('languages', 26),
 ('docker', 26),
 ('test', 26),
 ('maintaining', 26),
 ('science', 26),
 ('platforms', 25),
 ('c', 25),
 ('agile', 25),
 ('unlimited', 25),
 ('operational', 25),
 ('automate', 25),
 ('shell', 25),
 ('database', 24),
 ('release', 24),
 ('projects', 24),
 ('scripts', 24),
 ('package', 23),
 ('resume', 23),
 ('tech', 23),
 ('background', 23),
 ('manager', 23),
 ('nagios', 23),
 ('sql', 23),
 ('s3', 22),
 ('stack', 22),
 ('control', 22),
 ('enterprise', 22),
 ('able', 22),
 ('balance', 22),
 ('requirements', 22),
 ('managing', 22),
 ('collaborative', 21),
 ('lead', 21),
 ('deployments', 21),
 ('operating', 21),
 ('saas', 21),
 ('marketing', 21),
 ('hardware', 21),
 ('designing', 21),
 ('responsible', 21),
 ('relevant', 21),
 ('deploying', 21),
 ('global', 20),
 ('innovative', 20),
 ('internal', 20),
 ('reliability', 20),
 ('distributed', 20),
 ('implementation', 20),
 ('supporting', 20),
 ('tuning', 20),
 ('send', 20),
 ('powershell', 20),
 ('cool', 19),
 ('closely', 19),
 ('microsoft', 19),
 ('various', 19),
 ('databases', 19),
 ('developers', 19),
 ('communication', 19),
 ('resolve', 19),
 ('hard', 19),
 ('tasks', 18),
 ('understand', 18),
 ('interested', 18),
 ('salt', 18),
 ('learning', 18),
 ('plan', 18),
 ('scaling', 18),
 ('class', 18),
 ('balancing', 18),
 ('leadership', 18),
 ('mongodb', 18),
 ('provisioning', 18),
 ('problem', 18),
 ('smart', 18),
 ('automating', 17),
 ('storage', 17),
 ('paced', 17),
 ('ready', 17),
 ('encouraged', 17),
 ('capacity', 17),
 ('dynamic', 17),
 ('right', 17),
 ('media', 17),
 ('video', 17),
 ('critical', 17),
 ('potential', 17),
 ('postgresql', 17),
 ('digital', 16),
 ('redis', 16),
 ('fastest', 16),
 ('leading', 16),
 ('testing', 16),
 ('sounds', 16),
 ('interesting', 16),
 ('extensive', 16),
 ('established', 16),
 ('hosting', 16),
 ('tons', 16),
 ('apache', 16),
 ('vmware', 16),
 ('apps', 15),
 ('title', 15),
 ('space', 15),
 ('define', 15),
 ('include', 15),
 ('installation', 15),
 ('ops', 15),
 ('asap', 15),
 ('architect', 15),
 ('configuring', 15),
 ('000', 15),
 ('bonuses', 15),
 ('key', 14),
 ('financial', 14),
 ('7', 14),
 ('deploy', 14),
 ('architecture', 14),
 ('net', 14),
 ('rapid', 14),
 ('available', 14),
 ('50', 14),
 ('initiatives', 14),
 ('required', 14),
 ('developer', 14),
 ('nginx', 14),
 ('million', 14),
 ('monitor', 14),
 ('document', 14),
 ('scalable', 14),
 ('commerce', 14),
 ('quality', 14),
 ('online', 14),
 ('opportunities', 14),
 ('environmentwhat', 14),
 ('creating', 14),
 ('azure', 14),
 ('writing', 14),
 ('driven', 14),
 ('collaborate', 14),
 ('play', 14),
 ('huge', 14),
 ('efficiency', 13),
 ('backed', 13),
 ('public', 13),
 ('current', 13),
 ('assist', 13),
 ('unique', 13),
 ('organization', 13),
 ('willing', 13),
 ('hadoop', 13),
 ('splunk', 13),
 ('custom', 13),
 ('contribute', 13),
 ('expertise', 13),
 ('builds', 13),
 ('atmosphere', 13),
 ('php', 13),
 ('identify', 13),
 ('access', 13),
 ('ip', 13),
 ('variety', 13),
 ('dev', 13),
 ('administrator', 13),
 ('consumer', 13),
 ('offices', 13),
 ('set', 13),
 ('tcp', 13),
 ('kafka', 13),
 ('firm', 13),
 ('written', 12),
 ('clustering', 12),
 ('scheduling', 12),
 ('centos', 12),
 ('administering', 12),
 ('8', 12),
 ('focus', 12),
 ('solution', 12),
 ('latest', 12),
 ('nice', 12),
 ('daily', 12),
 ('change', 12),
 ('leader', 12),
 ('nosql', 12),
 ('education', 12),
 ('enjoy', 12),
 ('solve', 12),
 ('expert', 12),
 ('solving', 12),
 ('recovery', 12),
 ('challenging', 12),
 ('qa', 12),
 ('project', 12),
 ('firewalls', 12),
 ('cassandra', 12),
 ('home', 12),
 ('client', 11),
 ('offering', 11),
 ('programming', 11),
 ('goals', 11),
 ('ci', 11),
 ('ground', 11),
 ('running', 11),
 ('tomcat', 11),
 ('github', 11),
 ('stable', 11),
 ('location', 11),
 ('implementing', 11),
 ('tier', 11),
 ('beautiful', 11),
 ('joining', 11),
 ('dns', 11),
 ('lunch', 11),
 ('following', 11),
 ('analytics', 11),
 ('learn', 11),
 ('participate', 11),
 ('come', 11),
 ('procedures', 11),
 ('ubuntu', 11),
 ('ad', 11),
 ('streamline', 11),
 ('machine', 11),
 ('methodologies', 11),
 ('sales', 11),
 ('version', 11),
 ('things', 10),
 ('city', 10),
 ('virtualization', 10),
 ('vendors', 10),
 ('driving', 10),
 ('fixing', 10),
 ('capabilities', 10),
 ('complex', 10),
 ('pre', 10),
 ('robust', 10),
 ('experiencewhat', 10),
 ('iis', 10),
 ('notch', 10),
 ('value', 10),
 ('easy', 10),
 ('packages', 10),
 ('tooling', 10),
 ('youfor', 10),
 ('configure', 10),
 ('brightest', 10),
 ('step', 10),
 ('relocation', 10),
 ('ipo', 10),
 ('groups', 10),
 ('pipeline', 10),
 ('app', 10),
 ('places', 10),
 ('virtual', 10),
 ('http', 10),
 ('overall', 10),
 ('information', 10),
 ('multi', 10),
 ('analytical', 10),
 ('planning', 10),
 ('plans', 10),
 ('different', 10),
 ('care', 10),
 ('advertising', 10),
 ('deep', 9),
 ('100k', 9),
 ('complete', 9),
 ('focused', 9),
 ('principles', 9),
 ('major', 9),
 ('setting', 9),
 ('content', 9),
 ('comfortable', 9),
 ('person', 9),
 ('managed', 9),
 ('necessary', 9),
 ('program', 9),
 ('consumers', 9),
 ('members', 9),
 ('operate', 9),
 ('reporting', 9),
 ('jira', 9),
 ('language', 9),
 ('tool', 9),
 ('elastic', 9),
 ('2012', 9),
 ('rotation', 9),
 ('improving', 9),
 ('investigating', 9),
 ('site', 9),
 ('stability', 9),
 ('improvements', 9),
 ('performing', 9),
 ('postgres', 9),
 ('urgent', 9),
 ('html', 9),
 ('ensuring', 9),
 ('functional', 9),
 ('built', 9),
 ('documentation', 9),
 ('newest', 9),
 ('mac', 9),
 ('operation', 9),
 ('installing', 9),
 ('brand', 9),
 ('youwe', 8),
 ('includes', 8),
 ('meet', 8),
 ('limited', 8),
 ('venture', 8),
 ('needs', 8),
 ('protocols', 8),
 ('bamboo', 8),
 ('email', 8),
 ('svn', 8),
 ('game', 8),
 ('uswe', 8),
 ('basis', 8),
 ('coordinate', 8),
 ('going', 8),
 ('analyze', 8),
 ('javascript', 8),
 ('responsibilities', 8),
 ('possible', 8),
 ('affiliate', 8),
 ('setup', 8),
 ('continues', 8),
 ('mission', 8),
 ('standard', 8),
 ('heart', 8),
 ('policy', 8),
 ('rackspace', 8),
 ('presence', 8),
 ('field', 8),
 ('immediate', 8),
 ('training', 8),
 ('sharing', 8),
 ('term', 8),
 ('features', 8),
 ('satisfaction', 8),
 ('15', 8),
 ('sites', 8),
 ('computing', 8),
 ('firms', 8),
 ('metrics', 8),
 ('brands', 8),
 ('additional', 8),
 ('install', 8),
 ('positionrequirements', 8),
 ('significant', 8),
 ('aggressive', 8),
 ('center', 8),
 ('rabbitmq', 8),
 ('ownership', 8),
 ('effectively', 8),
 ('vpc', 8),
 ('areas', 8),
 ('heavily', 8),
 ('headquartered', 8),
 ('graphite', 8),
 ('worldwide', 8),
 ('interface', 8),
 ('word', 8),
 ('concepts', 8),
 ('really', 8),
 ('exceptional', 8),
 ('cross', 7),
 ('secure', 7),
 ('upside', 7),
 ('30', 7),
 ('rds', 7),
 ('mode', 7),
 ('engage', 7),
 ('funding', 7),
 ('switches', 7),
 ('google', 7),
 ('requires', 7),
 ('excel', 7),
 ('profitable', 7),
 ('look', 7),
 ('generation', 7),
 ('state', 7),
 ('simply', 7),
 ('insurance', 7),
 ('weekly', 7),
 ('framework', 7),
 ('zabbix', 7),
 ('failover', 7),
 ('21', 7),
 ('promote', 7),
 ('launch', 7),
 ('telecommute', 7),
 ('140k', 7),
 ('personal', 7),
 ('measure', 7),
 ('mentor', 7),
 ('architectures', 7),
 ('alex', 7),
 ('resources', 7),
 ('disaster', 7),
 ('mraz', 7),
 ('mind', 7),
 ('positionmore', 7),
 ('elasticsearch', 7),
 ('certification', 7),
 ('challenges', 7),
 ('centers', 7),
 ('helping', 7),
 ('advertisement', 7),
 ('party', 7),
 ('motivated', 7),
 ('24x7', 7),
 ('dedicated', 7),
 ('businesses', 7),
 ('addition', 7),
 ('thousands', 7),
 ('cd', 7),
 ('assistance', 7),
 ('responsibility', 7),
 ('timely', 7),
 ('devices', 7),
 ('scalability', 7),
 ('volume', 7),
 ('cycle', 7),
 ('evangelist', 7),
 ('invest', 7),
 ('logstash', 7),
 ('run', 7),
 ('remote', 7),
 ('coding', 7),
 ('candidates', 7),
 ('entrepreneurial', 7),
 ('chain', 7),
 ('minds', 7),
 ('transportation', 7),
 ('uptime', 7),
 ('cacti', 7),
 ('elk', 7),
 ('doingwe', 7),
 ('orchestration', 7),
 ('known', 7),
 ('intelligence', 7),
 ('supported', 6),
 ('fault', 6),
 ('changes', 6),
 ('managers', 6),
 ('provides', 6),
 ('ups', 6),
 ('policies', 6),
 ('away', 6),
 ('meals', 6),
 ('activities', 6),
 ('foundation', 6),
 ('billions', 6),
 ('review', 6),
 ('cause', 6),
 ('industries', 6),
 ('backups', 6),
 ('cisco', 6),
 ('configurations', 6),
 ('santa', 6),
 ('engagement', 6),
 ('cost', 6),
 ('tests', 6),
 ('winning', 6),
 ('ve', 6),
 ('lifecycle', 6),
 ('container', 6),
 ('getting', 6),
 ('gain', 6),
 ('providers', 6),
 ('entire', 6),
 ('log', 6),
 ('chance', 6),
 ('stocked', 6),
 ('bay', 6),
 ('cloudformation', 6),
 ('stage', 6),
 ('family', 6),
 ('6', 6),
 ('efficient', 6),
 ('maven', 6),
 ('list', 6),
 ('relational', 6),
 ('speed', 6),
 ('travel', 6),
 ('taskswhat', 6),
 ('infrastructures', 6),
 ('order', 6),
 ('demand', 6),
 ('120k', 6),
 ('rewarded', 6),
 ('kind', 6),
 ('improvement', 6),
 ('saltstack', 6),
 ('professionals', 6),
 ('virtualized', 6),
 ('studio', 6),
 ('reading', 6),
 ('backup', 6),
 ('balancers', 6),
 ('android', 6),
 ('long', 6),
 ('common', 6),
 ('redhat', 6),
 ('track', 6),
 ('fantastic', 6),
 ('scheduling3', 6),
 ('educational', 6),
 ('line', 6),
 ('oracle', 6),
 ('js', 6),
 ('upward', 6),
 ('consul', 6),
 ('ny', 6),
 ('proficiency', 6),
 ('analysis', 6),
 ('175k', 6),
 ('visibility', 6),
 ('approach', 6),
 ('direct', 6),
 ('proactively', 6),
 ('api', 6),
 ('experiencing', 6),
 ('adtech', 6),
 ('coming', 6),
 ('york', 6),
 ('160k', 6),
 ('asp', 6),
 ('connections', 5),
 ('research', 5),
 ('connect', 5),
 ('enhance', 5),
 ('add', 5),
 ('brilliant', 5),
 ('collection', 5),
 ('shop', 5),
 ('proficient', 5),
 ('sense', 5),
 ('powering', 5),
 ('hosted', 5),
 ('spirit', 5),
 ('evolve', 5),
 ('week', 5),
 ('match', 5),
 ('tune', 5),
 ('revenue', 5),
 ('times', 5),
 ('truly', 5),
 ('atlassian', 5),
 ('seriously', 5),
 ('modern', 5),
 ('power', 5),
 ('infrastructurewhat', 5),
 ('increase', 5),
 ('heavy', 5),
 ('execute', 5),
 ('local', 5),
 ('tens', 5),
 ('roles', 5),
 ('biggest', 5),
 ('utilize', 5),
 ('dallas', 5),
 ('laid', 5),
 ('important', 5),
 ('skillswhat', 5),
 ('primary', 5),
 ('hour', 5),
 ('logs', 5),
 ('goal', 5),
 ('covered', 5),
 ('executing', 5),
 ('engine', 5),
 ('floor', 5),
 ('excellence', 5),
 ('positionexperience', 5),
 ('relic', 5),
 ('member', 5),
 ('similar', 5),
 ('developmentwhat', 5),
 ('takes', 5),
 ('old', 5),
 ('predictive', 5),
 ('hats', 5),
 ('total', 5),
 ('lives', 5),
 ('past', 5),
 ('prior', 5),
 ('script', 5),
 ('instrumentation', 5),
 ('aid', 5),
 ('ios', 5),
 ('perform', 5),
 ('terraform', 5),
 ('caching', 5),
 ('mobility', 5),
 ('openstack', 5),
 ('upgrades', 5),
 ('cfengine', 5),
 ('stores', 5),
 ('monica', 5),
 ('deeper', 5),
 ('80', 5),
 ('believe', 5),
 ('internet', 5),
 ('standards', 5),
 ('event', 5),
 ('130k', 5),
 ('moving', 5),
 ('report', 5),
 ('expand', 5),
 ('05', 5),
 ('18', 5),
 ('route', 5),
 ('ptoso', 5),
 ('auto', 5),
 ('verbal', 5),
 ('operationsdevops', 5),
 ('jose', 5),
 ('articulate', 5),
 ('specialize', 5),
 ('depending', 5),
 ('routing', 5),
 ('requests', 5),
 ('controls', 5),
 ('ago', 5),
 ('kitchen', 5),
 ('foster', 5),
 ('doingthe', 5),
 ('agencies', 5),
 ('gaming', 5),
 ('evaluate', 5),
 ('events', 5),
 ('components', 5),
 ('single', 5),
 ('efforts', 5),
 ('individuals', 5),
 ('private', 5),
 ('drive', 5),
 ('advantage', 5),
 ('admin', 5),
 ('energy', 5),
 ('awswhat', 5),
 ('bleeding', 5),
 ('traditional', 5),
 ('allow', 5),
 ('summary', 5),
 ('speak', 5),
 ('fortune', 5),
 ('connected', 5),
 ('involves', 5),
 ('led', 5),
 ('stop', 5),
 ('hybrid', 5),
 ('food', 5),
 ('clusters', 5),
 ('skill', 5),
 ('tfs', 5),
 ('startups', 5),
 ('lauren', 5),
 ('partnerships', 5),
 ('achieve', 5),
 ('prioritize', 5),
 ('director', 5),
 ('hiring', 5),
 ('designs', 5),
 ('external', 5),
 ('human', 5),
 ('saving', 5),
 ('reduce', 5),
 ('incredible', 5),
 ('assists', 5),
 ('nfs', 5),
 ('execution', 4),
 ('earn', 4),
 ('format', 4),
 ('los', 4),
 ('optimize', 4),
 ('downright', 4),
 ('collaboration', 4),
 ('scala', 4),
 ('hit', 4),
 ('practiceswhat', 4),
 ('incident', 4),
 ('ecommerce', 4),
 ('patching', 4),
 ('collectd', 4),
 ('alongside', 4),
 ('relaxed', 4),
 ('correct', 4),
 ('serve', 4),
 ('salaries', 4),
 ('templates', 4),
 ('bridge', 4),
 ('exhibits', 4),
 ('meaningful', 4),
 ('specializes', 4),
 ('scales', 4),
 ('collaborating', 4),
 ('affordable', 4),
 ('advancement', 4),
 ('f5', 4),
 ('risk', 4),
 ('existing', 4),
 ('involved', 4),
 ('healthcare', 4),
 ('added', 4),
 ('transforming', 4),
 ('500', 4),
 ('downloads', 4),
 ('funds', 4),
 ('jeans', 4),
 ('blogs', 4),
 ('corporate', 4),
 ('recent', 4),
 ('rdbms', 4),
 ('vsphere', 4),
 ('snacks', 4),
 ('experiences', 4),
 ('individually', 4),
 ('faults', 4),
 ('gap', 4),
 ('block', 4),
 ('locations', 4),
 ('familiar', 4),
 ('turn', 4),
 ('strategic', 4),
 ('django', 4),
 ('hat', 4),
 ('minimum', 4),
 ('fine', 4),
 ('individual', 4),
 ('push', 4),
 ('close', 4),
 ('switching', 4),
 ('face', 4),
 ('streaming', 4),
 ('03', 4),
 ('integrating', 4),
 ('specifically', 4),
 ('nyc', 4),
 ('plenty', 4),
 ('cloudera', 4),
 ('icinga', 4),
 ('wifi', 4),
 ('points', 4),
 ('ebooks', 4),
 ('incentives', 4),
 ('benefit', 4),
 ('inspiring', 4),
 ('ssl', 4),
 ('snippets', 4),
 ('manual', 4),
 ('formation', 4),
 ('reimbursement', 4),
 ('pipelines', 4),
 ('visual', 4),
 ('matter', 4),
 ('linus', 4),
 ('prestigious', 4),
 ('address', 4),
 ('starts', 4),
 ('conglomerate', 4),
 ('lines', 4),
 ('cs', 4),
 ('series', 4),
 ('css', 4),
 ('53', 4),
 ('promising', 4),
 ('faster', 4),
 ('cdn', 4),
 ('house', 4),
 ('you1', 4),
 ('recently', 4),
 ('enjoyable', 4),
 ('point', 4),
 ('expected', 4),
 ('progress', 4),
 ('personnel', 4),
 ('profit', 4),
 ('submit', 4),
 ('playful', 4),
 ('frameworks', 4),
 ('offshore', 4),
 ('laptop', 4),
 ('automationwhat', 4),
 ('display', 4),
 ('entertainment', 4),
 ('portland', 4),
 ('specialists', 4),
 ('object', 4),
 ('associated', 4),
 ('whitepapers', 4),
 ('worked', 4),
 ('downtown', 4),
 ('issueswhat', 4),
 ('storm', 4),
 ('accelerating', 4),
 ('coverage', 4),
 ('loves', 4),
 ('minded', 4),
 ('20', 4),
 ('departments', 4),
 ('partial', 4),
 ('workflow', 4),
 ('premise', 4),
 ('administer', 4),
 ('surrounding', 4),
 ('la', 4),
 ('vagrant', 4),
 ('greatest', 4),
 ('constantly', 4),
 ('sign', 4),
 ('cubical', 4),
 ('tell', 4),
 ('ssh', 4),
 ('onshore', 4),
 ('primed', 4),
 ('wear', 4),
 ('website', 4),
 ('decor', 4),
 ('articles', 4),
 ('positionrequired', 4),
 ('repositories', 4),
 ('opening', 4),
 ('teamwork', 4),
 ('detection', 4),
 ('scalabilitywhat', 4),
 ('solvers', 4),
 ('coworkers', 4),
 ('possess', 4),
 ('fabric', 4),
 ('numerous', 4),
 ('expanding', 4),
 ('arm', 4),
 ('uscompetitive', 4),
 ('ebs', 4),
 ('ranges', 4),
 ('space3', 4),
 ('jquery', 4),
 ('fashion', 4),
 ('enhancing', 4),
 ('jboss', 4),
 ('gym', 4),
 ('smtp', 4),
 ('administrationwhat', 4),
 ('integrate', 4),
 ('rotating', 4),
 ('mid', 4),
 ('international', 4),
 ('angeles', 4),
 ('editor', 4),
 ('analyzing', 4),
 ('affiliates', 4),
 ('entrepreneurs', 4),
 ('fueled', 4),
 ('architects', 4),
 ('red', 4),
 ('subject', 4),
 ('talents', 4),
 ('ventures', 4),
 ('assigned', 4),
 ('having', 4),
 ('90k', 4),
 ('communicate', 4),
 ('jvm', 4),
 ('collect', 4),
 ('millions', 4),
 ('campaigns', 4),
 ('couchbase', 4),
 ('rails', 4),
 ('feeds', 4),
 ('iam', 4),
 ('evolution', 4),
 ('discovery', 4),
 ('300', 3),
 ('automatic', 3),
 ('attract', 3),
 ('attitude', 3),
 ('provision', 3),
 ('thriving', 3),
 ('dnetal', 3),
 ('ice', 3),
 ('kpis', 3),
 ('tolerant', 3),
 ('sf', 3),
 ('dhcp', 3),
 ('talk', 3),
 ('groovy', 3),
 ('action', 3),
 ('resilient', 3),
 ('curious', 3),
 ('preferredso', 3),
 ('allowing', 3),
 ...]

In [4]:
# A much easier technique to count wanted words than the method posted above...
wanted = ['automated', 'database', 'flask', 'postgresql', 'javascript', 'linux', 'unix', 'ansible',
          'puppet','rhel', 'coreos', 'aws', 'boto', 'selenium', 'mysql', 'restful', 'gui', 'artificial', 
          'intelligence', 'django', 'api', 'full', 'stack', 'dba', 'certification', 'oracle', '11g', 'MSSQL'
         'automation', 'sql', 'data', 'security', 'testing', 'test', 'git', 'json', 'cloud', 'devops', 'celery',
         'analytics', 'visualization', 'redis', 'mongodb', 'nosql', 'openstack', 'aws', 'salt', 'analysis', 'golang',
          'qa', 'distributed', 'chef', 'docker', 'hadoop', 'sqlite', 'monitoring', 'micro', 'python', 'ruby', 'ibm',
         'jenkins', 'travis', 'git', 'ec2', 's3', 'amazon', 'azure' , 'openstack', 'proxmox', 'vm']
cnt = collections.Counter()
words = re.findall('\w+', open('job_text_dba.txt').read().lower())
for word in words:
    if word in wanted:
        cnt[word] += 1
print(cnt)


Counter({'devops': 501, 'linux': 195, 'cloud': 156, 'aws': 128, 'python': 107, 'chef': 102, 'monitoring': 95, 'puppet': 89, 'data': 74, 'ruby': 62, 'security': 61, 'full': 49, 'jenkins': 41, 'git': 38, 'amazon': 37, 'ansible': 35, 'mysql': 34, 'unix': 31, 'ec2': 28, 'automated': 27, 'test': 26, 'docker': 26, 'database': 24, 'sql': 23, 'stack': 22, 's3': 22, 'distributed': 20, 'mongodb': 18, 'salt': 18, 'postgresql': 17, 'testing': 16, 'redis': 16, 'azure': 14, 'hadoop': 13, 'nosql': 12, 'qa': 12, 'analytics': 11, 'javascript': 8, 'certification': 7, 'intelligence': 7, 'oracle': 6, 'api': 6, 'analysis': 6, 'openstack': 5, 'django': 4, 'golang': 3, 'travis': 3, 'json': 3, 'vm': 2, 'boto': 2, 'flask': 2, 'dba': 2, 'rhel': 2, 'restful': 1, 'celery': 1})

In [ ]: