In [1]:
%pylab inline
from astropy.table import Table, Column
import numpy as np
import pandas as pd
import seaborn as sns
from scipy import stats
import matplotlib as mpl
import matplotlib.pyplot as plt
In [2]:
announceFile = open('data/ItemizedAnnouncements.txt', 'r')
announce = []
for line in announceFile:
announce.append(line)
announceFile.close()
# The lord list of jobs, separate from the announcements
dat = pd.read_csv('data/AllAASjobReg_abbreviated.dat', sep=';')
Does the job ad include these stems?
Ideally, we'd also like to find out:
Let's attack the first question since it is relatively easy.
In [3]:
thisWord = "jenga"
searchWords = ['instrumenta', 'theor', 'observation', 'data-intensive', 'computational',
'galax', 'exoplanet', 'star formation', 'SDSS','GRB', 'hardware', 'software', 'data science',
'nano', 'optics', 'x-ray', 'brown dwarf', 'LSST', 'HST', 'statistics', 'GAIA',
'LBT', 'adaptive optics', 'Kepler', 'Keck', 'ALMA', 'VLT', 'Spitzer', 'simulations',
'interdisciplinary', 'MHD', 'high performance computing', 'planetary', 'quantum', 'infrared']
totHits = []
for thisWord in searchWords:
occurences = []
for advert in announce:
thisAdvert = advert.lower()
thisCount = thisAdvert.count(thisWord.lower())
occurences.append(thisCount)
#loc = thisAdvert.find(thisWord.lower())
#if (thisCount > 0):
#print advert[loc-40:loc+80]
occurences = np.asarray(occurences)
dat[thisWord] = occurences
hits = (occurences > 0)
totHits.append(sum(hits))
print sum(hits), thisWord
In [4]:
dct = {'word':searchWords, 'counts':totHits}
tdf = pd.DataFrame(dct)
sortedvi = [x for (y,x) in sorted(zip(totHits,searchWords))]
In [5]:
sns.set()
sns.set_context("paper", font_scale=1.5, rc={"lines.linewidth": 2.5})
In [18]:
sns.factorplot('word', 'counts', data=tdf, x_order=sortedvi)
plt.xticks(rotation=90)
savefig('AAS_wordFreq.png')
In [7]:
dat.head()
Out[7]:
In [8]:
dat.to_excel('AASjobRegExcel_basicNLP.xls')
A few key comparisons: Frequency of hardware to software:
In [9]:
len(np.nonzero(dat['software']))
Out[9]:
In [ ]: