In [1]:
import re
import matplotlib.pyplot as plt
import numpy as np
import prettytable
splitter = re.compile(r'(?:[^,(]|\([^)]*\))+')
In [2]:
with open('disciplines.txt') as f:
students = f.readlines()
students = [set(d.strip() for d in splitter.findall(student)) for student in students]
In [3]:
domains = set()
_ = map(domains.update, students)
physics = {'Aerospace Engineering', 'Astrophysics', 'Atmospheric Science',
'Earth sciences (geology, oceanography, meteorology)', 'Engineering (civil, mechanical, chemical)',
'Geography', 'Geological Engineering : Environmental', 'Geological Engineering : Geotechnical',
'Geophysics', 'Hydrogeology', 'Hydrology', 'Physical Oceanography', 'Physics', 'Space sciences',
'astronomy', 'biomedical engineering',}
chemistry = {'Chemical/Geochemical Oceanography', 'Chemistry', 'Materials Science',
'Materials Science and Engineering', 'materials science',}
biology = {'Biological Oceanography', 'Biomechanics', 'Brain and neurosciences', 'Environmental Science',
'Life science (biology, genetics)', 'Life science (ecology, zoology, botany)', 'Medicine',
'Nursing/research',}
mathematics_cs = {'Applied Mathematics', 'Applied math', 'Computer science and electrical engineering',
'Data analysis', 'Mathematics', 'Statistics', 'Tech support', 'data analysis', 'lab tech',
'or support programmer', 'statistics'}
human_social = {'Admin', 'Business', 'Design', 'Economics', 'Human factors & applied psychology',
'Humanities and social sciences', 'Law', 'Legal', 'Librarianship', 'Library Science',
'Library Systems', 'Library science', 'Non-profit', 'editing/publishing', 'finance',}
domains = [physics, chemistry, biology, mathematics_cs, human_social]
domain_labels = ["Physics", "Chemistry", "Biology", "Mathematics & Computer Science", "Humanities & Social Sciences"]
Note that the counts calculation may have a sum greater than the total number of students since students may select more than one domain.
In [4]:
counts = np.zeros(5, int)
for i, domain in enumerate(domains):
for student in students:
counts[i] += bool(student & domain)
In [5]:
ndat = len(domains)
dat = 100 * np.asarray(counts, float) / len(students)
fig = plt.figure(1)
p = plt.subplot(1, 1, 1)
plt.bar(range(ndat), dat, align='center', fc='r')
plt.axis([-0.5, ndat - 0.5, 0, 100])
plt.xticks(range(ndat), domain_labels, rotation=90)
plt.ylabel("Percentage (%)")
plt.savefig("students_by_domain.svg", bbox_inches="tight")
plt.show()
plt.close()
In [6]:
tab = prettytable.PrettyTable(["Domain", "Percentage (%)"])
for x, y in zip(domain_labels, dat):
tab.add_row([x, "{0:.2f}".format(y)])
In [7]:
print tab.get_string()
In [8]:
print tab.get_html_string()
In [8]: