In [1]:
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import matplotlib
In [2]:
#%matplotlib inline
In [112]:
df = pd.read_excel('/Users/matt/projects/EPRV/data/AttendeeReportCrop_20150703.xls', encoding='utf-8')
In [113]:
df.columns
Out[113]:
In [114]:
df.loc[36:37]
Out[114]:
In [115]:
sundf = df[df['AgendaItem'].str.contains('Doppler Primer:')].copy()
len(sundf)
Out[115]:
Let's create two new columns in our DataFrame: the Primer, and the PrimerID. The Primer column will contain the name of the Doppler Primer session (minus the Doppler Primer: prefix), and the PrimerID will be a session identifier that will later be used in plotting.
In [116]:
sundf['PrimerID'] = 0
In [117]:
sundf['Primer'] = [re.search(r'(.*):\s(.*)$', item).group(2) for item in sundf['AgendaItem']]
In [118]:
sundf[['AgendaItem', 'Primer']].head(3)
Out[118]:
In [119]:
sundf['Primer'].unique()
Out[119]:
Now loop through the five unique sessions, updating the PrimerID column for each participant:
In [120]:
dopID = 0
for agItem in sundf['Primer'].unique():
sundf.loc[sundf['Primer'] == agItem, 'PrimerID'] = dopID
dopID += 1
Create an abbreviated code for each session. This will be added to the nametag to spark conversation among participants.
In [121]:
sun_ses = ['IC', 'DC', 'SM', 'SA', 'NA']
A quick preview of the first few rows to see the result:
In [122]:
sundf[['AgendaItem', 'Primer', 'PrimerID']].head(4)
Out[122]:
In [123]:
mondf = df[df['AgendaItem'].str.contains('Monday Break-out:')].copy()
len(mondf)
Out[123]:
In [124]:
mondf['MonID'] = 0
mondf['Monday'] = [re.search(r'(.*):\s(.*)$', item).group(2) for item in mondf['AgendaItem']]
mondf['Monday'].unique()
monID = 0
for agItem in mondf['Monday'].unique():
mondf.loc[mondf['Monday'] == agItem, 'MonID'] = monID
monID += 1
In [125]:
mondf['Monday'].unique()
Out[125]:
In [126]:
mon_ses = ['FS', 'NA', 'TC', 'BC', 'FC']
In [127]:
mondf[['AgendaItem', 'Monday', 'MonID']].head(4)
Out[127]:
In [128]:
tuedf = df[df['AgendaItem'].str.contains('Tuesday Break-out:')].copy()
len(tuedf)
Out[128]:
In [129]:
tuedf['TueID'] = 0
tuedf['Tuesday'] = [re.search(r'(.*):\s(.*)$', item).group(2) for item in tuedf['AgendaItem']]
tuedf['Tuesday'].unique()
tuesID = 0
for agItem in tuedf['Tuesday'].unique():
tuedf.loc[tuedf['Tuesday'] == agItem, 'TueID'] = tuesID
tuesID += 1
In [130]:
tuedf['Tuesday'].unique()
Out[130]:
In [131]:
tue_ses = ['ST', 'DC', 'LB', 'PS', 'NA']
In [132]:
tuedf[['AgendaItem', 'Tuesday', 'TueID']].head(4)
Out[132]:
In [133]:
fulldf = df[['RegId', 'GroupId', 'FirstName', 'LastName', 'Company']]
In [134]:
print(len(fulldf))
fulldf = fulldf.drop_duplicates()
print(len(fulldf))
print(len(sundf))
print(len(mondf))
print(len(tuedf))
In [135]:
fulldf.columns
Out[135]:
In [136]:
sundf.columns
Out[136]:
In [137]:
newdf = pd.merge(fulldf, sundf, on=['RegId', 'GroupId', 'FirstName', 'LastName', 'Company'], how='left')
print(len(newdf))
newdf = pd.merge(newdf, mondf, on=['RegId', 'GroupId', 'FirstName', 'LastName', 'Company'], how='left')
print(len(newdf))
newdf = pd.merge(newdf, tuedf, on=['RegId', 'GroupId', 'FirstName', 'LastName', 'Company'], how='left')
print(len(newdf))
In [138]:
newdf.head(5)
Out[138]:
In [139]:
newdf.columns
Out[139]:
Now create a new DataFrame that is a subset of the newdf with only the columns of interest. Also, make sure the DataFrame is sorted by lastname, the index is reset, and it's a copy of newdf instead of a pointer to newdf.
In [140]:
finaldf = newdf[['FirstName', 'LastName', 'Company', 'Primer', 'PrimerID', 'Monday', 'MonID', 'Tuesday', 'TueID']].sort('LastName').reset_index().copy()
In [141]:
finaldf.head(5)
Out[141]:
In [142]:
len(finaldf)
Out[142]:
In [143]:
finaldf.columns
Out[143]:
Now replace all empty cells for "Company" to a very general location:
In [144]:
finaldf.Company = ['Earth' if pd.isnull(company_el) else company_el for company_el in finaldf.Company]
Replace NaNs for PrimerID with the "Not Attending" ID:
In [145]:
finaldf.PrimerID = [4 if pd.isnull(primerid_el) else primerid_el for primerid_el in finaldf.PrimerID]
Check for NaNs in the Monday ID:
In [146]:
len(finaldf[pd.isnull(finaldf['MonID'])])
Out[146]:
Replace NaNs for the MonID with the "Not Attending" ID:
In [147]:
finaldf.MonID = [4 if pd.isnull(monid_el) else monid_el for monid_el in finaldf.MonID]
In [148]:
len(finaldf[pd.isnull(finaldf['MonID'])])
Out[148]:
Replace NaNs for the TueID with the "Not Attending" ID:
In [149]:
len(finaldf[pd.isnull(finaldf['TueID'])])
Out[149]:
In [150]:
finaldf.TueID = [4 if pd.isnull(tueid_el) else tueid_el for tueid_el in finaldf.TueID]
In [151]:
len(finaldf[pd.isnull(finaldf['TueID'])])
Out[151]:
Test out the wrap-around text for institute for participants that have long institution names. This regular expression will look for institutions (or Companies, as RegOnline refers to them), and find items that have a '/', and if no '/', either a '-', ',', or 'at' in the text. If so, add a newline character to make the text wrap around to the next line.
We'll first test the output on a participant's institution that contains both a '/' and a '-':
In [152]:
p = re.compile ('(/|^(?!.*/).*-|^(?!.*/).*,|^(?!.*/).*\sat\s)')
p.subn(r'\1\n', finaldf.loc[2].Company)[0]
Out[152]:
And test a cell that is long, contains at, but at is part of a longer word:
In [153]:
p.subn(r'\1\n', finaldf.loc[53].Company)[0]
Out[153]:
And a quick test on a few more institutions:
In [154]:
[p.sub(r'\1\n', company_el) if len(company_el) > 30 else company_el for company_el in finaldf.head(5).Company.values]
Out[154]:
Now update the full Company column of the DataFrame:
In [155]:
finaldf.Company = [p.sub(r'\1\n', company_el) if len(company_el) > 30 else company_el for company_el in finaldf.Company.values]
In [156]:
png = mpimg.imread('/Users/matt/projects/EPRV/images/NameTag2.png')
In [157]:
png.shape
Out[157]:
In [158]:
import matplotlib.font_manager as mfm
fontpaths = fontpaths=['/System/Library/Fonts/',
'/Library/Fonts',
'/Library/Fonts/Microsoft',
'/usr/X11/lib/X11/fonts',
'/opt/X11/share/fonts',
'/Users/matt/Library/Fonts']
blaa = mfm.findSystemFonts(fontpaths=fontpaths)
In [159]:
colors = ['#FFE2A9', '#4BA4D8', '#768085', '#BF5338', '#335B8F']
colors2 = ['#335B8F', '#BF5338', '#768085', '#4BA4D8', '#FFE2A9']
colors3 = ['#4BA4D8', '#FFE2A9', '#BF5338', '#768085', '#335B8F']
circ_ypos = 775
name_dict = {'family': 'YaleNew-Roman',
'color': '#D6E8E1',
'weight': 'bold',
'size': 28
}
company_dict = {'family': 'YaleNew-Roman',
'color': '#D6E8E1',
'weight': 'bold',
'size': 16
}
circle_dict = {'family': 'YaleNew-Roman',
'color': '#1D2523',
'weight': 'normal',
'size': 20
}
def change_name_size(name, name_dict):
if len(name) < 16:
name_dict['size'] = 28
elif ((len(name) >= 16) and (len(name) < 19)):
name_dict['size'] = 24
elif ((len(name) >= 19) and (len(name) < 24)):
name_dict['size'] = 20
elif ((len(name) >= 24) and (len(name) < 30)):
name_dict['size'] = 17
else:
name_dict['size'] = 16
return name_dict
def change_company_size(company, company_dict):
newlines = len(re.findall(r'\n', finaldf.loc[0].Company))
if newlines == 0:
if len(company) < 15:
company_dict['size'] = 18
elif ((len(company) >= 15) and (len(company) < 30)):
company_dict['size'] = 14
elif ((len(company) >= 30) and (len(company) < 40)):
company_dict['size'] = 12
elif ((len(company) >= 40) and (len(company) < 50)):
company_dict['size'] = 10
else:
company_dict['size'] = 8
else:
if len(company) < 15:
company_dict['size'] = 18
elif ((len(company) >= 15) and (len(company) < 40)):
company_dict['size'] = 14
elif ((len(company) >= 40) and (len(company) < 50)):
company_dict['size'] = 12
else:
company_dict['size'] = 10
return company_dict
# The HP Color LaserJet CP4020 offsets things by 1/16th of an inch left-to-right.
# This fudge factor should fix that:
hrz_fdg = 1. / 16./ 8.5
leftarr = np.array([0.0294, 0.5, 0.0294, 0.5, 0.0294, 0.5]) + hrz_fdg
bottomarr = [0.091, 0.091, 0.364, 0.364, 0.637, 0.637]
width = 0.4706
height = 0.273
# loop through the total number of pages:
for page in range(int(np.ceil((len(finaldf))/6.))):
print('Now on page: {}'.format(page))
fig = plt.figure(figsize=(8.5, 11))
for indx in range(6):
# add an if statement to handle the last page if there are less than
# six participants remaining:
if ((page*6 + indx) < len(finaldf)):
rect = [leftarr[indx], bottomarr[indx], width, height]
ax = fig.add_axes(rect)
ax.imshow(png)
ax.get_xaxis().set_visible(False)
ax.get_yaxis().set_visible(False)
print(u'Now making name tag for: {} {}'.format(finaldf.loc[page*6 + indx].FirstName, finaldf.loc[page*6 + indx].LastName))
#add name text:
name = finaldf.loc[page*6 + indx].FirstName + ' ' + finaldf.loc[page*6 + indx].LastName
this_name_dict = change_name_size(name, name_dict)
ax.text(600, 500, name, fontdict=this_name_dict, horizontalalignment='center')
#add company text:
company = finaldf.loc[page*6 + indx].Company
this_co_dict = change_company_size(company, company_dict)
ax.text(600, 625, company, fontdict=this_co_dict, horizontalalignment='center')
#add circles for sessions:
circ1 = plt.Circle((750, circ_ypos), 70, color=colors[int(finaldf.loc[page*6 + indx].PrimerID)])
fig.gca().add_artist(circ1)
ax.text(750, circ_ypos + 27.5, sun_ses[int(finaldf.loc[page*6 + indx].PrimerID)], fontdict=circle_dict, horizontalalignment='center')
circ2 = plt.Circle((925, circ_ypos), 70, color=colors2[int(finaldf.loc[page*6 + indx].MonID)])
fig.gca().add_artist(circ2)
ax.text(925, circ_ypos + 27.5, mon_ses[int(finaldf.loc[page*6 + indx].MonID)], fontdict=circle_dict, horizontalalignment='center')
circ3 = plt.Circle((1100, circ_ypos), 70, color=colors3[int(finaldf.loc[page*6 + indx].TueID)])
fig.gca().add_artist(circ3)
ax.text(1100, circ_ypos + 27.5, tue_ses[int(finaldf.loc[page*6 + indx].TueID)], fontdict=circle_dict, horizontalalignment='center')
plt.savefig('../nametags/nameTags_bold_p'+str(page)+'.png', dpi=300)
In [52]:
finaldf.columns
Out[52]:
In [54]:
finaldf.FirstName.values
Out[54]:
In [62]:
finaldf.LastName.values
Out[62]:
In [59]:
hrz_fdg = 1. / 16./ 8.5
leftarr = np.array([0.0294, 0.5, 0.0294, 0.5, 0.0294, 0.5])
In [60]:
leftarr + hrz_fdg
Out[60]:
In [ ]: