In [220]:
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import matplotlib
In [2]:
#%matplotlib inline
In [221]:
df = pd.read_excel('/Users/matt/projects/EPRV/data/missings2.xls', encoding='utf-8')
In [222]:
df.columns
Out[222]:
In [223]:
df.loc[16:17]
Out[223]:
In [224]:
#df['AgendaItem'].str.contains('Doppler Primer:')
In [225]:
sundf = df[df['AgendaItem'].str.contains('Doppler Primer:')].copy()
len(sundf)
Out[225]:
Let's create two new columns in our DataFrame: the Primer, and the PrimerID. The Primer column will contain the name of the Doppler Primer session (minus the Doppler Primer: prefix), and the PrimerID will be a session identifier that will later be used in plotting.
In [226]:
sundf['PrimerID'] = 0
In [227]:
sundf['Primer'] = [re.search(r'(.*):\s(.*)$', item).group(2) for item in sundf['AgendaItem']]
In [228]:
sundf[['AgendaItem', 'Primer']].head(3)
Out[228]:
In [229]:
sundf['Primer'].unique()
Out[229]:
Now loop through the five unique sessions, updating the PrimerID column for each participant:
In [230]:
dopID = 0
for agItem in sundf['Primer'].unique():
sundf.loc[sundf['Primer'] == agItem, 'PrimerID'] = dopID
dopID += 1
Create an abbreviated code for each session. This will be added to the nametag to spark conversation among participants.
In [231]:
sun_ses = ['NA', 'SA', 'IC', 'DC', 'SM']
A quick preview of the first few rows to see the result:
In [232]:
sundf[['AgendaItem', 'Primer', 'PrimerID']].head(4)
Out[232]:
In [233]:
mondf = df[df['AgendaItem'].str.contains('Monday Break-out:')].copy()
len(mondf)
Out[233]:
In [234]:
mondf['MonID'] = 0
mondf['Monday'] = [re.search(r'(.*):\s(.*)$', item).group(2) for item in mondf['AgendaItem']]
mondf['Monday'].unique()
monID = 0
for agItem in mondf['Monday'].unique():
mondf.loc[mondf['Monday'] == agItem, 'MonID'] = monID
monID += 1
In [235]:
mondf['Monday'].unique()
Out[235]:
In [236]:
mon_ses = ['NA', 'FS', 'TC', 'BC', 'FC']
In [237]:
mondf[['AgendaItem', 'Monday', 'MonID']].head(4)
Out[237]:
In [238]:
tuedf = df[df['AgendaItem'].str.contains('Tuesday Break-out:')].copy()
len(tuedf)
Out[238]:
In [239]:
tuedf['TueID'] = 0
tuedf['Tuesday'] = [re.search(r'(.*):\s(.*)$', item).group(2) for item in tuedf['AgendaItem']]
tuedf['Tuesday'].unique()
tuesID = 0
for agItem in tuedf['Tuesday'].unique():
tuedf.loc[tuedf['Tuesday'] == agItem, 'TueID'] = tuesID
tuesID += 1
In [240]:
tuedf['Tuesday'].unique()
Out[240]:
In [241]:
tue_ses = ['NA', 'ST', 'DC', 'LB', 'PS']
In [242]:
tuedf[['AgendaItem', 'Tuesday', 'TueID']].head(4)
Out[242]:
In [243]:
fulldf = df[['RegId', 'GroupId', 'FirstName', 'LastName', 'Company']]
In [244]:
print(len(fulldf))
fulldf = fulldf.drop_duplicates()
print(len(fulldf))
print(len(sundf))
print(len(mondf))
print(len(tuedf))
In [245]:
fulldf.columns
Out[245]:
In [246]:
sundf.columns
Out[246]:
In [247]:
newdf = pd.merge(fulldf, sundf, on=['RegId', 'GroupId', 'FirstName', 'LastName', 'Company'], how='left')
print(len(newdf))
newdf = pd.merge(newdf, mondf, on=['RegId', 'GroupId', 'FirstName', 'LastName', 'Company'], how='left')
print(len(newdf))
newdf = pd.merge(newdf, tuedf, on=['RegId', 'GroupId', 'FirstName', 'LastName', 'Company'], how='left')
print(len(newdf))
In [248]:
newdf.head(5)
Out[248]:
In [249]:
newdf.columns
Out[249]:
Now create a new DataFrame that is a subset of the newdf with only the columns of interest. Also, make sure the DataFrame is sorted by lastname, the index is reset, and it's a copy of newdf instead of a pointer to newdf.
In [250]:
finaldf = newdf[['FirstName', 'LastName', 'Company', 'Primer', 'PrimerID', 'Monday', 'MonID', 'Tuesday', 'TueID']].sort('LastName').reset_index().copy()
In [251]:
finaldf.head(5)
Out[251]:
In [252]:
len(finaldf)
Out[252]:
In [253]:
finaldf.columns
Out[253]:
Now replace all empty cells for "Company" to a very general location:
In [254]:
finaldf.Company = ['Earth' if pd.isnull(company_el) else company_el for company_el in finaldf.Company]
Replace NaNs for PrimerID with the "Not Attending" ID:
In [255]:
finaldf.PrimerID = [4 if pd.isnull(primerid_el) else primerid_el for primerid_el in finaldf.PrimerID]
Check for NaNs in the Monday ID:
In [256]:
len(finaldf[pd.isnull(finaldf['MonID'])])
Out[256]:
Replace NaNs for the MonID with the "Not Attending" ID:
In [257]:
finaldf.MonID = [4 if pd.isnull(monid_el) else monid_el for monid_el in finaldf.MonID]
In [258]:
len(finaldf[pd.isnull(finaldf['MonID'])])
Out[258]:
Replace NaNs for the TueID with the "Not Attending" ID:
In [259]:
len(finaldf[pd.isnull(finaldf['TueID'])])
Out[259]:
In [260]:
finaldf.TueID = [4 if pd.isnull(tueid_el) else tueid_el for tueid_el in finaldf.TueID]
In [261]:
len(finaldf[pd.isnull(finaldf['TueID'])])
Out[261]:
Test out the wrap-around text for institute for participants that have long institution names. This regular expression will look for institutions (or Companies, as RegOnline refers to them), and find items that have a '/', and if no '/', either a '-', ',', or 'at' in the text. If so, add a newline character to make the text wrap around to the next line.
We'll first test the output on a participant's institution that contains both a '/' and a '-':
In [262]:
p = re.compile ('(/|^(?!.*/).*-|^(?!.*/).*,|^(?!.*/).*\sat\s)')
p.subn(r'\1\n', finaldf.loc[2].Company)[0]
Out[262]:
And test a cell that is long, contains at, but at is part of a longer word:
In [263]:
#p.subn(r'\1\n', finaldf.loc[53].Company)[0]
And a quick test on a few more institutions:
In [264]:
[p.sub(r'\1\n', company_el) if len(company_el) > 30 else company_el for company_el in finaldf.head(5).Company.values]
Out[264]:
Now update the full Company column of the DataFrame:
In [265]:
finaldf.Company = [p.sub(r'\1\n', company_el) if len(company_el) > 30 else company_el for company_el in finaldf.Company.values]
In [266]:
png = mpimg.imread('/Users/matt/projects/EPRV/images/NameTag2.png')
In [267]:
png.shape
Out[267]:
In [268]:
import matplotlib.font_manager as mfm
fontpaths = fontpaths=['/System/Library/Fonts/',
'/Library/Fonts',
'/Library/Fonts/Microsoft',
'/usr/X11/lib/X11/fonts',
'/opt/X11/share/fonts',
'/Users/matt/Library/Fonts']
blaa = mfm.findSystemFonts(fontpaths=fontpaths)
In [269]:
colors = ['#FFE2A9', '#4BA4D8', '#768085', '#BF5338', '#335B8F']
colors2 = ['#335B8F', '#BF5338', '#768085', '#4BA4D8', '#FFE2A9']
colors3 = ['#4BA4D8', '#FFE2A9', '#BF5338', '#768085', '#335B8F']
circ_ypos = 775
name_dict = {'family': 'YaleNew-Roman',
'color': '#D6E8E1',
'weight': 'bold',
'size': 28
}
company_dict = {'family': 'YaleNew-Roman',
'color': '#D6E8E1',
'weight': 'bold',
'size': 16
}
circle_dict = {'family': 'YaleNew-Roman',
'color': '#1D2523',
'weight': 'normal',
'size': 20
}
def change_name_size(name, name_dict):
if len(name) < 16:
name_dict['size'] = 28
elif ((len(name) >= 16) and (len(name) < 19)):
name_dict['size'] = 24
elif ((len(name) >= 19) and (len(name) < 24)):
name_dict['size'] = 20
elif ((len(name) >= 24) and (len(name) < 30)):
name_dict['size'] = 17
else:
name_dict['size'] = 16
return name_dict
def change_company_size(company, company_dict):
newlines = len(re.findall(r'\n', finaldf.loc[0].Company))
if newlines == 0:
if len(company) < 15:
company_dict['size'] = 18
elif ((len(company) >= 15) and (len(company) < 30)):
company_dict['size'] = 14
elif ((len(company) >= 30) and (len(company) < 40)):
company_dict['size'] = 12
elif ((len(company) >= 40) and (len(company) < 50)):
company_dict['size'] = 10
else:
company_dict['size'] = 8
else:
if len(company) < 15:
company_dict['size'] = 18
elif ((len(company) >= 15) and (len(company) < 40)):
company_dict['size'] = 14
elif ((len(company) >= 40) and (len(company) < 50)):
company_dict['size'] = 12
else:
company_dict['size'] = 10
return company_dict
# The HP Color LaserJet CP4020 offsets things by 1/16th of an inch left-to-right.
# This fudge factor should fix that:
hrz_fdg = 1. / 16./ 8.5
leftarr = np.array([0.0294, 0.5, 0.0294, 0.5, 0.0294, 0.5]) + hrz_fdg
bottomarr = [0.091, 0.091, 0.364, 0.364, 0.637, 0.637]
width = 0.4706
height = 0.273
# loop through the total number of pages:
for page in range(int(np.ceil((len(finaldf))/6.))):
print('Now on page: {}'.format(page))
fig = plt.figure(figsize=(8.5, 11))
for indx in range(6):
# add an if statement to handle the last page if there are less than
# six participants remaining:
if ((page*6 + indx) < len(finaldf)):
rect = [leftarr[indx], bottomarr[indx], width, height]
ax = fig.add_axes(rect)
ax.imshow(png)
ax.get_xaxis().set_visible(False)
ax.get_yaxis().set_visible(False)
print(u'Now making name tag for: {} {}'.format(finaldf.loc[page*6 + indx].FirstName, finaldf.loc[page*6 + indx].LastName))
#add name text:
name = finaldf.loc[page*6 + indx].FirstName + ' ' + finaldf.loc[page*6 + indx].LastName
this_name_dict = change_name_size(name, name_dict)
ax.text(600, 500, name, fontdict=this_name_dict, horizontalalignment='center')
#add company text:
company = finaldf.loc[page*6 + indx].Company
this_co_dict = change_company_size(company, company_dict)
ax.text(600, 625, company, fontdict=this_co_dict, horizontalalignment='center')
#add circles for sessions:
circ1 = plt.Circle((750, circ_ypos), 70, color=colors[int(finaldf.loc[page*6 + indx].PrimerID)])
fig.gca().add_artist(circ1)
ax.text(750, circ_ypos + 27.5, sun_ses[int(finaldf.loc[page*6 + indx].PrimerID)], fontdict=circle_dict, horizontalalignment='center')
circ2 = plt.Circle((925, circ_ypos), 70, color=colors2[int(finaldf.loc[page*6 + indx].MonID)])
fig.gca().add_artist(circ2)
ax.text(925, circ_ypos + 27.5, mon_ses[int(finaldf.loc[page*6 + indx].MonID)], fontdict=circle_dict, horizontalalignment='center')
circ3 = plt.Circle((1100, circ_ypos), 70, color=colors3[int(finaldf.loc[page*6 + indx].TueID)])
fig.gca().add_artist(circ3)
ax.text(1100, circ_ypos + 27.5, tue_ses[int(finaldf.loc[page*6 + indx].TueID)], fontdict=circle_dict, horizontalalignment='center')
plt.savefig('../nametags/more_missing_nameTags_bold_p'+str(page)+'.png', dpi=300)
In [270]:
finaldf.columns
Out[270]:
In [271]:
finaldf.FirstName.values
Out[271]:
In [272]:
finaldf.LastName.values
Out[272]:
In [273]:
hrz_fdg = 1. / 16./ 8.5
leftarr = np.array([0.0294, 0.5, 0.0294, 0.5, 0.0294, 0.5])
In [274]:
leftarr + hrz_fdg
Out[274]:
In [ ]: