In [56]:
import pandas as pd 
import glob
from bs4 import BeautifulSoup

In [57]:
def store_data(check1,master_dict,tags,soup,email,user_df_list,keys,prin_pi,pal_count,usernames):
        #store palmetto user count 
        if pal_count > 1: 
            palm_user = ','.join(usernames)
            master_dict[keys[5]].append(palm_user)
        elif pal_count == 1:
            try:
                master_dict[keys[5]].append(usernames[0])
            except IndexError:
                master_dict[keys[5]].append('Error')
                pass
        elif pal_count == 0:
            master_dict[keys[5]].append('Not Found')
        #store ID
        awardID = soup.find(tags[0]).string
        master_dict[keys[0]].append(awardID)
        
        #Store Date 
        date = soup.find(tags[1]).string
        master_dict[keys[1]].append(date)
        
        #store cu count 
        master_dict[keys[2]].append(check1) 
        key = [x.string for x in soup.find_all(tags[2])]
        values = [x.string for x in soup.find_all(tags[3])]
        role_dict = dict(zip(key,values))
        #store role check result
        mail = ''
        try:
            for k,v in role_dict.items():
                if v == prin_pi:
                    user = k.split('@')[0]
                    mail = k
            if email in mail:
                if user in user_df_list: 
                    master_dict[keys[4]].append("Yes")
                else: 
                    master_dict[keys[4]].append("No") 
            else:
                master_dict[keys[4]].append("No")
        except KeyError:
            master_dict[keys[4]].append("Error")
            pass

In [58]:
%%time 

prin_pi = 'Principal Investigator'
email = 'clemson.edu'
tagl = len(tags)
check = 0
pal_count = 0 
usernames = []

tags = [
       'AwardID',
       'AwardEffectiveDate',
       'EmailAddress',
       'RoleCode'
       ]

master_dict = {
               'AwardID':[],
               'Date': [],
               'CU_Email_Count':[],
               'Palmetto_User_Count':[],
               'Principal_Investigator':[],
               'Palmetto_Username':[]
              }
keys = [
        'AwardID',
        'Date',
        'CU_Email_Count',
        'Palmetto_User_Count',
        'Principal_Investigator',
        'Palmetto_Username'
        ]

#csv converted to list 
df = pd.read_csv('../Research-Trend/palmetto/palmetto_data/Users.csv') #type = dataframe
user_df_Series = df['LocalUserId'] #type = Series, palmetto user ids 
user_df_list = list(user_df_Series)
#list of awards 
awards = glob.glob('20*/*.xml')
#parse awards 
for docs in awards:
    #open award
    with open(docs) as award_doc: 
        xml = award_doc.read()
        soup = BeautifulSoup(xml,'xml')
    try: 
        for emails in soup.find_all(tags[2]):
                at = emails.string #email address
                us = emails.string.split('@')[0] #username
                if email in at: 
                    check+=1
                    if us in user_df_list:
                        usernames.append(us)
                        pal_count+=1 #number of palmetto users found 
    except AttributeError:
        check = 0 
        continue
    if check > 0:
        check1 = check 
        check = 0 
        #store palmetto user count  
        master_dict[keys[3]].append(pal_count)
        #store other variables 
        store_data(check1,master_dict,tags,soup,email,user_df_list,keys,
                   prin_pi,pal_count,usernames)
        #reset values
        check1 = 0 
        pal_count = 0
        usernames = []


CPU times: user 10min 14s, sys: 7.92 s, total: 10min 22s
Wall time: 11min 2s

In [59]:
master_df = pd.DataFrame(dict([(k,pd.Series(v)) for k,v in master_dict.items()]))
m_df = master_df[['AwardID','Date','CU_Email_Count','Palmetto_User_Count',
                   'Principal_Investigator','Palmetto_Username']]
m_df


Out[59]:
AwardID Date CU_Email_Count Palmetto_User_Count Principal_Investigator Palmetto_Username
0 0700508 05/15/2007 1 0 No Not Found
1 0701187 09/01/2007 1 0 No Not Found
2 0703042 08/29/2006 1 0 No Not Found
3 0703061 06/15/2007 3 1 No mkurz
4 0703117 09/01/2007 2 0 No Not Found
5 0703547 08/01/2007 3 1 No kevja
6 0706426 06/01/2007 1 0 No Not Found
7 0708899 09/01/2007 1 1 Yes sbritt
8 0710797 11/01/2006 1 0 No Not Found
9 0722313 08/01/2007 5 1 No geist
10 0722625 09/01/2007 1 1 Yes aapon
11 0722841 08/01/2007 4 0 No Not Found
12 0726023 09/01/2007 1 1 No sebgoa
13 0728035 09/01/2007 2 1 No kwang
14 0728626 09/15/2007 1 0 No Not Found
15 0728868 09/01/2007 2 1 Yes calkin
16 0730694 09/01/2007 1 0 No Not Found
17 0733125 08/01/2007 1 1 Yes mason
18 0733441 07/01/2007 2 0 No Not Found
19 0733711 09/15/2007 3 0 No Not Found
20 0733944 08/15/2007 1 1 Yes hubing
21 0735655 01/01/2008 2 0 No Not Found
22 0736007 11/01/2007 1 0 No Not Found
23 0736037 06/01/2007 1 0 No Not Found
24 0737514 06/01/2008 1 0 No Not Found
25 0738162 07/15/2007 1 0 No Not Found
26 0742296 10/01/2007 2 0 No Not Found
27 0744040 02/15/2008 1 0 No Not Found
28 0750814 03/01/2008 1 1 No lmark
29 0751132 04/15/2008 1 0 No Not Found
... ... ... ... ... ... ...
517 1620695 07/01/2016 1 1 Yes jb
518 1620922 09/30/2015 1 0 No Not Found
519 1624641 09/01/2016 1 0 No Not Found
520 1624705 09/15/2016 1 0 No Not Found
521 1629437 08/01/2016 1 0 No Not Found
522 1629934 09/15/2016 5 1 No sez
523 1632881 09/01/2016 1 1 Yes foulger
524 1633608 09/15/2016 5 4 Yes jb,khan,sez,smoysey
525 1633952 09/01/2016 1 0 No Not Found
526 1638321 09/01/2017 3 2 No switzef,isafro
527 1638888 08/15/2016 4 0 No Not Found
528 1640578 09/15/2016 1 0 No Not Found
529 1640645 09/01/2016 3 0 No Not Found
530 1640664 09/01/2016 1 0 No Not Found
531 1642102 10/01/2016 4 2 Yes nuyun,gemmill
532 1642143 01/01/2017 4 2 No kwang,nuyun
533 1642542 08/01/2016 1 1 Yes aapon
534 1643020 11/01/2016 2 1 Yes kwang
535 1644552 07/15/2016 4 1 No smoysey
536 1646691 08/01/2016 5 1 No etkraem
537 1647361 08/01/2016 2 1 Yes isafro
538 1660329 08/09/2016 1 0 No Not Found
539 1701290 08/01/2017 3 1 No kevja
540 1705448 11/01/2016 1 1 Yes xianl
541 1705450 11/01/2016 1 1 Yes xianl
542 1710898 09/30/2016 1 0 No Not Found
543 1719461 08/01/2016 1 0 No Not Found
544 1722482 01/01/2017 2 1 No bcampb7
545 1722997 10/01/2016 1 0 No Not Found
546 1725377 02/01/2017 4 0 No Not Found

547 rows × 6 columns


In [60]:
check = m_df.loc[m_df['Palmetto_User_Count'] > 0]
check


Out[60]:
AwardID Date CU_Email_Count Palmetto_User_Count Principal_Investigator Palmetto_Username
3 0703061 06/15/2007 3 1 No mkurz
5 0703547 08/01/2007 3 1 No kevja
7 0708899 09/01/2007 1 1 Yes sbritt
9 0722313 08/01/2007 5 1 No geist
10 0722625 09/01/2007 1 1 Yes aapon
12 0726023 09/01/2007 1 1 No sebgoa
13 0728035 09/01/2007 2 1 No kwang
15 0728868 09/01/2007 2 1 Yes calkin
17 0733125 08/01/2007 1 1 Yes mason
20 0733944 08/15/2007 1 1 Yes hubing
28 0750814 03/01/2008 1 1 No lmark
30 0753335 04/01/2008 1 1 No sebgoa
35 0800474 05/01/2008 1 1 Yes gli
37 0801150 08/01/2008 2 1 No kevja
43 0809820 06/01/2008 5 2 No abdkhan,smoysey
44 0811728 09/15/2008 1 1 No lflower
51 0820345 02/15/2009 1 1 Yes ffeltus
55 0825468 10/01/2008 1 1 Yes bcampb7
56 0825655 07/15/2008 1 1 No pisup
69 0838639 02/15/2009 1 1 Yes dhouse
73 0845593 02/15/2009 1 1 Yes bcdean
80 0855878 02/15/2009 1 1 No hubing
81 0856046 02/15/2009 2 1 Yes foulger
85 0901732 08/01/2009 3 1 No kevja
92 0907390 08/01/2009 1 1 Yes dperahi
95 0911122 03/01/2009 1 1 Yes jfrugol
96 0914478 07/15/2009 1 1 Yes rebholz
98 0915085 07/01/2009 2 1 No dhouse
99 0915214 09/01/2009 1 1 Yes khan
100 0916387 08/01/2009 1 1 Yes smithmc
... ... ... ... ... ... ...
478 1551262 05/18/2015 1 1 Yes rge
479 1551511 05/18/2015 1 1 Yes rge
480 1551534 09/01/2015 2 1 No ladner
482 1552214 08/01/2016 1 1 Yes mlarsen
486 1553945 04/01/2016 1 1 Yes fding
487 1554385 05/01/2016 1 1 Yes rgetman
491 1560070 09/01/2016 2 1 No sosolik
494 1563315 07/01/2016 1 1 Yes ptallap
495 1563426 05/01/2016 1 1 Yes xzhao5
496 1563435 08/01/2016 3 2 Yes ab7,sez
501 1566346 06/01/2016 1 1 Yes sriggs
503 1600767 02/01/2016 2 2 Yes macaule,spoznan
506 1601485 05/15/2016 2 1 Yes sears3
512 1611136 07/01/2016 1 1 Yes dperahi
517 1620695 07/01/2016 1 1 Yes jb
522 1629934 09/15/2016 5 1 No sez
523 1632881 09/01/2016 1 1 Yes foulger
524 1633608 09/15/2016 5 4 Yes jb,khan,sez,smoysey
526 1638321 09/01/2017 3 2 No switzef,isafro
531 1642102 10/01/2016 4 2 Yes nuyun,gemmill
532 1642143 01/01/2017 4 2 No kwang,nuyun
533 1642542 08/01/2016 1 1 Yes aapon
534 1643020 11/01/2016 2 1 Yes kwang
535 1644552 07/15/2016 4 1 No smoysey
536 1646691 08/01/2016 5 1 No etkraem
537 1647361 08/01/2016 2 1 Yes isafro
539 1701290 08/01/2017 3 1 No kevja
540 1705448 11/01/2016 1 1 Yes xianl
541 1705450 11/01/2016 1 1 Yes xianl
544 1722482 01/01/2017 2 1 No bcampb7

228 rows × 6 columns


In [61]:
check2 = check.loc[m_df['Principal_Investigator'] == 'Yes']
check2


Out[61]:
AwardID Date CU_Email_Count Palmetto_User_Count Principal_Investigator Palmetto_Username
7 0708899 09/01/2007 1 1 Yes sbritt
10 0722625 09/01/2007 1 1 Yes aapon
15 0728868 09/01/2007 2 1 Yes calkin
17 0733125 08/01/2007 1 1 Yes mason
20 0733944 08/15/2007 1 1 Yes hubing
35 0800474 05/01/2008 1 1 Yes gli
51 0820345 02/15/2009 1 1 Yes ffeltus
55 0825468 10/01/2008 1 1 Yes bcampb7
69 0838639 02/15/2009 1 1 Yes dhouse
73 0845593 02/15/2009 1 1 Yes bcdean
81 0856046 02/15/2009 2 1 Yes foulger
92 0907390 08/01/2009 1 1 Yes dperahi
95 0911122 03/01/2009 1 1 Yes jfrugol
96 0914478 07/15/2009 1 1 Yes rebholz
99 0915214 09/01/2009 1 1 Yes khan
100 0916387 08/01/2009 1 1 Yes smithmc
104 0919440 09/15/2009 2 2 Yes gemmill,jb
111 0927962 08/15/2009 1 1 Yes vblouin
122 0934299 07/15/2009 1 1 Yes hubing
134 0946932 09/01/2009 1 1 Yes dperahi
135 0947679 10/01/2009 1 1 Yes aapon
136 0948132 09/15/2009 1 1 Yes jmarty
138 0949790 03/01/2010 1 1 Yes mcateri
139 0950691 03/15/2010 1 1 Yes bcampb7
140 0950700 09/01/2010 1 1 Yes jfrugol
143 0953783 03/15/2010 1 1 Yes dominy
145 0954811 09/01/2010 1 1 Yes sbritt
146 0955096 04/15/2010 1 1 Yes gli
149 0960100 03/15/2010 4 2 Yes sbritt,sosolik
151 0961135 10/01/2009 1 1 Yes aapon
... ... ... ... ... ... ...
448 1522751 07/15/2015 1 1 Yes isafro
450 1527193 09/01/2015 1 1 Yes burr2
454 1531127 09/01/2015 4 2 Yes jmarty,kwang
456 1534304 01/01/2016 3 2 Yes ladner,ssarupr
464 1540702 08/01/2015 5 2 Yes pcarbaj,smoysey
465 1541944 03/15/2016 1 1 Yes ssarupr
467 1543373 04/01/2016 1 1 Yes xianl
469 1544910 10/01/2015 3 2 Yes jmarty,pisup
472 1547107 09/01/2015 3 2 Yes qsc,rebholz
478 1551262 05/18/2015 1 1 Yes rge
479 1551511 05/18/2015 1 1 Yes rge
482 1552214 08/01/2016 1 1 Yes mlarsen
486 1553945 04/01/2016 1 1 Yes fding
487 1554385 05/01/2016 1 1 Yes rgetman
494 1563315 07/01/2016 1 1 Yes ptallap
495 1563426 05/01/2016 1 1 Yes xzhao5
496 1563435 08/01/2016 3 2 Yes ab7,sez
501 1566346 06/01/2016 1 1 Yes sriggs
503 1600767 02/01/2016 2 2 Yes macaule,spoznan
506 1601485 05/15/2016 2 1 Yes sears3
512 1611136 07/01/2016 1 1 Yes dperahi
517 1620695 07/01/2016 1 1 Yes jb
523 1632881 09/01/2016 1 1 Yes foulger
524 1633608 09/15/2016 5 4 Yes jb,khan,sez,smoysey
531 1642102 10/01/2016 4 2 Yes nuyun,gemmill
533 1642542 08/01/2016 1 1 Yes aapon
534 1643020 11/01/2016 2 1 Yes kwang
537 1647361 08/01/2016 2 1 Yes isafro
540 1705448 11/01/2016 1 1 Yes xianl
541 1705450 11/01/2016 1 1 Yes xianl

142 rows × 6 columns