In [1]:
import pandas as pd 
import glob
from bs4 import BeautifulSoup

In [2]:
def userlist(soup,email,user_df_list,pal_count):
    usernames = [] 
        for emails in soup.find_all(tags[2]):
            try:
                at = emails.string #email
                us = emails.string.split('@')[0] #username
                if email in at:
                    if us in user_df_list:
                        usernames.append(us)
            except IndexError:
                continue 
        return usernames

In [3]:
def store_data(check,master_dict,tags,soup,email,user_df_list,keys,prin_pi,pal_count,usernames):
        #store palmetto user count 
        if pal_count > 1: 
            palm_user = ','.join(usernames)
            master_dict[keys[5]].append(palm_user)
        elif pal_count == 1:
            try:
                master_dict[keys[5]].append(usernames[0])
            except IndexError:
                pass
        else:
            master_dict[keys[5]].append('NaN')
        #store ID
        awardID = soup.find(tags[0]).string
        master_dict[keys[0]].append(awardID)
        
        #Store Date 
        date = soup.find(tags[1]).string
        master_dict[keys[1]].append(date)
        
        #store cu count 
        master_dict[keys[2]].append(check) 
        role_dict = {x.string:y.string 
                     for x in soup.find(tags[3]) 
                     for y in soup.find(tags[2]) 
                     if x.string == prin_pi}
        #store role check result 
        try:
            user = role_dict[prin_pi].split('@')[0] #username
            mail = role_dict[prin_pi] #email
            if email in mail:
                if user in user_df_list: 
                    master_dict[keys[4]].append("Yes")
                else: 
                    master_dict[keys[4]].append("No") 
            else:
                master_dict[keys[4]].append("No")
        except KeyError: 
            pass

In [4]:
%%time 

prin_pi = 'Principal Investigator'
email = 'clemson.edu'
tags = [
       'AwardID',
       'AwardEffectiveDate',
       'EmailAddress',
       'RoleCode'
       ]

master_dict = {
               'AwardID':[],
               'Date': [],
               'CU_Email_Count':[],
               'Palmetto_User_Count':[],
               'Principal_Investigator':[],
               'Palmetto_Username':[]
              }
keys = [
        'AwardID',
        'Date',
        'CU_Email_Count',
        'Palmetto_User_Count',
        'Principal_Investigator',
        'Palmetto_Username'
        ]
tagl = len(tags)
check = 0
pal_count = 0 

#csv converted to list 
df = pd.read_csv('../Research-Trend/palmetto/palmetto_data/Users.csv') #type = dataframe
user_df_Series = df['LocalUserId'] #type = Series, palmetto user ids 
user_df_list = list(user_df_Series)

awards = glob.glob('20*/*.xml')

for docs in awards: 
    with open(docs) as award_doc: 
        xml = award_doc.read()
        soup = BeautifulSoup(xml,'xml')
    try: 
        for emails in soup.find_all(tags[2]):
            try:
                at = emails.string #email address
                us = emails.string.split('@')[0] #username
                if email in at: 
                    check+=1
                    if us in user_df_list:
                        pal_count+=1 #number of palmetto users found 
            except IndexError:
                continue 
    except AttributeError:
        continue
    if check > 0:
        #store palmetto username
        usernames = userlist(soup,email,user_df_list,pal_count)
        store_data(check,master_dict,tags,soup,email,user_df_list,keys,
                   prin_pi,pal_count,usernames)
        check = 0
        #store the number of palmetto users
        if pal_count > 0:
            master_dict[keys[3]].append(pal_count)
        pal_count = 0


CPU times: user 10min 14s, sys: 12.9 s, total: 10min 27s
Wall time: 28min 9s

In [5]:
master_df = pd.DataFrame(dict([(k,pd.Series(v)) for k,v in master_dict.items()]))
m_df = master_df[['AwardID','Date','CU_Email_Count','Palmetto_User_Count',
                   'Principal_Investigator','Palmetto_Username']]
m_df


Out[5]:
AwardID Date CU_Email_Count Palmetto_User_Count Principal_Investigator Palmetto_Username
0 0700508 05/15/2007 1 1.0 No mkurz
1 0701187 09/01/2007 1 1.0 No kevja
2 0703042 08/29/2006 1 1.0 No sbritt
3 0703061 06/15/2007 3 1.0 No geist
4 0703117 09/01/2007 2 1.0 Yes aapon
5 0703547 08/01/2007 3 1.0 No sebgoa
6 0706426 06/01/2007 1 1.0 No kwang
7 0708899 09/01/2007 1 1.0 Yes calkin
8 0710797 11/01/2006 1 1.0 No mason
9 0722313 08/01/2007 5 1.0 Yes hubing
10 0722625 09/01/2007 1 1.0 Yes lmark
11 0722841 08/01/2007 4 1.0 No sebgoa
12 0726023 09/01/2007 1 1.0 No gli
13 0728035 09/01/2007 2 1.0 No kevja
14 0728626 09/15/2007 1 2.0 No abdkhan,smoysey
15 0728868 09/01/2007 2 1.0 No lflower
16 0730694 09/01/2007 1 1.0 No ffeltus
17 0733125 08/01/2007 1 1.0 No bcampb7
18 0733441 07/01/2007 2 1.0 Yes pisup
19 0733711 09/15/2007 3 1.0 No dhouse
20 0733944 08/15/2007 1 1.0 No bcdean
21 0735655 01/01/2008 2 1.0 No hubing
22 0736007 11/01/2007 1 1.0 No foulger
23 0736037 06/01/2007 1 1.0 No kevja
24 0737514 06/01/2008 1 1.0 No dperahi
25 0738162 07/15/2007 1 1.0 Yes jfrugol
26 0742296 10/01/2007 2 1.0 No rebholz
27 0744040 02/15/2008 1 1.0 No dhouse
28 0750814 03/01/2008 1 1.0 No khan
29 0751132 04/15/2008 1 1.0 No smithmc
... ... ... ... ... ... ...
518 1620695 07/01/2016 1 NaN NaN NaN
519 1620922 09/30/2015 1 NaN NaN NaN
520 1624641 09/01/2016 1 NaN NaN NaN
521 1624705 09/15/2016 1 NaN NaN NaN
522 1629437 08/01/2016 1 NaN NaN NaN
523 1629934 09/15/2016 5 NaN NaN NaN
524 1632881 09/01/2016 1 NaN NaN NaN
525 1633608 09/15/2016 5 NaN NaN NaN
526 1633952 09/01/2016 1 NaN NaN NaN
527 1638321 09/01/2017 3 NaN NaN NaN
528 1638888 08/15/2016 4 NaN NaN NaN
529 1640578 09/15/2016 1 NaN NaN NaN
530 1640645 09/01/2016 3 NaN NaN NaN
531 1640664 09/01/2016 1 NaN NaN NaN
532 1642102 10/01/2016 4 NaN NaN NaN
533 1642143 01/01/2017 4 NaN NaN NaN
534 1642542 08/01/2016 1 NaN NaN NaN
535 1643020 11/01/2016 2 NaN NaN NaN
536 1644552 07/15/2016 4 NaN NaN NaN
537 1646691 08/01/2016 5 NaN NaN NaN
538 1647361 08/01/2016 2 NaN NaN NaN
539 1660329 08/09/2016 1 NaN NaN NaN
540 1701290 08/01/2017 3 NaN NaN NaN
541 1705448 11/01/2016 1 NaN NaN NaN
542 1705450 11/01/2016 1 NaN NaN NaN
543 1710898 09/30/2016 1 NaN NaN NaN
544 1719461 08/01/2016 1 NaN NaN NaN
545 1722482 01/01/2017 2 NaN NaN NaN
546 1722997 10/01/2016 1 NaN NaN NaN
547 1725377 02/01/2017 4 NaN NaN NaN

548 rows × 6 columns


In [6]:
check = m_df.loc[m_df['Palmetto_User_Count'] > 0]
check


Out[6]:
AwardID Date CU_Email_Count Palmetto_User_Count Principal_Investigator Palmetto_Username
0 0700508 05/15/2007 1 1.0 No mkurz
1 0701187 09/01/2007 1 1.0 No kevja
2 0703042 08/29/2006 1 1.0 No sbritt
3 0703061 06/15/2007 3 1.0 No geist
4 0703117 09/01/2007 2 1.0 Yes aapon
5 0703547 08/01/2007 3 1.0 No sebgoa
6 0706426 06/01/2007 1 1.0 No kwang
7 0708899 09/01/2007 1 1.0 Yes calkin
8 0710797 11/01/2006 1 1.0 No mason
9 0722313 08/01/2007 5 1.0 Yes hubing
10 0722625 09/01/2007 1 1.0 Yes lmark
11 0722841 08/01/2007 4 1.0 No sebgoa
12 0726023 09/01/2007 1 1.0 No gli
13 0728035 09/01/2007 2 1.0 No kevja
14 0728626 09/15/2007 1 2.0 No abdkhan,smoysey
15 0728868 09/01/2007 2 1.0 No lflower
16 0730694 09/01/2007 1 1.0 No ffeltus
17 0733125 08/01/2007 1 1.0 No bcampb7
18 0733441 07/01/2007 2 1.0 Yes pisup
19 0733711 09/15/2007 3 1.0 No dhouse
20 0733944 08/15/2007 1 1.0 No bcdean
21 0735655 01/01/2008 2 1.0 No hubing
22 0736007 11/01/2007 1 1.0 No foulger
23 0736037 06/01/2007 1 1.0 No kevja
24 0737514 06/01/2008 1 1.0 No dperahi
25 0738162 07/15/2007 1 1.0 Yes jfrugol
26 0742296 10/01/2007 2 1.0 No rebholz
27 0744040 02/15/2008 1 1.0 No dhouse
28 0750814 03/01/2008 1 1.0 No khan
29 0751132 04/15/2008 1 1.0 No smithmc
... ... ... ... ... ... ...
198 1058885 09/01/2011 1 1.0 No rge
199 1059812 10/01/2010 1 1.0 Yes rge
200 1060545 06/01/2011 1 1.0 No ladner
201 1061524 03/01/2011 2 1.0 No mlarsen
202 1062155 11/01/2010 2 1.0 Yes fding
203 1062873 03/15/2011 2 1.0 No rgetman
204 1063679 10/01/2010 4 1.0 No sosolik
205 1064230 10/01/2010 3 1.0 No ptallap
206 1066567 02/01/2011 1 1.0 No xzhao5
207 1066658 08/01/2011 1 2.0 Yes ab7,sez
208 1067995 02/01/2011 1 1.0 Yes sriggs
209 1068906 03/01/2011 2 2.0 No macaule,spoznan
210 1068977 04/15/2011 2 1.0 No sears3
211 1100185 08/01/2011 1 1.0 No dperahi
212 1100356 07/15/2011 1 1.0 No jb
213 1100752 06/01/2011 1 1.0 No sez
214 1100765 04/01/2011 1 1.0 No foulger
215 1101251 08/15/2011 1 4.0 No jb,khan,sez,smoysey
216 1101301 08/01/2011 3 2.0 No switzef,isafro
217 1101845 06/01/2011 1 2.0 Yes nuyun,gemmill
218 1102889 09/01/2010 1 2.0 No kwang,nuyun
219 1104181 09/01/2011 1 1.0 No aapon
220 1104527 08/15/2011 1 1.0 No kwang
221 1104646 01/01/2011 1 1.0 No smoysey
222 1105307 09/01/2011 1 1.0 Yes etkraem
223 1107786 08/15/2011 1 1.0 No isafro
224 1112593 07/15/2011 1 1.0 No kevja
225 1112704 06/01/2013 1 1.0 No xianl
226 1115520 08/15/2011 1 1.0 Yes xianl
227 1116075 09/01/2011 2 1.0 No bcampb7

228 rows × 6 columns


In [9]:
check2 = check.loc[m_df['Principal_Investigator'] == 'Yes']
check2


Out[9]:
AwardID Date CU_Email_Count Palmetto_User_Count Principal_Investigator Palmetto_Username
4 0703117 09/01/2007 2 1.0 Yes aapon
7 0708899 09/01/2007 1 1.0 Yes calkin
9 0722313 08/01/2007 5 1.0 Yes hubing
10 0722625 09/01/2007 1 1.0 Yes lmark
18 0733441 07/01/2007 2 1.0 Yes pisup
25 0738162 07/15/2007 1 1.0 Yes jfrugol
35 0800474 05/01/2008 1 1.0 Yes pisup
38 0801435 05/01/2008 1 1.0 Yes marisak
44 0811728 09/15/2008 1 1.0 Yes jb
52 0821918 09/01/2008 1 2.0 Yes sbritt,sosolik
55 0825468 10/01/2008 1 3.0 Yes gemmill,pepin,jb
56 0825655 07/15/2008 1 1.0 Yes aapon
58 0825942 09/01/2008 1 1.0 Yes gli
59 0826067 08/15/2008 2 1.0 Yes kevja
65 0834219 09/01/2008 1 1.0 Yes aapon
71 0844954 09/01/2008 2 1.0 Yes pepin
75 0847132 07/01/2009 1 1.0 Yes zhu3
76 0850695 07/01/2009 2 1.0 Yes stewari
77 0853835 08/15/2009 1 1.0 Yes dperahi
78 0853873 06/01/2009 1 1.0 Yes rebholz
80 0855878 02/15/2009 1 1.0 Yes rge
82 0900163 08/15/2009 1 1.0 Yes thrash5
83 0900182 08/15/2009 1 2.0 Yes geist,sebgoa
85 0901732 08/01/2009 3 1.0 Yes jb
96 0914478 07/15/2009 1 1.0 Yes ssarupr
102 0918970 09/01/2009 1 1.0 Yes dhouse
108 0923379 08/01/2009 5 1.0 Yes bcampb7
120 0933218 08/15/2009 1 4.0 Yes aapon,pepin,duffy2,galen
122 0934299 07/15/2009 1 1.0 Yes bcampb7
123 0934300 08/01/2009 3 1.0 Yes bcdean
... ... ... ... ... ... ...
132 0944315 04/01/2010 1 1.0 Yes spoznan
135 0947679 10/01/2009 1 1.0 Yes fwillia
141 0950710 01/01/2010 2 1.0 Yes xianl
142 0952160 02/15/2010 1 1.0 Yes sjoerg
143 0953783 03/15/2010 1 2.0 Yes jmarty,kwang
145 0954811 09/01/2010 1 1.0 Yes sbritt
146 0955096 04/15/2010 1 2.0 Yes jb,ooyekoy
148 0959124 05/01/2010 1 1.0 Yes blenner
150 0960586 07/01/2010 3 1.0 Yes walt
154 0965624 07/01/2010 1 1.0 Yes edimit
155 0965814 06/01/2010 1 1.0 Yes kwang
160 0969474 08/01/2010 1 3.0 Yes jb,gemmill,kwang
161 1000667 07/15/2010 2 1.0 Yes jb
162 1001553 08/01/2010 3 1.0 Yes ffeltus
170 1008600 09/01/2010 1 1.0 Yes sears3
171 1011478 07/01/2010 3 1.0 Yes seksiog
177 1026385 09/15/2010 5 1.0 Yes lflower
178 1028074 04/01/2010 1 1.0 Yes foulger
182 1037544 07/01/2010 1 2.0 Yes rebholz,heister
185 1041631 10/01/2010 1 2.0 Yes jmarty,kwang
187 1043129 06/01/2011 1 1.0 Yes luofeng
191 1049765 09/01/2010 2 1.0 Yes xianl
196 1055950 01/15/2011 1 2.0 Yes qsc,rebholz
199 1059812 10/01/2010 1 1.0 Yes rge
202 1062155 11/01/2010 2 1.0 Yes fding
207 1066658 08/01/2011 1 2.0 Yes ab7,sez
208 1067995 02/01/2011 1 1.0 Yes sriggs
217 1101845 06/01/2011 1 2.0 Yes nuyun,gemmill
222 1105307 09/01/2011 1 1.0 Yes etkraem
226 1115520 08/15/2011 1 1.0 Yes xianl

63 rows × 6 columns


In [11]:
def test(x, debug = False):
    x = x * 2
    if debug: print (x)
    return x

y = test(2)
y = test(2, True)


4