In [1]:
import pandas as pd
import glob
from bs4 import BeautifulSoup
In [2]:
def userlist(soup,email,user_df_list,pal_count):
usernames = []
for emails in soup.find_all(tags[2]):
try:
at = emails.string #email
us = emails.string.split('@')[0] #username
if email in at:
if us in user_df_list:
usernames.append(us)
except IndexError:
continue
return usernames
In [3]:
def store_data(check,master_dict,tags,soup,email,user_df_list,keys,prin_pi,pal_count,usernames):
#store palmetto user count
if pal_count > 1:
palm_user = ','.join(usernames)
master_dict[keys[5]].append(palm_user)
elif pal_count == 1:
try:
master_dict[keys[5]].append(usernames[0])
except IndexError:
pass
else:
master_dict[keys[5]].append('NaN')
#store ID
awardID = soup.find(tags[0]).string
master_dict[keys[0]].append(awardID)
#Store Date
date = soup.find(tags[1]).string
master_dict[keys[1]].append(date)
#store cu count
master_dict[keys[2]].append(check)
role_dict = {x.string:y.string
for x in soup.find(tags[3])
for y in soup.find(tags[2])
if x.string == prin_pi}
#store role check result
try:
user = role_dict[prin_pi].split('@')[0] #username
mail = role_dict[prin_pi] #email
if email in mail:
if user in user_df_list:
master_dict[keys[4]].append("Yes")
else:
master_dict[keys[4]].append("No")
else:
master_dict[keys[4]].append("No")
except KeyError:
pass
In [4]:
%%time
prin_pi = 'Principal Investigator'
email = 'clemson.edu'
tags = [
'AwardID',
'AwardEffectiveDate',
'EmailAddress',
'RoleCode'
]
master_dict = {
'AwardID':[],
'Date': [],
'CU_Email_Count':[],
'Palmetto_User_Count':[],
'Principal_Investigator':[],
'Palmetto_Username':[]
}
keys = [
'AwardID',
'Date',
'CU_Email_Count',
'Palmetto_User_Count',
'Principal_Investigator',
'Palmetto_Username'
]
tagl = len(tags)
check = 0
pal_count = 0
#csv converted to list
df = pd.read_csv('../Research-Trend/palmetto/palmetto_data/Users.csv') #type = dataframe
user_df_Series = df['LocalUserId'] #type = Series, palmetto user ids
user_df_list = list(user_df_Series)
awards = glob.glob('20*/*.xml')
for docs in awards:
with open(docs) as award_doc:
xml = award_doc.read()
soup = BeautifulSoup(xml,'xml')
try:
for emails in soup.find_all(tags[2]):
try:
at = emails.string #email address
us = emails.string.split('@')[0] #username
if email in at:
check+=1
if us in user_df_list:
pal_count+=1 #number of palmetto users found
except IndexError:
continue
except AttributeError:
continue
if check > 0:
#store palmetto username
usernames = userlist(soup,email,user_df_list,pal_count)
store_data(check,master_dict,tags,soup,email,user_df_list,keys,
prin_pi,pal_count,usernames)
check = 0
#store the number of palmetto users
if pal_count > 0:
master_dict[keys[3]].append(pal_count)
pal_count = 0
In [5]:
master_df = pd.DataFrame(dict([(k,pd.Series(v)) for k,v in master_dict.items()]))
m_df = master_df[['AwardID','Date','CU_Email_Count','Palmetto_User_Count',
'Principal_Investigator','Palmetto_Username']]
m_df
Out[5]:
In [6]:
check = m_df.loc[m_df['Palmetto_User_Count'] > 0]
check
Out[6]:
In [9]:
check2 = check.loc[m_df['Principal_Investigator'] == 'Yes']
check2
Out[9]:
In [11]:
def test(x, debug = False):
x = x * 2
if debug: print (x)
return x
y = test(2)
y = test(2, True)