In [1]:
import pandas as pd
import glob
import re
import string
from bs4 import BeautifulSoup
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
#xml tag locations
tag0 = 0
tag1 = 1
tag2 = 2
tag3 = 3
tag4 = 4
In [2]:
def dict_build(tags,store,sorted_keys,keyword_list,email='clemson.edu',
alt_email='g.clemson.edu'):
count = 0
value = 0
for x in range(len(tags)):
#Store Id, Effective Date, Dollar Amount
if tags[x] != tags[tag3] and tags[x] != tags[tag4]:
try:
store[sorted_keys[x]].append(soup.find(tags[x]).string)
except AttributeError:
continue
elif tags[x] == tags[tag3]:
#User count is stored
try:
for e_mails in soup.find_all(tags[tag3]):
e_check = e_mails.string.split('@')[1]
if e_check == email or e_check == alt_email:
value+=1
store[sorted_keys[x]].append(value)
value = 0
except AttributeError:
continue
elif tags[x] == tags[tag4]:
#Keyword check
try:
abst = soup.find(tags[tag4]).string
regex = re.compile('[%s]' % re.escape(string.punctuation))
abs_punc_free = regex.sub(' ', str(abst))
abs_word_list = abs_punc_free.split()
for words in abs_word_list:
if words in keyword_list:
count+=1
if count > 0:
store[sorted_keys[x]].append('Found')
count = 0
else:
store[sorted_keys[x]].append('NOT Found')
count = 0
except AttributeError:
continue
In [3]:
%%time
keyword_list = ['computation', 'compute', 'simulation', 'computational', 'simulate', 'genome',
'sequence', 'sequencing', 'molecule', 'large scale', 'large-scale', 'massive',
'hpc', 'molecular', 'simulations', 'genomic']
#*****Dictionary to set up for DataFrame************
store = {'ID':[],'Date':[],'Dollar_Amount':[],
'Inspector_Count':[],'Data_Word_Check':[]}
sorted_keys = ['ID','Date','Dollar_Amount',
'Inspector_Count','Data_Word_Check']
#***************************************************
tags =['AwardID','AwardEffectiveDate','AwardAmount',
'EmailAddress','AbstractNarration']
file = glob.glob('20*/*.xml')
p = 0 #check for the existance of clemson email
email = 'clemson.edu'
alt_email = 'g.clemson.edu'
for x in file:
with open(x) as file2:
xml = file2.read()
soup = BeautifulSoup(xml,'xml')
try:
for e in soup.find_all(tags[tag3]):
try:
inspector_email = e.string.split('@')[1]
except IndexError:
continue
if inspector_email == email or inspector_email == alt_email:
p+=1 #increment every time a clemson is found in a xml file
except AttributeError:
continue
if p > 0:
p = 0 #set test value back to 0 for next loop
dict_build(tags,store,sorted_keys,keyword_list,email,alt_email)
else:
continue
In [5]:
fund_frame = pd.DataFrame(dict([(k,pd.Series(v)) for k,v in store.items()]))
fund = fund_frame[['ID','Date','Dollar_Amount','Inspector_Count','Data_Word_Check']]
fund.head()
Out[5]:
In [6]:
fund
Out[6]:
In [7]:
check = fund.loc[fund['Data_Word_Check'] == 'Found']
In [8]:
check
Out[8]:
In [9]:
d1 = fund['Dollar_Amount']#series
d1 = d1.astype(float) #type conversion to float
d2 = pd.to_datetime(fund["Date"]) #convert Date to datetime
d1.index = d2 #set d1's index to Date values
byyear = d1.groupby(d1.index.year).sum() #aggregate the date and sum it
print(byyear)
my_plot = byyear.plot.bar(figsize = (12,6),color='#F66733',legend=True,label='CU Award Amounts')
my_plot.set_title('NSF Award Totals (2007-2017)')
my_plot.set_ylabel('10 million ($)')
my_plot.set_xlabel('Year')
Out[9]:
In [10]:
#csv converted to list
df = pd.read_csv('palmetto/palmetto_data/Users.csv') #type = dataframe
user_df_Series = df['LocalUserId'] #type = Series, palmetto user ids
user_df_list = list(user_df_Series)
In [12]:
%%time
files = glob.glob('20*/*.xml')
username = [] #The clemson userIDs in the NSF files
#length = 802
pal_user_fund = []
test = 0
amount = {'Date':[],'Award_Amount':[]}
for xml_f in files:
#Open files
with open(xml_f) as new_file:
#Store Data
xml = new_file.read()
#Convert data to text
soup = BeautifulSoup(xml,'xml')
try:
#loop through the xml file and store all user email
for e in soup.find_all(tags[tag3]):
try:
mail = e.string.split('@')[1]
if mail == email or mail == alt_email:
user = e.string.split('@')[0]
username.append(user)
if user in user_df_list:
test+=1
except IndexError:
continue
except AttributeError:
continue
if test > 0:
test = 0
try:
amount['Award_Amount'].append(soup.find(tags[tag2]).string)
amount['Date'].append(soup.find(tags[tag1]).string)
except AttributeError:
continue
In [13]:
print(mail)
In [14]:
print(len(amount['Award_Amount'])) #palmetto users number of award amounts found
print(len(fund['Dollar_Amount'])) #CU users number of award amounts found
In [15]:
similar = [userID for userID in username if userID in user_df_list] #length = 282
print('Number of Palmetto usernames: '+ str(len(similar)))
print('Number of total Clemson usernames: '+ str(len(username)))
In [16]:
palmetto = len(similar)
all_users = len(username)
percentage = palmetto/all_users #35%
print("Palmetto Users: "+ str(round(percentage*100)) + '%\n'+'Note: ' + "Value out of total number of Clemson usernames.")
In [21]:
pal_user_fund = pd.DataFrame(amount) #Dateframe
pal_fund = pal_user_fund[['Date','Award_Amount']] #Dataframe
pal_s = pal_fund['Award_Amount'] #Series
pal_s = pal_s.astype(float) #convert to float
pal2 = pd.to_datetime(pal_fund['Date']) #convert to date type
pal_s.index = pal2 #Set index to date
g_pal_s = pal_s.groupby(pal_s.index.year).sum()#sum dollar amounts
stack_fund = pd.concat([g_pal_s,byyear],axis=1) #combine series now dataframe
stack_fund.columns = ['CU Awards: Palmetto Users Only','CU Awards']#set column names
stack_fund.index.name = "Year"
In [22]:
print(stack_fund.fillna(0))
#*****************************Graph Properties**************************************
my_plt = stack_fund.plot.bar(figsize = (12,6),color=['#522D80','#F66733'],legend=True)
my_plt.set_title('NSF Award Totals (2007-2017)')
my_plt.set_ylabel('10 million ($)')
my_plt.set_xlabel('Year')
Out[22]:
In [23]:
stack_fund.to_csv(path_or_buf='CU_Fund.csv')
In [24]:
import csv
csvfile = 'Cu_NSF_users.csv'
with open(csvfile, "w") as output:
writer = csv.writer(output, lineterminator='\n')
for val in username:
writer.writerow([val])
In [35]:
userlist = []
with open('Cu_NSF_users.csv','r') as file:
wordreader = csv.reader(file,delimiter='\n')
for v in wordreader:
userlist.append('\n'.join(v))
print(len(userlist))
In [41]:
prin_pi = 'Principal Investigator'
co_pi = 'Co-Principal Investigator'
with open('2014/1400009.xml') as test_file:
xml_file = test_file.read()
soup = BeautifulSoup(xml_file,'xml')
role = soup.find_all('RoleCode')
for roles in role:
if roles.string == prin_val:
print('Principle')
In [ ]: