In [1]:
import pandas as pd 
import glob
import re 
import string
from bs4 import BeautifulSoup
import numpy as np
import matplotlib.pyplot as plt 

%matplotlib inline
#xml tag locations
tag0 = 0
tag1 = 1 
tag2 = 2
tag3 = 3
tag4 = 4

In [2]:
def dict_build(tags,store,sorted_keys,keyword_list,email='clemson.edu',
alt_email='g.clemson.edu'):
    count = 0
    value = 0 
    for x in range(len(tags)):
                #Store Id, Effective Date, Dollar Amount
                if tags[x] != tags[tag3] and tags[x] != tags[tag4]:
                    try: 
                        store[sorted_keys[x]].append(soup.find(tags[x]).string)
                    except AttributeError: 
                        continue
                elif tags[x] == tags[tag3]: 
                    #User count is stored 
                    try:
                        for e_mails in soup.find_all(tags[tag3]):
                            e_check = e_mails.string.split('@')[1]
                            if e_check == email or e_check == alt_email:
                                value+=1
                        store[sorted_keys[x]].append(value)
                        value = 0 
                    except AttributeError: 
                        continue
                elif tags[x] == tags[tag4]: 
                    #Keyword check 
                    try:
                        abst = soup.find(tags[tag4]).string
                        regex = re.compile('[%s]' % re.escape(string.punctuation))
                        abs_punc_free = regex.sub(' ', str(abst))
                        abs_word_list = abs_punc_free.split()
                        for words in abs_word_list: 
                            if words in keyword_list:
                                count+=1
                        if count > 0: 
                            store[sorted_keys[x]].append('Found')
                            count = 0 
                        else:
                            store[sorted_keys[x]].append('NOT Found')
                            count = 0 
                    except AttributeError:
                        continue

In [3]:
%%time
    
keyword_list = ['computation', 'compute', 'simulation', 'computational', 'simulate', 'genome',
   'sequence', 'sequencing', 'molecule', 'large scale', 'large-scale', 'massive', 
    'hpc', 'molecular', 'simulations', 'genomic']

#*****Dictionary to set up for DataFrame************
store = {'ID':[],'Date':[],'Dollar_Amount':[],
'Inspector_Count':[],'Data_Word_Check':[]}
sorted_keys = ['ID','Date','Dollar_Amount',
'Inspector_Count','Data_Word_Check']
#***************************************************

tags =['AwardID','AwardEffectiveDate','AwardAmount',
       'EmailAddress','AbstractNarration']
file = glob.glob('20*/*.xml')

p = 0 #check for the existance of clemson email 

email = 'clemson.edu'
alt_email = 'g.clemson.edu'

    for x in file: 
        with open(x) as file2:
            xml = file2.read()
            soup = BeautifulSoup(xml,'xml') 
            try:
                for e in soup.find_all(tags[tag3]):
                    try:
                        inspector_email = e.string.split('@')[1]
                    except IndexError:
                        continue
                    if inspector_email == email or inspector_email == alt_email:
                        p+=1 #increment every time a clemson is found in a xml file 
            except AttributeError: 
                continue
            if p > 0:
                p = 0 #set test value back to 0 for next loop
                dict_build(tags,store,sorted_keys,keyword_list,email,alt_email)
            else:
                continue


CPU times: user 12min 58s, sys: 33.1 s, total: 13min 31s
Wall time: 26min

In [5]:
fund_frame = pd.DataFrame(dict([(k,pd.Series(v)) for k,v in store.items()]))
fund = fund_frame[['ID','Date','Dollar_Amount','Inspector_Count','Data_Word_Check']]
fund.head()


Out[5]:
ID Date Dollar_Amount Inspector_Count Data_Word_Check
0 1500007 10/15/2014 151149 1 Found
1 1501695 09/01/2015 2022723 1 Found
2 1502293 08/01/2015 13615 3 NOT Found
3 1503358 06/01/2015 99192 1 NOT Found
4 1503562 06/01/2015 15000 1 Found

In [6]:
fund


Out[6]:
ID Date Dollar_Amount Inspector_Count Data_Word_Check
0 1500007 10/15/2014 151149 1 Found
1 1501695 09/01/2015 2022723 1 Found
2 1502293 08/01/2015 13615 3 NOT Found
3 1503358 06/01/2015 99192 1 NOT Found
4 1503562 06/01/2015 15000 1 Found
5 1504576 09/15/2015 176183 1 NOT Found
6 1504619 07/01/2015 227369 3 NOT Found
7 1505098 08/26/2014 335830 1 NOT Found
8 1507266 08/01/2015 375000 1 Found
9 1507529 08/01/2015 390000 1 Found
10 1510790 08/15/2015 359066 3 NOT Found
11 1511051 07/01/2015 339870 2 NOT Found
12 1512342 08/16/2014 206536 1 NOT Found
13 1513875 10/01/2015 304924 1 NOT Found
14 1517014 09/01/2015 282067 1 Found
15 1518455 09/01/2015 850000 4 NOT Found
16 1522191 09/15/2015 234780 2 Found
17 1522751 07/15/2015 180000 1 Found
18 1527165 09/01/2015 450000 1 NOT Found
19 1527193 09/01/2015 246411 1 Found
20 1527421 09/01/2015 307767 2 NOT Found
21 1529927 08/01/2015 332654 2 NOT Found
22 1531041 08/01/2015 1002447 1 NOT Found
23 1531127 09/01/2015 614582 4 NOT Found
24 1531607 09/15/2015 380599 3 NOT Found
25 1534304 01/01/2016 969089 3 Found
26 1537756 08/15/2015 349999 2 NOT Found
27 1537924 09/01/2015 255680 4 NOT Found
28 1538215 09/01/2015 300000 1 NOT Found
29 1539536 09/01/2015 396011 4 NOT Found
... ... ... ... ... ...
498 1442131 06/01/2014 98625 1 NOT Found
499 1443040 01/01/2015 1485021 1 Found
500 1444461 01/01/2016 1276249 3 Found
501 1444552 04/15/2015 2237255 1 Found
502 1444962 08/15/2014 204633 1 NOT Found
503 1446323 09/15/2014 93255 1 NOT Found
504 1447771 09/01/2014 300000 2 Found
505 1453607 09/01/2015 287754 1 NOT Found
506 1453775 02/01/2015 178018 1 Found
507 1454139 02/01/2015 500000 1 NOT Found
508 1456582 03/01/2015 128216 1 NOT Found
509 1457909 05/01/2015 396983 1 NOT Found
510 1458177 08/01/2015 130560 1 NOT Found
511 1460110 07/15/2015 210459 2 NOT Found
512 1460863 03/15/2015 330000 2 NOT Found
513 1460895 11/01/2015 335187 2 NOT Found
514 1462064 04/01/2015 4000 2 NOT Found
515 1462420 08/15/2014 258235 1 NOT Found
516 1462804 07/15/2015 375000 2 Found
517 1463808 06/15/2015 143956 1 NOT Found
518 1464459 06/01/2015 11499 1 NOT Found
519 1464637 04/01/2015 15999 2 Found
520 1701290 08/01/2017 22233 3 NOT Found
521 1705448 11/01/2016 53844 1 Found
522 1705450 11/01/2016 33224 1 NOT Found
523 1710898 09/30/2016 192469 1 NOT Found
524 1719461 08/01/2016 71870 1 Found
525 1722482 01/01/2017 49980 2 Found
526 1722997 10/01/2016 57172 1 Found
527 1725377 02/01/2017 152268 4 NOT Found

528 rows × 5 columns


In [7]:
check = fund.loc[fund['Data_Word_Check'] == 'Found']

In [8]:
check


Out[8]:
ID Date Dollar_Amount Inspector_Count Data_Word_Check
0 1500007 10/15/2014 151149 1 Found
1 1501695 09/01/2015 2022723 1 Found
4 1503562 06/01/2015 15000 1 Found
8 1507266 08/01/2015 375000 1 Found
9 1507529 08/01/2015 390000 1 Found
14 1517014 09/01/2015 282067 1 Found
16 1522191 09/15/2015 234780 2 Found
17 1522751 07/15/2015 180000 1 Found
19 1527193 09/01/2015 246411 1 Found
25 1534304 01/01/2016 969089 3 Found
34 1541944 03/15/2016 122306 1 Found
35 1542727 06/01/2015 49991 1 Found
37 1543655 08/01/2015 49998 1 Found
38 1544910 10/01/2015 800000 3 Found
40 1546711 01/01/2016 41500 2 Found
41 1547107 09/01/2015 9808 3 Found
43 1547236 08/15/2015 248703 1 Found
44 1547399 08/01/2016 1633485 5 Found
45 1549977 09/01/2015 90075 1 Found
48 1551511 05/18/2015 178018 1 Found
49 1551534 09/01/2015 64214 2 Found
55 1553945 04/01/2016 515569 1 Found
56 1554385 05/01/2016 503922 1 Found
59 1559711 07/09/2015 63325 1 Found
62 1561190 06/01/2016 126191 1 Found
64 1563426 05/01/2016 272665 1 Found
65 1563435 08/01/2016 427724 3 Found
80 1608663 09/01/2016 532191 1 Found
81 1611136 07/01/2016 142000 1 Found
85 1619950 10/01/2016 212169 2 Found
... ... ... ... ... ...
456 1254609 07/01/2013 330202 1 Found
457 1254670 03/01/2013 400000 1 Found
460 1261359 10/01/2012 773996 1 Found
463 1264579 09/01/2013 300000 1 Found
465 1265410 08/15/2012 273320 1 Found
472 1403873 08/15/2014 327824 2 Found
474 1404981 05/01/2014 699843 1 Found
477 1407480 08/15/2014 150000 1 Found
478 1407623 09/01/2014 253554 1 Found
479 1409111 08/01/2014 100000 1 Found
482 1411174 02/01/2014 7500 1 Found
484 1418960 08/01/2014 206248 1 Found
485 1419023 01/01/2015 100000 1 Found
486 1419038 01/01/2015 100002 1 Found
487 1419100 08/01/2014 180000 1 Found
489 1423189 08/01/2014 499347 2 Found
495 1438325 08/01/2014 358386 2 Found
496 1440609 10/01/2014 291040 3 Found
497 1440659 09/01/2014 149009 1 Found
499 1443040 01/01/2015 1485021 1 Found
500 1444461 01/01/2016 1276249 3 Found
501 1444552 04/15/2015 2237255 1 Found
504 1447771 09/01/2014 300000 2 Found
506 1453775 02/01/2015 178018 1 Found
516 1462804 07/15/2015 375000 2 Found
519 1464637 04/01/2015 15999 2 Found
521 1705448 11/01/2016 53844 1 Found
524 1719461 08/01/2016 71870 1 Found
525 1722482 01/01/2017 49980 2 Found
526 1722997 10/01/2016 57172 1 Found

181 rows × 5 columns


In [9]:
d1 = fund['Dollar_Amount']#series
d1 = d1.astype(float) #type conversion to float
d2 = pd.to_datetime(fund["Date"]) #convert Date to datetime
d1.index = d2 #set d1's index to Date values 
byyear = d1.groupby(d1.index.year).sum() #aggregate the date and sum it
print(byyear)
my_plot = byyear.plot.bar(figsize = (12,6),color='#F66733',legend=True,label='CU Award Amounts')
my_plot.set_title('NSF Award Totals (2007-2017)')
my_plot.set_ylabel('10 million ($)')
my_plot.set_xlabel('Year')


2006      547565.0
2007     9241641.0
2008    13431600.0
2009    47706863.0
2010    26512890.0
2011    24445595.0
2012    21615326.0
2013    14194995.0
2014    27252198.0
2015    22153118.0
2016    26024463.0
2017     1224156.0
Name: Dollar_Amount, dtype: float64
Out[9]:
<matplotlib.text.Text at 0x2b656a6461d0>

In [10]:
#csv converted to list 
df = pd.read_csv('palmetto/palmetto_data/Users.csv') #type = dataframe
user_df_Series = df['LocalUserId'] #type = Series, palmetto user ids 
user_df_list = list(user_df_Series)

In [12]:
%%time 

files = glob.glob('20*/*.xml')
username = [] #The clemson userIDs in the NSF files 
              #length = 802
pal_user_fund = []
test = 0
amount = {'Date':[],'Award_Amount':[]}

for xml_f in files:
#Open files
    with open(xml_f) as new_file:
    #Store Data
        xml = new_file.read()
        #Convert data to text 
        soup = BeautifulSoup(xml,'xml')
        try:
            #loop through the xml file and store all user email
            for e in soup.find_all(tags[tag3]):
                try: 
                    mail = e.string.split('@')[1]
                    if mail == email or mail == alt_email:
                        user = e.string.split('@')[0]
                        username.append(user)
                        if user in user_df_list:
                            test+=1
                except IndexError: 
                    continue
        except AttributeError:
            continue
        if test > 0:
            test = 0 
            try:
                amount['Award_Amount'].append(soup.find(tags[tag2]).string)
                amount['Date'].append(soup.find(tags[tag1]).string)
            except AttributeError: 
                continue


CPU times: user 12min 17s, sys: 25.1 s, total: 12min 42s
Wall time: 13min 29s

In [13]:
print(mail)


hotmail.com

In [14]:
print(len(amount['Award_Amount'])) #palmetto users number of award amounts found
print(len(fund['Dollar_Amount']))  #CU users number of award amounts found


220
528

In [15]:
similar = [userID for userID in username if userID in user_df_list] #length = 282
print('Number of Palmetto usernames: '+ str(len(similar)))
print('Number of total Clemson usernames: '+ str(len(username)))


Number of Palmetto usernames: 282
Number of total Clemson usernames: 802

In [16]:
palmetto = len(similar)
all_users = len(username)

percentage = palmetto/all_users #35%
print("Palmetto Users: "+ str(round(percentage*100)) + '%\n'+'Note: ' + "Value out of total number of Clemson usernames.")


Palmetto Users: 35%
Note: Value out of total number of Clemson usernames.

In [21]:
pal_user_fund = pd.DataFrame(amount) #Dateframe
pal_fund = pal_user_fund[['Date','Award_Amount']] #Dataframe

pal_s = pal_fund['Award_Amount'] #Series
pal_s = pal_s.astype(float) #convert to float

pal2 = pd.to_datetime(pal_fund['Date']) #convert to date type
pal_s.index = pal2 #Set index to date 

g_pal_s = pal_s.groupby(pal_s.index.year).sum()#sum dollar amounts

stack_fund = pd.concat([g_pal_s,byyear],axis=1) #combine series now dataframe

stack_fund.columns = ['CU Awards: Palmetto Users Only','CU Awards']#set column names

stack_fund.index.name = "Year"

In [22]:
print(stack_fund.fillna(0))

#*****************************Graph Properties**************************************
my_plt = stack_fund.plot.bar(figsize = (12,6),color=['#522D80','#F66733'],legend=True)
my_plt.set_title('NSF Award Totals (2007-2017)')
my_plt.set_ylabel('10 million ($)')
my_plt.set_xlabel('Year')


      CU Awards: Palmetto Users Only   CU Awards
Year                                            
2006                             0.0    547565.0
2007                       2535162.0   9241641.0
2008                       4594505.0  13431600.0
2009                      11644465.0  47706863.0
2010                      13796148.0  26512890.0
2011                       5980112.0  24445595.0
2012                      13878261.0  21615326.0
2013                       5362704.0  14194995.0
2014                      22436154.0  27252198.0
2015                      12434660.0  22153118.0
2016                      18987197.0  26024463.0
2017                       1071888.0   1224156.0
Out[22]:
<matplotlib.text.Text at 0x2b658906e940>

In [23]:
stack_fund.to_csv(path_or_buf='CU_Fund.csv')

In [24]:
import csv
csvfile = 'Cu_NSF_users.csv'

with open(csvfile, "w") as output:
    writer = csv.writer(output, lineterminator='\n')
    for val in username:
        writer.writerow([val])

In [35]:
userlist = []
with open('Cu_NSF_users.csv','r') as file: 
    wordreader = csv.reader(file,delimiter='\n')
    for v in wordreader:
        userlist.append('\n'.join(v))
print(len(userlist))


802

In [41]:
prin_pi = 'Principal Investigator'
co_pi = 'Co-Principal Investigator'

with open('2014/1400009.xml') as test_file:
            xml_file = test_file.read()
            soup = BeautifulSoup(xml_file,'xml')
            role = soup.find_all('RoleCode')
            for roles in role: 
                if roles.string == prin_val:
                    print('Principle')


Principle

In [ ]: