notebook.community

Edit and run



In [1]:

    
import pandas as pd 
import glob
import re 
import string
from bs4 import BeautifulSoup
import numpy as np
import matplotlib.pyplot as plt 

%matplotlib inline
#xml tag locations
tag0 = 0
tag1 = 1 
tag2 = 2
tag3 = 3
tag4 = 4



In [2]:

    
def dict_build(tags,store,sorted_keys,keyword_list,email='clemson.edu',
alt_email='g.clemson.edu'):
    count = 0
    value = 0 
    for x in range(len(tags)):
                #Store Id, Effective Date, Dollar Amount
                if tags[x] != tags[tag3] and tags[x] != tags[tag4]:
                    try: 
                        store[sorted_keys[x]].append(soup.find(tags[x]).string)
                    except AttributeError: 
                        continue
                elif tags[x] == tags[tag3]: 
                    #User count is stored 
                    try:
                        for e_mails in soup.find_all(tags[tag3]):
                            e_check = e_mails.string.split('@')[1]
                            if e_check == email or e_check == alt_email:
                                value+=1
                        store[sorted_keys[x]].append(value)
                        value = 0 
                    except AttributeError: 
                        continue
                elif tags[x] == tags[tag4]: 
                    #Keyword check 
                    try:
                        abst = soup.find(tags[tag4]).string
                        regex = re.compile('[%s]' % re.escape(string.punctuation))
                        abs_punc_free = regex.sub(' ', str(abst))
                        abs_word_list = abs_punc_free.split()
                        for words in abs_word_list: 
                            if words in keyword_list:
                                count+=1
                        if count > 0: 
                            store[sorted_keys[x]].append('Found')
                            count = 0 
                        else:
                            store[sorted_keys[x]].append('NOT Found')
                            count = 0 
                    except AttributeError:
                        continue



In [3]:

    
%%time
    
keyword_list = ['computation', 'compute', 'simulation', 'computational', 'simulate', 'genome',
   'sequence', 'sequencing', 'molecule', 'large scale', 'large-scale', 'massive', 
    'hpc', 'molecular', 'simulations', 'genomic']

#*****Dictionary to set up for DataFrame************
store = {'ID':[],'Date':[],'Dollar_Amount':[],
'Inspector_Count':[],'Data_Word_Check':[]}
sorted_keys = ['ID','Date','Dollar_Amount',
'Inspector_Count','Data_Word_Check']
#***************************************************

tags =['AwardID','AwardEffectiveDate','AwardAmount',
       'EmailAddress','AbstractNarration']
file = glob.glob('20*/*.xml')

p = 0 #check for the existance of clemson email 

email = 'clemson.edu'
alt_email = 'g.clemson.edu'

    for x in file: 
        with open(x) as file2:
            xml = file2.read()
            soup = BeautifulSoup(xml,'xml') 
            try:
                for e in soup.find_all(tags[tag3]):
                    try:
                        inspector_email = e.string.split('@')[1]
                    except IndexError:
                        continue
                    if inspector_email == email or inspector_email == alt_email:
                        p+=1 #increment every time a clemson is found in a xml file 
            except AttributeError: 
                continue
            if p > 0:
                p = 0 #set test value back to 0 for next loop
                dict_build(tags,store,sorted_keys,keyword_list,email,alt_email)
            else:
                continue









    



CPU times: user 12min 58s, sys: 33.1 s, total: 13min 31s
Wall time: 26min



In [5]:

    
fund_frame = pd.DataFrame(dict([(k,pd.Series(v)) for k,v in store.items()]))
fund = fund_frame[['ID','Date','Dollar_Amount','Inspector_Count','Data_Word_Check']]
fund.head()









    Out[5]:






  
    
      
      ID
      Date
      Dollar_Amount
      Inspector_Count
      Data_Word_Check
    
  
  
    
      0
      1500007
      10/15/2014
      151149
      1
      Found
    
    
      1
      1501695
      09/01/2015
      2022723
      1
      Found
    
    
      2
      1502293
      08/01/2015
      13615
      3
      NOT Found
    
    
      3
      1503358
      06/01/2015
      99192
      1
      NOT Found
    
    
      4
      1503562
      06/01/2015
      15000
      1
      Found



In [6]:

    
fund









    Out[6]:






  
    
      
      ID
      Date
      Dollar_Amount
      Inspector_Count
      Data_Word_Check
    
  
  
    
      0
      1500007
      10/15/2014
      151149
      1
      Found
    
    
      1
      1501695
      09/01/2015
      2022723
      1
      Found
    
    
      2
      1502293
      08/01/2015
      13615
      3
      NOT Found
    
    
      3
      1503358
      06/01/2015
      99192
      1
      NOT Found
    
    
      4
      1503562
      06/01/2015
      15000
      1
      Found
    
    
      5
      1504576
      09/15/2015
      176183
      1
      NOT Found
    
    
      6
      1504619
      07/01/2015
      227369
      3
      NOT Found
    
    
      7
      1505098
      08/26/2014
      335830
      1
      NOT Found
    
    
      8
      1507266
      08/01/2015
      375000
      1
      Found
    
    
      9
      1507529
      08/01/2015
      390000
      1
      Found
    
    
      10
      1510790
      08/15/2015
      359066
      3
      NOT Found
    
    
      11
      1511051
      07/01/2015
      339870
      2
      NOT Found
    
    
      12
      1512342
      08/16/2014
      206536
      1
      NOT Found
    
    
      13
      1513875
      10/01/2015
      304924
      1
      NOT Found
    
    
      14
      1517014
      09/01/2015
      282067
      1
      Found
    
    
      15
      1518455
      09/01/2015
      850000
      4
      NOT Found
    
    
      16
      1522191
      09/15/2015
      234780
      2
      Found
    
    
      17
      1522751
      07/15/2015
      180000
      1
      Found
    
    
      18
      1527165
      09/01/2015
      450000
      1
      NOT Found
    
    
      19
      1527193
      09/01/2015
      246411
      1
      Found
    
    
      20
      1527421
      09/01/2015
      307767
      2
      NOT Found
    
    
      21
      1529927
      08/01/2015
      332654
      2
      NOT Found
    
    
      22
      1531041
      08/01/2015
      1002447
      1
      NOT Found
    
    
      23
      1531127
      09/01/2015
      614582
      4
      NOT Found
    
    
      24
      1531607
      09/15/2015
      380599
      3
      NOT Found
    
    
      25
      1534304
      01/01/2016
      969089
      3
      Found
    
    
      26
      1537756
      08/15/2015
      349999
      2
      NOT Found
    
    
      27
      1537924
      09/01/2015
      255680
      4
      NOT Found
    
    
      28
      1538215
      09/01/2015
      300000
      1
      NOT Found
    
    
      29
      1539536
      09/01/2015
      396011
      4
      NOT Found
    
    
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      498
      1442131
      06/01/2014
      98625
      1
      NOT Found
    
    
      499
      1443040
      01/01/2015
      1485021
      1
      Found
    
    
      500
      1444461
      01/01/2016
      1276249
      3
      Found
    
    
      501
      1444552
      04/15/2015
      2237255
      1
      Found
    
    
      502
      1444962
      08/15/2014
      204633
      1
      NOT Found
    
    
      503
      1446323
      09/15/2014
      93255
      1
      NOT Found
    
    
      504
      1447771
      09/01/2014
      300000
      2
      Found
    
    
      505
      1453607
      09/01/2015
      287754
      1
      NOT Found
    
    
      506
      1453775
      02/01/2015
      178018
      1
      Found
    
    
      507
      1454139
      02/01/2015
      500000
      1
      NOT Found
    
    
      508
      1456582
      03/01/2015
      128216
      1
      NOT Found
    
    
      509
      1457909
      05/01/2015
      396983
      1
      NOT Found
    
    
      510
      1458177
      08/01/2015
      130560
      1
      NOT Found
    
    
      511
      1460110
      07/15/2015
      210459
      2
      NOT Found
    
    
      512
      1460863
      03/15/2015
      330000
      2
      NOT Found
    
    
      513
      1460895
      11/01/2015
      335187
      2
      NOT Found
    
    
      514
      1462064
      04/01/2015
      4000
      2
      NOT Found
    
    
      515
      1462420
      08/15/2014
      258235
      1
      NOT Found
    
    
      516
      1462804
      07/15/2015
      375000
      2
      Found
    
    
      517
      1463808
      06/15/2015
      143956
      1
      NOT Found
    
    
      518
      1464459
      06/01/2015
      11499
      1
      NOT Found
    
    
      519
      1464637
      04/01/2015
      15999
      2
      Found
    
    
      520
      1701290
      08/01/2017
      22233
      3
      NOT Found
    
    
      521
      1705448
      11/01/2016
      53844
      1
      Found
    
    
      522
      1705450
      11/01/2016
      33224
      1
      NOT Found
    
    
      523
      1710898
      09/30/2016
      192469
      1
      NOT Found
    
    
      524
      1719461
      08/01/2016
      71870
      1
      Found
    
    
      525
      1722482
      01/01/2017
      49980
      2
      Found
    
    
      526
      1722997
      10/01/2016
      57172
      1
      Found
    
    
      527
      1725377
      02/01/2017
      152268
      4
      NOT Found
    
  

528 rows × 5 columns



In [7]:

    
check = fund.loc[fund['Data_Word_Check'] == 'Found']



In [8]:

    
check









    Out[8]:






  
    
      
      ID
      Date
      Dollar_Amount
      Inspector_Count
      Data_Word_Check
    
  
  
    
      0
      1500007
      10/15/2014
      151149
      1
      Found
    
    
      1
      1501695
      09/01/2015
      2022723
      1
      Found
    
    
      4
      1503562
      06/01/2015
      15000
      1
      Found
    
    
      8
      1507266
      08/01/2015
      375000
      1
      Found
    
    
      9
      1507529
      08/01/2015
      390000
      1
      Found
    
    
      14
      1517014
      09/01/2015
      282067
      1
      Found
    
    
      16
      1522191
      09/15/2015
      234780
      2
      Found
    
    
      17
      1522751
      07/15/2015
      180000
      1
      Found
    
    
      19
      1527193
      09/01/2015
      246411
      1
      Found
    
    
      25
      1534304
      01/01/2016
      969089
      3
      Found
    
    
      34
      1541944
      03/15/2016
      122306
      1
      Found
    
    
      35
      1542727
      06/01/2015
      49991
      1
      Found
    
    
      37
      1543655
      08/01/2015
      49998
      1
      Found
    
    
      38
      1544910
      10/01/2015
      800000
      3
      Found
    
    
      40
      1546711
      01/01/2016
      41500
      2
      Found
    
    
      41
      1547107
      09/01/2015
      9808
      3
      Found
    
    
      43
      1547236
      08/15/2015
      248703
      1
      Found
    
    
      44
      1547399
      08/01/2016
      1633485
      5
      Found
    
    
      45
      1549977
      09/01/2015
      90075
      1
      Found
    
    
      48
      1551511
      05/18/2015
      178018
      1
      Found
    
    
      49
      1551534
      09/01/2015
      64214
      2
      Found
    
    
      55
      1553945
      04/01/2016
      515569
      1
      Found
    
    
      56
      1554385
      05/01/2016
      503922
      1
      Found
    
    
      59
      1559711
      07/09/2015
      63325
      1
      Found
    
    
      62
      1561190
      06/01/2016
      126191
      1
      Found
    
    
      64
      1563426
      05/01/2016
      272665
      1
      Found
    
    
      65
      1563435
      08/01/2016
      427724
      3
      Found
    
    
      80
      1608663
      09/01/2016
      532191
      1
      Found
    
    
      81
      1611136
      07/01/2016
      142000
      1
      Found
    
    
      85
      1619950
      10/01/2016
      212169
      2
      Found
    
    
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      456
      1254609
      07/01/2013
      330202
      1
      Found
    
    
      457
      1254670
      03/01/2013
      400000
      1
      Found
    
    
      460
      1261359
      10/01/2012
      773996
      1
      Found
    
    
      463
      1264579
      09/01/2013
      300000
      1
      Found
    
    
      465
      1265410
      08/15/2012
      273320
      1
      Found
    
    
      472
      1403873
      08/15/2014
      327824
      2
      Found
    
    
      474
      1404981
      05/01/2014
      699843
      1
      Found
    
    
      477
      1407480
      08/15/2014
      150000
      1
      Found
    
    
      478
      1407623
      09/01/2014
      253554
      1
      Found
    
    
      479
      1409111
      08/01/2014
      100000
      1
      Found
    
    
      482
      1411174
      02/01/2014
      7500
      1
      Found
    
    
      484
      1418960
      08/01/2014
      206248
      1
      Found
    
    
      485
      1419023
      01/01/2015
      100000
      1
      Found
    
    
      486
      1419038
      01/01/2015
      100002
      1
      Found
    
    
      487
      1419100
      08/01/2014
      180000
      1
      Found
    
    
      489
      1423189
      08/01/2014
      499347
      2
      Found
    
    
      495
      1438325
      08/01/2014
      358386
      2
      Found
    
    
      496
      1440609
      10/01/2014
      291040
      3
      Found
    
    
      497
      1440659
      09/01/2014
      149009
      1
      Found
    
    
      499
      1443040
      01/01/2015
      1485021
      1
      Found
    
    
      500
      1444461
      01/01/2016
      1276249
      3
      Found
    
    
      501
      1444552
      04/15/2015
      2237255
      1
      Found
    
    
      504
      1447771
      09/01/2014
      300000
      2
      Found
    
    
      506
      1453775
      02/01/2015
      178018
      1
      Found
    
    
      516
      1462804
      07/15/2015
      375000
      2
      Found
    
    
      519
      1464637
      04/01/2015
      15999
      2
      Found
    
    
      521
      1705448
      11/01/2016
      53844
      1
      Found
    
    
      524
      1719461
      08/01/2016
      71870
      1
      Found
    
    
      525
      1722482
      01/01/2017
      49980
      2
      Found
    
    
      526
      1722997
      10/01/2016
      57172
      1
      Found
    
  

181 rows × 5 columns



In [9]:

    
d1 = fund['Dollar_Amount']#series
d1 = d1.astype(float) #type conversion to float
d2 = pd.to_datetime(fund["Date"]) #convert Date to datetime
d1.index = d2 #set d1's index to Date values 
byyear = d1.groupby(d1.index.year).sum() #aggregate the date and sum it
print(byyear)
my_plot = byyear.plot.bar(figsize = (12,6),color='#F66733',legend=True,label='CU Award Amounts')
my_plot.set_title('NSF Award Totals (2007-2017)')
my_plot.set_ylabel('10 million ($)')
my_plot.set_xlabel('Year')









    



2006      547565.0
2007     9241641.0
2008    13431600.0
2009    47706863.0
2010    26512890.0
2011    24445595.0
2012    21615326.0
2013    14194995.0
2014    27252198.0
2015    22153118.0
2016    26024463.0
2017     1224156.0
Name: Dollar_Amount, dtype: float64






    Out[9]:





<matplotlib.text.Text at 0x2b656a6461d0>



In [10]:

    
#csv converted to list 
df = pd.read_csv('palmetto/palmetto_data/Users.csv') #type = dataframe
user_df_Series = df['LocalUserId'] #type = Series, palmetto user ids 
user_df_list = list(user_df_Series)



In [12]:

    
%%time 

files = glob.glob('20*/*.xml')
username = [] #The clemson userIDs in the NSF files 
              #length = 802
pal_user_fund = []
test = 0
amount = {'Date':[],'Award_Amount':[]}

for xml_f in files:
#Open files
    with open(xml_f) as new_file:
    #Store Data
        xml = new_file.read()
        #Convert data to text 
        soup = BeautifulSoup(xml,'xml')
        try:
            #loop through the xml file and store all user email
            for e in soup.find_all(tags[tag3]):
                try: 
                    mail = e.string.split('@')[1]
                    if mail == email or mail == alt_email:
                        user = e.string.split('@')[0]
                        username.append(user)
                        if user in user_df_list:
                            test+=1
                except IndexError: 
                    continue
        except AttributeError:
            continue
        if test > 0:
            test = 0 
            try:
                amount['Award_Amount'].append(soup.find(tags[tag2]).string)
                amount['Date'].append(soup.find(tags[tag1]).string)
            except AttributeError: 
                continue









    



CPU times: user 12min 17s, sys: 25.1 s, total: 12min 42s
Wall time: 13min 29s



In [13]:

    
print(mail)









    



hotmail.com



In [14]:

    
print(len(amount['Award_Amount'])) #palmetto users number of award amounts found
print(len(fund['Dollar_Amount']))  #CU users number of award amounts found



In [15]:

    
similar = [userID for userID in username if userID in user_df_list] #length = 282
print('Number of Palmetto usernames: '+ str(len(similar)))
print('Number of total Clemson usernames: '+ str(len(username)))









    



Number of Palmetto usernames: 282
Number of total Clemson usernames: 802



In [16]:

    
palmetto = len(similar)
all_users = len(username)

percentage = palmetto/all_users #35%
print("Palmetto Users: "+ str(round(percentage*100)) + '%\n'+'Note: ' + "Value out of total number of Clemson usernames.")









    



Palmetto Users: 35%
Note: Value out of total number of Clemson usernames.



In [21]:

    
pal_user_fund = pd.DataFrame(amount) #Dateframe
pal_fund = pal_user_fund[['Date','Award_Amount']] #Dataframe

pal_s = pal_fund['Award_Amount'] #Series
pal_s = pal_s.astype(float) #convert to float

pal2 = pd.to_datetime(pal_fund['Date']) #convert to date type
pal_s.index = pal2 #Set index to date 

g_pal_s = pal_s.groupby(pal_s.index.year).sum()#sum dollar amounts

stack_fund = pd.concat([g_pal_s,byyear],axis=1) #combine series now dataframe

stack_fund.columns = ['CU Awards: Palmetto Users Only','CU Awards']#set column names

stack_fund.index.name = "Year"



In [22]:

    
print(stack_fund.fillna(0))

#*****************************Graph Properties**************************************
my_plt = stack_fund.plot.bar(figsize = (12,6),color=['#522D80','#F66733'],legend=True)
my_plt.set_title('NSF Award Totals (2007-2017)')
my_plt.set_ylabel('10 million ($)')
my_plt.set_xlabel('Year')









    



      CU Awards: Palmetto Users Only   CU Awards
Year                                            
2006                             0.0    547565.0
2007                       2535162.0   9241641.0
2008                       4594505.0  13431600.0
2009                      11644465.0  47706863.0
2010                      13796148.0  26512890.0
2011                       5980112.0  24445595.0
2012                      13878261.0  21615326.0
2013                       5362704.0  14194995.0
2014                      22436154.0  27252198.0
2015                      12434660.0  22153118.0
2016                      18987197.0  26024463.0
2017                       1071888.0   1224156.0






    Out[22]:





<matplotlib.text.Text at 0x2b658906e940>



In [23]:

    
stack_fund.to_csv(path_or_buf='CU_Fund.csv')



In [24]:

    
import csv
csvfile = 'Cu_NSF_users.csv'

with open(csvfile, "w") as output:
    writer = csv.writer(output, lineterminator='\n')
    for val in username:
        writer.writerow([val])



In [35]:

    
userlist = []
with open('Cu_NSF_users.csv','r') as file: 
    wordreader = csv.reader(file,delimiter='\n')
    for v in wordreader:
        userlist.append('\n'.join(v))
print(len(userlist))



In [41]:

    
prin_pi = 'Principal Investigator'
co_pi = 'Co-Principal Investigator'

with open('2014/1400009.xml') as test_file:
            xml_file = test_file.read()
            soup = BeautifulSoup(xml_file,'xml')
            role = soup.find_all('RoleCode')
            for roles in role: 
                if roles.string == prin_val:
                    print('Principle')









    



Principle



In [ ]:

	ID	Date	Dollar_Amount	Inspector_Count	Data_Word_Check
0	1500007	10/15/2014	151149	1	Found
1	1501695	09/01/2015	2022723	1	Found
2	1502293	08/01/2015	13615	3	NOT Found
3	1503358	06/01/2015	99192	1	NOT Found
4	1503562	06/01/2015	15000	1	Found