In [2]:
import pandas as pd 
import glob
import re 
import string
from bs4 import BeautifulSoup
import numpy as np
import matplotlib.pyplot as plt 

%matplotlib inline
tag0 = 0
tag1 = 1 
tag2 = 2
tag3 = 3
tag4 = 4

In [3]:
def dict_build(tags,tag3,tag4,store,sorted_keys,keyword_list,email='clemson.edu',
alt_email='g.clemson.edu'):
    count = 0
    value = 0 
    for x in range(len(tags)):
                #Store Id, Effective Date, Dollar Amount
                if tags[x] != tags[tag3] and tags[x] != tags[tag4]:
                    try: 
                        store[sorted_keys[x]].append(soup.find(tags[x]).string)
                    except AttributeError: 
                        continue
                elif tags[x] == tags[tag3]: 
                    #User count is stored 
                    try:
                        for e_mails in soup.find_all(tags[tag3]):
                            e_check = e_mails.string.split('@')[1]
                            if e_check == email or e_check == alt_email:
                                value+=1
                        store[sorted_keys[x]].append(value)
                        value = 0 
                    except AttributeError: 
                        continue
                elif tags[x] == tags[tag4]: 
                    #Keyword check 
                    try:
                        abst = soup.find(tags[tag4]).string
                        regex = re.compile('[%s]' % re.escape(string.punctuation))
                        abs_punc_free = regex.sub(' ', str(abst))
                        abs_word_list = abs_punc_free.split()
                        for words in abs_word_list: 
                            if words in keyword_list:
                                count+=1
                        if count > 0: 
                            store[sorted_keys[x]].append('Found')
                            count = 0 
                        else:
                            store[sorted_keys[x]].append('NOT Found')
                            count = 0 
                    except AttributeError:
                        continue

In [4]:
%%time
    
keyword_list = ['computation', 'compute', 'simulation', 'computational', 'simulate', 'genome',
   'sequence', 'sequencing', 'molecule', 'large scale', 'large-scale', 'massive', 
    'hpc', 'molecular', 'simulations', 'genomic']

#*****Dictionary to set up for DataFrame************
store = {'ID':[],'Date':[],'Dollar_Amount':[],
'Inspector_Count':[],'Data_Word_Check':[]}
sorted_keys = ['ID','Date','Dollar_Amount',
'Inspector_Count','Data_Word_Check']
#***************************************************

tags =['AwardID','AwardEffectiveDate','AwardAmount',
       'EmailAddress','AbstractNarration']
file = glob.glob('20*/*.xml')

email = 'clemson.edu'
alt_email = 'g.clemson.edu'

    for x in file: 
        with open(x) as file2:
            xml = file2.read()
            soup = BeautifulSoup(xml,'xml')
            try: 
                inspector_email = soup.find(tags[tag3]).string.split('@')[1]
            except AttributeError:
                pass
            if inspector_email == email or inspector_email == alt_email:
                dict_build(tags,tag3,tag4,store,sorted_keys,keyword_list,email,alt_email)
            else:
                continue


CPU times: user 10min 50s, sys: 10.8 s, total: 11min 1s
Wall time: 11min 55s

In [5]:
print(store.values())


dict_values([['10/15/2014', '07/01/2015', '08/01/2015', '08/01/2015', '08/15/2015', '08/16/2014', '10/01/2015', '09/01/2015', '09/01/2015', '09/15/2015', '07/15/2015', '09/01/2015', '09/01/2015', '08/01/2015', '09/01/2015', '01/01/2016', '08/15/2015', '09/01/2015', '09/01/2015', '04/01/2015', '08/15/2015', '09/01/2015', '08/01/2015', '03/15/2016', '06/01/2015', '04/01/2016', '10/01/2015', '09/01/2015', '01/01/2016', '08/15/2015', '08/01/2016', '09/01/2015', '10/01/2015', '05/18/2015', '05/18/2015', '09/01/2015', '10/15/2015', '08/01/2016', '09/01/2015', '07/01/2015', '09/01/2016', '04/01/2016', '05/01/2016', '07/01/2016', '09/01/2015', '07/09/2015', '09/01/2016', '09/01/2016', '06/01/2016', '07/01/2016', '05/01/2016', '08/01/2016', '10/01/2016', '09/01/2015', '07/01/2016', '06/01/2016', '02/01/2016', '06/01/2016', '05/15/2016', '10/01/2016', '05/01/2016', '11/01/2015', '09/01/2016', '07/01/2016', '07/01/2016', '03/15/2016', '10/01/2016', '09/30/2015', '09/01/2016', '09/15/2016', '08/01/2016', '09/15/2016', '09/01/2016', '09/15/2016', '09/01/2016', '08/15/2016', '09/15/2016', '09/01/2016', '09/01/2016', '10/01/2016', '01/01/2017', '08/01/2016', '11/01/2016', '07/15/2016', '08/01/2016', '08/01/2016', '08/09/2016', '05/15/2007', '08/29/2006', '06/15/2007', '09/01/2007', '06/01/2007', '09/01/2007', '08/01/2007', '09/01/2007', '08/01/2007', '07/01/2007', '09/15/2007', '08/15/2007', '06/01/2007', '06/01/2008', '07/15/2007', '10/01/2007', '02/15/2008', '08/15/2008', '05/01/2008', '05/01/2008', '02/15/2009', '06/15/2008', '06/01/2008', '02/01/2008', '06/01/2008', '11/01/2008', '02/15/2009', '09/01/2008', '09/01/2008', '08/15/2008', '08/15/2008', '08/15/2008', '08/15/2008', '08/15/2009', '09/01/2008', '09/15/2008', '11/15/2008', '09/01/2009', '09/01/2008', '02/01/2009', '07/01/2009', '07/01/2009', '08/15/2009', '06/01/2009', '02/15/2009', '08/01/2013', '08/16/2012', '09/01/2013', '09/15/2013', '08/01/2013', '09/01/2013', '09/15/2013', '08/15/2013', '09/15/2013', '07/01/2013', '06/01/2013', '06/01/2013', '06/01/2013', '06/15/2013', '10/01/2013', '03/01/2013', '09/01/2013', '06/01/2013', '07/01/2013', '10/01/2013', '07/15/2013', '08/01/2013', '09/01/2013', '07/15/2013', '09/01/2013', '06/15/2013', '01/01/2014', '10/01/2013', '08/01/2013', '09/01/2013', '10/01/2013', '03/01/2014', '09/01/2013', '07/01/2015', '09/15/2013', '08/15/2009', '08/15/2009', '09/01/2009', '07/01/2009', '08/01/2009', '08/01/2009', '09/01/2009', '08/15/2009', '08/01/2009', '09/01/2009', '09/15/2009', '03/01/2009', '07/15/2009', '08/15/2009', '09/01/2009', '08/01/2009', '08/15/2009', '08/01/2009', '09/01/2009', '08/01/2009', '08/15/2009', '08/15/2009', '08/01/2009', '08/15/2009', '08/15/2009', '05/01/2009', '07/01/2009', '09/01/2009', '08/15/2009', '07/15/2009', '08/01/2009', '09/01/2009', '10/01/2009', '04/01/2010', '09/01/2009', '10/01/2009', '09/15/2009', '09/01/2010', '01/01/2010', '02/15/2010', '03/15/2010', '06/01/2010', '09/01/2010', '04/15/2010', '03/15/2010', '07/15/2010', '07/01/2010', '05/01/2010', '07/01/2010', '09/01/2010', '05/15/2010', '07/15/2010', '06/01/2010', '09/01/2010', '10/01/2010', '07/15/2010', '09/01/2010', '07/01/2010', '08/15/2010', '09/15/2010', '08/15/2010', '09/15/2010', '08/01/2010', '07/15/2010', '09/01/2010', '09/01/2010', '09/01/2010', '08/01/2011', '05/01/2011', '05/01/2011', '01/15/2011', '09/15/2010', '09/01/2011', '06/01/2011', '03/01/2011', '11/01/2010', '03/15/2011', '10/01/2010', '10/01/2010', '02/01/2011', '02/01/2011', '03/01/2011', '04/15/2011', '06/01/2011', '08/15/2011', '06/01/2011', '09/01/2010', '08/15/2011', '01/01/2011', '09/01/2011', '07/15/2011', '09/01/2011', '06/15/2011', '05/15/2011', '09/01/2011', '09/01/2011', '10/01/2011', '09/01/2011', '10/01/2011', '05/01/2011', '08/15/2011', '09/01/2011', '09/01/2011', '09/01/2011', '09/15/2011', '01/01/2012', '08/01/2011', '03/01/2013', '09/15/2012', '04/01/2012', '03/01/2012', '08/15/2012', '03/01/2012', '05/01/2012', '08/15/2012', '05/01/2013', '01/01/2012', '06/01/2012', '09/15/2012', '05/01/2012', '08/15/2012', '06/01/2012', '05/15/2012', '02/01/2012', '06/15/2012', '09/01/2012', '07/15/2012', '04/01/2012', '03/01/2012', '04/01/2012', '09/01/2012', '08/15/2012', '09/01/2012', '08/15/2012', '10/01/2012', '09/01/2012', '01/01/2012', '01/01/2012', '07/01/2012', '08/15/2012', '10/01/2012', '09/01/2012', '11/01/2012', '01/01/2012', '07/15/2012', '09/01/2012', '09/01/2012', '10/01/2012', '09/01/2012', '10/01/2012', '01/01/2012', '07/01/2013', '03/01/2013', '05/01/2013', '10/01/2012', '05/15/2013', '09/01/2013', '10/01/2012', '08/15/2012', '03/15/2013', '03/01/2013', '01/01/2014', '08/01/2014', '01/01/2014', '06/01/2014', '09/01/2014', '09/01/2014', '07/01/2014', '08/15/2014', '05/01/2014', '09/01/2014', '08/15/2014', '09/01/2014', '08/01/2014', '07/01/2014', '02/01/2014', '09/01/2014', '08/01/2014', '01/01/2015', '01/01/2015', '08/01/2014', '10/01/2014', '08/01/2014', '08/15/2014', '08/01/2014', '01/01/2015', '08/01/2014', '06/01/2014', '01/01/2016', '08/15/2014', '09/15/2014', '09/01/2014', '09/01/2015', '02/01/2015', '02/01/2015', '03/01/2015', '05/01/2015', '08/01/2015', '07/15/2015', '03/15/2015', '11/01/2015', '04/01/2015', '08/15/2014', '07/15/2015', '06/15/2015', '06/01/2015', '11/01/2016', '11/01/2016', '09/30/2016', '08/01/2016', '01/01/2017', '10/01/2016', '02/01/2017'], ['1500007', '1504619', '1507266', '1507529', '1510790', '1512342', '1513875', '1517014', '1518455', '1522191', '1522751', '1527193', '1527421', '1529927', '1531127', '1534304', '1537756', '1537924', '1539536', '1539688', '1540025', '1540623', '1540702', '1541944', '1542727', '1543373', '1544910', '1547107', '1547164', '1547236', '1547399', '1549977', '1550242', '1551262', '1551511', '1551534', '1551605', '1552214', '1552794', '1553126', '1553565', '1553945', '1554385', '1555224', '1556563', '1559711', '1560070', '1560300', '1561190', '1563315', '1563426', '1563435', '1565268', '1565508', '1565809', '1566346', '1600767', '1600874', '1601485', '1601983', '1602006', '1602451', '1608663', '1611136', '1611714', '1617040', '1619950', '1620922', '1624641', '1624705', '1629437', '1629934', '1632881', '1633608', '1633952', '1638888', '1640578', '1640645', '1640664', '1642102', '1642143', '1642542', '1643020', '1644552', '1646691', '1647361', '1660329', '0700508', '0703042', '0703061', '0703117', '0706426', '0708899', '0722841', '0730694', '0733125', '0733441', '0733711', '0733944', '0736037', '0737514', '0738162', '0742296', '0744040', '0756457', '0800474', '0801435', '0808740', '0809129', '0809820', '0813637', '0814338', '0817794', '0820345', '0821918', '0824443', '0825773', '0826067', '0826441', '0828699', '0830581', '0834219', '0836068', '0837540', '0841636', '0844954', '0846898', '0847132', '0850695', '0853835', '0853873', '0856046', '1303254', '1304208', '1304211', '1305267', '1305338', '1305382', '1307078', '1307740', '1308298', '1310962', '1310963', '1310967', '1310973', '1312817', '1314342', '1314725', '1319084', '1331728', '1332007', '1332964', '1333489', '1335049', '1335163', '1335995', '1336632', '1339532', '1342763', '1343437', '1346632', '1348166', '1355395', '1359223', '1359716', '1360594', '1360664', '0900163', '0900182', '0901693', '0903795', '0904116', '0905322', '0905570', '0907167', '0907390', '0907395', '0908342', '0911122', '0914478', '0914903', '0915214', '0916387', '0919113', '0920274', '0923311', '0923379', '0925424', '0927962', '0928533', '0928744', '0928807', '0929532', '0930035', '0932606', '0933218', '0934299', '0934300', '0936672', '0937985', '0944315', '0946932', '0947679', '0948132', '0950700', '0950710', '0952160', '0953783', '0954318', '0954811', '0955096', '0960100', '0963199', '0965624', '0966581', '0967423', '0967425', '0968909', '1000667', '1004413', '1005369', '1007539', '1008073', '1008600', '1011478', '1011820', '1016182', '1017007', '1026385', '1028146', '1034979', '1048325', '1049765', '1049947', '1052671', '1055254', '1055419', '1055950', '1057633', '1058885', '1060545', '1061524', '1062155', '1062873', '1063679', '1064230', '1066567', '1067995', '1068906', '1068977', '1100752', '1101251', '1101845', '1102889', '1104527', '1104646', '1105307', '1112593', '1116691', '1118656', '1123052', '1124859', '1126407', '1127957', '1128023', '1128481', '1129017', '1129969', '1130819', '1130825', '1132168', '1136248', '1139048', '1142905', '1143514', '1144846', '1145993', '1146014', '1149644', '1150670', '1151294', '1152892', '1153294', '1156247', '1156761', '1159622', '1200117', '1200560', '1200787', '1201026', '1201546', '1207080', '1211691', '1213912', '1218345', '1219473', '1223688', '1228312', '1234859', '1236070', '1236759', '1237077', '1240327', '1240620', '1242325', '1242516', '1243436', '1243467', '1245607', '1245936', '1246547', '1246875', '1247198', '1248199', '1249541', '1249656', '1251544', '1254559', '1254609', '1254670', '1255535', '1261359', '1263802', '1264579', '1265279', '1265410', '1266013', '1266155', '1400361', '1400370', '1402387', '1402393', '1402411', '1403062', '1403099', '1403873', '1404981', '1405723', '1407480', '1407623', '1409111', '1410727', '1411174', '1412694', '1418960', '1419023', '1419038', '1419100', '1419199', '1423189', '1428620', '1435261', '1437836', '1438325', '1442131', '1444461', '1444962', '1446323', '1447771', '1453607', '1453775', '1454139', '1456582', '1457909', '1458177', '1460110', '1460863', '1460895', '1462064', '1462420', '1462804', '1463808', '1464459', '1705448', '1705450', '1710898', '1719461', '1722482', '1722997', '1725377'], ['Found', 'NOT Found', 'Found', 'Found', 'NOT Found', 'NOT Found', 'NOT Found', 'Found', 'NOT Found', 'Found', 'Found', 'Found', 'NOT Found', 'NOT Found', 'NOT Found', 'Found', 'NOT Found', 'NOT Found', 'NOT Found', 'NOT Found', 'NOT Found', 'NOT Found', 'NOT Found', 'Found', 'Found', 'NOT Found', 'Found', 'Found', 'NOT Found', 'Found', 'Found', 'Found', 'NOT Found', 'NOT Found', 'Found', 'Found', 'NOT Found', 'NOT Found', 'NOT Found', 'NOT Found', 'NOT Found', 'Found', 'Found', 'NOT Found', 'NOT Found', 'Found', 'NOT Found', 'NOT Found', 'Found', 'NOT Found', 'Found', 'Found', 'NOT Found', 'NOT Found', 'NOT Found', 'NOT Found', 'NOT Found', 'NOT Found', 'NOT Found', 'NOT Found', 'NOT Found', 'NOT Found', 'Found', 'Found', 'NOT Found', 'NOT Found', 'Found', 'NOT Found', 'Found', 'NOT Found', 'NOT Found', 'NOT Found', 'NOT Found', 'Found', 'NOT Found', 'NOT Found', 'NOT Found', 'NOT Found', 'NOT Found', 'NOT Found', 'NOT Found', 'NOT Found', 'NOT Found', 'NOT Found', 'NOT Found', 'NOT Found', 'Found', 'NOT Found', 'NOT Found', 'NOT Found', 'NOT Found', 'NOT Found', 'Found', 'NOT Found', 'Found', 'Found', 'NOT Found', 'NOT Found', 'NOT Found', 'Found', 'NOT Found', 'NOT Found', 'NOT Found', 'NOT Found', 'NOT Found', 'Found', 'NOT Found', 'NOT Found', 'Found', 'Found', 'NOT Found', 'NOT Found', 'NOT Found', 'Found', 'Found', 'NOT Found', 'NOT Found', 'NOT Found', 'NOT Found', 'NOT Found', 'NOT Found', 'NOT Found', 'NOT Found', 'NOT Found', 'Found', 'NOT Found', 'Found', 'Found', 'NOT Found', 'NOT Found', 'NOT Found', 'Found', 'NOT Found', 'NOT Found', 'NOT Found', 'NOT Found', 'NOT Found', 'NOT Found', 'Found', 'NOT Found', 'Found', 'Found', 'Found', 'NOT Found', 'NOT Found', 'Found', 'Found', 'NOT Found', 'NOT Found', 'NOT Found', 'NOT Found', 'Found', 'Found', 'Found', 'NOT Found', 'Found', 'NOT Found', 'NOT Found', 'Found', 'NOT Found', 'NOT Found', 'Found', 'NOT Found', 'NOT Found', 'NOT Found', 'NOT Found', 'NOT Found', 'NOT Found', 'NOT Found', 'Found', 'Found', 'NOT Found', 'NOT Found', 'NOT Found', 'NOT Found', 'Found', 'NOT Found', 'NOT Found', 'Found', 'Found', 'Found', 'Found', 'Found', 'NOT Found', 'Found', 'NOT Found', 'Found', 'Found', 'NOT Found', 'Found', 'Found', 'NOT Found', 'NOT Found', 'NOT Found', 'NOT Found', 'Found', 'NOT Found', 'NOT Found', 'NOT Found', 'NOT Found', 'NOT Found', 'Found', 'NOT Found', 'NOT Found', 'NOT Found', 'NOT Found', 'NOT Found', 'Found', 'NOT Found', 'NOT Found', 'Found', 'NOT Found', 'Found', 'Found', 'Found', 'NOT Found', 'NOT Found', 'Found', 'Found', 'NOT Found', 'Found', 'NOT Found', 'NOT Found', 'Found', 'NOT Found', 'NOT Found', 'Found', 'NOT Found', 'Found', 'NOT Found', 'NOT Found', 'NOT Found', 'NOT Found', 'NOT Found', 'NOT Found', 'NOT Found', 'Found', 'NOT Found', 'NOT Found', 'NOT Found', 'NOT Found', 'NOT Found', 'NOT Found', 'NOT Found', 'Found', 'NOT Found', 'NOT Found', 'NOT Found', 'NOT Found', 'Found', 'Found', 'NOT Found', 'NOT Found', 'NOT Found', 'Found', 'NOT Found', 'NOT Found', 'Found', 'NOT Found', 'NOT Found', 'NOT Found', 'NOT Found', 'NOT Found', 'NOT Found', 'NOT Found', 'NOT Found', 'NOT Found', 'Found', 'NOT Found', 'NOT Found', 'Found', 'NOT Found', 'NOT Found', 'Found', 'NOT Found', 'NOT Found', 'NOT Found', 'NOT Found', 'NOT Found', 'NOT Found', 'NOT Found', 'Found', 'NOT Found', 'NOT Found', 'Found', 'Found', 'NOT Found', 'NOT Found', 'NOT Found', 'NOT Found', 'NOT Found', 'NOT Found', 'Found', 'Found', 'Found', 'Found', 'NOT Found', 'Found', 'Found', 'Found', 'Found', 'Found', 'NOT Found', 'NOT Found', 'NOT Found', 'NOT Found', 'NOT Found', 'Found', 'Found', 'NOT Found', 'NOT Found', 'NOT Found', 'NOT Found', 'NOT Found', 'NOT Found', 'NOT Found', 'NOT Found', 'NOT Found', 'Found', 'Found', 'NOT Found', 'Found', 'NOT Found', 'Found', 'NOT Found', 'Found', 'NOT Found', 'NOT Found', 'NOT Found', 'NOT Found', 'NOT Found', 'Found', 'NOT Found', 'NOT Found', 'NOT Found', 'Found', 'Found', 'NOT Found', 'Found', 'Found', 'Found', 'NOT Found', 'Found', 'NOT Found', 'Found', 'Found', 'Found', 'Found', 'NOT Found', 'Found', 'NOT Found', 'NOT Found', 'NOT Found', 'Found', 'NOT Found', 'Found', 'NOT Found', 'NOT Found', 'Found', 'NOT Found', 'Found', 'NOT Found', 'NOT Found', 'NOT Found', 'NOT Found', 'NOT Found', 'NOT Found', 'NOT Found', 'NOT Found', 'NOT Found', 'Found', 'NOT Found', 'NOT Found', 'Found', 'NOT Found', 'NOT Found', 'Found', 'Found', 'Found', 'NOT Found'], ['151149', '227369', '375000', '390000', '359066', '206536', '304924', '282067', '850000', '234780', '180000', '246411', '307767', '332654', '614582', '969089', '349999', '255680', '396011', '50000', '90000', '799128', '499855', '122306', '49991', '55083', '800000', '9808', '249915', '248703', '1633485', '90075', '49500', '251532', '178018', '64214', '195010', '46947', '154123', '169570', '500000', '515569', '503922', '500000', '50000', '63325', '352644', '315000', '126191', '300000', '272665', '427724', '230277', '130560', '175000', '174807', '16000', '138000', '15000', '593218', '200000', '51019', '532191', '142000', '213596', '49961', '212169', '154652', '150000', '171473', '14455', '1355893', '6000000', '2989899', '192604', '99987', '200000', '200000', '175536', '140504', '499805', '308000', '249999', '50000', '41280', '151074', '326427', '181746', '239397', '886886', '930238', '430000', '269786', '231345', '324999', '25000', '4457', '552854', '10000', '72999', '150000', '162000', '497997', '410928', '210000', '238448', '288606', '430000', '600000', '48167', '10080', '157583', '294026', '158508', '283814', '234000', '236020', '250000', '348536', '358349', '221767', '99878', '150000', '149168', '498863', '50706', '612998', '550000', '272616', '300000', '249565', '10000', '12696', '50443', '150000', '94585', '398092', '308000', '434250', '375000', '390000', '17873', '5070', '5070', '5070', '129020', '524680', '45963', '458000', '153952', '48000', '420000', '180000', '360000', '335000', '331339', '226041', '9100', '363158', '162000', '298090', '160000', '30021', '265988', '335000', '131963', '264123', '128781', '306000', '120000', '16000000', '400000', '200977', '99999', '290000', '360000', '379047', '299563', '19950', '256583', '30900', '182334', '309416', '14658', '752967', '496070', '299986', '324060', '249986', '260000', '470363', '260268', '39982', '149961', '285438', '750000', '232000', '10000', '499945', '2019878', '275326', '4875', '99736', '75000', '97934', '415430', '563299', '587711', '406150', '597664', '400000', '1648901', '1431340', '220694', '310433', '334970', '381651', '250000', '250000', '311760', '210000', '265853', '45000', '535000', '700000', '400000', '209922', '417908', '127196', '10000', '400000', '79927', '100000', '137400', '400000', '428294', '410000', '481706', '530000', '330000', '551998', '326731', '575025', '270000', '299887', '203280', '323857', '10000', '66777', '377084', '603856', '101946', '313861', '103645', '150000', '5000', '475000', '150000', '53655', '25940', '3000', '349000', '683437', '184000', '150000', '200000', '297402', '442186', '232500', '149729', '450000', '524667', '151274', '299796', '245001', '504150', '330622', '653999', '477144', '407654', '413291', '384218', '75186', '24900', '238691', '262086', '406664', '200448', '300000', '79464', '8190', '518372', '85027', '402080', '223188', '34868', '300000', '1024160', '218589', '325285', '299998', '821066', '75000', '214916', '53071', '50000', '600000', '75000', '8000', '990898', '308405', '2072668', '262654', '4000', '85540', '49995', '298870', '129386', '330202', '400000', '526216', '773996', '24900', '300000', '50000', '273320', '146968', '1095', '23175', '207000', '50000', '216000', '150000', '210000', '313127', '327824', '699843', '10967', '150000', '253554', '100000', '448497', '7500', '326972', '206248', '100000', '100002', '180000', '10999999', '499347', '584229', '242600', '330751', '358386', '98625', '1276249', '204633', '93255', '300000', '287754', '178018', '500000', '128216', '396983', '130560', '210459', '330000', '335187', '4000', '258235', '375000', '143956', '11499', '53844', '33224', '192469', '71870', '49980', '57172', '152268'], [1, 3, 1, 1, 3, 1, 1, 1, 4, 2, 1, 1, 2, 2, 4, 3, 2, 4, 4, 1, 1, 2, 5, 1, 1, 1, 3, 3, 1, 1, 5, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 3, 2, 1, 1, 1, 2, 1, 2, 3, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 5, 1, 5, 1, 4, 1, 3, 1, 4, 4, 1, 2, 4, 4, 2, 1, 1, 1, 3, 2, 1, 1, 4, 1, 1, 2, 3, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 5, 1, 1, 2, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 4, 1, 2, 1, 1, 2, 1, 1, 2, 3, 1, 3, 2, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3, 5, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 3, 1, 5, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 3, 3, 1, 1, 2, 1, 1, 2, 2, 1, 1, 2, 1, 3, 1, 1, 3, 5, 1, 4, 2, 2, 1, 1, 1, 1, 1, 3, 1, 1, 2, 2, 2, 3, 3, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 2, 1, 5, 1, 1, 1, 2, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 3, 1, 1, 2, 1, 4, 6, 1, 3, 1, 3, 1, 1, 1, 2, 4, 1, 2, 2, 1, 3, 2, 2, 2, 1, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 3, 1, 1, 2, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 4]])

In [6]:
fund_frame = pd.DataFrame(dict([(k,pd.Series(v)) for k,v in store.items()]))
fund = fund_frame[['ID','Date','Dollar_Amount','Inspector_Count','Data_Word_Check']]
fund.head()


Out[6]:
ID Date Dollar_Amount Inspector_Count Data_Word_Check
0 1500007 10/15/2014 151149 1.0 Found
1 1504619 07/01/2015 227369 3.0 NOT Found
2 1507266 08/01/2015 375000 1.0 Found
3 1507529 08/01/2015 390000 1.0 Found
4 1510790 08/15/2015 359066 3.0 NOT Found

In [7]:
fund


Out[7]:
ID Date Dollar_Amount Inspector_Count Data_Word_Check
0 1500007 10/15/2014 151149 1.0 Found
1 1504619 07/01/2015 227369 3.0 NOT Found
2 1507266 08/01/2015 375000 1.0 Found
3 1507529 08/01/2015 390000 1.0 Found
4 1510790 08/15/2015 359066 3.0 NOT Found
5 1512342 08/16/2014 206536 1.0 NOT Found
6 1513875 10/01/2015 304924 1.0 NOT Found
7 1517014 09/01/2015 282067 1.0 Found
8 1518455 09/01/2015 850000 4.0 NOT Found
9 1522191 09/15/2015 234780 2.0 Found
10 1522751 07/15/2015 180000 1.0 Found
11 1527193 09/01/2015 246411 1.0 Found
12 1527421 09/01/2015 307767 2.0 NOT Found
13 1529927 08/01/2015 332654 2.0 NOT Found
14 1531127 09/01/2015 614582 4.0 NOT Found
15 1534304 01/01/2016 969089 3.0 Found
16 1537756 08/15/2015 349999 2.0 NOT Found
17 1537924 09/01/2015 255680 4.0 NOT Found
18 1539536 09/01/2015 396011 4.0 NOT Found
19 1539688 04/01/2015 50000 1.0 NOT Found
20 1540025 08/15/2015 90000 1.0 NOT Found
21 1540623 09/01/2015 799128 2.0 NOT Found
22 1540702 08/01/2015 499855 5.0 NOT Found
23 1541944 03/15/2016 122306 1.0 Found
24 1542727 06/01/2015 49991 1.0 Found
25 1543373 04/01/2016 55083 1.0 NOT Found
26 1544910 10/01/2015 800000 3.0 Found
27 1547107 09/01/2015 9808 3.0 Found
28 1547164 01/01/2016 249915 1.0 NOT Found
29 1547236 08/15/2015 248703 1.0 Found
... ... ... ... ... ...
350 1428620 08/15/2014 584229 1.0 NOT Found
351 1435261 08/01/2014 242600 2.0 NOT Found
352 1437836 01/01/2015 330751 1.0 NOT Found
353 1438325 08/01/2014 358386 1.0 Found
354 1442131 06/01/2014 98625 1.0 NOT Found
355 1444461 01/01/2016 1276249 1.0 Found
356 1444962 08/15/2014 204633 1.0 NOT Found
357 1446323 09/15/2014 93255 1.0 NOT Found
358 1447771 09/01/2014 300000 2.0 Found
359 1453607 09/01/2015 287754 2.0 NOT Found
360 1453775 02/01/2015 178018 2.0 Found
361 1454139 02/01/2015 500000 2.0 NOT Found
362 1456582 03/01/2015 128216 1.0 NOT Found
363 1457909 05/01/2015 396983 2.0 NOT Found
364 1458177 08/01/2015 130560 1.0 NOT Found
365 1460110 07/15/2015 210459 1.0 NOT Found
366 1460863 03/15/2015 330000 1.0 NOT Found
367 1460895 11/01/2015 335187 1.0 NOT Found
368 1462064 04/01/2015 4000 1.0 NOT Found
369 1462420 08/15/2014 258235 1.0 NOT Found
370 1462804 07/15/2015 375000 2.0 Found
371 1463808 06/15/2015 143956 1.0 NOT Found
372 1464459 06/01/2015 11499 4.0 NOT Found
373 1705448 11/01/2016 53844 NaN Found
374 1705450 11/01/2016 33224 NaN NOT Found
375 1710898 09/30/2016 192469 NaN NOT Found
376 1719461 08/01/2016 71870 NaN Found
377 1722482 01/01/2017 49980 NaN Found
378 1722997 10/01/2016 57172 NaN Found
379 1725377 02/01/2017 152268 NaN NOT Found

380 rows × 5 columns


In [8]:
check = fund.loc[fund['Data_Word_Check'] == 'Found']

In [9]:
check.head()


Out[9]:
ID Date Dollar_Amount Inspector_Count Data_Word_Check
0 1500007 10/15/2014 151149 1.0 Found
2 1507266 08/01/2015 375000 1.0 Found
3 1507529 08/01/2015 390000 1.0 Found
7 1517014 09/01/2015 282067 1.0 Found
9 1522191 09/15/2015 234780 2.0 Found

In [19]:
d1 = fund['Dollar_Amount']#series
d1 = d1.astype(float) #type conversion to float
d2 = pd.to_datetime(fund["Date"]) #convert Date to datetime
d1.index = d2 #set d1's index to Date values 
byyear = d1.groupby(d1.index.year).sum() #aggregate the date and sum it
print(byyear)
my_plot = byyear.plot.bar(figsize = (12,6),color='#F66733',legend=True,label='CU Award Amounts')
my_plot.set_title('NSF Award Totals (2007-2017)')
my_plot.set_ylabel('10 million ($)')
my_plot.set_xlabel('Year')


2006      239397.0
2007     4580307.0
2008     4568309.0
2009    29983040.0
2010    13833306.0
2011     9821422.0
2012    15521590.0
2013     8715603.0
2014    18506847.0
2015    13549741.0
2016    24084718.0
2017      702053.0
Name: Dollar_Amount, dtype: float64
Out[19]:
<matplotlib.text.Text at 0x2af607fdd048>

In [11]:
#csv converted to list 
df = pd.read_csv('palmetto/palmetto_data/Users.csv') #type = dataframe
user_df_Series = df['LocalUserId'] #type = Series, palmetto user ids 
user_df_list = list(user_df_Series)

In [29]:
%%time 

files = glob.glob('20*/*.xml')
username = [] #The clemson userIDs in the NSF files 
              #length = 802
pal_user_fund = []
test = 0
amount = {'Date':[],'Award_Amount':[]}

for xml_f in files:
#Open files
    with open(xml_f) as new_file:
    #Store Data
        xml = new_file.read()
        #Convert data to text 
        soup = BeautifulSoup(xml,'xml')
        try:
            #loop through the xml file and store all user email
            for e in soup.find_all(tags[tag3]):
                try: 
                    mail = e.string.split('@')[1]
                    if mail == email or mail == alt_email: 
                        username.append(e.string.split('@')[0])
                        test+=1
                        if test == 1: 
                            try:
                                amount['Award_Amount'].append(soup.find(tags[tag2]).string)
                                amount['Date'].append(soup.find(tags[tag1]).string)
                            except AttributeError: 
                                continue
                except IndexError: 
                    continue 
        except AttributeError:
            continue
        test = 0


---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-29-50916abccfe9> in <module>()
----> 1 get_ipython().run_cell_magic('time', '', "\nfiles = glob.glob('20*/*.xml')\nusername = [] #The clemson userIDs in the NSF files \n              #length = 802\npal_user_fund = []\ntest = 0\namount = {'Date':[],'Award_Amount':[]}\n\nfor xml_f in files:\n#Open files\n    with open(xml_f) as new_file:\n    #Store Data\n        xml = new_file.read()\n        #Convert data to text \n        soup = BeautifulSoup(xml,'xml')\n        try:\n            #loop through the xml file and store all user email\n            for e in soup.find_all(tags[tag3]):\n                try: \n                    mail = e.string.split('@')[1]\n                    if mail == email or mail == alt_email: \n                        username.append(e.string.split('@')[0])\n                    else:\n                        test = 'False'\n                except IndexError: \n                    continue \n        except AttributeError:\n            continue\n        \n        #**********Adding new condition for the money earned by palmetto users \n        try:\n            mail_test = soup.find(tags[tag3]).string.split('@')[0]\n            if mail_test in user_df_list:\n                amount['Award_Amount'].append(soup.find(tags[tag2]).string)\n                amount['Date'].append(soup.find(tags[tag1]).string)\n        except AttributeError: \n            continue\n            ")

/software/anaconda3/4.2.0/lib/python3.5/site-packages/IPython/core/interactiveshell.py in run_cell_magic(self, magic_name, line, cell)
   2101             magic_arg_s = self.var_expand(line, stack_depth)
   2102             with self.builtin_trap:
-> 2103                 result = fn(magic_arg_s, cell)
   2104             return result
   2105 

<decorator-gen-59> in time(self, line, cell, local_ns)

/software/anaconda3/4.2.0/lib/python3.5/site-packages/IPython/core/magic.py in <lambda>(f, *a, **k)
    186     # but it's overkill for just that one bit of state.
    187     def magic_deco(arg):
--> 188         call = lambda f, *a, **k: f(*a, **k)
    189 
    190         if callable(arg):

/software/anaconda3/4.2.0/lib/python3.5/site-packages/IPython/core/magics/execution.py in time(self, line, cell, local_ns)
   1174         else:
   1175             st = clock2()
-> 1176             exec(code, glob, local_ns)
   1177             end = clock2()
   1178             out = None

<timed exec> in <module>()

/software/anaconda3/4.2.0/lib/python3.5/site-packages/bs4/__init__.py in __init__(self, markup, features, builder, parse_only, from_encoding, exclude_encodings, **kwargs)
    226             self.reset()
    227             try:
--> 228                 self._feed()
    229                 break
    230             except ParserRejectedMarkup:

/software/anaconda3/4.2.0/lib/python3.5/site-packages/bs4/__init__.py in _feed(self)
    287         self.builder.reset()
    288 
--> 289         self.builder.feed(self.markup)
    290         # Close out any unfinished strings and close all the open tags.
    291         self.endData()

/software/anaconda3/4.2.0/lib/python3.5/site-packages/bs4/builder/_lxml.py in feed(self, markup)
    135                 data = markup.read(self.CHUNK_SIZE)
    136                 if len(data) != 0:
--> 137                     self.parser.feed(data)
    138             self.parser.close()
    139         except (UnicodeDecodeError, LookupError, etree.ParserError) as e:

src/lxml/parser.pxi in lxml.etree._FeedParser.feed (src/lxml/lxml.etree.c:112202)()

src/lxml/parser.pxi in lxml.etree._FeedParser.feed (src/lxml/lxml.etree.c:112077)()

src/lxml/parsertarget.pxi in lxml.etree._TargetParserContext._handleParseResult (src/lxml/lxml.etree.c:128526)()

src/lxml/parsertarget.pxi in lxml.etree._TargetParserContext._handleParseResult (src/lxml/lxml.etree.c:128396)()

src/lxml/lxml.etree.pyx in lxml.etree._ExceptionContext._raise_if_stored (src/lxml/lxml.etree.c:10741)()

src/lxml/saxparser.pxi in lxml.etree._handleSaxTargetStart (src/lxml/lxml.etree.c:120346)()

src/lxml/saxparser.pxi in lxml.etree._callTargetSaxStart (src/lxml/lxml.etree.c:121259)()

src/lxml/parsertarget.pxi in lxml.etree._PythonSaxParserTarget._handleSaxStart (src/lxml/lxml.etree.c:127508)()

/software/anaconda3/4.2.0/lib/python3.5/site-packages/bs4/builder/_lxml.py in start(self, name, attrs, nsmap)
    145     def start(self, name, attrs, nsmap={}):
    146         # Make sure attrs is a mutable dict--lxml may send an immutable dictproxy.
--> 147         attrs = dict(attrs)
    148         nsprefix = None
    149         # Invert each namespace map as it comes in.

/software/anaconda3/4.2.0/lib/python3.5/_collections_abc.py in keys(self)
    607     def keys(self):
    608         "D.keys() -> a set-like object providing a view on D's keys"
--> 609         return KeysView(self)
    610 
    611     def items(self):

/software/anaconda3/4.2.0/lib/python3.5/_collections_abc.py in __init__(self, mapping)
    629     __slots__ = '_mapping',
    630 
--> 631     def __init__(self, mapping):
    632         self._mapping = mapping
    633 

KeyboardInterrupt: 

In [24]:
similar = [userID for userID in username if userID in user_df_list] #length = 282
print('Number of Palmetto usernames: '+ str(len(similar)))
print('Number of total Clemson usernames: '+ str(len(username)))


Number of Palmetto usernames: 282
Number of total Clemson usernames: 802

In [25]:
palmetto = len(similar)
all_users = len(username)

percentage = palmetto/all_users #35%
print("Palmetto Users: "+ str(round(percentage*100)) + '%\n'+'Note: ' + "Value out of total number of Clemson usernames.")


Palmetto Users: 35%
Note: Value out of total number of Clemson usernames.

In [27]:
pal_user_fund = pd.DataFrame(amount) #Dateframe
pal_fund = pal_user_fund[['Date','Award_Amount']] #Dataframe

pal_s = pal_fund['Award_Amount'] #Series
pal_s = pal_s.astype(float) #convert to float 

pal2 = pd.to_datetime(pal_fund['Date']) #convert to date type
pal_s.index = pal2 #Set index to date 

g_pal_s = pal_s.groupby(pal_s.index.year).sum() #sum dollar amounts
stack_fund = pd.concat([g_pal_s,byyear],axis=1) #combine series


stack_fund.columns = ['CU Awards: Palmetto Users Only','CU Awards']#set column names
print(stack_fund)

#*****************************Graph Properties**************************************
plot1 = stack_fund.plot.bar(stacked=True,figsize = (12,6),
                              color=['#522D80','#F66733'],
                              legend=True)
plot1.set_title('NSF Award Totals (2007-2017)')
plot1.set_ylabel('10 million ($)')
plot1.set_xlabel('Year')


      Award_Amount  Dollar_Amount
2006      547565.0       239397.0
2007    14977193.0      4580307.0
2008    17173581.0      4568309.0
2009    64442893.0     29983040.0
2010    40823264.0     13833306.0
2011    41480604.0      9821422.0
2012    43714433.0     15521590.0
2013    19862183.0      8715603.0
2014    32441117.0     18506847.0
2015    37781328.0     13549741.0
2016    59165119.0     24084718.0
2017     4274561.0       702053.0
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-27-f38216d1d665> in <module>()
     18 plot1 = stack_fund.plot.bar(stacked=True,figsize = (12,6),
     19                               color=['#522D80','#F66733'],
---> 20                               legend=True,columns = ['CU Awards: Palmetto Users Only','CU Awards'])
     21 plot1.set_title('NSF Award Totals (2007-2017)')
     22 plot1.set_ylabel('10 million ($)')

/software/anaconda3/4.2.0/lib/python3.5/site-packages/pandas/tools/plotting.py in bar(self, x, y, **kwds)
   3777         axes : matplotlib.AxesSubplot or np.array of them
   3778         """
-> 3779         return self(kind='bar', x=x, y=y, **kwds)
   3780 
   3781     def barh(self, x=None, y=None, **kwds):

/software/anaconda3/4.2.0/lib/python3.5/site-packages/pandas/tools/plotting.py in __call__(self, x, y, kind, ax, subplots, sharex, sharey, layout, figsize, use_index, title, grid, legend, style, logx, logy, loglog, xticks, yticks, xlim, ylim, rot, fontsize, colormap, table, yerr, xerr, secondary_y, sort_columns, **kwds)
   3738                           fontsize=fontsize, colormap=colormap, table=table,
   3739                           yerr=yerr, xerr=xerr, secondary_y=secondary_y,
-> 3740                           sort_columns=sort_columns, **kwds)
   3741     __call__.__doc__ = plot_frame.__doc__
   3742 

/software/anaconda3/4.2.0/lib/python3.5/site-packages/pandas/tools/plotting.py in plot_frame(data, x, y, kind, ax, subplots, sharex, sharey, layout, figsize, use_index, title, grid, legend, style, logx, logy, loglog, xticks, yticks, xlim, ylim, rot, fontsize, colormap, table, yerr, xerr, secondary_y, sort_columns, **kwds)
   2612                  yerr=yerr, xerr=xerr,
   2613                  secondary_y=secondary_y, sort_columns=sort_columns,
-> 2614                  **kwds)
   2615 
   2616 

/software/anaconda3/4.2.0/lib/python3.5/site-packages/pandas/tools/plotting.py in _plot(data, x, y, subplots, ax, kind, **kwds)
   2439         plot_obj = klass(data, subplots=subplots, ax=ax, kind=kind, **kwds)
   2440 
-> 2441     plot_obj.generate()
   2442     plot_obj.draw()
   2443     return plot_obj.result

/software/anaconda3/4.2.0/lib/python3.5/site-packages/pandas/tools/plotting.py in generate(self)
   1026         self._compute_plot_data()
   1027         self._setup_subplots()
-> 1028         self._make_plot()
   1029         self._add_table()
   1030         self._make_legend()

/software/anaconda3/4.2.0/lib/python3.5/site-packages/pandas/tools/plotting.py in _make_plot(self)
   1962                 rect = self._plot(ax, self.ax_pos + w, y, self.bar_width,
   1963                                   start=start, label=label,
-> 1964                                   log=self.log, **kwds)
   1965                 pos_prior = pos_prior + np.where(mask, y, 0)
   1966                 neg_prior = neg_prior + np.where(mask, 0, y)

/software/anaconda3/4.2.0/lib/python3.5/site-packages/pandas/tools/plotting.py in _plot(cls, ax, x, y, w, start, log, **kwds)
   1916     @classmethod
   1917     def _plot(cls, ax, x, y, w, start=0, log=False, **kwds):
-> 1918         return ax.bar(x, y, w, bottom=start, log=log, **kwds)
   1919 
   1920     @property

/software/anaconda3/4.2.0/lib/python3.5/site-packages/matplotlib/__init__.py in inner(ax, *args, **kwargs)
   1817                     warnings.warn(msg % (label_namer, func.__name__),
   1818                                   RuntimeWarning, stacklevel=2)
-> 1819             return func(ax, *args, **kwargs)
   1820         pre_doc = inner.__doc__
   1821         if pre_doc is None:

/software/anaconda3/4.2.0/lib/python3.5/site-packages/matplotlib/axes/_axes.py in bar(self, left, height, width, bottom, **kwargs)
   2087                 label='_nolegend_'
   2088                 )
-> 2089             r.update(kwargs)
   2090             r.get_path()._interpolation_steps = 100
   2091             #print r.get_label(), label, 'label' in kwargs

/software/anaconda3/4.2.0/lib/python3.5/site-packages/matplotlib/artist.py in update(self, props)
    857                 func = getattr(self, 'set_' + k, None)
    858                 if func is None or not six.callable(func):
--> 859                     raise AttributeError('Unknown property %s' % k)
    860                 func(v)
    861             changed = True

AttributeError: Unknown property columns

In [ ]: