notebook.community

Edit and run



In [19]:

    
import pandas as pd 
import glob
from bs4 import BeautifulSoup



In [3]:

    
email_list = []
test = []
awards = glob.iglob('20*/*.xml')
#parse awards 
for docs in awards:
    #open award
    with open(docs) as award_doc: 
        xml = award_doc.read()
        soup = BeautifulSoup(xml,'xml')
        try: 
            for emails in soup.find_all('EmailAddress'):
                email_list.append(emails.string.split('@')[0])
        except (AttributeError,IndexError):
            test.append(emails.string)









    



None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None






    



---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-3-3122301a891f> in <module>()
      6     with open(docs) as award_doc:
      7         xml = award_doc.read()
----> 8         soup = BeautifulSoup(xml,'xml')
      9         try:
     10             for emails in soup.find_all('EmailAddress'):

/software/anaconda3/4.2.0/lib/python3.5/site-packages/bs4/__init__.py in __init__(self, markup, features, builder, parse_only, from_encoding, exclude_encodings, **kwargs)
    226             self.reset()
    227             try:
--> 228                 self._feed()
    229                 break
    230             except ParserRejectedMarkup:

/software/anaconda3/4.2.0/lib/python3.5/site-packages/bs4/__init__.py in _feed(self)
    287         self.builder.reset()
    288 
--> 289         self.builder.feed(self.markup)
    290         # Close out any unfinished strings and close all the open tags.
    291         self.endData()

/software/anaconda3/4.2.0/lib/python3.5/site-packages/bs4/builder/_lxml.py in feed(self, markup)
    135                 data = markup.read(self.CHUNK_SIZE)
    136                 if len(data) != 0:
--> 137                     self.parser.feed(data)
    138             self.parser.close()
    139         except (UnicodeDecodeError, LookupError, etree.ParserError) as e:

src/lxml/parser.pxi in lxml.etree._FeedParser.feed (src/lxml/lxml.etree.c:112202)()

src/lxml/parser.pxi in lxml.etree._FeedParser.feed (src/lxml/lxml.etree.c:112077)()

src/lxml/parsertarget.pxi in lxml.etree._TargetParserContext._handleParseResult (src/lxml/lxml.etree.c:128526)()

src/lxml/parsertarget.pxi in lxml.etree._TargetParserContext._handleParseResult (src/lxml/lxml.etree.c:128396)()

src/lxml/lxml.etree.pyx in lxml.etree._ExceptionContext._raise_if_stored (src/lxml/lxml.etree.c:10741)()

src/lxml/saxparser.pxi in lxml.etree._handleSaxTargetStart (src/lxml/lxml.etree.c:120346)()

src/lxml/saxparser.pxi in lxml.etree._callTargetSaxStart (src/lxml/lxml.etree.c:121259)()

src/lxml/parsertarget.pxi in lxml.etree._PythonSaxParserTarget._handleSaxStart (src/lxml/lxml.etree.c:127508)()

/software/anaconda3/4.2.0/lib/python3.5/site-packages/bs4/builder/_lxml.py in start(self, name, attrs, nsmap)
    143         self.nsmaps = [self.DEFAULT_NSMAPS]
    144 
--> 145     def start(self, name, attrs, nsmap={}):
    146         # Make sure attrs is a mutable dict--lxml may send an immutable dictproxy.
    147         attrs = dict(attrs)

KeyboardInterrupt:



In [59]:

    
award = glob.glob('2007/0703061.xml')
key1 = 'Principal Investigator'

for a in award: 
    with open(a) as file: 
        xml = file.read()
        soup = BeautifulSoup(xml,'xml')
        key = [x.string for x in soup.find_all('EmailAddress')]
        values = [x.string for x in soup.find_all('RoleCode')]
        role_dict = dict(zip(key, values))



In [62]:

    
for k,v in role_dict.items():
    if v == key1:
        print(k,':',v)
        value = k.split('@')[0]
        print(value)









    



Carl.Washburn@gvltec.edu : Principal Investigator
Carl.Washburn

carl.washburn@gvltec.edu



In [64]:

    
for k,v in role_dict.items():
    print(k,':',v)









    



mkurz@clemson.edu : Co-Principal Investigator
James.Crocker@gvltec.edu : Co-Principal Investigator
Bill.Kendall@gvltec.edu : Former Co-Principal Investigator
agramop@clemson.edu : Co-Principal Investigator
Carl.Washburn@gvltec.edu : Principal Investigator
duchowski@clemson.edu : Co-Principal Investigator



In [ ]:

    
def test(x, debug = False):
    x = x * 2
    if debug: print (x)
    return x

y = test(2)
y = test(2, True)