In [151]:
import argparse
import mysql.connector
#from elementtree.ElementTree import Element                                                                                                                                                                         
#import cElementTree as ElementTree                                                                                                                                                                                  
import xml.etree.cElementTree as ET
import logging

In [152]:
def load_data(datafile):    
    context = ET.iterparse(datafile, events=("start", "end"))
    logging.debug("Got context")
    context = iter(context)    
    return context

In [153]:
def extract_editions(wos_id, elem):    
    return [{'wos_id':wos_id, 
             'edition': i.attrib['value']} for i in elem.iterfind('./static_data/summary/EWUID/edition')] 


def extract_authors(wos_id, elem):
    authors = []    
    
    for names in elem.iterfind('./static_data/summary/names'):
        for name in names:
            author = {'wos_id'   : wos_id,
                      'position' : name.attrib.get('seq_no', 'NULL'),
                      'reprint'  : name.attrib.get('reprint', 'NULL'),
                      'cluster_id': name.attrib.get('dais_id', 'NULL'),
                      'role'     : name.attrib.get('role','NULL')}
            for item in name.iter():
                author[str(item.tag)] = str(item.text)
            print author
            authors.extend(author)
        
    return authors

def extract_publisher(wos_id, elem):
    publisher = {'wos_id': wos_id}
    
    for publishers in elem.iterfind('./static_data/summary/publishers'):
        for item in publishers.iter():
            if item.tag in ['display_name', 'full_name', 'full_address', 'city']:
                publisher[item.tag] = item.text
    print publisher        
    return publisher

In [210]:
context = load_data("sample2.xml")
count  = 0 
def extract_pub_info(wos_id, elem):
    pub = {'wos_id': wos_id}
    
    try:
        # Add the page info
        pub.update(list(elem.iterfind('./static_data/summary/pub_info'))[0].attrib)
        #for item in list(elem.iterfind('.//static_data/summary'))[0].iter():
        #    print item.tag, item.attrib, item.text
        
    except Exception as e:
        #print elem
        #for item in list(elem.iterfind('.//static_data/summary'))[0].iter():
        #    print item.tag, item.attrib, item.text
        
        print "[ERROR] {0} Could not capture pub_info, returning None.".format(wos_id)
        print "[ERROR] {0} CANNOT PROCESS THIS DOCUMENT".format(wos_id)
        
        return None
    
    # Add the publication info    
    pub.update(list(elem.iterfind('./static_data/summary/pub_info'))[0].attrib)
    
    # Get title, source, and source abbreviations   
    for i in elem.iterfind('./static_data/summary/titles/title'):
        pub[str(i.attrib['type'])] = i.text 
    
    # Get document type    
    try:
        pub['doc_type'] = list(elem.iterfind('./static_data/summary/doctypes/doctype'))[0].text
    except Exception as e:
        print "[ERROR] {0} Could not capture doctype, setting to default NULL".format(wos_id)
        pub['doc_type'] = 'NULL'
    
    languages = []
    for lang in list(elem.iterfind('./static_data/fullrecord_metadata/languages/language')):
        #print lang.tag, lang.attrib, lang.text
        languages.extend([{'wos_id': wos_id,
                          'language' : lang.text}])
    # Get categorical data
    headings = []
    for x in list(elem.iterfind('./static_data/fullrecord_metadata/category_info/headings/heading')):
        headings.extend([{'wos_id': wos_id,
                           'heading': x.text }])
        
    subheadings = []
    for sub in list(elem.iterfind('./static_data/fullrecord_metadata/category_info/subheadings/subheading')):
        #print sub.tag, sub.attrib, sub.text
        subheadings.extend([{'wos_id': wos_id,
                             'subheading': sub.text }])        
    #print "Subheadings : ", subheadings
    
    subjects = []
    for sub in list(elem.iterfind('./static_data/fullrecord_metadata/category_info/subjects/subject')):
        #print sub.tag, sub.attrib, sub.text
        subjects.extend([{'wos_id': wos_id,
                          'ascatype' : sub.attrib['ascatype'],
                          'subjects': sub.text }])
    #print "Subjects : ", subjects
    
    
    
    for item in list(elem.iterfind('./dynamic_data/cluster_related/identifiers/identifier')):
        #print item.tag, item.attrib, item.text
        pub[item.attrib['type']] = item.attrib['value']
    #print pub
    
    for item in list(elem.iterfind('./dynamic_data/ic_related/oases/oas')):
        print item.tag, item.attrib, item.text
        if item.text == 'Yes' and item.attrib['type'] == 'gold':
            pub['oases_type_gold'] = 'Yes'
            print "Gold = Yes"
        
    abstract_text = 'NULL'
    for ab in list(elem.iterfind('./static_data/fullrecord_metadata/abstracts/abstract/abstract_text/p')):
        if abstract_text == 'NULL':
            abstract_text = ''
        abstract_text = abstract_text + '\n<p>' + ab.text + '</p>'
    pub['abstract_text'] = abstract_text
    
    return pub
    
def extract_addresses(wos_id, elem):
    addresslist = []
    print wos_id
    name_address_relation = []
    for addresses in elem.iterfind('./static_data/fullrecord_metadata/addresses/address_name'):
        
        print "-"*50
                
        addr = {'wos_id'   : wos_id,
                'addr_num' : list(addresses.iterfind('./address_spec'))[0].attrib['addr_no'],
                'organization' : 'NULL'                
                }
        
        for address in addresses.iter():
            if address.tag in ['full_address', 'city', 'state', 'country', 'zip']:
                #print address.tag
                addr[str(address.tag)] =  str(address.text)        
                
        orgs = []
        for item in addresses.iter():            
            if item.tag == 'organization':                            
                orgs.extend([item.text])
        if not orgs :
            orgs = ['NULL']
        print "Organizations : ", orgs
                
        suborgs = []
        for item in addresses.iter():            
            if item.tag == 'suborganization':
                suborgs.extend([item.text])
        if not suborgs :
            suborgs = ['NULL']
            
        print "SubOrganizations : ", suborgs
                    
        for org in orgs:            
            for suborg in suborgs:
                t = {'organization'    : org,
                     'suborganization' : suborg}
                temp = addr.copy()
                temp.update(t)
                addresslist.extend([temp])
                        
        for name in list(addresses.iterfind('./names/name')):
            print "Name: ", name.tag, name.attrib
            print "{0} {1} {2}".format(wos_id, name.attrib['seq_no'], name.attrib['addr_no'])
            name_address_relation.extend([{'wos_id' : wos_id,
                 'position' : name.attrib['seq_no'],
                 'addr_num' : name.attrib['addr_no']}])
        #print addresslist
        
    #print addresslist
    #print publisher        
    return addresslist, name_address_relation
    
# uid -> wos_id, citedAuthor, year , page, volume, citedTitle, citedWork, doi
def extract_references(wos_id, elem):
    references = []
    for refs in list(elem.iterfind('./static_data/fullrecord_metadata/references/reference')):
        #print sub.tag, sub.attrib, sub.text
        cur = {}
        #print "-"*50
        for ref in refs.iter():
            #print ref.tag, ref.text
            cur[str(ref.tag)] = ref.text        
        references.extend([cur])
        
    return references

def extract_funding(wos_id, elem):
    print wos_id
    funding = {}
    text = ""
    for t in list(elem.iterfind('./static_data/fullrecord_metadata/fund_ack/fund_text')):
        for para in t.iter():
            #print para.tag, para.text
            text = text + str(para.text) + '\n'
    for g in list(elem.iterfind('./static_data/fullrecord_metadata/fund_ack/grants/grant')):
        print g.tag
        for agency in g.iterfind('./grant_agency') :
            print agency.tag, agency.text
        for grant_id in g.iterfind('./grant_ids/grant_id') :
            print grant_id.tag, grant_id.text
    funding['funding_text'] = text
    return funding

def extract_conferences(wos_id, elem):
    conference = {'wos_id' : wos_id}
    sponsors   = []
    
    for conf in list(elem.iterfind('./static_data/summary/conferences/conference')):        
        # Do try catch on each of these
        print conf
        conference['conf_id'] = conf.attrib.get('conf_id', 'NULL')
        print conference
        try : 
            conference['info']  = list(conf.iterfind('./conf_infos/conf_info'))[0].text
        except Exception as e:
            conference['info'] = 'NULL'
        try : 
            conference['title'] = list(conf.iterfind('./conf_titles/conf_title'))[0].text      
        except Exception as e:
            conference['title'] = 'NULL'
        try : 
            conference['dates'] = list(conf.iterfind('./conf_dates/conf_dates'))[0].text
        except Exception as e:
            conference['dates'] = 'NULL'
        try : 
            conference.update(list(conf.iterfind('./conf_dates/conf_date'))[0].attrib)
        except Exception as e:
            pass        
        try : 
            conference['conf_city']  = list(conf.iterfind('./conf_locations/conf_location/conf_city'))[0].text
        except Exception as e:
            conference['conf_city']  = 'NULL'
        try : 
            conference['conf_state'] = list(conf.iterfind('./conf_locations/conf_location/conf_state'))[0].text
        except Exception as e:
            conference['conf_state'] = 'NULL'
        try : 
            conference['conf_host']  = list(conf.iterfind('./conf_locations/conf_location/conf_host'))[0].text
        except Exception as e:
            conference['conf_host']  = 'NULL'
            
        for sponsor in list(conf.iterfind('./sponsors/sponsor')):
            print "Sponsor {0}/{1}: {2}".format(wos_id, conference['conf_id'], sponsor.text)
            sponsors.extend([{'wos_id' : wos_id,
                              'conf_id' : conference['conf_id'],
                              'sponsor' : sponsor.text}])
        
        
    #print conference
    print sponsors
    return conference, sponsors

def extract_keywords(wos_id, elem):
    keywords = []
    keywordsplus = []
    
    for keyword in list(elem.iterfind('./static_data/fullrecord_metadata/keywords/keyword')):        
        keywords.extend([{'wos_id' : wos_id,
                          'keyword' : keyword.text}])
    
    
    for keyword in list(elem.iterfind('./static_data/item/keywords_plus/keyword')):
        print "Plus ", keyword.tag, keyword.text
        keywordsplus.extend([{'wos_id' : wos_id,
                              'keyword' : keyword.text}])
    
    return keywords, keywordsplus

for event, elem in context:
    if event != "start" :
        continue
    pub = {} 
    if elem.tag == 'REC': 
        
        wos_id = list(elem.iterfind('UID'))[0].text
        print "{0} -----------------".format(wos_id)
        #extract_conferences(wos_id, elem)
        #pub = extract_pub_info(wos_id, elem)
        #extract_addresses(wos_id, elem)
        extract_keywords(wos_id, elem)
        #if not pub:
        #    break
        #publisher = extract_publisher(wos_id, elem)
        print "DONE-----------------"
        #print extract_editions(wos_id, elem)        
        #authors = extract_authors(wos_id, elem)
        #refs = extract_references(wos_id, elem)  
        #funding = extract_funding(wos_id, elem)
        
    count += 1
    
print "Done"
    #if count == 20 :
    #    break
    #print "Event:{0} \nElem:{1} \nAttr:{2} \nValue:{3}".format(event, elem.tag, elem.attrib, elem.value)


WOS:000354056000012 -----------------
DONE-----------------
WOS:000353952100024 -----------------
DONE-----------------
WOS:000353975000017 -----------------
Plus  keyword TENDON
Plus  keyword SURGERY
Plus  keyword REPAIR
DONE-----------------
WOS:000353977600022 -----------------
DONE-----------------
WOS:000353832200020 -----------------
DONE-----------------
WOS:000357413500004 -----------------
DONE-----------------
WOS:000353996400021 -----------------
DONE-----------------
WOS:000354001900019 -----------------
DONE-----------------
WOS:000354027200009 -----------------
DONE-----------------
WOS:000354078100010 -----------------
DONE-----------------
WOS:000357426400013 -----------------
DONE-----------------
WOS:000354139800022 -----------------
DONE-----------------
WOS:000353755100054 -----------------
DONE-----------------
WOS:000354071000003 -----------------
DONE-----------------
WOS:000354159400007 -----------------
DONE-----------------
WOS:000353743200009 -----------------
DONE-----------------
WOS:000353751700035 -----------------
DONE-----------------
WOS:000357353500006 -----------------
DONE-----------------
WOS:000354157700005 -----------------
DONE-----------------
WOS:000348489200051 -----------------
DONE-----------------
WOS:000348460100014 -----------------
DONE-----------------
WOS:000357414400019 -----------------
DONE-----------------
WOS:000348501500021 -----------------
DONE-----------------
WOS:000348555300011 -----------------
DONE-----------------
WOS:000346392900002 -----------------
DONE-----------------
WOS:000358036902681 -----------------
DONE-----------------
WOS:000348555500060 -----------------
DONE-----------------
WOS:000348490600002 -----------------
DONE-----------------
WOS:000358036901811 -----------------
DONE-----------------
WOS:000348457200006 -----------------
DONE-----------------
WOS:000358036902060 -----------------
DONE-----------------
WOS:000358157600316 -----------------
DONE-----------------
WOS:000358386901167 -----------------
DONE-----------------
WOS:000358036902621 -----------------
DONE-----------------
WOS:000358264900080 -----------------
DONE-----------------
WOS:000358036900204 -----------------
DONE-----------------
WOS:000358377700158 -----------------
DONE-----------------
WOS:000358456100002 -----------------
DONE-----------------
WOS:000358036901943 -----------------
DONE-----------------
WOS:000348555500053 -----------------
DONE-----------------
WOS:000348516900007 -----------------
DONE-----------------
WOS:000354187000008 -----------------
DONE-----------------
WOS:000354147700005 -----------------
DONE-----------------
WOS:000358264900012 -----------------
DONE-----------------
WOS:000354157700008 -----------------
DONE-----------------
WOS:000358385100220 -----------------
DONE-----------------
WOS:000348634000002 -----------------
DONE-----------------
WOS:000358323800003 -----------------
DONE-----------------
WOS:000348457800004 -----------------
DONE-----------------
WOS:000354199900123 -----------------
DONE-----------------
WOS:000354114700004 -----------------
DONE-----------------
WOS:000354202200016 -----------------
DONE-----------------
WOS:000358229500018 -----------------
DONE-----------------
WOS:000354164400010 -----------------
DONE-----------------
WOS:000358197600144 -----------------
DONE-----------------
WOS:000352667900006 -----------------
DONE-----------------
WOS:000358424900073 -----------------
DONE-----------------
WOS:000352643100480 -----------------
DONE-----------------
WOS:000353885800037 -----------------
DONE-----------------
WOS:000352578900790 -----------------
DONE-----------------
WOS:000352611400002 -----------------
DONE-----------------
WOS:000352512700014 -----------------
DONE-----------------
WOS:000352464400011 -----------------
DONE-----------------
WOS:000358572600009 -----------------
DONE-----------------
WOS:000353131800001 -----------------
DONE-----------------
WOS:000352956900002 -----------------
DONE-----------------
WOS:000353146600002 -----------------
DONE-----------------
WOS:000352855600304 -----------------
DONE-----------------
WOS:000351278300003 -----------------
DONE-----------------
WOS:000352819200013 -----------------
DONE-----------------
WOS:000353154400043 -----------------
DONE-----------------
WOS:000352782300024 -----------------
DONE-----------------
WOS:000352748600431 -----------------
DONE-----------------
WOS:000352675200022 -----------------
DONE-----------------
WOS:000353347500009 -----------------
DONE-----------------
WOS:000347250800007 -----------------
DONE-----------------
WOS:000353251800065 -----------------
DONE-----------------
WOS:000353251500862 -----------------
DONE-----------------
WOS:000353183600027 -----------------
DONE-----------------
WOS:000353168600015 -----------------
DONE-----------------
None -----------------
DONE-----------------
WOS:000350881900004 -----------------
DONE-----------------
WOS:000358692000007 -----------------
DONE-----------------
WOS:000358632500014 -----------------
DONE-----------------
WOS:000358587000026 -----------------
DONE-----------------
WOS:000351900900002 -----------------
DONE-----------------
WOS:000358635400032 -----------------
DONE-----------------
WOS:000351657400010 -----------------
DONE-----------------
WOS:000351828700001 -----------------
DONE-----------------
WOS:000348207300006 -----------------
DONE-----------------
WOS:000348155000001 -----------------
DONE-----------------
WOS:000348215900045 -----------------
DONE-----------------
WOS:000347753900017 -----------------
DONE-----------------
WOS:000350740400024 -----------------
DONE-----------------
WOS:000351019500002 -----------------
DONE-----------------
WOS:000351015600148 -----------------
DONE-----------------
WOS:000350824200006 -----------------
DONE-----------------
---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
<ipython-input-210-535112b72346> in <module>()
    239     if elem.tag == 'REC':
    240 
--> 241         wos_id = list(elem.iterfind('UID'))[0].text
    242         print "{0} -----------------".format(wos_id)
    243         #extract_conferences(wos_id, elem)

IndexError: list index out of range

In [105]:
print global_val


None

In [ ]: