In [1]:
from bs4 import BeautifulSoup
from gensim import corpora
from pprint import pprint
from collections import defaultdict
import glob
#Array storage for Award IDs and Abstracts
awardabs_ar=[]
aID_ar=[]
doct_div2 = []
doc_div = []
for items in glob.glob("201*/*.xml"): #Iterate through all xml files in the directory/file name given
#Open files
item = open(items)
#Store Data
xml = item.read()
#Convert data to text
soup = BeautifulSoup(xml,'xml')
#store the current name of the institution
clemson = soup.Name.string
#Check if the name matches Clemson University
#if clemson=="Clemson University":
#Add award ID
ID = soup.AwardID.string
aID_ar.append(ID)
#filter data if not None Type, Error checking
if soup.AbstractNarration.string is not None:
abst = soup.AbstractNarration.string
#Adds abstracts to its list
awardabs_ar.append(abst)
#Determine the number of topics if not None Type, error checking
if soup.Directorate.LongName.string is not None:
doct = soup.Directorate.LongName.string
if soup.Division.LongName.string is not None:
div = soup.Division.LongName.string
#concatenate Directorate and Divison names for testing
test = doct + "\t" + div
#All Directorates and Divisions stored
doc_div.append(test)
#No duplicate Directorate and Division combinations
if test not in doct_div2:
doct_div2.append(test)
topic = len(doct_div2)
print("\nNumber of Topics:", topic)
#All Abstracts
file=open("cu_tigers.txt","w")
for lines in awardabs_ar:
file.write(lines)
file.write('\n')
file.close()
#All Xml Award IDs
file2=open("cu_IDs.txt","w")
for lines in aID_ar:
file2.write(lines)
file2.write('\n')
file2.close()
#No Dublicate Directorates and Division Combos
file3=open("cu_doc_div2.txt","w")
for lines in doct_div2:
file3.write(lines)
file3.write('\n')
file3.close()
#All Directorates and Divisions
file4=open("cu_doc_div.txt","w")
for lines in doc_div:
file4.write(lines)
file4.write('\n')
file4.close()
print("done")
In [2]:
from bs4 import BeautifulSoup
from gensim import corpora
from pprint import pprint
from collections import defaultdict
import glob
#Array storage for Award IDs and Abstracts
awardabs_ar=[]
aID_ar=[]
doct_div2 = []
doc_div = []
for items in glob.glob("201*/*.xml"): #Iterate through all xml files in the directory/file name given
#Open files
item = open(items)
#Store Data
xml = item.read()
#Convert data to text
soup = BeautifulSoup(xml,'xml')
#store the current name of the institution
clemson = soup.Name.string
#Check if the name matches Clemson University
if clemson=="Clemson University":
#Add award ID
ID = soup.AwardID.string
aID_ar.append(ID)
#filter data if not None Type, Error checking
if soup.AbstractNarration.string is not None:
abst = soup.AbstractNarration.string
#Adds abstracts to its list
awardabs_ar.append(abst)
#Determine the number of topics if not None Type, error checking
if soup.Directorate.LongName.string is not None:
doct = soup.Directorate.LongName.string
if soup.Division.LongName.string is not None:
div = soup.Division.LongName.string
#concatenate Directorate and Divison names for testing
test = doct + " " + div
#All Directorates and Divisions stored
doc_div.append(test)
#No duplicate Directorate and Division combinations
if test not in doct_div2:
doct_div2.append(test)
topic = len(doct_div2)
print("\nNumber of Topics:", topic)
#All Abstracts
file=open("cu_tigers.txt","w")
for lines in awardabs_ar:
file.write(lines)
file.write('\n')
file.close()
#All Xml Award IDs
file2=open("cu_IDs.txt","w")
for lines in aID_ar:
file2.write(lines)
file2.write('\n')
file2.close()
#No Dublicate Directorates and Division Combos
file3=open("cu_doc_div2.txt","w")
for lines in doct_div2:
file3.write(lines)
file3.write('\n')
file3.close()
#All Directorates and Divisions
file4=open("cu_doc_div.txt","w")
for lines in doc_div:
file4.write(lines)
file4.write('\n')
file4.close()
#Total Topic count
file5=open("TotalTopicCount.txt","w")
file5.write(str(topic))
file5.close()
print("done")
In [ ]:
In [ ]: