In [ ]:
#Copies files containing potentially valid "rs" terms from source to destination directory
import os
import shutil
directory = "/home/werner/Desktop/Source/articles.O-Z/" #source directory
filelisting = os.walk(directory)
totalFiles = 0 #counter 1
rsFiles = 0 #counter 2
for root, dirs, files in filelisting:
for file in files:
totalFiles += 1
breakTest = 0
fileOne = open(root + "/" + file)
if breakTest == 1:
break
for line in fileOne:
line1 = line.split()
if breakTest == 1:
break
for word in line1:
if (word.startswith('rs') or (not word[0].isalnum() and "rs" in word[1:3])): #search clause
shutil.copy(os.path.join(root,file), "/home/werner/Desktop/Destination/" + file) #destination directory
breakTest = 1
rsFiles += 1
if breakTest == 1:
break
fileOne.close()
print(totalFiles, rsFiles) #print ratio of files scanned to files containing rsSNPs
In [ ]:
#Create flat file database
import xml.etree.ElementTree as ET
import os
import json
import unicodedata
def remove_control_characters(s): #Remove control characters in XML text
t = ""
for ch in s:
if unicodedata.category(ch)[0] == "C":
t += " "
if ch == "," or ch == "\"":
t += ""
else:
t += ch
return "".join(ch for ch in t if unicodedata.category(ch)[0]!="C")
directory = "/home/werner/Desktop/Destination/"
filelisting = os.walk(directory)
rslist =[]
for root, dirs, files in filelisting:
for file in files:
email = "not available" #iteration reset
pmid = "not available" #iteration reset
year = "not available" #iteration reset
doi = "not available" #iteration reset
kwds = [] #iteration reset
title = "" #iteration reset
tree = ET.parse(root + "/" + file)
#Get email addresses
for node in tree.iter('email'):
email = node.text
#Get publication date
for node in tree.iter('pub-date'):
for subnode in node.iter('year'):
collection = node.attrib
if "pub-type" in collection.keys():
year = subnode.text
#Get titles
for node in tree.iter('title-group'):
for subnode in node.iter('article-title'):
whole = subnode.itertext()
for parts in whole:
title += parts
title = remove_control_characters(title)
#Get PMC IDs
for node in tree.iter('article-id'):
pmidat = node.attrib
if "pmc" in pmidat.values():
pmid = node.text
if "doi" in pmidat.values():
doi = node.text
#Get author defined keywords
for node in tree.iter('kwd'):
kwd = node.text
if kwd != None:
kwds.append(json.dumps(kwd))
kwdstring = str(kwds)
kwdstring = kwdstring.replace("'", "")
kwdstring = kwdstring.replace("\\", "")
#Get rs numbers
for node in tree.iter():
if node.text != None:
node = node.text.split()
for rsnumber in node:
#trim rs numbers preceded by opening bracket
if "rs" in rsnumber[1:3]:
rsnumber = rsnumber[1:]
#ensure that digits follow rs numbers and place limits on rs number length
if rsnumber.isalnum() and len(rsnumber) > 4 and len(rsnumber) <= 12:
#ensure that digits follow rs
if rsnumber.startswith("rs") and rsnumber[2].isdigit():
#ensure that rs numbers end with digits
while not rsnumber[-1].isdigit():
rsnumber = rsnumber[:-1]
#create list item without white spaces
rslist.append(rsnumber.strip() + "\t" + email.strip() + "\t" + year.strip() + \
"\t" + pmid.strip() + "\t" + doi.strip() + "\t" + file.strip() + "\t" + str(kwdstring) + "\t" + str(title))
#create refSNP occurence counter (with dictionary) to avoid data duplication
rsdict = {}
for item in rslist:
#add data if counter is zero
if rsdict.get(item,"empty") == "empty":
rsdict.update({item:1})
#increment counter if data was added
if rsdict.get(item,"empty") != "empty":
rsdict[item] += 1
rslist = []
#create tab delimited CSV file and add data
writetofile = open("/home/werner/Desktop/TEXTdb.csv", "a")
for item in rsdict.keys():
writetofile.write(item + "\t" + str(rsdict[item]) + "\n")
writetofile.close()
In [ ]:
#Convert tab delimited CSV file to JSON
import json
file = open("/home/werner/Desktop/TEXTdb.csv", "r")
JSONstring = "{\"PMCOAI_rs_articles\": ["
for line in file:
lister = line.strip().split("\t")
if len(lister) == 9:
rs_number = "\"rs_number\": " + "\"" + lister[0] + "\""
email_address = "\"email_address\": " + "\"" + lister[1] + "\""
publication_date = "\"publication_date\": " + "\"" + lister[2] + "\""
pubmed_id = "\"pubmed_id\": " + "\"" + lister[3] + "\""
doi = "\"doi\": " + "\"" + lister[4] + "\""
pubmed_file_name = "\"pubmed_file_name\": " + "\"" + lister[5] + "\""
keywords = "\"keywords\": " + lister[6]
article_title = "\"article_title\": " + "\"" + lister[7] + "\""
rs_number_cited_in_article = "\"rs_number_cited_in_article\": " + "\"" + lister[8] + "\""
newdictionary = "{" + rs_number + ", " + email_address + ", " + publication_date + ", " + pubmed_id + \
", " + doi + ", " + pubmed_file_name + ", " + rs_number_cited_in_article + ", " + article_title + ", " + keywords + "}"
JSONstring += newdictionary + ", "
file.close()
#save JSON data and ensure last entry is enclosed in brackets
with open("/home/werner/Desktop/JSONdbKeys.json", "w") as file1:
JSONstring = JSONstring[:-2]
JSONstring += "]}"
file1.write(JSONstring)
In [ ]:
#Generate keyword list and count keywords in database
import json
keywordList = []
with open("/home/werner/Desktop/JSONdbKeys.json") as data_file:
data = json.load(data_file)
#Determine range to avoid out of bounds error
for i in range(len(data["PMCOAI_rs_articles"])):
if len(data["PMCOAI_rs_articles"][i]["keywords"]) > 0:
for keywords in data["PMCOAI_rs_articles"][i]["keywords"]:
keywords = keywords.strip()
#Include alphanumeric terms and phrases
if len(keywords) > 0 and all(x.isalnum() or x.isspace() for x in keywords) \
and not keywords.startswith("rs"):
keywordList.append(keywords)
#Remove duplicates with set
keywordList = set(keywordList)
#Revert to list
keywordList = sorted(keywordList)
with open("/home/werner/Desktop/keywords.js", "w") as data_file:
data_file.write(json.dumps(keywordList))
print(len(keywordList))
In [ ]:
#Generate rs number list and count rs numbers in database
import json
keywordList = []
with open("/home/werner/Desktop/JSONdbKeys.json") as data_file:
data = json.load(data_file)
#Determine range to avoid out of bounds error
for i in range(len(data["PMCOAI_rs_articles"])):
#Include rs numbers with at least four digits following "rs"
if len(data["PMCOAI_rs_articles"][i]["rs_number"]) > 5 and data["PMCOAI_rs_articles"][i]["rs_number"][2:].isdigit():
keywordList.append(data["PMCOAI_rs_articles"][i]["rs_number"])
#Remove duplicates with set
keywordList = set(keywordList)
#Revert to list
keywordList = sorted(keywordList)
with open("/home/werner/Desktop/rsnumbers.js", "w") as data_file:
data_file.write(json.dumps(keywordList))
print(len(keywordList))
In [ ]:
#Return a list containing each object's DOI (using Python 3) if the object contains reference SNP ID rs5743810
#Published on web page as example for API usage
import httplib2
import json
h = httplib2.Http()
resp, content = h.request("http://sniphunter.sanbi.ac.za/PMCOAI_rs_articles?rs_number=rs5743810")
assert resp.status == 200
#Convert binary object to string
result = content.decode("utf-8")
#Convert string to JSONstring
inJSON = json.loads(result)
for dictionary in inJSON:
for key, value in dictionary.items():
if key == "doi":
print(value)