Data sets can be found here.
Raw data from pubmed is contained in xml files, and we'd like to extract author and date information into a spreadsheet for easier analysis.
The first thing to do is to make sure to set the working directory to where the data is.
In [1]:
import os
print(os.getcwd())
In [2]:
os.chdir("../data/pubs")
os.listdir()
Out[2]:
Next, we'll need to parse the xml files. Since several of the data fles are huge, we don't want to use the python xml module, which would require loading the entire contents of the file into memory. Instead, we'll use lxml.etree.iterparse()
, which will allow us to grab one article at a time, grab its info, then clear it from memory.
In [ ]:
import lxml.etree as ET
import datetime
for event, element in ET.iterparse('git.xml', tag="PubmedArticle", events=("end",)):
element.xpath('.//DateCreated/Year')[0].text
pmid = element.xpath('.//PMID')[0].text
pubdate = datetime.date(
int(element.xpath('.//DateCreated/Year')[0].text), # year
int(element.xpath('.//DateCreated/Month')[0].text), # month
int(element.xpath('.//DateCreated/Day')[0].text), #day
)
journal = element.xpath('.//Journal//ISOAbbreviation')
if journal:
journal = journal[0].text
else:
journal = None
title = element.xpath('.//Article/ArticleTitle')
if title:
title = title[0].text
else:
title = None
abstract = element.xpath('.//Article/Abstract')
if abstract:
abstract = abstract[0].text
else:
abstract = None
author_records = element.xpath('.//Article/AuthorList/Author')
authors = []
for name in author_records:
try:
authors.append((name[0].text, name[1].text))
except IndexError:
pass
print("{}, {}:{}".format(pmid, journal, authors))
element.clear()
In [11]:
class Article(object):
"""Container for publication info"""
def __init__(self, article_id, pubdate, journal, title, abstract, authors):
self.article_id = article_id
self.pubdate = pubdate
self.journal = journal
self.title = title
self.abstract = abstract
self.authors = authors
def __repr__(self):
return "<Article ID: {}>".format(self.article_id)
def get_authors(self):
for author in self.authors:
yield author
class Author(object):
def __init__(self, last_name, first_name=None):
assert type(last_name) == str
self.last_name = last_name
if first_name:
assert type(first_name) == str
self.first_name = first_name.split()[0]
try:
self.initials = " ".join(first_name.split()[1:])
except IndexError:
self.initials = None
else:
self.first_name = None
self.initials = None
In [12]:
from lxml.etree import iterparse
def iter_parse_pubmed(xml_file):
# get an iterable
for event, element in iterparse(xml_file, tag="PubmedArticle", events=("end",)):
pmid = element.xpath('.//PMID')[0].text
pubdate = datetime.date(
int(element.xpath('.//DateCreated/Year')[0].text), # year
int(element.xpath('.//DateCreated/Month')[0].text), # month
int(element.xpath('.//DateCreated/Day')[0].text), #day
)
journal = element.xpath('.//Journal//ISOAbbreviation')
if journal:
journal = journal[0].text
else:
journal = None
title = element.xpath('.//Article/ArticleTitle')
if title:
title = title[0].text
else:
title = None
abstract = element.xpath('.//Article/Abstract')
if abstract:
abstract = abstract[0].text
else:
abstract = None
author_records = element.xpath('.//Article/AuthorList/Author')
authors = []
for name in author_records:
try:
authors.append(Author(name[0].text, name[1].text))
except IndexError:
pass
element.clear()
yield Article(pmid, pubdate, journal, title, abstract, authors)
In [13]:
iter_parse_pubmed('git.xml')
Out[13]:
Usage:
In [ ]:
for article in iter_parse_pubmed('github_pubs.xml'):
print(article)
print(article.pubdate)
for author in article.get_authors():
print("{}, {} {}".format(author.last_name, author.first_name, author.initials))
print()
Author position matters, but it matters in sort of a weird way - first author and last author are most important, then decreasing as you work your way in to the middle of the list. But practically, there's not much distinction between 3rd and 4th author (or 3rd from last and 4th from last), so we'll generate scores for first, second, last, penultimate and everyone else. The trick is to avoid index errors if the author list is smaller than 5, so we need to write up some special cases.
In [14]:
def score_authors(author_list):
if not author_list:
first = None
else:
first = author_list[0]
others, penultimate, second, last = None, None, None, None
list_length = len(author_list)
if list_length > 4:
others = [author for author in author_list[2:-2]]
if list_length > 3:
penultimate = author_list[-2]
if list_length > 2:
second = author_list[1]
if list_length > 1:
last = author_list[-1]
return first, last, second, penultimate, others
In order to get the data into a usable spreadsheet-like form, and for later analysis, I'm going to use the DataFrame
s from the pandas package. This might be overkill, but I know how to use it (sort of).
In [16]:
import pandas as pd
col_names = ["Date", "Journal", "Author Name", "Position"]
df = pd.DataFrame(columns=col_names)
for article in iter_parse_pubmed('git.xml'):
first, last, second, penultimate, others = score_authors(article.authors)
if first:
row = pd.Series([article.pubdate, article.journal, first.first_name, "first"], name=article.pmid, index=col_names)
df = df.append(row)
else:
continue
try:
row = pd.Series([article.pubdate, article.journal, last.first_name, "last"], name=article.pmid, index=col_names)
df = df.append(row)
except:
pass
try:
row = pd.Series([article.pubdate, article.journal, second.first_name, "second"], name=article.pmid, index=col_names)
df = df.append(row)
except:
pass
try:
row = pd.Series([article.pubdate, article.journal, penultimate.first_name, "penultimate"], name=article.pmid, index=col_names)
df = df.append(row)
except:
pass
try:
for x in others:
row = pd.Series([article.pubdate, article.journal, x.first_name, "other"], name=article.pmid, index=col_names)
df = df.append(row)
except:
pass
print(df[1:10])
Data downloaded from the arXiv preprint server is formatted a bit differently, so I'll write a parser that looks a lot like iter_parse_pubmed()
- since they don't really have journals, I'm instead going to include a list of subject tags in place of the journal.
In [1]:
def iter_parse_arxiv(xml_file):
print("parsing!")
ns = {
"a":"http://arxiv.org/OAI/arXiv/",
"o":"http://www.openarchives.org/OAI/2.0/"}
for event, element in iterparse(xml_file, tag= "{http://www.openarchives.org/OAI/2.0/}record", events=("end",)):
ident = element.xpath('./o:header/o:identifier', namespaces = ns)[0].text
pubdate = element.xpath('.//o:datestamp', namespaces = ns)[0].text.split("-")
pubdate = datetime.date(*[int(d) for d in pubdate])
author_records = element.xpath('.//o:metadata//a:authors/a:author', namespaces = ns)
authors = []
for name in author_records:
last_name = name.xpath('./a:keyname', namespaces = ns)[0].text
try:
first_name = name.xpath('./a:forenames', namespaces = ns)[0].text
except IndexError:
first_name = None
try:
authors.append(Author(last_name, first_name))
except IndexError:
pass
try:
title = element.xpath('.//o:metadata//a:title', namespaces = ns)[0].text
except IndexError:
title = None
try:
abstract = element.xpath('.//o:metadata//a:abstract', namespaces = ns)[0].text
except IndexError:
abstract = None
try:
cat = element.xpath('.//o:metadata//a:categories', namespaces = ns)[0].text.split(" ")
except IndexError:
cat = None
element.clear()
yield Article(ident, pubdate, cat, title, abstract, authors)
Now the conclusion - I'll write a function that takes a pubmed xml, parses it using iter_parse_pubmed()
or iter_parse_arxiv()
and score_authors()
, puts the authors into a data frame as shown above, and writes a CSV file. Note: if you want to just parse a pubmed file without going through this notebook, you can use the included xml_parsing.py
script:
$ python xml_parsing.py --pubmed /path/to/pubmed.xml /path/to/output.csv
or
$ python xml_parsing.py --arxiv /path/to/arxiv.xml /path/to/output.csv
In [ ]:
def write_names_to_file(in_file, out_file, pub_type="pubmed"):
col_names = ["Date", "Journal", "Author Name", "Position"]
df = pd.DataFrame(columns=col_names)
with open(out_file, 'w+') as out:
df.to_csv(out, columns = col_names)
counter = 0
if pub_type == "arxiv":
articles = iter_parse_arxiv(in_file)
elif pub_type == "pubmed":
articles = iter_parse_pubmed(in_file)
else:
raise IndexError
for article in articles:
first, last, second, penultimate, others = score_authors(article.authors)
if first:
row = pd.Series([article.pubdate, article.journal, first.first_name, "first"], name=article.article_id, index=col_names)
df = df.append(row)
else:
continue
try:
row = pd.Series([article.pubdate, article.journal, last.first_name, "last"], name=article.article_id, index=col_names)
df = df.append(row)
except:
pass
try:
row = pd.Series([article.pubdate, article.journal, second.first_name, "second"], name=article.article_id, index=col_names)
df = df.append(row)
except:
pass
try:
row = pd.Series([article.pubdate, article.journal, penultimate.first_name, "penultimate"], name=article.article_id, index=col_names)
df = df.append(row)
except:
pass
try:
for x in others:
row = pd.Series([article.pubdate, article.journal, x.first_name, "other"], name=article.article_id, index=col_names)
df = df.append(row)
except:
pass
if counter % 1000 == 0:
print(counter)
with open(out_file, 'a+') as out:
df.to_csv(out, columns = col_names, header=False)
df = pd.DataFrame(columns=col_names)
counter +=1
with open(out_file, 'a+') as out:
df.to_csv(out, columns = col_names, header=False)
Now the tough part - getting genders.
I played around trying to get sexmachine
and GenderComputer
to work, but ran into some issues, and those projects don't seem like they're being maintained, so I thought i'd try genderize.io and gender-api.com. The trouble is these are a web apis, which takes more time than something run locally, and they have a limit to the number of requests you can make. The owners of both of these APIs generously provided me with enough requests to use them for free for this project, but I'll show how to use all three methods.
On to a new notebook...