In [1]:
from datetime import datetime
start = datetime.utcnow() # For measuring the total processing time
In [2]:
import json
from urllib.request import urlopen
import pandas as pd
import numpy as np
In [3]:
AMC_URL = "http://articlemeta.scielo.org/api/v1/collection/identifiers/"
amc_data = pd.DataFrame(json.load(urlopen(AMC_URL)))
print("Number of collections: " + str(amc_data.shape[0]+1))
amc_data.head(2)
Out[3]:
Some collections won't be analyzed, mainly to avoid duplicates
(there are articles in more than one collection).
The spa
(Public Health collection) should have part of it
kept in the result, but it's not a collection
whose journals/articles are assigned to a single country.
The collections below are linked to a single country:
In [4]:
dont_evaluate = ["bio", "cci", "cic", "ecu", "psi", "pry", "rve", "rvo", "rvt", "sss", "spa", "wid"]
amc_names_map = {"code": "collection"}
amc_pairs = amc_data[(amc_data["acron2"].str.len() == 2) &
~amc_data["code"].isin(dont_evaluate)]\
.rename(columns=amc_names_map)\
# "~amc_data["code"].isin(dont_evaluate)]" is denying the list "dont_evaluate"
print("Number of collections: " + str(amc_pairs.shape[0]+1))
collections = amc_pairs[['collection']].copy()
collections
Out[4]:
These journals in the spa
collection have the following countries:
In [5]:
spa_issn_country = pd.DataFrame([
("0021-2571"),
("0042-9686"),
("1020-4989"),
("1555-7960"),
], columns=["issn"])
spa_issn_country # For collection = "spa", only!
Out[5]:
This dataset is the Network spreadsheet/CSV pack which can be found in the SciELO Analytics report web page. The first two rows of it are:
In [6]:
import zipfile
# Use the Zip file in jcatalog/data/scielo
with zipfile.ZipFile( "../../data/scielo/tabs_network_190210.zip", 'r') as zip_ref:
zip_ref.extract('documents_dates.csv', 'csv_files')
In [7]:
df0 = pd.read_csv('csv_files/documents_dates.csv', keep_default_na=False, low_memory=False)
df0.shape
Out[7]:
In [8]:
names_map = {
"ISSN SciELO": "issn",
"title at SciELO":"title",
"document publishing ID (PID SciELO)": "docs",
"document type":"type",
"document is citable": "is_citable",
"document publishing year": "year"
}
# df[list(names_map.keys())].rename(columns=names_map, inplace=True)
df0.rename(columns=names_map, inplace=True)
df0.head(2)
Out[8]:
In [9]:
df = pd.concat([
pd.merge(df0[df0["collection"] != "spa"], collections, how="inner", on="collection"),
pd.merge(df0[df0["collection"] == "spa"], spa_issn_country, how="inner", on="issn"),
])
In [10]:
df.head(2)
Out[10]:
In [11]:
# compare
df0.shape
Out[11]:
In [12]:
df.shape
Out[12]:
In [13]:
set(df.collection)
Out[13]:
In [14]:
df["pub_year"] = np.where(df['year'] <= 1996, 'ate_1996', df["year"])
In [15]:
df['document published at year'] = pd.to_numeric(df['document published at year'], errors='coerce')
df['document published at month'] = pd.to_numeric(df['document published at month'], errors='coerce')
df['document accepted at year'] = pd.to_numeric(df['document accepted at year'], errors='coerce')
df['document accepted at month'] = pd.to_numeric(df['document accepted at month'], errors='coerce')
df['document submitted at year'] = pd.to_numeric(df['document submitted at year'], errors='coerce')
df['document submitted at month'] = pd.to_numeric(df['document submitted at month'], errors='coerce')
In [16]:
current_year = datetime.now().year
print(current_year)
In [17]:
df['check_doc_pub_scielo'] = np.where(
(df['document published in SciELO at year'] >= 1997) &
(df['document published in SciELO at year'] <= current_year) &
(df['document published in SciELO at month'] >= 1) &
(df['document published in SciELO at month'] <= 12) &
(df['document published in SciELO at day'] >= 1) &
(df['document published in SciELO at day'] <= 31), 0,1)
In [18]:
df['check_doc_pub'] = np.where(
(df['document published at year'] >= 1997) &
(df['document published at year'] <= current_year) &
(df['document published at month'] >= 1) &
(df['document published at month'] <= 12), 0, 1)
In [19]:
df['check_doc_accepted'] = np.where(
(df['document accepted at year'] >= 1997) &
(df['document accepted at year'] <= current_year) &
(df['document accepted at month'] >= 1) &
(df['document accepted at month'] <= 12), 0, 1)
In [20]:
df['check_doc_submitted'] = np.where(
(df['document submitted at year'] >= 1997) &
(df['document submitted at year'] <= current_year) &
(df['document submitted at month'] >= 1) &
(df['document submitted at month'] <= 12), 0, 1)
In [21]:
df['meses_sub_aprov'] = np.where(
(df.check_doc_submitted == 0) & (df.check_doc_accepted == 0),
(df['document accepted at year'] * 12 + df['document accepted at month']) -
(df['document submitted at year'] * 12 + df['document submitted at month']), np.nan)
In [22]:
df['meses_aprov_pub'] = np.where(
(df.check_doc_accepted == 0) & (df.check_doc_pub == 0),
(df['document published at year'] * 12 + df['document published at month']) -
(df['document accepted at year'] * 12 + df['document accepted at month']), np.nan)
In [23]:
df['meses_sub_pub'] = np.where(
(df.check_doc_submitted == 0) & (df.check_doc_pub == 0),
(df['document published at year'] * 12 + df['document published at month']) -
(df['document submitted at year'] * 12 + df['document submitted at month']), np.nan)
In [24]:
df['meses_aprov_pub_scielo'] = np.where(
(df.check_doc_accepted == 0) & (df.check_doc_pub_scielo == 0),
(df['document published in SciELO at year'] * 12 + df['document published in SciELO at month']) -
(df['document accepted at year'] * 12 + df['document accepted at month']), np.nan)
In [25]:
df['meses_sub_pub_scielo'] = np.where(
(df.check_doc_submitted == 0) & (df.check_doc_pub_scielo == 0),
(df['document published in SciELO at year'] * 12 + df['document published in SciELO at month']) -
(df['document submitted at year'] * 12 + df['document submitted at month']), np.nan)
In [26]:
dfcit = df[df['is_citable'] == 1]
dfcit.shape
Out[26]:
In [27]:
values_list = ['meses_sub_aprov',
'meses_aprov_pub',
'meses_sub_pub',
'meses_aprov_pub_scielo',
'meses_sub_pub_scielo']
td = dfcit.pivot_table(
index=["issn"],
values=values_list,
columns=["pub_year"],
aggfunc=[np.nanmean, np.nanstd],
fill_value="")
In [28]:
td.head(10).T
Out[28]:
In [29]:
td.columns.levels
Out[29]:
In [30]:
td.keys()
for k in td.keys():
print(k)
In [31]:
newlabel = []
for k in td.keys():
newlabel.append(k[0]
.replace('nanmean', 'media')
.replace('nanstd', 'desvp')+'_'+k[1]+'_'+k[2])
In [32]:
newlabel
Out[32]:
In [33]:
newlabel[0::24]
Out[33]:
In [34]:
td.columns = newlabel
In [35]:
td.T
Out[35]:
In [36]:
td.to_csv("output/td_documents_dates_network.csv")
In [37]:
print(f"Notebook processing duration: {datetime.utcnow() - start}")
In [38]:
b = df[df['docs'].str.contains('S0100-40421998000500015')]
In [40]:
b[['document submitted at year',
'document accepted at year',
'document published at year',
'document published in SciELO at year']].astype(int)
Out[40]:
In [ ]: