by Kristian Garza
In this exploration the first question we asked was how big is our dataset? We are looking at events that represents links between journal article and scholarly resources. Therefore our dataset looks like this: there are around 50K links from journal articles to scholarly resources and around 900K links from scholarly resources to journal articles.
In [63]:
library(ggplot2)
library(jsonlite)
library(plyr)
library(scales)
library(dplyr)
library(stringr)
library(RColorBrewer)
library(httr)
library(tidyr)
library(psych)
source("../functions/graph_functions.r")
subject,year,count,percentage,sum copper,2006,32,79,5255 silver,2006,4176,79,5255
In [64]:
# load("../data/2018-10-03_source_datacite-crossref_meta.Rda",verbose=TRUE)
load("../data/2018-10-28_source_datacite-crossref_meta.Rda",verbose=TRUE)
print((meta$registrants$years[1]))
registrants <- meta$registrants
citation_types <- meta$`citation-types`
relation_types <- meta$`relation-types`
pairings <- meta$pairings
In [65]:
flat_year<-function(years){
x <- filter(years[[1]], title == "2017")
return(x$sum)
}
flat_year_8<-function(years){
x <- filter(years[[1]], title == "2018")
return(x$sum)
}
# registrants %>% mutate(`2017` = "",`2018` = "" )
for (row in 1:nrow(registrants)) {
first <- flat_year(registrants$years[row])
second <- flat_year_8(registrants$years[row])
if(length(first) == 0){
first<-0
}
if(length(second) == 0){
second<-0
}
registrants$`2017`[row] <- first
registrants$`2018`[row] <- second
}
registrants<-registrants %>%
mutate(m=((`2018`-`2017`)/(10000)),
client=title,
`2018`=`2018`,
`2017`=`2017`,
`2018-p`=100*(`2018`/count),
`2017-p`=100*(`2017`/count)
) %>%
filter(startsWith(title, "datacite"))
# head(registrants,5)
In [66]:
load("../data/2018-10-11_datacite_registrants.Rda",verbose=TRUE)
registrants <- registrants %>% rowwise() %>% left_join(datacite_reg)
In [67]:
plot_slopegraph(y_label="Datasets to Article Links", slope_df=head(registrants,15))
In [68]:
types <- relation_types %>%
mutate(total = sum(count), percentage = (count/total)*100, type=title, column="Type") %>%
arrange(desc(total))
hundred_plot(head(types,7),"Links by relationship type (%)",TRUE)
In [21]:
citation <- citation_types %>%
mutate(total = sum(count), percentage = (count/total)*100, type=title, column="Type") %>%
arrange(desc(total))
hundred_plot(head(citation,7),"Links by type (%)",TRUE)
In [22]:
load("../data/2018-10-28_source_datacite_all_citations_types_meta.Rda",verbose=TRUE)
citation_types <- meta$`citation-types`
types <- citation_types %>%
mutate(total = sum(count), percentage = (count/total)*100, type=title, column="Type") %>%
arrange(desc(percentage))
hundred_plot(head(types,7),"Types of Citation (%)",TRUE)
In [23]:
pairings<-pairings %>% unnest(registrants)
In [24]:
pairings<-pairings %>% filter(startsWith(title, "datacite")) %>% mutate(datacenter=as.factor(title),publisher=as.factor(id1)) %>%
arrange(desc(sum))
head(pairings,10)
summary(pairings$count)
Another interesting thing we can look is relationships of citations between Publishers and Datacenters.
FIG Parallel set graph for data citations between particular Publishers and a particular Data Center. Publishers as the top category and Data Centers as the bottom category. The width of the bar denotes the absolute number of citations for that Publisher-Data center match. The dataset corresponds to links collected as of September 2018.
In [25]:
with(pairings, parallelset(datacenter, publisher, freq=sum, col="#008888", alpha=0.4))
In [26]:
pairings_h <- pairings %>%
mutate(highlighted = ifelse(datacenter=="datacite.bl.ccdc","Yes","No"))
myt <- within(pairings_h, {
highlighted <- factor(highlighted, levels=c("Yes","No"))
color <- ifelse(highlighted=="Yes","#008888","#9e99a3")
})
with(myt, parallelset(datacenter, publisher, freq=sum, col=color, alpha=0.4))
In [27]:
pairings_h <- pairings %>%
mutate(highlighted = ifelse(datacenter=="datacite.tib.pangaea","Yes","No"))
myt <- within(pairings_h, {
highlighted <- factor(highlighted, levels=c("Yes","No"))
color <- ifelse(highlighted=="Yes","#008888","#9e99a3")
})
with(myt, parallelset(datacenter, publisher, freq=sum, col=color, alpha=0.4))