Datacite metadata were pulled using the rdatacite
package in November 2022. Each of the six institutions were searched
using the name of each University in the
creators.affiliation.name
metadata field. Results were
filtered to include DOIs with a publicationYear
of 2012 or
later, and a resourceTypeGeneral
of dataset or software. As
the search terms returned other institutions with similar names, results
were filtered to include DOIs only from the relevant institutional
affiliations.
Following recommendations of the Crossref API, metadata was pulled
from the April 2022 Public Release file (http://dx.doi.org/10.13003/83b2gq). DOIs were searched
records with a created-dateparts
year of 2012 or newer,
that had a type
of datasets (Crossref does not have
software as an available type), and had an author affiliation with one
of the six institutions.
Upon initial examination of the affiliation data, we realized that our own institutional repositories were not represented in the data because the affiliation metadata field was not completed as part of the DOI generation process.
To pull data shared in our institutional repositories as a
comparison, a second search was performed to retrieve DOIs published by
the institutional repositories at each university. For the institutional
repositories using DataCite to issue DOIs (5 out of the 6 institutions
at the time), the datacite API queried by names of the institutional
repositories in the publisher
metadata field. For the one
institution using CrossRef to issue DOIs (Duke), the crossref API was
used to retrieve all DOIs published using the Duke member prefixes.
Institutional repository data was then filtered to include only the relevant repositories, datasets and software resource types, and DOIs published in 2012 or later.
Affiliation data from datacite, affiliation data from cross ref, and the institutional repository data were combined into a single dataset.
Load required packages and read in combined data.
rm(list = ls())
#packages
#for radar graph
devtools::install_github("ricardo-bion/ggradar",
dependencies = TRUE)
#load
pacman::p_load(dplyr,
tidyr,
ggplot2,
rjson,
rdatacite,
cowplot,
stringr,
knitr,
DT,
ggbreak,
ggradar,
janitor)
#Load the combined data from 3_Combined_data.R
load(file="data_rdata_files/Combined_ALL_data.Rdata")
#rename object
all_dois <- combined_dois3
#re-factor group so that datacite appears before cross ref
all_dois$group <- factor(all_dois$group, levels = c("Affiliation - Datacite", "Affiliation - CrossRef", "IR_publisher"))
#rename "Washington U" --> WashU
all_dois$institution[which(all_dois$institution == "Washington U")] <- "WashU"
Some repositories (such as Harvard’s Dataverse and Qualitative Data Repository) assign DOIs at the level of the file, rather than the study. Similarly, Zenodo often has many related DOIs for multiple figures within a study. In order to attempt to compare study-to-study counts of data sharing, look at the DOIs collapsed by “container”.
by_container <-
all_dois %>%
filter(!is.na(container_identifier)) %>%
group_by(container_identifier, publisher, title, institution) %>%
summarize(count=n()) %>%
arrange(desc(count))
How many publishers have container DOIs?
by_container %>%
group_by(publisher) %>%
summarize(count=n()) %>%
arrange(desc(count)) %>%
datatable
Collapsing by container for counts
containerdups <- which(!is.na(all_dois$container_identifier) & duplicated(all_dois$container_identifier))
all_dois_collapsed <- all_dois[-containerdups,]
This leaves a total of 143633 cases.
DOI types by resource
all_dois_collapsed %>%
group_by(resourceTypeGeneral, group) %>%
summarize(count=n()) %>%
pivot_wider(names_from = group,
values_from = count,
values_fill = 0) %>%
kable()
resourceTypeGeneral | Affiliation - Datacite | Affiliation - CrossRef | IR_publisher |
---|---|---|---|
Dataset | 11334 | 125635 | 2103 |
Software | 4500 | 0 | 61 |
DOI by institutional affiliation/publisher
all_dois_collapsed %>%
group_by(group, institution) %>%
summarize(count=n()) %>%
pivot_wider(names_from = group,
values_from = count) %>%
adorn_totals(where=c("row", "col")) %>%
kable()
institution | Affiliation - Datacite | Affiliation - CrossRef | IR_publisher | Total |
---|---|---|---|---|
Cornell | 3887 | 655 | 174 | 4716 |
Duke | 2370 | 2969 | 225 | 5564 |
Michigan | 4187 | 119942 | 645 | 124774 |
Minnesota | 2322 | 1514 | 692 | 4528 |
Virginia Tech | 1442 | 64 | 333 | 1839 |
WashU | 1626 | 491 | 95 | 2212 |
Total | 15834 | 125635 | 2164 | 143633 |
How many non-ENCODE DOIs?
all_dois_collapsed %>%
filter(publisher != "ENCODE Data Coordination Center") %>%
group_by(institution, group) %>%
summarize(count=n()) %>%
pivot_wider(names_from = group,
values_from = count) %>%
adorn_totals(where=c("row", "col")) %>%
kable()
institution | Affiliation - Datacite | Affiliation - CrossRef | IR_publisher | Total |
---|---|---|---|---|
Cornell | 3887 | 649 | 174 | 4710 |
Duke | 2370 | 2443 | 225 | 5038 |
Michigan | 4187 | 2373 | 645 | 7205 |
Minnesota | 2322 | 1514 | 692 | 4528 |
Virginia Tech | 1442 | 64 | 333 | 1839 |
WashU | 1626 | 491 | 95 | 2212 |
Total | 15834 | 7534 | 2164 | 25532 |
Look at all the Institutional Repositories Captured
IR_pubs <- all_dois_collapsed %>%
filter(group == "IR_publisher") %>%
group_by(publisher_plus) %>%
summarize(count = n())
IR_pubs %>%
kable(col.names = c("Institutional Repository", "Count"))
Institutional Repository | Count |
---|---|
Cornell | 174 |
Duke-Duke Digital Repository | 78 |
Duke-Research Data Repository, Duke University | 147 |
Michigan | 10 |
Michigan-Deep Blue | 515 |
Michigan-ICPSR/ISR | 109 |
Michigan-Other | 11 |
Minnesota | 692 |
Virginia Tech | 333 |
Washington U | 95 |
Replace all of these publishers with “Institutional Repository” so that they will be represented in a single bar.
all_dois_collapsed$publisher[which(all_dois_collapsed$publisher_plus %in% unique(IR_pubs$publisher_plus))] <- "Institutional Repository"
#catch the rest of the "Cornell University Library"
all_dois_collapsed$publisher[which(all_dois_collapsed$publisher == "Cornell University Library")] <- "Institutional Repository"
#and stray VT
all_dois_collapsed$publisher[which(all_dois_collapsed$publisher == "University Libraries, Virginia Tech")] <- "Institutional Repository"
#and DRUM
all_dois_collapsed$publisher[which(all_dois_collapsed$publisher == "Data Repository for the University of Minnesota (DRUM)")] <- "Institutional Repository"
##ICPSR is also inconsistent
all_dois_collapsed$publisher[grep("Consortium for Political", all_dois_collapsed$publisher)] <- "ICPSR"
Think we just keep these together for the main analysis…
by_publisher_collapse <- all_dois_collapsed %>%
group_by(publisher, institution) %>%
summarize(count=n()) %>%
arrange(institution, desc(count))
Table of publisher counts
by_publisher_collapse_table <- by_publisher_collapse %>%
pivot_wider(names_from = institution,
values_from = count,
values_fill = 0) %>%
rowwise %>%
mutate(Total = sum(c_across(Cornell:`WashU`))) %>%
ungroup() %>%
arrange(desc(Total)) %>%
mutate(Cumulative_Percent = round(cumsum(Total)/sum(Total)*100, 1))
by_publisher_collapse_table %>%
datatable
Write out the table of data & software publishers
write.csv(by_publisher_collapse_table, file="data_summary_data/Counts of Publishers By Insitituion - Collapsed by container.csv", row.names = F)
Total By institution
by_publisher_collapse %>%
group_by(institution) %>%
summarize(TotalN = sum(count)) %>%
adorn_totals(where="row") %>%
kable()
institution | TotalN |
---|---|
Cornell | 4716 |
Duke | 5564 |
Michigan | 124774 |
Minnesota | 4528 |
Virginia Tech | 1839 |
WashU | 2212 |
Total | 143633 |
How many IRs were included in the original DataCite and CrossRef pulls?
all_dois_collapsed %>%
filter(publisher == "Institutional Repository") %>%
group_by(group) %>%
tally() %>%
kable()
group | n |
---|---|
Affiliation - Datacite | 5 |
IR_publisher | 2164 |
By institution
all_dois_collapsed %>%
filter(publisher == "Institutional Repository") %>%
group_by(group, institution) %>%
tally() %>%
kable()
group | institution | n |
---|---|---|
Affiliation - Datacite | Cornell | 1 |
Affiliation - Datacite | Duke | 1 |
Affiliation - Datacite | Michigan | 1 |
Affiliation - Datacite | Virginia Tech | 2 |
IR_publisher | Cornell | 174 |
IR_publisher | Duke | 225 |
IR_publisher | Michigan | 645 |
IR_publisher | Minnesota | 692 |
IR_publisher | Virginia Tech | 333 |
IR_publisher | WashU | 95 |
# by_publisher_dc_collapse <- all_dois_collapsed %>%
# group_by(publisher, institution) %>%
# summarize(count=n()) %>%
# arrange(institution, desc(count))
#table of publishers - data
by_publisher_dc_collapse_table <- by_publisher_collapse %>%
pivot_wider(names_from = institution,
values_from = count,
values_fill = 0) %>%
rowwise %>%
mutate(Total = sum(c_across(Cornell:`WashU`))) %>%
arrange(desc(Total))
Look at publishers based on rank of number of DOIs
by_publisher_dc_collapse_table %>%
group_by(publisher) %>%
summarize(count=sum(Total)) %>%
arrange(desc(count)) %>%
mutate(pubrank = order(count, decreasing = T)) %>%
ggplot(aes(x=pubrank, y=count)) +
geom_bar(stat="identity") +
labs(x = "Publisher Rank (Top 20)", y="Number of DOIs")+
scale_y_break(breaks =c(10000, 100000),scales = .15) +
scale_x_continuous(limits = c(0,20), sec.axis = dup_axis(labels=NULL, breaks=NULL)) +
theme_bw()
Look at the top 10 publishers - how many does this capture?
top10pubs <- by_publisher_dc_collapse_table$publisher[1:10]
#for graph without ENCODE/LTD, grab top 12
top12pubs <- by_publisher_dc_collapse_table$publisher[1:12]
by_publisher_dc_collapse_table %>%
group_by(publisher) %>%
summarize(count=sum(Total)) %>%
mutate(intop10pub = publisher %in% top10pubs) %>%
group_by(intop10pub) %>%
summarize(totalDOIs = sum(count), nrepos = n()) %>%
ungroup() %>%
mutate(propDOIs = totalDOIs/sum(totalDOIs)) %>%
kable(digits = 2)
intop10pub | totalDOIs | nrepos | propDOIs |
---|---|---|---|
FALSE | 1369 | 163 | 0.01 |
TRUE | 142264 | 10 | 0.99 |
#By institution
by_publisher_dc_collapse_table %>%
select(-Total) %>%
pivot_longer(cols=-publisher,
names_to = "Institution",
values_to = "Total") %>%
group_by(publisher, Institution) %>%
summarize(count=sum(Total)) %>%
mutate(intop10pub = publisher %in% top10pubs) %>%
group_by(intop10pub, Institution) %>%
summarize(totalDOIs = sum(count), nrepos = n()) %>%
group_by(Institution) %>%
mutate(propDOIs = totalDOIs/sum(totalDOIs)) %>%
kable(digits = 3)
intop10pub | Institution | totalDOIs | nrepos | propDOIs |
---|---|---|---|---|
FALSE | Cornell | 254 | 163 | 0.054 |
FALSE | Duke | 258 | 163 | 0.046 |
FALSE | Michigan | 303 | 163 | 0.002 |
FALSE | Minnesota | 250 | 163 | 0.055 |
FALSE | Virginia Tech | 203 | 163 | 0.110 |
FALSE | WashU | 101 | 163 | 0.046 |
TRUE | Cornell | 4462 | 10 | 0.946 |
TRUE | Duke | 5306 | 10 | 0.954 |
TRUE | Michigan | 124471 | 10 | 0.998 |
TRUE | Minnesota | 4278 | 10 | 0.945 |
TRUE | Virginia Tech | 1636 | 10 | 0.890 |
TRUE | WashU | 2111 | 10 | 0.954 |
top10colors <- c("Harvard Dataverse" = "dodgerblue2",
"Zenodo" = "darkorange1",
"ICPSR" = "darkcyan",
"Dryad" = "lightgray",
"figshare" = "purple",
"Institutional Repository" = "lightblue",
"ENCODE Data Coordination Center" = "gold2",
"Faculty Opinions Ltd" = "darkgreen",
"Taylor & Francis" = "red",
"Neotoma Paleoecological Database" = "pink")
top12colors <- c("Harvard Dataverse" = "dodgerblue2",
"Zenodo" = "darkorange1",
"ICPSR" = "darkcyan",
"Dryad" = "lightgray",
"figshare" = "purple",
"Institutional Repository" = "lightblue",
"Taylor & Francis" = "red",
"Neotoma Paleoecological Database" = "pink",
"VTTI" = "lightgreen",
"MassIVE" = "darkblue")
(by_publisher_plot_collapse <- by_publisher_collapse %>%
filter(publisher %in% top10pubs) %>%
ggplot(aes(x=institution, y=count, fill=publisher)) +
geom_bar(stat="identity", position=position_dodge(preserve = "single")) +
scale_fill_manual(values = top10colors, name="Publisher")+
guides(fill = guide_legend(title.position = "top")) +
#scale_y_continuous(breaks = seq(from = 0, to=5000, by=500)) +
scale_y_break(breaks =c(3000, 120000),scales = .15) +
coord_cartesian(ylim = c(0,5000)) +
labs(x = "Institution", y="Count of Collapsed DOIs") +
theme_bw() +
guides(fill = guide_legend(nrow = 3, title.position = "top")) +
theme(legend.position = "bottom", legend.title.align = .5))
ggsave(by_publisher_plot_collapse, filename = "figures/Counts of DOIs by Institution_DOIcollapsed.png", device = "png", width = 8, height = 6, units="in")
(by_publisher_plot_collapse2 <- by_publisher_collapse %>%
filter(publisher %in% top10pubs) %>%
mutate(publisher = factor(publisher, levels=rev(top10pubs))) %>%
ggplot(aes(x=publisher, y=count, fill=publisher)) +
geom_bar(stat="identity", position=position_dodge(preserve = "single"), show.legend = FALSE) +
scale_fill_manual(values = top10colors, name="Publisher")+
guides(fill = guide_legend(title.position = "top")) +
#scale_y_continuous(breaks = seq(from = 0, to=5000, by=500)) +
#scale_y_break(breaks =c(3000, 120000),scales = .15) +
#coord_cartesian(ylim = c(0,5000)) +
coord_flip(ylim = c(0,4000))+
facet_wrap(~institution, scales = "free_x")+
labs(x = "Publisher", y="Count of data and software DOIs", caption = "*Michigan ENCODE bar truncated for visualization") +
theme_bw() +
theme(legend.position = "bottom", legend.title.align = .5,
text = element_text(family = "Arial", size=10)))
ggsave(by_publisher_plot_collapse2, filename = "figures/fig3_DOI_by_Institution.png", device = "png", width = 8, height = 6, units="in", dpi = 300)
ggsave(by_publisher_plot_collapse2, filename = "figures/fig3_DOI_by_Institution.tif", device = "tiff", width = 8, height = 6, units="in", dpi = 300)
by_publisher_percent_plot1 <- by_publisher_collapse %>%
group_by(institution) %>%
mutate(Percent = count/sum(count)*100) %>%
filter(publisher %in% top10pubs) %>%
ggplot(aes(x=institution, y=Percent)) +
geom_col(aes(fill=publisher)) +
scale_fill_manual(values = top10colors, name="Publisher") +
labs(x = "Institution", y="Percent of Total Data DOIs") +
guides(fill = guide_legend(title.position = "top",
nrow=3)) +
theme_bw() +
theme(legend.position = "bottom",
legend.title.align = .5)
ggsave(plot = by_publisher_percent_plot1, filename="figures/Percent DOIs Top Publisher Percents - With ENCODE.png", device = "png", width = 8, height = 5.29, units = "in")
publegend <- get_legend(by_publisher_percent_plot1)
by_publisher_percent_plot1 <- by_publisher_percent_plot1 + theme(legend.position = "none")
by_publisher_percent_plot2 <- by_publisher_collapse %>%
filter(publisher != "ENCODE Data Coordination Center",
publisher != "Faculty Opinions Ltd") %>%
group_by(institution) %>%
mutate(Percent = count/sum(count)*100) %>%
filter(publisher %in% top12pubs) %>%
ggplot(aes(x=institution, y=Percent)) +
geom_col(aes(fill=publisher)) +
scale_fill_manual(values = top12colors, name="Publisher") +
labs(x = "Institution", y="Percent of Total Data DOIs") +
theme_bw() +
theme(legend.position = "bottom", legend.title.align = .5)
ggsave(plot = by_publisher_percent_plot2, filename="figures/Percent DOIs Top Publisher Percents - No ENCODE.png", device = "png", width = 8.5, height = 5.29, units = "in")
by_publisher_percent_plot2 <- by_publisher_percent_plot2 +
theme(legend.position = "none")
(combined_pub_plots <- plot_grid(plot_grid(by_publisher_percent_plot1,
by_publisher_percent_plot2,
labels = c("A", "B")),
publegend,
nrow=2,
rel_heights = c(2,.5),
align = "v",
axis = "t"))
ggsave(plot = combined_pub_plots, filename="figures/Percent DOIs Top Publisher Percents.png", device = "png", width = 10.5, units = "in")
Try in B&W safe format with an “other” bar.
top10colorsother <- c(top10colors, "Other" = "darkgray")
pubshortnames <- data.frame(publisher = c(top12pubs, "Other"),
shortname = c("EN", "FOL", "Z", "DR", "FG","IR", "ICP", "HD", "TF", "NPD","VTI","MIV","O"))
#add short hands labels
top10colorsotherLabels <- paste0(pubshortnames$shortname[match(names(top10colorsother), pubshortnames$publisher)], ": ", names(top10colorsother))
names(top10colorsotherLabels) <- names(top10colorsother)
#create top 12 colors with other
top12colorsother <- c(top12colors, "Other" = "darkgray")
#and labels
top12colorsotherLabels <- paste0(pubshortnames$shortname[match(names(top12colorsother), pubshortnames$publisher)], ": ", names(top12colorsother))
names(top12colorsotherLabels) <- names(top12colorsother)
##LEGEND PLOT
#do a temp plot for the legend with all the repositories on it
dataforlegend <- by_publisher_collapse %>%
mutate(publisher = ifelse(publisher %in% top12pubs, publisher, "Other"),
institution = ifelse(institution == "WashU", "WashU", institution)) %>%
group_by(institution, publisher) %>%
summarize(count = sum(count)) %>%
mutate(Percent = count/sum(count)*100) %>%
full_join(pubshortnames, by="publisher") %>%
filter(!is.na(institution))
plotforlegend <- dataforlegend %>%
ggplot(aes(x=institution, y=Percent)) +
geom_col(aes(fill=publisher)) +
scale_fill_manual(values = c(top10colors, top12colorsother), name="Publisher", labels = c(top10colorsotherLabels, top12colorsotherLabels)) +
guides(fill = guide_legend(title.position = "top",
nrow=4)) +
theme_bw() +
theme(legend.position = "bottom",
legend.title.align = .5)
publegend <- ggfun::get_legend(plotforlegend)
##PLOT A
plotAdata <- by_publisher_collapse %>%
mutate(publisher = ifelse(publisher %in% top10pubs, publisher, "Other"),
institution = ifelse(institution == "WashU", "WashU", institution)) %>%
group_by(institution, publisher) %>%
summarize(count = sum(count)) %>%
mutate(Percent = count/sum(count)*100) %>%
left_join(pubshortnames, by="publisher") %>%
mutate(publisher = factor(publisher, levels=rev(c(top10pubs, "Other")))) %>%
#remove short name labels if small counts
mutate(shortname = ifelse(Percent < 4, "", shortname))
(by_publisher_percent_plot1a <- plotAdata %>%
ggplot(aes(x=institution, y=Percent)) +
geom_col(aes(fill=publisher)) +
geom_text(aes(label=shortname, group=publisher), position = position_stack(vjust = .5), size=2) +
scale_fill_manual(values = top10colorsother, name="Publisher", labels = top10colorsotherLabels) +
coord_cartesian(ylim=c(0,100)) +
labs(x = "Institution", y="Percent of Total Data DOIs") +
guides(fill = guide_legend(title.position = "top",
nrow=3)) +
theme_bw() +
theme(legend.position = "bottom",
legend.title.align = .5,
text = element_text(family = "Arial", size=10)))
#ggsave(plot = by_publisher_percent_plot1, filename="figures/Percent DOIs Top Publisher Percents - With ENCODE.png", device = "png", width = 8, height = 5.29, units = "in")
#remove legend
by_publisher_percent_plot1a <- by_publisher_percent_plot1a + theme(legend.position = "none")
### PLOT B
plotBdata <- by_publisher_collapse %>%
mutate(publisher = ifelse(publisher %in% top12pubs, publisher, "Other"),
institution = ifelse(institution == "WashU", "Wash U", institution)) %>%
filter(publisher != "ENCODE Data Coordination Center",
publisher != "Faculty Opinions Ltd") %>%
group_by(institution, publisher) %>%
summarize(count = sum(count)) %>%
mutate(Percent = count/sum(count)*100) %>%
full_join(pubshortnames, by="publisher") %>%
mutate(publisher = factor(publisher, levels=rev(c(top12pubs, "Other")))) %>%
filter(!is.na(institution)) %>%
#remove short name labels if small counts
mutate(shortname = ifelse(Percent < 4, "", shortname))
(by_publisher_percent_plot2a <- plotBdata %>%
ggplot(aes(x=institution, y=Percent)) +
geom_col(aes(fill=publisher)) +
geom_text(aes(label=shortname, group=publisher), position = position_stack(vjust = .5), size=2) +
scale_fill_manual(values = top12colorsother, name="Publisher", labels = top12colorsotherLabels) +
coord_cartesian(ylim=c(0,100)) +
labs(x = "Institution", y="Percent of Total Data DOIs") +
guides(fill = guide_legend(title.position = "top",
nrow=3)) +
theme_bw() +
theme(legend.position = "bottom",
legend.title.align = .5,
text = element_text(family = "Arial", size=10)))
#ggsave(plot = by_publisher_percent_plot2, filename="figures/Percent DOIs Top Publisher Percents - No ENCODE.png", device = "png", width = 8.5, height = 5.29, units = "in")
by_publisher_percent_plot2a <- by_publisher_percent_plot2a +
theme(legend.position = "none")
##COMBINED PLOT
(combined_pub_plots <- plot_grid(plot_grid(by_publisher_percent_plot1a,
by_publisher_percent_plot2a,
labels = c("A", "B")),
publegend,
nrow=2,
rel_heights = c(2,.5),
align = "v",
axis = "t"))
ggsave(plot = combined_pub_plots, filename="figures/Fig4_DOIDistribution.png", device = "png", width = 8.5, height = 7, units = "in", dpi = 300, bg="white")
ggsave(plot = combined_pub_plots, filename="figures/Fig4_DOIDistribution.tif", device = "tiff", width = 8.5, height = 7, units = "in", dpi = 300, bg="white")
Overall Proportion of Data/Software DOIs in Top 10 publishers by institution
by_publisher_collapse %>%
group_by(institution) %>%
mutate(Percent = count/sum(count)*100) %>%
filter(publisher %in% top10pubs) %>%
group_by(institution) %>%
summarize(TotalCount = sum(count), TotalPercent = sum(Percent)) %>%
kable(digits =2)
institution | TotalCount | TotalPercent |
---|---|---|
Cornell | 4462 | 94.61 |
Duke | 5306 | 95.36 |
Michigan | 124471 | 99.76 |
Minnesota | 4278 | 94.48 |
Virginia Tech | 1636 | 88.96 |
WashU | 2111 | 95.43 |
How many different publishers are researchers sharing their data and how does this change over time?
by_year_nrepos <- all_dois_collapsed %>%
group_by(publicationYear, publisher, institution) %>%
summarize(nDOIs = n()) %>%
group_by(publicationYear, institution) %>%
summarize(npublishers = n(), totalDOIs = sum(nDOIs))
by_year_nrepos %>%
ggplot(aes(x=publicationYear, y=npublishers, group=institution)) +
geom_line(aes(color=institution)) +
labs(x="Year",
y="Number of Repositories",
title="Number of Repositories Where Data and Software are Shared Across Time") +
theme_bw() +
theme(legend.title = element_blank())
Labeled version
instcolors <- c("Cornell" = "#B31B1B",
"Duke" = "#00539B",
"Michigan" = "#FFCB05", # #00274C
"Minnesota" = "#7a0019",
"Virginia Tech" = "#E87722",
"WashU" = "#6c7373")
by_year_nrepos %>%
ggplot(aes(x=publicationYear, y=npublishers, group=institution)) +
geom_line(aes(color=institution), linewidth=.8, show.legend = FALSE) +
scale_color_manual(values = instcolors) +
labs(x="Year",
y="Number of Repositories") +
annotate(geom="text", x= 11.2, y=by_year_nrepos$npublishers[which(by_year_nrepos$institution== "Virginia Tech" & by_year_nrepos$publicationYear == 2022)], label="Virginia Tech", hjust=0) +
annotate(geom="text", x= 11.2, y=by_year_nrepos$npublishers[which(by_year_nrepos$institution== "Cornell" & by_year_nrepos$publicationYear == 2022)], label="Cornell", hjust=0) +
annotate(geom="text", x= 11.2, y=(by_year_nrepos$npublishers[which(by_year_nrepos$institution== "Duke" & by_year_nrepos$publicationYear == 2022)]+1), label="Duke", hjust=0) +
annotate(geom="text", x= 11.2, y=(by_year_nrepos$npublishers[which(by_year_nrepos$institution== "Michigan" & by_year_nrepos$publicationYear == 2022)]+1), label="Michigan", hjust=0) +
annotate(geom="text", x= 11.2, y=(by_year_nrepos$npublishers[which(by_year_nrepos$institution== "Minnesota" & by_year_nrepos$publicationYear == 2022)]-1), label="Minnesota", hjust=0) +
annotate(geom="text", x= 11.2, y=(by_year_nrepos$npublishers[which(by_year_nrepos$institution== "WashU" & by_year_nrepos$publicationYear == 2022)]-.5), label="WashU", hjust=0) +
coord_cartesian(clip = "off")+
theme_classic() +
theme(panel.grid = element_blank(),
plot.margin = unit(c(1,5,1,1), "lines"),
text = element_text(family = "Arial", size=10))
ggsave(filename="figures/Fig2_N_Repo_by_Year.png", device = "png", dpi = 300)
ggsave(filename="figures/Fig2_N_Repo_by_Year.tiff", device = "tiff", dpi = 300)
We can also look at the data collapsed by version of a record. This was motivated because some repositories have multiple entries for the different versions of the same dataset/collection. And some entries have many versions.
Explore versions
Some Repositories attach “vX” to the doi.
all_dois_collapsed <- all_dois_collapsed %>%
mutate(hasversion = grepl("\\.v[[:digit:]]+$", DOI))
all_dois_collapsed %>%
filter(hasversion == TRUE) %>%
group_by(publisher, hasversion) %>%
summarize(count=n()) %>%
arrange(desc(count)) %>%
datatable()
Some repositories use the “VersionCount”
all_dois_collapsed %>%
filter(versionCount > 0) %>%
group_by(publisher) %>%
summarize(count=n(), AvgNversions = round(mean(versionCount),2)) %>%
arrange(desc(count)) %>%
datatable()
Some use “metadataVersion”
all_dois_collapsed %>%
filter(metadataVersion > 0) %>%
group_by(publisher) %>%
summarize(count=n(), AvgNversions = round(mean(metadataVersion),2)) %>%
arrange(desc(count)) %>%
datatable()
How to collapse by version? Maybe that’s for another day…
Look at repositories with affiliation and publication years prior to 2014
DataCite released affiliation as a metadata option on Oct 16. 2014. The repositories with affiliations for things published before then may have been back-updated?
What repositories have publications with affiliation before then?
all_dois_collapsed %>%
group_by(publisher, publicationYear) %>%
summarize(count=n()) %>%
arrange(publicationYear) %>%
pivot_wider(names_from = publicationYear,
values_from = count) %>%
arrange(2012, 2013, 2014, 2015) %>%
datatable()
Looking at fields that are recommended by OSTP Only:
Subset the data to these fields
for_metadata <- all_dois_collapsed %>%
select(institution, publisher, group, DOI, creators, publicationYear, relatedIdentifiers, fundingReferences, dates) %>%
ungroup() %>%
mutate(RowID = 1:nrow(.))
#create function to return if at least one is not NA or NULL
atleastonevalid <- function(x) {
sum(!is.na(x)) > 0 &
sum(x != "") > 0 &
sum(x != "NULL") > 0}
Creator fields (author name, affiliation, name identifiers, affiliation identifier)
#each are nested in an item within a list, so need to unnest
for_metadata$creators1 <- lapply(for_metadata$creators, function(x) x[[1]])
creators <- for_metadata %>%
select(RowID, publisher, creators1) %>%
unnest(cols = creators1, keep_empty = T)
#make lists in nameIdentifiers an empty dataframe
#make each column of interest a vector
creators$affiliation1 <- lapply(creators$affiliation, paste, collapse=",")
creators$nameIdentifier <- lapply(creators$nameIdentifiers, paste, collapse=",")
creator_table <- creators %>%
group_by(RowID) %>%
summarize(has_name = atleastonevalid(name),
has_affiliation = atleastonevalid(affiliation1),
has_nameIdentifier = atleastonevalid(nameIdentifier),
count=n())
#Some quick accuracy checks
noname <- filter(for_metadata, RowID %in% creator_table$RowID[which(creator_table$has_name==FALSE)])
#seem to be the crossref ones
Publication Year and dates
for_metadata$dates1 <-lapply(for_metadata$dates, function(x) x[[1]])
dates <- for_metadata %>%
unnest(dates1, keep_empty = T)
date_table <- dates %>%
pivot_wider(names_from = dateType,
values_from = date,
names_prefix = "date_") %>%
group_by(RowID) %>%
summarize(has_pubYear = atleastonevalid(publicationYear),
has_dateCreated = atleastonevalid(date_Created),
has_dateIssued = atleastonevalid(date_Issued),
has_dateCollected = atleastonevalid(date_Collected))
Funder Information
for_metadata$fundingReferences1 <- lapply(for_metadata$fundingReferences, function(x) x[[1]])
#pull out the unique funder variables in the data
fundervariables <- unique(unlist((lapply(for_metadata$fundingReferences1, function(x) names(x)))))
#add to dataset
for_metadata <- as.data.frame(for_metadata)
#fill in with whether that variable is present in each row of metadata
containsfundervar <- lapply(for_metadata$fundingReferences1, function(x) fundervariables %in% names(x))
names(containsfundervar) <- for_metadata$RowID
for_metadata[,fundervariables] <- do.call(rbind, containsfundervar)
funding_table <- for_metadata %>%
select(RowID, all_of(fundervariables))
Related identifiers
for_metadata$relatedIdentifiers <- lapply(for_metadata$relatedIdentifiers, function(x) x[[1]])
idvariables <- unique(unlist((lapply(for_metadata$relatedIdentifiers, function(x) names(x)))))
#fill in with whether that variable is present in each row of metadata
containsidvar <- lapply(for_metadata$relatedIdentifiers, function(x) idvariables %in% names(x))
names(containsidvar) <- for_metadata$RowID
for_metadata[,idvariables] <- do.call(rbind, containsidvar)
relid_table <- for_metadata %>%
select(RowID, all_of(idvariables))
Combine all of them and select relevant fields
all_dois_collapsed_completeness <- for_metadata %>%
select(RowID, DOI,publisher, institution, group) %>%
full_join(creator_table, by="RowID") %>%
full_join(date_table, by="RowID") %>%
full_join(funding_table, by="RowID") %>%
full_join(relid_table, by="RowID") %>%
select(RowID,DOI, publisher, institution,group, has_name, has_affiliation, has_nameIdentifier, has_pubYear, funderName, awardNumber, funderIdentifier, relatedIdentifier)
Then create dataset with indicators for whether fields have information in them (only indicates presence of information, not quality of information).
all_dois_collapsed_completenessl <- all_dois_collapsed_completeness %>%
pivot_longer(cols=has_name:relatedIdentifier,
names_to = "variable",
values_to = "value") %>%
mutate(variable = gsub("has_", "", variable))
Table of metadata completeness for 10 sample DOIs from each publisher
all_dois_collapsed_completeness %>%
filter(publisher %in% top10pubs) %>%
filter(group != "Affiliation - CrossRef") %>%
#also remove DOIs from DUke institutional repo (CrossRef)
filter(!(publisher == "Institutional Repository" & institution == "Duke")) %>%
select(-RowID, -institution, -group) %>%
group_by(publisher) %>%
slice_head(n=10) %>%
datatable(options = list(
pageLength = 20, scrollX = TRUE))
by_publisher_complete_dc <- all_dois_collapsed_completenessl %>%
filter(publisher %in% top10pubs) %>%
filter(group != "Affiliation - CrossRef") %>%
#also remove DOIs from DUke institutional repo (CrossRef)
filter(!(publisher == "Institutional Repository" & institution == "Duke")) %>%
group_by(publisher, variable) %>%
summarize(complete = sum(value, na.rm = T), total = n()) %>%
mutate(percent_complete = complete/total*100)
#organize the variables by completeness
compvarorder <- by_publisher_complete_dc %>%
group_by(variable) %>%
summarize(avgcomp = mean(percent_complete, na.rm=T)) %>%
arrange(desc(avgcomp)) %>%
mutate(variableLabel = case_when(variable == "name" ~ "Creator Name",
variable == "pubYear" ~ "Publication Year",
variable == "affiliation" ~ "Creator Affiliation",
variable == "relatedIdentifier" ~ "Related Identifiers",
variable == "nameIdentifier" ~ "Creator Name Identifier",
variable == "funderName" ~ "Funder Name",
variable == "awardNumber" ~ "Funder Award Number",
variable == "funderIdentifier" ~ "Funder Identifier",))
(completepub <- by_publisher_complete_dc %>%
mutate(variable = factor(variable, levels = compvarorder$variable)) %>%
ggplot(aes(x=variable, y=percent_complete, group=publisher)) +
geom_line(aes(color=publisher), position = position_jitter(height = 1, width = .1), linewidth = 1) +
scale_color_manual(values = top10colors, name="Publisher") +
scale_x_discrete(labels = compvarorder$variableLabel) +
labs(x="DataCite Metadata Field", y = "Percent Records Complete") +
theme_bw() +
guides(color = guide_legend(nrow = 2, title.position = "top")) +
theme(legend.position = "bottom", legend.title.align = .5,
axis.text.x = element_text(angle=90, hjust = 1, vjust = .5)))
ggsave(plot = completepub, filename = "figures/CompletenessData_allDatacite.png", width = 10, height = 5.25, units="in")
Figure 5, try 2
(completepub2 <- by_publisher_complete_dc %>%
mutate(variable = factor(variable, levels = compvarorder$variable)) %>%
ggplot(aes(x=variable, y=publisher)) +
geom_tile(aes(fill=publisher, alpha=percent_complete)) +
scale_fill_manual(values = top10colors, name="Publisher") +
scale_x_discrete(labels = compvarorder$variableLabel) +
guides(fill="none", alpha = guide_legend("Percent Complete")) +
labs(y="Publisher", x="Metadata Field") +
theme_bw() +
theme(legend.position = "right", legend.title.align = .5,
text = element_text(family = "Arial", size=10),
axis.text.x = element_text(angle=90, hjust = 1, vjust = .5),
panel.grid.major = element_blank())
)
ggsave(plot = completepub2, filename = "figures/CompletenessData_allDatacite.png", width = 8, height = 5.25, units="in")
ggsave(plot = completepub2, filename = "figures/CompletenessData_allDatacite.tif", width =8, height = 5.25, units="in", device = "tiff")
(completepub3 <- by_publisher_complete_dc %>%
mutate(variable = factor(variable, levels = compvarorder$variable)) %>%
ggplot(aes(x=variable, y=percent_complete, group=publisher)) +
geom_line(aes(color=publisher), position = position_jitter(height = 1, width = .1), linewidth = 1) +
scale_color_manual(values = top10colors, name="Publisher") +
scale_x_discrete(labels = compvarorder$variableLabel) +
labs(x="DataCite Metadata Field", y = "Percent Records Complete") +
facet_wrap(~publisher, nrow = 2)+
theme_bw() +
guides(color = guide_legend(nrow = 2, title.position = "top")) +
theme(legend.position = "none", legend.title.align = .5,
text = element_text(family = "Arial", size=9),
axis.text.x = element_text(angle=90, hjust = 1, vjust = .5)))
# ggsave(plot = completepub3, filename = "figures/fig5_CompletenessData_allDatacite.png", width = 7.5, height = 4.5, units="in", dpi=300)
# ggsave(plot = completepub3, filename = "figures/fig5_CompletenessData_allDatacite.tif", width = 7.5, device="tiff", height = 4.5, units="in", dpi=300)
#Add abbreviations to legend
compvarorder$abbrev <- c("Year", "Name", "Affiliation", "RI", "NI", "Funder", "Award", "FI")
(fig5_all <- by_publisher_complete_dc %>%
mutate(variable = factor(variable, levels = compvarorder$variable)) %>%
select(publisher, variable, percent_complete) %>%
pivot_wider(names_from = variable,
values_from = percent_complete) %>%
ggradar(axis.labels = compvarorder$abbrev,
background.circle.colour = "white"))
fig5_dryad <- by_publisher_complete_dc %>%
filter(publisher == "Dryad") %>%
mutate(abbrev = compvarorder$abbrev[match(variable, compvarorder$variable)]) %>%
select(publisher, abbrev, percent_complete) %>%
pivot_wider(names_from = abbrev,
values_from = percent_complete) %>%
ggradar(grid.min = 0,
grid.mid = 50,
grid.max = 100,
group.colours = "black",
background.circle.colour = "white",
axis.label.size=3,
group.line.width = 1,
group.point.size = 3,
grid.label.size = 4,
fill = TRUE,
fill.alpha = .15) +
labs(title = "Dryad") +
theme(plot.title = element_text(size=13, hjust = .5))
fig5_figshare <- by_publisher_complete_dc %>%
filter(publisher == "figshare") %>%
mutate(abbrev = compvarorder$abbrev[match(variable, compvarorder$variable)]) %>%
select(publisher, abbrev, percent_complete) %>%
pivot_wider(names_from = abbrev,
values_from = percent_complete) %>%
ggradar(grid.min = 0,
grid.mid = 50,
grid.max = 100,
group.colours = "black",
background.circle.colour = "white",
axis.label.size=3,
group.line.width = 1,
group.point.size = 3,
grid.label.size = 4,
fill = TRUE,
fill.alpha = .15) +
labs(title = "Figshare") +
theme(plot.title = element_text(size=13, hjust = .5))
fig5_dataverse <- by_publisher_complete_dc %>%
filter(publisher == "Harvard Dataverse") %>%
mutate(abbrev = compvarorder$abbrev[match(variable, compvarorder$variable)]) %>%
select(publisher, abbrev, percent_complete) %>%
pivot_wider(names_from = abbrev,
values_from = percent_complete) %>%
ggradar(grid.min = 0,
grid.mid = 50,
grid.max = 100,
group.colours = "black",
background.circle.colour = "white",
axis.label.size=3,
group.line.width = 1,
group.point.size = 3,
grid.label.size = 4,
fill = TRUE,
fill.alpha = .15) +
labs(title = "Harvard Dataverse") +
theme(plot.title = element_text(size=13, hjust = .5))
fig5_icpsr <- by_publisher_complete_dc %>%
filter(publisher == "ICPSR") %>%
mutate(abbrev = compvarorder$abbrev[match(variable, compvarorder$variable)]) %>%
select(publisher, abbrev, percent_complete) %>%
pivot_wider(names_from = abbrev,
values_from = percent_complete) %>%
ggradar(grid.min = 0,
grid.mid = 50,
grid.max = 100,
group.colours = "black",
background.circle.colour = "white",
axis.label.size=3,
group.line.width = 1,
group.point.size = 3,
grid.label.size = 4,
fill = TRUE,
fill.alpha = .15) +
labs(title = "ICPSR") +
theme(plot.title = element_text(size=13, hjust = .5))
fig5_ir <- by_publisher_complete_dc %>%
filter(publisher == "Institutional Repository") %>%
mutate(abbrev = compvarorder$abbrev[match(variable, compvarorder$variable)]) %>%
select(publisher, abbrev, percent_complete) %>%
pivot_wider(names_from = abbrev,
values_from = percent_complete) %>%
ggradar(grid.min = 0,
grid.mid = 50,
grid.max = 100,
group.colours = "black",
background.circle.colour = "white",
axis.label.size=3,
group.line.width = 1,
group.point.size = 3,
grid.label.size = 4,
fill = TRUE,
fill.alpha = .15) +
labs(title = "Institutional Repository") +
theme(plot.title = element_text(size=13, hjust = .5))
fig5_npd <- by_publisher_complete_dc %>%
filter(publisher == "Neotoma Paleoecological Database") %>%
mutate(abbrev = compvarorder$abbrev[match(variable, compvarorder$variable)]) %>%
select(publisher, abbrev, percent_complete) %>%
pivot_wider(names_from = abbrev,
values_from = percent_complete) %>%
ggradar(grid.min = 0,
grid.mid = 50,
grid.max = 100,
group.colours = "black",
background.circle.colour = "white",
axis.label.size=3,
group.line.width = 1,
group.point.size = 3,
grid.label.size = 4,
fill = TRUE,
fill.alpha = .15) +
labs(title = "Neotoma Paleoecological Database") +
theme(plot.title = element_text(size=13, hjust = .5))
fig5_tf <- by_publisher_complete_dc %>%
filter(publisher == "Taylor & Francis") %>%
mutate(abbrev = compvarorder$abbrev[match(variable, compvarorder$variable)]) %>%
select(publisher, abbrev, percent_complete) %>%
pivot_wider(names_from = abbrev,
values_from = percent_complete) %>%
ggradar(grid.min = 0,
grid.mid = 50,
grid.max = 100,
group.colours = "black",
background.circle.colour = "white",
axis.label.size=3,
group.line.width = 1,
group.point.size = 3,
grid.label.size = 4,
fill = TRUE,
fill.alpha = .15) +
labs(title = "Taylor & Francis") +
theme(plot.title = element_text(size=13, hjust = .5))
fig5_zenodo <- by_publisher_complete_dc %>%
filter(publisher == "Zenodo") %>%
mutate(abbrev = compvarorder$abbrev[match(variable, compvarorder$variable)]) %>%
select(publisher, abbrev, percent_complete) %>%
pivot_wider(names_from = abbrev,
values_from = percent_complete) %>%
ggradar(grid.min = 0,
grid.mid = 50,
grid.max = 100,
group.colours = "black",
background.circle.colour = "white",
axis.label.size=3,
group.line.width = 1,
group.point.size = 3,
grid.label.size = 4,
fill = TRUE,
fill.alpha = .15) +
labs(title = "Zenodo") +
theme(plot.title = element_text(size=13, hjust = .5))
(combined_radar <- plot_grid(fig5_dryad,
fig5_figshare,
fig5_dataverse,
fig5_icpsr,
fig5_ir,
fig5_npd,
fig5_tf,
fig5_zenodo,
nrow = 2) +
theme(plot.background = element_rect(fill="white")))
ggsave(plot = combined_radar, filename = "figures/fig5_CompletenessData_allDatacite_radar.png", width = 12, height = 5, units="in", dpi=300)
ggsave(plot = combined_radar, filename = "figures/fig5_CompletenessData_allDatacite_radar.tif", width = 12, device="tiff", height = 5, units="in", dpi=300)
Examine ICPSR’s funder fields
ICPSR_fundercomplete <- all_dois_collapsed_completenessl %>%
filter(publisher == "ICPSR", grepl("funder", variable), value == TRUE)
ICPSR_funderspecific <- all_dois_collapsed %>%
filter(DOI %in% ICPSR_fundercomplete$DOI) %>%
select(DOI, publisher, fundingReferences)
by_publisher_complete_dc %>%
mutate(variable = factor(variable, levels = compvarorder$variable)) %>%
mutate(completepercent = paste0(complete, " (", round(percent_complete,1), "%)")) %>%
select(variable, publisher, completepercent) %>%
pivot_wider(names_from = publisher,
values_from = completepercent) %>%
arrange(variable) %>%
kable()
variable | Dryad | Harvard Dataverse | ICPSR | Institutional Repository | Neotoma Paleoecological Database | Taylor & Francis | Zenodo | figshare |
---|---|---|---|---|---|---|---|---|
pubYear | 2451 (100%) | 999 (100%) | 1549 (100%) | 1943 (100%) | 214 (100%) | 460 (100%) | 6578 (100%) | 2280 (100%) |
name | 2451 (100%) | 999 (100%) | 1549 (100%) | 1914 (98.5%) | 214 (100%) | 460 (100%) | 6578 (100%) | 2280 (100%) |
affiliation | 2451 (100%) | 999 (100%) | 1549 (100%) | 259 (13.3%) | 214 (100%) | 460 (100%) | 6578 (100%) | 2280 (100%) |
relatedIdentifier | 2016 (82.3%) | 615 (61.6%) | 888 (57.3%) | 1212 (62.4%) | 214 (100%) | 460 (100%) | 6550 (99.6%) | 2121 (93%) |
nameIdentifier | 917 (37.4%) | 389 (38.9%) | 0 (0%) | 146 (7.5%) | 0 (0%) | 0 (0%) | 2266 (34.4%) | 815 (35.7%) |
funderName | 933 (38.1%) | 0 (0%) | 1041 (67.2%) | 710 (36.5%) | 0 (0%) | 0 (0%) | 150 (2.3%) | 2 (0.1%) |
awardNumber | 844 (34.4%) | 0 (0%) | 776 (50.1%) | 458 (23.6%) | 0 (0%) | 0 (0%) | 150 (2.3%) | 2 (0.1%) |
funderIdentifier | 878 (35.8%) | 0 (0%) | 0 (0%) | 313 (16.1%) | 0 (0%) | 0 (0%) | 150 (2.3%) | 0 (0%) |
by_publisher_complete_dc %>%
mutate(variable = factor(variable, levels = compvarorder$variable)) %>%
mutate(completepercent = paste0(complete, " (", round(percent_complete,1), "%)")) %>%
select(variable, publisher, completepercent) %>%
pivot_wider(names_from = publisher,
values_from = completepercent) %>%
arrange(variable) %>%
write.csv(file="data_summary_data/Metadata_completeness_by_Repo.csv", row.names = F)
by_publisher_complete_ir <- all_dois_collapsed_completenessl %>%
filter(publisher == "Institutional Repository") %>%
filter(institution != "Duke") %>%
group_by(institution, variable) %>%
summarize(complete = sum(value, na.rm = T), total = n()) %>%
mutate(percent_complete = complete/total*100)
#organize the variables by completeness
compvarorderIR <- by_publisher_complete_ir %>%
group_by(variable) %>%
summarize(avgcomp = mean(percent_complete, na.rm=T)) %>%
arrange(desc(avgcomp))
instcolors <- c("Cornell" = "#B31B1B",
"Duke" = "#00539B",
"Michigan" = "#FFCB05", # #00274C
"Minnesota" = "#7a0019",
"Virginia Tech" = "#E87722",
"WashU" = "#6c7373")
Combined plot for IRs
(completeIRpub <- by_publisher_complete_ir %>%
mutate(variable = factor(variable, levels = compvarorderIR$variable)) %>%
ggplot(aes(x=variable, y=percent_complete, group=institution)) +
geom_line(aes(color=institution), position = position_jitter(height = 1, width = .1), linewidth = 1) +
scale_color_manual(values = instcolors , name="Institutional Repository") +
labs(x="DataCite Metadata Field", y = "Percent Records Complete") +
theme_bw() +
guides(color = guide_legend(nrow = 2, title.position = "top")) +
theme(legend.position = "bottom", legend.title.align = .5,
axis.text.x = element_text(angle=90, hjust = 1, vjust = .5)))
ggsave(plot = completepub, filename = "figures/CompletenessData_IRDatacite.png", width = 10, height = 5.25, units="in")
NOTE: Duke metadata came from CrossRef so this plot is removed
Write out CSV files for each institution:
for (i in unique(all_dois$institution)) {
all_dois %>%
filter(institution == i) %>%
write.csv(file=paste0("data_all_dois/All_dois_", i, gsub("-", "", Sys.Date()), ".csv"), row.names = F)
all_dois_collapsed %>%
filter(institution == i) %>%
write.csv(file=paste0("data_all_dois/All_dois_collapsed_", i, gsub("-", "", Sys.Date()), ".csv"), row.names = F)
}