This document contains the codes that generate the data analysis results and figures presented in the manuscript “Meta-research: The changing career paths of PhDs and postdocs trained at EMBL” (previously published on BioRXiv as “PhD and postdoc training outcomes at EMBL: changing career paths for life scientists in Europe”).
library(gridExtra)
library(cowplot)
library(ggbeeswarm)
library(Hmisc)
library(ggpubr)
library(reshape2)
library(psych)
library(plotly)
library(orca)
library(RColorBrewer)
library(survival)
library(glmnet)
library(survminer)
library(tidyverse)
library(reticulate)
#load helper functions
source("code/utils.R")
knitr::opts_chunk$set(fig.path = "./fig/", dev = c("png","pdf"))
#silent warnings and messages
knitr::opts_chunk$set(warning = FALSE, message = FALSE, autodep = TRUE)
Due to date protection, the original dataset will not be included in the public version.
#original data table
originalData = read_csv("data/datafinal_forR_050222.csv")
Years as integer
careerData <- mutate_at(originalData,
vars(`from.year`, to_year,phd_year_if_known),
as.integer) %>% rename(from_year = `from.year`)
Calculate Year at EMBL
careerData <- mutate(careerData, year_at_EMBL = to_year - from_year)
Change unknown to NA in all character columns
subStr <- function(x) {
x[x %in% c("unknown","Unknown")] <- NA
x
}
careerData <- mutate_if(careerData, is.character, subStr)
Change CV_wasNonR to CV_wasNonRes
careerData <- dplyr::rename(careerData, CV_wasNonRes = CV_wasNonR)
Change PhDtoGL to PhDtoPI
careerData <- dplyr::rename(careerData, PhDtoPI = PhDtoGL)
tempdata <- originalData
tempdata$timepointLast <- as.character(tempdata$timepointLast)
tempdata$timepointLast <- gsub('AcFac', 'AcPI', tempdata$timepointLast)
tempdata$type_pre_postdoc <- gsub('predoc', 'PhD', tempdata$type_pre_postdoc)
tempdata$timepointLast <- factor(tempdata$timepointLast,
levels = c("unknown", "NonSci", "NonRes", "IndR", "AcPD", "AcOt", "AcPI"))
tempdata$gender <- as.factor(tempdata$gender)
predoc <- tempdata[ which(tempdata$type_pre_postdoc=='PhD'),]
postdoc <- tempdata[ which(tempdata$type_pre_postdoc=='postdoc'),]
tempdata$type_pre_postdoc <- factor(tempdata$type_pre_postdoc, levels = c("PhD", "postdoc"))
##change colours###
mycols<-setNames(c("#D0D0CE","#E2E868","#F4C61F","#F49E17","#D41645","#734595","#3B6FB6"),
c('unknown','NonSci','NonRes','IndR','AcPD','AcOt', 'AcPI'))
#postdoc_2021
mydata_type <- tempdata %>%
dplyr::group_by(type_pre_postdoc, timepointLast) %>%
dplyr::summarise(count = n())
ggplot(mydata_type, aes(fill=timepointLast, y=count, x=type_pre_postdoc),col=mycols) +
geom_bar( stat="identity", position="fill") +
scale_fill_manual(values=mycols) +
labs(x = "Role at EMBL", y="Percentage in each role") +
ggtitle("Current role in 2021")+ theme_minimal(base_size = 16) + theme(axis.title=element_text(size=14)) + theme(plot.title = element_text(size=14), axis.text.x = element_text(size = 12,angle=45, hjust=1))+
theme(axis.title=element_text(size=14)) + theme(plot.title=element_text(size=14))+
theme(legend.key.height = unit(1, 'lines'),legend.key.width = unit(0.4, 'lines'),legend.title = element_text(size=14), legend.text = element_text(size=12)) + scale_y_continuous(labels = scales::label_percent(accuracy = 1L))
write.csv(mydata_type, "source1A.csv")
mydata2 <- tempdata
mydata2$cohort <- factor(mydata2$cohort, levels = c("1997-2004","2005-2012","2013-2020"))
mydata2$position_1 <- as.character(mydata2$position_1)
mydata2$position_5 <- as.character(mydata2$position_5)
mydata2$position_9 <- as.character(mydata2$position_9)
mydata2$position_13 <- as.character(mydata2$position_13)
mydata2$position_17 <- as.character(mydata2$position_17)
mydata2$timepointLast <- as.character(mydata2$timepointLast)
mydata2$position_1 <- gsub('laterPI', 'unknown', mydata2$position_1)
mydata2$position_5 <- gsub('laterPI', 'unknown', mydata2$position_5)
mydata2$position_9 <- gsub('laterPI', 'unknown', mydata2$position_9)
mydata2$position_13 <- gsub('laterPI', 'unknown', mydata2$position_13)
mydata2$position_17 <- gsub('laterPI', 'unknown', mydata2$position_17)
mydata2$timepointLast <- gsub('laterPI', 'unknown', mydata2$timepointLast)
mydata2$position_17[is.na(mydata2$position_17)]<-'timepoint not reached'
mydata2$position_13[is.na(mydata2$position_13)]<-'timepoint not reached'
mydata2$position_9[is.na(mydata2$position_9)]<-'timepoint not reached'
mydata2$position_5[is.na(mydata2$position_5)]<-'timepoint not reached'
mydata2$position_1[is.na(mydata2$position_1)]<-'timepoint not reached'
mydata2$position_1 <- gsub('AcFac', 'AcPI', mydata2$position_1)
mydata2$position_5 <- gsub('AcFac', 'AcPI', mydata2$position_5)
mydata2$position_9 <- gsub('AcFac', 'AcPI', mydata2$position_9)
mydata2$position_13 <- gsub('AcFac', 'AcPI', mydata2$position_13)
mydata2$position_17 <- gsub('AcFac', 'AcPI', mydata2$position_17)
mydata2$timepointLast <- gsub('AcFac', 'AcPI', mydata2$timepointLast)
mydata2$position_1 <- factor(mydata2$position_1, levels = c("unknown","NonSci", "NonRes", "IndR", "AcPD", "AcOt", "AcPI"))
mydata2$position_5 <- factor(mydata2$position_5, levels = c('timepoint not reached',"unknown","NonSci", "NonRes", "IndR", "AcPD", "AcOt", "AcPI"))
mydata2$position_9 <- factor(mydata2$position_9, levels = c('timepoint not reached',"unknown","NonSci", "NonRes", "IndR", "AcPD", "AcOt", "AcPI"))
mydata2$position_13 <- factor(mydata2$position_13, levels = c('timepoint not reached',"unknown","NonSci", "NonRes", "IndR", "AcPD", "AcOt", "AcPI"))
mydata2$position_17 <- factor(mydata2$position_17, levels = c('timepoint not reached',"unknown","NonSci", "NonRes", "IndR", "AcPD", "AcOt", "AcPI"))
mydata2$timepointLast <- factor(mydata2$timepointLast, levels = c("unknown","NonSci", "NonRes", "IndR", "AcPD", "AcOt", "AcPI"))
predoc <- mydata2[ which(mydata2$type_pre_postdoc=='PhD'),]
postdoc <- mydata2[ which(mydata2$type_pre_postdoc=='postdoc'),]
##change colours###
mycols_pos1<-c("#D0D0CE", "#E2E868","#F4C61F","#F49E17","#D41645","#734595","#3B6FB6")
names(mycols_pos1)=c('unknown', 'NonSci','NonRes','IndR','AcPD','AcOt','AcPI')
mycols<-c("grey90","#D0D0CE", "#E2E868","#F4C61F","#F49E17","#D41645","#734595","#3B6FB6")
names(mycols)=c('timepoint not reached','unknown','NonSci','NonRes','IndR','AcPD','AcOt', 'AcPI')
#postdoc_5
postdoc_cohort <- postdoc %>%
dplyr::group_by(cohort, position_5) %>%
dplyr::summarise(count = n())
write_csv(postdoc_cohort, "Source_1B_PD.csv")
bp <- ggplot(postdoc_cohort[which(postdoc_cohort$position_5!='timepoint not reached'),],
aes(fill=position_5, y=count, x=cohort),col=mycols) +
geom_bar( stat="identity", position="fill") +
scale_fill_manual(values=mycols) +
labs(x = "cohort", y="Percentage in each role") +
ggtitle("Postdoc alumni,
5 yrs after EMBL postdoc ended")+ theme_minimal(base_size = 16) + theme(axis.title=element_text(size=12)) + theme(plot.title = element_text(size=10), axis.text.x = element_text(size = 12,angle=45, hjust=1))+
theme(legend.key.height = unit(1, 'lines'),legend.key.width = unit(0.4, 'lines'),legend.title = element_text(size=12), legend.text = element_text(size=12)) + scale_y_continuous(labels = scales::label_percent(accuracy = 1L))
#predoc_5
predoc_cohort <- predoc %>%
dplyr::group_by(cohort, position_5) %>%
dplyr::summarise(count = n())
write_csv(predoc_cohort, "Source_1B_PhD.csv")
b <- ggplot(predoc_cohort[which(predoc_cohort$position_5!='timepoint not reached'),],
aes(fill=position_5, y=count, x=cohort),col=mycols) +
geom_bar(stat="identity", position="fill") +
scale_fill_manual(values=mycols) + labs(x = "cohort", y="Percentage in each role") +
ggtitle("PhD alumni,
5 yrs after PhD defense")+ theme_minimal(base_size = 16) + theme(axis.title=element_text(size=12)) + theme(plot.title = element_text(size=10), axis.text.x = element_text(size = 12,angle=45, hjust=1))+
theme(legend.key.height = unit(1, 'lines'),legend.key.width = unit(0.4, 'lines'),legend.title = element_text(size=12), legend.text = element_text(size=12)) + scale_y_continuous(labels = scales::label_percent(accuracy = 1L))
#extract legend
g_legend<-function(a.gplot){
tmp <- ggplot_gtable(ggplot_build(a.gplot))
leg <- which(sapply(tmp$grobs, function(x) x$name) == "guide-box")
legend <- tmp$grobs[[leg]]
return(legend)}
for_legend <-bp+theme(legend.position="right")
mylegend<-g_legend(for_legend)
grid.arrange(arrangeGrob(b + theme(legend.position="none"),
bp + theme(legend.position="none"),
nrow=1,widths=c(4,4)))
#postdoc_1
postdoc_cohort <- postdoc %>%
dplyr::group_by(cohort, position_1) %>%
dplyr::summarise(count = n())
write_csv(postdoc_cohort, "Source_F1S2B.csv")
ap <- ggplot(postdoc_cohort,
aes(fill=position_1, y=count, x=cohort),col=mycols_pos1) +
geom_bar( stat="identity", position="fill") +
scale_fill_manual(values=mycols_pos1) +
labs(x = "cohort", y="Percentage in each role") +
ggtitle("Postdocs, 1 year after EMBL postdoc ended") +
theme_minimal(base_size = 20)+ theme(plot.title = element_text(size=16)) + theme(axis.title=element_text(size=16)) + scale_y_continuous(labels = scales::label_percent(accuracy = 1L))
#postdoc_5
postdoc_cohort <- postdoc %>%
dplyr::group_by(cohort, position_5) %>%
dplyr::summarise(count = n())
write_csv(postdoc_cohort, "Source_F1S2D.csv")
bp <- ggplot(postdoc_cohort[which(postdoc_cohort$position_5!="timepoint not reached"),],
aes(fill=position_5, y=count, x=cohort),col=mycols) +
geom_bar( stat="identity", position="fill") +
scale_fill_manual(values=mycols) +
labs(x = "cohort", y="Percentage in each role") +
ggtitle("Postdocs, 5 years after EMBL postdoc ended") +
theme_minimal(base_size = 20)+ theme(plot.title = element_text(size=16))+ theme(axis.title=element_text(size=16)) + scale_y_continuous(labels = scales::label_percent(accuracy = 1L))
#postdoc_9
postdoc_cohort <- postdoc%>%
dplyr::group_by(cohort, position_9) %>%
dplyr::summarise(count = n())
write_csv(postdoc_cohort, "Source_F1S2F.csv")
cp<-ggplot(postdoc_cohort[which(postdoc_cohort$position_9!="timepoint not reached"),],
aes(fill=position_9, y=count, x=cohort),col=mycols) +
geom_bar(stat="identity", position="fill") +
scale_fill_manual(values=mycols) +
labs(x = "cohort", y="Percentage in each role") +
ggtitle("Postdocs, 9 years after EMBL postdoc ended") +
theme_minimal(base_size = 20)+ theme(plot.title = element_text(size=16))+ theme(axis.title=element_text(size=16)) + scale_y_continuous(labels = scales::label_percent(accuracy = 1L))
#postdoc_13
postdoc_cohort <- postdoc %>%
dplyr::group_by(cohort, position_13) %>%
dplyr::summarise(count = n())
write_csv(postdoc_cohort, "Source_F1S2H.csv")
dp <- ggplot(postdoc_cohort[which(postdoc_cohort$position_13!="timepoint not reached"),],
aes(fill=position_13, y=count, x=cohort),col=mycols) +
geom_bar( stat="identity", position="fill") +
scale_fill_manual(values=mycols) +
labs(x = "cohort", y="Percentage in each role") +
ggtitle("Postdocs, 13 years after EMBL postdoc ended") +
theme_minimal(base_size = 20)+ theme(plot.title = element_text(size=16))+ theme(axis.title=element_text(size=16)) + scale_y_continuous(labels = scales::label_percent(accuracy = 1L))
#postdoc_17
postdoc_cohort <- postdoc %>%
dplyr::group_by(cohort, position_17) %>%
dplyr::summarise(count = n())
write_csv(postdoc_cohort, "Source_F1S2J.csv")
ep <- ggplot(postdoc_cohort[which(postdoc_cohort$position_17!="timepoint not reached"),],
aes(fill=position_17, y=count, x=cohort),col=mycols) +
geom_bar( stat="identity", position="fill") + scale_fill_manual(values=c(mycols)) +
labs(x = "cohort", y="Percentage in each role") +
ggtitle("Postdocs, 17 years after EMBL postdoc ended")+
theme_minimal(base_size = 20)+ theme(plot.title = element_text(size=16))+ theme(axis.title=element_text(size=16)) + scale_y_continuous(labels = scales::label_percent(accuracy = 1L))
#predoc_1
predoc_cohort <- predoc %>%
dplyr::group_by(cohort, position_1) %>%
dplyr::summarise(count = n())
write_csv(predoc_cohort, "Source_F1S2A.csv")
a <- ggplot(predoc_cohort[which(predoc_cohort$position_1!="timepoint not reached"),],
aes(fill=position_1, y=count, x=cohort),col=mycols_pos1) +
geom_bar(stat="identity", position="fill") + scale_fill_manual(values=mycols_pos1) +
labs(x = "cohort", y="Percentage in each role") + ggtitle("PhD students, 1 year after PhD defence") +
theme_minimal(base_size = 20)+ theme(plot.title = element_text(size=16))+ theme(axis.title=element_text(size=16)) + scale_y_continuous(labels = scales::label_percent(accuracy = 1L))
#predoc_5
predoc_cohort <- predoc %>%
dplyr::group_by(cohort, position_5) %>%
dplyr::summarise(count = n())
write_csv(predoc_cohort, "Source_F1S2C.csv")
b <- ggplot(predoc_cohort[which(predoc_cohort$position_5!="timepoint not reached"),],
aes(fill=position_5, y=count, x=cohort),col=mycols) +
geom_bar(stat="identity", position="fill") + scale_fill_manual(values=mycols) +
labs(x = "cohort", y="Percentage in each role") + ggtitle("PhD students, 5 years after PhD defence") +
theme_minimal(base_size = 20)+ theme(plot.title = element_text(size=16))+ theme(axis.title=element_text(size=16)) + scale_y_continuous(labels = scales::label_percent(accuracy = 1L))
#predoc_9
predoc_cohort <- predoc %>%
dplyr::group_by(cohort, position_9) %>%
dplyr::summarise(count = n())
write_csv(predoc_cohort, "Source_F1S2E.csv")
c <- ggplot(predoc_cohort[which(predoc_cohort$position_9!="timepoint not reached"),],
aes(fill=position_9, y=count, x=cohort),col=mycols) +
geom_bar( stat="identity", position="fill") + scale_fill_manual(values=mycols) +
labs(x = "cohort", y="Percentage in each role") +
ggtitle("PhD students, 9 years after PhD defence") +
theme_minimal(base_size = 20)+ theme(plot.title = element_text(size=16))+ theme(axis.title=element_text(size=16)) + scale_y_continuous(labels = scales::label_percent(accuracy = 1L))
#predoc_13
predoc_cohort <- predoc %>%
dplyr::group_by(cohort, position_13) %>%
dplyr::summarise(count = n())
write_csv(predoc_cohort, "Source_F1S2G.csv")
d <- ggplot(predoc_cohort[which(predoc_cohort$position_13!="timepoint not reached"),],
aes(fill=position_13, y=count, x=cohort),col=mycols) +
geom_bar(stat="identity", position="fill") + scale_fill_manual(values=mycols) +
labs(x = "cohort", y="Percentage in each role") + ggtitle("PhD students, 13 years after PhD defence") +
theme_minimal(base_size = 20)+ theme(plot.title = element_text(size=16))+ theme(axis.title=element_text(size=16)) + scale_y_continuous(labels = scales::label_percent(accuracy = 1L))
#predoc_17
predoc_cohort <- predoc %>%
dplyr::group_by(cohort, position_17) %>%
dplyr::summarise(count = n())
write_csv(predoc_cohort, "Source_F1S2I.csv")
e <- ggplot(predoc_cohort[which(predoc_cohort$position_17!="timepoint not reached"),],
aes(fill=position_17, y=count, x=cohort),col=mycols) +
geom_bar( stat="identity", position="fill") + scale_fill_manual(values=c(mycols)) +
labs(x = "cohort", y="Percentage in each role") + ggtitle("PhD students, 17 years after PhD defence") +
theme_minimal(base_size = 20)+ theme(plot.title = element_text(size=16))+ theme(axis.title=element_text(size=12)) + scale_y_continuous(labels = scales::label_percent(accuracy = 1L))
#combine the figures
grid.arrange(arrangeGrob(a + theme(legend.position="none"),
ap + theme(legend.position="none"),
b + theme(legend.position="none"),
bp + theme(legend.position="none"),
c + theme(legend.position="none"),
cp + theme(legend.position="none"),
d + theme(legend.position="none"),
dp + theme(legend.position="none"),
e + theme(legend.position="none"),
ep + theme(legend.position="none") , nrow=5,ncol=2))
#load data file with values
otherCohort = read_csv("data/in_year_all.csv")
my_col <- c("grey40","#3B6FB6")
mydata <- melt(otherCohort[which (otherCohort$institution=="Stanford University (SU) Bioscience | 2018 dataset."),], id=c("institution","Cohorts","Cohort_short","ratio", "ratio_text"))
mydata$variable <- factor(mydata$variable , levels = c("EMBL", "NorthAmerica"))
levels(mydata$variable) <- c("EMBL", "Stanford University (SU) Bioscience | 2018 dataset.")
plot_b<-ggplot(mydata, aes(x=Cohorts, y=value, fill=variable)) +geom_bar(position="dodge",stat="identity") +theme_bw(base_size = 16) + theme(legend.position="top")+
guides(fill = guide_legend(nrow = 2, byrow = TRUE, title.position="top")) +labs(x = "cohort", y="Alumni in PI-like roles
(Percentage)")+ scale_fill_manual(values=c("#3B6FB6","grey40"),name = "Institution:")+ theme(legend.key.height = unit(1, 'lines'),legend.title = element_text(size=11), legend.text = element_text(size=11))+ scale_y_continuous(labels = scales::label_percent(accuracy = 1L),limits=c(0,0.5))
plot_d<-ggplot(mydata[ which(mydata$variable=='EMBL'),], aes(x=Cohort_short, y=ratio)) +geom_point() +
theme_bw(base_size = 10)+geom_hline(yintercept = 1, linetype=3)+
theme( axis.text.x = element_text(size = 8),plot.background = element_rect(fill = "grey90"))+labs(x = "cohort", y="EMBL / SU")+ scale_y_continuous(breaks=seq(0,2,1))+ coord_cartesian(ylim=c(-.2, 2.2))
mydata_plot <- ggdraw() +
draw_plot(plot_b) +
draw_plot(plot_d, x = 0.75, y = .4, width= 0.22, height = .3)
mydata_plot
institions<-sort(unique(otherCohort$institution[otherCohort$institution!="Stanford University (SU) Bioscience | 2018 dataset."]))
pList <- lapply(institions, function(i) {
mydata <- melt(otherCohort[which (otherCohort$institution==i),], id=c("institution","Cohorts","Cohort_short","ratio","ratio_text"))
mydata$variable <- factor(mydata$variable , levels = c("EMBL", "NorthAmerica"))
yaxistext<-unique(mydata$ratio_text)
levels(mydata$variable) <- c("EMBL", i)
plot_b<-ggplot(mydata, aes(x=Cohorts, y=value, fill=variable)) +geom_bar(position="dodge",stat="identity") +theme_bw(base_size = 22)+ylim(0,0.5) +
theme(legend.position="top")+
guides(fill = guide_legend(nrow = 2, byrow = TRUE, title.position="left")) +labs(x = "cohort", y="Alumni in PI-like roles
(Percentage)")+ scale_fill_manual(values=c("#3B6FB6","grey40"),name = "Institution:")+ scale_y_continuous(labels = scales::label_percent(accuracy = 1L),limits=c(0,0.5))
plot_d<-ggplot(mydata[ which(mydata$variable=='EMBL'),], aes(x=Cohort_short, y=ratio)) +geom_point() +
theme_bw(base_size = 16)+geom_hline(yintercept = 1, linetype=3)+
theme( axis.text.x = element_text(size = 14),plot.background = element_rect(fill = "grey90"))+labs(x = "cohort", y=yaxistext)+ scale_y_continuous(breaks=seq(0,2,1))+ coord_cartesian(ylim=c(-.2, 2.2))
width_count<-0.05+(0.075*length(unique(mydata$Cohort_short)))
mydata_plot <- ggdraw() +
draw_plot(plot_b) +
draw_plot(plot_d, x = 0.7, y = .4, width = width_count, height = .3)
mydata_plot
})
plot_grid(plotlist = pList, ncol=2)
#configuration of python environment
use_condaenv("r-reticulate")
colList <- list(NonSci = "#E2E868", NonRes = "#F4C61F",IndR = "#F49E17",AcPD = "#D41645", AcPhD = "#707372", AcOt = "#734595", AcPI = "#3B6FB6")
dummyTab <- data.frame(var = names(colList), val = 1)
g <- ggplot(dummyTab, aes(x=var, y=val, fill = var )) + geom_bar(stat="identity") +
scale_fill_manual(values = colList, name = "role")
legendPlot <- cowplot::get_legend(g)
plot_grid(legendPlot)
Function for producing Sankey diagram for career path
plotSankey <- function(tempdata, role) {
rolePrev <- list(was_GL = "new_AcFac_previous",
CV_wasNonSci = "new_NonSci_previous",
CV_wasIndR = "new_IndR_previous",
CV_wasNonR = "new_SciR_previous",
CV_wasAcOt = "new_AcOt_previous")
#first filter for alumni with full CV who became PIs
Type_data <- tempdata[which(tempdata$completeness=='Complete CV'),]
Type_data <- Type_data[which(Type_data[[role]]=='y'),]
role_code <- rolePrev[[role]]
#create summary tables for previous and current roles and swap names for nodes
Type_count <- Type_data %>% dplyr::group_by_at(rolePrev[[role]]) %>% dplyr::summarize(count = n())
Type_count <- add_column(Type_count, target = paste0(role_code), .after = 1)
Type_count<-as.data.frame(Type_count)
names(Type_count)[1]<- 'source'
names(Type_count)[3]<- "value"
Type_count$source = gsub("AcFac",0,Type_count$source)
Type_count$source = gsub("AcOt",1,Type_count$source)
Type_count$source = gsub("AcPD",2,Type_count$source)
Type_count$source = gsub("AcPhD",3,Type_count$source)
Type_count$source = gsub("IndR",4,Type_count$source)
Type_count$source = gsub("NonRes",5,Type_count$source)
Type_count$source = gsub("NonSci",6,Type_count$source)
Type_count$target = gsub("new_AcFac_previous",7,Type_count$target)
Type_count$target = gsub("new_AcOt_previous",8,Type_count$target)
Type_count$target = gsub("new_IndR_previous",11,Type_count$target)
Type_count$target = gsub("new_SciR_previous",12,Type_count$target)
Type_count$target = gsub("new_NonSci_previous",13,Type_count$target)
Type_now <- Type_data %>%
dplyr::group_by(timepointLast) %>%
dplyr::summarize(count = n())
Type_now <- add_column(Type_now, source = paste0(role_code), .before = 1)
Type_now<-as.data.frame(Type_now)
names(Type_now)[2]<- 'target'
names(Type_now)[3]<- "value"
Type_now$target = gsub("AcFac",14,Type_now$target)
Type_now$target = gsub("AcOt",15,Type_now$target)
Type_now$target = gsub("AcPD",16,Type_now$target)
Type_now$target = gsub("AcPhD",17,Type_now$target)
Type_now$target = gsub("IndR",18,Type_now$target)
Type_now$target = gsub("NonRes",19,Type_now$target)
Type_now$target = gsub("NonSci",20,Type_now$target)
Type_now$source = gsub("new_AcFac_previous",7,Type_now$source)
Type_now$source = gsub("new_AcOt_previous",8,Type_now$source)
Type_now$source = gsub("new_IndR_previous",11,Type_now$source)
Type_now$source = gsub("new_SciR_previous",12,Type_now$source)
Type_now$source = gsub("new_NonSci_previous",13,Type_now$source)
#merge and plot
links <- rbind(Type_count, Type_now)
write.csv(links, paste0("Source_F1SF1_",rolePrev[[role]],".csv"))
p <- plot_ly(
type = "sankey",
orientation = "h",
node = list(
label = c("","", "", "","","","",
"","","","","","","",
"", "","","","","",""),
color = c(colList$AcPI, colList$AcOt, colList$AcPD, colList$AcPhD, colList$IndR, colList$NonRes, colList$NonSci,
colList$AcPI, colList$AcOt, colList$AcPD, colList$AcPhD, colList$IndR, colList$NonRes, colList$NonSci,
colList$AcPI, colList$AcOt, colList$AcPD, colList$AcPhD, colList$IndR, colList$NonRes, colList$NonSci),
pad = 25,
thickness = 75, width=900,
line = list(
color = "black",
width = 0
)
),
link =links
)
p
}
plotSankey(originalData, "was_GL")
plotSankey(originalData, "CV_wasAcOt")
plotSankey(originalData, "CV_wasIndR")
plotSankey(originalData, "CV_wasNonR")
plotSankey(originalData, "CV_wasNonSci")
The text labels in this supplementary figure are added manually
corhortOut <- plotCohort(careerData, "AcPI", showTable = FALSE, maxY=0.5, legendPos = "bottom")
## [1] "Number of subjects with negative time: 2"
## [1] "Negative values are changed to zero"
## [1] "Total number of subjects used in the analysis: 2138"
## [1] "Number of events (being found in a position): 539"
## [1] "Total subject number stratified by pre/post-doc"
##
## PhD alumni Postdoc alumni
## 933 1205
## [1] "Event number stratified by pre/post-doc"
##
## PhD alumni Postdoc alumni
## 196 343
corhortOut$plot
tab1_co <- corhortOut$table %>% mutate(Position = "AcPI")
write.csv(tab1_co,"Figure 2A - Source Data 1.csv")
corhortOut <- plotCohort(careerData, "AcOt", showTable = FALSE, maxY = 0.5, legendPos = "bottom")
## [1] "Number of subjects with negative time: 4"
## [1] "Negative values are changed to zero"
## [1] "Total number of subjects used in the analysis: 2094"
## [1] "Number of events (being found in a position): 477"
## [1] "Total subject number stratified by pre/post-doc"
##
## PhD alumni Postdoc alumni
## 928 1166
## [1] "Event number stratified by pre/post-doc"
##
## PhD alumni Postdoc alumni
## 148 329
corhortOut$plot
tab2_co <- corhortOut$table %>% mutate(Position = "AcOt")
write.csv(tab2_co,"Figure 2B - Source Data 1.csv")
corhortOut <- plotCohort(careerData, "IndR", showTable = FALSE, maxY=0.5, legendPos = "top")
## [1] "Number of subjects with negative time: 9"
## [1] "Negative values are changed to zero"
## [1] "Total number of subjects used in the analysis: 2225"
## [1] "Number of events (being found in a position): 414"
## [1] "Total subject number stratified by pre/post-doc"
##
## PhD alumni Postdoc alumni
## 948 1277
## [1] "Event number stratified by pre/post-doc"
##
## PhD alumni Postdoc alumni
## 196 218
corhortOut$plot
tab3_co <- corhortOut$table %>% mutate(Position = "IndR")
write.csv(tab3_co,"Figure 2C - Source Data 1.csv")
corhortOut <- plotCohort(careerData, "NonRes", showTable = FALSE, maxY=0.5, legendPos = "top")
## [1] "Number of subjects with negative time: 3"
## [1] "Negative values are changed to zero"
## [1] "Total number of subjects used in the analysis: 2221"
## [1] "Number of events (being found in a position): 364"
## [1] "Total subject number stratified by pre/post-doc"
##
## PhD alumni Postdoc alumni
## 944 1277
## [1] "Event number stratified by pre/post-doc"
##
## PhD alumni Postdoc alumni
## 194 170
corhortOut$plot
tab4_co <- corhortOut$table %>% mutate(Position = "NonRes")
write.csv(tab4_co,"Figure 2D - Source Data 1.csv")
corhortOut <- plotCohort(careerData, "NonSci", showTable = FALSE, maxY=0.5, legendPos = "top")
## [1] "Number of subjects with negative time: 6"
## [1] "Negative values are changed to zero"
## [1] "Total number of subjects used in the analysis: 2253"
## [1] "Number of events (being found in a position): 131"
## [1] "Total subject number stratified by pre/post-doc"
##
## PhD alumni Postdoc alumni
## 961 1292
## [1] "Event number stratified by pre/post-doc"
##
## PhD alumni Postdoc alumni
## 81 50
corhortOut$plot
tab5_co <- corhortOut$table %>% mutate(Position = "NonSci")
write.csv(tab5_co,"Figure 2E - Source Data 1.csv")
##PhD to PI length by cohort
Box-plot with overlaid dot-plot showing the distribution in calendar years between PhD conferral and start of first PI role for PhD alumni who defended in 1997-2012 who became a principal investigator within 9-years, and for whom we have a detailed career path (n=157).
mydatatime <- as.data.frame(tempdata)
mydatatime <- mydatatime[ which(mydatatime$completeness=="Complete CV" ), ]
predoc_data <- mydatatime[ which(mydatatime$type_pre_postdoc=='PhD'), ]
predoc_to2012 <- predoc_data[ which(predoc_data$to_year<2013), ]
predoc_to2012_exit_GL <-predoc_to2012[ which(predoc_to2012$PhDtoGL<10& predoc_to2012$completeness=="Complete CV" ), ]
predoc_to2012_exit_GL$cohort <- as.factor(predoc_to2012_exit_GL$cohort)
stat <- t.test(predoc_to2012_exit_GL$PhDtoGL~predoc_to2012_exit_GL$cohort, var.equal = FALSE)[["p.value"]]
f10 <- -log10(stat)
fstat <- round (stat, f10 +3)
fstat[stat<2.2e-16] <- "<2.2e-16"
fstat <- paste("p=", fstat, " ", if (stat<0.05) {"*"},if (stat<0.001) {"*"},if (stat<0.0001) {"*"}, sep="")
fstat <- toString(fstat)
p <- ggboxplot(predoc_to2012_exit_GL, x="cohort", y="PhDtoGL", fill='#ffffff', color="grey58")
p <- p + geom_dotplot(aes(predoc_to2012_exit_GL$cohort, predoc_to2012_exit_GL$PhDtoGL),
binaxis='y', stackdir='center',stackratio=1.0, dotsize=0.25) +
stat_summary(fun.y = "mean", geom = "point", color = "RED", shape=4, size=10) + labs(x="PhD cohort", y = "Calendar years, PhD to first PI role") +
theme_classic(base_size = 16) + geom_hline(yintercept = 9,linetype="dashed")+ scale_y_continuous(breaks=seq(0,9,3))+coord_cartesian(ylim=c(0, 10))+ggtitle(fstat)+theme(plot.title = element_text(size=12))
p
source_temp <- predoc_to2012_exit_GL %>%
dplyr::group_by(cohort, PhDtoGL) %>%
dplyr::summarise(count = n())
write_csv(source_temp, "Figure2_fs1_1A_sourcedata.csv")
Box-plot with overlaid dot-plot showing the distribution in calendar years between completion of an EMBL postdoc and start of first PI role for postdoc alumni who completed their postdoc in 1997-2012 who became a principal investigator within 9-years, and for whom we have a detailed career path (n=218).
# postdoc by cohort
postdoc_data <- mydatatime[ which(mydatatime$type_pre_postdoc=='postdoc'), ]
data_sum <- postdoc_data %>% dplyr::group_by(cohort,PhDtoGL) %>% dplyr::summarise(n = n())
postdoc_to2012 <- postdoc_data[ which(postdoc_data$to_year<2013), ]
postdoc_to2012_exit_GL <- postdoc_to2012[ which(postdoc_to2012$PhDtoGL<10 & postdoc_to2012$completeness=="Complete CV" ), ]
postdoc_to2012_exit_GL$cohort <- as.factor(postdoc_to2012_exit_GL$cohort)
postdoc_to2012_exit_GLEMBL <- postdoc_to2012[ which(postdoc_to2012$EMBLtoPI<10 & postdoc_to2012$EMBLtoPI>-1 & postdoc_to2012$completeness=="Complete CV" ), ]
postdoc_to2012_exit_GLEMBL$cohort <- as.factor(postdoc_to2012_exit_GLEMBL$cohort)
stat <- t.test(postdoc_to2012_exit_GLEMBL$EMBLtoPI~postdoc_to2012_exit_GLEMBL$cohort, var.equal = FALSE)[["p.value"]]
f10 <- -log10(stat)
fstat <- round (stat, f10 +3)
fstat[stat<2.2e-16] <- "<2.2e-16"
fstat <- paste("p=", fstat, " ", if (stat<0.05) {"*"},if (stat<0.001) {"*"},if (stat<0.0001) {"*"}, sep="")
fstat <- toString(fstat)
p <- ggboxplot(postdoc_to2012_exit_GLEMBL, x="cohort", y="EMBLtoPI", fill='#ffffff', color="grey58")
p <- p + geom_dotplot(aes(postdoc_to2012_exit_GLEMBL$cohort, postdoc_to2012_exit_GLEMBL$EMBLtoPI),
binaxis='y', stackdir='center',stackratio=1.0, dotsize=0.25) +
stat_summary(fun.y = "mean", geom = "point", color = "RED", shape=4, size=10) + labs(x="Postdoc cohort", y = "Calendar years, EMBL to first PI role") +
theme_classic(base_size = 16) + geom_hline(yintercept = 9,linetype="dashed")+ scale_y_continuous(breaks=seq(0,9,3))+coord_cartesian(ylim=c(0, 10))+ggtitle(fstat)+theme(plot.title = element_text(size=12))
p
source_temp <- postdoc_to2012_exit_GLEMBL %>%
dplyr::group_by(cohort, EMBLtoPI) %>%
dplyr::summarise(count = n())
write_csv(source_temp, "Figure2_fs1_1B_sourcedata.csv")
Same as above but for postdoc alumni for whom we were able to identify a PhD conferral year
stat <- t.test(postdoc_to2012_exit_GL$PhDtoGL~postdoc_to2012_exit_GL$cohort, var.equal = FALSE)[["p.value"]]
f10 <- -log10(stat)
fstat <- round (stat, f10 +3)
fstat[stat<2.2e-16] <- "<2.2e-16"
fstat <- paste("p=", fstat, " ", if (stat<0.05) {"*"},if (stat<0.001) {"*"},if (stat<0.0001) {"*"}, sep="")
fstat <- toString(fstat)
p <- ggboxplot(postdoc_to2012_exit_GL, x="cohort", y="PhDtoGL", fill='#ffffff', color="grey58")
p <- p + geom_dotplot(aes(postdoc_to2012_exit_GL$cohort, postdoc_to2012_exit_GL$PhDtoGL),
binaxis='y', stackdir='center',stackratio=1.0, dotsize=0.25)+
stat_summary(fun.y = "mean", geom = "point", color = "RED", shape=4, size=10) + labs(x="Postdoc cohort", y = "Calendar years, PhD to first PI role") +
theme_classic(base_size = 16)+geom_hline(yintercept = 9,linetype="dashed")+ scale_y_continuous(breaks=seq(0,9,3))+coord_cartesian(ylim=c(0, 10))+ggtitle(fstat)+theme(plot.title = element_text(size=12))
p
source_temp <- postdoc_to2012_exit_GL %>%
dplyr::group_by(cohort, PhDtoGL) %>%
dplyr::summarise(count = n())
write_csv(source_temp, "Figure2_fs1_1C_sourcedata.csv")
source_temp <- tempdata %>%
dplyr::group_by(type_pre_postdoc, gender, timepointLast) %>%
dplyr::summarise(count = n())
write_csv(source_temp, "Figure3A_sourcedata.csv")
#predoc
predoc_cohort <- tempdata[ which(mydatatime$type_pre_postdoc=="PhD" ), ] %>%
dplyr::group_by(gender, timepointLast) %>%
dplyr::summarize(count = n())
a <- ggplot(predoc_cohort, aes(fill=timepointLast, y=count, x=gender),col=mycols_pos1) +
geom_bar( stat="identity", position="fill") + scale_fill_manual(values=c(mycols_pos1)) +
labs(x = "Gender", y="Percentage in each role") +
theme_minimal(base_size = 16)+ ggtitle("PhD students,
current role") +
theme(axis.title=element_text(size=12))+ theme(plot.title=element_text(size=12)) + scale_y_continuous(labels = scales::label_percent(accuracy = 1L))
#postdoc
postdoc_cohort <- tempdata[ which(mydatatime$type_pre_postdoc=="postdoc" ), ] %>%
dplyr::group_by(gender, timepointLast) %>%
dplyr::summarise(count = n())
ep <- ggplot(postdoc_cohort, aes(fill=timepointLast, y=count, x=gender),col=mycols_pos1) +
geom_bar( stat="identity", position="fill") +
scale_fill_manual(values=c(mycols_pos1)) +
labs(x = "Gender", y="Percentage in each role") +
theme_minimal(base_size = 16) + ggtitle("Postdocs,
current role") +
theme(axis.title=element_text(size=12)) + theme(plot.title=element_text(size=12))+
theme(legend.key.height = unit(1, 'lines'),legend.key.width = unit(0.4, 'lines'),legend.title = element_text(size=12), legend.text = element_text(size=12)) + scale_y_continuous(labels = scales::label_percent(accuracy = 1L))
# make legend
g_legend <- function(a.gplot){
tmp <- ggplot_gtable(ggplot_build(a.gplot))
leg <- which(sapply(tmp$grobs, function(x) x$name) == "guide-box")
legend <- tmp$grobs[[leg]]
return(legend)
}
mylegend <- g_legend(ep)
#combine plots
grid.arrange(arrangeGrob(a + theme(legend.position="none"),
ep + theme(legend.position="none"),
mylegend,
nrow=1,widths=c(4,4,3)))
corhortOut <- plotGender(careerData, "AcPI", showTable = FALSE, maxY=0.5, legendPos = "bottom")
## [1] "Number of subjects with negative time: 2"
## [1] "Negative values are changed to zero"
## [1] "Total number of subjects used in the analysis: 2138"
## [1] "Number of events (being found in a position): 539"
## [1] "Total subject number stratified by pre/post-doc"
##
## PhD alumni Postdoc alumni
## 933 1205
## [1] "Event number stratified by pre/post-doc"
##
## PhD alumni Postdoc alumni
## 196 343
corhortOut$plot
tab1_gender <- corhortOut$table %>% mutate(Position = "AcPI")
write_csv(tab1_gender, "Figure3B_km_acPI_gender_sourcedata.csv")
corhortOut <- plotGender(careerData, "AcOt", showTable = FALSE, maxY = 0.5, legendPos = "bottom")
## [1] "Number of subjects with negative time: 4"
## [1] "Negative values are changed to zero"
## [1] "Total number of subjects used in the analysis: 2094"
## [1] "Number of events (being found in a position): 477"
## [1] "Total subject number stratified by pre/post-doc"
##
## PhD alumni Postdoc alumni
## 928 1166
## [1] "Event number stratified by pre/post-doc"
##
## PhD alumni Postdoc alumni
## 148 329
corhortOut$plot
tab2_gender <- corhortOut$table %>% mutate(Position = "AcOt")
write_csv(tab2_gender, "Figure3_SF2A_sourcedata.csv")
corhortOut <- plotGender(careerData, "IndR", showTable = FALSE, maxY=0.5, legendPos = "top")
## [1] "Number of subjects with negative time: 9"
## [1] "Negative values are changed to zero"
## [1] "Total number of subjects used in the analysis: 2225"
## [1] "Number of events (being found in a position): 414"
## [1] "Total subject number stratified by pre/post-doc"
##
## PhD alumni Postdoc alumni
## 948 1277
## [1] "Event number stratified by pre/post-doc"
##
## PhD alumni Postdoc alumni
## 196 218
corhortOut$plot
tab3_gender <- corhortOut$table %>% mutate(Position = "IndR")
write_csv(tab3_gender, "Figure3_SF2B_sourcedata.csv")
corhortOut <- plotGender(careerData, "NonRes", showTable = FALSE, maxY=0.5, legendPos = "top")
## [1] "Number of subjects with negative time: 3"
## [1] "Negative values are changed to zero"
## [1] "Total number of subjects used in the analysis: 2221"
## [1] "Number of events (being found in a position): 364"
## [1] "Total subject number stratified by pre/post-doc"
##
## PhD alumni Postdoc alumni
## 944 1277
## [1] "Event number stratified by pre/post-doc"
##
## PhD alumni Postdoc alumni
## 194 170
corhortOut$plot
tab4_gender <- corhortOut$table %>% mutate(Position = "NonRes")
write_csv(tab4_gender, "Figure3C_sourcedata.csv")
corhortOut <- plotGender(careerData, "NonSci", showTable = FALSE, maxY=0.5, legendPos = "top")
## [1] "Number of subjects with negative time: 6"
## [1] "Negative values are changed to zero"
## [1] "Total number of subjects used in the analysis: 2253"
## [1] "Number of events (being found in a position): 131"
## [1] "Total subject number stratified by pre/post-doc"
##
## PhD alumni Postdoc alumni
## 961 1292
## [1] "Event number stratified by pre/post-doc"
##
## PhD alumni Postdoc alumni
## 81 50
corhortOut$plot
tab5_gender <- corhortOut$table %>% mutate(Position = "NonSci")
write_csv(tab5_gender, "Figure3_SF2C_sourcedata.csv")
Box-plot with overlaid dot-plot showing the distribution in calendar years between PhD conferral and start of first PI role for PhD alumni who defended in 1997-2012 who became a principal investigator within 9-years, and for whom we have a detailed career path (n=157).
predoc_to2012_exit_GL$gender <- as.factor(predoc_to2012_exit_GL$gender)
stat <- t.test(predoc_to2012_exit_GL$PhDtoGL~predoc_to2012_exit_GL$gender, var.equal = FALSE)[["p.value"]]
f10 <- -log10(stat)
fstat <- round (stat, f10 +3)
fstat[stat<2.2e-16] <- "<2.2e-16"
fstat <- paste("p=", fstat, " ", if (stat<0.05) {"*"},if (stat<0.001) {"*"},if (stat<0.0001) {"*"}, sep="")
fstat <- toString(fstat)
p <- ggboxplot(predoc_to2012_exit_GL, x="gender", y="PhDtoGL", fill='#ffffff', color="grey58")
p <- p + geom_dotplot(aes(predoc_to2012_exit_GL$gender, predoc_to2012_exit_GL$PhDtoGL),
binaxis='y', stackdir='center',stackratio=1.0, dotsize=0.25) +
stat_summary(fun.y = "mean", geom = "point", color = "RED", shape=4, size=10) + labs(x="PhD gender", y = "Calendar years, PhD to first PI role") +
theme_classic(base_size = 16) + geom_hline(yintercept = 9,linetype="dashed")+ scale_y_continuous(breaks=seq(0,9,3))+coord_cartesian(ylim=c(0, 10))+ggtitle(fstat)+theme(plot.title = element_text(size=12))
p
source_temp <- predoc_to2012_exit_GL %>%
dplyr::group_by(gender, PhDtoGL) %>%
dplyr::summarise(count = n())
write_csv(source_temp, "Figure3_FigureSupplement_1A_SourceData1.csv")
Box-plot with overlaid dot-plot showing the distribution in calendar years between completion of an EMBL postdoc and start of first PI role for postdoc alumni who completed their postdoc in 1997-2012 who became a principal investigator within 9-years, and for whom we have a detailed career path (n=218).
# postdoc by cohort
postdoc_to2012_exit_GLEMBL$gender <- as.factor(postdoc_to2012_exit_GLEMBL$gender)
stat <- t.test(postdoc_to2012_exit_GLEMBL$EMBLtoPI~postdoc_to2012_exit_GLEMBL$gender, var.equal = FALSE)[["p.value"]]
f10 <- -log10(stat)
fstat <- round (stat, f10 +3)
fstat[stat<2.2e-16] <- "<2.2e-16"
fstat <- paste("p=", fstat, " ", if (stat<0.05) {"*"},if (stat<0.001) {"*"},if (stat<0.0001) {"*"}, sep="")
fstat <- toString(fstat)
p <- ggboxplot(postdoc_to2012_exit_GLEMBL, x="gender", y="EMBLtoPI", fill='#ffffff', color="grey58")
p <- p + geom_dotplot(aes(postdoc_to2012_exit_GLEMBL$gender, postdoc_to2012_exit_GLEMBL$EMBLtoPI),
binaxis='y', stackdir='center',stackratio=1.0, dotsize=0.25) +
stat_summary(fun.y = "mean", geom = "point", color = "RED", shape=4, size=10) + labs(x="Postdoc gender", y = "Calendar years, EMBL to first PI role") +
theme_classic(base_size = 16) + geom_hline(yintercept = 9,linetype="dashed")+ scale_y_continuous(breaks=seq(0,9,3))+coord_cartesian(ylim=c(0, 10))+ggtitle(fstat)+theme(plot.title = element_text(size=12))
p
source_temp <- postdoc_to2012_exit_GLEMBL %>%
dplyr::group_by(gender, EMBLtoPI) %>%
dplyr::summarise(count = n())
write_csv(source_temp, "Figure3_FigureSupplement_1B_SourceData1.csv")
Same as above but for postdoc alumni for whom we were able to identify a PhD conferral year
postdoc_to2012_exit_GL$gender <- as.factor(postdoc_to2012_exit_GL$gender)
stat <- t.test(postdoc_to2012_exit_GL$PhDtoGL~postdoc_to2012_exit_GL$gender, var.equal = FALSE)[["p.value"]]
f10 <- -log10(stat)
fstat <- round (stat, f10 +3)
fstat[stat<2.2e-16] <- "<2.2e-16"
fstat <- paste("p=", fstat, " ", if (stat<0.05) {"*"},if (stat<0.001) {"*"},if (stat<0.0001) {"*"}, sep="")
fstat <- toString(fstat)
p <- ggboxplot(postdoc_to2012_exit_GL, x="gender", y="PhDtoGL", fill='#ffffff', color="grey58")
p <- p + geom_dotplot(aes(postdoc_to2012_exit_GL$gender, postdoc_to2012_exit_GL$PhDtoGL),
binaxis='y', stackdir='center',stackratio=1.0, dotsize=0.25)+
stat_summary(fun.y = "mean", geom = "point", color = "RED", shape=4, size=10) + labs(x="Postdoc gender", y = "Calendar years, PhD to first PI role") +
theme_classic(base_size = 16)+geom_hline(yintercept = 9,linetype="dashed")+ scale_y_continuous(breaks=seq(0,9,3))+coord_cartesian(ylim=c(0, 10))+ggtitle(fstat)+theme(plot.title = element_text(size=12))
p
source_temp <- postdoc_to2012_exit_GL %>%
dplyr::group_by(gender, PhDtoGL) %>%
dplyr::summarise(count = n())
write_csv(source_temp, "Figure3_FigureSupplement1C_SourceData1.csv")
Histograms of the natural logarithm of the highest category normalized citation of first author EMBL publications per ECR publishing at least one first author paper linked to EMBL and with a CNCI value in Clarivate InCites’ database) for alumni who have not (yet) become a PI (PI) and those who became a PI.
df_mean <- mydata %>% group_by(was_GL) %>% summarise(meant = mean(pubs_FIRST_RA_CNCI_max, na.rm=TRUE))
df_mean
## # A tibble: 2 × 2
## was_GL meant
## <chr> <dbl>
## 1 n 3.10
## 2 y 5.67
p <- ggplot(mydata, aes(x=log(pubs_FIRST_RA_CNCI_max))) +
geom_histogram(binwidth=1, colour="black", fill="grey80") +facet_grid(was_GL ~ .) +
theme(strip.text.y = element_text(angle = 0)) +
labs(x = "log(max CNCI, first-author research articles)", y="number of alumni")+
theme_bw(base_size = 16)
p<- p +
geom_vline(data=filter(df_mean, was_GL=="n"), aes(xintercept=log(meant)), colour="red",linetype="dashed") +
geom_vline(data=filter(df_mean, was_GL=="y"), aes(xintercept=log(meant)), colour="red",linetype="dashed")
p<- p + geom_vline(aes(xintercept=log(1)), colour="black")
p<- p + geom_text(data=filter(df_mean, was_GL=="n"), aes(label = paste0("log(",round(meant,1),")"), x = log(meant), y = 250),
size=6,colour="red", angle=-90, vjust = -1) +
geom_text(data=filter(df_mean, was_GL=="y"), aes(label = paste0("log(",round(meant,1),")"), x = log(meant), y = 250),
size=6,colour="red", angle=-90, vjust = -1)
#do t.test and extract p-value
stat<-t.test(mydata$pubs_FIRST_RA_CNCI_max~mydata$was_GL, var.equal = FALSE)[["p.value"]]
f10 <- -log10(stat)
fstat <- round (stat, f10 +3)
fstat[stat<2.2e-16] <- "<2.2e-16"
fstat <- paste("p=", fstat, " ", if (stat<0.05) {"*"},if (stat<0.001) {"*"},if (stat<0.0001) {"*"}, sep="")
fstat <- toString(fstat)
#plot
p<-p + ggtitle(fstat)+theme(plot.title = element_text(size=12), axis.text=element_text(size=12))
p
#extract source data
histogram_rawdata <- ggplot_build(p)
write.csv(histogram_rawdata$data[[1]],"Figure 4B - Source Data 1.csv")
Summary statistics
summary_stats <-group_by(mydata, was_GL) %>%
dplyr::summarise(
count = n(),
mean = mean(pubs_FIRST_RA_CNCI_max, na.rm = TRUE),
sd = sd(pubs_FIRST_RA_CNCI_max, na.rm = TRUE)
)
summary_stats
## # A tibble: 2 × 4
## was_GL count mean sd
## <chr> <int> <dbl> <dbl>
## 1 n 1599 3.10 5.51
## 2 y 685 5.67 12.7
Preprocessing data
#get survival table
survT <- processSurvivalTable(careerData, "AcPI","EMBL")
## [1] "Number of subjects with negative time: 2"
## [1] "Negative values are changed to zero"
## [1] "Total number of subjects used in the analysis: 2138"
## [1] "Number of events (being found in a position): 539"
testTab <- survT %>%
left_join(select(careerData, unique_ID, type_pre_postdoc, cohort, pubs_FIRST_ra_only_TOTAL), by = "unique_ID") %>%
filter(!is.na(timeToPos), !is.na(pubs_FIRST_ra_only_TOTAL), !is.na(type_pre_postdoc)) %>%
mutate(cohort = factor(cohort),
type_pre_postdoc = ifelse(type_pre_postdoc == "predoc","PhD alumni", "Postdoc alumni")) %>%
mutate(numPub = ifelse(pubs_FIRST_ra_only_TOTAL > 1, "2+", pubs_FIRST_ra_only_TOTAL)) %>%
mutate(type_pre_postdoc = factor(type_pre_postdoc, levels = c("PhD alumni","Postdoc alumni")),
numPub = factor(numPub, levels = c("0", "1", "2+")))
print("Total subject number stratified by pre/post-doc")
## [1] "Total subject number stratified by pre/post-doc"
print(table(testTab$type_pre_postdoc))
##
## PhD alumni Postdoc alumni
## 933 1205
print("Event number stratified by pre/post-doc")
## [1] "Event number stratified by pre/post-doc"
print(table(filter(testTab, ifPos)$type_pre_postdoc))
##
## PhD alumni Postdoc alumni
## 196 343
Plot KM curves stratified by pre-postdoc
plotOut <- km(testTab, "numPub", maxTime = 25, titlePlot = "",
titleLegend = "Number of\nfirst-author\npublications",
xlab = sprintf("Time after %s (years)", "EMBL"),
ylab = paste0("Probability of being found as ", "AcPI"), maxY=0.7) +
theme(legend.position = "right")
plotOut
pubOut <- plotPublication(careerData, "AcPI", showTable = FALSE)
## [1] "Number of subjects with negative time: 2"
## [1] "Negative values are changed to zero"
## [1] "Total number of subjects used in the analysis: 2138"
## [1] "Number of events (being found in a position): 539"
## [1] "Total subject number stratified by pre/post-doc"
##
## PhD alumni Postdoc alumni
## 933 1205
## [1] "Event number stratified by pre/post-doc"
##
## PhD alumni Postdoc alumni
## 196 343
tab1 <- pubOut$table %>% mutate(Position = "AcPI")
write_csv(tab1, "Figure_4C_km_AcPI_PubNum-Source Data 1.csv")
p <- plotHarrelsC(careerData, "AcPI", preOrPost = "both")
## [1] "Number of subjects with negative time: 0"
## [1] "Negative values are changed to zero"
## [1] "Total number of subjects used in the analysis: 1788"
## [1] "Number of events (being found in a position): 489"
p$plot
tabOut <- p$table
pubOut <- plotPublication(careerData, "AcOt", showTable = FALSE, maxY= 0.7)
## [1] "Number of subjects with negative time: 4"
## [1] "Negative values are changed to zero"
## [1] "Total number of subjects used in the analysis: 2094"
## [1] "Number of events (being found in a position): 477"
## [1] "Total subject number stratified by pre/post-doc"
##
## PhD alumni Postdoc alumni
## 928 1166
## [1] "Event number stratified by pre/post-doc"
##
## PhD alumni Postdoc alumni
## 148 329
pubOut$plot
tab2 <- pubOut$table %>% mutate(Position = "AcOt")
write_csv(tab2, "Figure5FS1A_km_AcOt_numPub_sourcedata.csv")
pubOut <- plotPublication(careerData, "IndR", showTable = FALSE, maxY = 0.7)
## [1] "Number of subjects with negative time: 9"
## [1] "Negative values are changed to zero"
## [1] "Total number of subjects used in the analysis: 2225"
## [1] "Number of events (being found in a position): 414"
## [1] "Total subject number stratified by pre/post-doc"
##
## PhD alumni Postdoc alumni
## 948 1277
## [1] "Event number stratified by pre/post-doc"
##
## PhD alumni Postdoc alumni
## 196 218
pubOut$plot
tab3 <- pubOut$table %>% mutate(Position = "IndR")
write_csv(tab3, "Figure 5_FS1B_km_IndR_numPub_sourcedata.csv")
pubOut <- plotPublication(careerData, "NonRes", showTable = FALSE, maxY=0.7)
## [1] "Number of subjects with negative time: 3"
## [1] "Negative values are changed to zero"
## [1] "Total number of subjects used in the analysis: 2221"
## [1] "Number of events (being found in a position): 364"
## [1] "Total subject number stratified by pre/post-doc"
##
## PhD alumni Postdoc alumni
## 944 1277
## [1] "Event number stratified by pre/post-doc"
##
## PhD alumni Postdoc alumni
## 194 170
pubOut$plot
tab4 <- pubOut$table %>% mutate(Position = "NonRes")
write_csv(tab4, "Figure 5_FS1Ckm_NonRes_numPub-sourcedata1.csv")
putOut <- plotPublication(careerData, "NonSci", showTable = FALSE, maxY=0.7)
## [1] "Number of subjects with negative time: 6"
## [1] "Negative values are changed to zero"
## [1] "Total number of subjects used in the analysis: 2253"
## [1] "Number of events (being found in a position): 131"
## [1] "Total subject number stratified by pre/post-doc"
##
## PhD alumni Postdoc alumni
## 961 1292
## [1] "Event number stratified by pre/post-doc"
##
## PhD alumni Postdoc alumni
## 81 50
putOut$plot
tab5 <- pubOut$table %>% mutate(Position = "NonSci")
write_csv(tab5, "Figure 5_FS1D_km_NonSci_numPub-sourcedata-1.csv")
p <- plotHarrelsC(careerData, "AcOt", preOrPost = "both")
## [1] "Number of subjects with negative time: 2"
## [1] "Negative values are changed to zero"
## [1] "Total number of subjects used in the analysis: 1745"
## [1] "Number of events (being found in a position): 396"
p$plot
tabOut <- bind_rows(tabOut, p$table)
p <- plotHarrelsC(careerData, "IndR", preOrPost = "both")
## [1] "Number of subjects with negative time: 6"
## [1] "Negative values are changed to zero"
## [1] "Total number of subjects used in the analysis: 1802"
## [1] "Number of events (being found in a position): 363"
p$plot
tabOut <- bind_rows(tabOut, p$table)
p <- plotHarrelsC(careerData, "NonRes", preOrPost = "both")
## [1] "Number of subjects with negative time: 1"
## [1] "Negative values are changed to zero"
## [1] "Total number of subjects used in the analysis: 1806"
## [1] "Number of events (being found in a position): 314"
p$plot
tabOut <- bind_rows(tabOut, p$table)
p <- plotHarrelsC(careerData, "NonSci", preOrPost = "both")
## [1] "Number of subjects with negative time: 4"
## [1] "Negative values are changed to zero"
## [1] "Total number of subjects used in the analysis: 1843"
## [1] "Number of events (being found in a position): 117"
p$plot
tabOut <- bind_rows(tabOut, p$table)
write_csv(tabOut, "Figure4D and Figure 5 - source data 1.csv")
#Cohort vs ra articles - boxplot
#do anova on all data, extract p-value
stat<-aov(tempdata$pubs_RAonly_TOTAL~tempdata$cohort)
stat<- summary(stat)[[1]][["Pr(>F)"]][[1]]
f10 <- -log10(stat)
fstat <-round (stat, f10 +3)
fstat<-paste("ANOVA p=", fstat, " ", if (stat<0.05) {"*"},if (stat<0.001) {"*"},if (stat<0.0001) {"*"}, sep="")
fstat <- toString(fstat)
levels(tempdata$cohort)
## NULL
#plot, excluding long-tail outliers
upper_lim<-quantile(tempdata$pubs_RAonly_TOTAL, 0.975)
p1 <- ggplot(tempdata, aes(x=cohort, y=pubs_RAonly_TOTAL)) +
geom_boxplot(lwd=0.5,color="grey40", fill="grey100", width=0.5)+
stat_summary(fun.y = "mean", geom = "point",shape=4, color="red",size=4) +
theme_classic(base_size = 14) +
theme(panel.grid.major=element_line(colour="grey90")) +
labs(x = "cohort", y="# research articles")
p1<-p1+ ggtitle(fstat)+ylim(0,upper_lim)+ theme(plot.title=element_text(size=12))
p1
RA_author_stats<-tempdata
RA_author_stats$pubs_RAonly_TOTAL[RA_author_stats$pubs_RAonly_TOTAL> quantile(RA_author_stats$pubs_RAonly_TOTAL, 0.975, na.rm=TRUE)] <-paste( ">",upper_lim)
RA_author_stats<- dplyr::group_by(RA_author_stats, cohort, pubs_RAonly_TOTAL) %>%
dplyr::summarise(
count = n(),)
write.csv(RA_author_stats,"Figure 6B - Source Data 1.csv" )
##Cohort vs first.author ra articles - boxplot Figure 6C
#do anova on all data, extract p-value
stat<-aov(tempdata$pubs_FIRST_ra_only_TOTAL~tempdata$cohort)
stat<- summary(stat)[[1]][["Pr(>F)"]][[1]]
f10 <- -log10(stat)
fstat <-round (stat, f10 +3)
fstat<-paste("ANOVA p=", fstat, " ", if (stat<0.05) {"*"},if (stat<0.001) {"*"},if (stat<0.0001) {"*"}, sep="")
fstat <- toString(fstat)
#plot, excluding long-tail outliers
upper_lim<-quantile(tempdata$pubs_FIRST_ra_only_TOTAL, 0.975)
p2 <- ggplot(tempdata, aes(x=cohort, y=pubs_FIRST_ra_only_TOTAL)) +
geom_boxplot(lwd=0.5,color="grey40", fill="grey100", width=0.5)+
stat_summary(fun.y = "mean", geom = "point",shape=4, color="red",size=4) +
theme_classic(base_size = 14) +
theme(panel.grid.major=element_line(colour="grey90")) +
labs(x = "cohort", y="# first-author research articles")
p2<-p2+ ggtitle(fstat)+ylim(0,upper_lim)+ theme(plot.title=element_text(size=12))
p2
first_author_stats<-tempdata
first_author_stats$pubs_FIRST_ra_only_TOTAL[first_author_stats$pubs_FIRST_ra_only_TOTAL> quantile(first_author_stats$pubs_FIRST_ra_only_TOTAL, 0.975, na.rm=TRUE)] <-paste( ">",upper_lim)
first_author_stats<- dplyr::group_by(first_author_stats, cohort,pubs_FIRST_ra_only_TOTAL) %>%
dplyr::summarise(
count = n())
write.csv(first_author_stats,"Figure 6C - Source Data 1.csv" )
pubdata <- read_csv("data/WOSpapers_unique.csv") %>%
rename_with(make.names)
pubdata_art <- pubdata[ which(pubdata$articletype=='Article'), ]
pubdata_oth <- pubdata[ which(pubdata$articletype=='Other'), ]
a<- dplyr::group_by(pubdata_art, Publication.Date) %>%
dplyr::summarise(
count = n(),
wmean = winsor.means(count_authors, na.rm = TRUE),
mean = mean(count_authors, na.rm = TRUE),
median = median(count_authors, na.rm = TRUE),
sd = sd(count_authors, na.rm = TRUE)
)
# Create Line Chart
p <- ggplot(a, aes(Publication.Date, y=wmean)) + geom_point(color = "black") +
ylim(0,12)+ xlim(1995,2020) +
labs(x="Year", y = "Winsorized mean,
number of authors
per paper") +
theme_classic(base_size = 14)+theme(panel.grid.major=element_line(colour="grey90"))
p
a <- a %>% filter(Publication.Date >1994 & Publication.Date < 2021)
write.csv(a,"Figure 6A - Source Data 1.csv")
b <- dplyr::group_by(pubdata_art, Publication.Date) %>%
dplyr::summarise(
count = n(),
mean = mean(Category.Normalized.Citation.Impact, na.rm = TRUE),
median = median(Category.Normalized.Citation.Impact, na.rm = TRUE),
sd = sd(Category.Normalized.Citation.Impact, na.rm = TRUE)
)
p <- ggplot(b, aes(Publication.Date, y=mean)) + geom_point(color = "black") +
ylim(0,6) + xlim(1995,2020) + labs(x="Year", y = "mean,
Category Normalized
Citation Impact") +
theme_classic(base_size = 14)+theme(panel.grid.major=element_line(colour="grey90"))
p
b <- b %>% filter(Publication.Date >1994 & Publication.Date < 2021)
write.csv(b,"Figure 6D - Source Data 1.csv")
#Session information
sessionInfo()
## R version 4.2.0 (2022-04-22)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur/Monterey 10.16
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] reticulate_1.25 forcats_0.5.1 stringr_1.4.1 dplyr_1.0.9
## [5] purrr_0.3.4 readr_2.1.2 tidyr_1.2.0 tibble_3.1.8
## [9] tidyverse_1.3.2 survminer_0.4.9 glmnet_4.1-4 Matrix_1.5-4
## [13] RColorBrewer_1.1-3 orca_1.1-1 plotly_4.10.1 psych_2.2.5
## [17] reshape2_1.4.4 ggpubr_0.4.0 Hmisc_4.7-0 Formula_1.2-4
## [21] survival_3.4-0 lattice_0.20-45 ggbeeswarm_0.6.0 ggplot2_3.4.1
## [25] cowplot_1.1.1 gridExtra_2.3 BiocStyle_2.24.0
##
## loaded via a namespace (and not attached):
## [1] readxl_1.4.0 backports_1.4.1 plyr_1.8.7
## [4] lazyeval_0.2.2 sp_1.5-0 splines_4.2.0
## [7] crosstalk_1.2.0 digest_0.6.30 foreach_1.5.2
## [10] htmltools_0.5.4 magick_2.7.3 fansi_1.0.3
## [13] magrittr_2.0.3 checkmate_2.1.0 googlesheets4_1.0.0
## [16] cluster_2.1.3 tzdb_0.3.0 modelr_0.1.8
## [19] vroom_1.5.7 jpeg_0.1-9 colorspace_2.0-3
## [22] rvest_1.0.2 haven_2.5.0 xfun_0.31
## [25] crayon_1.5.2 jsonlite_1.8.3 zoo_1.8-10
## [28] iterators_1.0.14 glue_1.6.2 gtable_0.3.0
## [31] gargle_1.2.0 car_3.1-0 shape_1.4.6
## [34] abind_1.4-5 scales_1.2.0 DBI_1.1.3
## [37] rstatix_0.7.0 Rcpp_1.0.9 viridisLite_0.4.0
## [40] xtable_1.8-4 htmlTable_2.4.1 foreign_0.8-82
## [43] bit_4.0.4 km.ci_0.5-6 dismo_1.3-5
## [46] htmlwidgets_1.5.4 httr_1.4.3 ellipsis_0.3.2
## [49] pkgconfig_2.0.3 farver_2.1.1 nnet_7.3-17
## [52] sass_0.4.2 dbplyr_2.2.1 deldir_1.0-6
## [55] utf8_1.2.2 tidyselect_1.1.2 labeling_0.4.2
## [58] rlang_1.0.6 munsell_0.5.0 cellranger_1.1.0
## [61] tools_4.2.0 cachem_1.0.6 cli_3.4.1
## [64] generics_0.1.3 broom_1.0.0 evaluate_0.15
## [67] fastmap_1.1.0 yaml_2.3.5 knitr_1.39
## [70] bit64_4.0.5 fs_1.5.2 survMisc_0.5.6
## [73] nlme_3.1-158 xml2_1.3.3 compiler_4.2.0
## [76] rstudioapi_0.13 beeswarm_0.4.0 png_0.1-7
## [79] ggsignif_0.6.3 reprex_2.0.1 bslib_0.4.1
## [82] stringi_1.7.8 highr_0.9 KMsurv_0.1-5
## [85] vctrs_0.5.2 pillar_1.8.0 lifecycle_1.0.3
## [88] BiocManager_1.30.18 jquerylib_0.1.4 data.table_1.14.8
## [91] raster_3.5-21 R6_2.5.1 latticeExtra_0.6-30
## [94] bookdown_0.27 vipor_0.4.5 codetools_0.2-18
## [97] assertthat_0.2.1 withr_2.5.0 mnormt_2.1.0
## [100] parallel_4.2.0 hms_1.1.1 terra_1.5-21
## [103] grid_4.2.0 rpart_4.1.16 rmarkdown_2.14
## [106] carData_3.0-5 googledrive_2.0.0 lubridate_1.8.0
## [109] base64enc_0.1-3 interp_1.1-3