This document contains the codes that generate the data analysis results and figures presented in the manuscript “Meta-research: The changing career paths of PhDs and postdocs trained at EMBL” (previously published on BioRXiv as “PhD and postdoc training outcomes at EMBL: changing career paths for life scientists in Europe”).

1 Section 1: preparation

1.1 Load required R packages

library(gridExtra)
library(cowplot)
library(ggbeeswarm)
library(Hmisc)
library(ggpubr)
library(reshape2)
library(psych)
library(plotly)
library(orca)
library(RColorBrewer)
library(survival)
library(glmnet)
library(survminer)
library(tidyverse)
library(reticulate)
#load helper functions
source("code/utils.R")
knitr::opts_chunk$set(fig.path = "./fig/", dev = c("png","pdf"))
#silent warnings and messages
knitr::opts_chunk$set(warning = FALSE, message = FALSE, autodep = TRUE)

1.2 Read and pre-process dataset

Due to date protection, the original dataset will not be included in the public version.

#original data table
originalData = read_csv("data/datafinal_forR_050222.csv")

1.2.1 Pre-processing

Years as integer

careerData <- mutate_at(originalData, 
                        vars(`from.year`, to_year,phd_year_if_known),
                        as.integer) %>% rename(from_year = `from.year`)

Calculate Year at EMBL

careerData <- mutate(careerData, year_at_EMBL = to_year - from_year)

Change unknown to NA in all character columns

subStr <- function(x) {
  x[x %in% c("unknown","Unknown")] <- NA
  x
}
careerData <- mutate_if(careerData, is.character, subStr)

1.2.1.1 Some additonal changes of the variable names

Change CV_wasNonR to CV_wasNonRes

careerData <- dplyr::rename(careerData, CV_wasNonRes = CV_wasNonR)

Change PhDtoGL to PhDtoPI

careerData <- dplyr::rename(careerData, PhDtoPI = PhDtoGL)

2 Section 2: Overview of career outcomes

2.1 Career outcomes for EMBL PhD and postdoc alumni

2.1.1 Prepare data for plotting

tempdata <- originalData
tempdata$timepointLast <- as.character(tempdata$timepointLast)
tempdata$timepointLast <- gsub('AcFac', 'AcPI', tempdata$timepointLast)
tempdata$type_pre_postdoc <- gsub('predoc', 'PhD', tempdata$type_pre_postdoc)
tempdata$timepointLast <- factor(tempdata$timepointLast, 
                                 levels = c("unknown", "NonSci", "NonRes", "IndR", "AcPD", "AcOt", "AcPI"))
tempdata$gender <- as.factor(tempdata$gender)
predoc <- tempdata[ which(tempdata$type_pre_postdoc=='PhD'),]
postdoc <- tempdata[ which(tempdata$type_pre_postdoc=='postdoc'),]
tempdata$type_pre_postdoc <- factor(tempdata$type_pre_postdoc, levels = c("PhD", "postdoc"))
##change colours###
mycols<-setNames(c("#D0D0CE","#E2E868","#F4C61F","#F49E17","#D41645","#734595","#3B6FB6"),
                 c('unknown','NonSci','NonRes','IndR','AcPD','AcOt', 'AcPI'))

2.1.2 Positions in 2021 (Figure 1A)

#postdoc_2021
mydata_type <- tempdata %>%
  dplyr::group_by(type_pre_postdoc, timepointLast) %>%
  dplyr::summarise(count = n())

ggplot(mydata_type, aes(fill=timepointLast, y=count, x=type_pre_postdoc),col=mycols) + 
  geom_bar( stat="identity", position="fill") + 
  scale_fill_manual(values=mycols) + 
  labs(x = "Role at EMBL", y="Percentage in each role") + 
  ggtitle("Current role in 2021")+ theme_minimal(base_size = 16) + theme(axis.title=element_text(size=14)) + theme(plot.title = element_text(size=14), axis.text.x = element_text(size = 12,angle=45, hjust=1))+ 
  theme(axis.title=element_text(size=14)) + theme(plot.title=element_text(size=14))+
  theme(legend.key.height = unit(1, 'lines'),legend.key.width = unit(0.4, 'lines'),legend.title = element_text(size=14), legend.text = element_text(size=12)) +          scale_y_continuous(labels = scales::label_percent(accuracy = 1L))

write.csv(mydata_type, "source1A.csv")

2.2 Positions stratified by years after EMBL

2.2.1 Prepare data for plotting

mydata2 <- tempdata
mydata2$cohort <- factor(mydata2$cohort, levels = c("1997-2004","2005-2012","2013-2020"))
mydata2$position_1 <- as.character(mydata2$position_1)
mydata2$position_5 <- as.character(mydata2$position_5)
mydata2$position_9 <- as.character(mydata2$position_9)
mydata2$position_13 <- as.character(mydata2$position_13)
mydata2$position_17 <- as.character(mydata2$position_17)
mydata2$timepointLast <- as.character(mydata2$timepointLast)
mydata2$position_1 <- gsub('laterPI', 'unknown', mydata2$position_1)
mydata2$position_5 <- gsub('laterPI', 'unknown', mydata2$position_5)
mydata2$position_9 <- gsub('laterPI', 'unknown', mydata2$position_9)
mydata2$position_13 <- gsub('laterPI', 'unknown', mydata2$position_13)
mydata2$position_17 <- gsub('laterPI', 'unknown', mydata2$position_17)
mydata2$timepointLast <- gsub('laterPI', 'unknown', mydata2$timepointLast)
mydata2$position_17[is.na(mydata2$position_17)]<-'timepoint not reached'
mydata2$position_13[is.na(mydata2$position_13)]<-'timepoint not reached'
mydata2$position_9[is.na(mydata2$position_9)]<-'timepoint not reached'
mydata2$position_5[is.na(mydata2$position_5)]<-'timepoint not reached'
mydata2$position_1[is.na(mydata2$position_1)]<-'timepoint not reached'
mydata2$position_1 <- gsub('AcFac', 'AcPI', mydata2$position_1)
mydata2$position_5 <- gsub('AcFac', 'AcPI', mydata2$position_5)
mydata2$position_9 <- gsub('AcFac', 'AcPI', mydata2$position_9)
mydata2$position_13 <- gsub('AcFac', 'AcPI', mydata2$position_13)
mydata2$position_17 <- gsub('AcFac', 'AcPI', mydata2$position_17)
mydata2$timepointLast <- gsub('AcFac', 'AcPI', mydata2$timepointLast)
mydata2$position_1 <- factor(mydata2$position_1, levels = c("unknown","NonSci", "NonRes", "IndR", "AcPD", "AcOt", "AcPI"))
mydata2$position_5 <- factor(mydata2$position_5, levels = c('timepoint not reached',"unknown","NonSci", "NonRes", "IndR", "AcPD", "AcOt", "AcPI"))
mydata2$position_9 <- factor(mydata2$position_9, levels = c('timepoint not reached',"unknown","NonSci", "NonRes", "IndR", "AcPD", "AcOt", "AcPI"))
mydata2$position_13 <- factor(mydata2$position_13, levels = c('timepoint not reached',"unknown","NonSci", "NonRes", "IndR", "AcPD", "AcOt", "AcPI"))
mydata2$position_17 <- factor(mydata2$position_17, levels = c('timepoint not reached',"unknown","NonSci", "NonRes", "IndR", "AcPD", "AcOt", "AcPI"))
mydata2$timepointLast <- factor(mydata2$timepointLast, levels = c("unknown","NonSci", "NonRes", "IndR", "AcPD", "AcOt", "AcPI"))
predoc <- mydata2[ which(mydata2$type_pre_postdoc=='PhD'),]
postdoc <- mydata2[ which(mydata2$type_pre_postdoc=='postdoc'),]

##change colours###
mycols_pos1<-c("#D0D0CE", "#E2E868","#F4C61F","#F49E17","#D41645","#734595","#3B6FB6")
names(mycols_pos1)=c('unknown', 'NonSci','NonRes','IndR','AcPD','AcOt','AcPI')
mycols<-c("grey90","#D0D0CE", "#E2E868","#F4C61F","#F49E17","#D41645","#734595","#3B6FB6")
names(mycols)=c('timepoint not reached','unknown','NonSci','NonRes','IndR','AcPD','AcOt', 'AcPI')

2.2.2 Positions after five years (Figure 1B)

#postdoc_5
postdoc_cohort <- postdoc %>%
  dplyr::group_by(cohort, position_5) %>%
  dplyr::summarise(count = n())
write_csv(postdoc_cohort, "Source_1B_PD.csv")
bp <- ggplot(postdoc_cohort[which(postdoc_cohort$position_5!='timepoint not reached'),], 
             aes(fill=position_5, y=count, x=cohort),col=mycols) + 
  geom_bar( stat="identity", position="fill") + 
  scale_fill_manual(values=mycols) + 
  labs(x = "cohort", y="Percentage in each role") + 
  ggtitle("Postdoc alumni, 
5 yrs after EMBL postdoc ended")+ theme_minimal(base_size = 16) + theme(axis.title=element_text(size=12)) + theme(plot.title = element_text(size=10), axis.text.x = element_text(size = 12,angle=45, hjust=1))+ 
  theme(legend.key.height = unit(1, 'lines'),legend.key.width = unit(0.4, 'lines'),legend.title = element_text(size=12), legend.text = element_text(size=12)) +          scale_y_continuous(labels = scales::label_percent(accuracy = 1L))
  

#predoc_5
predoc_cohort <- predoc %>%
  dplyr::group_by(cohort, position_5) %>%
  dplyr::summarise(count = n())
write_csv(predoc_cohort, "Source_1B_PhD.csv")
b <- ggplot(predoc_cohort[which(predoc_cohort$position_5!='timepoint not reached'),], 
            aes(fill=position_5, y=count, x=cohort),col=mycols) + 
  geom_bar(stat="identity", position="fill") + 
  scale_fill_manual(values=mycols) + labs(x = "cohort", y="Percentage in each role") +
  ggtitle("PhD alumni, 
5 yrs after PhD defense")+  theme_minimal(base_size = 16) + theme(axis.title=element_text(size=12)) + theme(plot.title = element_text(size=10), axis.text.x = element_text(size = 12,angle=45, hjust=1))+ 
  theme(legend.key.height = unit(1, 'lines'),legend.key.width = unit(0.4, 'lines'),legend.title = element_text(size=12), legend.text = element_text(size=12)) +          scale_y_continuous(labels = scales::label_percent(accuracy = 1L))

#extract legend
g_legend<-function(a.gplot){
  tmp <- ggplot_gtable(ggplot_build(a.gplot))
  leg <- which(sapply(tmp$grobs, function(x) x$name) == "guide-box")
  legend <- tmp$grobs[[leg]]
  return(legend)}

for_legend <-bp+theme(legend.position="right")
mylegend<-g_legend(for_legend)


grid.arrange(arrangeGrob(b + theme(legend.position="none"),
                               bp + theme(legend.position="none"),
                               nrow=1,widths=c(4,4)))

2.2.3 Other time points (Figure 1 - Supplement 2)

#postdoc_1
postdoc_cohort <- postdoc %>%
  dplyr::group_by(cohort, position_1) %>%
  dplyr::summarise(count = n())
write_csv(postdoc_cohort, "Source_F1S2B.csv")
ap <- ggplot(postdoc_cohort, 
             aes(fill=position_1, y=count, x=cohort),col=mycols_pos1) + 
  geom_bar( stat="identity", position="fill") + 
  scale_fill_manual(values=mycols_pos1) + 
  labs(x = "cohort", y="Percentage in each role") + 
  ggtitle("Postdocs, 1 year after EMBL postdoc ended") + 
   theme_minimal(base_size = 20)+ theme(plot.title = element_text(size=16)) + theme(axis.title=element_text(size=16)) +          scale_y_continuous(labels = scales::label_percent(accuracy = 1L))

#postdoc_5
postdoc_cohort <- postdoc %>%
  dplyr::group_by(cohort, position_5) %>%
  dplyr::summarise(count = n())
write_csv(postdoc_cohort, "Source_F1S2D.csv")
bp <- ggplot(postdoc_cohort[which(postdoc_cohort$position_5!="timepoint not reached"),], 
             aes(fill=position_5, y=count, x=cohort),col=mycols) + 
  geom_bar( stat="identity", position="fill") + 
  scale_fill_manual(values=mycols) + 
  labs(x = "cohort", y="Percentage in each role") + 
  ggtitle("Postdocs, 5 years after EMBL postdoc ended") + 
   theme_minimal(base_size = 20)+ theme(plot.title = element_text(size=16))+ theme(axis.title=element_text(size=16)) +          scale_y_continuous(labels = scales::label_percent(accuracy = 1L))

#postdoc_9
postdoc_cohort <- postdoc%>%
  dplyr::group_by(cohort, position_9) %>%
  dplyr::summarise(count = n())
write_csv(postdoc_cohort, "Source_F1S2F.csv")
cp<-ggplot(postdoc_cohort[which(postdoc_cohort$position_9!="timepoint not reached"),], 
           aes(fill=position_9, y=count, x=cohort),col=mycols) + 
  geom_bar(stat="identity", position="fill") + 
  scale_fill_manual(values=mycols) + 
  labs(x = "cohort", y="Percentage in each role") + 
  ggtitle("Postdocs, 9 years after EMBL postdoc ended") + 
   theme_minimal(base_size = 20)+ theme(plot.title = element_text(size=16))+ theme(axis.title=element_text(size=16)) +          scale_y_continuous(labels = scales::label_percent(accuracy = 1L))

#postdoc_13
postdoc_cohort <- postdoc %>%
  dplyr::group_by(cohort, position_13) %>%
  dplyr::summarise(count = n())
write_csv(postdoc_cohort, "Source_F1S2H.csv")
dp <- ggplot(postdoc_cohort[which(postdoc_cohort$position_13!="timepoint not reached"),], 
             aes(fill=position_13, y=count, x=cohort),col=mycols) + 
  geom_bar( stat="identity", position="fill") + 
  scale_fill_manual(values=mycols) + 
  labs(x = "cohort", y="Percentage in each role") + 
  ggtitle("Postdocs, 13 years after EMBL postdoc ended") + 
  theme_minimal(base_size = 20)+ theme(plot.title = element_text(size=16))+ theme(axis.title=element_text(size=16)) +          scale_y_continuous(labels = scales::label_percent(accuracy = 1L))

#postdoc_17
postdoc_cohort <- postdoc %>%
  dplyr::group_by(cohort, position_17) %>%
  dplyr::summarise(count = n())
write_csv(postdoc_cohort, "Source_F1S2J.csv")
ep <- ggplot(postdoc_cohort[which(postdoc_cohort$position_17!="timepoint not reached"),], 
            aes(fill=position_17, y=count, x=cohort),col=mycols) + 
  geom_bar( stat="identity", position="fill") + scale_fill_manual(values=c(mycols)) +
  labs(x = "cohort", y="Percentage in each role") + 
  ggtitle("Postdocs, 17 years after EMBL postdoc ended")+ 
   theme_minimal(base_size = 20)+ theme(plot.title = element_text(size=16))+ theme(axis.title=element_text(size=16)) +          scale_y_continuous(labels = scales::label_percent(accuracy = 1L))



#predoc_1
predoc_cohort <- predoc %>%
  dplyr::group_by(cohort, position_1) %>%
  dplyr::summarise(count = n())
write_csv(predoc_cohort, "Source_F1S2A.csv")

a <- ggplot(predoc_cohort[which(predoc_cohort$position_1!="timepoint not reached"),], 
            aes(fill=position_1, y=count, x=cohort),col=mycols_pos1) + 
  geom_bar(stat="identity", position="fill") + scale_fill_manual(values=mycols_pos1) + 
  labs(x = "cohort", y="Percentage in each role") + ggtitle("PhD students, 1 year after PhD defence") + 
   theme_minimal(base_size = 20)+ theme(plot.title = element_text(size=16))+ theme(axis.title=element_text(size=16)) +          scale_y_continuous(labels = scales::label_percent(accuracy = 1L))

#predoc_5
predoc_cohort <- predoc %>%
  dplyr::group_by(cohort, position_5) %>%
  dplyr::summarise(count = n())
write_csv(predoc_cohort, "Source_F1S2C.csv")
b <- ggplot(predoc_cohort[which(predoc_cohort$position_5!="timepoint not reached"),], 
            aes(fill=position_5, y=count, x=cohort),col=mycols) + 
  geom_bar(stat="identity", position="fill") + scale_fill_manual(values=mycols) + 
  labs(x = "cohort", y="Percentage in each role") + ggtitle("PhD students, 5 years after PhD defence") + 
   theme_minimal(base_size = 20)+ theme(plot.title = element_text(size=16))+ theme(axis.title=element_text(size=16)) +          scale_y_continuous(labels = scales::label_percent(accuracy = 1L))

#predoc_9
predoc_cohort <- predoc %>%
  dplyr::group_by(cohort, position_9) %>%
  dplyr::summarise(count = n())
write_csv(predoc_cohort, "Source_F1S2E.csv")
c <- ggplot(predoc_cohort[which(predoc_cohort$position_9!="timepoint not reached"),], 
            aes(fill=position_9, y=count, x=cohort),col=mycols) + 
  geom_bar( stat="identity", position="fill") + scale_fill_manual(values=mycols) + 
  labs(x = "cohort", y="Percentage in each role") + 
  ggtitle("PhD students, 9 years after  PhD defence") + 
   theme_minimal(base_size = 20)+ theme(plot.title = element_text(size=16))+ theme(axis.title=element_text(size=16)) +          scale_y_continuous(labels = scales::label_percent(accuracy = 1L))

#predoc_13
predoc_cohort <- predoc %>%
  dplyr::group_by(cohort, position_13) %>%
  dplyr::summarise(count = n())
write_csv(predoc_cohort, "Source_F1S2G.csv")
d <- ggplot(predoc_cohort[which(predoc_cohort$position_13!="timepoint not reached"),], 
            aes(fill=position_13, y=count, x=cohort),col=mycols) + 
  geom_bar(stat="identity", position="fill") + scale_fill_manual(values=mycols) + 
  labs(x = "cohort", y="Percentage in each role") + ggtitle("PhD students, 13 years after PhD defence") + 
   theme_minimal(base_size = 20)+ theme(plot.title = element_text(size=16))+ theme(axis.title=element_text(size=16)) +          scale_y_continuous(labels = scales::label_percent(accuracy = 1L))

#predoc_17
predoc_cohort <- predoc %>%
  dplyr::group_by(cohort, position_17) %>%
  dplyr::summarise(count = n())
write_csv(predoc_cohort, "Source_F1S2I.csv")
e <- ggplot(predoc_cohort[which(predoc_cohort$position_17!="timepoint not reached"),], 
            aes(fill=position_17, y=count, x=cohort),col=mycols) + 
  geom_bar( stat="identity", position="fill") + scale_fill_manual(values=c(mycols)) + 
  labs(x = "cohort", y="Percentage in each role") + ggtitle("PhD students, 17 years after PhD defence") + 
   theme_minimal(base_size = 20)+ theme(plot.title = element_text(size=16))+ theme(axis.title=element_text(size=12))  +          scale_y_continuous(labels = scales::label_percent(accuracy = 1L))



#combine the figures
grid.arrange(arrangeGrob(a + theme(legend.position="none"),
                         ap + theme(legend.position="none"),
                         b + theme(legend.position="none"),
                         bp + theme(legend.position="none"),
                         c + theme(legend.position="none"),
                         cp + theme(legend.position="none"),
                         d + theme(legend.position="none"),
                         dp + theme(legend.position="none"),
                         e + theme(legend.position="none"),
                         ep + theme(legend.position="none") , nrow=5,ncol=2))

2.3 Compare with other cohorts (Figure 1C and Figure 1 - Figure Supplement 3)

2.3.1 Data preparation for plotting

#load data file with values
otherCohort = read_csv("data/in_year_all.csv")
my_col <- c("grey40","#3B6FB6")

   mydata <- melt(otherCohort[which (otherCohort$institution=="Stanford University (SU) Bioscience | 2018 dataset."),], id=c("institution","Cohorts","Cohort_short","ratio", "ratio_text"))
  mydata$variable <- factor(mydata$variable , levels = c("EMBL", "NorthAmerica"))
  
  levels(mydata$variable) <- c("EMBL", "Stanford University (SU) Bioscience | 2018 dataset.")
    plot_b<-ggplot(mydata, aes(x=Cohorts, y=value, fill=variable)) +geom_bar(position="dodge",stat="identity") +theme_bw(base_size = 16)  + theme(legend.position="top")+
  guides(fill = guide_legend(nrow = 2, byrow = TRUE, title.position="top")) +labs(x = "cohort", y="Alumni in PI-like roles
(Percentage)")+ scale_fill_manual(values=c("#3B6FB6","grey40"),name = "Institution:")+  theme(legend.key.height = unit(1, 'lines'),legend.title = element_text(size=11), legend.text = element_text(size=11))+          scale_y_continuous(labels = scales::label_percent(accuracy = 1L),limits=c(0,0.5))
  
  plot_d<-ggplot(mydata[ which(mydata$variable=='EMBL'),], aes(x=Cohort_short, y=ratio)) +geom_point() +
    theme_bw(base_size = 10)+geom_hline(yintercept = 1, linetype=3)+
 theme( axis.text.x = element_text(size = 8),plot.background = element_rect(fill = "grey90"))+labs(x = "cohort", y="EMBL / SU")+ scale_y_continuous(breaks=seq(0,2,1))+ coord_cartesian(ylim=c(-.2, 2.2))
  
  mydata_plot <-  ggdraw() +
    draw_plot(plot_b) +
    draw_plot(plot_d, x = 0.75, y = .4, width= 0.22, height = .3)
  
  mydata_plot

institions<-sort(unique(otherCohort$institution[otherCohort$institution!="Stanford University (SU) Bioscience | 2018 dataset."]))
pList <- lapply(institions, function(i) {
  
  mydata <- melt(otherCohort[which (otherCohort$institution==i),], id=c("institution","Cohorts","Cohort_short","ratio","ratio_text"))
  mydata$variable <- factor(mydata$variable , levels = c("EMBL", "NorthAmerica"))
  yaxistext<-unique(mydata$ratio_text)
  levels(mydata$variable) <- c("EMBL", i)
    plot_b<-ggplot(mydata, aes(x=Cohorts, y=value, fill=variable)) +geom_bar(position="dodge",stat="identity") +theme_bw(base_size = 22)+ylim(0,0.5) +
    theme(legend.position="top")+
  guides(fill = guide_legend(nrow = 2, byrow = TRUE, title.position="left")) +labs(x = "cohort", y="Alumni in PI-like roles
(Percentage)")+ scale_fill_manual(values=c("#3B6FB6","grey40"),name = "Institution:")+          scale_y_continuous(labels = scales::label_percent(accuracy = 1L),limits=c(0,0.5))
  
  plot_d<-ggplot(mydata[ which(mydata$variable=='EMBL'),], aes(x=Cohort_short, y=ratio)) +geom_point() +
    theme_bw(base_size = 16)+geom_hline(yintercept = 1, linetype=3)+
 theme( axis.text.x = element_text(size = 14),plot.background = element_rect(fill = "grey90"))+labs(x = "cohort", y=yaxistext)+ scale_y_continuous(breaks=seq(0,2,1))+ coord_cartesian(ylim=c(-.2, 2.2))
  
  width_count<-0.05+(0.075*length(unique(mydata$Cohort_short)))
  
  mydata_plot <-  ggdraw() +
    draw_plot(plot_b) +
    draw_plot(plot_d, x = 0.7, y = .4, width = width_count, height = .3)
  
  mydata_plot
})
plot_grid(plotlist = pList, ncol=2)

2.4 Simplified career paths for alumni who have held different types of role (Figure 1 Figure Supplement 1)

#configuration of python environment
use_condaenv("r-reticulate")

2.4.1 Color code for the Sankey diagram

colList <- list(NonSci = "#E2E868", NonRes = "#F4C61F",IndR = "#F49E17",AcPD = "#D41645", AcPhD = "#707372", AcOt = "#734595",  AcPI = "#3B6FB6")
             
dummyTab <- data.frame(var = names(colList), val = 1)
g <- ggplot(dummyTab, aes(x=var, y=val, fill = var )) + geom_bar(stat="identity") +
  scale_fill_manual(values = colList, name = "role")
legendPlot <- cowplot::get_legend(g)
plot_grid(legendPlot)

Function for producing Sankey diagram for career path

plotSankey <- function(tempdata, role) {
  
  rolePrev <- list(was_GL = "new_AcFac_previous",
                   CV_wasNonSci = "new_NonSci_previous",
                   CV_wasIndR = "new_IndR_previous", 
                   CV_wasNonR = "new_SciR_previous",
                   CV_wasAcOt = "new_AcOt_previous")

  #first filter for alumni with full CV who became PIs
  Type_data <- tempdata[which(tempdata$completeness=='Complete CV'),]
  Type_data <- Type_data[which(Type_data[[role]]=='y'),]
  role_code <- rolePrev[[role]]
  #create summary tables for previous and current roles and swap names for nodes
  Type_count <- Type_data  %>% dplyr::group_by_at(rolePrev[[role]]) %>% dplyr::summarize(count = n())
  
  Type_count <- add_column(Type_count, target = paste0(role_code), .after = 1)
  Type_count<-as.data.frame(Type_count)
  names(Type_count)[1]<- 'source'
  names(Type_count)[3]<- "value"
  Type_count$source = gsub("AcFac",0,Type_count$source)
  Type_count$source = gsub("AcOt",1,Type_count$source)
  Type_count$source = gsub("AcPD",2,Type_count$source)
  Type_count$source = gsub("AcPhD",3,Type_count$source)
  Type_count$source = gsub("IndR",4,Type_count$source)
  Type_count$source = gsub("NonRes",5,Type_count$source)
  Type_count$source = gsub("NonSci",6,Type_count$source)
Type_count$target = gsub("new_AcFac_previous",7,Type_count$target)
  Type_count$target = gsub("new_AcOt_previous",8,Type_count$target)
  Type_count$target = gsub("new_IndR_previous",11,Type_count$target)
  Type_count$target = gsub("new_SciR_previous",12,Type_count$target)
  Type_count$target = gsub("new_NonSci_previous",13,Type_count$target)
  Type_now <- Type_data  %>%
    dplyr::group_by(timepointLast) %>%
    dplyr::summarize(count = n())
  Type_now <- add_column(Type_now, source = paste0(role_code), .before = 1)
  Type_now<-as.data.frame(Type_now)
  names(Type_now)[2]<- 'target'
  names(Type_now)[3]<- "value"
  Type_now$target = gsub("AcFac",14,Type_now$target)
  Type_now$target = gsub("AcOt",15,Type_now$target)
  Type_now$target = gsub("AcPD",16,Type_now$target)
  Type_now$target = gsub("AcPhD",17,Type_now$target)
  Type_now$target = gsub("IndR",18,Type_now$target)
  Type_now$target = gsub("NonRes",19,Type_now$target)
  Type_now$target = gsub("NonSci",20,Type_now$target)
  Type_now$source = gsub("new_AcFac_previous",7,Type_now$source)
  Type_now$source = gsub("new_AcOt_previous",8,Type_now$source)
  Type_now$source = gsub("new_IndR_previous",11,Type_now$source)
  Type_now$source = gsub("new_SciR_previous",12,Type_now$source)
  Type_now$source = gsub("new_NonSci_previous",13,Type_now$source)

  #merge and plot
  links <- rbind(Type_count, Type_now)
  write.csv(links, paste0("Source_F1SF1_",rolePrev[[role]],".csv"))
  p <- plot_ly(
    type = "sankey",
    orientation = "h",
    
    node = list(
      label = c("","", "", "","","","",
                "","","","","","","",
                "", "","","","","",""),
      
      color = c(colList$AcPI, colList$AcOt, colList$AcPD, colList$AcPhD, colList$IndR, colList$NonRes, colList$NonSci,
                colList$AcPI, colList$AcOt, colList$AcPD, colList$AcPhD, colList$IndR, colList$NonRes, colList$NonSci,
                colList$AcPI, colList$AcOt, colList$AcPD, colList$AcPhD, colList$IndR, colList$NonRes, colList$NonSci),
      pad = 25,
      thickness = 75, width=900,
      line = list(
        color = "black",
        width = 0
      )
    ),
    
    link =links
    )
  p
}

2.4.2 AcPI

plotSankey(originalData, "was_GL")

2.4.3 AcOt

plotSankey(originalData, "CV_wasAcOt")

2.4.4 IndR

plotSankey(originalData, "CV_wasIndR")

2.4.5 NonRes

plotSankey(originalData, "CV_wasNonR")

2.4.6 NonSci

plotSankey(originalData, "CV_wasNonSci")

The text labels in this supplementary figure are added manually

3 Section 3: Rate of entry into different career areas

3.1 Kaplan-Meier plots of time to position, stratified by cohorts

3.1.1 AcPI

corhortOut <- plotCohort(careerData, "AcPI", showTable = FALSE, maxY=0.5, legendPos = "bottom")

## [1] "Number of subjects with negative time: 2"
## [1] "Negative values are changed to zero"
## [1] "Total number of subjects used in the analysis: 2138"
## [1] "Number of events (being found in a position): 539"
## [1] "Total subject number stratified by pre/post-doc"
## 
##     PhD alumni Postdoc alumni 
##            933           1205 
## [1] "Event number stratified by pre/post-doc"
## 
##     PhD alumni Postdoc alumni 
##            196            343

corhortOut$plot

tab1_co <- corhortOut$table %>% mutate(Position = "AcPI")
write.csv(tab1_co,"Figure 2A - Source Data 1.csv")

3.1.2 AcOt

corhortOut <- plotCohort(careerData, "AcOt", showTable = FALSE, maxY = 0.5, legendPos = "bottom")

## [1] "Number of subjects with negative time: 4"
## [1] "Negative values are changed to zero"
## [1] "Total number of subjects used in the analysis: 2094"
## [1] "Number of events (being found in a position): 477"
## [1] "Total subject number stratified by pre/post-doc"
## 
##     PhD alumni Postdoc alumni 
##            928           1166 
## [1] "Event number stratified by pre/post-doc"
## 
##     PhD alumni Postdoc alumni 
##            148            329

corhortOut$plot

tab2_co <- corhortOut$table %>% mutate(Position = "AcOt")
write.csv(tab2_co,"Figure 2B - Source Data 1.csv")

3.1.3 IndR

corhortOut <- plotCohort(careerData, "IndR", showTable = FALSE, maxY=0.5, legendPos = "top")

## [1] "Number of subjects with negative time: 9"
## [1] "Negative values are changed to zero"
## [1] "Total number of subjects used in the analysis: 2225"
## [1] "Number of events (being found in a position): 414"
## [1] "Total subject number stratified by pre/post-doc"
## 
##     PhD alumni Postdoc alumni 
##            948           1277 
## [1] "Event number stratified by pre/post-doc"
## 
##     PhD alumni Postdoc alumni 
##            196            218

corhortOut$plot

tab3_co <- corhortOut$table %>% mutate(Position = "IndR")
write.csv(tab3_co,"Figure 2C - Source Data 1.csv")

3.1.4 NonRes

corhortOut <- plotCohort(careerData, "NonRes", showTable = FALSE, maxY=0.5, legendPos = "top")

## [1] "Number of subjects with negative time: 3"
## [1] "Negative values are changed to zero"
## [1] "Total number of subjects used in the analysis: 2221"
## [1] "Number of events (being found in a position): 364"
## [1] "Total subject number stratified by pre/post-doc"
## 
##     PhD alumni Postdoc alumni 
##            944           1277 
## [1] "Event number stratified by pre/post-doc"
## 
##     PhD alumni Postdoc alumni 
##            194            170

corhortOut$plot

tab4_co <- corhortOut$table %>% mutate(Position = "NonRes")
write.csv(tab4_co,"Figure 2D - Source Data 1.csv")

3.1.5 NonSci

corhortOut <- plotCohort(careerData, "NonSci", showTable = FALSE, maxY=0.5, legendPos = "top")

## [1] "Number of subjects with negative time: 6"
## [1] "Negative values are changed to zero"
## [1] "Total number of subjects used in the analysis: 2253"
## [1] "Number of events (being found in a position): 131"
## [1] "Total subject number stratified by pre/post-doc"
## 
##     PhD alumni Postdoc alumni 
##            961           1292 
## [1] "Event number stratified by pre/post-doc"
## 
##     PhD alumni Postdoc alumni 
##             81             50

corhortOut$plot

tab5_co <- corhortOut$table %>% mutate(Position = "NonSci")
write.csv(tab5_co,"Figure 2E - Source Data 1.csv")

##PhD to PI length by cohort

Box-plot with overlaid dot-plot showing the distribution in calendar years between PhD conferral and start of first PI role for PhD alumni who defended in 1997-2012 who became a principal investigator within 9-years, and for whom we have a detailed career path (n=157).

mydatatime <- as.data.frame(tempdata) 
mydatatime <- mydatatime[ which(mydatatime$completeness=="Complete CV" ), ]
predoc_data <- mydatatime[ which(mydatatime$type_pre_postdoc=='PhD'), ]
predoc_to2012 <- predoc_data[ which(predoc_data$to_year<2013), ]
predoc_to2012_exit_GL <-predoc_to2012[ which(predoc_to2012$PhDtoGL<10& predoc_to2012$completeness=="Complete CV" ), ]
predoc_to2012_exit_GL$cohort <- as.factor(predoc_to2012_exit_GL$cohort)

stat <- t.test(predoc_to2012_exit_GL$PhDtoGL~predoc_to2012_exit_GL$cohort, var.equal = FALSE)[["p.value"]]
f10 <- -log10(stat)
fstat <- round (stat, f10 +3)
fstat[stat<2.2e-16] <- "<2.2e-16"
fstat <- paste("p=", fstat, " ", if (stat<0.05) {"*"},if (stat<0.001) {"*"},if (stat<0.0001) {"*"}, sep="")
fstat <- toString(fstat)



p <- ggboxplot(predoc_to2012_exit_GL, x="cohort", y="PhDtoGL", fill='#ffffff', color="grey58")
p <- p + geom_dotplot(aes(predoc_to2012_exit_GL$cohort, predoc_to2012_exit_GL$PhDtoGL), 
                      binaxis='y', stackdir='center',stackratio=1.0, dotsize=0.25) + 
  stat_summary(fun.y = "mean", geom = "point", color = "RED", shape=4, size=10)   + labs(x="PhD cohort", y = "Calendar years, PhD to first PI role") + 
  theme_classic(base_size = 16) + geom_hline(yintercept = 9,linetype="dashed")+ scale_y_continuous(breaks=seq(0,9,3))+coord_cartesian(ylim=c(0, 10))+ggtitle(fstat)+theme(plot.title = element_text(size=12))
p

source_temp <- predoc_to2012_exit_GL %>%
  dplyr::group_by(cohort, PhDtoGL) %>%
  dplyr::summarise(count = n())
write_csv(source_temp, "Figure2_fs1_1A_sourcedata.csv")

Box-plot with overlaid dot-plot showing the distribution in calendar years between completion of an EMBL postdoc and start of first PI role for postdoc alumni who completed their postdoc in 1997-2012 who became a principal investigator within 9-years, and for whom we have a detailed career path (n=218).

# postdoc by cohort
postdoc_data <- mydatatime[ which(mydatatime$type_pre_postdoc=='postdoc'), ]
data_sum <- postdoc_data %>% dplyr::group_by(cohort,PhDtoGL) %>% dplyr::summarise(n = n())
postdoc_to2012 <- postdoc_data[ which(postdoc_data$to_year<2013), ]
postdoc_to2012_exit_GL <- postdoc_to2012[ which(postdoc_to2012$PhDtoGL<10 & postdoc_to2012$completeness=="Complete CV" ), ]
postdoc_to2012_exit_GL$cohort <- as.factor(postdoc_to2012_exit_GL$cohort)
postdoc_to2012_exit_GLEMBL <- postdoc_to2012[ which(postdoc_to2012$EMBLtoPI<10 & postdoc_to2012$EMBLtoPI>-1 & postdoc_to2012$completeness=="Complete CV" ), ]
postdoc_to2012_exit_GLEMBL$cohort <- as.factor(postdoc_to2012_exit_GLEMBL$cohort)

stat <- t.test(postdoc_to2012_exit_GLEMBL$EMBLtoPI~postdoc_to2012_exit_GLEMBL$cohort, var.equal = FALSE)[["p.value"]]
f10 <- -log10(stat)
fstat <- round (stat, f10 +3)
fstat[stat<2.2e-16] <- "<2.2e-16"
fstat <- paste("p=", fstat, " ", if (stat<0.05) {"*"},if (stat<0.001) {"*"},if (stat<0.0001) {"*"}, sep="")
fstat <- toString(fstat)

p <- ggboxplot(postdoc_to2012_exit_GLEMBL, x="cohort", y="EMBLtoPI", fill='#ffffff', color="grey58")
p <- p + geom_dotplot(aes(postdoc_to2012_exit_GLEMBL$cohort, postdoc_to2012_exit_GLEMBL$EMBLtoPI), 
                      binaxis='y', stackdir='center',stackratio=1.0, dotsize=0.25) +
  stat_summary(fun.y = "mean", geom = "point", color = "RED", shape=4, size=10)   + labs(x="Postdoc cohort", y = "Calendar years, EMBL to first PI role") + 
  theme_classic(base_size = 16) + geom_hline(yintercept = 9,linetype="dashed")+ scale_y_continuous(breaks=seq(0,9,3))+coord_cartesian(ylim=c(0, 10))+ggtitle(fstat)+theme(plot.title = element_text(size=12))
p

source_temp <- postdoc_to2012_exit_GLEMBL %>%
  dplyr::group_by(cohort, EMBLtoPI) %>%
  dplyr::summarise(count = n())
write_csv(source_temp, "Figure2_fs1_1B_sourcedata.csv")

Same as above but for postdoc alumni for whom we were able to identify a PhD conferral year

stat <- t.test(postdoc_to2012_exit_GL$PhDtoGL~postdoc_to2012_exit_GL$cohort, var.equal = FALSE)[["p.value"]]

f10 <- -log10(stat)
fstat <- round (stat, f10 +3)
fstat[stat<2.2e-16] <- "<2.2e-16"
fstat <- paste("p=", fstat, " ", if (stat<0.05) {"*"},if (stat<0.001) {"*"},if (stat<0.0001) {"*"}, sep="")
fstat <- toString(fstat)

p <- ggboxplot(postdoc_to2012_exit_GL, x="cohort", y="PhDtoGL", fill='#ffffff', color="grey58")
p <- p + geom_dotplot(aes(postdoc_to2012_exit_GL$cohort, postdoc_to2012_exit_GL$PhDtoGL), 
                      binaxis='y', stackdir='center',stackratio=1.0, dotsize=0.25)+
  stat_summary(fun.y = "mean", geom = "point", color = "RED", shape=4, size=10)  + labs(x="Postdoc cohort", y = "Calendar years, PhD to first PI role") + 
   theme_classic(base_size = 16)+geom_hline(yintercept = 9,linetype="dashed")+ scale_y_continuous(breaks=seq(0,9,3))+coord_cartesian(ylim=c(0, 10))+ggtitle(fstat)+theme(plot.title = element_text(size=12))
p

source_temp <- postdoc_to2012_exit_GL %>%
  dplyr::group_by(cohort, PhDtoGL) %>%
  dplyr::summarise(count = n())
write_csv(source_temp, "Figure2_fs1_1C_sourcedata.csv")

4 Section 4 - Gender

4.0.1 Plot (Figure 3)

source_temp <- tempdata %>%
  dplyr::group_by(type_pre_postdoc, gender, timepointLast) %>%
  dplyr::summarise(count = n())
write_csv(source_temp, "Figure3A_sourcedata.csv")


#predoc
predoc_cohort <- tempdata[ which(mydatatime$type_pre_postdoc=="PhD" ), ] %>%
  dplyr::group_by(gender, timepointLast) %>%
  dplyr::summarize(count = n())

a <- ggplot(predoc_cohort, aes(fill=timepointLast, y=count, x=gender),col=mycols_pos1) + 
  geom_bar( stat="identity", position="fill") + scale_fill_manual(values=c(mycols_pos1)) + 
  labs(x = "Gender", y="Percentage in each role") + 
  theme_minimal(base_size = 16)+ ggtitle("PhD students,
current role") + 
  theme(axis.title=element_text(size=12))+ theme(plot.title=element_text(size=12)) + scale_y_continuous(labels = scales::label_percent(accuracy = 1L))


#postdoc
postdoc_cohort <- tempdata[ which(mydatatime$type_pre_postdoc=="postdoc" ), ]  %>%
  dplyr::group_by(gender, timepointLast) %>%
  dplyr::summarise(count = n())

ep <- ggplot(postdoc_cohort,  aes(fill=timepointLast, y=count, x=gender),col=mycols_pos1) + 
  geom_bar( stat="identity", position="fill") + 
  scale_fill_manual(values=c(mycols_pos1)) + 
  labs(x = "Gender", y="Percentage in each role") + 
  theme_minimal(base_size = 16) + ggtitle("Postdocs,
current role") + 
  theme(axis.title=element_text(size=12)) + theme(plot.title=element_text(size=12))+
  theme(legend.key.height = unit(1, 'lines'),legend.key.width = unit(0.4, 'lines'),legend.title = element_text(size=12), legend.text = element_text(size=12))  + scale_y_continuous(labels = scales::label_percent(accuracy = 1L))


# make legend
g_legend <- function(a.gplot){
  
  tmp <- ggplot_gtable(ggplot_build(a.gplot))
  leg <- which(sapply(tmp$grobs, function(x) x$name) == "guide-box")
  legend <- tmp$grobs[[leg]]
  return(legend)
  
  }

mylegend <- g_legend(ep)

#combine plots
grid.arrange(arrangeGrob(a + theme(legend.position="none"),
                               ep + theme(legend.position="none"),
                               mylegend,
                               nrow=1,widths=c(4,4,3)))

4.1 Kaplan-Meier plots of time to position, stratified by gender

4.1.1 AcPI

corhortOut <- plotGender(careerData, "AcPI", showTable = FALSE, maxY=0.5, legendPos = "bottom")

## [1] "Number of subjects with negative time: 2"
## [1] "Negative values are changed to zero"
## [1] "Total number of subjects used in the analysis: 2138"
## [1] "Number of events (being found in a position): 539"
## [1] "Total subject number stratified by pre/post-doc"
## 
##     PhD alumni Postdoc alumni 
##            933           1205 
## [1] "Event number stratified by pre/post-doc"
## 
##     PhD alumni Postdoc alumni 
##            196            343

corhortOut$plot

tab1_gender <- corhortOut$table %>% mutate(Position = "AcPI")
write_csv(tab1_gender, "Figure3B_km_acPI_gender_sourcedata.csv")

4.1.2 AcOt

corhortOut <- plotGender(careerData, "AcOt", showTable = FALSE, maxY = 0.5, legendPos = "bottom")

## [1] "Number of subjects with negative time: 4"
## [1] "Negative values are changed to zero"
## [1] "Total number of subjects used in the analysis: 2094"
## [1] "Number of events (being found in a position): 477"
## [1] "Total subject number stratified by pre/post-doc"
## 
##     PhD alumni Postdoc alumni 
##            928           1166 
## [1] "Event number stratified by pre/post-doc"
## 
##     PhD alumni Postdoc alumni 
##            148            329

corhortOut$plot

tab2_gender <- corhortOut$table %>% mutate(Position = "AcOt")
write_csv(tab2_gender, "Figure3_SF2A_sourcedata.csv")

4.1.3 IndR

corhortOut <- plotGender(careerData, "IndR", showTable = FALSE, maxY=0.5, legendPos = "top")

## [1] "Number of subjects with negative time: 9"
## [1] "Negative values are changed to zero"
## [1] "Total number of subjects used in the analysis: 2225"
## [1] "Number of events (being found in a position): 414"
## [1] "Total subject number stratified by pre/post-doc"
## 
##     PhD alumni Postdoc alumni 
##            948           1277 
## [1] "Event number stratified by pre/post-doc"
## 
##     PhD alumni Postdoc alumni 
##            196            218

corhortOut$plot

tab3_gender <- corhortOut$table %>% mutate(Position = "IndR")
write_csv(tab3_gender, "Figure3_SF2B_sourcedata.csv")

4.1.4 NonRes

corhortOut <- plotGender(careerData, "NonRes", showTable = FALSE, maxY=0.5, legendPos = "top")

## [1] "Number of subjects with negative time: 3"
## [1] "Negative values are changed to zero"
## [1] "Total number of subjects used in the analysis: 2221"
## [1] "Number of events (being found in a position): 364"
## [1] "Total subject number stratified by pre/post-doc"
## 
##     PhD alumni Postdoc alumni 
##            944           1277 
## [1] "Event number stratified by pre/post-doc"
## 
##     PhD alumni Postdoc alumni 
##            194            170

corhortOut$plot

tab4_gender <- corhortOut$table %>% mutate(Position = "NonRes")
write_csv(tab4_gender, "Figure3C_sourcedata.csv")

4.1.5 NonSci

corhortOut <- plotGender(careerData, "NonSci", showTable = FALSE, maxY=0.5, legendPos = "top")

## [1] "Number of subjects with negative time: 6"
## [1] "Negative values are changed to zero"
## [1] "Total number of subjects used in the analysis: 2253"
## [1] "Number of events (being found in a position): 131"
## [1] "Total subject number stratified by pre/post-doc"
## 
##     PhD alumni Postdoc alumni 
##            961           1292 
## [1] "Event number stratified by pre/post-doc"
## 
##     PhD alumni Postdoc alumni 
##             81             50

corhortOut$plot

tab5_gender <- corhortOut$table %>% mutate(Position = "NonSci")
write_csv(tab5_gender, "Figure3_SF2C_sourcedata.csv")

4.2 PhD to PI length by gender (Figure 3 FigureSupplement 1)

predoc_to2012_exit_GL$gender <- as.factor(predoc_to2012_exit_GL$gender)
stat <- t.test(predoc_to2012_exit_GL$PhDtoGL~predoc_to2012_exit_GL$gender, var.equal = FALSE)[["p.value"]]
f10 <- -log10(stat)
fstat <- round (stat, f10 +3)
fstat[stat<2.2e-16] <- "<2.2e-16"
fstat <- paste("p=", fstat, " ", if (stat<0.05) {"*"},if (stat<0.001) {"*"},if (stat<0.0001) {"*"}, sep="")
fstat <- toString(fstat)



p <- ggboxplot(predoc_to2012_exit_GL, x="gender", y="PhDtoGL", fill='#ffffff', color="grey58")
p <- p + geom_dotplot(aes(predoc_to2012_exit_GL$gender, predoc_to2012_exit_GL$PhDtoGL), 
                      binaxis='y', stackdir='center',stackratio=1.0, dotsize=0.25) + 
  stat_summary(fun.y = "mean", geom = "point", color = "RED", shape=4, size=10) + labs(x="PhD gender", y = "Calendar years, PhD to first PI role") +  
  theme_classic(base_size = 16) + geom_hline(yintercept = 9,linetype="dashed")+ scale_y_continuous(breaks=seq(0,9,3))+coord_cartesian(ylim=c(0, 10))+ggtitle(fstat)+theme(plot.title = element_text(size=12))
p

source_temp <- predoc_to2012_exit_GL %>%
  dplyr::group_by(gender, PhDtoGL) %>%
  dplyr::summarise(count = n())
write_csv(source_temp, "Figure3_FigureSupplement_1A_SourceData1.csv")

# postdoc by cohort
postdoc_to2012_exit_GLEMBL$gender <- as.factor(postdoc_to2012_exit_GLEMBL$gender)
stat <- t.test(postdoc_to2012_exit_GLEMBL$EMBLtoPI~postdoc_to2012_exit_GLEMBL$gender, var.equal = FALSE)[["p.value"]]
f10 <- -log10(stat)
fstat <- round (stat, f10 +3)
fstat[stat<2.2e-16] <- "<2.2e-16"
fstat <- paste("p=", fstat, " ", if (stat<0.05) {"*"},if (stat<0.001) {"*"},if (stat<0.0001) {"*"}, sep="")
fstat <- toString(fstat)

p <- ggboxplot(postdoc_to2012_exit_GLEMBL, x="gender", y="EMBLtoPI", fill='#ffffff', color="grey58")
p <- p + geom_dotplot(aes(postdoc_to2012_exit_GLEMBL$gender, postdoc_to2012_exit_GLEMBL$EMBLtoPI), 
                      binaxis='y', stackdir='center',stackratio=1.0, dotsize=0.25) +
  stat_summary(fun.y = "mean", geom = "point", color = "RED", shape=4, size=10)  + labs(x="Postdoc gender", y = "Calendar years, EMBL to first PI role") + 
  theme_classic(base_size = 16) + geom_hline(yintercept = 9,linetype="dashed")+ scale_y_continuous(breaks=seq(0,9,3))+coord_cartesian(ylim=c(0, 10))+ggtitle(fstat)+theme(plot.title = element_text(size=12))
p

source_temp <- postdoc_to2012_exit_GLEMBL %>%
  dplyr::group_by(gender, EMBLtoPI) %>%
  dplyr::summarise(count = n())
write_csv(source_temp, "Figure3_FigureSupplement_1B_SourceData1.csv")

Same as above but for postdoc alumni for whom we were able to identify a PhD conferral year

postdoc_to2012_exit_GL$gender <- as.factor(postdoc_to2012_exit_GL$gender)
stat <- t.test(postdoc_to2012_exit_GL$PhDtoGL~postdoc_to2012_exit_GL$gender, var.equal = FALSE)[["p.value"]]
f10 <- -log10(stat)
fstat <- round (stat, f10 +3)
fstat[stat<2.2e-16] <- "<2.2e-16"
fstat <- paste("p=", fstat, " ", if (stat<0.05) {"*"},if (stat<0.001) {"*"},if (stat<0.0001) {"*"}, sep="")
fstat <- toString(fstat)
p <- ggboxplot(postdoc_to2012_exit_GL, x="gender", y="PhDtoGL", fill='#ffffff', color="grey58")
p <- p + geom_dotplot(aes(postdoc_to2012_exit_GL$gender, postdoc_to2012_exit_GL$PhDtoGL), 
                      binaxis='y', stackdir='center',stackratio=1.0, dotsize=0.25)+
  stat_summary(fun.y = "mean", geom = "point", color = "RED", shape=4, size=10)  + labs(x="Postdoc gender", y = "Calendar years, PhD to first PI role") + 
   theme_classic(base_size = 16)+geom_hline(yintercept = 9,linetype="dashed")+ scale_y_continuous(breaks=seq(0,9,3))+coord_cartesian(ylim=c(0, 10))+ggtitle(fstat)+theme(plot.title = element_text(size=12))
p

source_temp <- postdoc_to2012_exit_GL %>%
  dplyr::group_by(gender, PhDtoGL) %>%
  dplyr::summarise(count = n())
write_csv(source_temp, "Figure3_FigureSupplement1C_SourceData1.csv")

5 Section 5: Predictors of academic PI positions

5.1 First author publications by PI status, Figure 4A

Histograms of the number of first author EMBL publications per ECR for alumni who have not (yet) become a PI (PI) and those who became a PI.

Plot

mydata <- as.data.frame(tempdata) 
mydata$was_GL = replace_na(mydata$was_GL,"n")

#calculate mean by group
df_mean <- mydata %>% group_by(was_GL) %>% summarise(meant = mean(pubs_FIRST_ra_only_TOTAL))
upper_lim<-quantile(mydata$pubs_FIRST_ra_only_TOTAL, 0.975)
first_author_stats<-tempdata
first_author_stats$pubs_FIRST_ra_only_TOTAL[first_author_stats$pubs_FIRST_ra_only_TOTAL> quantile(first_author_stats$pubs_FIRST_ra_only_TOTAL, 0.975, na.rm=TRUE)] <-paste( ">",upper_lim)
first_author_stats<- first_author_stats %>% group_by(was_GL, pubs_FIRST_ra_only_TOTAL) %>% summarize(n = n())
write.csv(first_author_stats,"Figure 4A - Source Data 1.csv" )
p <- ggplot(mydata, aes(x=pubs_FIRST_ra_only_TOTAL)) + 
  geom_histogram(binwidth=1, colour="black", fill="grey80") + facet_grid(was_GL ~ .) +
  theme(strip.text.y = element_text(angle = 0)) + 
  labs(x = "# first-author research articles", y="number of alumni") +
  theme_bw(base_size = 16)

p<- p +  geom_vline(data=filter(df_mean, was_GL=="n"), 
                    aes(xintercept=meant), colour="red",linetype="dashed") + 
  geom_vline(data=filter(df_mean, was_GL=="y"), aes(xintercept=meant), colour="red",linetype="dashed")

p <- p +  geom_text(data=filter(df_mean, was_GL=="n"), aes(label = round(meant,1), x = meant, y = 400),  
                    size=6,colour="red", angle=-90, vjust = -1) +  
  geom_text(data=filter(df_mean, was_GL=="y"), aes(label = round(meant,1), x = meant, y = 400),
            size=6,colour="red", angle=-90, vjust = -1) + xlim(-0.5,upper_lim+0.5)

#do t.test and extract p-value
stat <- t.test(mydata$pubs_FIRST_ra_only_TOTAL~mydata$was_GL, var.equal = FALSE)[["p.value"]]
f10 <- -log10(stat)
fstat <- round (stat, f10 +3)
fstat[stat<2.2e-16] <- "<2.2e-16"
fstat <- paste("p=", fstat, " ",  if (stat<0.05) {"*"},if (stat<0.001) {"*"},if (stat<0.0001) {"*"}, sep="")
fstat <- toString(fstat)

#plot
p <- p + ggtitle(fstat)+theme(plot.title = element_text(size=12), axis.text=element_text(size=12))
p

Summary statistics

summary_stats <-group_by(mydata, was_GL) %>%
  dplyr::summarise(
    count = n(),
    mean = mean(pubs_FIRST_ra_only_TOTAL, na.rm = TRUE),
    sd = sd(pubs_FIRST_ra_only_TOTAL, na.rm = TRUE)
  )

5.2 CNCI by PI status, Figure 4B

Histograms of the natural logarithm of the highest category normalized citation of first author EMBL publications per ECR publishing at least one first author paper linked to EMBL and with a CNCI value in Clarivate InCites’ database) for alumni who have not (yet) become a PI (PI) and those who became a PI.

df_mean <- mydata %>% group_by(was_GL) %>% summarise(meant = mean(pubs_FIRST_RA_CNCI_max, na.rm=TRUE))
df_mean

## # A tibble: 2 × 2
##   was_GL meant
##   <chr>  <dbl>
## 1 n       3.10
## 2 y       5.67

p <- ggplot(mydata, aes(x=log(pubs_FIRST_RA_CNCI_max))) + 
  geom_histogram(binwidth=1, colour="black", fill="grey80") +facet_grid(was_GL ~ .) +
  theme(strip.text.y = element_text(angle = 0)) + 
  labs(x = "log(max CNCI, first-author research articles)", y="number of alumni")+
  theme_bw(base_size = 16)

p<- p +  
  geom_vline(data=filter(df_mean, was_GL=="n"), aes(xintercept=log(meant)), colour="red",linetype="dashed") + 
  geom_vline(data=filter(df_mean, was_GL=="y"), aes(xintercept=log(meant)), colour="red",linetype="dashed")

p<- p +  geom_vline(aes(xintercept=log(1)), colour="black") 
p<- p +  geom_text(data=filter(df_mean, was_GL=="n"), aes(label = paste0("log(",round(meant,1),")"), x = log(meant), y = 250),  
                  size=6,colour="red", angle=-90, vjust = -1) +  
  geom_text(data=filter(df_mean, was_GL=="y"), aes(label = paste0("log(",round(meant,1),")"), x = log(meant), y = 250),
            size=6,colour="red", angle=-90, vjust = -1) 


#do t.test and extract p-value
stat<-t.test(mydata$pubs_FIRST_RA_CNCI_max~mydata$was_GL, var.equal = FALSE)[["p.value"]]
f10 <- -log10(stat)
fstat <- round (stat, f10 +3)
fstat[stat<2.2e-16] <- "<2.2e-16"
fstat <- paste("p=", fstat, " ", if (stat<0.05) {"*"},if (stat<0.001) {"*"},if (stat<0.0001) {"*"}, sep="")
fstat <- toString(fstat)

#plot
p<-p + ggtitle(fstat)+theme(plot.title = element_text(size=12), axis.text=element_text(size=12))
p

#extract source data
histogram_rawdata <- ggplot_build(p)
write.csv(histogram_rawdata$data[[1]],"Figure 4B - Source Data 1.csv")