1 Load libraries

Global variables

#set the global ggplot theme
theme_set(theme_bw() + theme(axis.text = element_text(size=12), 
                             axis.title = element_text(size=14),
                             plot.title = element_text(size = 15, hjust =0.5, face="bold")))

2 Prepare datasets

2.1 1000CPS screen data

#only use the filtered data for main analysis
load("../../var/CPS1000_mainAnalysis.RData")

#choose the first sample
viabMat <- arrange(pheno1000_main, screenDate) %>%
  filter(diagnosis == "CLL") %>%
  distinct(patientID, Drug, concIndex, .keep_all = TRUE) %>%
  filter(! Drug %in% c("DMSO","PBS")) %>%
  mutate(id = paste0(Drug,"_",concIndex)) %>%
  select(patientID, id, normVal.adj.sigm) %>%
  spread(key = patientID, value = normVal.adj.sigm) %>%
  data.frame() %>% column_to_rownames("id") %>% 
  as.matrix()

2.2 RNAseq data

# load the most up to date datasets
loadData("ddsrna", "../../var")

## [1] "Loading data ../../var/ddsrna_180717.RData"

#only CLL samples
dds <- dds[,dds$diag %in% "CLL"]
dds <- estimateSizeFactors(dds)

#filter out none protein coding genes
dds<-dds[rowData(dds)$biotype %in% "protein_coding",]
dds <- dds[! rowData(dds)$symbol %in% c("",NA),]

#filter out low count genes
minrs <- 100
rs  <- rowSums(counts(dds, normalized = TRUE))
dds<-dds[ rs >= minrs, ]

#variance stabilize the data
#(includes normalizing for library size and dispsersion estimation) 
dds<-vst(dds)
exprMat <- assay(dds)

#filter out low variable genes
ntop<-5000
sds <- genefilter::rowSds(exprMat)
exprMat<-exprMat[order(sds,decreasing = T)[1:ntop],]


#scale and center RNAseq data
exprMat <- jyluMisc::mscale(exprMat, center = TRUE, scale = TRUE, useMad = FALSE)

2.3 Genetic data (CNV, SNV): remove features with >40% missing data

## [1] "Loading data ../../var/patmeta_181120.RData"

2.4 Methylation array data: data from BlooodCancerMultiOmics2017, top 5000 most variable

3 Assemble a multiAssayExperiment object and MOFA object

Generate object

cpsData <- list(drugs = viabMat, 
                rna = exprMat,
                gene = genData, 
                meth = methData)

# check dimensionalities, samples are columns, features are rows
lapply(cpsData, dim)

## $drugs
## [1] 315 275
## 
## $rna
## [1] 5000  206
## 
## $gene
## [1] 110 275
## 
## $meth
## [1] 5000  196

# Create MultiAssayExperiment object 
mae_cps <- MultiAssayExperiment(
  experiments = cpsData
)

# Build the MOFA object
MOFAobject <- createMOFAobject(mae_cps)

## Creating MOFA object from a MultiAssayExperiment object...

MOFAobject

## Untrained MOFA model with the following characteristics: 
##   Number of views: 4 
##   View names: drugs rna gene meth 
##   Number of features per view: 315 5000 110 5000
##   Number of samples: 349

Overview of data

plotTilesData(MOFAobject)

4 Run MOFA

4.1 Set up parameters

Define data options

DataOptions <- getDefaultDataOptions()
DataOptions$scaleViews <- FALSE

Define model options

ModelOptions <- getDefaultModelOptions(MOFAobject)
ModelOptions$numFactors <- 30
ModelOptions$sparsity <- FALSE
ModelOptions

## $likelihood
##       drugs         rna        gene        meth 
##  "gaussian"  "gaussian" "bernoulli"  "gaussian" 
## 
## $numFactors
## [1] 30
## 
## $sparsity
## [1] FALSE

Define training options

TrainOptions <- getDefaultTrainOptions()

# Automatically drop factors that explain less than 2% of variance in all omics
TrainOptions$DropFactorThreshold <- 0.05

#TrainOptions$seed <- 2018

TrainOptions

## $maxiter
## [1] 5000
## 
## $tolerance
## [1] 0.1
## 
## $DropFactorThreshold
## [1] 0.05
## 
## $verbose
## [1] FALSE
## 
## $seed
## NULL

4.2 Prepare MOFA

MOFAobject <- prepareMOFA(
  MOFAobject, 
  DataOptions = DataOptions,
  ModelOptions = ModelOptions,
  TrainOptions = TrainOptions
)

## Checking data options...

## Checking training options...

## Checking model options...

Save mofa input

save(MOFAobject, file = "mofaIn_noSparsity.RData")

Run mofa and save results

MOFAobject <- runMOFA(MOFAobject)
save(MOFAobject, file = "mofaRes.RData")

Load MOFA model

load("./mofaRes.RData")

5 Analysis of trained MOFA

5.1 Variance explained by MOFA model

# Calculate the variance explained (R2) per factor in each view 
r2 <- calculateVarianceExplained(MOFAobject)
r2$R2Total

##     drugs      gene      meth       rna 
## 0.2921765 0.2260048 0.2770163 0.3658035

# Variance explained by each factor in each view
head(r2$R2PerFactor)

##            drugs         gene         meth        rna
## LF1 1.004250e-01 1.769737e-01 1.867190e-01 0.08157859
## LF2 4.367826e-02 4.899725e-02 6.867120e-03 0.05042249
## LF3 1.186995e-01 7.417882e-06 6.575994e-03 0.01544383
## LF4 2.661160e-02 8.728655e-06 1.681635e-02 0.02560368
## LF5 6.861238e-07 5.402758e-07 1.730650e-04 0.06527149
## LF6 8.287495e-04 1.343789e-06 8.027143e-05 0.06065077

# Plot it
plotVarianceExplained(MOFAobject)

5.2 Meaning of the first 2 factors

Get weights and factors

allWeights <- getWeights(MOFAobject,
                         views = "all",
                         factors = "all",
                         as.data.frame = TRUE) %>% as.tibble()

allFactors <- getFactors(
  MOFAobject, 
  factors = "all",
  as.data.frame = TRUE
) %>% as.tibble()

allFactors <- left_join(allFactors, 
                        select(patMeta, Patient.ID, IGHV.status, trisomy12),
                        by = c(sample = "Patient.ID"))

## Warning: Column `sample`/`Patient.ID` joining factor and character vector,
## coercing into character vector

Plot the separation of the samples by the first two factor

plotTab <- filter(allFactors, factor %in% c("LF1","LF2")) %>%
  spread(key =factor, value = value) %>% mutate(trisomy12 = factor(trisomy12)) %>%
  filter(!is.na(IGHV.status), !is.na(trisomy12))
p <- ggplot(plotTab, aes(x=LF1, y=LF2, color = trisomy12, 
                         shape = IGHV.status, label = sample)) + 
  geom_point() +
  scale_shape_manual(values = c(M = 16, U =1))

p

#ggplotly(p)

5.3 Clinical correlations with factors

Load survival data

load("../../var/survival_CPS1000.RData")
#first sample of each patient
survT <- arrange(survT, sampleID) %>%
  distinct(patientID, .keep_all = TRUE)

function for cox regression

com <- function(response, time, endpoint, scale =FALSE) {  
  
  if (scale) {
    #calculate z-score
    response <- (response - mean(response, na.rm = TRUE))/sd(response, na.rm=TRUE)
  }
  surv <- coxph(Surv(time, endpoint) ~ response) 
  
  
  tibble(p = summary(surv)[[7]][,5], 
         HR = summary(surv)[[7]][,2], 
         lower = summary(surv)[[8]][,3], 
         higher = summary(surv)[[8]][,4])
}

Cox regression test

testTab <- left_join(allFactors, survT, by = c(sample = "patientID"))

#for OS
resOS <- filter(testTab, !is.na(OS)) %>%
  group_by(factor) %>%
  do(com(.$value, .$OS, .$died, TRUE)) %>% ungroup() %>%
  arrange(p) %>% mutate(p.adj = p.adjust(p, method = "BH")) %>%
  mutate(Endpoint = "OS")

resOS

## # A tibble: 10 x 7
##    factor         p    HR lower higher    p.adj Endpoint
##    <fct>      <dbl> <dbl> <dbl>  <dbl>    <dbl> <chr>   
##  1 LF1    0.0000103 2.15  1.53   3.02  0.000103 OS      
##  2 LF7    0.000312  0.601 0.456  0.793 0.00156  OS      
##  3 LF9    0.00409   1.40  1.11   1.76  0.0136   OS      
##  4 LF10   0.0751    1.39  0.967  2.00  0.188    OS      
##  5 LF2    0.175     0.758 0.507  1.13  0.318    OS      
##  6 LF3    0.191     0.802 0.577  1.12  0.318    OS      
##  7 LF4    0.552     0.900 0.637  1.27  0.655    OS      
##  8 LF8    0.586     1.10  0.784  1.54  0.655    OS      
##  9 LF5    0.589     0.909 0.643  1.29  0.655    OS      
## 10 LF6    0.786     0.951 0.661  1.37  0.786    OS

#for TTT
resTTT <- filter(testTab, !is.na(TTT)) %>%
  group_by(factor) %>%
  do(com(.$value, .$TTT, .$treatedAfter, TRUE)) %>% ungroup() %>%
  arrange(p) %>% mutate(p.adj = p.adjust(p, method = "BH")) %>%
  mutate(Endpoint = "TTT")

resTTT

## # A tibble: 10 x 7
##    factor        p    HR lower higher    p.adj Endpoint
##    <fct>     <dbl> <dbl> <dbl>  <dbl>    <dbl> <chr>   
##  1 LF7    7.45e-12 0.424 0.331  0.542 7.45e-11 TTT     
##  2 LF1    1.77e- 8 2.11  1.63   2.73  8.84e- 8 TTT     
##  3 LF4    5.53e- 3 1.33  1.09   1.62  1.84e- 2 TTT     
##  4 LF3    2.69e- 2 0.751 0.583  0.968 6.73e- 2 TTT     
##  5 LF9    3.90e- 2 1.31  1.01   1.69  7.81e- 2 TTT     
##  6 LF8    7.29e- 2 0.784 0.602  1.02  1.22e- 1 TTT     
##  7 LF2    1.05e- 1 0.769 0.560  1.06  1.49e- 1 TTT     
##  8 LF6    1.40e- 1 1.22  0.937  1.58  1.75e- 1 TTT     
##  9 LF5    3.95e- 1 1.13  0.852  1.50  4.38e- 1 TTT     
## 10 LF10   8.34e- 1 1.03  0.768  1.39  8.34e- 1 TTT

Plot p value and hazard ratio

plotTab <- bind_rows(resOS, resTTT) %>%
  filter(factor %in% unique(filter(.,p.adj < 0.05)$factor))


ggplot(plotTab, aes(x=factor, y = HR, col = Endpoint, dodge = Endpoint)) + 
  geom_hline(yintercept = 1, linetype = "dotted") +
  geom_point(position = position_dodge(width=0.8)) +
  geom_errorbar(position = position_dodge(width =0.8), 
                aes(ymin = lower, ymax = higher), width = 0.3) + 
  geom_text(position = position_dodge(width =0.8),
            aes(y = higher + 0.5, label = sprintf("p=%1.2f",p))) +
  ylim(-0.5,4) + 
  coord_flip() + theme_bw()

Function for KM plot

km <- function(response, time, endpoint, titlePlot = "KM plot", pval = NULL, stat = "maxstat") { 
  #function for km plot
  survS <- data.frame(time = time,
                      endpoint = endpoint)
  
  if (stat == "maxstat") {
    ms <- maxstat.test(Surv(time, endpoint)  ~ response, 
                               data = survS,
                               smethod = "LogRank",
                               minprop = 0.2, 
                               maxprop = 0.8, 
                               alpha = NULL)
    
    survS$group <- factor(ifelse(response >= ms$estimate, "high", "low"))
    
  } else if (stat == "median") {
    med <- median(response, na.rm = TRUE)
    survS$group <- factor(ifelse(response >= med, "high", "low"))
  } else if (stat == "binary") {
    survS$group <- factor(response)
  }
  
  if (is.null(pval)) pval = TRUE
  p <- ggsurvplot(survfit(Surv(time, endpoint) ~ group, data = survS), 
                              data = survS, pval = TRUE,  conf.int = FALSE, palette = "Set1",
                              ylab = "Fraction", xlab = "Time (years)", title = titlePlot,
                  ggtheme = theme_bw() + theme(plot.title = element_text(hjust =0.5)))$plot
  
  p
}

KM plot for overall survival

facList <- c("LF1","LF4","LF7","LF9")
plotList <- lapply(facList, function(x) {
  eachTab <- filter(testTab, factor == x) %>%
    select(value, OS, died) %>% filter(!is.na(OS))
  km(eachTab$value, eachTab$OS, eachTab$died, sprintf("%s VS Overall Survival", x), stat = "maxstat")
})

grid.arrange(grobs = plotList, ncol = 2)

KM plot for TTT

facList <- c("LF1","LF4","LF7","LF9")
plotList <- lapply(facList, function(x) {
  eachTab <- filter(testTab, factor == x) %>%
    select(value, TTT, treatedAfter) %>% filter(!is.na(TTT))
  km(eachTab$value, eachTab$TTT, eachTab$treatedAfter, sprintf("%s VS Time to next treatment", x), stat = "maxstat")
})

grid.arrange(grobs = plotList, ncol = 2)

5.4 Biological meaning of factor 4, 7, 9

5.4.1 LF7

plotTopWeights(MOFAobject, "gene", 7)

Function to plot customized MOFA examine heatmap

plotMofaHeatmap <- function(mofaObj, view, factor, features, geneAnno = NULL, censor = c(-5,5)) {
  # get plot features
  weightTab <- getWeights(mofaObj, views = view, factors = factor)
  seleFeatures <- rownames(weightTab[[1]])[order(abs(weightTab[[1]][,1]), decreasing = TRUE)[seq(1,features)]]
  
  #get original data matrix for plot
  plotMat <- getTrainData(
    mofaObj,
    as.data.frame = FALSE, 
    views = view)
  plotMat <- plotMat[[1]][seleFeatures,]
  plotMat <- plotMat[,complete.cases(t(plotMat))]
  
  if (!is.null(censor)) {
    plotMat[plotMat > censor[2]] <- censor[2]
    plotMat[plotMat < censor[1]] <- censor[1]
  }
  
  #get factor annotation
  facAnno <- getFactors(
    mofaObj, 
    factors = factor)
  facAnno <- data.frame(facAnno[colnames(plotMat),,drop=FALSE])
  plotMat <- plotMat[,order(facAnno[,1])]
  
  if (!is.null(geneAnno)) {
    facAnno <- cbind(facAnno, geneAnno[match(rownames(facAnno), rownames(geneAnno)),,drop=FALSE])
  }
  
  pheatmap(plotMat, cluster_cols = FALSE, 
           annotation_col =  facAnno, labels_col = "")
}

Plot heatmap

p53Anno <- data.frame(row.names = patMeta$Patient.ID, 
                      IGHV = patMeta$IGHV.status,
                      TP53 = as.factor(patMeta$TP53),
                      treatment = patMeta$treatment) 

plotMofaHeatmap(MOFAobject, "drugs", 7, 20, geneAnno = p53Anno, censor = c(-4,4))

Gene enrichment analysis

inputTab <- getWeights(MOFAobject, views = "rna", factor = 7, as.data.frame = TRUE) %>%
  mutate(symbol = rowData(dds)[match(feature, rownames(dds)),]$symbol) %>%
  filter(!is.na(symbol)) %>%
  arrange(desc(abs(value))) %>% distinct(symbol, .keep_all = TRUE) %>%
  dplyr::rename(stat = value) %>% select(symbol, stat) %>% 
  data.frame() %>% column_to_rownames("symbol")

enRes <- list()
enRes[["Hallmark"]] <- runGSEA(inputTab, gmtFile = "../../data/commonFiles/h.all.v5.1.symbols_customized.gmt", GSAmethod = "page")

## Loading required package: piano

## Warning: package 'piano' was built under R version 3.5.1

enRes[["KEGG"]] <- runGSEA(inputTab, gmtFile = "../../data/commonFiles/c2.cp.kegg.v5.1.symbols.gmt", GSAmethod = "page")
p <- plotEnrichmentBar(enRes, pCut = 0.01)

## Coordinate system already present. Adding new coordinate system, which will replace the existing one.

## Coordinate system already present. Adding new coordinate system, which will replace the existing one.

plot(p)

5.4.2 LF4

plotTopWeights(MOFAobject, "gene",4)

Plot heatmap

geneAnno <- data.frame(row.names = patMeta$Patient.ID, SF3B1 = as.factor(patMeta$SF3B1),
                       treatment = patMeta$treatment)
plotMofaHeatmap(MOFAobject, "drugs", 4, 20, geneAnno = geneAnno)

Gene enrichment analysis

inputTab <- getWeights(MOFAobject, views = "rna", factor =4, as.data.frame = TRUE) %>%
  mutate(symbol = rowData(dds)[match(feature, rownames(dds)),]$symbol) %>%
  filter(!is.na(symbol)) %>%
  arrange(desc(abs(value))) %>% distinct(symbol, .keep_all = TRUE) %>%
  dplyr::rename(stat = value) %>% select(symbol, stat) %>% 
  data.frame() %>% column_to_rownames("symbol")

enRes <- list()
enRes[["Hallmark"]] <- runGSEA(inputTab, gmtFile = "../../data/commonFiles/h.all.v5.1.symbols_customized.gmt", GSAmethod = "page")
enRes[["KEGG"]] <- runGSEA(inputTab, gmtFile = "../../data/commonFiles/c2.cp.kegg.v5.1.symbols.gmt", GSAmethod = "page")
p <- plotEnrichmentBar(enRes, pCut = 0.01)

## Coordinate system already present. Adding new coordinate system, which will replace the existing one.
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.

plot(p)

5.4.3 LF9

plotTopWeights(MOFAobject, "gene", 9)

Plot heatmap

geneAnno <- data.frame(row.names = patMeta$Patient.ID, 
                       IGHV = as.factor(patMeta$IGHV.status), 
                       del11q = as.factor(patMeta$del11q),
                       treatment = patMeta$treatment)
plotMofaHeatmap(MOFAobject, "drugs", 9, 20, geneAnno = geneAnno)

Gene enrichment analysis

inputTab <- getWeights(MOFAobject, views = "rna", factor = 9, as.data.frame = TRUE) %>%
  mutate(symbol = rowData(dds)[match(feature, rownames(dds)),]$symbol) %>%
  filter(!is.na(symbol)) %>%
  arrange(desc(abs(value))) %>% distinct(symbol, .keep_all = TRUE) %>%
  dplyr::rename(stat = value) %>% select(symbol, stat) %>% 
  data.frame() %>% column_to_rownames("symbol")

enRes <- list()
enRes[["Hallmark"]] <- runGSEA(inputTab, gmtFile = "../../data/commonFiles/h.all.v5.1.symbols_customized.gmt", GSAmethod = "page")
enRes[["KEGG"]] <- runGSEA(inputTab, gmtFile = "../../data/commonFiles/c2.cp.kegg.v5.1.symbols.gmt", GSAmethod = "page")
p <- plotEnrichmentBar(enRes, pCut = 0.01)

## Coordinate system already present. Adding new coordinate system, which will replace the existing one.
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.

plot(p)

5.5 Define subgroup using LF1, 4, 7, 9?

clusters <- clusterSamples(MOFAobject,
                           k=4,
                           factors=c(1,4,7,9))

lfMat <- getFactors(MOFAobject, factors = c(1,4,7,9))

colAnno <- data.frame(row.names = names(clusters),
                      clusters = factor(clusters),
                      TP53 = patMeta[match(names(clusters), patMeta$Patient.ID),]$TP53,
                      IGHV = patMeta[match(names(clusters), patMeta$Patient.ID),]$IGHV.status,
                      SF3B1 = patMeta[match(names(clusters), patMeta$Patient.ID),]$SF3B1,
                      trisomy12 = patMeta[match(names(clusters), patMeta$Patient.ID),]$trisomy12)


#using ggplot color for group annotation
gg_color_hue <- function(n) {
  hues = seq(15, 375, length = n + 1)
  hcl(h = hues, l = 65, c = 100)[1:n]
}

#reorder by cluster
plotMat <- t(lfMat)
plotMat <- plotMat[,rownames(colAnno[order(colAnno$clusters),])]

annoColor <- list(clusters = structure(RColorBrewer::brewer.pal(4, "Set1"), names = seq(4)))
pheatmap(plotMat, clustering_method = "complete", annotation_col = colAnno,
         cluster_rows = FALSE, cluster_cols = FALSE,
         show_colnames = FALSE, annotation_colors = annoColor)

Survial analysis using clusters from MOFA

testTab <- survT %>% mutate(cluster = factor(clusters[patientID]))

#OS
com(testTab$cluster, testTab$OS, testTab$died, FALSE)

## # A tibble: 3 x 4
##           p     HR  lower higher
##       <dbl>  <dbl>  <dbl>  <dbl>
## 1 0.0844    0.456  0.187   1.11 
## 2 0.0000844 0.0842 0.0245  0.289
## 3 0.591     1.23   0.579   2.61

#TTT
com(testTab$cluster, testTab$TTT, testTab$treatedAfter, FALSE)

## # A tibble: 3 x 4
##             p    HR  lower higher
##         <dbl> <dbl>  <dbl>  <dbl>
## 1 0.710       0.886 0.469   1.68 
## 2 0.000000435 0.130 0.0592  0.287
## 3 0.218       1.59  0.760   3.32

km(testTab$cluster, testTab$TTT, testTab$treatedAfter, 
   sprintf("%s VS Time to next treatment", "cluster"), stat = "binary")

km(testTab$cluster, testTab$OS, testTab$died, sprintf("%s VS Overall survival", "cluster"), stat = "binary")

Multi-omics Factor Analysis (MOFA) on CPS1000 dataset

8/30/2018 (Updated on 3 December 2018)

1 Load libraries

2 Prepare datasets

2.1 1000CPS screen data

2.2 RNAseq data

2.3 Genetic data (CNV, SNV): remove features with >40% missing data

2.4 Methylation array data: data from BlooodCancerMultiOmics2017, top 5000 most variable

3 Assemble a multiAssayExperiment object and MOFA object

4 Run MOFA

4.1 Set up parameters

4.2 Prepare MOFA

5 Analysis of trained MOFA

5.1 Variance explained by MOFA model

5.2 Meaning of the first 2 factors

5.3 Clinical correlations with factors

5.4 Biological meaning of factor 4, 7, 9

5.4.1 LF7

5.4.2 LF4

5.4.3 LF9

5.5 Define subgroup using LF1, 4, 7, 9?