Contents

1 Load libraries and datasets

Libraries

library(DESeq2)
library(DEXSeq)
library(limma)
library(pheatmap)
library(ggbeeswarm)
library(ggrepel)
library(tidyverse)

#set the global ggplot theme
theme_set(theme_bw() + theme(axis.text = element_text(size=12), 
                             axis.title = element_text(size=14),
                             plot.title = element_text(size = 15, hjust =0.5, face="bold")))

Datasets

load("../../var/ddsrna_180717.RData")
load("../../var/patmeta_191004.RData")

2 Gene expression analysis

2.1 Preprocessing

#subset samples
dds <- dds[,dds$diag == "CLL"]

colData(dds)$DDX3X <- patMeta[match(dds$PatID,patMeta$Patient.ID),]$DDX3X
dds <- dds[,!is.na(dds$DDX3X)]
dds <- estimateSizeFactors(dds)
ddxID <- "ENSG00000215301"

2.2 DDX3X expression

plotTab <- tibble(expr = counts(dds,normalized = TRUE)[ddxID,],
                  status = dds$DDX3X)
ggplot(plotTab, aes(x=status, y=log10(expr))) + geom_boxplot() + geom_point()

There are too many samples with lower DDX3X expression than DDX3X mutated samples.
Other mutations may also affect DDX3X expression.

2.3 Subset WT sample for samples without any mutations

geneSum <- dplyr::select(patMeta, Patient.ID,del10p:ZNF292) %>% 
  filter(Patient.ID %in% dds$PatID) %>% gather(-Patient.ID, key = "gene", value = "status") %>%
  mutate(status = as.integer(status)) %>%
  group_by(Patient.ID) %>% summarise(nTotal = sum(!is.na(status)),nMut = sum(status, na.rm = TRUE)) %>%
  arrange(desc(nTotal)) %>% filter(nMut ==0)
## Warning: attributes are not identical across measure variables;
## they will be dropped
ddsSub <- dds[,dds$DDX3X  == 1 | dds$PatID %in% geneSum$Patient.ID]

DDX3X expression

plotTab <- tibble(expr = counts(ddsSub,normalized = TRUE)[ddxID,],
                  status = ddsSub$DDX3X,
                  patID = ddsSub$PatID)
ggplot(plotTab, aes(x=status, y=log10(expr), label = patID)) + geom_boxplot() + geom_point() + ggrepel::geom_text_repel()

Change the status of the three WT samples with low DDX3X expression to mutated samples

sampleList <- c("P0995","P0716","P0515")
ddsSub[,sampleList]$DDX3X <- 1

plotTab <- tibble(expr = counts(ddsSub,normalized = TRUE)[ddxID,],
                  status = ddsSub$DDX3X,
                  patID = ddsSub$PatID)
ggplot(plotTab, aes(x=status, y=log10(expr), label = patID)) + geom_boxplot() + geom_point() + ggrepel::geom_text_repel()

Filter and VST transformation

#remove zero count genes
ddsSub <- ddsSub[rowSums(counts(ddsSub)) > 0,]

#only consider protein coding genes
ddsSub <- ddsSub[rowData(ddsSub)$biotype %in% c("protein_coding"),]

# vst and voom transformation
dds.vst <- varianceStabilizingTransformation(ddsSub)
dds.voom <- dds.vst
assay(dds.voom) <- voom(counts(ddsSub),lib.size = sizeFactors(ddsSub))$E

#how many genes and samples left?
dim(dds.vst)
## [1] 18433    14

2.4 Differential expression

Test using DESeq2

design(ddsSub) <- ~ DDX3X
DEres <- results(DESeq(ddsSub), tidy=TRUE) %>%
  mutate(symbol = rowData(ddsSub[row,])$symbol)
## using pre-existing size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 392 genes
## -- DESeq argument 'minReplicatesForReplace' = 7 
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing

P-value histogram

hist(DEres$pvalue, col = "lightblue", breaks = 50)

Looks OK.

List of all differentially expressed genes (10% FDR cut-off, adj < 0.1)

sigDE <- DEres %>% select(symbol, row, log2FoldChange, pvalue, padj) %>%
  filter(padj <= 0.1) %>% arrange(pvalue) 

sigDE %>%
  mutate_if(is.numeric, formatC, digits=2, format= "e") %>%
  DT::datatable()

Positive log2FoldChange indicates that the gene is up-regulated in DDX3X mutated samples and vice versa.

Plot of the top 9 most differentially expressed genes

plotTab <- assay(dds.vst)[sigDE$row[1:10],] %>% data.frame() %>%
  rownames_to_column("id") %>% gather(key = "sample", value = "expr", -id) %>%
  mutate(symbol = rowData(dds[id,])$symbol, DDX3X = colData(ddsSub[,sample])$DDX3X) %>%
  mutate(DDX3X =ifelse(DDX3X==1, "MUT","WT"))

ggplot(plotTab, aes(x=DDX3X, y = expr)) + geom_boxplot(aes(fill = DDX3X)) + ggbeeswarm::geom_beeswarm() +
  facet_wrap(~symbol, scales = "free", ncol=3) + theme(legend.position = "none") +
  ylab("Normalized expression")

2.5 Pathway enrichment

2.5.1 Enrichment using Hallmark genesets

Run Camera enrichment

exprMat <- assay(dds.voom)
designMat <- model.matrix(~ 1 + dds.voom$DDX3X)

enrichRes <- runCamera(exprMat, designMat, gmts$H, id = rowData(dds.voom)$symbol,
                       method = "camera", pCut = 0.05, ifFDR = TRUE, removePrefix = "HALLMARK_",
                       plotTitle = "Hallmarks (5% FDR)")
## Registered S3 method overwritten by 'sets':
##   method        from   
##   print.element ggplot2
enrichRes$enrichPlot

EMT pathway is up-regulated in samples with DDX3X mutation.

2.5.2 Enrichment using KEGG genesets

Run Camera enrichment

enrichRes <- runCamera(exprMat, designMat, gmts$KEGG, id = rowData(dds.voom)$symbol,
                       method = "camera", pCut = 0.05, ifFDR = TRUE, removePrefix = "KEGG_",
                       plotTitle = "KEGG (5% FDR)")
enrichRes$enrichPlot

2.5.3 Enrichment using C6 (oncogenic) genesets

Run Camera enrichment

enrichRes <- runCamera(exprMat, designMat, gmts$C6, id = rowData(dds.voom)$symbol,
                       method = "camera", pCut = 0.05, ifFDR = TRUE, removePrefix = "HALLMARK_",
                       plotTitle = "C6 (5% FDR)")
enrichRes$enrichPlot

3 Clustering using signature genes

3.1 DE genes in HALLMARK_INFLAMMATORY_RESPONSE

geneList <- piano::loadGSC(gmts$H)$gsc$HALLMARK_INFLAMMATORY_RESPONSE
deInflam <- filter(sigDE, symbol %in% geneList)
exprMat <- assay(dds.vst[deInflam$row,])

3.1.1 Heatmap (Hierarchical clustering)

colAnno <- data.frame(row.names = dds.vst$PatID,
                      IGHV = patMeta[match(dds.vst$PatID, patMeta$Patient.ID),]$IGHV.status,
                      trisomy12 = patMeta[match(dds.vst$PatID,patMeta$Patient.ID),]$trisomy12,
                       DDX3X = dds.vst$DDX3X)

plotMat <- t(scale(t(exprMat)))
plotMat[plotMat >= 4] <- 4
plotMat[plotMat <= -4] <- -4
    
pheatmap(plotMat, color = colorRampPalette(c("navy","white","firebrick"))(100),
         cluster_cols = TRUE, cluster_rows = TRUE,
         annotation_col = colAnno, labels_row = deInflam$symbol,
         show_colnames = FALSE, fontsize_row = 8, breaks = seq(-5,5, length.out = 101), treeheight_row = 0,
         border_color = NA, clustering_method = "ward.D2")

3.1.2 PCA

pcRes <- prcomp(t(exprMat), center= TRUE, scale = FALSE)
plotTab <- pcRes$x %>% data.frame() %>% rownames_to_column("patID") %>%
  left_join(rownames_to_column(colAnno, "patID"))
## Joining, by = "patID"
ggplot(plotTab, aes(x=PC1, y=PC2, col = DDX3X, shape = IGHV)) + 
  geom_point(size=3) + ggrepel::geom_text_repel(aes(label = patID))

The samples with DDX3X mutations do form a cluster here. But the caveat is that those genes are selected based on differential expression.

3.2 All genes in HALLMARK_INFLAMMATORY_RESPONSE

geneList <- piano::loadGSC(gmts$H)$gsc$HALLMARK_INFLAMMATORY_RESPONSE
exprMat <- assay(dds.vst[rowData(dds.vst)$symbol %in% geneList,])

3.2.1 Heatmap (Hierarchical clustering)

colAnno <- data.frame(row.names = dds.vst$PatID,
                      IGHV = patMeta[match(dds.vst$PatID, patMeta$Patient.ID),]$IGHV.status,
                      trisomy12 = patMeta[match(dds.vst$PatID,patMeta$Patient.ID),]$trisomy12,
                       DDX3X = dds.vst$DDX3X)

plotMat <- t(scale(t(exprMat)))
plotMat[plotMat >= 4] <- 4
plotMat[plotMat <= -4] <- -4
    
pheatmap(plotMat, color = colorRampPalette(c("navy","white","firebrick"))(100),
         cluster_cols = TRUE, cluster_rows = TRUE,
         annotation_col = colAnno, labels_row = rowData(dds.vst[rownames(exprMat),])$symbol,
         show_colnames = FALSE, fontsize_row = 8, breaks = seq(-5,5, length.out = 101), treeheight_row = 0,
         border_color = NA, clustering_method = "ward.D2")

3.2.2 PCA

pcRes <- prcomp(t(exprMat), center= TRUE, scale = FALSE)
plotTab <- pcRes$x %>% data.frame() %>% rownames_to_column("patID") %>%
  left_join(rownames_to_column(colAnno, "patID"))
## Joining, by = "patID"
ggplot(plotTab, aes(x=PC1, y=PC2, col = DDX3X, shape = IGHV)) + 
  geom_point(size=3) + ggrepel::geom_text_repel(aes(label = patID))

Using all the genes from HALLMARK_INFLAMMATORY_RESPONSE, the DDX3X mutated samples still cluster together, but less obvious.

3.3 DE genes in HALLMARK_TNFA_SIGNALING_VIA_NFKB

geneList <- piano::loadGSC(gmts$H)$gsc$HALLMARK_TNFA_SIGNALING_VIA_NFKB
deInflam <- filter(sigDE, symbol %in% geneList)
exprMat <- assay(dds.vst[deInflam$row,])

3.3.1 Heatmap (Hierarchical clustering)

colAnno <- data.frame(row.names = dds.vst$PatID,
                      IGHV = patMeta[match(dds.vst$PatID, patMeta$Patient.ID),]$IGHV.status,
                      trisomy12 = patMeta[match(dds.vst$PatID,patMeta$Patient.ID),]$trisomy12,
                       DDX3X = dds.vst$DDX3X)

plotMat <- t(scale(t(exprMat)))
plotMat[plotMat >= 4] <- 4
plotMat[plotMat <= -4] <- -4
    
pheatmap(plotMat, color = colorRampPalette(c("navy","white","firebrick"))(100),
         cluster_cols = TRUE, cluster_rows = TRUE,
         annotation_col = colAnno, labels_row = deInflam$symbol,
         show_colnames = FALSE, fontsize_row = 8, breaks = seq(-5,5, length.out = 101), treeheight_row = 0,
         border_color = NA, clustering_method = "ward.D2")

3.3.2 PCA

pcRes <- prcomp(t(exprMat), center= TRUE, scale = FALSE)
plotTab <- pcRes$x %>% data.frame() %>% rownames_to_column("patID") %>%
  left_join(rownames_to_column(colAnno, "patID"))
## Joining, by = "patID"
ggplot(plotTab, aes(x=PC1, y=PC2, col = DDX3X, shape = IGHV)) + 
  geom_point(size=3) + ggrepel::geom_text_repel(aes(label = patID))

3.4 All genes in HALLMARK_TNFA_SIGNALING_VIA_NFKB

geneList <- piano::loadGSC(gmts$H)$gsc$HALLMARK_TNFA_SIGNALING_VIA_NFKB
exprMat <- assay(dds.vst[rowData(dds.vst)$symbol %in% geneList,])

3.4.1 Heatmap (Hierarchical clustering)

colAnno <- data.frame(row.names = dds.vst$PatID,
                      IGHV = patMeta[match(dds.vst$PatID, patMeta$Patient.ID),]$IGHV.status,
                      trisomy12 = patMeta[match(dds.vst$PatID,patMeta$Patient.ID),]$trisomy12,
                       DDX3X = dds.vst$DDX3X)

plotMat <- t(scale(t(exprMat)))
plotMat[plotMat >= 4] <- 4
plotMat[plotMat <= -4] <- -4
    
pheatmap(plotMat, color = colorRampPalette(c("navy","white","firebrick"))(100),
         cluster_cols = TRUE, cluster_rows = TRUE,
         annotation_col = colAnno, labels_row = rowData(dds.vst[rownames(exprMat),])$symbol,
         show_colnames = FALSE, fontsize_row = 8, breaks = seq(-5,5, length.out = 101), treeheight_row = 0,
         border_color = NA, clustering_method = "ward.D2")

3.4.2 PCA

pcRes <- prcomp(t(exprMat), center= TRUE, scale = FALSE)
plotTab <- pcRes$x %>% data.frame() %>% rownames_to_column("patID") %>%
  left_join(rownames_to_column(colAnno, "patID"))
## Joining, by = "patID"
ggplot(plotTab, aes(x=PC1, y=PC2, col = DDX3X, shape = IGHV)) + 
  geom_point(size=3) + ggrepel::geom_text_repel(aes(label = patID))

4 IGHV gene usage analysis

Read IGHV sequencing data

ighvTab <- read_csv("../../ShinyApps/tumorbank/tables/ighv_gene_analysis_190318.csv")

Annotate DDX3X status

ighvTab <- mutate(ighvTab,DDX3X = patMeta[match(patientID,patMeta$Patient.ID),]$DDX3X) %>%
  filter(!is.na(DDX3X), status == "U")

Define samples without frequent driver mutations as WT samples

geneSum <- dplyr::select(patMeta, Patient.ID, DDX3X, TP53, ATM, NOTCH1, SF3B1, trisomy12, del17p, del13q) %>% 
  filter(Patient.ID %in% dds$PatID) %>% gather(-Patient.ID, key = "gene", value = "status") %>%
  mutate(status = as.integer(status)) %>%
  group_by(Patient.ID) %>% summarise(nTotal = sum(!is.na(status)),nMut = sum(status, na.rm = TRUE)) %>%
  arrange(desc(nTotal)) %>% filter(nMut ==0)

usefulSamples <- c(filter(ighvTab, DDX3X == 1)$patientID, geneSum$Patient.ID)
ighvTab <- filter(ighvTab, patientID %in% usefulSamples)
nrow(ighvTab)
## [1] 10

IGHV information for all the samples

select(ighvTab, patientID, DDX3X, status, Vgroup, Dgroup, Dframe, Jgroup, CDR3seq, CDR3len, seq_homology) %>%
  DT::datatable()

5 Expression analysis in U-CLL samples only

5.1 Preprocessing

dds$IGHV <- patMeta[match(dds$PatID, patMeta$Patient.ID),]$IGHV.status
ddsU <- dds[,dds$IGHV %in% "U"]

#how many DDX3X mutated samples?
table(ddsU$DDX3X)
## 
##  0  1 
## 87  4

5.2 DDX3X expression

plotTab <- tibble(expr = counts(ddsU,normalized = TRUE)[ddxID,],
                  status = ddsU$DDX3X,
                  patID = ddsU$PatID)
ggplot(plotTab, aes(x=status, y=log10(expr))) + geom_boxplot() + geom_point()

Remove on potential outlier from the WT group

outlierID <- arrange(plotTab, expr)$patID[1]
ddsU <- ddsU[,ddsU$PatID != outlierID]

Filter and VST transformation

#remove zero count genes
ddsU <- ddsU[rowSums(counts(ddsU)) > 0,]

#only consider protein coding genes
ddsU <- ddsU[rowData(ddsU)$biotype %in% c("protein_coding"),]

# vst and voom transformation
ddsU.vst <- varianceStabilizingTransformation(ddsU)
ddsU.voom <- ddsU.vst
assay(ddsU.voom) <- voom(counts(ddsU),lib.size = sizeFactors(ddsU))$E

#how many genes and samples left?
dim(ddsU.voom)
## [1] 18856    90

5.3 Differential expression

Annotate for major mutations

patAnno <- patMeta[match(ddsU$PatID,patMeta$Patient.ID),c("TP53","NOTCH1","trisomy12","SF3B1","del17p","del13q")] %>%
  mutate_all(replace_na, 0) %>% data.frame()
rownames(patAnno) <- ddsU$PatID
colData(ddsU) <- cbind(colData(ddsU),patAnno)

Test using DESeq2

design(ddsU) <- ~ DDX3X
DEres <- results(DESeq(ddsU), tidy=TRUE, contrast = c("DDX3X", 1,0)) %>%
  mutate(symbol = rowData(ddsU[row,])$symbol)
## using pre-existing size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 871 genes
## -- DESeq argument 'minReplicatesForReplace' = 7 
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing

P-value histogram

hist(DEres$pvalue, col = "lightblue", breaks = 50)

Looks OK.

List of all differentially expressed genes (10% FDR cut-off, adj < 0.1)

sigDE <- DEres %>% select(symbol, row, log2FoldChange, pvalue, padj) %>%
  filter(padj <= 0.1) %>% arrange(pvalue) 

sigDE %>%
  mutate_if(is.numeric, formatC, digits=2, format= "e") %>%
  DT::datatable()

Positive log2FoldChange indicates that the gene is up-regulated in DDX3X mutated samples and vice versa.

Plot of the top 9 most differentially expressed genes

plotTab <- assay(ddsU.vst)[sigDE$row[1:10],] %>% data.frame() %>%
  rownames_to_column("id") %>% gather(key = "sample", value = "expr", -id) %>%
  mutate(symbol = rowData(dds[id,])$symbol, DDX3X = colData(ddsU.vst[,sample])$DDX3X) %>%
  mutate(DDX3X =ifelse(DDX3X==1, "MUT","WT"))

ggplot(plotTab, aes(x=DDX3X, y = expr)) + geom_boxplot(aes(fill = DDX3X)) + ggbeeswarm::geom_beeswarm() +
  facet_wrap(~symbol, scales = "free", ncol=3) + theme(legend.position = "none") +
  ylab("Normalized expression")

5.4 Pathway enrichment

5.4.1 Enrichment using Hallmark genesets

Run Camera enrichment

exprMat <- assay(ddsU.vst)
designMat <- model.matrix(~ 1 + ddsU.vst$DDX3X)

enrichRes <- runCamera(exprMat, designMat, gmts$H, id = rowData(dds.voom)$symbol,
                       method = "camera", pCut = 0.05, ifFDR = TRUE, removePrefix = "HALLMARK_",
                       plotTitle = "Hallmarks (5% FDR)")
## [1] "No sets passed the criteria"
enrichRes$enrichPlot
## NULL

EMT pathway is up-regulated in samples with DDX3X mutation.

5.4.2 Enrichment using KEGG genesets

Run Camera enrichment

enrichRes <- runCamera(exprMat, designMat, gmts$KEGG, id = rowData(dds.voom)$symbol,
                       method = "camera", pCut = 0.05, ifFDR = TRUE, removePrefix = "KEGG_",
                       plotTitle = "KEGG (5% FDR)")
## [1] "No sets passed the criteria"
enrichRes$enrichPlot
## NULL

5.4.3 Enrichment using C6 (oncogenic) genesets

Run Camera enrichment

enrichRes <- runCamera(exprMat, designMat, gmts$C6, id = rowData(dds.voom)$symbol,
                       method = "camera", pCut = 0.05, ifFDR = TRUE, removePrefix = "HALLMARK_",
                       plotTitle = "C6 (5% FDR)")
## [1] "No sets passed the criteria"
enrichRes$enrichPlot
## NULL

6 Clustering using signature genes

6.1 All genes in HALLMARK_INFLAMMATORY_RESPONSE

geneList <- piano::loadGSC(gmts$H)$gsc$HALLMARK_INFLAMMATORY_RESPONSE
exprMat <- assay(ddsU.vst[rowData(ddsU.vst)$symbol %in% geneList,])

6.1.1 Heatmap (Hierarchical clustering)

colAnno <- data.frame(row.names = ddsU.vst$PatID,
                      IGHV = patMeta[match(ddsU.vst$PatID, patMeta$Patient.ID),]$IGHV.status,
                      trisomy12 = patMeta[match(ddsU.vst$PatID,patMeta$Patient.ID),]$trisomy12,
                      DDX3X = ddsU.vst$DDX3X)

plotMat <- t(scale(t(exprMat)))
plotMat[plotMat >= 4] <- 4
plotMat[plotMat <= -4] <- -4
    
pheatmap(plotMat, color = colorRampPalette(c("navy","white","firebrick"))(100),
         cluster_cols = TRUE, cluster_rows = TRUE,
         annotation_col = colAnno, labels_row = rowData(dds.vst[rownames(exprMat),])$symbol,
         show_colnames = FALSE, fontsize_row = 8, breaks = seq(-5,5, length.out = 101), treeheight_row = 0,
         border_color = NA, clustering_method = "ward.D2")

6.1.2 PCA

pcRes <- prcomp(t(exprMat), center= TRUE, scale = FALSE)
plotTab <- pcRes$x %>% data.frame() %>% rownames_to_column("patID") %>%
  left_join(rownames_to_column(colAnno, "patID"))
## Joining, by = "patID"
ggplot(plotTab, aes(x=PC1, y=PC2, col = DDX3X, shape = IGHV)) + 
  geom_point(size=3) + ggrepel::geom_text_repel(aes(label = patID))

Using all the genes from HALLMARK_INFLAMMATORY_RESPONSE, the DDX3X mutated samples still cluster together, but less obvious.

6.2 All genes in HALLMARK_TNFA_SIGNALING_VIA_NFKB

geneList <- piano::loadGSC(gmts$H)$gsc$HALLMARK_TNFA_SIGNALING_VIA_NFKB
exprMat <- assay(ddsU.vst[rowData(ddsU.vst)$symbol %in% geneList,])

6.2.1 Heatmap (Hierarchical clustering)

colAnno <- data.frame(row.names = ddsU.vst$PatID,
                      IGHV = patMeta[match(ddsU.vst$PatID, patMeta$Patient.ID),]$IGHV.status,
                      trisomy12 = patMeta[match(ddsU.vst$PatID,patMeta$Patient.ID),]$trisomy12,
                       DDX3X = ddsU.vst$DDX3X)

plotMat <- t(scale(t(exprMat)))
plotMat[plotMat >= 4] <- 4
plotMat[plotMat <= -4] <- -4
    
pheatmap(plotMat, color = colorRampPalette(c("navy","white","firebrick"))(100),
         cluster_cols = TRUE, cluster_rows = TRUE,
         annotation_col = colAnno, labels_row = rowData(dds.vst[rownames(exprMat),])$symbol,
         show_colnames = FALSE, fontsize_row = 8, breaks = seq(-5,5, length.out = 101), treeheight_row = 0,
         border_color = NA, clustering_method = "ward.D2")

6.2.2 PCA

pcRes <- prcomp(t(exprMat), center= TRUE, scale = FALSE)
plotTab <- pcRes$x %>% data.frame() %>% rownames_to_column("patID") %>%
  left_join(rownames_to_column(colAnno, "patID"))
## Joining, by = "patID"
ggplot(plotTab, aes(x=PC1, y=PC2, col = DDX3X, shape = IGHV)) + 
  geom_point(size=3) + ggrepel::geom_text_repel(aes(label = patID))