Libraries
library(DESeq2)
library(DEXSeq)
library(limma)
library(pheatmap)
library(ggbeeswarm)
library(ggrepel)
library(tidyverse)
#set the global ggplot theme
theme_set(theme_bw() + theme(axis.text = element_text(size=12),
axis.title = element_text(size=14),
plot.title = element_text(size = 15, hjust =0.5, face="bold")))
Datasets
load("../../var/ddsrna_180717.RData")
load("../../var/patmeta_191004.RData")
#subset samples
dds <- dds[,dds$diag == "CLL"]
colData(dds)$DDX3X <- patMeta[match(dds$PatID,patMeta$Patient.ID),]$DDX3X
dds <- dds[,!is.na(dds$DDX3X)]
dds <- estimateSizeFactors(dds)
ddxID <- "ENSG00000215301"
plotTab <- tibble(expr = counts(dds,normalized = TRUE)[ddxID,],
status = dds$DDX3X)
ggplot(plotTab, aes(x=status, y=log10(expr))) + geom_boxplot() + geom_point()
There are too many samples with lower DDX3X expression than DDX3X mutated samples.
Other mutations may also affect DDX3X expression.
geneSum <- dplyr::select(patMeta, Patient.ID,del10p:ZNF292) %>%
filter(Patient.ID %in% dds$PatID) %>% gather(-Patient.ID, key = "gene", value = "status") %>%
mutate(status = as.integer(status)) %>%
group_by(Patient.ID) %>% summarise(nTotal = sum(!is.na(status)),nMut = sum(status, na.rm = TRUE)) %>%
arrange(desc(nTotal)) %>% filter(nMut ==0)
## Warning: attributes are not identical across measure variables;
## they will be dropped
ddsSub <- dds[,dds$DDX3X == 1 | dds$PatID %in% geneSum$Patient.ID]
DDX3X expression
plotTab <- tibble(expr = counts(ddsSub,normalized = TRUE)[ddxID,],
status = ddsSub$DDX3X,
patID = ddsSub$PatID)
ggplot(plotTab, aes(x=status, y=log10(expr), label = patID)) + geom_boxplot() + geom_point() + ggrepel::geom_text_repel()
Change the status of the three WT samples with low DDX3X expression to mutated samples
sampleList <- c("P0995","P0716","P0515")
ddsSub[,sampleList]$DDX3X <- 1
plotTab <- tibble(expr = counts(ddsSub,normalized = TRUE)[ddxID,],
status = ddsSub$DDX3X,
patID = ddsSub$PatID)
ggplot(plotTab, aes(x=status, y=log10(expr), label = patID)) + geom_boxplot() + geom_point() + ggrepel::geom_text_repel()
Filter and VST transformation
#remove zero count genes
ddsSub <- ddsSub[rowSums(counts(ddsSub)) > 0,]
#only consider protein coding genes
ddsSub <- ddsSub[rowData(ddsSub)$biotype %in% c("protein_coding"),]
# vst and voom transformation
dds.vst <- varianceStabilizingTransformation(ddsSub)
dds.voom <- dds.vst
assay(dds.voom) <- voom(counts(ddsSub),lib.size = sizeFactors(ddsSub))$E
#how many genes and samples left?
dim(dds.vst)
## [1] 18433 14
Test using DESeq2
design(ddsSub) <- ~ DDX3X
DEres <- results(DESeq(ddsSub), tidy=TRUE) %>%
mutate(symbol = rowData(ddsSub[row,])$symbol)
## using pre-existing size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 392 genes
## -- DESeq argument 'minReplicatesForReplace' = 7
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
P-value histogram
hist(DEres$pvalue, col = "lightblue", breaks = 50)
Looks OK.
List of all differentially expressed genes (10% FDR cut-off, adj < 0.1)
sigDE <- DEres %>% select(symbol, row, log2FoldChange, pvalue, padj) %>%
filter(padj <= 0.1) %>% arrange(pvalue)
sigDE %>%
mutate_if(is.numeric, formatC, digits=2, format= "e") %>%
DT::datatable()
Positive log2FoldChange indicates that the gene is up-regulated in DDX3X mutated samples and vice versa.
Plot of the top 9 most differentially expressed genes
plotTab <- assay(dds.vst)[sigDE$row[1:10],] %>% data.frame() %>%
rownames_to_column("id") %>% gather(key = "sample", value = "expr", -id) %>%
mutate(symbol = rowData(dds[id,])$symbol, DDX3X = colData(ddsSub[,sample])$DDX3X) %>%
mutate(DDX3X =ifelse(DDX3X==1, "MUT","WT"))
ggplot(plotTab, aes(x=DDX3X, y = expr)) + geom_boxplot(aes(fill = DDX3X)) + ggbeeswarm::geom_beeswarm() +
facet_wrap(~symbol, scales = "free", ncol=3) + theme(legend.position = "none") +
ylab("Normalized expression")
Run Camera enrichment
exprMat <- assay(dds.voom)
designMat <- model.matrix(~ 1 + dds.voom$DDX3X)
enrichRes <- runCamera(exprMat, designMat, gmts$H, id = rowData(dds.voom)$symbol,
method = "camera", pCut = 0.05, ifFDR = TRUE, removePrefix = "HALLMARK_",
plotTitle = "Hallmarks (5% FDR)")
## Registered S3 method overwritten by 'sets':
## method from
## print.element ggplot2
enrichRes$enrichPlot
EMT pathway is up-regulated in samples with DDX3X mutation.
Run Camera enrichment
enrichRes <- runCamera(exprMat, designMat, gmts$KEGG, id = rowData(dds.voom)$symbol,
method = "camera", pCut = 0.05, ifFDR = TRUE, removePrefix = "KEGG_",
plotTitle = "KEGG (5% FDR)")
enrichRes$enrichPlot
Run Camera enrichment
enrichRes <- runCamera(exprMat, designMat, gmts$C6, id = rowData(dds.voom)$symbol,
method = "camera", pCut = 0.05, ifFDR = TRUE, removePrefix = "HALLMARK_",
plotTitle = "C6 (5% FDR)")
enrichRes$enrichPlot
geneList <- piano::loadGSC(gmts$H)$gsc$HALLMARK_INFLAMMATORY_RESPONSE
deInflam <- filter(sigDE, symbol %in% geneList)
exprMat <- assay(dds.vst[deInflam$row,])
colAnno <- data.frame(row.names = dds.vst$PatID,
IGHV = patMeta[match(dds.vst$PatID, patMeta$Patient.ID),]$IGHV.status,
trisomy12 = patMeta[match(dds.vst$PatID,patMeta$Patient.ID),]$trisomy12,
DDX3X = dds.vst$DDX3X)
plotMat <- t(scale(t(exprMat)))
plotMat[plotMat >= 4] <- 4
plotMat[plotMat <= -4] <- -4
pheatmap(plotMat, color = colorRampPalette(c("navy","white","firebrick"))(100),
cluster_cols = TRUE, cluster_rows = TRUE,
annotation_col = colAnno, labels_row = deInflam$symbol,
show_colnames = FALSE, fontsize_row = 8, breaks = seq(-5,5, length.out = 101), treeheight_row = 0,
border_color = NA, clustering_method = "ward.D2")
pcRes <- prcomp(t(exprMat), center= TRUE, scale = FALSE)
plotTab <- pcRes$x %>% data.frame() %>% rownames_to_column("patID") %>%
left_join(rownames_to_column(colAnno, "patID"))
## Joining, by = "patID"
ggplot(plotTab, aes(x=PC1, y=PC2, col = DDX3X, shape = IGHV)) +
geom_point(size=3) + ggrepel::geom_text_repel(aes(label = patID))
The samples with DDX3X mutations do form a cluster here. But the caveat is that those genes are selected based on differential expression.
geneList <- piano::loadGSC(gmts$H)$gsc$HALLMARK_INFLAMMATORY_RESPONSE
exprMat <- assay(dds.vst[rowData(dds.vst)$symbol %in% geneList,])
colAnno <- data.frame(row.names = dds.vst$PatID,
IGHV = patMeta[match(dds.vst$PatID, patMeta$Patient.ID),]$IGHV.status,
trisomy12 = patMeta[match(dds.vst$PatID,patMeta$Patient.ID),]$trisomy12,
DDX3X = dds.vst$DDX3X)
plotMat <- t(scale(t(exprMat)))
plotMat[plotMat >= 4] <- 4
plotMat[plotMat <= -4] <- -4
pheatmap(plotMat, color = colorRampPalette(c("navy","white","firebrick"))(100),
cluster_cols = TRUE, cluster_rows = TRUE,
annotation_col = colAnno, labels_row = rowData(dds.vst[rownames(exprMat),])$symbol,
show_colnames = FALSE, fontsize_row = 8, breaks = seq(-5,5, length.out = 101), treeheight_row = 0,
border_color = NA, clustering_method = "ward.D2")
pcRes <- prcomp(t(exprMat), center= TRUE, scale = FALSE)
plotTab <- pcRes$x %>% data.frame() %>% rownames_to_column("patID") %>%
left_join(rownames_to_column(colAnno, "patID"))
## Joining, by = "patID"
ggplot(plotTab, aes(x=PC1, y=PC2, col = DDX3X, shape = IGHV)) +
geom_point(size=3) + ggrepel::geom_text_repel(aes(label = patID))
Using all the genes from HALLMARK_INFLAMMATORY_RESPONSE, the DDX3X mutated samples still cluster together, but less obvious.
geneList <- piano::loadGSC(gmts$H)$gsc$HALLMARK_TNFA_SIGNALING_VIA_NFKB
deInflam <- filter(sigDE, symbol %in% geneList)
exprMat <- assay(dds.vst[deInflam$row,])
colAnno <- data.frame(row.names = dds.vst$PatID,
IGHV = patMeta[match(dds.vst$PatID, patMeta$Patient.ID),]$IGHV.status,
trisomy12 = patMeta[match(dds.vst$PatID,patMeta$Patient.ID),]$trisomy12,
DDX3X = dds.vst$DDX3X)
plotMat <- t(scale(t(exprMat)))
plotMat[plotMat >= 4] <- 4
plotMat[plotMat <= -4] <- -4
pheatmap(plotMat, color = colorRampPalette(c("navy","white","firebrick"))(100),
cluster_cols = TRUE, cluster_rows = TRUE,
annotation_col = colAnno, labels_row = deInflam$symbol,
show_colnames = FALSE, fontsize_row = 8, breaks = seq(-5,5, length.out = 101), treeheight_row = 0,
border_color = NA, clustering_method = "ward.D2")
pcRes <- prcomp(t(exprMat), center= TRUE, scale = FALSE)
plotTab <- pcRes$x %>% data.frame() %>% rownames_to_column("patID") %>%
left_join(rownames_to_column(colAnno, "patID"))
## Joining, by = "patID"
ggplot(plotTab, aes(x=PC1, y=PC2, col = DDX3X, shape = IGHV)) +
geom_point(size=3) + ggrepel::geom_text_repel(aes(label = patID))
geneList <- piano::loadGSC(gmts$H)$gsc$HALLMARK_TNFA_SIGNALING_VIA_NFKB
exprMat <- assay(dds.vst[rowData(dds.vst)$symbol %in% geneList,])
colAnno <- data.frame(row.names = dds.vst$PatID,
IGHV = patMeta[match(dds.vst$PatID, patMeta$Patient.ID),]$IGHV.status,
trisomy12 = patMeta[match(dds.vst$PatID,patMeta$Patient.ID),]$trisomy12,
DDX3X = dds.vst$DDX3X)
plotMat <- t(scale(t(exprMat)))
plotMat[plotMat >= 4] <- 4
plotMat[plotMat <= -4] <- -4
pheatmap(plotMat, color = colorRampPalette(c("navy","white","firebrick"))(100),
cluster_cols = TRUE, cluster_rows = TRUE,
annotation_col = colAnno, labels_row = rowData(dds.vst[rownames(exprMat),])$symbol,
show_colnames = FALSE, fontsize_row = 8, breaks = seq(-5,5, length.out = 101), treeheight_row = 0,
border_color = NA, clustering_method = "ward.D2")
pcRes <- prcomp(t(exprMat), center= TRUE, scale = FALSE)
plotTab <- pcRes$x %>% data.frame() %>% rownames_to_column("patID") %>%
left_join(rownames_to_column(colAnno, "patID"))
## Joining, by = "patID"
ggplot(plotTab, aes(x=PC1, y=PC2, col = DDX3X, shape = IGHV)) +
geom_point(size=3) + ggrepel::geom_text_repel(aes(label = patID))
Read IGHV sequencing data
ighvTab <- read_csv("../../ShinyApps/tumorbank/tables/ighv_gene_analysis_190318.csv")
Annotate DDX3X status
ighvTab <- mutate(ighvTab,DDX3X = patMeta[match(patientID,patMeta$Patient.ID),]$DDX3X) %>%
filter(!is.na(DDX3X), status == "U")
Define samples without frequent driver mutations as WT samples
geneSum <- dplyr::select(patMeta, Patient.ID, DDX3X, TP53, ATM, NOTCH1, SF3B1, trisomy12, del17p, del13q) %>%
filter(Patient.ID %in% dds$PatID) %>% gather(-Patient.ID, key = "gene", value = "status") %>%
mutate(status = as.integer(status)) %>%
group_by(Patient.ID) %>% summarise(nTotal = sum(!is.na(status)),nMut = sum(status, na.rm = TRUE)) %>%
arrange(desc(nTotal)) %>% filter(nMut ==0)
usefulSamples <- c(filter(ighvTab, DDX3X == 1)$patientID, geneSum$Patient.ID)
ighvTab <- filter(ighvTab, patientID %in% usefulSamples)
nrow(ighvTab)
## [1] 10
IGHV information for all the samples
select(ighvTab, patientID, DDX3X, status, Vgroup, Dgroup, Dframe, Jgroup, CDR3seq, CDR3len, seq_homology) %>%
DT::datatable()
dds$IGHV <- patMeta[match(dds$PatID, patMeta$Patient.ID),]$IGHV.status
ddsU <- dds[,dds$IGHV %in% "U"]
#how many DDX3X mutated samples?
table(ddsU$DDX3X)
##
## 0 1
## 87 4
plotTab <- tibble(expr = counts(ddsU,normalized = TRUE)[ddxID,],
status = ddsU$DDX3X,
patID = ddsU$PatID)
ggplot(plotTab, aes(x=status, y=log10(expr))) + geom_boxplot() + geom_point()
Remove on potential outlier from the WT group
outlierID <- arrange(plotTab, expr)$patID[1]
ddsU <- ddsU[,ddsU$PatID != outlierID]
Filter and VST transformation
#remove zero count genes
ddsU <- ddsU[rowSums(counts(ddsU)) > 0,]
#only consider protein coding genes
ddsU <- ddsU[rowData(ddsU)$biotype %in% c("protein_coding"),]
# vst and voom transformation
ddsU.vst <- varianceStabilizingTransformation(ddsU)
ddsU.voom <- ddsU.vst
assay(ddsU.voom) <- voom(counts(ddsU),lib.size = sizeFactors(ddsU))$E
#how many genes and samples left?
dim(ddsU.voom)
## [1] 18856 90
Annotate for major mutations
patAnno <- patMeta[match(ddsU$PatID,patMeta$Patient.ID),c("TP53","NOTCH1","trisomy12","SF3B1","del17p","del13q")] %>%
mutate_all(replace_na, 0) %>% data.frame()
rownames(patAnno) <- ddsU$PatID
colData(ddsU) <- cbind(colData(ddsU),patAnno)
Test using DESeq2
design(ddsU) <- ~ DDX3X
DEres <- results(DESeq(ddsU), tidy=TRUE, contrast = c("DDX3X", 1,0)) %>%
mutate(symbol = rowData(ddsU[row,])$symbol)
## using pre-existing size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 871 genes
## -- DESeq argument 'minReplicatesForReplace' = 7
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
P-value histogram
hist(DEres$pvalue, col = "lightblue", breaks = 50)
Looks OK.
List of all differentially expressed genes (10% FDR cut-off, adj < 0.1)
sigDE <- DEres %>% select(symbol, row, log2FoldChange, pvalue, padj) %>%
filter(padj <= 0.1) %>% arrange(pvalue)
sigDE %>%
mutate_if(is.numeric, formatC, digits=2, format= "e") %>%
DT::datatable()
Positive log2FoldChange indicates that the gene is up-regulated in DDX3X mutated samples and vice versa.
Plot of the top 9 most differentially expressed genes
plotTab <- assay(ddsU.vst)[sigDE$row[1:10],] %>% data.frame() %>%
rownames_to_column("id") %>% gather(key = "sample", value = "expr", -id) %>%
mutate(symbol = rowData(dds[id,])$symbol, DDX3X = colData(ddsU.vst[,sample])$DDX3X) %>%
mutate(DDX3X =ifelse(DDX3X==1, "MUT","WT"))
ggplot(plotTab, aes(x=DDX3X, y = expr)) + geom_boxplot(aes(fill = DDX3X)) + ggbeeswarm::geom_beeswarm() +
facet_wrap(~symbol, scales = "free", ncol=3) + theme(legend.position = "none") +
ylab("Normalized expression")
Run Camera enrichment
exprMat <- assay(ddsU.vst)
designMat <- model.matrix(~ 1 + ddsU.vst$DDX3X)
enrichRes <- runCamera(exprMat, designMat, gmts$H, id = rowData(dds.voom)$symbol,
method = "camera", pCut = 0.05, ifFDR = TRUE, removePrefix = "HALLMARK_",
plotTitle = "Hallmarks (5% FDR)")
## [1] "No sets passed the criteria"
enrichRes$enrichPlot
## NULL
EMT pathway is up-regulated in samples with DDX3X mutation.
Run Camera enrichment
enrichRes <- runCamera(exprMat, designMat, gmts$KEGG, id = rowData(dds.voom)$symbol,
method = "camera", pCut = 0.05, ifFDR = TRUE, removePrefix = "KEGG_",
plotTitle = "KEGG (5% FDR)")
## [1] "No sets passed the criteria"
enrichRes$enrichPlot
## NULL
Run Camera enrichment
enrichRes <- runCamera(exprMat, designMat, gmts$C6, id = rowData(dds.voom)$symbol,
method = "camera", pCut = 0.05, ifFDR = TRUE, removePrefix = "HALLMARK_",
plotTitle = "C6 (5% FDR)")
## [1] "No sets passed the criteria"
enrichRes$enrichPlot
## NULL
geneList <- piano::loadGSC(gmts$H)$gsc$HALLMARK_INFLAMMATORY_RESPONSE
exprMat <- assay(ddsU.vst[rowData(ddsU.vst)$symbol %in% geneList,])
colAnno <- data.frame(row.names = ddsU.vst$PatID,
IGHV = patMeta[match(ddsU.vst$PatID, patMeta$Patient.ID),]$IGHV.status,
trisomy12 = patMeta[match(ddsU.vst$PatID,patMeta$Patient.ID),]$trisomy12,
DDX3X = ddsU.vst$DDX3X)
plotMat <- t(scale(t(exprMat)))
plotMat[plotMat >= 4] <- 4
plotMat[plotMat <= -4] <- -4
pheatmap(plotMat, color = colorRampPalette(c("navy","white","firebrick"))(100),
cluster_cols = TRUE, cluster_rows = TRUE,
annotation_col = colAnno, labels_row = rowData(dds.vst[rownames(exprMat),])$symbol,
show_colnames = FALSE, fontsize_row = 8, breaks = seq(-5,5, length.out = 101), treeheight_row = 0,
border_color = NA, clustering_method = "ward.D2")
pcRes <- prcomp(t(exprMat), center= TRUE, scale = FALSE)
plotTab <- pcRes$x %>% data.frame() %>% rownames_to_column("patID") %>%
left_join(rownames_to_column(colAnno, "patID"))
## Joining, by = "patID"
ggplot(plotTab, aes(x=PC1, y=PC2, col = DDX3X, shape = IGHV)) +
geom_point(size=3) + ggrepel::geom_text_repel(aes(label = patID))
Using all the genes from HALLMARK_INFLAMMATORY_RESPONSE, the DDX3X mutated samples still cluster together, but less obvious.
geneList <- piano::loadGSC(gmts$H)$gsc$HALLMARK_TNFA_SIGNALING_VIA_NFKB
exprMat <- assay(ddsU.vst[rowData(ddsU.vst)$symbol %in% geneList,])
colAnno <- data.frame(row.names = ddsU.vst$PatID,
IGHV = patMeta[match(ddsU.vst$PatID, patMeta$Patient.ID),]$IGHV.status,
trisomy12 = patMeta[match(ddsU.vst$PatID,patMeta$Patient.ID),]$trisomy12,
DDX3X = ddsU.vst$DDX3X)
plotMat <- t(scale(t(exprMat)))
plotMat[plotMat >= 4] <- 4
plotMat[plotMat <= -4] <- -4
pheatmap(plotMat, color = colorRampPalette(c("navy","white","firebrick"))(100),
cluster_cols = TRUE, cluster_rows = TRUE,
annotation_col = colAnno, labels_row = rowData(dds.vst[rownames(exprMat),])$symbol,
show_colnames = FALSE, fontsize_row = 8, breaks = seq(-5,5, length.out = 101), treeheight_row = 0,
border_color = NA, clustering_method = "ward.D2")
pcRes <- prcomp(t(exprMat), center= TRUE, scale = FALSE)
plotTab <- pcRes$x %>% data.frame() %>% rownames_to_column("patID") %>%
left_join(rownames_to_column(colAnno, "patID"))
## Joining, by = "patID"
ggplot(plotTab, aes(x=PC1, y=PC2, col = DDX3X, shape = IGHV)) +
geom_point(size=3) + ggrepel::geom_text_repel(aes(label = patID))