colData(dxdCLL)$DDX3X <- patMeta[match(dxdCLL$patID,patMeta$Patient.ID),]$DDX3X
colData(dxdCLL)$batch <- as.factor(colData(dxdCLL)$batch)
I didn’t include the two samples with intronic mutations on DDX3X, since including them led to worse testing results.
Get samples without any known mutations
dxdSub <- dxdCLL[,!is.na(dxdCLL$DDX3X)]
dxdSub$condition <- as.factor(dxdSub$DDX3X)
geneSum <- dplyr::select(patMeta, Patient.ID,del10p:ZNF292) %>%
filter(Patient.ID %in% dxdSub$patID) %>% gather(-Patient.ID, key = "gene", value = "status") %>%
mutate(status = as.integer(status)) %>%
group_by(Patient.ID) %>% summarise(nTotal = sum(!is.na(status)),nMut = sum(status, na.rm = TRUE)) %>%
arrange(desc(nTotal)) %>% filter(nMut ==0)
dxdSub <- dxdSub[,dxdSub$DDX3X == 1 | dxdSub$patID %in% geneSum$Patient.ID]
Because differential exon usage estimation is very time demanding, I subsetted the wild type samples and only use the samples with out any known mutations as the control group.
Filter counts
keep <- rowSums(counts(dxdSub,normalized = TRUE)) > 100
dxdSub <- dxdSub[keep,]
sds <- genefilter::rowSds(counts(dxdSub, normalized=TRUE))
dxdSub <- dxdSub[sds > genefilter::shorth(sds),]
id <- "ENSG00000215301"
dxdOnly <- dxdSub[rowData(dxdSub)$groupID == id,]
dxdOnly$condition <- droplevels(dxdOnly$condition)
dxdOnly$sample <- droplevels(dxdOnly$sample)
formulaFullModel <- ~ sample + exon + condition:exon
formulaReducedModel <- ~ sample + exon
dxdOnly <- estimateDispersions(dxdOnly, formula = formulaFullModel)
dxdOnly <- testForDEU(dxdOnly, reducedModel = formulaReducedModel,
fullModel = formulaFullModel)
dxrSub <- DEXSeqResults(dxdOnly)
plotDEXSeq(dxrSub, "ENSG00000215301", displayTranscripts = TRUE, legend = TRUE, norCounts = TRUE)
No significant differential exon usage can be identified.
List of exons with p value < 0.05
filter(data.frame(dxrSub), pvalue <= 0.05 )
## groupID featureID exonBaseMean dispersion stat pvalue
## 1 ENSG00000215301 E006 1053.317 0.0535367850 6.084576 0.013636710
## 2 ENSG00000215301 E022 6357.577 0.0003588887 7.176584 0.007386121
## padj genomicData.seqnames genomicData.start genomicData.end
## 1 0.1568222 X 41196944 41197762
## 2 0.1568222 X 41206893 41207289
## genomicData.width genomicData.strand countData.9ELC countData.CY4B
## 1 819 + 2320 592
## 2 397 + 8311 4225
## countData.PFDT countData.RMU2 countData.U1TH countData.WHTN countData.A7KR
## 1 833 1490 1364 1862 797
## 2 5149 7148 6652 13338 9363
## countData.TIW9 countData.1NTHJK countData.3NS8ER countData.4RBL4B
## 1 1206 1716 1904 2524
## 2 10295 13763 9778 11532
## countData.APD8 countData.MT4S countData.P1WSWP countData.L8HC49
## 1 1121 987 1005 949
## 2 5091 11339 3889 3348
## transcripts
## 1 ENST0000....
## 2 ENST0000....
Those two exons didn’t pass 10% FDR control.
Process
#add intronic mutation
dxdSub <- dxdCLL[,!is.na(dxdCLL$DDX3X)]
dxdSub$intron <- 0
colData(dxdSub)$intron[dxdSub$patID %in% c("P0070")] <- 1
dxdSub$condition <- as.factor(dxdSub$intron)
dxdSub <- dxdSub[,dxdSub$intron == 1 | dxdSub$patID %in% geneSum$Patient.ID]
id <- "ENSG00000215301"
dxdOnly <- dxdSub[rowData(dxdSub)$groupID == id,]
dxdOnly$condition <- droplevels(dxdOnly$condition)
dxdOnly$sample <- droplevels(dxdOnly$sample)
formulaFullModel <- ~ sample + exon + condition:exon
formulaReducedModel <- ~ sample + exon
dxdOnly <- estimateDispersions(dxdOnly, formula = formulaFullModel)
dxdOnly <- testForDEU(dxdOnly, reducedModel = formulaReducedModel,
fullModel = formulaFullModel)
dxrSub <- DEXSeqResults(dxdOnly)
plotDEXSeq(dxrSub, "ENSG00000215301", displayTranscripts = TRUE, legend = TRUE, norCounts = TRUE)
List of exons with p value < 0.05
filter(data.frame(dxrSub), pvalue <= 0.05 )
## groupID featureID exonBaseMean dispersion stat pvalue
## 1 ENSG00000215301 E003 1173.949 0.03169048 4.610297 3.178053e-02
## 2 ENSG00000215301 E005 1453.107 0.03537576 4.562301 3.268313e-02
## 3 ENSG00000215301 E012 1848.620 0.00855084 17.321763 3.155523e-05
## padj genomicData.seqnames genomicData.start genomicData.end
## 1 0.2505706351 X 41193427 41193550
## 2 0.2505706351 X 41196661 41196718
## 3 0.0007257702 X 41202990 41203075
## genomicData.width genomicData.strand countData.0O1N countData.9ELC
## 1 124 + 207 1490
## 2 58 + 249 1780
## 3 86 + 327 2413
## countData.CY4B countData.U1TH countData.WHTN countData.A7KR countData.TIW9
## 1 374 1149 1968 1320 2156
## 2 441 1421 2426 1612 2625
## 3 977 1866 3435 2217 2966
## countData.1NTHJK countData.3NS8ER countData.APD8 countData.MT4S
## 1 2719 2384 1066 2171
## 2 3527 3088 1259 2754
## 3 4203 3279 1697 2957
## transcripts
## 1 ENST0000....
## 2 ENST0000....
## 3 ENST0000....
The intronic mutation in P0070 is located at 41203078 (A -> T), right after E012.
Process
#add intronic mutation
dxdSub <- dxdCLL[,!is.na(dxdCLL$DDX3X) | dxdCLL$patID == "P0681"]
dxdSub$intron <- 0
colData(dxdSub)$intron[dxdSub$patID %in% c("P0681")] <- 1
dxdSub$condition <- as.factor(dxdSub$intron)
dxdSub <- dxdSub[,dxdSub$intron == 1 | dxdSub$patID %in% geneSum$Patient.ID]
id <- "ENSG00000215301"
dxdOnly <- dxdSub[rowData(dxdSub)$groupID == id,]
dxdOnly$condition <- droplevels(dxdOnly$condition)
dxdOnly$sample <- droplevels(dxdOnly$sample)
formulaFullModel <- ~ sample + exon + condition:exon
formulaReducedModel <- ~ sample + exon
dxdOnly <- estimateDispersions(dxdOnly, formula = formulaFullModel)
dxdOnly <- testForDEU(dxdOnly, reducedModel = formulaReducedModel,
fullModel = formulaFullModel)
dxrSub <- DEXSeqResults(dxdOnly)
plotDEXSeq(dxrSub, "ENSG00000215301", displayTranscripts = TRUE, legend = TRUE, norCounts = TRUE)
List of exons with p value < 0.05
filter(data.frame(dxrSub), pvalue <= 0.05 )
## groupID featureID exonBaseMean dispersion stat pvalue
## 1 ENSG00000215301 E001 52.19497 0.0166880963 6.166810 1.301691e-02
## 2 ENSG00000215301 E015 3326.20801 0.0018123571 6.951426 8.375243e-03
## 3 ENSG00000215301 E016 3818.81359 0.0020682883 5.436902 1.971561e-02
## 4 ENSG00000215301 E017 4244.10121 0.0025061966 5.884011 1.527898e-02
## 5 ENSG00000215301 E022 6896.38787 0.0002558147 18.710633 1.521318e-05
## padj genomicData.seqnames genomicData.start genomicData.end
## 1 0.0878541373 X 41192651 41193414
## 2 0.0878541373 X 41204433 41204577
## 3 0.0906918008 X 41204657 41204801
## 4 0.0878541373 X 41205482 41205663
## 5 0.0003499031 X 41206893 41207289
## genomicData.width genomicData.strand countData.9ELC countData.CY4B
## 1 764 + 69 29
## 2 145 + 4182 1809
## 3 145 + 4734 2118
## 4 182 + 5097 2391
## 5 397 + 8311 4225
## countData.U1TH countData.WHTN countData.A7KR countData.TIW9
## 1 72 84 61 81
## 2 3283 6700 4074 5182
## 3 3702 7474 4643 5873
## 4 4079 7925 5335 6335
## 5 6652 13338 9363 10295
## countData.1NTHJK countData.3NS8ER countData.APD8 countData.MT4S
## 1 88 95 50 102
## 2 6406 5092 2467 5341
## 3 7545 5994 2969 6258
## 4 8826 6846 3320 6982
## 5 13763 9778 5091 11339
## countData.82KWXX transcripts
## 1 16 ENST0000....
## 2 2410 ENST0000....
## 3 2748 ENST0000....
## 4 3121 ENST0000....
## 5 4743 ENST0000....
The intronic mutation in P0681 is located at 41194068 (G->C), between E004 and E005.
Processing
dxdSub <- dxdCLL[,!is.na(dxdCLL$DDX3X)]
dxdSub$condition <- as.factor(dxdSub$DDX3X)
dxdSub <- dxdSub[,dxdSub$DDX3X == 1 | dxdSub$patID %in% geneSum$Patient.ID]
Annotate sample
dxdSub$condition <- droplevels(dxdSub$condition)
dxdSub$sample <- droplevels(dxdSub$sample)
Perform differential exon usage test by DEXSeq
formulaFullModel <- ~ sample + exon + condition:exon
formulaReducedModel <- ~ sample + exon
dxdSub <- estimateDispersions(dxdSub, formula = formulaFullModel)
dxdSub <- testForDEU(dxdSub, reducedModel = formulaReducedModel,
fullModel = formulaFullModel)
#save results
#save(dxdSub, file = "dxdSub_inclIntronic.RData")
Load saved data
load("dxdSub.RData")
dxrSub <- DEXSeqResults(dxdSub)
P value histogram
hist(dxrSub$pvalue, breaks = 20)
The signal is weak.
A table shown differentially used exons (10% FDR )
dxrSub %>% data.frame() %>%
rownames_to_column("id") %>% arrange(pvalue) %>%
filter(padj < 0.1) %>%
mutate_if(is.numeric, format, digits =3) %>%
mutate(geneID = rowData(dxdSub)[match(id, rownames(dxdSub)),]$groupID,
symbol = rowData(dxdSub)[match(id, rownames(dxdSub)),]$symbol) %>%
select(geneID, symbol, featureID, stat, pvalue, padj, genomicData.start, genomicData.end) %>%
DT::datatable()
Get significant results (10% FDR)
dxrSub.sig <- dxrSub %>% data.frame() %>%
rownames_to_column("id") %>% arrange(pvalue) %>%
filter( padj < 0.1) %>%
mutate(geneID = rowData(dxdSub)[match(id, rownames(dxdSub)),]$groupID,
symbol = rowData(dxdSub)[match(id, rownames(dxdSub)),]$symbol)
Gene names and number of exons affected
sort(table(dxrSub.sig$symbol), decreasing = TRUE)
##
## RHOH KDM2B TNFRSF14 WASHC3 ARPC1B COL9A2 CYHR1 EMP3
## 5 4 3 3 2 2 2 2
## FOSL2 MSI2 PDE4DIP RABGAP1L RIC8A SH3D21 AAMP ACOT9
## 2 2 2 2 2 2 1 1
## AK6 ARHGAP24 BCL6 BRD2 CACNB2 CAPN12 CCNC CCND3
## 1 1 1 1 1 1 1 1
## CCZ1B CD6 CLASP2 CLEC2D CNP DIDO1 ETV3 FNDC3A
## 1 1 1 1 1 1 1 1
## HHEX IFI16 ITPA KDELR2 KLF7 LAIR1 LENG8 LGMN
## 1 1 1 1 1 1 1 1
## METTL4 MFSD2A MYBL1 MYL6B NDUFAB1 NDUFAF6 P2RX5 PIAS1
## 1 1 1 1 1 1 1 1
## PISD PLIN2 POLR2E PTP4A1 RASGEF1B RELA RSRC2 RTF2
## 1 1 1 1 1 1 1 1
## SEPT9 SLC7A5 TAPT1 TBC1D10C TFEB WASH7P YBEY ZCCHC10
## 1 1 1 1 1 1 1 1
## ZDHHC4 ZFAND6 ZNF292 ZNF644 ZNF777
## 1 1 1 1 1
Visualising exon usage of RHOH
id <- unique(filter(dxrSub.sig, symbol == "RHOH")$geneID)
plotDEXSeq(dxrSub, id, displayTranscripts = TRUE, legend = TRUE, norCounts = TRUE)
Visualising exon usage of KDM2B
id <- unique(filter(dxrSub.sig, symbol == "KDM2B")$geneID)
plotDEXSeq(dxrSub, id, displayTranscripts = TRUE, legend = TRUE, norCounts = TRUE)
Visualising exon usage of BCL6
id <- unique(filter(dxrSub.sig, symbol == "BCL6")$geneID)
plotDEXSeq(dxrSub, id, displayTranscripts = TRUE, legend = TRUE, norCounts = TRUE)
Visualising exon usage of CCNC
id <- unique(filter(dxrSub.sig, symbol == "CCNC")$geneID)
plotDEXSeq(dxrSub, id, displayTranscripts = TRUE, legend = TRUE, norCounts = TRUE)
Visualising exon usage of CCND3
id <- unique(filter(dxrSub.sig, symbol == "CCND3")$geneID)
plotDEXSeq(dxrSub, id, displayTranscripts = TRUE, legend = TRUE, norCounts = TRUE)
Visualising exon usage of HHEX
id <- unique(filter(dxrSub.sig, symbol == "HHEX")$geneID)
plotDEXSeq(dxrSub, id, displayTranscripts = TRUE, legend = TRUE, norCounts = TRUE)
Visualising exon usage of IFI16
id <- unique(filter(dxrSub.sig, symbol == "IFI16")$geneID)
plotDEXSeq(dxrSub, id, displayTranscripts = TRUE, legend = TRUE, norCounts = TRUE)
Visualising exon usage of MYBL1
id <- unique(filter(dxrSub.sig, symbol == "MYBL1")$geneID)
plotDEXSeq(dxrSub, id, displayTranscripts = TRUE, legend = TRUE, norCounts = TRUE)
Visualising exon usage of RELA
id <- unique(filter(dxrSub.sig, symbol == "RELA")$geneID)
plotDEXSeq(dxrSub, id, displayTranscripts = TRUE, legend = TRUE, norCounts = TRUE)