Contents

1 Preprocessing

colData(dxdCLL)$DDX3X <- patMeta[match(dxdCLL$patID,patMeta$Patient.ID),]$DDX3X
colData(dxdCLL)$batch <- as.factor(colData(dxdCLL)$batch)

I didn’t include the two samples with intronic mutations on DDX3X, since including them led to worse testing results.

Get samples without any known mutations

dxdSub <- dxdCLL[,!is.na(dxdCLL$DDX3X)]
dxdSub$condition <- as.factor(dxdSub$DDX3X)

geneSum <- dplyr::select(patMeta, Patient.ID,del10p:ZNF292) %>% 
  filter(Patient.ID %in% dxdSub$patID) %>% gather(-Patient.ID, key = "gene", value = "status") %>%
  mutate(status = as.integer(status)) %>%
  group_by(Patient.ID) %>% summarise(nTotal = sum(!is.na(status)),nMut = sum(status, na.rm = TRUE)) %>%
  arrange(desc(nTotal)) %>% filter(nMut ==0)



dxdSub <- dxdSub[,dxdSub$DDX3X  == 1 | dxdSub$patID %in% geneSum$Patient.ID]

Because differential exon usage estimation is very time demanding, I subsetted the wild type samples and only use the samples with out any known mutations as the control group.

Filter counts

keep <- rowSums(counts(dxdSub,normalized = TRUE)) > 100
dxdSub <- dxdSub[keep,]

sds <- genefilter::rowSds(counts(dxdSub, normalized=TRUE))
dxdSub <- dxdSub[sds > genefilter::shorth(sds),]

2 Check the effect of DDX3X mutation on the splicing of DDX3X itself

2.1 Samples with extronic mutations

id <- "ENSG00000215301"
dxdOnly <- dxdSub[rowData(dxdSub)$groupID == id,]
dxdOnly$condition <- droplevels(dxdOnly$condition)
dxdOnly$sample <- droplevels(dxdOnly$sample)

formulaFullModel <- ~ sample + exon + condition:exon
formulaReducedModel <- ~ sample + exon 
dxdOnly <- estimateDispersions(dxdOnly, formula = formulaFullModel)
dxdOnly <- testForDEU(dxdOnly, reducedModel = formulaReducedModel,
                  fullModel = formulaFullModel)
dxrSub <- DEXSeqResults(dxdOnly)
plotDEXSeq(dxrSub, "ENSG00000215301", displayTranscripts = TRUE, legend = TRUE, norCounts = TRUE)

No significant differential exon usage can be identified.

List of exons with p value < 0.05

filter(data.frame(dxrSub), pvalue <= 0.05 )
##           groupID featureID exonBaseMean   dispersion     stat      pvalue
## 1 ENSG00000215301      E006     1053.317 0.0535367850 6.084576 0.013636710
## 2 ENSG00000215301      E022     6357.577 0.0003588887 7.176584 0.007386121
##        padj genomicData.seqnames genomicData.start genomicData.end
## 1 0.1568222                    X          41196944        41197762
## 2 0.1568222                    X          41206893        41207289
##   genomicData.width genomicData.strand countData.9ELC countData.CY4B
## 1               819                  +           2320            592
## 2               397                  +           8311           4225
##   countData.PFDT countData.RMU2 countData.U1TH countData.WHTN countData.A7KR
## 1            833           1490           1364           1862            797
## 2           5149           7148           6652          13338           9363
##   countData.TIW9 countData.1NTHJK countData.3NS8ER countData.4RBL4B
## 1           1206             1716             1904             2524
## 2          10295            13763             9778            11532
##   countData.APD8 countData.MT4S countData.P1WSWP countData.L8HC49
## 1           1121            987             1005              949
## 2           5091          11339             3889             3348
##    transcripts
## 1 ENST0000....
## 2 ENST0000....

Those two exons didn’t pass 10% FDR control.

2.2 Sample with intronic mutations

2.2.1 P0070

Process

#add intronic mutation
dxdSub <- dxdCLL[,!is.na(dxdCLL$DDX3X)]
dxdSub$intron <- 0
colData(dxdSub)$intron[dxdSub$patID %in% c("P0070")] <- 1
dxdSub$condition <- as.factor(dxdSub$intron)

dxdSub <- dxdSub[,dxdSub$intron  == 1 | dxdSub$patID %in% geneSum$Patient.ID]
id <- "ENSG00000215301"
dxdOnly <- dxdSub[rowData(dxdSub)$groupID == id,]
dxdOnly$condition <- droplevels(dxdOnly$condition)
dxdOnly$sample <- droplevels(dxdOnly$sample)

formulaFullModel <- ~ sample + exon + condition:exon
formulaReducedModel <- ~ sample + exon 
dxdOnly <- estimateDispersions(dxdOnly, formula = formulaFullModel)
dxdOnly <- testForDEU(dxdOnly, reducedModel = formulaReducedModel,
                  fullModel = formulaFullModel)
dxrSub <- DEXSeqResults(dxdOnly)
plotDEXSeq(dxrSub, "ENSG00000215301", displayTranscripts = TRUE, legend = TRUE, norCounts = TRUE)

List of exons with p value < 0.05

filter(data.frame(dxrSub), pvalue <= 0.05 )
##           groupID featureID exonBaseMean dispersion      stat       pvalue
## 1 ENSG00000215301      E003     1173.949 0.03169048  4.610297 3.178053e-02
## 2 ENSG00000215301      E005     1453.107 0.03537576  4.562301 3.268313e-02
## 3 ENSG00000215301      E012     1848.620 0.00855084 17.321763 3.155523e-05
##           padj genomicData.seqnames genomicData.start genomicData.end
## 1 0.2505706351                    X          41193427        41193550
## 2 0.2505706351                    X          41196661        41196718
## 3 0.0007257702                    X          41202990        41203075
##   genomicData.width genomicData.strand countData.0O1N countData.9ELC
## 1               124                  +            207           1490
## 2                58                  +            249           1780
## 3                86                  +            327           2413
##   countData.CY4B countData.U1TH countData.WHTN countData.A7KR countData.TIW9
## 1            374           1149           1968           1320           2156
## 2            441           1421           2426           1612           2625
## 3            977           1866           3435           2217           2966
##   countData.1NTHJK countData.3NS8ER countData.APD8 countData.MT4S
## 1             2719             2384           1066           2171
## 2             3527             3088           1259           2754
## 3             4203             3279           1697           2957
##    transcripts
## 1 ENST0000....
## 2 ENST0000....
## 3 ENST0000....

The intronic mutation in P0070 is located at 41203078 (A -> T), right after E012.

2.2.2 P0681

Process

#add intronic mutation
dxdSub <- dxdCLL[,!is.na(dxdCLL$DDX3X) | dxdCLL$patID == "P0681"]
dxdSub$intron <- 0
colData(dxdSub)$intron[dxdSub$patID %in% c("P0681")] <- 1
dxdSub$condition <- as.factor(dxdSub$intron)

dxdSub <- dxdSub[,dxdSub$intron  == 1 | dxdSub$patID %in% geneSum$Patient.ID]
id <- "ENSG00000215301"
dxdOnly <- dxdSub[rowData(dxdSub)$groupID == id,]
dxdOnly$condition <- droplevels(dxdOnly$condition)
dxdOnly$sample <- droplevels(dxdOnly$sample)

formulaFullModel <- ~ sample + exon + condition:exon
formulaReducedModel <- ~ sample + exon 
dxdOnly <- estimateDispersions(dxdOnly, formula = formulaFullModel)
dxdOnly <- testForDEU(dxdOnly, reducedModel = formulaReducedModel,
                  fullModel = formulaFullModel)
dxrSub <- DEXSeqResults(dxdOnly)
plotDEXSeq(dxrSub, "ENSG00000215301", displayTranscripts = TRUE, legend = TRUE, norCounts = TRUE)

List of exons with p value < 0.05

filter(data.frame(dxrSub), pvalue <= 0.05 )
##           groupID featureID exonBaseMean   dispersion      stat       pvalue
## 1 ENSG00000215301      E001     52.19497 0.0166880963  6.166810 1.301691e-02
## 2 ENSG00000215301      E015   3326.20801 0.0018123571  6.951426 8.375243e-03
## 3 ENSG00000215301      E016   3818.81359 0.0020682883  5.436902 1.971561e-02
## 4 ENSG00000215301      E017   4244.10121 0.0025061966  5.884011 1.527898e-02
## 5 ENSG00000215301      E022   6896.38787 0.0002558147 18.710633 1.521318e-05
##           padj genomicData.seqnames genomicData.start genomicData.end
## 1 0.0878541373                    X          41192651        41193414
## 2 0.0878541373                    X          41204433        41204577
## 3 0.0906918008                    X          41204657        41204801
## 4 0.0878541373                    X          41205482        41205663
## 5 0.0003499031                    X          41206893        41207289
##   genomicData.width genomicData.strand countData.9ELC countData.CY4B
## 1               764                  +             69             29
## 2               145                  +           4182           1809
## 3               145                  +           4734           2118
## 4               182                  +           5097           2391
## 5               397                  +           8311           4225
##   countData.U1TH countData.WHTN countData.A7KR countData.TIW9
## 1             72             84             61             81
## 2           3283           6700           4074           5182
## 3           3702           7474           4643           5873
## 4           4079           7925           5335           6335
## 5           6652          13338           9363          10295
##   countData.1NTHJK countData.3NS8ER countData.APD8 countData.MT4S
## 1               88               95             50            102
## 2             6406             5092           2467           5341
## 3             7545             5994           2969           6258
## 4             8826             6846           3320           6982
## 5            13763             9778           5091          11339
##   countData.82KWXX  transcripts
## 1               16 ENST0000....
## 2             2410 ENST0000....
## 3             2748 ENST0000....
## 4             3121 ENST0000....
## 5             4743 ENST0000....

The intronic mutation in P0681 is located at 41194068 (G->C), between E004 and E005.

3 Test the effect of DDX3X mutation on splicing at whole transcriptomic level

Processing

dxdSub <- dxdCLL[,!is.na(dxdCLL$DDX3X)]
dxdSub$condition <- as.factor(dxdSub$DDX3X)
dxdSub <- dxdSub[,dxdSub$DDX3X  == 1 | dxdSub$patID %in% geneSum$Patient.ID]

Annotate sample

dxdSub$condition <- droplevels(dxdSub$condition)
dxdSub$sample <- droplevels(dxdSub$sample)

Perform differential exon usage test by DEXSeq

formulaFullModel <- ~ sample + exon + condition:exon
formulaReducedModel <- ~ sample + exon 
dxdSub <- estimateDispersions(dxdSub, formula = formulaFullModel)
dxdSub <- testForDEU(dxdSub, reducedModel = formulaReducedModel,
                  fullModel = formulaFullModel)

#save results
#save(dxdSub, file = "dxdSub_inclIntronic.RData")

Load saved data

load("dxdSub.RData")
dxrSub <- DEXSeqResults(dxdSub)

P value histogram

hist(dxrSub$pvalue, breaks = 20)

The signal is weak.

A table shown differentially used exons (10% FDR )

dxrSub %>% data.frame() %>%
  rownames_to_column("id") %>% arrange(pvalue) %>%
  filter(padj < 0.1) %>%
  mutate_if(is.numeric, format, digits =3) %>%
  mutate(geneID = rowData(dxdSub)[match(id, rownames(dxdSub)),]$groupID,
         symbol = rowData(dxdSub)[match(id, rownames(dxdSub)),]$symbol) %>%
  select(geneID, symbol, featureID, stat, pvalue, padj, genomicData.start, genomicData.end) %>%
  DT::datatable()

Get significant results (10% FDR)

dxrSub.sig <- dxrSub %>% data.frame() %>%
  rownames_to_column("id") %>% arrange(pvalue) %>%
  filter( padj < 0.1) %>%
  mutate(geneID = rowData(dxdSub)[match(id, rownames(dxdSub)),]$groupID,
         symbol = rowData(dxdSub)[match(id, rownames(dxdSub)),]$symbol)

Gene names and number of exons affected

sort(table(dxrSub.sig$symbol), decreasing = TRUE)
## 
##     RHOH    KDM2B TNFRSF14   WASHC3   ARPC1B   COL9A2    CYHR1     EMP3 
##        5        4        3        3        2        2        2        2 
##    FOSL2     MSI2  PDE4DIP RABGAP1L    RIC8A   SH3D21     AAMP    ACOT9 
##        2        2        2        2        2        2        1        1 
##      AK6 ARHGAP24     BCL6     BRD2   CACNB2   CAPN12     CCNC    CCND3 
##        1        1        1        1        1        1        1        1 
##    CCZ1B      CD6   CLASP2   CLEC2D      CNP    DIDO1     ETV3   FNDC3A 
##        1        1        1        1        1        1        1        1 
##     HHEX    IFI16     ITPA   KDELR2     KLF7    LAIR1    LENG8     LGMN 
##        1        1        1        1        1        1        1        1 
##   METTL4   MFSD2A    MYBL1    MYL6B  NDUFAB1  NDUFAF6    P2RX5    PIAS1 
##        1        1        1        1        1        1        1        1 
##     PISD    PLIN2   POLR2E   PTP4A1 RASGEF1B     RELA    RSRC2     RTF2 
##        1        1        1        1        1        1        1        1 
##    SEPT9   SLC7A5    TAPT1 TBC1D10C     TFEB   WASH7P     YBEY  ZCCHC10 
##        1        1        1        1        1        1        1        1 
##   ZDHHC4   ZFAND6   ZNF292   ZNF644   ZNF777 
##        1        1        1        1        1

Visualising exon usage of RHOH

id <- unique(filter(dxrSub.sig, symbol == "RHOH")$geneID)
plotDEXSeq(dxrSub, id, displayTranscripts = TRUE, legend = TRUE, norCounts = TRUE)

Visualising exon usage of KDM2B

id <- unique(filter(dxrSub.sig, symbol == "KDM2B")$geneID)
plotDEXSeq(dxrSub, id, displayTranscripts = TRUE, legend = TRUE, norCounts = TRUE)

Visualising exon usage of BCL6

id <- unique(filter(dxrSub.sig, symbol == "BCL6")$geneID)
plotDEXSeq(dxrSub, id, displayTranscripts = TRUE, legend = TRUE, norCounts = TRUE)

Visualising exon usage of CCNC

id <- unique(filter(dxrSub.sig, symbol == "CCNC")$geneID)
plotDEXSeq(dxrSub, id, displayTranscripts = TRUE, legend = TRUE, norCounts = TRUE)

Visualising exon usage of CCND3

id <- unique(filter(dxrSub.sig, symbol == "CCND3")$geneID)
plotDEXSeq(dxrSub, id, displayTranscripts = TRUE, legend = TRUE, norCounts = TRUE)

Visualising exon usage of HHEX

id <- unique(filter(dxrSub.sig, symbol == "HHEX")$geneID)
plotDEXSeq(dxrSub, id, displayTranscripts = TRUE, legend = TRUE, norCounts = TRUE)

Visualising exon usage of IFI16

id <- unique(filter(dxrSub.sig, symbol == "IFI16")$geneID)
plotDEXSeq(dxrSub, id, displayTranscripts = TRUE, legend = TRUE, norCounts = TRUE)

Visualising exon usage of MYBL1

id <- unique(filter(dxrSub.sig, symbol == "MYBL1")$geneID)
plotDEXSeq(dxrSub, id, displayTranscripts = TRUE, legend = TRUE, norCounts = TRUE)

Visualising exon usage of RELA

id <- unique(filter(dxrSub.sig, symbol == "RELA")$geneID)
plotDEXSeq(dxrSub, id, displayTranscripts = TRUE, legend = TRUE, norCounts = TRUE)