Last updated: 2020-06-02

Load libraries and dataset


Prepare datasets

Using LUMOS dataset

protCLL$IGHV.status <- factor(protCLL$IGHV.status, levels = c("U","M"))

Using the protein complex information from database CORUM

int_pairs = read.table ("../data/proteins_in_complexes", sep = "\t", stringsAsFactors = FALSE, header = T)

The patients with unmutated IGHV status are defined as reference.

The analysis goal is to see whether IGHV affect protein complexes landscape. No gene dosage effect is involved here.

Differential protein expression analysis

Detect protein abundance changes related to IGHV

exprMat <- assays(protCLL)[["count"]]
designMat <- data.frame(row.names  = colnames(protCLL), IGHV = protCLL$IGHV.status, trisomy12 = protCLL$trisomy12)
fit <- proDA(exprMat, design = ~ .,
             col_data = designMat)
corRes <- test_diff(fit, "IGHVM") %>%
    dplyr::rename(id = name, logFC = diff, t=t_statistic,
                  P.Value = pval, adj.P.Val = adj_pval) %>% 
    mutate(name = rowData(protCLL[id,])$hgnc_symbol) %>%
    select(name, id, logFC, t, P.Value, adj.P.Val) %>%  
    arrange(P.Value) %>% as_tibble()

corRes.sig <- filter(corRes, adj.P.Val <0.05) %>% 
  mutate(direction = ifelse(t>0, "Up","Down"))

Detect differential complex formations based on the algorithm from Marija Buljan

Run AlteredPRQ algorithm to detect protein complex ratio complex changes

source ("../code/AlteredPQR.R")
quant_data_all = assays(protCLL)[["QRILC"]]
cols_with_reference_data = seq(ncol(protCLL))[protCLL$IGHV.status %in% "U"]
RepresentativePairs = Altered_PQR(modif_z_score_threshold = 3.0, fraction_of_samples_threshold = 0.3)
[1] "Running"
[1] "..."
[1] "..."
[1] "Top 0.1, 1 and 5% upper and lower z-score values are: 9.20327216455267 4.39550324313987 2.35629365075977 and -7.21186423614476 -3.74156821383189 -2.07263543774116."
[1] "Top 1% of the absolute values for the modified z-scores is 5.13854615164948."

Re-format output

protRes.pqr <- lapply(RepresentativePairs, function(x) x) %>% bind_cols() %>%
  separate(Protein_pair, into = c("idA","idB"),"-") %>%
  mutate(protA = rowData(protCLL[idA,])$hgnc_symbol,
         protB = rowData(protCLL[idB,])$hgnc_symbol,
         chrA = rowData(protCLL[idA,])$chromosome_name,
         chrB = rowData(protCLL[idB,])$chromosome_name) %>% mutate(idx = seq(nrow(.)))%>%
  mutate(pair=map2_chr(idA, idB, ~paste0(sort(c(.x,.y)), collapse = "-")))

Run the same algorithm on RNA expression data

dds$IGHV.status <- patMeta[match(dds$PatID,patMeta$Patient.ID),]$IGHV.status
rowData(dds)$uniprotID <- rownames(protCLL)[match(rownames(dds), rowData(protCLL)$ensembl_gene_id)]
ddsSub <- dds[!$uniprotID), dds$PatID %in% colnames(protCLL)]
rownames(ddsSub) <- rowData(ddsSub)$uniprotID
ddsSub.vst <- varianceStabilizingTransformation(ddsSub)
source ("../code/AlteredPQR.R")
quant_data_all = assay(ddsSub.vst)
cols_with_reference_data = seq(ncol(ddsSub.vst))[ddsSub.vst$IGHV.status %in% "U"]
RepresentativePairs = Altered_PQR(modif_z_score_threshold = 3.0, fraction_of_samples_threshold = 0.3)
[1] "Running"
[1] "..."
[1] "..."
[1] "Top 0.1, 1 and 5% upper and lower z-score values are: 9.58881644825238 3.85797889859497 2.32724046637626 and -6.54342340545538 -3.44243808190723 -2.07406753271959."
[1] "Top 1% of the absolute values for the modified z-scores is 4.43159081827063."
rnaRes.pqr <- lapply(RepresentativePairs, function(x) x) %>% bind_cols() %>%
  separate(Protein_pair, into = c("idA","idB"),"-") %>%
  dplyr::rename(rnaChange = Change, rnaScore= Score) %>%
  mutate(pair=map2_chr(idA, idB, ~paste0(sort(c(.x,.y)), collapse = "-"))) %>%
  select(pair, rnaScore, rnaChange)

Combine protein and RNA result

comRes.pqr <- left_join(protRes.pqr, rnaRes.pqr, by ="pair") %>%
  mutate(explainedByRNA = ifelse(, "no", 
                                 ifelse(Change == rnaChange, "yes", "no")))

Exploring results

List of detected pairs

comRes.pqr %>% select(protA, protB, Score, Change, chrA, chrB, explainedByRNA) %>%
  mutate(Score = format(Score, digits = 1)) %>%

How many of those changes can be explained at RNA level


 no yes 
156   3 

Visualization using network plot

All detected pairs are shown Build network

comRes.filt <- filter(comRes.pqr, Score > 0)

#get node list
allNodes <- union(comRes.filt$protA, comRes.filt$protB) 

nodeList <- data.frame(id = seq(length(allNodes))-1, name = allNodes, stringsAsFactors = FALSE) %>%
  mutate(group = corRes.sig[match(name, corRes.sig$name),]$direction) %>%
  mutate(group = ifelse(,"ns",group))

#get edge list
edgeList <- select(comRes.filt, protA, protB, Change, explainedByRNA) %>%
  dplyr::rename(Source = protA, Target = protB) %>% 
  mutate(Source = nodeList[match(Source,nodeList$name),]$id,
         Target = nodeList[match(Target, nodeList$name),]$id
         ) %>%
  data.frame(stringsAsFactors = FALSE)

net <- graph_from_data_frame(vertices = nodeList, d=edgeList, directed = FALSE)

Visualize using ggraph

tidyNet <- as_tbl_graph(net)
ggraph(tidyNet) + geom_edge_link(aes(color = Change,edge_linetype = explainedByRNA), width=1) + 
  geom_node_point(aes(color =group), size=4) + 
  geom_node_text(aes(label = name), repel = TRUE) +
  scale_color_manual(values = c(Up = "pink",Down = "lightblue", ns="grey"))+

The proteins up-regulated in M-CLL samples are colored by red, down-regulated proteins are colored by cyan and the proteins with no significant changes are colored by grey. The color of the edges indication the whether the ratio of two proteins in pairs are increased or decreased in the reference group.

Inspecting some potentially interesting pairs

plotPair <- function(comRes, protList, protCLL, gene) { 
  pairList <- filter(comRes, protA %in% protList | protB %in% protList)
  plotList <- lapply(seq(nrow(pairList)), function(i) {
    idA <- pairList[i,]$idA
    idB <- pairList[i,]$idB
    protA <- pairList[i,]$protA
    protB  <- pairList[i,]$protB
    idPair <- c(idA, idB)
    protPair <- c(protA, protB)
    ord <- order(protPair)
    idPair <- idPair[ord]
    protPair <- protPair[ord]
    plotTab <- assays(protCLL)[["count"]][idPair,] %>%
      t() %>% data.frame()
    colnames(plotTab) <- protPair
    plotTab$logRatio <- log2(plotTab[,1]) - log2(plotTab[,2])
    plotTab <- rownames_to_column(plotTab,"patID") %>%
      mutate(status = factor(protCLL[,patID][[gene]])) %>%
    histP <- ggplot(plotTab, aes(x=logRatio, fill = status, col = status)) +
      geom_histogram(position = "identity", alpha=0.5) +
      ggtitle(sprintf("Stoichiometry: %s ~ %s",protA, protB))
    corP <- ggplot(plotTab, aes_string(x=protA, y=protB, col="status")) + 
      geom_point() + geom_smooth(formula = y~x, method = "lm") +
      scale_color_discrete(name = gene)
    plot_grid(histP, corP)
plotPair.rna <- function(comRes, protList, ddsSub.vst, gene) { 
  pairList <- filter(comRes, protA %in% protList | protB %in% protList)
  plotList <- lapply(seq(nrow(pairList)), function(i) {
    idA <- pairList[i,]$idA
    idB <- pairList[i,]$idB
    protA <- pairList[i,]$protA
    protB  <- pairList[i,]$protB
    idPair <- c(idA, idB)
    protPair <- c(protA, protB)
    ord <- order(protPair)
    idPair <- idPair[ord]
    protPair <- protPair[ord]
    plotTab <- assay(ddsSub.vst)[idPair,] %>%
      t() %>% data.frame()
    colnames(plotTab) <- protPair
    plotTab$logRatio <- log2(plotTab[,1]) - log2(plotTab[,2])
    plotTab <- rownames_to_column(plotTab,"patID") %>%
      mutate(status = factor(ddsSub.vst[,patID][[gene]])) %>%
    histP <- ggplot(plotTab, aes(x=logRatio, fill = status, col = status)) +
      geom_histogram(position = "identity", alpha=0.5) +
      ggtitle(sprintf("Stoichiometry: %s ~ %s",protPair[1], protPair[2]))
    corP <- ggplot(plotTab, aes_string(x=protPair[1], y=protPair[2], col="status")) + 
      geom_point() + geom_smooth(formula = y~x, method = "lm") +
      scale_color_discrete(name = gene)
    plot_grid(histP, corP)

Pairs involving ZAP70

protList <- c("ZAP70")
plotPair(comRes.pqr, protList, protCLL, "IGHV.status")

Pairs involving IGHM

protList <- c("IGHM")
plotPair(comRes.pqr, protList, protCLL, "IGHV.status")



Pairs involving IGHD

protList <- c("IGHD")
plotPair(comRes.pqr, protList, protCLL, "IGHV.status")



Check those pairs at RNA level

Pairs involving ZAP70

protList <- c("ZAP70")
plotPair.rna(comRes.pqr, protList, ddsSub.vst, "IGHV.status")

Pairs involving IGHM

protList <- c("IGHM")
plotPair.rna(comRes.pqr, protList, ddsSub.vst, "IGHV.status")



Pairs involving IGHD

protList <- c("IGHD")
plotPair.rna(comRes.pqr, protList, ddsSub.vst, "IGHV.status")



Detect differential complex formations based on correlation test

Differential correlation detection using DGCA package

quant_data_all = assays(protCLL)[["QRILC"]]
quant_data_all <- quant_data_all[order(rownames(quant_data_all)),]

IGHV <- protCLL$IGHV.status
designMat <- model.matrix(~IGHV+0 )
colnames(designMat) <- c("WT","IGHV")
ddcor_res = ddcorAll(inputMat = quant_data_all, design = designMat,
  compare = c("WT", "IGHV"),
  adjust = "BH", heatmapPlot = FALSE, nPerm = 0, nPairs = "all")

Reformat output

comTab <- int_pairs %>%
  mutate(pair = map2_chr(ProtA, ProtB, ~paste0(sort(c(.x, .y)),collapse = "-"))) %>%
  separate(pair, c("Gene1","Gene2"), "-", remove = FALSE) %>%
  select(Gene1, Gene2) %>% mutate(inComplex= TRUE)

allRes <- left_join(ddcor_res, comTab, by = c("Gene1","Gene2")) %>%
  mutate(inComplex = ifelse(,FALSE,TRUE))

Distribution of p-values for protein in complexes and not in complexes

ggplot(allRes, aes(x=pValDiff, fill = inComplex)) + geom_histogram() + facet_wrap(~inComplex, scale="free") +

Not much difference.

Differential correlation detection on RNA level

quant_data_all = assay(ddsSub.vst)
quant_data_all <- quant_data_all[order(rownames(quant_data_all)),]

IGHV <- ddsSub.vst$IGHV.status
designMat <- model.matrix(~IGHV+0 )
colnames(designMat) <- c("WT","IGHV")
ddcor_res = ddcorAll(inputMat = quant_data_all, design = designMat,
  compare = c("WT", "IGHV"),
  adjust = "BH", heatmapPlot = FALSE, nPerm = 0, nPairs = "all")

rnaRes.cor <- ddcor_res %>%
  select(Gene1, Gene2,  pValDiff, pValDiff_adj, Classes) %>%
  dplyr::rename(p.rna = pValDiff, padj.rna = pValDiff_adj, Classes.rna = Classes)

Exploring the results

Select protein pairs involved in known complexes

comRes.cor <- filter(allRes, inComplex) %>%
   mutate(protA = rowData(protCLL[Gene1,])$hgnc_symbol,
         protB = rowData(protCLL[Gene2,])$hgnc_symbol,
         chrA = rowData(protCLL[Gene1,])$chromosome_name,
         chrB = rowData(protCLL[Gene2,])$chromosome_name) %>%
  mutate(idx = seq(nrow(.))) %>%
  mutate(p=pValDiff,padj = pValDiff_adj)

Add test results from RNA

comRes.cor <- left_join(comRes.cor, rnaRes.cor, by =c("Gene1","Gene2")) %>%
  mutate(explainedByRNA = ifelse(,"no",
                                 ifelse(padj.rna < 0.25 & Classes == Classes.rna,"yes","no")))

List of significant pairs (25% FDR) As this test is very stringent, I use the looser FDR cut-off here.

comRes.sig <- filter(comRes.cor) %>%
  mutate(padj = p.adjust(p, method = "BH"),
         ifSig = padj < 0.25) %>%
comRes.sig %>% select(protA, protB, p, padj, chrA, chrB, Classes, explainedByRNA) %>%
  mutate_if(is.numeric, formatC, digits=2, format="e") %>%

“Classes” is the direction of changes. “+” means positive correlation, “-” means negative correlation, “0” means no correlation. “0/+” means no correlation in U-CLL samples but positive correlation in M-CLL samples; “+/0” means positive correlations in U-CLL samples but no correlation in M-CLL samples. Any types of correlation changes may suggest a change of complex formation behavior.


Visualize in network plot

comRes.filt <- comRes.sig %>% dplyr::rename(idA = Gene1, idB=Gene2)
#comRes.filt <- comRes
#get node list
allNodes <- union(comRes.filt$protA, comRes.filt$protB) 

nodeList <- data.frame(id = seq(length(allNodes))-1, name = allNodes, stringsAsFactors = FALSE) %>%
  mutate(group = corRes.sig[match(name, corRes.sig$name),]$direction) %>%
  mutate(group = ifelse(,"ns",group)) 

#get edge list
edgeList <- select(comRes.filt, protA, protB, p, Classes) %>%
  dplyr::rename(Source = protA, Target = protB) %>% 
  mutate(Source = nodeList[match(Source,nodeList$name),]$id,
         Target = nodeList[match(Target, nodeList$name),]$id,
         Classes = as.character(Classes)) %>%
  data.frame(stringsAsFactors = FALSE)

net <- graph_from_data_frame(vertices = nodeList, d=edgeList, directed = FALSE)

Visualize using ggraph

tidyNet <- as_tbl_graph(net)
ggraph(tidyNet) + geom_edge_link(aes(color = Classes), width=1) + 
  geom_node_point(aes(color =group), size=4) + 
  geom_node_text(aes(label = name), repel = TRUE) +
  scale_color_manual(values = c(Up = "pink",Down = "lightblue", ns="grey"))+

Inspecting some potentially interesting pairs

Pairs involving IGHD

protList <- c("IGHD")
plotPair(comRes.filt, protList, protCLL, "IGHV.status")


Pairs involving RIPK1

protList <- c("RIPK1")
plotPair(comRes.filt, protList, protCLL, "IGHV.status")


Pairs involving MAPK1

protList <- c("MAPK1")
plotPair(comRes.filt, protList, protCLL, "IGHV.status")

Pairs involving CHEK2

protList <- c("CHEK2")
plotPair(comRes.filt, protList, protCLL, "IGHV.status")

Pairs involving MTOR

protList <- c("MTOR")
plotPair(comRes.filt, protList, protCLL, "IGHV.status")

Check those pairs at RNA expression level

Pairs involving IGHD

protList <- c("IGHD")
plotPair.rna(comRes.filt, protList, ddsSub.vst, "IGHV.status")


Pairs involving RIPK1

protList <- c("RIPK1")
plotPair.rna(comRes.filt, protList, ddsSub.vst, "IGHV.status")


Pairs involving MAPK1

protList <- c("MAPK1")
plotPair.rna(comRes.filt, protList, ddsSub.vst, "IGHV.status")

Pairs involving CHEK2

protList <- c("CHEK2")
plotPair.rna(comRes.filt, protList, ddsSub.vst, "IGHV.status")

Pairs involving MTOR

protList <- c("MTOR")
plotPair.rna(comRes.filt, protList, ddsSub.vst, "IGHV.status")

