7 Pathway Analysis

Pathway analysis enables exploration of different aggregate gene sets for our experimental questions. Each individual ROI/AOI segment is scored for every pathway of interest, which we can then use to investigate biological differences. We will perform analogous analyses as those outlined in the Differential Expression and Spatial Deconvolution sections of the report for gene set enrichment.

7.1 Scoring Gene Sets

Pathways and gene sets were defined from the Reactome database. We use an R software package called Gene Set Variation Analysis to score each segment within our study. For more information on pathway analysis, please see the Appendix.

# point the function to a folder containing GMT files or to a gene set list
if(file.exists(file.path(object_dir, "geneSetObject.RDS"))){
 geneSet_data <- readRDS(file.path(object_dir, "geneSetObject.RDS"))
} else {
 geneSet_data <- geneSetAnalysis(object = target_data,
                                 elt = "log_q",
                                 geneSet = "../sDAS/inst/extdata/reactome/",
                                 convertFrom = "SYMBOL",
                                 species = "Hs",
                                 minSize = mingenes,
                                 maxSize = maxgenes)
 saveRDS(geneSet_data, file.path(object_dir, "geneSetObject.RDS"))
}
# to view information about the gene sets scored, view the phenoData using:
#  pData(geneSet_data)

# check for gene sets of interest being captured in the analysis:
poi_missing <- poi[!poi %in% fData(geneSet_data)$GeneSet]
poi <- poi[poi %in% fData(geneSet_data)$GeneSet]

Analyst notes: We scored 1467 gene sets. We calculate scores for any gene set with at least 5 genes and fewer than 500 genes.

7.2 Glomeruli vs Tubules

We compare gene set enrichment for differences between glomeruli and tubules. The following formula was used to model differences for a given pathway:

\[ pathway \sim region + (1+region|slide) \]

pData(geneSet_data)$slide <- as.factor(pData(geneSet_data)$`slide name`)
pData(geneSet_data)$region <- as.factor(pData(geneSet_data)$region)
pData(geneSet_data)$class <- as.factor(pData(geneSet_data)$class)

contrast_factor <- "region"
gsde1_samples <- rownames(pData(geneSet_data)) # no filtering for this test

if(file.exists(file.path(ge_data_dir, "de_gs_contrast1.RDS"))){
 geneSet_results1 <- readRDS(file.path(ge_data_dir, "de_gs_contrast1.RDS"))
} else {
 geneSetDE1 <-  
  mixedModelDE(geneSet_data[, gsde1_samples],
               modelFormula = ~ region + (1 + region|slide),
               groupVar = contrast_factor,
               nCores=parallel::detectCores()-1,
               multiCore = FALSE)

 geneSet_results1 <- formatLMMResults(geneSetDE1, p_adjust_method = p_adjust_method_selected)

 saveRDS(geneSet_results1, file.path(ge_data_dir, "de_gs_contrast1.RDS"))
}

strings <- do.call(rbind, strsplit(unique(geneSet_results1$Comparison), split=" vs "))
write.table(geneSet_results1, 
            file.path(ge_data_dir, 
                      paste0("Results_DE1_GeneSets_", strings[1],"_vs_", strings[2],".txt")), 
            sep ="\t", row.names = FALSE)

Analyst Notes: Of the gene sets scored, 625 gene sets were significant at FDR < 0.05 (False Discovery Rate). 929 gene sets were significant at FDR < 0.2 (False Discovery Rate).

7.2.1 Differential Enrichment

In the volcano plot below, we label the top 5 gene sets based on their significance on either side of the volcano plot. The horizontal line indicates a p-value of 0.05 and vertical lines indicate of fold-change of 0.5 for z-scored pathway enrichment.

top_feat_gs1 <- getTopFeatures(geneSet_results1, 
                               n_features = 5,
                               est_thr = fc_threshold1, 
                               p_adjust_column = p_adjust_method, 
                               p_adjust_thr = p_adjust_threshold1)

if(!is.null(poi) & length(poi) > 0){
  poi_down1 <- poi[geneSet_results1[geneSet_results1$Feature %in% poi,]$Estimate < 0]
  top_feat_gs1$down <- unique(c(top_feat_gs1$down, poi_down1))
  top_feat_gs1$all <- unique(c(top_feat_gs1$all, poi_down1))
  poi_up1 <- poi[geneSet_results1[geneSet_results1$Feature %in% poi,]$Estimate >= 0]
  top_feat_gs1$up <- unique(c(top_feat_gs1$up, poi_up1))
  top_feat_gs1$all <- unique(c(top_feat_gs1$all, poi_up1))
 }

gs_contrast1_volcano_list <- 
 makeVolcano(df=geneSet_results1,
             to_label=top_feat_gs1$all,
             label_color="grey20",
             SCALE=1.4, LWD=1,
             fc_cutoff=volcano_fc_thr, 
             pval_cutoff=volcano_p_thr,
             p_adjust_column = p_adjust_method)

write.table(gs_contrast1_volcano_list[[2]], 
            file.path(ge_data_dir, "contrast1_de_geneset_volcano.tsv"), sep="\t",
            quote=FALSE, col.names=TRUE, row.names=FALSE)

p <- gs_contrast1_volcano_list[[1]]
ggsave(p, filename = file.path(ge_plot_dir, "contrast1_de_geneset_volcano.svg"), width=14, height=10)
p

Here we show the top results with FDR < 0.2 (False Discovery Rate) and |FC| > 0.5. The heatmap is clustered across segments using unsupervised hierarchical clustering (using Euclidean distances), and the bar plot on the right shows the absolute value of the enrichment fold change, colored based on the region.

if(!any(geneSet_results1[[p_adjust_method]] < p_adjust_threshold1)) {
 paste0("No gene sets were significant at ",p_adjust_class, " (",p_adjust_class, ") < ", p_adjust_threshold1)
} else {
 # save top 10 features
 top_feat_gs1 <- getTopFeatures(geneSet_results1, 
                                n_features = 10,
                                p_adjust_thr = p_adjust_threshold1,
                                p_adjust_column = p_adjust_method, 
                                est_thr = fc_threshold1)
 if(!is.null(poi) & length(poi) > 0){
   poi_down1 <- poi[geneSet_results1[geneSet_results1$Feature %in% poi,]$Estimate < 0]
   top_feat_gs1$down <- unique(c(top_feat_gs1$down, poi_down1))
   top_feat_gs1$all <- unique(c(top_feat_gs1$all, poi_down1))
   poi_up1 <- poi[geneSet_results1[geneSet_results1$Feature %in% poi,]$Estimate >= 0]
   top_feat_gs1$up <- unique(c(top_feat_gs1$up, poi_up1))
   top_feat_gs1$all <- unique(c(top_feat_gs1$all, poi_up1))
 }
 
 # Group expression data of the ROI segments
 exp_getTopFeatures <- exprs(geneSet_data)[top_feat_gs1$all, gsde1_samples]
 rownames(exp_getTopFeatures) <- trimNames(rownames(exp_getTopFeatures), 40)
 # Create annotation for heatmap columns
 HM_cols <- pal_heatmaps[contrast_factor]
 feat_ind <- match(top_feat_gs1$all, geneSet_results1$Feature)
 ha1 <- 
  HeatmapAnnotation(`|FC|` = 
                     anno_barplot(abs(geneSet_results1[feat_ind, 'Estimate']),
                                  width = unit(2, "cm"),
                                  bar_width = 0.7,
                                  gp = gpar(fill = c(rep(HM_cols[[1]][1], length(top_feat_gs1$up)),
                                                     rep(HM_cols[[1]][2], length(top_feat_gs1$down))))),
                    width = unit(3, "cm"),
                    border = FALSE,
                    which = "row")
 heatmap_params$top_annotation <-
  HeatmapAnnotation(df = pData(geneSet_data)[gsde1_samples, factors_of_interest],
                    col = pal_heatmaps)
 heatmap_params$right_annotation <- ha1
 heatmap_params$matrix <- exp_getTopFeatures
 h1 <- do.call(Heatmap, heatmap_params)
 # draw the heatmap
 svglite::svglite(filename = file.path(ge_plot_dir, "contrast1_de_geneset_heatmap.svg"),
                  width=12, height=8)
 draw(h1, merge_legend = TRUE, heatmap_legend_side = "bottom", 
      annotation_legend_side = "bottom", adjust_annotation_extension = TRUE)
 dev.off()
 draw(h1, merge_legend = TRUE, heatmap_legend_side = "bottom", 
      annotation_legend_side = "bottom", adjust_annotation_extension = TRUE)
}

The violin plots below further detail the differences between the tubule and glomerulus for differentially enriched pathways and pathways of interest.

# Select pathways to feature. Here, selecting all pathways used in heatmap and pathway of interest selected by client
top_feat_gs1 <- getTopFeatures(geneSet_results1, 
                               n_features = 5,
                               p_adjust_thr = p_adjust_threshold1, 
                               p_adjust_column = p_adjust_method,
                               est_thr = fc_threshold1)
label_ge1 <- unique(c(poi, top_feat_gs1$all))

# set up violin plot data frame
violin_df <- cbind(pData(geneSet_data) %>% 
                    dplyr::select(eval(contrast_factor)),
                   t(exprs(geneSet_data[label_ge1,])))

# Select expression of getTopFeatures
vexp_getTopFeatures <- exprs(geneSet_data)[label_ge1, ]

# Create expression vector for violin plot
violin_df <- cbind(pData(geneSet_data) %>% dplyr::select(eval(contrast_factor)),
                  t(vexp_getTopFeatures))

violin_df <- violin_df %>% tidyr::pivot_longer(cols=-1, names_to = "Feature", values_to = "Enrichment")

colnames(violin_df)[1] <- "contrast_factor"
violin_df$Feature <- trimNames(violin_df$Feature, 25)

# Format summary stats
violin_p_df <- filter(geneSet_results1, Feature %in% label_ge1)
violin_p_df <- violin_p_df %>% tidyr::separate(col = Comparison, into=c("group1", "group2"), sep=" vs ")
violin_p_df[[p_adjust_method]] <- signif(violin_p_df[[p_adjust_method]], 3)
violin_p_df$Feature <- trimNames(violin_p_df$Feature, 25)
violin_exp_max <- ddply(violin_df, .(Feature), summarize,
                        y.position=max(Enrichment)+0.5) # +1 for safe log2
violin_p_df <- base::merge(violin_p_df, violin_exp_max, by="Feature")

p <- ggplot(violin_df, 
            aes(x=contrast_factor, y=Enrichment, fill=contrast_factor)) + 
  geom_violin() +
  geom_jitter(width=0.25, height=0, size = 1.5) + 
  scale_fill_manual(values = pal_main[[contrast_factor]]) +
  facet_wrap(~Feature, scales = "free_y") +
  labs(x = eval(contrast_factor), y = "Enrichment (z-score)") +
  scale_y_continuous(expand = expansion(mult = c(0.1, 0.2)),
                     labels = scales::number_format(accuracy = 0.01)) +
  theme_bw(base_size = 12) + 
  theme(legend.position = "bottom") +
  guides(fill=guide_legend(title = eval(contrast_factor)))

p <- p + ggprism::add_pvalue(
  violin_p_df,
  label="{p_adjust_class} = {violin_p_df[[p_adjust_method]]}", label.size = 3.6,
  y.position = violin_p_df$y.position
)

ggsave(p, filename = file.path(ge_plot_dir, "contrast1_POI.svg"), width=12, height=8)
p

7.2.2 Results Table

The table is sorted based on FDR. Estimate reflects the relative difference in enrichment scores between the two groups in the Comparison column.

geneSet_results1$TargetNumber <-
 fData(geneSet_data)[match(geneSet_results1$Feature, rownames(geneSet_data)), 
                     "PresentTargets"]
geneSet_results1$Targets <-
 fData(geneSet_data)[match(geneSet_results1$Feature, rownames(geneSet_data)), 
                     "TargetIds"]
geneSet_results1$Pathway <- trimNames(geneSet_results1$Feature, 40)

# Change the adjusted P-value column name to its classified adjusted P-value method for display.
if(p_adjust_method_selected == "none"){
 dt1 <- geneSet_results1[, c("Pathway", "Comparison", "Estimate", "P", "TargetNumber", "Targets", "Feature")]
} else {
 dt1 <- geneSet_results1[, c("Pathway", "Comparison", "Estimate", "P","FDR", "TargetNumber", "Targets", "Feature")]
 colnames(dt1)[which(colnames(geneSet_results1) == p_adjust_method)] <- p_adjust_class
}

dt_params_gs <- dt_params
dt_params_gs$order <- list(list(3, "asc"))
dt_params_gs$autoWidth <- TRUE
dt_params_gs$buttons <- 
 list(list(extend = "copy"),
      list(extend = "csv", filename = paste0("PathwayAnalysis1_", strings[1], "vs", strings[2], ".csv")),
      list(extend = "excel", filename = paste0("PathwayAnalysis1_", strings[1], "vs", strings[2], ".xlsx")))
DT::formatRound(
 DT::formatSignif(
  DT::datatable(
   dt1,
   extensions = c("Buttons", "Scroller", "FixedColumns"),
   options = dt_params_gs,
   rownames = FALSE),
  columns = c("P", p_adjust_class)),
 columns = c("Estimate"))

7.3 Healthy vs Diseased (DKD) Glomeruli

We compare gene set enrichment for disease status-related changes within glomeruli. The following formula was used to model differences for a given pathway:

\[ pathway \sim class + (1|slide) \]

gsde2_samples <- 
 rownames(filter(pData(geneSet_data),
                 region=="glomerulus"))
contrast_factor <- "class"


if(file.exists(file.path(ge_data_dir, "de_gs_contrast2.RDS"))){
 geneSet_results2 <- readRDS(file.path(ge_data_dir, "de_gs_contrast2.RDS"))
} else {
 geneSetDE2 <-  
  mixedModelDE(geneSet_data[, gsde2_samples],
               modelFormula = ~ class + (1|slide),
               groupVar = contrast_factor,
               nCores = parallel::detectCores()-1,
               multiCore = FALSE)

 geneSet_results2 <- formatLMMResults(geneSetDE2, p_adjust_method = p_adjust_method_selected)

 saveRDS(geneSet_results2, file.path(ge_data_dir, "de_gs_contrast2.RDS"))
}

strings <- do.call(rbind, strsplit(unique(geneSet_results2$Comparison), split=" vs "))
write.table(geneSet_results2, 
            file.path(ge_data_dir, 
                      paste0("Results_DE2_GeneSets_", strings[1],"_vs_", strings[2],".txt")), 
            sep ="\t", row.names = FALSE)

Analyst Notes: Of the gene sets scored, 10 gene sets were significant at FDR < 0.05 (False Discovery Rate). 16 gene sets were significant at FDR < 0.2 (False Discovery Rate).

7.3.1 Differential Enrichment

In the volcano plot below, we label the top 5 gene sets based on their significance on either side of the volcano plot. The horizontal line indicates a p-value of 0.05 and vertical lines indicate a fold-change of 0.5 for z-scored pathway enrichment.

top_feat_gs2 <- getTopFeatures(geneSet_results2, 
                               n_features = 5,
                               p_adjust_column = p_adjust_method,
                               p_adjust_thr = p_adjust_threshold2, 
                               est_thr = fc_threshold2)

if(!is.null(poi) & length(poi) > 0){
  poi_down2 <- poi[geneSet_results2[geneSet_results2$Feature %in% poi,]$Estimate < 0]
  top_feat_gs2$down <- unique(c(top_feat_gs2$down, poi_down2))
  top_feat_gs2$all <- unique(c(top_feat_gs2$all, poi_down2))
  poi_up2 <- poi[geneSet_results2[geneSet_results2$Feature %in% poi,]$Estimate >= 0]
  top_feat_gs2$up <- unique(c(top_feat_gs2$up, poi_up2))
  top_feat_gs2$all <- unique(c(top_feat_gs2$all, poi_up2))
 }


gs_contrast2_volcano_list <- 
 makeVolcano(df=geneSet_results2, 
             to_label=top_feat_gs2$all,
             label_color="grey20",
             SCALE=1.4, LWD=1,
             fc_cutoff=volcano_fc_thr, 
             pval_cutoff=volcano_p_thr,
             p_adjust_column = p_adjust_method)

write.table(gs_contrast2_volcano_list[[2]], 
            file.path(ge_data_dir, "contrast4_de_geneset_volcano.tsv"), sep="\t",
            quote=FALSE, col.names=TRUE, row.names=FALSE)

p <- gs_contrast2_volcano_list[[1]]
ggsave(p, filename = file.path(ge_plot_dir, "contrast2_de_geneset_volcano.svg"), width=14, height=9)
p

Here we show the top results with an FDR < 0.2 (False Discovery Rate) and |FC| > 0.5. The heatmap is clustered across segments using unsupervised hierarchical clustering (using Euclidean distances), and the bar plot on the right shows the absolute value of the enrichment fold change, colored based on the class.

if(!any(geneSet_results2[[p_adjust_method]] < p_adjust_threshold2)) {
 paste0("No gene sets were significant at ", p_adjust_method," < ", p_adjust_threshold2)
} else {
 # save top 10 features
 top_feat_gs2 <- getTopFeatures(geneSet_results2, 
                                n_features = 10,
                                p_adjust_column = p_adjust_method,
                                p_adjust_thr = p_adjust_threshold2, 
                                est_thr = fc_threshold2)
 if(!is.null(poi) & length(poi) > 0){
   poi_down2 <- poi[geneSet_results2[geneSet_results2$Feature %in% poi,]$Estimate < 0]
   top_feat_gs2$down <- unique(c(top_feat_gs2$down, poi_down2))
   top_feat_gs2$all <- unique(c(top_feat_gs2$all, poi_down2))
   poi_up2 <- poi[geneSet_results2[geneSet_results2$Feature %in% poi,]$Estimate >= 0]
   top_feat_gs2$up <- unique(c(top_feat_gs2$up, poi_up2))
   top_feat_gs2$all <- unique(c(top_feat_gs2$all, poi_up2))
 }

 # subset geneset data
 geneSet_data2 <- geneSet_data[, gsde2_samples]
 # Group expression data of the ROI segments
 exp_getTopFeatures <- exprs(geneSet_data2)[top_feat_gs2$all, gsde2_samples]
 rownames(exp_getTopFeatures) <- trimNames(rownames(exp_getTopFeatures), 40)
 # Create annotation for heatmap columns
 HM_cols <- pal_heatmaps[contrast_factor]
 # annotation heatmap, right bar plot
 feat_ind <- match(top_feat_gs2$all, geneSet_results2$Feature)
 ha2 <- 
  HeatmapAnnotation(`|FC|` = 
                     anno_barplot(abs(geneSet_results2[feat_ind, 'Estimate']),
                                  width = unit(2, "cm"),
                                  bar_width = 0.7,
                                  gp = gpar(fill = c(rep(HM_cols[[1]][1], length(top_feat_gs2$up)),
                                                     rep(HM_cols[[1]][2], length(top_feat_gs2$down))))),
                    width = unit(3, "cm"),
                    border = FALSE,
                    which = "row")
 # heatmap
 heatmap_params$top_annotation <- 
  HeatmapAnnotation(df = pData(geneSet_data)[gsde2_samples, factors_of_interest],
                    col = pal_heatmaps)
 heatmap_params$right_annotation <- ha2
 heatmap_params$matrix <- exp_getTopFeatures
 h2 <- do.call(Heatmap, heatmap_params)
 # draw the heatmap
 svglite::svglite(filename = file.path(ge_plot_dir, "contrast2_de_geneset_heatmap.svg"),
                  width=12, height=8)
 draw(h2, merge_legend = TRUE, heatmap_legend_side = "bottom", 
      annotation_legend_side = "bottom", adjust_annotation_extension = TRUE)
 dev.off()
 draw(h2, merge_legend = TRUE, heatmap_legend_side = "bottom",
      annotation_legend_side = "bottom", adjust_annotation_extension = TRUE)
}

The violin plots below further detail the differences between glomeruli from normal and DKD segments for differentially enriched pathways and pathways of interest.

# Select pathways to feature. Here, selecting all pathways used in heatmap and pathway of interest selected by client
top_feat_gs2 <- getTopFeatures(geneSet_results2, 
                               n_features = 5,
                               p_adjust_column = p_adjust_method,
                               p_adjust_thr = p_adjust_threshold2, 
                               est_thr = fc_threshold2)
label_ge2 <- unique(c(poi, top_feat_gs2$all))

# set up violin plot data frame
violin_df <- cbind(pData(geneSet_data) %>% 
                    dplyr::select(eval(contrast_factor)),
                   t(exprs(geneSet_data[label_ge2,])))

# Select expression of getTopFeatures
vexp_getTopFeatures <- exprs(geneSet_data)[label_ge2, ]

# Create expression vector for violin plot
violin_df <- cbind(pData(geneSet_data) %>% dplyr::select(eval(contrast_factor)),
                  t(vexp_getTopFeatures))

violin_df <- violin_df %>% tidyr::pivot_longer(cols=-1, names_to = "Feature", values_to = "Enrichment")
colnames(violin_df)[1] <- "contrast_factor"
violin_df$Feature <- trimNames(violin_df$Feature, 25)

# Format summary stats
violin_p_df <- filter(geneSet_results2, Feature %in% label_ge2)
violin_p_df <- violin_p_df %>% tidyr::separate(col = Comparison, into=c("group1", "group2"), sep=" vs ")
violin_p_df[[p_adjust_method]] <- signif(violin_p_df[[p_adjust_method]], 3)
violin_p_df$Feature <- trimNames(violin_p_df$Feature, 25)
violin_exp_max <- ddply(violin_df, .(Feature), summarize,
                        y.position=max(Enrichment)+0.5) # +1 for safe log2
violin_p_df <- base::merge(violin_p_df, violin_exp_max, by="Feature")

p <- ggplot(violin_df, 
            aes(x=contrast_factor, y=Enrichment, fill=contrast_factor)) + 
  geom_violin() +
  geom_jitter(width=0.25, height=0, size = 1.5) + 
  scale_fill_manual(values = pal_main[[contrast_factor]]) +
  facet_wrap(~Feature, scales = "free_y") +
  labs(x = eval(contrast_factor), y = "Enrichment (z-score)") +
  scale_y_continuous(expand = expansion(mult = c(0.1, 0.2)), labels = scales::number_format(accuracy = 0.01)) +
  theme_bw(base_size = 12) + 
  theme(legend.position = "bottom") +
  guides(fill=guide_legend(title = eval(contrast_factor)))

p <- p + ggprism::add_pvalue(
  violin_p_df,
  label="{p_adjust_class} = {violin_p_df[[p_adjust_method]]}", label.size = 3.6,
  y.position = violin_p_df$y.position
)

ggsave(p, filename = file.path(ge_plot_dir, "contrast2_POI.svg"), width=12, height=8)
p

7.3.2 Results Table

The table is sorted based on FDR. Estimate reflects the relative difference in enrichment scores between the two groups in the Comparison column.

geneSet_results2$TargetNumber <-
 fData(geneSet_data)[match(geneSet_results2$Feature, rownames(geneSet_data)), 
                     "PresentTargets"]
geneSet_results2$Targets <-
 fData(geneSet_data)[match(geneSet_results2$Feature, rownames(geneSet_data)), 
                     "TargetIds"]
geneSet_results2$Pathway <- trimNames(geneSet_results2$Feature, 40)

# Change the adjusted P-value column name to its classified adjusted P-value method for display.
if(p_adjust_method_selected == "none"){
 dt2 <- geneSet_results2[, c("Pathway", "Comparison", "Estimate", "P", "TargetNumber", "Targets", "Feature")]
} else {
 dt2 <- geneSet_results2[, c("Pathway", "Comparison", "Estimate", "P","FDR", "TargetNumber", "Targets", "Feature")]
 colnames(dt2)[which(colnames(geneSet_results2) == p_adjust_method)] <- p_adjust_class
}

dt_params_gs$buttons <- 
 list(list(extend = "copy"),
      list(extend = "csv", filename = paste0("PathwayAnalysis2_", strings[1], "vs", strings[2], ".csv")),
      list(extend = "excel", filename = paste0("PathwayAnalysis2_", strings[1], "vs", strings[2], ".xlsx")))
DT::formatRound(
 DT::formatSignif(
  DT::datatable(
   dt2,
   extensions = c("Buttons", "Scroller", "FixedColumns"),
   options = dt_params_gs,
   rownames = FALSE),
  columns = c("P", p_adjust_class)),
 columns = c("Estimate"))