4 Exploratory Data Analysis
The following section is used to understand the general structure of the data and whether the hypotheses that were defined initially are likely to identify significant findings. This qualitative exploration can also help identify any study design considerations for downstream analyses.
4.1 Dimensional Reduction
In this section, we use different dimensional reduction techniques to identify broad patterns in the expression data.
Analyst Notes: Clear separation between Geometric segments (i.e., glomerulus) and PanCK segmentation (i.e., tubules). tSNE shows further clustering by PanCK status. Clustering based on disease type is also present in glomeruli (see shape clustering of grey points).
target_data <- getPCA(target_data, elt="log_q", the_prefix = "PCA")
p <- ggplot(data=pData(target_data),
aes(x=PCA_coords_1, y=PCA_coords_2)) +
geom_point(aes_string(color=dim_reduction_color_by,
shape=dim_reduction_shape_by), alpha=0.5, size=3) +
labs(x=paste0("PCA 1 (", round(experimentData(target_data)@other$PCA[1], 2), "%)"),
y=paste0("PCA 2 (", round(experimentData(target_data)@other$PCA[2], 2), "%)"),
title="PCA") +
theme_bw() + theme(legend.position = "right")
if(!is.null(dim_reduction_color_by)){
p <- p + scale_color_manual(values = pal_main[[dim_reduction_color_by]])
}
ggsave(p, filename = file.path(qc_dir, "PCA_log2_norm.svg"), width=8, height=5)
print(p)
p <- ggplot(data=pData(target_data),
aes(x=PCA_coords_1, y=PCA_coords_2)) +
geom_point(aes_string(color=dim_reduction_color_by,
shape=dim_reduction_shape_by), alpha=0.5, size=3) +
facet_wrap(~`slide name`, nrow=2) +
labs(x=paste0("PCA 1 (", round(experimentData(target_data)@other$PCA[1], 2), "%)"),
y=paste0("PCA 2 (", round(experimentData(target_data)@other$PCA[2], 2), "%)")) +
theme_bw() + theme(legend.position = "bottom")
if(!is.null(dim_reduction_color_by)){
p <- p + scale_color_manual(values = pal_main[[dim_reduction_color_by]])
}
ggsave(p, filename = file.path(qc_dir, "PCA_log2_norm_faceted.svg"), width=8, height=5)
print(p)
target_data <- gettSNE(target_data, elt="log_q", the_prefix = "tSNE")
p <- ggplot(data=pData(target_data),
aes(x=tSNE_coords_1, y=tSNE_coords_2)) +
geom_point(aes_string(color=dim_reduction_color_by,
shape=dim_reduction_shape_by), alpha=0.5, size=3) +
labs(x="tSNE 1", y="tSNE 2", title = "tSNE") +
theme_bw() + theme(legend.position = "right")
if(!is.null(dim_reduction_color_by)){
p <- p + scale_color_manual(values = pal_main[[dim_reduction_color_by]])
}
ggsave(p, filename = file.path(qc_dir, "tSNE_log2_norm.svg"), width=6, height=5)
print(p)
p <- ggplot(data=pData(target_data),
aes(x=tSNE_coords_1, y=tSNE_coords_2)) +
geom_point(aes_string(color=dim_reduction_color_by,
shape=dim_reduction_shape_by), alpha=0.5, size=3) +
facet_wrap(~`slide name`, nrow=2) +
labs(x="tSNE 1", y="tSNE 2") +
theme_bw() +
theme(legend.position = "bottom", legend.box="vertical", legend.margin=margin())
if(!is.null(dim_reduction_color_by)){
p <- p + scale_color_manual(values = pal_main[[dim_reduction_color_by]])
}
ggsave(p, filename = file.path(qc_dir, "tSNE_log2_norm_faceted.svg"), width=10, height=5)
print(p)
target_data <- getUMAP(target_data, elt="log_q", the_prefix = "UMAP")
p <- ggplot(data=pData(target_data),
aes(x=UMAP_coords_1, y=UMAP_coords_2)) +
geom_point(aes_string(color=dim_reduction_color_by,
shape=dim_reduction_shape_by), alpha=0.5, size=3) +
labs(x="UMAP 1", y="UMAP 2", title="UMAP") +
theme_bw() + theme(legend.position = "right")
if(!is.null(dim_reduction_color_by)){
p <- p + scale_color_manual(values = pal_main[[dim_reduction_color_by]])
}
ggsave(p, filename = file.path(qc_dir, "UMAP_log2_norm.svg"), width=6, height=5)
print(p)
p <- ggplot(data=pData(target_data),
aes(x=UMAP_coords_1, y=UMAP_coords_2)) +
geom_point(aes_string(color=dim_reduction_color_by,
shape=dim_reduction_shape_by), alpha=0.5, size=3) +
facet_wrap(~`slide name`, nrow=2) +
labs(x="UMAP 1", y="UMAP 2") +
theme_bw() + theme(legend.position = "bottom", legend.box="vertical", legend.margin=margin())
if(!is.null(dim_reduction_color_by)){
p <- p + scale_color_manual(values = pal_main[[dim_reduction_color_by]])
}
ggsave(p, filename = file.path(qc_dir, "UMAP_log2_norm_faceted.svg"), width=10, height=5)
print(p)
4.2 Clustering features with high coefficient of variation
Another approach to explore the data is to calculate the coefficient of variation (CV) for each gene (\(g\)) using the formula \(CV_g = SD_g/mean_g\). We then identify genes with high CVs that should have large differences across the various profiled segments. This unbiased approach can reveal highly variable genes across the study. For this study, we examined the 1013 genes that represent the 90th quantile.
We plot the results using unsupervised hierarchical clustering based on Pearson distance, displayed as a heatmap.
Analyst Notes: Top CV genes cluster by region. Tubule segments are represented in cluster #1 and glomerulus segments largely cluster in #2. Finer clustering is observed based on differences between segment type within tubules.
CV_dat <- assayDataApply(target_data, elt = "log_q", MARGIN = 1, getCV)
cv_feats <- names(CV_dat)[CV_dat > quantile(CV_dat, top_quantile)]
if(file.exists(file.path(qc_dir, "cv_heatmap.RDS"))){
cv_heatmap <- readRDS(file.path(qc_dir, "cv_heatmap.RDS"))
} else {
cv_heatmap <- Heatmap(t(scale(t(assayDataElement(target_data[cv_feats, ],
elt = "log_q")))),
col = colorRamp2(c(seq(-2.5, 2.5, 0.05)),
c(colorRampPalette(c("#0092b5", "white", "#a6ce39"))(101))),
name = 'z-score',
clustering_distance_rows = "pearson", clustering_method_rows = "average",
clustering_distance_columns = "pearson", clustering_method_columns = "average",
column_split = 2, row_split = 2,
border_gp = gpar(col = "darkgray"),
show_row_names = FALSE, show_column_names = FALSE,
use_raster = TRUE,
top_annotation =
HeatmapAnnotation(df= pData(target_data)[, factors_of_interest],
col = pal_heatmaps,
gp = gpar(col = "gray"))
)
saveRDS(cv_heatmap, file=file.path(qc_dir, "cv_heatmap.RDS"))
}
#svglite::svglite(filename = file.path(qc_dir, "cv_heatmap.svg"),
# width=10, height=6.5)
#draw(cv_heatmap, heatmap_legend_side = "left")
#dev.off()
draw(cv_heatmap, heatmap_legend_side = "left")