scDiffCom: GO Annotation
Cyril Lagger, Eugen Ursu
scDiffCom-GO-vignette.Rmd
The objectives of this vignette are:
- Show some of the details on how the GO Term annotation takes place
in
scDiffCom
. - Investigate the properties of GO Term merging modes.
Let’s start by presenting the relevant data structures used within
scDiffCom
.
GO Terms data structures in scDiffCom
First, there is the gene ontology data.table
that
describes the location of each GO term in the GO ontology graph. In
particular, note the integer column LEVEL
that is a proxy
for the distance to the GO term in the GO ontology graph from the
“roots” (biological process, molecular function, cellular
component).
library(knitr)
library(glue)
library(data.table)
library(ggplot2)
library(easyGgplot2)
library(scDiffCom)
data(gene_ontology_level)
head(gene_ontology_level)
Second, there is another data.table
that connects LRIs
to GO terms. For example, for mice, there are two ways of accessing
it:
- Compute
- Precomputed - internal
scDiffCom
data:scDiffCom::LRI_mouse$LRI_curated_GO
, computed with (1)
So internally, get_GO_interactions
performs the
assignment of GO Terms to LRI. We allow for several modes of merging the
GO terms for the ligands and receptors into a set of GO terms associated
with a LRI. Below we compute the LRI_curated_GO
for each
such mode. In what follows, we will compare them.
Let’s see how different are the assignments of GO Terms to LRI as a function of merging modes. Firstly, we check the distribution across LRI of the maximum level of the assigned GO Terms. In other words, for each LRI, we extract the levels of each of the assigned GO Terms and take the maximum over this set.
plots_max = list()
plots_median = list()
for (i in 1:length(LRI_curated_GO_list)) {
GO = LRI_curated_GO_list[[i]]
merge_mode = names(LRI_curated_GO_list)[i]
GO = merge(GO, gene_ontology_level, by.x="GO_ID", by.y="ID", all.x=TRUE)
GO = GO[!is.na(GO$LEVEL)]
GO_max_level = GO[, list(max_level = max(LEVEL)), by="LRI"]
plot_max = (ggplot(data=GO_max_level, aes(x=max_level))
+ geom_histogram(binwidth = 0.5)
+ xlim(0, 18)
# + scale_x_continuous(breaks = 1:18)
+ ylim(0, 1200)
+ ggtitle(glue::glue("{merge_mode}\nNum GO:LRI={nrow(GO)}\nN={nrow(GO_max_level)}\n"))
)
plots_max[[merge_mode]] = plot_max
GO_median_level = GO[, list(median_level = round(median(LEVEL))), by="LRI"]
# GO_median_level = GO_median_level[!is.na(GO_median_level$median_level), ]
plot_median = (ggplot(data=GO_median_level, aes(x=median_level))
+ geom_histogram(binwidth = 0.5)
+ xlim(0, 13)
# + scale_x_continuous(breaks = 1:18)
+ ylim(0, 2100)
+ ggtitle(glue::glue("{merge_mode}\nNum GO:LRI={nrow(GO)}\nN={nrow(GO_median_level)}"))
)
plots_median[[merge_mode]] = plot_median
}
ggplot2.multiplot(plotlist=plots_max, cols=4)
Now let’s check the median level distributions:
ggplot2.multiplot(plotlist=plots_median, cols=4)
Overall GO term level distribution
We present below the distribution of levels from the GO Terms. This provides a baseline value of what to expect when examining the levels of the relevant GO Terms in other contexts.
GO_LEVEL_NO_OTHER = gene_ontology_level[ASPECT != "other"]
(ggplot(data=GO_LEVEL_NO_OTHER, aes(x=LEVEL))
+ geom_histogram(binwidth = 0.5)
+ scale_x_continuous(breaks = 1:18)
+ ggtitle(glue::glue("GO Terms level distribution.\nN={nrow(GO_LEVEL_NO_OTHER)}"))
)
We compute the median level for reference.
median(GO_LEVEL_NO_OTHER$LEVEL)
Exploring different merging modes on scDiffCom
toy
object.
Now we have the LRI GO annotations obtained by the different merging
modes. We test them on the toy object available in the package
scDiffCom::seurat_sample_tms_liver
. First we perform the
raw scDiffCom
analysis. Then we perform ORA
on
the results using the different LRI GO annotations and compare the
results. Finally, we plot the ORA results from different merging
modes.
params = list(
LRI_species = "mouse",
seurat_celltype_id = "cell_type",
seurat_condition_id = list(
column_name = "age_group",
cond1_name = "YOUNG",
cond2_name = "OLD"
),
object_name = "scDiffCom_object",
seurat_assay = "RNA",
seurat_slot = "data",
log_scale = FALSE,
score_type = "geometric_mean",
threshold_min_cells = 5,
threshold_pct = 0.1,
iterations = 1000,
threshold_quantile_score = 0.2,
threshold_p_value_specificity = 0.05,
threshold_p_value_de = 0.05,
threshold_logfc = log(1.5), # log(1.5)
return_distributions = FALSE,
seed = 42,
verbose = TRUE
)
results = scDiffCom:::run_internal_raw_analysis(
seurat_object = scDiffCom::seurat_sample_tms_liver,
LRI_table = scDiffCom::LRI_mouse$LRI_curated,
LRI_species = "mouse",
params = params
)
results = scDiffCom::FilterCCI(results)
results_ora = list()
for (i in 1:length(LRI_curated_GO_list)) {
GO = LRI_curated_GO_list[[i]]
merge_mode = names(LRI_curated_GO_list)[i]
tryCatch(
{
results_ora[[merge_mode]] = scDiffCom:::run_ora(
object = results,
categories = "GO_TERMS",
extra_annotations = NULL,
overwrite = TRUE,
stringent_or_default = "default",
stringent_logfc_threshold = NULL,
verbose = TRUE,
class_signature = "scDiffCom",
global = FALSE,
LRI_curated_GO = GO
)
},
error=function (cond) {
print(cond)
}
)
}
dt_ora = data.table::data.table()
for (i in 1:length(results_ora)) {
merge_mode = names(results_ora)[i]
res = results_ora[[merge_mode]]
tryCatch(
{
dt = res@ora_table$GO_TERMS
dt_m = data.table::melt(
dt,
id.vars = c("VALUE_BIS", "VALUE", "CATEGORY", "LEVEL"),
measure.vars = c("BH_P_VALUE_UP", "BH_P_VALUE_DOWN", "BH_P_VALUE_FLAT"),
value.name = c("P_VAL")
)
dt_m$variable = as.character(dt_m$variable)
dt_m$TYPE = purrr::map_chr(dt_m$variable, function(s) {strsplit(s, split="_")[[1]][4]})
dt_m = dt_m[dt_m$P_VAL < 0.05, ]
dt_m$MERGE_MODE = merge_mode
dt_ora = rbind(dt_ora, dt_m)
},
error=function (cond) {
print(cond)
}
)
}
dt_ora = dt_ora[!is.na(LEVEL)]
dt_ora = dt_ora[TYPE %in% c("UP", "DOWN")]
violin_plot = ggplot(data=dt_ora, aes(y=LEVEL, x=MERGE_MODE)) +
geom_violin() +
scale_x_discrete(labels = c(
"ancestors_intersection" = glue("ancestors_intersection\nN={table(dt_ora$MERGE_MODE)['ancestors_intersection']}"),
"intersection" = glue("intersection\nN={table(dt_ora$MERGE_MODE)['intersection']}"),
"ancestors_union" = glue("ancestors_union\nN={table(dt_ora$MERGE_MODE)['ancestors_union']}"),
"union" = glue("union\nN={table(dt_ora$MERGE_MODE)['union']}"),
"ancestors_ligands" = glue("ancestors_ligands\nN={table(dt_ora$MERGE_MODE)['ancestors_ligands']}"),
"ligands" = glue("ligands\nN={table(dt_ora$MERGE_MODE)['ligands']}"),
"ancestors_receptors" = glue("ancestors_receptors\nN={table(dt_ora$MERGE_MODE)['ancestors_receptors']}"),
"receptors" = glue("receptors\nN={table(dt_ora$MERGE_MODE)['receptors']}")
)) +
xlab("GO Terms merging modes") +
ylab("Level") +
ggtitle("Distribution of GO Term levels by merging modes.") +
theme(axis.text.x = element_text(size=12, angle=0))
violin_plot