## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

## ----setup, message=FALSE, warning=FALSE--------------------------------------
library(pathdb)
library(clusterProfiler)
library(dplyr)

# Preview the first few rows
head(hypoxia_deseq)

## ----define_deg---------------------------------------------------------------
# Define up- and down- regulation based on LFC and padj
df <- hypoxia_deseq |>
  as.data.frame() |>
  mutate(diffexp = case_when(
    log2FoldChange > 0 & padj < 0.05 ~ "UP",
    log2FoldChange < 0 & padj < 0.05 ~ "DOWN",
    TRUE ~ "NO" # using TRUE as catch-all for padj >= 0.1 or NA
  ))

# Filter out non-significant or missing adjusted p-value genes
df <- df[df$diffexp != "NO" & !is.na(df$diffexp), ]

## ----convert_ids, eval=FALSE--------------------------------------------------
# # Convert ids to Ensembl format to ensure compatibility
# df <- convert_id(
#   genes = rownames(df),
#   data = df,
#   species_id = 96
# )

## ----split_lists--------------------------------------------------------------
# Split into up or down groups in a list
deg_results_list <- split(df, df$diffexp)

## ----t2g_prep, eval = FALSE---------------------------------------------------
# bg_genes <- T2G_prep(
#   species_id = 96,
#   category = c("KEGG"),
#   genes = rownames(hypoxia_deseq)
# )

## ----t2g_load, echo=FALSE-----------------------------------------------------
bg_genes <- hypoxia_T2G

## ----run_ora------------------------------------------------------------------
padj_cutoff <- 0.1
genecount_cutoff <- 5

# Apply enricher across Up/Down lists
res <- lapply(names(deg_results_list), function(direction) {
  enricher(
    gene = rownames(deg_results_list[[direction]]),
    pvalueCutoff = 0.05,
    TERM2GENE = bg_genes
  )
})

# Name the lists according to regulation
names(res) <- names(deg_results_list)

## ----format_results-----------------------------------------------------------
# Bind results by row
res_df <- lapply(names(res), function(x) {
  res_chunk <- res[[x]]@result
  res_chunk$diffexp <- x # Keep track of the regulation direction
  return(res_chunk)
})
res_df <- do.call(rbind, res_df)

# Filter rows that meet our significance and count standards
target_pws <- res_df$p.adjust < padj_cutoff & res_df$Count > genecount_cutoff

# Filter and select top terms
res_sig <- res_df[target_pws, ] |>
  arrange(p.adjust) |>
  group_by(diffexp) |>
  slice_head(n = 20) |> # get up to 20 top pathways per direction
  ungroup()

# View enriched pathways for UP-regulated genes
res_up <- res_sig |> filter(diffexp == "UP")
head(res_up[, c("GeneRatio", "FoldEnrichment", "p.adjust")])
# View enriched pathways for DOWN-regulated genes
res_down <- res_sig |> filter(diffexp == "DOWN")
head(res_up[, c("GeneRatio", "FoldEnrichment", "p.adjust")])

## ----gsea_prep----------------------------------------------------------------
# Create a ranked geneList
geneList <- setNames(
  object = hypoxia_deseq$log2FoldChange,
  nm = rownames(hypoxia_deseq)
)

# Order the list from highest to lowest
geneList <- sort(geneList, decreasing = TRUE)

## ----run_gsea, warning=FALSE--------------------------------------------------
# Execute GSEA
gsea <- GSEA(
  geneList = geneList,
  TERM2GENE = bg_genes,
  pvalueCutoff = 0.05,
  maxGSSize = 2000
)

# Extract and filter top results
gsea_res <- gsea@result |>
  arrange(p.adjust) |>
  slice(1:20)

head(gsea_res[, c("setSize", "enrichmentScore", "NES", "p.adjust")])