Select window_size, overlap_size, and
PCA/K-NN Parameters
- We simulate the output of the
{methylKit} package.
- Note: WGBS/EM-seq data should be grouped by
chromosome before performing sliding window imputation.
- Here, we simulate 1000 sites.
- Importantly, we simulate the location vector of each feature
(genomic position) with varying spacing of 50 to 500 bp apart from each
other.
- The
locations vector contains the genomic position of
each feature (column). It is used to determine which columns are grouped
together given a window size.
set.seed(1234)
sample_names <- paste0("S", 1:10)
n_sites <- 1000
# Simulate positions with 50–500 bp between each site
distances_between <- sample(50:500, size = n_sites, replace = TRUE)
locations <- cumsum(distances_between) # <- important, location vector
methyl <- data.frame(
chr = "chr1",
start = locations,
end = locations,
strand = "+"
)
for (i in seq_along(sample_names)) {
methyl[[paste0("numCs", i)]] <- sample.int(100, size = n_sites, replace = TRUE)
methyl[[paste0("numTs", i)]] <- sample.int(100, size = n_sites, replace = TRUE)
methyl[[paste0("coverage", i)]] <- methyl[[paste0("numCs", i)]] + methyl[[paste0("numTs", i)]]
}
methyl[1:5, 1:10]
#> chr start end strand numCs1 numTs1 coverage1 numCs2 numTs2 coverage2
#> 1 chr1 333 333 + 83 68 151 84 36 120
#> 2 chr1 718 718 + 82 2 84 54 99 153
#> 3 chr1 1173 1173 + 48 2 50 44 16 60
#> 4 chr1 1323 1323 + 70 48 118 48 56 104
#> 5 chr1 1483 1483 + 88 15 103 83 19 102
- First, we convert the data into a beta matrix with features in the
columns.
numCs_matrix <- as.matrix(methyl[, paste0("numCs", seq_along(sample_names))])
cov_matrix <- as.matrix(methyl[, paste0("coverage", seq_along(sample_names))])
beta_matrix <- numCs_matrix / cov_matrix
colnames(beta_matrix) <- sample_names
rownames(beta_matrix) <- methyl$start
beta_matrix <- t(beta_matrix)
# Set 10% of the data to missing
set.seed(1234)
beta_matrix[sample.int(length(beta_matrix), floor(length(beta_matrix) * 0.1))] <- NA
beta_matrix[1:4, 1:4]
#> 333 718 1173 1323
#> S1 0.54966887 0.9761905 0.96000000 0.5932203
#> S2 0.70000000 0.3529412 0.73333333 0.4615385
#> S3 0.06521739 0.7053571 NA 0.6507937
#> S4 0.69444444 0.2846154 0.04347826 0.7636364
- Then, in a real dataset, we would tune hyperparameters using
chr22. Here, as a demonstration, we use the whole data
since the size is small.
- We use 2 repetitions of cross-validation (increase to 10–30 in real
analyses). We are selecting between:
ncp (number of principal components) of 2 or 4,
indicating that we are performing sliding PCA imputation. Pass
k for sliding K-NN imputation.
window_size of 5,000 or 10,000 bp.
overlap_size fixed at 1,000 bp (does not affect results
much in real analyses).
params <- expand.grid(ncp = c(2, 4), window_size = c(5000, 10000))
params$overlap_size <- 1000
params$min_window_n <- 20 # windows with less than 20 columns are dropped
# Increase n_reps from 2 in actual analyses and use another chromosome (i.e., chr22)
tune_slide_pca <- tune_imp(
obj = beta_matrix,
parameters = params,
.f = "slide_imp",
n_reps = 2,
location = locations
)
#> Tuning slide_imp
#> Step 1/2: Resolving NA locations
#> ℹ Using default `num_na` = 500 (~5% of cells).
#> Increase for more reliability or decrease if missing is dense.
#> Running Mode: sequential...
#>
#> Step 2/2: Tuning
metrics <- compute_metrics(tune_slide_pca)
aggregate(.estimate ~ .metric + ncp + window_size, data = metrics, FUN = mean)
#> .metric ncp window_size .estimate
#> 1 mae 2 5000 0.2251033
#> 2 rmse 2 5000 0.2830913
#> 3 mae 4 5000 0.2391354
#> 4 rmse 4 5000 0.2976971
#> 5 mae 2 10000 0.2116169
#> 6 rmse 2 10000 0.2639824
#> 7 mae 4 10000 0.2220744
#> 8 rmse 4 10000 0.2754316
- Finally, we can use
slide_imp() to impute the full
beta_matrix. Use the best parameter combination from the
cross-validation metrics.
- First, we use
dry_run = TRUE to examine the columns to
be imputed.
start and end are window location
vectors.
window_n is the number of features included in the
window.
slide_imp(
obj = beta_matrix,
location = locations,
window_size = 5000,
overlap_size = 1000,
ncp = 2,
min_window_n = 20,
dry_run = TRUE # <- dry_run to inspect the windows
)
#> Dropping 43 window(s) with fewer than 20 columns.
#> # slideimp table: 24 x 4
#> start end window_n subset_local
#> 15 34 20 <double [20]>
#> 104 124 21 <double [21]>
#> 188 209 22 <double [22]>
#> 205 225 21 <double [21]>
#> 222 241 20 <double [20]>
#> 239 258 20 <double [20]>
#> 254 273 20 <double [20]>
#> 369 388 20 <double [20]>
#> 385 404 20 <double [20]>
#> 402 424 23 <double [23]>
#> # ... with 14 more rows
- Turn off
dry_run to impute the data
slide_imp(
obj = beta_matrix,
location = locations,
window_size = 5000,
overlap_size = 1000,
ncp = 2,
min_window_n = 20,
dry_run = FALSE,
.progress = FALSE
)
#> Method: slide_imp (PCA imputation)
#> Dimensions: 10 x 1000
#> Note: requested columns still contain NA values
#>
#> 333 718 1173 1323 1483 1925
#> S1 0.54966887 0.9761905 0.96000000 0.5932203 NA 0.08695652
#> S2 0.70000000 0.3529412 0.73333333 0.4615385 0.8137255 0.45454545
#> S3 0.06521739 0.7053571 NA 0.6507937 0.4393064 0.37735849
#> S4 0.69444444 0.2846154 0.04347826 0.7636364 0.9595960 0.54621849
#> S5 0.83505155 0.5777778 0.47517730 NA 0.7368421 0.21666667
#> S6 0.26612903 0.3451327 0.53072626 0.5000000 0.6465517 0.37288136
#>
#> # Showing [1:6, 1:6] of full matrix
Subset and Flanking Mode
- Use this mode when you only need to impute specific target features
(e.g., clock CpGs) rather than the entire dataset.
- Pass the desired feature names to the
subset argument.
Only windows containing these features will be imputed.
- Set
flank = TRUE to build windows centered on
each feature in the subset. Each window will extend
window_size bp on either side of the target feature
(flanking mode).
- In this mode, the
overlap_size argument is
ignored.
- In this example, we only want to impute the features
"1323" and "33810" by creating 5,000 bp
flanking windows around each feature:
slide_imp(
obj = beta_matrix,
location = locations,
window_size = 5000,
ncp = 2,
min_window_n = 20,
subset = c("1323", "33810"),
flank = TRUE,
dry_run = TRUE
)
#> # slideimp table: 2 x 5
#> start end window_n target subset_local
#> 1 22 22 4 <integer [1]>
#> 101 138 38 122 <integer [1]>