#remotes::install_github("wanglabcsu/bregr", dependencies = TRUE)
library(bregr)
#> Welcome to 'bregr' package!
#> =======================================================================
#> You are using bregr version 1.0.0.9000
#>
#> Project home : https://github.com/WangLabCSU/bregr
#> Documentation: https://wanglabcsu.github.io/bregr/
#> Cite as : arXiv:2110.14232
#> =======================================================================
#>
library(UCSCXenaTools)
#> =========================================================================================
#> UCSCXenaTools version 1.6.0
#> Project URL: https://github.com/ropensci/UCSCXenaTools
#> Usages: https://cran.r-project.org/web/packages/UCSCXenaTools/vignettes/USCSXenaTools.html
#>
#> If you use it in published research, please cite:
#> Wang et al., (2019). The UCSCXenaTools R package: a toolkit for accessing genomics data
#> from UCSC Xena platform, from cancer multi-omics to single-cell RNA-seq.
#> Journal of Open Source Software, 4(40), 1627, https://doi.org/10.21105/joss.01627
#> =========================================================================================
#> --Enjoy it--
library(UCSCXenaShiny)
#> =========================================================================================
#> UCSCXenaShiny version 2.2.0
#> Project URL: https://github.com/openbiox/UCSCXenaShiny
#> Usages: https://openbiox.github.io/UCSCXenaShiny/
#>
#> If you use it in published research, please cite:
#> Shensuo Li, Yuzhong Peng, Minjun Chen, Yankun Zhao, Yi Xiong, Jianfeng Li, Peng Luo,
#> Haitao Wang, Fei Zhao, Qi Zhao, Yanru Cui, Sujun Chen, Jian-Guo Zhou, Shixiang Wang,
#> Facilitating integrative and personalized oncology omics analysis with UCSCXenaShiny,
#> Communications Biology, 1200 (2024), https://doi.org/10.1038/s42003-024-06891-2
#> =========================================================================================
#> --Enjoy it--
data = XenaPrepare("HiSeqV2_PANCAN.gz")
head(data[, 1:10])
#> # A tibble: 6 × 10
#> sample `TCGA-AR-A5QQ-01` `TCGA-D8-A1JA-01` `TCGA-BH-A0BQ-01` `TCGA-BH-A0BT-01`
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 ARHGE… -0.357 -2.43 -0.542 -0.844
#> 2 HIF3A -3.98 -1.89 -2.83 -4.21
#> 3 RNF17 -0.531 0.0935 0.0216 -0.531
#> 4 RNF10 -0.410 0.141 0.189 1.41
#> 5 RNF11 0.161 2.56 0.443 0.123
#> 6 RNF13 -0.518 0.380 -0.0497 -0.0659
#> # ℹ 5 more variables: `TCGA-A8-A06X-01` <dbl>, `TCGA-A8-A096-01` <dbl>,
#> # `TCGA-BH-A0C7-01` <dbl>, `TCGA-AC-A5XU-01` <dbl>, `TCGA-PE-A5DE-01` <dbl>
Firstly, we need to convert genes to columns.
data_t = tidyr::pivot_longer(data, cols = colnames(data)[-1]) |>
tidyr::pivot_wider(id_cols = "name", names_from = "sample", values_from = "value", values_fill = NA_real_)
head(data_t[, 1:10])
#> # A tibble: 6 × 10
#> name ARHGEF10L HIF3A RNF17 RNF10 RNF11 RNF13 GTF2IP1 REM1 MTVR2
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 TCGA-AR… -0.357 -3.98 -0.531 -0.410 0.161 -0.518 -0.896 -1.02 -0.423
#> 2 TCGA-D8… -2.43 -1.89 0.0935 0.141 2.56 0.380 -0.188 -1.71 -0.423
#> 3 TCGA-BH… -0.542 -2.83 0.0216 0.189 0.443 -0.0497 0.326 0.449 -0.423
#> 4 TCGA-BH… -0.844 -4.21 -0.531 1.41 0.123 -0.0659 0.150 -1.51 -0.423
#> 5 TCGA-A8… -0.222 -4.97 -0.531 0.226 0.287 -0.342 0.169 0.0139 1.38
#> 6 TCGA-A8… -0.0975 -5.28 0.345 0.0342 -0.112 -0.0621 -0.0119 0.0606 -0.423
Next join the overall survival data.
data2 = dplyr::inner_join(
tcga_surv |> dplyr::select(sample, OS, OS.time) |>
dplyr::filter(!is.na(OS), !is.na(OS.time)),
data_t, by = c("sample" = "name")
)
head(data2[, 1:10])
#> sample OS OS.time ARHGEF10L HIF3A RNF17 RNF10
#> 1 TCGA-3C-AAAU-01 0 4047 0.6073075 -3.194126 -0.53103501 -0.14587199
#> 2 TCGA-3C-AALI-01 0 4005 -0.6411925 -4.928226 0.09546499 0.09812801
#> 3 TCGA-3C-AALJ-01 0 1474 1.0828075 -4.623726 -0.53103501 0.48402801
#> 4 TCGA-3C-AALK-01 0 1448 0.1216075 -2.881526 -0.53103501 0.17912801
#> 5 TCGA-4H-AAAK-01 0 348 0.4202075 -3.282726 -0.53103501 -0.02097199
#> 6 TCGA-5L-AAT0-01 0 1477 0.4748075 -2.775226 -0.53103501 0.24602801
#> RNF11 RNF13 GTF2IP1
#> 1 0.23742187 -0.2992099 -0.14269449
#> 2 -0.54197813 -0.3229099 -0.04469449
#> 3 -0.18367813 -0.9190099 0.26110551
#> 4 0.03922187 -0.4549099 0.18030551
#> 5 -0.11797813 -0.5578099 -0.17379449
#> 6 -0.37837813 -0.8308099 0.33890551
So we next run bregr with 1210 samples, 2.053^{4} variables.
gene_names = names(data_t)[-1]
# options(
# bregr.save_model = TRUE,
# bregr.path = "./bregr_models"
# )
With parallel computation:
system.time(
br_pipeline(
data2,
y = c("OS.time", "OS"),
x = gene_names,
method = "coxph",
run_parallel = 40
) |>
suppressWarnings()
)
#> exponentiate estimates of model(s) constructed from coxph method at default
#> ■■■ 5% | ETA: 22s
#>
#> ■■■■■■■■■ 26% | ETA: 13s
#>
#> ■■■■■■■■■■■■■■■ 47% | ETA: 8s
#>
#> ■■■■■■■■■■■■■■■■■■■ 60% | ETA: 7s
#>
#> ■■■■■■■■■■■■■■■■■■■■■■■■ 78% | ETA: 4s
#>
#> ■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 98% | ETA: 0s
#>
#> ■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 100% | ETA: 0s
#> user system elapsed
#> 3295.860 1466.084 3821.251
Without parallel computation: