TCGA-BRCA Gene Expression Analysis with bregr

Shixiang Wang

Central South University
wangshx@csu.edu.cn

2025-08-09

#remotes::install_github("wanglabcsu/bregr", dependencies = TRUE)

library(bregr)
#> Welcome to 'bregr' package!
#> =======================================================================
#> You are using bregr version 1.0.0.9000
#> 
#> Project home : https://github.com/WangLabCSU/bregr
#> Documentation: https://wanglabcsu.github.io/bregr/
#> Cite as      : arXiv:2110.14232
#> =======================================================================
#> 
library(UCSCXenaTools)
#> =========================================================================================
#> UCSCXenaTools version 1.6.0
#> Project URL: https://github.com/ropensci/UCSCXenaTools
#> Usages: https://cran.r-project.org/web/packages/UCSCXenaTools/vignettes/USCSXenaTools.html
#> 
#> If you use it in published research, please cite:
#> Wang et al., (2019). The UCSCXenaTools R package: a toolkit for accessing genomics data
#>   from UCSC Xena platform, from cancer multi-omics to single-cell RNA-seq.
#>   Journal of Open Source Software, 4(40), 1627, https://doi.org/10.21105/joss.01627
#> =========================================================================================
#>                               --Enjoy it--
library(UCSCXenaShiny)
#> =========================================================================================
#> UCSCXenaShiny version 2.2.0
#> Project URL: https://github.com/openbiox/UCSCXenaShiny
#> Usages: https://openbiox.github.io/UCSCXenaShiny/
#> 
#> If you use it in published research, please cite:
#>   Shensuo Li, Yuzhong Peng, Minjun Chen, Yankun Zhao, Yi Xiong, Jianfeng Li, Peng Luo, 
#>   Haitao Wang, Fei Zhao, Qi Zhao, Yanru Cui, Sujun Chen, Jian-Guo Zhou, Shixiang Wang,  
#>   Facilitating integrative and personalized oncology omics analysis with UCSCXenaShiny, 
#>   Communications Biology, 1200 (2024),  https://doi.org/10.1038/s42003-024-06891-2
#> =========================================================================================
#>                               --Enjoy it--

Download Data

XenaGenerate() |>
  XenaFilter(filterDatasets = "TCGA.BRCA.sampleMap/HiSeqV2_PANCAN")
#> class: XenaHub 
#> hosts():
#>   https://tcga.xenahubs.net
#> cohorts() (1 total):
#>   TCGA Breast Cancer (BRCA)
#> datasets() (1 total):
#>   TCGA.BRCA.sampleMap/HiSeqV2_PANCAN
XenaGenerate() |>
  XenaFilter(filterDatasets = "TCGA.BRCA.sampleMap/HiSeqV2_PANCAN") |> 
  XenaQuery() |> 
  XenaDownload(destdir = "")

Preprocessing Data

data = XenaPrepare("HiSeqV2_PANCAN.gz")
head(data[, 1:10])
#> # A tibble: 6 × 10
#>   sample `TCGA-AR-A5QQ-01` `TCGA-D8-A1JA-01` `TCGA-BH-A0BQ-01` `TCGA-BH-A0BT-01`
#>   <chr>              <dbl>             <dbl>             <dbl>             <dbl>
#> 1 ARHGE…            -0.357           -2.43             -0.542            -0.844 
#> 2 HIF3A             -3.98            -1.89             -2.83             -4.21  
#> 3 RNF17             -0.531            0.0935            0.0216           -0.531 
#> 4 RNF10             -0.410            0.141             0.189             1.41  
#> 5 RNF11              0.161            2.56              0.443             0.123 
#> 6 RNF13             -0.518            0.380            -0.0497           -0.0659
#> # ℹ 5 more variables: `TCGA-A8-A06X-01` <dbl>, `TCGA-A8-A096-01` <dbl>,
#> #   `TCGA-BH-A0C7-01` <dbl>, `TCGA-AC-A5XU-01` <dbl>, `TCGA-PE-A5DE-01` <dbl>

Firstly, we need to convert genes to columns.

data_t = tidyr::pivot_longer(data, cols = colnames(data)[-1]) |> 
  tidyr::pivot_wider(id_cols = "name", names_from = "sample", values_from = "value", values_fill = NA_real_)
head(data_t[, 1:10])
#> # A tibble: 6 × 10
#>   name     ARHGEF10L HIF3A   RNF17   RNF10  RNF11   RNF13 GTF2IP1    REM1  MTVR2
#>   <chr>        <dbl> <dbl>   <dbl>   <dbl>  <dbl>   <dbl>   <dbl>   <dbl>  <dbl>
#> 1 TCGA-AR…   -0.357  -3.98 -0.531  -0.410   0.161 -0.518  -0.896  -1.02   -0.423
#> 2 TCGA-D8…   -2.43   -1.89  0.0935  0.141   2.56   0.380  -0.188  -1.71   -0.423
#> 3 TCGA-BH…   -0.542  -2.83  0.0216  0.189   0.443 -0.0497  0.326   0.449  -0.423
#> 4 TCGA-BH…   -0.844  -4.21 -0.531   1.41    0.123 -0.0659  0.150  -1.51   -0.423
#> 5 TCGA-A8…   -0.222  -4.97 -0.531   0.226   0.287 -0.342   0.169   0.0139  1.38 
#> 6 TCGA-A8…   -0.0975 -5.28  0.345   0.0342 -0.112 -0.0621 -0.0119  0.0606 -0.423

Next join the overall survival data.

data2 = dplyr::inner_join(
  tcga_surv |> dplyr::select(sample, OS, OS.time) |> 
    dplyr::filter(!is.na(OS), !is.na(OS.time)),
  data_t, by = c("sample" = "name")
)
head(data2[, 1:10])
#>            sample OS OS.time  ARHGEF10L     HIF3A       RNF17       RNF10
#> 1 TCGA-3C-AAAU-01  0    4047  0.6073075 -3.194126 -0.53103501 -0.14587199
#> 2 TCGA-3C-AALI-01  0    4005 -0.6411925 -4.928226  0.09546499  0.09812801
#> 3 TCGA-3C-AALJ-01  0    1474  1.0828075 -4.623726 -0.53103501  0.48402801
#> 4 TCGA-3C-AALK-01  0    1448  0.1216075 -2.881526 -0.53103501  0.17912801
#> 5 TCGA-4H-AAAK-01  0     348  0.4202075 -3.282726 -0.53103501 -0.02097199
#> 6 TCGA-5L-AAT0-01  0    1477  0.4748075 -2.775226 -0.53103501  0.24602801
#>         RNF11      RNF13     GTF2IP1
#> 1  0.23742187 -0.2992099 -0.14269449
#> 2 -0.54197813 -0.3229099 -0.04469449
#> 3 -0.18367813 -0.9190099  0.26110551
#> 4  0.03922187 -0.4549099  0.18030551
#> 5 -0.11797813 -0.5578099 -0.17379449
#> 6 -0.37837813 -0.8308099  0.33890551

So we next run bregr with 1210 samples, 2.053^{4} variables.

Execute bregr Pipeline

gene_names = names(data_t)[-1]

# options(
#   bregr.save_model = TRUE,
#   bregr.path = "./bregr_models"
# )

With parallel computation:

system.time(
  br_pipeline(
    data2,
    y = c("OS.time", "OS"),
    x = gene_names, 
    method = "coxph",
    run_parallel = 40
  ) |>
    suppressWarnings()
)
#> exponentiate estimates of model(s) constructed from coxph method at default
#> ■■■                                5% | ETA: 22s
#> 
#> ■■■■■■■■■                         26% | ETA: 13s
#> 
#> ■■■■■■■■■■■■■■■                   47% | ETA:  8s
#> 
#> ■■■■■■■■■■■■■■■■■■■               60% | ETA:  7s
#> 
#> ■■■■■■■■■■■■■■■■■■■■■■■■          78% | ETA:  4s
#> 
#> ■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■    98% | ETA:  0s
#> 
#> ■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■  100% | ETA:  0s
#>     user   system  elapsed 
#> 3295.860 1466.084 3821.251

Without parallel computation:

system.time(
  br_pipeline(
    data2,
    y = c("OS.time", "OS"),
    x = gene_names, 
    method = "coxph"
  ) |>
    suppressWarnings()
)
#> exponentiate estimates of model(s) constructed from coxph method at default
#>      user    system   elapsed 
#> 15349.575     6.829 15418.882