TileDBArray 1.0.0
TileDB implements a framework for local and remote storage of dense and sparse arrays.
We can use this as a DelayedArray backend to provide an array-level abstraction,
thus allowing the data to be used in many places where an ordinary array or matrix might be used.
The TileDBArray package implements the necessary wrappers around TileDB-R
to support read/write operations on TileDB arrays within the DelayedArray framework.
TileDBArrayCreating a TileDBArray is as easy as:
X <- matrix(rnorm(1000), ncol=10)
library(TileDBArray)
writeTileDBArray(X)
## <100 x 10> matrix of class TileDBMatrix and type "double":
## [,1] [,2] [,3] ... [,9] [,10]
## [1,] 0.4695075 -0.1536033 -0.8322050 . -0.216530081 -0.820035789
## [2,] 0.4714332 1.1065538 0.8135870 . -1.267760754 -0.230147046
## [3,] -0.7638455 -1.3537554 -0.1561337 . -0.007176995 1.528866753
## [4,] -0.3068718 0.4714274 -0.4029580 . -0.336199112 0.112503679
## [5,] 0.9349748 -0.4700607 0.3885031 . -1.393425861 0.223380459
## ... . . . . . .
## [96,] 0.5279343 0.1023838 -1.7291151 . 0.2098264 1.7144877
## [97,] 2.0783636 0.9704089 0.5229847 . -0.8869093 -1.6604553
## [98,] 0.7390380 1.4580415 0.3324778 . 1.0397619 -0.8634210
## [99,] -1.2263313 -0.5689368 1.0927582 . -0.1971321 0.4283889
## [100,] 0.3854366 -0.6961820 -2.0814346 . -0.9528907 -1.9242366
Alternatively, we can use coercion methods:
as(X, "TileDBArray")
## <100 x 10> matrix of class TileDBMatrix and type "double":
## [,1] [,2] [,3] ... [,9] [,10]
## [1,] 0.4695075 -0.1536033 -0.8322050 . -0.216530081 -0.820035789
## [2,] 0.4714332 1.1065538 0.8135870 . -1.267760754 -0.230147046
## [3,] -0.7638455 -1.3537554 -0.1561337 . -0.007176995 1.528866753
## [4,] -0.3068718 0.4714274 -0.4029580 . -0.336199112 0.112503679
## [5,] 0.9349748 -0.4700607 0.3885031 . -1.393425861 0.223380459
## ... . . . . . .
## [96,] 0.5279343 0.1023838 -1.7291151 . 0.2098264 1.7144877
## [97,] 2.0783636 0.9704089 0.5229847 . -0.8869093 -1.6604553
## [98,] 0.7390380 1.4580415 0.3324778 . 1.0397619 -0.8634210
## [99,] -1.2263313 -0.5689368 1.0927582 . -0.1971321 0.4283889
## [100,] 0.3854366 -0.6961820 -2.0814346 . -0.9528907 -1.9242366
This process works also for sparse matrices:
Y <- Matrix::rsparsematrix(1000, 1000, density=0.01)
writeTileDBArray(Y)
## <1000 x 1000> sparse matrix of class TileDBMatrix and type "double":
## [,1] [,2] [,3] ... [,999] [,1000]
## [1,] 0 0 0 . 0 0
## [2,] 0 0 0 . 0 0
## [3,] 0 0 0 . 0 0
## [4,] 0 0 0 . 0 0
## [5,] 0 0 0 . 0 0
## ... . . . . . .
## [996,] 0 0 0 . 0 0
## [997,] 0 0 0 . 0 0
## [998,] 0 0 0 . 0 0
## [999,] 0 0 0 . 0 0
## [1000,] 0 0 0 . 0 0
Logical and integer matrices are supported:
writeTileDBArray(Y > 0)
## <1000 x 1000> sparse matrix of class TileDBMatrix and type "logical":
## [,1] [,2] [,3] ... [,999] [,1000]
## [1,] FALSE FALSE FALSE . FALSE FALSE
## [2,] FALSE FALSE FALSE . FALSE FALSE
## [3,] FALSE FALSE FALSE . FALSE FALSE
## [4,] FALSE FALSE FALSE . FALSE FALSE
## [5,] FALSE FALSE FALSE . FALSE FALSE
## ... . . . . . .
## [996,] FALSE FALSE FALSE . FALSE FALSE
## [997,] FALSE FALSE FALSE . FALSE FALSE
## [998,] FALSE FALSE FALSE . FALSE FALSE
## [999,] FALSE FALSE FALSE . FALSE FALSE
## [1000,] FALSE FALSE FALSE . FALSE FALSE
As are matrices with dimension names:
rownames(X) <- sprintf("GENE_%i", seq_len(nrow(X)))
colnames(X) <- sprintf("SAMP_%i", seq_len(ncol(X)))
writeTileDBArray(X)
## <100 x 10> matrix of class TileDBMatrix and type "double":
## SAMP_1 SAMP_2 SAMP_3 ... SAMP_9 SAMP_10
## GENE_1 0.4695075 -0.1536033 -0.8322050 . -0.216530081 -0.820035789
## GENE_2 0.4714332 1.1065538 0.8135870 . -1.267760754 -0.230147046
## GENE_3 -0.7638455 -1.3537554 -0.1561337 . -0.007176995 1.528866753
## GENE_4 -0.3068718 0.4714274 -0.4029580 . -0.336199112 0.112503679
## GENE_5 0.9349748 -0.4700607 0.3885031 . -1.393425861 0.223380459
## ... . . . . . .
## GENE_96 0.5279343 0.1023838 -1.7291151 . 0.2098264 1.7144877
## GENE_97 2.0783636 0.9704089 0.5229847 . -0.8869093 -1.6604553
## GENE_98 0.7390380 1.4580415 0.3324778 . 1.0397619 -0.8634210
## GENE_99 -1.2263313 -0.5689368 1.0927582 . -0.1971321 0.4283889
## GENE_100 0.3854366 -0.6961820 -2.0814346 . -0.9528907 -1.9242366
TileDBArraysTileDBArrays are simply DelayedArray objects and can be manipulated as such.
The usual conventions for extracting data from matrix-like objects work as expected:
out <- as(X, "TileDBArray")
dim(out)
## [1] 100 10
head(rownames(out))
## [1] "GENE_1" "GENE_2" "GENE_3" "GENE_4" "GENE_5" "GENE_6"
head(out[,1])
## [1] 0.4695075 0.4714332 -0.7638455 -0.3068718 0.9349748 0.8697633
We can also perform manipulations like subsetting and arithmetic.
Note that these operations do not affect the data in the TileDB backend;
rather, they are delayed until the values are explicitly required,
hence the creation of the DelayedMatrix object.
out[1:5,1:5]
## <5 x 5> matrix of class DelayedMatrix and type "double":
## SAMP_1 SAMP_2 SAMP_3 SAMP_4 SAMP_5
## GENE_1 0.4695075 -0.1536033 -0.8322050 0.7789347 1.4411733
## GENE_2 0.4714332 1.1065538 0.8135870 1.0687610 -0.4234327
## GENE_3 -0.7638455 -1.3537554 -0.1561337 0.1199645 -0.6845167
## GENE_4 -0.3068718 0.4714274 -0.4029580 1.4858483 1.3235803
## GENE_5 0.9349748 -0.4700607 0.3885031 -0.1659683 -1.9235338
out * 2
## <100 x 10> matrix of class DelayedMatrix and type "double":
## SAMP_1 SAMP_2 SAMP_3 ... SAMP_9 SAMP_10
## GENE_1 0.9390150 -0.3072067 -1.6644101 . -0.43306016 -1.64007158
## GENE_2 0.9428664 2.2131076 1.6271740 . -2.53552151 -0.46029409
## GENE_3 -1.5276909 -2.7075109 -0.3122675 . -0.01435399 3.05773351
## GENE_4 -0.6137436 0.9428549 -0.8059159 . -0.67239822 0.22500736
## GENE_5 1.8699496 -0.9401214 0.7770062 . -2.78685172 0.44676092
## ... . . . . . .
## GENE_96 1.0558686 0.2047677 -3.4582303 . 0.4196528 3.4289754
## GENE_97 4.1567271 1.9408179 1.0459693 . -1.7738187 -3.3209106
## GENE_98 1.4780760 2.9160830 0.6649556 . 2.0795239 -1.7268419
## GENE_99 -2.4526625 -1.1378736 2.1855164 . -0.3942643 0.8567777
## GENE_100 0.7708731 -1.3923641 -4.1628691 . -1.9057814 -3.8484732
We can also do more complex matrix operations that are supported by DelayedArray:
colSums(out)
## SAMP_1 SAMP_2 SAMP_3 SAMP_4 SAMP_5 SAMP_6 SAMP_7
## 2.3190548 -3.1883724 14.0665424 0.6398062 0.6598219 6.2458670 2.5465798
## SAMP_8 SAMP_9 SAMP_10
## -0.7081215 -1.8437946 3.6637104
out %*% runif(ncol(out))
## <100 x 1> matrix of class DelayedMatrix and type "double":
## y
## GENE_1 0.6881455
## GENE_2 -2.3869472
## GENE_3 -0.8666771
## GENE_4 1.8945255
## GENE_5 -3.5186043
## ... .
## GENE_96 1.9499076
## GENE_97 -0.6950101
## GENE_98 0.2777651
## GENE_99 -0.6061603
## GENE_100 -5.1366841
We can adjust some parameters for creating the backend with appropriate arguments to writeTileDBArray().
For example, the example below allows us to control the path to the backend
as well as the name of the attribute containing the data.
X <- matrix(rnorm(1000), ncol=10)
path <- tempfile()
writeTileDBArray(X, path=path, attr="WHEE")
## <100 x 10> matrix of class TileDBMatrix and type "double":
## [,1] [,2] [,3] ... [,9] [,10]
## [1,] -0.21130651 -2.32981644 1.13528679 . 0.46050819 -0.18808307
## [2,] 0.93485724 0.65682005 -0.07566573 . 0.35540957 0.78044244
## [3,] 0.88793409 -0.28957070 -1.21296278 . -0.01649357 0.15664396
## [4,] 1.82639416 0.12663312 -0.03304823 . -0.31730696 -0.92544908
## [5,] 1.51023744 -0.39219455 -0.61895208 . 0.77813708 -1.43326219
## ... . . . . . .
## [96,] 0.66982467 -0.05857117 1.38288407 . -0.1227409 -1.6044565
## [97,] 0.37571997 -0.62590522 0.50470006 . 0.4817874 1.2799410
## [98,] -1.17677924 -2.13071217 1.45959223 . -0.9470521 -2.6803048
## [99,] -0.37855747 0.84210955 0.05854815 . -0.2582330 0.7997802
## [100,] 0.41686218 0.03320671 1.23553003 . 0.5419969 2.1028142
As these arguments cannot be passed during coercion, we instead provide global variables that can be set or unset to affect the outcome.
path2 <- tempfile()
setTileDBPath(path2)
as(X, "TileDBArray") # uses path2 to store the backend.
## <100 x 10> matrix of class TileDBMatrix and type "double":
## [,1] [,2] [,3] ... [,9] [,10]
## [1,] -0.21130651 -2.32981644 1.13528679 . 0.46050819 -0.18808307
## [2,] 0.93485724 0.65682005 -0.07566573 . 0.35540957 0.78044244
## [3,] 0.88793409 -0.28957070 -1.21296278 . -0.01649357 0.15664396
## [4,] 1.82639416 0.12663312 -0.03304823 . -0.31730696 -0.92544908
## [5,] 1.51023744 -0.39219455 -0.61895208 . 0.77813708 -1.43326219
## ... . . . . . .
## [96,] 0.66982467 -0.05857117 1.38288407 . -0.1227409 -1.6044565
## [97,] 0.37571997 -0.62590522 0.50470006 . 0.4817874 1.2799410
## [98,] -1.17677924 -2.13071217 1.45959223 . -0.9470521 -2.6803048
## [99,] -0.37855747 0.84210955 0.05854815 . -0.2582330 0.7997802
## [100,] 0.41686218 0.03320671 1.23553003 . 0.5419969 2.1028142
sessionInfo()
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows Server 2012 R2 x64 (build 9600)
##
## Matrix products: default
##
## locale:
## [1] LC_COLLATE=C
## [2] LC_CTYPE=English_United States.1252
## [3] LC_MONETARY=English_United States.1252
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United States.1252
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] TileDBArray_1.0.0 DelayedArray_0.16.0 IRanges_2.24.0
## [4] S4Vectors_0.28.0 MatrixGenerics_1.2.0 matrixStats_0.57.0
## [7] BiocGenerics_0.36.0 Matrix_1.2-18 BiocStyle_2.18.0
##
## loaded via a namespace (and not attached):
## [1] Rcpp_1.0.5 RcppCCTZ_0.2.9 knitr_1.30
## [4] magrittr_1.5 bit_4.0.4 nanotime_0.3.2
## [7] lattice_0.20-41 rlang_0.4.8 stringr_1.4.0
## [10] tools_4.0.3 grid_4.0.3 xfun_0.18
## [13] tiledb_0.8.2 htmltools_0.5.0 bit64_4.0.5
## [16] yaml_2.2.1 digest_0.6.27 bookdown_0.21
## [19] BiocManager_1.30.10 evaluate_0.14 rmarkdown_2.5
## [22] stringi_1.5.3 compiler_4.0.3 zoo_1.8-8