| h5mread {HDF5Array} | R Documentation |
rhdf5::h5readh5mread is the result of experimenting with alternative
rhdf5::h5read implementations.
It should still be considered experimental!
h5mread(filepath, name, starts=NULL, counts=NULL, noreduce=FALSE,
as.integer=FALSE, as.sparse=FALSE,
method=0L, use.H5Dread_chunk=FALSE)
get_h5mread_returned_type(filepath, name, as.integer=FALSE)
filepath |
The path (as a single string) to the HDF5 file where the dataset to read from is located, or an H5File object. Note that you must create and use an H5File object if the HDF5
file to access is stored in an Amazon S3 bucket. See Also please note that H5File objects must NOT be used in the context of parallel evaluation at the moment. |
name |
The name of the dataset in the HDF5 file. |
starts, counts |
If If Each list element in If Finally note that when |
noreduce |
TODO |
as.integer |
TODO |
as.sparse |
TODO |
method |
TODO |
use.H5Dread_chunk |
TODO |
COMING SOON...
An array for h5mread.
The type of the array that will be returned by h5mread for
get_h5mread_returned_type.
Equivalent to:
typeof(h5mread(filepath, name, rep(list(integer(0)), ndim)))
where ndim is the number of dimensions (a.k.a. the rank
in HDF5 jargon) of the dataset. get_h5mread_returned_type is
provided for convenience.
H5File objects.
h5read in the rhdf5 package.
extract_array in the DelayedArray
package.
The TENxBrainData dataset (in the
TENxBrainData package).
h5mread_from_reshaped to read data from a virtually
reshaped HDF5 dataset.
## ---------------------------------------------------------------------
## BASIC USAGE
## ---------------------------------------------------------------------
m0 <- matrix((runif(600) - 0.5) * 10, ncol=12)
M0 <- writeHDF5Array(m0, name="M0")
m <- h5mread(path(M0), "M0")
stopifnot(identical(m0, m))
m <- h5mread(path(M0), "M0", starts=list(NULL, c(3, 12:8)))
stopifnot(identical(m0[ , c(3, 12:8)], m))
m <- h5mread(path(M0), "M0", starts=list(integer(0), c(3, 12:8)))
stopifnot(identical(m0[NULL , c(3, 12:8)], m))
m <- h5mread(path(M0), "M0", starts=list(1:5, NULL), as.integer=TRUE)
storage.mode(m0) <- "integer"
stopifnot(identical(m0[1:5, ], m))
a0 <- array(1:350, c(10, 5, 7))
A0 <- writeHDF5Array(a0, filepath=path(M0), name="A0")
h5ls(path(A0))
a <- h5mread(path(A0), "A0", starts=list(c(2, 7), NULL, 6),
counts=list(c(4, 2), NULL, NULL))
stopifnot(identical(a0[c(2:5, 7:8), , 6, drop=FALSE], a))
## Load the data in a sparse array representation:
m1 <- matrix(c(5:-2, rep.int(c(0L, 99L), 11)), ncol=6)
M1 <- writeHDF5Array(m1, name="M1", chunkdim=c(3L, 2L))
index <- list(5:3, NULL)
m <- h5mread(path(M1), "M1", starts=index)
sas <- h5mread(path(M1), "M1", starts=index, as.sparse=TRUE)
class(sas) # SparseArraySeed object (see ?SparseArraySeed)
as(sas, "dgCMatrix")
stopifnot(identical(m, sparse2dense(sas)))
## ---------------------------------------------------------------------
## PERFORMANCE
## ---------------------------------------------------------------------
library(ExperimentHub)
hub <- ExperimentHub()
## With the "sparse" TENxBrainData dataset
## ---------------------------------------
fname0 <- hub[["EH1039"]]
h5ls(fname0) # all datasets are 1D datasets
index <- list(77 * sample(34088679, 5000, replace=TRUE))
## h5mread() is about 4x faster than h5read():
system.time(a <- h5mread(fname0, "mm10/data", index))
system.time(b <- h5read(fname0, "mm10/data", index=index))
stopifnot(identical(a, b))
index <- list(sample(1306127, 7500, replace=TRUE))
## h5mread() is about 20x faster than h5read():
system.time(a <- h5mread(fname0, "mm10/barcodes", index))
system.time(b <- h5read(fname0, "mm10/barcodes", index=index))
stopifnot(identical(a, b))
## With the "dense" TENxBrainData dataset
## --------------------------------------
fname1 <- hub[["EH1040"]]
h5ls(fname1) # "counts" is a 2D dataset
set.seed(33)
index <- list(sample(27998, 300), sample(1306127, 450))
## h5mread() is about 2x faster than h5read():
system.time(a <- h5mread(fname1, "counts", index))
system.time(b <- h5read(fname1, "counts", index=index))
stopifnot(identical(a, b))
## Alternatively 'as.sparse=TRUE' can be used to reduce memory usage:
system.time(sas <- h5mread(fname1, "counts", index, as.sparse=TRUE))
stopifnot(identical(a, sparse2dense(sas)))
## The bigger the selection, the greater the speedup between
## h5read() and h5mread():
## Not run:
index <- list(sample(27998, 1000), sample(1306127, 1000))
## h5mread() about 8x faster than h5read() (20s vs 2m30s):
system.time(a <- h5mread(fname1, "counts", index))
system.time(b <- h5read(fname1, "counts", index=index))
stopifnot(identical(a, b))
## With 'as.sparse=TRUE' (about the same speed as with 'as.sparse=FALSE'):
system.time(sas <- h5mread(fname1, "counts", index, as.sparse=TRUE))
stopifnot(identical(a, sparse2dense(sas)))
## End(Not run)