From 97dcc4f330842d8d5e5e174949e2185d18301cab Mon Sep 17 00:00:00 2001 From: Fabian Gans Date: Fri, 30 Jan 2026 16:08:03 +0000 Subject: [PATCH 1/3] Add possiblity to open zipped zarrs --- Project.toml | 5 +- ext/ZarrExt.jl | 148 ++++++++++++++++++++++++++++++++----------------- 2 files changed, 100 insertions(+), 53 deletions(-) diff --git a/Project.toml b/Project.toml index 706673d..635d745 100644 --- a/Project.toml +++ b/Project.toml @@ -18,6 +18,7 @@ DimensionalData = "0.27, 0.28, 0.29" NetCDF = "0.11, 0.12" Zarr = "0.8, 0.9" + [extensions] ArchGDALExt = "ArchGDAL" AxisArraysExt = "AxisArrays" @@ -25,7 +26,7 @@ AxisKeysExt = "AxisKeys" DimensionalDataExt = "DimensionalData" NamedDimsExt = "NamedDims" NetCDFExt = "NetCDF" -ZarrExt = "Zarr" +ZarrExt = ["Zarr", "ZipArchives", "DiskArrays"] [weakdeps] ArchGDAL = "c9ce4bd3-c3d5-55b8-8973-c0e20141b8c3" @@ -33,6 +34,8 @@ AxisArrays = "39de3d68-74b9-583c-8d2d-e117c070f3a9" AxisKeys = "94b1ba4f-4ee9-5380-92f1-94cde586c3c5" DimensionalData = "0703355e-b756-11e9-17c0-8b28908087d0" Downloads = "f43a241f-c20a-4ad4-852c-f6b1247861c6" +DiskArrays = "3c3547ce-8d99-4f5e-a174-61eb10b00ae3" NamedDims = "356022a1-0364-5f58-8944-0da4b18d706f" NetCDF = "30363a11-5582-574a-97bb-aa9a979735b9" Zarr = "0a941bbe-ad1d-11e8-39d9-ab76183a1d99" +ZipArchives = "49080126-0e18-4c2a-b176-c102e4b3760c" diff --git a/ext/ZarrExt.jl b/ext/ZarrExt.jl index 6319899..c6782ea 100644 --- a/ext/ZarrExt.jl +++ b/ext/ZarrExt.jl @@ -1,64 +1,108 @@ module ZarrExt - using YAXArrayBase - using Zarr: ZArray, ZGroup, zgroup, zcreate, to_zarrtype, zopen, Compressor - import YAXArrayBase: YAXArrayBase as YAB - export ZarrDataset - - function __init__() - @debug "new driver key :zarr, updating backendlist." - YAB.backendlist[:zarr] = ZarrDataset - push!(YAB.backendregex, r"(.zarr$)|(.zarr/$)"=>ZarrDataset) - end +using YAXArrayBase +using Zarr: ZArray, ZGroup, zgroup, zcreate, to_zarrtype, zopen, Compressor, ZipStore +import DiskArrays: AbstractDiskArray, DiskArrays, Unchunked, Chunked, GridChunks +using ZipArchives: ZipReader +import YAXArrayBase: YAXArrayBase as YAB +export ZarrDataset - struct ZarrDataset - g::ZGroup - end - ZarrDataset(g::String;mode="r") = ZarrDataset(zopen(g,mode,fill_as_missing=false)) - - YAB.get_var_dims(ds::ZarrDataset,name) = reverse(ds[name].attrs["_ARRAY_DIMENSIONS"]) - YAB.get_varnames(ds::ZarrDataset) = collect(keys(ds.g.arrays)) - function YAB.get_var_attrs(ds::ZarrDataset, name) - #We add the fill value to the attributes to be consistent with NetCDF - a = ds[name] - if a.metadata.fill_value !== nothing - merge(ds[name].attrs,Dict("_FillValue"=>a.metadata.fill_value)) - else - ds[name].attrs - end +function __init__() + @debug "new driver key :zarr, updating backendlist." + YAB.backendlist[:zarr] = ZarrDataset + push!(YAB.backendregex, r"(.zarr$)|(.zarr/$)|(zarr.zip$)" => ZarrDataset) +end + +struct ZarrDataset + g::ZGroup +end +function ZarrDataset(g::String; mode="r") + store = if endswith(g, "zip") + ZipStore(ZipReader(SimpleFileDiskArray(g))) + else + g end - YAB.get_global_attrs(ds::ZarrDataset) = ds.g.attrs - Base.getindex(ds::ZarrDataset, i) = ds.g[i] - Base.haskey(ds::ZarrDataset,k) = haskey(ds.g,k) - - # function add_var(p::ZarrDataset, T::Type{>:Missing}, varname, s, dimnames, attr; kwargs...) - # S = Base.nonmissingtype(T) - # add_var(p,S, varname, s, dimnames, attr; fill_value = defaultfillval(S), fill_as_missing=true, kwargs...) - # end - - function YAB.add_var(p::ZarrDataset, T::Type, varname, s, dimnames, attr; - chunksize=s, fill_as_missing=false, kwargs...) - attr2 = merge(attr,Dict("_ARRAY_DIMENSIONS"=>reverse(collect(dimnames)))) - fv = get(attr,"_FillValue",get(attr,"missing_value",YAB.defaultfillval(T))) - za = zcreate(T, p.g, varname,s...;fill_value = fv,fill_as_missing,attrs=attr2,chunks=chunksize,kwargs...) - za + ZarrDataset(zopen(store, mode, fill_as_missing=false)) +end + +YAB.get_var_dims(ds::ZarrDataset, name) = reverse(ds[name].attrs["_ARRAY_DIMENSIONS"]) +YAB.get_varnames(ds::ZarrDataset) = collect(keys(ds.g.arrays)) +function YAB.get_var_attrs(ds::ZarrDataset, name) + #We add the fill value to the attributes to be consistent with NetCDF + a = ds[name] + if a.metadata.fill_value !== nothing + merge(ds[name].attrs, Dict("_FillValue" => a.metadata.fill_value)) + else + ds[name].attrs end +end +YAB.get_global_attrs(ds::ZarrDataset) = ds.g.attrs +Base.getindex(ds::ZarrDataset, i) = ds.g[i] +Base.haskey(ds::ZarrDataset, k) = haskey(ds.g, k) - #Special case for init with Arrays - function YAB.add_var(p::ZarrDataset, a::AbstractArray, varname, dimnames, attr; - kwargs...) - T = to_zarrtype(a) - b = add_var(p,T,varname,size(a),dimnames,attr;kwargs...) - b .= a - a +# function add_var(p::ZarrDataset, T::Type{>:Missing}, varname, s, dimnames, attr; kwargs...) +# S = Base.nonmissingtype(T) +# add_var(p,S, varname, s, dimnames, attr; fill_value = defaultfillval(S), fill_as_missing=true, kwargs...) +# end + +function YAB.add_var(p::ZarrDataset, T::Type, varname, s, dimnames, attr; + chunksize=s, fill_as_missing=false, kwargs...) + attr2 = merge(attr, Dict("_ARRAY_DIMENSIONS" => reverse(collect(dimnames)))) + fv = get(attr, "_FillValue", get(attr, "missing_value", YAB.defaultfillval(T))) + attr3 = filter(attr2) do (k, v) + isa(v, AbstractFloat) && !isnan(v) end + za = zcreate(T, p.g, varname, s...; fill_value=fv, fill_as_missing, attrs=attr3, chunks=chunksize, kwargs...) + za +end + +#Special case for init with Arrays +function YAB.add_var(p::ZarrDataset, a::AbstractArray, varname, dimnames, attr; + kwargs...) + T = to_zarrtype(a) + b = add_var(p, T, varname, size(a), dimnames, attr; kwargs...) + b .= a + a +end + +YAB.create_empty(::Type{ZarrDataset}, path, gatts=Dict()) = ZarrDataset(zgroup(path, attrs=gatts)) - YAB.create_empty(::Type{ZarrDataset}, path, gatts=Dict()) = ZarrDataset(zgroup(path, attrs=gatts)) +YAB.allow_parallel_write(::ZarrDataset) = true +YAB.allow_missings(::ZarrDataset) = false +YAB.to_dataset(g::ZGroup; kwargs...) = ZarrDataset(g) +YAB.iscompressed(a::ZArray{<:Any,<:Any,<:Compressor}) = true + + +#Add ability to read zipped zarrs + + +struct SimpleFileDiskArray{C<:Union{Int,Nothing}} <: AbstractDiskArray{UInt8,1} + file::String + s::Int + chunksize::C +end +Base.size(s::SimpleFileDiskArray) = (s.s,) +function SimpleFileDiskArray(filename; chunksize=nothing) + isfile(filename) || throw(ArgumentError("File $filename does not exist")) + s = filesize(filename) + SimpleFileDiskArray(filename, s, chunksize) +end +function DiskArrays.readblock!(a::SimpleFileDiskArray, aout, i::AbstractUnitRange) + open(a.file) do f + seek(f, first(i) - 1) + read!(f, aout) + end +end +DiskArrays.haschunks(a::SimpleFileDiskArray) = a.chunksize === nothing ? Unchunked() : Chunked() +function DiskArrays.eachchunk(a::SimpleFileDiskArray) + if a.chunksize === nothing + DiskArrays.estimate_chunksize(a) + else + GridChunks((a.s,), (a.chunksize,)) + end +end + - YAB.allow_parallel_write(::ZarrDataset) = true - YAB.allow_missings(::ZarrDataset) = false - YAB.to_dataset(g::ZGroup; kwargs...) = ZarrDataset(g) - YAB.iscompressed(a::ZArray{<:Any,<:Any,<:Compressor}) = true end \ No newline at end of file From 0544f24783a00280bca02f1fb53afce3d79694ce Mon Sep 17 00:00:00 2001 From: Fabian Gans Date: Fri, 30 Jan 2026 16:12:24 +0000 Subject: [PATCH 2/3] update Project.toml --- Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index 635d745..d580299 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "YAXArrayBase" uuid = "90b8fcef-0c2d-428d-9c56-5f86629e9d14" authors = ["Fabian Gans "] -version = "0.7.7" +version = "0.7.8" [deps] DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8" From 3ba18fb2aa0f9c0e6d0e4dd03d36f07521c13da7 Mon Sep 17 00:00:00 2001 From: Fabian Gans Date: Fri, 30 Jan 2026 16:28:17 +0000 Subject: [PATCH 3/3] Fix bug --- ext/ZarrExt.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ext/ZarrExt.jl b/ext/ZarrExt.jl index c6782ea..9d89266 100644 --- a/ext/ZarrExt.jl +++ b/ext/ZarrExt.jl @@ -49,7 +49,7 @@ function YAB.add_var(p::ZarrDataset, T::Type, varname, s, dimnames, attr; attr2 = merge(attr, Dict("_ARRAY_DIMENSIONS" => reverse(collect(dimnames)))) fv = get(attr, "_FillValue", get(attr, "missing_value", YAB.defaultfillval(T))) attr3 = filter(attr2) do (k, v) - isa(v, AbstractFloat) && !isnan(v) + !isa(v, AbstractFloat) || !isnan(v) end za = zcreate(T, p.g, varname, s...; fill_value=fv, fill_as_missing, attrs=attr3, chunks=chunksize, kwargs...) za