from __future__ import annotations
import base64
import re
from dataclasses import dataclass
from typing import ClassVar, Literal, Self, TypedDict, TypeGuard, cast, overload
import numpy as np
from zarr.core.common import JSON, NamedConfig, ZarrFormat
from zarr.core.dtype.common import (
DataTypeValidationError,
DTypeConfig_V2,
DTypeJSON,
HasItemSize,
HasLength,
HasObjectCodec,
check_dtype_spec_v2,
v3_unstable_dtype_warning,
)
from zarr.core.dtype.npy.common import check_json_str
from zarr.core.dtype.wrapper import TBaseDType, ZDType
BytesLike = np.bytes_ | str | bytes | int
class FixedLengthBytesConfig(TypedDict):
"""
A configuration for a data type that takes a ``length_bytes`` parameter.
Attributes
----------
length_bytes : int
The length in bytes of the data associated with this configuration.
Examples
--------
.. code-block:: python
{
"length_bytes": 12
}
"""
length_bytes: int
[docs]
class NullterminatedBytesJSON_V2(DTypeConfig_V2[str, None]):
"""
A wrapper around the JSON representation of the ``NullTerminatedBytes`` data type in Zarr V2.
The ``name`` field of this class contains the value that would appear under the
``dtype`` field in Zarr V2 array metadata.
References
----------
The structure of the ``name`` field is defined in the Zarr V2
`specification document <https://github.com/zarr-developers/zarr-specs/blob/main/docs/v2/v2.0.rst#data-type-encoding>`__.
Examples
--------
.. code-block:: python
{
"name": "|S10",
"object_codec_id": None
}
"""
[docs]
class NullTerminatedBytesJSON_V3(
NamedConfig[Literal["null_terminated_bytes"], FixedLengthBytesConfig]
):
"""
The JSON representation of the ``NullTerminatedBytes`` data type in Zarr V3.
References
----------
This representation is not currently defined in an external specification.
Examples
--------
.. code-block:: python
{
"name": "null_terminated_bytes",
"configuration": {
"length_bytes": 12
}
}
"""
[docs]
class RawBytesJSON_V2(DTypeConfig_V2[str, None]):
"""
A wrapper around the JSON representation of the ``RawBytes`` data type in Zarr V2.
The ``name`` field of this class contains the value that would appear under the
``dtype`` field in Zarr V2 array metadata.
References
----------
The structure of the ``name`` field is defined in the Zarr V2
`specification document <https://github.com/zarr-developers/zarr-specs/blob/main/docs/v2/v2.0.rst#data-type-encoding>`__.
Examples
--------
.. code-block:: python
{
"name": "|V10",
"object_codec_id": None
}
"""
[docs]
class RawBytesJSON_V3(NamedConfig[Literal["raw_bytes"], FixedLengthBytesConfig]):
"""
The JSON representation of the ``RawBytes`` data type in Zarr V3.
References
----------
This representation is not currently defined in an external specification.
Examples
--------
.. code-block:: python
{
"name": "raw_bytes",
"configuration": {
"length_bytes": 12
"""
[docs]
class VariableLengthBytesJSON_V2(DTypeConfig_V2[Literal["|O"], Literal["vlen-bytes"]]):
"""
A wrapper around the JSON representation of the ``VariableLengthBytes`` data type in Zarr V2.
The ``name`` field of this class contains the value that would appear under the
``dtype`` field in Zarr V2 array metadata. The ``object_codec_id`` field is always ``"vlen-bytes"``
References
----------
The structure of the ``name`` field is defined in the Zarr V2
`specification document <https://github.com/zarr-developers/zarr-specs/blob/main/docs/v2/v2.0.rst#data-type-encoding>`__.
Examples
--------
.. code-block:: python
{
"name": "|O",
"object_codec_id": "vlen-bytes"
}
"""
[docs]
@dataclass(frozen=True, kw_only=True)
class NullTerminatedBytes(ZDType[np.dtypes.BytesDType[int], np.bytes_], HasLength, HasItemSize):
"""
A Zarr data type for arrays containing fixed-length null-terminated byte sequences.
Wraps the ``np.dtypes.BytesDType`` data type. Scalars for this data type are instances of
``np.bytes_``.
This data type is parametrized by an integral length which specifies size in bytes of each
scalar. Because this data type uses null-terminated semantics, indexing into
NumPy arrays with this data type may return fewer than ``length`` bytes.
Attributes
----------
dtype_cls: ClassVar[type[np.dtypes.BytesDType[int]]] = np.dtypes.BytesDType
The NumPy data type wrapped by this ZDType.
_zarr_v3_name : ClassVar[Literal["null_terminated_bytes"]]
length : int
The length of the bytes.
Notes
-----
This data type is designed for compatibility with NumPy arrays that use the NumPy ``bytes`` data type.
It may not be desirable for usage outside of that context. If compatibility
with the NumPy ``bytes`` data type is not essential, consider using the ``RawBytes``
or ``VariableLengthBytes`` data types instead.
"""
dtype_cls = np.dtypes.BytesDType
_zarr_v3_name: ClassVar[Literal["null_terminated_bytes"]] = "null_terminated_bytes"
def __post_init__(self) -> None:
"""
We don't allow instances of this class with length less than 1 because there is no way such
a data type can contain actual data.
"""
if self.length < 1:
raise ValueError(f"length must be >= 1, got {self.length}.")
[docs]
@classmethod
def from_native_dtype(cls, dtype: TBaseDType) -> Self:
"""
Create an instance of NullTerminatedBytes from an instance of np.dtypes.BytesDType.
This method checks if the provided data type is an instance of np.dtypes.BytesDType.
If so, it returns a new instance of NullTerminatedBytes with a length equal to the
length of input data type.
Parameters
----------
dtype : TBaseDType
The native dtype to convert.
Returns
-------
NullTerminatedBytes
An instance of NullTerminatedBytes with the specified length.
Raises
------
DataTypeValidationError
If the dtype is not compatible with NullTerminatedBytes.
"""
if cls._check_native_dtype(dtype):
return cls(length=dtype.itemsize)
raise DataTypeValidationError(
f"Invalid data type: {dtype}. Expected an instance of {cls.dtype_cls}"
)
[docs]
def to_native_dtype(self) -> np.dtypes.BytesDType[int]:
"""
Create a NumPy bytes dtype from this NullTerminatedBytes ZDType.
Returns
-------
np.dtypes.BytesDType[int]
A NumPy data type object representing null-terminated bytes with a specified length.
"""
return self.dtype_cls(self.length)
@classmethod
def _check_json_v2(cls, data: DTypeJSON) -> TypeGuard[NullterminatedBytesJSON_V2]:
"""
Check that the input is a valid JSON representation of NullTerminatedBytes in Zarr V2.
The input data must be a mapping that contains a "name" key that matches the pattern
"|S<number>" and an "object_codec_id" key that is None.
Parameters
----------
data : DTypeJSON
The JSON data to check.
Returns
-------
bool
True if the input data is a valid representation, False otherwise.
"""
return (
check_dtype_spec_v2(data)
and isinstance(data["name"], str)
and re.match(r"^\|S\d+$", data["name"]) is not None
and data["object_codec_id"] is None
)
@classmethod
def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[NullTerminatedBytesJSON_V3]:
"""
Check that the input is a valid JSON representation of this class in Zarr V3.
Parameters
----------
data : DTypeJSON
The JSON data to check.
Returns
-------
TypeGuard[NullTerminatedBytesJSON_V3]
True if the input is a valid representation of this class in Zarr V3, False
otherwise.
"""
return (
isinstance(data, dict)
and set(data.keys()) == {"name", "configuration"}
and data["name"] == cls._zarr_v3_name
and isinstance(data["configuration"], dict)
and "length_bytes" in data["configuration"]
and isinstance(data["configuration"]["length_bytes"], int)
)
@classmethod
def _from_json_v2(cls, data: DTypeJSON) -> Self:
"""
Create an instance of this class from Zarr V2-flavored JSON.
This method checks if the input data is a valid representation of
this class in Zarr V2. If so, it returns a new instance of
this class with a ``length`` as specified in the input data.
Parameters
----------
data : DTypeJSON
The JSON data to parse.
Returns
-------
Self
An instance of this data type.
Raises
------
DataTypeValidationError
If the input data is not a valid representation of this class.
"""
if cls._check_json_v2(data):
name = data["name"]
return cls(length=int(name[2:]))
msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected a string like '|S1', '|S2', etc"
raise DataTypeValidationError(msg)
@classmethod
def _from_json_v3(cls, data: DTypeJSON) -> Self:
"""
Create an instance of this class from Zarr V3-flavored JSON.
This method checks if the input data is a valid representation of
this class in Zarr V3. If so, it returns a new instance of
this class with a ``length`` as specified in the input data.
Parameters
----------
data : DTypeJSON
The JSON data to parse.
Returns
-------
Self
An instance of this data type.
Raises
------
DataTypeValidationError
If the input data is not a valid representation of this class.
"""
if cls._check_json_v3(data):
return cls(length=data["configuration"]["length_bytes"])
msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string {cls._zarr_v3_name!r}"
raise DataTypeValidationError(msg)
@overload
def to_json(self, zarr_format: Literal[2]) -> NullterminatedBytesJSON_V2: ...
@overload
def to_json(self, zarr_format: Literal[3]) -> NullTerminatedBytesJSON_V3: ...
[docs]
def to_json(
self, zarr_format: ZarrFormat
) -> DTypeConfig_V2[str, None] | NullTerminatedBytesJSON_V3:
"""
Generate a JSON representation of this data type.
Parameters
----------
zarr_format : ZarrFormat
The zarr format version.
Returns
-------
NullterminatedBytesJSON_V2 | NullTerminatedBytesJSON_V3
The JSON-serializable representation of the data type
"""
if zarr_format == 2:
return {"name": self.to_native_dtype().str, "object_codec_id": None}
elif zarr_format == 3:
v3_unstable_dtype_warning(self)
return {
"name": self._zarr_v3_name,
"configuration": {"length_bytes": self.length},
}
raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover
def _check_scalar(self, data: object) -> TypeGuard[BytesLike]:
"""
Check if the provided data is of type BytesLike.
This method is used to verify if the input data can be considered as a
scalar of bytes-like type, which includes NumPy bytes, strings, bytes,
and integers.
Parameters
----------
data : object
The data to check.
Returns
-------
TypeGuard[BytesLike]
True if the data is bytes-like, False otherwise.
"""
return isinstance(data, BytesLike)
def _cast_scalar_unchecked(self, data: BytesLike) -> np.bytes_:
"""
Cast the provided scalar data to ``np.bytes_``, truncating if necessary.
Parameters
----------
data : BytesLike
The data to cast.
Returns
-------
np.bytes_
The casted data as a NumPy bytes scalar.
Notes
-----
This method does not perform any type checking.
The input data must be bytes-like.
"""
if isinstance(data, int):
return self.to_native_dtype().type(str(data)[: self.length])
else:
return self.to_native_dtype().type(data[: self.length])
[docs]
def cast_scalar(self, data: object) -> np.bytes_:
"""
Attempt to cast a given object to a NumPy bytes scalar.
This method first checks if the provided data is a valid scalar that can be
converted to a NumPy bytes scalar. If the check succeeds, the unchecked casting
operation is performed. If the data is not valid, a TypeError is raised.
Parameters
----------
data : object
The data to be cast to a NumPy bytes scalar.
Returns
-------
``np.bytes_``
The data cast as a NumPy bytes scalar.
Raises
------
TypeError
If the data cannot be converted to a NumPy bytes scalar.
"""
if self._check_scalar(data):
return self._cast_scalar_unchecked(data)
msg = (
f"Cannot convert object {data!r} with type {type(data)} to a scalar compatible with the "
f"data type {self}."
)
raise TypeError(msg)
[docs]
def default_scalar(self) -> np.bytes_:
"""
Return a default scalar value, which for this data type is an empty byte string.
Returns
-------
``np.bytes_``
The default scalar value.
"""
return np.bytes_(b"")
[docs]
def to_json_scalar(self, data: object, *, zarr_format: ZarrFormat) -> str:
"""
Convert a scalar to a JSON-serializable string representation.
This method encodes the given scalar as a NumPy bytes scalar and then
encodes the bytes as a base64-encoded string.
Parameters
----------
data : object
The scalar to convert.
zarr_format : ZarrFormat
The zarr format version.
Returns
-------
str
A string representation of the scalar.
"""
as_bytes = self.cast_scalar(data)
return base64.standard_b64encode(as_bytes).decode("ascii")
[docs]
def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> np.bytes_:
"""
Read a JSON-serializable value as ``np.bytes_``.
Parameters
----------
data : JSON
The JSON-serializable base64-encoded string.
zarr_format : ZarrFormat
The zarr format version.
Returns
-------
``np.bytes_``
The NumPy bytes scalar obtained from decoding the base64 string.
Raises
------
TypeError
If the input data is not a base64-encoded string.
"""
if check_json_str(data):
return self.to_native_dtype().type(base64.standard_b64decode(data.encode("ascii")))
raise TypeError(
f"Invalid type: {data}. Expected a base64-encoded string."
) # pragma: no cover
@property
def item_size(self) -> int:
"""
The size of a single scalar in bytes.
Returns
-------
int
The size of a single scalar in bytes.
"""
return self.length
[docs]
@dataclass(frozen=True, kw_only=True)
class RawBytes(ZDType[np.dtypes.VoidDType[int], np.void], HasLength, HasItemSize):
"""
A Zarr data type for arrays containing fixed-length sequences of raw bytes.
Wraps the NumPy ``void`` data type. Scalars for this data type are instances of ``np.void``.
This data type is parametrized by an integral length which specifies size in bytes of each
scalar belonging to this data type.
Attributes
----------
dtype_cls: ClassVar[type[np.dtypes.VoidDType[int]]] = np.dtypes.VoidDtype
The NumPy data type wrapped by this ZDType.
_zarr_v3_name : ClassVar[Literal["raw_bytes"]]
length : int
The length of the bytes.
Notes
-----
Although the NumPy "Void" data type is used to create "structured" data types in NumPy, this
class does not support structured data types.
See the ``Structured`` data type for this functionality.
"""
# np.dtypes.VoidDType is specified in an odd way in NumPy
# it cannot be used to create instances of the dtype
# so we have to tell mypy to ignore this here
dtype_cls = np.dtypes.VoidDType # type: ignore[assignment]
_zarr_v3_name: ClassVar[Literal["raw_bytes"]] = "raw_bytes"
def __post_init__(self) -> None:
"""
We don't allow instances of this class with length less than 1 because there is no way such
a data type can contain actual data.
"""
if self.length < 1:
raise ValueError(f"length must be >= 1, got {self.length}.")
@classmethod
def _check_native_dtype(
cls: type[Self], dtype: TBaseDType
) -> TypeGuard[np.dtypes.VoidDType[int]]:
"""
Check that the input is a NumPy void dtype with no fields.
Numpy void dtype comes in two forms:
* If the ``fields`` attribute is ``None``, then the dtype represents N raw bytes.
* If the ``fields`` attribute is not ``None``, then the dtype represents a structured dtype,
In this check we ensure that ``fields`` is ``None``.
Parameters
----------
dtype : TDBaseDType
The dtype to check.
Returns
-------
Bool
True if the dtype is an instance of np.dtypes.VoidDType with no fields, False otherwise.
"""
return cls.dtype_cls is type(dtype) and dtype.fields is None
[docs]
@classmethod
def from_native_dtype(cls, dtype: TBaseDType) -> Self:
"""
Create an instance of RawBytes from an instance of np.dtypes.VoidDType.
This method checks if the provided data type is compatible with RawBytes. The input
must be an instance of np.dtypes.VoidDType, and have no fields. If the input is compatible,
this method returns an instance of RawBytes with the specified length.
Parameters
----------
dtype : TBaseDType
The native dtype to convert.
Returns
-------
RawBytes
An instance of RawBytes with the specified length.
Raises
------
DataTypeValidationError
If the dtype is not compatible with RawBytes.
"""
if cls._check_native_dtype(dtype):
return cls(length=dtype.itemsize)
raise DataTypeValidationError(
f"Invalid data type: {dtype}. Expected an instance of {cls.dtype_cls}"
)
[docs]
def to_native_dtype(self) -> np.dtypes.VoidDType[int]:
"""
Create a NumPy void dtype from this RawBytes ZDType.
Returns
-------
np.dtypes.VoidDType[int]
A NumPy data type object representing raw bytes with a specified length.
"""
# Numpy does not allow creating a void type
# by invoking np.dtypes.VoidDType directly
return cast("np.dtypes.VoidDType[int]", np.dtype(f"V{self.length}"))
@classmethod
def _check_json_v2(cls, data: DTypeJSON) -> TypeGuard[RawBytesJSON_V2]:
"""
Check that the input is a valid representation of this class in Zarr V2.
Parameters
----------
data : DTypeJSON
The JSON data to check.
Returns
-------
True if the input is a valid representation of this class in Zarr V3, False otherwise.
"""
return (
check_dtype_spec_v2(data)
and isinstance(data["name"], str)
and re.match(r"^\|V\d+$", data["name"]) is not None
and data["object_codec_id"] is None
)
@classmethod
def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[RawBytesJSON_V3]:
"""
Check that the input is a valid JSON representation of this class in Zarr V3.
Parameters
----------
data : DTypeJSON
The JSON data to check.
Returns
-------
TypeGuard[RawBytesJSON_V3]
True if the input is a valid representation of this class in Zarr V3, False
otherwise.
"""
return (
isinstance(data, dict)
and set(data.keys()) == {"name", "configuration"}
and data["name"] == cls._zarr_v3_name
and isinstance(data["configuration"], dict)
and set(data["configuration"].keys()) == {"length_bytes"}
and isinstance(data["configuration"]["length_bytes"], int)
)
@classmethod
def _from_json_v2(cls, data: DTypeJSON) -> Self:
"""
Create an instance of RawBytes from Zarr V2-flavored JSON.
This method checks if the input data is a valid representation of
RawBytes in Zarr V2. If so, it returns a new instance of
RawBytes with a ``length`` as specified in the input data.
Parameters
----------
data : DTypeJSON
The JSON data to parse.
Returns
-------
Self
An instance of this data type.
Raises
------
DataTypeValidationError
If the input data is not a valid representation of this class.
"""
if cls._check_json_v2(data):
name = data["name"]
return cls(length=int(name[2:]))
msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected a string like '|V1', '|V2', etc"
raise DataTypeValidationError(msg)
@classmethod
def _from_json_v3(cls, data: DTypeJSON) -> Self:
"""
Create an instance of RawBytes from Zarr V3-flavored JSON.
This method checks if the input data is a valid representation of
RawBytes in Zarr V3. If so, it returns a new instance of
RawBytes with a ``length`` as specified in the input data.
Parameters
----------
data : DTypeJSON
The JSON data to parse.
Returns
-------
RawBytes
An instance of RawBytes.
Raises
------
DataTypeValidationError
If the input data is not a valid representation of this class.
"""
if cls._check_json_v3(data):
return cls(length=data["configuration"]["length_bytes"])
msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string {cls._zarr_v3_name!r}"
raise DataTypeValidationError(msg)
@overload
def to_json(self, zarr_format: Literal[2]) -> RawBytesJSON_V2: ...
@overload
def to_json(self, zarr_format: Literal[3]) -> RawBytesJSON_V3: ...
[docs]
def to_json(self, zarr_format: ZarrFormat) -> RawBytesJSON_V2 | RawBytesJSON_V3:
"""
Generate a JSON representation of this data type.
Parameters
----------
zarr_format : ZarrFormat
The zarr format version.
Returns
-------
RawBytesJSON_V2 | RawBytesJSON_V3
The JSON-serializable representation of the data type.
"""
if zarr_format == 2:
return {"name": self.to_native_dtype().str, "object_codec_id": None}
elif zarr_format == 3:
v3_unstable_dtype_warning(self)
return {"name": self._zarr_v3_name, "configuration": {"length_bytes": self.length}}
raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover
def _check_scalar(self, data: object) -> TypeGuard[np.bytes_ | str | bytes | np.void]:
"""
Check if the provided data can be cast to np.void.
This method is used to verify if the input data can be considered as a
scalar of bytes-like type, which includes np.bytes_, np.void, strings, and bytes objects.
Parameters
----------
data : object
The data to check.
Returns
-------
TypeGuard[np.bytes_ | str | bytes | np.void]
True if the data is void-scalar-like, False otherwise.
"""
return isinstance(data, np.bytes_ | str | bytes | np.void)
def _cast_scalar_unchecked(self, data: object) -> np.void:
"""
Cast the provided scalar data to np.void.
Parameters
----------
data : BytesLike
The data to cast.
Returns
-------
np.void
The casted data as a NumPy void scalar.
Notes
-----
This method does not perform any type checking.
The input data must be castable to np.void.
"""
native_dtype = self.to_native_dtype()
# Without the second argument, NumPy will return a void scalar for dtype V1.
# The second argument ensures that, if native_dtype is something like V10,
# the result will actually be a V10 scalar.
return native_dtype.type(data, native_dtype)
[docs]
def cast_scalar(self, data: object) -> np.void:
"""
Attempt to cast a given object to a NumPy void scalar.
This method first checks if the provided data is a valid scalar that can be
converted to a NumPy void scalar. If the check succeeds, the unchecked casting
operation is performed. If the data is not valid, a TypeError is raised.
Parameters
----------
data : object
The data to be cast to a NumPy void scalar.
Returns
-------
np.void
The data cast as a NumPy void scalar.
Raises
------
TypeError
If the data cannot be converted to a NumPy void scalar.
"""
if self._check_scalar(data):
return self._cast_scalar_unchecked(data)
msg = (
f"Cannot convert object {data!r} with type {type(data)} to a scalar compatible with the "
f"data type {self}."
)
raise TypeError(msg)
[docs]
def default_scalar(self) -> np.void:
"""
Return the default scalar value for this data type.
The default scalar is a NumPy void scalar of the same length as the data type,
filled with zero bytes.
Returns
-------
np.void
The default scalar value.
"""
return self.to_native_dtype().type(("\x00" * self.length).encode("ascii"))
[docs]
def to_json_scalar(self, data: object, *, zarr_format: ZarrFormat) -> str:
"""
Convert a scalar to a JSON-serializable string representation.
This method converts the given scalar to bytes and then
encodes the bytes as a base64-encoded string.
Parameters
----------
data : object
The scalar to convert.
zarr_format : ZarrFormat
The zarr format version.
Returns
-------
str
A string representation of the scalar.
"""
as_bytes = self.cast_scalar(data)
return base64.standard_b64encode(as_bytes.tobytes()).decode("ascii")
[docs]
def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> np.void:
"""
Read a JSON-serializable value as a np.void.
Parameters
----------
data : JSON
The JSON-serializable value.
zarr_format : ZarrFormat
The zarr format version.
Returns
-------
np.void
The NumPy void scalar.
Raises
------
TypeError
If the data is not a string, or if the string is not a valid base64 encoding.
"""
if check_json_str(data):
return self.to_native_dtype().type(base64.standard_b64decode(data))
raise TypeError(f"Invalid type: {data}. Expected a string.") # pragma: no cover
@property
def item_size(self) -> int:
"""
The size of a single scalar in bytes.
Returns
-------
int
The size of a single scalar in bytes.
"""
return self.length
[docs]
@dataclass(frozen=True, kw_only=True)
class VariableLengthBytes(ZDType[np.dtypes.ObjectDType, bytes], HasObjectCodec):
"""
A Zarr data type for arrays containing variable-length sequences of bytes.
Wraps the NumPy "object" data type. Scalars for this data type are instances of ``bytes``.
Attributes
----------
dtype_cls: ClassVar[type[np.dtypes.ObjectDType]] = np.dtypes.ObjectDType
The NumPy data type wrapped by this ZDType.
_zarr_v3_name: ClassVar[Literal["variable_length_bytes"]] = "variable_length_bytes"
The name of this data type in Zarr V3.
object_codec_id: ClassVar[Literal["vlen-bytes"]] = "vlen-bytes"
The object codec ID for this data type.
Notes
-----
Because this data type uses the NumPy "object" data type, it does not guarantee a compact memory
representation of array data. Therefore a "vlen-bytes" codec is needed to ensure that the array
data can be persisted to storage.
"""
dtype_cls = np.dtypes.ObjectDType
_zarr_v3_name: ClassVar[Literal["variable_length_bytes"]] = "variable_length_bytes"
object_codec_id: ClassVar[Literal["vlen-bytes"]] = "vlen-bytes"
[docs]
@classmethod
def from_native_dtype(cls, dtype: TBaseDType) -> Self:
"""
Create an instance of VariableLengthBytes from an instance of np.dtypes.ObjectDType.
This method checks if the provided data type is an instance of np.dtypes.ObjectDType.
If so, it returns an instance of VariableLengthBytes.
Parameters
----------
dtype : TBaseDType
The native dtype to convert.
Returns
-------
VariableLengthBytes
An instance of VariableLengthBytes.
Raises
------
DataTypeValidationError
If the dtype is not compatible with VariableLengthBytes.
"""
if cls._check_native_dtype(dtype):
return cls()
raise DataTypeValidationError(
f"Invalid data type: {dtype}. Expected an instance of {cls.dtype_cls}"
)
[docs]
def to_native_dtype(self) -> np.dtypes.ObjectDType:
"""
Create a NumPy object dtype from this VariableLengthBytes ZDType.
Returns
-------
np.dtypes.ObjectDType
A NumPy data type object representing variable-length bytes.
"""
return self.dtype_cls()
@classmethod
def _check_json_v2(
cls,
data: DTypeJSON,
) -> TypeGuard[VariableLengthBytesJSON_V2]:
"""
Check that the input is a valid JSON representation of a NumPy O dtype, and that the
object codec id is appropriate for variable-length bytes strings.
Parameters
----------
data : DTypeJSON
The JSON data to check.
Returns
-------
True if the input is a valid representation of this class in Zarr V2, False
otherwise.
"""
# Check that the input is a valid JSON representation of a Zarr v2 data type spec.
if not check_dtype_spec_v2(data):
return False
# Check that the object codec id is appropriate for variable-length bytes strings.
if data["name"] != "|O":
return False
return data["object_codec_id"] == cls.object_codec_id
@classmethod
def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[Literal["variable_length_bytes"]]:
"""
Check that the input is a valid JSON representation of this class in Zarr V3.
Parameters
----------
data : DTypeJSON
The JSON data to check.
Returns
-------
TypeGuard[Literal["variable_length_bytes"]]
True if the input is a valid representation of this class in Zarr V3, False otherwise.
"""
return data == cls._zarr_v3_name
@classmethod
def _from_json_v2(cls, data: DTypeJSON) -> Self:
"""
Create an instance of this VariableLengthBytes from Zarr V2-flavored JSON.
This method checks if the input data is a valid representation of this class
in Zarr V2. If so, it returns a new instance this class.
Parameters
----------
data : DTypeJSON
The JSON data to parse.
Returns
-------
Self
An instance of this data type.
Raises
------
DataTypeValidationError
If the input data is not a valid representation of this class class.
"""
if cls._check_json_v2(data):
return cls()
msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string '|O' and an object_codec_id of {cls.object_codec_id}"
raise DataTypeValidationError(msg)
@classmethod
def _from_json_v3(cls, data: DTypeJSON) -> Self:
"""
Create an instance of VariableLengthBytes from Zarr V3-flavored JSON.
This method checks if the input data is a valid representation of
VariableLengthBytes in Zarr V3. If so, it returns a new instance of
VariableLengthBytes.
Parameters
----------
data : DTypeJSON
The JSON data to parse.
Returns
-------
VariableLengthBytes
An instance of VariableLengthBytes.
Raises
------
DataTypeValidationError
If the input data is not a valid representation of this class.
"""
if cls._check_json_v3(data):
return cls()
msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected the string {cls._zarr_v3_name!r}"
raise DataTypeValidationError(msg)
@overload
def to_json(self, zarr_format: Literal[2]) -> VariableLengthBytesJSON_V2: ...
@overload
def to_json(self, zarr_format: Literal[3]) -> Literal["variable_length_bytes"]: ...
[docs]
def to_json(
self, zarr_format: ZarrFormat
) -> VariableLengthBytesJSON_V2 | Literal["variable_length_bytes"]:
"""
Convert the variable-length bytes data type to a JSON-serializable form.
Parameters
----------
zarr_format : ZarrFormat
The zarr format version. Accepted values are 2 and 3.
Returns
-------
``DTypeConfig_V2[Literal["|O"], Literal["vlen-bytes"]] | Literal["variable_length_bytes"]``
The JSON-serializable representation of the variable-length bytes data type.
For zarr_format 2, returns a dictionary with "name" and "object_codec_id".
For zarr_format 3, returns a string identifier "variable_length_bytes".
Raises
------
ValueError
If zarr_format is not 2 or 3.
"""
if zarr_format == 2:
return {"name": "|O", "object_codec_id": self.object_codec_id}
elif zarr_format == 3:
v3_unstable_dtype_warning(self)
return self._zarr_v3_name
raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover
[docs]
def default_scalar(self) -> bytes:
"""
Return the default scalar value for the variable-length bytes data type.
Returns
-------
bytes
The default scalar value, which is an empty byte string.
"""
return b""
[docs]
def to_json_scalar(self, data: object, *, zarr_format: ZarrFormat) -> str:
"""
Convert a scalar to a JSON-serializable string representation.
This method encodes the given scalar as bytes and then
encodes the bytes as a base64-encoded string.
Parameters
----------
data : object
The scalar to convert.
zarr_format : ZarrFormat
The zarr format version.
Returns
-------
str
A string representation of the scalar.
"""
return base64.standard_b64encode(data).decode("ascii") # type: ignore[arg-type]
[docs]
def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> bytes:
"""
Decode a base64-encoded JSON string to bytes.
Parameters
----------
data : JSON
The JSON-serializable base64-encoded string.
zarr_format : ZarrFormat
The zarr format version.
Returns
-------
bytes
The decoded bytes from the base64 string.
Raises
------
TypeError
If the input data is not a base64-encoded string.
"""
if check_json_str(data):
return base64.standard_b64decode(data.encode("ascii"))
raise TypeError(f"Invalid type: {data}. Expected a string.") # pragma: no cover
def _check_scalar(self, data: object) -> TypeGuard[BytesLike]:
"""
Check if the provided data is of type BytesLike.
This method is used to verify if the input data can be considered as a
scalar of bytes-like type, which includes NumPy bytes, strings, bytes,
and integers.
Parameters
----------
data : object
The data to check.
Returns
-------
TypeGuard[BytesLike]
True if the data is bytes-like, False otherwise.
"""
return isinstance(data, BytesLike)
def _cast_scalar_unchecked(self, data: BytesLike) -> bytes:
"""
Cast the provided scalar data to bytes.
Parameters
----------
data : BytesLike
The data to cast.
Returns
-------
bytes
The casted data as bytes.
Notes
-----
This method does not perform any type checking.
The input data must be bytes-like.
"""
if isinstance(data, str):
return bytes(data, encoding="utf-8")
return bytes(data)
[docs]
def cast_scalar(self, data: object) -> bytes:
"""
Attempt to cast a given object to a bytes scalar.
This method first checks if the provided data is a valid scalar that can be
converted to a bytes scalar. If the check succeeds, the unchecked casting
operation is performed. If the data is not valid, a TypeError is raised.
Parameters
----------
data : object
The data to be cast to a bytes scalar.
Returns
-------
bytes
The data cast as a bytes scalar.
Raises
------
TypeError
If the data cannot be converted to a bytes scalar.
"""
if self._check_scalar(data):
return self._cast_scalar_unchecked(data)
msg = (
f"Cannot convert object {data!r} with type {type(data)} to a scalar compatible with the "
f"data type {self}."
)
raise TypeError(msg)