Source code for zarr.core.dtype.npy.structured

from __future__ import annotations

from collections.abc import Sequence
from dataclasses import dataclass
from typing import TYPE_CHECKING, ClassVar, Literal, Self, TypeGuard, cast, overload

import numpy as np

from zarr.core.common import NamedConfig
from zarr.core.dtype.common import (
    DataTypeValidationError,
    DTypeConfig_V2,
    DTypeJSON,
    HasItemSize,
    StructuredName_V2,
    check_dtype_spec_v2,
    check_structured_dtype_name_v2,
    v3_unstable_dtype_warning,
)
from zarr.core.dtype.npy.common import (
    bytes_from_json,
    bytes_to_json,
    check_json_str,
)
from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar, ZDType

if TYPE_CHECKING:
    from zarr.core.common import JSON, ZarrFormat

StructuredScalarLike = list[object] | tuple[object, ...] | bytes | int


[docs] class StructuredJSON_V2(DTypeConfig_V2[StructuredName_V2, None]): """ A wrapper around the JSON representation of the ``Structured`` data type in Zarr V2. The ``name`` field is a sequence of sequences, where each inner sequence has two values: the field name and the data type name for that field (which could be another sequence). The data type names are strings, and the object codec ID is always None. References ---------- The structure of the ``name`` field is defined in the Zarr V2 `specification document <https://github.com/zarr-developers/zarr-specs/blob/main/docs/v2/v2.0.rst#data-type-encoding>`__. Examples -------- .. code-block:: python { "name": [ ["f0", "<m8[10s]"], ["f1", "<m8[10s]"], ], "object_codec_id": None } """
[docs] class StructuredJSON_V3( NamedConfig[Literal["structured"], dict[str, Sequence[Sequence[str | DTypeJSON]]]] ): """ A JSON representation of a structured data type in Zarr V3. References ---------- This representation is not currently defined in an external specification. Examples -------- .. code-block:: python { "name": "structured", "configuration": { "fields": [ ["f0", "int32], ["f1", "float64"], ] } } """
[docs] @dataclass(frozen=True, kw_only=True) class Structured(ZDType[np.dtypes.VoidDType[int], np.void], HasItemSize): """ A Zarr data type for arrays containing structured scalars, AKA "record arrays". Wraps the NumPy `np.dtypes.VoidDType` if the data type has fields. Scalars for this data type are instances of `np.void`, with a ``fields`` attribute. Attributes ---------- fields : Sequence[tuple[str, ZDType]] The fields of the structured dtype. References ---------- This data type does not have a Zarr V3 specification. The Zarr V2 data type specification can be found `here <https://github.com/zarr-developers/zarr-specs/blob/main/docs/v2/v2.0.rst#data-type-encoding>`__. """ _zarr_v3_name: ClassVar[Literal["structured"]] = "structured" dtype_cls = np.dtypes.VoidDType # type: ignore[assignment] fields: tuple[tuple[str, ZDType[TBaseDType, TBaseScalar]], ...] def __post_init__(self) -> None: if len(self.fields) < 1: raise ValueError(f"must have at least one field. Got {self.fields!r}") @classmethod def _check_native_dtype(cls, dtype: TBaseDType) -> TypeGuard[np.dtypes.VoidDType[int]]: """ Check that this dtype is a numpy structured dtype Parameters ---------- dtype : np.dtypes.DTypeLike The dtype to check. Returns ------- TypeGuard[np.dtypes.VoidDType] True if the dtype matches, False otherwise. """ return isinstance(dtype, cls.dtype_cls) and dtype.fields is not None
[docs] @classmethod def from_native_dtype(cls, dtype: TBaseDType) -> Self: """ Create a Structured ZDType from a native NumPy data type. Parameters ---------- dtype : TBaseDType The native data type. Returns ------- Self An instance of this data type. Raises ------ DataTypeValidationError If the input data type is not an instance of np.dtypes.VoidDType with a non-null ``fields`` attribute. Notes ----- This method attempts to resolve the fields of the structured dtype using the data type registry. """ from zarr.core.dtype import get_data_type_from_native_dtype fields: list[tuple[str, ZDType[TBaseDType, TBaseScalar]]] = [] if cls._check_native_dtype(dtype): # fields of a structured numpy dtype are either 2-tuples or 3-tuples. we only # care about the first element in either case. for key, (dtype_instance, *_) in dtype.fields.items(): # type: ignore[union-attr] dtype_wrapped = get_data_type_from_native_dtype(dtype_instance) fields.append((key, dtype_wrapped)) return cls(fields=tuple(fields)) raise DataTypeValidationError( f"Invalid data type: {dtype}. Expected an instance of {cls.dtype_cls}" )
[docs] def to_native_dtype(self) -> np.dtypes.VoidDType[int]: """ Convert the structured Zarr data type to a native NumPy void dtype. This method constructs a NumPy dtype with fields corresponding to the fields of the structured Zarr data type, by converting each field's data type to its native dtype representation. Returns ------- np.dtypes.VoidDType[int] The native NumPy void dtype representing the structured data type. """ return cast( "np.dtypes.VoidDType[int]", np.dtype([(key, dtype.to_native_dtype()) for (key, dtype) in self.fields]), )
@classmethod def _check_json_v2( cls, data: DTypeJSON, ) -> TypeGuard[StructuredJSON_V2]: """ Check if the input is a valid JSON representation of a Structured data type for Zarr V2. The input data must be a mapping that contains a "name" key that is not a str, and an "object_codec_id" key that is None. Parameters ---------- data : DTypeJSON The JSON data to check. Returns ------- TypeGuard[StructuredJSON_V2] True if the input is a valid JSON representation of a Structured data type for Zarr V2, False otherwise. """ return ( check_dtype_spec_v2(data) and not isinstance(data["name"], str) and check_structured_dtype_name_v2(data["name"]) and data["object_codec_id"] is None ) @classmethod def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[StructuredJSON_V3]: """ Check that the input is a valid JSON representation of this class in Zarr V3. Parameters ---------- data : DTypeJSON The JSON data to check. Returns ------- TypeGuard[StructuredJSON_V3] True if the input is a valid JSON representation of a structured data type for Zarr V3, False otherwise. """ return ( isinstance(data, dict) and set(data.keys()) == {"name", "configuration"} and data["name"] == cls._zarr_v3_name and isinstance(data["configuration"], dict) and set(data["configuration"].keys()) == {"fields"} ) @classmethod def _from_json_v2(cls, data: DTypeJSON) -> Self: # avoid circular import from zarr.core.dtype import get_data_type_from_json if cls._check_json_v2(data): # structured dtypes are constructed directly from a list of lists # note that we do not handle the object codec here! this will prevent structured # dtypes from containing object dtypes. return cls( fields=tuple( # type: ignore[misc] ( # type: ignore[misc] f_name, get_data_type_from_json( {"name": f_dtype, "object_codec_id": None}, zarr_format=2 ), ) for f_name, f_dtype in data["name"] ) ) msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected a JSON array of arrays" raise DataTypeValidationError(msg) @classmethod def _from_json_v3(cls, data: DTypeJSON) -> Self: # avoid circular import from zarr.core.dtype import get_data_type_from_json if cls._check_json_v3(data): config = data["configuration"] meta_fields = config["fields"] return cls( fields=tuple( (f_name, get_data_type_from_json(f_dtype, zarr_format=3)) # type: ignore[misc] for f_name, f_dtype in meta_fields ) ) msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected a JSON object with the key {cls._zarr_v3_name!r}" raise DataTypeValidationError(msg) @overload def to_json(self, zarr_format: Literal[2]) -> StructuredJSON_V2: ... @overload def to_json(self, zarr_format: Literal[3]) -> StructuredJSON_V3: ...
[docs] def to_json(self, zarr_format: ZarrFormat) -> StructuredJSON_V2 | StructuredJSON_V3: """ Convert the structured data type to a JSON-serializable form. Parameters ---------- zarr_format : ZarrFormat The Zarr format version. Accepted values are 2 and 3. Returns ------- StructuredJSON_V2 | StructuredJSON_V3 The JSON representation of the structured data type. Raises ------ ValueError If the zarr_format is not 2 or 3. """ if zarr_format == 2: fields = [ [f_name, f_dtype.to_json(zarr_format=zarr_format)["name"]] for f_name, f_dtype in self.fields ] return {"name": fields, "object_codec_id": None} elif zarr_format == 3: v3_unstable_dtype_warning(self) fields = [ [f_name, f_dtype.to_json(zarr_format=zarr_format)] # type: ignore[list-item] for f_name, f_dtype in self.fields ] base_dict = { "name": self._zarr_v3_name, "configuration": {"fields": fields}, } return cast("StructuredJSON_V3", base_dict) raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover
def _check_scalar(self, data: object) -> TypeGuard[StructuredScalarLike]: # TODO: implement something more precise here! """ Check that the input is a valid scalar value for this structured data type. Parameters ---------- data : object The scalar value to check. Returns ------- TypeGuard[StructuredScalarLike] Whether the input is a valid scalar value for this structured data type. """ return isinstance(data, (bytes, list, tuple, int, np.void)) def _cast_scalar_unchecked(self, data: StructuredScalarLike) -> np.void: """ Cast a python object to a numpy structured scalar without type checking. Parameters ---------- data : StructuredScalarLike The data to cast. Returns ------- np.void The casted data as a numpy structured scalar. Notes ----- This method does not perform any type checking. The input data must be castable to a numpy structured scalar. """ na_dtype = self.to_native_dtype() if isinstance(data, bytes): res = np.frombuffer(data, dtype=na_dtype)[0] elif isinstance(data, list | tuple): res = np.array([tuple(data)], dtype=na_dtype)[0] else: res = np.array([data], dtype=na_dtype)[0] return cast("np.void", res)
[docs] def cast_scalar(self, data: object) -> np.void: """ Cast a Python object to a NumPy structured scalar. This function attempts to cast the provided data to a NumPy structured scalar. If the data is compatible with the structured scalar type, it is cast without type checking. Otherwise, a TypeError is raised. Parameters ---------- data : object The data to be cast to a NumPy structured scalar. Returns ------- np.void The data cast as a NumPy structured scalar. Raises ------ TypeError If the data cannot be converted to a NumPy structured scalar. """ if self._check_scalar(data): return self._cast_scalar_unchecked(data) msg = ( f"Cannot convert object {data!r} with type {type(data)} to a scalar compatible with the " f"data type {self}." ) raise TypeError(msg)
[docs] def default_scalar(self) -> np.void: """ Get the default scalar value for this structured data type. Returns ------- np.void The default scalar value, which is the scalar representation of 0 cast to this structured data type. """ return self._cast_scalar_unchecked(0)
[docs] def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> np.void: """ Read a JSON-serializable value as a NumPy structured scalar. Parameters ---------- data : JSON The JSON-serializable value. zarr_format : ZarrFormat The zarr format version. Returns ------- np.void The NumPy structured scalar. Raises ------ TypeError If the input is not a base64-encoded string. """ if check_json_str(data): as_bytes = bytes_from_json(data, zarr_format=zarr_format) dtype = self.to_native_dtype() return cast("np.void", np.array([as_bytes]).view(dtype)[0]) raise TypeError(f"Invalid type: {data}. Expected a string.")
[docs] def to_json_scalar(self, data: object, *, zarr_format: ZarrFormat) -> str: """ Convert a scalar to a JSON-serializable string representation. Parameters ---------- data : object The scalar to convert. zarr_format : ZarrFormat The zarr format version. Returns ------- str A string representation of the scalar, which is a base64-encoded string of the bytes that make up the scalar. """ return bytes_to_json(self.cast_scalar(data).tobytes(), zarr_format)
@property def item_size(self) -> int: """ The size of a single scalar in bytes. Returns ------- int The size of a single scalar in bytes. """ return self.to_native_dtype().itemsize