Source code for zarr.core.dtype.common

from __future__ import annotations

import warnings
from collections.abc import Mapping, Sequence
from dataclasses import dataclass
from typing import (
    ClassVar,
    Final,
    Generic,
    Literal,
    TypedDict,
    TypeGuard,
    TypeVar,
)

from typing_extensions import ReadOnly

from zarr.core.common import NamedConfig
from zarr.errors import UnstableSpecificationWarning

EndiannessStr = Literal["little", "big"]
ENDIANNESS_STR: Final = "little", "big"

SpecialFloatStrings = Literal["NaN", "Infinity", "-Infinity"]
SPECIAL_FLOAT_STRINGS: Final = ("NaN", "Infinity", "-Infinity")

JSONFloatV2 = float | SpecialFloatStrings
JSONFloatV3 = float | SpecialFloatStrings | str

ObjectCodecID = Literal["vlen-utf8", "vlen-bytes", "vlen-array", "pickle", "json2", "msgpack2"]
# These are the ids of the known object codecs for zarr v2.
OBJECT_CODEC_IDS: Final = ("vlen-utf8", "vlen-bytes", "vlen-array", "pickle", "json2", "msgpack2")

# This is a wider type than our standard JSON type because we need
# to work with typeddict objects which are assignable to Mapping[str, object]
DTypeJSON = str | int | float | Sequence["DTypeJSON"] | None | Mapping[str, object]

# The DTypeJSON_V2 type exists because ZDType.from_json takes a single argument, which must contain
# all the information necessary to decode the data type. Zarr v2 supports multiple distinct
# data types that all used the "|O" data type identifier. These data types can only be
# discriminated on the basis of their "object codec", i.e. a special data type specific
# compressor or filter. So to figure out what data type a zarr v2 array has, we need the
# data type identifier from metadata, as well as an object codec id if the data type identifier
# is "|O".
# So we will pack the name of the dtype alongside the name of the object codec id, if applicable,
# in a single dict, and pass that to the data type inference logic.
# These type variables have a very wide bound because the individual zdtype
# classes can perform a very specific type check.

# This is the JSON representation of a structured dtype in zarr v2
StructuredName_V2 = Sequence["str | StructuredName_V2"]

# This models the type of the name a dtype might have in zarr v2 array metadata
DTypeName_V2 = StructuredName_V2 | str

TDTypeNameV2_co = TypeVar("TDTypeNameV2_co", bound=DTypeName_V2, covariant=True)
TObjectCodecID_co = TypeVar("TObjectCodecID_co", bound=None | str, covariant=True)


class DTypeConfig_V2(TypedDict, Generic[TDTypeNameV2_co, TObjectCodecID_co]):
    name: ReadOnly[TDTypeNameV2_co]
    object_codec_id: ReadOnly[TObjectCodecID_co]


DTypeSpec_V2 = DTypeConfig_V2[DTypeName_V2, None | str]


def check_structured_dtype_v2_inner(data: object) -> TypeGuard[StructuredName_V2]:
    """
    A type guard for the inner elements of a structured dtype. This is a recursive check because
    the type is itself recursive.

    This check ensures that all the elements are 2-element sequences beginning with a string
    and ending with either another string or another 2-element sequence beginning with a string and
    ending with another instance of that type.
    """
    if isinstance(data, (str, Mapping)):
        return False
    if not isinstance(data, Sequence):
        return False
    if len(data) != 2:
        return False
    if not (isinstance(data[0], str)):
        return False
    if isinstance(data[-1], str):
        return True
    elif isinstance(data[-1], Sequence):
        return check_structured_dtype_v2_inner(data[-1])
    return False


def check_structured_dtype_name_v2(data: Sequence[object]) -> TypeGuard[StructuredName_V2]:
    """
    Check that all the elements of a sequence are valid zarr v2 structured dtype identifiers
    """
    return all(check_structured_dtype_v2_inner(d) for d in data)


def check_dtype_name_v2(data: object) -> TypeGuard[DTypeName_V2]:
    """
    Type guard for narrowing the type of a python object to an valid zarr v2 dtype name.
    """
    if isinstance(data, str):
        return True
    elif isinstance(data, Sequence):
        return check_structured_dtype_name_v2(data)
    return False


def check_dtype_spec_v2(data: object) -> TypeGuard[DTypeSpec_V2]:
    """
    Type guard for narrowing a python object to an instance of DTypeSpec_V2
    """
    if not isinstance(data, Mapping):
        return False
    if set(data.keys()) != {"name", "object_codec_id"}:
        return False
    if not check_dtype_name_v2(data["name"]):
        return False
    return isinstance(data["object_codec_id"], str | None)


# By comparison, The JSON representation of a dtype in zarr v3 is much simpler.
# It's either a string, or a structured dict
DTypeSpec_V3 = str | NamedConfig[str, Mapping[str, object]]


def check_dtype_spec_v3(data: object) -> TypeGuard[DTypeSpec_V3]:
    """
    Type guard for narrowing the type of a python object to an instance of
    DTypeSpec_V3, i.e either a string or a dict with a "name" field that's a string and a
    "configuration" field that's a mapping with string keys.
    """
    if isinstance(data, str) or (  # noqa: SIM103
        isinstance(data, Mapping)
        and set(data.keys()) == {"name", "configuration"}
        and isinstance(data["configuration"], Mapping)
        and all(isinstance(k, str) for k in data["configuration"])
    ):
        return True
    return False


def unpack_dtype_json(data: DTypeSpec_V2 | DTypeSpec_V3) -> DTypeJSON:
    """
    Return the array metadata form of the dtype JSON representation. For the Zarr V3 form of dtype
    metadata, this is a no-op. For the Zarr V2 form of dtype metadata, this unpacks the dtype name.
    """
    if isinstance(data, Mapping) and set(data.keys()) == {"name", "object_codec_id"}:
        return data["name"]
    return data


[docs] class DataTypeValidationError(ValueError): ...
class ScalarTypeValidationError(ValueError): ... @dataclass(frozen=True, kw_only=True) class HasLength: """ A mix-in class for data types with a length attribute, such as fixed-size collections of unicode strings, or bytes. Attributes ---------- length : int The length of the scalars belonging to this data type. Note that this class does not assign a unit to the length. Child classes may assign units. """ length: int @dataclass(frozen=True, kw_only=True) class HasEndianness: """ A mix-in class for data types with an endianness attribute """ endianness: EndiannessStr = "little" @dataclass(frozen=True, kw_only=True) class HasItemSize: """ A mix-in class for data types with an item size attribute. This mix-in bears a property ``item_size``, which denotes the size of each element of the data type, in bytes. """ @property def item_size(self) -> int: raise NotImplementedError @dataclass(frozen=True, kw_only=True) class HasObjectCodec: """ A mix-in class for data types that require an object codec id. This class bears the property ``object_codec_id``, which is the string name of an object codec that is required to encode and decode the data type. In zarr-python 2.x certain data types like variable-length strings or variable-length arrays used the catch-all numpy "object" data type for their in-memory representation. But these data types cannot be stored as numpy object data types, because the object data type does not define a fixed memory layout. So these data types required a special codec, called an "object codec", that effectively defined a compact representation for the data type, which was used to encode and decode the data type. Zarr-python 2.x would not allow the creation of arrays with the "object" data type if an object codec was not specified, and thus the name of the object codec is effectively part of the data type model. """ object_codec_id: ClassVar[str] def v3_unstable_dtype_warning(dtype: object) -> None: """ Emit this warning when a data type does not have a stable zarr v3 spec """ msg = ( f"The data type ({dtype}) does not have a Zarr V3 specification. " "That means that the representation of arrays saved with this data type may change without " "warning in a future version of Zarr Python. " "Arrays stored with this data type may be unreadable by other Zarr libraries. " "Use this data type at your own risk! " "Check https://github.com/zarr-developers/zarr-extensions/tree/main/data-types for the " "status of data type specifications for Zarr V3." ) warnings.warn(msg, category=UnstableSpecificationWarning, stacklevel=2)