Source code for stouputils.collections

"""
This module provides utilities for collection manipulation:

- unique_list: Remove duplicates from a list while preserving order using object id, hash or str
- sort_dict_keys: Sort dictionary keys using a given order list (ascending or descending)
- upsert_in_dataframe: Insert or update a row in a Polars DataFrame based on primary keys
- array_to_disk: Easily handle large numpy arrays on disk using zarr for efficient storage and access.

.. image:: https://raw.githubusercontent.com/Stoupy51/stouputils/refs/heads/main/assets/collections_module.gif
  :alt: stouputils collections examples
"""

# Imports
import atexit
import os
import shutil
import tempfile
from typing import TYPE_CHECKING, Any, Literal, TypeVar

# Lazy imports for typing
if TYPE_CHECKING:
	import numpy as np
	import polars as pl
	import zarr  # pyright: ignore[reportMissingTypeStubs]
	from numpy.typing import NDArray

# Typing
T = TypeVar("T")

# Functions

[docs]
def unique_list(list_to_clean: list[Any], method: Literal["id", "hash", "str"] = "str") -> list[Any]:
	""" Remove duplicates from the list while keeping the order using ids (default) or hash or str

	Args:
		list_to_clean	(list[Any]):					The list to clean
		method			(Literal["id", "hash", "str"]):	The method to use to identify duplicates
	Returns:
		list[Any]: The cleaned list

	Examples:
		>>> unique_list([1, 2, 3, 2, 1], method="id")
		[1, 2, 3]

		>>> s1 = {1, 2, 3}
		>>> s2 = {2, 3, 4}
		>>> s3 = {1, 2, 3}
		>>> unique_list([s1, s2, s1, s1, s3, s2, s3], method="id")
		[{1, 2, 3}, {2, 3, 4}, {1, 2, 3}]

		>>> s1 = {1, 2, 3}
		>>> s2 = {2, 3, 4}
		>>> s3 = {1, 2, 3}
		>>> unique_list([s1, s2, s1, s1, s3, s2, s3], method="str")
		[{1, 2, 3}, {2, 3, 4}]
	"""
	# Initialize the seen ids set and the result list
	seen: set[Any] = set()
	result: list[Any] = []

	# Iterate over each item in the list
	for item in list_to_clean:
		if method == "id":
			item_identifier = id(item)
		elif method == "hash":
			item_identifier = hash(item)
		elif method == "str":
			item_identifier = str(item)
		else:
			raise ValueError(f"Invalid method: {method}")

		# If the item id is not in the seen ids set, add it to the seen ids set and append the item to the result list
		if item_identifier not in seen:
			seen.add(item_identifier)
			result.append(item)

	# Return the cleaned list
	return result



[docs]
def sort_dict_keys(dictionary: dict[T, Any], order: list[T], reverse: bool = False) -> dict[T, Any]:
	""" Sort dictionary keys using a given order list (reverse optional)

	Args:
		dictionary	(dict[T, Any]):	The dictionary to sort
		order		(list[T]):		The order list
		reverse		(bool):			Whether to sort in reverse order (given to sorted function which behaves differently than order.reverse())
	Returns:
		dict[T, Any]: The sorted dictionary

	Examples:
		>>> sort_dict_keys({'b': 2, 'a': 1, 'c': 3}, order=["a", "b", "c"])
		{'a': 1, 'b': 2, 'c': 3}

		>>> sort_dict_keys({'b': 2, 'a': 1, 'c': 3}, order=["a", "b", "c"], reverse=True)
		{'c': 3, 'b': 2, 'a': 1}

		>>> sort_dict_keys({'b': 2, 'a': 1, 'c': 3, 'd': 4}, order=["c", "b"])
		{'c': 3, 'b': 2, 'a': 1, 'd': 4}
	"""
	return dict(sorted(dictionary.items(), key=lambda x: order.index(x[0]) if x[0] in order else len(order), reverse=reverse))



[docs]
def upsert_in_dataframe(
	df: "pl.DataFrame",
	new_entry: dict[str, Any],
	primary_keys: dict[str, Any] | None = None
) -> "pl.DataFrame":
	""" Insert or update a row in the Polars DataFrame based on primary keys.

	Args:
		df				(pl.DataFrame):		The Polars DataFrame to update.
		new_entry		(dict[str, Any]):	The new entry to insert or update.
		primary_keys	(dict[str, Any]):	The primary keys to identify the row (default: empty).
	Returns:
		pl.DataFrame: The updated Polars DataFrame.
	"""
	# Imports
	import polars as pl

	# Create new DataFrame if file doesn't exist or is invalid
	if df.is_empty():
		return pl.DataFrame([new_entry])

	# If no primary keys provided, return DataFrame with new entry appended
	if not primary_keys:
		new_row_df = pl.DataFrame([new_entry])
		return pl.concat([df, new_row_df], how="diagonal_relaxed")

	# Build mask based on primary keys
	mask: pl.Expr = pl.lit(True)
	for key, value in primary_keys.items():
		if key in df.columns:
			mask = mask & (df[key] == value)
		else:
			# Primary key column doesn't exist, so no match possible
			mask = pl.lit(False)
			break

	# Insert or update row based on primary keys
	if df.select(mask).to_series().any():
		# Update existing row
		for key, value in new_entry.items():
			if key in df.columns:
				df = df.with_columns(pl.when(mask).then(pl.lit(value)).otherwise(pl.col(key)).alias(key))
			else:
				# Add new column if it doesn't exist
				df = df.with_columns(pl.when(mask).then(pl.lit(value)).otherwise(None).alias(key))
		return df
	else:
		# Insert new row
		new_row_df = pl.DataFrame([new_entry])
		return pl.concat([df, new_row_df], how="diagonal_relaxed")



[docs]
def array_to_disk(
	data: "NDArray[Any] | zarr.Array",
	delete_input: bool = True,
	more_data: "NDArray[Any] | zarr.Array | None" = None
) -> tuple["zarr.Array", str, int]:
	""" Easily handle large numpy arrays on disk using zarr for efficient storage and access.

	Zarr provides a simpler and more efficient alternative to np.memmap with better compression
	and chunking capabilities.

	Args:
		data			(NDArray | zarr.Array):	The data to save/load as a zarr array
		delete_input	(bool):	Whether to delete the input data after creating the zarr array
		more_data		(NDArray | zarr.Array | None): Additional data to append to the zarr array
	Returns:
		tuple[zarr.Array, str, int]: The zarr array, the directory path, and the total size in bytes

	Examples:
		>>> import numpy as np
		>>> data = np.random.rand(1000, 1000)
		>>> zarr_array = array_to_disk(data)[0]
		>>> zarr_array.shape
		(1000, 1000)

		>>> more_data = np.random.rand(500, 1000)
		>>> longer_array, dir_path, total_size = array_to_disk(zarr_array, more_data=more_data)
	"""
	def dir_size(directory: str) -> int:
		return sum(
			os.path.getsize(os.path.join(dirpath, filename))
			for dirpath, _, filenames in os.walk(directory)
			for filename in filenames
		)

	# Imports
	import zarr  # pyright: ignore[reportMissingTypeStubs]

	# If data is already a zarr.Array and more_data is present, just append and return
	if isinstance(data, zarr.Array) and more_data is not None:
		original_size: int = data.shape[0]
		new_shape: tuple[int, ...] = (original_size + more_data.shape[0], *data.shape[1:])
		data.resize(new_shape)
		data[original_size:] = more_data[:]

		# Delete more_data if specified, calculate size, and return
		if delete_input:
			del more_data
		store_path: str = str(data.store.path if hasattr(data.store, 'path') else data.store) # type: ignore
		return data, store_path, dir_size(store_path)

	# Create a temporary directory to store the zarr array (with compression (auto-chunking for optimal performance))
	temp_dir: str = tempfile.mkdtemp()
	zarr_array: zarr.Array = zarr.open_array(temp_dir, mode="w", shape=data.shape, dtype=data.dtype, chunks=True) # pyright: ignore[reportUnknownMemberType]
	zarr_array[:] = data[:]

	# If additional data is provided, resize and append
	if more_data is not None:
		original_size = data.shape[0]
		new_shape = (original_size + more_data.shape[0], *data.shape[1:])
		zarr_array.resize(new_shape)
		zarr_array[original_size:] = more_data[:]

	# Delete the original data from memory if specified
	if delete_input:
		del data
		if more_data is not None:
			del more_data

	# Register a cleanup function to delete the zarr directory at exit
	atexit.register(lambda: shutil.rmtree(temp_dir, ignore_errors=True))

	# Return all
	return zarr_array, temp_dir, dir_size(temp_dir)


if __name__ == "__main__":

	# Example usage of array_to_disk (now using zarr)
	print("\nZarr Example:")
	data = np.random.rand(1000, 1000)
	zarr_array, dir_path, total_size = array_to_disk(data, delete_input=True)
	print(f"Zarr array shape: {zarr_array.shape}, directory: {dir_path}, size: {total_size:,} bytes")
	print(f"Compression ratio: {(data.nbytes / total_size):.2f}x")

	# Make it longer (1000x1000 -> 1500x1000)
	data2 = np.random.rand(500, 1000)
	longer_array, dir_path, total_size = array_to_disk(zarr_array, more_data=data2)
	print(f"\nLonger zarr array shape: {longer_array.shape}, directory: {dir_path}, size: {total_size:,} bytes")
	print(f"Compression ratio: {(1500 * 1000 * 8 / total_size):.2f}x")