Source code for stouputils.data_science.utils

"""
This module contains the Utils class, which provides static methods for common operations.

This class contains static methods for:

- Safe division (with 0 as denominator or None)
- Safe multiplication (with None)
- Converting between one-hot encoding and class indices
- Calculating ROC curves and AUC scores
"""
# pyright: reportUnknownMemberType=false
# pyright: reportUnknownVariableType=false

# Imports
from typing import Any

import numpy as np
from numpy.typing import NDArray
from ..decorators import handle_error
from ..ctx import Muffle

from .config.get import DataScienceConfig


# Class

[docs]
class Utils:
	""" Utility class providing common operations. """


[docs]
	@staticmethod
	def safe_divide_float(a: float, b: float) -> float:
		""" Safe division of two numbers, return 0 if denominator is 0.

		Args:
			a  (float):  First number
			b  (float):  Second number
		Returns:
			float: Result of the division

		Examples:
			>>> Utils.safe_divide_float(10, 2)
			5.0
			>>> Utils.safe_divide_float(0, 5)
			0.0
			>>> Utils.safe_divide_float(10, 0)
			0
			>>> Utils.safe_divide_float(-10, 2)
			-5.0
		"""
		return a / b if b > 0 else 0



[docs]
	@staticmethod
	def safe_divide_none(a: float | None, b: float | None) -> float | None:
		""" Safe division of two numbers, return None if either number is None or denominator is 0.

		Args:
			a  (float | None):  First number
			b  (float | None):  Second number
		Returns:
			float | None: Result of the division or None if denominator is None

		Examples:
			>>> None == Utils.safe_divide_none(None, 2)
			True
			>>> None == Utils.safe_divide_none(10, None)
			True
			>>> None == Utils.safe_divide_none(10, 0)
			True
			>>> Utils.safe_divide_none(10, 2)
			5.0
		"""
		return a / b if a is not None and b is not None and b > 0 else None



[docs]
	@staticmethod
	def safe_multiply_none(a: float | None, b: float | None) -> float | None:
		""" Safe multiplication of two numbers, return None if either number is None.

		Args:
			a  (float | None):  First number
			b  (float | None):  Second number
		Returns:
			float | None: Result of the multiplication or None if either number is None

		Examples:
			>>> None == Utils.safe_multiply_none(None, 2)
			True
			>>> None == Utils.safe_multiply_none(10, None)
			True
			>>> Utils.safe_multiply_none(10, 2)
			20
			>>> Utils.safe_multiply_none(-10, 2)
			-20
		"""
		return a * b if a is not None and b is not None else None



[docs]
	@staticmethod
	@handle_error(error_log=DataScienceConfig.ERROR_LOG)
	def convert_to_class_indices(y: NDArray[np.intc | np.single] | list[NDArray[np.intc | np.single]]) -> NDArray[Any]:
		""" Convert array from one-hot encoded format to class indices.
		If the input is already class indices, it returns the same array.

		Args:
			y (NDArray[intc | single] | list[NDArray[intc | single]]): Input array (either one-hot encoded or class indices)
		Returns:
			NDArray[Any]: Array of class indices: [[0, 0, 1, 0], [1, 0, 0, 0]] -> [2, 0]

		Examples:
			>>> Utils.convert_to_class_indices(np.array([[0, 0, 1, 0], [1, 0, 0, 0]])).tolist()
			[2, 0]
			>>> Utils.convert_to_class_indices(np.array([2, 0, 1])).tolist()
			[2, 0, 1]
			>>> Utils.convert_to_class_indices(np.array([[1], [0]])).tolist()
			[[1], [0]]
			>>> Utils.convert_to_class_indices(np.array([])).tolist()
			[]
		"""
		y = np.array(y)
		if y.ndim > 1 and y.shape[1] > 1:
			return np.argmax(y, axis=1)
		return y



[docs]
	@staticmethod
	@handle_error(error_log=DataScienceConfig.ERROR_LOG)
	def convert_to_one_hot(
		y: NDArray[np.intc | np.single] | list[NDArray[np.intc | np.single]], num_classes: int
	) -> NDArray[Any]:
		""" Convert array from class indices to one-hot encoded format.
		If the input is already one-hot encoded, it returns the same array.

		Args:
			y            (NDArray[intc|single] | list[NDArray[intc|single]]):  Input array (either class indices or one-hot encoded)
			num_classes  (int):                                                Total number of classes
		Returns:
			NDArray[Any]:	One-hot encoded array: [2, 0] -> [[0, 0, 1, 0], [1, 0, 0, 0]]

		Examples:
			>>> Utils.convert_to_one_hot(np.array([2, 0]), 4).tolist()
			[[0.0, 0.0, 1.0, 0.0], [1.0, 0.0, 0.0, 0.0]]
			>>> Utils.convert_to_one_hot(np.array([[0, 0, 1, 0], [1, 0, 0, 0]]), 4).tolist()
			[[0, 0, 1, 0], [1, 0, 0, 0]]
			>>> Utils.convert_to_one_hot(np.array([0, 1, 2]), 3).shape
			(3, 3)
			>>> Utils.convert_to_one_hot(np.array([]), 3)
			array([], shape=(0, 3), dtype=float32)

			>>> array = np.array([[0.1, 0.9], [0.2, 0.8]])
			>>> array = Utils.convert_to_class_indices(array)
			>>> array = Utils.convert_to_one_hot(array, 2)
			>>> array.tolist()
			[[0.0, 1.0], [0.0, 1.0]]
		"""
		y = np.array(y)
		if y.ndim == 1 or y.shape[1] != num_classes:

			# Get the number of samples and create a one-hot encoded array
			n_samples: int = len(y)
			one_hot: NDArray[np.float32] = np.zeros((n_samples, num_classes), dtype=np.float32)
			if n_samples > 0:
				# Create a one-hot encoding by setting specific positions to 1.0:
				# - np.arange(n_samples) creates an array [0, 1, 2, ..., n_samples-1] for row indices
				# - y.astype(int) contains the class indices that determine which column gets the 1.0
				# - Together they form coordinate pairs (row_idx, class_idx) where we set values to 1.0
				row_indices: NDArray[np.intc] = np.arange(n_samples)
				one_hot[row_indices, y.astype(int)] = 1.0
			return one_hot
		return y



[docs]
	@staticmethod
	@handle_error(error_log=DataScienceConfig.ERROR_LOG)
	def get_roc_curve_and_auc(
		y_true: NDArray[np.intc | np.single],
		y_pred: NDArray[np.single]
	) -> tuple[float, NDArray[np.single], NDArray[np.single], NDArray[np.single]]:
		""" Calculate ROC curve and AUC score.

		Args:
			y_true  (NDArray[intc | single]):   True class labels (either one-hot encoded or class indices)
			y_pred  (NDArray[single]):           Predicted probabilities (must be probability scores, not class indices)
		Returns:
			tuple[float, NDArray[np.single], NDArray[np.single], NDArray[np.single]]:
				Tuple containing AUC score, False Positive Rate, True Positive Rate, and Thresholds

		Examples:
			>>> # Binary classification example
			>>> y_true = np.array([0.0, 1.0, 0.0, 1.0, 0.0])
			>>> y_pred = np.array([[0.2, 0.8], [0.1, 0.9], [0.8, 0.2], [0.2, 0.8], [0.7, 0.3]])
			>>> auc_value, fpr, tpr, thresholds = Utils.get_roc_curve_and_auc(y_true, y_pred)
			>>> round(auc_value, 2)
			0.92
			>>> [round(x, 2) for x in fpr.tolist()]
			[0.0, 0.0, 0.33, 0.67, 1.0]
			>>> [round(x, 2) for x in tpr.tolist()]
			[0.0, 0.5, 1.0, 1.0, 1.0]
			>>> [round(x, 2) for x in thresholds.tolist()]
			[inf, 0.9, 0.8, 0.3, 0.2]
		"""
		# For predictions, assert they are probabilities (one-hot encoded)
		assert y_pred.ndim > 1 and y_pred.shape[1] > 1, "Predictions must be probability scores in one-hot format"
		pred_probs: NDArray[np.single] = y_pred[:, 1]  # Take probability of positive class only

		# Calculate ROC curve and AUC score using probabilities
		with Muffle(mute_stderr=True):	# Suppress "UndefinedMetricWarning: No positive samples in y_true [...]"

			# Import functions
			try:
				from sklearn.metrics import roc_auc_score, roc_curve
			except ImportError as e:
				raise ImportError("scikit-learn is required for ROC curve calculation. Install with 'pip install scikit-learn'") from e

			# Convert y_true to class indices for both functions
			y_true_indices: NDArray[np.intc] = Utils.convert_to_class_indices(y_true)

			# Calculate AUC score directly using roc_auc_score
			auc_value: float = float(roc_auc_score(y_true_indices, pred_probs))

			# Calculate ROC curve points
			results: tuple[Any, Any, Any] = roc_curve(y_true_indices, pred_probs, drop_intermediate=False)
			fpr: NDArray[np.single] = results[0]
			tpr: NDArray[np.single] = results[1]
			thresholds: NDArray[np.single] = results[2]

		return auc_value, fpr, tpr, thresholds



[docs]
	@staticmethod
	@handle_error(error_log=DataScienceConfig.ERROR_LOG)
	def get_pr_curve_and_auc(
		y_true: NDArray[np.intc | np.single],
		y_pred: NDArray[np.single],
		negative: bool = False
	) -> tuple[float, float, NDArray[np.single], NDArray[np.single], NDArray[np.single]]:
		""" Calculate Precision-Recall Curve (or Negative Precision-Recall Curve) and AUC score.

		Args:
			y_true  (NDArray[intc | single]):   True class labels (either one-hot encoded or class indices)
			y_pred  (NDArray[single]):          Predicted probabilities (must be probability scores, not class indices)
			negative (bool):                    Whether to calculate the negative Precision-Recall Curve
		Returns:
			tuple[float, NDArray[np.single], NDArray[np.single], NDArray[np.single]]:
				Tuple containing either:
					- AUC score, Average Precision, Precision, Recall, and Thresholds
					- AUC score, Average Precision, Negative Predictive Value, Specificity, and Thresholds for the negative class

		Examples:
			>>> # Binary classification example
			>>> y_true = np.array([0.0, 1.0, 0.0, 1.0, 0.0])
			>>> y_pred = np.array([[0.2, 0.8], [0.1, 0.9], [0.8, 0.2], [0.2, 0.8], [0.7, 0.3]])
			>>> auc_value, average_precision, precision, recall, thresholds = Utils.get_pr_curve_and_auc(y_true, y_pred)
			>>> round(auc_value, 2)
			0.92
			>>> round(average_precision, 2)
			0.83
			>>> [round(x, 2) for x in precision.tolist()]
			[0.4, 0.5, 0.67, 1.0, 1.0]
			>>> [round(x, 2) for x in recall.tolist()]
			[1.0, 1.0, 1.0, 0.5, 0.0]
			>>> [round(x, 2) for x in thresholds.tolist()]
			[0.2, 0.3, 0.8, 0.9]
		"""
		# For predictions, assert they are probabilities (one-hot encoded)
		assert y_pred.ndim > 1 and y_pred.shape[1] > 1, "Predictions must be probability scores in one-hot format"
		pred_probs: NDArray[np.single] = y_pred[:, 1] if not negative else y_pred[:, 0]

		# Calculate Precision-Recall Curve and AUC score using probabilities
		with Muffle(mute_stderr=True):	# Suppress "UndefinedMetricWarning: No positive samples in y_true [...]"

			# Import functions
			try:
				from sklearn.metrics import auc, average_precision_score, precision_recall_curve
			except ImportError as e:
				raise ImportError("scikit-learn is required for PR Curve calculation. Install with 'pip install scikit-learn'") from e

			# Convert y_true to class indices for both functions
			y_true_indices: NDArray[np.intc] = Utils.convert_to_class_indices(y_true)

			results: tuple[Any, Any, Any] = precision_recall_curve(
				y_true=y_true_indices,
				probas_pred=pred_probs,
				pos_label=1 if not negative else 0
			)
			precision: NDArray[np.single] = results[0]
			recall: NDArray[np.single] = results[1]
			thresholds: NDArray[np.single] = results[2]
			auc_value: float = float(auc(recall, precision))
			average_precision: float = float(average_precision_score(y_true_indices, pred_probs))
		return auc_value, average_precision, precision, recall, thresholds