stouputils.data_science.metric_utils module#

This module contains the MetricUtils class, which provides static methods for calculating various metrics for machine learning tasks.

This class contains static methods for:

Calculating various metrics (accuracy, precision, recall, etc.)
Computing confusion matrix and related metrics
Generating ROC curves and finding optimal thresholds
Calculating F-beta scores

The metrics are calculated based on the predictions made by a model and the true labels from a dataset. The class supports both binary and multiclass classification tasks.

class MetricUtils[source]#

Bases: object

Class containing static methods for calculating metrics.

static metrics(dataset: Dataset, predictions: Iterable[Any], run_name: str, mode: Literal['binary', 'multiclass', 'none'] = 'binary') → dict[str, float][source]#

Method to calculate as many metrics as possible for the given dataset and predictions.

Parameters:

dataset (Dataset) – Dataset containing the true labels
predictions (Iterable) – Predictions made by the model
run_name (str) – Name of the run, used to save the ROC curve
mode (Literal) – Mode of the classification, defaults to “binary”

Returns:

Dictionary containing the calculated metrics

Return type:

dict[str, float]

Examples

>>> # Prepare a test dataset
>>> from .dataset import XyTuple
>>> test_data = XyTuple(X=np.array([[1], [2], [3]]), y=np.array([0, 1, 0]))
>>> dataset = Dataset(training_data=test_data, test_data=test_data, name="osef")

>>> # Prepare predictions
>>> predictions = np.array([[0.9, 0.1], [0.2, 0.8], [0.2, 0.8]])

>>> # Calculate metrics
>>> from stouputils.ctx import Muffle
>>> with Muffle():
...     metrics = MetricUtils.metrics(dataset, predictions, run_name="")

>>> # Check metrics
>>> round(float(metrics[MetricDictionnary.ACCURACY]), 2)
0.67
>>> round(float(metrics[MetricDictionnary.PRECISION]), 2)
0.5
>>> round(float(metrics[MetricDictionnary.RECALL]), 2)
1.0
>>> round(float(metrics[MetricDictionnary.F1_SCORE]), 2)
0.67
>>> round(float(metrics[MetricDictionnary.AUC]), 2)
0.75
>>> round(float(metrics[MetricDictionnary.MATTHEWS_CORRELATION_COEFFICIENT]), 2)
0.5

static confusion_matrix(true_classes: ndarray[Any, dtype[int32]], pred_classes: ndarray[Any, dtype[int32]], labels: tuple[str, ...], run_name: str = '') → dict[str, float][source]#

Calculate metrics based on confusion matrix.

Parameters:

true_classes (NDArray[np.intc]) – True class labels
pred_classes (NDArray[np.intc]) – Predicted class labels
labels (tuple[str, ...]) – List of class labels (strings)
run_name (str) – Name for saving the plot

Returns:

Dictionary of confusion matrix based metrics

Return type:

dict[str, float]

Examples

>>> # Prepare data
>>> true_classes = np.array([0, 1, 0])
>>> pred_probs = np.array([[0.9, 0.1], [0.1, 0.9], [0.1, 0.9]])
>>> pred_classes = Utils.convert_to_class_indices(pred_probs)   # [0, 1, 1]
>>> labels = ["class_0", "class_1"]

>>> # Calculate metrics
>>> from stouputils.ctx import Muffle
>>> with Muffle():
...     metrics = MetricUtils.confusion_matrix(true_classes, pred_classes, labels, run_name="")

>>> # Check metrics
>>> int(metrics[MetricDictionnary.CONFUSION_MATRIX_TN])
1
>>> int(metrics[MetricDictionnary.CONFUSION_MATRIX_FP])
1
>>> int(metrics[MetricDictionnary.CONFUSION_MATRIX_FN])
0
>>> int(metrics[MetricDictionnary.CONFUSION_MATRIX_TP])
1
>>> round(float(metrics[MetricDictionnary.FALSE_POSITIVE_RATE]), 2)
0.5

static f_scores(precision: float, recall: float) → dict[str, float][source]#

Calculate F-beta scores for different beta values.

Parameters:

precision (float) – Precision value
recall (float) – Recall value

Returns:

Dictionary of F-beta scores

Return type:

dict[str, float]

Examples

>>> from stouputils.ctx import Muffle
>>> with Muffle():
...     metrics = MetricUtils.f_scores(precision=0.5, recall=1.0)
>>> [round(float(x), 2) for x in metrics.values()]
[0.5, 0.51, 0.54, 0.58, 0.62, 0.67, 0.71, 0.75, 0.78, 0.81, 0.83]

static matthews_correlation(true_classes: ndarray[Any, dtype[int32]], pred_classes: ndarray[Any, dtype[int32]]) → dict[str, float][source]#

Calculate Matthews Correlation Coefficient.

Parameters:

true_classes (NDArray[np.intc]) – True class labels
pred_classes (NDArray[np.intc]) – Predicted class labels

Returns:

Dictionary containing MCC

Return type:

dict[str, float]

Examples

>>> true_classes = np.array([0, 1, 0])
>>> pred_classes = np.array([0, 1, 1])
>>> from stouputils.ctx import Muffle
>>> with Muffle():
...     metrics = MetricUtils.matthews_correlation(true_classes, pred_classes)
>>> float(metrics[MetricDictionnary.MATTHEWS_CORRELATION_COEFFICIENT])
0.5

static roc_curve_and_auc(true_classes: ndarray[Any, dtype[int32]] | ndarray[Any, dtype[float32]], pred_probs: ndarray[Any, dtype[float32]], fold_number: int = -1, run_name: str = '') → dict[str, float][source]#

Calculate ROC curve and AUC score.

Parameters:

true_classes (NDArray[np.intc | np.single]) – True class labels (one-hot encoded or class indices)
pred_probs (NDArray[np.single]) – Predicted probabilities (must be probability scores, not class indices)
fold_number (int) – Fold number, used for naming the plot file, usually -1 for final model with test set, 0 for final model with validation set, >0 for other folds with their validation set
run_name (str) – Name for saving the plot

Returns:

Dictionary containing AUC score and optimal thresholds

Return type:

dict[str, float]

Examples

>>> true_classes = np.array([0, 1, 0])
>>> pred_probs = np.array([[0.9, 0.1], [0.1, 0.9], [0.1, 0.9]])
>>> from stouputils.ctx import Muffle
>>> with Muffle():
...     metrics = MetricUtils.roc_curve_and_auc(true_classes, pred_probs, run_name="")

>>> # Check metrics
>>> round(float(metrics[MetricDictionnary.AUC]), 2)
0.75
>>> round(float(metrics[MetricDictionnary.OPTIMAL_THRESHOLD_YOUDEN]), 2)
0.9
>>> float(metrics[MetricDictionnary.OPTIMAL_THRESHOLD_COST])
inf

static pr_curve_and_auc(true_classes: ndarray[Any, dtype[int32]] | ndarray[Any, dtype[float32]], pred_probs: ndarray[Any, dtype[float32]], fold_number: int = -1, run_name: str = '') → dict[str, float][source]#

Calculate Precision-Recall curve and AUC score. (and NPV-Specificity curve and AUC)

Parameters:

true_classes (NDArray[np.intc | np.single]) – True class labels (one-hot encoded or class indices)
pred_probs (NDArray[np.single]) – Predicted probabilities (must be probability scores, not class indices)
fold_number (int) – Fold number, used for naming the plot file, usually -1 for final model with test set, 0 for final model with validation set, >0 for other folds with their validation set
run_name (str) – Name for saving the plot

Returns:

Dictionary containing AUC score and optimal thresholds

Return type:

dict[str, float]

Examples

>>> true_classes = np.array([0, 1, 0])
>>> pred_probs = np.array([[0.9, 0.1], [0.1, 0.9], [0.1, 0.9]])
>>> from stouputils.ctx import Muffle
>>> with Muffle():
...     metrics = MetricUtils.pr_curve_and_auc(true_classes, pred_probs, run_name="")

>>> # Check metrics
>>> round(float(metrics[MetricDictionnary.AUPRC]), 2)
0.75
>>> round(float(metrics[MetricDictionnary.NEGATIVE_AUPRC]), 2)
0.92
>>> round(float(metrics[MetricDictionnary.PR_AVERAGE]), 2)
0.5
>>> round(float(metrics[MetricDictionnary.PR_AVERAGE_NEGATIVE]), 2)
0.33
>>> round(float(metrics[MetricDictionnary.OPTIMAL_THRESHOLD_F1]), 2)
0.9
>>> round(float(metrics[MetricDictionnary.OPTIMAL_THRESHOLD_F1_NEGATIVE]), 2)
0.1

static all_curves(true_classes: ndarray[Any, dtype[int32]] | ndarray[Any, dtype[float32]], pred_probs: ndarray[Any, dtype[float32]], fold_number: int = -1, run_name: str = '') → dict[str, float][source]#

Run all X_curve_and_auc functions and return a dictionary of metrics.

Parameters:

true_classes (NDArray[np.intc | np.single]) – True class labels (one-hot encoded or class indices)
pred_probs (NDArray[np.single]) – Predicted probabilities (must be probability scores, not class indices)
fold_number (int) – Fold number, used for naming the plot file, usually -1 for final model with test set, 0 for final model with validation set, >0 for other folds with their validation set
run_name (str) – Name for saving the plot

Returns:

Dictionary containing AUC score and optimal thresholds for ROC and PR curves

Return type:

dict[str, float]

static plot_metric_curves(all_history: list[dict[str, list[float]]], metric_name: str, run_name: str = '') → None[source]#

Plot training and validation curves for a specific metric.

Generates two plots for the given metric: 1. A combined plot with both training and validation curves 2. A validation-only plot

The plots show the metric’s progression across training epochs for each fold. Special formatting distinguishes between folds and curve types: - Fold 0 (final model) uses thicker lines (2.0 width vs 1.0) - Training curves use solid lines, validation uses dashed - Each curve is clearly labeled in the legend

The plots are saved to the temp folder and logged to MLflow before cleanup.

Parameters:

all_history (list[dict[str, list[float]]]) – List of history dictionaries for each fold
metric_name (str) – Name of the metric to plot (e.g. “accuracy”, “loss”)
run_name (str) – Name of the run

Examples

>>> # Prepare data with 2 folds for instance
>>> all_history = [
...     {'loss': [0.1, 0.09, 0.08, 0.07, 0.06], 'val_loss': [0.11, 0.1, 0.09, 0.08, 0.07]},
...     {'loss': [0.12, 0.11, 0.1, 0.09, 0.08], 'val_loss': [0.13, 0.12, 0.11, 0.1, 0.09]}
... ]
>>> MetricUtils.plot_metric_curves(metric_name="loss", all_history=all_history, run_name="")

static plot_every_metric_curves(all_history: list[dict[str, list[float]]], metrics_names: tuple[str, ...] = (), run_name: str = '') → None[source]#

Plot and save training and validation curves for each metric.

Parameters:

all_history (list[dict[str, list[float]]]) – List of history dictionaries for each fold
metrics_names (tuple[str, ...]) – List of metric names to plot, defaults to (“loss”,)
run_name (str) – Name of the run

Examples

>>> # Prepare data with 2 folds for instance
>>> all_history = [
...     {'loss': [0.1, 0.09], 'val_loss': [0.11, 0.1], "accuracy": [0.9, 0.8], "val_accuracy": [0.8, 0.7]},
...     {'loss': [0.12, 0.11], 'val_loss': [0.13, 0.12], "accuracy": [0.8, 0.7], "val_accuracy": [0.7, 0.6]}
... ]
>>> MetricUtils.plot_every_metric_curves(all_history, metrics_names=["loss", "accuracy"], run_name="")

static find_best_x_and_plot(x_values: list[float], y_values: list[float], best_idx: int | None = None, smoothen: bool = True, use_steep: bool = True, run_name: str = '', x_label: str = 'Learning Rate', y_label: str = 'Loss', plot_title: str = 'Learning Rate Finder', log_x: bool = True, y_limits: tuple[float, ...] | None = None) → float[source]#

Find the best x value (where y is minimized) and plot the curve.

Parameters:

x_values (list[float]) – List of x values (e.g. learning rates)
y_values (list[float]) – List of corresponding y values (e.g. losses)
best_idx (int | None) – Index of the best x value (if None, a robust approach is used)
smoothen (bool) – Whether to apply smoothing to the y values
use_steep (bool) – Whether to use steepest slope strategy to determine best index
run_name (str) – Name of the run for saving the plot
x_label (str) – Label for the x-axis
y_label (str) – Label for the y-axis
plot_title (str) – Title for the plot
log_x (bool) – Whether to use a logarithmic x-axis (e.g. learning rate)
y_limits (tuple[float, ...] | None) – Limit for the y-axis, defaults to None (no limit)

Returns:

The best x value found (where y is minimized)

Return type:

float

This function creates a plot showing the relationship between x and y values to help identify the optimal x (where y is minimized). The plot can use a logarithmic x-axis for better visualization if desired.

The ideal x is typically found where y is still decreasing but before it starts to increase dramatically.

Examples

>>> x_values = [1e-5, 1e-4, 1e-3, 1e-2, 1e-1]
>>> y_values = [0.1, 0.09, 0.07, 0.06, 0.09]
>>> best_x = MetricUtils.find_best_x_and_plot(x_values, y_values, use_steep=True)
>>> print(f"Best x: {best_x:.0e}")
Best x: 1e-03
>>> best_x = MetricUtils.find_best_x_and_plot(x_values, y_values, use_steep=False)
>>> print(f"Best x: {best_x:.0e}")
Best x: 1e-02

stouputils.data_science.metric_utils module#

This Page