Source code for stouputils.data_science.scripts.augment_dataset


# Imports
import argparse
from typing import Literal

from ...decorators import handle_error, measure_time
from ...print import info
from ...io import clean_path
from ..config.get import DataScienceConfig
from ..data_processing.image_augmentation import ImageDatasetAugmentation
from ..data_processing.technique import ProcessingTechnique

# Constants
CONFIRMATION_HELP: str = "Don't ask for confirmation"
TYPE_HELP: str = "Type of data to augment"
INPUT_HELP: str = "Path to input dataset"
OUTPUT_HELP: str = "Path to save augmented dataset (Defaults to input path prefixed with 'aug_')"
PARSER_DESCRIPTION: str = "Command-line interface for augmenting a dataset with various techniques."
FINAL_DATASET_SIZE_HELP: str = "Final size of the dataset"


# Main function
[docs] @measure_time(info, "Total execution time of the script") @handle_error(exceptions=(KeyboardInterrupt, Exception), error_log=DataScienceConfig.ERROR_LOG) def augment_dataset( techniques: list[ProcessingTechnique], default_type: Literal["image"] = "image", default_input: str = f"{DataScienceConfig.DATA_FOLDER}/hip_implant", default_output: str = "", default_final_dataset_size: int = 1000, ) -> None: """ Augment a dataset with various data processing techniques. This script takes a dataset path and applies configurable processing techniques to generate an expanded dataset. The augmented data is saved to a destination path. The augmentation can be done for images or other data types. Args: default_type (str): Default type of data to augment. default_input (str): Default path to the input dataset. default_output (str): Default path to save the augmented dataset. default_final_dataset_size (int): Default final size of the dataset. Returns: None: This function does not return anything. """ info("Starting the script...") # Parse the arguments parser = argparse.ArgumentParser(description=PARSER_DESCRIPTION) parser.add_argument("-y", action="store_true", help=CONFIRMATION_HELP) parser.add_argument("--type", type=str, default=default_type, choices=["image"], help=TYPE_HELP) parser.add_argument("--input", type=str, default=default_input, help=INPUT_HELP) parser.add_argument("--output", type=str, default=default_output, help=OUTPUT_HELP) parser.add_argument("--final_dataset_size", type=int, default=default_final_dataset_size, help=FINAL_DATASET_SIZE_HELP) args: argparse.Namespace = parser.parse_args() data_type: str = args.type input_path: str = clean_path(args.input, trailing_slash=False) output_path: str = clean_path(args.output, trailing_slash=False) final_dataset_size: int = args.final_dataset_size info(f"Augmenting dataset from '{input_path}' to '{output_path}' with {final_dataset_size} samples") # Check if the output path is provided, if not, # use the input path prefixed with "aug_" (ex: .../data/hip_implant -> .../data/aug_hip_implant) if not output_path: splitted: list[str] = input_path.split("/") splitted[-1] = DataScienceConfig.AUGMENTED_DIRECTORY_PREFIX + splitted[-1] output_path = "/".join(splitted) info(f"Output path not provided, using variant of input path: '{output_path}'") # Augment the dataset if data_type == "image": augmentation = ImageDatasetAugmentation(final_dataset_size, techniques) augmentation.process_dataset(input_path, output_path, ignore_confirmation=args.y) return