Source code for stouputils.data_science.scripts.preprocess_dataset
# Imports
import argparse
from typing import Literal
from ...decorators import handle_error, measure_time
from ...print import info
from ...io import clean_path
from ..config.get import DataScienceConfig
from ..data_processing.image_preprocess import ImageDatasetPreprocess
from ..data_processing.technique import ProcessingTechnique
# Constants
CONFIRMATION_HELP: str = "Don't ask for confirmation"
TYPE_HELP: str = "Type of data to preprocess"
INPUT_HELP: str = "Path to input dataset"
OUTPUT_HELP: str = "Path to save preprocessed dataset"
PARSER_DESCRIPTION: str = "Command-line interface for preprocessing a dataset with various techniques."
# Main function
[docs]
@measure_time(info, "Total execution time of the script")
@handle_error(exceptions=(KeyboardInterrupt, Exception), error_log=DataScienceConfig.ERROR_LOG)
def preprocess_dataset(
techniques: list[ProcessingTechnique],
default_type: Literal["image"] = "image",
default_input: str = f"{DataScienceConfig.DATA_FOLDER}/hip_implant",
default_output: str = "",
) -> None:
""" Preprocess a dataset by applying image processing techniques.
This function takes a dataset path and applies various techniques
to create new dataset at the specified destination path.
Args:
techniques (list[ProcessingTechnique]): List of techniques to apply to the dataset.
default_type (str): Default type of data to preprocess.
default_input (str): Default path to the input dataset.
default_output (str): Default path to save the preprocessed dataset.
Returns:
None: The function modifies files on disk but does not return anything.
"""
info("Starting the script...")
# Parse the arguments
parser = argparse.ArgumentParser(description=PARSER_DESCRIPTION)
parser.add_argument("-y", action="store_true", help=CONFIRMATION_HELP)
parser.add_argument("--type", type=str, default=default_type, choices=["image"], help=TYPE_HELP)
parser.add_argument("--input", type=str, default=default_input, help=INPUT_HELP)
parser.add_argument("--output", type=str, default=default_output, help=OUTPUT_HELP)
args: argparse.Namespace = parser.parse_args()
data_type: str = args.type
input_path: str = clean_path(args.input, trailing_slash=False)
output_path: str = clean_path(args.output, trailing_slash=False)
# Check if the output path is provided, if not,
# use the input path suffixed with "_preprocessed"
if not output_path:
splitted: list[str] = input_path.split("/")
splitted[-1] = splitted[-1] + DataScienceConfig.PREPROCESSED_DIRECTORY_SUFFIX
output_path = "/".join(splitted)
info(f"Output path not provided, using variant of input path: '{output_path}'")
# Preprocess the dataset
if data_type == "image":
preprocess: ImageDatasetPreprocess = ImageDatasetPreprocess(techniques)
preprocess.process_dataset(input_path, output_path, ignore_confirmation=args.y)