Source code for configuration.data.datafactory

"""Contains functions to prepare the data."""

# Copyright  2025  Institute of Light and Matter, CNRS UMR 5306, University Claude Bernard Lyon 1
# Contributors: Oscar DUFOUR, Maxime STAPELLE, Alexandre NICOLAS

# This software is a computer program designed to generate a realistic crowd from anthropometric data and
# simulate the mechanical interactions that occur within it and with obstacles.

# This software is governed by the CeCILL-B license under French law and abiding by the rules of distribution
# of free software.  You can  use, modify and/ or redistribute the software under the terms of the CeCILL-B
# license as circulated by CEA, CNRS and INRIA at the following URL "http://www.cecill.info".

# As a counterpart to the access to the source code and  rights to copy, modify and redistribute granted by
# the license, users are provided only with a limited warranty  and the software's author,  the holder of the
# economic rights,  and the successive licensors  have only  limited liability.

# In this respect, the user's attention is drawn to the risks associated with loading,  using,  modifying
# and/or developing or reproducing the software by the user in light of its specific status of free software,
# that may mean  that it is complicated to manipulate,  and  that  also therefore means  that it is reserved
# for developers  and  experienced professionals having in-depth computer knowledge. Users are therefore
# encouraged to load and test the software's suitability as regards their requirements in conditions enabling
# the security of their systems and/or data to be ensured and,  more generally, to use and operate it in the
# same conditions as regards security.

# The fact that you are presently reading this means that you have had knowledge of the CeCILL-B license and that
# you accept its terms.

import logging
from pathlib import Path
from typing import get_args

import numpy as np
import pandas as pd
from shapely.geometry import MultiPolygon

import configuration.utils.constants as cst
import configuration.utils.functions as fun
from configuration.utils.typing_custom import Sex



[docs]
def read_anthropometric_data(sex: Sex, data_dir_path: Path) -> pd.DataFrame:
    """
    Read and process anthropometric data from a sex-specific CSV file.

    Parameters
    ----------
    sex : Sex
        The sex of the individuals whose data is to be read ("male" or "female").
    data_dir_path : Path
        Path to the root data directory containing the "csv" subdirectory.

    Returns
    -------
    pd.DataFrame
        Processed DataFrame containing:

        - Original data with standardized units (converted to cm/kg)
        - Renamed columns with units in brackets
        - Added "sex" column indicating the subject"s gender

    Raises
    ------
    ValueError
        If the provided `sex` is not "male" or "female".
    FileNotFoundError
        If the specified CSV file does not exist in the data directory.

    Notes
    -----
    - Performs the following unit conversions:
        * Height: inches → centimeters
        * Weight: pounds → kilograms
        * Chest depth: millimeters → centimeters
        * Bideltoid breadth: millimeters → centimeters
    - Original column names are renamed to include units in brackets
    """
    # Check if the sex is valid
    if sex not in get_args(Sex):
        raise ValueError("The sex should be either 'male' or 'female'.")

    # Read the CSV file
    dir_path = data_dir_path / "csv"
    file_name = f"ANSURII{sex.upper()}Public.csv"
    df = pd.read_csv(dir_path / file_name, encoding="latin1")

    # Add a column sex
    df["sex"] = np.full_like(df["Heightin"], sex, dtype=object)

    # Standardize units and rename columns
    df["chestdepth"] = df["chestdepth"] * cst.MM_TO_CM  # Convert mm to cm
    df.rename(columns={"chestdepth": "chest depth [cm]"}, inplace=True)
    df["bideltoidbreadth"] = df["bideltoidbreadth"] * cst.MM_TO_CM  # Convert mm to cm
    df.rename(columns={"bideltoidbreadth": "bideltoid breadth [cm]"}, inplace=True)
    df["Heightin"] = df["Heightin"] * cst.INCH_TO_CM  # Convert inches to cm
    df.rename(columns={"Heightin": "height [cm]"}, inplace=True)
    df = df[df["Weightlbs"] != 0]  # Remove rows with zero weight
    df["Weightlbs"] = df["Weightlbs"] * cst.LB_TO_KG  # Keep weight in kg
    df.rename(columns={"Weightlbs": "weight [kg]"}, inplace=True)

    return df




[docs]
def prepare_anthropometric_data(data_dir_path: Path) -> None:
    """
    Prepare and save anthropometric data as a pickle file.

    This function reads anthropometric data for both males and females, combines them into a single DataFrame,
    and saves the result as a pickle file for efficient future access.

    Parameters
    ----------
    data_dir_path : Path
        The path to the root data directory containing input data and
        where the output pickle file will be saved.
    """
    dir_path = data_dir_path / "pkl"
    df_male = read_anthropometric_data("male", data_dir_path)
    df_female = read_anthropometric_data("female", data_dir_path)
    df = pd.concat([df_male, df_female], ignore_index=True)
    fun.save_pickle(df, dir_path / "ANSUREIIPublic.pkl")




[docs]
def prepare_bike_data(data_dir_path: Path) -> None:
    """
    Prepare bike data by reading a CSV file, processing it, and saving as a pickle file.

    Parameters
    ----------
    data_dir_path : Path
        The path to the root data directory containing "csv" and "pkl" subdirectories.
    """
    df = pd.read_csv(data_dir_path / "csv" / "geometrics.mtb-news.de.csv", sep=";")
    fun.save_pickle(df, data_dir_path / "pkl" / "bike_data.pkl")




[docs]
def prepare_data() -> None:
    """
    Prepare the data for the application by processing anthropometric and bike data.

    This function checks for the existence of preprocessed data files and, if not found,
    initiates the data preparation process. It performs the following steps:

    1. Prepares anthropometric data by calling `prepare_anthropometric_data()`.
    2. Prepares bike data by calling `prepare_bike_data()`.
    3. Prepares 3D body data by calling `prepare_3D_body_data()`.
    """
    data_dir_path = Path(__file__).parent.parent.parent.parent.absolute() / "data"
    if (
        not (data_dir_path / "pkl" / "bike_data.pkl").exists()
        or not (data_dir_path / "pkl" / "ANSUREIIPublic.pkl").exists()
        or not (data_dir_path / "pkl" / "male_3dBody_light.pkl").exists()
        or not (data_dir_path / "pkl" / "female_3dBody_light.pkl").exists()
    ):
        logging.info("Preparing anthropometric data and bike data...")
        prepare_anthropometric_data(data_dir_path)
        prepare_bike_data(data_dir_path)
        logging.info("Preparing 3D body data...")
        prepare_3D_body_data(data_dir_path)
        logging.info("Data prepared successfully")




[docs]
def prepare_3D_body_data(data_dir_path: Path) -> None:
    """
    Process 3D body data by keeping one MultiPolygon per bin of height of a given size and reducing the precision of each MultiPolygon.

    For each sex (male/female):

    1. Loads original 3D body shape data from <sex>_3dBody.pkl
    2. Creates target bins at 3cm intervals (controlled by DISTANCE_BTW_TARGET_KEYS_ALTITUDES)
    3. Selects the nearest available height to each bin's boundary values
    4. Simplifies each Polygon that compose each MultiPolygon using Douglas-Peucker algorithm with specified tolerance
    5. Saves optimized data to <sex>_3dBody_light.pkl

    Parameters
    ----------
    data_dir_path : Path
        Path to root directory containing input/output subdirectories. Requires "pkl" subdirectory with original pickle files.

    Raises
    ------
    FileNotFoundError
        If either the input directory structure is invalid or source pickle file for a sex is missing.
    """
    for sex in cst.Sex:
        pickle_path = data_dir_path / "pkl" / f"{sex.name}_3dBody.pkl"
        if not pickle_path.exists():
            raise FileNotFoundError(f"Pickle file not found: {pickle_path}")

        shapes3D: dict[float, MultiPolygon] = fun.load_pickle(str(pickle_path))
        keys = sorted(float(k) for k in shapes3D.keys())
        if not keys:
            continue

        target_keys = np.arange(0.0, keys[-1] + 1, cst.DISTANCE_BTW_TARGET_KEYS_ALTITUDES)
        filtered_shapes3D: dict[float, MultiPolygon] = {}
        used_bins = set()

        for key in keys:
            bin_idx = np.argmin(np.abs(target_keys - key))
            bin_value = target_keys[bin_idx]
            if bin_value not in used_bins:
                filtered_shapes3D[key] = MultiPolygon(
                    [geom.simplify(tolerance=cst.POLYGON_TOLERANCE, preserve_topology=True) for geom in shapes3D[key].geoms]
                )
                used_bins.add(bin_value)

        output_path = data_dir_path / "pkl" / f"{sex.name}_3dBody_light.pkl"
        fun.save_pickle(filtered_shapes3D, output_path)