Dataset Utilities

These utility functions are meant to help you do complex modifications to your recipe with a single step. We encourage you to see how these are implemented by opening the drop down below each function to see the source code.

For any modification to a recipe's ingredients, you need to get a Pantry of ingredients:

from leip_recipe_designer import Pantry
pantry = Pantry.build("./local_pantry")

Generic Data Generator Functions¶

leip_recipe_designer.helpers.data.new_data_generator_by_format ¶

new_data_generator_by_format(type: LoadableDataType, root_path, pantry, nclasses: Optional[int] = None)

Creates a data_generator ingredient of the provided type, given that the data is structured in the canonical format of the type specified.

Parameters:

type (LoadableDataType) –

One of the supported data types.
n_classes –

Number of classes in your dataset, excluding the background class.
pantry –

The ingredients storage.
root_path –

The absolute path to the root directory of the dataset.

Source code in leip_recipe_designer/helpers/data.py

def new_data_generator_by_format(type: LoadableDataType, root_path, pantry, nclasses: Optional[int] = None):
    """
    Creates a data_generator ingredient of the provided type,
    given that the data is structured in the canonical format of the type specified.

    Parameters
    ----------
    type:
        One of the supported data types.
    n_classes:
        Number of classes in your dataset, excluding the background class.
    pantry:
        The ingredients storage.
    root_path:
        The absolute path to the root directory of the dataset.
    """
    if type == LoadableDataType.PASCAL_VOC:
        if nclasses is None:
            nclasses = get_classes_voc(root_path)
        is_split = _check_is_split_VOC(root_path)
        if is_split:
            data_gen = new_pascal_data_generator(
                pantry=pantry,
                root_path=root_path,
                nclasses=nclasses,
                is_split=is_split,
            )
        else:
            data_gen = new_pascal_data_generator(
                pantry=pantry,
                root_path=root_path,
                nclasses=nclasses,
                is_split=is_split,
                trainval_split_ratio=0.80,
                trainval_split_seed=42,
            )

    elif type == LoadableDataType.COCO:
        if nclasses is None:
            nclasses = get_classes_coco(root_path)
        data_gen = new_coco_data_generator(
            pantry=pantry,
            root_path=root_path,
            nclasses=nclasses,
        )

    elif type == LoadableDataType.YOLO:
        if nclasses is None:
            is_split = _check_is_split_YOLO(root_path)
            nclasses = get_classes_yolo(root_path, is_split=is_split)
        if is_split:
            data_gen = new_yolo_data_generator(
                pantry=pantry,
                root_path=root_path,
                nclasses=nclasses,
                is_split=is_split,
            )
        else:
            data_gen = new_yolo_data_generator(
                pantry=pantry,
                root_path=root_path,
                nclasses=nclasses,
                is_split=is_split,
                trainval_split_ratio=0.80,
                trainval_split_seed=42,
            )

    elif type == LoadableDataType.KITTI:
        if nclasses is None:
            is_split = _check_is_split_YOLO(root_path)
            nclasses = get_classes_yolo(root_path, is_split=is_split)
        if is_split:
            data_gen = new_kitti_data_generator(
                pantry=pantry,
                root_path=root_path,
                nclasses=nclasses,
                is_split=is_split,
            )
        else:
            data_gen = new_kitti_data_generator(
                pantry=pantry,
                root_path=root_path,
                nclasses=nclasses,
                is_split=is_split,
                trainval_split_ratio=0.80,
                trainval_split_seed=42,
            )

    else:
        raise ValueError("Unknown Format entered!")

    return data_gen

leip_recipe_designer.helpers.data.get_data_generator_by_name ¶

get_data_generator_by_name(pantry: IngredientCache, regex_ingredient_name: str) -> RecipeNode

Instantiates a single data_generator node based on a regex_ingredient_name. This method is used to instantiate some data that has already been integrated and offered. To list available off-the-shelf data run recipe.options("data_generator"). The returned value can replace another data_generator in a recipe via replace_data_generator(recipe, data_generator_node)

Parameters:

pantry (IngredientCache) –

The ingredients storage.
regex_ingredient_name (str) –

A string to regex match to a data_generator name.

Returns:

data_generator ( RecipeNode ) –

A data_generator.vision.detection.2d ingredient node.

Examples:

>>> from leip_recipe_designer.helpers.data import get_data_generator_by_name, replace_data_generator
>>> my_data = get_data_generator_by_name(pantry = mypantry, regex_ingredient_name = "Face Mask")
>>> replace_data_generator(some_existing_recipe, my_data)

Source code in leip_recipe_designer/helpers/data.py

def get_data_generator_by_name(pantry: "IngredientCache", regex_ingredient_name: str) -> "RecipeNode":
    """Instantiates a single data_generator node based on a regex_ingredient_name.
    This method is used to instantiate some data that has already been integrated and offered.
    To list available off-the-shelf data run `recipe.options("data_generator")`.
    The returned value can replace another data_generator in a recipe via
    `replace_data_generator(recipe, data_generator_node)`

    Parameters
    ----------
    pantry:
        The ingredients storage.
    regex_ingredient_name:
        A string to regex match to a data_generator name.

    Returns
    -------
    data_generator:
        A `data_generator.vision.detection.2d` ingredient node.

    Examples
    ----------
    >>> from leip_recipe_designer.helpers.data import get_data_generator_by_name, replace_data_generator
    >>> my_data = get_data_generator_by_name(pantry = mypantry, regex_ingredient_name = "Face Mask")
    >>> replace_data_generator(some_existing_recipe, my_data)
    """
    return instantiate_single_node_by_category("data_generator.vision.detection.2d", regex_ingredient_name, pantry)

Data Generator Manipulation¶

leip_recipe_designer.helpers.data.replace_data_generator ¶

replace_data_generator(recipe: RecipeNode, data: RecipeNode, keep_recipe_augmentation: bool = True, keep_recipe_composition: bool = True)

Replace the current dataset with the provided one.

Parameters:

recipe (RecipeNode) –

The recipe to be modified.
data (RecipeNode) –

The data_generator ingredient node.
keep_recipe_augmentation (bool, default: True ) –

If True, the data augmentations of the current dataset are transferred to the newly added ingredient.
keep_recipe_composition (bool, default: True ) –

If True, the composite transformation or its absence in the recipe will be preserved; otherwise, it'll be borrowed from the provided data.

Examples:

>>> from leip_recipe_designer.helpers.data import new_pascal_data_generator, replace_data_generator
>>> my_data = new_pascal_data_generator(root_path="/some/place")
>>> replace_data_generator(some_existing_recipe, my_data)

Source code in leip_recipe_designer/helpers/data.py

def replace_data_generator(
    recipe: "RecipeNode",
    data: "RecipeNode",
    keep_recipe_augmentation: bool = True,
    keep_recipe_composition: bool = True,
):
    """Replace the current dataset with the provided one.

    Parameters
    ----------
    recipe:
        The recipe to be modified.
    data:
        The `data_generator` ingredient node.
    keep_recipe_augmentation:
        If True, the data augmentations of the current dataset are transferred to the newly added ingredient.
    keep_recipe_composition:
        If True, the composite transformation or its absence in the recipe will be preserved;
        otherwise, it'll be borrowed from the provided data.

    Examples
    ----------
    >>> from leip_recipe_designer.helpers.data import new_pascal_data_generator, replace_data_generator
    >>> my_data = new_pascal_data_generator(root_path="/some/place")
    >>> replace_data_generator(some_existing_recipe, my_data)

    """

    recipe_is_mosaiced = _is_mosaiced(recipe)
    data_is_mosaiced = _is_mosaiced(data)

    if recipe_is_mosaiced and keep_recipe_composition:
        tag = "^data_generator.composite.mosaic.foreground$"
    else:
        tag = "^data_generator$"
        matches = recipe.iterate_matches(tag)
        if len(matches) != 1:
            msg = f"Found zero or more than one {tag} in recipe"
            logger.error(msg)
            raise KeyError(msg)

    if data_is_mosaiced and keep_recipe_composition:
        data = data["^data_generator.composite.mosaic.foreground$"]

    if keep_recipe_augmentation:
        taug = recipe["data.augmentation.training"]._portable()
        tval = recipe["data.augmentation.validation"]._portable()

    recipe[tag] = data

    if keep_recipe_augmentation:
        recipe["data.augmentation.training"] = taug
        recipe["data.augmentation.validation"] = tval

    # keep for backward compatibility
    if "data.normalization.is_final" in recipe["data.augmentation.training"]:
        if _is_mosaiced(recipe):
            recipe["data.augmentation.training"]["data.normalization.is_final"] = False
        else:
            recipe["data.augmentation.training"]["data.normalization.is_final"] = True

leip_recipe_designer.helpers.data.attach_fiftyone_data_generator ¶

attach_fiftyone_data_generator(pantry: IngredientCache, dataset_name: str, nclasses: int, label_map: Optional[str] = None, view_name_train: Optional[str] = 'train_view', view_name_val: Optional[str] = 'val_view', groundtruth_field_name: Optional[str] = 'ground_truth') -> RecipeNode

Creates a data generator for your FiftyOne Dataset

Parameters:

pantry (IngredientCache) –

The ingredients storage.
dataset_name (str) –

The name used while creating the FiftyOne Dataset.
nclasses (int) –

The number of classes in your dataset.
view_name_train (Optional[str], default: 'train_view' ) –

The name of a view on the FiftyOne Dataset you want to use for training.
view_name_val (Optional[str], default: 'val_view' ) –

The name of a view on the FiftyOne Dataset you want to use for evaluating.
groundtruth_field_name (Optional[str], default: 'ground_truth' ) –

The field name for ground truth in your FiftyOne Dataset (Print out a sample of the dataset for a detailed description).
label_map (Optional[str], default: None ) –

A dictionary enumerating the class names and the indeces to map them to, as `{"class_0_name": 0, "class_1_name": 1,...}

Returns:

data_generator ( RecipeNode ) –

A data_generator.vision.detection.2d ingredient node.

Source code in leip_recipe_designer/helpers/data.py

def attach_fiftyone_data_generator(
    pantry: "IngredientCache",
    dataset_name: str,
    nclasses: int,
    label_map: Optional[str] = None,
    view_name_train: Optional[str] = "train_view",
    view_name_val: Optional[str] = "val_view",
    groundtruth_field_name: Optional[str] = "ground_truth",
) -> "RecipeNode":
    """
    Creates a data generator for your FiftyOne Dataset

    Parameters
    ----------
    pantry:
        The ingredients storage.
    dataset_name:
        The name used while creating the FiftyOne Dataset.
    nclasses:
        The number of classes in your dataset.
    view_name_train:
        The name of a view on the FiftyOne Dataset you want to use for training.
    view_name_val:
        The name of a view on the FiftyOne Dataset you want to use for evaluating.
    groundtruth_field_name:
        The field name for ground truth in your FiftyOne Dataset
        (Print out a sample of the dataset for a detailed description).
    label_map:
        A dictionary enumerating the class names and the indeces to map them to,
        as `{"class_0_name": 0, "class_1_name": 1,...}

    Returns
    -------
    data_generator:
        A `data_generator.vision.detection.2d` ingredient node.
    """
    domain = "data_generator.vision.detection.2d"
    my_byod = "Attach FiftyOne Dataset"
    ingredient = instantiators.instantiate_single_node_by_category(domain, my_byod, pantry)

    ingredient["data.number_of_classes"] = nclasses
    ingredient["data_generator.dataset_name"] = dataset_name
    ingredient["data.fiftyone_train_view"] = view_name_train
    ingredient["data.fiftyone_val_view"] = view_name_val
    ingredient["data.fiftyone_groundtruth_field_name"] = groundtruth_field_name
    ingredient["data.fiftyone_label_map"] = label_map

    logger.info("The created ingredient is: \n")
    logger.info(ingredient)

    return ingredient

Format-Specific Data Generators¶

leip_recipe_designer.helpers.data.new_pascal_data_generator ¶

new_pascal_data_generator(pantry: IngredientCache, root_path: str, nclasses: int, is_split: bool = True, annotations_dir: str = 'Annotations', images_dir: str = 'JPEGImages', trainval_split_ratio: Optional[float] = None, trainval_split_seed: Optional[int] = None, train_set: Optional[str] = 'ImageSets/train.txt', val_set: Optional[str] = 'ImageSets/val.txt', dataset_name: str = 'pascal-like-data', image_extension: Optional[str] = 'jpg', download_url: Optional[str] = None, image_filename_from_xml_contents: bool = True) -> RecipeNode

Creates a new data_generator ingredient to ingest new pascal formatted detection dataset.

Parameters:

pantry (IngredientCache) –

The ingredients storage.
root_path (str) –

The absolute path to the root directory of the dataset.
annotations_dir (str, default: 'Annotations' ) –

The path to folder containing xml files, relative to root_path.
images_dir (str, default: 'JPEGImages' ) –

The path to folder containing only images, relative to root_path.
nclasses (int) –

The number of classes in your dataset.
is_split (bool, default: True ) –

True or False. If set to True, text files containing the list of samples for training and validation should be specified using train_set and val_set. If set to false, data will be split by the ingestor given the trainval_split_ratio and trainval_split_seed
trainval_split_ratio (Optional[float], default: None ) –

The ratio to use to split the dataset. Used only if is_split: false
trainval_split_seed (Optional[int], default: None ) –

The seed to use to pseudo randomly split the dataset. Used only if is_split: false
train_set (Optional[str], default: 'ImageSets/train.txt' ) –

Used only if is_split: true The path to text file containing names (no extensions) to the training samples.
val_set (Optional[str], default: 'ImageSets/val.txt' ) –

Used only if is_split: true The path to text file containing names (no extensions) to the validation samples.
dataset_name (str, default: 'pascal-like-data' ) –

This string will be used to name any generated artifacts.
image_extension (Optional[str], default: 'jpg' ) –

File extension of the images.
download_url (Optional[str], default: None ) –

URL to download the dataset from. If data is not already on root_path, it can download into root path.
image_filename_from_xml_contents (bool, default: True ) –

If True, the image name is retrieved from the XML annotations. Otherwise, the XML filename + the provided image extension is used.

Returns:

data_generator ( RecipeNode ) –

A data_generator.vision.detection.2d ingredient node.

Examples:

The example of the dataset structure could be the following:

my_dataset
├── Annotations
│   ├── image1.xml
│   └── image2.xml
├── JPEGImages
│   ├── image1.jpg
│   └── image2.jpg
└── ImageSets
    ├── train.txt
    └── val.txt

# Annotations/image1.xml
<annotation>
    <folder></folder>
    <filename>image1.jpg</filename>
    <size>
        <width>image1_width</width>
        <height>image1_height</height>
        <depth>image1_num_channels</depth>
    </size>
    <object>
        <name>class_name1</name>
        ...
        <bndbox>
            <xmin>bounding_box_1_xmin</xmin>
            <ymin>bounding_box_1_ymin</ymin>
            <xmax>bounding_box_1_xmax</xmax>
            <ymax>bounding_box_1_ymax</ymax>
        </bndbox>
        ...
    </object>
</annotation>

# ImageSets is an optional folder and provides a split list of image names for each dataset.
# ImageSets/train.txt
image1
image2

You can create a new data generator for this type of structure by running the following:

from leip_recipe_designer.helpers.data import new_pascal_data_generator, replace_data_generator

new_data = new_pascal_data_generator(
    pantry=pantry,
    root_path="/my/path/my_dataset",
    nclasses=1,
    )
replace_data_generator(recipe, new_data)

Source code in leip_recipe_designer/helpers/data.py

def new_pascal_data_generator(
    pantry: "IngredientCache",
    root_path: str,
    nclasses: int,
    is_split: bool = True,
    annotations_dir: str = "Annotations",
    images_dir: str = "JPEGImages",
    trainval_split_ratio: Optional[float] = None,
    trainval_split_seed: Optional[int] = None,
    train_set: Optional[str] = "ImageSets/train.txt",
    val_set: Optional[str] = "ImageSets/val.txt",
    dataset_name: str = "pascal-like-data",
    image_extension: Optional[str] = "jpg",
    download_url: Optional[str] = None,
    image_filename_from_xml_contents: bool = True,
) -> "RecipeNode":
    """
    Creates a new data_generator ingredient to ingest new pascal formatted detection dataset.

    Parameters
    ----------
    pantry:
        The ingredients storage.
    root_path:
        The absolute path to the root directory of the dataset.
    annotations_dir:
        The path to folder containing xml files, relative to root_path.
    images_dir:
        The path to folder containing only images, relative to root_path.
    nclasses:
        The number of classes in your dataset.
    is_split:
        True or False. If set to True, text files containing the list of samples for training and
        validation should be specified using train_set and val_set. If set to false,
        data will be split by the ingestor given the trainval_split_ratio and trainval_split_seed
    trainval_split_ratio:
        The ratio to use to split the dataset. Used only if is_split: false
    trainval_split_seed:
        The seed to use to pseudo randomly split the dataset. Used only if is_split: false
    train_set:
        Used only if is_split: true
        The path to text file containing names (no extensions) to the training samples.
    val_set:
        Used only if is_split: true
        The path to text file containing names (no extensions) to the validation samples.
    dataset_name:
        This string will be used to name any generated artifacts.
    image_extension:
        File extension of the images.
    download_url:
        URL to download the dataset from. If data is not already on root_path,
        it can download into root path.
    image_filename_from_xml_contents:
        If True, the image name is retrieved from the XML annotations.
        Otherwise, the XML filename + the provided image extension is used.

    Returns
    -------
    data_generator:
        A `data_generator.vision.detection.2d` ingredient node.

    Examples
    ----------
    The example of the dataset structure could be the following:

    ```
    my_dataset
    ├── Annotations
    │   ├── image1.xml
    │   └── image2.xml
    ├── JPEGImages
    │   ├── image1.jpg
    │   └── image2.jpg
    └── ImageSets
        ├── train.txt
        └── val.txt

    # Annotations/image1.xml
    <annotation>
        <folder></folder>
        <filename>image1.jpg</filename>
        <size>
            <width>image1_width</width>
            <height>image1_height</height>
            <depth>image1_num_channels</depth>
        </size>
        <object>
            <name>class_name1</name>
            ...
            <bndbox>
                <xmin>bounding_box_1_xmin</xmin>
                <ymin>bounding_box_1_ymin</ymin>
                <xmax>bounding_box_1_xmax</xmax>
                <ymax>bounding_box_1_ymax</ymax>
            </bndbox>
            ...
        </object>
    </annotation>

    # ImageSets is an optional folder and provides a split list of image names for each dataset.
    # ImageSets/train.txt
    image1
    image2
    ```

    You can create a new data generator for this type of structure by running the following:

    ``` py
    from leip_recipe_designer.helpers.data import new_pascal_data_generator, replace_data_generator

    new_data = new_pascal_data_generator(
        pantry=pantry,
        root_path="/my/path/my_dataset",
        nclasses=1,
        )
    replace_data_generator(recipe, new_data)
    ```
    """
    domain = "data_generator.vision.detection.2d"
    my_byod = "BYOD - PASCAL format" if download_url is None else "BYOD from Url - PASCAL format"

    ingredient = instantiators.instantiate_single_node_by_category(domain, my_byod, pantry)

    # optinal to use by synonym or by relative path
    ingredient["data_generator.root_path"] = root_path
    ingredient["data_generator.annotation_subdirectory"] = annotations_dir
    ingredient["data_generator.image_subdirectory"] = images_dir
    ingredient["data_generator.image_extension"] = image_extension
    ingredient["data.number_of_classes"] = nclasses
    ingredient["data_generator.data_is_presplit"] = is_split
    ingredient["data_generator.train_val_split_ratio"] = trainval_split_ratio
    ingredient["data_generator.split_seed"] = trainval_split_seed
    ingredient["data_generator.dataset_name"] = dataset_name
    ingredient["data_generator.training_set_file"] = train_set
    ingredient["data_generator.validation_set_file"] = val_set
    ingredient["data_generator.image_filename_from_xml_contents"] = image_filename_from_xml_contents

    if download_url is not None:
        ingredient["data_generator.download_url"] = download_url

    logger.info("The created ingredient is: \n")
    logger.info(ingredient)

    return ingredient

leip_recipe_designer.helpers.data.new_coco_data_generator ¶

new_coco_data_generator(pantry: IngredientCache, root_path: str, nclasses: int, train_annotations_json: str = 'annotations/instances_train.json', val_annotations_json: str = 'annotations/instances_val.json', train_images_dir: str = 'train', val_images_dir: str = 'val', label_indexing: str = '1-indexed-no-background', dataset_name: str = 'coco-like-data', download_url: Optional[str] = None) -> RecipeNode

Creates a new data_generator ingredient to ingest new COCO formatted detection dataset.

Parameters:

pantry (IngredientCache) –

The ingredients storage.
root_path (str) –

The absolute path to the root directory of the dataset.
train_annotations_json (str, default: 'annotations/instances_train.json' ) –

Path to .json file of training annotations, relative to root_path.
val_annotations_json (str, default: 'annotations/instances_val.json' ) –

Path to .json file of validation annotations, relative to root_path.
train_images_dir (str, default: 'train' ) –

Path to folder containing only training images, relative to root_path.
val_images_dir (str, default: 'val' ) –

Path to folder containing only validation images, relative to root_path.
nclasses (int) –

The number of classes in your dataset.
label_indexing (str, default: '1-indexed-no-background' ) –

Parameter that helps the data ingestor recognize if there is a background class. One of 0-indexed-no-background, 1-indexed-no-background, 0-indexed-with-background.
dataset_name (str, default: 'coco-like-data' ) –

This string will be used to name any generated artifacts.
download_url (Optional[str], default: None ) –

URL to download the dataset from. If data is not already on root_path, it can download into root path.

Returns:

data_generator ( RecipeNode ) –

A data_generator.vision.detection.2d ingredient node.

Examples:

The example of the dataset structure could be the following:

my_dataset
├── annotations
│   ├── instances_train.json
│   └── instances_val.json
├── train
│   ├── image1.jpg
│   └── image2.jpg
└── val
    ├── image3.jpg
    └── image4.jpg


# annotations/instances_train.json
{
    "images": [
        {
            "id": 1,
            "width": image1_width,
            "height": image1_height,
            "file_name": image1.jpg
        }
        ...
    ],
    "annotations": [
        {
            "id": 1,
            "image_id": 1,
            "category_id": 1,
            "bbox": [xmin, ymin, width, height],
            "area": bbox_width x bbox_height,
            "iscrowd": 0
        }
        ...
    ],
    "categories": [
        {
            "id": 1,
            "name": class_name1
        }
        ...
    ]
}

You can create a new data generator for this type of structure by running the following:

from leip_recipe_designer.helpers.data import new_coco_data_generator, replace_data_generator

new_data = new_coco_data_generator(
    pantry=pantry,
    root_path="/my/path/my_dataset",
    nclasses=1,
)
replace_data_generator(recipe, new_data)

Source code in leip_recipe_designer/helpers/data.py

def new_coco_data_generator(
    pantry: "IngredientCache",
    root_path: str,
    nclasses: int,
    train_annotations_json: str = "annotations/instances_train.json",
    val_annotations_json: str = "annotations/instances_val.json",
    train_images_dir: str = "train",
    val_images_dir: str = "val",
    label_indexing: str = "1-indexed-no-background",
    dataset_name: str = "coco-like-data",
    download_url: Optional[str] = None,
) -> "RecipeNode":
    """
    Creates a new data_generator ingredient to ingest new COCO formatted detection dataset.

    Parameters
    ----------
    pantry:
        The ingredients storage.
    root_path:
        The absolute path to the root directory of the dataset.
    train_annotations_json:
        Path to .json file of training annotations, relative to `root_path`.
    val_annotations_json:
        Path to .json file of validation annotations, relative to `root_path`.
    train_images_dir:
        Path to folder containing only training images, relative to `root_path`.
    val_images_dir:
        Path to folder containing only validation images, relative to `root_path`.
    nclasses:
        The number of classes in your dataset.
    label_indexing:
        Parameter that helps the data ingestor recognize if there is a background class.
        One of 0-indexed-no-background, 1-indexed-no-background, 0-indexed-with-background.
    dataset_name:
        This string will be used to name any generated artifacts.
    download_url:
        URL to download the dataset from. If data is not already on root_path,
        it can download into root path.

    Returns
    -------
    data_generator:
        A `data_generator.vision.detection.2d` ingredient node.

    Examples
    ----------
    The example of the dataset structure could be the following:

    ```
    my_dataset
    ├── annotations
    │   ├── instances_train.json
    │   └── instances_val.json
    ├── train
    │   ├── image1.jpg
    │   └── image2.jpg
    └── val
        ├── image3.jpg
        └── image4.jpg


    # annotations/instances_train.json
    {
        "images": [
            {
                "id": 1,
                "width": image1_width,
                "height": image1_height,
                "file_name": image1.jpg
            }
            ...
        ],
        "annotations": [
            {
                "id": 1,
                "image_id": 1,
                "category_id": 1,
                "bbox": [xmin, ymin, width, height],
                "area": bbox_width x bbox_height,
                "iscrowd": 0
            }
            ...
        ],
        "categories": [
            {
                "id": 1,
                "name": class_name1
            }
            ...
        ]
    }
    ```

    You can create a new data generator for this type of structure by running the following:

    ``` py
    from leip_recipe_designer.helpers.data import new_coco_data_generator, replace_data_generator

    new_data = new_coco_data_generator(
        pantry=pantry,
        root_path="/my/path/my_dataset",
        nclasses=1,
    )
    replace_data_generator(recipe, new_data)

    ```
    """
    domain = "data_generator.vision.detection.2d"
    my_byod = "BYOD - COCO format" if download_url is None else "BYOD from Url - COCO format"
    ingredient = instantiators.instantiate_single_node_by_category(domain, my_byod, pantry)
    ingredient["data_generator.root_path"] = root_path
    ingredient["data_generator.train_annotations_json"] = train_annotations_json
    ingredient["data_generator.val_annotations_json"] = val_annotations_json
    ingredient["data_generator.train_images_dir"] = train_images_dir
    ingredient["data_generator.val_images_dir"] = val_images_dir
    ingredient["data_generator.label_indexing"] = label_indexing
    ingredient["data.number_of_classes"] = nclasses
    ingredient["data_generator.dataset_name"] = dataset_name

    if download_url is not None:
        ingredient["data_generator.download_url"] = download_url

    logger.info("The created ingredient is: \n")
    logger.info(ingredient)

    return ingredient

leip_recipe_designer.helpers.data.new_yolo_data_generator ¶

new_yolo_data_generator(pantry: IngredientCache, root_path: str, nclasses: int, annotations_dir: str = 'labels', images_dir: str = 'images', is_split: bool = True, trainval_split_ratio: Optional[float] = None, trainval_split_seed: Optional[int] = None, dataset_name: str = 'yolo-like-data', train_subdir: Optional[str] = 'train', val_subdir: Optional[str] = 'val', download_url: Optional[str] = None) -> RecipeNode

Creates a new data_generator ingredient to ingest new YOLO formatted detection dataset.

Expected folder structure for split data:
|---root_path
|------train_subdir (relative to root_path)
|---------images_dir (relative to train_subdir)
|---------annotations_dir (relative to train_subdir)
|------val_subdir (relative to root_path)
|---------images_dir (relative to val_subdir)
|---------annotations_dir (relative to val_subdir)

Expected folder structure for not split data:
|---root_path
|------images_dir (relative to root_path)
|------annotations_dir (relative to root_path)

Parameters:

pantry (IngredientCache) –

The ingredients storage.
root_path (str) –

The absolute path to the root directory of the dataset.
annotations_dir (str, default: 'labels' ) –

Relative path to folder containing txt files, one per sample, samplename.txt. Each txt file will have one row per bounding box, formatted as label_index, x_center, y_center, box_width, box_height
images_dir (str, default: 'images' ) –

Relative path to folder containing only images.
nclasses (int) –

The number of classes in your dataset.
is_split (bool, default: True ) –

True or False. If set to True, subfolders containing training and validation images and annotations need to be specified with train_subdir and val_subdir. If set to false, data will be split by the ingestor using the trainval_split_ratio and trainval_split_seed
trainval_split_ratio (Optional[float], default: None ) –

The ratio to use to split the dataset. Used only if is_split: false
trainval_split_seed (Optional[int], default: None ) –

The seed to use to pseudo randomly split the dataset. Used only if is_split: false
train_subdir (Optional[str], default: 'train' ) –

Used only if is_split: true The path to text file containing names (no extensions) to the training samples.
val_subdir (Optional[str], default: 'val' ) –

Used only if is_split: true The path to text file containing names (no extensions) to the validation samples.
dataset_name (str, default: 'yolo-like-data' ) –

This string will be used to name any generated artifacts.
download_url (Optional[str], default: None ) –

URL to download the dataset from. If data is not already on root_path, it can download into root path.

Returns:

data_generator ( RecipeNode ) –

A data_generator.vision.detection.2d ingredient node.

Examples:

The example of the dataset structure could be the following:

my_dataset
├── train
│   ├── images
|   │   ├── image1.jpg
|   │   └── image2.jpg
│   └── labels
|       ├── image1.txt
|       └── image2.txt
└── val
    ├── images
    │   ├── image3.jpg
    │   └── image4.jpg
    └── labels
        ├── image3.txt
        └── image4.txt


# train/labels/image1.txt
class_name1 bbox1_x_center bbox1_y_center bbox1_width bbox1_height
class_name1 bbox2_x_center bbox2_y_center bbox2_width bbox2_height

You can create a new data generator for this type of structure by running the following:

from leip_recipe_designer.helpers.data import new_yolo_data_generator, replace_data_generator

new_data = new_yolo_data_generator(
    pantry=pantry,
    root_path="/my/path/my_dataset",
    nclasses=1,
    dataset_name="my-custom-name",
)

replace_data_generator(recipe, new_data)

Source code in leip_recipe_designer/helpers/data.py

def new_yolo_data_generator(
    pantry: "IngredientCache",
    root_path: str,
    nclasses: int,
    annotations_dir: str = "labels",
    images_dir: str = "images",
    is_split: bool = True,
    trainval_split_ratio: Optional[float] = None,
    trainval_split_seed: Optional[int] = None,
    dataset_name: str = "yolo-like-data",
    train_subdir: Optional[str] = "train",
    val_subdir: Optional[str] = "val",
    download_url: Optional[str] = None,
) -> "RecipeNode":
    """
    Creates a new data_generator ingredient to ingest new YOLO formatted detection dataset.
    ```
    Expected folder structure for split data:
    |---root_path
    |------train_subdir (relative to root_path)
    |---------images_dir (relative to train_subdir)
    |---------annotations_dir (relative to train_subdir)
    |------val_subdir (relative to root_path)
    |---------images_dir (relative to val_subdir)
    |---------annotations_dir (relative to val_subdir)

    Expected folder structure for not split data:
    |---root_path
    |------images_dir (relative to root_path)
    |------annotations_dir (relative to root_path)
    ```
    Parameters
    ----------
    pantry:
        The ingredients storage.
    root_path:
        The absolute path to the root directory of the dataset.
    annotations_dir:
        Relative path to folder containing txt files, one per sample, samplename.txt.
        Each txt file will have one row per bounding box, formatted as
        `label_index, x_center, y_center, box_width, box_height`
    images_dir:
        Relative path to folder containing only images.
    nclasses:
        The number of classes in your dataset.
    is_split:
        True or False. If set to True, subfolders containing training and validation
        images and annotations need to be specified with `train_subdir` and `val_subdir`.
        If set to false, data will be split by the ingestor using the trainval_split_ratio
        and trainval_split_seed
    trainval_split_ratio:
        The ratio to use to split the dataset. Used only if is_split: false
    trainval_split_seed:
        The seed to use to pseudo randomly split the dataset. Used only if is_split: false
    train_subdir:
        Used only if is_split: true
        The path to text file containing names (no extensions) to the training samples.
    val_subdir:
        Used only if is_split: true
        The path to text file containing names (no extensions) to the validation samples.
    dataset_name:
        This string will be used to name any generated artifacts.
    download_url:
        URL to download the dataset from. If data is not already on root_path,
        it can download into root path.

    Returns
    -------
    data_generator:
        A `data_generator.vision.detection.2d` ingredient node.

    Examples
    ----------
        The example of the dataset structure could be the following:

    ```
    my_dataset
    ├── train
    │   ├── images
    |   │   ├── image1.jpg
    |   │   └── image2.jpg
    │   └── labels
    |       ├── image1.txt
    |       └── image2.txt
    └── val
        ├── images
        │   ├── image3.jpg
        │   └── image4.jpg
        └── labels
            ├── image3.txt
            └── image4.txt


    # train/labels/image1.txt
    class_name1 bbox1_x_center bbox1_y_center bbox1_width bbox1_height
    class_name1 bbox2_x_center bbox2_y_center bbox2_width bbox2_height
    ```

    You can create a new data generator for this type of structure by running the following:

    ``` py
    from leip_recipe_designer.helpers.data import new_yolo_data_generator, replace_data_generator

    new_data = new_yolo_data_generator(
        pantry=pantry,
        root_path="/my/path/my_dataset",
        nclasses=1,
        dataset_name="my-custom-name",
    )

    replace_data_generator(recipe, new_data)
    ```

    """
    domain = "data_generator.vision.detection.2d"
    my_byod = "BYOD - YOLO format" if download_url is None else "BYOD from Url - YOLO format"

    if is_split:
        assert (
            train_subdir is not None and val_subdir is not None
        ), "train_subdir and val_subdir cannot be 'None' when is_split=True"

    ingredient = instantiators.instantiate_single_node_by_category(domain, my_byod, pantry)
    ingredient["data.number_of_classes"] = nclasses
    ingredient["data_generator.root_path"] = root_path
    ingredient["data_generator.train_subdir"] = train_subdir
    ingredient["data_generator.val_subdir"] = val_subdir
    ingredient["data_generator.image_subdirectory"] = images_dir
    ingredient["data_generator.annotation_subdirectory"] = annotations_dir
    ingredient["data_generator.data_is_presplit"] = is_split
    ingredient["data_generator.train_val_split_ratio"] = trainval_split_ratio
    ingredient["data_generator.split_seed"] = trainval_split_seed
    ingredient["data_generator.dataset_name"] = dataset_name

    if download_url is not None:
        ingredient["data_generator.download_url"] = download_url

    logger.info("The created ingredient is: \n")
    logger.info(ingredient)

    return ingredient

leip_recipe_designer.helpers.data.new_kitti_data_generator ¶

new_kitti_data_generator(pantry: IngredientCache, root_path: str, nclasses: int, labels_dir: str = 'labels', images_dir: str = 'images', is_split: bool = True, sub_dir: Optional[str] = None, trainval_split_ratio: Optional[float] = None, trainval_split_seed: Optional[int] = 42, train_subdir: Optional[str] = 'train', val_subdir: Optional[str] = 'val', dataset_name: str = 'kitti-like-data', download_url: Optional[str] = None) -> RecipeNode

Creates a new data_generator ingredient to ingest new pascal formatted detection dataset.

Parameters:

pantry (IngredientCache) –

The ingredients storage.
root_path (str) –

The absolute path to the root directory of the dataset.
sub_dir (Optional[str], default: None ) –

The path to subdirectory, relative to root_path.
labels_dir (str, default: 'labels' ) –

The path to folder containing only labels, relative to root_path.
images_dir (str, default: 'images' ) –

The path to folder containing only images, relative to root_path.
nclasses (int) –

The number of classes in your dataset.
is_split (bool, default: True ) –

True or False. If set to True, text files containing the list of samples for training and validation should be specified using train_set and val_set. If set to false, data will be split by the ingestor given the trainval_split_ratio and trainval_split_seed
trainval_split_ratio (Optional[float], default: None ) –

The ratio to use to split the dataset. Used only if is_split: false
trainval_split_seed (Optional[int], default: 42 ) –

The seed to use to pseudo randomly split the dataset. Used only if is_split: false
train_subdir (Optional[str], default: 'train' ) –

Used only if is_split: true The path to the training samples.
val_subdir (Optional[str], default: 'val' ) –

Used only if is_split: true The path to the validation samples.
dataset_name (str, default: 'kitti-like-data' ) –

This string will be used to name any generated artifacts.
download_url (Optional[str], default: None ) –

URL to download the dataset from. If data is not already on root_path, it can download into root path.

Returns:

data_generator ( RecipeNode ) –

A data_generator.vision.detection.2d ingredient node.

Examples:

This is the expected structure of a split KITTI dataset:

my_dataset
├── train
│   ├── images
|   │   ├── image1.jpg
|   │   └── image2.jpg
│   └── labels
|       ├── image1.txt
|       └── image2.txt
└── val
    ├── images
    │   ├── image3.jpg
    │   └── image4.jpg
    └── labels
        ├── image3.txt
        └── image4.txt

Each txt file should contain the following stucture: https://docs.nvidia.com/tao/tao-toolkit/text/data_annotation_format.html#object-detection-kitti-format

from leip_recipe_designer.helpers.data import new_yolo_data_generator, replace_data_generator

new_data = new_kitti_data_generator(
    pantry=pantry,
    root_path="/my/path/my_dataset",
    nclasses=1,
    dataset_name="my-custom-name",
)

replace_data_generator(recipe, new_data)

Source code in leip_recipe_designer/helpers/data.py

def new_kitti_data_generator(
    pantry: "IngredientCache",
    root_path: str,
    nclasses: int,
    labels_dir: str = "labels",
    images_dir: str = "images",
    is_split: bool = True,
    sub_dir: Optional[str] = None,
    trainval_split_ratio: Optional[float] = None,
    trainval_split_seed: Optional[int] = 42,
    train_subdir: Optional[str] = "train",
    val_subdir: Optional[str] = "val",
    dataset_name: str = "kitti-like-data",
    download_url: Optional[str] = None,
) -> "RecipeNode":
    """
    Creates a new data_generator ingredient to ingest new pascal formatted detection dataset.

    Parameters
    ----------
    pantry:
        The ingredients storage.
    root_path:
        The absolute path to the root directory of the dataset.
    sub_dir:
        The path to subdirectory, relative to root_path.
    labels_dir:
        The path to folder containing only labels, relative to root_path.
    images_dir:
        The path to folder containing only images, relative to root_path.
    nclasses:
        The number of classes in your dataset.
    is_split:
        True or False. If set to True, text files containing the list of samples for training and
        validation should be specified using train_set and val_set. If set to false,
        data will be split by the ingestor given the trainval_split_ratio and trainval_split_seed
    trainval_split_ratio:
        The ratio to use to split the dataset. Used only if is_split: false
    trainval_split_seed:
        The seed to use to pseudo randomly split the dataset. Used only if is_split: false
    train_subdir:
        Used only if is_split: true
        The path to the training samples.
    val_subdir:
        Used only if is_split: true
        The path to the validation samples.
    dataset_name:
        This string will be used to name any generated artifacts.
    download_url:
        URL to download the dataset from. If data is not already on root_path,
        it can download into root path.

    Returns
    -------
    data_generator:
        A `data_generator.vision.detection.2d` ingredient node.

    Examples
    --------

    This is the expected structure of a split KITTI dataset:
    ```
    my_dataset
    ├── train
    │   ├── images
    |   │   ├── image1.jpg
    |   │   └── image2.jpg
    │   └── labels
    |       ├── image1.txt
    |       └── image2.txt
    └── val
        ├── images
        │   ├── image3.jpg
        │   └── image4.jpg
        └── labels
            ├── image3.txt
            └── image4.txt

    ```
    Each txt file should contain the following stucture:
    https://docs.nvidia.com/tao/tao-toolkit/text/data_annotation_format.html#object-detection-kitti-format
    ``` py
    from leip_recipe_designer.helpers.data import new_yolo_data_generator, replace_data_generator

    new_data = new_kitti_data_generator(
        pantry=pantry,
        root_path="/my/path/my_dataset",
        nclasses=1,
        dataset_name="my-custom-name",
    )

    replace_data_generator(recipe, new_data)
    ```
    """
    domain = "data_generator.vision.detection.2d"
    my_byod = "BYOD - KITTI format" if download_url is None else "BYOD from Url - KITTI format"

    ingredient = instantiators.instantiate_single_node_by_category(domain, my_byod, pantry)

    # optinal to use by synonym or by relative path
    ingredient["data_generator.root_path"] = root_path
    ingredient["data_generator.sub_dir"] = sub_dir or ""
    ingredient["data_generator.annotation_subdirectory"] = labels_dir
    ingredient["data_generator.image_subdirectory"] = images_dir
    ingredient["data.number_of_classes"] = nclasses
    ingredient["data_generator.data_is_presplit"] = is_split
    ingredient["data_generator.train_val_split_ratio"] = trainval_split_ratio
    ingredient["data_generator.split_seed"] = trainval_split_seed
    ingredient["data_generator.dataset_name"] = dataset_name
    ingredient["data_generator.train_subdir"] = train_subdir
    ingredient["data_generator.val_subdir"] = val_subdir

    if download_url is not None:
        ingredient["data_generator.download_url"] = download_url

    logger.info("The created ingredient is: \n")
    logger.info(ingredient)

    return ingredient

Data modification helpers¶

leip_recipe_designer.helpers.data.mosaicify ¶

mosaicify(recipe: RecipeNode)

In-place function that applies mosaic training augmentation to the recipe's dataset. Mosaic is a data augmentation technique that combines 4 images into a single image. Mosaic does this by resizing each of the four images, stitching them together, and then taking a random cutout of the stitched images to get the final Mosaic image.

Parameters:

recipe (RecipeNode) –

The recipe to be modified.

Examples:

>>> from leip_recipe_designer.helpers.data import new_pascal_data_generator, replace_data_generator, mosaicify
>>> my_data = new_pascal_data_generator(root_path="/some/place")
>>> replace_data_generator(some_existing_recipe, my_data)
>>> mosaicify(some_existing_recipe)

Source code in leip_recipe_designer/helpers/data.py

def mosaicify(recipe: "RecipeNode"):
    """
    In-place function that applies mosaic training augmentation to the recipe's dataset.
    Mosaic is a data augmentation technique that combines 4 images into a single image.
    Mosaic does this by resizing each of the four images, stitching them together,
    and then taking a random cutout of the stitched images to get the final Mosaic image.

    Parameters
    ----------
    recipe:
        The recipe to be modified.


    Examples
    ----------
    >>> from leip_recipe_designer.helpers.data import new_pascal_data_generator, replace_data_generator, mosaicify
    >>> my_data = new_pascal_data_generator(root_path="/some/place")
    >>> replace_data_generator(some_existing_recipe, my_data)
    >>> mosaicify(some_existing_recipe)

    """

    is_mosaiced = _is_mosaiced(recipe)
    if is_mosaiced:
        logger.error("Recipe is already mosaiced.")
        raise RuntimeError("Recipe is already mosaiced.")

    # keep old data gen
    dg = recipe["^data_generator$"].duplicate()
    options = recipe_pantry_operations.search_pantry(dg, regex="mosaic")
    recipe["^data_generator$"] = options[0]["ingredient_id"]
    recipe["^data_generator.composite.mosaic.foreground$"] = dg

    alternative_train = recipe_pantry_operations.search_pantry(
        recipe["data.augmentation.training"], regex="efficientdet"
    )

    recipe["data.augmentation.composite.training"] = alternative_train[0]["ingredient_id"]
    recipe["data.augmentation.composite.training"]["slot:augmentations"] = []

    # now switch em
    aug = recipe["data.augmentation.training"]
    recipe["data.augmentation.training"] = recipe["data.augmentation.composite.training"]
    recipe["data.augmentation.composite.training"] = aug

    # keep for backward compatibility
    is_final_aug = "data.normalization.is_final"
    if is_final_aug in recipe["data.augmentation.training"]:
        recipe["data.augmentation.training"][is_final_aug] = False
    if is_final_aug in recipe["data.augmentation.composite.training"]:
        recipe["data.augmentation.composite.training"][is_final_aug] = True

leip_recipe_designer.helpers.data.change_label_map ¶

change_label_map(recipe: RecipeNode, label_mapping_dict: Dict)

Select what classes to use in your dataset, combine multiple classes into one, or leave some classes out of training.

Parameters:

recipe (RecipeNode) –

The recipe to be modified.
label_mapping_dict (Dict) –

Keys are the string name of the class, value is the integer label to map it to. If there are multiple string labels mapped to the same integer value, the classes will be merged.

Examples:

Let's assume my_recipe has the COCO dataset as its data_generator.

Example 1: Train on only the vehicles in COCO data, and skip all the samples that dont have any vehicle on it. This will now be a 5 class dataset:

>>> from leip_recipe_designer.helpers.data import change_label_map
>>> label_mapping_dict = {"car": 1,
>>>                       "bus": 2,
>>>                       "truck": 3,
>>>                       "motorcycle": 4,
>>>                       "bicycle": 5,
>>>                       }
>>> change_label_map(my_recipe, label_mapping_dict)

Example 2: Combine all vehicles into one class, these will now be a single class dataset:

>>> label_mapping_dict = {"car": 1,
>>>                       "bus": 1,
>>>                       "truck": 1,
>>>                       "motorcycle": 1,
>>>                       "bicycle": 1,
>>>                       }
>>> change_label_map(my_recipe, label_mapping_dict)

Source code in leip_recipe_designer/helpers/data.py

def change_label_map(recipe: "RecipeNode", label_mapping_dict: Dict):
    """
    Select what classes to use in your dataset, combine multiple classes into one,
    or leave some classes out of training.

    Parameters
    ----------
    recipe:
        The recipe to be modified.
    label_mapping_dict:
        Keys are the string name of the class, value is the integer label to map it to.
        If there are multiple string labels mapped to the same integer value, the classes will be merged.

    Examples
    ----------

    Let's assume `my_recipe` has the COCO dataset as its data_generator.

    Example 1: Train on only the vehicles in COCO data, and skip all the samples that dont have any vehicle on it.
    This will now be a 5 class dataset:
    >>> from leip_recipe_designer.helpers.data import change_label_map
    >>> label_mapping_dict = {"car": 1,
    >>>                       "bus": 2,
    >>>                       "truck": 3,
    >>>                       "motorcycle": 4,
    >>>                       "bicycle": 5,
    >>>                       }
    >>> change_label_map(my_recipe, label_mapping_dict)

    Example 2: Combine all vehicles into one class, these will now be a single class dataset:
    >>> label_mapping_dict = {"car": 1,
    >>>                       "bus": 1,
    >>>                       "truck": 1,
    >>>                       "motorcycle": 1,
    >>>                       "bicycle": 1,
    >>>                       }
    >>> change_label_map(my_recipe, label_mapping_dict)

    """
    assert isinstance(label_mapping_dict, dict)
    dg = recipe["^data_generator$"].duplicate()
    options = recipe_pantry_operations.search_pantry(dg, regex="Class selector$")
    recipe["slot:data"]["slot:module.dataset_generator"] = options[0]["ingredient_id"]
    recipe["slot:data"]["slot:module.dataset_generator"]["slot:actual.custom_labelmap"] = label_mapping_dict
    recipe["slot:data"]["slot:module.dataset_generator"]["slot:actual.underlying_dataset"] = dg

leip_recipe_designer.helpers.data.replace_augmentations ¶

replace_augmentations(recipe: RecipeNode, regex_ingredient_name: str, phase: str = 'train')

Replace current augmentations with the provided one.

Parameters:

recipe (RecipeNode) –

The recipe to be modified.
ingredient_name –

The name or regex of the augmentations ingredient to be assigned.
phase (str, default: 'train' ) –

One of "train" or "val".

Examples:

>>> from leip_recipe_designer.helpers.data import replace_augmentations
>>> replace_augmentations(some_existing_recipe, regex_ingredient_name="Yolo", phase="train")

Source code in leip_recipe_designer/helpers/data.py

def replace_augmentations(recipe: "RecipeNode", regex_ingredient_name: str, phase: str = "train"):
    """Replace current augmentations with the provided one.

    Parameters
    ----------
    recipe:
        The recipe to be modified.
    ingredient_name:
        The name or regex of the augmentations ingredient to be assigned.
    phase:
        One of "train" or "val".

    Examples
    ----------
    >>> from leip_recipe_designer.helpers.data import replace_augmentations
    >>> replace_augmentations(some_existing_recipe, regex_ingredient_name="Yolo", phase="train")

    """
    assert phase in ["train", "val"], "The provided phase is not supported. Choose one of 'train' or 'val'."

    if phase == "train":
        key = "data.augmentation.training"
        if _is_mosaiced(recipe):
            # remove nested transforms
            _ = recipe.assign_ingredients(key, regex="efficientdet")
            # keep for backward compatibility
            if "data.normalization.is_final" in recipe[key]:
                recipe[key]["data.normalization.is_final"] = False
            recipe[key]["augmentation_list"] = []

            # redirect to the main augmentations
            key = "data.augmentation.composite.training"
    else:
        key = "data.augmentation.validation"

    recipe.assign_ingredients(key, regex=regex_ingredient_name)