Enhance your Machine Learning workflows with LEIP Design’s new data visualization integration !¶

In this tutorial, we'll explore how to streamline machine learning workflows by leveraging our powerful data visualization integrations.

We'll demonstrate how to visualize your data, manage data versions, filter noisy samples & seamlessly ingest your refined dataset into your LEIP recipe!

Let’s dive into building efficient, performance-driven ML pipelines!

Step 1: Loading the Dataset in FiftyOne¶

In this step:

We load the dataset using FiftyOne from a VOC Detection format directory.
The dataset is split into train (75%) and val (25%) subsets.
Finally, we launch the FiftyOne web app in port 8882 to visually inspect the dataset.

In [ ]:

Copied!





# Import necessary libraries
import os
import datetime
import fiftyone as fo
from fiftyone import ViewField as F
import fiftyone.utils.random as four
from pathlib import Path

# Define paths for your dataset
data_path = 'path/to/your/dataset' 
dataset_name = 'surface_defect_detection'
images_dir = 'data'
labels_dir = 'labels'
images_dir = os.path.join(data_path, images_dir)
labels_dir = os.path.join(data_path, labels_dir)

# Generate a unique name for this dataset using the current timestamp
current_time = datetime.datetime.now()
new_dataset_name = dataset_name + f"_{current_time.month}_{current_time.day}_{current_time.hour}_{current_time.minute}_{current_time.second}"

# Specify the dataset type (VOCDetection in this case)
dataset_type = fo.types.VOCDetectionDataset

# Load the dataset from the specified directory
dataset = fo.Dataset.from_dir(
    dataset_type=dataset_type,
    data_path=images_dir,
    labels_path=labels_dir,
    name=new_dataset_name
)

# Split the dataset into training (75%) and validation (25%) sets
four.random_split(
    dataset,
    {"train": 0.75, "val": 0.25},
    seed=42,
)

# Launch the FiftyOne app to explore the dataset
session = fo.launch_app(dataset, port=8882, auto=False)
session.open_tab()
# Import necessary libraries
import os
import datetime
import fiftyone as fo
from fiftyone import ViewField as F
import fiftyone.utils.random as four
from pathlib import Path

# Define paths for your dataset
data_path = 'path/to/your/dataset' 
dataset_name = 'surface_defect_detection'
images_dir = 'data'
labels_dir = 'labels'
images_dir = os.path.join(data_path, images_dir)
labels_dir = os.path.join(data_path, labels_dir)

# Generate a unique name for this dataset using the current timestamp
current_time = datetime.datetime.now()
new_dataset_name = dataset_name + f"_{current_time.month}_{current_time.day}_{current_time.hour}_{current_time.minute}_{current_time.second}"

# Specify the dataset type (VOCDetection in this case)
dataset_type = fo.types.VOCDetectionDataset

# Load the dataset from the specified directory
dataset = fo.Dataset.from_dir(
    dataset_type=dataset_type,
    data_path=images_dir,
    labels_path=labels_dir,
    name=new_dataset_name
)

# Split the dataset into training (75%) and validation (25%) sets
four.random_split(
    dataset,
    {"train": 0.75, "val": 0.25},
    seed=42,
)

# Launch the FiftyOne app to explore the dataset
session = fo.launch_app(dataset, port=8882, auto=False)
session.open_tab()

Let's understand the dataset by printing the ground truth annotations of the first sample, including it's labels and bounding boxes

In [ ]:

Copied!





first_sample = dataset.first()

# Get the ground truth detections
ground_truth = first_sample.ground_truth  # Replace 'ground_truth' with the name of your field if different

# Print ground truth detections
print("Ground Truth Detections:")
for detection in ground_truth.detections:
    print(f"Label: {detection.label}, Bounding Box: {detection.bounding_box}")
first_sample = dataset.first()

# Get the ground truth detections
ground_truth = first_sample.ground_truth  # Replace 'ground_truth' with the name of your field if different

# Print ground truth detections
print("Ground Truth Detections:")
for detection in ground_truth.detections:
    print(f"Label: {detection.label}, Bounding Box: {detection.bounding_box}")

Step 2: Tailoring the Dataset to your specific needs (Optional)¶

Here, we filter the dataset to include only samples with detections for the classes scratches and patches.

A new dataset is created with only the filtered samples.
We preserve the train and val subsets as separate views.
Finally, we inspect the filtered dataset in the FiftyOne app.

In [ ]:

Copied!





# Define the list of classes we are interested in detecting
things_i_want_to_detect = ['inclusion', 'patches']

# Create a new dataset to store filtered samples
custom_dataset_name = f"{new_dataset_name}_filtered"
custom_dataset = fo.Dataset(custom_dataset_name)

# Iterate over samples in the original dataset
for sample in dataset:
    # Filter detections to keep only the ones matching our target classes
    filtered_detections = [
        det for det in sample.ground_truth.detections
        if det.label in things_i_want_to_detect
    ]
    
    # Skip samples without relevant detections
    if not filtered_detections:
        continue
    
    # Create a copy of the sample
    new_sample = sample.copy()
    
    # Overwrite the `ground_truth.detections` with the filtered detections
    new_sample["ground_truth"] = fo.Detections(detections=filtered_detections)
    
    # Add the modified sample to the new dataset
    custom_dataset.add_sample(new_sample)

# Save the train and validation subsets as views
train_view = custom_dataset.match_tags("train")
val_view = custom_dataset.match_tags("val")
custom_dataset.save_view("train_view", train_view)
custom_dataset.save_view("val_view", val_view)

# Compute metadata for the filtered dataset
custom_dataset.compute_metadata()

# Launch the FiftyOne app for the filtered dataset
session = fo.launch_app(custom_dataset, port=8882, auto=False)
session.open_tab()
# Define the list of classes we are interested in detecting
things_i_want_to_detect = ['inclusion', 'patches']

# Create a new dataset to store filtered samples
custom_dataset_name = f"{new_dataset_name}_filtered"
custom_dataset = fo.Dataset(custom_dataset_name)

# Iterate over samples in the original dataset
for sample in dataset:
    # Filter detections to keep only the ones matching our target classes
    filtered_detections = [
        det for det in sample.ground_truth.detections
        if det.label in things_i_want_to_detect
    ]
    
    # Skip samples without relevant detections
    if not filtered_detections:
        continue
    
    # Create a copy of the sample
    new_sample = sample.copy()
    
    # Overwrite the `ground_truth.detections` with the filtered detections
    new_sample["ground_truth"] = fo.Detections(detections=filtered_detections)
    
    # Add the modified sample to the new dataset
    custom_dataset.add_sample(new_sample)

# Save the train and validation subsets as views
train_view = custom_dataset.match_tags("train")
val_view = custom_dataset.match_tags("val")
custom_dataset.save_view("train_view", train_view)
custom_dataset.save_view("val_view", val_view)

# Compute metadata for the filtered dataset
custom_dataset.compute_metadata()

# Launch the FiftyOne app for the filtered dataset
session = fo.launch_app(custom_dataset, port=8882, auto=False)
session.open_tab()

In [ ]:

Copied!





first_sample = custom_dataset.first()

# Get the ground truth detections
ground_truth = first_sample.ground_truth  # Replace 'ground_truth' with the name of your field if different

# Print ground truth detections
print("Ground Truth Detections:")
for detection in ground_truth.detections:
    print(f"Label: {detection.label}, Bounding Box: {detection.bounding_box}")
first_sample = custom_dataset.first()

# Get the ground truth detections
ground_truth = first_sample.ground_truth  # Replace 'ground_truth' with the name of your field if different

# Print ground truth detections
print("Ground Truth Detections:")
for detection in ground_truth.detections:
    print(f"Label: {detection.label}, Bounding Box: {detection.bounding_box}")

Step 3: Ingesting your customized dataset in LEIP¶

In this step:

We create a basic object detection recipe using LEIP Recipe Designer.
The customized dataset is easily ingested in your recipe via the attach_fiftyone_data_generator.

In [ ]:

Copied!





# Import LEIP Recipe Designer and helper functions
import leip_recipe_designer as rd
from leip_recipe_designer.create import empty_detection_recipe
from leip_recipe_designer.helpers.data import replace_data_generator, attach_fiftyone_data_generator

# Define the workspace and pantry paths
workspace = Path(os.getcwd())
pantry = rd.Pantry.build(workspace / "my_combined_pantry", force_rebuild=False)

# Create an empty recipe for object detection
recipe = empty_detection_recipe(pantry=pantry)
recipe.fill_empty_recursively()

#Attach the filtered FiftyOne dataset as a data generator
datagen = attach_fiftyone_data_generator(
    pantry=pantry, 
    dataset_name=custom_dataset_name, 
    nclasses=2, 
    groundtruth_field_name='ground_truth',
)

# Replace the recipe's data generator with the one attached above
replace_data_generator(recipe, datagen)

# Fill the recipe again to ensure completeness
_ = recipe.fill_empty_recursively()
# Import LEIP Recipe Designer and helper functions
import leip_recipe_designer as rd
from leip_recipe_designer.create import empty_detection_recipe
from leip_recipe_designer.helpers.data import replace_data_generator, attach_fiftyone_data_generator

# Define the workspace and pantry paths
workspace = Path(os.getcwd())
pantry = rd.Pantry.build(workspace / "my_combined_pantry", force_rebuild=False)

# Create an empty recipe for object detection
recipe = empty_detection_recipe(pantry=pantry)
recipe.fill_empty_recursively()

#Attach the filtered FiftyOne dataset as a data generator
datagen = attach_fiftyone_data_generator(
    pantry=pantry, 
    dataset_name=custom_dataset_name, 
    nclasses=2, 
    groundtruth_field_name='ground_truth',
)

# Replace the recipe's data generator with the one attached above
replace_data_generator(recipe, datagen)

# Fill the recipe again to ensure completeness
_ = recipe.fill_empty_recursively()

Let's print the data_generator ingredient of the recipe to check if our custom dataset has been ingested successfully

In [ ]:

Copied!

recipe['data_generator']
recipe['data_generator']

Congratulations! You have successfully visualized, customized, and ingested your dataset into LEIP in just a few minutes.

From here, you can follow the steps shown in the Getting Started tutorial to train, evaluate, optimize, and deploy your model!