Filter and combine images (custom scenario)

This notebook shows how to filter images by names from different datasets and combine them to a single dataset.

Input:

We have a project with the following structure:

├── dataset_01
│   ├── dataset_1_img_1.jpg
│   ├── dataset_1_img_10.jpg
│   ├── dataset_1_img_11.jpg
│   ├── dataset_1_img_12.jpg
│   ├── dataset_1_img_2.jpg
│   ├── dataset_1_img_3.jpg
│   ├── dataset_1_img_4.jpg
│   ├── dataset_1_img_5.jpg
│   ├── dataset_1_img_6.jpg
│   ├── dataset_1_img_7.jpg
│   ├── dataset_1_img_8.jpg
│   └── dataset_1_img_9.jpg
├── dataset_02
│   ├── dataset_2_img_1.jpg
│   ├── dataset_2_img_2.jpg
│   └── dataset_2_img_3.jpg
└── dataset_03
    ├── dataset_3_img_1.jpg
    ├── dataset_3_img_2.jpg
    ├── dataset_3_img_3.jpg
    ├── dataset_3_img_4.jpg
    ├── dataset_3_img_5.jpg
    ├── dataset_3_img_6.jpg
    └── dataset_3_img_7.jpg

Output: - We would like to create a new project with only images that satisfy a custom criteria (e.g. images have names from dataset_1_img_3 to dataset_X_img_11). So the resulting project will contain:

├── dataset_01
│   ├── dataset_1_img_10.jpg
│   ├── dataset_1_img_11.jpg
│   ├── dataset_1_img_3.jpg
│   ├── dataset_1_img_4.jpg
│   ├── dataset_1_img_5.jpg
│   ├── dataset_1_img_6.jpg
│   ├── dataset_1_img_7.jpg
│   ├── dataset_1_img_8.jpg
│   └── dataset_1_img_9.jpg
├── dataset_02
│   └── dataset_2_img_3.jpg
└── dataset_03
    ├── dataset_3_img_3.jpg
    ├── dataset_3_img_4.jpg
    ├── dataset_3_img_5.jpg
    ├── dataset_3_img_6.jpg
    └── dataset_3_img_7.jpg

Imports

[98]:
import supervisely_lib as sly
import os
[99]:
import re

Initialize API access with your credentials

[100]:
# Obtain server address and your api_token from environment variables
# Edit those values if you run this notebook on your own PC
address = os.environ['SERVER_ADDRESS']
token = os.environ['API_TOKEN']

Initialize the API access object

[101]:
api = sly.Api(address, token)

Script parameters

[102]:
team_name = "max"
workspace_name = "ipynb_filter_combine_images"
project_name = "project_x"

result_project_name = "project_filtered"

# if you put None then images will be puted to the datasets with original names in result project.
# if you define name directly, the dataset with this name will be created in result project
# and all filtered images will be combined and putted in it
result_dataset_name = None #"combined_ds"

image_ids_range = [3, 11]

Verify input values

[103]:
team = api.team.get_info_by_name(team_name)
if team is None:
    raise RuntimeError("Team {!r} not found".format(team_name))

workspace = api.workspace.get_info_by_name(team.id, workspace_name)
if workspace is None:
    raise RuntimeError("Workspace {!r} not found".format(workspace_name))

project = api.project.get_info_by_name(workspace.id, project_name)
if project is None:
    raise RuntimeError("Project {!r} not found".format(project_name))

print("Team: id={}, name={}".format(team.id, team.name))
print("Workspace: id={}, name={}".format(workspace.id, workspace.name))
print("Project: id={}, name={}".format(project.id, project.name))
Team: id=600, name=max
Workspace: id=27434, name=ipynb_filter_combine_images
Project: id=62482, name=project_x

Create resulting project

[104]:
res_project = api.project.create(workspace.id, result_project_name, change_name_if_conflict=True)
print("Resulting project: id={}, name={}".format(res_project.id, res_project.name))
Resulting project: id=62502, name=project_filtered
[105]:
#clone project meta (list of classes and tags) from input project to the resulting one
project_meta_json = api.project.get_meta(project.id)
api.project.update_meta(res_project.id, project_meta_json)
[106]:
# create dataset in resulting project if needed
res_dataset = None
if result_dataset_name is not None:
    res_dataset = api.dataset.create(res_project.id, result_dataset_name, change_name_if_conflict=False)

Iterate over all images, filter and combine them if needed

[107]:
# filtering funtion
def filter_image_by_name(image_name, image_ids_range=[3, 11]):
    need_keep = False

    numbers = list(map(int, re.findall(r'\d+', image_name)))
    image_id = numbers[1]

    if image_id >= image_ids_range[0] and image_id <= image_ids_range[1]:
        need_keep = True

    return need_keep
[108]:
for dataset in api.dataset.get_list(project.id):
    print('Dataset: {}'.format(dataset.name), flush=True)

    dst_dataset = None
    if result_dataset_name is None:
        dst_dataset = api.dataset.create(res_project.id, dataset.name)
    else:
        dst_dataset = res_dataset

    images_in_dataset = api.image.get_list(dataset.id)

    filtered_image_ids = []
    for image_info in images_in_dataset:
        need_keep = filter_image_by_name(image_info.name)
        if need_keep == True:
            filtered_image_ids.append(image_info.id)

    if len(filtered_image_ids) > 0:
        copied_images = api.image.copy_batch(dst_dataset.id, filtered_image_ids, change_name_if_conflict=False, with_annotations=True)

    print("Number of copied images = {}".format(len(filtered_image_ids)))
Dataset: dataset_01
Number of copied images = 9
Dataset: dataset_03
Number of copied images = 5
Dataset: dataset_02
Number of copied images = 1
[ ]: