download_data.py
Python script, ASCII text executable
1import os 2import shutil 3import httpx 4import tqdm 5from PIL import Image, ImageOps 6import tempfile 7from pathlib import Path 8import random 9 10VALIDATION_SPLIT = 0.1 11 12query_yaml = """ 13want: 14- has: ["waste"] 15- nature: ["photo"] 16""" 17 18object_types = httpx.get("https://datasets.roundabout-host.com/api/object-types").json() 19 20base_types = { 21"plastic household waste": set(), 22"glass household waste": set(), 23"metal household waste": set(), 24"paper and cardboard": set(), 25"organic household waste": set(), 26"household waste": set(), 27"waste": set(), 28} 29 30memoisation = {} 31 32def get_objects(object_info): 33if object_info["id"] in memoisation: 34return memoisation[object_info["id"]] 35objects = set() 36for object in object_info["children"]: 37#print(f"Scanning {object}") 38info = httpx.get("https://datasets.roundabout-host.com/api/object/" + object).json() 39objects |= get_objects(info) 40objects.add(object_info["id"]) 41memoisation[object_info["id"]] = objects 42return objects 43 44for object_name in base_types: 45object_info = httpx.get("https://datasets.roundabout-host.com/api/object/" + object_name).json() 46base_types[object_name] = get_objects(object_info) 47print(object_name, base_types[object_name]) 48 49result = {"resources": True} # dummy value to enter the loop 50photos = [] 51offset = 0 52limit = 192 53output = Path("data/") 54val_output = Path("val_data/") 55 56print("Downloading photo metadata...") 57 58while result["resources"]: 59result = httpx.post(f"https://datasets.roundabout-host.com/api/query-pictures?offset={offset}&limit={limit}", data={"query": query_yaml}).json() 60photos += result["resources"] 61offset += limit 62print(f"Received photos {offset-limit} to {offset}") 63 64shutil.rmtree(output, ignore_errors=True) 65os.makedirs(output) 66shutil.rmtree(val_output, ignore_errors=True) 67os.makedirs(val_output) 68 69class_mapping = { 70"plastic household waste": 0, 71"glass household waste": 1, 72"metal household waste": 2, 73"paper and cardboard": 3, 74"organic household waste": 4, 75"household waste": 5, 76"waste": 6, 77} 78 79with tempfile.NamedTemporaryFile(delete_on_close=False, mode="wb") as temporary_file: 80for photo in tqdm.tqdm(photos): 81# Download the photo 82result = httpx.get(photo["download"], follow_redirects=True, timeout=15) 83temporary_file.write(result.content) 84temporary_file.seek(0) 85image = Image.open(temporary_file.name, formats=["JPEG"]) 86ImageOps.exif_transpose(image, in_place=True) 87image.thumbnail((640, 640)) 88if random.random() < VALIDATION_SPLIT: 89directory = val_output 90else: 91directory = output 92image.save(directory / (str(photo["id"]) + ".jpg")) 93# Download the annotations 94with open(directory / (str(photo["id"]) + ".txt"), "w") as annotation_file: 95for region in photo["regions"]: 96klass = -1 97for base_type in base_types: 98if region["object"] in base_types[base_type]: 99klass = class_mapping[base_type] 100break 101if klass == -1: 102continue 103 104if region["type"] == "bbox": 105cx = region["shape"]["x"] + region["shape"]["w"] / 2 106cy = region["shape"]["y"] + region["shape"]["h"] / 2 107w = region["shape"]["w"] 108h = region["shape"]["h"] 109elif region["type"] == "polygon": 110x = [point["x"] for point in region["shape"]] 111y = [point["y"] for point in region["shape"]] 112cx = (min(x) + max(x)) / 2 113cy = (min(y) + max(y)) / 2 114w = max(x) - min(x) 115h = max(y) - min(y) 116 117annotation_file.write(f"{klass} {cx} {cy} {w} {h}\n") 118 119