download_data.py
Python script, ASCII text executable
1import os 2import shutil 3import httpx 4import tqdm 5from PIL import Image, ImageOps 6import tempfile 7from pathlib import Path 8 9query_yaml = """ 10want: 11- has: ["waste"] 12- nature: ["photo"] 13""" 14 15object_types = httpx.get("https://datasets.roundabout-host.com/api/object-types").json() 16 17base_types = { 18"plastic household waste": set(), 19"glass household waste": set(), 20"metal household waste": set(), 21"paper and cardboard": set(), 22"organic household waste": set(), 23"household waste": set(), 24"waste": set(), 25} 26 27memoisation = {} 28 29def get_objects(object_info): 30if object_info["id"] in memoisation: 31return memoisation[object_info["id"]] 32objects = set() 33for object in object_info["children"]: 34#print(f"Scanning {object}") 35info = httpx.get("https://datasets.roundabout-host.com/api/object/" + object).json() 36objects |= get_objects(info) 37objects.add(object_info["id"]) 38memoisation[object_info["id"]] = objects 39return objects 40 41for object_name in base_types: 42object_info = httpx.get("https://datasets.roundabout-host.com/api/object/" + object_name).json() 43base_types[object_name] = get_objects(object_info) 44print(object_name, base_types[object_name]) 45 46result = {"resources": True} # dummy value to enter the loop 47photos = [] 48offset = 0 49limit = 192 50output = Path("data/") 51 52print("Downloading photo metadata...") 53 54while result["resources"]: 55result = httpx.post(f"https://datasets.roundabout-host.com/api/query-pictures?offset={offset}&limit={limit}", data={"query": query_yaml}).json() 56photos += result["resources"] 57offset += limit 58print(f"Received photos {offset-limit} to {offset}") 59 60shutil.rmtree(output, ignore_errors=True) 61os.makedirs(output) 62 63class_mapping = { 64"plastic household waste": 0, 65"glass household waste": 1, 66"metal household waste": 2, 67"paper and cardboard": 3, 68"organic household waste": 4, 69"household waste": 5, 70"waste": 6, 71} 72 73with tempfile.NamedTemporaryFile(delete_on_close=False, mode="wb") as temporary_file: 74for photo in tqdm.tqdm(photos): 75# Download the photo 76result = httpx.get(photo["download"], follow_redirects=True) 77temporary_file.write(result.content) 78temporary_file.seek(0) 79image = Image.open(temporary_file.name, formats=["JPEG"]) 80ImageOps.exif_transpose(image, in_place=True) 81image.thumbnail((640, 640)) 82image.save(output / (str(photo["id"]) + ".jpg")) 83# Download the annotations 84with open(output / (str(photo["id"]) + ".txt"), "w") as annotation_file: 85for region in photo["regions"]: 86klass = -1 87for base_type in base_types: 88if region["object"] in base_types[base_type]: 89klass = class_mapping[base_type] 90break 91if klass == -1: 92continue 93 94if region["type"] == "bbox": 95cx = region["shape"]["x"] + region["shape"]["w"] / 2 96cy = region["shape"]["y"] + region["shape"]["h"] / 2 97w = region["shape"]["w"] 98h = region["shape"]["h"] 99elif region["type"] == "polygon": 100x = [point["x"] for point in region["shape"]] 101y = [point["y"] for point in region["shape"]] 102cx = (min(x) + max(x)) / 2 103cy = (min(y) + max(y)) / 2 104w = max(x) - min(x) 105h = max(y) - min(y) 106 107annotation_file.write(f"{klass} {cx} {cy} {w} {h}\n") 108 109