download_data.py

text/x-script.python • 3.55 kiB
Python script, ASCII text executable
        
            1
            import os
        
            2
            import shutil
        
            3
            import httpx
        
            4
            import tqdm
        
            5
            from PIL import Image, ImageOps
        
            6
            import tempfile
        
            7
            from pathlib import Path
        
            8
            
        
            9
            query_yaml = """
        
            10
            want:
        
            11
                - has: ["waste"]
        
            12
                - nature: ["photo"]
        
            13
            """
        
            14
            
        
            15
            object_types = httpx.get("https://datasets.roundabout-host.com/api/object-types").json()
        
            16
            
        
            17
            base_types = {
        
            18
                "plastic household waste": set(),
        
            19
                "glass household waste": set(),
        
            20
                "metal household waste": set(),
        
            21
                "paper and cardboard": set(),
        
            22
                "organic household waste": set(),
        
            23
                "household waste": set(),
        
            24
                "waste": set(),
        
            25
            }
        
            26
            
        
            27
            memoisation = {}
        
            28
            
        
            29
            def get_objects(object_info):
        
            30
                if object_info["id"] in memoisation:
        
            31
                    return memoisation[object_info["id"]]
        
            32
                objects = set()
        
            33
                for object in object_info["children"]:
        
            34
                    #print(f"Scanning {object}")
        
            35
                    info = httpx.get("https://datasets.roundabout-host.com/api/object/" + object).json()
        
            36
                    objects |= get_objects(info)
        
            37
                objects.add(object_info["id"])
        
            38
                memoisation[object_info["id"]] = objects
        
            39
                return objects
        
            40
            
        
            41
            for object_name in base_types:
        
            42
                object_info = httpx.get("https://datasets.roundabout-host.com/api/object/" + object_name).json()
        
            43
                base_types[object_name] = get_objects(object_info)
        
            44
                print(object_name, base_types[object_name])
        
            45
            
        
            46
            result = {"resources": True}   # dummy value to enter the loop
        
            47
            photos = []
        
            48
            offset = 0
        
            49
            limit = 192
        
            50
            output = Path("data/")
        
            51
            
        
            52
            print("Downloading photo metadata...")
        
            53
            
        
            54
            while result["resources"]:
        
            55
                result = httpx.post(f"https://datasets.roundabout-host.com/api/query-pictures?offset={offset}&limit={limit}", data={"query": query_yaml}).json()
        
            56
                photos += result["resources"]
        
            57
                offset += limit
        
            58
                print(f"Received photos {offset-limit} to {offset}")
        
            59
            
        
            60
            shutil.rmtree(output, ignore_errors=True)
        
            61
            os.makedirs(output)
        
            62
            
        
            63
            class_mapping = {
        
            64
                "plastic household waste": 0,
        
            65
                "glass household waste": 1,
        
            66
                "metal household waste": 2,
        
            67
                "paper and cardboard": 3,
        
            68
                "organic household waste": 4,
        
            69
                "household waste": 5,
        
            70
                "waste": 6,
        
            71
            }
        
            72
            
        
            73
            with tempfile.NamedTemporaryFile(delete_on_close=False, mode="wb") as temporary_file:
        
            74
                for photo in tqdm.tqdm(photos):
        
            75
                    # Download the photo
        
            76
                    result = httpx.get(photo["download"], follow_redirects=True)
        
            77
                    temporary_file.write(result.content)
        
            78
                    temporary_file.seek(0)
        
            79
                    image = Image.open(temporary_file.name, formats=["JPEG"])
        
            80
                    ImageOps.exif_transpose(image, in_place=True)
        
            81
                    image.thumbnail((640, 640))
        
            82
                    image.save(output / (str(photo["id"]) + ".jpg"))
        
            83
                    # Download the annotations
        
            84
                    with open(output / (str(photo["id"]) + ".txt"), "w") as annotation_file:
        
            85
                        for region in photo["regions"]:
        
            86
                            klass = -1
        
            87
                            for base_type in base_types:
        
            88
                                if region["object"] in base_types[base_type]:
        
            89
                                    klass = class_mapping[base_type]
        
            90
                                    break
        
            91
                            if klass == -1:
        
            92
                                continue
        
            93
            
        
            94
                            if region["type"] == "bbox":
        
            95
                                cx = region["shape"]["x"] + region["shape"]["w"] / 2
        
            96
                                cy = region["shape"]["y"] + region["shape"]["h"] / 2
        
            97
                                w = region["shape"]["w"]
        
            98
                                h = region["shape"]["h"]
        
            99
                            elif region["type"] == "polygon":
        
            100
                                x = [point["x"] for point in region["shape"]]
        
            101
                                y = [point["y"] for point in region["shape"]]
        
            102
                                cx = (min(x) + max(x)) / 2
        
            103
                                cy = (min(y) + max(y)) / 2
        
            104
                                w = max(x) - min(x)
        
            105
                                h = max(y) - min(y)
        
            106
            
        
            107
                            annotation_file.write(f"{klass} {cx} {cy} {w} {h}\n")
        
            108
            
        
            109