the_od_bods/ckan.py at main · OpenDataScotland/the_od_bods · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
from processor import Processor
from loguru import logger
import time

class ProcessorCKAN(Processor):
    def __init__(self):
        super().__init__(type="ckan")

    def get_datasets(self, portal_owner, start_url, fname):
        logger.info("Processing {}", start_url)

        url = start_url

        ### catch for missing trailing "/" in url
        if url[-1] != "/":
            url = url + "/"

        datasets = processor.get_json(f"{url}api/3/action/package_list")
        if datasets != "NULL":

            logger.info("Found {} datasets", len(datasets['result']))

            prepped = []
            for dataset_name in datasets["result"]:

                # Rate limit us a little to avoid abusing the API
                time.sleep(1)
                dataset_metadata = processor.get_json(
                    f"{url}/api/3/action/package_show?id={dataset_name}"
                )

                try:
                    logger.info("Got {} with success status: {}", dataset_name, dataset_metadata['success'])
                except:
                    logger.warning("Failed to get metadata for {}. Skipping...", dataset_name)
                    continue

                dataset_metadata = dataset_metadata["result"]

                ### gets provided owner name if exists, else uses the owner of the portal.
                if (
                    "organization" in dataset_metadata
                    and "title" in dataset_metadata["organization"]
                ):
                    owner = dataset_metadata["organization"]["title"]
                else:
                    owner = portal_owner

                # TEMP FIX: PHS uses CKAN org objects as categories for some reason, overwrite them with PHS until we can make an org filtering system
                if portal_owner == "Public Health Scotland":
                    owner = portal_owner

                for resource in dataset_metadata["resources"]:
                    tags = list(map(lambda x: x["name"], dataset_metadata["tags"]))

                    file_size = 0

                    if "archiver" in resource and "size" in resource["archiver"]:
                        file_size = resource["archiver"]["size"]
                    elif "size" in resource:
                        file_size = resource["size"]

                    file_type = ""

                    if resource["format"]:
                        file_type = resource["format"]
                    elif "qa" in resource and "format" in resource["qa"]:
                        file_type = resource["qa"]["format"]
                    elif "resource:format" in resource:
                        file_type = resource["resource:format"]
                    elif "service_type" in resource:
                        file_type = resource["service_type"]
                    elif "is_wfs" in resource and resource["is_wfs"] == "yes":
                        file_type = "WFS"

                    description = dataset_metadata["notes"]

                    # TEMP FIX: PHS, Dundee and Stirling have some unicode chars that break the CSV. Long term we will sort this by using JSON
                    if (
                        portal_owner == "Public Health Scotland"
                        or portal_owner == "Dundee City Council"
                        or portal_owner == "Stirling Council"
                    ):
                        description = (
                            dataset_metadata["notes"].encode("unicode_escape").decode()
                        )

                    prepped.append(
                        [
                            dataset_metadata["title"],  # Title
                            owner,  # Owner
                            f"{url}dataset/{dataset_name}",  # PageURL
                            resource["url"],  # AssetURL
                            resource["name"],  # FileName
                            dataset_metadata["metadata_created"],  # DateCreated
                            dataset_metadata["metadata_modified"],  # DateUpdated
                            file_size,  # FileSize
                            "B",  # FileSizeUnit
                            file_type,  # FileType
                            None,  # NumRecords
                            ";".join(tags),  # OriginalTags
                            None,  # ManualTags
                            dataset_metadata["license_title"],  # License
                            description,  # Description
                        ]
                    )

                processor.write_csv(fname, prepped)


processor = ProcessorCKAN()

if __name__ == "__main__":
    processor.process()