-
Notifications
You must be signed in to change notification settings - Fork 19
Expand file tree
/
Copy pathckan.py
More file actions
114 lines (90 loc) · 4.58 KB
/
ckan.py
File metadata and controls
114 lines (90 loc) · 4.58 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
from processor import Processor
from loguru import logger
import time
class ProcessorCKAN(Processor):
def __init__(self):
super().__init__(type="ckan")
def get_datasets(self, portal_owner, start_url, fname):
logger.info("Processing {}", start_url)
url = start_url
### catch for missing trailing "/" in url
if url[-1] != "/":
url = url + "/"
datasets = processor.get_json(f"{url}api/3/action/package_list")
if datasets != "NULL":
logger.info("Found {} datasets", len(datasets['result']))
prepped = []
for dataset_name in datasets["result"]:
# Rate limit us a little to avoid abusing the API
time.sleep(1)
dataset_metadata = processor.get_json(
f"{url}/api/3/action/package_show?id={dataset_name}"
)
try:
logger.info("Got {} with success status: {}", dataset_name, dataset_metadata['success'])
except:
logger.warning("Failed to get metadata for {}. Skipping...", dataset_name)
continue
dataset_metadata = dataset_metadata["result"]
### gets provided owner name if exists, else uses the owner of the portal.
if (
"organization" in dataset_metadata
and "title" in dataset_metadata["organization"]
):
owner = dataset_metadata["organization"]["title"]
else:
owner = portal_owner
# TEMP FIX: PHS uses CKAN org objects as categories for some reason, overwrite them with PHS until we can make an org filtering system
if portal_owner == "Public Health Scotland":
owner = portal_owner
for resource in dataset_metadata["resources"]:
tags = list(map(lambda x: x["name"], dataset_metadata["tags"]))
file_size = 0
if "archiver" in resource and "size" in resource["archiver"]:
file_size = resource["archiver"]["size"]
elif "size" in resource:
file_size = resource["size"]
file_type = ""
if resource["format"]:
file_type = resource["format"]
elif "qa" in resource and "format" in resource["qa"]:
file_type = resource["qa"]["format"]
elif "resource:format" in resource:
file_type = resource["resource:format"]
elif "service_type" in resource:
file_type = resource["service_type"]
elif "is_wfs" in resource and resource["is_wfs"] == "yes":
file_type = "WFS"
description = dataset_metadata["notes"]
# TEMP FIX: PHS, Dundee and Stirling have some unicode chars that break the CSV. Long term we will sort this by using JSON
if (
portal_owner == "Public Health Scotland"
or portal_owner == "Dundee City Council"
or portal_owner == "Stirling Council"
):
description = (
dataset_metadata["notes"].encode("unicode_escape").decode()
)
prepped.append(
[
dataset_metadata["title"], # Title
owner, # Owner
f"{url}dataset/{dataset_name}", # PageURL
resource["url"], # AssetURL
resource["name"], # FileName
dataset_metadata["metadata_created"], # DateCreated
dataset_metadata["metadata_modified"], # DateUpdated
file_size, # FileSize
"B", # FileSizeUnit
file_type, # FileType
None, # NumRecords
";".join(tags), # OriginalTags
None, # ManualTags
dataset_metadata["license_title"], # License
description, # Description
]
)
processor.write_csv(fname, prepped)
processor = ProcessorCKAN()
if __name__ == "__main__":
processor.process()