diff --git a/to_rdf.py b/to_rdf.py index 5eba8ac..2ab57cc 100755 --- a/to_rdf.py +++ b/to_rdf.py @@ -1,13 +1,16 @@ #!/bin/env python3 +import re +import time +from datetime import datetime from rdflib import Graph, URIRef, Literal, ConjunctiveGraph -from rdflib.namespace import Namespace, DCTERMS, RDF, RDFS +from rdflib.namespace import Namespace, DCTERMS, RDF from rdflib.store import NO_STORE from requests import Session -from dateutil.parser import parse from urllib.parse import urljoin DCAT = Namespace('http://www.w3.org/ns/dcat#') +GDP = Namespace('http://gss-data.org.uk/def/gdp#') ds = ConjunctiveGraph('Sleepycat') if ds.open('datasets.db') == NO_STORE: @@ -29,47 +32,66 @@ gss.close() -datasets_url_base = 'https://www.gov.uk/government/statistical-data-sets.json' +datasets_url_base = 'https://www.gov.uk/government/statistics.json' s = Session() still_going = True -page = 1 datasets_url = datasets_url_base +abbr_re = re.compile(r'') +collection_re = re.compile(r'Part of a collection: ') + + +def fetch_carefully(url): + tries = 0 + holdoff = 5 + while tries < 10: + resp = s.get(url) + if resp.status_code == 200: + try: + return resp.json() + except: + pass + time.sleep(holdoff) + tries = tries + 1 + holdoff = holdoff * 2 + while still_going: - datasets = s.get(datasets_url).json() + datasets = fetch_carefully(datasets_url) fresh_datasets = False - for doc_obj in datasets['documents']: - doc = doc_obj['document'] + for res_obj in datasets['results']: + res = res_obj['result'] publisher = None issued = None - for md in doc['metadata']: - if md['label'] == 'From' and md['id'] == 'organisations': - for who in md['labels']: - if who in orgs: - publisher = orgs[who] - break - elif md['label'] == 'Published at': - issued = parse(md['machine_date']).date() - if 'link' in doc: - doc_url = urljoin(datasets_url, doc['link']) - doc_node = URIRef(doc_url) - prev_publish_date = ds.value(subject=doc_node, - predicate=DCTERMS.issued, - any=False) - if prev_publish_date is None or prev_publish_date.value != issued: - print(f"Updating {doc['title']}") - fresh_datasets = True - ds.set((doc_node, RDF.type, DCAT.Dataset)) - ds.set((doc_node, RDFS.label, Literal(doc['title'], lang="en-gb"))) - ds.set((doc_node, RDFS.comment, Literal(doc['summary'], lang="en-gb"))) - if publisher is not None: - ds.set((doc_node, DCTERMS.publisher, publisher)) - if issued is not None: - ds.set((doc_node, DCTERMS.issued, Literal(issued))) - if fresh_datasets and page < datasets['page_count']: - page = page + 1 - datasets_url = f'{datasets_url_base}?page={page}' + collection = None + abbr_match = abbr_re.match(res['organisations']) + if abbr_match: + publisher = orgs.get(abbr_match.group(1), None) + else: + publisher = orgs.get(res['organisations'], None) + issued = datetime.fromisoformat(res['public_timestamp']) + if 'publication_collections' in res and res['publication_collections'] is not None: + coll_match = collection_re.match(res['publication_collections']) + if coll_match: + collection = coll_match.group(1) + landingPage = URIRef(urljoin(datasets_url, res['url'])) + prev_publish_date = ds.value(subject=landingPage, predicate=DCTERMS.issued, any=False) + if True or prev_publish_date is None or prev_publish_date.value != issued: + print(f"Updating {res['title']}") + fresh_datasets = True + ds.set((landingPage, RDF.type, DCAT.Dataset)) + if res['display_type'] == 'National Statistics': + ds.add((landingPage, RDF.type, GDP.NationalStatistics)) + elif res['display_type'] == 'Official Statistics': + ds.add((landingPage, RDF.type, GDP.OfficialStatistics)) + ds.set((landingPage, DCTERMS.title, Literal(res['title'], lang="en-gb"))) + if publisher is not None: + ds.set((landingPage, DCTERMS.publisher, publisher)) + if issued is not None: + ds.set((landingPage, DCTERMS.issued, Literal(issued))) + + if fresh_datasets and 'next_page_url' in datasets: + datasets_url = urljoin(datasets_url, datasets['next_page_url']) still_going = True else: still_going = False