diff --git a/fetch_whitehall.py b/fetch_whitehall.py new file mode 100755 index 0000000..2ab57cc --- /dev/null +++ b/fetch_whitehall.py @@ -0,0 +1,100 @@ +#!/bin/env python3 +import re +import time +from datetime import datetime + +from rdflib import Graph, URIRef, Literal, ConjunctiveGraph +from rdflib.namespace import Namespace, DCTERMS, RDF +from rdflib.store import NO_STORE +from requests import Session +from urllib.parse import urljoin + +DCAT = Namespace('http://www.w3.org/ns/dcat#') +GDP = Namespace('http://gss-data.org.uk/def/gdp#') + +ds = ConjunctiveGraph('Sleepycat') +if ds.open('datasets.db') == NO_STORE: + ds.open('datasets.db', create=True) +print(f'Datasets store has {len(ds)} triples') +print(f' and {len(list(ds.triples((None, RDF.type, DCAT.Dataset))))} datasets.') + +gss = Graph('SPARQLStore', identifier='http://gss-data.org.uk') +gss.open("http://gss-data.org.uk/sparql") + +orgs = {org.label.value: org.org for org in gss.query( + """ +PREFIX org: +SELECT DISTINCT ?org ?label +WHERE { + ?org a org:Organization ; + rdfs:label ?label . +}""")} + +gss.close() + +datasets_url_base = 'https://www.gov.uk/government/statistics.json' + +s = Session() +still_going = True +datasets_url = datasets_url_base +abbr_re = re.compile(r'') +collection_re = re.compile(r'Part of a collection: ') + + +def fetch_carefully(url): + tries = 0 + holdoff = 5 + while tries < 10: + resp = s.get(url) + if resp.status_code == 200: + try: + return resp.json() + except: + pass + time.sleep(holdoff) + tries = tries + 1 + holdoff = holdoff * 2 + + +while still_going: + datasets = fetch_carefully(datasets_url) + fresh_datasets = False + for res_obj in datasets['results']: + res = res_obj['result'] + publisher = None + issued = None + collection = None + abbr_match = abbr_re.match(res['organisations']) + if abbr_match: + publisher = orgs.get(abbr_match.group(1), None) + else: + publisher = orgs.get(res['organisations'], None) + issued = datetime.fromisoformat(res['public_timestamp']) + if 'publication_collections' in res and res['publication_collections'] is not None: + coll_match = collection_re.match(res['publication_collections']) + if coll_match: + collection = coll_match.group(1) + landingPage = URIRef(urljoin(datasets_url, res['url'])) + prev_publish_date = ds.value(subject=landingPage, predicate=DCTERMS.issued, any=False) + if True or prev_publish_date is None or prev_publish_date.value != issued: + print(f"Updating {res['title']}") + fresh_datasets = True + ds.set((landingPage, RDF.type, DCAT.Dataset)) + if res['display_type'] == 'National Statistics': + ds.add((landingPage, RDF.type, GDP.NationalStatistics)) + elif res['display_type'] == 'Official Statistics': + ds.add((landingPage, RDF.type, GDP.OfficialStatistics)) + ds.set((landingPage, DCTERMS.title, Literal(res['title'], lang="en-gb"))) + if publisher is not None: + ds.set((landingPage, DCTERMS.publisher, publisher)) + if issued is not None: + ds.set((landingPage, DCTERMS.issued, Literal(issued))) + + if fresh_datasets and 'next_page_url' in datasets: + datasets_url = urljoin(datasets_url, datasets['next_page_url']) + still_going = True + else: + still_going = False + +print(f'Datasets store has {len(ds)} triples.') +ds.close() diff --git a/to_rdf.py b/to_rdf.py deleted file mode 100755 index 2ab57cc..0000000 --- a/to_rdf.py +++ /dev/null @@ -1,100 +0,0 @@ -#!/bin/env python3 -import re -import time -from datetime import datetime - -from rdflib import Graph, URIRef, Literal, ConjunctiveGraph -from rdflib.namespace import Namespace, DCTERMS, RDF -from rdflib.store import NO_STORE -from requests import Session -from urllib.parse import urljoin - -DCAT = Namespace('http://www.w3.org/ns/dcat#') -GDP = Namespace('http://gss-data.org.uk/def/gdp#') - -ds = ConjunctiveGraph('Sleepycat') -if ds.open('datasets.db') == NO_STORE: - ds.open('datasets.db', create=True) -print(f'Datasets store has {len(ds)} triples') -print(f' and {len(list(ds.triples((None, RDF.type, DCAT.Dataset))))} datasets.') - -gss = Graph('SPARQLStore', identifier='http://gss-data.org.uk') -gss.open("http://gss-data.org.uk/sparql") - -orgs = {org.label.value: org.org for org in gss.query( - """ -PREFIX org: -SELECT DISTINCT ?org ?label -WHERE { - ?org a org:Organization ; - rdfs:label ?label . -}""")} - -gss.close() - -datasets_url_base = 'https://www.gov.uk/government/statistics.json' - -s = Session() -still_going = True -datasets_url = datasets_url_base -abbr_re = re.compile(r'') -collection_re = re.compile(r'Part of a collection: ') - - -def fetch_carefully(url): - tries = 0 - holdoff = 5 - while tries < 10: - resp = s.get(url) - if resp.status_code == 200: - try: - return resp.json() - except: - pass - time.sleep(holdoff) - tries = tries + 1 - holdoff = holdoff * 2 - - -while still_going: - datasets = fetch_carefully(datasets_url) - fresh_datasets = False - for res_obj in datasets['results']: - res = res_obj['result'] - publisher = None - issued = None - collection = None - abbr_match = abbr_re.match(res['organisations']) - if abbr_match: - publisher = orgs.get(abbr_match.group(1), None) - else: - publisher = orgs.get(res['organisations'], None) - issued = datetime.fromisoformat(res['public_timestamp']) - if 'publication_collections' in res and res['publication_collections'] is not None: - coll_match = collection_re.match(res['publication_collections']) - if coll_match: - collection = coll_match.group(1) - landingPage = URIRef(urljoin(datasets_url, res['url'])) - prev_publish_date = ds.value(subject=landingPage, predicate=DCTERMS.issued, any=False) - if True or prev_publish_date is None or prev_publish_date.value != issued: - print(f"Updating {res['title']}") - fresh_datasets = True - ds.set((landingPage, RDF.type, DCAT.Dataset)) - if res['display_type'] == 'National Statistics': - ds.add((landingPage, RDF.type, GDP.NationalStatistics)) - elif res['display_type'] == 'Official Statistics': - ds.add((landingPage, RDF.type, GDP.OfficialStatistics)) - ds.set((landingPage, DCTERMS.title, Literal(res['title'], lang="en-gb"))) - if publisher is not None: - ds.set((landingPage, DCTERMS.publisher, publisher)) - if issued is not None: - ds.set((landingPage, DCTERMS.issued, Literal(issued))) - - if fresh_datasets and 'next_page_url' in datasets: - datasets_url = urljoin(datasets_url, datasets['next_page_url']) - still_going = True - else: - still_going = False - -print(f'Datasets store has {len(ds)} triples.') -ds.close()