diff --git a/check_datasets.py b/check_datasets.py new file mode 100644 index 0000000..dfb1388 --- /dev/null +++ b/check_datasets.py @@ -0,0 +1,33 @@ +#!/bin/env python3 +from rdflib import ConjunctiveGraph, Graph +from rdflib.namespace import RDF, Namespace, DCTERMS + +DCAT = Namespace('http://www.w3.org/ns/dcat#') + +ds = ConjunctiveGraph('Sleepycat') +ds.open('datasets.db') +print(f'Datasets store has {len(ds)} triples') +print(f' and {len(list(ds.triples((None, RDF.type, DCAT.Dataset))))} datasets.') + +gss = Graph('SPARQLStore', identifier='http://gss-data.org.uk') +gss.open("http://gss-data.org.uk/sparql") + +for dataset in gss.query(""" +PREFIX dcat: +PREFIX dct: + +SELECT ?id ?url ?issued WHERE { + ?id a dcat:Dataset ; + dcat:landingPage ?url ; + dct:issued ?issued . +}"""): + latest_pub_date = ds.value(subject=dataset.url, predicate=DCTERMS.issued, any=False) + if latest_pub_date is None: + print(f"Dataset {dataset.url} not listed in gov.uk statistical datasets.") + elif latest_pub_date != dataset.issued: + print(f"Dataset {dataset.url} has more recent update.") + else: + print(f"Dataset {dataset.url} is up to date.") + +gss.close() +ds.close() diff --git a/to_rdf.py b/to_rdf.py new file mode 100755 index 0000000..5eba8ac --- /dev/null +++ b/to_rdf.py @@ -0,0 +1,78 @@ +#!/bin/env python3 + +from rdflib import Graph, URIRef, Literal, ConjunctiveGraph +from rdflib.namespace import Namespace, DCTERMS, RDF, RDFS +from rdflib.store import NO_STORE +from requests import Session +from dateutil.parser import parse +from urllib.parse import urljoin + +DCAT = Namespace('http://www.w3.org/ns/dcat#') + +ds = ConjunctiveGraph('Sleepycat') +if ds.open('datasets.db') == NO_STORE: + ds.open('datasets.db', create=True) +print(f'Datasets store has {len(ds)} triples') +print(f' and {len(list(ds.triples((None, RDF.type, DCAT.Dataset))))} datasets.') + +gss = Graph('SPARQLStore', identifier='http://gss-data.org.uk') +gss.open("http://gss-data.org.uk/sparql") + +orgs = {org.label.value: org.org for org in gss.query( + """ +PREFIX org: +SELECT DISTINCT ?org ?label +WHERE { + ?org a org:Organization ; + rdfs:label ?label . +}""")} + +gss.close() + +datasets_url_base = 'https://www.gov.uk/government/statistical-data-sets.json' + +s = Session() +still_going = True +page = 1 +datasets_url = datasets_url_base + +while still_going: + datasets = s.get(datasets_url).json() + fresh_datasets = False + for doc_obj in datasets['documents']: + doc = doc_obj['document'] + publisher = None + issued = None + for md in doc['metadata']: + if md['label'] == 'From' and md['id'] == 'organisations': + for who in md['labels']: + if who in orgs: + publisher = orgs[who] + break + elif md['label'] == 'Published at': + issued = parse(md['machine_date']).date() + if 'link' in doc: + doc_url = urljoin(datasets_url, doc['link']) + doc_node = URIRef(doc_url) + prev_publish_date = ds.value(subject=doc_node, + predicate=DCTERMS.issued, + any=False) + if prev_publish_date is None or prev_publish_date.value != issued: + print(f"Updating {doc['title']}") + fresh_datasets = True + ds.set((doc_node, RDF.type, DCAT.Dataset)) + ds.set((doc_node, RDFS.label, Literal(doc['title'], lang="en-gb"))) + ds.set((doc_node, RDFS.comment, Literal(doc['summary'], lang="en-gb"))) + if publisher is not None: + ds.set((doc_node, DCTERMS.publisher, publisher)) + if issued is not None: + ds.set((doc_node, DCTERMS.issued, Literal(issued))) + if fresh_datasets and page < datasets['page_count']: + page = page + 1 + datasets_url = f'{datasets_url_base}?page={page}' + still_going = True + else: + still_going = False + +print(f'Datasets store has {len(ds)} triples.') +ds.close()