Newer
Older
stats-registry / to_rdf.py
#!/bin/env python3

from rdflib import Graph, URIRef, Literal, ConjunctiveGraph
from rdflib.namespace import Namespace, DCTERMS, RDF, RDFS
from rdflib.store import NO_STORE
from requests import Session
from dateutil.parser import parse
from urllib.parse import urljoin

DCAT = Namespace('http://www.w3.org/ns/dcat#')

ds = ConjunctiveGraph('Sleepycat')
if ds.open('datasets.db') == NO_STORE:
    ds.open('datasets.db', create=True)
print(f'Datasets store has {len(ds)} triples')
print(f' and {len(list(ds.triples((None, RDF.type, DCAT.Dataset))))} datasets.')

gss = Graph('SPARQLStore', identifier='http://gss-data.org.uk')
gss.open("http://gss-data.org.uk/sparql")

orgs = {org.label.value: org.org for org in gss.query(
    """
PREFIX org: <http://www.w3.org/ns/org#>
SELECT DISTINCT ?org ?label
WHERE {
  ?org a org:Organization ;
    rdfs:label ?label .
}""")}

gss.close()

datasets_url_base = 'https://www.gov.uk/government/statistical-data-sets.json'

s = Session()
still_going = True
page = 1
datasets_url = datasets_url_base

while still_going:
    datasets = s.get(datasets_url).json()
    fresh_datasets = False
    for doc_obj in datasets['documents']:
        doc = doc_obj['document']
        publisher = None
        issued = None
        for md in doc['metadata']:
            if md['label'] == 'From' and md['id'] == 'organisations':
                for who in md['labels']:
                    if who in orgs:
                        publisher = orgs[who]
                        break
            elif md['label'] == 'Published at':
                issued = parse(md['machine_date']).date()
        if 'link' in doc:
            doc_url = urljoin(datasets_url, doc['link'])
            doc_node = URIRef(doc_url)
            prev_publish_date = ds.value(subject=doc_node,
                                         predicate=DCTERMS.issued,
                                         any=False)
            if prev_publish_date is None or prev_publish_date.value != issued:
                print(f"Updating {doc['title']}")
                fresh_datasets = True
                ds.set((doc_node, RDF.type, DCAT.Dataset))
                ds.set((doc_node, RDFS.label, Literal(doc['title'], lang="en-gb")))
                ds.set((doc_node, RDFS.comment, Literal(doc['summary'], lang="en-gb")))
                if publisher is not None:
                    ds.set((doc_node, DCTERMS.publisher, publisher))
                if issued is not None:
                    ds.set((doc_node, DCTERMS.issued, Literal(issued)))
    if fresh_datasets and page < datasets['page_count']:
        page = page + 1
        datasets_url = f'{datasets_url_base}?page={page}'
        still_going = True
    else:
        still_going = False

print(f'Datasets store has {len(ds)} triples.')
ds.close()