#!/bin/env python3 from rdflib import Graph, URIRef, Literal, ConjunctiveGraph from rdflib.namespace import Namespace, DCTERMS, RDF, RDFS from rdflib.store import NO_STORE from requests import Session from dateutil.parser import parse from urllib.parse import urljoin DCAT = Namespace('http://www.w3.org/ns/dcat#') ds = ConjunctiveGraph('Sleepycat') if ds.open('datasets.db') == NO_STORE: ds.open('datasets.db', create=True) print(f'Datasets store has {len(ds)} triples') print(f' and {len(list(ds.triples((None, RDF.type, DCAT.Dataset))))} datasets.') gss = Graph('SPARQLStore', identifier='http://gss-data.org.uk') gss.open("http://gss-data.org.uk/sparql") orgs = {org.label.value: org.org for org in gss.query( """ PREFIX org: <http://www.w3.org/ns/org#> SELECT DISTINCT ?org ?label WHERE { ?org a org:Organization ; rdfs:label ?label . }""")} gss.close() datasets_url_base = 'https://www.gov.uk/government/statistical-data-sets.json' s = Session() still_going = True page = 1 datasets_url = datasets_url_base while still_going: datasets = s.get(datasets_url).json() fresh_datasets = False for doc_obj in datasets['documents']: doc = doc_obj['document'] publisher = None issued = None for md in doc['metadata']: if md['label'] == 'From' and md['id'] == 'organisations': for who in md['labels']: if who in orgs: publisher = orgs[who] break elif md['label'] == 'Published at': issued = parse(md['machine_date']).date() if 'link' in doc: doc_url = urljoin(datasets_url, doc['link']) doc_node = URIRef(doc_url) prev_publish_date = ds.value(subject=doc_node, predicate=DCTERMS.issued, any=False) if prev_publish_date is None or prev_publish_date.value != issued: print(f"Updating {doc['title']}") fresh_datasets = True ds.set((doc_node, RDF.type, DCAT.Dataset)) ds.set((doc_node, RDFS.label, Literal(doc['title'], lang="en-gb"))) ds.set((doc_node, RDFS.comment, Literal(doc['summary'], lang="en-gb"))) if publisher is not None: ds.set((doc_node, DCTERMS.publisher, publisher)) if issued is not None: ds.set((doc_node, DCTERMS.issued, Literal(issued))) if fresh_datasets and page < datasets['page_count']: page = page + 1 datasets_url = f'{datasets_url_base}?page={page}' still_going = True else: still_going = False print(f'Datasets store has {len(ds)} triples.') ds.close()