diff --git a/check_datasets.py b/check_datasets.py index dfb1388..2aea7e1 100644 --- a/check_datasets.py +++ b/check_datasets.py @@ -1,13 +1,20 @@ #!/bin/env python3 +import datetime + from rdflib import ConjunctiveGraph, Graph from rdflib.namespace import RDF, Namespace, DCTERMS DCAT = Namespace('http://www.w3.org/ns/dcat#') +GDP = Namespace('http://gss-data.org.uk/def/gdp#') ds = ConjunctiveGraph('Sleepycat') ds.open('datasets.db') print(f'Datasets store has {len(ds)} triples') -print(f' and {len(list(ds.triples((None, RDF.type, DCAT.Dataset))))} datasets.') +datasets = set(ds.subjects(RDF.type, DCAT.Dataset)) +datasets.update(ds.subjects(RDF.type, GDP.NationalStatistics)) +datasets.update(ds.subjects(RDF.type, GDP.OfficialStatistics)) + +print(f' and {len(datasets)} datasets.') gss = Graph('SPARQLStore', identifier='http://gss-data.org.uk') gss.open("http://gss-data.org.uk/sparql") @@ -22,9 +29,12 @@ dct:issued ?issued . }"""): latest_pub_date = ds.value(subject=dataset.url, predicate=DCTERMS.issued, any=False) + gss_issued = dataset.issued.value + if type(gss_issued) == datetime.datetime: + gss_issued = gss_issued.date() if latest_pub_date is None: print(f"Dataset {dataset.url} not listed in gov.uk statistical datasets.") - elif latest_pub_date != dataset.issued: + elif latest_pub_date.value.date() > gss_issued: print(f"Dataset {dataset.url} has more recent update.") else: print(f"Dataset {dataset.url} is up to date.")