Rename, considering we want to fetch lists from other places.
1 parent 1b02e60 commit d3ebc80a2912528068da8920e9c5682305c88859
@Alex Tucker Alex Tucker authored on 4 Apr 2019
Showing 2 changed files
View
101
fetch_whitehall.py 0 → 100755
#!/bin/env python3
import re
import time
from datetime import datetime
 
from rdflib import Graph, URIRef, Literal, ConjunctiveGraph
from rdflib.namespace import Namespace, DCTERMS, RDF
from rdflib.store import NO_STORE
from requests import Session
from urllib.parse import urljoin
 
DCAT = Namespace('http://www.w3.org/ns/dcat#')
GDP = Namespace('http://gss-data.org.uk/def/gdp#')
 
ds = ConjunctiveGraph('Sleepycat')
if ds.open('datasets.db') == NO_STORE:
ds.open('datasets.db', create=True)
print(f'Datasets store has {len(ds)} triples')
print(f' and {len(list(ds.triples((None, RDF.type, DCAT.Dataset))))} datasets.')
 
gss = Graph('SPARQLStore', identifier='http://gss-data.org.uk')
gss.open("http://gss-data.org.uk/sparql")
 
orgs = {org.label.value: org.org for org in gss.query(
"""
PREFIX org: <http://www.w3.org/ns/org#>
SELECT DISTINCT ?org ?label
WHERE {
?org a org:Organization ;
rdfs:label ?label .
}""")}
 
gss.close()
 
datasets_url_base = 'https://www.gov.uk/government/statistics.json'
 
s = Session()
still_going = True
datasets_url = datasets_url_base
abbr_re = re.compile(r'<abbr title="([^"]+)">')
collection_re = re.compile(r'Part of a collection: <a href="([^"]+)">')
 
 
def fetch_carefully(url):
tries = 0
holdoff = 5
while tries < 10:
resp = s.get(url)
if resp.status_code == 200:
try:
return resp.json()
except:
pass
time.sleep(holdoff)
tries = tries + 1
holdoff = holdoff * 2
 
 
while still_going:
datasets = fetch_carefully(datasets_url)
fresh_datasets = False
for res_obj in datasets['results']:
res = res_obj['result']
publisher = None
issued = None
collection = None
abbr_match = abbr_re.match(res['organisations'])
if abbr_match:
publisher = orgs.get(abbr_match.group(1), None)
else:
publisher = orgs.get(res['organisations'], None)
issued = datetime.fromisoformat(res['public_timestamp'])
if 'publication_collections' in res and res['publication_collections'] is not None:
coll_match = collection_re.match(res['publication_collections'])
if coll_match:
collection = coll_match.group(1)
landingPage = URIRef(urljoin(datasets_url, res['url']))
prev_publish_date = ds.value(subject=landingPage, predicate=DCTERMS.issued, any=False)
if True or prev_publish_date is None or prev_publish_date.value != issued:
print(f"Updating {res['title']}")
fresh_datasets = True
ds.set((landingPage, RDF.type, DCAT.Dataset))
if res['display_type'] == 'National Statistics':
ds.add((landingPage, RDF.type, GDP.NationalStatistics))
elif res['display_type'] == 'Official Statistics':
ds.add((landingPage, RDF.type, GDP.OfficialStatistics))
ds.set((landingPage, DCTERMS.title, Literal(res['title'], lang="en-gb")))
if publisher is not None:
ds.set((landingPage, DCTERMS.publisher, publisher))
if issued is not None:
ds.set((landingPage, DCTERMS.issued, Literal(issued)))
 
if fresh_datasets and 'next_page_url' in datasets:
datasets_url = urljoin(datasets_url, datasets['next_page_url'])
still_going = True
else:
still_going = False
 
print(f'Datasets store has {len(ds)} triples.')
ds.close()
View
101
to_rdf.py 100755 → 0
#!/bin/env python3
import re
import time
from datetime import datetime
 
from rdflib import Graph, URIRef, Literal, ConjunctiveGraph
from rdflib.namespace import Namespace, DCTERMS, RDF
from rdflib.store import NO_STORE
from requests import Session
from urllib.parse import urljoin
 
DCAT = Namespace('http://www.w3.org/ns/dcat#')
GDP = Namespace('http://gss-data.org.uk/def/gdp#')
 
ds = ConjunctiveGraph('Sleepycat')
if ds.open('datasets.db') == NO_STORE:
ds.open('datasets.db', create=True)
print(f'Datasets store has {len(ds)} triples')
print(f' and {len(list(ds.triples((None, RDF.type, DCAT.Dataset))))} datasets.')
 
gss = Graph('SPARQLStore', identifier='http://gss-data.org.uk')
gss.open("http://gss-data.org.uk/sparql")
 
orgs = {org.label.value: org.org for org in gss.query(
"""
PREFIX org: <http://www.w3.org/ns/org#>
SELECT DISTINCT ?org ?label
WHERE {
?org a org:Organization ;
rdfs:label ?label .
}""")}
 
gss.close()
 
datasets_url_base = 'https://www.gov.uk/government/statistics.json'
 
s = Session()
still_going = True
datasets_url = datasets_url_base
abbr_re = re.compile(r'<abbr title="([^"]+)">')
collection_re = re.compile(r'Part of a collection: <a href="([^"]+)">')
 
 
def fetch_carefully(url):
tries = 0
holdoff = 5
while tries < 10:
resp = s.get(url)
if resp.status_code == 200:
try:
return resp.json()
except:
pass
time.sleep(holdoff)
tries = tries + 1
holdoff = holdoff * 2
 
 
while still_going:
datasets = fetch_carefully(datasets_url)
fresh_datasets = False
for res_obj in datasets['results']:
res = res_obj['result']
publisher = None
issued = None
collection = None
abbr_match = abbr_re.match(res['organisations'])
if abbr_match:
publisher = orgs.get(abbr_match.group(1), None)
else:
publisher = orgs.get(res['organisations'], None)
issued = datetime.fromisoformat(res['public_timestamp'])
if 'publication_collections' in res and res['publication_collections'] is not None:
coll_match = collection_re.match(res['publication_collections'])
if coll_match:
collection = coll_match.group(1)
landingPage = URIRef(urljoin(datasets_url, res['url']))
prev_publish_date = ds.value(subject=landingPage, predicate=DCTERMS.issued, any=False)
if True or prev_publish_date is None or prev_publish_date.value != issued:
print(f"Updating {res['title']}")
fresh_datasets = True
ds.set((landingPage, RDF.type, DCAT.Dataset))
if res['display_type'] == 'National Statistics':
ds.add((landingPage, RDF.type, GDP.NationalStatistics))
elif res['display_type'] == 'Official Statistics':
ds.add((landingPage, RDF.type, GDP.OfficialStatistics))
ds.set((landingPage, DCTERMS.title, Literal(res['title'], lang="en-gb")))
if publisher is not None:
ds.set((landingPage, DCTERMS.publisher, publisher))
if issued is not None:
ds.set((landingPage, DCTERMS.issued, Literal(issued)))
 
if fresh_datasets and 'next_page_url' in datasets:
datasets_url = urljoin(datasets_url, datasets['next_page_url'])
still_going = True
else:
still_going = False
 
print(f'Datasets store has {len(ds)} triples.')
ds.close()