diff --git a/Jenkinsfile b/Jenkinsfile new file mode 100644 index 0000000..4bb2409 --- /dev/null +++ b/Jenkinsfile @@ -0,0 +1,31 @@ +pipeline { + agent { + label 'master' + } + stages { + stage('Clean') { + steps { + sh 'rm -rf out' + sh 'mkdir out' + } + } + stage('Family scope report') { + agent { + docker { + image 'cloudfluff/databaker-docker' + reuseNode true + } + } + steps { + sh 'cd family-scope' + sh 'jupyter-nbconvert --to python --stdout family-scope.ipynb | ipython' + sh 'cp dataset-dimensions.html ../out/' + } + } + } + post { + always { + archiveArtifacts 'out/*' + } + } +} diff --git a/family-scope/dataset-structure.sparql b/family-scope/dataset-structure.sparql new file mode 100644 index 0000000..80bf0c9 --- /dev/null +++ b/family-scope/dataset-structure.sparql @@ -0,0 +1,23 @@ +PREFIX qb: +PREFIX rdfs: +PREFIX skos: +PREFIX pmdqb: + +SELECT ?dataset ?dataset_uri ?component_spec_uri ?dimension_uri ?dimension ?codelist ?codelist_uri +WHERE { + ?dataset_uri + a qb:DataSet; + rdfs:label ?dataset; + qb:structure/qb:component ?component_spec_uri . + + ?component_spec_uri + qb:dimension ?dimension_uri . + + ?dimension_uri rdfs:label ?dimension . + + ?component_spec_uri pmdqb:codesUsed/skos:member/skos:inScheme ?codelist_uri . + + ?codelist_uri rdfs:label ?codelist . +} +GROUP BY ?dataset ?dataset_uri ?component_spec_uri ?dimension ?dimension_uri ?codelist ?codelist_uri +ORDER BY ?dataset_uri ?component_spec_uri ?dimension_uri ?codelist_uri diff --git a/family-scope/family-scope.ipynb b/family-scope/family-scope.ipynb new file mode 100644 index 0000000..697ee61 --- /dev/null +++ b/family-scope/family-scope.ipynb @@ -0,0 +1,334 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Investigating the coverage of the trade datasets family\n", + "## Setup SPARQL API" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from SPARQLWrapper import SPARQLWrapper2\n", + "import pandas as pd\n", + "from IPython.display import HTML\n", + "\n", + "endpoint = \"https://production-drafter-ons-alpha.publishmydata.com/v1/sparql/live\"\n", + "sparql = SPARQLWrapper2(endpoint)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Retreive Data Structure Definitions\n", + "This retreives the codelist(s) for each dimension, for each dataset." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "with open('./dataset-structure.sparql', 'r') as query_file:\n", + " query_str = query_file.read()\n", + "\n", + "sparql.setQuery(query_str)\n", + "\n", + "results = pd.DataFrame(sparql.query().bindings)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create URI presenters\n", + "We can create presenters that will provide, for each URI, an html link to the resource (i.e. the URI itself) with the `rdfs:label` used as the text." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "def uri_to_label(uri_column,label_column):\n", + " return(dict(zip(results[uri_column].map(lambda x: x.value),\n", + " results[label_column].map(lambda x: x.value))))\n", + "\n", + "labeller = {**uri_to_label('dataset_uri','dataset'),\n", + " **uri_to_label('dimension_uri','dimension'),\n", + " **uri_to_label('codelist_uri','codelist')}\n", + "\n", + "def presenter(uri):\n", + " return(f'{labeller[uri]} ')\n", + "\n", + "def lister(presenters):\n", + " return(presenters.str.join(''))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Present Codelists in a matrix" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Prepare a matrix of datasets vs dimensions, with each cell summarising the codelists being used." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "codelists = results[['dataset_uri','dimension_uri','codelist_uri']].applymap(lambda x: presenter(x.value))\n", + "codelists_pivot = codelists.pivot_table(index='dataset_uri',columns='dimension_uri',values='codelist_uri',aggfunc=lister)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Sort the matrix such that the most commonly re-used dimensions are to the left hand side." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
dimension_uriFlowProductTrade Partner GeographySeasonal AdjustmentPrice ClassificationEmploymentAge of BusinessTurnoverTrade Reporter GeographyInternational Trade BasisExport and Import ActivityCountry of Ownership
dataset_uri
HMRC Overseas Trade StatisticsFlow Directions[Combined Nomenclature, 2012 , Combined Nomenclature, 2013 , Combined Nomenclature, 2014 , Combined Nomenclature, 2015 , Combined Nomenclature, 2016 ]HMRC Geographies
HMRC Regional Trade StatisticsFlow DirectionsSITC 4HMRC GeographiesHMRC Regions
HMRC UK Trade in Goods Statistics by Business Characteristics 2015Flow DirectionsHMRC Industry GroupsEmployment Size BandsAge of Business
ONS ABSEmployment Size BandsAge of BusinessTurnover Size BandsExport and Import ActivityCountries of Ownership
ONS Balance of PaymentsFlow DirectionsBOP Services
ONS BoP Individual Country DataFlow DirectionsONS Trade Areas CORD
ONS CPAFlow DirectionsStatistical Classification of Products by Activity in the European Economic Community, 2008 versionONS Trade Areas LegacySeasonal AdjustmentsPrice ClassificationsInternational Trade Basis
ONS Pink Book Chapter 3Flow DirectionsPink Book Services
ONS Trade in goods MRETSFlow DirectionsCORD SITC[ONS Trade Areas CORD , ONS Trade Areas Legacy ]Seasonal AdjustmentsPrice Classifications
" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dimension_counts = codelists_pivot.count('rows').sort_values(ascending=False)\n", + "codelists_pivot = codelists_pivot.reindex(columns=dimension_counts.index).fillna('')\n", + "pd.set_option('max_colwidth', -1) # don't truncate cells\n", + "with open(\"dataset-dimensions.html\", \"w\") as f:\n", + " f.write(codelists_pivot.to_html(escape=False))\n", + "HTML(codelists_pivot.to_html(escape=False))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note that although multiple datasets re-use the same common dimensions, they often use their own codelists. We know that there exist correspondances between these (e.g. BPM6 codes can be used to relate BOP and PB definitions) although we've not published the links for those yet." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}