Gather some statistics about the datasets, vocabularies and codelists loaded into PMD.

In [1]:
from SPARQLWrapper import SPARQLWrapper2
import pandas as pd
from IPython.display import HTML

endpoint = "https://production-drafter-ons-alpha.publishmydata.com/v1/sparql/live"
sparql = SPARQLWrapper2(endpoint)

Find the distinct datasets

In [2]:
sparql.setQuery("""
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX qb:   <http://purl.org/linked-data/cube#>
PREFIX pmd:  <http://publishmydata.com/def/dataset#>
PREFIX gdp:  <http://gss-data.org.uk/def/gdp#>

SELECT DISTINCT ?dataset ?datasetLabel ?graph ?family
WHERE {
  ?dataset a qb:DataSet ;
         rdfs:label ?datasetLabel ;
         pmd:graph ?graph .
  OPTIONAL {
    ?dataset gdp:family [rdfs:label ?family]
  }
}
""")

results = sparql.query().bindings

table = pd.DataFrame()
table['Label'] = pd.Series({
    res['dataset'].value: res['datasetLabel'].value
    for res in results
})

table['Graph'] = pd.Series({
    res['dataset'].value: res['graph'].value
    for res in results
})

table['Family'] = pd.Series({
    res['dataset'].value: res['family'].value if 'family' in res else 'Trade'
    for res in results
})
table

Unnamed: 0,Label,Graph,Family
http://gss-data.org.uk/data/ons-pink-book-chapter-3,ONS Pink Book Chapter 3,http://gss-data.org.uk/graph/ons-pink-book-cha...,Trade
http://gss-data.org.uk/data/ons-bop-individual-country-data,ONS BoP Individual Country Data,http://gss-data.org.uk/graph/ons-bop-individua...,Trade
http://gss-data.org.uk/data/gss_data/migration/nisra-ni-migration-estimates,Mid-Year Population Estimates,http://gss-data.org.uk/graph/gss_data/migratio...,Migration
http://gss-data.org.uk/data/gss_data/migration/scotland-overseas,Migration between Scotland and Overseas,http://gss-data.org.uk/graph/gss_data/migratio...,Migration
http://gss-data.org.uk/data/gss_data/migration/dwp-nin-registrations-to-overseas-nationals,National Insurance number allocations to adult...,http://gss-data.org.uk/graph/gss_data/migratio...,Migration
http://gss-data.org.uk/data/gss_data/migration/ons-local-area-migration-indicators,"Local area migration indicators, UK",http://gss-data.org.uk/graph/gss_data/migratio...,Migration
http://gss-data.org.uk/data/gss_data/migration/ons-ltim-age-and-sex,"Long-term international migration 2.07, age an...",http://gss-data.org.uk/graph/gss_data/migratio...,Migration
http://gss-data.org.uk/data/gss_data/migration/ons-ltim-uk-destination-or-origin,"Long-term international migration 2.06, area o...",http://gss-data.org.uk/graph/gss_data/migratio...,Migration
http://gss-data.org.uk/data/gss_data/migration/ons-ltim-citizenship,"Long-term international migration 2.01a, citiz...",http://gss-data.org.uk/graph/gss_data/migratio...,Migration
http://gss-data.org.uk/data/gss_data/migration/ons-ltim-country-of-residence,"Long-term international migration 2.02, countr...",http://gss-data.org.uk/graph/gss_data/migratio...,Migration


Find the sizes of the graphs.

In [3]:
sparql.setQuery("""
SELECT (COUNT(*) as ?size) ?graph
WHERE {
  GRAPH ?graph {
    ?s ?p ?o
  }
} GROUP BY ?graph
""")

sizes = pd.DataFrame()
sizes['Triples'] = pd.Series({
    res['graph'].value : int(res['size'].value)
    for res in sparql.query().bindings
})
sizes

Unnamed: 0,Triples
http://purl.org/dc/terms/,866
http://gss-data.org.uk/graph/semstats/cpav2008-cpav21,28071
http://gss-data.org.uk/graph/semstats/cpav2008,47707
http://gss-data.org.uk/graph/semstats/cpav21,44275
http://gss-data.org.uk/graph/semstats/cpcv11,29269
http://gss-data.org.uk/graph/semstats/cpcv2,44159
http://gss-data.org.uk/graph/semstats/cpcv21,36837
http://gss-data.org.uk/graph/semstats/cpcv11-cpcv2,15202
http://gss-data.org.uk/graph/semstats/cpcv2-cpcv21,14788
http://gss-data.org.uk/graph/semstats/isicr31,5438


In [4]:
table = table.merge(sizes, left_on='Graph', right_index=True)
table.drop(columns=['Graph'], inplace=True)
table

Unnamed: 0,Label,Family,Triples
http://gss-data.org.uk/data/ons-pink-book-chapter-3,ONS Pink Book Chapter 3,Trade,150770
http://gss-data.org.uk/data/ons-bop-individual-country-data,ONS BoP Individual Country Data,Trade,1777303
http://gss-data.org.uk/data/gss_data/migration/nisra-ni-migration-estimates,Mid-Year Population Estimates,Migration,194613
http://gss-data.org.uk/data/gss_data/migration/scotland-overseas,Migration between Scotland and Overseas,Migration,219560
http://gss-data.org.uk/data/gss_data/migration/dwp-nin-registrations-to-overseas-nationals,National Insurance number allocations to adult...,Migration,53874
http://gss-data.org.uk/data/gss_data/migration/ons-local-area-migration-indicators,"Local area migration indicators, UK",Migration,153643
http://gss-data.org.uk/data/gss_data/migration/ons-ltim-age-and-sex,"Long-term international migration 2.07, age an...",Migration,35264
http://gss-data.org.uk/data/gss_data/migration/ons-ltim-uk-destination-or-origin,"Long-term international migration 2.06, area o...",Migration,13223
http://gss-data.org.uk/data/gss_data/migration/ons-ltim-citizenship,"Long-term international migration 2.01a, citiz...",Migration,20269
http://gss-data.org.uk/data/gss_data/migration/ons-ltim-country-of-residence,"Long-term international migration 2.02, countr...",Migration,17579


In [5]:
from urllib.parse import urlencode
def gss_url(uri):
    return 'http://gss-data.org.uk/resource?' + urlencode({
        'uri': uri
    })

table['Dataset'] = table.apply(lambda x: f'<a href="{gss_url(x.name)}">{x.Label}</a>', axis=1)
table.drop(columns=['Label'], inplace=True)
table = table[['Dataset', 'Family', 'Triples']]
table = table.sort_values(by=['Family', 'Dataset'])
table = table.set_index(['Family', 'Dataset'])
pd.set_option('max_colwidth', -1)
with open('dataset-stats.html', 'w') as f:
    f.write(table.to_html(escape=False, index=True))
HTML(table.to_html(escape=False, index=True))

Unnamed: 0_level_0,Unnamed: 1_level_0,Triples
Family,Dataset,Unnamed: 2_level_1
Disability,Family Resources Survey: financial year 2017/18,2657
Disability,"Children living in long-term workless households, by disability status (Table H)",917
Disability,A08: Labour market status of disabled people,61872
Disability,Co-occurring substance misuse and mental health issues: Country of birth,732
Disability,Co-occurring substance misuse and mental health issues: County & UA deprivation deciles in England (IMD2010),3503
Disability,"Co-occurring substance misuse and mental health issues: County & UA deprivation deciles in England (IMD2015, 419 geog)",8964
Disability,"Co-occurring substance misuse and mental health issues: County & UA deprivation deciles in England (IMD2019, 419 geog)",17376
Disability,Co-occurring substance misuse and mental health issues: County & UA (pre Apr2019) deprivation deciles in England (IMD2015),38862
Disability,Co-occurring substance misuse and mental health issues: District & UA deprivation deciles in England (IMD2010),2059
Disability,"Co-occurring substance misuse and mental health issues: District & UA deprivation deciles in England (IMD2015, 419 geog)",2783
