Newer
Older
DataReport / combined / data_report.py
# ---
# jupyter:
#   jupytext:
#     text_representation:
#       extension: .py
#       format_name: light
#       format_version: '1.5'
#       jupytext_version: 1.3.4
#   kernelspec:
#     display_name: Python 3
#     language: python
#     name: python3
# ---

from airtable import Airtable
from jenkins import Jenkins, JenkinsException


# +
from os import environ

JENKINS_USER = environ.get('JENKINS_USER')
JENKINS_TOKEN = environ.get('JENKINS_TOKEN')
AIRTABLE_TOKEN = environ.get('AIRTABLE_TOKEN')
AIRTABLE_BASE = environ.get('AIRTABLE_BASE')

from collections import namedtuple

COGS = namedtuple('Airtable', 'sources families superfamilies producers types')
at = COGS._make([
    { record['id']: record['fields'] for record in Airtable(
        AIRTABLE_BASE, table_name, api_key=AIRTABLE_TOKEN).get_all()
    } for table_name in ['Source Data', 'Family', 'Superfamily', 'Dataset Producer', 'Type']
])

# +
import pandas as pd

datasets = []

for source_id, source in at.sources.items():
    if 'Family' in source:
        datasets.append([
            at.superfamilies[at.families[source['Family'][0]]['Superfamily'][0]]['Name'],
            at.families[source['Family'][0]]['Name'],
            source['Name'],
            source.get('Stage', '')
        ])

table = pd.DataFrame(datasets, columns=('Superfamily', 'Family', 'Title', 'Stage'))
table
# -

table = table.groupby(['Superfamily', 'Family'])['Stage'].value_counts().reset_index(name='Count')
table

table \
  .pivot_table(index='Family', columns='Stage', values='Count', fill_value=0) \
  .loc[:,['Backlog','Candidate', 'Prioritized', 'Published']] \
  .plot.bar(stacked=True, figsize=(10,7), color=[
    'xkcd:burnt orange', 'xkcd:denim', 'xkcd:sky blue', 'xkcd:pastel green'])