diff --git a/combined/data_report.py b/combined/data_report.py new file mode 100644 index 0000000..3063895 --- /dev/null +++ b/combined/data_report.py @@ -0,0 +1,67 @@ +# --- +# jupyter: +# jupytext: +# text_representation: +# extension: .py +# format_name: light +# format_version: '1.5' +# jupytext_version: 1.3.4 +# kernelspec: +# display_name: Python 3 +# language: python +# name: python3 +# --- + +# + +# !pip install airtable-python-wrapper +# !pip install python-jenkins +# !pip install matplotlib + +from airtable import Airtable +from jenkins import Jenkins, JenkinsException + + +# + +from os import environ + +JENKINS_USER = environ.get('JENKINS_USER') +JENKINS_TOKEN = environ.get('JENKINS_TOKEN') +AIRTABLE_TOKEN = environ.get('AIRTABLE_TOKEN') +AIRTABLE_BASE = environ.get('AIRTABLE_BASE') + +from collections import namedtuple + +COGS = namedtuple('Airtable', 'sources families superfamilies producers types') +at = COGS._make([ + { record['id']: record['fields'] for record in Airtable( + AIRTABLE_BASE, table_name, api_key=AIRTABLE_TOKEN).get_all() + } for table_name in ['Source Data', 'Family', 'Superfamily', 'Dataset Producer', 'Type'] +]) + +# + +import pandas as pd + +datasets = [] + +for source_id, source in at.sources.items(): + if 'Family' in source: + datasets.append([ + at.superfamilies[at.families[source['Family'][0]]['Superfamily'][0]]['Name'], + at.families[source['Family'][0]]['Name'], + source['Name'], + source.get('Stage', '') + ]) + +table = pd.DataFrame(datasets, columns=('Superfamily', 'Family', 'Title', 'Stage')) +table + +# + +table = table.groupby(['Superfamily', 'Family'])['Stage'].value_counts().reset_index(name='Count') +table + +#+ +table \ + .pivot_table(index='Family', columns='Stage', values='Count', fill_value=0) \ + .loc[:,['Backlog','Candidate', 'Prioritized', 'Published']] \ + .plot.bar(stacked=True, figsize=(10,7), color=[ + 'xkcd:burnt orange', 'xkcd:denim', 'xkcd:sky blue', 'xkcd:pastel green'])