botlib/examples/podcast-download/radiolab.py

#!/usr/bin/env python3
import os
from sys import stderr

from botlib.cli import Cli
from botlib.curl import Curl, URLError
from botlib.helper import StrFormat, FileWrite
from botlib.oncedb import OnceDB

API = 'http://api.wnyc.org/api/v3'
COHORT = 'radiolab'
db_ids = OnceDB('radiolab_ids.sqlite')
db_slugs = OnceDB('radiolab_slugs.sqlite')
# published-at does not contain timezone info, but is assumed to be EST
os.environ['TZ'] = 'America/New_York'


def main():
    cli = Cli()
    cli.arg_dir('dest_dir', help='Download all episodes to dest_dir/year/')
    cli.arg_bool('--dry-run', help='Do not download, just parse')
    args = cli.parse()

    try:
        for title, query in (
            ('Podcasts', 'radiolab/podcasts'),
            ('Radio Shows', 'radiolab/radio-shows'),
            # ('Broadcasts', 'radiolabmoreperfect/radio-broadcasts'),
        ):
            processEpisodeList(args.dest_dir, title, query,
                               dry_run=args.dry_run)
    except Exception as e:
        print('  ERROR: ' + str(e), file=stderr)
        exit(1)

    print('\nDone.\n\nNow check MP3 tags (consistency).')


def processEpisodeList(basedir, title, query, index=1, *, dry_run=False):
    print('\nProcessing: {}'.format(title), end='')
    dat = Curl.json('{}/channel/shows/{}/{}?limit=9'.format(API, query, index))
    total = dat['data']['attributes']['total-pages']
    print(' ({}/{})'.format(index, total))
    anything_new = False
    for inc in dat['included']:
        anything_new |= processEpisode(inc['attributes'], basedir,
                                       dry_run=dry_run)
    if anything_new and index < total:
        processEpisodeList(basedir, title, query, index + 1, dry_run=dry_run)


def processEpisode(obj, basedir, *, dry_run=False):
    uid = obj['cms-pk']
    if db_ids.contains(COHORT, uid):
        return False  # Already exists

    title = obj['title'].strip()
    slug = obj['slug'].strip()
    # [newsdate] 2009-11-03T00:35:34-05:00 [publish-at] 2009-11-03T00:35:34
    date_a = StrFormat.to_date(obj['newsdate'].strip())
    date_b = StrFormat.to_date(obj['publish-at'].strip())
    date = date_a if date_a.timestamp() <= date_b.timestamp() else date_b

    # create by-year subdir
    dest_dir = os.path.join(basedir, str(date.year))
    if not dry_run and not os.path.exists(dest_dir):
        os.mkdir(dest_dir)

    # make filename and download list
    fname = '{} - {}'.format(date.strftime('%Y-%m-%d'),
                             StrFormat.safe_filename(title))
    urllist = [obj['audio'], obj['video']]
    urllist = [x for x in urllist if isinstance(x, str) and Curl.valid_url(x)]
    if not urllist:
        print('\ndownloading: {} ({}, {})'.format(fname, uid, slug))
        print('  No downloadable media found.')
        return False
    # get image
    img_url, img_desc = get_img_desc(obj['image-main'])
    if img_url:
        urllist.append(img_url)
    # download files
    intro = '\ndownloading: {} ({})'.format(fname, uid)
    flag = Curl.once(dest_dir, fname, urllist, date, override=False,
                     dry_run=dry_run, verbose=True, intro=intro)

    @FileWrite.once(dest_dir, fname + '.txt', date, override=False,
                    dry_run=dry_run, verbose=True, intro=flag or intro)
    def write_description():
        nonlocal flag
        flag = True
        desc = title + '\n' + '=' * len(title)
        desc += '\n\n' + StrFormat.strip_html(obj['body'])
        if img_desc:
            desc += '\n\n' + img_desc
        return desc + '\n\n\n' + obj['url'].strip() + '\n'  # link to article

    @FileWrite.once(dest_dir, fname + '.transcript.txt', date, override=False,
                    dry_run=dry_run, verbose=True, intro=flag or intro)
    def write_transcript():
        nonlocal flag
        flag = True
        data = StrFormat.strip_html(obj['transcript'])
        return data + '\n' if data else None

    # success! now save state
    if flag and not dry_run:
        db_ids.put(COHORT, uid, fname)
        db_slugs.put(COHORT, uid, slug)
        print('  SLUG: {}'.format(slug))
    return flag  # potentially need to query the next page too


def get_img_desc(obj):
    if not obj:
        return (None, None)
    url = (obj['url'] or '').strip()
    if not url:
        return (None, None)
    txt = None
    cred_name = obj['credits-name'].strip()
    cred_url = obj['credits-url'].strip()
    if cred_name:
        txt = 'Image by ' + cred_name
    if cred_url:
        if txt:
            txt += ' @ ' + cred_url
        else:
            txt = 'Image source: ' + cred_url
    return (url, txt)


# Individuals taken from Google search
# -> inurl:radiolab/segments site:wnycstudios.org
# -> inurl:radiolab/episodes site:wnycstudios.org
# Then regex:  /episodes/([^;]*?)" onmousedown

def processSingle(slug, basedir):
    # cms-pk = 91947 , slug = '91947-do-i-know-you'
    all_slugs = [slug for _, _, _, slug in db_slugs]
    if slug not in all_slugs:
        print(slug)
        data = Curl.json('{}/story/{}'.format(API, slug))
        try:
            processEpisode(data['data']['attributes'], basedir, dry_run=True)
        except URLError as e:
            print('  ERROR: ' + str(e), file=stderr)


main()