botlib/examples/podcast-download/radiolab.py

#!/usr/bin/env python3
import os
from sys import stderr
from typing import Dict, Tuple, Optional, Any

from botlib.cli import Cli
from botlib.curl import Curl, URLError
from botlib.helper import StrFormat, FileWrite
from botlib.oncedb import OnceDB

API = 'http://api.wnyc.org/api/v3'
COHORT = 'radiolab'
db_ids = OnceDB('radiolab_ids.sqlite')
db_slugs = OnceDB('radiolab_slugs.sqlite')
# published-at does not contain timezone info, but is assumed to be EST
os.environ['TZ'] = 'America/New_York'


def main() -> None:
    ''' CLI entry. '''
    cli = Cli()
    cli.arg_dir('dest_dir', help='Download all episodes to dest_dir/year/')
    cli.arg_bool('--dry-run', help='Do not download, just parse')
    args = cli.parse()

    try:
        for title, query in (
            ('Podcasts', 'radiolab/podcasts'),
            ('Radio Shows', 'radiolab/radio-shows'),
            # ('Broadcasts', 'radiolabmoreperfect/radio-broadcasts'),
        ):
            processEpisodeList(args.dest_dir, title, query,
                               dry_run=args.dry_run)
    except Exception as e:
        print('  ERROR: ' + str(e), file=stderr)
        exit(1)

    print('\nDone.\n\nNow check MP3 tags (consistency).')


def processEpisodeList(
    basedir: str,
    title: str,
    query: str,
    index: int = 1,
    *, dry_run: bool = False
) -> None:
    ''' Parse full podcast category. '''
    print('\nProcessing: {}'.format(title), end='')
    url = '{}/channel/shows/{}/{}?limit=9'.format(API, query, index)
    dat = Curl.json(url)  # type: Dict[str, Any]
    total = dat['data']['attributes']['total-pages']
    print(' ({}/{})'.format(index, total))
    anything_new = False
    for inc in dat['included']:
        anything_new |= processEpisode(inc['attributes'], basedir,
                                       dry_run=dry_run)
    if anything_new and index < total:
        processEpisodeList(basedir, title, query, index + 1, dry_run=dry_run)


def processEpisode(
    obj: Dict[str, Any],
    basedir: str,
    *, dry_run: bool = False
) -> bool:
    ''' Parse a single podcast episode. '''
    uid = obj['cms-pk']
    if db_ids.contains(COHORT, uid):
        return False  # Already exists

    title = obj['title'].strip()
    slug = obj['slug'].strip()
    # [newsdate] 2009-11-03T00:35:34-05:00 [publish-at] 2009-11-03T00:35:34
    date_a = StrFormat.to_date(obj['newsdate'].strip())
    date_b = StrFormat.to_date(obj['publish-at'].strip())
    date = date_a if date_a.timestamp() <= date_b.timestamp() else date_b

    # create by-year subdir
    dest_dir = os.path.join(basedir, str(date.year))
    if not dry_run and not os.path.exists(dest_dir):
        os.mkdir(dest_dir)

    # make filename and download list
    fname = '{} - {}'.format(date.strftime('%Y-%m-%d'),
                             StrFormat.safe_filename(title))
    urllist = [obj['audio'], obj['video']]
    urllist = [x for x in urllist if isinstance(x, str) and Curl.valid_url(x)]
    if not urllist:
        print('\ndownloading: {} ({}, {})'.format(fname, uid, slug))
        print('  No downloadable media found.')
        return False
    # get image
    img_url, img_desc = get_img_desc(obj['image-main'])
    if img_url:
        urllist.append(img_url)
    # download files
    intro = '\ndownloading: {} ({})'.format(fname, uid)
    flag = Curl.once(dest_dir, fname, urllist, date, override=False,
                     dry_run=dry_run, verbose=True, intro=intro)

    @FileWrite.once(dest_dir, fname + '.txt', date, override=False,
                    dry_run=dry_run, verbose=True, intro=flag or intro)
    def write_description() -> str:
        nonlocal flag
        flag = True
        desc = '{}\n{}\n\n{}'.format(
            title, '=' * len(title), StrFormat.strip_html(obj['body']))
        if img_desc:
            desc += '\n\n' + img_desc
        return '{}\n\n\n{}\n'.format(desc, obj['url'].strip())  # article link

    @FileWrite.once(dest_dir, fname + '.transcript.txt', date, override=False,
                    dry_run=dry_run, verbose=True, intro=flag or intro)
    def write_transcript() -> Optional[str]:
        nonlocal flag
        flag = True
        data = StrFormat.strip_html(obj['transcript'])
        return data + '\n' if data else None

    # success! now save state
    if flag and not dry_run:
        db_ids.put(COHORT, uid, fname)
        db_slugs.put(COHORT, uid, slug)
        print('  SLUG: {}'.format(slug))
    return flag  # potentially need to query the next page too


def get_img_desc(obj: Dict[str, Any]) -> Tuple[Optional[str], Optional[str]]:
    ''' Extract image description. '''
    if not obj:
        return (None, None)
    url = (obj['url'] or '').strip()
    if not url:
        return (None, None)
    txt = None
    cred_name = obj['credits-name'].strip()
    cred_url = obj['credits-url'].strip()
    if cred_name:
        txt = 'Image by ' + cred_name
    if cred_url:
        if txt:
            txt += ' @ ' + cred_url
        else:
            txt = 'Image source: ' + cred_url
    return (url, txt)


# Individuals taken from Google search
# -> inurl:radiolab/segments site:wnycstudios.org
# -> inurl:radiolab/episodes site:wnycstudios.org
# Then regex:  /episodes/([^;]*?)" onmousedown

def processSingle(slug: str, basedir: str) -> None:
    ''' [internal] process single episode if only the slug is known. '''
    # cms-pk = 91947 , slug = '91947-do-i-know-you'
    all_slugs = [slug for _, _, _, slug in db_slugs]
    if slug not in all_slugs:
        print(slug)
        data = Curl.json('{}/story/{}'.format(API, slug))
        try:
            processEpisode(data['data']['attributes'], basedir, dry_run=True)
        except URLError as e:
            print('  ERROR: ' + str(e), file=stderr)


main()