add examples
This commit is contained in:
88
examples/podcast-download/generic_rss.py
Executable file
88
examples/podcast-download/generic_rss.py
Executable file
@@ -0,0 +1,88 @@
|
||||
#!/usr/bin/env python3
|
||||
import os
|
||||
from sys import stderr
|
||||
|
||||
from botlib.cli import Cli
|
||||
from botlib.curl import Curl
|
||||
from botlib.feed2list import Feed2List
|
||||
from botlib.helper import StrFormat, FileWrite
|
||||
|
||||
|
||||
def main():
|
||||
cli = Cli()
|
||||
cli.arg_dir('dest_dir', help='Download all entries here')
|
||||
cli.arg('source', help='RSS file or web-url')
|
||||
cli.arg_bool('--dry-run', help='Do not download, just parse')
|
||||
cli.arg_bool('--by-year', help='Place episodes in dest_dir/year/')
|
||||
args = cli.parse()
|
||||
|
||||
try:
|
||||
print('Processing:', args.dest_dir)
|
||||
process(args.source, args.dest_dir,
|
||||
by_year=args.by_year, dry_run=args.dry_run)
|
||||
print('Done.')
|
||||
except Exception as e:
|
||||
print('ERROR: ' + str(e), file=stderr)
|
||||
|
||||
|
||||
def process(source, dest_dir, *, by_year=False, dry_run=False):
|
||||
# open source
|
||||
if os.path.isfile(source):
|
||||
fp = open(source) # closed in Feed2List
|
||||
elif Curl.valid_url(source):
|
||||
fp = Curl.get(source) # closed in Feed2List
|
||||
else:
|
||||
raise AttributeError('Not a valid file or URL: "{}"'.format(source))
|
||||
|
||||
# process
|
||||
dest = dest_dir
|
||||
for entry in reversed(Feed2List(fp, keys=[
|
||||
'link', 'title', 'description', 'enclosure', # audio
|
||||
'pubDate', 'media:content', # image
|
||||
# 'itunes:image', 'itunes:duration', 'itunes:summary'
|
||||
])):
|
||||
date = entry.get('pubDate') # try RSS only
|
||||
if by_year:
|
||||
dest = os.path.join(dest_dir, str(date.year))
|
||||
if not dry_run and not os.path.exists(dest):
|
||||
os.mkdir(dest)
|
||||
process_entry(entry, date, dest, dry_run=dry_run)
|
||||
return True
|
||||
|
||||
|
||||
def process_entry(entry, date, dest_dir, *, dry_run=False):
|
||||
title = entry['title']
|
||||
# <enclosure url="*.mp3" length="47216000" type="audio/mpeg"/>
|
||||
audio_url = entry.get('enclosure', {}).get('url')
|
||||
if not audio_url:
|
||||
print(' ERROR: URL not found for "{}"'.format(title), file=stderr)
|
||||
return
|
||||
# <media:content url="*.jpg" width="300" rel="full_image" height="300" />
|
||||
images = entry.get('media:content', [])
|
||||
if not isinstance(images, list):
|
||||
images = [images]
|
||||
maxRes = 0
|
||||
image_url = None
|
||||
for img in images:
|
||||
res = int(img.get('width', 0)) * int(img.get('height', 0))
|
||||
if res > maxRes:
|
||||
maxRes = res
|
||||
image_url = img.get('url')
|
||||
# make request
|
||||
fname = '{} - {}'.format(date.strftime('%Y-%m-%d'),
|
||||
StrFormat.safe_filename(title))
|
||||
intro = '\ndownloading: ' + fname
|
||||
urllist = [audio_url, image_url] if image_url else [audio_url]
|
||||
flag = Curl.once(dest_dir, fname, urllist, date, override=False,
|
||||
dry_run=dry_run, verbose=True, intro=intro)
|
||||
|
||||
@FileWrite.once(dest_dir, fname + '.txt', date, override=False,
|
||||
dry_run=dry_run, verbose=True, intro=flag or intro)
|
||||
def _description():
|
||||
desc = title + '\n' + '=' * len(title)
|
||||
desc += '\n\n' + StrFormat.strip_html(entry.get('description', ''))
|
||||
return desc + '\n\n\n' + entry.get('link', '') + '\n'
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
150
examples/podcast-download/radiolab.py
Executable file
150
examples/podcast-download/radiolab.py
Executable file
@@ -0,0 +1,150 @@
|
||||
#!/usr/bin/env python3
|
||||
import os
|
||||
from sys import stderr
|
||||
|
||||
from botlib.cli import Cli
|
||||
from botlib.curl import Curl, URLError
|
||||
from botlib.helper import StrFormat, FileWrite
|
||||
from botlib.oncedb import OnceDB
|
||||
|
||||
API = 'http://api.wnyc.org/api/v3'
|
||||
COHORT = 'radiolab'
|
||||
db_ids = OnceDB('radiolab_ids.sqlite')
|
||||
db_slugs = OnceDB('radiolab_slugs.sqlite')
|
||||
# published-at does not contain timezone info, but is assumed to be EST
|
||||
os.environ['TZ'] = 'America/New_York'
|
||||
|
||||
|
||||
def main():
|
||||
cli = Cli()
|
||||
cli.arg_dir('dest_dir', help='Download all episodes to dest_dir/year/')
|
||||
cli.arg_bool('--dry-run', help='Do not download, just parse')
|
||||
args = cli.parse()
|
||||
|
||||
try:
|
||||
for title, query in (
|
||||
('Podcasts', 'radiolab/podcasts'),
|
||||
('Radio Shows', 'radiolab/radio-shows'),
|
||||
# ('Broadcasts', 'radiolabmoreperfect/radio-broadcasts'),
|
||||
):
|
||||
processEpisodeList(args.dest_dir, title, query,
|
||||
dry_run=args.dry_run)
|
||||
except Exception as e:
|
||||
print(' ERROR: ' + str(e), file=stderr)
|
||||
exit(1)
|
||||
|
||||
print('\nDone.\n\nNow check MP3 tags (consistency).')
|
||||
|
||||
|
||||
def processEpisodeList(basedir, title, query, index=1, *, dry_run=False):
|
||||
print('\nProcessing: {}'.format(title), end='')
|
||||
dat = Curl.json('{}/channel/shows/{}/{}?limit=9'.format(API, query, index))
|
||||
total = dat['data']['attributes']['total-pages']
|
||||
print(' ({}/{})'.format(index, total))
|
||||
anything_new = False
|
||||
for inc in dat['included']:
|
||||
anything_new |= processEpisode(inc['attributes'], basedir,
|
||||
dry_run=dry_run)
|
||||
if anything_new and index < total:
|
||||
processEpisodeList(basedir, title, query, index + 1, dry_run=dry_run)
|
||||
|
||||
|
||||
def processEpisode(obj, basedir, *, dry_run=False):
|
||||
uid = obj['cms-pk']
|
||||
if db_ids.contains(COHORT, uid):
|
||||
return False # Already exists
|
||||
|
||||
title = obj['title'].strip()
|
||||
slug = obj['slug'].strip()
|
||||
# [newsdate] 2009-11-03T00:35:34-05:00 [publish-at] 2009-11-03T00:35:34
|
||||
date_a = StrFormat.to_date(obj['newsdate'].strip())
|
||||
date_b = StrFormat.to_date(obj['publish-at'].strip())
|
||||
date = date_a if date_a.timestamp() <= date_b.timestamp() else date_b
|
||||
|
||||
# create by-year subdir
|
||||
dest_dir = os.path.join(basedir, str(date.year))
|
||||
if not dry_run and not os.path.exists(dest_dir):
|
||||
os.mkdir(dest_dir)
|
||||
|
||||
# make filename and download list
|
||||
fname = '{} - {}'.format(date.strftime('%Y-%m-%d'),
|
||||
StrFormat.safe_filename(title))
|
||||
urllist = [obj['audio'], obj['video']]
|
||||
urllist = [x for x in urllist if isinstance(x, str) and Curl.valid_url(x)]
|
||||
if not urllist:
|
||||
print('\ndownloading: {} ({}, {})'.format(fname, uid, slug))
|
||||
print(' No downloadable media found.')
|
||||
return False
|
||||
# get image
|
||||
img_url, img_desc = get_img_desc(obj['image-main'])
|
||||
if img_url:
|
||||
urllist.append(img_url)
|
||||
# download files
|
||||
intro = '\ndownloading: {} ({})'.format(fname, uid)
|
||||
flag = Curl.once(dest_dir, fname, urllist, date, override=False,
|
||||
dry_run=dry_run, verbose=True, intro=intro)
|
||||
|
||||
@FileWrite.once(dest_dir, fname + '.txt', date, override=False,
|
||||
dry_run=dry_run, verbose=True, intro=flag or intro)
|
||||
def write_description():
|
||||
nonlocal flag
|
||||
flag = True
|
||||
desc = title + '\n' + '=' * len(title)
|
||||
desc += '\n\n' + StrFormat.strip_html(obj['body'])
|
||||
if img_desc:
|
||||
desc += '\n\n' + img_desc
|
||||
return desc + '\n\n\n' + obj['url'].strip() + '\n' # link to article
|
||||
|
||||
@FileWrite.once(dest_dir, fname + '.transcript.txt', date, override=False,
|
||||
dry_run=dry_run, verbose=True, intro=flag or intro)
|
||||
def write_transcript():
|
||||
nonlocal flag
|
||||
flag = True
|
||||
data = StrFormat.strip_html(obj['transcript'])
|
||||
return data + '\n' if data else None
|
||||
|
||||
# success! now save state
|
||||
if flag and not dry_run:
|
||||
db_ids.put(COHORT, uid, fname)
|
||||
db_slugs.put(COHORT, uid, slug)
|
||||
print(' SLUG: {}'.format(slug))
|
||||
return flag # potentially need to query the next page too
|
||||
|
||||
|
||||
def get_img_desc(obj):
|
||||
if not obj:
|
||||
return (None, None)
|
||||
url = (obj['url'] or '').strip()
|
||||
if not url:
|
||||
return (None, None)
|
||||
txt = None
|
||||
cred_name = obj['credits-name'].strip()
|
||||
cred_url = obj['credits-url'].strip()
|
||||
if cred_name:
|
||||
txt = 'Image by ' + cred_name
|
||||
if cred_url:
|
||||
if txt:
|
||||
txt += ' @ ' + cred_url
|
||||
else:
|
||||
txt = 'Image source: ' + cred_url
|
||||
return (url, txt)
|
||||
|
||||
|
||||
# Individuals taken from Google search
|
||||
# -> inurl:radiolab/segments site:wnycstudios.org
|
||||
# -> inurl:radiolab/episodes site:wnycstudios.org
|
||||
# Then regex: /episodes/([^;]*?)" onmousedown
|
||||
|
||||
def processSingle(slug, basedir):
|
||||
# cms-pk = 91947 , slug = '91947-do-i-know-you'
|
||||
all_slugs = [slug for _, _, _, slug in db_slugs]
|
||||
if slug not in all_slugs:
|
||||
print(slug)
|
||||
data = Curl.json('{}/story/{}'.format(API, slug))
|
||||
try:
|
||||
processEpisode(data['data']['attributes'], basedir, dry_run=True)
|
||||
except URLError as e:
|
||||
print(' ERROR: ' + str(e), file=stderr)
|
||||
|
||||
|
||||
main()
|
||||
Reference in New Issue
Block a user