From e871e6f03e36132a6063db9f11ecf15299e920b3 Mon Sep 17 00:00:00 2001 From: relikd Date: Fri, 8 Apr 2022 20:22:08 +0200 Subject: [PATCH] add examples --- .gitattributes | 1 + examples/podcast-download/generic_rss.py | 88 ++++++++++++ examples/podcast-download/radiolab.py | 150 ++++++++++++++++++++ examples/telegram-send/info-get-chat-id.py | 6 + examples/telegram-send/main-interactive.py | 61 ++++++++ examples/telegram-send/main-recurring.py | 54 +++++++ examples/web-scraper/README.md | 35 +++++ examples/web-scraper/news_vice.py | 28 ++++ examples/web-scraper/playground.py | 20 +++ examples/web-scraper/shopping_craigslist.py | 45 ++++++ 10 files changed, 488 insertions(+) create mode 100644 .gitattributes create mode 100755 examples/podcast-download/generic_rss.py create mode 100755 examples/podcast-download/radiolab.py create mode 100755 examples/telegram-send/info-get-chat-id.py create mode 100755 examples/telegram-send/main-interactive.py create mode 100755 examples/telegram-send/main-recurring.py create mode 100644 examples/web-scraper/README.md create mode 100755 examples/web-scraper/news_vice.py create mode 100755 examples/web-scraper/playground.py create mode 100755 examples/web-scraper/shopping_craigslist.py diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..7dd95dc --- /dev/null +++ b/.gitattributes @@ -0,0 +1 @@ +examples/** linguist-documentation diff --git a/examples/podcast-download/generic_rss.py b/examples/podcast-download/generic_rss.py new file mode 100755 index 0000000..84e3cb5 --- /dev/null +++ b/examples/podcast-download/generic_rss.py @@ -0,0 +1,88 @@ +#!/usr/bin/env python3 +import os +from sys import stderr + +from botlib.cli import Cli +from botlib.curl import Curl +from botlib.feed2list import Feed2List +from botlib.helper import StrFormat, FileWrite + + +def main(): + cli = Cli() + cli.arg_dir('dest_dir', help='Download all entries here') + cli.arg('source', help='RSS file or web-url') + cli.arg_bool('--dry-run', help='Do not download, just parse') + cli.arg_bool('--by-year', help='Place episodes in dest_dir/year/') + args = cli.parse() + + try: + print('Processing:', args.dest_dir) + process(args.source, args.dest_dir, + by_year=args.by_year, dry_run=args.dry_run) + print('Done.') + except Exception as e: + print('ERROR: ' + str(e), file=stderr) + + +def process(source, dest_dir, *, by_year=False, dry_run=False): + # open source + if os.path.isfile(source): + fp = open(source) # closed in Feed2List + elif Curl.valid_url(source): + fp = Curl.get(source) # closed in Feed2List + else: + raise AttributeError('Not a valid file or URL: "{}"'.format(source)) + + # process + dest = dest_dir + for entry in reversed(Feed2List(fp, keys=[ + 'link', 'title', 'description', 'enclosure', # audio + 'pubDate', 'media:content', # image + # 'itunes:image', 'itunes:duration', 'itunes:summary' + ])): + date = entry.get('pubDate') # try RSS only + if by_year: + dest = os.path.join(dest_dir, str(date.year)) + if not dry_run and not os.path.exists(dest): + os.mkdir(dest) + process_entry(entry, date, dest, dry_run=dry_run) + return True + + +def process_entry(entry, date, dest_dir, *, dry_run=False): + title = entry['title'] + # + audio_url = entry.get('enclosure', {}).get('url') + if not audio_url: + print(' ERROR: URL not found for "{}"'.format(title), file=stderr) + return + # + images = entry.get('media:content', []) + if not isinstance(images, list): + images = [images] + maxRes = 0 + image_url = None + for img in images: + res = int(img.get('width', 0)) * int(img.get('height', 0)) + if res > maxRes: + maxRes = res + image_url = img.get('url') + # make request + fname = '{} - {}'.format(date.strftime('%Y-%m-%d'), + StrFormat.safe_filename(title)) + intro = '\ndownloading: ' + fname + urllist = [audio_url, image_url] if image_url else [audio_url] + flag = Curl.once(dest_dir, fname, urllist, date, override=False, + dry_run=dry_run, verbose=True, intro=intro) + + @FileWrite.once(dest_dir, fname + '.txt', date, override=False, + dry_run=dry_run, verbose=True, intro=flag or intro) + def _description(): + desc = title + '\n' + '=' * len(title) + desc += '\n\n' + StrFormat.strip_html(entry.get('description', '')) + return desc + '\n\n\n' + entry.get('link', '') + '\n' + + +if __name__ == '__main__': + main() diff --git a/examples/podcast-download/radiolab.py b/examples/podcast-download/radiolab.py new file mode 100755 index 0000000..68dca5d --- /dev/null +++ b/examples/podcast-download/radiolab.py @@ -0,0 +1,150 @@ +#!/usr/bin/env python3 +import os +from sys import stderr + +from botlib.cli import Cli +from botlib.curl import Curl, URLError +from botlib.helper import StrFormat, FileWrite +from botlib.oncedb import OnceDB + +API = 'http://api.wnyc.org/api/v3' +COHORT = 'radiolab' +db_ids = OnceDB('radiolab_ids.sqlite') +db_slugs = OnceDB('radiolab_slugs.sqlite') +# published-at does not contain timezone info, but is assumed to be EST +os.environ['TZ'] = 'America/New_York' + + +def main(): + cli = Cli() + cli.arg_dir('dest_dir', help='Download all episodes to dest_dir/year/') + cli.arg_bool('--dry-run', help='Do not download, just parse') + args = cli.parse() + + try: + for title, query in ( + ('Podcasts', 'radiolab/podcasts'), + ('Radio Shows', 'radiolab/radio-shows'), + # ('Broadcasts', 'radiolabmoreperfect/radio-broadcasts'), + ): + processEpisodeList(args.dest_dir, title, query, + dry_run=args.dry_run) + except Exception as e: + print(' ERROR: ' + str(e), file=stderr) + exit(1) + + print('\nDone.\n\nNow check MP3 tags (consistency).') + + +def processEpisodeList(basedir, title, query, index=1, *, dry_run=False): + print('\nProcessing: {}'.format(title), end='') + dat = Curl.json('{}/channel/shows/{}/{}?limit=9'.format(API, query, index)) + total = dat['data']['attributes']['total-pages'] + print(' ({}/{})'.format(index, total)) + anything_new = False + for inc in dat['included']: + anything_new |= processEpisode(inc['attributes'], basedir, + dry_run=dry_run) + if anything_new and index < total: + processEpisodeList(basedir, title, query, index + 1, dry_run=dry_run) + + +def processEpisode(obj, basedir, *, dry_run=False): + uid = obj['cms-pk'] + if db_ids.contains(COHORT, uid): + return False # Already exists + + title = obj['title'].strip() + slug = obj['slug'].strip() + # [newsdate] 2009-11-03T00:35:34-05:00 [publish-at] 2009-11-03T00:35:34 + date_a = StrFormat.to_date(obj['newsdate'].strip()) + date_b = StrFormat.to_date(obj['publish-at'].strip()) + date = date_a if date_a.timestamp() <= date_b.timestamp() else date_b + + # create by-year subdir + dest_dir = os.path.join(basedir, str(date.year)) + if not dry_run and not os.path.exists(dest_dir): + os.mkdir(dest_dir) + + # make filename and download list + fname = '{} - {}'.format(date.strftime('%Y-%m-%d'), + StrFormat.safe_filename(title)) + urllist = [obj['audio'], obj['video']] + urllist = [x for x in urllist if isinstance(x, str) and Curl.valid_url(x)] + if not urllist: + print('\ndownloading: {} ({}, {})'.format(fname, uid, slug)) + print(' No downloadable media found.') + return False + # get image + img_url, img_desc = get_img_desc(obj['image-main']) + if img_url: + urllist.append(img_url) + # download files + intro = '\ndownloading: {} ({})'.format(fname, uid) + flag = Curl.once(dest_dir, fname, urllist, date, override=False, + dry_run=dry_run, verbose=True, intro=intro) + + @FileWrite.once(dest_dir, fname + '.txt', date, override=False, + dry_run=dry_run, verbose=True, intro=flag or intro) + def write_description(): + nonlocal flag + flag = True + desc = title + '\n' + '=' * len(title) + desc += '\n\n' + StrFormat.strip_html(obj['body']) + if img_desc: + desc += '\n\n' + img_desc + return desc + '\n\n\n' + obj['url'].strip() + '\n' # link to article + + @FileWrite.once(dest_dir, fname + '.transcript.txt', date, override=False, + dry_run=dry_run, verbose=True, intro=flag or intro) + def write_transcript(): + nonlocal flag + flag = True + data = StrFormat.strip_html(obj['transcript']) + return data + '\n' if data else None + + # success! now save state + if flag and not dry_run: + db_ids.put(COHORT, uid, fname) + db_slugs.put(COHORT, uid, slug) + print(' SLUG: {}'.format(slug)) + return flag # potentially need to query the next page too + + +def get_img_desc(obj): + if not obj: + return (None, None) + url = (obj['url'] or '').strip() + if not url: + return (None, None) + txt = None + cred_name = obj['credits-name'].strip() + cred_url = obj['credits-url'].strip() + if cred_name: + txt = 'Image by ' + cred_name + if cred_url: + if txt: + txt += ' @ ' + cred_url + else: + txt = 'Image source: ' + cred_url + return (url, txt) + + +# Individuals taken from Google search +# -> inurl:radiolab/segments site:wnycstudios.org +# -> inurl:radiolab/episodes site:wnycstudios.org +# Then regex: /episodes/([^;]*?)" onmousedown + +def processSingle(slug, basedir): + # cms-pk = 91947 , slug = '91947-do-i-know-you' + all_slugs = [slug for _, _, _, slug in db_slugs] + if slug not in all_slugs: + print(slug) + data = Curl.json('{}/story/{}'.format(API, slug)) + try: + processEpisode(data['data']['attributes'], basedir, dry_run=True) + except URLError as e: + print(' ERROR: ' + str(e), file=stderr) + + +main() diff --git a/examples/telegram-send/info-get-chat-id.py b/examples/telegram-send/info-get-chat-id.py new file mode 100755 index 0000000..4e7dd82 --- /dev/null +++ b/examples/telegram-send/info-get-chat-id.py @@ -0,0 +1,6 @@ +#!/usr/bin/env python3 +from botlib.tgclient import TGClient + +print('open a new telegram chat window with your bot and send /start') + +TGClient.listen_chat_info(__API_KEY__, 'my-username') diff --git a/examples/telegram-send/main-interactive.py b/examples/telegram-send/main-interactive.py new file mode 100755 index 0000000..1b45e0c --- /dev/null +++ b/examples/telegram-send/main-interactive.py @@ -0,0 +1,61 @@ +#!/usr/bin/env python3 +from botlib.tgclient import TGClient + +bot = TGClient(__API_KEY__, polling=True, allowedUsers=['my-username']) + + +@bot.message_handler(commands=['hi']) +def bot_reply(message): + if bot.allowed(message): # only reply to a single user (my-username) + bot.reply_to(message, 'Good evening my dear.') + + +@bot.message_handler(commands=['set']) +def update_config(message): + if bot.allowed(message): + try: + config = data_store.get(message.chat.id) + except KeyError: + bot.reply_to(message, 'Not found.') + return + + if message.text == '/set day': + config.param = 'day' + elif message.text == '/set night': + config.param = 'night' + else: + bot.reply_to(message, 'Usage: /set [day|night]') + + +@bot.message_handler(commands=['start']) +def new_chat_info(message): + bot.log_chat_info(message.chat) + if bot.allowed(message): + if data_store.get(message.chat.id): + bot.reply_to(message, 'Already exists') + else: + CreateNew(message) + + +class CreateNew: + def __init__(self, message): + self.ask_name(message) + + def ask_name(self, message): + msg = bot.send_force_reply(message.chat.id, 'Enter Name:') + bot.register_next_step_handler(msg, self.ask_interval) + + def ask_interval(self, message): + self.name = message.text + msg = bot.send_buttons(message.chat.id, 'Update interval (minutes):', + options=[3, 5, 10, 15, 30, 60]) + bot.register_next_step_handler(msg, self.finish) + + def finish(self, message): + try: + interval = int(message.text) + except ValueError: + bot.send_abort_keyboard(message, 'Not a number. Aborting.') + return + print('Name:', self.name, 'interval:', interval) + bot.send_message(message.chat.id, 'done.') diff --git a/examples/telegram-send/main-recurring.py b/examples/telegram-send/main-recurring.py new file mode 100755 index 0000000..063ab6e --- /dev/null +++ b/examples/telegram-send/main-recurring.py @@ -0,0 +1,54 @@ +#!/usr/bin/env python3 +from botlib.cron import Cron +from botlib.helper import Log +from botlib.oncedb import OnceDB +from botlib.tgclient import TGClient +# the pipeline process logic is split up: +# - you can have one file for generating the entries and writing to db (import) +# e.g., import an example from web-scraper and call download() +# - and another file to read db and send its entries to telegram (this file) +# of course, you can put your download logic inside this file as well +import sub_job_a as jobA +import sub_job_b as jobB + +cron = Cron() +bot = TGClient(__API_KEY__, polling=True, allowedUsers=['my-username']) +bot.set_on_kill(cron.stop) + + +def main(): + def clean_db(_): + Log.info('[clean up]') + OnceDB('cache.sqlite').cleanup(limit=150) + + def notify_jobA(_): + jobA.download(topic='development', cohort='dev:py') + send2telegram(__A_CHAT_ID__) + + def notify_jobB(_): + jobB.download() + send2telegram(__ANOTHER_CHAT_ID__) + + # Log.info('Ready') + cron.add_job(10, notify_jobA) # every 10 min + cron.add_job(30, notify_jobB) # every 30 min + cron.add_job(1440, clean_db) # daily + cron.start() + # cron.fire() + + +def send2telegram(chat_id): + db = OnceDB('cache.sqlite') + # db.mark_all_done() + + def _send(cohort, uid, obj): + Log.info('[push] {} {}'.format(cohort, uid)) + return bot.send(chat_id, obj, parse_mode='HTML', + disable_web_page_preview=True) + + if not db.foreach(_send): + # send() sleeps 45 sec (on error), safe to call immediatelly + send2telegram(chat_id) + + +main() diff --git a/examples/web-scraper/README.md b/examples/web-scraper/README.md new file mode 100644 index 0000000..4100ac0 --- /dev/null +++ b/examples/web-scraper/README.md @@ -0,0 +1,35 @@ +# How-to web scraping + +Use the `playground.py` for quick testing. +Initially, you have to set `cache_only=False` or otherwise no data is downloaded. +After the first download, re-enable `cache_only` so you don't have to download the data over and over again. +Also, when you feel ready, uncomment the `break` statement to see if it works for all entries. + +## Finding a proper `select` + +The hardest part is getting all regex matches right. +Open the browser devtools and choose the element picker. +Hover over the first element / row of the data you'd like to retrieve. +Pick whatever tag or class seems apropriate, also look at neighboring tags. +The `select` must match all entries but no unnecessary ones. +Although you can always filter unnecessary ones later... + +## Finding the regex + +The matches for the individual data fields are tricky too. +Select and right-click on the element you picked above. +Important: Either edit or copy as raw HTML. +The devtools will omit whitespace and display `'` as `"`, so you have to make sure you know what you are trying to match. + +Now begins the playing around part. +The regex will match the first occurrence, so if there are two anchor tags and you need the second one, you have to get creative. +For example, this is the case in the craigslist example. +Here I can match the second anchor because it is contained in a `h3` heading. + +Try to match as compact as possible, this makes it more robust against source code changes. +For example, use `]*>` to match an opening anchor with arbitrary attributes. +Some sites will put the `href` immediatelly after `]*>]*>([\s\S]*?)[\s\S]*?', + 'desc': r']*>([\s\S]*?)

', + }) + for elem in reversed(HTML2List(select).parse(Curl.get(url))): + match.set_html(elem) + x_uid = match['url'] + if not x_uid or db.contains(cohort, x_uid): + continue + txt = '{title}'.format(**match) + txt += '\n' + str(match['desc']) + if txt: + db.put(cohort, x_uid, txt) + + +# download() diff --git a/examples/web-scraper/playground.py b/examples/web-scraper/playground.py new file mode 100755 index 0000000..a905bc7 --- /dev/null +++ b/examples/web-scraper/playground.py @@ -0,0 +1,20 @@ +#!/usr/bin/env python3 +from botlib.curl import Curl +from botlib.html2list import HTML2List, MatchGroup + +URL = 'https://www.vice.com/en/topic/motherboard' +SOURCE = Curl.get(URL, cache_only=True) + +SELECT = '.vice-card__content' +match = MatchGroup({ + 'url': r']*>]*>([\s\S]*?)[\s\S]*?', + 'desc': r']*>([\s\S]*?)

', + 'wrong-regex': r']*>([\s\S]*?)[\s\S]*?', + 'price': r'([\s\S]*?)', + 'hood': r'([\s\S]*?)', + }, lambda match: ''' +{title} +{price}, {hood}'''.format(**match)) + + # process another source ... + # def fn(match): + # print(match.to_dict()) + # return advanced_fn(match) + # proc(cohort, load(url), select, match, fn) + + +# download()