Files
botlib/examples/web-scraper/news_vice.py
2022-04-08 20:22:08 +02:00

29 lines
897 B
Python
Executable File

#!/usr/bin/env python3
from botlib.curl import Curl
from botlib.html2list import HTML2List, MatchGroup
from botlib.oncedb import OnceDB
def download(*, topic='motherboard', cohort='vice:motherboard'):
db = OnceDB('cache.sqlite')
url = 'https://www.vice.com/en/topic/{}'.format(topic)
select = '.vice-card__content'
match = MatchGroup({
'url': r'<a href="([^"]*)"',
'title': r'<h3[^>]*><a [^>]*>([\s\S]*?)</a>[\s\S]*?</h3>',
'desc': r'<p[^>]*>([\s\S]*?)</p>',
})
for elem in reversed(HTML2List(select).parse(Curl.get(url))):
match.set_html(elem)
x_uid = match['url']
if not x_uid or db.contains(cohort, x_uid):
continue
txt = '<a href="https://www.vice.com{url}">{title}</a>'.format(**match)
txt += '\n' + str(match['desc'])
if txt:
db.put(cohort, x_uid, txt)
# download()