add examples
This commit is contained in:
28
examples/web-scraper/news_vice.py
Executable file
28
examples/web-scraper/news_vice.py
Executable file
@@ -0,0 +1,28 @@
|
||||
#!/usr/bin/env python3
|
||||
from botlib.curl import Curl
|
||||
from botlib.html2list import HTML2List, MatchGroup
|
||||
from botlib.oncedb import OnceDB
|
||||
|
||||
|
||||
def download(*, topic='motherboard', cohort='vice:motherboard'):
|
||||
db = OnceDB('cache.sqlite')
|
||||
url = 'https://www.vice.com/en/topic/{}'.format(topic)
|
||||
|
||||
select = '.vice-card__content'
|
||||
match = MatchGroup({
|
||||
'url': r'<a href="([^"]*)"',
|
||||
'title': r'<h3[^>]*><a [^>]*>([\s\S]*?)</a>[\s\S]*?</h3>',
|
||||
'desc': r'<p[^>]*>([\s\S]*?)</p>',
|
||||
})
|
||||
for elem in reversed(HTML2List(select).parse(Curl.get(url))):
|
||||
match.set_html(elem)
|
||||
x_uid = match['url']
|
||||
if not x_uid or db.contains(cohort, x_uid):
|
||||
continue
|
||||
txt = '<a href="https://www.vice.com{url}">{title}</a>'.format(**match)
|
||||
txt += '\n' + str(match['desc'])
|
||||
if txt:
|
||||
db.put(cohort, x_uid, txt)
|
||||
|
||||
|
||||
# download()
|
||||
Reference in New Issue
Block a user