Files
botlib/examples/web-scraper/playground.py
2022-04-08 20:22:08 +02:00

21 lines
597 B
Python
Executable File

#!/usr/bin/env python3
from botlib.curl import Curl
from botlib.html2list import HTML2List, MatchGroup
URL = 'https://www.vice.com/en/topic/motherboard'
SOURCE = Curl.get(URL, cache_only=True)
SELECT = '.vice-card__content'
match = MatchGroup({
'url': r'<a href="([^"]*)"',
'title': r'<h3[^>]*><a [^>]*>([\s\S]*?)</a>[\s\S]*?</h3>',
'desc': r'<p[^>]*>([\s\S]*?)</p>',
'wrong-regex': r'<a xref="([\s\S]*?)"',
})
for elem in reversed(HTML2List(SELECT).parse(SOURCE)):
match.set_html(elem)
for k, v in match.to_dict().items():
print(k, '=', v)
print()
break