add examples

2022-04-08 20:22:08 +02:00
parent 45dfc31966
commit e871e6f03e
10 changed files with 488 additions and 0 deletions
--- a/examples/web-scraper/README.md
+++ b/examples/web-scraper/README.md
@@ -0,0 +1,35 @@
+# How-to web scraping
+
+Use the `playground.py` for quick testing.
+Initially, you have to set `cache_only=False` or otherwise no data is downloaded.
+After the first download, re-enable `cache_only` so you don't have to download the data over and over again.
+Also, when you feel ready, uncomment the `break` statement to see if it works for all entries.
+
+## Finding a proper `select`
+
+The hardest part is getting all regex matches right.
+Open the browser devtools and choose the element picker.
+Hover over the first element / row of the data you'd like to retrieve.
+Pick whatever tag or class seems apropriate, also look at neighboring tags.
+The `select` must match all entries but no unnecessary ones.
+Although you can always filter unnecessary ones later...
+
+## Finding the regex
+
+The matches for the individual data fields are tricky too.
+Select and right-click on the element you picked above.
+Important: Either edit or copy as raw HTML.
+The devtools will omit whitespace and display `'` as `"`, so you have to make sure you know what you are trying to match.
+
+Now begins the playing around part.
+The regex will match the first occurrence, so if there are two anchor tags and you need the second one, you have to get creative.
+For example, this is the case in the craigslist example.
+Here I can match the second anchor because it is contained in a `h3` heading.
+
+Try to match as compact as possible, this makes it more robust against source code changes.
+For example, use `<a [^>]*>` to match an opening anchor with arbitrary attributes.
+Some sites will put the `href` immediatelly after `<a`, other somewhere in between.
+Be creative.
+Use `[\s\S]*?` to match anything (instead of just `.`), including whitespace and newlines.
+And finally, have at least one matching group (`()`).
+Note: whitespace will be stripped from the matching group.
--- a/examples/web-scraper/news_vice.py
+++ b/examples/web-scraper/news_vice.py
@@ -0,0 +1,28 @@
+#!/usr/bin/env python3
+from botlib.curl import Curl
+from botlib.html2list import HTML2List, MatchGroup
+from botlib.oncedb import OnceDB
+
+
+def download(*, topic='motherboard', cohort='vice:motherboard'):
+    db = OnceDB('cache.sqlite')
+    url = 'https://www.vice.com/en/topic/{}'.format(topic)
+
+    select = '.vice-card__content'
+    match = MatchGroup({
+        'url': r'<a href="([^"]*)"',
+        'title': r'<h3[^>]*><a [^>]*>([\s\S]*?)</a>[\s\S]*?</h3>',
+        'desc': r'<p[^>]*>([\s\S]*?)</p>',
+    })
+    for elem in reversed(HTML2List(select).parse(Curl.get(url))):
+        match.set_html(elem)
+        x_uid = match['url']
+        if not x_uid or db.contains(cohort, x_uid):
+            continue
+        txt = '<a href="https://www.vice.com{url}">{title}</a>'.format(**match)
+        txt += '\n' + str(match['desc'])
+        if txt:
+            db.put(cohort, x_uid, txt)
+
+
+# download()
--- a/examples/web-scraper/playground.py
+++ b/examples/web-scraper/playground.py
@@ -0,0 +1,20 @@
+#!/usr/bin/env python3
+from botlib.curl import Curl
+from botlib.html2list import HTML2List, MatchGroup
+
+URL = 'https://www.vice.com/en/topic/motherboard'
+SOURCE = Curl.get(URL, cache_only=True)
+
+SELECT = '.vice-card__content'
+match = MatchGroup({
+    'url': r'<a href="([^"]*)"',
+    'title': r'<h3[^>]*><a [^>]*>([\s\S]*?)</a>[\s\S]*?</h3>',
+    'desc': r'<p[^>]*>([\s\S]*?)</p>',
+    'wrong-regex': r'<a xref="([\s\S]*?)"',
+})
+for elem in reversed(HTML2List(SELECT).parse(SOURCE)):
+    match.set_html(elem)
+    for k, v in match.to_dict().items():
+        print(k, '=', v)
+    print()
+    break
--- a/examples/web-scraper/shopping_craigslist.py
+++ b/examples/web-scraper/shopping_craigslist.py
@@ -0,0 +1,45 @@
+#!/usr/bin/env python3
+from botlib.curl import Curl
+from botlib.html2list import HTML2List, MatchGroup
+from botlib.oncedb import OnceDB
+
+CRAIGSLIST = 'https://newyork.craigslist.org/search/boo'
+
+
+def load(url):
+    # return open('test.html')
+    return Curl.get(url)
+
+
+def download():
+    db = OnceDB('cache.sqlite')
+
+    def proc(cohort, source, select, regex={}, fn=str):
+        match = MatchGroup(regex)
+        for elem in reversed(HTML2List(select).parse(source)):
+            match.set_html(elem)
+            x_uid = match['url']
+            if not x_uid or db.contains(cohort, x_uid):
+                continue
+            txt = (fn(match) or '').strip()
+            if txt:
+                print(txt)
+                db.put(cohort, x_uid, txt)
+
+    proc('boat:craigslist', load(CRAIGSLIST), 'li.result-row', {
+        'url': r'<a href="([^"]*)"',
+        'title': r'<h3[\s\S]*?<a [^>]*>([\s\S]*?)</a>[\s\S]*?</h3>',
+        'price': r'<span class="result-price">([\s\S]*?)</span>',
+        'hood': r'<span class="result-hood">([\s\S]*?)</span>',
+    }, lambda match: '''
+<a href="{url}">{title}</a>
+<strong>{price}</strong>, {hood}'''.format(**match))
+
+    # process another source ...
+    # def fn(match):
+    #     print(match.to_dict())
+    #     return advanced_fn(match)
+    # proc(cohort, load(url), select, match, fn)
+
+
+# download()