add examples
This commit is contained in:
35
examples/web-scraper/README.md
Normal file
35
examples/web-scraper/README.md
Normal file
@@ -0,0 +1,35 @@
|
||||
# How-to web scraping
|
||||
|
||||
Use the `playground.py` for quick testing.
|
||||
Initially, you have to set `cache_only=False` or otherwise no data is downloaded.
|
||||
After the first download, re-enable `cache_only` so you don't have to download the data over and over again.
|
||||
Also, when you feel ready, uncomment the `break` statement to see if it works for all entries.
|
||||
|
||||
## Finding a proper `select`
|
||||
|
||||
The hardest part is getting all regex matches right.
|
||||
Open the browser devtools and choose the element picker.
|
||||
Hover over the first element / row of the data you'd like to retrieve.
|
||||
Pick whatever tag or class seems apropriate, also look at neighboring tags.
|
||||
The `select` must match all entries but no unnecessary ones.
|
||||
Although you can always filter unnecessary ones later...
|
||||
|
||||
## Finding the regex
|
||||
|
||||
The matches for the individual data fields are tricky too.
|
||||
Select and right-click on the element you picked above.
|
||||
Important: Either edit or copy as raw HTML.
|
||||
The devtools will omit whitespace and display `'` as `"`, so you have to make sure you know what you are trying to match.
|
||||
|
||||
Now begins the playing around part.
|
||||
The regex will match the first occurrence, so if there are two anchor tags and you need the second one, you have to get creative.
|
||||
For example, this is the case in the craigslist example.
|
||||
Here I can match the second anchor because it is contained in a `h3` heading.
|
||||
|
||||
Try to match as compact as possible, this makes it more robust against source code changes.
|
||||
For example, use `<a [^>]*>` to match an opening anchor with arbitrary attributes.
|
||||
Some sites will put the `href` immediatelly after `<a`, other somewhere in between.
|
||||
Be creative.
|
||||
Use `[\s\S]*?` to match anything (instead of just `.`), including whitespace and newlines.
|
||||
And finally, have at least one matching group (`()`).
|
||||
Note: whitespace will be stripped from the matching group.
|
||||
28
examples/web-scraper/news_vice.py
Executable file
28
examples/web-scraper/news_vice.py
Executable file
@@ -0,0 +1,28 @@
|
||||
#!/usr/bin/env python3
|
||||
from botlib.curl import Curl
|
||||
from botlib.html2list import HTML2List, MatchGroup
|
||||
from botlib.oncedb import OnceDB
|
||||
|
||||
|
||||
def download(*, topic='motherboard', cohort='vice:motherboard'):
|
||||
db = OnceDB('cache.sqlite')
|
||||
url = 'https://www.vice.com/en/topic/{}'.format(topic)
|
||||
|
||||
select = '.vice-card__content'
|
||||
match = MatchGroup({
|
||||
'url': r'<a href="([^"]*)"',
|
||||
'title': r'<h3[^>]*><a [^>]*>([\s\S]*?)</a>[\s\S]*?</h3>',
|
||||
'desc': r'<p[^>]*>([\s\S]*?)</p>',
|
||||
})
|
||||
for elem in reversed(HTML2List(select).parse(Curl.get(url))):
|
||||
match.set_html(elem)
|
||||
x_uid = match['url']
|
||||
if not x_uid or db.contains(cohort, x_uid):
|
||||
continue
|
||||
txt = '<a href="https://www.vice.com{url}">{title}</a>'.format(**match)
|
||||
txt += '\n' + str(match['desc'])
|
||||
if txt:
|
||||
db.put(cohort, x_uid, txt)
|
||||
|
||||
|
||||
# download()
|
||||
20
examples/web-scraper/playground.py
Executable file
20
examples/web-scraper/playground.py
Executable file
@@ -0,0 +1,20 @@
|
||||
#!/usr/bin/env python3
|
||||
from botlib.curl import Curl
|
||||
from botlib.html2list import HTML2List, MatchGroup
|
||||
|
||||
URL = 'https://www.vice.com/en/topic/motherboard'
|
||||
SOURCE = Curl.get(URL, cache_only=True)
|
||||
|
||||
SELECT = '.vice-card__content'
|
||||
match = MatchGroup({
|
||||
'url': r'<a href="([^"]*)"',
|
||||
'title': r'<h3[^>]*><a [^>]*>([\s\S]*?)</a>[\s\S]*?</h3>',
|
||||
'desc': r'<p[^>]*>([\s\S]*?)</p>',
|
||||
'wrong-regex': r'<a xref="([\s\S]*?)"',
|
||||
})
|
||||
for elem in reversed(HTML2List(SELECT).parse(SOURCE)):
|
||||
match.set_html(elem)
|
||||
for k, v in match.to_dict().items():
|
||||
print(k, '=', v)
|
||||
print()
|
||||
break
|
||||
45
examples/web-scraper/shopping_craigslist.py
Executable file
45
examples/web-scraper/shopping_craigslist.py
Executable file
@@ -0,0 +1,45 @@
|
||||
#!/usr/bin/env python3
|
||||
from botlib.curl import Curl
|
||||
from botlib.html2list import HTML2List, MatchGroup
|
||||
from botlib.oncedb import OnceDB
|
||||
|
||||
CRAIGSLIST = 'https://newyork.craigslist.org/search/boo'
|
||||
|
||||
|
||||
def load(url):
|
||||
# return open('test.html')
|
||||
return Curl.get(url)
|
||||
|
||||
|
||||
def download():
|
||||
db = OnceDB('cache.sqlite')
|
||||
|
||||
def proc(cohort, source, select, regex={}, fn=str):
|
||||
match = MatchGroup(regex)
|
||||
for elem in reversed(HTML2List(select).parse(source)):
|
||||
match.set_html(elem)
|
||||
x_uid = match['url']
|
||||
if not x_uid or db.contains(cohort, x_uid):
|
||||
continue
|
||||
txt = (fn(match) or '').strip()
|
||||
if txt:
|
||||
print(txt)
|
||||
db.put(cohort, x_uid, txt)
|
||||
|
||||
proc('boat:craigslist', load(CRAIGSLIST), 'li.result-row', {
|
||||
'url': r'<a href="([^"]*)"',
|
||||
'title': r'<h3[\s\S]*?<a [^>]*>([\s\S]*?)</a>[\s\S]*?</h3>',
|
||||
'price': r'<span class="result-price">([\s\S]*?)</span>',
|
||||
'hood': r'<span class="result-hood">([\s\S]*?)</span>',
|
||||
}, lambda match: '''
|
||||
<a href="{url}">{title}</a>
|
||||
<strong>{price}</strong>, {hood}'''.format(**match))
|
||||
|
||||
# process another source ...
|
||||
# def fn(match):
|
||||
# print(match.to_dict())
|
||||
# return advanced_fn(match)
|
||||
# proc(cohort, load(url), select, match, fn)
|
||||
|
||||
|
||||
# download()
|
||||
Reference in New Issue
Block a user