#!/usr/bin/env python3 from botlib.curl import Curl from botlib.html2list import HTML2List, MatchGroup from botlib.oncedb import OnceDB CRAIGSLIST = 'https://newyork.craigslist.org/search/boo' def load(url): # return open('test.html') return Curl.get(url) def download(): db = OnceDB('cache.sqlite') def proc(cohort, source, select, regex={}, fn=str): match = MatchGroup(regex) for elem in reversed(HTML2List(select).parse(source)): match.set_html(elem) x_uid = match['url'] if not x_uid or db.contains(cohort, x_uid): continue txt = (fn(match) or '').strip() if txt: print(txt) db.put(cohort, x_uid, txt) proc('boat:craigslist', load(CRAIGSLIST), 'li.result-row', { 'url': r']*>([\s\S]*?)[\s\S]*?', 'price': r'([\s\S]*?)', 'hood': r'([\s\S]*?)', }, lambda match: ''' {title} {price}, {hood}'''.format(**match)) # process another source ... # def fn(match): # print(match.to_dict()) # return advanced_fn(match) # proc(cohort, load(url), select, match, fn) # download()