feat: update urls

This commit is contained in:
relikd
2024-03-09 02:30:07 +01:00
parent e1d74f49fb
commit cea4d18672
2 changed files with 178 additions and 31 deletions

View File

@@ -65,12 +65,6 @@ Similar commands exist on Linux and Windows.
## Development ## Development
### TODO
- Reindexing of previous URLs (should remove dead-links and add new ones)
- Periodic check on outdated URLs (see previous)
### Requirements ### Requirements
- `ipa_archive.py` has a dependency on [RemoteZip](https://github.com/gtsystem/python-remotezip) (`pip install remotezip`) - `ipa_archive.py` has a dependency on [RemoteZip](https://github.com/gtsystem/python-remotezip) (`pip install remotezip`)
@@ -99,6 +93,13 @@ To add files to the archive follow these steps:
4. `./tools/image_optim.sh` (this will convert all .png files to .jpg) 4. `./tools/image_optim.sh` (this will convert all .png files to .jpg)
5. `python3 ipa_archive.py export json` 5. `python3 ipa_archive.py export json`
To update:
- `python3 ipa_archive.py update` # check all links (if not udpated recently)
- `python3 ipa_archive.py update [url|base_url_id]` # force update
- Then run the same steps as after adding an url
Userful helper: Userful helper:
- `./tools/check_error_no_plist.sh` # checks that no plist exists for a done=4 entry - `./tools/check_error_no_plist.sh` # checks that no plist exists for a done=4 entry
- `./tools/check_missing_img.sh` # checks that for each .plist an .jpg exists - `./tools/check_missing_img.sh` # checks that for each .plist an .jpg exists

View File

@@ -40,6 +40,9 @@ def main():
cmd.add_argument('urls', metavar='URL', nargs='+', cmd.add_argument('urls', metavar='URL', nargs='+',
help='Search URLs for .ipa links') help='Search URLs for .ipa links')
cmd = cli.add_parser('update', help='Update all urls')
cmd.add_argument('urls', metavar='URL', nargs='*', help='URLs or index')
cmd = cli.add_parser('run', help='Download and process pending urls') cmd = cli.add_parser('run', help='Download and process pending urls')
cmd.add_argument('-force', '-f', action='store_true', cmd.add_argument('-force', '-f', action='store_true',
help='Reindex local data / populate DB.' help='Reindex local data / populate DB.'
@@ -69,9 +72,18 @@ def main():
if args.cmd == 'add': if args.cmd == 'add':
for url in args.urls: for url in args.urls:
crawler(url) addNewUrl(url)
print('done.') print('done.')
elif args.cmd == 'update':
queue = args.urls or CacheDB().getUpdateUrlIds(sinceNow='-7 days')
if queue:
for i, url in enumerate(queue):
updateUrl(url, i + 1, len(queue))
print('done.')
else:
print('Nothing to do.')
elif args.cmd == 'run': elif args.cmd == 'run':
DB = CacheDB() DB = CacheDB()
if args.pk: if args.pk:
@@ -136,7 +148,8 @@ class CacheDB:
self._db.execute(''' self._db.execute('''
CREATE TABLE IF NOT EXISTS urls( CREATE TABLE IF NOT EXISTS urls(
pk INTEGER PRIMARY KEY, pk INTEGER PRIMARY KEY,
url TEXT NOT NULL UNIQUE url TEXT NOT NULL UNIQUE,
date INTEGER DEFAULT (strftime('%s','now'))
); );
''') ''')
self._db.execute(''' self._db.execute('''
@@ -161,7 +174,31 @@ class CacheDB:
def __del__(self) -> None: def __del__(self) -> None:
self._db.close() self._db.close()
# insert URLs # Get URL
def getIdForBaseUrl(self, url: str) -> 'int|None':
x = self._db.execute('SELECT pk FROM urls WHERE url=?', [url])
row = x.fetchone()
return row[0] if row else None
def getBaseUrlForId(self, uid: int) -> 'str|None':
x = self._db.execute('SELECT url FROM urls WHERE pk=?', [uid])
row = x.fetchone()
return row[0] if row else None
def getId(self, baseUrlId: int, pathName: str) -> 'int|None':
x = self._db.execute('''SELECT pk FROM idx
WHERE base_url=? AND path_name=?;''', [baseUrlId, pathName])
row = x.fetchone()
return row[0] if row else None
def getUrl(self, uid: int) -> str:
x = self._db.execute('''SELECT url, path_name FROM idx
INNER JOIN urls ON urls.pk=base_url WHERE idx.pk=?;''', [uid])
base, path = x.fetchone()
return base + '/' + quote(path)
# Insert URL
def insertBaseUrl(self, base: str) -> int: def insertBaseUrl(self, base: str) -> int:
try: try:
@@ -172,18 +209,42 @@ class CacheDB:
x = self._db.execute('SELECT pk FROM urls WHERE url = ?;', [base]) x = self._db.execute('SELECT pk FROM urls WHERE url = ?;', [base])
return x.fetchone()[0] return x.fetchone()[0]
def insertIpaUrls(self, entries: 'Iterable[tuple[int, str, int]]') -> int: def insertIpaUrls(
self, baseUrlId: int, entries: 'Iterable[tuple[str, int, str]]'
) -> int:
''' :entries: must be iterable of `(path_name, filesize, crc32)` '''
self._db.executemany(''' self._db.executemany('''
INSERT OR IGNORE INTO idx (base_url, path_name, fsize) VALUES (?,?,?); INSERT OR IGNORE INTO idx (base_url, path_name, fsize) VALUES (?,?,?);
''', entries) ''', ((baseUrlId, path, size) for path, size, _crc in entries))
self._db.commit() self._db.commit()
return self._db.total_changes return self._db.total_changes
def getUrl(self, uid: int) -> str: # Update URL
x = self._db.execute('''SELECT url, path_name FROM idx
INNER JOIN urls ON urls.pk=base_url WHERE idx.pk=?;''', [uid]) def getUpdateUrlIds(self, *, sinceNow: str) -> 'list[int]':
base, path = x.fetchone() x = self._db.execute('''SELECT pk FROM urls
return base + '/' + quote(path) WHERE date IS NULL OR date < strftime('%s','now', ?)
''', [sinceNow])
return [row[0] for row in x.fetchall()]
def markBaseUrlUpdated(self, uid: int) -> None:
self._db.execute('''
UPDATE urls SET date=strftime('%s','now') WHERE pk=?''', [uid])
self._db.commit()
def updateIpaUrl(self, baseUrlId: int, entry: 'tuple[str, int, str]') \
-> 'int|None':
''' :entry: must be `(path_name, filesize, crc32)` '''
uid = self.getId(baseUrlId, entry[0])
if uid:
self._db.execute('UPDATE idx SET done=0, fsize=? WHERE pk=?;',
[entry[1], uid])
self._db.commit()
return uid
if self.insertIpaUrls(baseUrlId, [entry]) > 0:
x = self._db.execute('SELECT MAX(pk) FROM idx;')
return x.fetchone()[0]
return None
# Export JSON # Export JSON
@@ -295,22 +356,43 @@ class CacheDB:
# [add] Process HTML link list # [add] Process HTML link list
############################################### ###############################################
def crawler(url: str) -> None: def addNewUrl(url: str) -> None:
archiveId = extractArchiveOrgId(url)
if not archiveId:
return
baseUrlId = CacheDB().insertBaseUrl(urlForArchiveOrgId(archiveId))
json_file = pathToListJson(baseUrlId)
entries = downloadListArchiveOrg(archiveId, json_file)
inserted = CacheDB().insertIpaUrls(baseUrlId, entries)
print(f'new links added: {inserted} of {len(entries)}')
def extractArchiveOrgId(url: str) -> 'str|None':
match = re_archive_url.match(url) match = re_archive_url.match(url)
if not match: if not match:
print(f'[WARN] not an archive.org url. Ignoring "{url}"', file=stderr) print(f'[WARN] not an archive.org url. Ignoring "{url}"', file=stderr)
return return None
downloadListArchiveOrg(match.group(1)) return match.group(1)
def downloadListArchiveOrg(archiveId: str) -> None: def urlForArchiveOrgId(archiveId: str) -> str:
baseUrl = f'https://archive.org/download/{archiveId}' return f'https://archive.org/download/{archiveId}'
baseUrlId = CacheDB().insertBaseUrl(baseUrl)
json_file = CACHE_DIR / 'url_cache' / (str(baseUrlId) + '.json.gz')
json_file.parent.mkdir(exist_ok=True) def pathToListJson(baseUrlId: int, *, tmp: bool = False) -> Path:
if tmp:
return CACHE_DIR / 'url_cache' / f'tmp_{baseUrlId}.json.gz'
return CACHE_DIR / 'url_cache' / f'{baseUrlId}.json.gz'
def downloadListArchiveOrg(
archiveId: str, json_file: Path, *, force: bool = False
) -> 'list[tuple[str, int, str]]':
''' :returns: List of `(path_name, file_size, crc32)` '''
# store json for later # store json for later
if not json_file.exists(): if force or not json_file.exists():
print(f'load: [{baseUrlId}] {baseUrl}') json_file.parent.mkdir(exist_ok=True)
print(f'load: {archiveId}')
req = Request(f'https://archive.org/metadata/{archiveId}/files') req = Request(f'https://archive.org/metadata/{archiveId}/files')
req.add_header('Accept-Encoding', 'deflate, gzip') req.add_header('Accept-Encoding', 'deflate, gzip')
with urlopen(req) as page: with urlopen(req) as page:
@@ -324,11 +406,75 @@ def downloadListArchiveOrg(archiveId: str) -> None:
with gzip.open(json_file, 'rb') as fp: with gzip.open(json_file, 'rb') as fp:
data = json.load(fp) data = json.load(fp)
# process and add to DB # process and add to DB
entries = [(baseUrlId, x['name'], int(x.get('size', 0))) return [(x['name'], int(x.get('size', 0)), x.get('crc32'))
for x in data['result'] for x in data['result']
if x['source'] == 'original' and x['name'].endswith('.ipa')] if x['source'] == 'original' and x['name'].endswith('.ipa')]
inserted = CacheDB().insertIpaUrls(entries)
print(f'new links added: {inserted} of {len(entries)}')
###############################################
# [update] Re-index existing URL caches
###############################################
def updateUrl(url_or_uid: 'str|int', proc_i: int, proc_total: int):
baseUrlId, url = _lookupBaseUrl(url_or_uid)
if not baseUrlId or not url:
print(f'[ERROR] Ignoring "{url_or_uid}". Not found in DB', file=stderr)
return
archiveId = extractArchiveOrgId(url) or '' # guaranteed to return str
print(f'Updating [{proc_i}/{proc_total}] {archiveId}')
old_json_file = pathToListJson(baseUrlId)
new_json_file = pathToListJson(baseUrlId, tmp=True)
old_entries = set(downloadListArchiveOrg(archiveId, old_json_file))
new_entries = set(downloadListArchiveOrg(archiveId, new_json_file))
old_diff = old_entries - new_entries
new_diff = new_entries - old_entries
DB = CacheDB()
if old_diff or new_diff:
c_del = 0
c_new = 0
for old_entry in old_diff: # no need to sort
uid = DB.getId(baseUrlId, old_entry[0])
if uid:
print(f' rm: [{uid}] {old_entry}')
DB.setPermanentError(uid)
c_del += 1
else:
print(f' [ERROR] could not find old entry {old_entry[0]}',
file=stderr)
for new_entry in sorted(new_diff):
uid = DB.updateIpaUrl(baseUrlId, new_entry)
if uid:
print(f' add: [{uid}] {new_entry}')
c_new += 1
else:
print(f' [ERROR] updating {new_entry[0]}', file=stderr)
print(f' updated -{c_del}/+{c_new} entries.')
os.rename(new_json_file, old_json_file)
else:
print(' no changes.')
DB.markBaseUrlUpdated(baseUrlId)
if new_json_file.exists():
os.remove(new_json_file)
def _lookupBaseUrl(url_or_index: 'str|int') -> 'tuple[int|None, str|None]':
if isinstance(url_or_index, str):
if url_or_index.isnumeric():
url_or_index = int(url_or_index)
if isinstance(url_or_index, int):
baseUrlId = url_or_index
url = CacheDB().getBaseUrlForId(baseUrlId)
else:
archiveId = extractArchiveOrgId(url_or_index)
if not archiveId:
return None, None
url = urlForArchiveOrgId(archiveId)
baseUrlId = CacheDB().getIdForBaseUrl(url)
return baseUrlId, url
############################################### ###############################################