Files
botlib/botlib/feed2list.py
2022-04-09 03:45:48 +02:00

68 lines
2.4 KiB
Python

#!/usr/bin/env python3
import xml.etree.ElementTree as ET
from typing import List, Dict, Any, Optional, Union, TextIO, BinaryIO
from .helper import StrFormat
def Feed2List(
fp: Optional[Union[TextIO, BinaryIO]],
*, keys: Optional[List[str]] = None
) -> List[Dict[str, Any]]:
''' Parse RSS or Atom feed and return list of entries. '''
def parse_without_namespace(file: Union[TextIO, BinaryIO]) -> ET.Element:
ns = {}
xml_iter = ET.iterparse(file, ('start-ns', 'start'))
for event, elem in xml_iter:
if event == 'start-ns':
ns['{' + elem[1]] = elem[0] + ':' if elem[0] else ''
elif event == 'start':
tag = elem.tag.split('}')
elem.tag = ''.join(ns[x] for x in tag[:-1]) + tag[-1]
return xml_iter.root
if not fp:
return []
# detect feed format (RSS / Atom)
root = parse_without_namespace(fp)
fp.close()
if root.tag == 'rss': # RSS
selector = 'channel/item'
date_fields = ['pubDate', 'lastBuildDate']
elif root.tag == 'feed': # Atom
selector = 'entry'
date_fields = ['updated', 'published']
else:
raise NotImplementedError('Unrecognizable feed format')
# parse XML
result = []
for item in root.findall(selector):
obj = {} # type: Dict[str, Any]
for child in item:
tag = child.tag
# Filter keys that are clearly not wanted by user
if keys and tag not in keys:
continue
value = (child.text or '').strip()
# For date-fields, create and return date
if tag in date_fields:
value = StrFormat.to_date(value)
# Return dict if has attributes or string without attribs
attr = child.attrib
if attr:
if value:
attr[''] = value
value = attr
# Auto-create list type if duplicate keys are used
try:
prev_val = obj[tag]
if not isinstance(prev_val, list):
obj[tag] = [prev_val]
obj[tag].append(value)
except KeyError:
obj[tag] = value
# Each entry has a key-value-dict. Value may be string or attrib-dict.
# Value may also be a list of mixed string and attrib-dict values.
result.append(obj)
return result