Parsing compressed xml feed into ElementTree

You can pass the value returned by urlopen() directly to GzipFile() and in turn you can pass it to ElementTree methods such as iterparse():

#!/usr/bin/env python3
import xml.etree.ElementTree as etree
from gzip import GzipFile
from urllib.request import urlopen, Request

with urlopen(Request("http://smarkets.s3.amazonaws.com/oddsfeed.xml",
                     headers={"Accept-Encoding": "gzip"})) as response, \
     GzipFile(fileobj=response) as xml_file:
    for elem in getelements(xml_file, 'interesting_tag'):
        process(elem)

where getelements() allows to parse files that do not fit in memory.

def getelements(filename_or_file, tag):
    """Yield *tag* elements from *filename_or_file* xml incrementaly."""
    context = iter(etree.iterparse(filename_or_file, events=('start', 'end')))
    _, root = next(context) # get root element
    for event, elem in context:
        if event == 'end' and elem.tag == tag:
            yield elem
            root.clear() # free memory

To preserve memory, the constructed xml tree is cleared on each tag element.

Leave a Comment