Getting Unicode decode error using lxml.iterparse

digitig at digitig at
Tue May 22 18:56:43 EDT 2018

I'm trying to read my iTunes library in Python using iterparse. My current stub is:

---- Snip ----

import sys
import datetime
import xml.etree.ElementTree as ET
import argparse
import re

class Library:
    unmarshallers = {
        # collections
        "array": lambda x: [v.text for v in x],
        "dict": lambda x:
            dict((x[i].text, x[i+1].text) for i in range(0, len(x), 2)),
        "key": lambda x: x.text or "",

        # simple types
        "string": lambda x: x.text or "",
        "data": lambda x: base64.decodestring(x.text or ""),
        "date": lambda x: datetime.datetime(*map(int, re.findall("\d+", x.text))),
        "true": lambda x: True,
        "false": lambda x: False,
        "real": lambda x: float(x.text),
        "integer": lambda x: int(x.text)

    def load(self, file):
        parser = ET.iterparse(file)
        for action, elem in parser:
            unmarshal = self.unmarshallers.get(elem.tag)
            if unmarshal:
                data = unmarshal(elem)
                elem.text = data
            elif elem.tag != "plist":
                raise IOError("unknown plist type: %r" % elem.tag)
        return parser.root[0].text
    def __init__(self, infile):
        self.root = self.load(infile)

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description = "Parse an iTunes library file to a set of CSV files suitable for import to a database.")
    parser.add_argument('infile', nargs='?', type=argparse.FileType('r'), default=sys.stdin)
    print('Infile = ', args.infile)
    library = Library(args.infile)

My input file (reduced to home in on the error) is:

---- snip -----

<?xml version="1.0" encoding="UTF-8"?>
<plist version="1.0">
			<key>Name</key><string>Part 2. The Death Of Enkidu. Skon Přitele Mého Mne Zdeptal Težče</string>

---- snip ----

<?xml version="1.0" encoding="UTF-8"?>
<plist version="1.0">
			<key>Name</key><string>Part 2. The Death Of Enkidu. Skon Přitele Mého Mne Zdeptal Težče</string>

I'm getting an error on one part of the XML:

 File "C:\Users\digit\Anaconda3\lib\encodings\", line 23, in decode
    return codecs.charmap_decode(input,self.errors,decoding_table)[0]

UnicodeDecodeError: 'charmap' codec can't decode byte 0x8d in position 202: character maps to <undefined>

I suspect the issue is that it's using, which I don't think is UTF-8 as specified in the XML prolog. Is this an iterparse problem, or am I using it wrongly?


More information about the Python-list mailing list