Writing an XML Header Using LXML

I am currently writing a script to convert a bunch of XML files from different encodings into unified UTF-8.

First, I will try to determine the encoding using LXML:

def get_source_encoding(self):
    tree = etree.parse(self.inputfile)
    encoding = tree.docinfo.encoding
    self.inputfile.seek(0)
    return (encoding or '').lower()

If it is empty, I try to get it from chardet:

def guess_source_encoding(self):
    chunk = self.inputfile.read(1024 * 10)
    self.inputfile.seek(0)
    return chardet.detect(chunk).lower()

Then I use codecsto convert the file encoding:

def convert_encoding(self, source_encoding, input_filename, output_filename):
    chunk_size = 16 * 1024

    with codecs.open(input_filename, "rb", source_encoding) as source:
        with codecs.open(output_filename, "wb", "utf-8") as destination:
            while True:
                chunk = source.read(chunk_size)

                if not chunk:
                    break;

                destination.write(chunk)

Finally, I am trying to rewrite the XML header. If originally there was an XML header

<?xml version="1.0"?>

or

<?xml version="1.0" encoding="windows-1255"?>

I would like to convert it to

<?xml version="1.0" encoding="UTF-8"?>

My current code is not working:

def edit_header(self, input_filename):
    output_filename = tempfile.mktemp(suffix=".xml")

    with open(input_filename, "rb") as source:
        parser = etree.XMLParser(encoding="UTF-8")
        tree = etree.parse(source, parser)

        with open(output_filename, "wb") as destination:
            tree.write(destination, encoding="UTF-8")

The file I'm testing has a header that does not indicate the encoding. How can I get it to correctly display the header with the specified encoding?

+4
source share
1

Try:

tree.write(destination, xml_declaration=True, encoding='UTF-8')

API:

xml_declaration, XML . False , True , None US-ASCII UTF-8 ( None).

ipython:

In [15]:  etree.ElementTree(etree.XML('<hi/>')).write(sys.stdout, xml_declaration=True, encoding='UTF-8')
<?xml version='1.0' encoding='UTF-8'?>
<hi/>

, , . lxml .

, , ( , Python2.7):

def convert_encoding(self, source_encoding, input_filename, output_filename):
    tree = etree.parse(input_filename)
    with open(output_filename, 'w') as destination:
        tree.write(destination, encoding='utf-8', xml_declaration=True)
+5

All Articles