使用Python实现XML文件转为Excel文件

由于项目组中原来的文件使用的XML格式作为配置，扩展性很好，但是阅读起来不是很直观，编辑起来也是不很方便，特别一些规则的二维表，个人觉得使用EXCEL文件会方便很多。所以为了方便阅读与编辑这些规则的二维表，花了一些时间写了一个Python脚本，以实现将XML文件转为Excel文件。这里支持XML文件转为一个Sheet或者多个Sheet：

如果第二层所有标签都相同则会转为一个Sheet，所有第二层的标签都会作为行数据
如果第二层的标签有多种，则会把第二层的不同标签作为不同的Sheet，第三层的数据作为行数据

其它情况未作测试。

注意：脚本会自动根据所有同层标签计算所有字段以及所有子标签及其重复数，并作为列的扩展，如果XML的嵌套过深或者太复杂，可能会导致扩展后的列数超过EXCEL所支持的最大列数（16384列），导致失败。

附上源码以共享，当然，如果有更好的Idea进行改进欢迎留言。

  1#
  2#  XML文件转换成Excel文件
  3#
  4#  版本：1.0
  5#
  6#  作者：Witton Bell
  7#  E_Mail:witton@163.com
  8#
  9#
 10#  功能描述：
 11#
 12#  自动检测源文件编码是否为UTF8，如果不是，则使用GB18030进行解码，并转换成UTF8编码的文件，再进行XML源文件的读取
 13#
 14#  自动提取XML中的标签作为Excel的标题列，支持嵌套与重复，但是所有列不能超过EXCEL支持的最大列（16384列)
 15#
 16#  仅支持XML的第一层只有一个标签，
 17#  第二层标签可以只有一个标签（允许重复），此时第一层标签作为Excel的WorkSheet，第二层标签及其嵌套作为一行数据）
 18#  第二层标签也可以有多个标签（允许重复），此时第二层标签作为Excel的WorkSheet，第三层标签及其嵌套作为一行数据）
 19#
 20#  由于openpyxl默认为Excel的xlsx格式，所以本脚本的所有Excel文件格式都为xlsx
 21#
 22#
 23import openpyxl
 24from openpyxl.styles import Alignment
 25import xml.dom.minidom
 26from xml.dom.minidom import Document
 27from openpyxl.styles import Border, Side
 28import chardet
 29import os
 30
 31
 32class XMLNode:
 33    def __init__(self):
 34        self.name = ""
 35        self.properties = []
 36        self.child = []
 37        self.layer = 0
 38        self.index = 0
 39        self.parent = None
 40        self.node_info = None
 41
 42
 43class XMLNodeInfo:
 44    def __init__(self, node):
 45        self.map_properties = {}
 46        self.map_child = {}
 47        self.max_index = 0
 48        self.layer = node.layer
 49        self.name = node.name
 50
 51        self.register(node)
 52
 53    def register(self, node):
 54        for k in node.properties:
 55            if self.map_properties.get(k[0]) is None:
 56                self.map_properties[k[0]] = self.map_properties.__len__()
 57
 58        for ch in node.child:
 59            v = self.map_child.get(ch.name)
 60            if v is None:
 61                self.map_child[ch.name] = ch.node_info
 62
 63        if node.index > self.max_index:
 64            self.max_index = node.index
 65
 66
 67class XMLReader:
 68    # is_auto_convert2utf8：是否自动转换为UTF8编码
 69    # is_convert_to_original_file：如果转换为UTF8，是否覆盖原文件
 70    def __init__(self, file_path, is_auto_convert2utf8, is_convert_to_original_file):
 71        self.__root = []
 72        self.__map_node_info = {}
 73        if is_auto_convert2utf8:
 74            is_tmp_file, tmp_file_path = self.__convert_to_utf8(file_path)
 75            fd = xml.dom.minidom.parse(tmp_file_path)
 76            if is_tmp_file:
 77                if is_convert_to_original_file:
 78                    os.remove(file_path)
 79                    os.rename(tmp_file_path, file_path)
 80                else:
 81                    os.remove(tmp_file_path)
 82        else:
 83            fd = xml.dom.minidom.parse(file_path)
 84        index = 0
 85        for child in fd.childNodes:
 86            if child.nodeType != xml.dom.Node.ELEMENT_NODE:
 87                continue
 88
 89            self.__read_node(child, self.__root, index, None)
 90            index += 1
 91
 92    def get_root_node(self):
 93        return self.__root
 94
 95    @staticmethod
 96    def __convert_to_utf8(file_path):
 97        fd = open(file_path, "rb")
 98        fd.seek(0, 2)
 99        size = fd.tell()
100        if size > 1024 * 1024:
101            size = 1024 * 1024
102        fd.seek(0, 0)
103        text = fd.read(size)
104        ret = chardet.detect(text)
105        if ret['encoding'].lower().find("utf-8") != -1:
106            return False, file_path
107
108        tmp_file = file_path + ".tmp"
109        file = open(tmp_file, "w", encoding="utf-8", newline="\n")
110        fd.seek(0, 0)
111        line = fd.readline()
112        while line.__len__() > 0:
113            file.write(line.decode("gb18030"))
114            line = fd.readline()
115        file.close()
116        fd.close()
117
118        return True, tmp_file
119
120    @staticmethod
121    def __get_attrib(node, rc):
122        if node._attrs is None:
123            return
124        for key in node._attrs:
125            v = node._attrs[key]._value
126            rc.append([key, v])
127
128    def __read_node(self, node, root, index, parent, layer=1):
129        xml_node = XMLNode()
130        xml_node.name = node.nodeName
131        xml_node.layer = layer
132        xml_node.index = index
133        xml_node.parent = parent
134        self.__get_attrib(node, xml_node.properties)
135        i = 0
136        for child in node.childNodes:
137            if child.nodeType != xml.dom.Node.ELEMENT_NODE:
138                continue
139
140            self.__read_node(child, xml_node.child, i, xml_node, layer + 1)
141            i += 1
142        root.append(xml_node)
143        self.__register(xml_node)
144
145    def __register(self, node):
146        key = node.name + str(node.layer)
147        nd = self.__map_node_info.get(key)
148        if nd is None:
149            nd = XMLNodeInfo(node)
150            node.node_info = nd
151            self.__map_node_info[key] = nd
152        else:
153            nd.register(node)
154            node.node_info = nd
155
156
157class XMLWriter:
158    def __init__(self, xml_file_path, xml_node_list):
159        doc = Document()
160        for node in xml_node_list:
161            ele = self.__write_node(node, doc)
162            doc.appendChild(ele)
163
164        f = open(xml_file_path, 'w', encoding='utf-8')
165        doc.writexml(f, indent='\t', addindent='\t', encoding='utf-8', newl='\n')
166
167    def __write_node(self, node, doc):
168        ele = doc.createElement(node.name)
169        for prop in node.properties:
170            ele.setAttribute(prop[0], prop[1])
171
172        for child in node.childs:
173            ret = self.__write_node(child, doc)
174            ele.appendChild(ret)
175
176        return ele
177
178
179class XmlToXls:
180    # read_from_xml_file_path：XML源文件完整路径
181    # save_to_xls_file_path：保存转换后的Excel文件完整路径
182    # is_auto_convert2utf8：是否自动转换为UTF8编码
183    # is_convert_to_original_file：如果转换为UTF8，是否覆盖原文件
184    # is_merge_head：如果表头有多行，是否合并层属关系的表头
185    # is_border_cell：是否添加单元格的边框
186    # is_alignment_center：单元格是否居中对齐
187    def __init__(self, read_from_xml_file_path, save_to_xls_file_path,
188                 is_auto_convert2utf8=True, is_convert_to_original_file=False,
189                 is_merge_head=False, is_border_cell=False, is_alignment_center=True):
190        try:
191            self.__is_merge_head = is_merge_head
192            if is_alignment_center:
193                self.__alignment = Alignment(horizontal='center', vertical='center')
194            else:
195                self.__alignment = None
196            if is_border_cell:
197                side = Side(border_style='thin', color='000000')
198                self.__border = Border(side, side, side, side, side)
199            else:
200                self.__border = None
201            wb = openpyxl.Workbook()
202            wb.encoding = 'utf-8'
203            for sh in wb.worksheets:
204                wb.remove_sheet(sh)
205
206            reader = XMLReader(read_from_xml_file_path,
207                               is_auto_convert2utf8,
208                               is_convert_to_original_file)
209            self.__write(reader.get_root_node(), wb)
210            wb.save(save_to_xls_file_path)
211            wb.close()
212        except Exception as e:
213            print(e)
214
215    def __write(self, xml_node, wb):
216        self.__map_field = {}
217        for node in xml_node:
218            if node.node_info.map_child.__len__() == 1:
219                self.__start_layer = 1
220                self.__write_sheet(node, wb)
221            else:
222                self.__start_layer = 2
223                for child in node.child:
224                    self.__write_sheet(child, wb)
225
226    def __write_sheet(self, node, wb):
227        sh = wb.create_sheet(node.name)
228        self.__write_head(node.node_info, sh)
229        self.__write_data(node, sh)
230
231    def __write_head(self, node, sh):
232        for key in node.map_child:
233            col = 1
234            child = node.map_child[key]
235            for k in child.map_properties:
236                c = child.map_properties[k]
237                self.__write_cell(sh, child.layer - self.__start_layer, c + col, k)
238            col += child.map_properties.__len__()
239            self.__write_head_ext(child, sh, col)
240            break
241
242    def __write_head_ext(self, node, sh, col):
243        for key in node.map_child:
244            child = node.map_child[key]
245            num = child.map_properties.__len__()
246            for i in range(0, child.max_index + 1):
247                if col > 16384:
248                    raise Exception("超过EXCEL最大列数(16384列)限制，转换失败")
249
250                old_col = col
251                self.__write_cell(sh, child.layer - self.__start_layer - 1, col, child.name)
252                for k in child.map_properties:
253                    c = child.map_properties[k]
254                    self.__write_cell(sh, child.layer - self.__start_layer, c + col, k)
255                col += num
256                col = self.__write_head_ext(child, sh, col)
257                if self.__is_merge_head:
258                    merge_row = child.layer - self.__start_layer - 1
259                    sh.merge_cells(start_row=merge_row, end_row=merge_row,
260                                   start_column=old_col, end_column=col - 1)
261        return col
262
263    def __write_data(self, node, sh):
264        row = sh.max_row + 1
265        sh.freeze_panes = sh.cell(row, 1)
266        for child in node.child:
267            col = 1
268            self.__write_data_ext(child, sh, row, col)
269            row += 1
270
271    def __write_data_ext(self, node, sh, row, col):
272        m = node.node_info.map_properties
273        max_prop_num = m.__len__()
274        for prop in node.properties:
275            c = m[prop[0]]
276            self.__write_cell(sh, row, c + col, prop[1])
277        col += max_prop_num
278
279        map_child = {}
280        for child in node.child:
281            if map_child.get(child.name) is None:
282                map_child[child.name] = 1
283            else:
284                map_child[child.name] += 1
285            col = self.__write_data_ext(child, sh, row, col)
286
287        for key in node.node_info.map_child:
288            child = node.node_info.map_child[key]
289            all_count = child.max_index + 1
290            count = map_child.get(key)
291            if count is not None:
292                all_count -= count
293
294            for i in range(0, all_count):
295                col += child.map_properties.__len__()
296
297        return col
298
299    def __write_cell(self, sh, row, col, value):
300        if value.isdigit():
301            cell = sh.cell(row, col, int(value))
302        else:
303            cell = sh.cell(row, col, value)
304        cell.alignment = self.__alignment
305        cell.border = self.__border

由于使用的XML解析器不支持值中有大于(>)，小于(<)这些特殊符号，所以如果值中有这些符号的XML文件会解析失败，报错：

1not well-formed (invalid token)

比如下面的XML文件就会报上面的错：

1<?xml version='1.0' encoding='UTF-8'?>
2<test>
3<testData value="<测试数据>"/>
4</test>

也不支持没有根节点的XML文件：比如：

 1<?xml version='1.0' encoding='UTF-8'?>
 2<A Value="A1">
 3    <AA value="a"/>
 4</A>
 5<B Value="B1">
 6    <BB value="b"/>
 7</B>
 8<C Value="C1">
 9    <CC value="c"/>
10</C>

会报错：

1junk after document element

C++使用的tinyxml是可以正常解析大于小于等特殊符号的，网上有一个 pytinyxml2开源项目，让python可以使用tinyxml进行解析。

安装pytinyxml2之前需要先安装swig，Windows下可以下载： swigwin-4.0.1.zip，这是已经编译好的版本，解压出来后添加路径到PATH环境变量即可。非Windows可以下载 swig-4.0.1.tar.gz进行编译安装，也可以直接：

1yum install swig

安装好swig后，在pytinyxml2源码目录中执行：

1python setup.py install

不能使用

1pip install pytinyxml2

进行安装，我遇到有报错:

 1running bdist_wheel
 2  running build
 3  running build_py
 4  creating build
 5  creating build/lib.linux-x86_64-3.5
 6  copying pytinyxml2.py -> build/lib.linux-x86_64-3.5
 7  running build_ext
 8  building '_pytinyxml2' extension
 9  swigging pytinyxml2.i to pytinyxml2_wrap.cpp
10  swig -python -c++ -o pytinyxml2_wrap.cpp pytinyxml2.i
11  pytinyxml2.i:5: Error: Unable to find 'tinyxml2.h'
12  error: command 'swig' failed with exit status 1

查看 pytinyxml2的包可以看到，里面缺少tinyxml2.h

项目中原来的程序只能读取XML格式的文档，我们为了方便编辑与查阅，所以把XML转为Excel，编辑完成后，还需要把Excel转为原来的XML，所以实现了XLS转XML，并对之前的代码作了部分修改，附上源码：

  1# -*- coding: UTF-8 -*-
  2#
  3#  XML文件与Excel文件互转
  4#
  5#  版本：1.1
  6#
  7#  作者：Witton Bell
  8#  E_Mail:witton@163.com
  9#
 10#
 11#  功能描述：
 12#
 13#  XmlToXls:
 14#
 15#  自动检测源文件编码是否为UTF8，如果不是，则使用GB18030进行解码，并转换成UTF8编码的文件，再进行XML源文件的读取
 16#
 17#  自动提取XML中的标签作为Excel的标题列，支持嵌套与重复，但是所有列不能超过EXCEL支持的最大列（16384列)
 18#
 19#  使用XMLReader仅支持XML的只有一个根标签（标准XML格式），使用TinyXMLReader支持有多个根标签
 20#  第二层标签可以只有一个标签（允许重复），此时第一层标签作为Excel的WorkSheet，第二层标签及其嵌套作为一行数据）
 21#  第二层标签也可以有多个标签（允许重复），此时第二层标签作为Excel的WorkSheet，第三层标签及其嵌套作为一行数据）
 22#
 23#  由于openpyxl默认为Excel的xlsx格式，所以本脚本的所有Excel文件格式都为xlsx
 24#
 25#  XlsToXml:
 26# 同样XMLWriter仅支持XML只有一个根标签（标准XML格式），使用TinyXMLWriter支持有多个根标签
 27# Excel文件需要有层次分明的表头，并且需要冻结表头
 28#
 29import openpyxl
 30from openpyxl.styles import Alignment
 31from openpyxl.styles import Border, Side
 32from openpyxl.comments import Comment
 33import openpyxl.utils as xls_util
 34import chardet
 35import os
 36import xml.dom.minidom
 37import copy
 38
 39class XMLNode:
 40    def __init__(self):
 41        self.name = ""
 42        self.properties = []
 43        self.child = []
 44        self.layer = 0
 45        self.index = 0
 46        self.parent = None
 47        self.node_info = None
 48
 49
 50class XMLNodeInfo:
 51    def __init__(self, node):
 52        self.map_properties = {}
 53        self.map_child = {}
 54        self.max_index = 0
 55        self.layer = node.layer
 56        self.name = node.name
 57        self.parent = node.parent
 58
 59        self.register(node)
 60
 61    def register(self, node):
 62        for k in node.properties:
 63            if self.map_properties.get(k[0]) is None:
 64                self.map_properties[k[0]] = self.map_properties.__len__()
 65
 66        for ch in node.child:
 67            v = self.map_child.get(ch.name)
 68            if v is None:
 69                self.map_child[ch.name] = ch.node_info
 70
 71        if node.index > self.max_index:
 72            self.max_index = node.index
 73
 74
 75class _XMLReaderBase:
 76    def __init__(self):
 77        self._root = []
 78        self._map_node_info = {}
 79
 80    def __calc_node_key(self, node):
 81        key = ""
 82        if node.parent is not None:
 83            key = self.__calc_node_key(node.parent)
 84        return "%s_%s" % (key, node.name)
 85
 86    def _register(self, node):
 87        key = self.__calc_node_key(node)
 88        nd = self._map_node_info.get(key)
 89        if nd is None:
 90            nd = XMLNodeInfo(node)
 91            node.node_info = nd
 92            self._map_node_info[key] = nd
 93        else:
 94            nd.register(node)
 95            node.node_info = nd
 96
 97    @staticmethod
 98    def _convert_to_utf8(file_path):
 99        fd = open(file_path, "rb")
100        fd.seek(0, 2)
101        size = fd.tell()
102        if size > 1024 * 1024:
103            size = 1024 * 1024
104        fd.seek(0, 0)
105        text = fd.read(size)
106        ret = chardet.detect(text)
107        if ret['encoding'].lower().find("utf-8") != -1:
108            return False, file_path
109
110        tmp_file = file_path + ".tmp"
111        file = open(tmp_file, "w", encoding="utf-8", newline="\n")
112        fd.seek(0, 0)
113        line = fd.readline()
114        while line.__len__() > 0:
115            file.write(line.decode("gb18030"))
116            line = fd.readline()
117        file.close()
118        fd.close()
119
120        return True, tmp_file
121
122
123# 该类使用xml不能解析值中带特殊符号的文件，也不支持没有统一根节点的XML文件，建议使用TinyXMLReader
124class XMLReader(_XMLReaderBase):
125    # is_auto_convert2utf8：是否自动转换为UTF8编码
126    # is_convert_to_original_file：如果转换为UTF8，是否覆盖原文件
127    def read(self, file_path, is_auto_convert2utf8=True, is_convert_to_original_file=False):
128        _XMLReaderBase.__init__(self)
129        if is_auto_convert2utf8:
130            is_tmp_file, tmp_file_path = self._convert_to_utf8(file_path)
131            fd = xml.dom.minidom.parse(tmp_file_path)
132            if is_tmp_file:
133                if is_convert_to_original_file:
134                    os.remove(file_path)
135                    os.rename(tmp_file_path, file_path)
136                else:
137                    os.remove(tmp_file_path)
138        else:
139            fd = xml.dom.minidom.parse(file_path)
140        index = 0
141        for child in fd.childNodes:
142            if child.nodeType != xml.dom.Node.ELEMENT_NODE:
143                continue
144
145            self.__read_node(child, self._root, index, None)
146            index += 1
147
148        return self._root
149
150    @staticmethod
151    def __get_attrib(node, rc):
152        if node._attrs is None:
153            return
154        for key in node._attrs:
155            v = node._attrs[key]._value
156            rc.append([key, v])
157
158    def __read_node(self, node, root, index, parent, layer=1):
159        xml_node = XMLNode()
160        xml_node.name = node.nodeName
161        xml_node.layer = layer
162        xml_node.index = index
163        xml_node.parent = parent
164        self.__get_attrib(node, xml_node.properties)
165        i = 0
166        for child in node.childNodes:
167            if child.nodeType != xml.dom.Node.ELEMENT_NODE:
168                continue
169
170            self.__read_node(child, xml_node.child, i, xml_node, layer + 1)
171            i += 1
172        root.append(xml_node)
173        self._register(xml_node)
174
175
176# 该类需要安装pytinyxml2，参见：https://blog.csdn.net/witton/article/details/100302498
177class TinyXMLReader(_XMLReaderBase):
178    # is_auto_convert2utf8：是否自动转换为UTF8编码
179    # is_convert_to_original_file：如果转换为UTF8，是否覆盖原文件
180    def read(self, file_path, is_auto_convert2utf8=True, is_convert_to_original_file=False):
181        _XMLReaderBase.__init__(self)
182        import pytinyxml2
183        doc = pytinyxml2.XMLDocument()
184        if is_auto_convert2utf8:
185            is_tmp_file, tmp_file_path = self._convert_to_utf8(file_path)
186            doc.LoadFile(tmp_file_path)
187            if is_tmp_file:
188                if is_convert_to_original_file:
189                    os.remove(file_path)
190                    os.rename(tmp_file_path, file_path)
191                else:
192                    os.remove(tmp_file_path)
193        else:
194            doc.LoadFile(file_path)
195        node = doc.RootElement()
196        index = 0
197        while node is not None:
198            self.__read_node(node, self._root, index, None)
199            node = node.NextSiblingElement()
200            index += 1
201
202        return self._root
203
204    @staticmethod
205    def __get_attrib(node, rc):
206        attrib = node.FirstAttribute()
207        while attrib is not None:
208            key = attrib.Name()
209            v = attrib.Value()
210            rc.append([key, v])
211            attrib = attrib.Next()
212
213    def __read_node(self, node, root, index, parent, layer=1):
214        xml_node = XMLNode()
215        xml_node.name = node.Value()
216        xml_node.layer = layer
217        xml_node.index = index
218        xml_node.parent = parent
219        self.__get_attrib(node, xml_node.properties)
220        i = 0
221        child = node.FirstChildElement()
222        while child is not None:
223            self.__read_node(child, xml_node.child, i, xml_node, layer + 1)
224            child = child.NextSiblingElement()
225            i += 1
226        root.append(xml_node)
227        self._register(xml_node)
228
229
230class XMLWriter:
231    def __init__(self, xml_file_path, xml_node_list):
232        doc = xml.dom.minidom.Document()
233        for node in xml_node_list:
234            ele = self.__write_node(node, doc)
235            doc.appendChild(ele)
236
237        f = open(xml_file_path, 'w', encoding='utf-8')
238        doc.writexml(f, indent='\t', addindent='\t', encoding='utf-8', newl='\n')
239        f.close()
240
241    def __write_node(self, node, doc):
242        ele = doc.createElement(node.name)
243        for prop in node.properties:
244            ele.setAttribute(prop[0], prop[1])
245
246        for child in node.child:
247            ret = self.__write_node(child, doc)
248            ele.appendChild(ret)
249
250        return ele
251
252
253class TinyXMLWriter:
254    # is_convert_spec_character: 是否转换特殊字符
255    def __init__(self, xml_file_path, xml_node_list, is_convert_spec_character=True):
256        import pytinyxml2
257        doc = pytinyxml2.XMLDocument(is_convert_spec_character)
258        decl = doc.NewDeclaration()
259        doc.LinkEndChild(decl)
260        for node in xml_node_list:
261            ele = self.__write_node(node, doc)
262            doc.LinkEndChild(ele)
263
264        doc.SaveFile(xml_file_path)
265
266    def __write_node(self, node, doc):
267        ele = doc.NewElement(node.name)
268        for prop in node.properties:
269            ele.SetAttribute(prop[0], prop[1])
270
271        for child in node.child:
272            ret = self.__write_node(child, doc)
273            ele.LinkEndChild(ret)
274
275        return ele
276
277
278class XmlToXls:
279    def __init__(self):
280        self.__is_merge_head = False
281        self.__alignment_center = None
282        self.__border = None
283
284    # read_from_xml_file_path：XML源文件完整路径
285    # save_to_xls_file_path：保存转换后的Excel文件完整路径
286    # is_auto_convert2utf8：是否自动转换为UTF8编码
287    # is_convert_to_original_file：如果转换为UTF8，是否覆盖原文件
288    # is_merge_head：如果表头有多行，是否合并层属关系的表头
289    # is_border_cell：是否添加单元格的边框
290    # is_use_tiny_xml：是否使用tinyXML
291    def convert(self, read_from_xml_file_path, save_to_xls_file_path,
292                is_auto_convert2utf8=True, is_convert_to_original_file=False,
293                is_merge_head=False, is_border_cell=False, is_use_tiny_xml=True):
294        try:
295            self.__is_merge_head = is_merge_head
296            self.__alignment_center = Alignment(horizontal='center', vertical='center')
297            if is_border_cell:
298                side = Side(border_style='thin', color='000000')
299                self.__border = Border(side, side, side, side, side)
300            else:
301                self.__border = None
302            wb = openpyxl.Workbook()
303            wb.encoding = 'utf-8'
304            for sh in wb.worksheets:
305                wb.remove_sheet(sh)
306
307            if is_use_tiny_xml:
308                reader = TinyXMLReader()
309            else:
310                reader = XMLReader()
311            root = reader.read(read_from_xml_file_path,
312                               is_auto_convert2utf8,
313                               is_convert_to_original_file)
314            self.__write(root, wb)
315            wb.save(save_to_xls_file_path)
316            wb.close()
317        except Exception as e:
318            print(e)
319
320    # src_path_dir：XML源目录完整路径
321    # dst_path_dir：保存转换后的Excel文件完整目录路径，如果为None或者为空，则直接转换在源文件应对目录下
322    # is_auto_convert2utf8：是否自动转换为UTF8编码
323    # is_convert_to_original_file：如果转换为UTF8，是否覆盖原文件
324    # is_merge_head：如果表头有多行，是否合并层属关系的表头
325    # is_border_cell：是否添加单元格的边框
326    # is_use_tiny_xml：是否使用tinyXML
327    def convert_dirs(self, src_path_dir, dst_path_dir=None,
328                     is_auto_convert2utf8=True, is_convert_to_original_file=False,
329                     is_merge_head=False, is_border_cell=False, is_use_tiny_xml=True):
330        if dst_path_dir is not None and dst_path_dir != "":
331            if not os.path.exists(dst_path_dir):
332                os.mkdir(dst_path_dir)
333
334        for root, dirs, files in os.walk(src_path_dir):
335            for name in files:
336                basename, ext = os.path.splitext(name)
337                if ext != ".xml":
338                    continue
339
340                src = os.path.join(root, name)
341                target = basename + ".xlsx"
342                print("处理%s" % src)
343                if dst_path_dir is None or dst_path_dir == "":
344                    dst = os.path.join(root, target)
345                else:
346                    dst = os.path.join(dst_path_dir, target)
347                self.convert(src, dst, is_auto_convert2utf8, is_convert_to_original_file,
348                             is_merge_head, is_border_cell, is_use_tiny_xml)
349
350    def __write(self, xml_node, wb):
351        self.__map_field = {}
352        for node in xml_node:
353            if node.node_info.map_child.__len__() == 1:
354                self.__is_multi_sheet = False
355                self.__write_sheet(node, wb)
356            else:
357                self.__is_multi_sheet = True
358                for child in node.child:
359                    self.__write_sheet(child, wb)
360
361    def __write_sheet(self, node, wb):
362        sh = wb.create_sheet(node.name)
363        self.__write_head(node, sh)
364        self.__write_data(node, sh)
365
366    def __write_head(self, node, sh):
367        node_info = node.node_info
368        if self.__is_multi_sheet:
369            self.__write_head_cell(sh, node_info.parent.layer, 1, node_info.parent.name)
370            self.__write_head_cell(sh, node_info.layer, 1, node_info.name)
371            comment_str = ""
372            for prop in node.properties:
373                comment_str += '%s="%s"\n' % (prop[0], prop[1])
374            if comment_str != "":
375                sh.cell(node_info.layer, 1).comment = Comment(comment_str, "", width=300)
376        elif not self.__is_multi_sheet:
377            self.__write_head_cell(sh, node_info.layer, 1, node_info.name)
378
379        child_name = None
380        for key in node_info.map_child:
381            col = 1
382            child = node_info.map_child[key]
383            child_name = child.name
384            for k in child.map_properties:
385                c = child.map_properties[k]
386                self.__write_head_cell(sh, child.layer, c + col, k)
387            col += child.map_properties.__len__()
388            self.__write_head_ext(child, sh, col)
389            break
390
391        if self.__is_multi_sheet:
392            row = 3
393        else:
394            row = 2
395
396        if child_name is not None:
397            sh.insert_rows(row)
398            self.__write_head_cell(sh, row, 1, child_name)
399
400    def __write_head_ext(self, node, sh, col):
401        for key in node.map_child:
402            child = node.map_child[key]
403            num = child.map_properties.__len__()
404            for i in range(0, child.max_index + 1):
405                if col > 16384:
406                    raise Exception("超过EXCEL最大列数(16384列)限制，转换失败")
407
408                old_col = col
409                self.__write_head_cell(sh, child.layer - 1, col, child.name)
410                for k in child.map_properties:
411                    c = child.map_properties[k]
412                    self.__write_head_cell(sh, child.layer, c + col, k)
413                col += num
414                col = self.__write_head_ext(child, sh, col)
415                if self.__is_merge_head:
416                    merge_row = child.layer - 1
417                    sh.merge_cells(start_row=merge_row, end_row=merge_row,
418                                   start_column=old_col, end_column=col - 1)
419        return col
420
421    def __write_data(self, node, sh):
422        row = sh.max_row + 1
423        sh.freeze_panes = sh.cell(row, 1)
424        for child in node.child:
425            col = 1
426            self.__write_data_ext(child, sh, row, col)
427            row += 1
428
429    def __write_data_ext(self, node, sh, row, col):
430        m = node.node_info.map_properties
431        max_prop_num = m.__len__()
432        for prop in node.properties:
433            c = m[prop[0]]
434            self.__write_cell(sh, row, c + col, prop[1])
435        col += max_prop_num
436
437        map_child = {}
438        for child in node.child:
439            if map_child.get(child.name) is None:
440                map_child[child.name] = 1
441            else:
442                map_child[child.name] += 1
443            col = self.__write_data_ext(child, sh, row, col)
444
445        for key in node.node_info.map_child:
446            child = node.node_info.map_child[key]
447            all_count = child.max_index + 1
448            count = map_child.get(key)
449            if count is not None:
450                all_count -= count
451
452            for i in range(0, all_count):
453                col += child.map_properties.__len__()
454
455        return col
456
457    def __write_head_cell(self, sh, row, col, value):
458        cell = sh.cell(row, col, value)
459        cell.border = self.__border
460        cell.alignment = self.__alignment_center
461
462    def __write_cell(self, sh, row, col, value):
463        if value.isdigit():
464            cell = sh.cell(row, col, int(value))
465        else:
466            cell = sh.cell(row, col, value)
467        cell.border = self.__border
468
469
470class XlsToXml:
471    def __init__(self):
472        pass
473
474    @staticmethod
475    def __add_prop(map_field, col, value):
476        if map_field.__len__() == 0:
477            return
478
479        if map_field.get(col) is None:
480            # 找本节点
481            c = col - 1
482            while c >= 1:
483                if map_field.get(c) is not None:
484                    node = map_field[c][0]
485                    break
486                c -= 1
487        else:
488            node = map_field[col][0]
489        node.properties.append([value, col])
490
491    @staticmethod
492    def __add_node(map_field, row, col, value):
493        node = XMLNode()
494        node.name = value
495
496        if map_field.get(col) is not None:
497            node.parent = map_field[col][0]
498        else:
499            # 找父节点
500            c = col - 1
501            while c >= 1:
502                if map_field.get(c) is not None:
503                    if row > map_field[c][1]:
504                        node.parent = map_field[c][0]
505                        break
506                c -= 1
507
508        if node.parent is not None:
509            node.parent.child.append(node)
510            node.layer = node.parent.layer + 1
511        else:
512            node.layer = 1
513        map_field[col] = [node, row]
514        return node
515
516    def __read_xls(self, file_path):
517        wb = openpyxl.load_workbook(file_path)
518        root_data = XMLNode()
519        is_multi_sheet = wb.worksheets.__len__() > 1
520        for sh in wb.worksheets:
521            max_row = sh.max_row + 1
522            max_col = sh.max_column + 1
523            if sh.freeze_panes is None:
524                raise Exception("文件[%s]表单[%s]的无冻结窗口，无法确定表头，转为XML失败" %
525                                (os.path.basename(file_path), sh.title))
526
527            head_row, head_col = xls_util.coordinate_to_tuple(sh.freeze_panes)
528            if head_col != 1:
529                raise Exception("文件[%s]表单[%s]的冻结窗口列不为1，无法转为XML" % (os.path.basename(file_path), sh.title))
530
531            root = None
532
533            map_field = {}
534            for col in range(1, max_col):
535                for row in range(1, head_row):
536                    cell = sh.cell(row, col)
537                    value = cell.value
538                    comment = cell.comment
539
540                    if value is None:
541                        continue
542
543                    next_row = row + 1
544                    if next_row >= head_row:
545                        self.__add_prop(map_field, col, value)
546                        continue
547
548                    next_row_value = sh.cell(next_row, col).value
549                    if next_row_value is None:
550                        self.__add_prop(map_field, col, value)
551                        continue
552
553                    node = self.__add_node(map_field, row, col, value)
554                    if root is None:
555                        root = node
556
557                    if comment is None:
558                        continue
559
560                    comment = comment.text
561                    lines = comment.splitlines()
562                    for line in lines:
563                        props = line.split('=')
564                        kv = []
565                        for prop in props:
566                            prop = prop.replace('"', '')
567                            kv.append(prop)
568                            if kv.__len__() == 2:
569                                node.properties.append(kv)
570                                kv = []
571
572            root_data.name = root.name
573            root_data.layer = root.layer
574
575            if is_multi_sheet and root.child.__len__() > 0:
576                child_list = copy.deepcopy(root.child[0].child)
577                root.child[0].child = []
578                root_data.child.append(root.child[0])
579                root_data_child = root.child[0].child
580            else:
581                child_list = copy.deepcopy(root.child)
582                root_data_child = root_data.child
583
584            for row in range(head_row, max_row):
585                clone = copy.deepcopy(child_list)
586                for child in clone:
587                    self.__read_node_data(child, sh, row)
588                    root_data_child.append(child)
589
590        return root_data
591
592    def __read_node_data(self, node, sh, row):
593        prop_list = []
594        for prop in node.properties:
595            col = prop[1]
596            value = sh.cell(row, col).value
597            if value is not None:
598                prop_list.append([prop[0], value])
599
600        child_list = []
601        for child in node.child:
602            self.__read_node_data(child, sh, row)
603            if child.properties.__len__() > 0 or child.child.__len__() > 0:
604                child_list.append(child)
605                copy.copy(child)
606
607        node.properties = prop_list
608        node.child = child_list
609
610    def convert(self, src_file_path, dst_file_path, is_use_tiny_xml=True, is_convert_spec_character=False):
611        root = self.__read_xls(src_file_path)
612        if is_use_tiny_xml:
613            TinyXMLWriter(dst_file_path, [root], is_convert_spec_character)
614        else:
615            XMLWriter(dst_file_path, [root])
616
617    def convert_dirs(self, src_path_dir, dst_path_dir=None, is_use_tiny_xml=True, is_convert_spec_character=False):
618        for root, dirs, files in os.walk(src_path_dir):
619            for name in files:
620                basename, ext = os.path.splitext(name)
621                if ext != ".xls" and ext != ".xlsx":
622                    continue
623
624                src = os.path.join(root, name)
625                target = basename + "1" + ".xml"
626                print("处理%s" % src)
627                if dst_path_dir is None or dst_path_dir == "":
628                    dst = os.path.join(root, target)
629                else:
630                    dst = os.path.join(dst_path_dir, target)
631
632                try:
633                    self.convert(src, dst, is_use_tiny_xml, is_convert_spec_character)
634                except Exception as e:
635                    print(e)

有人在问如何使用，给一个XML转Excel的示例，假如有一个test.xml文件：

 1<?xml version="1.0" encoding="UTF-8"?>
 2<Test>
 3	<Data Type="1001" Desc = "这是描述">
 4		<Condition Value="10" Name="名字1">
 5			<AwardItem ItemId = "5" SubId = "10" Count = "10" />
 6		</Condition>
 7		<Condition Value="20" Name="名字2">
 8			<AwardItem ItemId = "5" SubId = "100" Count = "50" />
 9		</Condition>
10		<Condition Value="30" Name="名字3">
11			<AwardItem ItemId = "5" SubId = "1000" Count = "100" />
12		</Condition>
13	</Data>
14	<Data Type="1002" Desc = "这是描述">
15		<Condition Value="100" Name="名字10">
16			<AwardItem ItemId = "5" SubId = "10" Count = "10" />
17		</Condition>
18		<Condition Value="200" Name="名字20">
19			<AwardItem ItemId = "5" SubId = "100" Count = "50" />
20		</Condition>
21		<Condition Value="300" Name="名字30">
22			<AwardItem ItemId = "5" SubId = "1000" Count = "100" />
23		</Condition>
24		<Condition Value="400" Name="名字40">
25			<AwardItem ItemId = "5" SubId = "5000" Count = "200" />
26		</Condition>
27	</Data>
28</Test>

我们写一个XML转XLSX的调用程序test.py

 1import sys
 2import xml2xls
 3import traceback
 4
 5def main():
 6    if sys.argv.__len__() < 3:
 7        print("格式错误，格式：<命令> <XML源文件> <XLSX目标文件>")
 8        return
 9        
10    c = xml2xls.XmlToXls()
11    try:
12        c.convert(sys.argv[1], sys.argv[2])
13        print("处理文件%s成功" % sys.argv[1])
14    except Exception as e:
15        print("处理文件%s失败, 异常:%s" % (sys.argv[1], e))
16        print(traceback.format_exc())
17
18if __name__ == '__main__':
19    main()

在命令行执行：

1python test.py test.xml test.xlsx

就会生成一个test.xlsx的Excel文件，使用Excel打开，结果如图：

从前面的XML可以看到，第一行只有3组Condition，第二行有4组Condition，所以最后按最大值4组来展示，第一行没第4组Condition，则为空。同时，转换后的Excel表头是锁定状态，并且表头是有层次结构的，对应XML的层次。同样，如果要将一个Excel表转换成XML，也需要有锁定状态的且有层次结构的表头。

祝好！

使用Python实现XML文件转为Excel文件

相关文章

最近文章

分类

标签

友情链接

其它