使用Python实现XML文件转为Excel文件
由于项目组中原来的文件使用的XML格式作为配置,扩展性很好,但是阅读起来不是很直观,编辑起来也是不很方便,特别一些规则的二维表,个人觉得使用EXCEL文件会方便很多。所以为了方便阅读与编辑这些规则的二维表,花了一些时间写了一个Python脚本,以实现将XML文件转为Excel文件。 这里支持XML文件转为一个Sheet或者多个Sheet:
- 如果第二层所有标签都相同则会转为一个Sheet,所有第二层的标签都会作为行数据
- 如果第二层的标签有多种,则会把第二层的不同标签作为不同的Sheet,第三层的数据作为行数据
其它情况未作测试。
注意:脚本会自动根据所有同层标签计算所有字段以及所有子标签及其重复数,并作为列的扩展,如果XML的嵌套过深或者太复杂,可能会导致扩展后的列数超过EXCEL所支持的最大列数(16384列),导致失败。
附上源码以共享,当然,如果有更好的Idea进行改进欢迎留言。
1#
2# XML文件转换成Excel文件
3#
4# 版本:1.0
5#
6# 作者:Witton Bell
7# E_Mail:witton@163.com
8#
9#
10# 功能描述:
11#
12# 自动检测源文件编码是否为UTF8,如果不是,则使用GB18030进行解码,并转换成UTF8编码的文件,再进行XML源文件的读取
13#
14# 自动提取XML中的标签作为Excel的标题列,支持嵌套与重复,但是所有列不能超过EXCEL支持的最大列(16384列)
15#
16# 仅支持XML的第一层只有一个标签,
17# 第二层标签可以只有一个标签(允许重复),此时第一层标签作为Excel的WorkSheet,第二层标签及其嵌套作为一行数据)
18# 第二层标签也可以有多个标签(允许重复),此时第二层标签作为Excel的WorkSheet,第三层标签及其嵌套作为一行数据)
19#
20# 由于openpyxl默认为Excel的xlsx格式,所以本脚本的所有Excel文件格式都为xlsx
21#
22#
23import openpyxl
24from openpyxl.styles import Alignment
25import xml.dom.minidom
26from xml.dom.minidom import Document
27from openpyxl.styles import Border, Side
28import chardet
29import os
30
31
32class XMLNode:
33 def __init__(self):
34 self.name = ""
35 self.properties = []
36 self.child = []
37 self.layer = 0
38 self.index = 0
39 self.parent = None
40 self.node_info = None
41
42
43class XMLNodeInfo:
44 def __init__(self, node):
45 self.map_properties = {}
46 self.map_child = {}
47 self.max_index = 0
48 self.layer = node.layer
49 self.name = node.name
50
51 self.register(node)
52
53 def register(self, node):
54 for k in node.properties:
55 if self.map_properties.get(k[0]) is None:
56 self.map_properties[k[0]] = self.map_properties.__len__()
57
58 for ch in node.child:
59 v = self.map_child.get(ch.name)
60 if v is None:
61 self.map_child[ch.name] = ch.node_info
62
63 if node.index > self.max_index:
64 self.max_index = node.index
65
66
67class XMLReader:
68 # is_auto_convert2utf8:是否自动转换为UTF8编码
69 # is_convert_to_original_file:如果转换为UTF8,是否覆盖原文件
70 def __init__(self, file_path, is_auto_convert2utf8, is_convert_to_original_file):
71 self.__root = []
72 self.__map_node_info = {}
73 if is_auto_convert2utf8:
74 is_tmp_file, tmp_file_path = self.__convert_to_utf8(file_path)
75 fd = xml.dom.minidom.parse(tmp_file_path)
76 if is_tmp_file:
77 if is_convert_to_original_file:
78 os.remove(file_path)
79 os.rename(tmp_file_path, file_path)
80 else:
81 os.remove(tmp_file_path)
82 else:
83 fd = xml.dom.minidom.parse(file_path)
84 index = 0
85 for child in fd.childNodes:
86 if child.nodeType != xml.dom.Node.ELEMENT_NODE:
87 continue
88
89 self.__read_node(child, self.__root, index, None)
90 index += 1
91
92 def get_root_node(self):
93 return self.__root
94
95 @staticmethod
96 def __convert_to_utf8(file_path):
97 fd = open(file_path, "rb")
98 fd.seek(0, 2)
99 size = fd.tell()
100 if size > 1024 * 1024:
101 size = 1024 * 1024
102 fd.seek(0, 0)
103 text = fd.read(size)
104 ret = chardet.detect(text)
105 if ret['encoding'].lower().find("utf-8") != -1:
106 return False, file_path
107
108 tmp_file = file_path + ".tmp"
109 file = open(tmp_file, "w", encoding="utf-8", newline="\n")
110 fd.seek(0, 0)
111 line = fd.readline()
112 while line.__len__() > 0:
113 file.write(line.decode("gb18030"))
114 line = fd.readline()
115 file.close()
116 fd.close()
117
118 return True, tmp_file
119
120 @staticmethod
121 def __get_attrib(node, rc):
122 if node._attrs is None:
123 return
124 for key in node._attrs:
125 v = node._attrs[key]._value
126 rc.append([key, v])
127
128 def __read_node(self, node, root, index, parent, layer=1):
129 xml_node = XMLNode()
130 xml_node.name = node.nodeName
131 xml_node.layer = layer
132 xml_node.index = index
133 xml_node.parent = parent
134 self.__get_attrib(node, xml_node.properties)
135 i = 0
136 for child in node.childNodes:
137 if child.nodeType != xml.dom.Node.ELEMENT_NODE:
138 continue
139
140 self.__read_node(child, xml_node.child, i, xml_node, layer + 1)
141 i += 1
142 root.append(xml_node)
143 self.__register(xml_node)
144
145 def __register(self, node):
146 key = node.name + str(node.layer)
147 nd = self.__map_node_info.get(key)
148 if nd is None:
149 nd = XMLNodeInfo(node)
150 node.node_info = nd
151 self.__map_node_info[key] = nd
152 else:
153 nd.register(node)
154 node.node_info = nd
155
156
157class XMLWriter:
158 def __init__(self, xml_file_path, xml_node_list):
159 doc = Document()
160 for node in xml_node_list:
161 ele = self.__write_node(node, doc)
162 doc.appendChild(ele)
163
164 f = open(xml_file_path, 'w', encoding='utf-8')
165 doc.writexml(f, indent='\t', addindent='\t', encoding='utf-8', newl='\n')
166
167 def __write_node(self, node, doc):
168 ele = doc.createElement(node.name)
169 for prop in node.properties:
170 ele.setAttribute(prop[0], prop[1])
171
172 for child in node.childs:
173 ret = self.__write_node(child, doc)
174 ele.appendChild(ret)
175
176 return ele
177
178
179class XmlToXls:
180 # read_from_xml_file_path:XML源文件完整路径
181 # save_to_xls_file_path:保存转换后的Excel文件完整路径
182 # is_auto_convert2utf8:是否自动转换为UTF8编码
183 # is_convert_to_original_file:如果转换为UTF8,是否覆盖原文件
184 # is_merge_head:如果表头有多行,是否合并层属关系的表头
185 # is_border_cell:是否添加单元格的边框
186 # is_alignment_center:单元格是否居中对齐
187 def __init__(self, read_from_xml_file_path, save_to_xls_file_path,
188 is_auto_convert2utf8=True, is_convert_to_original_file=False,
189 is_merge_head=False, is_border_cell=False, is_alignment_center=True):
190 try:
191 self.__is_merge_head = is_merge_head
192 if is_alignment_center:
193 self.__alignment = Alignment(horizontal='center', vertical='center')
194 else:
195 self.__alignment = None
196 if is_border_cell:
197 side = Side(border_style='thin', color='000000')
198 self.__border = Border(side, side, side, side, side)
199 else:
200 self.__border = None
201 wb = openpyxl.Workbook()
202 wb.encoding = 'utf-8'
203 for sh in wb.worksheets:
204 wb.remove_sheet(sh)
205
206 reader = XMLReader(read_from_xml_file_path,
207 is_auto_convert2utf8,
208 is_convert_to_original_file)
209 self.__write(reader.get_root_node(), wb)
210 wb.save(save_to_xls_file_path)
211 wb.close()
212 except Exception as e:
213 print(e)
214
215 def __write(self, xml_node, wb):
216 self.__map_field = {}
217 for node in xml_node:
218 if node.node_info.map_child.__len__() == 1:
219 self.__start_layer = 1
220 self.__write_sheet(node, wb)
221 else:
222 self.__start_layer = 2
223 for child in node.child:
224 self.__write_sheet(child, wb)
225
226 def __write_sheet(self, node, wb):
227 sh = wb.create_sheet(node.name)
228 self.__write_head(node.node_info, sh)
229 self.__write_data(node, sh)
230
231 def __write_head(self, node, sh):
232 for key in node.map_child:
233 col = 1
234 child = node.map_child[key]
235 for k in child.map_properties:
236 c = child.map_properties[k]
237 self.__write_cell(sh, child.layer - self.__start_layer, c + col, k)
238 col += child.map_properties.__len__()
239 self.__write_head_ext(child, sh, col)
240 break
241
242 def __write_head_ext(self, node, sh, col):
243 for key in node.map_child:
244 child = node.map_child[key]
245 num = child.map_properties.__len__()
246 for i in range(0, child.max_index + 1):
247 if col > 16384:
248 raise Exception("超过EXCEL最大列数(16384列)限制,转换失败")
249
250 old_col = col
251 self.__write_cell(sh, child.layer - self.__start_layer - 1, col, child.name)
252 for k in child.map_properties:
253 c = child.map_properties[k]
254 self.__write_cell(sh, child.layer - self.__start_layer, c + col, k)
255 col += num
256 col = self.__write_head_ext(child, sh, col)
257 if self.__is_merge_head:
258 merge_row = child.layer - self.__start_layer - 1
259 sh.merge_cells(start_row=merge_row, end_row=merge_row,
260 start_column=old_col, end_column=col - 1)
261 return col
262
263 def __write_data(self, node, sh):
264 row = sh.max_row + 1
265 sh.freeze_panes = sh.cell(row, 1)
266 for child in node.child:
267 col = 1
268 self.__write_data_ext(child, sh, row, col)
269 row += 1
270
271 def __write_data_ext(self, node, sh, row, col):
272 m = node.node_info.map_properties
273 max_prop_num = m.__len__()
274 for prop in node.properties:
275 c = m[prop[0]]
276 self.__write_cell(sh, row, c + col, prop[1])
277 col += max_prop_num
278
279 map_child = {}
280 for child in node.child:
281 if map_child.get(child.name) is None:
282 map_child[child.name] = 1
283 else:
284 map_child[child.name] += 1
285 col = self.__write_data_ext(child, sh, row, col)
286
287 for key in node.node_info.map_child:
288 child = node.node_info.map_child[key]
289 all_count = child.max_index + 1
290 count = map_child.get(key)
291 if count is not None:
292 all_count -= count
293
294 for i in range(0, all_count):
295 col += child.map_properties.__len__()
296
297 return col
298
299 def __write_cell(self, sh, row, col, value):
300 if value.isdigit():
301 cell = sh.cell(row, col, int(value))
302 else:
303 cell = sh.cell(row, col, value)
304 cell.alignment = self.__alignment
305 cell.border = self.__border
由于使用的XML解析器不支持值中有大于(>),小于(<)这些特殊符号,所以如果值中有这些符号的XML文件会解析失败,报错:
1not well-formed (invalid token)
比如下面的XML文件就会报上面的错:
1<?xml version='1.0' encoding='UTF-8'?>
2<test>
3<testData value="<测试数据>"/>
4</test>
也不支持没有根节点的XML文件: 比如:
1<?xml version='1.0' encoding='UTF-8'?>
2<A Value="A1">
3 <AA value="a"/>
4</A>
5<B Value="B1">
6 <BB value="b"/>
7</B>
8<C Value="C1">
9 <CC value="c"/>
10</C>
会报错:
1junk after document element
C++使用的tinyxml是可以正常解析大于小于等特殊符号的,网上有一个 pytinyxml2开源项目,让python可以使用tinyxml进行解析。
安装pytinyxml2之前需要先安装swig,Windows下可以下载: swigwin-4.0.1.zip,这是已经编译好的版本,解压出来后添加路径到PATH环境变量即可。 非Windows可以下载 swig-4.0.1.tar.gz进行编译安装,也可以直接:
1yum install swig
安装好swig后,在pytinyxml2源码目录中执行:
1python setup.py install
不能使用
1pip install pytinyxml2
进行安装,我遇到有报错:
1running bdist_wheel
2 running build
3 running build_py
4 creating build
5 creating build/lib.linux-x86_64-3.5
6 copying pytinyxml2.py -> build/lib.linux-x86_64-3.5
7 running build_ext
8 building '_pytinyxml2' extension
9 swigging pytinyxml2.i to pytinyxml2_wrap.cpp
10 swig -python -c++ -o pytinyxml2_wrap.cpp pytinyxml2.i
11 pytinyxml2.i:5: Error: Unable to find 'tinyxml2.h'
12 error: command 'swig' failed with exit status 1
查看 pytinyxml2的包可以看到,里面缺少tinyxml2.h
项目中原来的程序只能读取XML格式的文档,我们为了方便编辑与查阅,所以把XML转为Excel,编辑完成后,还需要把Excel转为原来的XML,所以实现了XLS转XML,并对之前的代码作了部分修改,附上源码:
1# -*- coding: UTF-8 -*-
2#
3# XML文件与Excel文件互转
4#
5# 版本:1.1
6#
7# 作者:Witton Bell
8# E_Mail:witton@163.com
9#
10#
11# 功能描述:
12#
13# XmlToXls:
14#
15# 自动检测源文件编码是否为UTF8,如果不是,则使用GB18030进行解码,并转换成UTF8编码的文件,再进行XML源文件的读取
16#
17# 自动提取XML中的标签作为Excel的标题列,支持嵌套与重复,但是所有列不能超过EXCEL支持的最大列(16384列)
18#
19# 使用XMLReader仅支持XML的只有一个根标签(标准XML格式),使用TinyXMLReader支持有多个根标签
20# 第二层标签可以只有一个标签(允许重复),此时第一层标签作为Excel的WorkSheet,第二层标签及其嵌套作为一行数据)
21# 第二层标签也可以有多个标签(允许重复),此时第二层标签作为Excel的WorkSheet,第三层标签及其嵌套作为一行数据)
22#
23# 由于openpyxl默认为Excel的xlsx格式,所以本脚本的所有Excel文件格式都为xlsx
24#
25# XlsToXml:
26# 同样XMLWriter仅支持XML只有一个根标签(标准XML格式),使用TinyXMLWriter支持有多个根标签
27# Excel文件需要有层次分明的表头,并且需要冻结表头
28#
29import openpyxl
30from openpyxl.styles import Alignment
31from openpyxl.styles import Border, Side
32from openpyxl.comments import Comment
33import openpyxl.utils as xls_util
34import chardet
35import os
36import xml.dom.minidom
37import copy
38
39class XMLNode:
40 def __init__(self):
41 self.name = ""
42 self.properties = []
43 self.child = []
44 self.layer = 0
45 self.index = 0
46 self.parent = None
47 self.node_info = None
48
49
50class XMLNodeInfo:
51 def __init__(self, node):
52 self.map_properties = {}
53 self.map_child = {}
54 self.max_index = 0
55 self.layer = node.layer
56 self.name = node.name
57 self.parent = node.parent
58
59 self.register(node)
60
61 def register(self, node):
62 for k in node.properties:
63 if self.map_properties.get(k[0]) is None:
64 self.map_properties[k[0]] = self.map_properties.__len__()
65
66 for ch in node.child:
67 v = self.map_child.get(ch.name)
68 if v is None:
69 self.map_child[ch.name] = ch.node_info
70
71 if node.index > self.max_index:
72 self.max_index = node.index
73
74
75class _XMLReaderBase:
76 def __init__(self):
77 self._root = []
78 self._map_node_info = {}
79
80 def __calc_node_key(self, node):
81 key = ""
82 if node.parent is not None:
83 key = self.__calc_node_key(node.parent)
84 return "%s_%s" % (key, node.name)
85
86 def _register(self, node):
87 key = self.__calc_node_key(node)
88 nd = self._map_node_info.get(key)
89 if nd is None:
90 nd = XMLNodeInfo(node)
91 node.node_info = nd
92 self._map_node_info[key] = nd
93 else:
94 nd.register(node)
95 node.node_info = nd
96
97 @staticmethod
98 def _convert_to_utf8(file_path):
99 fd = open(file_path, "rb")
100 fd.seek(0, 2)
101 size = fd.tell()
102 if size > 1024 * 1024:
103 size = 1024 * 1024
104 fd.seek(0, 0)
105 text = fd.read(size)
106 ret = chardet.detect(text)
107 if ret['encoding'].lower().find("utf-8") != -1:
108 return False, file_path
109
110 tmp_file = file_path + ".tmp"
111 file = open(tmp_file, "w", encoding="utf-8", newline="\n")
112 fd.seek(0, 0)
113 line = fd.readline()
114 while line.__len__() > 0:
115 file.write(line.decode("gb18030"))
116 line = fd.readline()
117 file.close()
118 fd.close()
119
120 return True, tmp_file
121
122
123# 该类使用xml不能解析值中带特殊符号的文件,也不支持没有统一根节点的XML文件,建议使用TinyXMLReader
124class XMLReader(_XMLReaderBase):
125 # is_auto_convert2utf8:是否自动转换为UTF8编码
126 # is_convert_to_original_file:如果转换为UTF8,是否覆盖原文件
127 def read(self, file_path, is_auto_convert2utf8=True, is_convert_to_original_file=False):
128 _XMLReaderBase.__init__(self)
129 if is_auto_convert2utf8:
130 is_tmp_file, tmp_file_path = self._convert_to_utf8(file_path)
131 fd = xml.dom.minidom.parse(tmp_file_path)
132 if is_tmp_file:
133 if is_convert_to_original_file:
134 os.remove(file_path)
135 os.rename(tmp_file_path, file_path)
136 else:
137 os.remove(tmp_file_path)
138 else:
139 fd = xml.dom.minidom.parse(file_path)
140 index = 0
141 for child in fd.childNodes:
142 if child.nodeType != xml.dom.Node.ELEMENT_NODE:
143 continue
144
145 self.__read_node(child, self._root, index, None)
146 index += 1
147
148 return self._root
149
150 @staticmethod
151 def __get_attrib(node, rc):
152 if node._attrs is None:
153 return
154 for key in node._attrs:
155 v = node._attrs[key]._value
156 rc.append([key, v])
157
158 def __read_node(self, node, root, index, parent, layer=1):
159 xml_node = XMLNode()
160 xml_node.name = node.nodeName
161 xml_node.layer = layer
162 xml_node.index = index
163 xml_node.parent = parent
164 self.__get_attrib(node, xml_node.properties)
165 i = 0
166 for child in node.childNodes:
167 if child.nodeType != xml.dom.Node.ELEMENT_NODE:
168 continue
169
170 self.__read_node(child, xml_node.child, i, xml_node, layer + 1)
171 i += 1
172 root.append(xml_node)
173 self._register(xml_node)
174
175
176# 该类需要安装pytinyxml2,参见:https://blog.csdn.net/witton/article/details/100302498
177class TinyXMLReader(_XMLReaderBase):
178 # is_auto_convert2utf8:是否自动转换为UTF8编码
179 # is_convert_to_original_file:如果转换为UTF8,是否覆盖原文件
180 def read(self, file_path, is_auto_convert2utf8=True, is_convert_to_original_file=False):
181 _XMLReaderBase.__init__(self)
182 import pytinyxml2
183 doc = pytinyxml2.XMLDocument()
184 if is_auto_convert2utf8:
185 is_tmp_file, tmp_file_path = self._convert_to_utf8(file_path)
186 doc.LoadFile(tmp_file_path)
187 if is_tmp_file:
188 if is_convert_to_original_file:
189 os.remove(file_path)
190 os.rename(tmp_file_path, file_path)
191 else:
192 os.remove(tmp_file_path)
193 else:
194 doc.LoadFile(file_path)
195 node = doc.RootElement()
196 index = 0
197 while node is not None:
198 self.__read_node(node, self._root, index, None)
199 node = node.NextSiblingElement()
200 index += 1
201
202 return self._root
203
204 @staticmethod
205 def __get_attrib(node, rc):
206 attrib = node.FirstAttribute()
207 while attrib is not None:
208 key = attrib.Name()
209 v = attrib.Value()
210 rc.append([key, v])
211 attrib = attrib.Next()
212
213 def __read_node(self, node, root, index, parent, layer=1):
214 xml_node = XMLNode()
215 xml_node.name = node.Value()
216 xml_node.layer = layer
217 xml_node.index = index
218 xml_node.parent = parent
219 self.__get_attrib(node, xml_node.properties)
220 i = 0
221 child = node.FirstChildElement()
222 while child is not None:
223 self.__read_node(child, xml_node.child, i, xml_node, layer + 1)
224 child = child.NextSiblingElement()
225 i += 1
226 root.append(xml_node)
227 self._register(xml_node)
228
229
230class XMLWriter:
231 def __init__(self, xml_file_path, xml_node_list):
232 doc = xml.dom.minidom.Document()
233 for node in xml_node_list:
234 ele = self.__write_node(node, doc)
235 doc.appendChild(ele)
236
237 f = open(xml_file_path, 'w', encoding='utf-8')
238 doc.writexml(f, indent='\t', addindent='\t', encoding='utf-8', newl='\n')
239 f.close()
240
241 def __write_node(self, node, doc):
242 ele = doc.createElement(node.name)
243 for prop in node.properties:
244 ele.setAttribute(prop[0], prop[1])
245
246 for child in node.child:
247 ret = self.__write_node(child, doc)
248 ele.appendChild(ret)
249
250 return ele
251
252
253class TinyXMLWriter:
254 # is_convert_spec_character: 是否转换特殊字符
255 def __init__(self, xml_file_path, xml_node_list, is_convert_spec_character=True):
256 import pytinyxml2
257 doc = pytinyxml2.XMLDocument(is_convert_spec_character)
258 decl = doc.NewDeclaration()
259 doc.LinkEndChild(decl)
260 for node in xml_node_list:
261 ele = self.__write_node(node, doc)
262 doc.LinkEndChild(ele)
263
264 doc.SaveFile(xml_file_path)
265
266 def __write_node(self, node, doc):
267 ele = doc.NewElement(node.name)
268 for prop in node.properties:
269 ele.SetAttribute(prop[0], prop[1])
270
271 for child in node.child:
272 ret = self.__write_node(child, doc)
273 ele.LinkEndChild(ret)
274
275 return ele
276
277
278class XmlToXls:
279 def __init__(self):
280 self.__is_merge_head = False
281 self.__alignment_center = None
282 self.__border = None
283
284 # read_from_xml_file_path:XML源文件完整路径
285 # save_to_xls_file_path:保存转换后的Excel文件完整路径
286 # is_auto_convert2utf8:是否自动转换为UTF8编码
287 # is_convert_to_original_file:如果转换为UTF8,是否覆盖原文件
288 # is_merge_head:如果表头有多行,是否合并层属关系的表头
289 # is_border_cell:是否添加单元格的边框
290 # is_use_tiny_xml:是否使用tinyXML
291 def convert(self, read_from_xml_file_path, save_to_xls_file_path,
292 is_auto_convert2utf8=True, is_convert_to_original_file=False,
293 is_merge_head=False, is_border_cell=False, is_use_tiny_xml=True):
294 try:
295 self.__is_merge_head = is_merge_head
296 self.__alignment_center = Alignment(horizontal='center', vertical='center')
297 if is_border_cell:
298 side = Side(border_style='thin', color='000000')
299 self.__border = Border(side, side, side, side, side)
300 else:
301 self.__border = None
302 wb = openpyxl.Workbook()
303 wb.encoding = 'utf-8'
304 for sh in wb.worksheets:
305 wb.remove_sheet(sh)
306
307 if is_use_tiny_xml:
308 reader = TinyXMLReader()
309 else:
310 reader = XMLReader()
311 root = reader.read(read_from_xml_file_path,
312 is_auto_convert2utf8,
313 is_convert_to_original_file)
314 self.__write(root, wb)
315 wb.save(save_to_xls_file_path)
316 wb.close()
317 except Exception as e:
318 print(e)
319
320 # src_path_dir:XML源目录完整路径
321 # dst_path_dir:保存转换后的Excel文件完整目录路径,如果为None或者为空,则直接转换在源文件应对目录下
322 # is_auto_convert2utf8:是否自动转换为UTF8编码
323 # is_convert_to_original_file:如果转换为UTF8,是否覆盖原文件
324 # is_merge_head:如果表头有多行,是否合并层属关系的表头
325 # is_border_cell:是否添加单元格的边框
326 # is_use_tiny_xml:是否使用tinyXML
327 def convert_dirs(self, src_path_dir, dst_path_dir=None,
328 is_auto_convert2utf8=True, is_convert_to_original_file=False,
329 is_merge_head=False, is_border_cell=False, is_use_tiny_xml=True):
330 if dst_path_dir is not None and dst_path_dir != "":
331 if not os.path.exists(dst_path_dir):
332 os.mkdir(dst_path_dir)
333
334 for root, dirs, files in os.walk(src_path_dir):
335 for name in files:
336 basename, ext = os.path.splitext(name)
337 if ext != ".xml":
338 continue
339
340 src = os.path.join(root, name)
341 target = basename + ".xlsx"
342 print("处理%s" % src)
343 if dst_path_dir is None or dst_path_dir == "":
344 dst = os.path.join(root, target)
345 else:
346 dst = os.path.join(dst_path_dir, target)
347 self.convert(src, dst, is_auto_convert2utf8, is_convert_to_original_file,
348 is_merge_head, is_border_cell, is_use_tiny_xml)
349
350 def __write(self, xml_node, wb):
351 self.__map_field = {}
352 for node in xml_node:
353 if node.node_info.map_child.__len__() == 1:
354 self.__is_multi_sheet = False
355 self.__write_sheet(node, wb)
356 else:
357 self.__is_multi_sheet = True
358 for child in node.child:
359 self.__write_sheet(child, wb)
360
361 def __write_sheet(self, node, wb):
362 sh = wb.create_sheet(node.name)
363 self.__write_head(node, sh)
364 self.__write_data(node, sh)
365
366 def __write_head(self, node, sh):
367 node_info = node.node_info
368 if self.__is_multi_sheet:
369 self.__write_head_cell(sh, node_info.parent.layer, 1, node_info.parent.name)
370 self.__write_head_cell(sh, node_info.layer, 1, node_info.name)
371 comment_str = ""
372 for prop in node.properties:
373 comment_str += '%s="%s"\n' % (prop[0], prop[1])
374 if comment_str != "":
375 sh.cell(node_info.layer, 1).comment = Comment(comment_str, "", width=300)
376 elif not self.__is_multi_sheet:
377 self.__write_head_cell(sh, node_info.layer, 1, node_info.name)
378
379 child_name = None
380 for key in node_info.map_child:
381 col = 1
382 child = node_info.map_child[key]
383 child_name = child.name
384 for k in child.map_properties:
385 c = child.map_properties[k]
386 self.__write_head_cell(sh, child.layer, c + col, k)
387 col += child.map_properties.__len__()
388 self.__write_head_ext(child, sh, col)
389 break
390
391 if self.__is_multi_sheet:
392 row = 3
393 else:
394 row = 2
395
396 if child_name is not None:
397 sh.insert_rows(row)
398 self.__write_head_cell(sh, row, 1, child_name)
399
400 def __write_head_ext(self, node, sh, col):
401 for key in node.map_child:
402 child = node.map_child[key]
403 num = child.map_properties.__len__()
404 for i in range(0, child.max_index + 1):
405 if col > 16384:
406 raise Exception("超过EXCEL最大列数(16384列)限制,转换失败")
407
408 old_col = col
409 self.__write_head_cell(sh, child.layer - 1, col, child.name)
410 for k in child.map_properties:
411 c = child.map_properties[k]
412 self.__write_head_cell(sh, child.layer, c + col, k)
413 col += num
414 col = self.__write_head_ext(child, sh, col)
415 if self.__is_merge_head:
416 merge_row = child.layer - 1
417 sh.merge_cells(start_row=merge_row, end_row=merge_row,
418 start_column=old_col, end_column=col - 1)
419 return col
420
421 def __write_data(self, node, sh):
422 row = sh.max_row + 1
423 sh.freeze_panes = sh.cell(row, 1)
424 for child in node.child:
425 col = 1
426 self.__write_data_ext(child, sh, row, col)
427 row += 1
428
429 def __write_data_ext(self, node, sh, row, col):
430 m = node.node_info.map_properties
431 max_prop_num = m.__len__()
432 for prop in node.properties:
433 c = m[prop[0]]
434 self.__write_cell(sh, row, c + col, prop[1])
435 col += max_prop_num
436
437 map_child = {}
438 for child in node.child:
439 if map_child.get(child.name) is None:
440 map_child[child.name] = 1
441 else:
442 map_child[child.name] += 1
443 col = self.__write_data_ext(child, sh, row, col)
444
445 for key in node.node_info.map_child:
446 child = node.node_info.map_child[key]
447 all_count = child.max_index + 1
448 count = map_child.get(key)
449 if count is not None:
450 all_count -= count
451
452 for i in range(0, all_count):
453 col += child.map_properties.__len__()
454
455 return col
456
457 def __write_head_cell(self, sh, row, col, value):
458 cell = sh.cell(row, col, value)
459 cell.border = self.__border
460 cell.alignment = self.__alignment_center
461
462 def __write_cell(self, sh, row, col, value):
463 if value.isdigit():
464 cell = sh.cell(row, col, int(value))
465 else:
466 cell = sh.cell(row, col, value)
467 cell.border = self.__border
468
469
470class XlsToXml:
471 def __init__(self):
472 pass
473
474 @staticmethod
475 def __add_prop(map_field, col, value):
476 if map_field.__len__() == 0:
477 return
478
479 if map_field.get(col) is None:
480 # 找本节点
481 c = col - 1
482 while c >= 1:
483 if map_field.get(c) is not None:
484 node = map_field[c][0]
485 break
486 c -= 1
487 else:
488 node = map_field[col][0]
489 node.properties.append([value, col])
490
491 @staticmethod
492 def __add_node(map_field, row, col, value):
493 node = XMLNode()
494 node.name = value
495
496 if map_field.get(col) is not None:
497 node.parent = map_field[col][0]
498 else:
499 # 找父节点
500 c = col - 1
501 while c >= 1:
502 if map_field.get(c) is not None:
503 if row > map_field[c][1]:
504 node.parent = map_field[c][0]
505 break
506 c -= 1
507
508 if node.parent is not None:
509 node.parent.child.append(node)
510 node.layer = node.parent.layer + 1
511 else:
512 node.layer = 1
513 map_field[col] = [node, row]
514 return node
515
516 def __read_xls(self, file_path):
517 wb = openpyxl.load_workbook(file_path)
518 root_data = XMLNode()
519 is_multi_sheet = wb.worksheets.__len__() > 1
520 for sh in wb.worksheets:
521 max_row = sh.max_row + 1
522 max_col = sh.max_column + 1
523 if sh.freeze_panes is None:
524 raise Exception("文件[%s]表单[%s]的无冻结窗口,无法确定表头,转为XML失败" %
525 (os.path.basename(file_path), sh.title))
526
527 head_row, head_col = xls_util.coordinate_to_tuple(sh.freeze_panes)
528 if head_col != 1:
529 raise Exception("文件[%s]表单[%s]的冻结窗口列不为1,无法转为XML" % (os.path.basename(file_path), sh.title))
530
531 root = None
532
533 map_field = {}
534 for col in range(1, max_col):
535 for row in range(1, head_row):
536 cell = sh.cell(row, col)
537 value = cell.value
538 comment = cell.comment
539
540 if value is None:
541 continue
542
543 next_row = row + 1
544 if next_row >= head_row:
545 self.__add_prop(map_field, col, value)
546 continue
547
548 next_row_value = sh.cell(next_row, col).value
549 if next_row_value is None:
550 self.__add_prop(map_field, col, value)
551 continue
552
553 node = self.__add_node(map_field, row, col, value)
554 if root is None:
555 root = node
556
557 if comment is None:
558 continue
559
560 comment = comment.text
561 lines = comment.splitlines()
562 for line in lines:
563 props = line.split('=')
564 kv = []
565 for prop in props:
566 prop = prop.replace('"', '')
567 kv.append(prop)
568 if kv.__len__() == 2:
569 node.properties.append(kv)
570 kv = []
571
572 root_data.name = root.name
573 root_data.layer = root.layer
574
575 if is_multi_sheet and root.child.__len__() > 0:
576 child_list = copy.deepcopy(root.child[0].child)
577 root.child[0].child = []
578 root_data.child.append(root.child[0])
579 root_data_child = root.child[0].child
580 else:
581 child_list = copy.deepcopy(root.child)
582 root_data_child = root_data.child
583
584 for row in range(head_row, max_row):
585 clone = copy.deepcopy(child_list)
586 for child in clone:
587 self.__read_node_data(child, sh, row)
588 root_data_child.append(child)
589
590 return root_data
591
592 def __read_node_data(self, node, sh, row):
593 prop_list = []
594 for prop in node.properties:
595 col = prop[1]
596 value = sh.cell(row, col).value
597 if value is not None:
598 prop_list.append([prop[0], value])
599
600 child_list = []
601 for child in node.child:
602 self.__read_node_data(child, sh, row)
603 if child.properties.__len__() > 0 or child.child.__len__() > 0:
604 child_list.append(child)
605 copy.copy(child)
606
607 node.properties = prop_list
608 node.child = child_list
609
610 def convert(self, src_file_path, dst_file_path, is_use_tiny_xml=True, is_convert_spec_character=False):
611 root = self.__read_xls(src_file_path)
612 if is_use_tiny_xml:
613 TinyXMLWriter(dst_file_path, [root], is_convert_spec_character)
614 else:
615 XMLWriter(dst_file_path, [root])
616
617 def convert_dirs(self, src_path_dir, dst_path_dir=None, is_use_tiny_xml=True, is_convert_spec_character=False):
618 for root, dirs, files in os.walk(src_path_dir):
619 for name in files:
620 basename, ext = os.path.splitext(name)
621 if ext != ".xls" and ext != ".xlsx":
622 continue
623
624 src = os.path.join(root, name)
625 target = basename + "1" + ".xml"
626 print("处理%s" % src)
627 if dst_path_dir is None or dst_path_dir == "":
628 dst = os.path.join(root, target)
629 else:
630 dst = os.path.join(dst_path_dir, target)
631
632 try:
633 self.convert(src, dst, is_use_tiny_xml, is_convert_spec_character)
634 except Exception as e:
635 print(e)
有人在问如何使用,给一个XML转Excel的示例,假如有一个test.xml文件:
1<?xml version="1.0" encoding="UTF-8"?>
2<Test>
3 <Data Type="1001" Desc = "这是描述">
4 <Condition Value="10" Name="名字1">
5 <AwardItem ItemId = "5" SubId = "10" Count = "10" />
6 </Condition>
7 <Condition Value="20" Name="名字2">
8 <AwardItem ItemId = "5" SubId = "100" Count = "50" />
9 </Condition>
10 <Condition Value="30" Name="名字3">
11 <AwardItem ItemId = "5" SubId = "1000" Count = "100" />
12 </Condition>
13 </Data>
14 <Data Type="1002" Desc = "这是描述">
15 <Condition Value="100" Name="名字10">
16 <AwardItem ItemId = "5" SubId = "10" Count = "10" />
17 </Condition>
18 <Condition Value="200" Name="名字20">
19 <AwardItem ItemId = "5" SubId = "100" Count = "50" />
20 </Condition>
21 <Condition Value="300" Name="名字30">
22 <AwardItem ItemId = "5" SubId = "1000" Count = "100" />
23 </Condition>
24 <Condition Value="400" Name="名字40">
25 <AwardItem ItemId = "5" SubId = "5000" Count = "200" />
26 </Condition>
27 </Data>
28</Test>
我们写一个XML转XLSX的调用程序test.py
1import sys
2import xml2xls
3import traceback
4
5def main():
6 if sys.argv.__len__() < 3:
7 print("格式错误,格式:<命令> <XML源文件> <XLSX目标文件>")
8 return
9
10 c = xml2xls.XmlToXls()
11 try:
12 c.convert(sys.argv[1], sys.argv[2])
13 print("处理文件%s成功" % sys.argv[1])
14 except Exception as e:
15 print("处理文件%s失败, 异常:%s" % (sys.argv[1], e))
16 print(traceback.format_exc())
17
18if __name__ == '__main__':
19 main()
在命令行执行:
1python test.py test.xml test.xlsx
就会生成一个test.xlsx的Excel文件,使用Excel打开,结果如图:
从前面的XML可以看到,第一行只有3组Condition,第二行有4组Condition,所以最后按最大值4组来展示,第一行没第4组Condition,则为空。 同时,转换后的Excel表头是锁定状态,并且表头是有层次结构的,对应XML的层次。同样,如果要将一个Excel表转换成XML,也需要有锁定状态的且有层次结构的表头。
祝好!
- 原文作者:Witton
- 原文链接:https://wittonbell.github.io/posts/2019/2019-09-02-使用Python实现XML文件转为Excel文件/
- 版权声明:本作品采用知识共享署名-非商业性使用-禁止演绎 4.0 国际许可协议. 进行许可,非商业转载请注明出处(作者,原文链接),商业转载请联系作者获得授权。