Python处理xml文件_文件格式怎么转换
作者:快盘下载 人气:大家好,又见面了,我是你们的朋友全栈君。
由于项目组中原来的文件使用的XML格式作为配置,扩展性很好,但是阅读起来不是很直观,编辑起来也是不很方便,特别一些规则的二维表,个人觉得使用EXCEL文件会方便很多。所以为了方便阅读与编辑这些规则的二维表,花了一些时间写了一个Python脚本,以实现将XML文件转为Excel文件。 这里支持XML文件转为一个Sheet或者多个Sheet:
如果第二层所有标签都相同则会转为一个Sheet,所有第二层的标签都会作为行数据如果第二层的标签有多种,则会把第二层的不同标签作为不同的Sheet,第三层的数据作为行数据其它情况未作测试。
注意:脚本会自动根据所有同层标签计算所有字段以及所有子标签及其重复数,并作为列的扩展,如果XML的嵌套过深或者太复杂,可能会导致扩展后的列数超过EXCEL所支持的最大列数(16384列),导致失败。
附上源码以共享,当然,如果有更好的Idea进行改进欢迎留言。
# # XML文件转换成Excel文件 # # 版本:1.0 # # 作者:Witton Bell # E_Mail:witton@163.com # # # 功能描述: # # 自动检测源文件编码是否为UTF8,如果不是,则使用GB18030进行解码,并转换成UTF8编码的文件,再进行XML源文件的读取 # # 自动提取XML中的标签作为Excel的标题列,支持嵌套与重复,但是所有列不能超过EXCEL支持的最大列(16384列) # # 仅支持XML的第一层只有一个标签, # 第二层标签可以只有一个标签(允许重复),此时第一层标签作为Excel的WorkSheet,第二层标签及其嵌套作为一行数据) # 第二层标签也可以有多个标签(允许重复),此时第二层标签作为Excel的WorkSheet,第三层标签及其嵌套作为一行数据) # # 由于openpyxl默认为Excel的xlsx格式,所以本脚本的所有Excel文件格式都为xlsx # # import openpyxl from openpyxl.styles import Alignment import xml.dom.minidom from xml.dom.minidom import Document from openpyxl.styles import Border, Side import chardet import os class XMLnode: def __init__(self): self.name = "" self.properties = [] self.child = [] self.layer = 0 self.index = 0 self.parent = None self.node_info = None class XMLNodeInfo: def __init__(self, node): self.map_properties = { } self.map_child = { } self.max_index = 0 self.layer = node.layer self.name = node.name self.register(node) def register(self, node): for k in node.properties: if self.map_properties.get(k[0]) is None: self.map_properties[k[0]] = self.map_properties.__len__() for ch in node.child: v = self.map_child.get(ch.name) if v is None: self.map_child[ch.name] = ch.node_info if node.index > self.max_index: self.max_index = node.index class XMLReader: # is_auto_convert2utf8:是否自动转换为UTF8编码 # is_convert_to_original_file:如果转换为UTF8,是否覆盖原文件 def __init__(self, file_path, is_auto_convert2utf8, is_convert_to_original_file): self.__root = [] self.__map_node_info = { } if is_auto_convert2utf8: is_tmp_file, tmp_file_path = self.__convert_to_utf8(file_path) fd = xml.dom.minidom.parse(tmp_file_path) if is_tmp_file: if is_convert_to_original_file: os.remove(file_path) os.rename(tmp_file_path, file_path) else: os.remove(tmp_file_path) else: fd = xml.dom.minidom.parse(file_path) index = 0 for child in fd.childNodes: if child.nodeType != xml.dom.Node.ELEMENT_NODE: continue self.__read_node(child, self.__root, index, None) index += 1 def get_root_node(self): return self.__root @staticmethod def __convert_to_utf8(file_path): fd = open(file_path, "rb") fd.seek(0, 2) size = fd.tell() if size > 1024 * 1024: size = 1024 * 1024 fd.seek(0, 0) text = fd.read(size) ret = chardet.detect(text) if ret['encoding'].lower().find("utf-8") != -1: return False, file_path tmp_file = file_path + ".tmp" file = open(tmp_file, "w", encoding="utf-8", newline=" ") fd.seek(0, 0) line = fd.readline() while line.__len__() > 0: file.write(line.decode("gb18030")) line = fd.readline() file.close() fd.close() return True, tmp_file @staticmethod def __get_attrib(node, rc): if node._attrs is None: return for key in node._attrs: v = node._attrs[key]._value rc.append([key, v]) def __read_node(self, node, root, index, parent, layer=1): xml_node = XMLNode() xml_node.name = node.nodeName xml_node.layer = layer xml_node.index = index xml_node.parent = parent self.__get_attrib(node, xml_node.properties) i = 0 for child in node.childNodes: if child.nodeType != xml.dom.Node.ELEMENT_NODE: continue self.__read_node(child, xml_node.child, i, xml_node, layer + 1) i += 1 root.append(xml_node) self.__register(xml_node) def __register(self, node): key = node.name + str(node.layer) nd = self.__map_node_info.get(key) if nd is None: nd = XMLNodeInfo(node) node.node_info = nd self.__map_node_info[key] = nd else: nd.register(node) node.node_info = nd class XMLWriter: def __init__(self, xml_file_path, xml_node_list): doc = Document() for node in xml_node_list: ele = self.__write_node(node, doc) doc.appendChild(ele) f = open(xml_file_path, 'w', encoding='utf-8') doc.writexml(f, indent=' ', addindent=' ', encoding='utf-8', newl=' ') def __write_node(self, node, doc): ele = doc.createElement(node.name) for prop in node.properties: ele.setAttribute(prop[0], prop[1]) for child in node.childs: ret = self.__write_node(child, doc) ele.appendChild(ret) return ele class XmlToXls: # read_from_xml_file_path:XML源文件完整路径 # save_to_xls_file_path:保存转换后的Excel文件完整路径 # is_auto_convert2utf8:是否自动转换为UTF8编码 # is_convert_to_original_file:如果转换为UTF8,是否覆盖原文件 # is_merge_head:如果表头有多行,是否合并层属关系的表头 # is_border_cell:是否添加单元格的边框 # is_alignment_center:单元格是否居中对齐 def __init__(self, read_from_xml_file_path, save_to_xls_file_path, is_auto_convert2utf8=True, is_convert_to_original_file=False, is_merge_head=False, is_border_cell=False, is_alignment_center=True): try: self.__is_merge_head = is_merge_head if is_alignment_center: self.__alignment = Alignment(horizontal='center', vertical='center') else: self.__alignment = None if is_border_cell: side = Side(border_style='thin', color='000000') self.__border = Border(side, side, side, side, side) else: self.__border = None wb = openpyxl.Workbook() wb.encoding = 'utf-8' for sh in wb.worksheets: wb.remove_sheet(sh) reader = XMLReader(read_from_xml_file_path, is_auto_convert2utf8, is_convert_to_original_file) self.__write(reader.get_root_node(), wb) wb.save(save_to_xls_file_path) wb.close() except Exception as e: print(e) def __write(self, xml_node, wb): self.__map_field = { } for node in xml_node: if node.node_info.map_child.__len__() == 1: self.__start_layer = 1 self.__write_sheet(node, wb) else: self.__start_layer = 2 for child in node.child: self.__write_sheet(child, wb) def __write_sheet(self, node, wb): sh = wb.create_sheet(node.name) self.__write_head(node.node_info, sh) self.__write_data(node, sh) def __write_head(self, node, sh): for key in node.map_child: col = 1 child = node.map_child[key] for k in child.map_properties: c = child.map_properties[k] self.__write_cell(sh, child.layer - self.__start_layer, c + col, k) col += child.map_properties.__len__() self.__write_head_ext(child, sh, col) break def __write_head_ext(self, node, sh, col): for key in node.map_child: child = node.map_child[key] num = child.map_properties.__len__() for i in range(0, child.max_index + 1): if col > 16384: raise Exception("超过EXCEL最大列数(16384列)限制,转换失败") old_col = col self.__write_cell(sh, child.layer - self.__start_layer - 1, col, child.name) for k in child.map_properties: c = child.map_properties[k] self.__write_cell(sh, child.layer - self.__start_layer, c + col, k) col += num col = self.__write_head_ext(child, sh, col) if self.__is_merge_head: merge_row = child.layer - self.__start_layer - 1 sh.merge_cells(start_row=merge_row, end_row=merge_row, start_column=old_col, end_column=col - 1) return col def __write_data(self, node, sh): row = sh.max_row + 1 sh.freeze_panes = sh.cell(row, 1) for child in node.child: col = 1 self.__write_data_ext(child, sh, row, col) row += 1 def __write_data_ext(self, node, sh, row, col): m = node.node_info.map_properties max_prop_num = m.__len__() for prop in node.properties: c = m[prop[0]] self.__write_cell(sh, row, c + col, prop[1]) col += max_prop_num map_child = { } for child in node.child: if map_child.get(child.name) is None: map_child[child.name] = 1 else: map_child[child.name] += 1 col = self.__write_data_ext(child, sh, row, col) for key in node.node_info.map_child: child = node.node_info.map_child[key] all_count = child.max_index + 1 count = map_child.get(key) if count is not None: all_count -= count for i in range(0, all_count): col += child.map_properties.__len__() return col def __write_cell(self, sh, row, col, value): if value.isdigit(): cell = sh.cell(row, col, int(value)) else: cell = sh.cell(row, col, value) cell.alignment = self.__alignment cell.border = self.__border
由于使用的XML解析器不支持值中有大于(>),小于(<)这些特殊符号,所以如果值中有这些符号的XML文件会解析失败,报错:
not well-formed (invalid token)
比如下面的XML文件就会报上面的错:
<?xml version='1.0' encoding='UTF-8'?> <test> <testData value="<测试数据>"/> </test>
也不支持没有根节点的XML文件: 比如:
<?xml version='1.0' encoding='UTF-8'?> <A Value="A1"> <AA value="a"/> </A> <B Value="B1"> <BB value="b"/> </B> <C Value="C1"> <CC value="c"/> </C>
会报错:
junk after document element
C++使用的tinyxml是可以正常解析大于小于等特殊符号的,网上有一个pytinyxml2开源项目,让python可以使用tinyxml进行解析。
安装pytinyxml2之前需要先安装swig,Windows下可以下载:swigwin-4.0.1.zip,这是已经编译好的版本,解压出来后添加路径到PATH环境变量即可。 非Windows可以下载swig-4.0.1.tar.gz进行编译安装,也可以直接:
yum install swig
安装好swig后,在pytinyxml2源码目录中执行:
python setup.py install
不能使用
pip install pytinyxml2
进行安装,我遇到有报错:
running bdist_wheel running build running build_py creating build creating build/lib.linux-x86_64-3.5 copying pytinyxml2.py -> build/lib.linux-x86_64-3.5 running build_ext building '_pytinyxml2' extension swigging pytinyxml2.i to pytinyxml2_wrap.cpp swig -python -c++ -o pytinyxml2_wrap.cpp pytinyxml2.i pytinyxml2.i:5: Error: Unable to find 'tinyxml2.h' error: command 'swig' failed with exit status 1
查看pytinyxml2的包可以看到,里面缺少tinyxml2.h
项目中原来的程序只能读取XML格式的文档,我们为了方便编辑与查阅,所以把XML转为Excel,编辑完成后,还需要把Excel转为原来的XML,所以实现了XLS转XML,并对之前的代码作了部分修改,附上源码:
# -*- coding: UTF-8 -*- # # XML文件与Excel文件互转 # # 版本:1.1 # # 作者:Witton Bell # E_Mail:witton@163.com # # # 功能描述: # # XmlToXls: # # 自动检测源文件编码是否为UTF8,如果不是,则使用GB18030进行解码,并转换成UTF8编码的文件,再进行XML源文件的读取 # # 自动提取XML中的标签作为Excel的标题列,支持嵌套与重复,但是所有列不能超过EXCEL支持的最大列(16384列) # # 使用XMLReader仅支持XML的只有一个根标签(标准XML格式),使用TinyXMLReader支持有多个根标签 # 第二层标签可以只有一个标签(允许重复),此时第一层标签作为Excel的WorkSheet,第二层标签及其嵌套作为一行数据) # 第二层标签也可以有多个标签(允许重复),此时第二层标签作为Excel的WorkSheet,第三层标签及其嵌套作为一行数据) # # 由于openpyxl默认为Excel的xlsx格式,所以本脚本的所有Excel文件格式都为xlsx # # XlsToXml: # 同样XMLWriter仅支持XML只有一个根标签(标准XML格式),使用TinyXMLWriter支持有多个根标签 # Excel文件需要有层次分明的表头,并且需要冻结表头 # import openpyxl from openpyxl.styles import Alignment from openpyxl.styles import Border, Side from openpyxl.comments import Comment import openpyxl.utils as xls_util import chardet import os import xml.dom.minidom import copy class XMLNode: def __init__(self): self.name = "" self.properties = [] self.child = [] self.layer = 0 self.index = 0 self.parent = None self.node_info = None class XMLNodeInfo: def __init__(self, node): self.map_properties = { } self.map_child = { } self.max_index = 0 self.layer = node.layer self.name = node.name self.parent = node.parent self.register(node) def register(self, node): for k in node.properties: if self.map_properties.get(k[0]) is None: self.map_properties[k[0]] = self.map_properties.__len__() for ch in node.child: v = self.map_child.get(ch.name) if v is None: self.map_child[ch.name] = ch.node_info if node.index > self.max_index: self.max_index = node.index class _XMLReaderBase: def __init__(self): self._root = [] self._map_node_info = { } def __calc_node_key(self, node): key = "" if node.parent is not None: key = self.__calc_node_key(node.parent) return "%s_%s" % (key, node.name) def _register(self, node): key = self.__calc_node_key(node) nd = self._map_node_info.get(key) if nd is None: nd = XMLNodeInfo(node) node.node_info = nd self._map_node_info[key] = nd else: nd.register(node) node.node_info = nd @staticmethod def _convert_to_utf8(file_path): fd = open(file_path, "rb") fd.seek(0, 2) size = fd.tell() if size > 1024 * 1024: size = 1024 * 1024 fd.seek(0, 0) text = fd.read(size) ret = chardet.detect(text) if ret['encoding'].lower().find("utf-8") != -1: return False, file_path tmp_file = file_path + ".tmp" file = open(tmp_file, "w", encoding="utf-8", newline=" ") fd.seek(0, 0) line = fd.readline() while line.__len__() > 0: file.write(line.decode("gb18030")) line = fd.readline() file.close() fd.close() return True, tmp_file # 该类使用xml不能解析值中带特殊符号的文件,也不支持没有统一根节点的XML文件,建议使用TinyXMLReader class XMLReader(_XMLReaderBase): # is_auto_convert2utf8:是否自动转换为UTF8编码 # is_convert_to_original_file:如果转换为UTF8,是否覆盖原文件 def read(self, file_path, is_auto_convert2utf8=True, is_convert_to_original_file=False): _XMLReaderBase.__init__(self) if is_auto_convert2utf8: is_tmp_file, tmp_file_path = self._convert_to_utf8(file_path) fd = xml.dom.minidom.parse(tmp_file_path) if is_tmp_file: if is_convert_to_original_file: os.remove(file_path) os.rename(tmp_file_path, file_path) else: os.remove(tmp_file_path) else: fd = xml.dom.minidom.parse(file_path) index = 0 for child in fd.childNodes: if child.nodeType != xml.dom.Node.ELEMENT_NODE: continue self.__read_node(child, self._root, index, None) index += 1 return self._root @staticmethod def __get_attrib(node, rc): if node._attrs is None: return for key in node._attrs: v = node._attrs[key]._value rc.append([key, v]) def __read_node(self, node, root, index, parent, layer=1): xml_node = XMLNode() xml_node.name = node.nodeName xml_node.layer = layer xml_node.index = index xml_node.parent = parent self.__get_attrib(node, xml_node.properties) i = 0 for child in node.childNodes: if child.nodeType != xml.dom.Node.ELEMENT_NODE: continue self.__read_node(child, xml_node.child, i, xml_node, layer + 1) i += 1 root.append(xml_node) self._register(xml_node) # 该类需要安装pytinyxml2,参见:https://blog.csdn.net/witton/article/details/100302498 class TinyXMLReader(_XMLReaderBase): # is_auto_convert2utf8:是否自动转换为UTF8编码 # is_convert_to_original_file:如果转换为UTF8,是否覆盖原文件 def read(self, file_path, is_auto_convert2utf8=True, is_convert_to_original_file=False): _XMLReaderBase.__init__(self) import pytinyxml2 doc = pytinyxml2.XMLDocument() if is_auto_convert2utf8: is_tmp_file, tmp_file_path = self._convert_to_utf8(file_path) doc.LoadFile(tmp_file_path) if is_tmp_file: if is_convert_to_original_file: os.remove(file_path) os.rename(tmp_file_path, file_path) else: os.remove(tmp_file_path) else: doc.LoadFile(file_path) node = doc.RootElement() index = 0 while node is not None: self.__read_node(node, self._root, index, None) node = node.NextSiblingElement() index += 1 return self._root @staticmethod def __get_attrib(node, rc): attrib = node.FirstAttribute() while attrib is not None: key = attrib.Name() v = attrib.Value() rc.append([key, v]) attrib = attrib.Next() def __read_node(self, node, root, index, parent, layer=1): xml_node = XMLNode() xml_node.name = node.Value() xml_node.layer = layer xml_node.index = index xml_node.parent = parent self.__get_attrib(node, xml_node.properties) i = 0 child = node.FirstChildElement() while child is not None: self.__read_node(child, xml_node.child, i, xml_node, layer + 1) child = child.NextSiblingElement() i += 1 root.append(xml_node) self._register(xml_node) class XMLWriter: def __init__(self, xml_file_path, xml_node_list): doc = xml.dom.minidom.Document() for node in xml_node_list: ele = self.__write_node(node, doc) doc.appendChild(ele) f = open(xml_file_path, 'w', encoding='utf-8') doc.writexml(f, indent=' ', addindent=' ', encoding='utf-8', newl=' ') f.close() def __write_node(self, node, doc): ele = doc.createElement(node.name) for prop in node.properties: ele.setAttribute(prop[0], prop[1]) for child in node.child: ret = self.__write_node(child, doc) ele.appendChild(ret) return ele class TinyXMLWriter: # is_convert_spec_character: 是否转换特殊字符 def __init__(self, xml_file_path, xml_node_list, is_convert_spec_character=True): import pytinyxml2 doc = pytinyxml2.XMLDocument(is_convert_spec_character) decl = doc.NewDeclaration() doc.LinkEndChild(decl) for node in xml_node_list: ele = self.__write_node(node, doc) doc.LinkEndChild(ele) doc.SaveFile(xml_file_path) def __write_node(self, node, doc): ele = doc.NewElement(node.name) for prop in node.properties: ele.SetAttribute(prop[0], prop[1]) for child in node.child: ret = self.__write_node(child, doc) ele.LinkEndChild(ret) return ele class XmlToXls: def __init__(self): self.__is_merge_head = False self.__alignment_center = None self.__border = None # read_from_xml_file_path:XML源文件完整路径 # save_to_xls_file_path:保存转换后的Excel文件完整路径 # is_auto_convert2utf8:是否自动转换为UTF8编码 # is_convert_to_original_file:如果转换为UTF8,是否覆盖原文件 # is_merge_head:如果表头有多行,是否合并层属关系的表头 # is_border_cell:是否添加单元格的边框 # is_use_tiny_xml:是否使用tinyXML def convert(self, read_from_xml_file_path, save_to_xls_file_path, is_auto_convert2utf8=True, is_convert_to_original_file=False, is_merge_head=False, is_border_cell=False, is_use_tiny_xml=True): try: self.__is_merge_head = is_merge_head self.__alignment_center = Alignment(horizontal='center', vertical='center') if is_border_cell: side = Side(border_style='thin', color='000000') self.__border = Border(side, side, side, side, side) else: self.__border = None wb = openpyxl.Workbook() wb.encoding = 'utf-8' for sh in wb.worksheets: wb.remove_sheet(sh) if is_use_tiny_xml: reader = TinyXMLReader() else: reader = XMLReader() root = reader.read(read_from_xml_file_path, is_auto_convert2utf8, is_convert_to_original_file) self.__write(root, wb) wb.save(save_to_xls_file_path) wb.close() except Exception as e: print(e) # src_path_dir:XML源目录完整路径 # dst_path_dir:保存转换后的Excel文件完整目录路径,如果为None或者为空,则直接转换在源文件应对目录下 # is_auto_convert2utf8:是否自动转换为UTF8编码 # is_convert_to_original_file:如果转换为UTF8,是否覆盖原文件 # is_merge_head:如果表头有多行,是否合并层属关系的表头 # is_border_cell:是否添加单元格的边框 # is_use_tiny_xml:是否使用tinyXML def convert_dirs(self, src_path_dir, dst_path_dir=None, is_auto_convert2utf8=True, is_convert_to_original_file=False, is_merge_head=False, is_border_cell=False, is_use_tiny_xml=True): if dst_path_dir is not None and dst_path_dir != "": if not os.path.exists(dst_path_dir): os.mkdir(dst_path_dir) for root, dirs, files in os.walk(src_path_dir): for name in files: basename, ext = os.path.splitext(name) if ext != ".xml": continue src = os.path.join(root, name) target = basename + ".xlsx" print("处理%s" % src) if dst_path_dir is None or dst_path_dir == "": dst = os.path.join(root, target) else: dst = os.path.join(dst_path_dir, target) self.convert(src, dst, is_auto_convert2utf8, is_convert_to_original_file, is_merge_head, is_border_cell, is_use_tiny_xml) def __write(self, xml_node, wb): self.__map_field = { } for node in xml_node: if node.node_info.map_child.__len__() == 1: self.__is_multi_sheet = False self.__write_sheet(node, wb) else: self.__is_multi_sheet = True for child in node.child: self.__write_sheet(child, wb) def __write_sheet(self, node, wb): sh = wb.create_sheet(node.name) self.__write_head(node, sh) self.__write_data(node, sh) def __write_head(self, node, sh): node_info = node.node_info if self.__is_multi_sheet: self.__write_head_cell(sh, node_info.parent.layer, 1, node_info.parent.name) self.__write_head_cell(sh, node_info.layer, 1, node_info.name) comment_str = "" for prop in node.properties: comment_str += '%s="%s" ' % (prop[0], prop[1]) if comment_str != "": sh.cell(node_info.layer, 1).comment = Comment(comment_str, "", width=300) elif not self.__is_multi_sheet: self.__write_head_cell(sh, node_info.layer, 1, node_info.name) child_name = None for key in node_info.map_child: col = 1 child = node_info.map_child[key] child_name = child.name for k in child.map_properties: c = child.map_properties[k] self.__write_head_cell(sh, child.layer, c + col, k) col += child.map_properties.__len__() self.__write_head_ext(child, sh, col) break if self.__is_multi_sheet: row = 3 else: row = 2 if child_name is not None: sh.insert_rows(row) self.__write_head_cell(sh, row, 1, child_name) def __write_head_ext(self, node, sh, col): for key in node.map_child: child = node.map_child[key] num = child.map_properties.__len__() for i in range(0, child.max_index + 1): if col > 16384: raise Exception("超过EXCEL最大列数(16384列)限制,转换失败") old_col = col self.__write_head_cell(sh, child.layer - 1, col, child.name) for k in child.map_properties: c = child.map_properties[k] self.__write_head_cell(sh, child.layer, c + col, k) col += num col = self.__write_head_ext(child, sh, col) if self.__is_merge_head: merge_row = child.layer - 1 sh.merge_cells(start_row=merge_row, end_row=merge_row, start_column=old_col, end_column=col - 1) return col def __write_data(self, node, sh): row = sh.max_row + 1 sh.freeze_panes = sh.cell(row, 1) for child in node.child: col = 1 self.__write_data_ext(child, sh, row, col) row += 1 def __write_data_ext(self, node, sh, row, col): m = node.node_info.map_properties max_prop_num = m.__len__() for prop in node.properties: c = m[prop[0]] self.__write_cell(sh, row, c + col, prop[1]) col += max_prop_num map_child = { } for child in node.child: if map_child.get(child.name) is None: map_child[child.name] = 1 else: map_child[child.name] += 1 col = self.__write_data_ext(child, sh, row, col) for key in node.node_info.map_child: child = node.node_info.map_child[key] all_count = child.max_index + 1 count = map_child.get(key) if count is not None: all_count -= count for i in range(0, all_count): col += child.map_properties.__len__() return col def __write_head_cell(self, sh, row, col, value): cell = sh.cell(row, col, value) cell.border = self.__border cell.alignment = self.__alignment_center def __write_cell(self, sh, row, col, value): if value.isdigit(): cell = sh.cell(row, col, int(value)) else: cell = sh.cell(row, col, value) cell.border = self.__border class XlsToXml: def __init__(self): pass @staticmethod def __add_prop(map_field, col, value): if map_field.__len__() == 0: return if map_field.get(col) is None: # 找本节点 c = col - 1 while c >= 1: if map_field.get(c) is not None: node = map_field[c][0] break c -= 1 else: node = map_field[col][0] node.properties.append([value, col]) @staticmethod def __add_node(map_field, row, col, value): node = XMLNode() node.name = value if map_field.get(col) is not None: node.parent = map_field[col][0] else: # 找父节点 c = col - 1 while c >= 1: if map_field.get(c) is not None: if row > map_field[c][1]: node.parent = map_field[c][0] break c -= 1 if node.parent is not None: node.parent.child.append(node) node.layer = node.parent.layer + 1 else: node.layer = 1 map_field[col] = [node, row] return node def __read_xls(self, file_path): wb = openpyxl.load_workbook(file_path) root_data = XMLNode() is_multi_sheet = wb.worksheets.__len__() > 1 for sh in wb.worksheets: max_row = sh.max_row + 1 max_col = sh.max_column + 1 if sh.freeze_panes is None: raise Exception("文件[%s]表单[%s]的无冻结窗口,无法确定表头,转为XML失败" % (os.path.basename(file_path), sh.title)) head_row, head_col = xls_util.coordinate_to_tuple(sh.freeze_panes) if head_col != 1: raise Exception("文件[%s]表单[%s]的冻结窗口列不为1,无法转为XML" % (os.path.basename(file_path), sh.title)) root = None map_field = { } for col in range(1, max_col): for row in range(1, head_row): cell = sh.cell(row, col) value = cell.value comment = cell.comment if value is None: continue next_row = row + 1 if next_row >= head_row: self.__add_prop(map_field, col, value) continue next_row_value = sh.cell(next_row, col).value if next_row_value is None: self.__add_prop(map_field, col, value) continue node = self.__add_node(map_field, row, col, value) if root is None: root = node if comment is None: continue comment = comment.text lines = comment.splitlines() for line in lines: props = line.split('=') kv = [] for prop in props: prop = prop.replace('"', '') kv.append(prop) if kv.__len__() == 2: node.properties.append(kv) kv = [] root_data.name = root.name root_data.layer = root.layer if is_multi_sheet and root.child.__len__() > 0: child_list = copy.deepcopy(root.child[0].child) root.child[0].child = [] root_data.child.append(root.child[0]) root_data_child = root.child[0].child else: child_list = copy.deepcopy(root.child) root_data_child = root_data.child for row in range(head_row, max_row): clone = copy.deepcopy(child_list) for child in clone: self.__read_node_data(child, sh, row) root_data_child.append(child) return root_data def __read_node_data(self, node, sh, row): prop_list = [] for prop in node.properties: col = prop[1] value = sh.cell(row, col).value if value is not None: prop_list.append([prop[0], value]) child_list = [] for child in node.child: self.__read_node_data(child, sh, row) if child.properties.__len__() > 0 or child.child.__len__() > 0: child_list.append(child) copy.copy(child) node.properties = prop_list node.child = child_list def convert(self, src_file_path, dst_file_path, is_use_tiny_xml=True, is_convert_spec_character=False): root = self.__read_xls(src_file_path) if is_use_tiny_xml: TinyXMLWriter(dst_file_path, [root], is_convert_spec_character) else: XMLWriter(dst_file_path, [root]) def convert_dirs(self, src_path_dir, dst_path_dir=None, is_use_tiny_xml=True, is_convert_spec_character=False): for root, dirs, files in os.walk(src_path_dir): for name in files: basename, ext = os.path.splitext(name) if ext != ".xls" and ext != ".xlsx": continue src = os.path.join(root, name) target = basename + "1" + ".xml" print("处理%s" % src) if dst_path_dir is None or dst_path_dir == "": dst = os.path.join(root, target) else: dst = os.path.join(dst_path_dir, target) try: self.convert(src, dst, is_use_tiny_xml, is_convert_spec_character) except Exception as e: print(e)
有人在问如何使用,给一个XML转Excel的示例,假如有一个test.xml文件:
<?xml version="1.0" encoding="UTF-8"?> <Test> <Data Type="1001" Desc = "这是描述"> <Condition Value="10" Name="名字1"> <AwardItem ItemId = "5" SubId = "10" Count = "10" /> </Condition> <Condition Value="20" Name="名字2"> <AwardItem ItemId = "5" SubId = "100" Count = "50" /> </Condition> <Condition Value="30" Name="名字3"> <AwardItem ItemId = "5" SubId = "1000" Count = "100" /> </Condition> </Data> <Data Type="1002" Desc = "这是描述"> <Condition Value="100" Name="名字10"> <AwardItem ItemId = "5" SubId = "10" Count = "10" /> </Condition> <Condition Value="200" Name="名字20"> <AwardItem ItemId = "5" SubId = "100" Count = "50" /> </Condition> <Condition Value="300" Name="名字30"> <AwardItem ItemId = "5" SubId = "1000" Count = "100" /> </Condition> <Condition Value="400" Name="名字40"> <AwardItem ItemId = "5" SubId = "5000" Count = "200" /> </Condition> </Data> </Test>
我们写一个XML转XLSX的调用程序test.py
import sys import xml2xls import traceback def main(): if sys.argv.__len__() < 3: print("格式错误,格式:<命令> <XML源文件> <XLSX目标文件>") return c = xml2xls.XmlToXls() try: c.convert(sys.argv[1], sys.argv[2]) print("处理文件%s成功" % sys.argv[1]) except Exception as e: print("处理文件%s失败, 异常:%s" % (sys.argv[1], e)) print(traceback.format_exc()) if __name__ == '__main__': main()
在命令行执行:
python test.py test.xml test.xlsx
就会生成一个test.xlsx的Excel文件,使用Excel打开,结果如图:
从前面的XML可以看到,第一行只有3组Condition,第二行有4组Condition,所以最后按最大值4组来展示,第一行没第4组Condition,则为空。 同时,转换后的Excel表头是锁定状态,并且表头是有层次结构的,对应XML的层次。同样,如果要将一个Excel表转换成XML,也需要有锁定状态的且有层次结构的表头。
祝好!
发布者:全栈程序员栈长,转载请注明出处:https://javaforall.cn/171942.html原文链接:https://javaforall.cn
加载全部内容