今天公司新需求,解析word习题集
使用python-docx这个库解析
dockx的官方API描述
官方的比那些blog好多了:https://python-docx.readthedocs.io/en/latest/dev/analysis/features/shapes/shapes-inline.html
使用ElementTree解析word的树
代码:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39
| import docx import json
import xml.etree.cElementTree as ET
def parsePic2(questionArray, bookNameWord): fs = docx.Document(bookNameWord) proxy = [] for p in fs.paragraphs: proxy.append(p._element.xml) rIds = [] for p in proxy: root = ET.fromstring(p) pictr_str = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}r" pictrs = root.findall(pictr_str) image_str = "*/{urn:schemas-microsoft-com:vml}shape/{urn:schemas-microsoft-com:vml}imagedata" for pictr in pictrs: pict = pictr.findall(image_str) if len(pict) > 0: rIds.append(pict[0].attrib['{http://schemas.openxmlformats.org/officeDocument/2006/relationships}id'])
def parsePic(questionArray,bookNameWord): fs = docx.Document(bookNameWord) for shap in fs.inline_shapes: tree = shap._inline.xml root = ET.fromstring(tree) root.findall('')
questionArray = [] parsePic(questionArray, './test.docx')
|