今天公司新需求,解析word习题集
使用python-docx这个库解析
dockx的官方API描述
官方的比那些blog好多了:https://python-docx.readthedocs.io/en/latest/dev/analysis/features/shapes/shapes-inline.html
使用ElementTree解析word的树
代码:
| 12
 3
 4
 5
 6
 7
 8
 9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 
 | import docx
 import json
 
 import xml.etree.cElementTree as ET
 
 
 def parsePic2(questionArray, bookNameWord):
 fs = docx.Document(bookNameWord)
 
 proxy = []
 for p in fs.paragraphs:
 proxy.append(p._element.xml)
 rIds = []
 for p in proxy:
 
 root = ET.fromstring(p)
 
 pictr_str = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}r"
 pictrs = root.findall(pictr_str)
 image_str = "*/{urn:schemas-microsoft-com:vml}shape/{urn:schemas-microsoft-com:vml}imagedata"
 for pictr in pictrs:
 
 pict = pictr.findall(image_str)
 if len(pict) > 0:
 rIds.append(pict[0].attrib['{http://schemas.openxmlformats.org/officeDocument/2006/relationships}id'])
 
 def parsePic(questionArray,bookNameWord):
 fs = docx.Document(bookNameWord)
 for shap in fs.inline_shapes:
 tree = shap._inline.xml
 root = ET.fromstring(tree)
 root.findall('')
 
 
 
 
 questionArray = []
 parsePic(questionArray, './test.docx')
 
 |