0%

python解析word文档

今天公司新需求,解析word习题集

使用python-docx这个库解析

dockx的官方API描述

官方的比那些blog好多了:https://python-docx.readthedocs.io/en/latest/dev/analysis/features/shapes/shapes-inline.html

使用ElementTree解析word的树

代码:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39

import docx
import json

import xml.etree.cElementTree as ET


def parsePic2(questionArray, bookNameWord):
fs = docx.Document(bookNameWord)
# parse pic
proxy = []
for p in fs.paragraphs:
proxy.append(p._element.xml)
rIds = []
for p in proxy:
# 一段一个根树
root = ET.fromstring(p)
# 获得<w:r>树,所有的<w:pict>树均是<w:r>树的子树
pictr_str = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}r"
pictrs = root.findall(pictr_str)
image_str = "*/{urn:schemas-microsoft-com:vml}shape/{urn:schemas-microsoft-com:vml}imagedata"
for pictr in pictrs:
# 获得所有<v:imagedata>标签
pict = pictr.findall(image_str)
if len(pict) > 0:
rIds.append(pict[0].attrib['{http://schemas.openxmlformats.org/officeDocument/2006/relationships}id'])

def parsePic(questionArray,bookNameWord):
fs = docx.Document(bookNameWord)
for shap in fs.inline_shapes:
tree = shap._inline.xml
root = ET.fromstring(tree)
root.findall('')
#print (json.dumps(shap.__dict__, default=lambda o: o.__dict__))
#for line in fs.paragraphs:
# print(line.text)

questionArray = []
parsePic(questionArray, './test.docx')

Welcome to my other publishing channels