python 实现word文档内格式转换

Contents

1. 0x01问题描述
2. 0x02分析问题
3. 0x03解决问题
4. 0x04:程序代码
5. 0x05:致谢

打完了神码的国赛选拔赛，闲来无事去办公室溜达溜达，正好遇到老师在整理材料，其中最费时间的就是材料里面的格式转换，每个文档都需要转换一遍，大量的重复劳动，于是尝试用程序员的眼光去看待和解决问题，写个脚本去做。

0x01问题描述

文档是专升本的过程性考核资料，每个学生一份，一共是72份，需要将

大学英语（一）A      	4	89
计算机应用基础B	        4	97
生活小智慧与生命大健康	2	86
思想道德修养与法律基础	3	75
大学生职业生涯规划	   1.5	78
乒乓球	                1	91
程序设计技术	        4	93
高等数学B	            4	84
大学英语（二）B	        4	87
实用心理学	            2	87
网页制作	            4	74
毛泽东思想和中国特色社会主义理论体系概论	4	82
足球	                1	85
IT职业素养	            1	87
Java程序设计	        4	95
中国现当代文学作品欣赏	2	74
数据结构	            4	78
数据库原理与应用	    4	87
HTML5程序开发	        4	87
形势与政策	            1	99
中华食医	            2	92
操作系统	            4	81
微机原理与接口技术	    4	83
Android应用程序设计	    4	86
就业指导实务	       0.5	94
网络创业理论与实践	    2	98
互联网+时代创新创业	    1	94
软件创新创业项目实践	1	95
顶岗实习	           19	95
共29科，补考0科。

修改为

大学英语（一）A89 计算机应用基础B97 生活小智慧与生命大健康86 思想道德修养与法律基础75 大学生职业生涯规划78 乒乓球91s 程序设计技术93 高等数学B84 大学英语（二）B87 实用心理学87 网页制作74 毛泽东思想和中国特色社会主义理论体系概论82 足球85 IT职业素养87 Java程序设计95 中国现当代文学作品欣赏74 数据结构78 数据库原理与应用87 HTML5程序开发87 形势与政策99 中华食医92 操作系统81 微机原理与接口技术83 Android应用程序设计86 就业指导实务94 网络创业理论与实践98 互联网+时代创新创业94 软件创新创业项目实践95 顶岗实习95

0x02分析问题

由上面的效果可以看出，其重复话的操作无非就是

删除学分这一项
去处所有的空格
将换行替换为空格

转换为程序可以划分为几个自定义函数

读文件
改拓展名(doc转docx)
读取文件内容保存退出
转换格式函数

文档模板如下
https://wenku.baidu.com/view/36dfebd577232f60dccca13f.html

文件保存目录
images

0x03解决问题

0x01 读取当前子目录下所有word格式文件

# coding=gb2312
import subprocess
docx = []


def dir_list(path):
    dirlist=[]
    for pt in path:
        p = subprocess.Popen('dir '+pt.encode('gb2312'), shell=True, stdout=subprocess.PIPE)
        out, err = p.communicate()
        lista = out.splitlines()[:-2]
        for line in lista[7:]:
            if ".doc" not in line and ".docx" not in line and "." in line:
                continue
            if len(pt)>3:
                dirlist.append(pt+"//"+line.decode("gb2312")[36:])
            else:
                dirlist.append(line.decode("gb2312")[36:])
    return dirlist

调用函数将所有word文件的名称保存进了pathdir列表

遍历列表将doc和docx文件分类保存

def dir_path():
    doc = []
    pathdir = ['']
    pathdir=dir_list(dir_list(pathdir))
    for fname in pathdir:
        if ".docx" in fname:
            docx.append(fname)
        else:
            doc.append(fname)
    return doc

0x02转换文件格式(doc->docx)

这里使用win32com模块转换文件格式

安装win32com模块

1	pip install pypiwin32

转换文件格式

def convert(path):#转换单个文件
    word = wc.Dispatch("Word.Application")
    doc = word.Documents.Open(path)
    doc.SaveAs(path+"x",16)
    doc.Close()
    word.Quit()
    print "转换文件:".decode('gb2312'),path,"成功".decode('gb2312')
    os.remove(path) #转换完成后删除原有文件

def doc_convert():#批量转换文件
    path = os.getcwd().decode("gb2312") #获取当前绝对路径
    doc=dir_path()
    for pt in doc:
        abspath=path+'\\'+pt
        convert(abspath)

调用

1	doc_convert()

0x03读取文件内容并修改

修改传入的列表内容并返回

def filter(textlist):#过滤函数，过滤传入的list内容并传出
    newtextlist=[]
    for text in textlist:
        if "补考".decode('gb2312') in text or "；".decode("gb2312") in text or "。".decode("gb2312") in text:
            continue
        if "顶岗实习".decode('gb2312') in text:
            text=text.replace("17","").replace("19","")
        text=text.replace(" ","").replace("	","").replace("0.5","").replace("1.5","")
        if "100" in text:
            text=text[:-5]+text[-5:-3].replace(text[-5:-3],"")+text[-3:]
        for i in range(8):
            if "，".decode("gb2312") in text or "," in text:
               if text[-6:-5] == str(i):
                   text=text[:-6]+text[-6:-5].replace(text[-6:-5].decode("gb2312"),"")+text[-5:]
            if text[-3:-2] == str(i):
                text=text[:-3]+text[-3:-2].replace(text[-3:-2].decode("gb2312"),"")+text[-2:]
        text=text.replace("\n"," ")
        newtextlist.append(text)
    return newtextlist

修改word文档内容并保存

def msave():#修改word文档内容并保存
    for fname in docx:
        document = Document(fname)
        tables = document.tables[0]
        textlist=tables.cell(5, 3).text.split("\n")
        text=" ".join(filter(textlist))
        tables.cell(5, 3).text=text
        document.save(fname)
        print "修改格式： ".decode("gb2312")+fname+" 成功！".decode("gb2312")

主函数调用程序

if __name__=="__main__":
    print "======自动格式修改工具======"
    doc_convert()
    print "转换docx格式完成"
    msave()
    print "修改完成"

至此程序已经完全完成

0x04:程序代码

# coding=gb2312
import subprocess
import os
from win32com import client as wc
from docx import Document
docx = []


def dir_list(path):#列出所有的文件名
    dirlist=[]
    for pt in path:
        p = subprocess.Popen('dir '+pt.encode('gb2312'), shell=True, stdout=subprocess.PIPE)
        out, err = p.communicate()
        lista = out.splitlines()[:-2]
        for line in lista[7:]:
            if ".doc" not in line and ".docx" not in line and "." in line:
                continue
            if len(pt)>3:
                dirlist.append(pt+"\\"+line.decode("gb2312")[36:])
            else:
                dirlist.append(line.decode("gb2312")[36:])
    return dirlist

def dir_path():#将列出的文件名分类存入列表
    doc = []
    pathdir = ['']
    pathdir=dir_list(dir_list(pathdir))
    for fname in pathdir:
        if ".docx" in fname:
            docx.append(fname)
        else:
            doc.append(fname)
    return doc

def convert(path):#转换单个文件
    word = wc.Dispatch("Word.Application")
    doc = word.Documents.Open(path)
    doc.SaveAs(path+"x",16)
    doc.Close()
    word.Quit()
    print "转换文件:".decode('gb2312'),path,"成功".decode('gb2312')
    os.remove(path) #转换完成后删除原有文件
    docx.append(path)
def doc_convert():#批量转换文件
    path = os.getcwd().decode("gb2312") #获取当前绝对路径
    doc=dir_path()
    for pt in doc:
        abspath=path+'\\'+pt
        convert(abspath)
def filter(textlist):#过滤函数，过滤传入的list内容并传出
    newtextlist=[]
    for text in textlist:
        if "补考".decode('gb2312') in text or "；".decode("gb2312") in text or "。".decode("gb2312") in text:
            continue
        if "顶岗实习".decode('gb2312') in text:
            text=text.replace("17","").replace("19","")
        text=text.replace(" ","").replace("	","").replace("0.5","").replace("1.5","")
        if "100" in text:
            text=text[:-5]+text[-5:-3].replace(text[-5:-3],"")+text[-3:]
        for i in range(8):
            if "，".decode("gb2312") in text or "," in text:
               if text[-6:-5] == str(i):
                   text=text[:-6]+text[-6:-5].replace(text[-6:-5].decode("gb2312"),"")+text[-5:]
            if text[-3:-2] == str(i):
                text=text[:-3]+text[-3:-2].replace(text[-3:-2].decode("gb2312"),"")+text[-2:]
        text=text.replace("\n"," ")
        newtextlist.append(text)
    return newtextlist

def msave():#修改word文档内容并保存
    for fname in docx:
        document = Document(fname)
        tables = document.tables[0]
        textlist=tables.cell(5, 3).text.split("\n")
        text=" ".join(filter(textlist))
        tables.cell(5, 3).text=text
        document.save(fname)
        print "修改格式： ".decode("gb2312")+fname+" 成功！".decode("gb2312")

if __name__=="__main__":
    print "======自动格式修改工具======"
    doc_convert()
    print "转换docx格式完成"
    msave()
    print "修改完成"

0x05:致谢

doc转docx
https://www.cnblogs.com/themost/p/7243511.html
python执行系统命令后获取返回值的几种方式
https://blog.csdn.net/nextdoor6/article/details/51283117
python的subprocess．Popen()用法
https://blog.csdn.net/sinat_36219858/article/details/70186649