python word

 代码:

  1 #coding=utf-8
  2 __author__ = 'zhm'
  3 from win32com import client as wc
  4 import os
  5 import time
  6 import random
  7 import MySQLdb
  8 import re
  9 def wordsToHtml(dir):
 10 #批量把文件夹的word文档转换成html文件
 11  #金山WPS调用,抢先版的用KWPS,正式版WPS
 12  word = wc.Dispatch('KWPS.Application')
 13  for path, subdirs, files in os.walk(dir):
 14   for wordFile in files:
 15    wordFullName = os.path.join(path, wordFile)
 16    #print "word:" + wordFullName
 17    doc = word.Documents.Open(wordFullName)
 18    wordFile2 = unicode(wordFile, "gbk")
 19    dotIndex = wordFile2.rfind(".")
 20    if(dotIndex == -1):
 21     print '********************ERROR: 未取得后缀名!'
 22    fileSuffix = wordFile2[(dotIndex + 1) : ]
 23    if(fileSuffix == "doc" or fileSuffix == "docx"):
 24     fileName = wordFile2[ : dotIndex]
 25     htmlName = fileName + ".html"
 26     htmlFullName = os.path.join(unicode(path, "gbk"), htmlName)
 27     # htmlFullName = unicode(path, "gbk") + "\" + htmlName
 28     print u'生成了html文件:' + htmlFullName
 29     doc.SaveAs(htmlFullName, 8)
 30     doc.Close()
 31  word.Quit()
 32  print ""
 33  print "Finished!"
 34 def html_add_to_db(dir):
 35 #将转换成功的html文件批量插入数据库中。
 36  conn = MySQLdb.connect(
 37   host='localhost',
 38   port=3306,
 39   user='root',
 40   passwd='root',
 41   db='test',
 42   charset='utf8'
 43   )
 44  cur = conn.cursor()
 45  for path, subdirs, files in os.walk(dir):
 46   for htmlFile in files:
 47    htmlFullName = os.path.join(path, htmlFile)
 48    title = os.path.splitext(htmlFile)[0]
 49    targetDir = 'D:/files/htmls/'
 50    #D:/files为web服务器配置的静态目录
 51    sconds = time.time()
 52    msconds = sconds * 1000
 53    targetFile = os.path.join(targetDir, str(int(msconds))+str(random.randint(100, 10000)) +'.html')
 54    htmlFile2 = unicode(htmlFile, "gbk")
 55    dotIndex = htmlFile2.rfind(".")
 56    if(dotIndex == -1):
 57     print '********************ERROR: 未取得后缀名!'
 58    fileSuffix = htmlFile2[(dotIndex + 1) : ]
 59    if(fileSuffix == "htm" or fileSuffix == "html"):
 60     if not os.path.exists(targetDir):
 61      os.makedirs(targetDir)
 62     htmlFullName = os.path.join(unicode(path, "gbk"), htmlFullName)
 63     htFile = open(htmlFullName,'rb')
 64     #获取网页内容
 65     htmStrCotent = htFile.read()
 66     #找出里面的图片
 67     img=re.compile(r"""<imgs.*?s?srcs*=s*['|"]?([^s'"]+).*?>""",re.I)
 68     m = img.findall(htmStrCotent)
 69     for tagContent in m:
 70      imgSrc = unicode(tagContent, "gbk")
 71      imgSrcFullName = os.path.join(path, imgSrc)
 72      #上传图片
 73      imgTarget = 'D:/files/images/whzx/'
 74      img_sconds = time.time()
 75      img_msconds = sconds * 1000
 76      targetImgFile = os.path.join(imgTarget, str(int(img_msconds))+str(random.randint(100, 10000)) +'.png')
 77      if not os.path.exists(imgTarget):
 78       os.makedirs(imgTarget)
 79      if not os.path.exists(targetImgFile) or(os.path.exists(targetImgFile) and (os.path.getsize(targetImgFile) != os.path.getsize(imgSrcFullName))):
 80       tmpImgFile = open(imgSrcFullName,'rb')
 81       tmpWriteImgFile = open(targetImgFile, "wb")
 82       tmpWriteImgFile.write(tmpImgFile.read())
 83       tmpImgFile.close()
 84       tmpWriteImgFile.close()
 85       htmStrCotent=htmStrCotent.replace(tagContent,targetImgFile.split(":")[1])
 86     if not os.path.exists(targetFile) or(os.path.exists(targetFile) and (os.path.getsize(targetFile) != os.path.getsize(htmlFullName))):
 87      #用iframe包装转换好的html文件。
 88      iframeHtml='''
 89      <script type="text/javascript" language="javascript">
 90       function iFrameHeight() {
 91        var ifm= document.getElementById("iframepage");
 92        var subWeb = document.frames ? document.frames["iframepage"].document:ifm.contentDocument;
 93        if(ifm != null && subWeb != null) {
 94         ifm.height = subWeb.body.scrollHeight;
 95        }
 96       }
 97      </script>
 98      <iframe src='''+targetFile.split(':')[1]+'''
 99       marginheight="0" marginwidth="0" frameborder="0" scrolling="no" width="765" height=100% id="iframepage" name="iframepage" onLoad="iFrameHeight()" ></iframe>
100      '''
101      tmpTargetFile = open(targetFile, "wb")
102      tmpTargetFile.write(htmStrCotent)
103      tmpTargetFile.close()
104      htFile.close()
105      try:
106       # 执行
107       sql = "insert into common_article(title,content) values(%s,%s)"
108       param = (unicode(title, "gbk"),iframeHtml)
109       cur.execute(sql,param)
110      except:
111       print "Error: unable to insert data"
112  cur.close()
113  conn.commit()
114  # 关闭数据库连接
115  conn.close()
116 if __name__ == '__main__':
117  wordsToHtml('d:/word')
118  html_add_to_db('d:/word')
原文地址:https://www.cnblogs.com/kamil/p/5772903.html