软工博客归档工具(自用)

#-*- codeing = utf-8 -*-
#@Time :2021/6/21 16:51
#@Author :Xxg
#@Site :
#@File :作业归档完善版.py
#@Software :PyCharm
import random
import requests
import pymysql
from lxml import etree
import docx
headers={
    "User-Agent": ""
}
url = ''

reponse = requests.get(url, headers=headers)   # reponse
html = etree.HTML(reponse.text)
# print(html)
date = html.xpath('//div[@class="dayTitle"]/a/text()')
name = html.xpath('//div[@class="postTitle"]/a/span/text()')
zhaiyao = html.xpath('//div[@class="postCon"]/div[@class="c_b_p_desc"]/text()')
# 链接
yueduquanwen = html.xpath('//div[@class="postCon"]/div[@class="c_b_p_desc"]/a/@href')
for i in range(len(yueduquanwen)):
    url1 = yueduquanwen[i]
    # url1 = "https://www.cnblogs.com/sakura-xxg/category/1990334.html"
    reponse1 = requests.get(url1, headers=headers)  # reponse
    html_son = etree.HTML(reponse1.text)
    title = html_son.xpath('//div[@class="post"]/h1[@class="postTitle"]/a/span/text()')
    print(title)
    content = html_son.xpath('//div[@class="blogpost-body blogpost-body-html"]/p/text()')
    print(content)
    date = html_son.xpath('//div[@class="postDesc"]/span[@id="post-date"]/text()')
    print(date)
# 创建docx对象
    file = docx.Document()
    file.add_paragraph(date)
    for j in range(len(content)):
        file.add_paragraph(content[j])
    file.save("D:\"+title[0]+".docx")
    # for j in range(len(content)):
    #   file.add_paragraphy(content[j])
    # date_son = html.xpath('//div[@class="dayTitle"]/a/text()')
    # name_son = html.xpath('//div[@class="postTitle"]/a/span/text()')
    # zhaiyao_son = html.xpath('//div[@class="postCon"]/div[@class="c_b_p_desc"]/text()')
    # print(date_son)
    # print(zhaiyao_son)
print(yueduquanwen)
# print(date[0])
# print(name[0].replace(" ","").replace("
",""))
# print(zhaiyao[0].replace("
",""))
# print(zhaiyao[0])

# 保存成word
# for n in range(len(date)):
#     file = docx.Document()
#     file.add_paragraph(date[n])
#     file.add_paragraph(zhaiyao[2*n].replace("
",""))
#     # file.save("F:\word\"+name[n].replace(" ","").replace("
","")+".docx")
#     print(date[n])
#     print(zhaiyao[2*n])
原文地址:https://www.cnblogs.com/sakura-xxg/p/14915406.html