【爬虫】爬取淮安信息职业学校(苏电院)的新闻网 python

爬取淮安信息职业技术学院所有的新闻内容;包含所有不同的新闻内容,本脚本会输出显示爬取到的新闻详细页URL、文件下载URL,同时提供了下载文件的功能,可以自行研究处理文件下载的!(暂只支持下载爬取到的DOC/xls/PDF文件)。

该脚本是一个框体,解决了爬取过程种的URL爬取,使用者可自行根据实际需要来改动代码。同时也为了方便用来存进数据库,这里也预留了news_*[],用来向数据库写入,他家可以基于本脚本自行扩展,欢迎留言你的github仓库。

作者在处理页面URL遍历的时候,用了一个比较牵强的方法来处理hcit这种逆向的新闻页面编号htm;如果大家又有了高明的方法,欢迎与我讨论哈~~~

#! python3

import requests
from bs4 import BeautifulSoup


def reptiles(responses,addr):
    soup = BeautifulSoup(responses.text,"html.parser")
    # print (soup.find_all(id="line_u8_0"))
    k = 0
    news_link   = []
    news_title  = []
    news_time   = []
    news_text   = []
    while True:
        key_text = "line_u8_%d"%(k)
        soup_li = soup.find(id=str(key_text))
        if key_text not in responses.text : # 遇空则break
            break
        # 获取标题
        title = soup_li.a.get_text()
        title = title[10:]
        news_title.append(title)
        # 获取时间
        time = soup_li.span.get_text()
        news_time.append(time)
        # 获取正文页
        link = soup_li.a['href']
        news_link.append(link)
        newsText_url = addr + link
        res = requests.get(newsText_url)
        res.encoding = "utf-8"
        print("[爬取正文]%s  [%d]"%(newsText_url,res.status_code))
        soup_newsTtext = BeautifulSoup(res.text,"html.parser")
        newsText = soup_newsTtext.find(id="vsb_content")
        news_text.append(newsText)
        # 获取正文附件并下载
        try:
            newsText_a = newsText.a['href']
            newsText_name = newsText.a.string
            newsText_link = "http://www.hcit.edu.cn" + newsText_a
            print("[附件文件]"+newsText_link)
            # res_link = requests.get(newsText_link)
            # with open(newsText_name,'wb') as code:
            #     code.write(res_link.content)
        except Exception as e:
            print("[无附件文件]")
        else:
            pass
        
        k += 1 
    # print(responses.url)
    return 


def main():
    url_addr = [
        # "http://www.hcit.edu.cn/sdxw/xyyw",
        # "http://www.hcit.edu.cn/sdxw/ybdt",
        # "http://www.hcit.edu.cn/sdxw/mtjj",
        "http://www.hcit.edu.cn/sdxw/ggtz"
    ]

    for addr in url_addr:
        page = 1
        while True:
            addr_url = addr + "/" + str(page) + ".htm"
            # print(addr_url)
            responses = requests.get(addr_url)
            if responses.status_code != 200 :
                print("[爬取页面] %s [%s]" % (str(addr + ".htm"), str(requests.get(addr + ".htm").status_code)))
                reptiles(requests.get(addr + ".htm"), addr + "/../")
                print("+++++++++++++++++++++++++[None]++++++++++++++++++++++++++++")
                break
            responses.encoding = "utf-8"
            # 确认访问正常
            print("[爬取页面] %s [%s]"%(str(addr_url),str(responses.status_code)))
            reptiles(responses,addr + "/")
            page += 1
            print("=============================================")

if __name__ == "__main__":
    main()
原文地址:https://www.cnblogs.com/wangyuyang1016/p/14125277.html