requests+BeautifulSoup | 爬取电影天堂全站电影资源

import requests
import urllib.request as ur
from bs4 import BeautifulSoup
import csv
import threading
class MovieHeven():
    def __init__(self):
        self.url="https://www.dytt8.net/html/gndy/dyzz/index.html"
        self.page=1
        self.No=1
        self.fobj=open("movies.csv", "wt", encoding="gbk", newline='')
    def spider(self):
        try:
            print("正在爬取第{}页...".format(self.page))
            # time.sleep(1)
            #获取网页链接并读取
            html = requests.get(self.url)#.Session()
            html.encoding="gbk"
            html=html.text
            #beautfulSoup装载文档
            root=BeautifulSoup(html,"lxml")
            #查找所需元素,获取tables列表
            tables=root.find("div",attrs={"class":"co_content8"}).find("ul").find_all("table")
            for table in tables:
                name = table.find("a").text
                href = "http://www.dytt8.net"+table.find("a")["href"]
                # 文件写入操作
                writer = csv.writer(self.fobj)
                writer.writerow([name, href])
                print("No:", self.No, name, href)
                self.No += 1
            # time.sleep(1)
            urls=root.find("div",attrs={"class":"co_content8"}).find("div",attrs={"class":"x"}).find_all("a")
            #寻找下一页的链接
            for u in urls:
                if u.text == "下一页":#如有下一页
                    self.url="https://www.dytt8.net/html/gndy/dyzz/"+u["href"]
                    print(self.url)
                    self.page += 1
                    self.spider()#爬取下一页


        # except:#没有下一页
        #     print("finished")
            # spider(url)
        except Exception as err:
            print(err)
    def main(self):
    ##    threading.Thread(target=spiderA(url)).start()
        import time
        begin_time = time.time()
        self.spider()  # 执行主程序
        self.fobj.close()
        end_time = time.time()
        time = end_time - begin_time
        m, s = divmod(round(time), 60)
        print("用时:{}min{}s".format(m, s))

if __name__ == '__main__':
    billie=MovieHeven()
    billie.main()

  

原文地址:https://www.cnblogs.com/billie52707/p/12113520.html