爬取电影网站

code

import time
import sys,os
import requests
import shutil
from selenium import webdriver 
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from bs4 import BeautifulSoup


def asleep(driver):
    driver.implicitly_wait(3.5)
    time.sleep(2) 

driver = webdriver.Chrome()
asleep(driver)

#719页
for k in range(1,720):
    url="http://zimiyy.com/mov/0/0/all/{}.html".format(k)

    driver.get(url)

    t=driver.find_element_by_xpath("//div[@class='index-tj mb clearfix']/ul").get_attribute('innerHTML')

    soup1 = BeautifulSoup(t, 'html.parser')

    tmp=soup1.findAll('a')

    for i in tmp:
        tmp_movie_url="http://zimiyy.com{}".format(i.get("href"))
        print(tmp_movie_url)
        movie_name=i.get("title")
        print(movie_name)
        pic_url=i.find("img").get("src")
        print(pic_url)

        time.sleep(2)
        #进入详情页
        driver.get(tmp_movie_url)
        #获取描述
        tmp_desc=driver.find_element_by_class_name("info").get_attribute('innerHTML')
        detail_html=driver.find_element_by_id("stab_1_71").get_attribute('innerHTML')
        soup2 = BeautifulSoup(detail_html, 'html.parser')
        tmp_play_page_list=soup2.findAll('li')

        print(tmp_desc)
        all_movie_url={}
        for j in tmp_play_page_list:
            movie_url_type=j.find("a").string
            play_page_url=j.find("a").get("href")
            #进入播放页
            driver.get(play_page_url)
            #获取视频链接
            try:
                movie_url=driver.find_element_by_xpath("//span[@class='dplayer-info-panel-item-data']").text
            except Exception as e:
                print(e)
                movie_url=None

            #记录
            all_movie_url[movie_url_type]=movie_url

        print(all_movie_url)
        print("*"*17)

    time.sleep(3)

原文地址:https://www.cnblogs.com/sea-stream/p/13851753.html