PYTHON爬取66影视的电影下载链接,有搜索功能

本片代码亮点在于使用BeautifulSoup的select功能,可以直接根据数据在html页面中的层级标签来获取数据。

# -*- coding=gb18030 -*-

__author__ = 'vincent'

import sys
import urllib2
import urllib
import cookielib
from bs4 import BeautifulSoup


class Spider66ys:
    headers = None
    home_url = None

    def __init__(self):
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 5.1; rv:50.0) Gecko/20100101 Firefox/50.0'
        }
        self.home_url = "http://www.66ys.tv"

    #   获取网页信息
    def get_html(self, url):
        print "正在获取网页[", url, "]的信息..."
        if len(url) == 0:
            print "Input url is null!"
            sys.exit(0)

        request = urllib2.Request(url, headers=self.headers)
        response = urllib2.urlopen(request)
        html = response.read()
        #   print "获取首页信息(", url, ")完毕."
        return html

    # 在电影页面下获取电影的下载链接
    def get_download_url(self, film):
        print "开始从网页[", film[0], "]中获取电影[", film[1], "]的下载链接..."
        html = self.get_html(film[0])

        # fp = open("film.html", "w")
        # fp.write(html)
        # fp.close()

        soup = BeautifulSoup(html, "lxml", from_encoding="gb18030")
        # print soup.prettify()
        results = soup.select("html > body > div.wrap > div.mainleft 
            > div.contentinfo > div#text > table > tbody > tr > td > a")
        for result in results:
            film.append(result['href'])

    # 获取最新更新电影
    def get_new_update(self):
        new_film_list = []

        print "正在获取[", self.home_url, "]更新电影..."
        html = self.get_html(self.home_url)

        # fp = open("66ys.html", "w")
        # fp.write(html)
        # fp.close()

        soup = BeautifulSoup(html, "lxml", from_encoding="gb18030")
        results = soup.select("html > body > div.wrap > div.tnlist > ul > li > a")
        for result in results:
            film = []
            film.append(result['href'])
            film.append(result.getText().encode('gb18030').strip())
            self.get_download_url(film)
            new_film_list.append(film)

        return new_film_list

    # 根据关键字在66影视上搜索电影
    def search_film(self, content):
        search_film_list = []
        search_url = self.home_url + "/e/search/index.php"
        print "开始搜索电影[", content, "]..."
        # print search_url
        postDict = {
            "keyboard": content,
            "show": "title,smalltext",
            "submit": "",
            "tbname": "Article",
            "tempid": "1"
        }
        postData = urllib.urlencode(postDict)
        # print postData
        cookie_jar = cookielib.LWPCookieJar()
        opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie_jar))
        urllib2.install_opener(opener)
        request = urllib2.Request(search_url, postData, headers=self.headers)
        response = urllib2.urlopen(request)
        opener.open(request)
        html = response.read()
        # fp = open("search.html", "w")
        # fp.write(html)
        # fp.close()
        # print content
        soup = BeautifulSoup(html, "lxml", from_encoding="gb18030")
        results = soup.select("html > body > table.tableborder > tr > td > div > b")
        if len(results) == 1:
            print "没有搜索到相关的内容"
            return None

        results = soup.select("html > body > div > div.wrap > div.mainleft > div.channellist > div.listBox > ul > li 
                              div.listInfo > h3 > a")
        # print results
        for result in results:
            film = []
            film.append(result['href'])
            film.append(result.getText().encode('gb18030').strip())
            self.get_download_url(film)
            search_film_list.append(film)
        print "共搜索到[", len(results), "]部电影。"
        return search_film_list


if __name__ == "__main__":
    spider = Spider66ys()
    # new_film_list = spider.get_new_update()
    # for film in new_film_list:
    #     for info in film:
    #         print info, "	"
    #     print ""
    content = "冰与火之歌"
    search_film_list = spider.search_film(content)
    for film in search_film_list:
        print film[1], ":"
        for info in film[2:]:
            print info
        print "-"*200
原文地址:https://www.cnblogs.com/stupid-vincent/p/6279794.html