爬虫(urllib, beatifulsoup的基本使用)

代码如下:

完成了html代码中提取需要的信息操作

import ssl
from bs4 import BeautifulSoup
import re
import urllib.request, urllib.error
import xlwt
import sqlite3

def main():
    ssl._create_default_https_context = ssl._create_unverified_context
    baseUrl = 'https://movie.douban.com/top250?start='
    getData(baseUrl)

findLink = re.compile(r'<a href="(.*?)">', re.S)
findImg = re.compile(r'src="(.*?)"', re.S)
findTitle = re.compile(r'<span class="title">(.*?)</span>', re.S)
findIto = re.compile(r'<p class="">(.*?)</p>', re.S)
findRating = re.compile(r'<span class="rating_num" property="v:average">(.*?)</span>', re.S)
findInq = re.compile(r'<span class="inq">(.*?)</span>', re.S)
#爬取网页
def getData(baseUrl):
    global dlist
    dlist = []
    for i in range(0, 10):
        temp= []
        url = baseUrl + str(i*25)
        html = askUrl(url)
        #逐一解析数据
        soup = BeautifulSoup(html, "html.parser")
        for item in soup.find_all('div', class_="item"):
            content = []
            # print(item)
            item = str(item)
            #目标链接地址
            link = re.findall(findLink, item)[0]
            content.append(link)
            #图片地址
            pic = re.findall(findImg, item)
            content.append(pic)
            #标题
            title = re.findall(findTitle, item)
            # print(title)
            if (len(title) == 1):
                content.append(title[0])
                content.append(' ')
            else:
                content.append(title[0])
                content.append(title[1])
            #介绍
            ito = re.findall(findIto, item)[0]
            ito = re.sub(r'
', '', ito)
            ito = re.sub(r'...<br/>', '', ito)
            ito = re.sub(r' ', '', ito)
            content.append(ito)
            #评分
            rating = re.findall(findRating, item)[0]
            content.append(rating)
            #inq

            inq = re.findall(findInq, item)
            content.append(inq)
            dlist.append(content)
    print(len(dlist))
    return dlist
#得到指定一个URL的网页内容

def askUrl(url):
    head = {  #用来模拟请求头信息
        "User-Agent": "Mozilla / 5.0(Macintosh;IntelMacOSX10_15_3) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / 81.0.4044.122Safari / 537.36"
    }
    request = urllib.request.Request(url, headers=head)
    html = ""
    try:
        response = urllib.request.urlopen(request)
        html = response.read().decode("utf-8")
        # print(html)
    except urllib.error.URLError as e:
        if hasattr(e, "code"):
            print(e.code)
        if hasattr(e, "reason"):
            print(e.reason)
    return html
if __name__ == "__main__":
    main()





原文地址:https://www.cnblogs.com/jackson1/p/12776794.html