python爬虫-豆瓣电影top250

一.python爬虫简介
1.什么是爬虫:
网络爬虫,是一种按照一定规则,自动抓取互联网信息的程序或者脚本。由于互联网数据的多样性和资源的有限性,根据用户需求定向抓取相关网页并分析已成为如今主流的爬取策略。
2.爬虫的作用:
网络抓取图片,爬取想看的视频,只要通过浏览器访问的数据都可以通过爬虫获取
3.爬虫的本质:
模拟浏览器打开网页,获取网页中我们想要的那部分数据

二.爬取数据
1.urllib模块使用

import urllib.request
import urllib.parse
#解析baidu网页源码并进行utf-8解码,get请求
response = urllib.request.urlopen( "http://www.baidu.com" )
print(response.read().decode("utf-8"))

#获取一个post请求,其中封装data数据,使用utf8解码
data = bytes(urllib.parse.urlencode({"hello":"world"}),encoding="utf-8")
response = urllib.request.urlopen("http://httpbin.org/post",data=data)
print(response.read().decode("utf-8"))

#超时处理
try:
    response = urllib.request.urlopen("http://httpbin.org/get",timeout=1)
    print(response.read().decode("utf-8"))
except urllib.error.URLError as e:
    print("time out")

#获取响应码/头部
response = urllib.request.urlopen( "http://www.baidu.com" )
print(response.status)
print(response.getheaders())

#爬取豆瓣信息,使用浏览器信息
url = "http://www.douban.com"
headers={
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36"
}
#data = bytes(urllib.parse.urlencode({"name":"eric"}),encoding="utf-8")
req = urllib.request.Request(url=url,headers=headers,method="POST")
response = urllib.request.urlopen(req)
print(response.read().decode("utf-8"))

2.实例-数据获取

#得到指定一个URL的网页内容
def askURl(url):
    head={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36"}
    request = urllib.request.Request(url,headers=head)
    try:
        response = urllib.request.urlopen(request)
        html = response.read().decode("utf-8")
        #print(html)
    except urllib.error.URLError as  e:
        if hasattr(e,"code"):
            print(e.code)
    return html

三.解析数据
1.BeauifulSoup模块

#!/usr/bin/python3
# @DESC:BeatuifulSoup4将复杂HTML文档转换成一个复杂的树形结构,每个节点都是Python对象,所有对象可以回归为4种:Tag,NavigableString,BeautifulSoup,Comment
import re
from bs4 import BeautifulSoup

file = open("./baidu.html","rb")
html = file.read().decode("utf-8")
bs = BeautifulSoup(html,"html.parser")
#1.Tag标签及其内容,拿到他找到的第一个内容
print(bs.title) #打印title
print(bs.a)     #打印a标签
print(bs.head)  #打印head标签

# 2.NavigableString拿到标签的内容
print(bs.title.string)  #打印title中字符串
print(bs.a.attrs)  #打印标签内所有属性
print(bs.a.string) #打印标签内字符串

# 3.BeautifulSoup,表示整个文档
print(bs.name)
print(bs.attrs)
print(bs)

# 4.comment,是一个特殊的NavigableString,输出内容不包含注释

# 5.文档的遍历
print(bs.head.contents)
print(bs.head.contents[0])

# 6文档搜索
# 6.1 find_all() 字符串过滤:会查找于字符串你完全匹配的内容
t_list = bs.find_all("a") #查找所有的a标签
print(t_list)

# 6.2正则表达式搜索:使用search()方法来匹配内容
t_list = bs.find_all(re.compile("a"))
print(t_list)

# 6.3根据函数的要求来搜索
def name_is_exists(tag):
    return tag.has_attr("name")
t_list = bs.find_all(name_is_exists);
for item in t_list:
    print(item)

# 6.3.kwargs 参数
t_list = bs.find_all(id="head",name=True,limit=3)
#t_list = bs.find_all(text="贴吧")
for item in t_list:
    print(item)

# 6.4选择器
t_list = bs.select('title')  #通过标签查找
t_list = bs.select('#u1')   #通过id查找
t_list = bs.select(".mnav")   #通过类名查找
t_list = bs.select("a[class]")   #通过类名查找
t_list = bs.select(".mnav ~ .bri")   #查看兄弟节点
for item in t_list:
    print(item)

2.re模块

import re
#创建模式对象-search
pat = re.compile("AA")  #此处的AA是正则表达式,用来验证其他字符串
m = pat.search("CBA")  #search字符串为被校验的内容
m = pat.search("ABCAA")  #search字符串为被校验的内容
m = pat.search("BAACABCAA")  #search字符串为被校验的内容
print(m) #打印返回第一次匹配的字符串中下标,左闭右开
#无模式对象-search
m = re.search("asd","Aasd")  #前字符串为模板,后字符串为被校验的对象
#print(m)
#查找所有符合标准的字符串,返回列表
print(re.findall("a","ASDaDEFGAa")) #前字符串为模板,后字符串为被校验的对象
print(re.findall("[A-Z]","ASDaDEFGAa")) #返回大写字母
print(re.findall("[A-Z]+","ASDaDEFGAa")) #符合的字母一次性输出
#匹配符合调整的内容(.*?)
print(re.findall("AS(.*?)Aa","ASDaDEFGAa"))
#sub 正则替换
print(re.sub("a","A","abcdcasd"))  #找到a用A来替换,在第三个字符串中查找
print(re.sub("
","","ab
dca
sd"))  #去除换行
#建议在正则表达式中,被比较的字符串前面加上r,不用担心转义

3.实例-数据解析

#创建正则表达式对象,表示规则(字符串的模式)
findLink = re.compile(r'<a href="(.*?)">')   #影片链接匹配规则
findImgSrc = re.compile(r'<img.*src="(.*?)"',re.S)   #re.S忽略换行符,图片链接匹配规则
findTitle = re.compile(r'<span class=".*">(.*?)</span>') #匹配影片名
findRating = re.compile(r'<span class="rating_num" property="v:average">(.*?)</span>') #匹配评分规则
fingCommentNum = re.compile(r'<span>(d*?)人评价</span>')  #匹配评价人数
findInq = re.compile(r'<span class="inq">(.*?)</span>') #匹配一句话评价
findBD = re.compile(r'<p class="">(.*?)</p>',re.S) #匹配相关内容

#爬取网页
def getData(baseurl):
    datalist = []
    #2.逐一解析数据
    for i in range(10):      #调用获取页面信息的函数10次
        url = baseurl + str(i*25)
        html = askURl(url)
        #2.解析数据
        soup = BeautifulSoup(html,"html.parser")
        for item in soup.find_all("div",class_="item"): #查找符合要求的字符串,行为列表
            #print(item) #测试查看电影item全部信息
            data = [] #保存一部电影的所有信息
            item=str(item)
            # re库用来通过正则表达式查找指定的字符串
            link=re.findall(findLink,item)[0]  #查找超链接
            data.append(link)
            imgSrc=re.findall(findImgSrc,item)[0] #查找图像地址
            data.append(imgSrc)
            titles=re.findall(findTitle,item) #查找标题,可能多个
            for i in range(0,3):
                res = titles[i].replace("/","").replace(" ","").replace("xa0","") #去掉无关符号
                data.append(res)
            rating = re.findall(findRating, item)[0] #查找评分
            data.append(rating)
            commentNum = re.findall(fingCommentNum, item)[0] #查找评分数量

            data.append(commentNum)
            inq = re.findall(findInq, item) #查找一句话评论
            if len(inq) !=0:
                inq = inq[0].replace(".","").replace(" ","").replace("","") #去掉无关符号
                data.append(inq)
            else:
                data.append("")
            bd = re.findall(findBD, item)[0] #查找相关内容
            bd = re.sub('<br(s+)?/>(s+)?>',"",bd)  #去掉<br/>
            bd = re.sub('/',"",bd)
            bd = re.sub('xa0',"",bd)
            bd = re.sub(' ',"",bd)
            data.append(bd.strip())

            datalist.append(data) #把处理好的一部电影信息放入datalist
            #print(datalist)
    return  datalist

四.保存数据
1.xlwt模块

import xlwt
workbook = xlwt.Workbook(encoding="utf-8") #创建workbook对象
worksheet = workbook.add_sheet('sheet1') #创建工作表
worksheet.write(0,0,'hello') #写入数据,第一行参数为行,第二行参数为列,第三行参数内容
workbook.save('student.xls') #保存数据表

2.sqlite3模块

import sqlite3
#1.打开或创建数据库文件
conn = sqlite3.connect("test.db")
#安装插件Database Navigator后重启pycharm即可
print("Opened database successfully")
c = conn.cursor()  #获取游标

#2.创建表
sql_creatTabel = '''
    create table if not exists company
        (id int promary key not null,
        name text not null,
        age int not null,
        address char(50),
        salary real);
'''
c.execute(sql_creatTabel)  #执行sql语句
conn.commit()  #提交数据库操作
#conn.close()   #关闭数据库连接
print("Creat table successfully")
#3.插入数据
sql_insertData1 = '''
    insert into company(id,name,age,address,salary)
    values(1,'张三',35,'南京',10000);
'''
sql_insertData2 = '''
    insert into company(id,name,age,address,salary)
    values(2,'李四',27,'北京',15000);
'''
c.execute(sql_insertData1)
c.execute(sql_insertData2)
conn.commit()  #提交数据库操作
print("Insert Data successfully")

#4.查询数据
sql_queryData = ' select * from company '
cursor = c.execute(sql_queryData)
for row in cursor:
    print("id=",row[0],end="")
    print("name=",row[1],end="")
    print("address=",row[2],end="")
    print("salary=",row[3],end="
")
print("Query Data successfully")
conn.close()

3.实例-数据xls

#保存数据
def saveData(datalist,savepath):
    print("save......")
    book = xlwt.Workbook(encoding="utf8",style_compression=0)
    sheet = book.add_sheet("豆瓣电影Top250",cell_overwrite_ok=True)
    col = ('电影详情链接',"图片链接","名片1","名片2","名片3","评分","评价数","概括","相关信息")
    for i in range(9):
        sheet.write(0,i,col[i]) #列名
    for i in range(250):
        print("第%d条"%(i+1))
        data = datalist[i]
        for j in range(0,9):
            sheet.write(i+1,j,data[j])
    book.save(savepath)    #保存

4.实例-数据保存DB

#数据库初始化
def init_db(dbpath):
    sql = '''
        create table if not exists movie250(
        id integer primary key autoincrement,
        info_link text,
        pic_link text,
        name1 varchar,
        name2 varchar,
        name3 varchar,
        score numeric,
        rated numeric,
        instroduction text,
        info text
        )
    ''' #创建数据表
    conn = sqlite3.connect(dbpath)
    cursor = conn.cursor()
    cursor.execute(sql)
    conn.commit()
    conn.close()

#保存数据入DB
def saveData2DB(datalist, dbpath):
    init_db(dbpath)
    conn=sqlite3.connect(dbpath)
    cur = conn.cursor()

    for data in datalist:
        for index in range(len(data)):
            if index ==5 or index ==6:
                continue
            data[index] = '"'+data[index]+'"'
        sql = '''
        insert into movie250(
        info_link,pic_link,name1,name2,name3,score,rated,instroduction,info)
        values(%s)'''%",".join(data)
        #print(sql)
        cur.execute(sql)
        conn.commit()
    cur.close()
    conn.close()

五.完整源码

#!/usr/bin/python3
# -*- coding:utf-8 -*-
# @Time:2021/8/21 11:43
# @author: Mrwhite
# @File:spiderdouban250.py
# @DESC:

from bs4 import BeautifulSoup    #网页解析,获取数据
import re      #正则表达式 进行文字匹配
import urllib.request,urllib.error   #制定URL,获取网页数据
import xlwt    #进行excel操作
import sqlite3 #进行数据库操作

def main():
    #xx电影250基础url
    baseurl = "https://movie.douban.com/top250?start="

    #1-2.爬取网页并解析
    datalist=getData(baseurl)
    savepath = "豆瓣电影Top250.xls"
    dbpath = "movie.db"

    #3.保存数据
    #saveData(datalist,savepath)
    saveData2DB(datalist,dbpath)

#创建正则表达式对象,表示规则(字符串的模式)
findLink = re.compile(r'<a href="(.*?)">')   #影片链接匹配规则
findImgSrc = re.compile(r'<img.*src="(.*?)"',re.S)   #re.S忽略换行符,图片链接匹配规则
findTitle = re.compile(r'<span class=".*">(.*?)</span>') #匹配影片名
findRating = re.compile(r'<span class="rating_num" property="v:average">(.*?)</span>') #匹配评分规则
fingCommentNum = re.compile(r'<span>(d*?)人评价</span>')  #匹配评价人数
findInq = re.compile(r'<span class="inq">(.*?)</span>') #匹配一句话评价
findBD = re.compile(r'<p class="">(.*?)</p>',re.S) #匹配相关内容

#爬取网页
def getData(baseurl):
    datalist = []
    #2.逐一解析数据
    for i in range(10):      #调用获取页面信息的函数10次
        url = baseurl + str(i*25)
        html = askURl(url)
        #2.解析数据
        soup = BeautifulSoup(html,"html.parser")
        for item in soup.find_all("div",class_="item"): #查找符合要求的字符串,行为列表
            #print(item) #测试查看电影item全部信息
            data = [] #保存一部电影的所有信息
            item=str(item)
            # re库用来通过正则表达式查找指定的字符串
            link=re.findall(findLink,item)[0]  #查找超链接
            data.append(link)
            imgSrc=re.findall(findImgSrc,item)[0] #查找图像地址
            data.append(imgSrc)
            titles=re.findall(findTitle,item) #查找标题,可能多个
            for i in range(0,3):
                res = titles[i].replace("/","").replace(" ","").replace("xa0","") #去掉无关符号
                data.append(res)
            rating = re.findall(findRating, item)[0] #查找评分
            data.append(rating)
            commentNum = re.findall(fingCommentNum, item)[0] #查找评分数量

            data.append(commentNum)
            inq = re.findall(findInq, item) #查找一句话评论
            if len(inq) !=0:
                inq = inq[0].replace(".","").replace(" ","").replace("","") #去掉无关符号
                data.append(inq)
            else:
                data.append("")
            bd = re.findall(findBD, item)[0] #查找相关内容
            bd = re.sub('<br(s+)?/>(s+)?>',"",bd)  #去掉<br/>
            bd = re.sub('/',"",bd)
            bd = re.sub('xa0',"",bd)
            bd = re.sub(' ',"",bd)
            data.append(bd.strip())

            datalist.append(data) #把处理好的一部电影信息放入datalist
            #print(datalist)
    return  datalist
#得到指定一个URL的网页内容
def askURl(url):
    head={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36"}
    request = urllib.request.Request(url,headers=head)
    try:
        response = urllib.request.urlopen(request)
        html = response.read().decode("utf-8")
        #print(html)
    except urllib.error.URLError as  e:
        if hasattr(e,"code"):
            print(e.code)
    return html

#保存数据
def saveData(datalist,savepath):
    print("save......")
    book = xlwt.Workbook(encoding="utf8",style_compression=0)
    sheet = book.add_sheet("豆瓣电影Top250",cell_overwrite_ok=True)
    col = ('电影详情链接',"图片链接","名片1","名片2","名片3","评分","评价数","概括","相关信息")
    for i in range(9):
        sheet.write(0,i,col[i]) #列名
    for i in range(250):
        print("第%d条"%(i+1))
        data = datalist[i]
        for j in range(0,9):
            sheet.write(i+1,j,data[j])
    book.save(savepath)    #保存

#数据库初始化
def init_db(dbpath):
    sql = '''
        create table if not exists movie250(
        id integer primary key autoincrement,
        info_link text,
        pic_link text,
        name1 varchar,
        name2 varchar,
        name3 varchar,
        score numeric,
        rated numeric,
        instroduction text,
        info text
        )
    ''' #创建数据表
    conn = sqlite3.connect(dbpath)
    cursor = conn.cursor()
    cursor.execute(sql)
    conn.commit()
    conn.close()

#保存数据入DB
def saveData2DB(datalist, dbpath):
    init_db(dbpath)
    conn=sqlite3.connect(dbpath)
    cur = conn.cursor()

    for data in datalist:
        for index in range(len(data)):
            if index ==5 or index ==6:
                continue
            data[index] = '"'+data[index]+'"'
        sql = '''
        insert into movie250(
        info_link,pic_link,name1,name2,name3,score,rated,instroduction,info)
        values(%s)'''%",".join(data)
        #print(sql)
        cur.execute(sql)
        conn.commit()
    cur.close()
    conn.close()

if __name__ == "__main__":  #当程序执行时
    #调用函数
    main()
    print("爬取完毕")
原文地址:https://www.cnblogs.com/mrwhite2020/p/15169293.html