【Python3】【爬虫】bilibili摄影板块

0x00准备

  1. B站的小姐姐炒鸡漂亮(逃,这个真的是技术贴。
  2. 第一次抓动态网站:原理是直接模拟浏览器访问。

0x01环境

1.python3不说了
2.selenium安装报错解决方案:

安装:
pip install selenium

报错1:
During handling of the above exception, another exception occurred:
安装chromedirve解决。

报错2:
FileNotFoundError: [WinError 2] 系统找不到指定的文件。

from selenium import webdriver
#driver = webdriver.Chrome("C:Development&GwjEnvironmentchromedriver.exe")  #错误的路径 使用正斜杠
driver = webdriver.Chrome("C:/Development/&GwjEnvironment/chromedriver.exe") #正确的路径 使用反斜杠
driver.get("http://www.baidu.com")

更正:
找到并且全部改成shell=True
C:DevelopmentPython36Libsubprocess.py

3.要装对应浏览器的调试工具。
比如我的chrome就是chromedriver_win32
4.需要的库自己看代码里面。

0x02丢图跑

好吧这个真的不是技术贴。

代码2(用js):

# 抓取了B站cosplay热门摄影
import os
from selenium import webdriver
from bs4 import BeautifulSoup
import ssl
from time import sleep
import requests
import random
import re
import json

UserAgent_List = [
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
    "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
    "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
    "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
    "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
    "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
]
headers = {'User-Agent': random.choice(UserAgent_List),
           'Accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
           'Accept-Encoding': 'gzip',
           }
base_url = 'https://h.bilibili.com/eden/picture_area#/cos/hot'
pic_save_path = "output/"

# 打开浏览器模拟请求
def browser_get(pageNum):
    browser = webdriver.Chrome()
    browser.get(base_url)
    h = int(int(pageNum)/20)
    for i in range(h):
        browser.execute_script("window.scrollBy(0,3000)")
        sleep(2)
    html_text = browser.page_source
    soup = BeautifulSoup(html_text,'html.parser')
    urls = soup.find('div',{'class':'area-wrapper'}).findAll('h3',{'class':'article-title'})
    # print(len(urls))
    count = 1
    for url in urls:
        test = 'https://api.vc.bilibili.com/link_draw/v1/doc/detail?doc_id='+re.sub('D',"",url.a['href']) #regex初体验
        browser.get(test)
        js = browser.page_source # 怎么直接得到json???
        # print(js)
        sleep(2) # 异步加载
        get_meizi_url(js)
        count += 1
        if count>int(pageNum):break
    browser.quit()

# 获取每个页面的小姐姐
def get_meizi_url(js):
    # print(js)
    soup = BeautifulSoup(js, 'html.parser')
    text = soup.find('pre').string
    hhh = json.loads(text)
    title = ""
    for i in hhh['data']['item']['title']:
        title = title+i
    if not os.path.exists(title):
        os.makedirs(title)
    else:
        return 
    print(title)
    count = 1
    for i in hhh['data']['item']['pictures']:
        print(i['img_src'])
        qaq = re.search(r'(jpg)|(webp)|(png)|(jpeg)',i['img_src'])
        filename = '%s/%s/%s.%s'%(os.path.abspath('.'),title,count,qaq.group())
        with open(filename,'wb+')as qwq:
            qwq.write(requests.get(i['img_src'],headers=headers).content)
        count += 1
    return 

if __name__ == '__main__':
    ssl._create_default_https_context = ssl._create_unverified_context #https问题
    pageNum = input(u'请问你要几份小姐姐照片:')
    #if not os.path.exists(pic_save_path):
    #    os.makedirs(pic_save_path)
    browser_get(pageNum)

代码1(纯html):

# 抓取了B站cosplay热门摄影
# 72行有个BUG,B站格式随机,需要正则表达式匹配,占坑待填。
import os
from selenium import webdriver
from bs4 import BeautifulSoup
import ssl
from time import sleep
import requests
import random

UserAgent_List = [
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
    "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
    "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
    "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
    "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
    "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
]
headers = {'User-Agent': random.choice(UserAgent_List),
           'Accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
           'Accept-Encoding': 'gzip',
           }
base_url = 'https://h.bilibili.com/eden/picture_area#/cos/hot'
pic_save_path = "output/"

# 打开浏览器模拟请求
def browser_get(pageNum):
    browser = webdriver.Chrome()
    browser.get(base_url)
    h = int(int(pageNum)/20)
    for i in range(h):
        browser.execute_script("window.scrollBy(0,3000)")
        sleep(2)
    html_text = browser.page_source
    soup = BeautifulSoup(html_text,'html.parser')
    urls = soup.find('div',{'class':'area-wrapper'}).findAll('h3',{'class':'article-title'})
    # print(len(urls))
    count = 1
    for url in urls:
        browser.get('https://'+url.a['href'])
        sleep(2) # 异步加载
        html = browser.page_source
        get_meizi_url(html)
        count += 1
        if count>int(pageNum):break
    browser.quit()

# 获取每个页面的小姐姐
def get_meizi_url(html):
    # print(html)
    soup = BeautifulSoup(html, 'html.parser')
    title = soup.find('h1',attrs={'class':'article-title dp-i-block v-middle'}).string
    if not os.path.exists(title):
        os.makedirs(title)
    print(title)
    href = soup.find('div',attrs={'class':'images'}).findAll('img')
    count = 1
    for a in href:
        print(a['src'])
        filename = '%s/%s/%s.jpg'%(os.path.abspath('.'),title,count)
        with open(filename,'wb+')as qwq:
            qwq.write(requests.get(a['src'],headers=headers).content)
        count += 1

if __name__ == '__main__':
    ssl._create_default_https_context = ssl._create_unverified_context #https问题
    pageNum = input(u'请问你要几份小姐姐照片:')
    #if not os.path.exists(pic_save_path):
    #    os.makedirs(pic_save_path)
    browser_get(pageNum)    

ok

反正我偷图跑路成功了,代码挂了联系博主,欢迎大佬带找js。

原文地址:https://www.cnblogs.com/gwj1314/p/9444888.html