爬虫（二）

　　在开发工具内获取“请求头”来伪装成浏览器，以便更好地抓取数据

!/usr/bin/env python
-*- encoding:UTF-8 -*-

import requests

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'
}
res = requests.get('http://bj.xiaozhu.com/',headers=headers)    # get方法加入请求头
try:
    print(res.text)
except ConnectionError:
    print('拒绝连接')


# 通过BeautiSoup库解析得到的Soup文档是标准结构化数据比上面的更好
import requests
from bs4 import BeautifulSoup

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'
}
res = requests.get('http://bj.xiaozhu.com/',headers=headers)    # get方法加入请求头
try:
    soup = BeautifulSoup(res.text, 'html.parser')
    print(soup.prettify())
except ConnectionError:
    print('拒绝连接')

更新后：

price = soup.select('#page_list > ul > li:nth-child(1) > div.result_btm_con.lodgeunitname > div:nth-child(1) > '
                    'span.result_price > i')

完整代码

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 '
                  'Safari/537.36 '
}
res = requests.get('http://bj.xiaozhu.com/', headers=headers)  # get方法加入请求头

soup = BeautifulSoup(res.text, 'html.parser')
# 定位元素位置并通过selector方法提取
prices = soup.select(
    '#page_list > ul > li > div.result_btm_con.lodgeunitname > div:nth-child(1) >  span.result_price > i')
for price in prices:
    print(price.get_text())
　　# print(prince) 带有标签

爬取北京地区短租房信息：

import random

import requests
from bs4 import BeautifulSoup
import time

# 加入请求头伪装成浏览器
headers = {
    # 通过Chrome浏览器复制User-Agent
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'
}


# 定义判断用户性别的函数
def judgment_sex(class_name):
    if class_name == ['member_ico1']:
        return '女'
    else:
        return '男'


# 获取详细页URL函数
def get_links(url):
    try:
        wb_date = requests.get(url, headers)
    except ConnectionAbortedError:
        print('拒绝连接')
    soup = BeautifulSoup(wb_date.text, 'lxml')
    links = soup.select('#page_list > ul > li > a')
    for link in links:
        herf = link.get("href")
        get_info(herf)


# 获取网页信息函数
def get_info(url):
    wb_date = requests.get(url, headers)
    soup = BeautifulSoup(wb_date.text, 'lxml')
    # 通过浏览器copy selector
    tittles = soup.select('div.pho_info > h4')
    addresses = soup.select('span.pr5')
    prises = soup.select('#pricePart > div.day_l > span')
    images = soup.select('#floatRightBox > div.js_box.clearfix > div.member_pic > a > img')
    names = soup.select('#floatRightBox > div.js_box.clearfix > div.w_240 > h6 > a')
    sexs = soup.select('#floatRightBox > div.js_box.clearfix > div.member_pic > div')
    for tittle, address, prise, image, name, sex in zip(tittles, addresses, prises, images, names, sexs):
        date = {
            'tittle': tittle.get_text().strip(),
            'address': address.get_text().strip(),
            'price': prise.get_text(),
            'image': image.get("src"),
            'name': name.get_text(),
            'sex': judgment_sex(sex.get("class"))
        }
        print(date)


if __name__ == '__main__':

    urls = ['http://bj.xiaozhu.com/search-duanzufang-p{}-0/'.format(number) for number in range(1, 14)]
    for single_url in urls:
        get_links(single_url)
        # 休眠十秒，防止被封IP
        time.sleep(random.randint(10, 13))

        # 缺点：缺少IP管理，采用休眠方法，效率低

爬取酷狗top1.0版：

#!/usr/bin/env python
# -*- encoding:UTF-8 -*-

from bs4 import BeautifulSoup
import requests
import time,random

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.157 Safari/537.36'
}

def get_info(url):
    """获取信息函数"""
    wb_data = requests.get(url,headers)
    soup = BeautifulSoup(wb_data.text,'lxml')
    ranks = soup.select('span.pc_temp_num')
    titles =soup.select('div.pc_temp_songlist>ul>li>a')
    times = soup.select('span.pc_temp_tips_r>span')
    for rank,title,time in zip(ranks,titles,times):
        data = {
            'rank':rank.get_text().strip(),
            'singer':title.get_text().split('-')[0],
            'song':title.get_text().split('-')[1],
            'time':time.get_text().strip()
        }
        print(data)



if __name__ == '__main__':
    """主程序入口"""
    urls = ['https://www.kugou.com/yy/rank/home/{}-8888.html'.format(i) for i in range(1,25)]
    for url in urls:
        get_info(url)
    time.sleep(random.randint(3,5))

爬取酷狗top1.1版：

#!/usr/bin/env python
# -*- encoding:UTF-8 -*-

from bs4 import BeautifulSoup
import requests
import time,random

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.157 Safari/537.36'
}

def get_info(url):
    """获取信息函数"""
    wb_data = requests.get(url,headers)
    soup = BeautifulSoup(wb_data.text,'lxml')
    ranks = soup.select('span.pc_temp_num')
    titles =soup.select('a.pc_temp_songname')
    times = soup.select('span.pc_temp_time')
    for rank,title,time in zip(ranks,titles,times):
        data = {
            'rank':rank.get_text().strip(),
            'singer':title.get_text().split('-')[0],
            'song':title.get_text().split('-')[1],
            'time':time.get_text().strip()
        }
        print(data)



if __name__ == '__main__':
    """主程序入口"""
    urls = ['https://www.kugou.com/yy/rank/home/{}-8888.html'.format(i) for i in range(1,25)]
    for url in urls:
        get_info(url)
    time.sleep(random.randint(3,5))

爬取价格：

import re
import requests
res = requests.get('http://bj.xiaozhu.com/')
prices = re.findall('<span class="result_price">&#165;<i>(.*?)</i>起/晚</span>', res.text)
for price in prices:
    print(price)

注意：

<span class="result_price">¥<i>488</i>起/晚</span>

¥   和  &#165;等价，但爬取时，不能出现¥