selenium + 抓包

携程列表页数据动态加载生成,接口参数加密,使用selenium + 抓包解决

初始化selenium, options参数加上,desired_capabilities参数无法使用

def dirver_init():
    options = webdriver.ChromeOptions()

    # ip = get_pro()
    # print(ip)
    # 添加代理
    # options.add_argument(('--proxy-server=http://' + ip))

    # 设置开发者模式启动,该模式下webdriver属性为正常值   一般反爬比较好的网址都会根据这个反爬
    options.add_experimental_option('excludeSwitches', ['enable-automation'])

    # 禁用浏览器弹窗
    prefs = {
        'profile.default_content_setting_values': {
            'notifications': 2
        }}
    options.add_experimental_option('prefs', prefs)

    # 添加UA
    options.add_argument(
        'user-agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36"')

    # 将配置文件加载进webdriver
    # driver = webdriver.Chrome(options=options)
    driver = webdriver.Chrome(desired_capabilities=caps)
    return driver

动态加载,是鼠标滑动到当前窗口最下方

def dynamicLoading(driver):
    all_window_height = []  # 创建一个列表,用于记录每一次拖动滚动条后页面的最大高度
    all_window_height.append(driver.execute_script("return document.body.scrollHeight;"))
    while True:
        print(all_window_height)
        # driver.execute_script("document.documentElement.scrollTop = 100000")
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")
        time.sleep(3)
        heck_height = driver.execute_script("return document.body.scrollHeight;")
        if heck_height == all_window_height[-1]:  # 判断拖动滚动条后的最大高度与上一次的最大高度的大小,相等表明到了最底部
            break
        else:
            all_window_height.append(heck_height)

判断酒店数是否>10及是否需要动态加载,并获取接口的相应内容

def web_selen(driver, url):
    driver.get(url)
    time.sleep(3)

    # 酒店个数
    Html = etree.HTML(driver.page_source)
    nums_content = Html.xpath("//div[contains(@class, 'filter-title')]/h3/text()")[0]
    print(nums_content)
    nums = int(re.findall(r'找到(.*)家酒店', nums_content)[0])
    print(nums)

    if nums > 10:
        # 动态加载
        dynamicLoading(driver)

        time.sleep(10)
        # 获取接口相应内容
        request_log = driver.get_log('performance')
        for i in range(len(request_log)):
            message = json.loads(request_log[i]['message'])
            message = message['message']['params']
            # .get() 方式获取是了避免字段不存在时报错
            request = message.get('request')
            if (request is None):
                continue
            url = request.get('url')
            if 'HotelSearch?testab' in url:
                print(url)
                print(message['requestId'])

                try:
                    content = driver.execute_cdp_cmd('Network.getResponseBody', {'requestId': message['requestId']})
                    print(content)
                except:
                    pass
                print('-------------')

    driver.close()  # 关闭页面
    driver.quit()
原文地址:https://www.cnblogs.com/zwp-627/p/14086637.html