爬虫 selenium 基础操作

Driver下载地址

chromedriver

selenium

Driver的常用的API

基本

from selenium import webdriver
executable_path = "chromedriver"
driver = webdriver.Chrome(executable_path=executable_path)
# 获取所有的frame
for window in driver.window_handles:
    print(window)
# 切换 frame
driver.switch_to.frame(1)
# 前进
driver.forward()
# 后退
driver.back()
# 输入框输入
driver.find_element_by_id("kw").send_keys("new")
# 获取页面的元素
element = driver.find_element_by_id("element")
# 获取元素内的文本
print(element.text)
# 获取元素的属性
element.get_attribute('id')

# 设置 input 的值
driver.execute_script("arguments[0].value = '17037458040';", driver.find_element_by_id("nameNormal"))
driver.execute_script("arguments[0].value = 'qcc203010';", driver.find_element_by_id("pwdNormal"))

Select

from selenium.webdriver.support.ui import Select
from selenium import webdriver
executable_path = "chromedriver"
driver = webdriver.Chrome(executable_path=executable_path)
# 针对 select 选择框
select = Select(driver.find_element_by_id("select"))
# 选择第一个
select.select_by_index(1)
# 选择value = new 的值
select.select_by_value("new")
# 选择 text = new 的值
select.select_by_visible_text("new")
# 添加 cookie
driver.add_cookie({"name": "name", "value": "value", "path": "/"})
# 打印所有的cookie
for cookie in driver.get_cookies():
    print(cookie)

页面等待

显式等待

显式等待 指定某个条件,设置等待时间,如果超出这个时间没有找到该元素,便会抛出异常

from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import Select
from time import sleep

try:  # 判断form表单ajax加载完成标记:id属性
    element = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.XPATH, '//select[contains(@id,"yui_3_16")]')))
    s1 = Select(driver.find_element_by_name('province'))
    # 实例化Select
    s1.select_by_visible_text("山西")
    sleep(5)
    # seconds
    s2 = Select(driver.find_element_by_name('city'))
    # 实例化Select
    s2.select_by_visible_text("大同")
    sleep(5)  # seconds
    driver.find_element_by_class_name('btn btn-mini').submit()
finally:
    print('end')  
    driver.quit()

隐式等待

隐式等待目的是让WebDriver在查找某个或某类元素时候容留一定的时间来进行检查。
在这个时间内,如果找到就返回。否则就等到超过设置的时间并告知没有找到。

driver.implicitly_wait(30)
driver.get("https://www.google.co.in/")
driver.find_element_by_id("lst-ib")

设置User-Agent和Proxy

# 设置ua
option.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.92 Safari/537.36')
# 设置代理
option.add_argument("--proxy-server=http://localhost:8001")
driver = webdriver.Chrome(r"chromedriver.exe", options=option)

设置防止JS检测

# 比较实用的防止被检测到的方法
# window.navigator.webdriver
driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
  "source": """
    Object.defineProperty(navigator, 'webdriver', {
      get: () => undefined
    })
  """
})
原文地址:https://www.cnblogs.com/iFanLiwei/p/12853187.html