案例
from selenium import webdriver from selenium.webdriver.common.keys import Keys # 键盘按键操作 import time def get_goods(driver): try: goods = driver.find_elements_by_class_name('gl-item') for good in goods: detail_url = good.find_element_by_tag_name('a').get_attribute('href') p_name = good.find_element_by_css_selector('.p-name em').text.replace(' ', '') price = good.find_element_by_css_selector('.p-price i').text p_commit = good.find_element_by_css_selector('.p-commit a').text msg = ''' 商品 : %s 链接 : %s 价钱 :%s 评论 :%s ''' % (p_name, detail_url, price, p_commit) print(msg, end=' ') button = driver.find_element_by_partial_link_text('下一页') button.click() time.sleep(1) get_goods(driver) except Exception: pass def spider(url, keyword): chrome_options = webdriver.ChromeOptions() # 使用headless无界面浏览器模式 # 启动浏览器,获取网页源代码 chrome_options.add_argument('--headless') chrome_options.add_argument('--disable-gpu') driver = webdriver.Chrome(chrome_options=chrome_options) driver.get(url) driver.implicitly_wait(3) # 使用隐式等待 try: input_tag = driver.find_element_by_id('key') input_tag.send_keys(keyword) input_tag.send_keys(Keys.ENTER) get_goods(driver) finally: driver.close() if __name__ == '__main__': spider('https://www.jd.com/', keyword='java')
# 导入库 from selenium import webdriver import time def jd_crawler(): # executable_path 用于指定driver存放路径 browser = webdriver.Chrome(executable_path=r"E:chromedriver.exe") # 打开京东官网 browser.get('https://www.jd.com/') # browser.find_element_by_id("kw").send_keys("python selenium") # 获取输入框对象 search = browser.find_element_by_xpath('//*[@id="key"]') # 输入想要搜索的关键词,如"ps5国行" search.send_keys('ps5国行') # 获取搜索按钮对象并单击 browser.find_element_by_xpath('//*[@id="search"]/div/div[2]/button').click() # 将滚动条移动到页面底部,用于加载所有信息 javascript = "var q=document.documentElement.scrollTop=50000" # 执行 javascript 移动滚动条 browser.execute_script(javascript) # 等待3秒,有些异步加载的数据加载慢 time.sleep(3) # 通过查看页面源码得到金额的 xpath 路径,并获取金额 prices = browser.find_elements_by_xpath('//*[@id="J_goodsList"]/ul/li/div/div[2]/strong/i') # 通过查看页面源码得到商品标题的 xpath 路径,并获取商品标题 names = browser.find_elements_by_xpath('//*[@id="J_goodsList"]/ul/li/div/div[3]/a/em') # 遍历打印出当前页所有标题和金额 for name, price in zip(names, prices): print(name.text.replace(' ', ''), price.text) # 退出浏览器 browser.quit() if __name__ == '__main__': jd_crawler()
https://python-selenium-zh.readthedocs.io/zh_CN/latest/
https://www.cnblogs.com/0bug/p/12165370.html
https://www.cnblogs.com/TankXiao/p/5222238.html#step