selenium最初是一个自动化测试工具,而爬虫中使用它主要是为了解决requests无法直接执行JavaScript代码的问题。selenium本质是通过驱动浏览器,完全模拟浏览器的操作,比如跳转、输入、点击、下拉等,来拿到网页渲染之后的结果,可支持多种浏览器。如下:
from selenium import webdriver browser=webdriver.Chrome() #弹出浏览器-谷歌浏览器 browser=webdriver.Firefox() browser=webdriver.PhantomJS() #无界面浏览器 browser=webdriver.Safari() browser=webdriver.Edge()
一、使用安装
selenium+chromedriver安装:
#1、安装selenium: pip3 install selenium #2、下载chromdriver.exe 下载chromdriver.exe放到python安装路径的scripts目录中即可 镜像链接:http://npm.taobao.org/mirrors/chromedriver/2.29/ 官网链接:https://sites.google.com/a/chromium.org/chromedriver/downloads #验证安装 C:UsersAdministrator>python3 Python 3.6.1 (v3.6.1:69c0db5, Mar 21 2017, 18:41:36) [MSC v.1900 64 bit (AMD64)] on win32 Type "help", "copyright", "credits" or "license" for more information. >>> from selenium import webdriver >>> driver=webdriver.Chrome() #弹出浏览器 >>> driver.get('https://www.baidu.com') >>> driver.page_source #注意: selenium3默认支持的webdriver是Firfox,而Firefox需要安装geckodriver 下载链接:https://github.com/mozilla/geckodriver/releases
selenium+phantomjs安装:
#1、下载selenium pip3 install selenium #2、下载phantomjs 下载phantomjs,解压后把phantomjs.exe所在的bin目录放到环境变量 镜像链接:http://npm.taobao.org/dist/phantomjs/ 下载链接:http://phantomjs.org/download.html #验证安装 C:UsersAdministrator>phantomjs phantomjs> console.log('egon gaga') egon gaga undefined phantomjs> ^C C:UsersAdministrator>python3 Python 3.6.1 (v3.6.1:69c0db5, Mar 21 2017, 18:41:36) [MSC v.1900 64 bit (AMD64)] on win32 Type "help", "copyright", "credits" or "license" for more information. >>> from selenium import webdriver >>> driver=webdriver.PhantomJS() #无界面浏览器 >>> driver.get('https://www.baidu.com') >>> driver.page_source
二、基本应用实例
from selenium import webdriver from selenium.webdriver import ActionChains #鼠标点击操作 from selenium.webdriver.common.by import By #按照什么方式查找,如By.ID和By.CSS_SELECTOR from selenium.webdriver.common.keys import Keys #键盘按键操作 from selenium.webdriver.support import expected_conditions as EC #判断模块,有一系列的判断方法 from selenium.webdriver.support.wait import WebDriverWait #等待页面加载某些元素 import time browser=webdriver.Chrome() try: browser.get('https://www.baidu.com') input_tag=browser.find_element_by_id('kw') input_tag.send_keys('美女') #python2中输入中文错误,字符串前加个u input_tag.send_keys(Keys.ENTER) #输入回车 wait=WebDriverWait(browser,10) wait.until(EC.presence_of_element_located((By.ID,'content_left'))) #等到id为content_left的元素加载完毕,最多等10秒 print(browser.page_source) #获取页面内容 print(browser.current_url) #获取当前请求url print(browser.get_cookies()) #[{},{},{}...] time.sleep(10) finally: browser.close() #一定要关闭浏览器,否则会在后台
三、选择器介绍
1、基本选择器
from selenium import webdriver from selenium.webdriver import ActionChains from selenium.webdriver.common.by import By from selenium.webdriver.common.keys import Keys from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.wait import WebDriverWait import time ''' 1、find_element_by_id 2、find_element_by_link_text 3、find_element_by_partial_link_text 4、find_element_by_tag_name 5、find_element_by_class_name 6、find_element_by_name 7、find_element_by_css_selector 8、find_element_by_xpath ''' driver=webdriver.Chrome() wait=WebDriverWait(driver,3) #显式等待 try: driver.get("https://www.baidu.com/") # 1、find_element_by_id:根据id获取input输入框 input_tag=driver.find_element_by_id("kw") print(input_tag.tag_name) print(input_tag.get_attribute('name')) print(input_tag.text) #2、find_element_by_link_text :根据可以点击的文本找标签 login_btn=driver.find_element_by_link_text("登录") login_btn.click() # 3、find_element_by_partial_link_text:根据可以点击的部分文本找标签 login_btn = driver.find_element_by_partial_link_text("登") login_btn.click() # 4、find_element_by_class_name user_login=driver.find_element_by_class_name("tang-pass-footerBarULogin") # 因为此标签的点击功能是通过绑定事件实现的,所以需通过如下两种等待事件加载完后才可以执行点击代码 # 方式一: user_login=wait.until(EC.presence_of_element_located((By.CLASS_NAME,"tang-pass-footerBarULogin"))) #元组
#方式二: user_login=wait.until(EC.element_to_be_clickable((By.CLASS_NAME,"tang-pass-footerBarULogin"))) #元组 user_login.click() #5、find_element_by_name user_input=driver.find_element_by_name("userName") pswd_input=driver.find_element_by_name("password") ensure_btn=driver.find_element_by_id("TANGRAM__PSP_10__submit") user_input.send_keys("骑猪走秀") pswd_input.send_keys("xxxxxxxxxxx") ensure_btn.click() time.sleep(10) finally: driver.close()
2、xpath选择器
from selenium import webdriver from selenium.webdriver import ActionChains from selenium.webdriver.common.by import By #按照什么方式查找,By.ID,By.CSS_SELECTOR from selenium.webdriver.common.keys import Keys #键盘按键操作 from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.wait import WebDriverWait #等待页面加载某些元素 import time driver=webdriver.Chrome() driver.implicitly_wait(3) #隐式等待 try: driver.get("https://doc.scrapy.org/en/latest/_static/selectors-sample1.html") # 1、//与/ tag=driver.find_element_by_xpath("/html/body/div/a") #代表从html中找儿子body,从body中找儿子div,从div中找儿子a print(tag.text) print(tag.get_attribute("href")) print(tag.tag_name) tag=driver.find_elements_by_xpath("//a") #从全篇找所有的a,结果为列表 print(tag) #[] tag = driver.find_element_by_xpath("//a") # 从全篇找第一个出现的a print(tag.text) tag=driver.find_elements_by_xpath("//div//a") #从全篇找所有的div下的所有子子孙孙a #选择器查找 tag=driver.find_elements_by_css_selector("div a") print(len(tag)) # 2查找第几个 tag=driver.find_elements_by_xpath("//div//a[5]") #查找第5个a print(tag[0].text) #[] 即使只有一个对象也是以列表的形式显示,因为elements tag = driver.find_element_by_xpath("//div//a[5]") # 查找第5个a,与上述效果一样 print(tag.text) #3按照属性查找:以下三者效果一样 tag1 = driver.find_element_by_xpath('//a[@href="image4.html"]') tag2 = driver.find_element_by_xpath('//a[4]') tag3 = driver.find_element_by_xpath('//a[contains(@href,"image4")]') #模糊查找 print(tag1.text) print(tag2.text) print(tag3.text) # 4、其他 driver.find_elements_by_xpath('//*[@class="xxxxx"]') #具有class="xxxxx"属性的任何标签 driver.find_elements_by_xpath('//div[@class="xxxxx"][@class="yyyyy"]') #同时具有class="xxxxx"和class="yyyyy"属性的div标签 print(driver.find_element_by_xpath('//a[img/@src="image2_thumb.jpg"]').text) #找到子标签img的src属性为image2_thumb.jpg的a标签 print(driver.find_element_by_xpath('//a//..').tag_name) #获得a标签的父标签 print([tag.tag_name for tag in driver.find_elements_by_xpath('//img//..')]) #..获得所有的img标签的父标签 img = driver.find_element_by_xpath('//img') print(img.location) #位置坐标:{'x': 8, 'y': 30} print(img.size) #长宽大小:{'height': 16, 'width': 16} time.sleep(5) finally: driver.close()
3、获取标签属性
通过如上选择器拿到标签对象tag,可以通过如下形式获得标签的相关属性值,如下:
print(tag.get_attribute('src')) #获取标签属性 print(tag.id) #获取标签ID print(tag.location) #获取标签位置 print(tag.tag_name) #获取标签名称 print(tag.size) #获取标签大小
4、等待元素加载
selenium只是模拟浏览器的行为,而浏览器解析页面是需要时间的(执行css,js),一些元素可能需要过一段时间才能加载出来,为了保证能查找到元素,必须等待在选择器的基本介绍中,我们分别用到了两种等待元素加载的方式,分别为隐式等待和显示等待。分别总结如下:
(1)隐式等待:在browser.get('xxx')前就设置,针对所有元素有效
browser=webdriver.Chrome() browser.implicitly_wait(10) #隐式等待:在查找所有元素时,如果尚未被加载,则等10秒 browser.get('https://www.baidu.com')
(2)显式等待:在browser.get('xxx')之后设置,只针对某个元素有效
from selenium.webdriver.support.wait import WebDriverWait browser=webdriver.Chrome() browser.get('https://www.baidu.com') wait=WebDriverWait(browser,10) #显式等待:显式地等待某个元素被加载 wait.until(EC.presence_of_element_located((By.ID,'content_left')))
四、元素交互操作
1、输入、点击、清空
from selenium import webdriver from selenium.webdriver import ActionChains from selenium.webdriver.common.by import By from selenium.webdriver.common.keys import Keys from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.wait import WebDriverWait import time driver=webdriver.Chrome() driver.implicitly_wait(3) try: driver.get("https://www.jd.com/") input_tag=driver.find_element_by_id('key') input_tag.send_keys('情趣用品') #输入内容 input_tag.send_keys(Keys.ENTER) #Keys.ENTER代表回车键 time.sleep(3) input_tag = driver.find_element_by_id('key') input_tag.clear() #清空内容 input_tag.send_keys('苍进空') input_tag.send_keys(Keys.ENTER) time.sleep(5) finally: driver.close()
2、ActionChains操作
from selenium import webdriver from selenium.webdriver import ActionChains from selenium.webdriver.common.by import By from selenium.webdriver.common.keys import Keys from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.wait import WebDriverWait import time try: driver=webdriver.Chrome() driver.get('http://www.runoob.com/try/try.php?filename=jqueryui-api-droppable') driver.implicitly_wait(3) driver.switch_to.frame('iframeResult') #若页面套子页面,直接查找子页面元素不成功,需要切换到子页面内 # driver.switch_to.parent_frame() #切回到父页面 # 方式一:基于一条链,从头一下移动到尾部 source = driver.find_element_by_id('draggable') target = driver.find_element_by_id('droppable') actions=ActionChains(driver) actions.drag_and_drop(source,target).perform() #记得每一步操作完需要.perform() # 方式二:基于不同ActionChains,可以控制移动的位移 source = driver.find_element_by_id('draggable') target = driver.find_element_by_id('droppable') distance=target.location['x'] - source.location['x'] ActionChains(driver).click_and_hold(source).perform() #点击拖住 # ActionChains(driver).move_by_offset(xoffset=distance,yoffset=0).perform() # ActionChains(driver).release().perform() res=0 while res < distance: ActionChains(driver).move_by_offset(xoffset=1,yoffset=0).perform() #多次移动 res+=1 ActionChains(driver).release().perform() #移动到位后释放鼠标 time.sleep(5) finally: driver.close()
五、应用实例
爬取jd商品信息
from selenium import webdriver from selenium.webdriver import ActionChains from selenium.webdriver.common.by import By from selenium.webdriver.common.keys import Keys from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.wait import WebDriverWait import time def get_gods(driver): try: goods=driver.find_elements_by_class_name("gl-item") for good in goods: detail_url = good.find_element_by_tag_name('a').get_attribute('href') god_name=good.find_element_by_css_selector(".p-name em").text.replace(" ","") god_price=good.find_element_by_css_selector(".p-price i").text comment_num=good.find_element_by_css_selector(".p-commit a").text msg = ' 商品 : %s 链接 : %s 价钱 :%s 评论 :%s '% (god_name, detail_url, god_price, comment_num) with open("gods.text",'a',encoding="utf-8") as f: f.write(msg) print(msg, end=' ') page_num = driver.find_element_by_css_selector("#J_bottomPage .curr").text print(page_num) next_page=driver.find_element_by_partial_link_text('下一页') next_page.click() time.sleep(10) get_gods(driver) except Exception: print("出错啦") def main(keyword): driver=webdriver.Chrome() driver.implicitly_wait(3) try: driver.get("https://www.jd.com/") input_tag=driver.find_element_by_id("key") input_tag.send_keys(keyword) input_tag.send_keys(Keys.ENTER) get_gods(driver) finally: driver.close() if __name__ == '__main__': main("振动棒")
163邮箱自动发送邮件:
from selenium import webdriver from selenium.webdriver import ActionChains from selenium.webdriver.common.by import By from selenium.webdriver.common.keys import Keys from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.wait import WebDriverWait import time browser=webdriver.Chrome() browser.implicitly_wait(5) try: browser.get('http://mail.163.com/') frame=browser.find_element_by_id('x-URS-iframe') browser.switch_to.frame(frame) browser.find_element_by_css_selector('.m-container') inp_user=browser.find_element_by_name('email') inp_pwd=browser.find_element_by_name('password') button=browser.find_element_by_id('dologin') inp_user.send_keys('用户名') time.sleep(2) inp_pwd.send_keys('密码') time.sleep(2) button.click() #如果遇到验证码,可以把下面一小段打开注释 # import time # time.sleep(10) # button = browser.find_element_by_id('dologin') # button.click() browser.find_element_by_id('dvNavTop') write_msg=browser.find_elements_by_css_selector('#dvNavTop li')[1] #获取写信按钮 write_msg.click() browser.find_element_by_class_name('tH0') recv_man=browser.find_element_by_class_name('nui-editableAddr-ipt') title=browser.find_element_by_css_selector('.dG0 .nui-ipt-input') recv_man.send_keys('收件地址') title.send_keys('爬虫练习') print(title.tag_name) frame=browser.find_element_by_class_name('APP-editor-iframe') browser.switch_to.frame(frame) body=browser.find_element(By.CSS_SELECTOR,'body') body.send_keys('我是机器,testing中。。。请勿回复') browser.switch_to.parent_frame() #切回他爹 send_button=browser.find_element_by_class_name('nui-toolbar-item') send_button.click() time.sleep(10) except Exception as e: print(e) finally: browser.close()
user_login.click()