1.安装与入门
pip3 install selenium
将chromedriver放到一个没有权限要求的目录
from selenium import webdriver driverpath="D:chromedriverchromedriver_win32chromedriver.exe" #初始化一个driver driver=webdriver.Chrome(executable_path=driverpath) #请求网页 driver.get("https://www.baidu.com/") #通过page_source获取网页代码 print(driver.page_source)
2.关闭页面
a. driver.close() #关闭当前页面
b. driver.quit() #退出整个浏览器
from selenium import webdriver import time driverpath="D:chromedriverchromedriver_win32chromedriver.exe" driver=webdriver.Chrome(executable_path=driverpath) driver.get("https://www.baidu.com/") for i in range(5): time.sleep(1) print("沉睡%s秒" % i) driver.quit() #或者driver.close()
3.定位元素
- find_element_by_id:通过ID进行匹配查找,只返回匹配到的一个元素
- find_element_by_name:通过name进行匹配查找,只返回匹配到的一个元素
- find_element_by_xpath:通过xpath进行匹配查找,只返回匹配到的一个元素
- find_element_by_link_text:通过链接内容进行匹配查找,只返回匹配到的一个元素
- find_element_by_partical_link_text:通过部分链接内容进行匹配查找,只返回匹配到的一个元素
- find_element_by_tag_name:通过标签名称进行匹配查找,只返回匹配到的一个元素
- find_element_by_class_name:通过class名称进行匹配查找,只返回匹配到的一个元素
- find_element_by_css_selector:通过CSS选择器进行匹配查找,只返回匹配到的一个元素
值得注意的是,上面方法只会匹配查找只会获取第一个元素。除了上面这些查找单个元素的方法之外,Selenium还定义查找多个元素的方法:
- find_elements_by_name:通过name进行匹配查找,返回所有匹配到的元素列表
- find_elements_by_xpath:通过xpath进行匹配查找,返回所有匹配到的元素列表
- find_elements_by_link_text:通过链接内容进行匹配查找,返回所有匹配到的元素列表
- find_elements_by_partical_link_text:通过部分链接内容进行匹配查找,返回所有匹配到的元素列表
- find_elements_by_tag_name:通过标签名称进行匹配查找,返回所有匹配到的元素列表
- find_elements_by_class_name:通过class名称进行匹配查找,返回所有匹配到的元素列表
- find_elements_by_css_selector:通过CSS选择器进行匹配查找,返回所有匹配到的元素列表
除了上面给出的公有方法之外,Selenium还提供了两种私有方法可能对页面对象中的定位器有用,这两个私有方法是:find_element
和find_elements
:
from selenium.webdriver.common.by import By element = driver.find_element(By.XPATH,'//*[@id="su"]') elements = driver.find_elements(By.XPATH,'//button')
下面是By
可用的属性:
ID = 'id' NAME = 'name' XPATH = 'xpath' LINK_TEXT = '链接内容' PARTIAL_LINK_TEXT = '部分链接内容' TAG_NAME = '标签名称' CLASS_NAME = '类名' CSS_SELECTOR = 'CSS选择器'
通过ID定位
当你知道元素的ID属性时,你可以使用它,使用此策略,将返回ID属性值与该匹配的第一个元素。如果没有元素匹配到此ID属性,将会引发NoSuchElementException错误,举个栗子,有这样一个数据源:
<input id="login_id" type="text"/>
我们知道了这个元素的ID,我们可以这样进行定位:
element = driver.find_element_by_id('login_id')
案例:
from selenium import webdriver driverpath="D:chromedriverchromedriver_win32chromedriver.exe" driver=webdriver.Chrome(executable_path=driverpath) driver.get("https://www.baidu.com/") input_tag=driver.find_element_by_id("kw") print(input_tag) input_tag.send_keys("何足道")
通过name定位
当你知道元素的name属性时,你可以使用它,使用此策略,将返回name属性值与该匹配的第一个元素。如果没有元素匹配到此name属性,将会引发NoSuchElementException错误,举个栗子,有这样一个数据源:
<input name="login" type="text"/>
我们知道了这个元素的name,我们可以这样进行定位:
element = driver.find_element_by_name('login')
案例:
from selenium import webdriver driverpath="D:chromedriverchromedriver_win32chromedriver.exe" driver=webdriver.Chrome(executable_path=driverpath) driver.get("https://www.baidu.com") input_tag=driver.find_element_by_name("wd") print(input_tag) input_tag.send_keys("python")
通过XPath定位
XPath是用于在XML文档中查找节点的语言,由于XML可以是HTML的实现,因此Selenium用户可以利用这种强大的语言来定位其Web应用程序中的元素。XPath扩展了通过id和name属性定位的简单方法,并打开了各种新的可能性,例如在页面中查找第三个复选框。举个栗子,有这样一个数据源:
<html> <body> <form id="loginForm"> <input name="username" type="text" /> <input name="password" type="password" /> <input name="continue" type="submit" value="Login" /> <input name="continue" type="button" value="Clear" /> </form> </body> <html>
我们可以这样进行定位:
user_element = driver.find_element_by_xpath("/html/body/form/input[@name='username']")
or
user_element = driver.find_element_by_xpath("/html/body/form/input")
or
user_element = driver.find_element_by_xpath('//input[1]')
or
user_element = driver.find_element_by_xpath("//input[@name='username'][@type='text']")
案例:
from selenium import webdriver driverpath="D:chromedriverchromedriver_win32chromedriver.exe" driver=webdriver.Chrome(executable_path=driverpath) driver.get("https://www.baidu.com/") input_tag=driver.find_element_by_xpath('//input[@name="wd"][@id="kw"]') print(input_tag) input_tag.send_keys("python")
通过链接内容定位
当你知道链接元素的内容时,你可以使用它,使用此策略,将返回链接内容与该匹配到的第一个元素。如果没有元素匹配到此链接内容,将会引发NoSuchElementException错误,举个栗子,有这样一个数据源:
<html> <body> <p>Are you sure you want to do this?</p> <a href="continue.html" rel="external nofollow" rel="external nofollow" >Continue</a> <a href="cancel.html" rel="external nofollow" >Cancel</a> </body> <html>
其中<a href="continue.html" rel="external nofollow" rel="external nofollow" >Continue</a>
元素可以这样定位:
element = driver.find_element_by_link_text('Continue')
or
element = driver.find_element_by_partical_link_text('Con')
通过标签名称定位
当你知道标签的名称时,你可以使用它,使用此策略,将返回标签名称与该匹配到的第一个元素。如果没有元素匹配到此标签名称时,将会引发NoSuchElementException错误,举个栗子,有这样一个数据源:
<html>
<body>
<h1>Welcome</h1>
<p>Site content goes here.</p>
</body>
<html>
我们可以这样对<p>
元素进行定位:
element = driver.find_element_by_class_name('content')
案例:
from selenium import webdriver
driverpath="D:chromedriverchromedriver_win32chromedriver.exe"
driver=webdriver.Chrome(executable_path=driverpath)
driver.get("https://www.baidu.com/")
input_tag=driver.find_element_by_class_name("s_ipt")
print(input_tag)
input_tag.send_keys("python")
通过CSS选择器进行定位
当你想通过CSS选择器语法找到一个元素时,你可以使用它,使用此策略,将返回匹配CSS选择器的第一个元素。如果没有元素匹配到此CSS选择器时,将会引发NoSuchElementException错误,举个栗子,有这样一个数据源:
<html> <body> <p class="content">Site content goes here.</p> </body> <html>
我们可以这样对<p>
元素进行定位:
element = driver.find_element_by_css_selector('body p')
案例:
from selenium import webdriver
driverpath="D:chromedriverchromedriver_win32chromedriver.exe"
driver=webdriver.Chrome(executable_path=driverpath)
driver.get("https://www.baidu.com/")
input_tag=driver.find_element_by_css_selector(".quickdelete-wrap >input")
print(input_tag)
input_tag.send_keys("python")
4.selenium操作表单
a.操作input
from selenium import webdriver import time driverpath="D:chromedriverchromedriver_win32chromedriver.exe" driver=webdriver.Chrome(executable_path=driverpath) driver.get("https://www.baidu.com/") input_tag=driver.find_element_by_id("kw") input_tag.send_keys("python") #给input输入值 time.sleep(3) input_tag.clear() #清空
b.操作CheckBox,因为要选中checkbox标签,在网页中是通过鼠标点击的,因此想要选中checkbox,那么选中这个标签,然后执行click事件,示例代码如下:
from selenium import webdriver driverpath="D:chromedriverchromedriver_win32chromedriver.exe" driver=webdriver.Chrome(executable_path=driverpath) driver.get("https://www.douban.com/") checkbox_tag=driver.find_element_by_id("form_remember") checkbox_tag.click() #找到checkbox标签后执行click函数
c.选择select,select元素不能直接点击,因为点击后还需要选中元素,这时候selenium就专门为select标签提供了一个类,selenium.webdriver.support.ui.Select,将获取到的元素当成参数传到这个类中,创建这个对象,以后就可以使用这个对象进行选择了,示例代码如下:
from selenium import webdriver from selenium.webdriver.support.ui import Select driverpath="D:chromedriverchromedriver_win32chromedriver.exe" driver=webdriver.Chrome(executable_path=driverpath) driver.get("http://www.dobai.cn/") selectBtn=Select(driver.find_element_by_name("jumpMenu")) selectBtn.select_by_index(1) #通过索引获取 selectBtn.select_by_value("http://m.95xiu.com") #通过value获取 selectBtn.select_by_visible_text("95秀客户端") #通过value文字获取
d.操作按钮:操作按钮有很多种方式,比如单击,双击,右击,这里讲一个最常用的,就是点击,直接调用click函数就可以了,实例代码如下
from selenium import webdriver driverpath="D:chromedriverchromedriver_win32chromedriver.exe" driver=webdriver.Chrome(executable_path=driverpath) driver.get("https://baidu.com/") input_tag=driver.find_element_by_id("kw") input_tag.send_keys("python") submit_tag=driver.find_element_by_id("su") submit_tag.click()
5.行为链
有时候在页面中的操作可能要有很多步,那么这时候可以使用鼠标行为链ActionChains来完成。比如现在要将鼠标移动到某个元素上并执行点击事件
from selenium import webdriver from selenium.webdriver.common.action_chains import ActionChains driverpath="D:chromedriverchromedriver_win32chromedriver.exe" driver=webdriver.Chrome(executable_path=driverpath) driver.get('https://www.baidu.com') input_tag=driver.find_element_by_id("kw") submit_btn=driver.find_element_by_id("su") actions=ActionChains(driver) actions.move_to_element(input_tag) #移动到某元素 actions.send_keys_to_element(input_tag,"python") #在指定的元素内输入值 actions.move_to_element(submit_btn) actions.click() #点击 actions.perform() #执行
更多鼠标操作系统行为:
1。 click_and_hold(element) 点击但不松开
2. context_click(element) 右键点击
3. double_click(element) 双击
6.操作cookie
# 获取所有的cookie: for cookie in driver.get_cookies(): print(cookie) # 根据cookie的value获取key: value = driver.get_cookie(key) # 删除所有的cookie: driver.delete_all_cookies() # 删除某个cookie: driver.delete_cookie(key)
from selenium import webdriver from selenium.webdriver.common.action_chains import ActionChains driverpath="D:chromedriverchromedriver_win32chromedriver.exe" driver=webdriver.Chrome(executable_path=driverpath) driver.get('https://www.baidu.com') #获取所有cookie for cookie in driver.get_cookies(): print(cookie) #根据cookie的value获取key print("-"*100) print(driver.get_cookie("PSTM")) driver.delete_all_cookies()
7.selenium的隐式等待和显式等待
a.显式等待 ,一个显式等待是你定义的一段代码,用于等待某个条件发生然后再继续执行后续代码。显式等待是等元素加载!!!
from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC driverpath="D:chromedriverchromedriver_win32chromedriver.exe" driver=webdriver.Chrome(executable_path=driverpath) driver.get("https://www.baidu.com") try: element=WebDriverWait(driver,10).until( EC.presence_of_element_located((By.ID,"kw")) ) finally: driver.quit()
b.隐式等待,相当于设置全局的等待,在定位元素时,对所有元素设置超时时间。隐式等待是等页面加载,而不是元素加载!!!(隐式等待就是针对页面的,显式等待是针对元素的。)
from selenium import webdriver driverpath="D:chromedriverchromedriver_win32chromedriver.exe" driver=webdriver.Chrome(executable_path=driverpath) driver.get("https://www.baidu.com") driver.implicitly_wait(10) input_tag=driver.find_element_by_id("kw")
显示等待更节约时间,推荐使用
8.selenium打开多窗口和切换
from selenium import webdriver driverpath="D:chromedriverchromedriver_win32chromedriver.exe" driver=webdriver.Chrome(executable_path=driverpath) driver.get("https://www.baidu.com") driver.execute_script("window.open('https://www.douban.com')") print(driver.current_url) #显示driver中当前的窗口URL print(driver.window_handles) #显示句柄中的URL driver.switch_to_window(driver.window_handles[1]) #在driver中切换到第2个 print(driver.current_url) #显示driver中当前的窗口URL
注:虽然在窗口中切换到新的页面,但是driver中还没有切换,如果想要在代码中切换到新页面,应该使用driver.switch_to_window来切换到指定的窗口,driver.window_handles是一个列表,里面装的是窗口的句柄,它会按照打开页面的顺序来存储窗口的句柄
9.selenium通过代理IP
from selenium import webdriver driverpath="D:chromedriverchromedriver_win32chromedriver.exe" options=webdriver.ChromeOptions() options.add_argument("--proxy-server=http://140.143.164.107:8888") driver=webdriver.Chrome(executable_path=driverpath,chrome_options=options) driver.get("http://httpbin.org/ip")
10.WebElement元素
from selenium.webdriver.remote.webelement import WebElement类是每个获取出来的元素的所属类
一些常用的属性: 1.get_attribute:这个标签的某个属性的值 2.screenshot:获取当前页面的截图,这个方法只能在driver上使用,driver的对象类,也是继承自WebElement
实例:
from selenium import webdriver from selenium.webdriver.remote.webelement import WebElement driverpath="D:chromedriverchromedriver_win32chromedriver.exe" driver=webdriver.Chrome(executable_path=driverpath) driver.get("https://www.baidu.com") submit_tag=driver.find_element_by_id("su") print(type(submit_tag)) print(submit_tag.get_attribute("value")) driver.save_screenshot("baidu.png")
常用的接口方法
submit #提交表单,特别用于没有提交按钮的情况,例如,搜索框中输入内容后回车操作
from selenium import webdriver from selenium.webdriver.remote.webelement import WebElement driverpath="D:chromedriverchromedriver_win32chromedriver.exe" driver=webdriver.Chrome(executable_path=driverpath) driver.get("http://www.youdao.com") input_tag=driver.find_element_by_id("translateContent") input_tag.send_keys("python") input_tag.submit() #点击回车键 注意:subm()和click()可以互换,submit()也可以点击按钮
注意:subm()和click()可以互换,submit()也可以点击按钮
size:返回元素尺寸
text :返回元素文本
get_attribute(name):获得属性值
is_dispalyed() :该元素是否用户可见
from selenium import webdriver from selenium.webdriver.remote.webelement import WebElement driverpath="D:chromedriverchromedriver_win32chromedriver.exe" driver=webdriver.Chrome(executable_path=driverpath) driver.get("https://www.baidu.com") #获得输入框尺寸 input_tag=driver.find_element_by_id("su") print(input_tag.size) #返回百度页面底部备案 result=driver.find_element_by_id("s_upfunc_menus") print(result.text) #返回元素是否可见 result2=driver.find_element_by_id("su") print(result2.size)
11.案例
a.传统方式,只打开一页
import requests
import time
from lxml import etree
import re
def get_content_page(url):
headers={
"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36",
"Referer":"https://www.lagou.com/jobs/list_python?city=%E5%85%A8%E5%9B%BD&cl=false&fromSearch=true&labelWords=&suginput=",
"Cookie":"_ga=GA1.2.738947720.1535697456; user_trace_token=20180831143733-57623494-ace8-11e8-be67-525400f775ce; LGUID=20180831143733-5762374b-ace8-11e8-be67-525400f775ce; index_location_city=%E5%85%A8%E5%9B%BD; X_HTTP_TOKEN=ad6c0f5ffeca4f3239eb2f7b016e51ab; _gid=GA1.2.961140155.1535940535; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1535697456,1535940536; LGSID=20180903100855-4fb092a7-af1e-11e8-8548-525400f775ce; PRE_UTM=; PRE_HOST=; PRE_SITE=; PRE_LAND=https%3A%2F%2Fpassport.lagou.com%2Flogin%2Flogin.html%3Fmsg%3Dvalidation%26uStatus%3D2%26clientIp%3D182.48.111.194; JSESSIONID=ABAAABAAADEAAFI2FAFBCB9D1D4A88B226483F57290DD96; TG-TRACK-CODE=index_search; SEARCH_ID=4cb60039551040dbba7154530b206fcf; _gat=1; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1535941478; LGRID=20180903102438-8185b0f6-af20-11e8-8548-525400f775ce"
}
response=requests.get(url,headers=headers)
text=response.text
html=etree.HTML(text)
title=html.xpath('//div[@class="job-name"]/span/text()')[0]
jobdetail=html.xpath('//dd[@class="job_request"]//span/text()')
salary=jobdetail[0]
location=jobdetail[1]
location=re.sub(r"[s/]","",location)
experience=jobdetail[2]
experience=re.sub(r"[s/]","",experience)
qualifications=jobdetail[3]
qualifications=re.sub(r"[s/]","",qualifications)
is_fulltime=jobdetail[4]
is_fulltime=re.sub(r"[s/]","",is_fulltime)
detailinfo=html.xpath('//dd[@class="job_bt"]//p/text()')
detailinfo="".join(detailinfo).strip()
print(title,salary,location,experience,qualifications,is_fulltime,detailinfo)
time.sleep(2)
def get_page_id():
url="https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false"
headers={
"Referer":"https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput=",
"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36",
"Cookie":"_ga=GA1.2.738947720.1535697456; user_trace_token=20180831143733-57623494-ace8-11e8-be67-525400f775ce; LGUID=20180831143733-5762374b-ace8-11e8-be67-525400f775ce; index_location_city=%E5%85%A8%E5%9B%BD; X_HTTP_TOKEN=ad6c0f5ffeca4f3239eb2f7b016e51ab; _gid=GA1.2.961140155.1535940535; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1535697456,1535940536; LGSID=20180903100855-4fb092a7-af1e-11e8-8548-525400f775ce; PRE_UTM=; PRE_HOST=; PRE_SITE=; PRE_LAND=https%3A%2F%2Fpassport.lagou.com%2Flogin%2Flogin.html%3Fmsg%3Dvalidation%26uStatus%3D2%26clientIp%3D182.48.111.194; JSESSIONID=ABAAABAAADEAAFI2FAFBCB9D1D4A88B226483F57290DD96; TG-TRACK-CODE=index_search; SEARCH_ID=2753ed70c0374a4eb0436263dc6db38c; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1535941412; LGRID=20180903102332-5a63d0bf-af20-11e8-b4e7-5254005c3644"
}
data={
'first':'true',
'pn':1,
'kd':'python'
}
for x in range(1,2):
data["pn"]=x
response=requests.post(url,headers=headers,data=data)
json_text=response.json()
time.sleep(2)
get_detail=json_text["content"]["hrInfoMap"]
for detailtext in get_detail:
detailurl="https://www.lagou.com/jobs/%s.html" %(detailtext)
print(detailurl)
get_content_page(detailurl)
if __name__=="__main__":
get_page_id()
b.通过selenium方式,自动下一页
from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from lxml import etree import time import re class LagouSpider(): driverpath = "D:chromedriverchromedriver_win32chromedriver.exe" def __init__(self): self.driver=webdriver.Chrome(executable_path=self.driverpath) self.url="https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput=" self.postion=[] def run(self): self.driver.get(self.url) while True: source=self.driver.page_source #显示等待打开页面的元素 WebDriverWait(self.driver,10).until( EC.presence_of_element_located((By.XPATH,"//div[@class='pager_container']/span[last()]")) ) self.parse_list_page(source) #找到下一页的标签 next_page_bt=self.driver.find_element_by_xpath('//div[@class="pager_container"]/span[last()]') if "pager_next_disabled" in next_page_bt.get_attribute("class"): break else: next_page_bt.click() time.sleep(3) def parse_list_page(self,source): html=etree.HTML(source) #解析找到详情页面的URL links=html.xpath('//a[@class="position_link"]/@href') for link in links: self.request_detail_page(link) def request_detail_page(self,url): # self.driver.get(url) #切换窗口打开详情页面的URL self.driver.execute_script("window.open('%s')"%(url)) self.driver.switch_to_window(self.driver.window_handles[1]) #显示等待打开详情页面的元素 WebDriverWait(self.driver,10).until( EC.presence_of_element_located((By.XPATH,"//div[@class='job-name']/span[@class='name']")) ) source=self.driver.page_source self.parse_detail_page(source) self.driver.close() self.driver.switch_to_window(self.driver.window_handles[0]) def parse_detail_page(self,source): html=etree.HTML(source) title = html.xpath('//div[@class="job-name"]/span/text()')[0] jobdetail = html.xpath('//dd[@class="job_request"]//span/text()') salary = jobdetail[0] location = jobdetail[1] location = re.sub(r"[s/]", "", location) experience = jobdetail[2] experience = re.sub(r"[s/]", "", experience) qualifications = jobdetail[3] qualifications = re.sub(r"[s/]", "", qualifications) is_fulltime = jobdetail[4] is_fulltime = re.sub(r"[s/]", "", is_fulltime) detailinfo = html.xpath('//dd[@class="job_bt"]//p/text()') detailinfo = "".join(detailinfo).strip() postion={ "title":title, "salary":salary, "location":location, "experience":experience, "qualifications":qualifications, "is_fulltime":is_fulltime, "detailinfo":detailinfo } print(postion) self.postion.append(postion) if __name__=="__main__": spider=LagouSpider() spider.run()
爬Boss直聘
from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from lxml import etree import time from selenium.webdriver.common.action_chains import ActionChains class BossSpider(): driverpath = "D:chromedriverchromedriver_win32chromedriver.exe" def __init__(self): self.driver=webdriver.Chrome(executable_path=self.driverpath) self.url="https://www.zhipin.com/job_detail/?query=python&scity=101010100&industry=&position=" def run(self): self.driver.get(self.url) while True: source=self.driver.page_source actions = ActionChains(self.driver) WebDriverWait(self.driver,10).until( EC.presence_of_element_located((By.XPATH,"//div[@class='page']/a[last()]")) ) self.parse_detail_url(source) next_btn=self.driver.find_element_by_xpath("//div[@class='page']/a[last()]") if "disabled" in next_btn.get_attribute("class"): break else: actions.move_to_element(next_btn) actions.click() # 点击 time.sleep(3) def parse_detail_url(self,source): html=etree.HTML(source) links=html.xpath('//div[@class="info-primary"]//a/@href') for link in links: self.request_detail_page(link) time.sleep(3) def request_detail_page(self,url): self.driver.execute_script("window.open('%s')"%(url)) self.driver.switch_to.window(self.driver.window_handles[1]) WebDriverWait(self.driver,10).until( EC.presence_of_element_located((By.XPATH,"//div[@class='job-sec']/div[@class='text']")) ) source=self.driver.page_source self.get_detail_info(source) self.driver.close() self.driver.switch_to.window(self.driver.window_handles[0]) def get_detail_info(self,source): html=etree.HTML(source) postion=html.xpath("//div[@class='info-primary']/div[@class='name']/h1/text()")[0] detailinfo1=html.xpath("//div[@class='info-primary']")[0] strinfo=detailinfo1.xpath("./p/text()") city=strinfo[0].split(":")[-1] experience=strinfo[1].split(":")[-1] education=strinfo[2].split(":")[-1] job_description=html.xpath("//div[@class='job-sec']/div[@class='text']/text()") job_description="".join(job_description).strip() print(city,experience,education,job_description) if __name__=="__main__": spider=BossSpider() spider.run()