需要模拟点击爬虫小案例

用了一天时间,写了一个简单功能的小爬虫,用的selenium
爬虫主要涉及:
1、模拟用户登录
2、模拟点击下拉菜单
3、定位下拉菜单中的文本、点击选择
4、双击文本,实现隐藏文本==>显示文本
5、查询后先切换到首页
6、点击下一页翻页,判断当前页是否是最
from selenium import webdriver
from selenium.webdriver import ActionChains
import re
import time
import os
import random

'''
爬虫函数,输入省份、城市、参数
该函数主要完成省份、城市参数传入
driver.execute_script("arguments[0].scrollIntoView();", lis_p[p])完成下拉菜单找到省份及城市所在位置
点击省份、城市下拉菜单,点击查询,等待加载
先跳转到首页,然后按页循环获取table中的文本内容
'''
def paqu(driver,p,c,lis_p,lis_c,p_name):
driver.find_element_by_xpath(
'//*[@id="app"]/div/section/div/section[2]/div/div[2]/div/div/form/div[11]/div/div[2]/div[1]/span/span/i').click()

time.sleep(5)

driver.execute_script("arguments[0].scrollIntoView();", lis_p[p])
lis_p[p].click()
time.sleep(5)

# 城市
driver.find_element_by_xpath(
'//*[@id="app"]/div/section/div/section[2]/div/div[2]/div/div/form/div[12]/div/div[2]/div[1]/span/span/i').click()
time.sleep(3)
driver.execute_script("arguments[0].scrollIntoView();", lis_c[c])
c_name = lis_c[c].text
lis_c[c].click()
time.sleep(3)

# 查询
driver.find_element_by_xpath('//*[@id="app"]/div/section/div/section[2]/div/div[2]/div/div/form/div[20]/div/button/span').click()
time.sleep(5)
try:
# 跳到首页
driver.find_element_by_xpath(
'//*[@id="app"]/div/section/div/section[2]/div/div[2]/div[3]/section/section[2]/span[1]').click()
time.sleep(5)
# 获取一共有多少页
ul_page = driver.find_element_by_xpath('//*[@id="app"]/div/section/div/section[2]/div/div[2]/div[3]/section/section[2]/div[2]/ul')
li_page = ul_page.find_elements_by_xpath('li')
pages = int(li_page[len(li_page)-1].text)
for i in range(1,pages+1):
print(i)
# 客户名称
try:
action_chains = ActionChains(driver)

v_num = driver.find_elements_by_xpath(
'//*[@id="app"]/div/section/div/section[2]/div/div[2]/div[3]/section/section[1]/div/div[3]/table/tbody/tr/td[3]/div/span')
list_num = []
for n in range(len(v_num)):
list_num.append(v_num[n].text)

spans = driver.find_elements_by_xpath(
'//*[@id="app"]/div/section/div/section[2]/div/div[2]/div[3]/section/section[1]/div/div[3]/table/tbody/tr/td[2]/div/span')

for i in range(len(spans)):
action_chains.double_click(spans[i]).perform()
new_time = random.randint(2,5)
time.sleep(new_time)
print(p_name+' '+c_name+' '+spans[i].text+' '+list_num[i])

f.write(p_name+' '+c_name+' '+spans[i].text+' '+list_num[i])
f.write(' ')

but = driver.find_element_by_xpath('//*[@id="app"]/div/section/div/section[2]/div/div[2]/div[3]/section/section[2]/div[2]/button[2]')
flag = but.is_enabled()
print(flag)
if flag is True:
but.click()
time.sleep(5)
else:
print('当前是最后一页')
except:
print('查询结果为空')
driver.find_element_by_xpath(
'//*[@id="app"]/div/section/div/section[2]/div/div[2]/div[3]/section/section[2]/span[1]').click()
time.sleep(5)
except:
pass

'''
点击下拉菜单,获取省份下有多少城市
'''
def get_citynum(driver,p):
driver.find_element_by_xpath(
'//*[@id="app"]/div/section/div/section[2]/div/div[2]/div/div/form/div[11]/div/div[2]/div[1]/span/span/i').click()
time.sleep(5)
ul_p = driver.find_element_by_xpath(
'//*[@id="app"]/div/section/div/section[2]/div/div[2]/div/div/form/div[11]/div/div[2]/div[2]/div[1]/div[1]/ul')
lis_p = ul_p.find_elements_by_xpath('li')
driver.execute_script("arguments[0].scrollIntoView();", lis_p[p])
p_name = lis_p[p].text
lis_p[p].click()
time.sleep(5)

# 城市
driver.find_element_by_xpath(
'//*[@id="app"]/div/section/div/section[2]/div/div[2]/div/div/form/div[12]/div/div[2]/div[1]/span/span/i').click()
time.sleep(5)
ul_c = driver.find_element_by_xpath(
'//*[@id="app"]/div/section/div/section[2]/div/div[2]/div/div/form/div[12]/div/div[2]/div[2]/div[1]/div[1]/ul')
lis_c = ul_c.find_elements_by_xpath('li')
num = len(lis_c)
return num,lis_p,lis_c,p_name


if __name__ == "__main__":
# 打开Firefox浏览器 设定等待加载时间
driver = webdriver.Chrome()
# 定位节点
url = '******'
driver.get(url)
time.sleep(5)
# 输入用户名
driver.find_element_by_xpath('//*[@id="app"]/div/div/form/div[1]/div/label/div/input').clear()
driver.find_element_by_xpath('//*[@id="app"]/div/div/form/div[1]/div/label/div/input').send_keys('******')
# 输入登录的密码
driver.find_element_by_xpath('//*[@id="app"]/div/div/form/div[2]/div/label/div/input').clear()
time.sleep(2)
driver.find_element_by_xpath('//*[@id="app"]/div/div/form/div[2]/div/label/div/input').send_keys('******')
# 输入验证码
driver.find_element_by_xpath('//*[@id="app"]/div/div/form/div[3]/div/div/label/div/input').send_keys(
input("输入验证码: "))
# 点击登录
driver.find_element_by_xpath('//*[@id="app"]/div/div/form/label[1]/button').click()
time.sleep(5)
# 客户分析
driver.find_element_by_xpath('//*[@id="app"]/div/header/nav/div[8]').click()
time.sleep(5)
# 潜在车辆销售分析
driver.find_element_by_xpath('//*[@id="app"]/div/section/div/section[1]/div/div[2]/div/ul/li[5]').click()
time.sleep(5)
for p in range(1,2):
f = open(r'E:MyPythonProjectxiaoshou_kehuVehicle_distribution'+str(p+1)+'.txt', 'w')
print('开始爬取第'+str(p+1)+'个省份')
num,lis_p,lis_c,p_name = get_citynum(driver,p)
for c in range(4,num):
paqu(driver, p,c,lis_p,lis_c,p_name)
print('结束爬取第'+str(p+1)+'个省份')
f.close()

自己写的爬虫太简单,需要进一步精进修改,没有深入的去
思考爬虫设计的够不够完善,用户代理、访问频次等等都没有仔细考虑。
先记录一下本次爬虫小case,有时间的话会进行完善
原文地址:https://www.cnblogs.com/liuffblog/p/13152506.html