自动化抓取数据

import time
from selenium import webdriver
from pymouse import PyMouse
import os
import random
from shutil import copy


class tietajuhe(object):
def __init__(self):
self.files_path = r'D:html_dowload\'
self.m = PyMouse()
category = ['乡镇', '市政', '园区', '广告']
try:
for Category_name in category:
self.option = webdriver.ChromeOptions()
self.option.add_argument("--user-data-dir=" + r"D:/Google2data/")
self.browser = webdriver.Chrome(chrome_options=self.option) # 打开chrome浏览器
# 工作路径
self.path = r"D:铁塔聚合810\%s全国百度资讯\" % Category_name
try:
self.Html_save(Cat=Category_name)
except:
print("保存网页出问题了")
try:
self.mkdir_and_file()
except:
print("创建文件夹出问题了")
try:
self.copy_file()
except:
print("转移网页出问题了")
except:
pass
# self.path = r"D:铁塔聚合810乡镇全国百度资讯\"
# self.Html_save()

def mkdir_and_file(self):
# 切换进路径
os.chdir(self.path)
# 读取本地时间
n_time = time.localtime(time.time())
today_time = str(n_time.tm_mon) + str(n_time.tm_mday)
# 遍历创建文件文件夹
for i in range(1, 10):
filedir = today_time + str(i)
os.mkdir(filedir)
path_in = self.path + filedir
os.chdir(path_in)
num_score = random.randint(1, 100)
fp = open("%d.score" % num_score, 'w')
fp.close()
os.chdir(self.path)

def copy_file(self):
mubiao_dir = self.path
a_list = []
b_list = []
# 遍历得出目标文件夹地址
for root_a, dirs_a, files_a in os.walk(mubiao_dir):
a_list.append(dirs_a)

# 遍历得出文件地址
for root, dirs, files in os.walk(self.files_path):
for html_name in files:
file = self.files_path + html_name
b_list.append(file)
print(len(b_list))

i = 0
for dir_path in a_list[0]:
dir = mubiao_dir + dir_path # 目标地址
# print(dir) #当前路径下所有非目录子文件
copy(b_list[i], dir) # 复制文件
os.remove(b_list[i]) # 删除已复制的页面
i += 1

def Html_save(self, Cat):
# key_w = ['视频监控', '智慧乡镇', '乡镇建设', '脱贫攻坚', '民生建设']
# key_e = ['智能云广播', '智慧人社局', '智慧灯杆', '市政面貌', '智慧监管']
# key_r = ['智慧园区', '现代化', '招商', '园区规划', '园区监控']
# key_r = ['媒体融合', '媒体+', '人工智能', '5G', '智能互联']
Cat_Dic = {
'key_w' : ['视频监控', '智慧乡镇', '乡镇建设', '脱贫攻坚', '民生建设'],
'key_e' : ['智能云广播', '智慧人社局', '智慧灯杆', '市政面貌', '智慧监管'],
'key_r' : ['智慧园区', '现代化', '招商', '园区规划', '园区监控'],
'key_t' : ['媒体融合', '媒体+', '人工智能', '5G', '智能互联'],
}
if Cat == '乡镇':
Cat_text = Cat_Dic['key_w']
elif Cat == '市政':
Cat_text = Cat_Dic['key_e']
elif Cat == '园区':
Cat_text = Cat_Dic['key_r']
elif Cat == '广告':
Cat_text = Cat_Dic['key_t']
else:
pass
key_w_url = []
for key in Cat_text:
# 启用带插件的浏览器
self.browser.get("https://www.baidu.com/s?ie=utf-8&cl=2&medium=0&rtt=1&bsst=1&rsv_dl=news_t_sk&tn=news&word=%s&rsv_sug3=5&rsv_sug4=284&rsv_sug1=5&rsv_sug2=0&inputT=1286" % key)
self.browser.maximize_window()
# time.sleep(99999999999999999999999)
ret = self.browser.find_elements_by_xpath('//*[contains(@id,"")]/h3/a') # 查询class为item
print(len(ret))
num_a = 0
for i in ret:
num_a += 1
if num_a == 3:
print('*'*99)
break
url = i.get_attribute("href")
key_w_url.append(url)
self.browser.execute_script("window.open('%s')" % url)
time.sleep(1.5)
self.m.click(1800, 50)
time.sleep(1.5)
self.m.click(1435, 151)
time.sleep(3)
self.m.click(20, 293)

self.browser.quit()


if __name__ == '__main__':
Titan = tietajuhe()
Titan.__init__()


原文地址:https://www.cnblogs.com/blog0001/p/13471299.html