selenium模拟浏览器爬取淘宝产品信息

 1 from selenium import webdriver
 2 from selenium.webdriver.common.by import By
 3 from selenium.webdriver.support.ui import WebDriverWait
 4 from selenium.webdriver.support import expected_conditions as EC
 5 from selenium.common.exceptions import TimeoutException
 6 import re
 7 from pyquery import PyQuery
 8 from day01.config import *
 9 import pymongo
10 client = pymongo.MongoClient(MONGO_URL) #连接mongodb
11 db = client[MONGO_DB]
12 
13 browser = webdriver.Chrome()
14 wait = WebDriverWait(browser,10)
15 
16 def search():
17     try:
18         browser.get("https://www.taobao.com")
19         # 输入框
20         input_box = wait.until(
21             EC.presence_of_element_located((By.CSS_SELECTOR,"#q"))
22         )
23         submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,"#J_TSearchForm > div.search-button > button")))
24         input_box.send_keys("美食")
25         submit.click()
26         login = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,"#fm-login-id")))
27         if login is not None:
28             login.send_keys("********")
29             password = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,"#fm-login-password")))
30             password.send_keys("*********")
31             login_button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,"#login-form > div.fm-btn > button")))
32             login_button.click()
33         else:
34             pass
35         total_page = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,"#mainsrp-pager > div > div > div > div.total")))
36         get_products()
37         return total_page.text
38     except TimeoutException:
39         return search()
40     # finally:
41     #     browser.quit()
42 
43 def next_page(page_number):
44     "操作翻页"
45     try:
46         input_page = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,"#mainsrp-pager > div > div > div > div.form > input")))
47         confirm_button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,"#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit")))
48         input_page.clear()
49         input_page.send_keys(page_number)
50         confirm_button.click()
51         # 判断页码数是否在当前页,用来判断元素中存在指定文本的
52         wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR,"#mainsrp-pager > div > div > div > ul > li.item.active > span"),str(page_number)))
53         get_products()
54     except TimeoutException:
55         next_page(page_number)
56 
57 def get_products():
58     "获取产品信息"
59     wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,"#mainsrp-itemlist .items .item")))
60     html = browser.page_source #可以返回网页源码
61     doc = PyQuery(html) #使用pyquery解析网页
62     items = doc('#mainsrp-itemlist .items .item').items()
63     for item in items:
64         product = {
65             'image':item.find('.pic .img').attr('src'),#获取标签属性
66             'price':item.find('.price').text(), #价格
67             'deal':item.find('.deal-cnt').text()[:-3], #成交量
68             'title':item.find('.title').text(),
69             'shop':item.find('.shop').text(),
70             'location':item.find('.location').text()
71         }
72         # print(product)
73         save_to_mongo(product)
74         # from day01.connectMongo import ConnectMongo
75         # con = ConnectMongo()
76         # con.insert_one_data(product,"table")
77 
78 def save_to_mongo(result):
79     try:
80         if db[MONGO_TABLE].insert(result):
81             print("存储到mongodb成功")
82     except Exception as e:
83         print("存储到mongodb异常,%s"%e)
84 
85 
86 def main():
87     result = search()
88     total = int(re.compile("(d+)").search(result).group(1))
89     for i in range(2,total+1):
90         next_page(i)
91 
92 if __name__ == '__main__':
93     main()
原文地址:https://www.cnblogs.com/yzmPython/p/14184494.html