淘宝爬虫

 1 from selenium import webdriver
 2 from selenium.webdriver.common.by import By
 3 from selenium.webdriver.support import expected_conditions as EC
 4 from selenium.webdriver.support.wait import WebDriverWait
 5 from selenium.common.exceptions import TimeoutException
 6 from pyquery import PyQuery as pq
 7 import re
 8 from config import *
 9 import pymongo
10 
11 client = pymongo.MongoClient(MONGO_URL)
12 db = client[MONGO_DB]
13 browser = webdriver.Chrome()
14 
15 wait = WebDriverWait(browser, 10)
16 
17 
18 def search():
19     try:
20         browser.get('https://www.taobao.com')
21         input_ = wait.until(
22             EC.presence_of_element_located((By.CSS_SELECTOR, '#q'))
23         )
24         submit = wait.until(
25             EC.element_to_be_clickable((By.CSS_SELECTOR, '#J_TSearchForm > div.search-button > button'))        
26         )
27 
28         input_.send_keys('xiaomi')
29         submit.click()
30 
31         total = wait.until(
32             EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.total'))
33             )    
34         get_products()
35         return total.text    
36     except TimeoutException:
37         return search()
38 
39 def next_page(page_num):
40     try:
41         input_ = wait.until(
42                 EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.form > input'))
43             )
44         submit = wait.until(
45                 EC.element_to_be_clickable((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit'))        
46             )
47         input_.clear()
48         input_.send_keys(page_num)
49         submit.click()
50         wait.until(EC.text_to_be_present_in_element(
51             (By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > ul > li.item.active > span'), str(page_num)))
52         get_products()
53     except TimeoutException:
54         next_page(page_num)
55 
56 def get_products():
57     wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-itemlist .items .item')))
58     html = browser.page_source
59     doc = pq(html)
60     items = doc('#mainsrp-itemlist .items .item').items()
61     for item in items:
62         product ={
63             'image': item.find('.pic .img').attr('src'),
64             'price': item.find('.price').text(),
65             'deal': item.find('.deal-cnt').text()[:-3],
66             'title': item.find('.title').text(),
67             'shop': item.find('.shop').text(),
68             'location': item.find('.location').text()
69 
70         }
71         print(product)
72         save_to_mongo(product)
73 
74 def save_to_mongo(result):
75     try:
76         if db[MONGO_TABLE].insert(result):
77             print('success save to mongodb', result)
78     except Exception:
79         print('error to mongo')
80 
81 def main():
82     total = search()
83     total = int(re.compile('(d+)').search(total).group(1))
84     # print(total)
85     for i in range(2, total):
86         next_page(i)
87     browser.close()
88 
89 if __name__ == '__main__':
90     main()

 config.py

1 MONGO_URL = 'localhost'
2 MONGO_DB = 'taobao'
3 MONGO_TABLE = 'product'

 

运行结果:

 数据库:

 

原文地址:https://www.cnblogs.com/MC-Curry/p/9338906.html