seleniun 爬取淘宝网

  1 import re
  2 from selenium import webdriver
  3 from selenium.common.exceptions import TimeoutException
  4 from selenium.webdriver.common.by import By
  5 from selenium.webdriver.support.ui import WebDriverWait
  6 from selenium.webdriver.support import expected_conditions as EC
  7 from pyquery import PyQuery as pq
  8 
  9 import pymongo
 10 
 11 MONGO_URL = 'localhost'
 12 MONGO_DB = 'taobao'
 13 MONGO_TABLE = 'product'
 14 
 15 SERVICE_ARGS = ['--load-images=false', '--disk-cache=true']
 16 
 17 KEYWORD = '美食'
 18 
 19 client = pymongo.MongoClient(MONGO_URL)
 20 db = client[MONGO_DB]
 21 
 22 browser = webdriver.PhantomJS(service_args=SERVICE_ARGS)
 23 wait = WebDriverWait(browser, 10)
 24 
 25 browser.set_window_size(1400, 900)
 26 
 27 def search():
 28     print('正在搜索')
 29     try:
 30         browser.get('https://www.taobao.com')
 31         input = wait.until(
 32             EC.presence_of_element_located((By.CSS_SELECTOR, '#q'))
 33         )
 34         submit = wait.until(
 35             EC.element_to_be_clickable((By.CSS_SELECTOR, '#J_TSearchForm > div.search-button > button')))
 36         input.send_keys(KEYWORD)
 37         submit.click()
 38         total = wait.until(
 39             EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.total')))
 40         get_products()
 41         return total.text
 42     except TimeoutException:
 43         return search()
 44 
 45 
 46 def next_page(page_number):
 47     print('正在翻页', page_number)
 48     try:
 49         input = wait.until(
 50             EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.form > input'))
 51         )
 52         submit = wait.until(EC.element_to_be_clickable(
 53             (By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit')))
 54         input.clear()
 55         input.send_keys(page_number)
 56         submit.click()
 57         wait.until(EC.text_to_be_present_in_element(
 58             (By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > ul > li.item.active > span'), str(page_number)))
 59         get_products()
 60     except TimeoutException:
 61         next_page(page_number)
 62 
 63 
 64 def get_products():
 65     wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-itemlist .items .item')))
 66     html = browser.page_source
 67     doc = pq(html)
 68     items = doc('#mainsrp-itemlist .items .item').items()
 69     for item in items:
 70         product = {
 71             'image': item.find('.pic .img').attr('src'),
 72             'price': item.find('.price').text(),
 73             'deal': item.find('.deal-cnt').text()[:-3],
 74             'title': item.find('.title').text(),
 75             'shop': item.find('.shop').text(),
 76             'location': item.find('.location').text()
 77         }
 78         print(product)
 79         save_to_mongo(product)
 80 
 81 
 82 def save_to_mongo(result):
 83     try:
 84         if db[MONGO_TABLE].insert(result):
 85             print('存储到MONGODB成功', result)
 86     except Exception:
 87         print('存储到MONGODB失败', result)
 88 
 89 
 90 def main():
 91     try:
 92         total = search()
 93         total = int(re.compile('(d+)').search(total).group(1))
 94         for i in range(2, total + 1):
 95             next_page(i)
 96     except Exception:
 97         print('出错啦')
 98     finally:
 99         browser.close()
100 
101 if __name__ == '__main__':
102     main()
原文地址:https://www.cnblogs.com/zhongshuiping/p/9714025.html