爬虫_淘宝(selenium)

总体来说代码还不是太完美

实现了js渲染网页的解析的一种思路

主要是这个下拉操作,不能一下拉到底,数据是在中间加载进来的,

具体过程都有写注释

  1 from selenium import webdriver
  2 import time
  3 from selenium.webdriver.support.ui import WebDriverWait
  4 from selenium.webdriver.support import expected_conditions as EC
  5 from selenium.webdriver.common.by import By
  6 from lxml import etree
  7 import re
  8 import pymongo
  9 
 10 client = pymongo.MongoClient('127.0.0.1', port=27017)
 11 db = client.taobao_mark
 12 collection = db.informations
 13 
 14 
 15 def search(url):
 16 
 17     # 设置无头浏览器
 18     opt = webdriver.ChromeOptions()
 19     opt.set_headless()
 20     driver = webdriver.Chrome(options=opt)
 21     driver.get(url)
 22 
 23     # 使用显示等待加载输入文本框,设置搜索关键字,以'轻薄本'为例
 24     input_key = WebDriverWait(driver, 10).until(
 25         EC.presence_of_element_located((By.ID, 'q')))
 26     input_key.send_keys('轻薄本')
 27 
 28     # 显示等待加载搜索按钮,然后执行点击操作
 29     search_btn = WebDriverWait(driver, 10).until(
 30         EC.presence_of_element_located((By.CLASS_NAME, 'btn-search')))
 31     search_btn.click()
 32 
 33     # 搜索出来的网页内容只有一行与关键字相关,剩下的都是广告
 34     # 此时发现更多相关内容在//span[@class="see-all"]这个标签的链接中,
 35     # 通过xpath获取链接,进行跳转
 36     element = etree.HTML(driver.page_source)
 37     list_page_url = element.xpath('//span[@class="see-all"]/a/@href')[0]
 38     list_page_url = 'https://s.taobao.com/' + list_page_url
 39     driver.close()
 40     return list_page_url
 41     
 42 
 43 def scroll(driver):
 44     # 如果没有进行动态加载,只能获取四分之一的内容,调用js实现下拉网页进行动态加载
 45     for y in range(7):
 46         js='window.scrollBy(0,600)'
 47         driver.execute_script(js)
 48         time.sleep(0.5)
 49 
 50 
 51 def get_detail_url(url):
 52 
 53     # 用来存储列表页中商品的的链接
 54     detail_urls = []
 55 
 56     opt = webdriver.ChromeOptions()
 57     opt.set_headless()
 58     driver = webdriver.Chrome(options=opt)
 59     # driver = webdriver.Chrome()
 60     driver.get(url)
 61 
 62     scroll(driver)
 63 
 64     # 通过简单处理,获取当前最大页数
 65     max_page = WebDriverWait(driver, 10).until(
 66         EC.presence_of_element_located((By.XPATH, '//div[@class="total"]'))
 67         )
 68     text = max_page.get_attribute('textContent').strip()
 69     max_page = re.findall('d+', text, re.S)[0]    
 70     # max_page = int(max_page)
 71     max_page = 1
 72     # 翻页操作
 73     for i in range(1, max_page+1):
 74         print('正在爬取第%s页' % i)
 75         # 使用显示等待获取页面跳转页数文本框
 76         next_page_btn = WebDriverWait(driver, 10).until(
 77             EC.presence_of_element_located((By.XPATH, '//input[@class="input J_Input"]'))
 78             )
 79 
 80         # 获取确定按钮,点击后可进行翻页
 81         submit = WebDriverWait(driver, 10).until(
 82             EC.element_to_be_clickable((By.XPATH, '//span[@class="btn J_Submit"]'))
 83             )
 84 
 85         next_page_btn.clear()
 86         next_page_btn.send_keys(i)
 87         # 这里直接点击会报错'提示不可点击',百度说有一个蒙层,
 88         # 沉睡两秒等待蒙层消失即可点击
 89         time.sleep(2)
 90         submit.click()
 91         scroll(driver)
 92 
 93         urls = WebDriverWait(driver, 10).until(
 94         EC.presence_of_all_elements_located((By.XPATH, '//a[@class="img-a"]'))
 95         )
 96         for url in urls:
 97             detail_urls.append(url.get_attribute('href'))
 98     driver.close()
 99     # 返回所有商品链接列表
100     return detail_urls
101 
102 def parse_detail(detail_urls):
103     parameters = []
104     opt = webdriver.ChromeOptions()
105     opt.set_headless()
106     driver = webdriver.Chrome(options=opt)
107     for url in detail_urls:
108         parameter = {}
109 
110         print('正在解析网址%s' % url)
111         driver.get(url)
112         html = driver.page_source
113         element = etree.HTML(html)
114 
115         # 名字name
116         name = element.xpath('//div[@class="spu-title"]/text()')
117         name = name[0].strip()
118         parameter['name'] = name
119 
120         # 价格price
121         price = element.xpath('//span[@class="price g_price g_price-highlight"]/strong/text()')[0]
122         price = str(price)
123         parameter['price'] = price
124 
125         # 特点specials
126         b = element.xpath('//div[@class="item col"]/text()')
127         specials = []
128         for i in b:
129             if re.match('w', i, re.S):
130                 specials.append(i)
131         parameter['specials'] = specials
132 
133         param = element.xpath('//span[@class="param"]/text()')
134         # 尺寸
135         size = param[0]
136         parameter['size'] = size
137         # 笔记本CPU
138         cpu = param[1]
139         parameter['cpu'] = cpu
140         # 显卡
141         graphics_card = param[2]
142         parameter['graphics_card'] = graphics_card
143         # 硬盘容量
144         hard_disk = param[3]
145         parameter['hard_disk'] = hard_disk
146         # 处理器主频
147         processor = param[4]
148         parameter['processor'] = processor
149         # 内存容量
150         memory = param[5]
151         parameter['memory'] = memory
152         parameter['url'] = url
153         print(parameter)
154         print('='*50)
155         save_to_mongo(parameter)
156         parameters.append(parameter)
157     return parameters
158         
159 
160 def save_to_mongo(result):
161     try:
162         if collection.insert(result):
163             print('success save to mongodb', result)
164     except Exception:
165         print('error to mongo')
166 
167 
168 def main():
169     url = 'https://www.taobao.com/'
170     list_page_url = search(url)
171     detail_urls = get_detail_url(list_page_url)
172     parameters = parse_detail(detail_urls)
173 
174 if __name__ == '__main__':
175     main()

运行结果

 数据库

原文地址:https://www.cnblogs.com/MC-Curry/p/9531908.html