selenium+plantomJS

#!/usr/bin/env python
# -*- coding:utf-8 -*-
"""
    流程框架:
        1.搜索关键词,利用selenium驱动浏览器搜索关键词,查询得到商品列表
        2.分析页码并翻页,得到商品页码数,模拟翻页,得到后续页面的商品列表
        3.分析提取商品内容,利用PyQuery分析源码,解析得到商品列表
        4.存储至MongoDB,将商品列表信息存储到Mongodb数据库
"""
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import re
from pyquery import PyQuery as pq
import pymongo

LOCATION = "localhost"
MONGO_DB = 'taobao'
MONGO_TABLE = "taobao"
KEYWORD = '零食'


client = pymongo.MongoClient(LOCATION)
db = client[MONGO_DB]


driver = webdriver.PhantomJS(service_args=['--load-images=false', ])
wait = WebDriverWait(driver, 20)
driver.set_window_size(1366, 768)


def search(keyword):
    print("正在搜索关键字:%s" % keyword)
    try:
        driver.get('http://www.taobao.com')
        element = wait.until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "#q"))
        )
        submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '.btn-search')))
        element.clear()
        element.send_keys(keyword)
        submit.click()
        total = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.total')))
        get_product()
        return total.text
    except TimeoutException:
        search(keyword)


def next_page(page_number):
    print("正在翻页:%s" % page_number)
    flag = False
    try:
        element = WebDriverWait(driver, 20).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, 'input.input:nth-child(2)')))
        submit = WebDriverWait(driver, 20).until(
                          EC.element_to_be_clickable((By.CSS_SELECTOR, 'span.btn:nth-child(4)')))
        element.clear()
        element.send_keys(page_number)
        submit.click()
        wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR, 'span.num'), str(page_number)))
        get_product()
        flag = True
    except TimeoutException:
        next_page(page_number)
    except Exception as e:
        print(e)
    return flag


def get_product():
    print("正在获取产品信息...")
    try:
        wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-itemlist .items .item')))
        html = driver.page_source
        doc = pq(html)
        items = doc('#mainsrp-itemlist .items .item').items()
        for item in items:
            product = {
                "img": item.find(".pic-box div a img").attr('src'),
                'title': item.find(".title a").text(),
                "sales": item.find(".deal-cnt").text(),
                "shop": item.find(".shopname").text(),
                "location": item.find(".location").text(),
                "price": item.find(".price strong").text()
            }
            save_to_mongo(product)
    except Exception as e:
        print(e)


def save_to_mongo(result):
    try:
        if db[MONGO_TABLE].insert(result):
            print("产品信息成功保存到mongodb", result)
    except Exception as e:
        print("保存失败!", e)


def main():
    page = search(KEYWORD)
    page = re.compile("d+").search(page).group(0)
    flag = ''
    for i in range(2, int(page) + 1):
        flag = next_page(i)
    return flag


if __name__ == "__main__":
    main()

  

原文地址:https://www.cnblogs.com/nixingguo/p/7266507.html