使用requests+pyquery爬取dd373地下城跨五最新商品信息

废话不多说直接上代码:

  可以使用openpyel库对爬取的信息写入Execl表格中代码我就不上传了

import requests
from urllib.parse import urlencode
from requests import RequestException
from pyquery import PyQuery as pq

def open_sh():
    #获取dd373html信息
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'
    }
    data = {
        "minPrice":333,
        "maxPrice":""
    }
    url = "https://www.dd373.com/s/rbg22w-x9kjbs-wwf11b-0-0-0-qquvn4-0-0-0-0-0-0-0-0.html?"+urlencode(data)
    try:
        response = requests.get(url,headers=headers)
        if response.status_code == 200:
            return response.text
        return None
    except RequestException:
        print("链接错误",url)
        return None

def doc_page(html):
    # 获取地下城账号信息
    doc = pq(html)
    content = doc("div.content")
    titleText = content.find(".box.money_ner").items()
    for items in titleText:
        product = {
            "地址":items.find("a.titleText").attr("href"),
            "账号信息":items.find("a.titleText").text(),
            "价格":items.find("div.money_text strong span").text()+'元',
            "是否存在":items.find("div.num.left").text()
        }
        print(product)
def page_sh(pagebox):
    # 循环遍历所有分页
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'
    }
    data = {
        "minPrice": 333,
        "maxPrice": ""
    }
    for page in range(1,pagebox+1):
        url = "https://www.dd373.com/s/rbg22w-x9kjbs-wwf11b-0-0-0-qquvn4-0-0-0-0-0-0-0-%s.html?%s"%(page,urlencode(data))
        try:
            page1 = page_currentpage(url)
            if page1==page:
                response = requests.get(url, headers=headers)
                if response.status_code == 200:
                    doc_page(response.text)
        except Exception as e:
            raise e

def page_currentpage(html):
    # 获取分页中被高亮的页数用于判断是否在 当前页面
    doc = pq(html)
    currentpage= doc("a.nb.currentpage").text()
    return int(currentpage)

def page_box(html):
    # 获取所有的页码
    doc = pq(html)
    pagebox = doc(".pagebox.clear ul li.yeshu").text()[9:-1]
    return int(pagebox)

def main():
    html = open_sh()
    page = page_box(html)
    page_sh(page)



if __name__ == "__main__":
    main()

  

原文地址:https://www.cnblogs.com/zhmiao/p/10684570.html