基于pydpier爬取1药网(转载)

1.商品爬取

#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Created on 2019-02-02 08:59:40
# Project: oneDrug

from pyspider.libs.base_handler import *
from pymongo import MongoClient
import re


class Handler(BaseHandler):
    crawl_config = {
    }

    def __init__(self):
        self.client = MongoClient('mongodb://localhost:27017')
        self.drug = self.client.drug

    def insert_goods(self, data):
        collection = self.drug['goods']
        collection.update({'goods_id': data['goods_id']}, data, True)

    def insert_comments(self, data):
        collection = self.drug['comments']
        collection.insert_one(data)

    @every(minutes=24 * 60)
    def on_start(self):
        self.crawl('https://www.111.com.cn/categories/', callback=self.categories_page, validate_cert=False,
                   fetch_type='js')

    @config(age=10 * 24 * 60 * 60)
    def categories_page(self, response):
        for each in response.doc('.allsort em > a').items():
            self.crawl(each.attr.href, callback=self.cagetory_list_page, validate_cert=False, fetch_type='js')

    @config(priority=1)
    def cagetory_list_page(self, response):
        for each in response.doc('#itemSearchList a[target="_blank"][class="product_pic pro_img"]').items():
            self.crawl(each.attr.href, callback=self.detail_page, validate_cert=False, fetch_type='js')
        next = response.doc('#search_table > div.turnPageBottom > a.page_next').attr.href
        self.crawl(next, callback=self.cagetory_list_page, validate_cert=False, fetch_type='js')

    @config(priority=2)
    def detail_page(self, response):
        goods_id = response.doc('#gallery_view > ul > li.item_number').text()
        cagetory_one = response.doc('body > div.wrap.clearfix > div > span:nth-child(3) > a').text()
        cagetory_two = response.doc('body > div.wrap.clearfix > div > span:nth-child(5) > a').text()
        cagetory_three = response.doc('body > div.wrap.clearfix > div > span:nth-child(7) > a').text()
        merchants = response.doc('div.middle_property > span:nth-child(1)').text()
        goods_name = response.doc('div.middle_property > h1').text()
        goods_desc = response.doc('div.middle_property > span.red.giftRed').text()
        goods_price = response.doc(
            'div.middle_property > div.shangpin_info > dl:nth-child(2) > dd > span.good_price').text()
        total_comments = response.doc('#fristReviewCount > span > a').text()

        brand = response.doc(
            '#tabCon > div:nth-child(1) > div.goods_intro > table > tbody > tr:nth-child(2) > td:nth-child(2)').text()
        spec = response.doc(
            '#tabCon > div:nth-child(1) > div.goods_intro > table > tbody > tr:nth-child(2) > td:nth-child(4)').text()
        weight = response.doc(
            '#tabCon > div:nth-child(1) > div.goods_intro > table > tbody > tr:nth-child(3) > td:nth-child(2)').text()
        manufacturers = response.doc(
            '#tabCon > div:nth-child(1) > div.goods_intro > table > tbody > tr:nth-child(3) > td:nth-child(4)').text()
        approval_number = response.doc(
            '#tabCon > div:nth-child(1) > div.goods_intro > table > tbody > tr:nth-child(4) > td:nth-child(2)').text()
        drug_type = response.doc(
            '#tabCon > div:nth-child(1) > div.goods_intro > table > tbody > tr:nth-child(4) > td:nth-child(4)').text()

        instructions = {}
        if response.doc('#prodDetailCotentDiv > table > tbody > tr:nth-child(1) > th').text():
            for i in range(3, 22):
                instructions_key = 
                response.doc('#prodDetailCotentDiv > table > tbody > tr:nth-child({}) > th'.format(i)).text().split(
                    " ")[0]
                instructions_value = response.doc(
                    '#prodDetailCotentDiv > table > tbody > tr:nth-child({}) > td'.format(i)).text()
                instructions[instructions_key] = instructions_value

        total_comments = response.doc('#itemComments > span').text()
        good_comments = response.doc('#productExperience > div > ul > li:nth-child(2) > a > span').text()
        mid_comments = response.doc('#productExperience > div > ul > li:nth-child(3) > a > span').text()
        bad_comments = response.doc('#productExperience > div > ul > li:nth-child(4) > a > span').text()

        url_id = re.findall('d+', response.url)[1]

        goods_data = {
            'url_id': url_id,
            'goods_id': goods_id,
            'goods_name': goods_name,
            'goods_desc': goods_desc,
            'goods_price': goods_price,
            'merchants': merchants,
            'cagetory': {
                '1': cagetory_one,
                '2': cagetory_two,
                '3': cagetory_three
            },
            'drug_detail': {
                'brand': brand,
                'spec': spec,
                'weight': weight,
                'manufacturers': manufacturers,
                'approval_number': approval_number,
                'drug_type': drug_type
            },
            'instructions': instructions,
            'comments': {
                'total_comments': total_comments,
                'good_comments': good_comments,
                'mid_comments': mid_comments,
                'bad_comments': bad_comments
            }
        }
        self.insert_goods(goods_data)

2.评论爬取

from pymongo import MongoClient
import requests
from bs4 import BeautifulSoup
import re
import socket


class Drug:
    def __init__(self):
        self.clint = MongoClient('mongodb://localhost:27017')
        self.drug = self.clint.drug
        self.collection = self.drug['goods']
        self.comm_collection = self.drug['comments']

    def dbmodify(self):
        for data in self.collection.find({},{"goods_id":1,"goods_price":1}):
            try:
                _id = data['_id']
                id = data['goods_id'].split("")[1]
                price = data['goods_price'].split("")[1]
                self.collection.update({'_id': _id},{'$set':{'goods_id':id,'goods_price':price}})
                print(_id, id, price)
            except IndexError:
                pass



    def getBaseArgument(self,goods_id):
        base_url = 'https://www.111.com.cn/interfaces/review/list/html.action'
        data = {
            'goodsId': goods_id,
            'pageIndex': 1,
            'score': '1&_19020301'
        }
        try:
            self.collection.update_one({'url_id': goods_id}, {'$set': {'commspider': True}})
            requests.packages.urllib3.disable_warnings()
            requests.adapters.DEFAULT_RETRIES = 5
            # 设置连接活跃状态为False
            s = requests.session()
            s.keep_alive = False
            r = s.get(base_url, params=data, timeout = 5,verify=False)
            r.close()
            soup = BeautifulSoup(r.text, 'html.parser')
            if soup.find_all("div", class_="view_no_result"):
                return "No Comments!"
            else:
                total_page_text = soup.find_all(text=re.compile(r'共d+页'))[0]
                pattern = re.compile(r'd+')
                total_page = pattern.findall(total_page_text)
                return total_page[0]
        except requests.exceptions.RequestException as e:
            print(e)

    def getCommlist(self,goods_id, total_page):
        base_url = 'https://www.111.com.cn/interfaces/review/list/html.action'
        try:
            for i in range(1, int(total_page)):
                data = {
                    'goodsId': goods_id,
                    'pageIndex': i,
                    'score': '1&_19020301'
                }
                try:
                    requests.packages.urllib3.disable_warnings()
                    requests.adapters.DEFAULT_RETRIES = 15
                    # 设置连接活跃状态为False
                    s = requests.session()
                    s.keep_alive = False
                    r = s.get(base_url, params=data, timeout = 5,verify=False)
                    r.close()
                    soup = BeautifulSoup(r.text, 'html.parser')
                    for tr in soup.find_all("tr"):
                        comments = {}
                        try:
                            comments['goodsId'] = goods_id
                            comments['content'] = tr.find('p').text.strip()
                            comments['date'] = tr.find('p', attrs={'class': 'eval_date'}).text.strip()
                            self.comm_collection.insert_one(comments)
                        except:
                            print(goods_id + "Have some problem!
")
                        print(comments)
                except requests.exceptions.RequestException as e:
                    print(e)
        except ValueError:
            return "No Comments! Try next!"

    def getComments(self):
        i = 0
        goods_list = []
        for data in self.collection.find({'commspider': False}, {"url_id"}):
            id = data['url_id']
            goods_list.append(id)
        length = len(goods_list)
        print("总共 {} 条商品".format(length))
        for good in goods_list:
            total_page = self.getBaseArgument(good)
            comments = self.getCommlist(good,total_page)
            i = i + 1
            print("总共 {} 条商品
目前第 {} 条
商品编号 {} 
".format(length,i, good))
            print(comments)


test = Drug().getComments()
原文地址:https://www.cnblogs.com/tjp40922/p/10611624.html