爬取企查查网站中安徽省内的企业数据信息

企查查网站中汇聚了有关注册企业的详细信息，为了更好的查询企业相关信息，本人对网站中安徽省境内的企业进行了爬取，其中遇到的问题和使用的技术如下：

1、遇到的问题：

　　1>企查查PC版数据只显示前500页，为了尽可能最大化爬取网站数据，本次爬取按照市级分别爬取，共计爬取安徽省境内16个市区共计80000条企业信息；

　　2>在爬取网站数据时，若爬取速度过快，会出现手动验证功能，为了解决手动验证，同时为了避免封号，直接采用随机更换IP代理，IP代理可以在《89免费代理》网站获取免费代理账号，网址为：http://www.89ip.cn/，可以一次性获取30个代理IP，如果不够用，

　　　　可以多次提取，然后构建代理池，本人试了，该网站的免费代理比西次代理和快代理网站的免费代理要好很多，如下图：

2、使用的技术：

　　1>请求模块：requests请求，为了避免反爬，采用随机代理，同时使用fake_useragent随机产生user-agent；

　　2>解析库：使用xpath和正则表达式

　　3>提速优化：采用多线程，同时对爬取的数据进行一次性保存，避免磁盘频繁IO；

3、核心代码如下：

import requests
from lxml import etree
from queue import Queue
from threading import Thread
from fake_useragent import UserAgent
import csv
import os
import re
import random
import time
from ippools import ProxySpider
from proxy_ip import IP_LIST


class QichachaSpider:
    def __init__(self):
        self.url = 'https://www.qichacha.com/gongsi_area.html?prov={}&city={}&p={}'
        self.q = Queue()
        self.company_info = []
        self.headers = {
            'Host': 'www.qichacha.com',
            'Referer': 'https: // www.qichacha.com /',
            'X-Requested-With': 'XMLHttpRequest'
        }

    # 随机User-Agent
    def random_ua(self):
        ua = UserAgent()
        return ua.random

    # 随机IP
    def random_proxy(self):
        proxy_list = ProxySpider().get_training_ip('https://www.qichacha.com/')
        return proxy_list

    # 爬取目标入队列
    def put_url(self):
        self.headers['User-Agent'] = self.random_ua()
        url = 'https://www.qichacha.com/'
        html = requests.get(url, headers=self.headers).content.decode('utf-8', 'ignore')
        parse_html = etree.HTML(html)
        r_list = parse_html.xpath('//div[@class="areacom"]/div[2]/div[2]/a/@href')
        for r in r_list:
            link = r.split('_')[1:]
            for i in range(1, 501):
                url = self.url.format(link[0], link[1], i)
                print(url)
                self.q.put(url)

    # 获取一级页面数据
    def get_data(self):
        while True:
            if not self.q.empty():
                url = self.q.get()
                self.headers['User-Agent'] = self.random_ua()
                # proxies = self.random_proxy()
                proxies = random.choice(IP_LIST)
                try:
                    html = requests.get(url, headers=self.headers, proxies=proxies, timeout=3).content.decode('utf-8',
                                                                                                              'ignore')
                    # html = requests.get(url, headers=self.headers).content.decode('utf-8', 'ignore')
                    # time.sleep(random.uniform(0.5, 1.5))
                    parse_html = etree.HTML(html)
                    company_list = parse_html.xpath('//table[@class="m_srchList"]/tbody/tr')
                    if company_list is not None:
                        for company in company_list:
                            try:
                                company_name = company.xpath('./td[2]/a/text()')[0].strip()
                                company_link = 'https://www.qichacha.com' + company.xpath('./td[2]/a/@href')[0].strip()
                                company_type, company_industry, company_business_scope = self.get_company_info(
                                    company_link)
                                company_person = company.xpath('./td[2]/p[1]/a/text()')[0].strip()
                                company_money = company.xpath('./td[2]/p[1]/span[1]/text()')[0].split('：')[-1].strip()
                                company_time = company.xpath('./td[2]/p[1]/span[2]/text()')[0].split('：')[-1].strip()
                                company_email = company.xpath('./td[2]/p[2]/text()')[0].split('：')[-1].strip()
                                company_phone = company.xpath('td[2]/p[2]/span/text()')[0].split('：')[-1].strip()
                                company_address = company.xpath('td[2]/p[3]/text()')[0].split('：')[-1].strip()
                                company_status = company.xpath('td[3]/span/text()')[0].strip()
                                company_dict = {
                                    '公司名称': company_name,
                                    '公司链接': company_link,
                                    '公司类型': company_type,
                                    '所属行业': company_industry,
                                    '经营范围': company_business_scope,
                                    '公司法人': company_person,
                                    '注册资本': company_money,
                                    '注册时间': company_time,
                                    '邮箱': company_email,
                                    '电话': company_phone,
                                    '地址': company_address,
                                    '是否存续': company_status,
                                }
                                print(company_dict)
                                # self.company_info.append(
                                #     (company_name, company_link, company_type, company_industry, company_business_scope,
                                #      company_person, company_money, company_time, company_email, company_phone,
                                #      company_address, company_status))
                                info_list = [company_name, company_link, company_type, company_industry,
                                             company_business_scope, company_person, company_money, company_time,
                                             company_email, company_phone, company_address, company_status]
                                self.save_data(info_list)

                            except:
                                with open('./bad.csv', 'a', encoding='utf-8', newline='') as f:
                                    writer = csv.writer(f)
                                    writer.writerow(url)
                                continue
                except:
                    self.q.put(url)

            else:
                break

    # 获取二级页面数据
    def get_company_info(self, company_link):
        headers = {'User-Agent': UserAgent().random}
        html = requests.get(company_link, headers=headers, proxies=random.choice(IP_LIST), timeout=3).content.decode(
            'utf-8', 'ignore')
        while True:
            if '企业类型' not in html:
                html = requests.get(company_link, headers=headers, proxies=random.choice(IP_LIST),
                                    timeout=3).content.decode(
                    'utf-8', 'ignore')
            else:
                break
        try:
            company_type = re.findall(r'企业类型</td> <td class="">(.*?)</td>', html, re.S)[0].strip()
            company_industry = re.findall(r'所属行业</td> <td class="">(.*?)</td>', html, re.S)[0].strip()
            company_business_scope = re.findall(r'经营范围.*?"3">(.*?)</td>', html, re.S)[0].strip()
            return company_type, company_industry, company_business_scope
        except:
            return '无', '无', '无'

    # 保存数据
    def save_data(self, info):
        with open('./1111.csv', 'a', encoding='utf-8', newline='') as f:
            writer = csv.writer(f)
            writer.writerow(info)

    def main(self):
        if os.path.exists('./1111.csv'):
            os.remove('./1111.csv')
            with open('./1111.csv', 'a', encoding='utf-8', newline='') as f:
                writer = csv.writer(f)
                writer.writerow(
                    ['公司名称', '公司链接', '公司类型', '所属行业', '经营范围', '公司法人', '注册资本', '注册时间', '邮箱', '电话', '地址', '是否存续'])
        self.put_url()
        t_list = []
        for i in range(0, 10):
            t = Thread(target=self.get_data)
            t_list.append(t)
            t.start()

        for j in t_list:
            j.join()


if __name__ == "__main__":
    spider = QichachaSpider()
    spider.main()

未了提高爬取速率，同时为了显示高大上，下面使用scrapy框架进行爬取，代码如下：

1、items.py

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html

import scrapy


class QichachaItem(scrapy.Item):
    company_name = scrapy.Field()
    company_person = scrapy.Field()
    company_money = scrapy.Field()
    company_establish = scrapy.Field()
    company_email = scrapy.Field()
    company_phone = scrapy.Field()
    company_address = scrapy.Field()
    company_risk = scrapy.Field()
    company_status = scrapy.Field()
    company_type = scrapy.Field()
    company_trade = scrapy.Field()
    company_business_scope = scrapy.Field()
    company_link = scrapy.Field()
    company_city = scrapy.Field()

2、qichacha.py

# -*- coding: utf-8 -*-
import scrapy
import time
import random
import re
from ..items import QichachaItem


class QichachaSpider(scrapy.Spider):
    name = 'qichacha'
    allowed_domains = ['www.qichacha.com']
    base_url = 'https://www.qichacha.com/gongsi_area.html?prov=AH&city={}&p={}'
    city_code_list = [340100, 340200, 340300, 340400, 340500, 340600, 340700, 340800, 341000,
                      341100, 341200, 341300, 341500, 341600, 341700, 341800]
    city_name_list = ['合肥市', '芜湖市', '蚌埠市', '淮南市', '马鞍山市', '淮北市', '铜陵市', '安庆市', '黄山市', '滁州市', '阜阳市', '宿州市', '六安市', '亳州市',
                      '池州市', '宣城市']
    base_company_url = 'https://www.qichacha.com{}'

    def start_requests(self):
        for i in range(len(self.city_code_list)):
            for j in range(1, 501):
                item = QichachaItem()
                item['company_city'] = self.city_name_list[i]
                url = self.base_url.format(self.city_code_list[i], j)
                yield scrapy.Request(
                    url=url,
                    meta={'item': item},
                    callback=self.parse_page
                )
            time.sleep(random.randint(30, 60))

    def parse_page(self, response):
        item = response.meta['item']
        company_list = response.xpath('//*[@id="searchlist"]/table/tbody/tr')
        for company in company_list:
            item['company_name'] = company.xpath('td[2]/a/text()').extract_first()
            item['company_link'] = self.base_company_url.format(company.xpath('td[2]/a/@href').extract_first())
            item['company_person'] = company.xpath('td[2]/p[1]/a/text()').extract_first()
            item['company_money'] = company.xpath('td[2]/p[1]/span[1]/text()').extract_first().split('：')[-1]
            item['company_establish'] = company.xpath('td[2]/p[1]/span[2]/text()').extract_first().split('：')[-1]
            item['company_email'] = company.xpath('td[2]/p[2]/text()').extract_first().split('：')[-1].strip()
            item['company_phone'] = company.xpath('td[2]/p[2]/span/text()').extract_first().split('：')[-1]
            item['company_address'] = company.xpath('td[2]/p[3]/text()').extract_first().split('：')[-1].strip()
            item['company_status'] = company.xpath('td[3]/span/text()').extract_first().split('：')[-1]
            yield scrapy.Request(
                url=item['company_link'],
                meta={'item': item},
                callback=self.parse_company
            )
        time.sleep(random.randint(10, 20))

    def parse_company(self, response):
        item = response.meta['item']
        html = response.text
        if re.findall(r'<h2>经营风险.*?<span>(.*?)</span>', html, re.S):
            item['company_risk'] = re.findall(r'<h2>经营风险.*?<span>(.*?)</span>', html, re.S)[0].strip()
        else:
            item['company_risk'] = '-'
        if re.findall(r'企业类型</td> <td class="">(.*?)</td>', html, re.S):
            item['company_type'] = re.findall(r'企业类型</td> <td class="">(.*?)</td>', html, re.S)[0].strip()
        else:
            item['company_type'] = '-'
        if re.findall(r'所属行业</td> <td class="">(.*?)</td>', html, re.S):
            item['company_trade'] = re.findall(r'所属行业</td> <td class="">(.*?)</td>', html, re.S)[0].strip()
        else:
            item['company_trade'] = '-'
        if re.findall(r'经营范围</td> <td class="" colspan="3">(.*?)</td>', html, re.S):
            item['company_business_scope'] = re.findall(r'经营范围</td> <td class="" colspan="3">(.*?)</td>', html, re.S)[
                0].strip()
        else:
            item['company_business_scope'] = '-'

        yield item
        time.sleep(random.uniform(0.5, 1))

3、pipelines.py

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
import pymysql
from .settings import *


class QichachaPipeline(object):
    def process_item(self, item, spider):
        print([item['company_name'], item['company_person'], item['company_money'], item['company_establish'],
               item['company_email'], item['company_phone'], item['company_address'], item['company_risk'],
               item['company_status'], item['company_type'], item['company_trade'], item['company_link'],
               item['company_city'], item['company_business_scope']])

        return item


class MysqlPipeline(object):
    def open_spider(self, spider):
        self.db = pymysql.connect(host=MYSQL_HOST, port=MYSQL_PORT, user=MYSQL_USER, password=MYSQL_PWD,
                                  database=MYSQL_DB, charset='utf8')
        self.cursor = self.db.cursor()

    def process_item(self, item, spider):
        ins = 'insert into qichachatab values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'
        info_list = [item['company_name'], item['company_city'], item['company_person'], item['company_money'],
                     item['company_establish'], item['company_email'], item['company_phone'], item['company_address'],
                     item['company_risk'], item['company_status'], item['company_type'], item['company_trade'],
                     item['company_link'], item['company_business_scope']]
        self.cursor.execute(ins, info_list)
        self.db.commit()

        return item

    def close_spider(self, spider):
        self.cursor.close()
        self.db.close()

4、middlewares.py

# -*- coding: utf-8 -*-

# Define here the models for your spider middleware
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html

from scrapy import signals


class QichachaSpiderMiddleware(object):
    # Not all methods need to be defined. If a method is not defined,
    # scrapy acts as if the spider middleware does not modify the
    # passed objects.

    @classmethod
    def from_crawler(cls, crawler):
        # This method is used by Scrapy to create your spiders.
        s = cls()
        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
        return s

    def process_spider_input(self, response, spider):
        # Called for each response that goes through the spider
        # middleware and into the spider.

        # Should return None or raise an exception.
        return None

    def process_spider_output(self, response, result, spider):
        # Called with the results returned from the Spider, after
        # it has processed the response.

        # Must return an iterable of Request, dict or Item objects.
        for i in result:
            yield i

    def process_spider_exception(self, response, exception, spider):
        # Called when a spider or process_spider_input() method
        # (from other spider middleware) raises an exception.

        # Should return either None or an iterable of Request, dict
        # or Item objects.
        pass

    def process_start_requests(self, start_requests, spider):
        # Called with the start requests of the spider, and works
        # similarly to the process_spider_output() method, except
        # that it doesn’t have a response associated.

        # Must return only requests (not items).
        for r in start_requests:
            yield r

    def spider_opened(self, spider):
        spider.logger.info('Spider opened: %s' % spider.name)


class QichachaDownloaderMiddleware(object):
    # Not all methods need to be defined. If a method is not defined,
    # scrapy acts as if the downloader middleware does not modify the
    # passed objects.

    @classmethod
    def from_crawler(cls, crawler):
        # This method is used by Scrapy to create your spiders.
        s = cls()
        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
        return s

    def process_request(self, request, spider):
        # Called for each request that goes through the downloader
        # middleware.

        # Must either:
        # - return None: continue processing this request
        # - or return a Response object
        # - or return a Request object
        # - or raise IgnoreRequest: process_exception() methods of
        #   installed downloader middleware will be called
        return None

    def process_response(self, request, response, spider):
        # Called with the response returned from the downloader.

        # Must either;
        # - return a Response object
        # - return a Request object
        # - or raise IgnoreRequest
        return response

    def process_exception(self, request, exception, spider):
        # Called when a download handler or a process_request()
        # (from other downloader middleware) raises an exception.

        # Must either:
        # - return None: continue processing this exception
        # - return a Response object: stops process_exception() chain
        # - return a Request object: stops process_exception() chain
        pass

    def spider_opened(self, spider):
        spider.logger.info('Spider opened: %s' % spider.name)


from fake_useragent import UserAgent


class RandomUserAgentDownloaderMiddleware(object):
    def process_request(self, requset, spider):
        ua = UserAgent().random
        requset.headers['User-Agent'] = ua


import redis
from .settings import *
from .proxies import ProxypoolSpider


class RandomProxyDownloaderMiddleware(object):
    def __init__(self):
        self.db = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, db=REDIS_DB, password=REDIS_PWD)

    def process_request(self, request, spider):
        proxy_list = self.db.zrangebyscore(REDIS_PROXY_KEY, 90, 100, withscores=True)
        if len(proxy_list) == 0:
            proxy_spider = ProxypoolSpider()
            proxy_spider.get_proxy()
        proxy = random.choice(proxy_list)[0].decode('utf-8')
        request.meta['proxy'] = proxy

    def process_response(self, request, response, spider):
        print(response.status, ": ", request.url)
        return request

    def process_exception(self, request, exception, spider):
        cur_proxy = request.meta['proxy']
        print('异常来了')
        self.db.zincrby(REDIS_PROXY_KEY, -1, cur_proxy)
        del request.meta['proxy']
        return request

5、settings.py

# -*- coding: utf-8 -*-

# Scrapy settings for Qichacha project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://docs.scrapy.org/en/latest/topics/settings.html
#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html
import time
import random

BOT_NAME = 'Qichacha'

SPIDER_MODULES = ['Qichacha.spiders']
NEWSPIDER_MODULE = 'Qichacha.spiders'

# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False

# Configure maximum concurrent requests performed by Scrapy (default: 16)
CONCURRENT_REQUESTS = 10

# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = time.sleep(random.uniform(10, 20))
# The download delay setting will honor only one of:
# CONCURRENT_REQUESTS_PER_DOMAIN = 16
# CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
# TELNETCONSOLE_ENABLED = False

# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    'Accept-Language': 'en',
    'Cookie': 'UM_distinctid=16bb7adb9252d8-09b3389bed6ae2-3a65420e-1fa400-16bb7adb92636e; zg_did=%7B%22did%22%3A%20%2216bb7adbb84740-04a7e287a3fa12-3a65420e-1fa400-16bb7adbb85669%22%7D; _uab_collina=156215474498922246746771; zg_63e87cf22c3e4816a30bfbae9ded4af2=%7B%22sid%22%3A%201562193465906%2C%22updated%22%3A%201562193465917%2C%22info%22%3A%201562193465914%2C%22superProperty%22%3A%20%22%7B%7D%22%2C%22platform%22%3A%20%22%7B%7D%22%2C%22utm%22%3A%20%22%7B%7D%22%2C%22referrerDomain%22%3A%20%22%22%7D; QCCSESSID=lnr0huo5t5s058h9tmlso56nu1; acw_tc=65e21c2915648350628141700eeaf85114e84964375db3a9f1b718d751; CNZZDATA1254842228=845561946-1562153840-https%253A%252F%252Fwww.baidu.com%252F%7C1565064428; hasShow=1; Hm_lvt_3456bee468c83cc63fb5147f119f1075=1565048078,1565048449,1565048590,1565067806; zg_de1d1a35bfa24ce29bbf2c7eb17e6c4f=%7B%22sid%22%3A%201565067805845%2C%22updated%22%3A%201565069298085%2C%22info%22%3A%201564658796236%2C%22superProperty%22%3A%20%22%7B%7D%22%2C%22platform%22%3A%20%22%7B%7D%22%2C%22utm%22%3A%20%22%7B%7D%22%2C%22referrerDomain%22%3A%20%22sp0.baidu.com%22%2C%22cuid%22%3A%20%22d48f6830513b318400fcc23636a23a7f%22%7D; Hm_lpvt_3456bee468c83cc63fb5147f119f1075=1565069298',
    'Referer': 'https://www.qichacha.com/',
}

# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
# SPIDER_MIDDLEWARES = {
#    'Qichacha.middlewares.QichachaSpiderMiddleware': 543,
# }

# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
    'Qichacha.middlewares.QichachaDownloaderMiddleware': 543,
    # 'Qichacha.middlewares.RandomUserAgentDownloaderMiddleware': 200,
    # 'Qichacha.middlewares.RandomProxyDownloaderMiddleware': 250,
}

# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
# EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
# }

# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
    'Qichacha.pipelines.QichachaPipeline': 300,
    'Qichacha.pipelines.MysqlPipeline': 100,
}

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
# AUTOTHROTTLE_ENABLED = True
# The initial download delay
# AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
# AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
# AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
# HTTPCACHE_ENABLED = True
# HTTPCACHE_EXPIRATION_SECS = 0
# HTTPCACHE_DIR = 'httpcache'
# HTTPCACHE_IGNORE_HTTP_CODES = []
# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

MYSQL_HOST = '127.0.0.1'
MYSQL_PORT = 3306
MYSQL_DB = 'qichachadb'
MYSQL_USER = 'root'
MYSQL_PWD = '123456'

# Redis数据库定义
REDIS_HOST = '127.0.0.1'
REDIS_PORT = 6379
REDIS_PWD = '123456'
REDIS_DB = 0
REDIS_PROXY_KEY = 'proxy'

# 日志管理
LOG_LEVEL = 'WARNING'
# LOG_FILE = 'qichacha.log'

# 编码设置
FEED_EXPORT_ENCODING = 'utf-8'

6、run.py

from scrapy import cmdline

cmdline.execute('scrapy crawl qichacha'.split())