抓取国家地区基础数据

一、介绍

1.1 背景

项目中地区基础数据从上线后再也没有更新过,与目前的国家地区数据不一致,需要做同步操作。

1.2 解决方案

通过采用爬虫手段从 国家统计局官网 抓取数据做同步。

采用 WebMagic 框架实现操作

二、操作

2.1 创建本地临时表

CREATE TABLE `ba_region` (  
  `region_id` varchar(10) COLLATE utf8mb4_bin NOT NULL COMMENT '地区代码',
  `region_name` varchar(255) COLLATE utf8mb4_bin NOT NULL COMMENT '地区名称',
  `region_type` varchar(2) COLLATE utf8mb4_bin NOT NULL COMMENT '地区级别',
  `region_Parent` varchar(10) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '上级地区代码',
  `province_id` varchar(10) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '省代码',
  `province_name` varchar(255) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '省名称',
  `city_id` varchar(10) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '市代码',
  `city_name` varchar(255) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '市名称',
  `district_id` varchar(10) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '区/县代码',
  `district_name` varchar(255) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '区/县名称',
  `street_id` varchar(10) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '乡/镇/街代码',
  `street_name` varchar(255) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '乡/镇/街名称',
  `creator` varchar(30) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '创建人员',
  `create_time` datetime DEFAULT NULL COMMENT '创建时间',
  `modifier` varchar(30) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '修改人员',
  `modify_time` datetime DEFAULT NULL COMMENT '修改时间',
  `ip_address` varchar(30) COLLATE utf8mb4_bin DEFAULT NULL COMMENT 'IP地址',
  PRIMARY KEY (`id`),
  KEY `inx_region_parent` (`region_Parent`) USING BTREE,
  KEY `inx_region_type` (`region_type`) USING BTREE
) ENGINE=InnoDB AUTO_INCREMENT=47122 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_bin COMMENT='地区基础数据';

2.2 添加相关依赖和配置

<dependencies>
    <dependency>
        <groupId>org.springframework.boot</groupId>
        <artifactId>spring-boot-starter-web</artifactId>
    </dependency>
    <dependency>
        <groupId>org.springframework.boot</groupId>
        <artifactId>spring-boot-starter-test</artifactId>
        <scope>test</scope>
    </dependency>
    <dependency>
        <groupId>us.codecraft</groupId>
        <artifactId>webmagic-core</artifactId>
        <version>0.7.3</version>
    </dependency>
    <dependency>
        <groupId>us.codecraft</groupId>
        <artifactId>webmagic-extension</artifactId>
        <version>0.7.3</version>
        <exclusions>
            <exclusion>
                <groupId>org.slf4j</groupId>
                <artifactId>slf4j-log4j12</artifactId>
            </exclusion>
        </exclusions>
    </dependency>

    <dependency>
        <groupId>org.mybatis.spring.boot</groupId>
        <artifactId>mybatis-spring-boot-starter</artifactId>
        <version>2.0.0</version>
    </dependency>

    <!--mysql 驱动依赖-->
    <dependency>
        <groupId>mysql</groupId>
        <artifactId>mysql-connector-java</artifactId>
        <scope>runtime</scope>
    </dependency>
    <dependency>
        <groupId>org.projectlombok</groupId>
        <artifactId>lombok</artifactId>
        <version>1.18.10</version>
        <scope>provided</scope>
    </dependency>
    <dependency>
       <groupId>org.apache.httpcomponents</groupId>
       <artifactId>httpclient</artifactId>
       <version>4.5.2</version>
   </dependency>
   <dependency>
       <groupId>org.apache.httpcomponents</groupId>
       <artifactId>httpcore</artifactId>
       <version>4.4.4</version>
   </dependency>
</dependencies>

数据库配置:

spring:
  datasource:
    driver-class-name: com.mysql.cj.jdbc.Driver
    url: jdbc:mysql://localhost:3306/test?serverTimezone=UTC&useUnicode=true&useSSL=false&useUnicode=true&characterEncoding=utf8
    username: root
    password: sa000

2.3 实现抓取逻辑

  1. 新增自定义页面处理组件
public class RegionPageProcessor implements PageProcessor {

    private Site site = Site.me()
            .setCharset("GBK")
            // 设置置重试次数
            .setRetryTimes(3)
            // 设置循环重试次数
            .setCycleRetryTimes(3)
            // 设置抓取间隔
            .setSleepTime(100)
            // 设置超时时间,单位毫秒
            .setTimeOut(10000);

    private String SPIDER_URL = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/";


    /**
     * 获取搜寻器配置
     *
     * @return
     */
    @Override
    public Site getSite() {
        return site;
    }

    /**
     * 处理
     *
     * @param page
     */
    @Override
    public void process(Page page) {
        page.setCharset("UTF-8");

        final Map<RegionType, List<Selectable>> map = groupingNode(page);
        for (RegionType regionType : map.keySet()) {
            final List<Selectable> selectables = map.get(regionType);
            if (selectables == null || selectables.size() <= 0) {
                continue;
            }

            List<BaRegion> regionList = conversion(page, regionType, selectables);
            if (regionList.size() <= 0) {
                continue;
            }
            page.putField("data", regionList);
        }

    }


    private Map<RegionType, List<Selectable>> groupingNode(Page page) {
        Map<RegionType, List<Selectable>> map = new HashMap<>();
        addNodeGroup(map, page, PROVINCE);
        addNodeGroup(map, page, RegionType.CITY);
        addNodeGroup(map, page, RegionType.COUNTY);
        addNodeGroup(map, page, RegionType.TOWN);
        return map;
    }

    private void addNodeGroup(Map<RegionType, List<Selectable>> map, Page page, RegionType regionType) {
        final List<Selectable> cityNodes = page.getHtml().xpath("//tr[@class='" + regionType.getClassName() + "']/td").nodes();
        if (cityNodes != null && cityNodes.size() > 0) {
            map.put(regionType, cityNodes);
        }
    }


    private List<BaRegion> conversion(Page page, RegionType regionType, List<Selectable> selectables) {
        List<BaRegion> result = new ArrayList<>();
        String code = "";
        String nodeCode = "";
        for (Selectable selectable : selectables) {
            final List<String> all = selectable.xpath("//*/text()").all();
            if (all == null || all.size() <= 0) {
                continue;
            }
            String nodeName = all.get(0);
            if (all.size() > 1) {
                nodeName = all.get(1);
            }

            if (StringUtils.isBlank(nodeName)) {
                continue;
            }
            if (compile("[0-9]*").matcher(nodeName).matches()) {
                code = nodeName;
                continue;
            }
            final List<String> links = selectable.links().all();
            // 部分市辖区下面没有区域
            if (links.size() <= 0) {
                if (StringUtils.isBlank(code)) {
                    continue;
                }
                if (RegionType.COUNTY.equals(regionType)) {
                    nodeCode = (code.substring(0, 2) + "/" + code.substring(2, 4) + "/" + code.substring(0, 6));
                }

            } else {
                // 获取下一级 URL
                String newUrl = links.get(0);
                boolean nextLevelSwitch = !RegionType.TOWN.equals(regionType) && StringUtils.isNotBlank(newUrl);
                if (nextLevelSwitch) {
                    page.addTargetRequest(newUrl);
                }
                nodeCode = newUrl.replace(SPIDER_URL, "").replace(".html", "");

            }
            if (StringUtils.isBlank(nodeCode)) {
                continue;
            }
            // 组装数据
            BaRegion baRegion = toBaRegion(regionType, nodeCode, nodeName);
            result.add(baRegion);
        }
        return result;
    }

    private BaRegion toBaRegion(RegionType regionType, String code, String nodeName) {
        BaRegion region = new BaRegion();
        region.setRegionType(regionType.getCode());
        region.setRegionName(nodeName);

        String[] split;
        String parentId;
        String regionId;

        switch (regionType) {
            case PROVINCE:
                region.setRegionId(code + "0000");
                region.setRegionParent("1000");
                break;
            case CITY:
                split = code.split("/");
                parentId = split[0] + "0000";
                regionId = split[split.length - 1] + "00";
                region.setRegionId(regionId);
                region.setRegionParent(parentId);
                break;
            case COUNTY:
                split = code.split("/");
                parentId = split[0] + split[1] + "00";
                regionId = split[2];
                region.setRegionId(regionId);
                region.setRegionParent(parentId);
                break;
            case TOWN:
                split = code.split("/");
                if (split.length == 3) {// 针对上一级直接是城市的地区
                    parentId = split[0] + split[1] + "00";
                    regionId = split[2];
                } else {
                    parentId = split[0] + split[1] + split[2];
                    regionId = split[3];
                }
                region.setRegionId(regionId);
                region.setRegionParent(parentId);
                break;
            default:
                break;
        }
        return region;
    }


}

涉及枚举类:

public enum RegionType {
    PROVINCE("10", "provincetr", "省份"),
    CITY("20", "citytr", "城市"),
    COUNTY("30", "countytr", "区/县"),
    TOWN("40", "towntr", "镇/街道"),
    VILLAGE("50", "villagetr", "居委会"),
    ;

    private String code;
    private String className;
    private String desc;

    RegionType(String code, String className, String desc) {
        this.code = code;
        this.className = className;
        this.desc = desc;
    }

    public String getCode() {
        return code;
    }

    public String getDesc() {
        return desc;
    }

    public String getClassName() {
        return className;
    }
}

  1. 实现结果持久化到数据库
@Component
public class SqlPipeline implements Pipeline {

    @Autowired
    private BaRegionMapper baRegionMapper;

    @Override
    public void process(ResultItems resultItems, Task task) {
        List<BaRegion> baRegionList = resultItems.get("data");
        if (baRegionList == null || baRegionList.size() <= 0) {
            System.out.println("抓取数据不存在");
            return;
        }
        for (BaRegion baRegion : baRegionList) {
            baRegionMapper.add(baRegion);
        }
    }
}

DAO 实现如下:

@Mapper
public interface BaRegionMapper {
    @Insert("INSERT INTO ba_region VALUES(#{regionId},#{regionName},#{regionType},#{regionParent}," +
            "#{provinceId},#{provinceName},#{cityId},#{cityName},#{districtId},#{districtName},#{streetId}," +
            "#{streetName},#{creator},#{createTime},#{modifier},#{modifyTime},#{ipAddress})")
    int add(BaRegion region);
}
  1. 默认的页面下载实现类 HttpClientDownloader 没有记录下载失败的URL ,我们继承 HttpClientDownloader 实现 onError 方法记录异常 URL 列表:
@ThreadSafe
@Component
public class HttpClientDownloader2 extends HttpClientDownloader {
    
    private static List<String> errorUrls = new ArrayList<>();

    @Override
    protected void onError(Request request) {
        super.onError(request);
        errorUrls.add(request.getUrl());
    }

    public List<String> getErrorUrls() {
        return errorUrls;
    }

}
  1. 执行抓取:
@SpringBootTest
class ReptileApplicationTests {

    @Autowired
    private SqlPipeline sqlPipeline;
    @Autowired
    private HttpClientDownloader2 httpClientDownloader2;

    @Test
    public void start() {
        exec("http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/index.html");
		// 异常URL 再次尝试抓取
        final List<String> errorUrls = httpClientDownloader2.getErrorUrls();
        if (errorUrls != null && errorUrls.size() > 0) {
            for (String errorUrl : errorUrls) {
                errorUrls.remove(errorUrl);
                exec(errorUrl);
            }
        }
        // 二次抓取失败的异常URL,打印输出
        final List<String> errorUrls2 = httpClientDownloader2.getErrorUrls();
        for (String errorUrl : errorUrls2) {
            System.out.println(errorUrl);
        }
    }

    private void exec(String url) {
        List<SpiderListener> spiderListeners = new ArrayList<>();
        // 记录异常的URL
        spiderListeners.add(new ErrorLogSpiderListener());
        Spider.create(new RegionPageProcessor())
                .addUrl(url)
                .setDownloader(httpClientDownloader2)
                .addPipeline(sqlPipeline).run();

    }
}

2.4 整理数据

  1. 去除重复数据
-- 创建自增ID 
ALTER TABLE `test`.`ba_region` 
ADD COLUMN `id` bigint(20) NOT NULL AUTO_INCREMENT FIRST,
ADD PRIMARY KEY (`id`);
-- 删除重复数据
delete from ba_region where id in(
    select id from (
        select max(id)as id from ba_region 
        	where region_id in(
                	select region_id from ba_region  
                		group by region_id HAVING count(1) > 1
            ) 
        group by region_id) a
)
  1. 手动添加国家信息
INSERT INTO `ba_region`(`region_id`, `region_name`, `region_type`, `region_Parent`, `province_id`, `province_name`, `city_id`, `city_name`, `district_id`, `district_name`, `street_id`, `street_name`, `creator`, `create_time`, `modifier`, `modify_time`, `ip_address`) VALUES ('1000', '中国', '00', '', NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL);
  1. 更新省级数据
update ba_region set region_Parent='1000',province_id=region_id,province_name=  region_name  where region_type='10';
  1. 更新市级信息
-- 更新城市字段
update ba_region set city_id =region_id,city_name = region_name   where region_type='20';
-- 更新省份字段
update ba_region a,ba_region b set a.province_id=b.region_id,a.province_name= b.region_name
where a.region_type='20' and a.region_Parent = b.region_id   ;
  1. 更新区/县级信息
-- 更新区/县字段
update ba_region set district_id =region_id,district_name = region_name   where region_type='30';
-- 更新省份、城市字段
update ba_region a,ba_region b set  a.province_id=b.province_id,a.province_name= b.province_name, a.city_id=b.region_id,a.city_name= b.region_name
where a.region_type='30' and a.region_Parent = b.region_id;

6) 更新镇/街道信息

-- 更新镇/街道字段
update ba_region set street_id =region_id,street_name = region_name   where region_type='40';
-- 更新省份、城市、区/县字段
update ba_region a,ba_region b set  a.province_id=b.province_id,a.province_name= b.province_name, a.city_id=b.city_id,a.city_name= b.city_name,a.district_id=b.region_id,a.district_name= b.region_name
where a.region_type='40' and a.region_Parent = b.region_id;       
  1. 删除自增字段
ALTER TABLE `test`.`ba_region` 
DROP COLUMN `id`,
DROP PRIMARY KEY;
原文地址:https://www.cnblogs.com/markLogZhu/p/13557933.html