C#网页爬虫抓取行政区划

借鉴C#网页爬虫抓取行政区划,从国家统计局获取了最新行政区域数据。

以下为代码贴片:

数据库类:

public class City {
    public decimal ID { get; set; }
    public string Name { get; set; }
    public string Code { get; set; }
    public string Org_Level { get; set; }
    public string ParentCode { get; set; }
    public decimal ParentID { get; set; }
    public string Contry { get; set; }
    public string Loc_x { get; set; }
    public string Loc_y { get; set; }
  }

获取网页帮助类:

 1  public class HttpHelper {
 2     private static ILog log = log4net.LogManager.GetLogger(typeof(HttpHelper));
 3 
 4     public static string DownloadHtml(string url,Encoding encod) {
 5       string html = string.Empty;
 6       try {
 7         //设置请求参数
 8         HttpWebRequest request = HttpWebRequest.Create(url) as HttpWebRequest;
 9         request.Timeout = 10 * 1000;//10s超时
10         request.ContentType = "text/html;charset=utf-8";
11         request.UserAgent = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36";
12         
13         //获取结果
14         using(HttpWebResponse resp = request.GetResponse() as HttpWebResponse) {
15           if(resp.StatusCode != HttpStatusCode.OK) {
16             log.Fatal(string.Format("抓取{0}地址返回失败,response.StatusCode = {1}",url,resp.StatusCode));
17           } else {
18             try {
19               StreamReader sr = new StreamReader(resp.GetResponseStream(),encod);
20               html = sr.ReadToEnd();
21               sr.Close();
22             } catch(Exception e) {
23               log.Fatal(string.Format("DownLoadHtml抓取html{0}保存失败",url),e);
24               
25             }
26           }
27         }
28       } catch(Exception e) {
29         if(e.Message.Equals("远程服务器返回错误:(306)。")) {
30         }
31         log.Fatal(e);
32       } finally {
33       }
34       return html;
35     }
36   }

数据库保存帮助类:

  public class SQLHelper {
    
    /// 一个有效的数据库连接对象 
    /// 命令类型(存储过程,命令文本或其它.) 
    /// T存储过程名称或T-SQL语句 
    /// SqlParamter参数数组 
    /// 返回影响的行数 
    public static int ExecuteNonQueryForCity(List<City> cityList) {
      int count = 0;
      //string dbConnectStr = System.Configuration.ConfigurationSettings.AppSettings["DBContext"].ToString();
      var connectionString = System.Configuration.ConfigurationManager.ConnectionStrings["DBContext"].ConnectionString;
      using(SqlConnection connection = new SqlConnection(connectionString)) {
        if(connection.State != ConnectionState.Open) {
          connection.Open();
        }
        // 创建SqlCommand命令,并进行预处理 
        using(SqlCommand cmd = new SqlCommand()) {
          cmd.Connection = connection;
          cmd.CommandText = "insert into base_city(ID,name,Code,Contry,Loc_x,Loc_y,Org_Level,ParentCode,ParentID,state) values(@ID,@name,@Code,@Contry,@Loc_x,@Loc_y,@Org_Level,@ParentCode,@ParentID,@state)";
          foreach(var city in cityList) {
            try {
              if(string.IsNullOrEmpty(city.Name))
                city.Name = "";
              if(string.IsNullOrEmpty(city.Code))
                city.Code = "";
              if(string.IsNullOrEmpty(city.Contry))
                city.Contry = "";
              if(string.IsNullOrEmpty(city.Loc_x))
                city.Loc_x = "";
              if(string.IsNullOrEmpty(city.Loc_y))
                city.Loc_y = "";
              if(string.IsNullOrEmpty(city.Org_Level))
                city.Org_Level = "";
              if(string.IsNullOrEmpty(city.ParentCode))
                city.ParentCode = "";

              cmd.Parameters.Add(new SqlParameter("@ID",city.ID));
              cmd.Parameters.Add(new SqlParameter("@name",city.Name));
              cmd.Parameters.Add(new SqlParameter("@Code",city.Code));
              cmd.Parameters.Add(new SqlParameter("@Contry",city.Contry));             
              cmd.Parameters.Add(new SqlParameter("@Loc_x",city.Loc_x));  
              cmd.Parameters.Add(new SqlParameter("@Loc_y",city.Loc_y));
              cmd.Parameters.Add(new SqlParameter("@Org_Level",city.Org_Level));
              cmd.Parameters.Add(new SqlParameter("@ParentCode",city.ParentCode));
              cmd.Parameters.Add(new SqlParameter("@ParentID",city.ParentID));
              cmd.Parameters.Add(new SqlParameter("@state","1"));
              // Finally, execute the command 
              int retval = cmd.ExecuteNonQuery();
              if(retval == 0) {
                Console.WriteLine("插入错误:");
              }
              count += retval;
            } catch(Exception e) {
              Console.WriteLine("插入错误:" + e.Message);
            }
            // 清除参数,以便再次使用. 
            cmd.Parameters.Clear();
          }
        }
        connection.Close();
      }
      return count;
    }
  }

抓取数据:

 public class 省市县数据抓取 {
    private ILog log = log4net.LogManager.GetLogger(typeof(省市县数据抓取));
    public const string UrlStr = "http://www.stats.gov.cn/tjsj/tjbz/xzqhdm/201703/t20170310_1471429.html";
    public List<City> SaveList = new List<City>();
    public 省市县数据抓取() {
      try {
        log.Info("抓取数据");
        string HtmlStr = HttpHelper.DownloadHtml(UrlStr,Encoding.UTF8);
        HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();
        doc.LoadHtml(HtmlStr);
        //string goodsListPath = "//*[@id='J_goodsList']";
        //HtmlNode goodsListNode = doc.DocumentNode.SelectSingleNode(goodsListPath);
        string liPath = "//p[@class='MsoNormal']";
        HtmlNodeCollection goodsNodeCollection = doc.DocumentNode.SelectNodes(liPath);

        City c = new City() { 
          ID=1,
          Name = "全国",
          Code = "100000",
          Contry = "China",
          Org_Level = "1"
        };
        SaveList.Add(c);
        foreach(var item in goodsNodeCollection) {
          var firstNode = item.FirstChild;
          if(firstNode.Name == "b")
            GetProvince(item);
          else if(firstNode.InnerText == " ") {
            GetCity(item);
          } else if(firstNode.InnerText == "  ") {
            GetCounty(item);
          }
        }

      } catch(Exception e) {
        log.Info("last child code:" + SaveList.Last().Code);
        log.Info(e);
        throw (e);
      }
    }

    private void GetCounty(HtmlNode item) {
      City c = new City();
      c.Code = item.ChildNodes[1].InnerText.Replace(" ","").Trim();
      c.Name = item.ChildNodes[2].InnerText.Trim();
      c.Org_Level = "4";
      c.ID = SaveList.Last().ID + 1;
      var pc = SaveList.Last(i => i.Org_Level == "3");
      c.ParentCode = pc.Code;
      c.ParentID = pc.ID;
      c.Contry = "China";
      //if(c.Name == "市辖区")
      //  return;
      SaveList.Add(c);
    }

    private void GetCity(HtmlNode item) {
      City c = new City();
      c.Code = item.ChildNodes[1].InnerText.Replace(" ","").Trim();
      c.Name = item.ChildNodes[2].InnerText.Trim();     
      c.Org_Level = "3";
      c.ID = SaveList.Last().ID + 1;
      var pc = SaveList.Last(i => i.Org_Level == "2");
      c.ParentCode = pc.Code;
      c.ParentID = pc.ID;
      c.Contry = "China";
      SaveList.Add(c);

    }

    private void GetProvince(HtmlNode item) {
      City c = new City();
      c.Code = item.ChildNodes[0].FirstChild.InnerText.Replace(" ","").Trim();
      c.Name = item.ChildNodes[1].FirstChild.InnerText.Trim();
      c.Org_Level = "2";
      c.ID = SaveList.Last().ID + 1;
      var pc = SaveList.Last(i => i.Org_Level == "1");
      c.ParentCode = pc.Code;
      c.ParentID = pc.ID;
      c.Contry = "China";
      SaveList.Add(c);
    }

    public void Save() {
      log.Info("保存数据");
      SQLHelper.ExecuteNonQueryForCity(SaveList);
    }
  }

全国 Org_Level =1

省 Org_Level =2

市 Org_Level =3

县 Org_Level =4

SaveList 首先添加了一个全国属性城市,Org_Level =1

因为网页数据读取是从  省->市->县  ->省->市->县  这样循环读取的,所以在获取省、市、县的父级时,可以直接从SaveList 获取最后一个上一级别的对象即可

执行类:

省市县数据抓取 CityCatch = new 省市县数据抓取();
CityCatch.Save();

获取的数据如下:

 

原文地址:https://www.cnblogs.com/managersi/p/6941218.html