C#爬取国家统计局五级地址

// http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/index.html 
// 我这里是从省开始往下爬的,如果需要一次性爬取所有省的数据,得改一下从外一层开始爬
// 地址 
public string url;
// 存储表名
public string dbname;
// 省级编码
public string code;
// 省名称
public string name;
// 数据库名称
public static string database = "TEST";
// 处理连接超时等意外断开情况
public int flag = 0;
public void ProcessRequest(HttpContext context)
{
    url= System.Web.HttpUtility.HtmlDecode(System.Web.HttpContext.Current.Request.Form["url"]);
    TableExist(dbname);
    Provincial();
    City();
    County();
    Town();
    Village();
    context.Response.Write("爬取成功");
    
}
public void TableExist(string dbname) {
    DataTable dt = bll.SelectbySql("SELECT table_name FROM information_schema.TABLES WHERE table_name ='" + dbname + "'");
    if (dt.Rows.Count <= 0) {
        string sql =
            "USE [" + database + "]
" +
            "SET ANSI_NULLS ON
" +
            "SET QUOTED_IDENTIFIER ON
" +
            "CREATE TABLE[dbo].[" + dbname + "](" +
                "[ID][int] IDENTITY(1, 1) NOT NULL," +
                "[Code] [nvarchar] (20) NULL," +
                "[ParentCode] [nvarchar] (20) NULL," +
                "[Name] [nvarchar] (50) NULL," +
                "[Path] [nvarchar] (100) NULL," +
                "[PathName] [nvarchar] (200) NULL," +
                "[Levels] [int] NULL," +
                "[Urls]" +
                    "[nvarchar]" +
                    "(max) NULL," +

                "[DeleteMark] [bit] NULL," +
                "CONSTRAINT[PK_" + dbname + "] PRIMARY KEY CLUSTERED" +
            "(" +
                "[ID] ASC" +
            ")WITH(PAD_INDEX = OFF, STATISTICS_NORECOMPUTE = OFF, IGNORE_DUP_KEY = OFF, ALLOW_ROW_LOCKS = ON, ALLOW_PAGE_LOCKS = ON) ON[PRIMARY]" +
            ") ON[PRIMARY] TEXTIMAGE_ON[PRIMARY]
";
            //"GO";
        bll.RunbySql(sql);
    }
}
public void Provincial()
{
    bll.RunbySql("insert into " + dbname + " values('" + code + "','0','" + name + "','" + code + "','" + name + "',0,'" + url + "',0)");
}
public void City()
{
    DataTable dt = bll.SelectbySql("select * from " + dbname + " where DeleteMark=0 and Levels=0 and Urls is not null and Urls<>''");
    if (dt.Rows.Count > 0)
    {
        for (int i = 0; i < dt.Rows.Count; i++)
        {
            String HtmlString = returnHtml(dt.Rows[i]["Urls"].ToString());
            NSoup.Nodes.Document doc = NSoup.NSoupClient.Parse(HtmlString);
            //先获取id为artContent的元素,再获取所有的p标签
            Elements lists = doc.GetElementsByClass("citytr");
            foreach (Element element in lists)
            {
                //td节点,包括路径和编码
                Element elements_code = element.Children[0];
                //td节点,包括路径和名称
                Element elements_name = element.Children[1];
                if (elements_code.Children.Count > 0)
                {
                    elements_code = elements_code.Children[0];
                    elements_name = elements_name.Children[0];
                }

                string newurls = "";
                if (elements_code.HasAttr("href")) {
                    string thisUrl = dt.Rows[i]["Urls"].ToString();
                    newurls = thisUrl.Substring(0, thisUrl.LastIndexOf('/') + 1) + elements_code.Attr("href");
                }

                bll.RunbySql("insert into " + dbname + " values('" + elements_code.Text() + "','"+ dt.Rows[i]["Code"].ToString() + "','" + elements_name.Text() + "','" + dt.Rows[i]["Path"].ToString() + "/" + elements_code.Text() + "','" + dt.Rows[i]["PathName"].ToString() + "/" + elements_name.Text() + "',1,'" + newurls + "',0)");
            }
        }
    }
}
public void County()
{
    DataTable dt = bll.SelectbySql("select * from " + dbname + " where DeleteMark=0 and Levels=1 and Urls is not null and Urls<>''");
    if (dt.Rows.Count > 0)
    {
        for (int i = 0; i < dt.Rows.Count; i++)
        {
            String HtmlString = returnHtml(dt.Rows[i]["Urls"].ToString());
            NSoup.Nodes.Document doc = NSoup.NSoupClient.Parse(HtmlString);
            //先获取id为artContent的元素,再获取所有的p标签
            Elements lists = doc.GetElementsByClass("countytr");
            foreach (Element element in lists)
            {
                //td节点,包括路径和编码
                Element elements_code = element.Children[0];
                //td节点,包括路径和名称
                Element elements_name = element.Children[1];
                if (elements_code.Children.Count > 0)
                {
                    elements_code = elements_code.Children[0];
                    elements_name = elements_name.Children[0];
                }

                string newurls = "";
                if (elements_code.HasAttr("href"))
                {
                    string thisUrl = dt.Rows[i]["Urls"].ToString();
                    newurls = thisUrl.Substring(0, thisUrl.LastIndexOf('/') + 1) + elements_code.Attr("href");
                }

                bll.RunbySql("insert into " + dbname + " values('" + elements_code.Text() + "','" + dt.Rows[i]["Code"].ToString() + "','" + elements_name.Text() + "','" + dt.Rows[i]["Path"].ToString() + "/" + elements_code.Text() + "','" + dt.Rows[i]["PathName"].ToString() + "/" + elements_name.Text() + "',2,'" + newurls + "',0)");
            }
        }
    }
}
public void Town()
{
    DataTable dt = bll.SelectbySql("select * from " + dbname + " where DeleteMark=0 and Levels=2 and Urls is not null and Urls<>''");
    if (dt.Rows.Count > 0)
    {
        for (int i = 0; i < dt.Rows.Count; i++)
        {
            String HtmlString = returnHtml(dt.Rows[i]["Urls"].ToString());
            NSoup.Nodes.Document doc = NSoup.NSoupClient.Parse(HtmlString);
            //先获取id为artContent的元素,再获取所有的p标签
            Elements lists = doc.GetElementsByClass("towntr");
            foreach (Element element in lists)
            {
                //td节点,包括路径和编码
                Element elements_code = element.Children[0];
                //td节点,包括路径和名称
                Element elements_name = element.Children[1];
                if (elements_code.Children.Count > 0)
                {
                    elements_code = elements_code.Children[0];
                    elements_name = elements_name.Children[0];
                }

                string newurls = "";
                if (elements_code.HasAttr("href"))
                {
                    string thisUrl = dt.Rows[i]["Urls"].ToString();
                    newurls = thisUrl.Substring(0, thisUrl.LastIndexOf('/') + 1) + elements_code.Attr("href");
                }

                bll.RunbySql("insert into " + dbname + " values('" + elements_code.Text() + "','" + dt.Rows[i]["Code"].ToString() + "','" + elements_name.Text() + "','" + dt.Rows[i]["Path"].ToString() + "/" + elements_code.Text() + "','" + dt.Rows[i]["PathName"].ToString() + "/" + elements_name.Text() + "',3,'" + newurls + "',0)");
            }
        }
    }
}
public void Village()
{
    DataTable dt = bll.SelectbySql("select * from " + dbname + " where DeleteMark=0 and Levels=3 and Urls is not null and Urls<>''");
    if (dt.Rows.Count > 0)
    {
        for (int i = 0; i < dt.Rows.Count; i++)
        {
            String HtmlString = returnHtml(dt.Rows[i]["Urls"].ToString());
            NSoup.Nodes.Document doc = NSoup.NSoupClient.Parse(HtmlString);
            //先获取id为artContent的元素,再获取所有的p标签
            Elements lists = doc.GetElementsByClass("villagetr");
            foreach (Element element in lists)
            {
                //td节点,包括路径和编码
                Element elements_code = element.Children[0];
                //td节点,包括路径和名称
                Element elements_name = element.Children[2];

                bll.RunbySql("insert into " + dbname + " values('" + elements_code.Text() + "','" + dt.Rows[i]["Code"].ToString() + "','" + elements_name.Text() + "','" + dt.Rows[i]["Path"].ToString() + "/" + elements_code.Text() + "','" + dt.Rows[i]["PathName"].ToString() + "/" + elements_name.Text() + "',4,'',0)");
            }
        }
    }
}
public string returnHtml(string Urls) {
    String HtmlString = "";
    try
    {
        WebClient webClient = new WebClient();
        HtmlString = Encoding.GetEncoding("gb2312").GetString(webClient.DownloadData(Urls));
        flag = 0;
        return HtmlString;
    }
    catch
    {
        flag++;
        if (flag <= 10) {
            return returnHtml(Urls);
        }
        else
        {
            return HtmlString;
        }
    } 
}
原文地址:https://www.cnblogs.com/tenfly/p/14435772.html