C# Html Agility Pack

using System;
using HtmlAgilityPack;
using System.IO;
using System.Text;
using System.Text.RegularExpressions;
using System.Collections.Generic;

namespace ConsoleApp
{
    class Program
    {
        static string goText(HtmlNode _htmlnode, bool isSplit = true)
        {
            string str = "";

            try {
                // 获取text内容
                str = _htmlnode.InnerText;

                // 消除多余的符号
                str = Regex.Replace(str, "
|
|	| ", "").Trim();

                // 切割字符串
                if (isSplit && str.IndexOf("") >= 0) {
                    str = str.Split('')[1];
                }
            }
            catch {

            }          
            
            return str;
        }

        static void Main(string[] args)
        {
            // 获取index.html的内容
            string basePath = AppDomain.CurrentDomain.BaseDirectory + "/index.html";
            string html = "";
            if (File.Exists(@basePath)) {
                html = File.ReadAllText(@basePath, Encoding.Default);
            }
            
            // 开始计算耗时
            DateTime beforDT = System.DateTime.Now;

            // 使用HtmlAgilityPack解析它
            var htmlDoc = new HtmlDocument();
            htmlDoc.LoadHtml(html);

            // 报告编号
            var report_number = goText(htmlDoc.DocumentNode.SelectSingleNode("/html/body/div/div/table/tr[2]/td/table[1]/tbody/tr[2]/td[1]"));
            // 查询时间
            var query_time = goText(htmlDoc.DocumentNode.SelectSingleNode("/html/body/div/div/table/tr[2]/td/table[1]/tbody/tr[2]/td[2]"));
            // 报告时间
            var report_time = goText(htmlDoc.DocumentNode.SelectSingleNode("/html/body/div/div/table/tr[2]/td/table[1]/tbody/tr[2]/td[3]"));
            // 姓名
            var report_name = goText(htmlDoc.DocumentNode.SelectSingleNode("/html/body/div/div/table/tr[2]/td/table[2]/tbody/tr[1]/td[1]"));
            // 证件类型
            var report_type = goText(htmlDoc.DocumentNode.SelectSingleNode("/html/body/div/div/table/tr[2]/td/table[2]/tbody/tr[1]/td[2]"));
            // 证件号码
            var report_id = goText(htmlDoc.DocumentNode.SelectSingleNode("/html/body/div/div/table/tr[2]/td/table[2]/tbody/tr[1]/td[3]"));
            // 婚姻
            var report_marriage = goText(htmlDoc.DocumentNode.SelectSingleNode("/html/body/div/div/table/tr[2]/td/table[2]/tbody/tr[1]/td[4]"));



            // 表格
            var table_tr = htmlDoc.DocumentNode.SelectNodes("/html/body/div/div/table/tr[2]/td/table[4]/tr[3]/td/table/tbody/tr/td/table/tbody/tr");
            List<Table> list = new List<Table>();            
            // 遍历所有的tr
            foreach (var node in table_tr) {
                // 跳过第一次遍历吧
                if (node.NodeType == HtmlNodeType.Element) {
                    // 获取所有的Td
                    var tds = node.Elements("td");
                    Table tb = new Table();
                    int i = 0;
                    // 遍历所有的Td
                    foreach (var td in tds) {
                        if (td.NodeType == HtmlNodeType.Element) {
                            string text = goText(td, false);
                            // 使用比较蠢的方式赋值,自己想办法优化
                            switch (i) 
                            {
                                case 0:
                                    tb.a = text;
                                    break;
                                case 1:
                                    tb.b = text;
                                    break;
                                case 2:
                                    tb.c = text;
                                    break;
                                case 3:
                                    tb.d = text;
                                    break;
                            }
                        }
                        i++;
                    }
                    list.Add(tb);
                }
            }

            // 删除第一个节点。我不需要表头
            list.RemoveAt(0);
            Console.Write(list);

            // 结算程序耗时
            DateTime afterDT = System.DateTime.Now;
            TimeSpan ts = afterDT.Subtract(beforDT);
            Console.WriteLine("DateTime总共花费{0}ms.", ts.TotalMilliseconds);
            Console.ReadLine();
        }
    }

    public class Table
    {
        /// <summary>
        /// a
        /// </summary>
        public string a { get; set; }
        /// <summary>
        /// b
        /// </summary>
        public string b { get; set; }
        /// <summary>
        /// c
        /// </summary>
        public string c { get; set; }
        /// <summary>
        /// c
        /// </summary>
        public string d { get; set; }
    }
}
原文地址:https://www.cnblogs.com/CyLee/p/8029337.html