爬虫-HtmlAgilityPack

写了一个简单爬婴儿配方奶粉的小爬虫,使用HtmlAgilityPack
HtmlAgilityPack:https://html-agility-pack.net/

参考

HtmlAgilityPack - 详细简介和使用

using System;
using System.Collections.Generic;
using System.Linq;
using System.Net;
using System.Web;
using System.Web.Mvc;
using CrawlerForBaby.Models;
using CrawlerForBaby.Untity;
using HtmlAgilityPack;

namespace CrawlerForBaby.Controllers
{
    public class HomeController : Controller
    {

        public ActionResult Index()
        {
            return View();
        }

        public JsonResult GetTables(int page, int limit)
        {
            using (CrawlerForBabyEntities1 db = new CrawlerForBabyEntities1())
            {
                var query = (from s in db.BabyRecipe
                             join p in db.Product on s.ProductId equals p.ProductId
                             select new DTOList
                             {
                                 Id = s.Id,
                                 SerialNum = s.SerialNum,
                                 Project = s.Project,
                                 Unit = s.Unit,
                                 EveryHundredKJ = s.EveryHundredKJ,
                                 EveryHundredG = s.EveryHundredG,
                                 ProductId = p.ProductId,
                                 RegistrationID = p.RegistrationID,
                                 CommonName = p.CommonName,
                                 ProductName = p.ProductName,
                                 EngLishName = p.EngLishName,
                                 Process = p.Process,
                                 ProcessName = p.ProcessName,
                                 IsRawMilkSkim = p.IsRawMilkSkim,
                                 Type = p.Type
                             }).AsQueryable();

                var tables = query.OrderBy(s => s.Id).Skip((page - 1) * limit).Take(limit).ToList();

                return Json(new ResultModel<DTOList>() { success = true, code = 0, count = query.Count(), data = tables, msg = "" }, JsonRequestBehavior.AllowGet);
            }
        }

        [HttpPost]
        public JsonResult AddTables(string json)
        {
            if (json.IndexOf("http://tsspxx.gsxt.gov.cn/tyyp/detailyp.xhtml") < 0)
            {
                return Json(new ResultModel<string>() { success = false, msg = "链接不对" });
            }


            var url = json;
            var web = new HtmlWeb();
            var doc = web.Load(url);

            // With LINQ 
            var zch = doc.DocumentNode.Descendants("tr")
             .Where(x => x.ChildNodes["th"].InnerText == "注册号")
             .FirstOrDefault();

            string registNumber = zch.ChildNodes["td"].InnerText.Substring(4, zch.ChildNodes["td"].InnerText.Length - 4);

            var commonName = doc.DocumentNode.Descendants("tr")
            .Where(x => x.ChildNodes["th"].InnerText == "通用名称(产品)")
            .FirstOrDefault();

            var productName = doc.DocumentNode.Descendants("tr")
            .Where(x => x.ChildNodes["th"].InnerText == "商品名称(产品)")
            .FirstOrDefault();

            var ENName = doc.DocumentNode.Descendants("tr")
            .Where(x => x.ChildNodes["th"].InnerText == "英文名称(产品)")
            .FirstOrDefault();

            var process = doc.DocumentNode.Descendants("tr")
            .Where(x => x.ChildNodes["th"].InnerText == "生产工艺")
            .FirstOrDefault();


            var url1 = "http://tsspxx.gsxt.gov.cn:80//tyyp/detailPf.xhtml?COLUMN1667=%25E5%259B%25BD%25E9%25A3%259F%25E6%25B3%25A8%25E5%25AD%2597" + registNumber;
            var web1 = new HtmlWeb();
            var doc1 = web1.Load(url1);
            var headers = HTTPHeader.GetHTTPResponseHeaders(url1);
            string cookie = headers["Set-Cookie"];

            var url2 = "http://tsspxx.gsxt.gov.cn:80//tyyp/yppfpage.xhtml?currentPage=6";
            var web2 = new HtmlWeb();
            HtmlAgilityPack.HtmlWeb.PreRequestHandler handler = delegate (HttpWebRequest request)
            {
                request.Headers[HttpRequestHeader.AcceptEncoding] = "gzip, deflate";
                request.Headers[HttpRequestHeader.Cookie] = cookie;
                //request.Headers[HttpRequestHeader.Referer] = url1;
                request.AutomaticDecompression = DecompressionMethods.Deflate | DecompressionMethods.GZip;
                return true;
            };
            web2.PreRequest += handler;
            var doc2 = web2.Load(url2);

            var list = GetPageData.GetData(cookie);

            using (CrawlerForBabyEntities1 db = new CrawlerForBabyEntities1())
            {
                string commonNameText = commonName.ChildNodes["td"].InnerText;
                string productNameText = productName.ChildNodes["td"].InnerText;
                

                var isAdd = (from s in db.Product
                             where s.CommonName == commonNameText && s.ProductName == productNameText
                             select s).Any();
                if (!isAdd)
                {
                    Product product = new Product()
                    {
                        RegistrationID = zch.ChildNodes["td"].InnerText,
                        CommonName = commonNameText,
                        ProductName = productNameText,
                        EngLishName = ENName.ChildNodes["td"].InnerText,
                        Process = process.ChildNodes["td"].InnerText,
                    };

                    db.Product.Add(product);
                    db.SaveChanges();

                    foreach (var item in list)
                    {
                        var tds = item.DocumentNode.Descendants("td").ToList();

                        for (int i = 0; i < tds.Count; i++)
                        {
                            if (i % 5 == 0)
                            {
                                double kjResult = 0;
                                double gResult = 0;
                                var kj = double.TryParse(tds[i + 3].InnerText, out kjResult);
                                var g = double.TryParse(tds[i + 4].InnerText, out gResult);
                                BabyRecipe model = new BabyRecipe()
                                {
                                    SerialNum = Convert.ToInt32(tds[i].InnerText),
                                    Project = tds[i + 1].InnerText,
                                    Unit = tds[i + 2].InnerText,
                                    EveryHundredKJ = double.Parse(kjResult.ToString("0.00")),
                                    EveryHundredG = double.Parse(gResult.ToString("0.00")),
                                    ProductId = product.ProductId
                                };
                                db.BabyRecipe.Add(model);
                                db.SaveChanges();
                            }
                        }
                    }
                }
                else
                {
                    return Json(new ResultModel<string>() { success = false, msg = "已经存在" });
                }

            }

            return Json(new ResultModel<string>() { success = true, code = 0, msg = "" }, JsonRequestBehavior.AllowGet);
        }
    }
}

前端

@{
    ViewBag.Title = "Home Page";
}
<link href="~/Content/layui/css/layui.css" rel="stylesheet" />
<script src="~/Content/layui/layui.js"></script>

<div class="jumbotron">
    <h1>ASP.NET</h1>
</div>


<form class="layui-form" action="">
    <div class="layui-form-item layui-form-text">
        <label class="layui-form-label">URL</label>
        <div class="layui-input-block">
            示例:<a href="http://tsspxx.gsxt.gov.cn/tyyp/detailyp.xhtml?id=A4FA632E8E15B4D8E055620810C6201A" target="_blank">http://tsspxx.gsxt.gov.cn/tyyp/detailyp.xhtml?id=A4FA632E8E15B4D8E055620810C6201A</a>
        </div>
        <div class="layui-input-block">
            <textarea name="json" placeholder="请输入URL" class="layui-textarea" id="textjson" style="height: 50px;min-height:50px"></textarea>
        </div>
    </div>
    <div class="layui-form-item">
        <div class="layui-input-block">
            <button class="layui-btn" lay-submit lay-filter="formDemo">立即提交</button>
            <button type="reset" class="layui-btn layui-btn-primary">重置</button>
        </div>
    </div>
</form>

<div class="row">
    <table id="demo" lay-filter="test"></table>
</div>

<style>
    .layui-table-cell {
        height: 44px;
        line-height: 44px;
    }

    .optherName:active {
    }

    .optherName:hover {
        color: #ffffff;
        background-color: #379736;
    }
</style>

<script>
    layui.use(['laydate','table', 'form'], function () {
        var table = layui.table;
        var form = layui.form;
        var laydate = layui.laydate;

                //监听提交
        form.on('submit(formDemo)',
            function (data) {
                $.ajax({
                    url: "Home/AddTables",
                    //dataType: 'text',
                    contentType: "application/x-www-form-urlencoded",
                    data: data.field, //请求的附加参数,用json对象
                    method: 'POST',
                    success: function (res) {
                        console.log(res);
                        if (res.success) {
                            layer.msg("新增成功!");
                            tableObj.reload(); //重载表格
                        } else {
                            layer.msg(res.msg);
                        }

                    }
                });
                $('#textjson').val('');
                return false;
            });

        //第一个实例
        var tableObj = table.render({
            elem: '#demo'
            , height: 312
            , url: '/home/GetTables/' //数据接口
            , page: true //开启分页
            , cols: [[ //表头
                { field: 'Id', title: 'ID',  80, sort: true, fixed: 'left' }
                , { field: 'SerialNum', title: '序号',  60 }
                , { field: 'CommonName', title: '通用名',  240 }
                , { field: 'ProductName', title: '产品名',  140 }
                , { field: 'EngLishName', title: '英文',  120 }
                , { field: 'Process', title: '工艺',  150 }
                , { field: 'Unit', title: '单位',  80 }
                , { field: 'Project', title: '项目',  250, sort: true }
                , { field: 'Unit', title: '单位',  80 }
                , { field: 'EveryHundredKJ', title: '每100kJ',  120 }
                , { field: 'EveryHundredG', title: '每100g',  120, sort: true }
            ]]
        });
    });
</script>
原文地址:https://www.cnblogs.com/tangge/p/12834083.html