NET 爬虫

最近经常听说或者接触关于网络爬虫的问题,只是一直看到被人写的代码。而没有真正的做过实践,

昨天做了一下尝试,其中采用网络流行的扩展类库 http://html-agility-pack.net/?z=codeplex 

遇到的问题是:部分网站禁止爬虫,或者有规则验证,无法通过模拟http 请求获取 html 

本测试案例 通过模拟http 请求获取html ,通过Html Agility Pack 分析节点,获取对应节点的值,其中本案例采用的是:赶集网的数据

代码如下:

 private static void ClearnHtml(string html)
        {
            var htmlDoc = new HtmlAgilityPack.HtmlDocument();
            htmlDoc.LoadHtml(html);
            var list = new List<Room>();
            var sb = new StringBuilder();//f-list-item ershoufang-list
            HtmlAgilityPack.HtmlNodeCollection  htmlBody = htmlDoc.DocumentNode.SelectNodes("*//div[@class='f-list-item ershoufang-list']");
          
            foreach(HtmlAgilityPack.HtmlNode roomitem in htmlBody)
            {
                var room = new Room();
                if (roomitem != null) {
                    try
                    {

                        var title = roomitem.SelectNodes("*//a[@class='js-title value title-font']").FirstOrDefault() != null ? roomitem.SelectNodes("*//a[@class='js-title value title-font']").FirstOrDefault().InnerText.Replace("
", "").Replace(" ", "") : "0";
                        room.Type = roomitem.SelectNodes("*//span[@class='first js-huxing']").FirstOrDefault() != null ? roomitem.SelectNodes("*//span[@class='first js-huxing']").FirstOrDefault().InnerText.Replace("
", "").Replace(" ", "") : "1";
                        room.buju = roomitem.SelectNodes("*//dd[@class='dd-item size']/span[3]").FirstOrDefault() != null ? roomitem.SelectNodes("*//dd[@class='dd-item size']/span[3]").FirstOrDefault().InnerText.Replace("
", "").Replace(" ", "") : "0";
                        room.mianji = roomitem.SelectNodes("*//dd[@class='dd-item size']/span[5]").FirstOrDefault() != null ? roomitem.SelectNodes("*//dd[@class='dd-item size']/span[5]").FirstOrDefault().InnerText.Replace("
", "").Replace(" ", "") : "0";
                        room.Direction = roomitem.SelectNodes("*//dd[@class='dd-item size']/span[7]").FirstOrDefault() != null ? roomitem.SelectNodes("*//dd[@class='dd-item size']/span[7]").FirstOrDefault().InnerText.Replace("
", "").Replace(" ", "") : "0";
                        room.Floor = roomitem.SelectNodes("*//dd[@class='dd-item size']/span[9]").FirstOrDefault() != null ? roomitem.SelectNodes("*//dd[@class='dd-item size']/span[9]").FirstOrDefault().InnerText.Replace("
", "").Replace(" ", "") : "0";
                        room.zhuangxiu = roomitem.SelectNodes("*//span[@class='last']").FirstOrDefault() != null ? roomitem.SelectNodes("*//span[@class='last']").FirstOrDefault().InnerText.Replace("
", "").Replace(" ", "") : "0";
                        room.area = roomitem.SelectNodes("*//span[@class='area']").FirstOrDefault() != null ? roomitem.SelectNodes("*//span[@class='area']").FirstOrDefault().InnerText.Replace("
", "").Replace(" ", "") : "0";
                        room.feature = roomitem.SelectNodes("*//dd[@class='dd-item feature']").FirstOrDefault() != null ? roomitem.SelectNodes("*//dd[@class='dd-item feature']").FirstOrDefault().InnerText.Trim().Replace("
", "").Replace(" ", "") : "0";
                        room.Price = roomitem.SelectNodes("*//div[@class='price']/span[1]").FirstOrDefault() != null ? roomitem.SelectNodes("*//div[@class='price']/span[1]").FirstOrDefault().InnerText.Replace("
", "").Replace(" ", "").Replace(" ", "") : "0";

                    }
                    catch (Exception ex) {
                        continue;
                    }
                   
                }
                sb.Append($"insert into room(title,Type,buju,mianji,Direction,Floor,zhuangxiu,area,feature,Price)values");
                sb.Append($"('{room.title}','{room.Type}','{room.buju}','{ room.mianji}','{room.Direction}','{room.Floor}','{room.zhuangxiu}','{room.area}','{room.feature}','{room.Price}');");
                //
               // list.Add(room);
            }
            var connection = new MySqlConnection("Server=127.0.0.1;Database=personal;Uid=ken;Pwd=123456;");
            connection.Execute(sb.ToString());
          

        }

  

原文地址:https://www.cnblogs.com/yanwuming/p/9606628.html