Dooioo Deal

using AnfleCrawler.Common;
using System;
using System.Collections.Generic;
using System.Linq;
using System.Net;
using System.Text;
using System.Threading.Tasks;

namespace AnfleCrawler.DataAnalyzer
{
    internal class Dooioo : AnalyzerBase
    {
        protected override void AnalyzeInternal(PageLandEntity current)
        {
            var lander = Crawler.Lander;
            var pHandler = CreateContentHandler(current);
            switch (current.Depth)
            {
                case 0:
                    {
                        var dom = lander.GetDocument(pHandler);
                        DoPerPaging(current, dom.DocumentNode, ".pagination a:last-child");

                        foreach (var node in QueryNodes(dom.DocumentNode, "#hlist a"))
                        {
                            var url = GetHref(node, current.Url);
                            Crawler.PushUrl(url, DataDepth.Houses);
                        }
                    }
                    break;
                case DataDepth.Houses:
                    {
                        var dom = lander.GetDocument(pHandler);
                        var attrs = new AttributeFiller();

                        var Nset = QueryNodes(dom.DocumentNode, "#building-info li").Select(p =>
                        {
                            var spans = QueryTexts(p, "span").ToArray();
                            return string.Format("{0}:{1}", spans[0], spans[1]);
                        });
                        attrs.Append(Nset);

                        Guid hashKey = GenHashKey(current.Url.OriginalString);
                        var bo = Crawler.Repository.LoadHouses(hashKey);
                        bo.SiteID = current.Url.GetDomain();
                        bo.PageUrl = current.Url.OriginalString;
                        bo.CityName = Crawler.Config.CityName;
                        attrs.FillEntity(bo, new Dictionary<string, string>() 
                        {
                            {"小区名", "小区名称"},
                            {"板块", "所属区域"},
                            {"建造年代", "竣工时间"},
                            {"地址", "小区地址"},
                            {"物业类型", "物业类别"},
                        });
                        MapMark(bo);
                        Repository.Save(bo);
                        Crawler.OutWrite("保存楼盘 {0}", bo.小区名称);

                        var Pset = QueryNodes(dom.DocumentNode, ".pagination a", false);
                        if (Pset.Any())
                        {
                            string pageCount = Pset.Skip(Pset.Count() - 2).First().InnerText;
                            Crawler.PushUrl(new Uri(string.Format("http://www.dooioo.com/ershoufang/s117862?p=[2-{0}]", pageCount)), DataDepth.Deal, bo.RowID);
                        }
                        SaveHouselisting(bo.RowID, current, dom);
                    }
                    break;
                case DataDepth.Deal:
                    {
                        Guid housesID = (Guid)current.State;
                        pHandler.CrossLoad = (arg, xDom) =>
                        {
                            string pName = "p";
                            if (arg.IsRepost)
                            {
                                arg.IsRepost = false;
                                return;
                            }
                            var query = System.Web.HttpUtility.ParseQueryString(arg.RequestUrl.Query);
                            int pageIndex;
                            if (!int.TryParse(query[pName], out pageIndex))
                            {
                                pageIndex = 1;
                            }

                            var input = xDom.GetElementsByTagName("ul").Cast<System.Windows.Forms.HtmlElement>()
                                .Where(p => p.GetAttribute("class").Contains("pagination")).FirstOrDefault();
                            if (input == null)
                            {
                                App.LogInfo("CrossLoad xPaing:{0} {1}", this.GetType().Name, xDom.Body.InnerHtml);
                                return;
                            }
                            var btn = input.GetElementsByTagName("a").Cast<System.Windows.Forms.HtmlElement>()
                                .Where(p => p.InnerText == pageIndex.ToString()).First();
                            btn.InvokeMember("click");
                            arg.IsRepost = true;
                        };
                        var dom = lander.GetDocument(pHandler);
                        SaveHouselisting(housesID, current, dom);
                    }
                    break;
            }
        }

        private void SaveHouselisting(Guid housesID, PageLandEntity current, HtmlAgilityPack.HtmlDocument dom)
        {
            var attrs = new AttributeFiller();
            foreach (var node in QueryNodes(dom.DocumentNode, "#history-list tr"))
            {
                var spans = QueryTexts(node, "td").ToArray();
                attrs.Append("HousesID:{0}", housesID);

                DateTime dump;
                if (DateTime.TryParse(spans[4], out dump))
                {
                    attrs.Append("TransactionDate:{0}", dump);
                }

                attrs.Append("SoldPriceOrRent:{0}", spans[2]);
                attrs.Append("UnitPriceOrLease:{0}", spans[3]);
                attrs.Append("Apartment:{0}", spans[0]);
                attrs.Append("ServiceBroker:{0}", spans[5]);
                attrs.Append("Area:{0}", spans[1]);

                var bo = new HouselistingEntity();
                attrs.FillEntity(bo);
                Repository.SaveHouselisting(bo);
                Crawler.OutWrite("保存小区出售记录 {0}", housesID);
            }
        }
    }
}
原文地址:https://www.cnblogs.com/Googler/p/4241258.html