58.com qiyi

using AnfleCrawler.Common;
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;

namespace AnfleCrawler.DataAnalyzer
{
    internal class Qy58 : AnalyzerBase
    {
        public override void Init(PageCrawler crawler)
        {
            base.Init(crawler);

            var url = new Uri("http://qy.58.com/caohejing/pn1/?PGTID=14177711280840.45006677554920316&ClickID=1");
            //http://qy.58.com/19583455460359/?PGTID=14177659184690.5166369006238447&ClickID=4
            crawler.PushUrl(url, 0);
        }

        protected override void AnalyzeInternal(PageLandEntity current)
        {
            var lander = Crawler.Lander;
            var pHandler = CreateContentHandler(current);
            switch (current.Depth)
            {
                case 0:
                    {
                        pHandler.AjaxBlocks.Add(HACK);
                        var dom = lander.GetDocument(pHandler);
                        DoPerPaging(current, dom.DocumentNode, ".next");

                        foreach (var node in QueryNodes(dom.DocumentNode, ".compList a"))
                        {
                            var url = GetHref(node, current.Url);
                            Crawler.PushUrl(url, 1);
                        }
                    }
                    break;
                case 1:
                    {
                        var dom = lander.GetDocument(pHandler);
                        var attr = new AttributeFiller();

                        attr.Append("Name:{0}", QueryTexts(dom.DocumentNode, ".compT").First());

                        foreach (var th in QueryNodes(dom.DocumentNode, ".basicMsg table th").Skip(1))
                        {
                            string sTh = th.InnerText, sTd;
                            switch (sTh)
                            {
                                case "联系电话":
                                case "邮箱":
                                    var client = new System.Net.WebClient();
                                    var iNode = QueryNode(th.NextSibling, "img");
                                    byte[] imgRaw = client.DownloadData(GetHref(iNode, current.Url, attrName: "src"));
                                    var img = new System.Drawing.Bitmap(new System.IO.MemoryStream(imgRaw));
                                    sTd = OCR(img);
                                    break;
                                case "公司地址":
                                    sTd = QueryTexts(th.NextSibling, "span").First();
                                    break;
                                default:
                                    sTd = th.NextSibling.InnerText.HtmlTrim();
                                    break;
                            }
                            attr.Append("{0}:{1}", sTh, sTd);
                        }

                        var bo = new CompanyEntity();
                        bo.City = "上海";
                        bo.GroupName = "漕河泾企业";
                        bo.PageUrl = current.Url.OriginalString;
                        bo.UpdateDate = DateTime.Now;
                        attr.FillEntity(bo, new Dictionary<string, string>() 
                        {
                            {"公司性质", "Nature"},
                            {"公司行业", "Industry"},
                            {"公司规模", "Scale"},
                            {"联系人", "ContactPerson"},
                            {"企业网址", "Website"},

                            {"联系电话", "Tel"},
                            {"邮箱", "Email"},
                            {"公司地址", "Address"},
                        });
                        Repository.SaveCompany(bo);
                        Crawler.OutWrite("保存企业 {0}", bo.Name);
                    }
                    break;
            }
        }
    }
}
原文地址:https://www.cnblogs.com/Googler/p/4211492.html