Soufun_News

using AnfleCrawler.Common;
using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.IO;
using System.Linq;
using System.Net;
using System.Text;
using System.Threading.Tasks;

namespace AnfleCrawler.DataAnalyzer
{
    internal class Soufun_News : AnalyzerBase
    {
        private enum Kind
        {
            [Description("市场")]
            Market = 32,
            [Description("政策")]
            Policy = 35,
            [Description("公司")]
            Company = 736,
        }

        private static readonly string[] FilterTags = new string[] { "script", "iframe" };

        public override void Init(PageCrawler crawler)
        {
            string exp = string.Format("http://news.sh.soufun.com/more/[{0}]/[1-50].html", string.Join(",", Enum.GetValues(typeof(Kind)).Cast<int>()));
            crawler.PushUrl(new StringPatternGenerator(exp), 0);
            base.Init(crawler);
        }

        protected override void AnalyzeInternal(PageLandEntity current)
        {
            var lander = Crawler.Lander;
            dynamic repository = Repository;
            var pHandler = CreateContentHandler(current);
            switch (current.Depth)
            {
                case 0:
                    {
                        var dom = lander.GetDocument(pHandler);
                        foreach (var node in QueryNodes(dom.DocumentNode, ".contenttext"))
                        {
                            var linkNode = QueryNode(node, "a.link_01");
                            string url = GetHref(linkNode, current.Url).OriginalString;
                            int i = url.LastIndexOf(".");
                            Crawler.PushUrl(new Uri(url.Insert(i, "_all")), 1);
                        }
                    }
                    break;
                case 1:
                    {
                        var dom = lander.GetDocument(pHandler);
                        var hackNode = QueryNode(dom.DocumentNode, "#newxq_B01_26");
                        string kind = QueryNodes(hackNode, "a").Last().InnerText;
                        string title = QueryNode(dom.DocumentNode, "h1").InnerText;
                        var contentNode = QueryNode(dom.DocumentNode, "#news_body");
                        foreach (string tag in FilterTags)
                        {
                            foreach (var node in QueryNodes(contentNode, tag, false).ToArray())
                            {
                                node.Remove();
                            }
                        }
                        var set = QueryNodes(dom.DocumentNode, "#newxq_B01_27 span").Take(2).ToArray();
                        string source = null;
                        DateTime publishDate;
                        DateTime.TryParse(set[0].InnerText, out publishDate);
                        if (set.Length == 2)
                        {
                            source = set[1].InnerText;
                        }
                        repository.SaveNews(current.Url, kind, source, title, contentNode.InnerHtml, publishDate);
                        Crawler.OutWrite("保存新闻 {0}", title);
                    }
                    break;
            }
        }
    }
}
        public void SaveNews(Uri pageUrl, string kind, string source, string title, string content, DateTime publishDate)
        {
            Guid rowID = CryptoManaged.MD5Hash(pageUrl.OriginalString);
            using (var db = Create())
            {
                var q = from t in db.News
                        where t.RowID == rowID
                        select t;
                var news = q.SingleOrDefault();
                if (news == null)
                {
                    db.News.Add(news = new News()
                    {
                        RowID = rowID,
                        SiteID = pageUrl.Authority,
                    });
                }
                news.Kind = kind;
                news.Source = source;
                news.Title = title;
                news.Content = content;
                news.PublishDate = publishDate;
                db._SaveChanges();
            }
        }
原文地址:https://www.cnblogs.com/Googler/p/4181664.html