采集新闻

面向对象的方式做采集程序
采集新闻
方便扩展
存储到xml


步骤:
1、找对象 文章视为对象 每一个网站视为对象
2、为了方便扩展做类似于计算器的操作
把采集的网站视为对象,所有的网站都能够采集 和保存成xml
所以抽象出父类WebSite 抽象类
实现具体的子类cnbeta sina等
3、WebSite 抽象类{ Name(网站名字 只读) Path xml保存路径 Url采集的url,抽象方法 Load采集新闻,Save把新闻保存到xml中}
4、cnbeta 继承WebSite{ }
donews
5、窗体加载时候根据反射读取每个继承自WebSite的子类的名字,添加到下拉框中
6、点采集按钮时候。根据下拉框中的内容创建具体的子类,执行采集方法
7、点保存按钮的时候 把采集到的新闻集合,存储在xml中

cnBate
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Net;
using System.IO;
using System.Text.RegularExpressions;
using System.Xml;

namespace 采集程序3
{
    class cnBate : WebSite
    {
        public override string name
        {
            get { return "cnBate"; }
        }
        List<Article> articles = new List<Article>();
        string regex = @"<div\s+class=""newslist"">\s+<dl>.+?<strong>(?<title>.+?)</strong></a>.+?<span>(?<author>.+?)发布于\s+(?<time>\d{4}\-\d{2}\-\d{2}\s+?\d{2}:\d{2}:\d{2}).+?</a>.+?<span>(?<content>.+?)</span></dd>";
        public override List<Article> Load()
        {
            WebClient wc = new WebClient();
            using (Stream stream = wc.OpenRead(base.Url))
            {
                using (StreamReader sr = new StreamReader(stream, Encoding.GetEncoding("gb2312")))
                {
                    string content;
                    while (!string.IsNullOrEmpty((content = sr.ReadToEnd())))
                    {
                        MatchCollection mc = Regex.Matches(content, regex, RegexOptions.Singleline);
                        foreach (Match match in mc)
                        {
                            if (match.Success)
                            {
                                Article article = new Article();
                                article.Title = match.Groups["title"].Value;
                                article.Author = match.Groups["author"].Value;
                                article.Content = match.Groups["content"].Value;
                                article.Content = Regex.Replace(article.Content, "<.+?>", "");
                                article.Time = DateTime.Parse(match.Groups["time"].Value);
                                articles.Add(article);
                            }
                        }
                    }
                }
            }
            return articles;
        }

        public override void Save()
        {
            if (!File.Exists(base.Path))
            {
                CreateXml();
            }
            else
            {
                AddXml();
            }
        }

        public void CreateXml()
        {
            XmlDocument doc = new XmlDocument();
            XmlDeclaration declaration = doc.CreateXmlDeclaration("1.0", "utf-8", null);
            doc.AppendChild(declaration);
            XmlElement parent = doc.CreateElement("News");
            doc.AppendChild(parent);

            foreach (Article item in articles)
            {
                XmlElement child = doc.CreateElement("New");
                parent.AppendChild(child);
                CreateItems(doc, child, item.Title, "Title");
                CreateItems(doc, child, item.Author, "Author");
                CreateItems(doc, child, item.Content, "Content");
                CreateItems(doc, child, item.Time.ToString(), "Time");
            }
            doc.Save(base.Path);
        }

        public void AddXml()
        {
            XmlDocument doc = new XmlDocument();
            doc.Load(base.Path);
            XmlElement parent = doc.DocumentElement;

            
            foreach (Article item in articles)
            {
                XmlElement child = doc.CreateElement("New");
                parent.AppendChild(child);
                CreateItems(doc, child, item.Title, "Title");
                CreateItems(doc, child, item.Author, "Author");
                CreateItems(doc, child, item.Content, "Content");
                CreateItems(doc, child, item.Time.ToString(), "Time");
            }
            doc.Save(base.Path);
        }

        private static void CreateItems(XmlDocument doc, XmlElement child, string item, string str)
        {
            XmlElement title = doc.CreateElement(str);
            title.InnerText = item;
            child.AppendChild(title);
        }
    }
}
WebSite
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;

namespace 采集程序3
{
    public abstract class WebSite
    {
        public abstract string name
        {
            get;
        }
        private string url;

        public string Url
        {
            get { return url; }
            set { url = value; }
        }
        private string path;

        public string Path
        {
            get { return path; }
            set { path = value; }
        }

        public abstract List<Article> Load();
        public abstract void Save();
    }
}
Article
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;

namespace 采集程序3
{
    public  class Article
    {
        private string title;

        public string Title
        {
            get { return title; }
            set { title = value; }
        }
        private string author;

        public string Author
        {
            get { return author; }
            set { author = value; }
        }
        private string content;

        public string Content
        {
            get { return content; }
            set { content = value; }
        }
        private DateTime time;

        public DateTime Time
        {
            get { return time; }
            set { time = value; }
        }

 
    }
}
Fectory
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;

namespace 采集程序3
{
    class Fectory
    {
        public static WebSite CreateObj(string type)
        {
            WebSite site = null;
            switch (type)
            {
                case "cnBate":
                    site = new cnBate();
                    site.Path = "cnBate.xml";
                    site.Url = @"http://www.cnbeta.com/";
                    break;
            }
            return site;
        }
    }
}
Form1
using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Linq;
using System.Text;
using System.Windows.Forms;
using System.Reflection;

namespace 采集程序3
{
    public partial class Form1 : Form
    {
        public Form1()
        {
            InitializeComponent();
        }

        private void Form1_Load(object sender, EventArgs e)
        {
            //使用反射动态添加列表项,方便扩展
            Assembly ass = sender.GetType().Assembly;
            Type[] types= ass.GetTypes();
            foreach (Type type in types)
            {
                if (typeof(WebSite).IsAssignableFrom(type) && !type.IsAbstract)
                {
                    WebSite ws = Activator.CreateInstance(type) as WebSite;
                    comboBox1.Items.Add(ws.name);
                }
            }
        }
        WebSite ws;
        private void btnLoad_Click(object sender, EventArgs e)
        {
            ws = Fectory.CreateObj(comboBox1.Text);
            if (ws != null)
            {
                List<Article> articles = new List<Article>();
                articles = ws.Load();
                foreach (Article item in articles)
                {
                    ListViewItem lvi = new ListViewItem(item.Title);
                    lvi.SubItems.Add(item.Author);
                    lvi.SubItems.Add(item.Content);
                    lvi.SubItems.Add(item.Time.ToString());
                    listView1.Items.Add(lvi);
                }
            }
            else
            {
                MessageBox.Show("该选项不存在");
            }
        }

        private void btnSave_Click(object sender, EventArgs e)
        {
            ws.Save();
            MessageBox.Show("保存成功");
        }

        private void listView1_DoubleClick(object sender, EventArgs e)
        {
            MessageBox.Show(listView1.SelectedItems[0].SubItems[0].Text);
        }
    }
}

原文地址:https://www.cnblogs.com/hejinyang/p/2818416.html