C#/winform采集百度hi文章

public partial class Form1 : Form

    {

        Thread newth;

        public Form1()

        {

            InitializeComponent();

         

        }    

        private void buttonGo_Click(object sender, EventArgs e)

        {

            CheckForIllegalCrossThreadCalls = false; //简单异步线程控制设置       

             newth = new Thread(new ThreadStart(doit));

            newth.Start();

        }

        void doit()

        { //HttpWebRequest 对象采集百度hi blog文章

            HttpWebRequest webRequest; //请求对象

            StreamReader responseReader;//响应对象

            string responseData;

            html mytml; //自定义html简单处理对象,处理文章页面数据

            DataTable dt = new DataTable();//存储文章列表

            DataTable dt2 = new DataTable();//存储文章内容

            int pagecount = 0;

            dt.Columns.Add(new DataColumn("title"));//标题

            dt.Columns.Add(new DataColumn("link"));//链接

            dt.Columns.Add(new DataColumn("description"));//文章内容

            dt.Columns.Add(new DataColumn("pubDate"));//发表时间

            dt.Columns.Add(new DataColumn("category"));//文章分类

            dt2.Columns.Add(new DataColumn("title"));

            dt2.Columns.Add(new DataColumn("link"));

            dt2.Columns.Add(new DataColumn("description"));

            dt2.Columns.Add(new DataColumn("pubDate"));

            dt2.Columns.Add(new DataColumn("category"));

            string url = "http://hi.baidu.com/306759613/blog/index/";//文章列表第一页为http://hi.baidu.com/306759613/blog/index/0

            string arcurl="http://hi.baidu.com/306759613/blog/item/";//文章所在路径

            //find page count

            //from index 0

            webRequest = WebRequest.Create(url + 0) as HttpWebRequest;

            webRequest.Timeout = 3000;//请求延时设置

            WebResponse reponse = webRequest.GetResponse();

          //gb2312读取数据

            responseReader = new StreamReader(

            reponse.GetResponseStream(), System.Text.Encoding.GetEncoding("gb2312")

            );

            responseData = responseReader.ReadToEnd();//读取整个页面

            responseReader.Close();

            mytml = new html(responseData);//创建html页面处理对象

            List<string> regpsl = mytml.getElementsByRegex(@"/blog/index/[\d]+");//获取分页链接的正则

          List<int> pagenum = new List<int>();//存储页码

          foreach (string a in regpsl) {

          pagenum.Add(int.Parse(a.Replace("/blog/index/", "")));

          }

          pagecount = pagenum.Max() + 1;//pagenum中最大值为尾页页码,页面从0开始编号,页数为页面数+1       

          mytml = null;

            this.progressBar1.Value = 0; //进度条

            for (int i = 0; i < pagecount; i++)

            {

           webRequest = WebRequest.Create(url+i) as HttpWebRequest;//读取各分页

           webRequest.Timeout = 3000;

            responseReader = new StreamReader(

            webRequest.GetResponse().GetResponseStream(), System.Text.Encoding.GetEncoding("gb2312")

            );

             responseData = responseReader.ReadToEnd();

             responseReader.Close();

             mytml = new html(responseData);

             DataTable dti = mytml.getAritcleTable();//获取该分页文章列表

             this.progressBar1.Value = (i * 100 / pagecount);

             this.label1.Text = this.progressBar1.Value + "%"; //进度条


         for (int j = 0; j < dti.Rows.Count; j++)

         {

             dt.Rows.Add(dti.Rows[j].ItemArray);//插入该文章到总文章表

             HttpWebRequest subrequest = WebRequest.Create(arcurl+dti.Rows[j][1]+".html") as HttpWebRequest;//读取文章信息

             subrequest.Timeout = 3000;

             StreamReader subre = new StreamReader (subrequest.GetResponse().GetResponseStream(), System.Text.Encoding.GetEncoding("gb2312"));

             string tmphtml = subre.ReadToEnd();

             //处理文章页面html开始

             int start = tmphtml.IndexOf("<div id=\"blog_text\" class=\"cnt\">")+"<div id=\"blog_text\" class=\"cnt\">".Length;

             int end = tmphtml.IndexOf("</div",start);

             tmphtml = tmphtml.Substring(start,end-start);//取得文章内容

             dt2.Rows.Add(new object[] { dti.Rows[j].ItemArray[0], dti.Rows[j].ItemArray[1], tmphtml, dti.Rows[j].ItemArray[3] });//插入文章数据到文章表

             subre.Close();

             this.progressBar1.Value = (i * 100 / pagecount) + (j * 25 / dti.Rows.Count);//设置进度条

             this.label1.Text = this.progressBar1.Value + "%"; //显示百分比           

             writeXML(dt2, "f:\\p\\" + dti.Rows[j][1] + ".xml");//将文章以xml格式输出

             dt2.Rows.Clear();

             subre.Close();

             subrequest = null;

         }

             webRequest = null;

             responseReader.Close();

             responseReader = null;

             responseData = string.Empty;

            }

            this.progressBar1.Value =this.progressBar1.Maximum;//进度100%

            this.label1.Text = this.progressBar1.Value + "%";

             this.dataGridView1.DataSource = dt;//显示文章列表数据

              writeXML(dt, "f:\\p\\Articel.xml");//输出文章内容数据到xml文件

            textBoxDebug.Text = textBoxDebug.Text+ "写入完毕\r\n";

        }

    /// <summary>

        /// 将数据表输出到xml

        /// </summary>

        /// <param name="dt"></param>

        /// <param name="fileName"></param>

        public void writeXML(DataTable dt, string fileName)

        {

            string xmlstr ="<?xml version=\"1.0\" encoding=\"utf-8\"?>\r\n";

            xmlstr += "<?xml-stylesheet href=\"t.xsl\" type=\"text/xsl\"?>\r\n";

           xmlstr +="<root>\r\n";

            dt.TableName = "articels";

            System.Xml.XmlDocument xml = new System.Xml.XmlDocument();

            for (int k = 0; k < dt.Rows.Count; k++)

            {

                xmlstr = xmlstr + "<" + dt.TableName + ">\r\n";

                for (int l = 0; l < dt.Columns.Count; l++)

                {

                    xmlstr = xmlstr + "<" + dt.Columns[l].ColumnName + ">\r\n<![CDATA[\r\n";

                    xmlstr = xmlstr + dt.Rows[k][l] + "\r\n";

                    xmlstr = xmlstr + "]]>\r\n</" + dt.Columns[l].ColumnName + ">\r\n";

                }

                xmlstr = xmlstr + "</" + dt.TableName + ">\r\n";

            }

            xmlstr += "</root>\r\n";

            StreamWriter w = new StreamWriter(fileName, false, System.Text.Encoding.UTF8);//utf8保存

            w.Write(xmlstr);

            w.Close();

        }

    }

    class html

    {

        string htmltext=string.Empty;

        /// <summary>

        /// 构造函数

        /// </summary>

        /// <param name="htmltext"></param>

        public html( string htmltext) {

            this.htmltext = htmltext;       

        }

        /// <summary>

        /// 获取文章列表

        /// </summary>

        /// <returns></returns>

        public DataTable getAritcleTable(){

            DataTable dt = new DataTable();

            int start = htmltext.IndexOf("div id=\"m_blog\" class=\"modbox\">");//起始位置

            int end = htmltext.IndexOf("<div id=\"mod_artclg\" class=\"mod\">");//结束位置

            string htm = htmltext.Substring(start-1, end - start -1 );

            dt.Columns.Add(new DataColumn("title"));

            dt.Columns.Add(new DataColumn("link"));

            dt.Columns.Add(new DataColumn("description"));

            dt.Columns.Add(new DataColumn("pubDate"));

            dt.Columns.Add(new DataColumn("category"));

            string title, link, description, pubDate, category,temp;

            int nstart, nend;//记录上次提取位置

            start = 0;

            do

            {//遍历html文档 提取文章信息

                nstart = htm.IndexOf("<div class=\"tit\">",start) + "<div class=\"tit\">".Length;

                if (nstart < start) break;

                start = nstart;

                nend = htm.IndexOf("</div>",start);

                start = nend + 5;

                temp = htm.Substring(nstart, nend - nstart );

                nstart = temp.IndexOf(">");

                nend =temp.IndexOf("</a>");

                title = temp.Substring(nstart + 1, nend-nstart-1 );//文章标题

                nstart = temp.IndexOf("\"");

                nend = temp.IndexOf("\"", nstart + 1);

                link = temp.Substring(nstart + 1, nend - nstart-1 );//链接

                nstart = link.IndexOf("item/")+"item/".Length;

                nend = link.IndexOf(".html");

                link = link.Substring(nstart, nend - nstart);//取文件名(去除扩展名)

                nstart = htm.IndexOf("<div class=\"date\">", start)+ "<div class=\"date\">".Length;

                start = nstart;

                nend = htm.IndexOf("</div>", start);

                pubDate = htm.Substring(nstart , nend - nstart);//发表日期

                start = nend + 5;

                nstart = htm.IndexOf("<div class=\"cnt\">", start) + "<div class=\"cnt\">".Length;

                start = nstart;

                nend = htm.IndexOf("</div>", start);

                start = nend + 5;

                description = htm.Substring(nstart, nend - nstart );//文章内容

                nstart = htm.IndexOf("<div class=\"opt\">", start) + "<div class=\"opt\">".Length;

                start = nstart;

                nend = htm.IndexOf("</div>", start);

                start = nend + 5;

                temp = htm.Substring(nstart, nend - nstart );

                nstart = temp.IndexOf("");

                nend =temp.IndexOf("</a>");

                category=temp.Substring(nstart + 1, nend - nstart - 1); //文章分类            

dt.Rows.Add(new string[] { title, link, description, pubDate, category });

            } while (nstart > 0);

         

            return dt;

        }

    }

原文地址:https://www.cnblogs.com/top5/p/1542578.html