写了一个新浪博客的爬虫

事件起因:我家夫人说有一个新浪博客的文章写得非常好,他想用打印机打印出来看(说电子的她看着难受),夫人所托,没有办法,动手呗

语言:C#

用到的第三方库:

HtmlAgilityPack,NPOI(均可从NUGET下载到)

软件界面:

由于原理非常简单,花了一个小时写了一下,直接上代码

using System;
using System.IO;
using System.Net;
using System.Text;
using System.Text.RegularExpressions;
using System.Threading;
using System.Threading.Tasks;
using System.Windows.Forms;
using HtmlAgilityPack;
using NPOI.XWPF.UserModel;

namespace blogSpider
{
    public partial class Form1 : Form
    {
        private int _successNum;
        private static readonly string wordFilePath = Environment.CurrentDirectory + @"";
        public Form1()
        {
            InitializeComponent();
        }


        /// <summary>
        /// 抓取按钮
        /// </summary>
        /// <param name="sender"></param>
        /// <param name="e"></param>
        private void button1_Click(object sender, EventArgs e)
        {
            if (!txt_url.Text.Contains("http"))
            {
                MessageBox.Show(@"请输入正确的网址");
                return;
            }

            //开启线程
            Task.Run(() =>
            {
                if (!Directory.Exists(wordFilePath + "word"))
                {
                    Directory.CreateDirectory(wordFilePath + "word");
                }

                using (var webClient = new WebClient())
                {
                    GetBlogContentRecursion(txt_url.Text.Trim(), webClient);
                }

                Invoke((MethodInvoker) (() =>
                {
                    lb_tip.Text = @"抓取完成";
                }));
            });
        }


        private void GetBlogContentRecursion(string url,WebClient wb)
        {
            wb.Encoding = Encoding.UTF8;
            var doc = new HtmlAgilityPack.HtmlDocument();
            string html = wb.DownloadString(url);
            doc.LoadHtml(html);
            HtmlNode parentNode = doc.DocumentNode;
            HtmlNodeCollection hrefNode = parentNode.SelectNodes("//span[@class='atc_title']/a");  //当前页面所有的链接地址集合

            //Random rd = new Random();
            foreach (HtmlNode node in hrefNode)
            {
                //获取文章详情,要加入延迟
                Console.WriteLine(node.Attributes["href"].Value);

                var contentDoc = new HtmlAgilityPack.HtmlDocument();
                string contentHtml = wb.DownloadString(node.Attributes["href"].Value);
                contentDoc.LoadHtml(contentHtml);
                HtmlNode contentNode = contentDoc.DocumentNode;
                var titleNode = contentNode.SelectSingleNode("//h2[@class='titName SG_txta']");  //标题
                var articleNode = contentNode.SelectSingleNode("//div[@class='articalContent   ']");  //内容
                if (titleNode != null && articleNode != null)
                {
                    if (File.Exists($"{wordFilePath}word\{titleNode.InnerText}.docx")) continue; //如果有同样标题的word文档就跳过
                    GenerateWord(titleNode.InnerText.Replace("&nbsp;", ""), articleNode.InnerText.Replace("&nbsp;",""));
                }

                Invoke((MethodInvoker)(() =>
                {
                    lb_tip.Text = $@"成功抓取文章{++_successNum}篇,标题:{node.InnerText}";
                }));
                
                Thread.Sleep(1000);  //加入延迟是为了防止新浪封IP
            }

            var nextPageNode = parentNode.SelectSingleNode("//li[@class='SG_pgnext']/a"); //下一页的地址
            if (nextPageNode != null)
            {
                GetBlogContentRecursion(nextPageNode.Attributes["href"].Value, wb);
            }
        }


        /// <summary>
        /// 生成word文档
        /// </summary>
        /// <param name="title"></param>
        /// <param name="content"></param>
        private void GenerateWord(string title, string content)
        {
            var doc = new XWPFDocument();
            var p1 = doc.CreateParagraph();
            p1.Alignment = ParagraphAlignment.CENTER;
            var r1 = p1.CreateRun();
            r1.FontSize = 16;
            r1.IsBold = true;
            r1.SetText(title);

            var p2 = doc.CreateParagraph();
            p2.Alignment = ParagraphAlignment.LEFT;
            var r2 = p2.CreateRun();
            r2.SetText(content);

        title = Regex.Replace(title, "[ \[ \] \^ \-_*×――(^)$%~!@#$…&%¥—+=<>《》!!???::•`·、。,;,.;"‘’“”-]", "");
        var crtFileName = $@"{wordFilePath}word\{title}.docx";

        doc.Write(f);

            f.Close();
        }
    }
}

原文地址:https://www.cnblogs.com/zhb7769/p/13650358.html