没事虫子爬个书...

x

看到程序猿爬虫的故事...一个无聊的周末...也想用Jumony爬点书,,,囤起来...仓鼠症...

using Ivony.Html;
using System;
using System.Collections.Generic;
using System.IO;
using System.Text;
using System.Windows.Forms;

namespace BookGet
{
    public partial class Form1 : Form
    {
        public Form1()
        {
            InitializeComponent();
        }
        private void button1_Click(object sender, EventArgs e)
        {
            string host = "https://m.xxx.net/";
            string baseUrl = "https://m.xxx.net/wapsort/";
            var nextUrl = "11_1.html";// "171271.html";
            string url = "";

            //所有书籍路径...
            Dictionary<string, string> bookInfoDic = new Dictionary<string, string>();
            string bookName = string.Empty;
            string bookUrl = string.Empty;

            #region 获取所有的页→获取此页中的所有书籍字典
            int testI = 0;
            //循环所有页...
            nextUrl = baseUrl + nextUrl;
            while (nextUrl != "" && testI < 20)
            {
                testI++;
                try
                {
                    url = nextUrl;
                    var doc = new Ivony.Html.Parser.JumonyParser().LoadDocument(url);

                    //此页中的所有书籍...
                    IEnumerable<IHtmlElement> bookList = doc.Find("#nr_body div div.common-bookele h3 a");
                    foreach (var bookItem in bookList)
                    {
                        bookName = bookItem.InnerText();
                        bookUrl = bookItem.Attribute("href").Value();
                        if (!bookInfoDic.ContainsKey(bookName))
                        {
                            //if (bookName == "好想宠坏你")
                            {
                                bookInfoDic.Add(bookName, bookUrl);
                            }
                        }
                    }

                    var domNext = doc.FindFirst("#nr_body div#page a.next");
                    nextUrl = domNext.Attribute("href").Value();
                    if (domNext.Attribute("class").Value() == "prev none")
                    {
                        nextUrl = "";
                    }
                }
                catch
                {
                    Console.WriteLine(string.Format("{0}没有成功", url));
                    nextUrl = "";
                }


            }
            #endregion


            #region 读取所有书,并下载到本地...

            bookName = string.Empty;
            //保存此书的路径...
            string bookPath = string.Empty;
            //书的ID
            //string bookIDStr = string.Empty;
            string beginReadUrl = string.Empty;
            //一个章节的标题(分段阅读)
            string bookTitlePage = string.Empty;
            //一个章节的文本...
            string bookTextPage = string.Empty;

            StringBuilder bookTextBuil = new StringBuilder();

            string nextTextPage = string.Empty;
            FileStream fs = null;
            StreamWriter sw = null;

            //循环书List...
            foreach (var item in bookInfoDic)
            {
                bookTextBuil.Clear();
                try
                {
                    bookPath = string.Format("D:\yuzhaiwu\{0}.txt", item.Key);
                    if (File.Exists(bookPath))
                    {
                        fs = new FileStream(bookPath, FileMode.Append);
                    }
                    else
                    {
                        fs = new FileStream(bookPath, FileMode.Create);
                    }


                    sw = new StreamWriter(fs, Encoding.UTF8);
                    //进入书的主页...
                    var mainPage = new Ivony.Html.Parser.JumonyParser().LoadDocument(item.Value);
                    //开始阅读...
                    var beginReadEle = mainPage.FindFirst("#novelMain a.btn");
                    beginReadUrl = beginReadEle.Attribute("href").Value();

                    nextTextPage = (host + beginReadUrl);
                    //下一页下一页...
                    while (nextTextPage != "")
                    {
                        //各个章节...
                        var firstPage = new Ivony.Html.Parser.JumonyParser().LoadDocument(nextTextPage, Encoding.UTF8, true);
                        #region 如果发现页面中所有的html代码在一个title中的话...读取title中的html代码,在转换...
                        //string htmlPage = firstPage.FindFirst("title").InnerHtml();
                        //var firstPageTemp = new JumonyParser().Parse(htmlPage);
                        //bookTextPage = firstPageTemp.FindFirst("#nr1").InnerText();
                        #endregion

                        bookTitlePage = firstPage.FindFirst("#nr_title").InnerText();
                        //bookTextPage = firstPage.FindFirst("#nr1").InnerText();
                        bookTextPage = firstPage.FindFirst("#nr1").InnerHtml().Replace("<p>", "
").Replace("</p>", "
").Replace("<p></p>", "");
                        bookTextBuil.AppendFormat("


{0}
{1}", bookTitlePage, bookTextPage);

                        //获取下一章节路径...
                        var nextPageEle = firstPage.FindFirst("#nr_body a#pb_next");//#nr_body div.nr_page table tbody tr td.next a#pb_next
                        nextTextPage = nextPageEle.Attribute("href").Value();
                        //如果相等,表明是最后一页了...
                        if (nextTextPage == item.Value)
                        {
                            nextTextPage = "";
                        }
                    }

                    //var bookIDEle = mainPage.FindFirst("SOHUCS");
                    //bookIDStr = bookIDEle.Attribute("sid").Value();

                }
                catch (System.IO.IOException ioEx)
                {
                    MessageBox.Show(ioEx.Message);

                }
                catch (Exception ex)
                {
                    MessageBox.Show(ex.Message);
                }
                finally
                {

                    //Console.WriteLine(title);
                    //sw.WriteLine("");
                    //sw.WriteLine(title);
                    //sw.WriteLine("");
                    sw.WriteLine(bookTextBuil.ToString());

                }

            }


            sw.Close();
            fs.Close();
            #endregion
            MessageBox.Show("全部成功!");

        }
    }
}

x

原文地址:https://www.cnblogs.com/love-zf/p/8612693.html