抓取网页文本内容

使用的是WebRequest类,在这以http://novel.hongxiu.com/a/1036665/10425842.html为例。

代码如下:

using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.IO;
using System.Net;
using System.Text;
using System.Text.RegularExpressions;
using System.Windows.Forms;

namespace 网页抓取
{
    public partial class Form1 : Form
    {
        public Form1()
        {
            InitializeComponent();
        }

        public void zhuaqu()
        {
            WebRequest request = WebRequest.Create(label1.Text);//发出请求
            WebResponse response = request.GetResponse();//Internet请求的响应
            StreamReader sr = new StreamReader(response.GetResponseStream(), Encoding.UTF8);//按编码方式读取Internet返回的数据流
            string html = sr.ReadToEnd();
            string th = thtxt(html);//使用正则表达式替换html源代码中的标签为空格
            sr.Close();
            

            int sindex = th.IndexOf("红|袖|言|情|小|说");//查索引
            int lindex = th.IndexOf("但是什么?");
            string subtxt = th.Substring(sindex,lindex-sindex+6);//截取想要的内容
            StreamWriter sw = new StreamWriter("E:\x1.txt");//写入流保存
            sw.WriteLine(subtxt);
            sw.Close();
            richTextBox1.Text = subtxt;

        }
        private void button1_Click(object sender, EventArgs e)
        {
            zhuaqu();
        }

        private string thtxt(string Html)
        {
            Regex reg = new Regex("<(.|
)+?>");
            //Regex r = new Regex(@"s+");//把空格替换掉的正则表达式
            string th = reg.Replace(Html, "");
            th = th.Replace("<", "<");
            th = th.Replace(">", "");
            //th = r.Replace(th,"");
            return th;
        }
    }
}

运行效果

原文地址:https://www.cnblogs.com/happinesshappy/p/4579410.html