使用c#采集目标网页

经常做网站的朋友会用到一个工具叫做火车头的工具,这个工具可以方便的把网页上的重复数据抓取下来,有的时候我们会需要自己去抓取页面数据而火车头存在的一些限制没办法完成的时候可以使用c#的正则表达式配合字符串处理完成火车头的采集功能。

using System.IO;
using System.Net;
using System.Text;
using System;
using System.Text.RegularExpressions;
using System.Collections.Generic;


namespace testtaobao {
    public class caiji
    {
        #region 获取网页内容
        /// <summary>
        /// 获取网页内容
        /// </summary>
        /// <param name="url">网址</param>
        /// <param name="code">网页编码例如GB2312</param>
        /// <returns>网页源码</returns>
        public string gethtml(string url,string code){
            string strResult;
            try
            {
                HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);
                //声明一个HttpWebRequest请求   
                request.Timeout = 30000;
                //设置连接超时时间   
                request.Headers.Set("Pragma", "no-cache");
                HttpWebResponse response = (HttpWebResponse)request.GetResponse();
                Stream streamReceive = response.GetResponseStream();
                Encoding encoding = Encoding.GetEncoding(code);
                StreamReader streamReader = new StreamReader(streamReceive, encoding);
                strResult = streamReader.ReadToEnd();
                return strResult;
            }
            catch (Exception ex)
            {
                throw ex;
            }
        }
        #endregion
        #region 替换换行符
        /// <summary>
        /// 替换掉网页源码里面的换行符,方便匹配
        /// </summary>
        /// <param name="HtmlCode">html代码</param>
        /// <returns>去除换行符后的字符串</returns>
        public string ReplaceEnter(string HtmlCode)
        {
            string s = "";
            if (HtmlCode == null || HtmlCode == "")
                s = "";
            else
                s = HtmlCode.Replace("\"", "");
            s = s.Replace("\r\n", "");
            return s;
        }
        #endregion
        #region 执行正则提取出值
        /// <summary>
        /// 执行正则提取出值
        /// </summary>
        /// <param name="RegexString">正则表达式</param>
        /// <param name="RemoteStr">HtmlCode源代码</param>
        /// <returns></returns>
        public MatchCollection GetRegValue(string RegexString, string RemoteStr)
        {
            Regex r = new Regex(RegexString,RegexOptions.Multiline);            
            MatchCollection matches = r.Matches(RemoteStr);
            return matches;
            
        }
        #endregion


        #region 获取目标字符串
        /// <summary>
        /// 获取目标字符串
        /// </summary>
        /// <param name="fstr">目标字符串前面的字串</param>
        /// <param name="estr">目标字符串后面的字串</param>
        /// <param name="scstr">源字符串</param>
        /// <returns>匹配到的字符串数组</returns>
        public List<string> getstr(string fstr, string estr, string scstr) {
            //StringBuilder stb = new StringBuilder();
            string regstr = fstr + @".*?" + estr;
            List<string> rlist = new List<string>();
            MatchCollection match = GetRegValue(regstr, scstr);
            
            for (int i = 0; i < match.Count; i++)
            {
                string tpstr = match[i].ToString();
                tpstr = tpstr.Replace(fstr, "");
                tpstr = tpstr.Replace(estr, "");
                rlist.Add(tpstr);
            }
            return rlist;
        }
        #endregion
    }
}
原文地址:https://www.cnblogs.com/lijurui/p/2703221.html