C# 使用http网址, 爬取该网页的html内容

主要的实现方式为:创建流对象,读取数据。

但是有的网页不是直接绑定数据的,而是js动态加载的数据。

所有下面两种方法,针对不同的情况

一,直接加载数据的网页

 public String getTitle(String url)
        {
            //请求资源  
            System.Net.WebRequest wb = System.Net.WebRequest.Create(url.Trim());

            //响应请求  
            WebResponse webRes = null;

            //将返回的数据放入流中  
            Stream webStream = null;
            try
            {
                webRes = wb.GetResponse();
                webStream = webRes.GetResponseStream();
            }
            catch (Exception e)
            {
                return "输入的网址不存在或非法...";
            }


            //从流中读出数据  (这里如果乱码改变编码即可)
            StreamReader sr = new StreamReader(webStream, System.Text.Encoding.UTF8);

            //创建可变字符对象,用于保存网页数据   
            StringBuilder sb = new StringBuilder();

            //读出数据存入可变字符中  
            String str = "";
            while ((str = sr.ReadLine()) != null)
            {
                sb.Append(str);
            }

            //建立获取网页标题正则表达式  
            String regex = @"<title>.+</title>";

            //返回网页标题  
            String title = Regex.Match(sb.ToString(), regex).ToString();
            title = Regex.Replace(title, @"[""]+", "");
            return title;
        }

二,js动态加载数据的 网页

public static string GetTitel(string url, string post_parament)
        {
            string html;//网页源代码

            HttpWebRequest Web_Request = (HttpWebRequest)WebRequest.Create(url);
            Web_Request.Timeout = 30000;
            Web_Request.Method = "GET";
            Web_Request.UserAgent = "Mozilla/4.0";
            Web_Request.Headers.Add("Accept-Encoding", "gzip, deflate");
            //Web_Request.Credentials = CredentialCache.DefaultCredentials;

            //设置代理属性WebProxy-------------------------------------------------
            //WebProxy proxy = new WebProxy("111.13.7.120", 80);
            //在发起HTTP请求前将proxy赋值给HttpWebRequest的Proxy属性
            //Web_Request.Proxy = proxy;

            HttpWebResponse Web_Response = (HttpWebResponse)Web_Request.GetResponse();

            if (Web_Response.ContentEncoding.ToLower() == "gzip")  // 如果使用了GZip则先解压
            {
                using (Stream Stream_Receive = Web_Response.GetResponseStream())
                {
                    using (var Zip_Stream = new GZipStream(Stream_Receive, CompressionMode.Decompress))
                    {
                        using (StreamReader Stream_Reader = new StreamReader(Zip_Stream, Encoding.UTF8))
                        {
                            html = Stream_Reader.ReadToEnd();
                        }
                    }
                }
            }
            else
            {
                using (Stream Stream_Receive = Web_Response.GetResponseStream())
                {
                    using (StreamReader Stream_Reader = new StreamReader(Stream_Receive, Encoding.UTF8))
                    {
                        html = Stream_Reader.ReadToEnd();
                    }
                }
            }

        
     //正则赛选出titel Match m
= Regex.Match(html, "<title>(.*)</title>"); if (m.Groups.Count == 2) { return m.Groups[1].Value; } else { return ""; } ////筛选出titel //String regex = @"<title>.+</title>"; //String title = Regex.Match(html.ToString(), regex).ToString(); //title = Regex.Replace(title, @"[""]+", ""); //return title; }
原文地址:https://www.cnblogs.com/liuzheng0612/p/11839807.html