抓取网页

C# 读取文本文件内容生成相应的文件,获取目录下所有文件名并保存为文本文

最近因为经常用到2个功能:
1):以一个文件内容为名批量生成相应的文件
2):查找一个目录(包括子目录)下某扩展名的所有文件
所以写了一个小程序,方便多了。
先看效果图:


虽然很简单但须注意:
1. 
扩展名 区分大小写
if (Path.GetExtension(file).ToLower() == mask.ToLower())
一开始没注意这,害得找出的文件总是比正常的文件少
2.
去掉文件名中的非法字符
line = line.Replace("\", string.Empty);
line = line.Replace("/", string.Empty);
line = line.Replace(":", string.Empty);
line = line.Replace("*", string.Empty);
line = line.Replace("?", string.Empty);
line = line.Replace(""", string.Empty);
line = line.Replace("<", string.Empty);
line = line.Replace(">", string.Empty);
line = line.Replace("|", string.Empty);
//line = line.Replace(" ", string.Empty);
fs = new FileStream(fileSaveDir +"\"+ line + ext, FileMode.Create);
3.
注意各种细节,一些小问题不容忽视,现在这个程序的excepitoin 处理还有一些模糊! 暂时就不改了。
4.主要代码
c#
//生成文件
//
private void btnCreate_Click(object sender, EventArgs e)//生成文件
        {
            FileStream fs;
            String line = "";
           // ext = Convert.ToString( comboBox1.SelectedItem);
            ext = comboBox1.Text;
            fileSaveDir = this.tbxSaveDir.Text;
            fileName = this.tbxFilename.Text;
            if (fileName == "")
            {
                MessageBox.Show("请选择文件名的存放文件。");
                return;
            }
            if (fileSaveDir == "")
            {
                FileInfo fi = new FileInfo(fileName);
                fileSaveDir =Convert.ToString(fi.Directory);
            }
            try
            {
                using (StreamReader sr = new StreamReader(fileName))
                {
                    do
                    {
                        line = sr.ReadLine();
                        if (line != null)
                        {
                            String file = fileSaveDir + "\" + line + ext;
                            if(File.Exists(file))
                            {
                                if (DialogResult.Yes == MessageBox.Show("文件 "+"""+line+ext+"""+" 已经存在了!", "是否忽略已经存在的文件", MessageBoxButtons.YesNo,MessageBoxIcon.Warning))
                                {
                                    continue;
                                }
                                else
                                {
                                    MessageBox.Show("一共生成了" + count + " 个文件。");
                                    return;
                                }
                            }
                            
                            line = line.Replace("\", string.Empty);
                            line = line.Replace("/", string.Empty);
                            line = line.Replace(":", string.Empty);
                            line = line.Replace("*", string.Empty);
                            line = line.Replace("?", string.Empty);
                            line = line.Replace(""", string.Empty);
                            line = line.Replace("<", string.Empty);
                            line = line.Replace(">", string.Empty);
                            line = line.Replace("|", string.Empty);
                            //line = line.Replace(" ", string.Empty);
                            
                            fs = new FileStream(fileSaveDir +"\"+ line + ext, FileMode.Create);
                            //fs = new FileStream(line + ".txt", FileMode.Create);
                            count++;
                        }
                    } while (line != null);
                }
            }
            catch (ArgumentException arge)
            {
                MessageBox.Show(arge.Message);
            }
            catch (Exception ex)
            {
                MessageBox.Show(ex.Message);
            }
            MessageBox.Show("一共生成了"+count+" 个文件。");
            count = 0;
            //this.comboBox1.SelectedIndex = 0;
        
        }

//获取文件名
private void btnGetFileName_Click(object sender, EventArgs e)//获取文件名
        {
            int fileCount = 0;
            bool fullname = checkBox1.Checked;
            if (this.tbxPath.Text =="" || this.tbxExten_tabPage2.Text == "" || this.tbxSavePath.Text == "")
            {
                MessageBox.Show("请选择目录及扩展名。");
                return;
            }
            String directory = this.tbxPath.Text;
            String mask = this.tbxExten_tabPage2.Text;
            String savepath = this.tbxSavePath.Text;
            findFiles(directory, mask, false,fullname, ref fileCount);
            File.Delete(savepath);
            FileStream fs = new FileStream(savepath , FileMode.CreateNew);
            StreamWriter sw = new StreamWriter(fs);
            foreach (string str in al)
                sw.WriteLine(str);
            sw.Close();
            fs.Close();
            MessageBox.Show("一共获取了" + fileCount + "个文件名。");
            
            fileCount = 0;
            al.Clear();
        }
public void findFiles(string directory, string mask, bool ignoreHidden,bool fullname, ref int fileCount)//获取文件名
        {
            //先查找当前目录下指定后缀名的所有文件
            foreach (string file in Directory.GetFiles(directory, "*.*")) //System Volume Information unauthorizedAccessException
            {
                if (!(ignoreHidden && (File.GetAttributes(file) & FileAttributes.Hidden) == FileAttributes.Hidden))
                {
                    if (mask != "")
                    {
                        if (Path.GetExtension(file).ToLower() == mask.ToLower())
                        {
                            FileInfo fi = new FileInfo(file);
                            String name="";
                            if (fullname)
                            {
                                name = fi.FullName;
                            }
                            else
                            {
                                name = fi.Name;//.Replace(mask,"");
                            }
                            al.Add(name);
                            fileCount++;
                        }
                    }
                }
            }          
            string[] childDirectories = Directory.GetDirectories(directory);
            foreach (string dir in childDirectories)
            {
                if (!(ignoreHidden && (File.GetAttributes(dir) & FileAttributes.Hidden) == FileAttributes.Hidden))
                
                    findFiles(dir, mask, false,fullname, ref fileCount);
                
            }                 
        }
//java code(查找一个目录(包括子目录)下的所有文件):
import java.io.*;

public class ListFiles {
private static String listFileStr = "";
private static String dir;
private static String savefile;
private static int count = 0;

private static FileWriter fw;
private static File saveFile;
public static void main(String[] args) {
   try
   {
    System.out.println("请输入查找文件的目录:(eg:d\:music)");
    try{
       //接收键盘输入作为输入流,把输入流放到缓冲流里面
       BufferedReader in = new BufferedReader(new InputStreamReader(System.in)); 
       //从缓冲流读取一行数据
       dir = in.readLine();
       //saveFile=new File(savefile);
    }
    catch(IOException e)
    {
     //System.out.println(e.toString());
     System.out.println("请输入合法的路径名!");
    }
    System.out.println("请输入保存文件的位置:(eg:d\:savename.txt)");
    try{
       BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
       savefile = in.readLine();
       fw=new FileWriter(savefile);
    }
    catch(IOException ex)
    {
     //System.out.println(ex.toString());
     System.out.println("请输入合法的路径名!");
    } 
   
    
    ListFiles lf=new ListFiles();
    lf.listFile(dir);
    fw.write(listFileStr);
    fw.close();
    System.out.println("
一共找到"+count+"个文件!");
   }
   catch (ArrayIndexOutOfBoundsException ea)
   {
    //参数提示
    System.out.println("Usage: ListFiles <source dir> <target file>");
  
   }
   catch (IOException e)
   {
    System.out.println("IO error!
"+e.toString());
   }
   }
   public void listFile(String rp)
   {
  
    File file=new File(rp);
    File list[]=file.listFiles();
    for(int i=0;i<list.length;i++)
    {
     try
     {
      if (list[i].isDirectory())
      {
       new ListFiles().listFile(list[i].toString());
      }
      else 
      {
       listFileStr+=list[i].getAbsolutePath()+"
";
       System.out.println(list[i].getAbsolutePath());
//       listFileStr+=list[i].getName()+"
";
//       System.out.println(list[i].getName());
       count++;
      }
     }
     catch (Exception ex)
     {
      listFileStr+="Access deny:"+list[i].getAbsolutePath()+"
";
      System.out.println("Access deny:"+list[i].getAbsolutePath());
    }
   }
}
}
 
View Code

C# 抓取网页Html

C# 抓取网页的Html 及分析:
源码如下:
private void Search(string url)
{
    string rl;
    WebRequest Request = WebRequest.Create(url.Trim());
 
    WebResponse Response = Request.GetResponse();
 
    Stream resStream = Response.GetResponseStream();
 
    StreamReader sr = new StreamReader(resStream, Encoding.Default);
    StringBuilder sb = new StringBuilder();
    while ((rl = sr.ReadLine()) != null)
    {
        sb.Append(rl);
    }
 
 
    string str = sb.ToString().ToLower();
 
    string str_get = mid(str, "<ul class="post_list">", "</ul>");
 
 
    int start = 0;
    while (true)
    {
        if (str_get == null)
            break;
        string strResult = mid(str_get, "href="", """, out start);
        if (strResult == null)
            break;
        else
        {
            lab[url] += strResult;
            str_get = str_get.Substring(start);
        }
    }
}
 
 
 
 
private string mid(string istr, string startString, string endString)
{
    int iBodyStart = istr.IndexOf(startString, 0);               //开始位置
    if (iBodyStart == -1)
        return null;
    iBodyStart += startString.Length;                           //第一次字符位置起的长度
    int iBodyEnd = istr.IndexOf(endString, iBodyStart);         //第二次字符在第一次字符位置起的首次位置
    if (iBodyEnd == -1)
        return null;
    iBodyEnd += endString.Length;                              //第二次字符位置起的长度
    string strResult = istr.Substring(iBodyStart, iBodyEnd - iBodyStart - 1);
    return strResult;
}
 
 
private string mid(string istr, string startString, string endString, out int iBodyEnd)
{
    //初始化out参数,否则不能return
    iBodyEnd = 0;
 
    int iBodyStart = istr.IndexOf(startString, 0);               //开始位置
    if (iBodyStart == -1)
        return null;
    iBodyStart += startString.Length;                           //第一次字符位置起的长度
    iBodyEnd = istr.IndexOf(endString, iBodyStart);         //第二次字符在第一次字符位置起的首次位置
    if (iBodyEnd == -1)
        return null;
    iBodyEnd += endString.Length;                              //第二次字符位置起的长度
    string strResult = istr.Substring(iBodyStart, iBodyEnd - iBodyStart - 1);
    return strResult;
}
 
View Code

C# 抓取网页里面的所有链接

这几天偶尔看见了,C#抓取网页的链接。的代码。感觉当时做的很简单。呵呵。也没多考虑什么过程。先把简单的给大家拿出来看看。如果大家有什么意见或者有好的方法可以共同交流。谢谢!一下仅供参考:
 
using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Linq;
using System.Text;
using System.Windows.Forms;

using System.Xml;
using System.Net;
using System.IO;
using System.Collections;
using System.Text.RegularExpressions;

namespace text
{
    public partial class Form1 : Form
    {
        string strCode;
        ArrayList alLinks;
        public Form1()
        {
            InitializeComponent();
        }

        private void button1_Click(object sender, EventArgs e)
        {
            if (textBox1.Text == "")
            {
                MessageBox.Show("请输入网址");
                return;
            }
            string strURL = textBox1.Text.ToString().Trim();
            if (strURL.Substring(0, 7) != @"http://")
            {
                strURL = @"http://" + strURL;
            }
            MessageBox.Show("正在获取页面代码,请稍后...");
            strCode = GetPageSource(strURL);
            MessageBox.Show("正在提取超链接,请稍侯...");
            alLinks = GetHyperLinks(strCode);
            MessageBox.Show("正在写入文件,请稍侯...");
            WriteToXml(strURL, alLinks);
        }
        // 获取指定网页的HTML代码 
        public static string GetPageSource(string URL)
        {
            Uri uri = new Uri(URL);
            HttpWebRequest hwReq = (HttpWebRequest)WebRequest.Create(uri);
            HttpWebResponse hwRes = (HttpWebResponse)hwReq.GetResponse();
            hwReq.Method = "Get";
            hwReq.KeepAlive = false;
            StreamReader reader = new StreamReader(hwRes.GetResponseStream(), System.Text.Encoding.GetEncoding("GB2312"));
            return reader.ReadToEnd();
        }
        // 提取HTML代码中的网址 
        public static ArrayList GetHyperLinks(string htmlCode)
        {
            ArrayList al = new ArrayList();
            string strRegex = @"http://([w-]+.)+[w-]+(/[w- ./?%&=]*)?";
            Regex r = new Regex(strRegex, RegexOptions.IgnoreCase);
            MatchCollection m = r.Matches(htmlCode);
            for (int i = 0; i <= m.Count - 1; i++)
            {
                bool rep = false;
                string strNew = m[i].ToString();
                // 过滤重复的URL 
                foreach (string str in al)
                {
                    if (strNew == str)
                    {
                        rep = true;
                        break;
                    }
                }
                if (!rep) al.Add(strNew);
            }
            al.Sort();
            return al;
        }
        // 把网址写入xml文件 
        static void WriteToXml(string strURL, ArrayList alHyperLinks)
        {
            XmlTextWriter writer = new XmlTextWriter("HyperLinks.xml", Encoding.UTF8);
            writer.Formatting = Formatting.Indented;
            writer.WriteStartDocument(false);
            writer.WriteDocType("HyperLinks", null, "urls.dtd", null);
            writer.WriteComment("提取自" + strURL + "的超链接");
            writer.WriteStartElement("HyperLinks");
            writer.WriteStartElement("HyperLinks", null);
            writer.WriteAttributeString("DateTime", DateTime.Now.ToString());

            foreach (string str in alHyperLinks)
            {
                string title = GetDomain(str);
                string body = str;
                writer.WriteElementString(title, null, body);
            }
            writer.WriteEndElement();
            writer.WriteEndElement();
            writer.Flush();
            writer.Close();
        }
        // 获取网址的域名后缀 
        static string GetDomain(string strURL)
        {
            string retVal;
            string strRegex = @"(.com/|.net/|.cn/|.org/|.gov/)";
            Regex r = new Regex(strRegex, RegexOptions.IgnoreCase);
            Match m = r.Match(strURL);
            retVal = m.ToString();
            strRegex = @".|/$";
            retVal = Regex.Replace(retVal, strRegex, "").ToString();
            if (retVal == "")
                retVal = "other";
            return retVal;
        }
    }
}
 
View Code

C# 抓取网页内容(转)

摘要: 1、抓取一般内容需要三个类:WebRequest、WebResponse、StreamReader所需命名空间:System.Net、System.IO核心代码:view plaincopy to clipboardprint?WebRequestrequest=WebRequest.Create("http://www.cftea. ...
1、抓取一般内容
需要三个类:WebRequest、WebResponse、StreamReader
所需命名空间:System.Net、System.IO
核心代码:
view plaincopy to clipboardprint?
WebRequest request = WebRequest.Create("http://www.cftea.com/");  
WebResponse response = request.GetResponse();  
StreamReader reader = new StreamReader(response.GetResponseStream(), Encoding.GetEncoding("gb2312"));  
      WebRequest 类的 Create 为静态方法,参数为要抓取的网页的网址;
      Encoding 指定编码,Encoding 中有属性 ASCII、UTF32、UTF8 等全球通用的编码,但没有 gb2312 这个编码属性,所以我们使用 GetEncoding 获得 gb2312 编码。
示例:
view plaincopy to clipboardprint?
<%@ Page Language="C#" %>  
<%@ Import Namespace="System.Net" %>  
<%@ Import Namespace="System.IO" %>  
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">  
<mce:script runat="server"><!--  
    void Page_Load(object sender, EventArgs e)  
    {  
        try  
        {  
            WebRequest request = WebRequest.Create("http://www.cftea.com/");  
            WebResponse response = request.GetResponse();  
            StreamReader reader = new StreamReader(response.GetResponseStream(), Encoding.GetEncoding("gb2312"));  
              
            tb.Text = reader.ReadToEnd();  
              
            reader.Close();  
            reader.Dispose();  
            response.Close();  
        }  
        catch (Exception ex)  
        {  
            tb.Text = ex.Message;  
        }  
    }  
// --></mce:script>   
<html xmlns="http://www.w3.org/1999/xhtml" >  
<head runat="server">  
    <title>抓取网页内容 - 千一网络</title>  
</head>  
<body>  
    <form id="form1" runat="server">  
    <div>  
    <asp:TextBox ID="tb" runat="server" Width="500" Height="300" TextMode="multiLine"></asp:TextBox>  
    </div>  
    </form>  
</body>  
</html>  
 
 2 抓取网页内容-图片
    需要四个类:WebRequest、WebResponse、Stream、FileStream。
   示例:
view plaincopy to clipboardprint?
<%@ Page Language="C#" %>  
<%@ Import Namespace="System.Net" %>  
<%@ Import Namespace="System.IO" %>  
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">  
<mce:script runat="server"><!--  
    void Page_Load(object sender, EventArgs e)  
    {  
        try  
        {  
            WebRequest request = WebRequest.Create("http://www.cftea.com/images/logo.gif");  
            WebResponse response = request.GetResponse();  
            Stream reader = response.GetResponseStream();  
              
            FileStream writer = new FileStream("D://logo.gif", FileMode.OpenOrCreate, FileAccess.Write);  
            byte[] buff = new byte[512];  
            int c = 0; //实际读取的字节数   
            while ((c=reader.Read(buff, 0, buff.Length)) > 0)  
            {  
                writer.Write(buff, 0, c);  
            }  
            writer.Close();  
            writer.Dispose();  
              
            reader.Close();  
            reader.Dispose();  
            response.Close();  
              
            tb.Text = "保存成功!";  
        }  
        catch (Exception ex)  
        {  
            tb.Text = ex.Message;  
        }  
    }  
// --></mce:script>   
<html xmlns="http://www.w3.org/1999/xhtml" >  
<head runat="server">  
    <title>抓取网页图片并保存 - 千一网络</title>  
</head>  
<body>  
    <form id="form1" runat="server">  
    <div>  
    <asp:TextBox ID="tb" runat="server" Width="500" Height="300" TextMode="multiLine"></asp:TextBox>  
    </div>  
    </form>  
</body>  
</html>  
 
3 抓取网页内容-Post 数据
   在抓取网页时,有时候,需要将某些数据通过 Post 的方式发送到服务器,将以下代码添加在网页抓取的程序中,以实现将用户名和密码 Post 到服务器
view plaincopy to clipboardprint?
string data = "userName=admin&passwd=admin888";  
byte[] requestBuffer = System.Text.Encoding.GetEncoding("gb2312").GetBytes(data);  
  
request.Method = "POST";  
request.ContentType = "application/x-www-form-urlencoded";  
request.ContentLength = requestBuffer.Length;  
using (Stream requestStream = request.GetRequestStream())  
{  
    requestStream.Write(requestBuffer, 0, requestBuffer.Length);  
    requestStream.Close();  
}  
  
using (StreamReader reader = new StreamReader(response.GetResponseStream(), Encoding.GetEncoding("gb2312")))  
{  
    string str = reader.ReadToEnd();  
    reader.Close();  
}  
 
4  抓取网页内容-防止重定向
在抓取网页时,成功登录服务器应用系统后,应用系统可能会通过 Response.Redirect 将网页进行重定向,如果不需要响应这个重定向,那么,我们就不要把 reader.ReadToEnd() 给 Response.Write 出来,就可以了。
5 抓取网页内容-保持登录状态
  
利用 Post 数据成功登录服务器应用系统后,就可以抓取需要登录的页面了,那么我们就可能需要在多个 Request 间保持登录状态。
首先,我们要使用 HttpWebRequest,而不是 WebRequest。
与 WebRequest 相比,变化的代码是:
view plaincopy to clipboardprint?
HttpWebRequest request = (HttpWebRequest)HttpWebRequest.Create(url);  
 
注意:HttpWebRequest.Create 返回的类型仍是 WebRequest,所以要转化一下。
其次,使用 CookieContainer。
view plaincopy to clipboardprint?
System.Net.CookieContainer cc = new System.Net.CookieContainer();  
request.CookieContainer = cc;  
request2.CookieContainer = cc;   
 
这样 request 和 request2 之间就使用了相同的 Session,如果 request 登录了,那么 request2 也是登录状态。
最后,如何在不同的页面间使用同一个 CookieContainer。
要在不同的页面间使用同一个 CookieContainer,只有把 CookieContainer 加入 Session。
 
view plaincopy to clipboardprint?
Session.Add("ccc", cc); //
  
CookieContainer cc = (CookieContainer)Session["ccc"]; //
 
5 抓取网页内容-把当前会话带到 WebRequest 中
 
比如说浏览器 B1 去访问服务器端 S1,这会产生一个会话,而服务器端 S2 再用 WebRequest 去访问服务器端 S1,这又会产生一个会话。现在的需求是让 WebRequest 使用浏览器 B1 与 S1 之间的会话,也就是说要让 S1 认为是 B1 在访问 S1,而不是 S2 在访问 S1。
这就要利用 Cookie 了,先在 S1 中取得与 B1 的 SessionID 的 Cookie,再将这个 Cookie 告诉 S2,S2 再将 Cookie 写在 WebRequest 中。
view plaincopy to clipboardprint?
WebRequest request = WebRequest.Create("url");  
<SPAN class=key>request.Headers.Add(HttpRequestHeader.Cookie, "ASPSESSIONIDSCATBTAD=KNNDKCNBONBOOBIHHHHAOKDM;");</SPAN>  
WebResponse response = request.GetResponse();  
StreamReader reader = new StreamReader(response.GetResponseStream(), System.Text.Encoding.GetEncoding("gb2312"));  
Response.Write(reader.ReadToEnd());  
reader.Close();  
reader.Dispose();  
response.Close();  
 
要说明的是:
本文并不是 Cookie 欺骗,因为 SessionID 是 S1 告诉 S2 的,并不是 S2 窃取的,虽然有些古怪,但这可能在一些特定的应用系统中会有用。
S1 必须要向 B1 写 Session,这样 SessionID 才会保存到 Cookie 中,并且 SessionID 才会保持不变。
在 ASP.NET 中取 Cookie 用 Request.Cookies,本文假设 Cookie 已经取出来。
不同的服务器端语言,SessionID 在 Cookie 中上名称并不一样,本文是 ASP 的 SessionID。
S1 可能不仅仅依靠 SessionID 来判断当前登录,它可能还会辅助于 Referer、User-Agent 等,这取决于 S1 端程序的设计。
其实本文算是本连载中“保持登录状态”的另一种方法。
6 抓取网页内容-如何更改来源 Referer 和 UserAgent
view plaincopy to clipboardprint?
<SPAN class=caution>HttpWebRequest</SPAN> request = <SPAN class=caution>(HttpWebRequest)HttpWebRequest</SPAN>.Create("http://127.0.0.1/index.htm");  
//request.Headers.Add(HttpRequestHeader.Referer, "http://www.cftea.com/"); // 错误   
//request.Headers[HttpRequestHeader.Referer] = "http://www.cftea.com/"; // 错误   
<SPAN class=caution>request.Referer</SPAN> = "http://www.cftea.com/"; // 正确  
 
注释掉的两句是不对的,会发生错误:
view plaincopy to clipboardprint?
此标头必须使用适当的属性进行修改。  
参数名: name   
 
UserAgent 类似。
View Code

C#抓取和分析网页的类

抓取和分析网页的类。

主要功能有:

1、提取网页的纯文本,去所有html标签和javascript代码

2、提取网页的链接,包括href和frame及iframe

3、提取网页的title等(其它的标签可依此类推,正则是一样的)

4、可以实现简单的表单提交及cookie保存

 /*

*  Author:Sunjoy at CCNU

*  如果您改进了这个类请发一份代码给我(ccnusjy 在gmail.com)

*/



using System;

using System.Data;

using System.Configuration;

using System.Net;

using System.IO;

using System.Text;

using System.Collections.Generic;

using System.Text.RegularExpressions;

using System.Threading;

using System.Web;

/// <summary>

/// 网页类

/// </summary>

public class WebPage

{



    #region 私有成员

    private Uri m_uri;   //网址

    private List<Link> m_links;    //此网页上的链接

    private string m_title;        //此网页的标题

    private string m_html;         //此网页的HTML代码

    private string m_outstr;       //此网页可输出的纯文本

    private bool m_good;           //此网页是否可用

    private int m_pagesize;       //此网页的大小

    private static Dictionary<string, CookieContainer> webcookies = new Dictionary<string, CookieContainer>();//存放所有网页的Cookie

    private string m_post;  //此网页的登陆页需要的POST数据

    private string m_loginurl;  //此网页的登陆页

    #endregion





    #region 私有方法

    /// <summary>

    /// 这私有方法从网页的HTML代码中分析出链接信息

    /// </summary>

    /// <returns>List<Link></returns>

    private List<Link> getLinks()

    {

        if (m_links.Count == 0)

        {

            Regex[] regex = new Regex[2];

            regex[0] = new Regex("(?m)<a[^><]+href=("|')?(?<url>([^>"'\s)])+)("|')?[^>]*>(?<text>(\w|\W)*?)</", RegexOptions.Multiline | RegexOptions.IgnoreCase);

            regex[1] = new Regex("<[i]*frame[^><]+src=("|')?(?<url>([^>"'\s)])+)("|')?[^>]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase);

            for (int i = 0; i < 2; i++)

            {

                Match match = regex[i].Match(m_html);

                while (match.Success)

                {

                    try

                    {

                        string url = new Uri(m_uri, match.Groups["url"].Value).AbsoluteUri;

                        string text = "";

                        if (i == 0) text = new Regex("(<[^>]+>)|(\s)|(&nbsp;)|&|"", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(match.Groups["text"].Value, "");

                        Link link = new Link(url, text);

                        m_links.Add(link);

                    }

                    catch(Exception ex){Console.WriteLine(ex.Message); };

                    match = match.NextMatch();

                }

            }

        }

        return m_links;

    }

   

    /// <summary>

    /// 此私有方法从一段HTML文本中提取出一定字数的纯文本

    /// </summary>

    /// <param name="instr">HTML代码</param>

    /// <param name="firstN">提取从头数多少个字</param>

    /// <param name="withLink">是否要链接里面的字</param>

    /// <returns>纯文本</returns>

    private string getFirstNchar(string instr, int firstN, bool withLink)

    {

        if (m_outstr == "")

        {

            m_outstr = instr.Clone() as string;

            m_outstr = new Regex(@"(?m)<script[^>]*>(w|W)*?</script[^>]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase ).Replace(m_outstr, "");

            m_outstr = new Regex(@"(?m)<style[^>]*>(w|W)*?</style[^>]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase ).Replace(m_outstr, "");

            m_outstr = new Regex(@"(?m)<select[^>]*>(w|W)*?</select[^>]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase ).Replace(m_outstr, "");

            if (!withLink) m_outstr = new Regex(@"(?m)<a[^>]*>(w|W)*?</a[^>]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(m_outstr, "");

            Regex objReg = new System.Text.RegularExpressions.Regex("(<[^>]+?>)|&nbsp;", RegexOptions.Multiline | RegexOptions.IgnoreCase);

            m_outstr = objReg.Replace(m_outstr, "");

            Regex objReg2 = new System.Text.RegularExpressions.Regex("(\s)+", RegexOptions.Multiline | RegexOptions.IgnoreCase);

            m_outstr = objReg2.Replace(m_outstr, " ");

        }

        return m_outstr.Length > firstN ? m_outstr.Substring(0, firstN) : m_outstr;

    }



    /// <summary>

    /// 此私有方法返回一个IP地址对应的无符号整数

    /// </summary>

    /// <param name="x">IP地址</param>

    /// <returns></returns>

    private uint getuintFromIP(IPAddress x)

    {

        Byte[] bt = x.GetAddressBytes();

        uint i = (uint)(bt[0] * 256 * 256 * 256);

        i += (uint)(bt[1] * 256 * 256);

        i += (uint)(bt[2] * 256);

        i += (uint)(bt[3]);

        return i;

    }



    #endregion





    #region 公有文法

    /// <summary>

    /// 此公有方法提取网页中一定字数的纯文本,包括链接文字

    /// </summary>

    /// <param name="firstN">字数</param>

    /// <returns></returns>

    public string getContext(int firstN)

    {

        return getFirstNchar(m_html, firstN, true);

    }



    /// <summary>

    /// 此公有方法提取网页中一定字数的纯文本,不包括链接文字

    /// </summary>

    /// <param name="firstN"></param>

    /// <returns></returns>

    public string getContextWithOutLink(int firstN)

    {

        return getFirstNchar(m_html, firstN, false);

    }



    /// <summary>

    /// 此公有方法从本网页的链接中提取一定数量的链接,该链接的URL满足某正则式

    /// </summary>

    /// <param name="pattern">正则式</param>

    /// <param name="count">返回的链接的个数</param>

    /// <returns>List<Link></returns>

    public List<Link> getSpecialLinksByUrl(string pattern,int count)

    {

        if(m_links.Count==0)getLinks();

        List<Link> SpecialLinks = new List<Link>();

        List<Link>.Enumerator i;

        i = m_links.GetEnumerator();

        int cnt = 0;

        while (i.MoveNext() && cnt<count)

        {

            if (new Regex(pattern, RegexOptions.Multiline | RegexOptions.IgnoreCase ).Match(i.Current.url).Success)

            {

                SpecialLinks.Add(i.Current);

                cnt++;

            }

        } 

        return SpecialLinks;

    }







    /// <summary>

    /// 此公有方法从本网页的链接中提取一定数量的链接,该链接的文字满足某正则式

    /// </summary>

    /// <param name="pattern">正则式</param>

    /// <param name="count">返回的链接的个数</param>

    /// <returns>List<Link></returns>

    public List<Link> getSpecialLinksByText(string pattern,int count)

    {

        if (m_links.Count == 0) getLinks();

        List<Link> SpecialLinks = new List<Link>();

        List<Link>.Enumerator i;

        i = m_links.GetEnumerator();

        int cnt = 0;

        while (i.MoveNext() && cnt < count)

        {

            if (new Regex(pattern, RegexOptions.Multiline | RegexOptions.IgnoreCase ).Match(i.Current.text).Success)

            {

                SpecialLinks.Add(i.Current);

                cnt++;

            }

        }

        return SpecialLinks;

    }

    /// <summary>

    /// 此公有方法获得所有链接中在一定IP范围的链接

    /// </summary>

    /// <param name="_ip_start">起始IP</param>

    /// <param name="_ip_end">终止IP</param>

    /// <returns></returns>

    public List<Link> getSpecialLinksByIP(string _ip_start, string _ip_end)

    {

        IPAddress ip_start = IPAddress.Parse(_ip_start);

        IPAddress ip_end = IPAddress.Parse(_ip_end);

        if (m_links.Count == 0) getLinks();

        List<Link> SpecialLinks = new List<Link>();

        List<Link>.Enumerator i;

        i = m_links.GetEnumerator();

        while (i.MoveNext())

        {

            IPAddress ip;

            try

            {

                ip = Dns.GetHostEntry(new Uri(i.Current.url).Host).AddressList[0];

            }

            catch { continue; }

            if(getuintFromIP(ip)>=getuintFromIP(ip_start) && getuintFromIP(ip)<=getuintFromIP(ip_end))

            {

                SpecialLinks.Add(i.Current);

            }

        }

        return SpecialLinks;

    }



    /// <summary>

    /// 这公有方法提取本网页的纯文本中满足某正则式的文字

    /// </summary>

    /// <param name="pattern">正则式</param>

    /// <returns>返回文字</returns>

    public string getSpecialWords(string pattern)

    {

        if (m_outstr == "") getContext(Int16.MaxValue);

        Regex regex = new Regex(pattern, RegexOptions.Multiline | RegexOptions.IgnoreCase );

        Match mc=regex.Match(m_outstr);

        if (mc.Success)

            return mc.Groups[1].Value;

        return string.Empty;

    }

    #endregion





    #region 构造函数

    

    private void Init(string _url)

    {

   

        try

        {

            m_uri = new Uri(_url);

            m_links = new List<Link>();

            m_html = "";

            m_outstr = "";

            m_title = "";

            m_good = true;

            if (_url.EndsWith(".rar") || _url.EndsWith(".dat") || _url.EndsWith(".msi"))

            {

                m_good = false;

                return;

            }

            HttpWebRequest rqst = (HttpWebRequest)WebRequest.Create(m_uri);

            rqst.AllowAutoRedirect = true;

            rqst.MaximumAutomaticRedirections = 3;

            rqst.UserAgent = "Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0)";

            rqst.KeepAlive = true;

            rqst.Timeout = 30000;

            lock (WebPage.webcookies)

            {

                if (WebPage.webcookies.ContainsKey(m_uri.Host))

                    rqst.CookieContainer = WebPage.webcookies[m_uri.Host];

                else

                {

                    CookieContainer cc = new CookieContainer();

                    WebPage.webcookies[m_uri.Host] = cc;

                    rqst.CookieContainer = cc;

                }

            }



            HttpWebResponse rsps = (HttpWebResponse)rqst.GetResponse();



            Stream sm = rsps.GetResponseStream();

            if (!rsps.ContentType.ToLower().StartsWith("text/") || rsps.ContentLength > 1 << 22)

            {

                rsps.Close();

                m_good = false;

                return;

            }

            Encoding cding = System.Text.Encoding.Default;

            string contenttype=rsps.ContentType.ToLower();

            int ix = contenttype.IndexOf("charset=");

            if (ix != -1)

            {



                try

                {

                    cding = System.Text.Encoding.GetEncoding(rsps.ContentType.Substring(ix + "charset".Length + 1));

                }

                catch

                {

                    cding = Encoding.Default;

                }

                m_html = new StreamReader(sm, cding).ReadToEnd();

            }

            else

            {

                m_html = new StreamReader(sm, cding).ReadToEnd();

                Regex regex = new Regex("charset=(?<cding>[^=]+)?"",RegexOptions.IgnoreCase);

                string strcding = regex.Match(m_html).Groups["cding"].Value;

                try

                {

                    cding = Encoding.GetEncoding(strcding);

                }

                catch{

                    cding = Encoding.Default;

                }

                byte[] bytes=Encoding.Default.GetBytes(m_html.ToCharArray());

                m_html = cding.GetString(bytes);

                if (m_html.Split('?').Length > 100)

                {

                    m_html=Encoding.Default.GetString(bytes);

                }

            }



            

            m_pagesize = m_html.Length;

            m_uri = rsps.ResponseUri;

            rsps.Close();

        }

        catch (Exception ex)

        {

            Console.WriteLine(ex.Message+m_uri.ToString());

            m_good = false;

            

        }

    }



    public WebPage(string _url)

    {

        string uurl = "";

        try

        {

            uurl = Uri.UnescapeDataString(_url);

            _url = uurl;

        }

        catch { };

        Regex re = new Regex("(?<h>[^x00-xff]+)");

        Match mc = re.Match(_url);

        if (mc.Success)

        {

            string han = mc.Groups["h"].Value;

            _url = _url.Replace(han, System.Web.HttpUtility.UrlEncode(han, Encoding.GetEncoding("GB2312")));

        }



        Init(_url);

    }



    public WebPage(string _url, string _loginurl, string _post)

    {

        string uurl = "";

        try

        {

            uurl = Uri.UnescapeDataString(_url);

            _url = uurl;

        }

        catch { };

        Regex re = new Regex("(?<h>[^x00-xff]+)");

        Match mc = re.Match(_url);

        if (mc.Success)

        {

            string han = mc.Groups["h"].Value;

            _url = _url.Replace(han, System.Web.HttpUtility.UrlEncode(han, Encoding.GetEncoding("GB2312")));

        }

        if (_loginurl.Trim() == "" || _post.Trim() == "" || WebPage.webcookies.ContainsKey(new Uri(_url).Host))

        {

            Init(_url);

        }

        else

        {

            #region 登陆

            string indata = _post;

            m_post = _post;

            m_loginurl = _loginurl;

            byte[] bytes = Encoding.Default.GetBytes(_post);

            CookieContainer myCookieContainer = new CookieContainer();

            try

            {



                //新建一个CookieContainer来存放Cookie集合 



                HttpWebRequest myHttpWebRequest = (HttpWebRequest)WebRequest.Create(_loginurl);

                //新建一个HttpWebRequest 

                myHttpWebRequest.ContentType = "application/x-www-form-urlencoded";

                myHttpWebRequest.AllowAutoRedirect = false;

                myHttpWebRequest.UserAgent = "Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0)";

                myHttpWebRequest.Timeout = 60000;

                myHttpWebRequest.KeepAlive = true;

                myHttpWebRequest.ContentLength = bytes.Length;

                myHttpWebRequest.Method = "POST";

                myHttpWebRequest.CookieContainer = myCookieContainer;

                //设置HttpWebRequest的CookieContainer为刚才建立的那个myCookieContainer 

                Stream myRequestStream = myHttpWebRequest.GetRequestStream();

                myRequestStream.Write(bytes, 0, bytes.Length);

                myRequestStream.Close();

                HttpWebResponse myHttpWebResponse = (HttpWebResponse)myHttpWebRequest.GetResponse();



                foreach (Cookie ck in myHttpWebResponse.Cookies)

                {

                    myCookieContainer.Add(ck);

                }

                myHttpWebResponse.Close();

            }

            catch

            {

                Init(_url);

                return;

            }



            #endregion



            #region 登陆后再访问页面

            try

            {

                m_uri = new Uri(_url);

                m_links = new List<Link>();

                m_html = "";

                m_outstr = "";

                m_title = "";

                m_good = true;

                if (_url.EndsWith(".rar") || _url.EndsWith(".dat") || _url.EndsWith(".msi"))

                {

                    m_good = false;

                    return;

                }

                HttpWebRequest rqst = (HttpWebRequest)WebRequest.Create(m_uri);

                rqst.AllowAutoRedirect = true;

                rqst.MaximumAutomaticRedirections = 3;

                rqst.UserAgent = "Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0)";

                rqst.KeepAlive = true;

                rqst.Timeout = 30000;

                rqst.CookieContainer = myCookieContainer;

                lock (WebPage.webcookies)

                {

                    WebPage.webcookies[m_uri.Host] = myCookieContainer;

                }

                HttpWebResponse rsps = (HttpWebResponse)rqst.GetResponse();



                Stream sm = rsps.GetResponseStream();

                if (!rsps.ContentType.ToLower().StartsWith("text/") || rsps.ContentLength > 1 << 22)

                {

                    rsps.Close();

                    m_good = false;

                    return;

                }

                Encoding cding = System.Text.Encoding.Default;

                int ix = rsps.ContentType.ToLower().IndexOf("charset=");

                if (ix != -1)

                {

                    try

                    {

                        cding = System.Text.Encoding.GetEncoding(rsps.ContentType.Substring(ix + "charset".Length + 1));

                    }

                    catch

                    {

                        cding = Encoding.Default;

                    }

                }





                m_html = new StreamReader(sm, cding).ReadToEnd();





                m_pagesize = m_html.Length;

                m_uri = rsps.ResponseUri;

                rsps.Close();

            }

            catch (Exception ex)

            {

                Console.WriteLine(ex.Message+m_uri.ToString());

                m_good = false;

            

            }

            #endregion

        }



    }



    #endregion





    #region 属性



    /// <summary>

    /// 通过此属性可获得本网页的网址,只读

    /// </summary>

    public string URL

    {

        get

        {

            return m_uri.AbsoluteUri;

        }

    }



    /// <summary>

    /// 通过此属性可获得本网页的标题,只读

    /// </summary>

    public string Title

    {

        get

        {

            if (m_title == "")

            {

                Regex reg = new Regex(@"(?m)<title[^>]*>(?<title>(?:w|W)*?)</title[^>]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase );

                Match mc = reg.Match(m_html);

                if (mc.Success)

                    m_title= mc.Groups["title"].Value.Trim();

            }

            return m_title;

        }

    }

  



    /// <summary>

    /// 此属性获得本网页的所有链接信息,只读

    /// </summary>

    public List<Link> Links

    {

        get

        {

            if (m_links.Count == 0) getLinks();

            return m_links;

        }

    }





    /// <summary>

    /// 此属性返回本网页的全部纯文本信息,只读

    /// </summary>

    public string Context

    {

       get

       {

           if (m_outstr == "") getContext(Int16.MaxValue);

           return m_outstr;

       }

    }



    /// <summary>

    /// 此属性获得本网页的大小

    /// </summary>

    public int PageSize

    {

        get

        {

            return m_pagesize;

        }

    }

    /// <summary>

    /// 此属性获得本网页的所有站内链接

    /// </summary>

    public List<Link> InsiteLinks

    {

        get

        {

            return getSpecialLinksByUrl("^http://"+m_uri.Host,Int16.MaxValue);

        }

    }



    /// <summary>

    /// 此属性表示本网页是否可用

    /// </summary>

    public bool IsGood

    {

        get

        {

            return m_good;

        }

    }

    /// <summary>

    /// 此属性表示网页的所在的网站

    /// </summary>

    public string Host

    {

        get

        {

            return m_uri.Host;

        }

    }

    



    /// <summary>

    /// 此网页的登陆页所需的POST数据

    /// </summary>

    public string PostStr

    {

        get

        {

            return m_post;

        }

    }

    /// <summary>

    /// 此网页的登陆页

    /// </summary>

    public string LoginURL

    {

        get

        {

            return m_loginurl;

        }

    }

    #endregion

}



/// <summary>

/// 链接类

/// </summary>

public class Link

{

    public string url;   //链接网址

    public string text;  //链接文字

    public Link(string _url, string _text)

    {

        url = _url;

        text = _text;

    }

} 
View Code

C#抓取网页信息

景
  随着Internet的普及,网络信息正以极高的速度增长,在这么多数据中找到自己需要的信息是一件很繁琐的事情,找到需要的信息后如何获取也是件麻烦的事。这 就需要Internet信息抓取程序来代替人工的操作。
  所谓Internet信息抓取程序,就是程序会按照用户的关键词或关键网站来收集相应的信息,并提供给用户想要的信息格式 。
  信息量的增加会带来信息网站发布人员工作量的剧增,为实现信息发布系统实现信息自
  动发布、减少工作人员工作量、即时跟踪最新信息,就需要自动信息提供 程序,因此Internet信息抓取程序应运而生。
  目标
  实现自定义网站信息分类抓取,存入本地数据库、生成静态页面或其它用户定义的信息结构,并下载与信息相关 的多媒体文件。
  开发
  目标站点结构分析
  本步骤是准确抓取信息个关键。
  首先要选择更新频 率高的页面做为抓取地址,然后分析要抓取内容页面url特点。
  然后分析要抓取信息页面的元素特性,比如标题位置,内容位置 等,得到定位标记点。
  将以上信息 写成自己的配置文件或存到数据库中。
  每个网站都需要分析,写出单独的配置文件,供抓取程序使用。
  信息提取
  根据配置文件取得要抓取页面url,使用HttpWebRequest类获取内容:
双击代码全选
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
//获取http页面函数
        public string Get_Http(string a_strUrl,int timeout)
        {
            string strResult ;        
            try
            {
HttpWebRequest myReq = (HttpWebRequest) HttpWebRequest.Create(a_strUrl) ;
                myReq.Timeout = timeout;
                HttpWebResponse HttpWResp = (HttpWebResponse)myReq.GetResponse();
           
                Stream myStream = HttpWResp.GetResponseStream () ;
                StreamReader sr = new StreamReader (myStream , Encoding.Default);
                StringBuilder strBuilder = new StringBuilder();
                while (-1 != sr.Peek())
                {
                    strBuilder.Append (sr.ReadLine()+"
");
                }
                strResult = strBuilder.ToString ();
            }
            catch(Exception exp)
            {
                strResult = "错误:" + exp.Message ;
            }
            return strResult ;
        }
  获取页面内容后,分析页面中连接地址取到要抓取的url:
双击代码全选
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
//处理页面标题和链接
        public string SniffWebUrl( string urlStr,string blockB,string blockE )
        {      
            string urlch1 = "";
            string urlch2 = "";                   
            int end_n1 = 0;
            int end_nums = 0;
            int end_nums1 = 0;
            int end_nums2 = 0;
            int end_nums3     = 0;           
            string reUTStr = "";
            string reTitle = "";
            string ret = "";          
            try
            {
                int pos01 = urlStr.IndexOf( "." );
                int pos02 = urlStr.LastIndexOf( "/" );
                if( pos01 < 0 )
                {
                    return "";
                }
                if( pos02 < 0 )
                {
                    return "";
                }
                int pos03 = urlStr.IndexOf( "/",pos01 );
                if ( pos03 < 0 )
                {
                    urlch1 = urlStr;
                    urlch2 = urlStr;
                }
                else
                {
                    urlch1 = urlStr.Substring( 0,pos03 );
                    urlch2 = urlStr.Substring( 0,pos02 );
                }
                string tmpAllStr = new PublicFun().Get_Http( urlStr ,time1);
                int pos1 = tmpAllStr.IndexOf( blockB );
                int pos2 = tmpAllStr.IndexOf( blockE,pos1 + blockB.Length );
                if ( pos1>0 && pos2>0 && pos2>pos1 )
                {
                    ret = tmpAllStr.Substring( pos1 + blockB.Length,pos2 - pos1 - blockB.Length );
                    ret = ret.Substring( ret.IndexOf( "<" ));
                    while( ret.IndexOf( "<A" ) >= 0 )
                    {
                      &nbs p; ret = ret.Substring( 0,ret.IndexOf( "<A" ) ) + "<a" + ret.Substring( ret.IndexOf( "<A" ) + 2 );
                    }
                    while( ret.IndexOf( "</A" ) >=0 )
                    {
                      &nbs p; ret = ret.Substring( 0,ret.IndexOf( "</A" ) ) + "</a" + ret.Substring( ret.IndexOf( "</A" ) + 3 );
                    }
                    while( ret.IndexOf( "Href=" ) >=0 )
                    {
                      &nbs p; ret = ret.Substring( 0,ret.IndexOf( "Href=" )) + "href=" + ret.Substring( ret.IndexOf( "Href=" ) + 5 );
                    }
                    while( ret.IndexOf( "HREF=" ) >=0 )
                    {
                      &nbs p; ret = ret.Substring( 0,ret.IndexOf( "HREF=" )) + "href=" + ret.Substring( ret.IndexOf( "HREF=" ) + 5 );
                    }
                    while( ret.IndexOf( "href='" ) >=0 )
                    {
                      &nbs p; ret = ret.Substring( 0,ret.IndexOf( "href='" )) + "href="" + ret.Substring( ret.IndexOf( "href='" ) + 6 );
                    }
                }      
                tmpAllStr = ret;     
                int begin_nums = tmpAllStr.IndexOf( "href=" );
                while ( begin_nums >= 0 )
                {              
                    string tmpStrA = "";
                    string tmpStrB = tmpAllStr.Substring( begin_nums + 5,1 );
                    if ( tmpStrB == """ )
                    {
                      &nbs p; end_n1 = begin_nums + 6;
                      &nb sp; if ( ( end_n1 + 1 ) > tmpAllStr.Length )
                        {
                      &nbs p;     return "";
                    &nbs p;   }
                        tmpStrA = tmpAllStr.Substring( begin_nums+6,1 );
                    }
                    else
                    {
                      &nbs p; end_n1 = begin_nums + 5;
                      &nb sp; tmpStrA = tmpStrB;
                    }
                    if ( tmpStrA == "#" )
                    {
                      &nbs p; tmpAllStr = tmpAllStr.Substring( end_n1 );
                      &nb sp; begin_nums = tmpAllStr.IndexOf( "href=" );
                    }
                    else
                    {                  
                        end_nums1 = tmpAllStr.IndexOf( " ",end_n1 );
                      &nb sp; end_nums2 = tmpAllStr.IndexOf( ">",end_n1 );
                      &nb sp; end_nums3 = tmpAllStr.IndexOf( "</a",end_nums2 );
                      &nb sp; if ( ( end_nums3 >= 0 ) && ( end_nums2 >= 0 ) )
                        {
                      &nbs p;     reTitle = tmpAllStr.Substring( end_nums2 + 1,end_nums3 - end_nums2 - 1 );
                      &nb sp;     if ( end_nums1 > end_nums2 )
                            {
                      &nbs p;         end_nums = end_nums2;
                     & nbsp;      }
                            else
                      & nbsp;     {
                      &nbs p;         if ( end_nums1 < 0 )
                                {
                      &nbs p;             end_nums = end_nums2;
                     & nbsp;          }
                                else
                      & nbsp;         {
                      &nbs p;             end_nums = end_nums1;
                     & nbsp;          }
                            }
                            string str4 = tmpAllStr.Substring( end_nums-1, end_nums - end_nums + 1 );
                      &nb sp;     if ( str4 =="""  || str4 == "'" )
                            {
                      &nbs p;         end_nums = end_nums - 1;
                      &nb sp;     }
                            string sTotalOne = tmpAllStr.Substring( end_n1,end_nums - end_n1 );
                      &nb sp;     if ( sTotalOne.IndexOf( "http://" ) <0 )
                            {
                      &nbs p;         if ( sTotalOne.IndexOf( "/" ) == 0 )
                                {
                      &nbs p;             sTotalOne = urlch1 + sTotalOne;
                     & nbsp;          }
                                else
                      & nbsp;         {                               
                                    int linshiIntNum = 0;
                      &nb sp;             int flags = 0;
                      &nb sp;             string urlChange = urlStr;;
                     &nb sp;              while( sTotalOne.IndexOf( "../" ) >= 0 )
                                    {
                      &nbs p;                 sTotalOne = sTotalOne.Substring( sTotalOne.IndexOf( "../" ) + 3 );
                      &nb sp;                 linshiIntNum = linshiIntNum + 1;
                      &nb sp;                 flags = flags +1;
                      &n bsp;             }
                                    while( ( urlChange.LastIndexOf( "/" ) >= 0 ) && ( linshiIntNum >= 0 ) )
                                    {
                      &nbs p;                 urlChange = urlChange.Substring( 0,urlChange.LastIndexOf( "/" ) );
                      &nb sp;                 linshiIntNum = linshiIntNum - 1;
                      &nb sp;             }
                                    if ( flags == 0 )
                                    {
                      &nbs p;                 sTotalOne = urlch2 + "/" + sTotalOne;
                     & nbsp;              }
                                    else
                      & nbsp;             {
                      &nbs p;                 sTotalOne = urlChange + "/" + sTotalOne;
                     & nbsp;              }
                                }
                            }
                            reUTStr = reUTStr + new PublicFun().RemoveHtmlCode( reTitle ) + sTotalOne;
                     & nbsp;      tmpAllStr = tmpAllStr.Substring( end_nums3 + 4 );
                      &nb sp;     begin_nums = tmpAllStr.IndexOf( "href=" );
                      &nb sp; }
                        else
                      & nbsp; {
                      &nbs p;     begin_nums = - 1;
                      &nb sp; }                   
                    }
                }
                return reUTStr;
            }
            catch( Exception e)
            {
                return "";
            }
        }


得到要抓取内容的url后,处理该页面:
双击代码全选
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
//获取链接内容并分类处理
        public string GetWebContent( string gatherUrl,string subUrl,string subTitle,string b_Content,string e_Content,string b_Filter,string e_Filter,string root )
        {
            string tmpAllStr = "";           
            string dfStrB = "";
            string dfStrE = "";               
            string rePicStr = "";//图片返回路径   
            string reContentStr = "";
            string picHtml = "images"; //本地图片路径
           
            string urlch1 ="";
            string urlch2 ="";
            int pos1 = gatherUrl.IndexOf( "." );
            int pos2 = gatherUrl.LastIndexOf( "/" );
            if( pos1 < 0 )
            {
                return "";
            }
            if( pos2 < 0 )
            {               
                return "";
            }
            int pos3 = gatherUrl.IndexOf( "/",pos1 );
            if ( pos3 < 0 )
            {
                urlch1 = gatherUrl;
                urlch2 = gatherUrl;
            }
            else
            {
                urlch1 = gatherUrl.Substring( 0,pos3 );
                urlch2 = gatherUrl.Substring( 0,pos2 );
            }   
           
            tmpAllStr = new PublicFun().Get_Http( subUrl,time1 );
            //取稿源
            string docFromStr = "";
            if ( dfStrB != "" && dfStrE != "" )
            {
                if ( tmpAllStr != "" )
                {
                    int b_docF = tmpAllStr.IndexOf( dfStrB );
                    if ( b_docF > 0 )
                    {
                      &nbs p; int e_docF = tmpAllStr.IndexOf( dfStrE,b_docF + dfStrB.Length );
                      &nb sp; if ( e_docF > 0 && e_docF > b_docF && e_docF - b_docF < 20 )
                        {
                      &nbs p;     docFromStr = tmpAllStr.Substring( b_docF + dfStrB.Length, e_docF - b_docF - dfStrB.Length );
                      &nb sp; }
                    }
                }
            }
            //取内容
            if ( tmpAllStr != "" )
            {               
                int begin_strnum = tmpAllStr.IndexOf( b_Content );
                if ( begin_strnum < 0 )
                {                  
                    return "";
                }
                int end_strnum = tmpAllStr.IndexOf( e_Content,begin_strnum + b_Content.Length );
                if ( end_strnum < 0 )
                {                  
                    return "";
                }
                string sTotalSubM = "";
                if ( end_strnum > begin_strnum )
                {
                    sTotalSubM = tmpAllStr.Substring ( begin_strnum,end_strnum - begin_strnum );
                }
               
                if ( sTotalSubM == "" )
                {                  
                    return "";
                }               
                //过滤无用信息
                int bfnum = sTotalSubM.IndexOf( b_Filter );
                if ( bfnum > -1 )
                {
                    int efnum = sTotalSubM.IndexOf( e_Filter,bfnum );
                    if ( efnum > -1 )
                    {
                      &nbs p; if ( efnum > bfnum )
                        {
                      &nbs p;     sTotalSubM = sTotalSubM.Substring( 0,bfnum ) + sTotalSubM.Substring( efnum + e_Filter.Length );
                      &nb sp; }
                    }
                }
                //格式化图片标记
               
                while( sTotalSubM.IndexOf( "Src=" ) >= 0 )
                {
                    sTotalSubM = sTotalSubM.Substring( 0,sTotalSubM.IndexOf( "Src=" ) ) + "src=" + sTotalSubM.Substring( sTotalSubM.IndexOf( "Src=" ) + 4 );
                }
                while( sTotalSubM.IndexOf( "SRC=" ) >= 0 )
                {
                    sTotalSubM = sTotalSubM.Substring( 0,sTotalSubM.IndexOf( "SRC=" ) ) + "src=" + sTotalSubM.Substring( sTotalSubM.IndexOf( "SRC=" ) + 4 );
                }
                while( sTotalSubM.IndexOf( "src='" ) >= 0 )
                {
                    sTotalSubM = sTotalSubM.Substring( 0,sTotalSubM.IndexOf( "src='" ) ) + "src="" + sTotalSubM.Substring( sTotalSubM.IndexOf( "src='" ) + 5 );
                }
                //取图片地址
                int end_n12 = 0;
                int end_nums2 = 0;
                int begin_nums2 = sTotalSubM.IndexOf( "src=" );
                while( begin_nums2 >= 0 )
                {
                    String tmpStr = sTotalSubM.Substring( begin_nums2 + 4,1 );
                    if ( tmpStr == """ )
                    {
                      &nbs p; end_n12 = begin_nums2 + 5;
                    }
                    else
                    {
                      &nbs p; end_n12 = begin_nums2 + 4;
                    }
                    int end_nums2a = sTotalSubM.IndexOf( " ",end_n12 );
                    int end_nums2b = sTotalSubM.IndexOf( ">",end_n12 );
                    if ( end_nums2b < 0 )
                    {
                      &nbs p; break;
                    }
                    if ( end_nums2a > end_nums2b )
                    {
                      &nbs p; end_nums2 = end_nums2b;
                    }
                    else
                    {
                      &nbs p; if (end_nums2a<0)
                        {
                      &nbs p;     end_nums2 = end_nums2b;
                         }
                        else
                      & nbsp; {
                      &nbs p;     end_nums2 = end_nums2a;
                         }
                    }
                    tmpStr = sTotalSubM.Substring( end_nums2-1,1 );
                    if ( tmpStr == """ || tmpStr == "'" )
                    {
                      &nbs p; end_nums2 = end_nums2 - 1;
                    }
                    string tmpPicStr = sTotalSubM.Substring( end_n12,end_nums2 - end_n12 );
                    if ( tmpPicStr.IndexOf( "http://" ) < 0 )
                    {
                      &nbs p; if ( tmpPicStr.IndexOf( "/" ) == 0 )
                        {
                      &nbs p;     tmpPicStr = urlch1 + tmpPicStr;
                     & nbsp;  }
                        else
                      & nbsp; {                            
                            int linshiIntNum = 0;
                      &nb sp;     int flags = 0;
                      &nb sp;     string urlChange = subUrl;
                     &nbs p;      while( tmpPicStr.IndexOf( "../" ) >= 0 )
                            {
                      &nbs p;         tmpPicStr = tmpPicStr.Substring( tmpPicStr.IndexOf("../") + 3 );
                      &nb sp;         linshiIntNum = linshiIntNum + 1;
                      &nb sp;         flags = flags + 1;
                      &nb sp;     }
                            while( ( urlChange.LastIndexOf( "/" ) >= 0 ) && ( linshiIntNum >= 0 ) )
                            {
                      &nbs p;         urlChange = urlChange.Substring( 0,urlChange.LastIndexOf( "/" ) );
                      &nb sp;         linshiIntNum = linshiIntNum - 1;
                      &nb sp;     }
                            if ( flags == 0 )
                            {
                      &nbs p;         tmpPicStr = urlch2 + "/" + tmpPicStr;
                     & nbsp;      }
                            else
                      & nbsp;     {
                      &nbs p;         tmpPicStr = urlChange + "/" + tmpPicStr;
                     & nbsp;      }
                        }
                    }
                    //tmpPicStr = tmpPicStr.ToLower();
                    string tmpPicStrTmp = tmpPicStr.ToLower ();
                    //if ( tmpPicStr.IndexOf( ".jpg" ) > 0 || tmpPicStr.IndexOf( ".gif" ) > 0 || tmpPicStr.IndexOf( ".bmp" ) > 0 )
                    if ( tmpPicStrTmp.IndexOf( ".jpg" ) > 0 || tmpPicStrTmp.IndexOf( ".gif" ) > 0 || tmpPicStrTmp.IndexOf( ".bmp" ) > 0 )
                    {
                      &nbs p; rePicStr = rePicStr + "||" + tmpPicStr ;
                      &nbs p; int flagN2 = tmpPicStr.LastIndexOf( "/" );
                      &nb sp; string fileN2 = picHtml + tmpPicStr.Substring( flagN2 );
                      &nb sp; sTotalSubM = sTotalSubM.Substring( 0,end_nums2 ) + ">******" + fileN2 + "******<" + sTotalSubM.Substring( end_nums2 );
                      &nb sp; begin_nums2 = sTotalSubM.IndexOf( "src=", end_nums2 + fileN2.Length + 22 );
                    }
                    else
                    {
                      &nbs p; begin_nums2 = sTotalSubM.IndexOf( "src=", end_nums2 + 4 );                       
                    }                   
                }
                if ( rePicStr.Length > 2 ) 
                    rePicStr =  rePicStr.Substring(2);              
                //内容处理 格式化关键标记
                while( sTotalSubM.IndexOf( "<P" ) >= 0 )
                {
                    sTotalSubM = sTotalSubM.Substring( 0,sTotalSubM.IndexOf( "<P" ) ) + "|****|<" + sTotalSubM.Substring( sTotalSubM.IndexOf( "<P" ) + 2 );
                }
                while( sTotalSubM.IndexOf( "<p" ) >= 0 )
                {
                    sTotalSubM = sTotalSubM.Substring( 0,sTotalSubM.IndexOf( "<p" ) ) + "|****|<" + sTotalSubM.Substring( sTotalSubM.IndexOf( "<p" ) + 2 );
                }
                while( sTotalSubM.IndexOf( "</P" ) >= 0 )
                {
                    sTotalSubM = sTotalSubM.Substring( 0,sTotalSubM.IndexOf( "</P" ) ) + "|****|<" + sTotalSubM.Substring( sTotalSubM.IndexOf( "</P" ) + 3 );
                }
                while( sTotalSubM.IndexOf( "</p" ) >= 0 )
                {
                    sTotalSubM = sTotalSubM.Substring( 0,sTotalSubM.IndexOf( "</p" ) ) + "|****|<" + sTotalSubM.Substring( sTotalSubM.IndexOf( "</p" ) + 3 );
                }
                while( sTotalSubM.IndexOf( "<br" ) >=0 )
                {
                    sTotalSubM = sTotalSubM.Substring( 0,sTotalSubM.IndexOf( "<br" ) ) + "+****+<" + sTotalSubM.Substring( sTotalSubM.IndexOf( "<br" ) + 3 );
                }
                while( sTotalSubM.IndexOf( "<BR" ) >= 0 )
                {
                    sTotalSubM = sTotalSubM.Substring( 0,sTotalSubM.IndexOf( "<BR" ) ) + "+****+<" + sTotalSubM.Substring( sTotalSubM.IndexOf( "<BR" ) + 3 );
                }
                while( sTotalSubM.IndexOf( "<Br" ) >= 0 )
                {
                    sTotalSubM = sTotalSubM.Substring( 0,sTotalSubM.IndexOf( "<Br" ) ) + "+****+<" + sTotalSubM.Substring( sTotalSubM.IndexOf( "<Br" ) + 3 );
                }
                while( sTotalSubM.IndexOf( "<bR" ) >= 0 )
                {
                    sTotalSubM = sTotalSubM.Substring( 0,sTotalSubM.IndexOf( "<bR" ) ) + "+****+<" + sTotalSubM.Substring( sTotalSubM.IndexOf( "<bR" ) + 3 );
                }
                //去除html标记
                int linshiInt1 = sTotalSubM.IndexOf( "<" );
                int linshiInt2 = sTotalSubM.IndexOf( ">" );           
                if ( linshiInt2 < linshiInt1 )
                {
                    sTotalSubM = sTotalSubM.Substring( linshiInt2 + 1 );
                }
                int linshiInt11 = sTotalSubM.LastIndexOf( "<" );
                int linshiInt12 = sTotalSubM.LastIndexOf( ">" );
                if ( linshiInt12 < linshiInt11 )
                {
                    sTotalSubM = sTotalSubM.Substring( 0,linshiInt12 + 1 );
                }
                linshiInt1 = sTotalSubM.IndexOf( "<" );
                while ( linshiInt1 >= 0 )
                {
                    linshiInt2 = sTotalSubM.IndexOf( ">",linshiInt1 );
                    if ( linshiInt2 >= 0 )
                    {              
                        sTotalSubM = sTotalSubM.Substring( 0,linshiInt1 ) + sTotalSubM.Substring( linshiInt2 + 1 );
                    }
                    else
                    {
                      &nbs p; sTotalSubM = sTotalSubM.Substring( 0,linshiInt1 );
                    }
                    linshiInt1 = sTotalSubM.IndexOf("<");
                }
                //还原关键标记
                int linshiInt3 = 0;
                int linshiInt4 = 0;
                while( sTotalSubM.IndexOf( "+****+" ) >= 0 )
                {
                    sTotalSubM = sTotalSubM.Substring( 0,sTotalSubM.IndexOf( "+****+" ) ) + "<br>
" + sTotalSubM.Substring( sTotalSubM.IndexOf( "+****+" ) + 9 );
                }
                while( sTotalSubM.IndexOf( "|****|" ) >= 0 )
                {
                    sTotalSubM = sTotalSubM.Substring( 0,sTotalSubM.IndexOf( "|****|" ) ) + "<br>
" + sTotalSubM.Substring( sTotalSubM.IndexOf( "|****|" ) + 9 );
                }
                while( sTotalSubM.IndexOf( "******" ) >= 0 )
                {
                    linshiInt3 = sTotalSubM.IndexOf( "******" ) + 9;
                    linshiInt4 = sTotalSubM.IndexOf( "******",linshiInt3 );
                    if ( linshiInt4 >= 0 )
                    {
                      &nbs p; int tmpPos = sTotalSubM.IndexOf( "******" );
                      &nb sp; string tmpStr1 = sTotalSubM.Substring( 0,tmpPos );
                        string tmpStr2 = sTotalSubM.Substring( linshiInt3,linshiInt4 - linshiInt3 );
                      &nb sp; string tmpStr3 = sTotalSubM.Substring( linshiInt4 + 9 );
                      &nb sp; sTotalSubM = tmpStr1 + "<img src=" + tmpStr2 + ">" + tmpStr3;
                    }
                    else
                    {
                      &nbs p; break;
                    }
                }
                //去除内容中的标题
                if ( sTotalSubM.IndexOf( subTitle ) >= 0 )
                {
                    sTotalSubM = sTotalSubM.Substring( 0,sTotalSubM.IndexOf( subTitle ) ) + sTotalSubM.Substring( sTotalSubM.IndexOf( subTitle ) + subTitle.Length );
                }
                reContentStr = sTotalSubM;
                //调用下载图片功能
                //下载图片到指定目录
                string[] img_Url = new PublicFun().split( rePicStr,"||" );
                for ( int i=0;i<img_Url.Length;i++ )
                {
                    if ( img_Url[i] != "" )
                    {
                      &nbs p; new PublicFun().Get_Img( img_Url[i],10000,root + "images" + img_Url[i].Substring( img_Url[i].LastIndexOf("/")+1 ) );
                    }
                }
            }
            return reContentStr;
        }

以上方法返回要取得的信息,包括标题内容,图片地址等。
  下载页面中图片:
双击代码全选
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
//下载图片
        public void Get_Img(string a_strUrl,int timeout,string filepath)
        {
            try
            {
HttpWebRequest myReq = (HttpWebRequest) HttpWebRequest.Create(a_strUrl) ;
                myReq.Timeout = timeout;
                HttpWebResponse HttpWResp = (HttpWebResponse)myReq.GetResponse();        
                Stream myStream = HttpWResp.GetResponseStream () ;         
                Bitmap map = new Bitmap( myStream );
                PictureBox picB = new PictureBox ();
                picB.Image = (Image) map;
                string path = filepath.Substring( 0,filepath.LastIndexOf( "" ) );
                if (!Directory.Exists(path))
                {
                    CreateDir( path );
                }               
                picB.Image.Save (filepath);               
            }
            catch(Exception exp)
            {
                string ss = exp.Message;
        WriteLog( filepath.Substring(0,filepath.LastIndexOf("")) + "error.log",a_strUrl + "--" + ss + "
");       
            }
        }
  保存文件或入库
  上面取得的信息可以按自己的要求保存。
  ****设计的时候没有使用url按层次循 环抓取,这样定义抓取url效率更高,速度更快。
  注:此版本只提供静态文件存储功能,不提供数据库接口,不提供自定义网站功能。
  本程序运行需要先安 装.net 框架1.1
View Code

c# 抓取网页类(获取网页中所有信息)

c# 抓取网页类(获取网页中所有信息)
分类: c#程序设计2011-08-05 09:14 2362人阅读 评论(4) 收藏 举报
 
[csharp] view plaincopyprint?
1. using System;  
2. using System.Data;  
3. using System.Configuration;  
4. using System.Net;  
5. using System.IO;  
6. using System.Text;  
7. using System.Collections.Generic;  
8. using System.Text.RegularExpressions;  
9. using System.Threading;  
10. using System.Web;  
11. using System.Web.UI.MobileControls;  
12.     /// <summary>  
13.     /// 网页类  
14.     /// </summary>  
15.     public class WebPage  
16.     {  
17.         #region 私有成员  
18.         private Uri m_uri;   //url  
19.         private List<Link> m_links;    //此网页上的链接  
20.         private string m_title;        //标题  
21.         private string m_html;         //HTML代码  
22.         private string m_outstr;       //网页可输出的纯文本  
23.         private bool m_good;           //网页是否可用  
24.         private int m_pagesize;       //网页的大小  
25.         private static Dictionary<string, CookieContainer> webcookies = new Dictionary<string, CookieContainer>();//存放所有网页的Cookie  
26.         
27.         #endregion  
28.  
29.         #region 属性  
30.   
31.         /// <summary>  
32.         /// 通过此属性可获得本网页的网址,只读  
33.         /// </summary>  
34.         public string URL  
35.         {  
36.             get  
37.             {  
38.                 return m_uri.AbsoluteUri;  
39.             }  
40.         }  
41.   
42.         /// <summary>  
43.         /// 通过此属性可获得本网页的标题,只读  
44.         /// </summary>  
45.         public string Title  
46.         {  
47.             get  
48.             {  
49.                 if (m_title == "")  
50.                 {  
51.                     Regex reg = new Regex(@"(?m)<title[^>]*>(?<title>(?:w|W)*?)</title[^>]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase);  
52.                     Match mc = reg.Match(m_html);  
53.                     if (mc.Success)  
54.                         m_title = mc.Groups["title"].Value.Trim();  
55.                 }  
56.                 return m_title;  
57.             }  
58.         }  
59.         public string M_html  
60.         {  
61.             get  
62.             {  
63.                 if (m_html == null)  
64.                 {  
65.                     m_html = "";  
66.                 }  
67.                 return m_html;  
68.             }  
69.         }  
70.         /// <summary>  
71.         /// 此属性获得本网页的所有链接信息,只读  
72.         /// </summary>  
73.         public List<Link> Links  
74.         {  
75.             get  
76.             {  
77.                 if (m_links.Count == 0) getLinks();  
78.                 return m_links;  
79.             }  
80.         }  
81.   
82.   
83.         /// <summary>  
84.         /// 此属性返回本网页的全部纯文本信息,只读  
85.         /// </summary>  
86.         public string Context  
87.         {  
88.             get  
89.             {  
90.                 if (m_outstr == "") getContext(Int16.MaxValue);  
91.                 return m_outstr;  
92.             }  
93.         }  
94.   
95.         /// <summary>  
96.         /// 此属性获得本网页的大小  
97.         /// </summary>  
98.         public int PageSize  
99.         {  
100.             get  
101.             {  
102.                 return m_pagesize;  
103.             }  
104.         }  
105.         /// <summary>  
106.         /// 此属性获得本网页的所有站内链接  
107.         /// </summary>  
108.         public List<Link> InsiteLinks  
109.         {  
110.             get  
111.             {  
112.                 return getSpecialLinksByUrl("^http://" + m_uri.Host, Int16.MaxValue);  
113.             }  
114.         }  
115.   
116.         /// <summary>  
117.         /// 此属性表示本网页是否可用  
118.         /// </summary>  
119.         public bool IsGood  
120.         {  
121.             get  
122.             {  
123.                 return m_good;  
124.             }  
125.         }  
126.         /// <summary>  
127.         /// 此属性表示网页的所在的网站  
128.         /// </summary>  
129.         public string Host  
130.         {  
131.             get  
132.             {  
133.                 return m_uri.Host;  
134.             }  
135.         }  
136.         #endregion  
137.   
138.   
139.         /// <summary>  
140.         /// 从HTML代码中分析出链接信息  
141.         /// </summary>  
142.         /// <returns>List<Link></returns>  
143.         private List<Link> getLinks()  
144.         {  
145.             if (m_links.Count == 0)  
146.             {  
147.                 Regex[] regex = new Regex[2];  
148.                 regex[0] = new Regex(@"<ashrefs*=""(?<URL>[^""]*).*?>(?<title>[^<]*)</a>", RegexOptions.IgnoreCase | RegexOptions.Singleline);  
149.                 regex[1] = new Regex("<[i]*frame[^><]+src=("|')?(?<url>([^>"'\s)])+)("|')?[^>]*>", RegexOptions.IgnoreCase);  
150.   
151.                 for (int i = 0; i < 2; i++)  
152.                 {  
153.                     Match match = regex[i].Match(m_html);  
154.                     while (match.Success)  
155.                     {  
156.                         try  
157.                         {  
158.                             string url = HttpUtility.UrlDecode(new Uri(m_uri, match.Groups["URL"].Value).AbsoluteUri);  
159.   
160.                             string text = "";  
161.                             if (i == 0) text = new Regex("(<[^>]+>)|(\s)|( )|&|"", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(match.Groups["text"].Value, "");  
162.   
163.                             Link link = new Link();  
164.                             link.Text = text;  
165.                             link.NavigateUrl = url;  
166.   
167.                             m_links.Add(link);  
168.                         }  
169.                         catch (Exception ex) { Console.WriteLine(ex.Message); };  
170.                         match = match.NextMatch();  
171.                     }  
172.                 }  
173.             }  
174.             return m_links;  
175.         }  
176.         /// <summary>  
177.         /// 此私有方法从一段HTML文本中提取出一定字数的纯文本  
178.         /// </summary>  
179.         /// <param name="instr">HTML代码</param>  
180.         /// <param name="firstN">提取从头数多少个字</param>  
181.         /// <param name="withLink">是否要链接里面的字</param>  
182.         /// <returns>纯文本</returns>  
183.         private string getFirstNchar(string instr, int firstN, bool withLink)  
184.         {  
185.             if (m_outstr == "")  
186.             {  
187.                 m_outstr = instr.Clone() as string;  
188.                 m_outstr = new Regex(@"(?m)<script[^>]*>(w|W)*?</script[^>]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(m_outstr, "");  
189.                 m_outstr = new Regex(@"(?m)<style[^>]*>(w|W)*?</style[^>]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(m_outstr, "");  
190.                 m_outstr = new Regex(@"(?m)<select[^>]*>(w|W)*?</select[^>]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(m_outstr, "");  
191.                 if (!withLink) m_outstr = new Regex(@"(?m)<a[^>]*>(w|W)*?</a[^>]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(m_outstr, "");  
192.                 Regex objReg = new System.Text.RegularExpressions.Regex("(<[^>]+?>)| ", RegexOptions.Multiline | RegexOptions.IgnoreCase);  
193.                 m_outstr = objReg.Replace(m_outstr, "");  
194.                 Regex objReg2 = new System.Text.RegularExpressions.Regex("(\s)+", RegexOptions.Multiline | RegexOptions.IgnoreCase);  
195.                 m_outstr = objReg2.Replace(m_outstr, " ");  
196.   
197.             }  
198.             return m_outstr.Length > firstN ? m_outstr.Substring(0, firstN) : m_outstr;  
199.         }  
200.  
201.  
202.         #region 公有文法  
203.         /// <summary>  
204.         /// 此公有方法提取网页中一定字数的纯文本,包括链接文字  
205.         /// </summary>  
206.         /// <param name="firstN">字数</param>  
207.         /// <returns></returns>  
208.         public string getContext(int firstN)  
209.         {  
210.             return getFirstNchar(m_html, firstN, true);  
211.         }  
212.   
213.         /// <summary>  
214.         /// 此公有方法从本网页的链接中提取一定数量的链接,该链接的URL满足某正则式  
215.         /// </summary>  
216.         /// <param name="pattern">正则式</param>  
217.         /// <param name="count">返回的链接的个数</param>  
218.         /// <returns>List<Link></returns>  
219.         public List<Link> getSpecialLinksByUrl(string pattern, int count)  
220.         {  
221.             if (m_links.Count == 0) getLinks();  
222.             List<Link> SpecialLinks = new List<Link>();  
223.             List<Link>.Enumerator i;  
224.             i = m_links.GetEnumerator();  
225.             int cnt = 0;  
226.             while (i.MoveNext() && cnt < count)  
227.             {  
228.                 if (new Regex(pattern, RegexOptions.Multiline | RegexOptions.IgnoreCase).Match(i.Current.NavigateUrl).Success)  
229.                 {  
230.                     SpecialLinks.Add(i.Current);  
231.                     cnt++;  
232.                 }  
233.             }  
234.             return SpecialLinks;  
235.         }  
236.   
237.         /// <summary>  
238.         /// 此公有方法从本网页的链接中提取一定数量的链接,该链接的文字满足某正则式  
239.         /// </summary>  
240.         /// <param name="pattern">正则式</param>  
241.         /// <param name="count">返回的链接的个数</param>  
242.         /// <returns>List<Link></returns>  
243.         public List<Link> getSpecialLinksByText(string pattern, int count)  
244.         {  
245.             if (m_links.Count == 0) getLinks();  
246.             List<Link> SpecialLinks = new List<Link>();  
247.             List<Link>.Enumerator i;  
248.             i = m_links.GetEnumerator();  
249.             int cnt = 0;  
250.             while (i.MoveNext() && cnt < count)  
251.             {  
252.                 if (new Regex(pattern, RegexOptions.Multiline | RegexOptions.IgnoreCase).Match(i.Current.Text).Success)  
253.                 {  
254.                     SpecialLinks.Add(i.Current);  
255.                     cnt++;  
256.                 }  
257.             }  
258.             return SpecialLinks;  
259.         }  
260.   
261.         /// <summary>  
262.         /// 这公有方法提取本网页的纯文本中满足某正则式的文字  
263.         /// </summary>  
264.         /// <param name="pattern">正则式</param>  
265.         /// <returns>返回文字</returns>  
266.         public string getSpecialWords(string pattern)  
267.         {  
268.             if (m_outstr == "") getContext(Int16.MaxValue);  
269.             Regex regex = new Regex(pattern, RegexOptions.Multiline | RegexOptions.IgnoreCase);  
270.             Match mc = regex.Match(m_outstr);  
271.             if (mc.Success)  
272.                 return mc.Groups[1].Value;  
273.             return string.Empty;  
274.         }  
275.         #endregion  
276.  
277.         #region 构造函数  
278.   
279.         private void Init(string _url)  
280.         {  
281.             try  
282.             {  
283.                 m_uri = new Uri(_url);  
284.                 m_links = new List<Link>();  
285.                 m_html = "";  
286.                 m_outstr = "";  
287.                 m_title = "";  
288.                 m_good = true;  
289.                 if (_url.EndsWith(".rar") || _url.EndsWith(".dat") || _url.EndsWith(".msi"))  
290.                 {  
291.                     m_good = false;  
292.                     return;  
293.                 }  
294.                 HttpWebRequest rqst = (HttpWebRequest)WebRequest.Create(m_uri);  
295.                 rqst.AllowAutoRedirect = true;  
296.                 rqst.MaximumAutomaticRedirections = 3;  
297.                 rqst.UserAgent = "Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0)";  
298.                 rqst.KeepAlive = true;  
299.                 rqst.Timeout = 10000;  
300.                 lock (WebPage.webcookies)  
301.                 {  
302.                     if (WebPage.webcookies.ContainsKey(m_uri.Host))  
303.                         rqst.CookieContainer = WebPage.webcookies[m_uri.Host];  
304.                     else  
305.                     {  
306.                         CookieContainer cc = new CookieContainer();  
307.                         WebPage.webcookies[m_uri.Host] = cc;  
308.                         rqst.CookieContainer = cc;  
309.                     }  
310.                 }  
311.                 HttpWebResponse rsps = (HttpWebResponse)rqst.GetResponse();  
312.                 Stream sm = rsps.GetResponseStream();  
313.                 if (!rsps.ContentType.ToLower().StartsWith("text/") || rsps.ContentLength > 1 << 22)  
314.                 {  
315.                     rsps.Close();  
316.                     m_good = false;  
317.                     return;  
318.                 }  
319.                 Encoding cding = System.Text.Encoding.Default;  
320.                 string contenttype = rsps.ContentType.ToLower();  
321.                 int ix = contenttype.IndexOf("charset=");  
322.                 if (ix != -1)  
323.                 {  
324.                     try  
325.                     {  
326.                         cding = System.Text.Encoding.GetEncoding(rsps.ContentType.Substring(ix + "charset".Length + 1));  
327.                     }  
328.                     catch  
329.                     {  
330.                         cding = Encoding.Default;  
331.                     }  
332.                      
333.                     //该处视情况而定 有的需要解码  
334.                     //m_html = HttpUtility.HtmlDecode(new StreamReader(sm, cding).ReadToEnd());  
335.                     m_html = new StreamReader(sm, cding).ReadToEnd();  
336.                       
337.                 }  
338.                 else  
339.                 {  
340.                   //该处视情况而定 有的需要解码  
341.                    //m_html = HttpUtility.HtmlDecode(new StreamReader(sm, cding).ReadToEnd());  
342.                       
343.                     m_html = new StreamReader(sm, cding).ReadToEnd();  
344.                     Regex regex = new Regex("charset=(?<cding>[^=]+)?"", RegexOptions.IgnoreCase);  
345.                     string strcding = regex.Match(m_html).Groups["cding"].Value;  
346.                     try  
347.                     {  
348.                         cding = Encoding.GetEncoding(strcding);  
349.                     }  
350.                     catch  
351.                     {  
352.                         cding = Encoding.Default;  
353.                     }  
354.                     byte[] bytes = Encoding.Default.GetBytes(m_html.ToCharArray());  
355.                     m_html = cding.GetString(bytes);  
356.                     if (m_html.Split('?').Length > 100)  
357.                     {  
358.                         m_html = Encoding.Default.GetString(bytes);  
359.                     }  
360.                 }  
361.                 m_pagesize = m_html.Length;  
362.                 m_uri = rsps.ResponseUri;  
363.                 rsps.Close();  
364.             }  
365.             catch (Exception ex)  
366.             {  
367.                  
368.             }  
369.         }  
370.         public WebPage(string _url)  
371.         {  
372.             string uurl = "";  
373.             try  
374.             {  
375.                 uurl = Uri.UnescapeDataString(_url);  
376.                 _url = uurl;  
377.             }  
378.             catch { };  
379.             Init(_url);  
380.         }  
381.         #endregion  
382.     }  
 
View Code

得到一个完整的文件列表中使用ftprequest缓慢

我想得到的文件名,文件的大小和最后修改时间每个文件服务器上,然后在一个完整的它。

它真的很好,直到我切换主机,现在真的很缓慢,尽管新的主机是一样快,在客户端。

没有任何明显的理由为何?

此外,它是非常必要把登录凭据,每一次?

我使用的第一种方法得到一个字符串数组,然后遍历并使用另一个在每个项目得到文件的大小:
public static string[] GetFileList()
    {
        string[] downloadFiles;
        StringBuilder result = new StringBuilder();
        FtpWebRequest request;
        try
        {
            request = (FtpWebRequest)FtpWebRequest.Create(new Uri("ftp://mysite.se/"));
            request.UseBinary = true;
            request.Credentials = new NetworkCredential(settings.Username, settings.Password);
            request.Method = WebRequestMethods.Ftp.ListDirectory;
            request.UseBinary = true;

            WebResponse response = request.GetResponse();
            StreamReader reader = new StreamReader(response.GetResponseStream());

            string line = reader.ReadLine();
            while (line != null)
            {
                result.Append(line);
                result.Append("
");
                line = reader.ReadLine();
            }
            // to remove the trailing '
'
            result.Remove(result.ToString().LastIndexOf('
'), 1);
            reader.Close();
            response.Close();
            return result.ToString().Split('
');
        }
        catch (Exception ex)
        {
            System.Windows.Forms.MessageBox.Show(ex.Message);
            downloadFiles = null;
            return downloadFiles;
        }
    }

    public static int GetFileSize(string file)
    {
        //MessageBox.Show("getting filesize...");

        StringBuilder result = new StringBuilder();
        FtpWebRequest request;
        try
        {
            request = (FtpWebRequest)FtpWebRequest.Create(new Uri("ftp://mysite.se/" + file));
            request.UseBinary = true;
            request.Credentials = new NetworkCredential(settings.Username, settings.Password);
            request.Method = WebRequestMethods.Ftp.GetFileSize;

            int dataLength = (int)request.GetResponse().ContentLength;

            return dataLength;
        }
        catch (Exception ex)
        {
            //System.Windows.Forms.MessageBox.Show(ex.Message);
            return 1337;
        }
    }
View Code
原文地址:https://www.cnblogs.com/blogpro/p/11458363.html