采集系统万能正则表达式

由于经常要写一些采集的程序，下面的三个函数是采集中的很常用的函数。姑且叫采集系统万能正则表达式吧。全部源码见
http://www.softbk.com/news.asp?id=3564 欢迎一起交流

//获取页面的html源码

public string GetHtmlSource(string Url, string charset)

{

if (charset == "" || charset == null) charset = "gb2312";

string text1 = "";

try

{

HttpWebRequest request1 = (HttpWebRequest)WebRequest.Create(Url);

HttpWebResponse response1 = (HttpWebResponse)request1.GetResponse();

Stream stream1 = response1.GetResponseStream();

StreamReader reader1 = new StreamReader(stream1, Encoding.GetEncoding(charset));

text1 = reader1.ReadToEnd();

stream1.Close();

response1.Close();

}

catch (Exception exception1)

{

}

return text1;

}

public string SniffwebCode(string code, string wordsBegin, string wordsEnd)

{

string NewsTitle = "";

Regex regex1 = new Regex("" + wordsBegin + @"(?<title>[\s\S]+?)" + wordsEnd + "", RegexOptions.Compiled | RegexOptions.IgnoreCase);

for (Match match1 = regex1.Match(code); match1.Success; match1 = match1.NextMatch())

{

NewsTitle = match1.Groups["title"].ToString();

}

return NewsTitle;

}

public ArrayList SniffwebCodeReturnList(string code, string wordsBegin, string wordsEnd)

{

ArrayList urlList = new ArrayList();

//string NewsTitle = "";

Regex regex1 = new Regex("" + wordsBegin + @"(?<title>[\s\S]+?)" + wordsEnd + "", RegexOptions.Compiled | RegexOptions.IgnoreCase);

for (Match match1 = regex1.Match(code); match1.Success; match1 = match1.NextMatch())

{

urlList.Add(match1.Groups["title"].ToString());

}

return urlList;

}