功能用途
主要是用来提取html页面内容时使用。
示例代码
using System; using System.Collections.Generic; using System.Linq; using System.Text; using System.IO; using System.Net; using System.Net.NetworkInformation; using System.Net.Sockets; using System.Threading; using System.Text.RegularExpressions; namespace HtmlRegex { public class BaseRegex { WebClient web = new WebClient(); public void DeBug(string path,int encoding,string content) { Encoding encods; if (encoding == 1) encods = Encoding.UTF8; else encods = Encoding.Default; StreamWriter sw = new StreamWriter(path,true ,encods); sw.WriteLine(content); sw.Flush(); sw.Close(); } public string getPageContent(string url, int encoding) { byte[] buff = web.DownloadData(url); if (encoding == 1) { return Encoding.UTF8.GetString(buff); } return Encoding.Default.GetString(buff); } public string checkHtml(string html) { //过滤JS和CSS Regex regex1 = new Regex(@"<script.*?>.+?</script>", RegexOptions.Singleline | RegexOptions.IgnoreCase); Regex regex2 = new Regex(@"<style.*?>.+?</style>", RegexOptions.Singleline | RegexOptions.IgnoreCase); Regex regex3 = new Regex(@"<script.*?>.*?</script>", RegexOptions.Singleline | RegexOptions.IgnoreCase); Regex regex4 = new Regex(@"<style.*?>.*?</style>", RegexOptions.Singleline | RegexOptions.IgnoreCase); Regex regex5 = new Regex(@"<.*?>", RegexOptions.Singleline | RegexOptions.IgnoreCase); Regex regex6 = new Regex(@"&S{2,}?;", RegexOptions.Singleline | RegexOptions.IgnoreCase); Regex regex7 = new Regex(@"<!--.+?-->", RegexOptions.Singleline | RegexOptions.IgnoreCase); Regex regex8 = new Regex(@"[ ]{2,}", RegexOptions.Singleline | RegexOptions.IgnoreCase); //HTML标签包括自闭和标签 //Regex regex9 = new Regex(@"<(.*)(.*)>.*</1>|<(.*) />", RegexOptions.Singleline | RegexOptions.IgnoreCase); html = regex1.Replace(html, ""); html = regex2.Replace(html, ""); html = regex3.Replace(html, ""); html = regex4.Replace(html, ""); html = regex5.Replace(html, ""); html = regex6.Replace(html, ""); html = regex7.Replace(html, ""); html = regex8.Replace(html, ""); html = html.Replace(" ", ""); return html; } } }