(1)方法一
public string RemoveHTMLTags(string htmlStream)
{
if (htmlStream == null)
{
throw new Exception("Your input html stream is null!");
return null;
}
/*
*
最好把所有的特殊HTML标记都找出来,然后把与其相对应的Unicode字符一起影射到Hash表内,最后一起都替换掉
*/
//先单独测试,成功后,再把所有模式合并
//注:这两个必须单独处理
//去掉嵌套了HTML标记的JavaScript:(<script)[\s\S]*(</script>)
//去掉css标记:(<style)[\s\S]*(</style>)
//去掉css标记:\..*\{[\s\S]*\}
htmlStream = Regex.Replace(htmlStream,
"(<script)[\s\S]*?(</script>)|(<style)[\s\S]*?(</style>)",
" ", RegexOptions.IgnoreCase);
//htmlStream = RemoveTag(htmlStream, "script");
//htmlStream = RemoveTag(htmlStream, "style");
//去掉普通HTML标记:<[^>]+>
//替换空格: |&|­| |­
htmlStream = Regex.Replace(htmlStream,
"<[^>]+>| |&|­| |­|•|<|>",
" ", RegexOptions.IgnoreCase);
//htmlStream = RemoveTag(htmlStream);
//替换左尖括号
//htmlStream = Regex.Replace(htmlStream, "<",
"<");
//替换右尖括号
//htmlStream = Regex.Replace(htmlStream, ">",
">");
//替换空行
//htmlStream = Regex.Replace(htmlStream, "[
|
| ]", "
");//[
|
][ *| *]*[
|
]
htmlStream = Regex.Replace(htmlStream, "(
[
|
| |
]*
)|(
[
|
| | ]*
)", "
");
htmlStream = Regex.Replace(htmlStream, "[ | ]{1,}", "
");
return htmlStream.Trim();
}
(2)网上搜索到的方法
//
除去所有在html元素中标记
public
static string striphtml(string strhtml)
{
string stroutput = strhtml;
Regex regex = new
Regex(@"<[^>]+>|</[^>]+>");
stroutput = regex.Replace(stroutput, "");
return stroutput;
}