清理网页中的HTML

 public string ClearHtml(string text)//过滤html,js,css代码
    {
        text = text.Trim();
        if (string.IsNullOrEmpty(text))
            return string.Empty;
        text = Regex.Replace(text, "<head[^>]*>(?:.|[
])*?</head>", "");
        text = Regex.Replace(text, "<script[^>]*>(?:.|[
])*?</script>", "");
        text = Regex.Replace(text, "<style[^>]*>(?:.|[
])*?</style>", "");
        
        text = Regex.Replace(text, "(<[b|B][r|R]/*>)+|(<[p|P](.|\n)*?>)", ""); //<br> 
        text = Regex.Replace(text, "\&[a-zA-Z]{1,10};", "");
        text = Regex.Replace(text, "<[^>]*>", "");

        text = Regex.Replace(text, "(\s*&[n|N][b|B][s|S][p|P];\s*)+", ""); // 
        text = Regex.Replace(text, "<(.|\n)*?>", string.Empty); //其它任何标记
        text = Regex.Replace(text, "[\s]{2,}", " "); //两个或多个空格替换为一个

        text = text.Replace("'", "''");
        text = text.Replace("
", "");
        text = text.Replace("  ", "");
        text = text.Replace("	", "");
        return text.Trim();
    }

  

原文地址:https://www.cnblogs.com/babietongtianta/p/4796687.html