C#去掉HTML标记

(1)方法一

  public string RemoveHTMLTags(string htmlStream)
        {
            if (htmlStream == null)
            {
                throw new Exception("Your input html stream is null!");
                return null;
            }

            /*
             * 最好把所有的特殊HTML标记都找出来,然后把与其相对应的Unicode字符一起影射到Hash表内,最后一起都替换掉
             */

            //先单独测试,成功后,再把所有模式合并

            //注:这两个必须单独处理
            //去掉嵌套了HTML标记的JavaScript:(<script)[\s\S]*(</script>)
            //去掉css标记:(<style)[\s\S]*(</style>)
            //去掉css标记:\..*\{[\s\S]*\}
            htmlStream = Regex.Replace(htmlStream, "(<script)[\s\S]*?(</script>)|(<style)[\s\S]*?(</style>)", " ", RegexOptions.IgnoreCase);
            //htmlStream = RemoveTag(htmlStream, "script");
            //htmlStream = RemoveTag(htmlStream, "style");

            //去掉普通HTML标记:<[^>]+>
            //替换空格:&nbsp;|&amp;|&shy;|&#160;|&#173;
            htmlStream = Regex.Replace(htmlStream, "<[^>]+>|&nbsp;|&amp;|&shy;|&#160;|&#173;|&bull;|&lt;|&gt;", " ", RegexOptions.IgnoreCase);
            //htmlStream = RemoveTag(htmlStream);

            //替换左尖括号
            //htmlStream = Regex.Replace(htmlStream, "&lt;", "<");

            //替换右尖括号
            //htmlStream = Regex.Replace(htmlStream, "&gt;", ">");

            //替换空行
            //htmlStream = Regex.Replace(htmlStream, "[ | | ]", " ");//[ | ][ *| *]*[ | ]
            htmlStream = Regex.Replace(htmlStream, "( [ | | | ]* )|( [ | | | ]* )", " ");
            htmlStream = Regex.Replace(htmlStream, "[ | ]{1,}", " ");

            return htmlStream.Trim();
        }

(2)网上搜索到的方法

 // 除去所有在html元素中标记
    public static string striphtml(string strhtml)
    {
        string stroutput = strhtml;
        Regex regex = new Regex(@"<[^>]+>|</[^>]+>");

        stroutput = regex.Replace(stroutput, "");
        return stroutput;

    }

原文地址:https://www.cnblogs.com/sky-net/p/4442297.html