清除html中的标记,只留下文字

/// <summary>
/// 清除html中的标记,只留下文字。
/// </summary>
/// <param name="HTML"></param>
/// <returns></returns>
public string ClearHTMLTags(string HTML)
{
    if (string.IsNullOrEmpty(HTML)) return "";
    string[] Regexs ={
                @"<script[^>]*?>.*?</script>",
                @"<(/s*)?!?((w+:)?w+)(w+(s*=?s*(([""'])(file://[%22%22'tbnr]%7c[%5e/7])*?7|w+)|.{0})|s)*?(/s*)?>",
                //@"([ ])[s]+",//换行
                @"&(quot|#34);",
                @"&(amp|#38);",
                @"&(lt|#60);",
                @"&(gt|#62);",
                @"&(nbsp|#160);",
                @"&(iexcl|#161);",
                @"&(cent|#162);",
                @"&(pound|#163);",
                @"&(copy|#169);",
                @"&#(d+);",
                @"-->",
                @"<!--.* "
};

    string[] Replaces ={
                    "",
                    "",
                    //"",//换行
                    """,
                    "&",
                    "<",
                    ">",
                    " ",
                    "xa1", //chr(161),
                    "xa2", //chr(162),
                    "xa3", //chr(163),
                    "xa9", //chr(169),
                    "",
                    " ",
                    ""
};

    string s = HTML;
    for (int i = 0; i < Regexs.Length; i++)
    {
        s = new Regex(Regexs[i], RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(s, Replaces[i]);
    }
    //s.Replace("<", "");
    //s.Replace(">", "");
    //s.Replace(" ", "");

    return s;
}

上面的方式去除的有时候并不太准,其实可以换种想法,若是之前用富文本框编辑的,则可以实例化一个富文本框,然后设置 this.richTextBox.Ref = "";然后再取出this.richTextBox.Text。这种方式虽然笨,但比较准确。用此方法需注意一点:若是数据比较多,这个功能要单独做,不要对这条数据做这样的转换之后再去做其它的事,这样会大大增加循环的时间,从而导致出现一些不莫名奇妙的问题。
原文地址:https://www.cnblogs.com/pnljs/p/3169287.html