用Regex去除HTML标记

/// <summary>

/// 去除HTML标记

/// </summary>

/// <param>包括HTML的源码 </param>

/// <returns>已经去除后的文字</returns>

 public static string StripHTML(string strHtml)

 {

            string[] aryReg ={

          @"<script[^>]*?>.*?</script>",

@"<(\/\s*)?!?((\w+:)?\w+)(\w+(\s*=?\s*(([""'])(\\[""'tbnr]|[^\7])*?\7|\w+)|.{0})|\s)*?(\/\s*)?>",

          @"([\r\n])[\s]+",

          @"&(quot|#34);",

          @"&(amp|#38);",

          @"&(lt|#60);",

          @"&(gt|#62);",

          @"&(nbsp|#160);",

          @"&(iexcl|#161);",

          @"&(cent|#162);",

          @"&(pound|#163);",

          @"&(copy|#169);",

          @"&#(\d+);",

          @"-->",

          @"<!--.*\n"

        

         };

 

            string[] aryRep = {

           "",

           "",

           "",

           "\"",

           "&",

           "<",

           ">",

           " ",

           "\xa1",//chr(161),

           "\xa2",//chr(162),

           "\xa3",//chr(163),

           "\xa9",//chr(169),

           "",

           "\r\n",

           ""

          };

 

            string newReg = aryReg[0];

            string strOutput = strHtml;

            for (int i = 0; i < aryReg.Length; i++)

            {

                Regex regex = new Regex(aryReg[i], RegexOptions.IgnoreCase);

                strOutput = regex.Replace(strOutput, aryRep[i]);

            }

            strOutput.Replace("<", "");

            strOutput.Replace(">", "");

            strOutput.Replace("\r\n", "");

            strOutput.Replace("&nbsp;","");

            return strOutput;

        }

原文地址:https://www.cnblogs.com/cuiwenke/p/1688407.html