提取HTML代码中文字的C#函数

/// <summary>
  
/// 去除HTML标记
  
/// </summary>
  
/// <param name="strHtml">包括HTML的源码 </param>
  
/// <returns>已经去除后的文字</returns>

  public static string StripHTML(string strHtml)
  
{
   
string [] aryReg ={
          
@"<script[^>]*?>.*?</script>",

          
@"<(\/\s*)?!?((\w+:)?\w+)(\w+(\s*=?\s*(([""'])(\\[""'tbnr]|[^\7])*?\7|\w+)|.{0})|\s)*?(\/\s*)?>",
          
@"([\r\n])[\s]+",
          
@"&(quot|#34);",
          
@"&(amp|#38);",
          
@"&(lt|#60);",
          
@"&(gt|#62);"
          
@"&(nbsp|#160);"
          
@"&(iexcl|#161);",
          
@"&(cent|#162);",
          
@"&(pound|#163);",
          
@"&(copy|#169);",
          
@"&#(\d+);",
          
@"-->",
          
@"<!--.*\n"
         
         }
;

   
string [] aryRep = {
           
"",
           
"",
           
"",
           
"\"",
           "&",
           
"<",
           
">",
           
" ",
           
"\xa1",//chr(161),
           "\xa2",//chr(162),
           "\xa3",//chr(163),
           "\xa9",//chr(169),
           "",
           
"\r\n",
           
""
          }
;

   
string newReg =aryReg[0];
   
string strOutput=strHtml;
   
for(int i = 0;i<aryReg.Length;i++)
   
{
    Regex regex 
= new Regex(aryReg[i],RegexOptions.IgnoreCase );
    strOutput 
= regex.Replace(strOutput,aryRep[i]);
   }


   strOutput.Replace(
"<","");
   strOutput.Replace(
">","");
   strOutput.Replace(
"\r\n","");


   
return strOutput;
  }

青苹果Web应用商店 https://webapp.taobao.com/

PHP/ASP.NET/ASP/UCHOME/DISCUZ! X系列网站开发,详细需求联系QQ:8511978

原文地址:https://www.cnblogs.com/Dicky/p/122372.html