C#正则提取html图片等

去除html标记,比较实用,分享给大家。

   ///   <summary>
   ///   去除HTML标记
   ///   </summary>
   ///   <param   name="Htmlstring">包括HTML的源码   </param>
   ///   <returns>已经去除后的文字</returns> 
   public   static   string   NoHTML(string   Htmlstring)
   {
    //删除脚本
   Htmlstring = Htmlstring.Replace(" ","");
   Htmlstring = Regex.Replace(Htmlstring,@"<script.*?</script>","",RegexOptions.IgnoreCase);
   Htmlstring = Regex.Replace(Htmlstring,@"<style.*?</style>","",RegexOptions.IgnoreCase);
   Htmlstring = Regex.Replace(Htmlstring,@"<.*?>","",RegexOptions.IgnoreCase);
   //删除HTML
   Htmlstring = Regex.Replace(Htmlstring,@"<(.[^>]*)>","",RegexOptions.IgnoreCase);
   Htmlstring = Regex.Replace(Htmlstring,@"([ ])[s]+","",RegexOptions.IgnoreCase);
   Htmlstring = Regex.Replace(Htmlstring,@"-->","",RegexOptions.IgnoreCase);
   Htmlstring = Regex.Replace(Htmlstring,@"<!--.*","",RegexOptions.IgnoreCase);
   Htmlstring = Regex.Replace(Htmlstring,@"&(quot|#34);",""",RegexOptions.IgnoreCase);
   Htmlstring = Regex.Replace(Htmlstring,@"&(amp|#38);","&",RegexOptions.IgnoreCase);
   Htmlstring = Regex.Replace(Htmlstring,@"&(lt|#60);","<",RegexOptions.IgnoreCase);
   Htmlstring = Regex.Replace(Htmlstring,@"&(gt|#62);",">",RegexOptions.IgnoreCase);
   Htmlstring = Regex.Replace(Htmlstring,@"&(nbsp|#160);","",RegexOptions.IgnoreCase);
   Htmlstring = Regex.Replace(Htmlstring,@"&(iexcl|#161);","xa1",RegexOptions.IgnoreCase);
   Htmlstring = Regex.Replace(Htmlstring,@"&(cent|#162);","xa2",RegexOptions.IgnoreCase);
   Htmlstring = Regex.Replace(Htmlstring,@"&(pound|#163);","xa3",RegexOptions.IgnoreCase);
   Htmlstring = Regex.Replace(Htmlstring,@"&(copy|#169);","xa9",RegexOptions.IgnoreCase);
   Htmlstring = Regex.Replace(Htmlstring,@"&#(d+);","",RegexOptions.IgnoreCase); 
   Htmlstring = Htmlstring.Replace("<","");
   Htmlstring = Htmlstring.Replace(">","");
   Htmlstring = Htmlstring.Replace(" ","");   
   Htmlstring=HttpContext.Current.Server.HtmlEncode(Htmlstring).Trim();
   return Htmlstring;
   }

   /// <summary>
   /// 提取HTML代码中文字的C#函数
   /// </summary>
   public   static   string   StripHTML(string   strHtml)
   {
    string   []   aryReg   ={
           @"<script[^>]*?>.*?</script>",
           @"<(/s*)?!?((w+:)?w+)(w+(s*=?s*(([""'])(\[""'tbnr]|[^7])*?7|w+)|.{0})|s)*?(/s*)?>",
           @"([ ])[s]+",
           @"&(quot|#34);",
           @"&(amp|#38);",
           @"&(lt|#60);",
           @"&(gt|#62);",
           @"&(nbsp|#160);",
           @"&(iexcl|#161);",
           @"&(cent|#162);",
           @"&(pound|#163);",
           @"&(copy|#169);",
           @"&#(d+);",
           @"-->",
           @"<!--.* "
          };
    string   []   aryRep   =   {
             "",
             "",
             "",
             """,
             "&",
             "<",
             ">",
             "   ",
             "xa1",//chr(161), 
             "xa2",//chr(162), 
             "xa3",//chr(163), 
             "xa9",//chr(169), 
             "",
             " ",
             ""
            };
    string   newReg   =aryReg[0];
    string   strOutput=strHtml;
    for(int   i   =   0;i<aryReg.Length;i++)
    {
     Regex   regex   =   new   Regex(aryReg[i],RegexOptions.IgnoreCase);
     strOutput   =   regex.Replace(strOutput,aryRep[i]);
    }
    strOutput.Replace("<","");
    strOutput.Replace(">","");
    strOutput.Replace(" ","");
    return   strOutput;
   }
  
  
    #region   取出文本中的图片地址 
    /**////   <summary>
    ///   取出文本中的图片地址
    ///   </summary>
    ///   <param   name="HTMLStr">HTMLStr</param> 
    public   static   string   GetImgUrl(string   HTMLStr)
    {
     string   str   =   string.Empty;
     string   sPattern   =   @"^<imgs+[^>]*>";
     Regex   r   =   new   Regex(@"<imgs+[^>]*s*srcs*=s*([']?)(?<url>S+)'?[^>]*>",
      RegexOptions.Compiled);
     Match   m   =   r.Match(HTMLStr.ToLower());
     if   (m.Success)
      str   =   m.Result("${url}");
     return   str;
    }
    #endregion

原文地址:https://www.cnblogs.com/easyteck/p/3481928.html