取得HTML中的img
/// <summary> /// 取得HTML中所有图片的 URL。 /// </summary> /// <param name="sHtmlText">HTML代码</param> /// <returns>图片的URL列表</returns> public static string[] GetHtmlImageUrlList(string sHtmlText) { // 定义正则表达式用来匹配 img 标签 Regex regImg = new Regex(@"<img[^<>]*?src[s ]*=[s ]*[""']?[s ]*(?<imgUrl>[^s ""'<>]*)[^<>]*?/?[s ]*>", RegexOptions.IgnoreCase); // 搜索匹配的字符串 MatchCollection matches = regImg.Matches(sHtmlText); int i = 0; string[] sUrlList = new string[matches.Count]; // 取得匹配项列表 foreach (Match match in matches) sUrlList[i++] = match.Groups["imgUrl"].Value; return sUrlList; }
取得HTML中的文字
/// <summary> /// 取得html中的文字 /// </summary> /// <param name="htmlString"></param> /// <returns></returns> public static string NoHTML(string htmlString) { if (string.IsNullOrEmpty(htmlString)) return string.Empty; //删除脚本 htmlString = Regex.Replace(htmlString, @"<script[^>]*?>.*?</script>", "", RegexOptions.IgnoreCase); //删除HTML htmlString = Regex.Replace(htmlString, @"<(.[^>]*)>", "", RegexOptions.IgnoreCase); htmlString = Regex.Replace(htmlString, @"([ ])[s]+", "", RegexOptions.IgnoreCase); htmlString = Regex.Replace(htmlString, @"-->", "", RegexOptions.IgnoreCase); htmlString = Regex.Replace(htmlString, @"<!--.*", "", RegexOptions.IgnoreCase); htmlString = Regex.Replace(htmlString, @"&(quot|#34);", """, RegexOptions.IgnoreCase); htmlString = Regex.Replace(htmlString, @"&(amp|#38);", "&", RegexOptions.IgnoreCase); htmlString = Regex.Replace(htmlString, @"&(lt|#60);", "<", RegexOptions.IgnoreCase); htmlString = Regex.Replace(htmlString, @"&(gt|#62);", ">", RegexOptions.IgnoreCase); htmlString = Regex.Replace(htmlString, @"&(nbsp|#160);", " ", RegexOptions.IgnoreCase); htmlString = Regex.Replace(htmlString, @"&(iexcl|#161);", "xa1", RegexOptions.IgnoreCase); htmlString = Regex.Replace(htmlString, @"&(cent|#162);", "xa2", RegexOptions.IgnoreCase); htmlString = Regex.Replace(htmlString, @"&(pound|#163);", "xa3", RegexOptions.IgnoreCase); htmlString = Regex.Replace(htmlString, @"&(copy|#169);", "xa9", RegexOptions.IgnoreCase); htmlString = Regex.Replace(htmlString, @"&#(d+);", "", RegexOptions.IgnoreCase); //htmlString = System.Web.HttpUtility.HtmlEncode(htmlString); return htmlString; }