获取网页的Encoding

在下载Html页面的时候,我们需要得到它的String,就必须得到它的Encoding,得到Encoding的方法很简单,在这里写下自己用到的code,做一下笔记。

代码其实都很简单,里面还有些简单的注释,很容易理解!

得到Encoding部分的代码:

View Code
        /// <summary>
/// 通过contentType和字节数组得到html的Encoding
/// </summary>
/// <param name="contentType">WebClient下载得到的contentType</param>
/// <param name="myData">WebClient下载得到的字节数组</param>
/// <returns>得到html页面的Encoding,如果程序无法得到,则默认返回utf-8编码</returns>
public static Encoding GetHtmlEncoding(string contentType, byte[] myData)
{
try
{
Encoding encoding;
///先得到字节数组的Asic编码字符串
///之后针对这个字符串进行分析,得到Encoding
string ansiContent = Encoding.ASCII.GetString(myData);

///第一步:通过正则匹配charset,这个就是网页的Encoding标识
encoding = GetHtmlEncodingFormString(ansiContent);

///如果得到Encoding,就返回,否则继续执行其他步骤
if (encoding != null)
return encoding;

///第二步:通过contentType得到Encoding
encoding = GetEncodingFromContentType(contentType);

///如果得到Encoding,就返回,否则继续执行其他步骤
if (encoding != null)
return encoding;

///第三步:通过一个开源的类库,得到Encoding
///具体它怎么得到的,我也没有仔细研究
encoding = GetEncodingFromBytes(myData);

if (encoding != null)
return encoding;

return Encoding.UTF8;
}
catch
{
return Encoding.UTF8;
}
}

/// <summary>
/// 第一步:通过正则匹配charset,这个就是网页的Encoding标识
/// </summary>
/// <param name="htmlContent">Asic字符串</param>
/// <returns>Encoding</returns>
private static Encoding GetHtmlEncodingFormString(string htmlContent)
{
string encondingString = null;
Regex CharsetReg = new Regex(@"charset\b\s*=\s*(?<charset>[^""]*)", RegexOptions.IgnoreCase | RegexOptions.Compiled);

if (CharsetReg.IsMatch(htmlContent))
{
encondingString = CharsetReg.Match(htmlContent).Groups["charset"].Value.Trim();
try
{
Encoding encoding = Encoding.GetEncoding(encondingString);
return encoding;
}
catch { return null; }
}
else
return null;
}

private static Encoding GetEncodingFromContentType(string contentType)
{
try
{
string[] strArray = contentType.ToLower(CultureInfo.InvariantCulture).Split(new char[] { ';', '=', ' ' });
bool flag = false;
foreach (string str2 in strArray)
{
if (str2 == "charset")
flag = true;
else if (flag)
return Encoding.GetEncoding(str2);
}
}
catch { }

return null;
}

private static Encoding GetEncodingFromBytes(byte[] myData)
{
try
{
UniversalDetector Det = new UniversalDetector(null);
Det.HandleData(myData, 0, myData.Length);
Det.DataEnd();

if (!string.IsNullOrEmpty(Det.GetDetectedCharset()))
return Encoding.GetEncoding(Det.GetDetectedCharset());
}
catch { }

return null;
}

测试的代码:

View Code
public static void TestEncoding()
{
WebClient client = new WebClient();
//WebClient下载得到字节数组
byte[] bytes = client.DownloadData("http://www.baidu.com");
//通过Webclient得到contenttype
string contentType = client.ResponseHeaders["Content-Type"];
//调用方法得到Encoding
Encoding encoding = GetHtmlEncoding(contentType, bytes);
//通过Encoding得到html字符串
string htmlString = encoding.GetString(bytes);
Console.WriteLine(htmlString);
}

需要用到的第三方类库为:NUniversalCharDet

引用:using Mozilla.NUniversalCharDet;

这个dll从网上download一个就可以了!


原文地址:https://www.cnblogs.com/pmars/p/2302272.html