HtmlAgilityPack中文乱码问题

  1. 打开HtmlAgilityPack.1.4.0.Source工程   
  2. 找到HtmlWeb.cs文件打开修改下面方法中的一小段代码:  
  3.     private HttpStatusCode Get(Uri uri, string method, string path, HtmlDocument doc, IWebProxy proxy,  
  4.                                    ICredentials creds)函数中的下方的代码  
  5.             Encoding respenc = !string.IsNullOrEmpty(resp.ContentEncoding)  
  6.                                   ? Encoding.GetEncoding(resp.ContentEncoding)  
  7.                                   : null;  
  8.             /*修改成下面的即可*/  
  9.             /*王..修改 中文乱码问题*/  
  10.              //Encoding respenc = !string.IsNullOrEmpty(resp.ContentEncoding)  
  11.             //                       ? Encoding.GetEncoding(resp.ContentEncoding)  
  12.             //                       : null;  
  13.             System.Text.Encoding respenc;  
  14.   
  15.             if ((resp.ContentEncoding != null) && (resp.ContentEncoding.Length > 0))  
  16.             {  
  17.                 respenc = System.Text.Encoding.GetEncoding(resp.ContentEncoding);  
  18.             }  
  19.             else if ((resp.CharacterSet != null) && (resp.CharacterSet.Length > 0))//根据Content-Type中获取的charset  
  20.             {  
  21.                 if (string.Compare(resp.CharacterSet, "ISO-8859-1"true, System.Globalization.CultureInfo.InvariantCulture) == 0)  
  22.                     respenc = System.Text.Encoding.GetEncoding("GB2312");  
  23.                 else  
  24.                     respenc = System.Text.Encoding.GetEncoding(resp.CharacterSet);  
  25.             }  
  26.             else  
  27.             {  
  28.                 respenc = System.Text.Encoding.GetEncoding("GB2312");  
  29.             }  
/// <summary>
        
/// 获取指定URL的HTML源代码
        
/// </summary>
        
/// <param name="url"></param>
        
/// <param name="encoding">如果为NULL 则自动识别</param>
        
/// <returns></returns>
        public static string GetWebHtml(string url, Encoding encoding)
        {
            
try
            {
                HttpWebRequest hwr 
= (HttpWebRequest)HttpWebRequest.Create(url);
                HttpWebResponse res;

                
try
                {
                    res 
= (HttpWebResponse)hwr.GetResponse();
                }
                
catch
                {
                    
return string.Empty;
                }

                
if (res.StatusCode == HttpStatusCode.OK)
                {
                    
using (Stream mystream = res.GetResponseStream())
                    {
                        
//没有指定编码,
                        if (encoding == null)
                        {
                            
return DecodeData(mystream, res);
                        }
                        
//指定了编码
                        else
                        {
                            
using (StreamReader reader = new StreamReader(mystream, encoding))
                            {
                                
return reader.ReadToEnd();
                            }
                        }
                    }
                }

                
return null;
            }
            
catch
            {
                
return null;
            }
        }


        
private static string DecodeData(Stream responseStream, HttpWebResponse response)
        {
            
string name = null;
            
string text2 = response.Headers["content-type"];
            
if (text2 != null)
            {
                
int index = text2.IndexOf("charset=");
                
if (index != -1)
                {
                    name 
= text2.Substring(index + 8);
                }
            }
            MemoryStream stream 
= new MemoryStream();
            
byte[] buffer = new byte[0x400];
            
for (int i = responseStream.Read(buffer, 0, buffer.Length); i > 0; i = responseStream.Read(buffer, 0, buffer.Length))
            {
                stream.Write(buffer, 
0, i);
            }
            responseStream.Close();
            
if (name == null)
            {
                MemoryStream stream3 
= stream;
                stream3.Seek((
long)0, SeekOrigin.Begin);
                
string text3 = new StreamReader(stream3, Encoding.ASCII).ReadToEnd();
                
if (text3 != null)
                {
                    
int startIndex = text3.IndexOf("charset=");
                    
int num4 = -1;
                    
if (startIndex != -1)
                    {
                        num4 
= text3.IndexOf("\"", startIndex);
                        
if (num4 != -1)
                        {
                            
int num5 = startIndex + 8;
                            name 
= text3.Substring(num5, (num4 - num5) + 1).TrimEnd(new char[] { '>''"' });
                        }
                    }
                }
            }
            Encoding aSCII 
= null;
            
if (name == null)
            {
                aSCII 
= Encoding.GetEncoding("gb2312");
            }
            
else
            {
                
try
                {
                    
if (name == "GBK")
                    {
                        name 
= "GB2312";
                    }
                    aSCII 
= Encoding.GetEncoding(name);
                }
                
catch
                {
                    aSCII 
= Encoding.GetEncoding("gb2312");
                }
            }
            stream.Seek((
long)0, SeekOrigin.Begin);
            StreamReader reader2 
= new StreamReader(stream, aSCII);
            
return reader2.ReadToEnd();
        } 
 string Html = XINLG.Labs.Utils.NetUtil.GetWebHtml("http://www.cnblogs.com/pick/"null);
            HtmlDocument doc 
= new HtmlDocument();
            doc.LoadHtml(Html); 
原文地址:https://www.cnblogs.com/jes_shaw/p/2247632.html