使用HtmlParser使用心得

最近因工作的需要，需要检查html那些不合理或则什么没有闭合。在网上找了很久都没有找到比较合适的工具。于是句试着搞搞HtmlParser。

获取html的代码：

   string GetContentFromUrl(string url)
        {
            string content = string.Empty;
            try
            {
                HttpWebRequest request = (HttpWebRequest)HttpWebRequest.Create(url);
                request.Method = "GET";
                request.AllowAutoRedirect = true;
                HttpWebResponse response = request.GetResponse() as HttpWebResponse;
                using (Stream stream = response.GetResponseStream())
                {
                    StringBuilder sb = new StringBuilder();
                    byte[] buffer = new byte[4096];
                    MemoryStream sr = new MemoryStream();
                    Encoding coding = Encoding.GetEncoding(response.CharacterSet);
                    int readLength = stream.Read(buffer, 0, buffer.Length);
                    while (readLength > 0)
                    {
                        sr.Write(buffer, 0, readLength);
                        string txt = coding.GetString(buffer, 0, readLength);
                        sb.Append(txt);
                        readLength = stream.Read(buffer, 0, buffer.Length);
                    }
                    content = sb.ToString();
                }
                response.Close();
                request.Abort();
            }
            catch (Exception ex)
            {
                content = ex.Message;
            }
            return content;
        }

解析html代码，一下代码在网上都能找到的

  private void RecursionHtmlNode(TreeNode treeNode, INode htmlNode, bool siblingRequired)
        {
            if (htmlNode == null || treeNode == null) return;

            TreeNode current = treeNode;
            //current node
            if (htmlNode is ITag)
            {
                ITag tag = (htmlNode as ITag);
                if (!tag.IsEndTag())
                {
                    string nodeString = tag.TagName + " ";
                    if (tag.Attributes != null && tag.Attributes.Count > 0)
                    {
                        StringBuilder sb = new StringBuilder();
                        foreach (string key in tag.Attributes.Keys)
                        {
                            if (key.Contains("<TAGNAME>"))
                                continue;
                            if (tag.Attributes[key] != null)
                                sb.Append(key + "=\"" + tag.Attributes[key].ToString() + "\"");
                        }
                  
                        nodeString += sb.ToString();
                    }
                    current = new TreeNode(nodeString);
                    treeNode.Nodes.Add(current);
                }
            }

            //the children nodes
            if (htmlNode.Children != null && htmlNode.Children.Count > 0)
            {
                this.RecursionHtmlNode(current, htmlNode.FirstChild, true);
            }

            //the sibling nodes
            if (siblingRequired)
            {
                INode sibling = htmlNode.NextSibling;
                while (sibling != null)
                {
                    this.RecursionHtmlNode(treeNode, sibling, false);
                    sibling = sibling.NextSibling;
                }
            }
        }

  void ParseHTml()
        {
            string content = this.txtContent.Text;
            if (string.IsNullOrEmpty(content))
                return;

            Lexer lexer = new Lexer(content);
            Parser parser = new Parser(lexer);
            NodeList htmlNodes = parser.Parse(null);
            this.treeView1.Nodes.Clear();
            this.treeView1.Nodes.Add("root");
            TreeNode treeRoot = this.treeView1.Nodes[0];
            for (int i = 0; i < htmlNodes.Count; i++)
            {
                this.RecursionHtmlNode(treeRoot, htmlNodes[i], false);
            }

        }

运行结果如图：

网上有关HtmlParser的源代码下载比较麻烦，我把该部分代码页放在此次demo中了，下载地址：http://download.csdn.net/detail/dz45693/4374572