ZH奶酪:PHP 使用DOMDocument抓取网页

原文链接:http://blog.csdn.net/xyzhaopeng/article/details/6626340

从一个HTML页面的一个表格中提取数据并且将这个数据整理出来加入到MySQL数据库中。

假设目标HTML中我感兴趣的Table有3列,分别是ID,Name,内容。

index.php

<pre class="php" name="code"><?php  
  
/* 
 * To change this template, choose Tools | Templates 
 * and open the template in the editor. 
 */  
    $urlTarget = "http://www.xxxx.com/targethtmlpage.html";  
  
    require_once('ContentManager.php');  
  
    //建立Dom对象,分析HTML文件;  
    $htmDoc = new DOMDocument;  
    $htmDoc->loadHTMLFile($urlTarget );  
    $htmDoc->normalizeDocument();  
  
    //获得到此文档中每一个Table对象;  
    $tables_list = $htmDoc->getElementsByTagName('table');     
  
    //测试Table Count;  
    $tables_count = $tables_list->length;  
    foreach ($tables_list as $table)  
    {  
        //得到Table对象的class属性  
        $tableProp = $table->getAttribute('class');  
        if ($tableProp == 'target_table_class')  
        {  
            $contentMgr = new ContentManager();  
            $contentMgr->ParseFromDOMElement($table);  
              
            //这里myParser就完成了分析动作。然后就可以进行需要的操作了。  
            //比如写入MySQL。  
            $contentMgr->SerializeToDB();  
        }  
    }  
?>  
</pre><br> 

ContentManager.php

    <?php  
      
    /* 
     * To change this template, choose Tools | Templates 
     * and open the template in the editor. 
     */  
      
    /** 
     * Description of ContentParser 
     * 
     * @author xxxxx 
     */  
    require_once('ContentInfo.php');  
    class ContentManager {  
        //put your code here  
        var $ContentList;  
        public function __construct() {  
            $this->ContentList = new ArrayObject();  
        }  
          
        public function ParseFromDOMElement(DOMElement $table)  
        {  
            $rows_list = $fundsTable->getElementsByTagName('tr');  
            $rows_length = $rows_list->length;  
            $index = 0;  
      
            foreach ($rows_list as $row)  
            {  
                $contentInfo = new ContentInfo();  
                $contentInfo->ParseFromDOMElement($row);  
                $this->ContentList->append ($contentInfo);  
            }  
      
            //test how many contents parsed.  
            $count = $this->fundsInfoArray->count();  
            echo $count;   
        }  
          
        public function SerializeToDB()  
        {  
            //写入数据库,代码略。  
        }  
    }  
      
    ?>  

contentinfo.php

    <?php  
      
    /* 
     * To change this template, choose Tools | Templates 
     * and open the template in the editor. 
     */  
      
    /** 
     * Description of ContentInfo 
     * 
     * @author xxxxx 
     */  
    class ContentInfo {  
        //put your code here  
        var $ID;  
        var $Name;  
        var $Content;  
        public function ParseFromDOMElement(DOMElement $row)  
        {  
            $cells_list = $row->getElementsByTagName('td');  
            $cells_length = $row->length;  
              
            $curCellIdx = 0;  
            foreach ($cells_list as $cell)  
            {  
                switch ($curCellIdx++)  
                {  
                    case 0:  
                        $this->ID = $cell->nodeValue;  
                        break;  
                    case 1:  
                        $this->Name = $cell->nodeValue;  
                        break;  
                    case 2:  
                        $this->Content = $cell->nodeValue;  
                        break;  
                }  
            }  
        }  
    }  
      
    ?>  
原文地址:https://www.cnblogs.com/CheeseZH/p/4858293.html