PHP采集器 基于yii

<?php

class CjRenwuResultController extends Controller
{
	/**
	 * @var string the default layout for the views. Defaults to '//layouts/column2', meaning
	 * using two-column layout. See 'protected/views/layouts/column2.php'.
	 */
	public $layout='//layouts/lv_column2';

	/**
	 * @return array action filters
	 */
	public function filters()
	{
		return array(
			'accessControl', // perform access control for CRUD operations
			'postOnly + delete', // we only allow deletion via POST request
		);
	}

	/**
	 * Specifies the access control rules.
	 * This method is used by the 'accessControl' filter.
	 * @return array access control rules
	 */
	public function accessRules()
	{
		return array(
	 
			array('allow', // allow authenticated user to perform 'create' and 'update' actions
				'actions'=>array('create','update','admin','delete'),
				'users'=>array('@'),
			),
		 
			array('deny',  // deny all users
				'users'=>array('*'),
			),
		);
	}

	/**
	 * Displays a particular model.
	 * @param integer $id the ID of the model to be displayed
	 */
	public function actionView($id)
	{
		$this->render('view',array(
			'model'=>$this->loadModel($id),
		));
	}

	/**
	 * Creates a new model.
	 * If creation is successful, the browser will be redirected to the 'view' page.
	 */
	public function actionCreate()
	{
		$model=new CjRenwuResult;

		// Uncomment the following line if AJAX validation is needed
		// $this->performAjaxValidation($model);

		if(isset($_POST['CjRenwuResult']))
		{
			$model->attributes=$_POST['CjRenwuResult'];
			if($model->save())
				$this->redirect(array('view','id'=>$model->id));
		}

		$this->render('create',array(
			'model'=>$model,
		));
	}

	/**
	 * Updates a particular model.
	 * If update is successful, the browser will be redirected to the 'view' page.
	 * @param integer $id the ID of the model to be updated
	 */
	public function actionUpdate($id)
	{
		$model=$this->loadModel($id);

		// Uncomment the following line if AJAX validation is needed
		// $this->performAjaxValidation($model);

		if(isset($_POST['CjRenwuResult']))
		{
			$model->attributes=$_POST['CjRenwuResult'];
			if($model->save())
				$this->redirect(array('view','id'=>$model->id));
		}

		$this->render('update',array(
			'model'=>$model,
		));
	}

	/**
	 * Deletes a particular model.
	 * If deletion is successful, the browser will be redirected to the 'admin' page.
	 * @param integer $id the ID of the model to be deleted
	 */
	public function actionDelete($id)
	{
		$this->loadModel($id)->delete();

		// if AJAX request (triggered by deletion via admin grid view), we should not redirect the browser
		if(!isset($_GET['ajax']))
			$this->redirect(isset($_POST['returnUrl']) ? $_POST['returnUrl'] : array('admin'));
	}

	/**
	 * Lists all models.
	 */
	public function actionIndex()
	{
		$dataProvider=new CActiveDataProvider('CjRenwuResult');
		$this->render('index',array(
			'dataProvider'=>$dataProvider,
		));
	}

	/**
	 * Manages all models.
	 */
	public function actionAdmin()
	{
		$model=new CjRenwuResult('search');
		$model->unsetAttributes();  // clear any default values
		if(isset($_GET['CjRenwuResult']))
			$model->attributes=$_GET['CjRenwuResult'];

		$this->render('admin',array(
			'model'=>$model,
		));
	}

	/**
	 * Returns the data model based on the primary key given in the GET variable.
	 * If the data model is not found, an HTTP exception will be raised.
	 * @param integer $id the ID of the model to be loaded
	 * @return CjRenwuResult the loaded model
	 * @throws CHttpException
	 */
	public function loadModel($id)
	{
		$model=CjRenwuResult::model()->findByPk($id);
		if($model===null)
			throw new CHttpException(404,'The requested page does not exist.');
		return $model;
	}

	/**
	 * Performs the AJAX validation.
	 * @param CjRenwuResult $model the model to be validated
	 */
	protected function performAjaxValidation($model)
	{
		if(isset($_POST['ajax']) && $_POST['ajax']==='cj-renwu-result-form')
		{
			echo CActiveForm::validate($model);
			Yii::app()->end();
		}
	}
}

控制器分4个文件

CjRenwuController.php

<?php
class CjRenwuController extends Controller
{
     
    public $layout='//layouts/lv_column2';
 
    public function filters()
    {
        return array(
            'accessControl', // perform access control for CRUD operations
            'postOnly + delete', // we only allow deletion via POST request
        );
    }
 
    public function accessRules()
    {
        return array(
            array('allow',  // allow all users to perform 'index' and 'view' actions
                'actions'=>array('index','view'),
                'users'=>array('*'),
            ),
            array('allow', // allow authenticated user to perform 'create' and 'update' actions
                'actions'=>array('create','update','admin','getyuanma'),
                'users'=>array('@'),
            ),
            array('allow', // allow admin user to perform 'admin' and 'delete' actions
                'actions'=>array('delete'),
                'users'=>array('admin'),
            ),
            array('deny',  // deny all users
                'users'=>array('*'),
            ),
        );
    }
     
    public function actionGetyuanma()
    {         
        $this->layout='//layouts/lv_getyuanma'; 

        /**获取源码测试**/      
        error_reporting(0);
        set_time_limit(0);

        if(isset($_POST))
        {
            $url=$_POST["url"];
            $bianma=$_POST["bianma"];
            
            if($bianma=='GB2312')
                $neirong=iconv("GB2312","UTF-8//IGNORE",file_get_contents($url));
            else
                $neirong=file_get_contents($url);

        }else{
            $neirong='';
        }

        $this->render('getyuanma',array(
            'neirong'=>$neirong,
        ));

    }


    public function actionView($id)
    {
        $this->render('view',array(
            'model'=>$this->loadModel($id),
        ));
    }
 

    public function actionCreate()
    {
        $model=new CjRenwu;

        if(isset($_POST['CjRenwu']))
        {
            $model->attributes=$_POST['CjRenwu'];
            if($model->save())
                $this->redirect(array('admin'));
        }

        $this->render('create',array(
            'model'=>$model,
        ));
    }

  
 
    public function actionUpdate($id)
    {

         $model=$this->loadModel($id);
         if(isset($_POST["ceshi"]) && $_POST["ceshi"]==1)
         {

             $url=$_POST['CjRenwu']['url'];
             $url2=$_POST['CjRenwu']['url2'];
            $url2_start= $_POST['CjRenwu']['url2_start'];
            $url2_end= $_POST['CjRenwu']['url2_end'];

            $start=$_POST['CjRenwu']['start'];          //超链接区域 开始标示
            $end=$_POST['CjRenwu']['end'];            //超链接区域 结束标示

            $tiquguize=$_POST['CjRenwu']['tiquguize'];   //内容超链接匹配规则
            $dijige=$_POST['CjRenwu']['dijige'];        //第几个(.*)是链接url

            $urlqianzhui=$_POST['CjRenwu']['urlqianzhui']; //URL前缀
            $bianma=$_POST['CjRenwu']['bianma']; //确定编码
 
            
            $arr_url=explode("
",$url);             
            $arr_url2=array();

            for($k=$url2_start;$k<=$url2_end;$k++)
            {
                array_push($arr_url2,str_replace('{xxq}',$k,$url2));
            }
             
            if($url=='')
            {
                $arr_url3=$arr_url2;
            }else if($url2==''){
                $arr_url3=$arr_url;
            }else{
                $arr_url3=array_merge($arr_url,$arr_url2);
            }
 
             echo '<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />';

            foreach ($arr_url3 as $key => $value) { 
            //循环列表页 开始             

                $url_now=$value;

                 echo "<br>";
                echo "<font color=red>正在分析列表页:</font>".$url_now;
                echo "<br>";
                /*********************** 开始 采集URL ****************************/ 

                if($bianma=='gb2312')
                    $contents=iconv("gb2312","utf-8//IGNORE",file_get_contents($url_now));
                else
                    $contents=file_get_contents($url_now);
     
                $geshi='/'.$start.'(.*?)'.$end.'/is';  //超链接区域 匹配格式  
                preg_match_all($geshi,$contents,$pipei); //开始匹配 超链接区域
                $zxcontent=$pipei[0][0]; //获得 超链接区域内容(唯一) 
                $geshi2='/'.$tiquguize.'/is';  //内容超链接url 匹配格式      
                preg_match_all($geshi2,$zxcontent,$pipei2); //开始匹配 内容超链接url
                $num=count($pipei2[0]);// 获取内容超链接url的数量

                $zxlink=array();//将URL结果集 保存到数组中
                for($j=0;$j<$num;$j++)
                {
                    array_push($zxlink,$pipei2[$dijige][$j]); 
                }                 
                /*********************** 结束 采集URL ****************************/
                 
                /*********************** 开始 显示URL ****************************/
                for($p=0;$p<count($zxlink);$p++)
                {                
                    $thisurl=$urlqianzhui.$zxlink[$p];
                    echo "<br>";
                    echo "得到的内容页地址是:".$thisurl;  
                }
        
            //循环列表页 结束
            }


         }else{

             if(isset($_POST['CjRenwu']))
            {
                $model->attributes=$_POST['CjRenwu'];
                if($model->save())
                    $this->redirect(array('update','id'=>$model->id));
            }

            $this->render('update',array(
                'model'=>$model,
            ));

         }


    }



    public function actionDelete($id)
    {
        $this->loadModel($id)->delete();

        // if AJAX request (triggered by deletion via admin grid view), we should not redirect the browser
        if(!isset($_GET['ajax']))
            $this->redirect(isset($_POST['returnUrl']) ? $_POST['returnUrl'] : array('admin'));
    }

    /**
     * Lists all models.
     */
    public function actionIndex()
    {
        $dataProvider=new CActiveDataProvider('CjRenwu');
        $this->render('index',array(
            'dataProvider'=>$dataProvider,
        ));
    }

    /**
     * Manages all models.
     */
    public function actionAdmin()
    {
        $model=new CjRenwu('search');
        $model->unsetAttributes();  // clear any default values
        if(isset($_GET['CjRenwu']))
            $model->attributes=$_GET['CjRenwu'];

        $this->render('admin',array(
            'model'=>$model,
        ));
    }

    /**
     * Returns the data model based on the primary key given in the GET variable.
     * If the data model is not found, an HTTP exception will be raised.
     * @param integer $id the ID of the model to be loaded
     * @return CjRenwu the loaded model
     * @throws CHttpException
     */
    public function loadModel($id)
    {
        $model=CjRenwu::model()->findByPk($id);
        if($model===null)
            throw new CHttpException(404,'The requested page does not exist.');
        return $model;
    }

    /**
     * Performs the AJAX validation.
     * @param CjRenwu $model the model to be validated
     */
    protected function performAjaxValidation($model)
    {
        if(isset($_POST['ajax']) && $_POST['ajax']==='cj-renwu-form')
        {
            echo CActiveForm::validate($model);
            Yii::app()->end();
        }
    }
}

CjRenwuResultController.php

核心文件CjContentController.php

<?php
class CjCenterController extends Controller
{
    public $layout='//layouts/lv_column2';    

    public function filters()
    {
        return array(
            'accessControl', // perform access control for CRUD operations
            'postOnly + delete', // we only allow deletion via POST request
        );
    }

    public function accessRules()
    {
        return array(
             
            array('allow',
                'actions'=>array('index','geturl','tiquruku','deleteurl','deleteku','ceshi'),
                'users'=>array('@'),
            ),     
            array('deny',  // deny all users
                'users'=>array('*'),
            ),
        );
    }

    public function actionIndex()
    {
        $CjRenwu_model=new CjRenwu;
        $criteria=new CDbCriteria(); 
        $criteria->order='id desc';
        $data_renwu=$CjRenwu_model->findAll($criteria);

        $this->render('index',array(
            'data_renwu'=>$data_renwu,
        ));
    }

  

    


    public function actionGeturl($renwu_id)
    {
            $model=$this->loadModel_renwu($renwu_id);

            $url=$model->url;
             $url2=$model->url2;
            $url2_start=$model->url2_start;
            $url2_end=$model->url2_end;

            $start=$model->start;          //超链接区域 开始标示
            $end=$model->end;            //超链接区域 结束标示

            $tiquguize=$model->tiquguize;   //内容超链接匹配规则
            $dijige=$model->dijige;        //第几个(.*)是链接url

            $urlqianzhui=$model->urlqianzhui; //URL前缀
            $bianma=$model->bianma; //确定编码 
            
            $arr_url=explode("
",$url);             
            $arr_url2=array();

            for($k=$url2_start;$k<=$url2_end;$k++)
            {
                array_push($arr_url2,str_replace('{xxq}',$k,$url2));
            }

            if($url=='')
            {
                $arr_url3=$arr_url2;

            }else if($url2==''){

                $arr_url3=$arr_url;

            }else{

                $arr_url3=array_merge($arr_url,$arr_url2);
            }
  

            foreach ($arr_url3 as $key => $value) { 
            //循环列表页 开始             

                if($value=='')
                {
                    die('采集地址为空!');
                }

                $url_now=$value;

              
                //echo "<font color=red>正在分析列表页:</font>".$url_now;
                //echo "<br>";
                /*********************** 开始 采集URL ****************************/ 

                if($bianma=='gb2312')
                    $contents=iconv("gb2312","utf-8//IGNORE",file_get_contents($url_now));
                else
                    $contents=file_get_contents($url_now);
     
                $geshi='/'.$start.'(.*?)'.$end.'/is';  //超链接区域 匹配格式  
                preg_match_all($geshi,$contents,$pipei); //开始匹配 超链接区域
                $zxcontent=$pipei[0][0]; //获得 超链接区域内容(唯一) 
                $geshi2='/'.$tiquguize.'/is';  //内容超链接url 匹配格式      
                preg_match_all($geshi2,$zxcontent,$pipei2); //开始匹配 内容超链接url
                $num=count($pipei2[0]);// 获取内容超链接url的数量

                $zxlink=array();//将URL结果集 保存到数组中
                for($j=0;$j<$num;$j++)
                {
                    array_push($zxlink,$pipei2[$dijige][$j]); 
                }                 
                /*********************** 结束 采集URL ****************************/
                 
                /*********************** 开始 显示URL ****************************/
                for($p=0;$p<count($zxlink);$p++)
                {                
                    $thisurl=$urlqianzhui.$zxlink[$p];
                    //echo "<br>";
                    //echo "得到的内容页地址是:".$thisurl;  
                    
                    $model2=new CjRenwuUrl;
                    $model2->url=$thisurl;
                    $model2->renwu_id=$renwu_id;
                    $model2->bianma=$bianma;
                    $model2->inputtime=time();
                    $model2->inputtime2=date('H:i Y-m-d',time());
                    $model2->save(); 
                }

                if($key==(count($arr_url3)-1))
                         $this->redirect(array('CjCenter/index'));
        
            //循环列表页 结束
            }
 
    }

    public function actionDeleteurl($renwu_id)
    {
        $CjRenwuUrl_model=new CjRenwuUrl;
        $criteria=new CDbCriteria();
        $criteria->condition="t.renwu_id='".$renwu_id."'";
        $criteria->order='id desc';
        $data_CjRenwuUrl=$CjRenwuUrl_model->findAll($criteria);

        if(count($data_CjRenwuUrl)==0)
            $this->redirect(array('CjCenter/index'));

        foreach ($data_CjRenwuUrl as $key => $value) {
            
            $this->loadModel_renwuurl($value->id)->delete();

            if($key==(count($data_CjRenwuUrl)-1))
                $this->redirect(array('CjCenter/index'));
        }
  
    }

    public function actionTiquruku($renwu_id)
    {
        $model=$this->loadModel_renwu($renwu_id);
        $bianma=$model->bianma;
  
          /*查询标签 开始*/
        $CjRenwuBiaoqian_model=new CjRenwuBiaoqian;
        $criteria=new CDbCriteria();
        $criteria->condition="t.renwu_id='".$renwu_id."'";
        $criteria->order='id desc';
        $data_CjRenwuBiaoqian=$CjRenwuBiaoqian_model->findAll($criteria);
        /*查询标签 结束*/

        /*查询URL 开始*/
        $CjRenwuUrl_model=new CjRenwuUrl;
        $criteria=new CDbCriteria();
        $criteria->condition="t.renwu_id='".$renwu_id."' and t.caileme=0";
        $criteria->order='id desc';
        $data_CjRenwuUrl=$CjRenwuUrl_model->findAll($criteria);
        /*查询URL 结束*/

        foreach ($data_CjRenwuUrl as $kk => $data) {
        //循环每一个文章URL 开始
            
                 $url_now=$data->url; //正在处理的URL

                 $arr_result_tiqu=array(); //储备单个URL提取出来的数据            
                foreach ($data_CjRenwuBiaoqian as $key => $value) {
                     
                    $bq_name=$value->bq_name;
                    $bq_start=$value->bq_start;
                    $bq_end=$value->bq_end;
                    $bq_guize=$value->bq_guize;
                    $bq_dijige=$value->bq_dijige;
                    $bq_tihuan=$value->bq_tihuan;

                    if($bianma=='gb2312')
                        $contents=iconv("gb2312","utf-8//IGNORE",file_get_contents($url_now));
                    else
                        $contents=file_get_contents($url_now);

                    $geshi='/'.$bq_start.'(.*?)'.$bq_end.'/is'; 
                    preg_match_all($geshi,$contents,$pipei);
                    $zxcontent=$pipei[0][0]; 
                    $geshi2='/'.$bq_guize.'/is';  
                    preg_match_all($geshi2,$zxcontent,$pipei2); 
                    
                    $str_r=$pipei2[$bq_dijige][0];

                    if($bq_tihuan!=''){
                        $arr_bq_tihuan=explode("
",$bq_tihuan);                     
                        foreach ($arr_bq_tihuan as $key => $value) {
                            $arr_th=explode('|', $value);
                            $str_r=str_replace($arr_th[0], $arr_th[1], $str_r);
                        }    
                    }
                     
                    array_push($arr_result_tiqu,$bq_name.':::'.trim($str_r));
                }
                $str_result_tiqu=implode('^^^', $arr_result_tiqu);

                /*结果入库 开始*/
                $model2=new CjRenwuResult;
                $model2->content=$str_result_tiqu;
                $model2->renwu_id=$renwu_id;        
                $model2->url=$url_now;         
                $model2->inputtime=time();
                $model2->inputtime2=date('H:i Y-m-d',time());
                $model2->save();
                /*结果入库 结束*/

                //修改URL为已采集
                $model3=$this->loadModel_renwuurl($data->id);
                $model3->caileme=1;
                $model3->save();

                if($kk==(count($data_CjRenwuUrl)-1))
                    $this->redirect(array('CjCenter/index'));


        //循环每一个文章URL 结束
        }
                 
 
    }

    public function actionDeleteku($renwu_id)
    {
        $CjRenwuResult_model=new CjRenwuResult;
        $criteria=new CDbCriteria();
        $criteria->condition="t.renwu_id='".$renwu_id."'";
        $criteria->order='id desc';
        $data_CjRenwuResult=$CjRenwuResult_model->findAll($criteria);

        if(count($data_CjRenwuResult)==0)
            $this->redirect(array('CjCenter/index'));
        
        foreach ($data_CjRenwuResult as $key => $value) {
            
            $this->loadModel_renwuresult($value->id)->delete();

            if($key==(count($data_CjRenwuResult)-1))
                $this->redirect(array('CjCenter/index'));
        }
 
 
    }

 
 
    public function loadModel_renwu($id)
    {
        $model=CjRenwu::model()->findByPk($id);
        if($model===null)
            throw new CHttpException(404,'The requested page does not exist.');
        return $model;
    }


    public function loadModel_renwuurl($id)
    {
        $model=CjRenwuUrl::model()->findByPk($id);
        if($model===null)
            throw new CHttpException(404,'The requested page does not exist.');
        return $model;
    }

    public function loadModel_renwuresult($id)
    {
        $model=CjRenwuResult::model()->findByPk($id);
        if($model===null)
            throw new CHttpException(404,'The requested page does not exist.');
        return $model;
    }

    public function actionCeshi(){

         $this->render('ceshi');
    }

    public function actionCeshi2(){

        $bq_content='山东全省到青海全省';
        $arr_city=explode('到',$bq_content);
        $city1=$arr_city[0];
        $city2=$arr_city[1];

        /*出发地区*/
        if(strstr($city1,'全省')){

            $city1=str_replace('全省', '', $city1);

            $XxqArea_model=new XxqArea;
            $criteria=new CDbCriteria;
            $criteria->condition="t.areaname like '%$city1%' and t.daima like '%00%'";
            $result=$XxqArea_model->find($criteria);

            $areaname_sheng_from=$result->areaname; //到达省 名
            $areaid_sheng_from=$result->areaid; //到达省 id

            $areaname_shi_from=''; //到达市 名
            $areaid_shi_from=''; //到达市 id


        }else{

            $XxqArea_model=new XxqArea;
            $criteria=new CDbCriteria;
            $criteria->condition="t.areaname like '%$city1%' and t.daima like '%00%'";
            $result=$XxqArea_model->find($criteria);

            $areaname_shi_from=$result->areaname; //出发市 名
            $areaid_shi_from=$result->areaid; //出发市 id
     
            $daima=$result->daima;
            $sheng_daima=substr($daima, 0,2).'0000';
     
            $XxqArea_model=new XxqArea;
            $criteria=new CDbCriteria;
            $criteria->condition="t.daima='$sheng_daima'";
            $result=$XxqArea_model->find($criteria);
        
            $areaname_sheng_from=$result->areaname; //出发省 名
            $areaid_sheng_from=$result->areaid; //出发省 id

        }

        /*目的地区*/
        if(strstr($city2,'全省')){

            $city2=str_replace('全省', '', $city2);

            $XxqArea_model=new XxqArea;
            $criteria=new CDbCriteria;
            $criteria->condition="t.areaname like '%$city2%' and t.daima like '%00%'";
            $result=$XxqArea_model->find($criteria);

            $areaname_sheng_to=$result->areaname; //到达省 名
            $areaid_sheng_to=$result->areaid; //到达省 id

            $areaname_shi_to=''; //到达市 名
            $areaid_shi_to=''; //到达市 id


        }else{

            $XxqArea_model=new XxqArea;
            $criteria=new CDbCriteria;
            $criteria->condition="t.areaname like '%$city2%' and t.daima like '%00%'";
            $result=$XxqArea_model->find($criteria);

            $areaname_shi_to=$result->areaname; //到达市 名
            $areaid_shi_to=$result->areaid; //到达市 id
     
            $daima=$result->daima;
            $sheng_daima=substr($daima, 0,2).'0000';
     
            $XxqArea_model=new XxqArea;
            $criteria=new CDbCriteria;
            $criteria->condition="t.daima='$sheng_daima'";
            $result=$XxqArea_model->find($criteria);
        
            $areaname_sheng_to=$result->areaname; //到达省 名
            $areaid_sheng_to=$result->areaid; //到达省 id
        }
        
       

        echo $areaname_sheng_from.'_'.$areaname_shi_from.'>>'.$areaname_sheng_to.'_'.$areaname_shi_to;

        echo '<br>';

        echo $areaid_sheng_from.'_'.$areaid_shi_from.'>>'.$areaid_sheng_to.'_'.$areaid_shi_to;

    }
    


}

www.zgline.com

原文地址:https://www.cnblogs.com/zhaoguoliang/p/3415613.html