php抓取网页body内容,并过滤网页标签

  php只抓取网页文字内容,并过滤其标签,说干就干,开始!

  

<?php
 function curl_request ( $url , $post = '' , $cookie = '' ,  $returnCookie = 0 ) {
	 $ua = $ua==''?$_SERVER ['HTTP_USER_AGENT']:'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)' ;
			$curl  =  curl_init ( ) ;
			curl_setopt ( $curl , CURLOPT_URL ,  $url ) ;
			curl_setopt ( $curl , CURLOPT_USERAGENT , $ua ) ;
			curl_setopt ( $curl , CURLOPT_FOLLOWLOCATION ,  1 ) ;
			curl_setopt ( $curl , CURLOPT_AUTOREFERER ,  1 ) ;
			curl_setopt ( $curl , CURLOPT_REFERER ,  "https://www.baidu.com" ) ;
			if ( $post )  {
				 curl_setopt ( $curl , CURLOPT_POST ,  1 ) ;
				 curl_setopt ( $curl , CURLOPT_POSTFIELDS ,  http_build_query ( $post ) ) ;
			}
			if ( $cookie )  {
				 curl_setopt ( $curl , CURLOPT_COOKIE ,  $cookie ) ;
			}
			curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, false);
			curl_setopt($curl, CURLOPT_SSL_VERIFYHOST, false);
			curl_setopt ( $curl , CURLOPT_HEADER ,  $returnCookie ) ;
			curl_setopt ( $curl , CURLOPT_TIMEOUT ,  10 ) ;
			curl_setopt ( $curl , CURLOPT_RETURNTRANSFER ,  1 ) ;
			$data  =  curl_exec ( $curl ) ;
			if  ( curl_errno ( $curl ) )  {
				 return  curl_error ( $curl ) ;
			}
			curl_close ( $curl ) ;
			if ( $returnCookie ) {
				 list ( $header ,  $body )  =  explode ( "

" ,  $data ,  2 ) ;
				 preg_match_all ( "/Set-Cookie:([^;]*);/" ,  $header ,  $matches ) ;
				 $info [ 'cookie' ]   =  substr ( $matches [ 1 ] [ 0 ] ,  1 ) ;
				 $info [ 'content' ]  =  $body ;
				 return  $info ;
			} else {
				 //return  $data ;
				 $data=mb_convert_encoding($data, 'UTF-8', 'UTF-8,GBK,GB2312,BIG5');
				preg_match("/<body.*?>(.*?)</body>/is",$data,$match);
				$str= trim($match[1]);
      $html = strip_tags($str);
	$html_len = mb_strlen($html,'UTF-8');
	$html = mb_substr($html, 0, strlen($html), 'UTF-8');
	$search = array(" "," ","
","
","	");
    $replace = array("","","","","");
	echo str_replace($search, $replace, $html);
			}
}
curl_request ( $url, $post = '' , $cookie = '' ,  $returnCookie = 0 );
?>

  演示地址:http://www.myjiancai.net/so/?a=url&u=aHR0cDovL3d3dy5iYWlkdS5jb20vbGluaz91cmw9cVktZmFNMG4wS1lzRVdoNDJCZlVPZUVsbnpPcTEzN3ZhZGRvSDN1RUswcUZtLS1jenZxNy1wVjM1aWJzczRFa3R6ejVWd0toWllRMVJvLTU1UU01Zl8=&t=5paw5L2Z5pa55p!x5omjLeW7uuetkeeZvuenkS3mlK@mqKHnvZE=&s=5paw5L2Z5pa55p!x5omj

千行代码,Bug何处藏。 纵使上线又怎样,朝令改,夕断肠。
原文地址:https://www.cnblogs.com/68xi/p/14983693.html