curl采集初历

2天的采集学习  记录下

目标网站 www.cupshe.com

需求分析  采集目标站点商品主图,名称,价格,以及采集时间

先建立cupshe的库

目标站点全部商品的网址  www.cupshe.com/collections/all?page=1&sort_by=best-selling        //sort_by=best-selling的意思是按照销量排序

分析  商品a标签的href中  可以和商品详情页面的product组成商品地址 

那我们就先采集href中的商品详情地址

先建立表

表名product_urls

CREATE TABLE `product_urls` (
  `id` int(10) unsigned NOT NULL AUTO_INCREMENT,
  `url` varchar(355) DEFAULT NULL,
  PRIMARY KEY (`id`)
) ENGINE=MyISAM AUTO_INCREMENT=49 DEFAULT CHARSET=utf8;

<?php 

    function geturl($url){
        
        
        
        //$sql = "SELECT * FROM product_variants";
        //$res = $conn->query($sql);
        /*
        try{
            $pdo=new PDOException("mysql:host=localhost;dbname=cupshe","root","");
            var_dump($pdo);
        }catch(PDOException $e){
            echo '数据库连接失败'.$e->getMessage();
        }
        $res = $pdo->query("SELECT * FROM product_variants");
        var_dump($res);
        die;
        
        
        */
        //var_dump($res);
        //die;
        $curl = curl_init();
        curl_setopt($curl,CURLOPT_URL,$url);                //要抓取的URL。在使用curl_init()初始化会话时也可以设置。
        curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1);        //TRUE将转移返回为curl_exec()的返回值的字符串,而不是直接输出。
        curl_setopt($curl, CURLOPT_USERAGENT, "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)"); 
        curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, false);    //SSL 报错时使用
        curl_setopt($curl, CURLOPT_SSL_VERIFYHOST, false);    //SSL 报错时使用
        curl_setopt($curl,CURLOPT_SSL_VERIFYPEER,FALSE); //对认证证书来源的检查  true检查加密算法是否存在
        

        $res = curl_exec($curl);
        
        //print_r($res);
        
        curl_close($curl);
        
        return $res;
        
    }
    //$url = "https://www.cupshe.com";
    //https://www.cupshe.com/collections/all?page=1
    //$res = geturl($url);
    /*
     *  匹配字符串*/
    $conn = new mysqli("localhost","root","","cupshe");
        
        if (!$conn) {
            die("Connection failed: " . mysqli_connect_error());
        }
    //$sql = "select * from product_urls";    
    //$res = mysqli_query($conn,$sql);
    //$res = mysqli_fetch_all($res,MYSQLI_ASSOC);
    //var_dump($res);
    //die;
    $page_max = 35;
    $i = 1;
    while($i <= $page_max){
        
        
        $url = "https://www.cupshe.com";
        $url = $url."/collections/all?page={$i}&amp;sort_by=best-selling";
        $res = geturl($url);
        //$aaa = $url."/collections/all?page={$i}&amp;sort_by=best-selling";
        preg_match_all('/href="/products/(.+?)"/i', $res, $m);
        var_dump($m);
        $arr_products = array_values(array_unique($m[1]));
        foreach($arr_products as $k=>$v){
    
            $sql ="INSERT INTO product_urls (`url`) VALUES ('{$v}')";
            //echo $sql;
            $res = mysqli_query($conn,$sql);
        }
        $i++;
        //var_dump($res);
        
        //var_dump($arr_products);
        
    }
    
    mysqli_close($conn);
    
?>


存储在数据库中然后再新建一个php文件  去采集商品详情页面的html文档 并且用正则匹配
先建表 看需求
CREATE TABLE `product_variants` (
  `id` int(10) unsigned NOT NULL AUTO_INCREMENT,
  `name` text,
  `price` float(10,2) DEFAULT NULL,
  `image_src` text,
  `updated_at` varchar(255) DEFAULT NULL,
  `create_time` varchar(255) DEFAULT NULL,
  PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=56 DEFAULT CHARSET=utf8;
<?php
date_default_timezone_set('PRC'); 
function geturl($url){
    $curl = curl_init();
        curl_setopt($curl,CURLOPT_URL,$url);
        curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1);
        curl_setopt($curl, CURLOPT_USERAGENT, "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)"); 
        curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, false);    //SSL 报错时使用
        curl_setopt($curl, CURLOPT_SSL_VERIFYHOST, false);    //SSL 报错时使用
        curl_setopt($curl,CURLOPT_SSL_VERIFYPEER,FALSE); //对认证证书来源的检查  true检查加密算法是否存在
        

        $res = curl_exec($curl);
        
        //print_r($res);
        
        curl_close($curl);
        
        return $res;
        
}
//链接数据库
$conn = new mysqli("localhost","root","","cupshe");
        
        if (!$conn) {
            die("Connection failed: " . mysqli_connect_error());
        }
$sql="SELECT url FROM product_urls"; 
$res=mysqli_query($conn,$sql); 
//var_dump($res);        
$res = mysqli_fetch_all($res,MYSQLI_ASSOC);    
    
//var_dump($res);        
 foreach($res as $val){
     
     $urls = "https://www.cupshe.com/products/".$val['url'];
     //echo $urls; 
     //<span class="last-crumb">Cupshe  Amuse Society Halter Bikini Set</span>
     //<span class="money" doubly-currency-usd="1999" doubly-currency="USD">$19.99 USD</span>
     //<span class="money" id="ProductPrice" itemprop="price">21.80</span>
     $res = geturl($urls);
     
    preg_match('/src="//cdn.shopify.com/((?!js).)*d"/i', $res, $images);
    preg_match('/<span class="last-crumb">(.*?)</span>/is',$res,$names);
    preg_match('/<span class="money" id="ProductPrice" itemprop="price">(.*?)</span>/is',$res,$prices);
    $time = date("Y-m-dH:i:s"); 
    //var_dump($images);
    //echo $images[0];
    $image = substr($images[0],0,-1);
    $image = ltrim($image,"src="");
    $sql ="insert into product_variants (name,price,image_src,create_time) values ('{$names[1]}','{$prices[1]}','{$image}','{$time}')";
//    echo $sql;
    $res = mysqli_query($conn,$sql);
    //var_dump($name);
    // var_dump($name);
    /* foreach($m[0] as $val){
        $m = substr($val,0,strlen($val)-1);
        $m = ltrim($m,"src="");
        //var_dump($m);
        
        
        
        
        //is_dir('./images/') ? '': mkdir('./images/'); 
        //file_put_contents('./images/'.$val['url'].'.jpg', $m);  
        
     }
     
     foreach($name[1] as $val_name){
         
         var_dump($val_name);
     }
    
     foreach($price[1] as $val_price){
         
         var_dump($val_price);
     }
      */
     
     
     
 }
//var_dump($res);


?>

完成之后  又开始写多线程  这是单线程的  

多线程不详解   因为目前我也是半懂半不懂的

参考地址http://www.cnblogs.com/loveyouyou616/p/5624139.html

附上代码

<?php
//for循环 基础方案
$start = microtime(true);

header('Content-type:text/html;charset=utf-8');


//链接数据库
$conn = new mysqli("localhost","root","","cupshe");
        
        if (!$conn) {
            die("Connection failed: " . mysqli_connect_error());
        }
$sql="SELECT url FROM product_urls"; 
$res=mysqli_query($conn,$sql); 
//var_dump($res);        
$res = mysqli_fetch_all($res,MYSQLI_ASSOC);    
//从数据库取url值  出来组装正确的url
foreach($res as $val){
     
     $url_arr[] = "https://www.cupshe.com/products/".$val['url'];
}



$mh = curl_multi_init();

foreach ($url_arr as $i=>$url){
    $curl = curl_init();
    curl_setopt($curl,CURLOPT_URL,$url);
    curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1);
    curl_setopt($curl, CURLOPT_USERAGENT, "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)"); 
    curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, false);    //SSL 报错时使用
    curl_setopt($curl, CURLOPT_SSL_VERIFYHOST, false);    //SSL 报错时使用
    curl_setopt($curl,CURLOPT_SSL_VERIFYPEER,FALSE); //对认证证书来源的检查
    

    if (strpos($url,'https')){
        curl_setopt ( $curl, CURLOPT_SSL_VERIFYPEER, false );
        curl_setopt ( $curl, CURLOPT_SSL_VERIFYHOST, 2 );
    }

    $res = curl_exec($curl);
    curl_close($curl);
    preg_match('/src="//cdn.shopify.com/((?!js).)*d"/i', $res, $images);
    preg_match('/<span class="last-crumb">(.*?)</span>/is',$res,$names);
    preg_match('/<span class="money" id="ProductPrice" itemprop="price">(.*?)</span>/is',$res,$prices);
    $time = date("Y-m-dH:i:s"); 
    //var_dump($images);
    //echo $images[0];
    $image = substr($images[0],0,-1);
    $image = ltrim($image,"src="");
    $sql ="insert into product_variants (name,price,image_src,create_time) values ('{$names[1]}','{$prices[1]}','{$image}','{$time}')";
    $res = mysqli_query($conn,$sql);
    var_dump($res);
}


$end = microtime(true) - $start;

echo '<br/>';
echo $end;  //平均19.002983093262s

在说下  不知道为什么这台机器上pdo用不了   只能用mysqli了  第一次用    可能比较菜吧  慢慢学习吧  学习使我快乐

原文地址:https://www.cnblogs.com/kimc1112/p/7156493.html