PHP cURL爬虫基础学习

最近打算做个tiktok uid 爬虫技术所以特意搜下相关PHP cURL爬虫技术

1.使用方式:http://api.98lm.com/debug.php?url=www.98lm.com

2.使用curl爬虫采集

3.使用json封装返回结果

<?php
//header('content-type:text/html;charset=utf-8');

function curlPost($url,$data,$method){
  $ch = curl_init();   //1.初始化
  curl_setopt($ch, CURLOPT_URL, $url); //2.请求地址
  curl_setopt($ch, CURLOPT_CUSTOMREQUEST, $method);//3.请求方式
  //4.参数如下
  curl_setopt($ch, CURLOPT_HEADER, 0);//是否显示头信息  
  curl_setopt($ch, CURLOPT_RETURNTRANSFER, 0);//是否自动显示返回的信息  
//curl_setopt($ch, CURLOPT_COOKIEJAR, $cookie); //设置Cookie信息保存在指定的文件中 
  curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, FALSE);//https
  curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, FALSE);
  curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (compatible; MSIE 5.01; Windows NT 5.0)');//模拟浏览器
  curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
  curl_setopt($ch, CURLOPT_AUTOREFERER, 1);
  curl_setopt($ch, CURLOPT_HTTPHEADER,array('Accept-Encoding: gzip, deflate'));//gzip解压内容
  curl_setopt($ch, CURLOPT_ENCODING, 'gzip,deflate');
    
  if($method=="POST"){//5.post方式的时候添加数据
    curl_setopt($ch, CURLOPT_POSTFIELDS, $data);
  }
  
  if($method=="GET"){//5.post方式的时候添加数据
  curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
  }
  
  $tmpInfo = curl_exec($ch);//6.执行
  if (curl_errno($ch)) {//7.如果出错
    return curl_error($ch);
  }
  curl_close($ch);//8.关闭
  return $tmpInfo;
}
/*封装返回json数据*/
    function jsonReturn($code = 200,$msg='',$data = null)
    {
        $Result['code'] = $code;
        $Result['msg'] = $msg ? $msg : '';
        $Result['msg']  = iconv('GB2312', 'UTF-8', $Result['msg'] );
       //  $Result['msg']= json_decode( $Result['msg']);
        
        if($data !== null) $Result['data'] = $data;
        if(($Result = json_encode($Result,JSON_UNESCAPED_UNICODE)) === false){
            switch(json_last_error()){
                case JSON_ERROR_NONE: exit('JSON_ERROR_NONE');
                case JSON_ERROR_DEPTH: exit('JSON_ERROR_DEPTH');
                case JSON_ERROR_STATE_MISMATCH: exit('JSON_ERROR_STATE_MISMATCH');
                case JSON_ERROR_CTRL_CHAR: exit('JSON_ERROR_CTRL_CHAR');
                case JSON_ERROR_SYNTAX: exit('JSON_ERROR_SYNTAX');
                case JSON_ERROR_UTF8: exit('JSON_ERROR_UTF8');
                case JSON_ERROR_RECURSION: exit('JSON_ERROR_RECURSION');
                case JSON_ERROR_INF_OR_NAN: exit('JSON_ERROR_INF_OR_NAN');
                case JSON_ERROR_UNSUPPORTED_TYPE: exit('JSON_ERROR_UNSUPPORTED_TYPE');
                case JSON_ERROR_INVALID_PROPERTY_NAME: exit('JSON_ERROR_INVALID_PROPERTY_NAME');
                case JSON_ERROR_UTF16: exit('JSON_ERROR_UTF16');
                default: exit('JSON_ERROR_UNKNOWN');
            }
        }
        // 返回JSON数据格式到客户端 包含状态信息
        header('Content-Type:application/json; charset=utf-8');
        //跨域请求
        //header('Access-Control-Allow-Origin:*');
       // exit($Result);
        //  $Result= json_decode($Result);
       exit($Result);
     
}
 
//header('Content-type:text/json');
if($_REQUEST['url']!="")
{
 $tk_url=$_REQUEST['url'];
//jsonReturn(1,'获取数据成功',$tk_url);
} else {
 jsonReturn(-1,'No url 获取数据失败');
 exit(0);

}
//$cookie = dirname(__FILE__) . '/cookie_oschina.txt'; //设置cookie保存路径  
$data=array('name' => '1234');
//$url="http://www.98lm.com/";
$method="GET";
$file=curlPost($tk_url,$data,$method);
//$file=mb_convert_encoding($file,'UTF-8','GBK');
preg_match('/<i class="iconfont icon-daohang2"><\/i>(.*?)<\/a><i class="line">/',$file, $match);
jsonReturn(1,'获取数据成功',$match[1]);

 

赞 (0)