php常用的几种采集方法

function curl_get($url) {

$ch = curl_init();

curl_setopt($ch, CURLOPT_URL, $url);

curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);

curl_setopt($ch, CURLOPT_HEADER, 1);

$result = curl_exec($ch);

$code = curl_getinfo($ch, CURLINFO_HTTP_CODE);

if ($code != '404' && $result) {

return $result;

}

curl_close($ch);

}

 

//获取页面url链接

function get_page_urls($spider_page_result, $base_url) {

$get_url_result = preg_match_all("/\'\"\ ]*).*?>/", $spider_page_result, $out);

if ($get_url_result) {

return $out[1];

} else {

return;

}

}

 

/**模拟百度蜘蛛采集**/
function _GetContent( $url ){
        $ch = curl_init();
        $ip = '220.181.108.91';  // 百度蜘蛛
        $timeout = 15;
        curl_setopt($ch,CURLOPT_URL,$url);
        curl_setopt($ch,CURLOPT_TIMEOUT,0);
        //伪造百度蜘蛛IP
        curl_setopt($ch,CURLOPT_HTTPHEADER,array('X-FORWARDED-FOR:'.$ip.'','CLIENT-IP:'.$ip.''));
        //伪造百度蜘蛛头部
        curl_setopt($ch,CURLOPT_USERAGENT,"Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)");
        curl_setopt($ch,CURLOPT_RETURNTRANSFER,1);
        curl_setopt($ch,CURLOPT_HEADER,0);
        curl_setopt($ch,CURLOPT_CONNECTTIMEOUT,$timeout);
        $content = curl_exec($ch);
        if($content === false)
        {//输出错误信息
            $no = curl_errno($ch);
            switch(trim($no))
            {
                case 28 : $error = '访问目标地址超时'; break;
                default : $error = curl_error($ch); break;
            }
            echo $error;
        }
        else
        {
            $succ = true;
            return $content;
        }
    }

 

function post($url, $data = array())
{
 global $nochange;
 $o = "";
 foreach ($data as $k => $v) {
  $o .= "$k=" . $v . "&";
 }
 $data = substr($o, 0, -1);
 $ch = curl_init();
 curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
 curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, FALSE);
 curl_setopt($ch, CURLOPT_URL, $url);
 curl_setopt($ch, CURLOPT_ENCODING, 'gzip,deflate');
 curl_setopt($ch, CURLOPT_REFERER, $url);
 curl_setopt($ch, CURLOPT_USERAGENT, $url);
 curl_setopt($ch, CURLOPT_HEADER, false);
 curl_setopt($ch, CURLOPT_HTTPHEADER, array('CLIENT-IP:' . get_rand_ip(), 'X-FORWARDED-FOR:' . get_rand_ip()));
 curl_setopt($ch, CURLOPT_TIMEOUT, 30);
 curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
 curl_setopt($ch, CURLOPT_POST, 1);
 curl_setopt($ch, CURLOPT_POSTFIELDS, $data);
 $rs = curl_exec($ch);
 $info = curl_getinfo($ch);
 curl_close($ch);
 $content_type = $info['content_type'];
 header('content-type:' . $content_type);
 if (nochange_url($content_type, $nochange_url) < 1) {
  $rs = change_link($rs);
  $rs = regstr($rs);
 }
 return $rs;
}

 

unction get($url)
{
 global $user_curl, $user_agent, $user_client, $nochange;
 if (!($cache = cache('r', $url))) {
  if (function_exists('curl_init') && $user_curl == "1") {
   $ch = curl_init();
   if ($user_agent == 'baidu') {
    if ($user_client == "mobile") {
     $user_agent = 'Mozilla/5.0 (Linux;u;Android 4.2.2;zh-cn;) AppleWebKit/534.46 (KHTML,like Gecko) Version/5.1 Mobile Safari/10600.6.3 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)';
    } elseif (is_mobile() && $user_client == "auto") {
     $user_agent = 'Mozilla/5.0 (Linux;u;Android 4.2.2;zh-cn;) AppleWebKit/534.46 (KHTML,like Gecko) Version/5.1 Mobile Safari/10600.6.3 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)';
    } elseif ($user_client == "pc" || !is_mobile()) {
     $user_agent = 'Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)';
    } else {
     $user_agent = 'Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)';
    }
   } elseif ($user_agent == 'google') {
    if ($user_client == "mobile") {
     $user_agent = 'Mozilla/5.0 (iPhone; CPU iPhone OS 8_3 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Version/8.0 Mobile/12F70 Safari/600.1.4 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)';
    } elseif (is_mobile() && $user_client == "auto") {
     $user_agent = 'Mozilla/5.0 (iPhone; CPU iPhone OS 8_3 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Version/8.0 Mobile/12F70 Safari/600.1.4 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)';
    } elseif ($user_client == "pc" || !is_mobile()) {
     $user_agent = 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)';
    } else {
     $user_agent = 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)';
    }
   } elseif ($user_agent == 'yahoo') {
    $user_agent = 'Mozilla/5.0 (compatible; Yahoo! Slurp; http://help.yahoo.com/help/us/ysearch/slurp)';
   } elseif ($user_agent == 'bing') {
    if ($user_client == "mobile") {
     $user_agent = 'Mozilla/5.0 (iPhone; CPU iPhone OS 7_0 like Mac OS X) AppleWebKit/537.51.1 (KHTML, like Gecko) Version/7.0 Mobile/11A465 Safari/9537.53 (compatible; bingbot/2.0; http://www.bing.com/bingbot.htm)';
    } elseif (is_mobile() && $user_client == "auto") {
     $user_agent = 'Mozilla/5.0 (iPhone; CPU iPhone OS 7_0 like Mac OS X) AppleWebKit/537.51.1 (KHTML, like Gecko) Version/7.0 Mobile/11A465 Safari/9537.53 (compatible; bingbot/2.0; http://www.bing.com/bingbot.htm)';
    } elseif ($user_client == "pc" || !is_mobile()) {
     $user_agent = 'Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)';
    } else {
     $user_agent = 'Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)';
    }
   } else {
    $user_agent = 'Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)';
   }
   curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
   curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, FALSE);
   curl_setopt($ch, CURLOPT_URL, $url);
   curl_setopt($ch, CURLOPT_ENCODING, 'gzip,deflate');
   curl_setopt($ch, CURLOPT_REFERER, $url);
   curl_setopt($ch, CURLOPT_USERAGENT, $user_agent);
   curl_setopt($ch, CURLOPT_HEADER, false);
   curl_setopt($ch, CURLOPT_HTTPHEADER, array('CLIENT-IP:' . get_rand_ip(), 'X-FORWARDED-FOR:' . get_rand_ip()));
   curl_setopt($ch, CURLOPT_TIMEOUT, 30);
   curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
   curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
   $rs = curl_exec($ch);
   $info = curl_getinfo($ch);
   curl_close($ch);
   $content_type = $info['content_type'];
  } else {
   $rs = file_get_contents($url);
  }
  if (nochange_url($content_type, $nochange) < 1 && nochange_url($content_type, "jpg|jpeg|gif|png|bmp") < 1) {
   $rs = change_link($rs);
   $rs = regstr($rs);
  }
  if (nochange_url($content_type, "jpg|jpeg|gif|png|bmp") < 1) {
   $cache = array('content_type' => $content_type, 'rs' => $rs);
   cache('w', $url, $cache);
  }
  header('content-type:' . $content_type);
  return $rs;
 } else {
  extract($cache);
  header('content-type:' . $content_type);
  return $rs;
 }

 

赞 (0)