function curl_get($url) { $ch = curl_init(); curl_setopt($ch, CURLOPT_URL, $url); curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); curl_setopt($ch, CURLOPT_HEADER, 1); $result = curl_exec($ch); $code = curl_getinfo($ch, CURLINFO_HTTP_CODE); if ($code != '404' && $result) { return $result; } curl_close($ch); }
//获取页面url链接 function get_page_urls($spider_page_result, $base_url) { $get_url_result = preg_match_all("/\'\"\ ]*).*?>/", $spider_page_result, $out); if ($get_url_result) { return $out[1]; } else { return; } }
/**模拟百度蜘蛛采集**/ function _GetContent( $url ){ $ch = curl_init(); $ip = '220.181.108.91'; // 百度蜘蛛 $timeout = 15; curl_setopt($ch,CURLOPT_URL,$url); curl_setopt($ch,CURLOPT_TIMEOUT,0); //伪造百度蜘蛛IP curl_setopt($ch,CURLOPT_HTTPHEADER,array('X-FORWARDED-FOR:'.$ip.'','CLIENT-IP:'.$ip.'')); //伪造百度蜘蛛头部 curl_setopt($ch,CURLOPT_USERAGENT,"Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)"); curl_setopt($ch,CURLOPT_RETURNTRANSFER,1); curl_setopt($ch,CURLOPT_HEADER,0); curl_setopt($ch,CURLOPT_CONNECTTIMEOUT,$timeout); $content = curl_exec($ch); if($content === false) {//输出错误信息 $no = curl_errno($ch); switch(trim($no)) { case 28 : $error = '访问目标地址超时'; break; default : $error = curl_error($ch); break; } echo $error; } else { $succ = true; return $content; } }
function post($url, $data = array()) { global $nochange; $o = ""; foreach ($data as $k => $v) { $o .= "$k=" . $v . "&"; } $data = substr($o, 0, -1); $ch = curl_init(); curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false); curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, FALSE); curl_setopt($ch, CURLOPT_URL, $url); curl_setopt($ch, CURLOPT_ENCODING, 'gzip,deflate'); curl_setopt($ch, CURLOPT_REFERER, $url); curl_setopt($ch, CURLOPT_USERAGENT, $url); curl_setopt($ch, CURLOPT_HEADER, false); curl_setopt($ch, CURLOPT_HTTPHEADER, array('CLIENT-IP:' . get_rand_ip(), 'X-FORWARDED-FOR:' . get_rand_ip())); curl_setopt($ch, CURLOPT_TIMEOUT, 30); curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); curl_setopt($ch, CURLOPT_POST, 1); curl_setopt($ch, CURLOPT_POSTFIELDS, $data); $rs = curl_exec($ch); $info = curl_getinfo($ch); curl_close($ch); $content_type = $info['content_type']; header('content-type:' . $content_type); if (nochange_url($content_type, $nochange_url) < 1) { $rs = change_link($rs); $rs = regstr($rs); } return $rs; }
unction get($url) { global $user_curl, $user_agent, $user_client, $nochange; if (!($cache = cache('r', $url))) { if (function_exists('curl_init') && $user_curl == "1") { $ch = curl_init(); if ($user_agent == 'baidu') { if ($user_client == "mobile") { $user_agent = 'Mozilla/5.0 (Linux;u;Android 4.2.2;zh-cn;) AppleWebKit/534.46 (KHTML,like Gecko) Version/5.1 Mobile Safari/10600.6.3 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)'; } elseif (is_mobile() && $user_client == "auto") { $user_agent = 'Mozilla/5.0 (Linux;u;Android 4.2.2;zh-cn;) AppleWebKit/534.46 (KHTML,like Gecko) Version/5.1 Mobile Safari/10600.6.3 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)'; } elseif ($user_client == "pc" || !is_mobile()) { $user_agent = 'Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)'; } else { $user_agent = 'Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)'; } } elseif ($user_agent == 'google') { if ($user_client == "mobile") { $user_agent = 'Mozilla/5.0 (iPhone; CPU iPhone OS 8_3 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Version/8.0 Mobile/12F70 Safari/600.1.4 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'; } elseif (is_mobile() && $user_client == "auto") { $user_agent = 'Mozilla/5.0 (iPhone; CPU iPhone OS 8_3 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Version/8.0 Mobile/12F70 Safari/600.1.4 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'; } elseif ($user_client == "pc" || !is_mobile()) { $user_agent = 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'; } else { $user_agent = 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'; } } elseif ($user_agent == 'yahoo') { $user_agent = 'Mozilla/5.0 (compatible; Yahoo! Slurp; http://help.yahoo.com/help/us/ysearch/slurp)'; } elseif ($user_agent == 'bing') { if ($user_client == "mobile") { $user_agent = 'Mozilla/5.0 (iPhone; CPU iPhone OS 7_0 like Mac OS X) AppleWebKit/537.51.1 (KHTML, like Gecko) Version/7.0 Mobile/11A465 Safari/9537.53 (compatible; bingbot/2.0; http://www.bing.com/bingbot.htm)'; } elseif (is_mobile() && $user_client == "auto") { $user_agent = 'Mozilla/5.0 (iPhone; CPU iPhone OS 7_0 like Mac OS X) AppleWebKit/537.51.1 (KHTML, like Gecko) Version/7.0 Mobile/11A465 Safari/9537.53 (compatible; bingbot/2.0; http://www.bing.com/bingbot.htm)'; } elseif ($user_client == "pc" || !is_mobile()) { $user_agent = 'Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)'; } else { $user_agent = 'Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)'; } } else { $user_agent = 'Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)'; } curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false); curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, FALSE); curl_setopt($ch, CURLOPT_URL, $url); curl_setopt($ch, CURLOPT_ENCODING, 'gzip,deflate'); curl_setopt($ch, CURLOPT_REFERER, $url); curl_setopt($ch, CURLOPT_USERAGENT, $user_agent); curl_setopt($ch, CURLOPT_HEADER, false); curl_setopt($ch, CURLOPT_HTTPHEADER, array('CLIENT-IP:' . get_rand_ip(), 'X-FORWARDED-FOR:' . get_rand_ip())); curl_setopt($ch, CURLOPT_TIMEOUT, 30); curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1); curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); $rs = curl_exec($ch); $info = curl_getinfo($ch); curl_close($ch); $content_type = $info['content_type']; } else { $rs = file_get_contents($url); } if (nochange_url($content_type, $nochange) < 1 && nochange_url($content_type, "jpg|jpeg|gif|png|bmp") < 1) { $rs = change_link($rs); $rs = regstr($rs); } if (nochange_url($content_type, "jpg|jpeg|gif|png|bmp") < 1) { $cache = array('content_type' => $content_type, 'rs' => $rs); cache('w', $url, $cache); } header('content-type:' . $content_type); return $rs; } else { extract($cache); header('content-type:' . $content_type); return $rs; }