封装了一个curl_http.php类,代码如下:
<?php /** * Curl HTTP客户端 * 使用例子: * $curl = &new Curl_HTTP_Client(); * $useragent = "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)"; * $curl->set_user_agent($useragent); * $curl->store_cookies("/tmp/cookies.txt"); * $post_data = array('login' => 'pera', 'password' => 'joe'); * $html_data = $curl->send_post_data(http:/域名/login.php, $post_data); */ class Curl_HTTP_Client { var $ch ; var $debug = true; var $error_msg; var $error_no=""; function __construct($debug = false) { $this->debug = $debug; $this->init(); } function init() { //函数的作用初始化一个curl会话,curl_init()函数唯一的一个参数是可选的,表示一个url地址。 $this->ch = curl_init(); //set various options //set error in case http return code bigger than 300 //显示HTTP状态码,默认行为是忽略编号小于等于400的HTTP信息 curl_setopt($this->ch, CURLOPT_FAILONERROR, true); // 允许重新定向 curl_setopt($this->ch, CURLOPT_FOLLOWLOCATION, 0); // 如果有gzip则解压 curl_setopt($this->ch,CURLOPT_ENCODING , 'gzip, deflate'); // do not veryfy ssl // this is important for windows // as well for being able to access pages with non valid cert //curl_setopt($this->ch, CURLOPT_SSL_VERIFYPEER, 0); //不验证证书下同 curl_setopt($this->ch, CURLOPT_SSL_VERIFYPEER, false); curl_setopt($this->ch, CURLOPT_SSL_VERIFYHOST, false); } //HTTP认证 function set_credentials($username,$password) { curl_setopt($this->ch, CURLOPT_USERPWD, "$username:$password"); } //来源位置 function set_referrer($referrer_url) { //设置header中"Referer: " 部分的值 curl_setopt($this->ch, CURLOPT_REFERER, $referrer_url); } function set_user_agent($useragent) { //在HTTP请求中包含一个”user-agent”头的字符串 curl_setopt($this->ch, CURLOPT_USERAGENT, $useragent); } function include_response_headers($value) { //启用时会将头文件的信息作为数据流输出 curl_setopt($this->ch, CURLOPT_HEADER, $value); } function set_proxy($proxy) { //设置通过的HTTP代理服务器 curl_setopt($this->ch, CURLOPT_PROXY, $proxy); } function send_post_data($url, $postdata, $ip=null, $timeout=10) { //需要获取的URL地址 curl_setopt($this->ch, CURLOPT_URL,$url); //在启用CURLOPT_RETURNTRANSFER时候将获取数据返回 curl_setopt($this->ch, CURLOPT_RETURNTRANSFER,true); //绑定固定IP if($ip) { if($this->debug) { echo "Binding to ip $ip\n"; } curl_setopt($this->ch,CURLOPT_INTERFACE,$ip); } //设置curl允许执行的最长秒数 $timeout curl_setopt($this->ch, CURLOPT_TIMEOUT, $timeout); //启用时会发送一个常规的POST请求,类型为:application/x-www-form-urlencoded,就像表单提交的一样。 curl_setopt($this->ch, CURLOPT_POST, true); //generate post string $post_array = array(); if(is_array($postdata)) { foreach($postdata as $key=>$value) { //$post_array[] = urlencode($key) . "=" . urlencode($value); $post_array[] = $key . "=" . $value; } $post_string = implode("&",$post_array); if($this->debug) { echo "Url: $url\nPost String: $post_string\n"; } } else { $post_string = $postdata; } //在HTTP中的“POST”操作。如果要传送一个文件,需要一个@开头的文件名 curl_setopt($this->ch, CURLOPT_POSTFIELDS, $post_string); //执行一个curl会话 $result = curl_exec($this->ch); if(curl_errno($this->ch)) { if($this->debug) { echo "Error Occured in Curl\n"; echo "Error number: " .curl_errno($this->ch) ."\n"; echo "Error message: " .curl_error($this->ch)."\n"; } return false; } else { return $result; } } function fetch_url($url, $ip=null, $timeout=20) { //需要获取的URL地址,也可以在PHP的curl_init()函数中设置 curl_setopt($this->ch, CURLOPT_URL,$url); //启用时会设置HTTP的method为GET,因为GET是默认是,所以只在被修改的情况下使用s curl_setopt($this->ch, CURLOPT_HTTPGET,true); //在启用CURLOPT_RETURNTRANSFER时候将获取数据返回 curl_setopt($this->ch, CURLOPT_RETURNTRANSFER,true); //bind to specific ip address if it is sent trough arguments if($ip) { if($this->debug) { echo "Binding to ip $ip\n"; } //在外部网络接口中使用的名称,可以是一个接口名,IP或者主机名 curl_setopt($this->ch,CURLOPT_INTERFACE,$ip); } //设置curl允许执行的最长秒数 $timeout curl_setopt($this->ch, CURLOPT_TIMEOUT, $timeout); //执行一个curl会话 $result = curl_exec($this->ch); if(curl_errno($this->ch)) { if($this->debug) { echo "Error Occured in Curl\n"; echo "Error number: " .curl_errno($this->ch) ."\n"; echo "Error message: " .curl_error($this->ch)."\n"; } return false; } else { return $result; } } /** * Fetch data from target URL * and store it directly to file * @param string url * @param resource value stream resource(ie. fopen) * @param string ip address to bind (default null) * @param int timeout in sec for complete curl operation (default 5) * @return boolean true on success false othervise * @access public */ function fetch_into_file($url, $fp, $ip=null, $timeout=5) { //需要获取的URL地址 curl_setopt($this->ch, CURLOPT_URL,$url); //启用时会设置HTTP的method为GET,因为GET是默认是,所以只在被修改的情况下使用 curl_setopt($this->ch, CURLOPT_HTTPGET, true); //设置输出文件的位置,值是一个资源类型,默认为STDOUT (浏览器)。 curl_setopt($this->ch, CURLOPT_FILE, $fp); //bind to specific ip address if it is sent trough arguments if($ip) { if($this->debug) { echo "Binding to ip $ip\n"; } //在外部网络接口中使用的名称,可以是一个接口名,IP或者主机名。 curl_setopt($this->ch, CURLOPT_INTERFACE, $ip); } //设置curl允许执行的最长秒数 $timeout curl_setopt($this->ch, CURLOPT_TIMEOUT, $timeout); //执行一个curl会话 $result = curl_exec($this->ch); if(curl_errno($this->ch)) { if($this->debug) { echo "Error Occured in Curl\n"; echo "Error number: " .curl_errno($this->ch) ."\n"; echo "Error message: " .curl_error($this->ch)."\n"; } return false; } else { return true; } } /** * Send multipart post data to the target URL * return data returned from url or false if error occured * (contribution by vule nikolic, vule@dinke.net) * @param string url * @param array assoc post data array ie. $foo['post_var_name'] = $value * @param array assoc $file_field_array, contains file_field name = value - path pairs * @param string ip address to bind (default null) * @param int timeout in sec for complete curl operation (default 30 sec) * @return string data * @access public */ function send_multipart_post_data($url, $postdata, $file_field_array=array(), $ip=null, $timeout=30) { //需要获取的URL地址 curl_setopt($this->ch, CURLOPT_URL, $url); // 在启用CURLOPT_RETURNTRANSFER时候将获取数据返回 curl_setopt($this->ch, CURLOPT_RETURNTRANSFER, true); //bind to specific ip address if it is sent trough arguments if($ip) { if($this->debug) { echo "Binding to ip $ip\n"; } //在外部网络接口中使用的名称,可以是一个接口名,IP或者主机名 curl_setopt($this->ch,CURLOPT_INTERFACE,$ip); } //设置curl允许执行的最长秒数 $timeout curl_setopt($this->ch, CURLOPT_TIMEOUT, $timeout); //启用时会发送一个常规的POST请求,类型为:application/x-www-form-urlencoded,就像表单提交的一样。 curl_setopt($this->ch, CURLOPT_POST, true); // disable Expect header // 设置一个header中传输内容的数组 $headers = array("Expect: "); curl_setopt($this->ch, CURLOPT_HTTPHEADER, $headers); // initialize result post array $result_post = array(); //generate post string $post_array = array(); $post_string_array = array(); if(!is_array($postdata)) { return false; } foreach($postdata as $key=>$value) { $post_array[$key] = $value; $post_string_array[] = urlencode($key)."=".urlencode($value); } $post_string = implode("&",$post_string_array); if($this->debug) { echo "Post String: $post_string\n"; } // set post string //curl_setopt($this->ch, CURLOPT_POSTFIELDS, $post_string); // set multipart form data - file array field-value pairs if(!empty($file_field_array)) { foreach($file_field_array as $var_name => $var_value) { if(strpos(PHP_OS, "WIN") !== false) $var_value = str_replace("/", "\\", $var_value); // win hack $file_field_array[$var_name] = "@".$var_value; } } // 在HTTP中的“POST”操作。如果要传送一个文件,需要一个@开头的文件名 $result_post = array_merge($post_array, $file_field_array); curl_setopt($this->ch, CURLOPT_POSTFIELDS, $result_post); //执行一个curl会话 $result = curl_exec($this->ch); if(curl_errno($this->ch)) { if($this->debug) { echo "Error Occured in Curl\n"; echo "Error: " .curl_errno($this->ch) ."\n"; echo "Message: " .curl_error($this->ch)."\n"; } return false; } else { return $result; } } /** * Set file location where cookie data will be stored and send on each new request * @param string absolute path to cookie file (must be in writable dir) * @access public */ function store_cookies($cookie_file) { //连接关闭以后,存放cookie信息的文件名称 (cookies stored in $cookie_file) curl_setopt ($this->ch, CURLOPT_COOKIEJAR, $cookie_file); //包含cookie信息的文件名称,这个cookie文件可以是Netscape格式或者HTTP风格的header信息 curl_setopt ($this->ch, CURLOPT_COOKIEFILE, $cookie_file); } /** * Set custom cookie * @param string cookie * @access public */ function set_cookie($cookie) { //设定HTTP请求中“Set-Cookie:”部分的内容 curl_setopt ($this->ch, CURLOPT_COOKIE, $cookie); } /** * Get last URL info * usefull when original url was redirected to other location * @access public * @return string url */ function get_effective_url() { //最后一个有效的url地址 return curl_getinfo($this->ch, CURLINFO_EFFECTIVE_URL); } /** * Get http response code * @access public * @return int */ function get_http_response_code() { //最后一个收到的HTTP代码 return curl_getinfo($this->ch, CURLINFO_HTTP_CODE); } /** * Return last error message and error number * @return string error msg * @access public */ function get_error_msg() { //$this->error_no=curl_errno($this->ch); echo "Error: " .curl_errno($this->ch) ."\n"; echo "Message3: " .curl_error($this->ch)."\n"; return $err; } //高亮HTML $line_number 显示行号 function highlightHtml($code,$line_number=false) { //$code=$this->results; $code = htmlspecialchars($code); //$code = str_replace(" "," ",$code); //替换空格替换为 //$code = nl2br($code); //将回车替换为<br> //htm标签 //$code = preg_replace_callback('/<([a-zA-Z0-9]+)(.*?)(\/?>)/',array('WebLoad','__pv'), $code); $code = preg_replace('/(<[a-zA-Z0-9]+)/', '<font color="#0000FF">$1</font>', $code); $code = preg_replace('/(<\/[a-zA-Z0-9]+>)/', '<font color="#0000FF">$1</font>', $code); $code = preg_replace('/(\/>)/', '<font color="#0000FF">${1}</font>', $code); //$code = preg_replace('/(<\/?[a-zA-Z]+ .*?>)/','<font color="#0000FF">${1}</font>', $code); $code = preg_replace('/<!DOCTYPE\s+.+?>/','<font color="#3300FF">${0}</font>',$code); //注释 $code = str_replace('<!--', '<font color="#666666"><em><!--', $code); $code = str_replace('-->', '--></em></font>', $code); //--------------以下是专为visualTpl模板文件而定制的高亮代码-------- //block : begin|end $code = preg_replace('/(<!--\s*)(begin|end)(\s+)([a-z_\x7f-\xfe]+)/i','${1}<font size="" color="#0000FF"><b>${2}</b></font>${3}<b><font color="#FF0000">${4}</font></b>', $code); $code = preg_replace('/(\$[a-z0-9_]+)\s*=\s*(per|on)\(([0-9]+),(\'.*?\'),(\'.*?\')\)/i','<font color="#009900"><b>${1}</b></font>=<font color="#0000FF">${2}</font>(${3},<font color="#FF9999">${4}</font>,<font color="#FF9999">${5}</font>)', $code); //vip : vip|endvip $code = preg_replace('/<font color="#666666"><em><!--\s*vip/i', '<span style="display:block;border:1px dashed #696969;padding 3px" >${0}', $code); $code = preg_replace('/<!--\s*endvip\s*--><\/em><\/font>/i', '${0}</span>', $code); //ssi : #include $code = preg_replace('/<!--\s*#include\s+file.+?-->/i','<span style="background-color:#FFFF66; font-weight:bold; font-style:normal;padding:3px">${0}</span>',$code); //无格式变量 $code = preg_replace('/(\{\$[a-zA-Z0-9_\x7f-\xfe]+\})/','<font style="background-color:#D7FED1;padding:1px" color="#009900">${1}</font>', $code); //格式变量 $code = preg_replace('/(\{\$[a-zA-Z0-9_\x7f-\xfe]+;)([a-zA-Z]+)=\'([^\']+?)\'\}/','<font style="background-color:#D7FED1;padding:1px" color="#009900">${1}<font color="#CC0000">${2}</font>=<font color="#FF33CC">\'${3}\'</font>}</font>', $code); //--------------定制代码结束-------------- //$code = preg_replace_callback('/>[^<]+?</',array('WebLoad','__htmlspace'), $code); //$code = nl2br($code); if(! $line_number){ return '<PRE>'.$code.'</PRE>'; }else{ $code = '<pre><ol><li>' . str_replace("\n",'</li><li>',$code) . '</li></ol></pre>'; return $code; } } function close() { //关闭一个curl会话 curl_close($this->ch); } } ?>
调用示例:
<?php header('Content-Type: text/xml;charset=utf-8'); //1.require引入curl_http.php类 require("curl_http.php"); //2.创建对象 $curl = new Curl_HTTP_Client(); $curl->set_referrer(""); $curl->set_user_agent("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101"); //3.要爬取的URL链接 $url="https://www.98lm.com/" $html = $curl->fetch_url($url); echo $html; die; ?>