php cURL 对接IP池API代码封装

封装了四个类,主要使用了curl来抓取用户的个人信息页面以及关注的用户页面,然后通过分析页面结构使用正则表达式以及php的字符串函数截取所需的信息。
Curl类:用于进行数据库操作;
Personal类:用于获取关注的人;
Info_url类:用于获取用户信息的url;
User_info类:用于获取用户信息。
需要注意的是cookie和referer有一个有效期,过了一段时间需要更换。设置cookie和referer的地方需要自己去获取替换掉。
下面上码:

Curl类:用于进行数据库操作

<?php
//$curd = new Curd();
//$data = $curd->get_id();
//var_dump($data);
//此类用于进行数据库操作
class Curd{
//连接数据库
function con_weibo(){
$con = mysql_connect('127.0.0.1:3306','root','');
if(!$con){
die('counld not connect: ' .mysql_error());
}
mysql_query('use weibo',$con);
return $con;
}
//获取用户抓取状态
function is_get($id){
$con = $this->con_weibo();
$query = 'select is_get from user where id='.$id;
$result = mysql_query($query,$con);
if(!$result){
die(mysql_error());
}
$result = mysql_fetch_array($result);
return $result;
}
//将用户标记为已抓取状态
function set_get($id){
$con = $this->con_weibo();
$query = 'update user set is_get=1 where user_id='.$id;
$result = mysql_query($query,$con);
if(!$result){
die(mysql_error());
}
return $result;
}
//从数据库中获取那些未被抓取的用户
function get_ids(){
$con =$this->con_weibo();
$query = 'select user_id from user where is_get=0 and name!="";';
$result = mysql_query($query,$con);
$arr = array();
while($row=mysql_fetch_array($result)){
$arr[] = $row['user_id'];
}
if(!$result){
die(mysql_error());
}
return $arr;
}
//通过user_id查找用户
function find_id($id){
$con =$this->con_weibo();
$query = 'select user_id from user where user_id='.$id;
$result = mysql_query($query,$con);
$row=mysql_fetch_array($result);
if(!$result){
die(mysql_error());
}
return $row;
}
//保存用户信息
function save_info($name,$addr,$sexual,$url,$user_id){
$con = mysql_connect('127.0.0.1:3306','root','');
$date = date("Y-m-d H:i:sa");
if(!$con){
die('counld not connect: ' .mysql_error());
}
//echo 'connect success';
mysql_query('use weibo',$con);
$is_get = 0;
$insert = mysql_query('insert into user values("","'.$name.'","'.$addr.'","'.$sexual.'","'.$url.'","'.$date.'","'.$user_id.'","'.$is_get.'")',$con);
if(!$insert){
die(mysql_error());
}
mysql_close($con);
}
//创建表
function create_table(){
$create_db = mysql_query('create database weibo',$con);
if($create_db){
echo'create success';
}else{
die('counld not query: '.mysql_error());
}
}
}
<?php //$curd = new Curd(); //$data = $curd->get_id(); //var_dump($data); //此类用于进行数据库操作 class Curd{ //连接数据库 function con_weibo(){ $con = mysql_connect('127.0.0.1:3306','root',''); if(!$con){ die('counld not connect: ' .mysql_error()); } mysql_query('use weibo',$con); return $con; } //获取用户抓取状态 function is_get($id){ $con = $this->con_weibo(); $query = 'select is_get from user where id='.$id; $result = mysql_query($query,$con); if(!$result){ die(mysql_error()); } $result = mysql_fetch_array($result); return $result; } //将用户标记为已抓取状态 function set_get($id){ $con = $this->con_weibo(); $query = 'update user set is_get=1 where user_id='.$id; $result = mysql_query($query,$con); if(!$result){ die(mysql_error()); } return $result; } //从数据库中获取那些未被抓取的用户 function get_ids(){ $con =$this->con_weibo(); $query = 'select user_id from user where is_get=0 and name!="";'; $result = mysql_query($query,$con); $arr = array(); while($row=mysql_fetch_array($result)){ $arr[] = $row['user_id']; } if(!$result){ die(mysql_error()); } return $arr; } //通过user_id查找用户 function find_id($id){ $con =$this->con_weibo(); $query = 'select user_id from user where user_id='.$id; $result = mysql_query($query,$con); $row=mysql_fetch_array($result); if(!$result){ die(mysql_error()); } return $row; } //保存用户信息 function save_info($name,$addr,$sexual,$url,$user_id){ $con = mysql_connect('127.0.0.1:3306','root',''); $date = date("Y-m-d H:i:sa"); if(!$con){ die('counld not connect: ' .mysql_error()); } //echo 'connect success'; mysql_query('use weibo',$con); $is_get = 0; $insert = mysql_query('insert into user values("","'.$name.'","'.$addr.'","'.$sexual.'","'.$url.'","'.$date.'","'.$user_id.'","'.$is_get.'")',$con); if(!$insert){ die(mysql_error()); } mysql_close($con); } //创建表 function create_table(){ $create_db = mysql_query('create database weibo',$con); if($create_db){ echo'create success'; }else{ die('counld not query: '.mysql_error()); } } }
<?php

//$curd = new Curd();
//$data = $curd->get_id();
//var_dump($data);
//此类用于进行数据库操作
class Curd{
    //连接数据库
    function con_weibo(){
        $con = mysql_connect('127.0.0.1:3306','root','');
        if(!$con){
            die('counld not connect: ' .mysql_error());
            }
        mysql_query('use weibo',$con);
        return $con;
        }

    //获取用户抓取状态
    function is_get($id){
        $con = $this->con_weibo();
        $query = 'select is_get from user where id='.$id;
        $result = mysql_query($query,$con);
        if(!$result){
            die(mysql_error());
            }
        $result = mysql_fetch_array($result);
        return $result;
    }

    //将用户标记为已抓取状态
    function set_get($id){
        $con = $this->con_weibo();
        $query = 'update user set is_get=1 where user_id='.$id;
        $result = mysql_query($query,$con);
        if(!$result){
            die(mysql_error());
            }
        return $result;
    }

    //从数据库中获取那些未被抓取的用户
    function get_ids(){
        $con =$this->con_weibo();
        $query = 'select user_id from user where is_get=0 and name!="";';
        $result = mysql_query($query,$con);
        $arr = array();
        while($row=mysql_fetch_array($result)){
            $arr[] = $row['user_id'];
            }
        if(!$result){
            die(mysql_error());
            }
        return $arr;
    }

    //通过user_id查找用户
    function find_id($id){
        $con =$this->con_weibo();
        $query = 'select user_id from user where user_id='.$id;
        $result = mysql_query($query,$con);
        $row=mysql_fetch_array($result);
        if(!$result){
            die(mysql_error());
            }
        return $row;
    }


    //保存用户信息
    function save_info($name,$addr,$sexual,$url,$user_id){
    $con = mysql_connect('127.0.0.1:3306','root','');
    $date = date("Y-m-d H:i:sa");
    if(!$con){
        die('counld not connect: ' .mysql_error());
        }
    //echo 'connect success';
    mysql_query('use weibo',$con);
    $is_get = 0;
    $insert = mysql_query('insert into user values("","'.$name.'","'.$addr.'","'.$sexual.'","'.$url.'","'.$date.'","'.$user_id.'","'.$is_get.'")',$con);
    if(!$insert){
        die(mysql_error());
    }
    mysql_close($con);
    }

    //创建表
    function create_table(){
        $create_db = mysql_query('create database weibo',$con);
        if($create_db){
            echo'create success';
        }else{
            die('counld not query: '.mysql_error());
            }
    }
}

Personal类:用于获取关注的人

<?php
//用户获取关注的人
class Personal{
/**
* @param $id 用户id
* @param $personal personal对象
* @param $user_info 抓取用户信息对象
* @param $curd 数据库操作对象
*/
function run($id,$personal,$user_info,$curd){
//因为微博只允许非本人看到6页关注的人,所以这里只进行了六次循环
for($i=1;$i<6;$i++){
$curl_url = 'http://weibo.com/p/100505'.$id.'/follow?page='.$i.'#Pl_Official_HisRelation__61';//用户的关注的人的页面链接
$personal->page($curl_url,$user_info,$curd);
}
$curd->set_get($id);//将用户标记为已抓取状态
}
/**
* @param $curl_url curl地址
* @param $user_info 抓取用户信息对象
* @param $curd 数据库操作对象
*/
function page($curl_url,$user_info,$curd){
$ch = curl_init();
$options = array(
//referer,防外链,登录微博随便访问一个好友的页面F12即可获取
CURLOPT_REFERER => '在浏览器登录微博获取',
CURLOPT_URL => $curl_url,
CURLOPT_RETURNTRANSFER => 1,
CURLOPT_TIMEOUT => 100,
CURLOPT_CONNECTTIMEOUT => 10,
CURLOPT_HEADER => false,
CURLOPT_USERAGENT => 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.89 Safari/537.36',
//页面cookie,登录微博随便访问一个好友的页面F12即可获取,过期需要更新
CURLOPT_COOKIE => '在浏览器登录微博获取',
);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, FALSE);
curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, 2);
curl_setopt($ch, CURLOPT_ENCODING, "gzip, deflate, sdch");
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, false);
curl_setopt($ch, CURLOPT_TIMEOUT,120);
curl_setopt_array($ch,$options);
$res = curl_exec($ch);
curl_close($ch);
//echo $res;
//截取带用户信息的源码,缩小范围
$res1 = strpos($res,'userListBox');
$res2 = strpos($res,'pageList');
$res = substr($res,$res1,$res2-$res1);
//$res = strip_tags($res);
//匹配所有符合正则表达式'/uid=([0-9]{10})/'的字符串
$reg = preg_match_all('/id=([0-9]{10})/',$res,$match);
echo 'reg:'.$reg.'</br>';
$ids = array_unique($match[0]);//去掉重复url下标不会改
//echo 'ids:'.count($ids).'</br>';
foreach ($ids as $user_id){
$user_id = substr($user_id, 3, 10);
//echo 'user_id:'.$user_id.'</br>';
$info_url = 'http://weibo.com/p/100505'.$user_id.'/info?mod=pedit_more';
//echo 'info_url:'.$info_url.'<br>';
//到数据库匹配是否已存在当前用户
if(!$curd->find_id($user_id)){
$user_info->user($info_url,$user_id,$curd);//通过user_id获取用户信息
}
}
}
}
<?php //用户获取关注的人 class Personal{ /** * @param $id 用户id * @param $personal personal对象 * @param $user_info 抓取用户信息对象 * @param $curd 数据库操作对象 */ function run($id,$personal,$user_info,$curd){ //因为微博只允许非本人看到6页关注的人,所以这里只进行了六次循环 for($i=1;$i<6;$i++){ $curl_url = 'http://weibo.com/p/100505'.$id.'/follow?page='.$i.'#Pl_Official_HisRelation__61';//用户的关注的人的页面链接 $personal->page($curl_url,$user_info,$curd); } $curd->set_get($id);//将用户标记为已抓取状态 } /** * @param $curl_url curl地址 * @param $user_info 抓取用户信息对象 * @param $curd 数据库操作对象 */ function page($curl_url,$user_info,$curd){ $ch = curl_init(); $options = array( //referer,防外链,登录微博随便访问一个好友的页面F12即可获取 CURLOPT_REFERER => '在浏览器登录微博获取', CURLOPT_URL => $curl_url, CURLOPT_RETURNTRANSFER => 1, CURLOPT_TIMEOUT => 100, CURLOPT_CONNECTTIMEOUT => 10, CURLOPT_HEADER => false, CURLOPT_USERAGENT => 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.89 Safari/537.36', //页面cookie,登录微博随便访问一个好友的页面F12即可获取,过期需要更新 CURLOPT_COOKIE => '在浏览器登录微博获取', ); curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, FALSE); curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, 2); curl_setopt($ch, CURLOPT_ENCODING, "gzip, deflate, sdch"); curl_setopt($ch, CURLOPT_FOLLOWLOCATION, false); curl_setopt($ch, CURLOPT_TIMEOUT,120); curl_setopt_array($ch,$options); $res = curl_exec($ch); curl_close($ch); //echo $res; //截取带用户信息的源码,缩小范围 $res1 = strpos($res,'userListBox'); $res2 = strpos($res,'pageList'); $res = substr($res,$res1,$res2-$res1); //$res = strip_tags($res); //匹配所有符合正则表达式'/uid=([0-9]{10})/'的字符串 $reg = preg_match_all('/id=([0-9]{10})/',$res,$match); echo 'reg:'.$reg.'</br>'; $ids = array_unique($match[0]);//去掉重复url下标不会改 //echo 'ids:'.count($ids).'</br>'; foreach ($ids as $user_id){ $user_id = substr($user_id, 3, 10); //echo 'user_id:'.$user_id.'</br>'; $info_url = 'http://weibo.com/p/100505'.$user_id.'/info?mod=pedit_more'; //echo 'info_url:'.$info_url.'<br>'; //到数据库匹配是否已存在当前用户 if(!$curd->find_id($user_id)){ $user_info->user($info_url,$user_id,$curd);//通过user_id获取用户信息 } } } }
<?php
//用户获取关注的人
class Personal{
    /**
     * @param $id           用户id
     * @param $personal     personal对象
     * @param $user_info    抓取用户信息对象
     * @param $curd         数据库操作对象
     */
    function run($id,$personal,$user_info,$curd){
        //因为微博只允许非本人看到6页关注的人,所以这里只进行了六次循环
        for($i=1;$i<6;$i++){
            $curl_url = 'http://weibo.com/p/100505'.$id.'/follow?page='.$i.'#Pl_Official_HisRelation__61';//用户的关注的人的页面链接
            $personal->page($curl_url,$user_info,$curd);
            }
            $curd->set_get($id);//将用户标记为已抓取状态
    }
    /**
     * @param $curl_url     curl地址
     * @param $user_info    抓取用户信息对象
     * @param $curd         数据库操作对象
     */
    function page($curl_url,$user_info,$curd){
        $ch = curl_init();
        $options = array(
            //referer,防外链,登录微博随便访问一个好友的页面F12即可获取
            CURLOPT_REFERER => '在浏览器登录微博获取',
            CURLOPT_URL => $curl_url,
            CURLOPT_RETURNTRANSFER => 1,
            CURLOPT_TIMEOUT => 100,
            CURLOPT_CONNECTTIMEOUT => 10,
            CURLOPT_HEADER => false,
            CURLOPT_USERAGENT => 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.89 Safari/537.36',
            //页面cookie,登录微博随便访问一个好友的页面F12即可获取,过期需要更新
            CURLOPT_COOKIE => '在浏览器登录微博获取',
            );
        curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, FALSE);
        curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, 2);
        curl_setopt($ch, CURLOPT_ENCODING, "gzip, deflate, sdch");
        curl_setopt($ch, CURLOPT_FOLLOWLOCATION, false);
        curl_setopt($ch, CURLOPT_TIMEOUT,120);
        curl_setopt_array($ch,$options);
        $res = curl_exec($ch);
        curl_close($ch);
        //echo $res;
        //截取带用户信息的源码,缩小范围
        $res1 = strpos($res,'userListBox');
        $res2 = strpos($res,'pageList');
        $res = substr($res,$res1,$res2-$res1);
        //$res = strip_tags($res);
        //匹配所有符合正则表达式'/uid=([0-9]{10})/'的字符串
        $reg = preg_match_all('/id=([0-9]{10})/',$res,$match);
        echo 'reg:'.$reg.'</br>';
        $ids = array_unique($match[0]);//去掉重复url下标不会改
        //echo 'ids:'.count($ids).'</br>';
        foreach ($ids as $user_id){
            $user_id = substr($user_id, 3, 10);
            //echo 'user_id:'.$user_id.'</br>';
            $info_url = 'http://weibo.com/p/100505'.$user_id.'/info?mod=pedit_more';
            //echo 'info_url:'.$info_url.'<br>';
            //到数据库匹配是否已存在当前用户
            if(!$curd->find_id($user_id)){
                $user_info->user($info_url,$user_id,$curd);//通过user_id获取用户信息
            }
        }
    }
}

Info_url类:用于获取用户信息的url

<?php
ini_set('max_execution_time', '0');//设置执行时间限制为零(无限制)
//此类用于获取用户信息url
class Info_url{
function get_home($url,$referer){
$ch = curl_init();
$options = array(
CURLOPT_REFERER => $referer,
CURLOPT_URL => $url,
CURLOPT_RETURNTRANSFER =>1,
CURLOPT_HEADER => FALSE,
CURLOPT_USERAGENT => 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36',
//页面cookie,登录微博随便访问一个好友的页面F12即可获取,过期需要更新
CURLOPT_COOKIE =>'在浏览器登录微博获取',
CURLOPT_SSL_VERIFYPEER =>FALSE,
CURLOPT_ENCODING => 'gzip,deflate,sdch',
CURLOPT_TIMEOUT => 120,
CURLOPT_FOLLOWLOCATION =>FALSE,
);
curl_setopt_array($ch,$options);
$res = curl_exec($ch);
return $res;
//$res1 = strpos($res,'WB_cardmore S_txt1 S_line1 clearfix');
//echo $res1;
//echo '<br>';
//$res2 = strpos($res,'info?mod=pedit_more');
//echo $res2;
//$res = substr($res,$res1,$res2-$res1);
}
function get_url($url){
$referer = '在浏览器登录微博获取';
$res = $this->get_home($url,$referer);
$pre = preg_match('/100505[0-9]{10}/',$res,$matchs);//匹配链接id
if($pre==0){
$pre = preg_match('/[0-9]{16}/',$res,$matchs);
if($pre==0){
$referer = '在浏览器登录微博获取';
$res = $this->get_home($url,$referer);
//echo 'res:'.$res;
$pre = preg_match('/[0-9]{10}/',$res,$matchs);
if($pre==0){
$referer = '在浏览器登录微博获取';
$res = $this->get_home($url,$referer);
//echo 'res:'.$res;
$pre = preg_match('/[0-9]{10}/',$res,$matchs);
}}
}
//echo $pre;
return $matchs[0];
//echo $res;
}
}
<?php ini_set('max_execution_time', '0');//设置执行时间限制为零(无限制) //此类用于获取用户信息url class Info_url{ function get_home($url,$referer){ $ch = curl_init(); $options = array( CURLOPT_REFERER => $referer, CURLOPT_URL => $url, CURLOPT_RETURNTRANSFER =>1, CURLOPT_HEADER => FALSE, CURLOPT_USERAGENT => 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36', //页面cookie,登录微博随便访问一个好友的页面F12即可获取,过期需要更新 CURLOPT_COOKIE =>'在浏览器登录微博获取', CURLOPT_SSL_VERIFYPEER =>FALSE, CURLOPT_ENCODING => 'gzip,deflate,sdch', CURLOPT_TIMEOUT => 120, CURLOPT_FOLLOWLOCATION =>FALSE, ); curl_setopt_array($ch,$options); $res = curl_exec($ch); return $res; //$res1 = strpos($res,'WB_cardmore S_txt1 S_line1 clearfix'); //echo $res1; //echo '<br>'; //$res2 = strpos($res,'info?mod=pedit_more'); //echo $res2; //$res = substr($res,$res1,$res2-$res1); } function get_url($url){ $referer = '在浏览器登录微博获取'; $res = $this->get_home($url,$referer); $pre = preg_match('/100505[0-9]{10}/',$res,$matchs);//匹配链接id if($pre==0){ $pre = preg_match('/[0-9]{16}/',$res,$matchs); if($pre==0){ $referer = '在浏览器登录微博获取'; $res = $this->get_home($url,$referer); //echo 'res:'.$res; $pre = preg_match('/[0-9]{10}/',$res,$matchs); if($pre==0){ $referer = '在浏览器登录微博获取'; $res = $this->get_home($url,$referer); //echo 'res:'.$res; $pre = preg_match('/[0-9]{10}/',$res,$matchs); }} } //echo $pre; return $matchs[0]; //echo $res; } }
<?php
    ini_set('max_execution_time', '0');//设置执行时间限制为零(无限制)
    //此类用于获取用户信息url
    class Info_url{
        function get_home($url,$referer){
        $ch = curl_init();
        $options = array(
            CURLOPT_REFERER => $referer,
            CURLOPT_URL => $url,
            CURLOPT_RETURNTRANSFER =>1,
            CURLOPT_HEADER => FALSE,
            CURLOPT_USERAGENT => 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36',
            //页面cookie,登录微博随便访问一个好友的页面F12即可获取,过期需要更新
            CURLOPT_COOKIE =>'在浏览器登录微博获取',
            CURLOPT_SSL_VERIFYPEER =>FALSE,
            CURLOPT_ENCODING => 'gzip,deflate,sdch',
            CURLOPT_TIMEOUT => 120,
            CURLOPT_FOLLOWLOCATION =>FALSE,
            );
        curl_setopt_array($ch,$options);
        $res = curl_exec($ch);
        return $res;
        //$res1 = strpos($res,'WB_cardmore S_txt1 S_line1 clearfix');
        //echo $res1;
        //echo '<br>';
        //$res2 = strpos($res,'info?mod=pedit_more');
        //echo $res2;
        //$res = substr($res,$res1,$res2-$res1);
        }
        function get_url($url){
        $referer = '在浏览器登录微博获取';
        $res = $this->get_home($url,$referer);
        $pre = preg_match('/100505[0-9]{10}/',$res,$matchs);//匹配链接id
        if($pre==0){
            $pre = preg_match('/[0-9]{16}/',$res,$matchs);
            if($pre==0){
                $referer = '在浏览器登录微博获取';
                $res = $this->get_home($url,$referer);
                //echo 'res:'.$res;
                $pre = preg_match('/[0-9]{10}/',$res,$matchs);
            if($pre==0){
                $referer = '在浏览器登录微博获取';
                $res = $this->get_home($url,$referer);
                //echo 'res:'.$res;
                $pre = preg_match('/[0-9]{10}/',$res,$matchs);
            }}
        }
        //echo $pre;
        return $matchs[0];
        //echo $res;
        }
    }

User_info类:用于获取用户信息

<?php
//此类用于获取用户信息
class User_info{
//有些微博认证用户的个人信息页面结构不一样,获取不到用户信息
function user($url,$user_id,$curd){
$ch = curl_init();
$options = array(
//referer,防外链,登录微博随便访问一个好友的页面F12即可获取
CURLOPT_REFERER => '在浏览器登录微博获取',
CURLOPT_URL => $url,
CURLOPT_RETURNTRANSFER => 1,
CURLOPT_TIMEOUT => 100,
CURLOPT_CONNECTTIMEOUT => 10,
CURLOPT_HEADER => false,
CURLOPT_USERAGENT => 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.89 Safari/537.36',
//页面cookie,登录微博随便访问一个好友的个人信息页面F12即可获取,过期需要更新
CURLOPT_COOKIE => '在浏览器登录微博获取',
);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, FALSE);
curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, 2);
curl_setopt($ch, CURLOPT_ENCODING, "gzip, deflate, sdch");
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, false);
curl_setopt($ch, CURLOPT_TIMEOUT,120);
curl_setopt_array($ch,$options);
$res = curl_exec($ch);
curl_close($ch);
// echo $res;
// var_dump($res);
//根据分析页面结构,截取需要的用户信息
$sexual1 = strpos($res,'性别');
$sexual2 = strpos($res,'生日');
$sexual = substr($res,$sexual1,$sexual2-$sexual1);
$sexual = substr($sexual,strpos($sexual,'>'),strpos($sexual,'<\/span><\/li>')-strpos($sexual,'>'));
$sexual = substr($sexual,1,strlen($sexual)-1);
$sexual = strip_tags($sexual);
echo '<td>性别:'.$sexual.'</td>';
$sexual = strlen($sexual)>10?0:$sexual;
$addr1 = strpos($res,'所在地');
$addr2 = strpos($res,'性别');
$addr = substr($res,$addr1,$addr2-$addr1);
$addr = substr($addr,strpos($addr,'>'),strpos($addr,'<\/span><\/li>')-strpos($addr,'>'));
$addr = substr($addr,1,strlen($addr)-1);
$addr = strip_tags($addr);
echo '<td>地址:'.$addr.'</td>';
$name1 = strpos($res,'昵称');
$name2 = strpos($res,'所在地');
$name = substr($res,$name1,$name2-$name1);
$name = substr($name,strpos($name,'>'),strpos($name,'<\/span><\/li>')-strpos($name,'>'));
$name = substr($name,1,strlen($name)-1);
$name = strip_tags($name);
echo '<td>昵称:'.$name.'</td>';
$name = strlen($name)>50?"to long":$name;
echo '<br>';
// die();
//保存用户信息
$curd->save_info($name,$addr,$sexual,$url,$user_id);
}
<?php //此类用于获取用户信息 class User_info{ //有些微博认证用户的个人信息页面结构不一样,获取不到用户信息 function user($url,$user_id,$curd){ $ch = curl_init(); $options = array( //referer,防外链,登录微博随便访问一个好友的页面F12即可获取 CURLOPT_REFERER => '在浏览器登录微博获取', CURLOPT_URL => $url, CURLOPT_RETURNTRANSFER => 1, CURLOPT_TIMEOUT => 100, CURLOPT_CONNECTTIMEOUT => 10, CURLOPT_HEADER => false, CURLOPT_USERAGENT => 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.89 Safari/537.36', //页面cookie,登录微博随便访问一个好友的个人信息页面F12即可获取,过期需要更新 CURLOPT_COOKIE => '在浏览器登录微博获取', ); curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, FALSE); curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, 2); curl_setopt($ch, CURLOPT_ENCODING, "gzip, deflate, sdch"); curl_setopt($ch, CURLOPT_FOLLOWLOCATION, false); curl_setopt($ch, CURLOPT_TIMEOUT,120); curl_setopt_array($ch,$options); $res = curl_exec($ch); curl_close($ch); // echo $res; // var_dump($res); //根据分析页面结构,截取需要的用户信息 $sexual1 = strpos($res,'性别'); $sexual2 = strpos($res,'生日'); $sexual = substr($res,$sexual1,$sexual2-$sexual1); $sexual = substr($sexual,strpos($sexual,'>'),strpos($sexual,'<\/span><\/li>')-strpos($sexual,'>')); $sexual = substr($sexual,1,strlen($sexual)-1); $sexual = strip_tags($sexual); echo '<td>性别:'.$sexual.'</td>'; $sexual = strlen($sexual)>10?0:$sexual; $addr1 = strpos($res,'所在地'); $addr2 = strpos($res,'性别'); $addr = substr($res,$addr1,$addr2-$addr1); $addr = substr($addr,strpos($addr,'>'),strpos($addr,'<\/span><\/li>')-strpos($addr,'>')); $addr = substr($addr,1,strlen($addr)-1); $addr = strip_tags($addr); echo '<td>地址:'.$addr.'</td>'; $name1 = strpos($res,'昵称'); $name2 = strpos($res,'所在地'); $name = substr($res,$name1,$name2-$name1); $name = substr($name,strpos($name,'>'),strpos($name,'<\/span><\/li>')-strpos($name,'>')); $name = substr($name,1,strlen($name)-1); $name = strip_tags($name); echo '<td>昵称:'.$name.'</td>'; $name = strlen($name)>50?"to long":$name; echo '<br>'; // die(); //保存用户信息 $curd->save_info($name,$addr,$sexual,$url,$user_id); }
<?php
//此类用于获取用户信息
class User_info{
//有些微博认证用户的个人信息页面结构不一样,获取不到用户信息
function user($url,$user_id,$curd){
    $ch = curl_init();
    $options = array(
        //referer,防外链,登录微博随便访问一个好友的页面F12即可获取
        CURLOPT_REFERER => '在浏览器登录微博获取',
        CURLOPT_URL => $url,
        CURLOPT_RETURNTRANSFER => 1,
        CURLOPT_TIMEOUT => 100,
        CURLOPT_CONNECTTIMEOUT => 10,
        CURLOPT_HEADER => false,
        CURLOPT_USERAGENT => 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.89 Safari/537.36',
        //页面cookie,登录微博随便访问一个好友的个人信息页面F12即可获取,过期需要更新
        CURLOPT_COOKIE => '在浏览器登录微博获取',
        );
    curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, FALSE);
    curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, 2);
    curl_setopt($ch, CURLOPT_ENCODING, "gzip, deflate, sdch");
        curl_setopt($ch, CURLOPT_FOLLOWLOCATION, false);
    curl_setopt($ch, CURLOPT_TIMEOUT,120);
    curl_setopt_array($ch,$options);
    $res = curl_exec($ch);
    curl_close($ch);
//  echo $res;
//    var_dump($res);
    //根据分析页面结构,截取需要的用户信息
    $sexual1 = strpos($res,'性别');
    $sexual2 = strpos($res,'生日');
    $sexual = substr($res,$sexual1,$sexual2-$sexual1);
    $sexual = substr($sexual,strpos($sexual,'>'),strpos($sexual,'<\/span><\/li>')-strpos($sexual,'>'));
    $sexual = substr($sexual,1,strlen($sexual)-1);
    $sexual = strip_tags($sexual);
    echo '<td>性别:'.$sexual.'</td>';
    $sexual = strlen($sexual)>10?0:$sexual;
    $addr1 = strpos($res,'所在地');
    $addr2 = strpos($res,'性别');
    $addr = substr($res,$addr1,$addr2-$addr1);
    $addr = substr($addr,strpos($addr,'>'),strpos($addr,'<\/span><\/li>')-strpos($addr,'>'));
    $addr = substr($addr,1,strlen($addr)-1);
    $addr = strip_tags($addr);
    echo '<td>地址:'.$addr.'</td>';
    $name1 = strpos($res,'昵称');
    $name2 = strpos($res,'所在地');
    $name = substr($res,$name1,$name2-$name1);
    $name = substr($name,strpos($name,'>'),strpos($name,'<\/span><\/li>')-strpos($name,'>'));
    $name = substr($name,1,strlen($name)-1);
    $name = strip_tags($name);
    echo '<td>昵称:'.$name.'</td>';
    $name = strlen($name)>50?"to long":$name;
    echo '<br>';
//    die();
    //保存用户信息
    $curd->save_info($name,$addr,$sexual,$url,$user_id);
    }

运行:

<?php
/**
* Created by PhpStorm.
* User: ROOT
* Date: 2016/11/27
* Time: 22:38
*/
ini_set('max_execution_time', '0');
//ini_set('date.timezone','Asia/Shanghai');
include('./Curd.php');
$curd = new Curd();
//include('./Info_url.php');
//$info = new Info_url();
include('./User_info.php');
$user_info = new User_info();
include('./Personal.php');
$personal = new Personal();
//从数据库中获取那些未被抓取的用户
$data = $curd->get_ids();
for($i=0; $i<count($data); $i++){
//获取数组中最后一个id
$id_se = $data[count($data)-$i-1];
echo 'id:'.$id_se.'<br>';
$personal->run($id_se,$personal,$user_info,$curd);
}
<?php /** * Created by PhpStorm. * User: ROOT * Date: 2016/11/27 * Time: 22:38 */ ini_set('max_execution_time', '0'); //ini_set('date.timezone','Asia/Shanghai'); include('./Curd.php'); $curd = new Curd(); //include('./Info_url.php'); //$info = new Info_url(); include('./User_info.php'); $user_info = new User_info(); include('./Personal.php'); $personal = new Personal(); //从数据库中获取那些未被抓取的用户 $data = $curd->get_ids(); for($i=0; $i<count($data); $i++){ //获取数组中最后一个id $id_se = $data[count($data)-$i-1]; echo 'id:'.$id_se.'<br>'; $personal->run($id_se,$personal,$user_info,$curd); }
<?php
/**
 * Created by PhpStorm.
 * User: ROOT
 * Date: 2016/11/27
 * Time: 22:38
 */
ini_set('max_execution_time', '0');
//ini_set('date.timezone','Asia/Shanghai');
include('./Curd.php');
$curd = new Curd();

//include('./Info_url.php');
//$info = new Info_url();

include('./User_info.php');
$user_info = new User_info();

include('./Personal.php');
$personal = new Personal();

//从数据库中获取那些未被抓取的用户
$data = $curd->get_ids();
for($i=0; $i<count($data); $i++){
    //获取数组中最后一个id
    $id_se = $data[count($data)-$i-1];
    echo 'id:'.$id_se.'<br>';
    $personal->run($id_se,$personal,$user_info,$curd);
}

 

赞 (0)