php cURL 对接IP池API代码封装

封装了四个类,主要使用了curl来抓取用户的个人信息页面以及关注的用户页面,然后通过分析页面结构使用正则表达式以及php的字符串函数截取所需的信息。
Curl类:用于进行数据库操作;
Personal类:用于获取关注的人;
Info_url类:用于获取用户信息的url;
User_info类:用于获取用户信息。
需要注意的是cookie和referer有一个有效期,过了一段时间需要更换。设置cookie和referer的地方需要自己去获取替换掉。
下面上码:

Curl类:用于进行数据库操作

<?php

//$curd = new Curd();
//$data = $curd->get_id();
//var_dump($data);
//此类用于进行数据库操作
class Curd{
    //连接数据库
    function con_weibo(){
        $con = mysql_connect('127.0.0.1:3306','root','');
        if(!$con){
            die('counld not connect: ' .mysql_error());
            }
        mysql_query('use weibo',$con);
        return $con;
        }

    //获取用户抓取状态
    function is_get($id){
        $con = $this->con_weibo();
        $query = 'select is_get from user where id='.$id;
        $result = mysql_query($query,$con);
        if(!$result){
            die(mysql_error());
            }
        $result = mysql_fetch_array($result);
        return $result;
    }

    //将用户标记为已抓取状态
    function set_get($id){
        $con = $this->con_weibo();
        $query = 'update user set is_get=1 where user_id='.$id;
        $result = mysql_query($query,$con);
        if(!$result){
            die(mysql_error());
            }
        return $result;
    }

    //从数据库中获取那些未被抓取的用户
    function get_ids(){
        $con =$this->con_weibo();
        $query = 'select user_id from user where is_get=0 and name!="";';
        $result = mysql_query($query,$con);
        $arr = array();
        while($row=mysql_fetch_array($result)){
            $arr[] = $row['user_id'];
            }
        if(!$result){
            die(mysql_error());
            }
        return $arr;
    }

    //通过user_id查找用户
    function find_id($id){
        $con =$this->con_weibo();
        $query = 'select user_id from user where user_id='.$id;
        $result = mysql_query($query,$con);
        $row=mysql_fetch_array($result);
        if(!$result){
            die(mysql_error());
            }
        return $row;
    }


    //保存用户信息
    function save_info($name,$addr,$sexual,$url,$user_id){
    $con = mysql_connect('127.0.0.1:3306','root','');
    $date = date("Y-m-d H:i:sa");
    if(!$con){
        die('counld not connect: ' .mysql_error());
        }
    //echo 'connect success';
    mysql_query('use weibo',$con);
    $is_get = 0;
    $insert = mysql_query('insert into user values("","'.$name.'","'.$addr.'","'.$sexual.'","'.$url.'","'.$date.'","'.$user_id.'","'.$is_get.'")',$con);
    if(!$insert){
        die(mysql_error());
    }
    mysql_close($con);
    }

    //创建表
    function create_table(){
        $create_db = mysql_query('create database weibo',$con);
        if($create_db){
            echo'create success';
        }else{
            die('counld not query: '.mysql_error());
            }
    }
}

Personal类:用于获取关注的人

<?php
//用户获取关注的人
class Personal{
    /**
     * @param $id           用户id
     * @param $personal     personal对象
     * @param $user_info    抓取用户信息对象
     * @param $curd         数据库操作对象
     */
    function run($id,$personal,$user_info,$curd){
        //因为微博只允许非本人看到6页关注的人,所以这里只进行了六次循环
        for($i=1;$i<6;$i++){
            $curl_url = 'http://weibo.com/p/100505'.$id.'/follow?page='.$i.'#Pl_Official_HisRelation__61';//用户的关注的人的页面链接
            $personal->page($curl_url,$user_info,$curd);
            }
            $curd->set_get($id);//将用户标记为已抓取状态
    }
    /**
     * @param $curl_url     curl地址
     * @param $user_info    抓取用户信息对象
     * @param $curd         数据库操作对象
     */
    function page($curl_url,$user_info,$curd){
        $ch = curl_init();
        $options = array(
            //referer,防外链,登录微博随便访问一个好友的页面F12即可获取
            CURLOPT_REFERER => '在浏览器登录微博获取',
            CURLOPT_URL => $curl_url,
            CURLOPT_RETURNTRANSFER => 1,
            CURLOPT_TIMEOUT => 100,
            CURLOPT_CONNECTTIMEOUT => 10,
            CURLOPT_HEADER => false,
            CURLOPT_USERAGENT => 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.89 Safari/537.36',
            //页面cookie,登录微博随便访问一个好友的页面F12即可获取,过期需要更新
            CURLOPT_COOKIE => '在浏览器登录微博获取',
            );
        curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, FALSE);
        curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, 2);
        curl_setopt($ch, CURLOPT_ENCODING, "gzip, deflate, sdch");
        curl_setopt($ch, CURLOPT_FOLLOWLOCATION, false);
        curl_setopt($ch, CURLOPT_TIMEOUT,120);
        curl_setopt_array($ch,$options);
        $res = curl_exec($ch);
        curl_close($ch);
        //echo $res;
        //截取带用户信息的源码,缩小范围
        $res1 = strpos($res,'userListBox');
        $res2 = strpos($res,'pageList');
        $res = substr($res,$res1,$res2-$res1);
        //$res = strip_tags($res);
        //匹配所有符合正则表达式'/uid=([0-9]{10})/'的字符串
        $reg = preg_match_all('/id=([0-9]{10})/',$res,$match);
        echo 'reg:'.$reg.'</br>';
        $ids = array_unique($match[0]);//去掉重复url下标不会改
        //echo 'ids:'.count($ids).'</br>';
        foreach ($ids as $user_id){
            $user_id = substr($user_id, 3, 10);
            //echo 'user_id:'.$user_id.'</br>';
            $info_url = 'http://weibo.com/p/100505'.$user_id.'/info?mod=pedit_more';
            //echo 'info_url:'.$info_url.'<br>';
            //到数据库匹配是否已存在当前用户
            if(!$curd->find_id($user_id)){
                $user_info->user($info_url,$user_id,$curd);//通过user_id获取用户信息
            }
        }
    }
}

Info_url类:用于获取用户信息的url

<?php
    ini_set('max_execution_time', '0');//设置执行时间限制为零(无限制)
    //此类用于获取用户信息url
    class Info_url{
        function get_home($url,$referer){
        $ch = curl_init();
        $options = array(
            CURLOPT_REFERER => $referer,
            CURLOPT_URL => $url,
            CURLOPT_RETURNTRANSFER =>1,
            CURLOPT_HEADER => FALSE,
            CURLOPT_USERAGENT => 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36',
            //页面cookie,登录微博随便访问一个好友的页面F12即可获取,过期需要更新
            CURLOPT_COOKIE =>'在浏览器登录微博获取',
            CURLOPT_SSL_VERIFYPEER =>FALSE,
            CURLOPT_ENCODING => 'gzip,deflate,sdch',
            CURLOPT_TIMEOUT => 120,
            CURLOPT_FOLLOWLOCATION =>FALSE,
            );
        curl_setopt_array($ch,$options);
        $res = curl_exec($ch);
        return $res;
        //$res1 = strpos($res,'WB_cardmore S_txt1 S_line1 clearfix');
        //echo $res1;
        //echo '<br>';
        //$res2 = strpos($res,'info?mod=pedit_more');
        //echo $res2;
        //$res = substr($res,$res1,$res2-$res1);
        }
        function get_url($url){
        $referer = '在浏览器登录微博获取';
        $res = $this->get_home($url,$referer);
        $pre = preg_match('/100505[0-9]{10}/',$res,$matchs);//匹配链接id
        if($pre==0){
            $pre = preg_match('/[0-9]{16}/',$res,$matchs);
            if($pre==0){
                $referer = '在浏览器登录微博获取';
                $res = $this->get_home($url,$referer);
                //echo 'res:'.$res;
                $pre = preg_match('/[0-9]{10}/',$res,$matchs);
            if($pre==0){
                $referer = '在浏览器登录微博获取';
                $res = $this->get_home($url,$referer);
                //echo 'res:'.$res;
                $pre = preg_match('/[0-9]{10}/',$res,$matchs);
            }}
        }
        //echo $pre;
        return $matchs[0];
        //echo $res;
        }
    }

User_info类:用于获取用户信息

<?php
//此类用于获取用户信息
class User_info{
//有些微博认证用户的个人信息页面结构不一样,获取不到用户信息
function user($url,$user_id,$curd){
    $ch = curl_init();
    $options = array(
        //referer,防外链,登录微博随便访问一个好友的页面F12即可获取
        CURLOPT_REFERER => '在浏览器登录微博获取',
        CURLOPT_URL => $url,
        CURLOPT_RETURNTRANSFER => 1,
        CURLOPT_TIMEOUT => 100,
        CURLOPT_CONNECTTIMEOUT => 10,
        CURLOPT_HEADER => false,
        CURLOPT_USERAGENT => 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.89 Safari/537.36',
        //页面cookie,登录微博随便访问一个好友的个人信息页面F12即可获取,过期需要更新
        CURLOPT_COOKIE => '在浏览器登录微博获取',
        );
    curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, FALSE);
    curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, 2);
    curl_setopt($ch, CURLOPT_ENCODING, "gzip, deflate, sdch");
        curl_setopt($ch, CURLOPT_FOLLOWLOCATION, false);
    curl_setopt($ch, CURLOPT_TIMEOUT,120);
    curl_setopt_array($ch,$options);
    $res = curl_exec($ch);
    curl_close($ch);
//  echo $res;
//    var_dump($res);
    //根据分析页面结构,截取需要的用户信息
    $sexual1 = strpos($res,'性别');
    $sexual2 = strpos($res,'生日');
    $sexual = substr($res,$sexual1,$sexual2-$sexual1);
    $sexual = substr($sexual,strpos($sexual,'>'),strpos($sexual,'<\/span><\/li>')-strpos($sexual,'>'));
    $sexual = substr($sexual,1,strlen($sexual)-1);
    $sexual = strip_tags($sexual);
    echo '<td>性别:'.$sexual.'</td>';
    $sexual = strlen($sexual)>10?0:$sexual;
    $addr1 = strpos($res,'所在地');
    $addr2 = strpos($res,'性别');
    $addr = substr($res,$addr1,$addr2-$addr1);
    $addr = substr($addr,strpos($addr,'>'),strpos($addr,'<\/span><\/li>')-strpos($addr,'>'));
    $addr = substr($addr,1,strlen($addr)-1);
    $addr = strip_tags($addr);
    echo '<td>地址:'.$addr.'</td>';
    $name1 = strpos($res,'昵称');
    $name2 = strpos($res,'所在地');
    $name = substr($res,$name1,$name2-$name1);
    $name = substr($name,strpos($name,'>'),strpos($name,'<\/span><\/li>')-strpos($name,'>'));
    $name = substr($name,1,strlen($name)-1);
    $name = strip_tags($name);
    echo '<td>昵称:'.$name.'</td>';
    $name = strlen($name)>50?"to long":$name;
    echo '<br>';
//    die();
    //保存用户信息
    $curd->save_info($name,$addr,$sexual,$url,$user_id);
    }

运行:

<?php
/**
 * Created by PhpStorm.
 * User: ROOT
 * Date: 2016/11/27
 * Time: 22:38
 */
ini_set('max_execution_time', '0');
//ini_set('date.timezone','Asia/Shanghai');
include('./Curd.php');
$curd = new Curd();

//include('./Info_url.php');
//$info = new Info_url();

include('./User_info.php');
$user_info = new User_info();

include('./Personal.php');
$personal = new Personal();

//从数据库中获取那些未被抓取的用户
$data = $curd->get_ids();
for($i=0; $i<count($data); $i++){
    //获取数组中最后一个id
    $id_se = $data[count($data)-$i-1];
    echo 'id:'.$id_se.'<br>';
    $personal->run($id_se,$personal,$user_info,$curd);
}

 

赞 (0)