封装了四个类,主要使用了curl来抓取用户的个人信息页面以及关注的用户页面,然后通过分析页面结构使用正则表达式以及php的字符串函数截取所需的信息。
Curl类:用于进行数据库操作;
Personal类:用于获取关注的人;
Info_url类:用于获取用户信息的url;
User_info类:用于获取用户信息。
需要注意的是cookie和referer有一个有效期,过了一段时间需要更换。设置cookie和referer的地方需要自己去获取替换掉。
下面上码:
Curl类:用于进行数据库操作
<?php //$curd = new Curd(); //$data = $curd->get_id(); //var_dump($data); //此类用于进行数据库操作 class Curd{ //连接数据库 function con_weibo(){ $con = mysql_connect('127.0.0.1:3306','root',''); if(!$con){ die('counld not connect: ' .mysql_error()); } mysql_query('use weibo',$con); return $con; } //获取用户抓取状态 function is_get($id){ $con = $this->con_weibo(); $query = 'select is_get from user where id='.$id; $result = mysql_query($query,$con); if(!$result){ die(mysql_error()); } $result = mysql_fetch_array($result); return $result; } //将用户标记为已抓取状态 function set_get($id){ $con = $this->con_weibo(); $query = 'update user set is_get=1 where user_id='.$id; $result = mysql_query($query,$con); if(!$result){ die(mysql_error()); } return $result; } //从数据库中获取那些未被抓取的用户 function get_ids(){ $con =$this->con_weibo(); $query = 'select user_id from user where is_get=0 and name!="";'; $result = mysql_query($query,$con); $arr = array(); while($row=mysql_fetch_array($result)){ $arr[] = $row['user_id']; } if(!$result){ die(mysql_error()); } return $arr; } //通过user_id查找用户 function find_id($id){ $con =$this->con_weibo(); $query = 'select user_id from user where user_id='.$id; $result = mysql_query($query,$con); $row=mysql_fetch_array($result); if(!$result){ die(mysql_error()); } return $row; } //保存用户信息 function save_info($name,$addr,$sexual,$url,$user_id){ $con = mysql_connect('127.0.0.1:3306','root',''); $date = date("Y-m-d H:i:sa"); if(!$con){ die('counld not connect: ' .mysql_error()); } //echo 'connect success'; mysql_query('use weibo',$con); $is_get = 0; $insert = mysql_query('insert into user values("","'.$name.'","'.$addr.'","'.$sexual.'","'.$url.'","'.$date.'","'.$user_id.'","'.$is_get.'")',$con); if(!$insert){ die(mysql_error()); } mysql_close($con); } //创建表 function create_table(){ $create_db = mysql_query('create database weibo',$con); if($create_db){ echo'create success'; }else{ die('counld not query: '.mysql_error()); } } }
Personal类:用于获取关注的人
<?php //用户获取关注的人 class Personal{ /** * @param $id 用户id * @param $personal personal对象 * @param $user_info 抓取用户信息对象 * @param $curd 数据库操作对象 */ function run($id,$personal,$user_info,$curd){ //因为微博只允许非本人看到6页关注的人,所以这里只进行了六次循环 for($i=1;$i<6;$i++){ $curl_url = 'http://weibo.com/p/100505'.$id.'/follow?page='.$i.'#Pl_Official_HisRelation__61';//用户的关注的人的页面链接 $personal->page($curl_url,$user_info,$curd); } $curd->set_get($id);//将用户标记为已抓取状态 } /** * @param $curl_url curl地址 * @param $user_info 抓取用户信息对象 * @param $curd 数据库操作对象 */ function page($curl_url,$user_info,$curd){ $ch = curl_init(); $options = array( //referer,防外链,登录微博随便访问一个好友的页面F12即可获取 CURLOPT_REFERER => '在浏览器登录微博获取', CURLOPT_URL => $curl_url, CURLOPT_RETURNTRANSFER => 1, CURLOPT_TIMEOUT => 100, CURLOPT_CONNECTTIMEOUT => 10, CURLOPT_HEADER => false, CURLOPT_USERAGENT => 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.89 Safari/537.36', //页面cookie,登录微博随便访问一个好友的页面F12即可获取,过期需要更新 CURLOPT_COOKIE => '在浏览器登录微博获取', ); curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, FALSE); curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, 2); curl_setopt($ch, CURLOPT_ENCODING, "gzip, deflate, sdch"); curl_setopt($ch, CURLOPT_FOLLOWLOCATION, false); curl_setopt($ch, CURLOPT_TIMEOUT,120); curl_setopt_array($ch,$options); $res = curl_exec($ch); curl_close($ch); //echo $res; //截取带用户信息的源码,缩小范围 $res1 = strpos($res,'userListBox'); $res2 = strpos($res,'pageList'); $res = substr($res,$res1,$res2-$res1); //$res = strip_tags($res); //匹配所有符合正则表达式'/uid=([0-9]{10})/'的字符串 $reg = preg_match_all('/id=([0-9]{10})/',$res,$match); echo 'reg:'.$reg.'</br>'; $ids = array_unique($match[0]);//去掉重复url下标不会改 //echo 'ids:'.count($ids).'</br>'; foreach ($ids as $user_id){ $user_id = substr($user_id, 3, 10); //echo 'user_id:'.$user_id.'</br>'; $info_url = 'http://weibo.com/p/100505'.$user_id.'/info?mod=pedit_more'; //echo 'info_url:'.$info_url.'<br>'; //到数据库匹配是否已存在当前用户 if(!$curd->find_id($user_id)){ $user_info->user($info_url,$user_id,$curd);//通过user_id获取用户信息 } } } }
Info_url类:用于获取用户信息的url
<?php ini_set('max_execution_time', '0');//设置执行时间限制为零(无限制) //此类用于获取用户信息url class Info_url{ function get_home($url,$referer){ $ch = curl_init(); $options = array( CURLOPT_REFERER => $referer, CURLOPT_URL => $url, CURLOPT_RETURNTRANSFER =>1, CURLOPT_HEADER => FALSE, CURLOPT_USERAGENT => 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36', //页面cookie,登录微博随便访问一个好友的页面F12即可获取,过期需要更新 CURLOPT_COOKIE =>'在浏览器登录微博获取', CURLOPT_SSL_VERIFYPEER =>FALSE, CURLOPT_ENCODING => 'gzip,deflate,sdch', CURLOPT_TIMEOUT => 120, CURLOPT_FOLLOWLOCATION =>FALSE, ); curl_setopt_array($ch,$options); $res = curl_exec($ch); return $res; //$res1 = strpos($res,'WB_cardmore S_txt1 S_line1 clearfix'); //echo $res1; //echo '<br>'; //$res2 = strpos($res,'info?mod=pedit_more'); //echo $res2; //$res = substr($res,$res1,$res2-$res1); } function get_url($url){ $referer = '在浏览器登录微博获取'; $res = $this->get_home($url,$referer); $pre = preg_match('/100505[0-9]{10}/',$res,$matchs);//匹配链接id if($pre==0){ $pre = preg_match('/[0-9]{16}/',$res,$matchs); if($pre==0){ $referer = '在浏览器登录微博获取'; $res = $this->get_home($url,$referer); //echo 'res:'.$res; $pre = preg_match('/[0-9]{10}/',$res,$matchs); if($pre==0){ $referer = '在浏览器登录微博获取'; $res = $this->get_home($url,$referer); //echo 'res:'.$res; $pre = preg_match('/[0-9]{10}/',$res,$matchs); }} } //echo $pre; return $matchs[0]; //echo $res; } }
User_info类:用于获取用户信息
<?php //此类用于获取用户信息 class User_info{ //有些微博认证用户的个人信息页面结构不一样,获取不到用户信息 function user($url,$user_id,$curd){ $ch = curl_init(); $options = array( //referer,防外链,登录微博随便访问一个好友的页面F12即可获取 CURLOPT_REFERER => '在浏览器登录微博获取', CURLOPT_URL => $url, CURLOPT_RETURNTRANSFER => 1, CURLOPT_TIMEOUT => 100, CURLOPT_CONNECTTIMEOUT => 10, CURLOPT_HEADER => false, CURLOPT_USERAGENT => 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.89 Safari/537.36', //页面cookie,登录微博随便访问一个好友的个人信息页面F12即可获取,过期需要更新 CURLOPT_COOKIE => '在浏览器登录微博获取', ); curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, FALSE); curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, 2); curl_setopt($ch, CURLOPT_ENCODING, "gzip, deflate, sdch"); curl_setopt($ch, CURLOPT_FOLLOWLOCATION, false); curl_setopt($ch, CURLOPT_TIMEOUT,120); curl_setopt_array($ch,$options); $res = curl_exec($ch); curl_close($ch); // echo $res; // var_dump($res); //根据分析页面结构,截取需要的用户信息 $sexual1 = strpos($res,'性别'); $sexual2 = strpos($res,'生日'); $sexual = substr($res,$sexual1,$sexual2-$sexual1); $sexual = substr($sexual,strpos($sexual,'>'),strpos($sexual,'<\/span><\/li>')-strpos($sexual,'>')); $sexual = substr($sexual,1,strlen($sexual)-1); $sexual = strip_tags($sexual); echo '<td>性别:'.$sexual.'</td>'; $sexual = strlen($sexual)>10?0:$sexual; $addr1 = strpos($res,'所在地'); $addr2 = strpos($res,'性别'); $addr = substr($res,$addr1,$addr2-$addr1); $addr = substr($addr,strpos($addr,'>'),strpos($addr,'<\/span><\/li>')-strpos($addr,'>')); $addr = substr($addr,1,strlen($addr)-1); $addr = strip_tags($addr); echo '<td>地址:'.$addr.'</td>'; $name1 = strpos($res,'昵称'); $name2 = strpos($res,'所在地'); $name = substr($res,$name1,$name2-$name1); $name = substr($name,strpos($name,'>'),strpos($name,'<\/span><\/li>')-strpos($name,'>')); $name = substr($name,1,strlen($name)-1); $name = strip_tags($name); echo '<td>昵称:'.$name.'</td>'; $name = strlen($name)>50?"to long":$name; echo '<br>'; // die(); //保存用户信息 $curd->save_info($name,$addr,$sexual,$url,$user_id); }
运行:
<?php /** * Created by PhpStorm. * User: ROOT * Date: 2016/11/27 * Time: 22:38 */ ini_set('max_execution_time', '0'); //ini_set('date.timezone','Asia/Shanghai'); include('./Curd.php'); $curd = new Curd(); //include('./Info_url.php'); //$info = new Info_url(); include('./User_info.php'); $user_info = new User_info(); include('./Personal.php'); $personal = new Personal(); //从数据库中获取那些未被抓取的用户 $data = $curd->get_ids(); for($i=0; $i<count($data); $i++){ //获取数组中最后一个id $id_se = $data[count($data)-$i-1]; echo 'id:'.$id_se.'<br>'; $personal->run($id_se,$personal,$user_info,$curd); }