1.先来一个简单的案例,请求http协议的网站

// 初始化一个 cURL 对象
$curl = curl_init();

// 设置你需要抓取的URL
curl_setopt($curl, CURLOPT_URL, 'http://www.hao123.com');

// 设置header
//是否把被访问服务器的头信息显示出来, 0不显示,非0显示
curl_setopt($curl, CURLOPT_HEADER, 0);

// 设置cURL 参数,要求结果保存到字符串中还是输出到屏幕上, 0为直接输出屏幕,非0则不输出
curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1);

// 运行cURL,请求网页
$data = curl_exec($curl);

// 关闭URL请求
curl_close($curl);

// 显示获得的数据
var_dump($data);

 

2.请求https协议网站,并发送数据(get)

$url = 'https://api.weixin.qq.com/cgi-bin/token?grant_type=client_credential&appid=wxfefd7eaa357a57cf&secret=e061b4df1183fb203e2dc38d35b6a633';
//$url = 'http://localhost/wx/xx.php';
$curl = curl_init($url);

// 对认证证书来源的检查,0表示阻止对证书的合法性的检查。
curl_setopt ( $curl, CURLOPT_SSL_VERIFYPEER, 0 );

// 从证书中检查SSL加密算法是否存在
curl_setopt ( $curl, CURLOPT_SSL_VERIFYHOST, 2 );

//如果访问的url有发送跳转请求,将继续获取跳转后网址的内容
curl_setopt ( $curl, CURLOPT_FOLLOWLOCATION, 1 );

// 设置超时限制防止死循环
curl_setopt ($curl, CURLOPT_TIMEOUT, 30 );            

//不取得返回头信息 
curl_setopt ($curl, CURLOPT_HEADER, 0 );
/*
CURLOPT_RETURNTRANSFER 
设置为1 如果成功只将结果返回,不自动输出任何内容,如果失败 返回false
设置为0或不使用这个选项 ,如果成功返回true,自动输出返回内容,如果失败返回false
*/
curl_setopt($curl, CURLOPT_RETURNTRANSFER,1);

$result = curl_exec ($curl);

//关闭
curl_close ( $curl );

$res = json_decode($result,true);
print_r($res);

 

 1 //3.模拟登录lamp兄弟连
 2 $url = 'http://bbs.lampbrother.net/login.php';
 3 
 4 $arr = array(
 5     'step'=>2,
 6     'lgt'=>2,
 7     'pwuser'=>'你的邮箱',
 8     'pwpwd'=>'你的密码',
 9     'question'=>0,
10     'hideid'=>0
11 );
12 
13 /*****方法一*****/
14 /*
15 // 把COOKIE保存至cookie.txt
16 curl_setopt($ch, CURLOPT_COOKIEFILE, 'cookie.txt');
17 curl_setopt($ch, CURLOPT_COOKIEJAR, 'cookie.txt');
18 先把COOKIE保存文件,调用的时候还得读取文件,这样意味着两次的IO操作,效率低
19 */
20 
21 /*
22 $cookie_file = tempnam('./temp','cookie');
23 //先获取cookie保存文件
24 $ch = curl_init();
25 curl_setopt($ch, CURLOPT_URL, $url);
26 curl_setopt($ch, CURLOPT_HEADER, 0);
27 curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
28 curl_setopt($ch, CURLOPT_POST, 1);
29 curl_setopt($ch, CURLOPT_POSTFIELDS, http_build_query($arr));
30 curl_setopt($ch, CURLOPT_COOKIEJAR, $cookie_file);
31 curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
32 $data = curl_exec($ch);
33 curl_close($ch);
34 //echo $data;
35 
36 //通过保存文件的cookie请求首页
37 $ch = curl_init();
38 curl_setopt($ch, CURLOPT_URL, 'http://bbs.lampbrother.net/');
39 curl_setopt($ch, CURLOPT_HEADER, 0);
40 curl_setopt($ch, CURLOPT_RETURNTRANSFER, 0);
41 curl_setopt($ch, CURLOPT_COOKIEFILE, $cookie_file);
42 curl_exec($ch);
43 curl_close($ch);
44 */
45 
46 
47 /*****方法二*****/
48 $ch = curl_init();
49 curl_setopt($ch, CURLOPT_URL, $url);
50 //这里返回头信息方便获取
51 curl_setopt($ch, CURLOPT_HEADER, 1);
52 curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
53 curl_setopt($ch, CURLOPT_POST, 1);
54 curl_setopt($ch, CURLOPT_POSTFIELDS, http_build_query($arr));
55 $content = curl_exec($ch);
56 curl_close($ch);
57 //解析http数据流
58 list($header, $body) = explode("\r\n\r\n",$content);
59 print_r($header);
60 //解析cookie
61 preg_match_all("/set-cookie:([^\r\n]*)/i",$header,$matches);
62 //print_r($matches);
63 $cookies = implode(';', $matches[1]);
64 print_r($cookies);
65 
66 
67 //后面用curl请求时可以直接使用
68 // curl_setopt($ch, CURLOPT_COOKIE, $cookie);
69 $ch = curl_init();
70 curl_setopt($ch, CURLOPT_URL, 'http://bbs.lampbrother.net/');
71 curl_setopt($ch, CURLOPT_HEADER, 0);
72 curl_setopt($ch, CURLOPT_RETURNTRANSFER, 0);
73 curl_setopt($ch, CURLOPT_COOKIE, $cookies);
74 curl_exec($ch);
75 curl_close($ch);

 

<?php
//4.开源中国信息抓取实例

header('Content-type:text/html;charset=utf-8');
$url = 'https://www.oschina.net/action/user/hash_login';

$data = array(
    'email'=>'你的邮箱',
    'pwd'=>sha1('你的密码'),
    'save_login'=>1,
);

$headers = array(
    'User-Agent:Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.107 Safari/537.36',
    'Referer:https://www.oschina.net/home/login?goto_page=http%3A%2F%2Fwww.oschina.net%2Fcode%2Fsnippet_47318_27221',
);

//获取cookie
$curl = curl_init($url);
curl_setopt ( $curl, CURLOPT_SSL_VERIFYPEER, 0 );
curl_setopt ( $curl, CURLOPT_SSL_VERIFYHOST, 2 );
curl_setopt ( $curl, CURLOPT_FOLLOWLOCATION, 1 );
curl_setopt ($curl, CURLOPT_TIMEOUT, 30 );
curl_setopt ($curl, CURLOPT_HEADER, 1);
curl_setopt($curl, CURLOPT_HTTPHEADER,$headers);
curl_setopt($curl, CURLOPT_RETURNTRANSFER,1);
curl_setopt($curl, CURLOPT_POST, 1);
curl_setopt($curl, CURLOPT_POSTFIELDS, http_build_query($data));
$result = curl_exec ($curl);
curl_close ($curl);
//print_r($result);
preg_match_all("/set-cookie:([^\r\n]*)/i",$result,$matches);
//print_r($matches);
$cookies = implode(';', $matches[1]);

//抓取信息
$url = 'http://my.oschina.net/xxxxx/admin/inbox';
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_HEADER, 0);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_COOKIE, $cookies);
$res = curl_exec($ch);
curl_close($ch);

require './simple_html_dom.php';

//simple_html_dom解释包使用实例

$html1 = new simple_html_dom();
$html1->load($res);
$r = $html1->find('ul.Msgs li[id]');

$html2 = new simple_html_dom();
foreach($r as $k=>$v){
    $html2->load($v);
    $t = $html2->find('.msg');
    foreach($t as $key=>$value){
        echo $value.'<hr/>';
    }
}

$html2->clear();
?>

 

posted on 2014-02-10 10:31  睡着的糖葫芦  阅读(2616)  评论(0编辑  收藏  举报