汪1234

  博客园  :: 首页  :: 新随笔  :: 联系 :: 订阅 订阅  :: 管理

爬虫基础概念

什么是爬虫?

-----请求网站并提取数据的自动化程序------

urllib模块

关于py2与py3的调用

复制代码
#################################### in py2:

'''
import urllib2

data=urllib2.urlopen("http://www.baidu.com")
f=open("baidu.html","w")
f.write(data.read())

'''

#################################### in py3:

import urllib.request

data=urllib.request.urlopen("http://www.baidu.com")
f
=open("baidu.html","wb")
f.write(data.read())

复制代码

 urllib模块介绍

 

 

requests模块

 

requests模块与正则表达式的简单应用

复制代码
import requests

import re
import json

def getPage(url):

response</span>=<span style="color: #000000;">requests.get(url)
</span><span style="color: #0000ff;">return</span><span style="color: #000000;"> response.text

def parsePage(s):

com</span>=re.compile(<span style="color: #800000;">'</span><span style="color: #800000;">&lt;div class="item"&gt;.*?&lt;div class="pic"&gt;.*?&lt;em .*?&gt;(?P&lt;id&gt;\d+).*?&lt;span class="title"&gt;(?P&lt;title&gt;.*?)&lt;/span&gt;</span><span style="color: #800000;">'</span>
               <span style="color: #800000;">'</span><span style="color: #800000;">.*?&lt;span class="rating_num" .*?&gt;(?P&lt;rating_num&gt;.*?)&lt;/span&gt;.*?&lt;span&gt;(?P&lt;comment_num&gt;.*?)评价&lt;/span&gt;</span><span style="color: #800000;">'</span><span style="color: #000000;">,re.S)

ret</span>=<span style="color: #000000;">com.finditer(s)
</span><span style="color: #0000ff;">for</span> i <span style="color: #0000ff;">in</span><span style="color: #000000;"> ret:
    </span><span style="color: #0000ff;">yield</span><span style="color: #000000;"> {
        </span><span style="color: #800000;">"</span><span style="color: #800000;">id</span><span style="color: #800000;">"</span>:i.group(<span style="color: #800000;">"</span><span style="color: #800000;">id</span><span style="color: #800000;">"</span><span style="color: #000000;">),
        </span><span style="color: #800000;">"</span><span style="color: #800000;">title</span><span style="color: #800000;">"</span>:i.group(<span style="color: #800000;">"</span><span style="color: #800000;">title</span><span style="color: #800000;">"</span><span style="color: #000000;">),
        </span><span style="color: #800000;">"</span><span style="color: #800000;">rating_num</span><span style="color: #800000;">"</span>:i.group(<span style="color: #800000;">"</span><span style="color: #800000;">rating_num</span><span style="color: #800000;">"</span><span style="color: #000000;">),
        </span><span style="color: #800000;">"</span><span style="color: #800000;">comment_num</span><span style="color: #800000;">"</span>:i.group(<span style="color: #800000;">"</span><span style="color: #800000;">comment_num</span><span style="color: #800000;">"</span><span style="color: #000000;">),
    }

def main(num):

url</span>=<span style="color: #800000;">'</span><span style="color: #800000;">https://movie.douban.com/top250?start=%s&amp;filter=</span><span style="color: #800000;">'</span>%<span style="color: #000000;">num
response_html</span>=<span style="color: #000000;">getPage(url)
ret</span>=<span style="color: #000000;">parsePage(response_html)
</span><span style="color: #0000ff;">print</span><span style="color: #000000;">(ret)
f</span>=open(<span style="color: #800000;">"</span><span style="color: #800000;">move_info7</span><span style="color: #800000;">"</span>,<span style="color: #800000;">"</span><span style="color: #800000;">a</span><span style="color: #800000;">"</span>,encoding=<span style="color: #800000;">"</span><span style="color: #800000;">utf8</span><span style="color: #800000;">"</span><span style="color: #000000;">)

</span><span style="color: #0000ff;">for</span> obj <span style="color: #0000ff;">in</span><span style="color: #000000;"> ret:
    </span><span style="color: #0000ff;">print</span><span style="color: #000000;">(obj)
    data</span>=json.dumps(obj,ensure_ascii=<span style="color: #000000;">False)
    f.write(data</span>+<span style="color: #800000;">"</span><span style="color: #800000;">\n</span><span style="color: #800000;">"</span><span style="color: #000000;">)

if name == 'main':
count
=0
for i in range(10):
main(count)
count
+=25

复制代码

 

posted on 2020-03-27 14:00  梦一水知音~  阅读(90)  评论(0编辑  收藏  举报