爬虫基础概念
什么是爬虫?
-----请求网站并提取数据的自动化程序-------
urllib模块
关于py2与py3的调用
#################################### in py2:'''
import urllib2data=urllib2.urlopen("http://www.baidu.com")
f=open("baidu.html","w")
f.write(data.read())'''
#################################### in py3:
import urllib.request
data=urllib.request.urlopen("http://www.baidu.com")
f=open("baidu.html","wb")
f.write(data.read())
urllib模块介绍
requests模块
requests模块与正则表达式的简单应用
import requestsimport re
import jsondef getPage(url):
response</span>=<span style="color: #000000;">requests.get(url) </span><span style="color: #0000ff;">return</span><span style="color: #000000;"> response.text
def parsePage(s):
com</span>=re.compile(<span style="color: #800000;">'</span><span style="color: #800000;"><div class="item">.*?<div class="pic">.*?<em .*?>(?P<id>\d+).*?<span class="title">(?P<title>.*?)</span></span><span style="color: #800000;">'</span> <span style="color: #800000;">'</span><span style="color: #800000;">.*?<span class="rating_num" .*?>(?P<rating_num>.*?)</span>.*?<span>(?P<comment_num>.*?)评价</span></span><span style="color: #800000;">'</span><span style="color: #000000;">,re.S) ret</span>=<span style="color: #000000;">com.finditer(s) </span><span style="color: #0000ff;">for</span> i <span style="color: #0000ff;">in</span><span style="color: #000000;"> ret: </span><span style="color: #0000ff;">yield</span><span style="color: #000000;"> { </span><span style="color: #800000;">"</span><span style="color: #800000;">id</span><span style="color: #800000;">"</span>:i.group(<span style="color: #800000;">"</span><span style="color: #800000;">id</span><span style="color: #800000;">"</span><span style="color: #000000;">), </span><span style="color: #800000;">"</span><span style="color: #800000;">title</span><span style="color: #800000;">"</span>:i.group(<span style="color: #800000;">"</span><span style="color: #800000;">title</span><span style="color: #800000;">"</span><span style="color: #000000;">), </span><span style="color: #800000;">"</span><span style="color: #800000;">rating_num</span><span style="color: #800000;">"</span>:i.group(<span style="color: #800000;">"</span><span style="color: #800000;">rating_num</span><span style="color: #800000;">"</span><span style="color: #000000;">), </span><span style="color: #800000;">"</span><span style="color: #800000;">comment_num</span><span style="color: #800000;">"</span>:i.group(<span style="color: #800000;">"</span><span style="color: #800000;">comment_num</span><span style="color: #800000;">"</span><span style="color: #000000;">), }
def main(num):
url</span>=<span style="color: #800000;">'</span><span style="color: #800000;">https://movie.douban.com/top250?start=%s&filter=</span><span style="color: #800000;">'</span>%<span style="color: #000000;">num response_html</span>=<span style="color: #000000;">getPage(url) ret</span>=<span style="color: #000000;">parsePage(response_html) </span><span style="color: #0000ff;">print</span><span style="color: #000000;">(ret) f</span>=open(<span style="color: #800000;">"</span><span style="color: #800000;">move_info7</span><span style="color: #800000;">"</span>,<span style="color: #800000;">"</span><span style="color: #800000;">a</span><span style="color: #800000;">"</span>,encoding=<span style="color: #800000;">"</span><span style="color: #800000;">utf8</span><span style="color: #800000;">"</span><span style="color: #000000;">) </span><span style="color: #0000ff;">for</span> obj <span style="color: #0000ff;">in</span><span style="color: #000000;"> ret: </span><span style="color: #0000ff;">print</span><span style="color: #000000;">(obj) data</span>=json.dumps(obj,ensure_ascii=<span style="color: #000000;">False) f.write(data</span>+<span style="color: #800000;">"</span><span style="color: #800000;">\n</span><span style="color: #800000;">"</span><span style="color: #000000;">)
if name == 'main':
count=0
for i in range(10):
main(count)
count+=25