爬虫小练习:堆糖图片抓取--爬虫正式学习day1
#菠萝tang
#coding:utf-8
import urllib2
import urllib
import os
import time
import json
import jsonpath
def handle_request(url, sort, page ):
qurey_string = '&type=feed&include_fields=top_comments%2Cis_root%2Csource_link%2Citem%2Cbuyable%2Croot_id%2Cstatus%2Clike_count%2Clike_id%2Csender%2Calbum%2Creply_count%2Cfavorite_blog_id&_type=&'
url_use = url + sort + qurey_string + 'start=' + str(24*page)
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
}
request = urllib2.Request(url=url_use, headers=headers)
return request
def download_image(content):
unicodestr = json.loads(content)
url_list = jsonpath.jsonpath(unicodestr, "$..path")
for li in url_list:
dirname = 'DuiTang'
if not os.path.exists(dirname):
os.mkdir(dirname)
filename = li.split('/')[-1]
# print(filename)
filepath = dirname + '/' + filename
# print(filepath)
urllib.urlretrieve(li, filepath)
time.sleep(1)
def main():
url = 'https://www.duitang.com/napi/blog/list/by_search/?kw='
start_page = int(input("请输入起始抓取位置(24个图为一个部分):"))
end_page = int(input("请输入终止抓取位置:"))
sort = raw_input("请输入查询的种类:")
for page in range(start_page-1, end_page):
print('第%s部分开始下载......'%(page+1))
request = handle_request(url, sort, page)
content = urllib2.urlopen(request).read()
print(content)
# #解析内容,提取所有图片链接,下载图片
download_image(content)
print('第%s部分下载完成' %(page+1))
time.sleep(2)
if __name__ == '__main__':
main()
#使用python2.7
#堆糖的图片显示是按照json来的,分页只是障眼法,主要参数为:kw、和start位置!
#获取json数据需要努力学习!!!
#unicodestr = json.loads(content)
#url_list = jsonpath.jsonpath(unicodestr, "$..path")