1-爬取汽车之家
依赖
爬取汽车之家用到了Python的两个库:
- requests:模拟浏览器发送请求
- BeautifulSoup4:解析爬取的数据
这两个库都需要我们手动下载:
pip install requests
pip install BeautifulSoup4
简单爬取汽车之家新闻页首页
import os
import requests
from bs4 import BeautifulSoup
base_dir = os.path.dirname(__file__)
def spider():
'''基础版爬取汽车之家新闻页'''
response = requests.get(url='https://www.autohome.com.cn/news/')
# print(response) # 状态码
# print(response.status_code) # 状态码
# print(response.headers) # 响应头
# print(response.text) # 文本内容为中文内容为乱码,可以查看charset=gb2312
response.encoding = 'gbk' # 解决乱码
# print(response.text)
soup = BeautifulSoup(response.text, 'html.parser')
result = soup.find(name='div', attrs={'id': 'auto-channel-lazyload-article'}) # 拿到所有的数据
# print(result)
li_list = result.find_all(name='li')
# print(li_list[0])
for item in li_list:
# 取标题
title_tag = item.find(name='h3')
if not title_tag:
continue
# print(title, title.text) # <h3>奥迪R8 V10 Decennium特别版官图发布</h3> 奥迪R8 V10 Decennium特别版官图发布
title = title_tag.text
# 取简介
introduction = item.find(name='p').text
# print(introduction) # [汽车之家 新车官图] 日前,为纪念奥迪R8 V10车型诞生10周年,奥迪官方发布了R8 V10 Decennium(十年)特别版车型的官图。新车基...
url = 'https:' + item.find(name='a').get('href')
# print(url) # https://www.autohome.com.cn/news/201902/930488.html#pvareaid=102624
img = 'https:' + item.find(name='img').get('src')
# print(img) # 爬取的是图片的链接,如果想要下载到本地,还需要再次向该链接发送请求,写入文件
img_content = requests.get(url=img)
img_name = img.rsplit('/', 1)[-1]
file_path = os.path.join(base_dir, 'img', img_name)
with open(file_path, 'wb') as f:
f.write(img_content.content)
if __name__ == '__main__':
spider()
爬取新闻页前一百页
import os
import time
import requests
from bs4 import BeautifulSoup
base_dir = os.path.dirname(__file__)
def spider(page):
'''基础版爬取汽车之家新闻页'''
response = requests.get(url='https://www.autohome.com.cn/news/%s/#liststart' % page)
# print(response) # 状态码
# print(response.status_code) # 状态码
# print(response.headers) # 响应头
# print(response.text) # 文本内容为中文内容为乱码,可以查看charset=gb2312
response.encoding = 'gbk' # 解决乱码
# print(response.text)
soup = BeautifulSoup(response.text, 'html.parser')
result = soup.find(name='div', attrs={'id': 'auto-channel-lazyload-article'}) # 拿到所有的数据
# print(result)
li_list = result.find_all(name='li')
# print(li_list[0])
for item in li_list:
# 取标题
title_tag = item.find(name='h3')
if not title_tag:
continue
print(title_tag, title_tag.text) # <h3>奥迪R8 V10 Decennium特别版官图发布</h3> 奥迪R8 V10 Decennium特别版官图发布
title = title_tag.text
# 取简介
introduction = item.find(name='p').text
# print(introduction) # [汽车之家 新车官图] 日前,为纪念奥迪R8 V10车型诞生10周年,奥迪官方发布了R8 V10 Decennium(十年)特别版车型的官图。新车基...
url = 'https:' + item.find(name='a').get('href')
# print(url) # https://www.autohome.com.cn/news/201902/930488.html#pvareaid=102624
img = 'https:' + item.find(name='img').get('src')
# print(img) # 爬取的是图片的链接,如果想要下载到本地,还需要再次向该链接发送请求,写入文件
# img_content = requests.get(url=img)
# img_name = img.rsplit('/', 1)[-1]
# file_path = os.path.join(base_dir, 'img', img_name)
# with open(file_path, 'wb') as f:
# f.write(img_content.content)
if __name__ == '__main__':
start_time = time.time()
for i in range(1, 101):
spider(i)
print('顺序爬取100页共耗时', time.time() - start_time) # 99.59376955032349
多线程爬取汽车之家新闻页前100页
import os
import time
import requests
from threading import Thread
from bs4 import BeautifulSoup
base_dir = os.path.dirname(__file__)
def spider(page):
'''基础版爬取汽车之家新闻页'''
response = requests.get(url='https://www.autohome.com.cn/news/%s/#liststart' % page)
# print(response) # 状态码
# print(response.status_code) # 状态码
# print(response.headers) # 响应头
# print(response.text) # 文本内容为中文内容为乱码,可以查看charset=gb2312
response.encoding = 'gbk' # 解决乱码
# print(response.text)
soup = BeautifulSoup(response.text, 'html.parser')
result = soup.find(name='div', attrs={'id': 'auto-channel-lazyload-article'}) # 拿到所有的数据
# print(result)
li_list = result.find_all(name='li')
# print(li_list[0])
for item in li_list:
# 取标题
title_tag = item.find(name='h3')
if not title_tag:
continue
print(title_tag, title_tag.text) # <h3>奥迪R8 V10 Decennium特别版官图发布</h3> 奥迪R8 V10 Decennium特别版官图发布
title = title_tag.text
# 取简介
introduction = item.find(name='p').text
# print(introduction) # [汽车之家 新车官图] 日前,为纪念奥迪R8 V10车型诞生10周年,奥迪官方发布了R8 V10 Decennium(十年)特别版车型的官图。新车基...
url = 'https:' + item.find(name='a').get('href')
# print(url) # https://www.autohome.com.cn/news/201902/930488.html#pvareaid=102624
img = 'https:' + item.find(name='img').get('src')
# print(img) # 爬取的是图片的链接,如果想要下载到本地,还需要再次向该链接发送请求,写入文件
# img_content = requests.get(url=img)
# img_name = img.rsplit('/', 1)[-1]
# file_path = os.path.join(base_dir, 'img', img_name)
# with open(file_path, 'wb') as f:
# f.write(img_content.content)
if __name__ == '__main__':
# spider(1)
start_time = time.time()
for i in range(1, 101):
t = Thread(target=spider, args=(i, ))
t.start()
print('多线程爬取100页共耗时', time.time() - start_time) # 0.17073273658752441
线程池爬取汽车之家新闻页前100页
import os
import time
import requests
from concurrent.futures import ThreadPoolExecutor
from multiprocessing import cpu_count
from bs4 import BeautifulSoup
base_dir = os.path.dirname(__file__)
def spider(page):
'''基础版爬取汽车之家新闻页'''
response = requests.get(url='https://www.autohome.com.cn/news/%s/#liststart' % page)
# print(response) # 状态码
# print(response.status_code) # 状态码
# print(response.headers) # 响应头
# print(response.text) # 文本内容为中文内容为乱码,可以查看charset=gb2312
response.encoding = 'gbk' # 解决乱码
# print(response.text)
soup = BeautifulSoup(response.text, 'html.parser')
result = soup.find(name='div', attrs={'id': 'auto-channel-lazyload-article'}) # 拿到所有的数据
# print(result)
li_list = result.find_all(name='li')
# print(li_list[0])
for item in li_list:
# 取标题
title_tag = item.find(name='h3')
if not title_tag:
continue
print(title_tag, title_tag.text) # <h3>奥迪R8 V10 Decennium特别版官图发布</h3> 奥迪R8 V10 Decennium特别版官图发布
title = title_tag.text
# 取简介
introduction = item.find(name='p').text
# print(introduction) # [汽车之家 新车官图] 日前,为纪念奥迪R8 V10车型诞生10周年,奥迪官方发布了R8 V10 Decennium(十年)特别版车型的官图。新车基...
url = 'https:' + item.find(name='a').get('href')
# print(url) # https://www.autohome.com.cn/news/201902/930488.html#pvareaid=102624
img = 'https:' + item.find(name='img').get('src')
# print(img) # 爬取的是图片的链接,如果想要下载到本地,还需要再次向该链接发送请求,写入文件
# img_content = requests.get(url=img)
# img_name = img.rsplit('/', 1)[-1]
# file_path = os.path.join(base_dir, 'img', img_name)
# with open(file_path, 'wb') as f:
# f.write(img_content.content)
if __name__ == '__main__':
start_time = time.time()
t = ThreadPoolExecutor(cpu_count() * 5)
for i in range(1, 101):
t.submit(spider, i)
t.shutdown(wait=True)
print('线程池爬取100页共耗时', time.time() - start_time) # 36.4789092540741
进程池爬取汽车之家新闻页前100页
import os
import time
import requests
from concurrent.futures import ProcessPoolExecutor
from multiprocessing import cpu_count
from bs4 import BeautifulSoup
base_dir = os.path.dirname(__file__)
def spider(page):
'''基础版爬取汽车之家新闻页'''
response = requests.get(url='https://www.autohome.com.cn/news/%s/#liststart' % page)
# print(response) # 状态码
# print(response.status_code) # 状态码
# print(response.headers) # 响应头
# print(response.text) # 文本内容为中文内容为乱码,可以查看charset=gb2312
response.encoding = 'gbk' # 解决乱码
# print(response.text)
soup = BeautifulSoup(response.text, 'html.parser')
result = soup.find(name='div', attrs={'id': 'auto-channel-lazyload-article'}) # 拿到所有的数据
# print(result)
li_list = result.find_all(name='li')
# print(li_list[0])
for item in li_list:
# 取标题
title_tag = item.find(name='h3')
if not title_tag:
continue
print(title_tag, title_tag.text) # <h3>奥迪R8 V10 Decennium特别版官图发布</h3> 奥迪R8 V10 Decennium特别版官图发布
title = title_tag.text
# 取简介
introduction = item.find(name='p').text
# print(introduction) # [汽车之家 新车官图] 日前,为纪念奥迪R8 V10车型诞生10周年,奥迪官方发布了R8 V10 Decennium(十年)特别版车型的官图。新车基...
url = 'https:' + item.find(name='a').get('href')
# print(url) # https://www.autohome.com.cn/news/201902/930488.html#pvareaid=102624
img = 'https:' + item.find(name='img').get('src')
# print(img) # 爬取的是图片的链接,如果想要下载到本地,还需要再次向该链接发送请求,写入文件
# img_content = requests.get(url=img)
# img_name = img.rsplit('/', 1)[-1]
# file_path = os.path.join(base_dir, 'img', img_name)
# with open(file_path, 'wb') as f:
# f.write(img_content.content)
if __name__ == '__main__':
start_time = time.time()
p = ProcessPoolExecutor(cpu_count() * 2)
for i in range(1, 101):
p.submit(spider, i)
p.shutdown(wait=True)
print('进程池爬取100页共耗时', time.time() - start_time) # 32.66965293884277
进程池和线程池其实在合理的设置范围内爬取速度差别不大,甚至线程池更快一些,上例的最后打印的时间差距可以忽略不计,并且受网速影响。
混爬汽车之家好多页
import os
import time
import requests
from concurrent.futures import ProcessPoolExecutor
from multiprocessing import cpu_count
from bs4 import BeautifulSoup
base_dir = os.path.dirname(__file__)
def spider(page):
'''基础版爬取汽车之家新闻页'''
response = requests.get(url='https://www.autohome.com.cn/%s/%s/#liststart' % (page[1], page[0]))
# print(response) # 状态码
# print(response.status_code) # 状态码
# print(response.headers) # 响应头
# print(response.text) # 文本内容为中文内容为乱码,可以查看charset=gb2312
response.encoding = 'gbk' # 解决乱码
# print(response.text)
soup = BeautifulSoup(response.text, 'html.parser')
result = soup.find(name='div', attrs={'id': 'auto-channel-lazyload-article'}) # 拿到所有的数据
# print(result)
li_list = result.find_all(name='li')
# print(li_list[0])
for item in li_list:
# 取标题
title_tag = item.find(name='h3')
if not title_tag:
continue
print(title_tag, title_tag.text) # <h3>奥迪R8 V10 Decennium特别版官图发布</h3> 奥迪R8 V10 Decennium特别版官图发布
title = title_tag.text
# 取简介
introduction = item.find(name='p').text
# print(introduction) # [汽车之家 新车官图] 日前,为纪念奥迪R8 V10车型诞生10周年,奥迪官方发布了R8 V10 Decennium(十年)特别版车型的官图。新车基...
url = 'https:' + item.find(name='a').get('href')
# print(url) # https://www.autohome.com.cn/news/201902/930488.html#pvareaid=102624
img = 'https:' + item.find(name='img').get('src')
# print(img) # 爬取的是图片的链接,如果想要下载到本地,还需要再次向该链接发送请求,写入文件
# img_content = requests.get(url=img)
# img_name = img.rsplit('/', 1)[-1]
# file_path = os.path.join(base_dir, 'img', img_name)
# with open(file_path, 'wb') as f:
# f.write(img_content.content)
if __name__ == '__main__':
start_time = time.time()
p = ProcessPoolExecutor(cpu_count() * 2)
for item in ['news', 'advice', 'drive', 'use', 'culture', 'travels', 'tech', 'tuning', 'ev']:
for i in range(1, 101):
p.submit(spider, (i, item),)
p.shutdown(wait=True)
print('共耗时', time.time() - start_time) # 418.42672753334045,结果有点抠脚啊
欢迎指正,that's all