Pyhon3爬虫(1)
简单爬虫程序:获取百度页面所有的HTML元素
from urllib import request url = "http://www.baidu.com" headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'} page = request.Request(url, headers=headers) page_info = request.urlopen(url).read().decode("utf-8") print(page_info)
引入BeautifulSoup模块,通过正则表达式获取cnblog博客标题
from urllib import request from bs4 import BeautifulSoup url = r'https://www.cnblogs.com/' headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'} page = request.Request(url, headers=headers) page_info = request.urlopen(page).read().decode("utf-8") soup = BeautifulSoup(page_info, "html.parser") titles = soup.find_all("a", "titlelnk") for title in titles: pint(title.string)
获取知乎话题图片地址并下载
import time from urllib import request from bs4 import BeautifulSoup import re url = r'https://www.zhihu.com/question/22918070' headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'} page = request.Request(url, headers=headers) page_info = request.urlopen(page).read().decode("utf-8") soup = BeautifulSoup(page_info, "html.parser") links = soup.find_all("img", "origin_image zh-lightbox-thumb", src=re.compile(r'.jpg$')) local_path = r'E:\pic' for link in links: print(link.attrs["src"]) request.urlretrieve(link.attrs["src"], local_path + r'\%s.jpg' % time.time())