python爬虫学习04-爬取贴吧
百度贴吧网页爬取
以下是代码
from urllib.request import urlopen
from urllib.request import Request
from urllib.parse import urlencode
from fake_useragent import UserAgent
def get_html(url):
# 随机获取一个动态ua
headers = {
"User-Agent": UserAgent().random
}
# 发起请求
request = Request(url, headers=headers)
# urlopen()获取页面,类型是字节,需要用decode()解码,转换成str类型
respose = urlopen(request)
return respose.read()
def save_html(filename,html_bytes):
with open(filename,"wb") as f:
f.write(html_bytes)
def main():
content = input(print("请输入你要下载的内容:"))
num = input(print("请输入你要下载多少页:"))
base_url = "https://tieba.baidu.com/f?ie=utf-8{}"
for pn in range(int(num)):
args = {
"pn":pn*50,
"kw":content
}
filename = "第"+str(pn+1)+"页.html"
args = urlencode(args)
print("正在下载"+filename)
html_bytes = get_html(base_url.format(args))
save_html(filename,html_bytes)
if __name__ == '__main__':
main()
代码成就万世基积沙镇海
梦想永在凌云意意气风发