爬取百度贴吧
#批量爬取贴吧页面数据 #第1页: https://tieba.baidu.com/f?kw=%E6%97%85%E8%A1%8C%E9%9D%92%E8%9B%99&ie=utf-8&pn=0 #第2页:https://tieba.baidu.com/f?kw=%E6%97%85%E8%A1%8C%E9%9D%92%E8%9B%99&ie=utf-8&pn=50 #第3页 https://tieba.baidu.com/f?kw=%E6%97%85%E8%A1%8C%E9%9D%92%E8%9B%99&ie=utf-8&pn=100 #第4页 pn=150 #及格水平 # base_url = "https://tieba.baidu.com/f?kw=%E6%97%85%E8%A1%8C%E9%9D%92%E8%9B%99&ie=utf-8&pn=" # for page in range(10): # new_url = base_url + str(page*50) # print(new_url) #进阶水平 #从键盘去输入贴吧名称和页数,然后爬取指定页面的内容 base_url = "https://tieba.baidu.com/f?" name = input("请输入贴吧名称") page = input("请输入贴吧页数") from urllib import request,parse for i in range(int(page)): qs = { "kw":name, "pn":i*50 } qs_data = parse.urlencode(qs) url = base_url + qs_data headers = { "user_agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36" } req = request.Request(base_url,headers = headers) response = request.urlopen(req) html = response.read() html = html.decode("utf-8") # with open(name+"第"+page+"页"+".html","w",encoding="utf-8") as f: with open(name+"第"+str(i)+"页"+".html","w",encoding="utf-8") as f: f.write(html)