爬取百度贴吧

#批量爬取贴吧页面数据
#第1页: https://tieba.baidu.com/f?kw=%E6%97%85%E8%A1%8C%E9%9D%92%E8%9B%99&ie=utf-8&pn=0
#第2页:https://tieba.baidu.com/f?kw=%E6%97%85%E8%A1%8C%E9%9D%92%E8%9B%99&ie=utf-8&pn=50
#第3页 https://tieba.baidu.com/f?kw=%E6%97%85%E8%A1%8C%E9%9D%92%E8%9B%99&ie=utf-8&pn=100
#第4页                                                                           pn=150

#及格水平
# base_url = "https://tieba.baidu.com/f?kw=%E6%97%85%E8%A1%8C%E9%9D%92%E8%9B%99&ie=utf-8&pn="
# for page in range(10):
#     new_url = base_url + str(page*50)
#     print(new_url)

#进阶水平
#从键盘去输入贴吧名称和页数,然后爬取指定页面的内容
base_url = "https://tieba.baidu.com/f?"

name = input("请输入贴吧名称")
page = input("请输入贴吧页数")

from urllib import request,parse

for i in range(int(page)):
    qs = {
        "kw":name,
        "pn":i*50
    }
    qs_data = parse.urlencode(qs)
    url = base_url + qs_data

    headers = {
        "user_agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36"
    }
    req = request.Request(base_url,headers = headers)
    response = request.urlopen(req)
    html = response.read()
    html = html.decode("utf-8")
    # with open(name+"第"+page+"页"+".html","w",encoding="utf-8") as f:
    with open(name+""+str(i)+""+".html","w",encoding="utf-8") as f:
        f.write(html)

 

posted @ 2018-03-10 21:22  Bob__Zhang  阅读(278)  评论(0编辑  收藏  举报