百度贴吧爬虫小程序源码

# _*_ coding::utf_8 _*_
import urllib.request
import urllib.parse
import os
url='http://tieba.baidu.com/f?'
start=int(input("请输入开始页码:"))
end=int(input("请输入结束页码:"))
name=input("请输入搜索贴吧的名字:")
name1=urllib.parse.quote(name)
num1=(start-1)*50
num2=(end-1)*50
if not os.path.exists(name):
    path = r'E:\python'
    os.mkdir(path+name)
for i in range(start,end):
    data={
    'kw':name1,
    'ie':'utf-8',
    'pn':i,
    }
    headers={
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
    }
    data=urllib.parse.urlencode(data)
    url+=data
    request=urllib.request.Request(url=url,headers=headers)
    response=urllib.request.urlopen(request)
    filename=name+'_'+str(i)+".html"
    filepath=path+name+'/'+filename
    with open(filepath,'wb') as fp:
        fp.write(response.read())
        print("下载完成第{n}页".format(n=i))

 

posted on 2019-09-29 19:27  ybl20000418  阅读(166)  评论(0编辑  收藏  举报

导航