爬取贴吧的例子(摘录)
import requests
class Spider:
def __init__(self,name):
self.name=name
self.url_temp="https://tieba.baidu.com/f?kw="+name+"&ie=utf-8&pn={}"
self.headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/53
7.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"}
def get_url_list(self):
return [self.url_temp.format(i*50) for i in range(3)]
def parse_url(self,url):
response=requests.get(url,headers=self.headers)
return response
def save_html_str(self,html_str,page_num):
file_path="/tmp/tieba/{}吧_第{}页".format(self.name,page_num)
with open(file_path,"w",encoding="utf-8") as f:
f.write(html_str)
def run(self):
url_list=self.get_url_list()
for url in url_list:
html_str=self.parse_url(url).content.decode()
page_num=url_list.index(url)+1
self.save_html_str(html_str,page_num)
def main():
name=input("请输入要爬取的贴吧:")
tieba_spider=Spider(name)
tieba_spider.run()
if __name__ == "__main__":
main()
假设是一种超能力