dayawanCrawler
import requests from bs4 import BeautifulSoup class GetWebData: def __init__(self): self.headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Cookie': 'JSESSIONID=59D7AE73DA0256B8DACA9712795B8EB5', 'Host': '61.142.120.214:9000', 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36', } self.urlHead = 'http://61.142.120.214:9000/web/salepermit.jsp?page=' self.urlTail = '&&projectname=&&code=&&compname=&&address=&&date1=&&date2=' def collectDataFromURL(self): for i in range(1, 11, 1): url = "{}{}{}".format(self.urlHead, i, self.urlTail) try: response = requests.get(url, headers=self.headers, timeout=20) except: print("请求 {} 超时".format(url)) continue if response.status_code == 200: print("{} 请求成功".format(url)) response.encoding = 'GBK' soup = BeautifulSoup(response.text, 'lxml') self.getOnePage(soup) else: print("{}请求失败,返回值为{}".format(url, response.status_code)) def getOnePage(self, soup): for tr in soup.find_all('tr'): tds = tr.find_all('td') print(tds) crawler = GetWebData() crawler.collectDataFromURL()