python3二级页面爬虫
from bs4 import BeautifulSoup import requests # title: 文娱数据库 # target: http://wydb.leshanvc.com/ # author: 不想长大a page = 1 # 定义页数初始值 website = ' http://wydb.leshanvc.com/' # 目标网站 url_file = 'out.txt' # 存放目标网站链接 headers = { 'user-agent': 'Mozilla/5.0' # 伪装 } count = 0 # 统计个数 for i in range(1, 1388): # http://wydb.leshanvc.com/page-1/ basic_url = website+'page-'+str(page)+'/' # 一级链接 html = requests.get(basic_url, headers) # 获取一级页面内容 page += 1 soup = BeautifulSoup(html.text, 'html.parser') # 解析一级页面内容 # print(soup) for item in soup.find_all('div', 'list'): link = item.find('div', 'info').find('div').find('span', 'companyname').find('a')['href'] # 二级页面链接 # print(link) xq_html = requests.get(website + link, headers) # 获取详情页 xq_soup = BeautifulSoup(xq_html.text, 'html.parser') # 解析详情页 url_list = '' try: # 使用copy selector 获取二级页面指定内容 url_list = xq_soup.select('#wydb > div.right > div.bor.con > div:nth-child(7) > span:nth-child(2) > a') except: url_list = 'none' # 爬不到就为空 for i in url_list: for _char in i: if not '\u4e00' <= _char <= '\u9fa5': # 过滤汉字 with open(url_file, "a+") as f: f.write(i.get_text()) # 写入txt f.write('\n') # 换行 print(i.get_text()) count += 1 print("一共爬了", count) print("结束")
网安小菜鸡