爬虫作业
1、爬取三国演义http://www.shicimingju.com/book/sanguoyanyi.html
代码:
import requests from bs4 import BeautifulSoup res = requests.get('https://www.shicimingju.com/book/sanguoyanyi.html') soup = BeautifulSoup(res.text, 'lxml') # 先取出书名 book_name = soup.find(class_='bookmark-list').find(name='h1').text # print(book_name) # 取出所有回合章节的url url_list = soup.select('.book-mulu ul li a') # print(url_list) for line in url_list: url = 'https://www.shicimingju.com' + line.attrs.get('href') # print(url) # 依次get回合url res1 = requests.get(url) soup1 = BeautifulSoup(res1.text, 'lxml') # print(soup1) # 取出该回合名称 title = soup1.select('.bookmark-list h1')[0].text # print(title,type(title)) # 取出该回合内容 content = soup1.find(class_='chapter_content').text # print(content) with open('%s.txt' % book_name, 'a', encoding='utf-8') as f: # 追加回合标题 f.write(title) # 追加回合内容 f.write(content)
2、爬取肯德基门店信息:http://www.kfc.com.cn/kfccda/storelist/index.aspx
import requests import json from bs4 import BeautifulSoup data = { 'cname': '上海', 'pid': '', 'keyword': '', 'pageIndex': 1, 'pageSize': 1000 } header = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.116 Safari/537.36', 'Referer': 'http://www.kfc.com.cn/kfccda/storelist/index.aspx', } res = requests.post('http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx', params={'op': 'cname'}, data=data, headers=header) soup = BeautifulSoup(res.text, 'lxml') kfc_info = json.loads(res.text).get('Table1') kfc_list = [ { "storeName":kfc.get('storeName')+'餐厅', "addressDetail":kfc.get("addressDetail"), "pro":kfc.get("pro") } for kfc in kfc_info ] print(kfc_list) print(len(kfc_list)) #455
3、爬取拉钩网职位信息
import requests # 实际要爬取的url url = 'https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false' payload = { 'first': 'true', 'pn': '1', 'kd': 'python', } header = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36', 'Referer': 'https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput=', 'Accept': 'application/json, text/javascript, */*; q=0.01' } # 原始的url urls = 'https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput=' # 建立session s = requests.Session() # 获取搜索页的cookies s.get(urls, headers=header, timeout=3) # 为此次获取的cookies cookie = s.cookies # 获取此次文本 response = s.post(url, data=payload, headers=header, cookies=cookie, params={'city': '上海'}, timeout=5).text print(response)