本文主要描述了爬虫在研招网上的使用,请使用者不要频繁去运行代码,对服务器造成压力,否则后果自负。同时希望该代码可以帮助正在考研学子们可以筛选出自己喜欢的学校。
1.首先登陆研招网信息通过专业代码查询研招网上的专业代码。
2.更改代码中学校的代码
3.运行程序,代码附在下方。运行结束后会自动生成一个文本txt,你想要的信息就在里面。
import requests
from lxml import etree
import re
data ={
"ssdm":"","dwmc":"","mldm":"zyxw","mlmc":"","yjxkdm":"","zymc":"","xxfs":"","pageno":""
}
class UniversityInfo:
def __init__(self):
# 招生单位、考试方式、院系所、专业、学习方式、研究方向、指导老师、拟招人数、备注、政治、外语、业务课一、业务课二
self.EnrollmentUnit = ""
self.ExaminationMethod = ""
self.CollegesDepartments = ""
self.Major = ""
self.learningStyle = ""
self.ResearchDirection = ""
self.Instructor = ""
self.Number = ""
self.Remarks = ""
self.Politics = ""
self.English = ""
self.BusinessClass1 = ""
self.BusinessClass2 = ""
class YanZhaoWang:
def __init__(self,code):
self.url = "https://yz.chsi.com.cn/zsml/queryAction.do?ssdm&dwmc&mldm=zyxw&mlmc&yjxkdm={}&zymc&xxfs&pageno={}"
self.data = data
self.data["yjxkdm"]=code
self.page = 1
self.schoolUrl = []
self.facultyUrl = []
self.schoolInfo = []
def GetFacultyUrl(self):
for i in self.schoolUrl:
response = requests.post("https://yz.chsi.com.cn"+i).text
response = etree.HTML(response)
url = response.xpath('//table/tbody//td[8]/a/@href')
for i in url:
print(i)
self.facultyUrl.append(i)
def GetSchoolInfo(self):
for i in self.facultyUrl:
response = requests.post("https://yz.chsi.com.cn/"+i).text
response = etree.HTML(response)
schoolinfo = UniversityInfo()
schoolinfo.EnrollmentUnit = response.xpath('//table[@class="zsml-condition"]/tbody/tr[1]/td[2]/text()')[0]
schoolinfo.ExaminationMethod = response.xpath('//table[@class="zsml-condition"]/tbody/tr[1]/td[4]/text()')[0]
schoolinfo.CollegesDepartments = response.xpath('//table[@class="zsml-condition"]/tbody/tr[2]/td[2]/text()')[0]
schoolinfo.Major = response.xpath('//table[@class="zsml-condition"]/tbody/tr[2]/td[4]/text()')[0]
schoolinfo.learningStyle = response.xpath('//table[@class="zsml-condition"]/tbody/tr[3]/td[2]/text()')[0]
schoolinfo.ResearchDirection = response.xpath('//table[@class="zsml-condition"]/tbody/tr[3]/td[4]/text()')[0]
schoolinfo.Instructor = response.xpath('//table[@class="zsml-condition"]/tbody/tr[4]/td[2]/text()')
schoolinfo.Number = response.xpath('//table[@class="zsml-condition"]/tbody/tr[4]/td[4]/text()')[0]
schoolinfo.Remarks = response.xpath('//table[@class="zsml-condition"]/tbody/tr[5]/text()')[0]
table = response.xpath('//tbody[@class="zsml-res-items"]')
for i in table:
schoolinfo.Politics = i.xpath('tr/td[1]/text()')[0]
schoolinfo.English = i.xpath('tr/td[2]/text()')[0]
schoolinfo.BusinessClass1 = i.xpath('tr/td[3]/text()')[0]
schoolinfo.BusinessClass1 = i.xpath('tr/td[4]/text()')[0]
self.schoolInfo.append(schoolinfo)
def GetSchoolUrl(self,url):
response = requests.post(url).text
response = etree.HTML(response)
page = response.xpath('//li[@class="lip lip-last"]/a/@onclick')
url = response.xpath('//*[@id="form3"]/a/@href')
for i in url:
self.schoolUrl.append(i)
print(page)
if page!=[]:
self.page += 1
self.GetSchoolUrl(self.url.format(self.data["yjxkdm"],self.page))
def WriteSchool(self):
with open("./text.txt",'a',encoding="utf-8")as f:
for i in self.schoolInfo:
print(i.EnrollmentUnit)
f.write(self.Tostring(i.EnrollmentUnit)+"\t"+self.Tostring(i.ExaminationMethod)+"\t"+self.Tostring(i.CollegesDepartments)+"\t"+self.Tostring(i.Major)+"\t"+self.Tostring(i.learningStyle)+"\t"+self.Tostring(i.ResearchDirection)+"\t"+self.Tostring(i.Instructor)+"\t"+self.Tostring(i.Number)+"\t"+self.Tostring(i.Remarks)+"\t"+self.Tostring(i.Politics)+"\t"+self.Tostring(i.English)+"\t"+self.Tostring(i.BusinessClass1)+'\t'+self.Tostring(i.BusinessClass2)+'\n')
def Tostring(self,str):
str ="".join(str)
return re.sub("[\t\r\n]+","",str)
def run(self):
self.GetSchoolUrl(self.url.format(self.data["yjxkdm"],self.page))
self.GetFacultyUrl()
self.GetSchoolInfo()
self.WriteSchool()
if __name__ == '__main__':
# 括号内填专业代码
yzw = YanZhaoWang('1055')
yzw.run()
代码详细解释
代码运行过程中需要requests、lxml、re库
学校介绍是有UniversityInfo类以字典形式储存的
YanZhaoWang类主要负责实现代码的主要逻辑
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· DeepSeek 开源周回顾「GitHub 热点速览」
· 物流快递公司核心技术能力-地址解析分单基础技术分享
· .NET 10首个预览版发布:重大改进与新特性概览!
· AI与.NET技术实操系列(二):开始使用ML.NET
· 单线程的Redis速度为什么快?