python之简单爬虫
1 #爬虫 :模拟浏览器发送请求,下载网页代码,只提取有用的数据,存放于数据库或者文件中 2 #方案:python语法来实现,requests库发请求解析请求,excel表存储数据 3 #爬虫操作流程:1.模拟浏览器发送请求 4 #2.下载网页代码 5 #3.只提取有用的数据 6 #4.存放于数据库或者文件中 7 8 #-------------------------------------------------------------------------------- 9 #excel初始化:新建一个excel文件,创建个子表 10 #导入模块 11 import requests,re 12 import xlwt 13 workBook = xlwt.Workbook(encoding='uft-8')#此时还在缓存里 14 #创建子表 15 workSheet = workBook.add_sheet('51job') 16 #创建表头 17 colName = ['岗位名称','公司名称','地点','薪资','发布时间'] 18 #把表头写进表格中 19 for one in range(0,len(colName)):#0,1,2,3,4 20 #写进单元格 21 workSheet.write(0,one,colName[one]) #行号,列号,内容 22 #-------------------------------------------------------------------------------- 23 24 #获取网站的页数,定义个函数 25 # <span class="td">共7页,到第</span> 26 def get_pagenum(): 27 web_url = 'https://search.51job.com/list/080200,000000,0000,00,9,07,%25E8%25BD%25AF%25E4%25BB%25B6%25E6%25B5%258B%25E8%25AF%2595,2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare=' 28 resp = requests.get(web_url) # 构建请求 29 resp.encoding = 'gbk' # 解决乱码 30 pages = int(re.findall('<span class="td">共(.*?)页,到第</span>', resp.text, re.S)[0]) 31 return pages 32 #-------------------------------------------------------------------------------- 33 34 #目标:获取符合要求的岗位 35 raw = 1 #内容的初始化行=1 ,因为有表头了 36 #1.构建请求 37 for one in range(1,get_pagenum()+1): 38 web_url = f'https://search.51job.com/list/080200,000000,0000,00,9,07,%25E8%25BD%25AF%25E4%25BB%25B6%25E6%25B5%258B%25E8%25AF%2595,2,{one}.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare=' 39 resp = requests.get(web_url) # 构建请求 40 resp.encoding = 'gbk' #解决乱码 41 # print(resp.request.headers) #请求头 42 # print(resp.request.body) #请求体 43 #2.解析响应数据 44 print(resp.text) 45 #3.提取有效数据 46 #从响应数据中提取有效数据。re.S #使.匹配包括换行符在内的所有字符 47 info = re.findall('<div class="el">(.*?)</div>',resp.text,re.S) #数据中有很多换行符,需要加re.S 48 for line in info: 49 #1.获取岗位名称 50 temp = re.findall('<a target="_blank" title="(.*?)" href',line,re.S) 51 # 因为re.findall返回值是列表,所以要取值就必须写下标 52 jobName = temp[0].strip() 53 workSheet.write(raw, 0, jobName) 54 # 2.获取公司名称,#因为re.findall返回值是列表,所以要取值就必须写下标 55 company = temp[1].strip() 56 workSheet.write(raw, 1, company) 57 # 3.获取地点,#因为re.findall返回值是列表,所以要取值就必须写下标 58 adress = re.findall('<span class="t3">(.*?)</span>',line,re.S)[0] 59 workSheet.write(raw, 2, adress) 60 # 4. 获取薪资,#因为re.findall返回值是列表,所以要取值就必须写下标 61 salary = re.findall('<span class="t4">(.*?)</span>', line, re.S)[0] 62 workSheet.write(raw, 3, salary) 63 #5.获取发布时间 64 jobTime = re.findall('<span class="t5">(.*?)</span>', line, re.S)[0] 65 workSheet.write(raw, 4, jobTime) 66 raw +=1 67 print(jobName,company,adress,salary,jobTime) 68 69 #-------------------------------------------------------------------------------- 70 71 #4.存储数据 72 #保存 73 workBook.save('D:\\51.job.xls') 74 75 #-------------------------------------------------------------------------------- 76 77 78 79 80 81 82 83 84 ''' 85 <div class="el"> 86 <p class="t1 "> 87 <em class="check" name="delivery_em" onclick="checkboxClick(this)"></em> 88 <input class="checkbox" type="checkbox" name="delivery_jobid" value="85295957" jt="0" style="display:none"> 89 <span> 90 <a target="_blank" title="软件测试工程师" href="https://jobs.51job.com/hangzhou-yhq/85295957.html?s=01&t=0" onmousedown=""> 91 软件测试工程师 </a> 92 </span> 93 </p> 94 <span class="t2"><a target="_blank" title="杭州老板电器股份有限公司" href="https://jobs.51job.com/all/co2322348.html">杭州老板电器股份有限公司</a></span> 95 <span class="t3">杭州-余杭区</span> 96 <span class="t4">10-15万/年</span> 97 <span class="t5">03-21</span> 98 </div> 99 100 '''