爬取加载页面数据
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 | '' ' @author:zl @contact: @site: https: //search.51job.com/list/000000,000000,0000,00,9,99,python,2,2.html '' ' # _*_ coding:utf-8 _*_ import requests from bs4 import BeautifulSoup import re import time from pymongo import MongoClient import xlwt import json headers = { 'user-agent' : "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36" , 'accept' : "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8" , 'accept-encoding' : "gzip, deflate, br" , 'accept-language' : "zh-CN,zh;q=0.9" , 'cache-control' : "max-age=0" , 'upgrade-insecure-requests' : "1" , 'Connection' : 'keep-alive' , 'Host' : "search.51job.com" , } # 获取源码 def get_content(): post_param = { 'action' : '' , 'start' : 0, 'limit' :300} html = requests. get ( "https://movie.douban.com/j/chart/top_list?type=5&interval_id=100%3A90" , params =post_param, verify=False) #jsondata = html.content.decode(encoding='utf-8') jsondata=html.json() return jsondata # 获取字段 def get (jsondata): #jsondata=json.loads(jsondata) list = [] for i in jsondata: item ={ 'rank' :i[ 'rank' ], 'cover_url' :i[ 'cover_url' ], 'id' :i[ 'id' ], 'types' :i[ 'types' ], 'regions' :i[ 'regions' ], 'title' :i[ 'title' ], 'url' :i[ 'url' ], 'release_date' :i[ 'release_date' ], 'actor_count' :i[ 'actor_count' ], 'vote_count' :i[ 'vote_count' ], 'score' :i[ 'score' ], 'actors' :i[ 'actors' ], } list.append(item) return list # 爬到的内容写入excel def excel_write(items): for item in items: # 职位信息 j=0 for i in item: print(item[i]) print( "j:" ,j) index=item[ 'rank' ] print( "index:" ,index) ws.write(index, j, item[i]) # 行,列,数据 j += 1 if __name__ == '__main__' : newTable = "test2.xls" # 表格名称 wb = xlwt.Workbook(encoding= 'utf-8' ) # 创建excel文件,声明编码 ws = wb.add_sheet( 'sheet1' ,cell_overwrite_ok=True) # 创建表格 headData = [ 'rank' , 'cover_url' , 'id' , 'types' , 'regions' , 'title' , 'url' , 'release_date' , 'actor_count' , 'vote_count' , 'score' , 'actors' ] # 表头信息 for colnum in range(0,12): ws.write(0,colnum,headData[colnum],xlwt.easyxf( 'font: bold on' )) excel_write( get (get_content())) wb.save(newTable) |
#解析json
import requests
response=requests.get('http://httpbin.org/get')
import json
res1=json.loads(response.text)
#太麻烦
res2=response.json()
#直接获取json数据
print(res1 == res2) #True
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 | '' ' @author:zl @contact: @site: https: //search.51job.com/list/000000,000000,0000,00,9,99,python,2,2.html '' ' # _*_ coding:utf-8 _*_ import requests from bs4 import BeautifulSoup import re import time from pymongo import MongoClient import xlwt import json headers = { 'user-agent' : "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36" , 'accept' : "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8" , 'accept-encoding' : "gzip, deflate, br" , 'accept-language' : "zh-CN,zh;q=0.9" , 'cache-control' : "max-age=0" , 'upgrade-insecure-requests' : "1" , 'Connection' : 'keep-alive' , 'Host' : "search.51job.com" , } # 获取源码 def get_content(): post_param = { 'action' : '' , 'start' : 0, 'limit' :300} html = requests. get ( "https://movie.douban.com/j/chart/top_list?type=5&interval_id=100%3A90" , params =post_param, verify=False) #jsondata = html.content.decode(encoding='utf-8') jsondata=html.json() return jsondata # 获取字段 def get (jsondata): #jsondata=json.loads(jsondata) list = [] for i in jsondata: item ={ 'rank' :i[ 'rank' ], 'cover_url' :i[ 'cover_url' ], 'id' :i[ 'id' ], 'types' :i[ 'types' ], 'regions' :i[ 'regions' ], 'title' :i[ 'title' ], 'url' :i[ 'url' ], 'release_date' :i[ 'release_date' ], 'actor_count' :i[ 'actor_count' ], 'vote_count' :i[ 'vote_count' ], 'score' :i[ 'score' ], 'actors' :i[ 'actors' ], } list.append(item) return list # 爬到的内容写入excel def excel_write(items): for item in items: # 职位信息 j=0 for i in item: print(item[i]) print( "j:" ,j) index=item[ 'rank' ] print( "index:" ,index) ws.write(index, j, item[i]) # 行,列,数据 j += 1 if __name__ == '__main__' : newTable = "test2.xls" # 表格名称 wb = xlwt.Workbook(encoding= 'utf-8' ) # 创建excel文件,声明编码 ws = wb.add_sheet( 'sheet1' ,cell_overwrite_ok=True) # 创建表格 headData = [ 'rank' , 'cover_url' , 'id' , 'types' , 'regions' , 'title' , 'url' , 'release_date' , 'actor_count' , 'vote_count' , 'score' , 'actors' ] # 表头信息 for colnum in range(0,12): ws.write(0,colnum,headData[colnum],xlwt.easyxf( 'font: bold on' )) excel_write( get (get_content())) wb.save(newTable) |
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】凌霞软件回馈社区,博客园 & 1Panel & Halo 联合会员上线
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】博客园社区专享云产品让利特惠,阿里云新客6.5折上折
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步