python学习日记——小作业之抓取新氧数据
# 抓取新氧数据 import requests import json import xlwt from bs4 import BeautifulSoup proxies={"http": "http://49.70.64.155:9999", "https": "http://59.57.148.70:9999", } # 初始化表格行数 row=0 def get_shuhouhuli(url_diclist): headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36 QIHU 360SE' } for url_dic in url_diclist: workbook = xlwt.Workbook(encoding='utf-8', style_compression=0) sheet = workbook.add_sheet('doctorinfo', cell_overwrite_ok=True) for k,v in url_dic.items(): response = requests.get(v, headers = headers) soup=BeautifulSoup(response.text,'lxml') shuhouhulilist=soup.select("#surgery_after > div > div") cols=0 global row for shuhouhuli in shuhouhulilist: print(shuhouhuli.text) sheet.write(row, cols, shuhouhuli.text) cols = cols + 1 row = row + 1 workbook.save("xinyanginfo.xls") def get_finalurl(preurl): headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36 QIHU 360SE' } finalurl = [] try: response=json.loads(requests.get(preurl,headers=headers).text) for info in response: try: pinyin=info["seo"]["pinyin"] finalurl.append({info["name"]: "https://www.soyoung.com/itemk/" + pinyin + "/"}) except: print(info) except: print(preurl+"不可用") return finalurl def scra_data(): workbook = xlwt.Workbook(encoding='utf-8', style_compression=0) sheet = workbook.add_sheet('xinyanginfo', cell_overwrite_ok=True) url="" try: for i in range(20155,20244): # 得到一级url url="https://www.soyoung.com/items/itemList?_json=1&menu_id="+str(i) # 根据一级url抓取得到二级url的字典的列表 finalurldic=get_finalurl(url) # 根据二级url抓取得到信息 for url_dic in finalurldic: headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36' } for k, v in url_dic.items(): response = requests.get(v, headers=headers) soup = BeautifulSoup(response.text, 'lxml') shuhouhulilist = soup.select("#surgery_after > div > div") cols = 2 global row sheet.write(row, 0, k) sheet.write(row, 1, v) for shuhouhuli in shuhouhulilist: sheet.write(row, cols, shuhouhuli.text) cols = cols + 1 row = row + 1 except: workbook.save("xinyanginfo.xls") print(url) workbook.save("xinyanginfo.xls") scra_data()
记录一下抓取的代码,因为新氧的安全策略,所以代理需要频繁替换,估计抓四次左右即可抓全数据