第一周学习总结
这一周我学习了python的一些基本语法和函数 以及如何从不同类型的网站爬虫
爬取了丁香园-的疫情数据还有拉勾网,猫眼的一些信息
学到两种方法,一种是xpath方法,一种是正则表达式
xpath
import requests from lxml import etree import xlwt url='http://www.beijing.gov.cn/hudong/hdjl/com.web.consult.consultDetail.flow?originalId=AH20021900306' headers = { "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36" } response=requests.post(url,headers=headers) res_html=etree.HTML(response.text)//将爬取出的数据解析为要xpath的形式 dd_list=res_html.xpath('//*[@id="f_baner"]/div[1]/div[1]/div[2]/strong/text()')[0].strip() print(dd_list)
提取选择的文字,.strip()将空格切割出去
还有要爬取的数据就在源代码上,用正则表达式
import requests import re import xlwt url = 'https://maoyan.com/board/4?' headers = { "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36" } def get_page(url): try: response = requests.get(url, headers=headers) response.encoding = 'utf-8' if response.status_code == 200: return response.text else: print('获取网页失败') except Exception as e: print(e) def get_info(page): items = re.findall('.*?class="name"><a href=".*?" title="(.*?)".*?">.*?</a></p>.*?<p class="releasetime">(.*?)</p>',page,re.S)//(.*?)表示你要爬取的内容 for item in items: data = {} data['name'] = item[0] data['time'] = item[1] # print (123) print(data) yield data urls = ['https://maoyan.com/board/4?offset={}'.format(i * 10) for i in range(10)] DATA = [] for url in urls: print (url) page = get_page(url) datas = get_info(page) for data in datas: DATA.append(data) # 将所有的数据添加到DATA里 f = xlwt.Workbook(encoding='utf-8') sheet01 = f.add_sheet(u'sheet1', cell_overwrite_ok=True) sheet01.write(0, 0, 'name') # 第一行第一列 sheet01.write(0, 1, 'time') # 第一行第一列 # 写内容 for i in range(len(DATA)): sheet01.write(i + 1, 0, DATA[i]['name']) sheet01.write(i + 1, 1, DATA[i]['time']) print('p', end='') f.save('F:\\作业.xls')
还有数据在json上是通过ajax来获取的
import requests import time import json import xlwt urls='https://www.lagou.com/jobs/list_%E7%88%AC%E8%99%AB/p-city_0?&cl=false&fromSearch=true&labelWords=&suginput=' url = 'https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false' headers = { 'Host': 'www.lagou.com', 'Origin': 'https://www.lagou.com', 'Referer': 'https://www.lagou.com/jobs/list_%E7%88%AC%E8%99%AB/p-city_0?&cl=false&fromSearch=true&labelWords=&suginput=', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36', 'X-Requested-With': 'XMLHttpRequest' } # data参数在network/XHR中找到包含数据的网页,然后在该网页headers的最下面可以找到data参数。 # 每一页url不变,但data里面的参数会变,是用来控制搜索关键字和页数的。 def get_page(url, i): data = { 'first': 'false', 'kd': '爬虫', # 搜索关键词 改关键字的时候,headers里面的referer也要改。 'pn': i # 第几页 } try: s = requests.Session() s.get(urls, headers=headers, timeout=3) # 请求首页获取cookies cookie = s.cookies # 为此次获取的cookies response = s.post(url, data=data, headers=headers, cookies=cookie, timeout=3) # 获取此次文本 time.sleep(5) response.raise_for_status() response.encoding = response.apparent_encoding return response.json() except requests.ConnectionError as e: print('Error:', e.args) def get_info(json): json = json['content']['positionResult']['result'] for item in json: data = {} data['companyFullName'] = item['companyFullName'] data['city'] = item['city'] data['district'] = item['district'] data['education'] = item['education'] data['financeStage'] = item['financeStage'] data['industryField'] = item['industryField'] data['companySize'] = item['companySize'] data['positionName'] = item['positionName'] data['salary'] = item['salary'] data['workYear'] = item['workYear'] yield data DATA = [] for i in range(1, 3): json = get_page(url, i) print('请求:' + str(i) + url) time.sleep(2) datas = get_info(json) # 如果这一步出了问题,那么可能就是因为上一步就出了问题。 for data in datas: DATA.append(data) ''' with open('./拉勾网.text','w',encoding='utf-8') as fp: fp.write(str(DATA))''' '''f = xlwt.Workbook(encoding='utf-8') sheet01 = f.add_sheet(u'sheet1', cell_overwrite_ok=True) sheet01.write(0, 0, 'companyFullName') # 第一行第一列 sheet01.write(0, 1, 'city') sheet01.write(0, 2, 'district') sheet01.write(0, 3, 'education') sheet01.write(0, 4, 'financeStage') sheet01.write(0, 5, 'positionName') sheet01.write(0, 6, 'salary') sheet01.write(0, 7, 'workYear') sheet01.write(0, 8, 'companySize') sheet01.write(0, 9, 'industryField') # 写内容 for i in range(len(DATA)): sheet01.write(i + 1, 0, DATA[i]['companyFullName']) sheet01.write(i + 1, 1, DATA[i]['city']) sheet01.write(i + 1, 2, DATA[i]['district']) sheet01.write(i + 1, 3, DATA[i]['education']) sheet01.write(i + 1, 4, DATA[i]['financeStage']) sheet01.write(i + 1, 5, DATA[i]['positionName']) sheet01.write(i + 1, 6, DATA[i]['salary']) sheet01.write(i + 1, 7, DATA[i]['workYear']) sheet01.write(i + 1, 8, DATA[i]['companySize']) sheet01.write(i + 1, 9, DATA[i]['industryField']) print('p', end='') f.save(u'F:\\爬虫.xls') '''