爬取空气质量1
import urllib.request import requests import csv import re from lxml import etree url='http://www.air-level.com' response=urllib.request.urlopen(url+'/').read().decode() hrefs=re.findall(r'<a href="(.*?)">',response) k=1 listw=[] header=['监测站','AQL','空气质量等级','PM2.5','PM10','主要污染物'] for i in hrefs: urls=url + i headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36 QIHU 360SE'} response=requests.get(urls, headers=headers) #print(response.raise_for_status) if str(response.raise_for_status)=='<bound method Response.raise_for_status of <Response [200]>>': html=etree.HTML(response.text) print(html) name =html.xpath('/html/body/div[1]/div[3]/div[1]/div[2]/table/tr/td[1]/text()') aql=html.xpath('/html/body/div[1]/div[3]/div[1]/div[2]/table/tr/td[2]/text()') rank=html.xpath('/html/body/div[1]/div[3]/div[1]/div[2]/table/tr/td[3]/span/text()') pm2=html.xpath('/html/body/div[1]/div[3]/div[1]/div[2]/table/tr/td[4]/text()') pm10=html.xpath('/html/body/div[1]/div[3]/div[1]/div[2]/table/tr/td[5]/text()') po=html.xpath('/html/body/div[1]/div[3]/div[1]/div[2]/table/tr/td[6]/text()') with open('C:\\Users\\lenovo\\Desktop\\ookk.csv','a+',newline='',encoding='utf-8')as f: writers=csv.writer(f) for j in range(0,len(name)-1): if k == 1: k = k-1 writers.writerow(header) listw = [name[j], aql[j], rank[j], pm2[j],pm10[j],po[j]] writers.writerow(listw) else: if rank[j]=='优': listw = [name[j], aql[j], rank[j], pm2[j], pm10[j]] writers.writerow(listw) else: if aql[j]=='-': listw = [name[j]] writers.writerow(listw) else: listw = [name[j], aql[j], rank[j], pm2[j], pm10[j], po[j]] writers.writerow(listw)