爬虫实践03 | xpath爬取通州区人民政府网站招聘信息

完整源代码：

#2022-03-02 xpath爬取通州区人民政府网站招聘信息
import requests
from lxml import etree
import time
for i in range(5):#一共有13页
    if i==1:
        url='http://www.bjtzh.gov.cn/bjtz/home/zpxx/index.shtml'
    else:
        url='http://www.bjtzh.gov.cn/bjtz/home/zpxx/index_{}.shtml'.format(i)
        
    headers={'User-Agent':'Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1'}
    url='http://www.bjtzh.gov.cn/bjtz/home/zpxx/index.shtml'
    response=requests.get(url=url,headers=headers)
    response.encoding = 'utf-8'
    html=etree.HTML(response.text)
    lis=html.xpath('//*[@id="channelNames"]/li')#拿到所有的li
    for li in lis:
        title=li.xpath('./a/text()')[0]#获取标题
        dt=li.xpath('./span/text()')[0]#获取日期
        href=li.xpath('./a/@href')[0]#获取页面链接
        href1=href.split(".")[-1]#以点来分割，往后取第一个元素
        if href1=="shtml":
            href="http://www.bjtzh.gov.cn"+href#拼接起来
        else:
            pass
        print(title,dt,href)
        time.sleep(3)
        with open(r"通州news.txt","a",encoding="utf-8") as f: #使用with open（）新建对象f ，a 表示追加
            f.write("{},{},{}".format(title,dt,href))   #将列表中的数据循环写入到文本文件中
            f.write("\n")#换行

分析：

1、访问网站信息

url=http://www.bjtzh.gov.cn/bjtz/home/zpxx/index.shtml

import requests
headers={'User-Agent':'Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1'}
url='http://www.bjtzh.gov.cn/bjtz/home/zpxx/index.shtml'
response=requests.get(url=url,headers=headers)
response.encoding = 'utf-8'#不加这一句打印出来的是乱码的
print(response.text)

这个打印出来的是乱码的，加一句代码：response.encoding = 'utf-8'

2、解析数据，拿到所有li标签

import requests
from lxml import etree
headers={'User-Agent':'Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1'}
url='http://www.bjtzh.gov.cn/bjtz/home/zpxx/index.shtml'
response=requests.get(url=url,headers=headers)
response.encoding = 'utf-8'
html=etree.HTML(response.text)
lis=html.xpath('//*[@id="channelNames"]/li')#拿到所有的li
print(lis)

copy得到的xpath是：//*[@id="channelNames"]/li[1]，需要的是所有的li，所以把[1]去掉

3、找到单个li，并查找所有需要的字段

import requests
from lxml import etree
import time
headers={'User-Agent':'Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1'}
url='http://www.bjtzh.gov.cn/bjtz/home/zpxx/index.shtml'
response=requests.get(url=url,headers=headers)
response.encoding = 'utf-8'
html=etree.HTML(response.text)
lis=html.xpath('//*[@id="channelNames"]/li')#拿到所有的li
for li in lis:
    title=li.xpath('./a/text()')[0]#获取标题
    dt=li.xpath('./span/text()')[0]#获取日期
    href=li.xpath('./a/@href')[0]#获取页面链接
    print(title,dt,href)
    time.sleep(1)

输出结果为：

由输出结果可以看出，部分href链接异常，格式不统一，所以要加一个if判断条件

import requests
from lxml import etree
import time
headers={'User-Agent':'Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1'}
url='http://www.bjtzh.gov.cn/bjtz/home/zpxx/index.shtml'
response=requests.get(url=url,headers=headers)
response.encoding = 'utf-8'
html=etree.HTML(response.text)
lis=html.xpath('//*[@id="channelNames"]/li')#拿到所有的li
for li in lis:
    title=li.xpath('./a/text()')[0]#获取标题
    dt=li.xpath('./span/text()')[0]#获取日期
    href=li.xpath('./a/@href')[0]#获取页面链接
    href1=href.split(".")[-1]#以点来分割，往后取第一个元素
    if href1=="shtml":
        href="http://www.bjtzh.gov.cn"+href#拼接起来
    else:
        pass
    print(title,dt,href)
    time.sleep(1)

其他字段也是一样的方法：点击按钮，在网页中点击你想查找的部分，在Elements对应代码中点击右键，Copy->Copy Xpath

4、保存数据

import requests
from lxml import etree
import time
for i in range(5):#一共有13页，可以直接改成13，这里只试了5页
    if i==1:
        url='http://www.bjtzh.gov.cn/bjtz/home/zpxx/index.shtml'
    else:
        url='http://www.bjtzh.gov.cn/bjtz/home/zpxx/index_{}.shtml'.format(i)
        
    headers={'User-Agent':'Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1'}
    url='http://www.bjtzh.gov.cn/bjtz/home/zpxx/index.shtml'
    response=requests.get(url=url,headers=headers)
    response.encoding = 'utf-8'
    html=etree.HTML(response.text)
    lis=html.xpath('//*[@id="channelNames"]/li')#拿到所有的li
    for li in lis:
        title=li.xpath('./a/text()')[0]#获取标题
        dt=li.xpath('./span/text()')[0]#获取日期
        href=li.xpath('./a/@href')[0]#获取页面链接
        href1=href.split(".")[-1]#以点来分割，往后取第一个元素
        if href1=="shtml":
            href="http://www.bjtzh.gov.cn"+href#拼接起来
        else:
            pass
        print(title,dt,href)
        time.sleep(3)
        with open(r"通州news.txt","a",encoding="utf-8") as f: #使用with open（）新建对象f ，a 表示追加
            f.write("{},{},{}".format(title,dt,href))   #将列表中的数据循环写入到文本文件中
            f.write("\n")#换行