抓取菜鸟教程中python内置函数的关键字和链接


import requests 
from lxml import etree
import os
'''
1.抓取菜鸟教程中python内置函数的关键字和链接
并保存在文件中
2.抓取跳转路劲的页面中的描述性语言
失败

'''
# 看来抓取页面和跳转的链接非常的简单
url = "https://www.runoob.com/python/python-built-in-functions.html"
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}

response = requests.get(url = url,headers=headers)
page_1 = response.text
tree = etree.HTML(page_1)
a_list = tree.xpath('//div[@id="content"]/table/tbody/tr/td/a/text()')
a_herf = tree.xpath('//div[@id="content"]/table/tbody/tr/td/a/@href') # 取属性要加@
print(a_list)
print(a_herf)
fp = open('python内置函数.txt','w',encoding='utf-8')
a = 0
# 抓取跳转链接之后的关键字 这里的xpath路径返回空列表
for url_li in a_herf:
    
    url1 = "https://www.runoob.com/" + url_li
    fp.write(a_list[a]+'\t'+url1+'\n')
    a+=1
fp.close()
    # page_2 = requests.get(url=url1,headers=headers)
    # tree1 = etree.HTML(page_1)
    # p_list = tree1.xpath('//div[@class="article-intro"]/p/text')
    # p_shuomin = tree1.xpath('//div[@id="content"]/blockquote/text') # 这个是说明描述
    # print(p_list)
    # print('*'*40)
    # print(p_shuomin)




posted @ 2021-07-06 16:20  索匣  阅读(96)  评论(0编辑  收藏  举报