爬取python网站下载地址,并下载最新文件

1.下载https://www.python.org/ftp/python/最新版本python文件
   一个下载网站,查看最新的,然后下载对应版本文件(如,列出python版本,并下载 https://www.python.org/ftp/python/3.5.2/Python-3.5.2.tar.xz) 。

  代码如下:

import requests
from lxml import etree
import time
import random
from fake_useragent import UserAgent
url = 'https://www.python.org/ftp/python/'
headers = {'User-Agent': UserAgent().random}
html = requests.get(url=url, headers=headers).content.decode('utf-8', 'ignore')
parse = etree.HTML(html)
table_list = parse.xpath('//a/text()')
  for i in table_list:
print(i)

####打印如下:
..
2.0/
2.0.1/
2.1/
2.1.1/
2.1.2/
2.1.3/

2.代码优化,获取最新下载地址,如https://www.python.org/ftp/python/3.12.2/Python-3.12.2.tgz

import requests
from lxml import etree
import time
import random
import re
from distutils.version import LooseVersion
from fake_useragent import UserAgent

url = 'https://www.python.org/ftp/python/'
headers = {'User-Agent': UserAgent().random}
html = requests.get(url=url, headers=headers).content.decode('utf-8', 'ignore')
parse = etree.HTML(html)
table_list = parse.xpath('//a/text()')

my_list = []
for i in table_list:
  my_list.append(i.replace('/',''))    ##替换/为空
#print(my_list)

matched_elements = []
pattern = r'^\d.*$'    ##匹配数据开头的元素
for n in my_list:
  if re.search(pattern, n): 
    matched_elements.append(n)
#print(matched_elements)

re.compile(r'(\d+ | [a-z]+ | \.)', re.VERBOSE)
matched_elements.sort(key=LooseVersion, reverse=True)   ##列表元素排序
x=matched_elements[0]
print('https://www.python.org/ftp/python/' +x +"/Python-" +x +".tgz")

打印输出:https://www.python.org/ftp/python/3.12.2/Python-3.12.2.tgz

posted on 2023-11-18 18:53  枫飘过的天1  阅读(104)  评论(0编辑  收藏  举报