python 爬取获得github项目代码
1 # # -*- coding:utf-8 -*- 2 # @Time : 2021/7/22 22:04 3 # @Author : 周博 4 # @File : test_1.py 5 # @博客园: https://www.cnblogs.com/smartisn/ 6 import requests 7 from lxml import etree 8 import sys 9 from urllib import request 10 import zipfile 11 import os 12 import time 13 import Download_mysql_zip.mysql.SQL as MYSQL 14 from selenium import webdriver 15 from selenium.webdriver.common.by import By 16 from selenium.webdriver.support import expected_conditions as EC 17 from selenium.webdriver.support.wait import WebDriverWait 18 from selenium.webdriver.chrome.options import Options 19 def Get_whole_file(file): 20 Lists_val=[] 21 for root, dirs, files in os.walk(file): 22 # root 表示当前正在访问的文件夹路径 23 # dirs 表示该文件夹下的子目录名list 24 # files 表示该文件夹下的文件list 25 # 遍历文件 26 for f in files: 27 Lists_val.append(os.path.join(root, f)) 28 # # 遍历所有的文件夹 29 # for d in dirs: 30 # print(os.path.join(root, d)) 31 return Lists_val 32 def un_zip(zip_filename,des_dir): 33 ''' 34 解压压缩包至des_dir指定文件夹 35 :param zip_filename:输入的压缩包名字,例如a.zip 36 :param des_dir: 解压到的位置:例如为 ./文件存储/ 37 :return: 38 ''' 39 with zipfile.ZipFile(zip_filename, 'r') as zzz: 40 # 捕捉错误并且 返回存在错误的 压缩包名称 41 try: 42 zzz.extractall(des_dir) 43 print(zip_filename,"解压成功") 44 except zipfile.BadZipFile: 45 print("Error: 压缩文件不完整:",zip_filename) 46 47 def DownLoad_mysql_(start,end): 48 # 51-60 49 URLS = MYSQL.select_url_html(start,end) 50 for url_ in URLS: 51 print("*******************") 52 url=url_[0] 53 print(url) 54 file_name = url.split("/")[-1] 55 try: 56 strhtml = requests.get(url, timeout=7) # Get方式获取网页数据 57 tree = etree.HTML(strhtml.text) 58 # //*[@id="repo-content-pjax-container"]/div/div[2]/div[1]/div[1]/span/get-repo/details/div/div/div[1]/ul/li[3]/a 59 print(tree.xpath('//*[@id="repo-content-pjax-container"]/div/div[2]/div[1]/div[1]/span/get-repo/details/div/div/div[1]/ul/li[3]/a//@href')) 60 href_down = tree.xpath('//*[@id="repo-content-pjax-container"]/div/div[2]/div[1]/div[1]/span/get-repo/details/div/div/div[1]/ul/li[2]/a//@href')[0] 61 print("55555555555555555555555555555555555555555") 62 print(href_down) 63 href_down="https://github.com"+href_down 64 print(href_down) 65 print("./data/" + file_name + '.zip') 66 request.urlretrieve(href_down, "./data/" + file_name + '.zip') 67 print("下载成功") 68 except Exception as e: 69 print(e) 70 continue 71 if __name__=="__main__": 72 # E:\pycharm\WorkPlace\.net_analyzer\DownLoad_GitHub\data\ 73 options = Options() 74 # options.headless = True # 禁止打开 75 driver = webdriver.Chrome('D:\Program Apps\Google\Chrome\driver\chromedriver.exe',options=options) 76 '''获取所有的列表''' 77 for page in range(0,1): 78 url = 'https://github.com/search?l=C%23&o=desc&p='+str(page)+'&q=C%23&s=stars&type=Repositories' 79 print("*******************") 80 print(url) 81 strhtml = requests.get(url, timeout=7) 82 tree = etree.HTML(strhtml.text) 83 hreff = tree.xpath('//*[@id="js-pjax-container"]/div/div[3]/div/ul//div[@class="f4 text-normal"]//a//@href') 84 for hh in hreff: 85 try: 86 file_name=hh.replace("/","_") 87 hh="https://github.com"+hh 88 driver.get(hh) 89 time.sleep(2) 90 wait = WebDriverWait(driver, 20) 91 button1 = wait.until(EC.element_to_be_clickable((By.XPATH, 92 '//*[@id="repo-content-pjax-container"]/div/div[2]/div[1]/div[1]/span/get-repo/details/summary'))) 93 button1.click() 94 # //*[@id="repo-content-pjax-container"]/div/div[2]/div[1]/div[1]/span/get-repo/details/div/div/div[1]/ul/li[3]/a 95 button2 = wait.until(EC.element_to_be_clickable((By.XPATH,'//*[@id="repo-content-pjax-container"]/div/div[2]/div[1]/div[1]/span/get-repo/details/div/div/div[1]/ul/li[3]/a'))) 96 button2.click() 97 print(hh,"——————下载成功") 98 except Exception as e: 99 print(e) 100 continue
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· TypeScript + Deepseek 打造卜卦网站:技术与玄学的结合
· Manus的开源复刻OpenManus初探
· AI 智能体引爆开源社区「GitHub 热点速览」
· C#/.NET/.NET Core技术前沿周刊 | 第 29 期(2025年3.1-3.9)
· 从HTTP原因短语缺失研究HTTP/2和HTTP/3的设计差异