博客代码备份的爬虫

博客代码备份的爬虫

介绍

考虑到博客中部分代码存在备份的需求,因此基于python的requests、beautifulsoup、selenium等库做了一个爬虫工具,可将博客中的代码部分以文件形式存储下来。
由于能力有限因此目前只做了单线程的实现,后续考虑多线程等优化以提高效率。
完整代码以上传至Github中: cnblogspider

代码的部分

实现的思路是首先获取所有需要存储代码的url链接,接着依据链接逐个获取代码并存储至文件中。代码主要分为三部分

mainway.py

获取全部链接,存储代码文件。

from getAllUrls import GetAllUrl
import time
from getCodeFromOnePage import getCode
import os

#设置一些常量
header={
"Accept": "application/json,text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "zh-cn",
"Accept-Encoding": "gzip, deflate",
"Connection": "keep-alive",
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.4 Safari/605.1.15"
}

start_url = "https://www.cnblogs.com/ghosteq/category/2100349.html?page=1"
out_dir = 'AllPATCPPCode'

#存储
def store_lists_to_a_file(lists,file_names = "lists.txt"):
    fo = open(file_names,"w",encoding = "utf8",newline="")
    for i in lists:
        fo.writelines(str(i))
    fo.close()
    return 0

#设置相应的文件路径
if not os.path.exists(out_dir):
    os.mkdir(out_dir)

#获取网页的信息
t1 = time.time()
ga = GetAllUrl()
urldic = ga.getUrlsDic(start_url)
t2 = time.time()
print('获取网址部分耗时 {} 秒'.format(t2-t1))

#获取存储页面内代码
for k,v in urldic.items():
    code = getCode(v)
    filespath = out_dir + "\\" + k +".cpp"
    store_lists_to_a_file(code,filespath)
t1 = time.time()
print('获取代码部分耗时 {} 秒'.format(t1-t2))

getAllUrls.py

本部分使用requests、beautifulsoup等库获取所有需要备份代码的网址链接。

import requests
from bs4 import BeautifulSoup as BS

class GetAllUrl(object):
    
    header={
    "Accept": "application/json,text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    "Accept-Language": "zh-cn",
    "Accept-Encoding": "gzip, deflate",
    "Connection": "keep-alive",
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.4 Safari/605.1.15"
    }
    def __init__(self,h=None):
        if(h is not None):
            self.header=h
               
    def getAllPagesUrl(self,pagesUrlsList,bs_s, header=header):
        #对于多页的情况,获取每一页的url 
        bs_s_url = bs_s.find(class_="pager").find_all("a")
        for u in bs_s_url:
            if u["href"] not in pagesUrlsList:
                pagesUrlsList.append(u["href"])
        return pagesUrlsList
    
    def getAllUrlListInPages(self,pagesUrlsList, header=header):
        #获取页面表内,全部的文章链接,返回一个列表
        contextUrlsList = []
        for pages in pagesUrlsList:
            r_s = requests.get(pages, header)
            bs_s = BS(r_s.text,"lxml")
            urls = bs_s.find_all(class_="entrylistItemTitle")
            
            for u in urls:
                if u["href"] not in contextUrlsList:
                    contextUrlsList.append(u["href"])
        return contextUrlsList
         
    def getAllUrlDicInPages(self,pagesUrlsList, header=header):
        #获取页面表内,全部的文章链接,返回一个字典,键为文章标题,值为文章链接
        contextUrlsDic = {}
        for pages in pagesUrlsList:
            r_s = requests.get(pages, header)
            bs_s = BS(r_s.text,"lxml")
            urls = bs_s.find_all(class_="entrylistItemTitle")
            
            for u in urls:
                if u.find("span").string not in contextUrlsDic:
                    contextUrlsDic[u.find("span").string]=u["href"]
        return contextUrlsDic
    
    def getUrlsList(self,url,header=header):
        #获取url系列页的所有文章链接,返回一个列表
        pagesUrlsList = []        
        pagesUrlsList.append(url)
        r_s = requests.get(url, header)
        bs_s = BS(r_s.text,"lxml")
        pagesUrlsList = self.getAllPagesUrl(pagesUrlsList,bs_s, header)
        contextUrlsList = self.getAllUrlListInPages(pagesUrlsList,header)
        return contextUrlsList
    
    def getUrlsDic(self,url,header=header):
        #获取url系列页的所有文章链接,返回一个字典,键为文章标题,值为文章链接
        pagesUrlsList = []
        pagesUrlsList.append(url)
        r_s = requests.get(url, header)
        bs_s = BS(r_s.text,"lxml")
        pagesUrlsList = self.getAllPagesUrl(pagesUrlsList,bs_s, header)
        contextUrlsList = self.getAllUrlDicInPages(pagesUrlsList,header)
        return contextUrlsList
		
if __name__ == '__main__': 
    url = "https://www.cnblogs.com/ghosteq/category/2100349.html?page=1"
    header={
    "Accept": "application/json,text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    "Accept-Language": "zh-cn",
    "Accept-Encoding": "gzip, deflate",
    "Connection": "keep-alive",
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.4 Safari/605.1.15"
    }
    ga = GetAllUrl(header)
    contextUrlsList = ga.getUrlsList(url)
    contextUrlsDic = ga.getUrlsDic(url)
    print(contextUrlsList)
    print(len(contextUrlsList))
    print(contextUrlsDic)
    print(len(contextUrlsDic))

getCodeFromOnePage.py

本部分使用seleium、pyperclip库获取页面中的代码。
但在实现过程中win32clipboard无法达到预期效果,报错 Specified clipboard format is not available,且难以解决。因此改用pyperclip,可是若息屏或设置不显示Chorme时无法正常复制。保留有问题的部分以便后续解决。

import win32clipboard
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.chrome.options import Options
import time
import pyperclip


header={
"Accept": "application/json,text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "zh-cn",
"Accept-Encoding": "gzip, deflate",
"Connection": "keep-alive",
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.4 Safari/605.1.15"
}
executable_path='driver\\chromedriver.exe'
#chrome_options = Options()
#chrome_options.add_argument('--no-sandbox')
#chrome_options.add_argument('--disable-dev-shm-usage')
#chrome_options.add_argument('--headless'))

def getCode(url):
    #获取url内的代码
    codetexts = ''
    browser = webdriver.Chrome(executable_path=executable_path)
    browser.get(url)
    time.sleep(1)
    copybotton = browser.find_elements_by_css_selector("[class='clipboard code-copay-btn hljs-comment']")[-1]
    ActionChains(browser).move_to_element(copybotton).click().click().perform()
    time.sleep(1)
    codetexts=pyperclip.paste()
    browser.close()
    return codetexts

#实现过程中win32clipboard无法达到预期效果,报错 Specified clipboard format is not available,且难以解决
#因此改用pyperclip,但若息屏或设置不显示Chorme时无法复制

#    win32clipboard.OpenClipboard()
#    win32clipboard.EmptyClipboard()
#    browser = webdriver.Chrome(chrome_options=chrome_options, executable_path=executable_path)

#    ActionChains(browser).key_down(Keys.CONTROL).send_keys("c").key_up(Keys.CONTROL).perform()
#    win32clipboard.CloseClipboard()
#    win32clipboard.OpenClipboard()
#    data=win32clipboard.GetClipboardData(win32clipboard.CF_UNICODETEXT)
#    win32clipboard.CloseClipboard()
    #输出粘贴板内容
#    data = data.decode('utf-8')

if __name__ == '__main__': 
    url = "https://www.cnblogs.com/ghosteq/p/16691733.html"
    print(getCode(url))
#    win32clipboard.CloseClipboard()
#    contextUrlsList = getUrlsList(url,header)
#    print(contextUrlsList)
#    print(len(contextUrlsList))
#    win32clipboard.OpenClipboard()
#    #清空粘贴板
##    win32clipboard.EmptyClipboard()
#    data=win32clipboard.GetClipboardData(win32clipboard.CF_TEXT)
#    win32clipboard.CloseClipboard()
    #输出粘贴板内容
#    data = data.decode('utf-8')
#    print(data)
#    print(type(data))
    
posted @ 2022-09-14 15:49  ghosteq  阅读(15)  评论(0编辑  收藏  举报