博客代码备份的爬虫
博客代码备份的爬虫
介绍
考虑到博客中部分代码存在备份的需求,因此基于python的requests、beautifulsoup、selenium等库做了一个爬虫工具,可将博客中的代码部分以文件形式存储下来。
由于能力有限因此目前只做了单线程的实现,后续考虑多线程等优化以提高效率。
完整代码以上传至Github中: cnblogspider
代码的部分
实现的思路是首先获取所有需要存储代码的url链接,接着依据链接逐个获取代码并存储至文件中。代码主要分为三部分
mainway.py
获取全部链接,存储代码文件。
from getAllUrls import GetAllUrl
import time
from getCodeFromOnePage import getCode
import os
#设置一些常量
header={
"Accept": "application/json,text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "zh-cn",
"Accept-Encoding": "gzip, deflate",
"Connection": "keep-alive",
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.4 Safari/605.1.15"
}
start_url = "https://www.cnblogs.com/ghosteq/category/2100349.html?page=1"
out_dir = 'AllPATCPPCode'
#存储
def store_lists_to_a_file(lists,file_names = "lists.txt"):
fo = open(file_names,"w",encoding = "utf8",newline="")
for i in lists:
fo.writelines(str(i))
fo.close()
return 0
#设置相应的文件路径
if not os.path.exists(out_dir):
os.mkdir(out_dir)
#获取网页的信息
t1 = time.time()
ga = GetAllUrl()
urldic = ga.getUrlsDic(start_url)
t2 = time.time()
print('获取网址部分耗时 {} 秒'.format(t2-t1))
#获取存储页面内代码
for k,v in urldic.items():
code = getCode(v)
filespath = out_dir + "\\" + k +".cpp"
store_lists_to_a_file(code,filespath)
t1 = time.time()
print('获取代码部分耗时 {} 秒'.format(t1-t2))
getAllUrls.py
本部分使用requests、beautifulsoup等库获取所有需要备份代码的网址链接。
import requests
from bs4 import BeautifulSoup as BS
class GetAllUrl(object):
header={
"Accept": "application/json,text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "zh-cn",
"Accept-Encoding": "gzip, deflate",
"Connection": "keep-alive",
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.4 Safari/605.1.15"
}
def __init__(self,h=None):
if(h is not None):
self.header=h
def getAllPagesUrl(self,pagesUrlsList,bs_s, header=header):
#对于多页的情况,获取每一页的url
bs_s_url = bs_s.find(class_="pager").find_all("a")
for u in bs_s_url:
if u["href"] not in pagesUrlsList:
pagesUrlsList.append(u["href"])
return pagesUrlsList
def getAllUrlListInPages(self,pagesUrlsList, header=header):
#获取页面表内,全部的文章链接,返回一个列表
contextUrlsList = []
for pages in pagesUrlsList:
r_s = requests.get(pages, header)
bs_s = BS(r_s.text,"lxml")
urls = bs_s.find_all(class_="entrylistItemTitle")
for u in urls:
if u["href"] not in contextUrlsList:
contextUrlsList.append(u["href"])
return contextUrlsList
def getAllUrlDicInPages(self,pagesUrlsList, header=header):
#获取页面表内,全部的文章链接,返回一个字典,键为文章标题,值为文章链接
contextUrlsDic = {}
for pages in pagesUrlsList:
r_s = requests.get(pages, header)
bs_s = BS(r_s.text,"lxml")
urls = bs_s.find_all(class_="entrylistItemTitle")
for u in urls:
if u.find("span").string not in contextUrlsDic:
contextUrlsDic[u.find("span").string]=u["href"]
return contextUrlsDic
def getUrlsList(self,url,header=header):
#获取url系列页的所有文章链接,返回一个列表
pagesUrlsList = []
pagesUrlsList.append(url)
r_s = requests.get(url, header)
bs_s = BS(r_s.text,"lxml")
pagesUrlsList = self.getAllPagesUrl(pagesUrlsList,bs_s, header)
contextUrlsList = self.getAllUrlListInPages(pagesUrlsList,header)
return contextUrlsList
def getUrlsDic(self,url,header=header):
#获取url系列页的所有文章链接,返回一个字典,键为文章标题,值为文章链接
pagesUrlsList = []
pagesUrlsList.append(url)
r_s = requests.get(url, header)
bs_s = BS(r_s.text,"lxml")
pagesUrlsList = self.getAllPagesUrl(pagesUrlsList,bs_s, header)
contextUrlsList = self.getAllUrlDicInPages(pagesUrlsList,header)
return contextUrlsList
if __name__ == '__main__':
url = "https://www.cnblogs.com/ghosteq/category/2100349.html?page=1"
header={
"Accept": "application/json,text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "zh-cn",
"Accept-Encoding": "gzip, deflate",
"Connection": "keep-alive",
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.4 Safari/605.1.15"
}
ga = GetAllUrl(header)
contextUrlsList = ga.getUrlsList(url)
contextUrlsDic = ga.getUrlsDic(url)
print(contextUrlsList)
print(len(contextUrlsList))
print(contextUrlsDic)
print(len(contextUrlsDic))
getCodeFromOnePage.py
本部分使用seleium、pyperclip库获取页面中的代码。
但在实现过程中win32clipboard无法达到预期效果,报错 Specified clipboard format is not available,且难以解决。因此改用pyperclip,可是若息屏或设置不显示Chorme时无法正常复制。保留有问题的部分以便后续解决。
import win32clipboard
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.chrome.options import Options
import time
import pyperclip
header={
"Accept": "application/json,text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "zh-cn",
"Accept-Encoding": "gzip, deflate",
"Connection": "keep-alive",
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.4 Safari/605.1.15"
}
executable_path='driver\\chromedriver.exe'
#chrome_options = Options()
#chrome_options.add_argument('--no-sandbox')
#chrome_options.add_argument('--disable-dev-shm-usage')
#chrome_options.add_argument('--headless'))
def getCode(url):
#获取url内的代码
codetexts = ''
browser = webdriver.Chrome(executable_path=executable_path)
browser.get(url)
time.sleep(1)
copybotton = browser.find_elements_by_css_selector("[class='clipboard code-copay-btn hljs-comment']")[-1]
ActionChains(browser).move_to_element(copybotton).click().click().perform()
time.sleep(1)
codetexts=pyperclip.paste()
browser.close()
return codetexts
#实现过程中win32clipboard无法达到预期效果,报错 Specified clipboard format is not available,且难以解决
#因此改用pyperclip,但若息屏或设置不显示Chorme时无法复制
# win32clipboard.OpenClipboard()
# win32clipboard.EmptyClipboard()
# browser = webdriver.Chrome(chrome_options=chrome_options, executable_path=executable_path)
# ActionChains(browser).key_down(Keys.CONTROL).send_keys("c").key_up(Keys.CONTROL).perform()
# win32clipboard.CloseClipboard()
# win32clipboard.OpenClipboard()
# data=win32clipboard.GetClipboardData(win32clipboard.CF_UNICODETEXT)
# win32clipboard.CloseClipboard()
#输出粘贴板内容
# data = data.decode('utf-8')
if __name__ == '__main__':
url = "https://www.cnblogs.com/ghosteq/p/16691733.html"
print(getCode(url))
# win32clipboard.CloseClipboard()
# contextUrlsList = getUrlsList(url,header)
# print(contextUrlsList)
# print(len(contextUrlsList))
# win32clipboard.OpenClipboard()
# #清空粘贴板
## win32clipboard.EmptyClipboard()
# data=win32clipboard.GetClipboardData(win32clipboard.CF_TEXT)
# win32clipboard.CloseClipboard()
#输出粘贴板内容
# data = data.decode('utf-8')
# print(data)
# print(type(data))
本文来自博客园,作者:ghosteq,转载请注明原文链接:https://www.cnblogs.com/ghosteq/articles/16693313.html