爬虫1
1. 将文件以xlsx格式存储并打开,里面只有一个sheet命名为‘工作表1’,内部格式为:
address_1 | Marketing | 链接_1 |
password: xxx | Pricelist | 链接_2 |
address_2 | Marketing | 链接_3 |
password: yyy | Pricelist | 链接_4 |
需求:将所有Pricelist对应的链接以pdf格式存储,存储名称为address
2. 步骤一:读取xlsx
import csv import xlrd wb = xlrd.open_workbook('C:/Users/Eleni/Downloads/price.xlsx') sh = wb.sheet_by_name('工作表1')
步骤二:计算address数量
n = int((sh.nrows+1)/3)
步骤三:在for循环中利用PhantomJS逐一打开链接,截图然后转存到指定文件夹中
参考链接:https://www.cnblogs.com/hong-fithing/p/9656221.html
首先自动创建一个临时文件夹,代码如下:
import os print(os.getcwd()) File_Path = os.getcwd() + '\\' + 'picture' + '\\' os.makedirs(File_Path)
优化后加上目录创建时间,以及容错处理:
import os import time from selenium import webdriver directory_time = time.strftime("%Y-%m-%d", time.localtime(time.time())) try: File_Path = os.getcwd() + '\\' + directory_time + '\\' if not os.path.exists(File_Path): os.makedirs(File_Path) print("目录新建成功:%s" % File_Path) else: print("目录存在") except BaseException as msg: print("新建目录失败:%s" % msg)
然后添加截图功能,将截图存入新建文件夹中:
driver1 = webdriver.PhantomJS() for i in range(n): driver1.get(sh.cell(i*3+1,2).value) driver1.save_screenshot('.\\' + directory_time + '\\' + str(i) + '.png')
步骤四:读取图片转化成pdf格式
报错:
(unicode error) 'unicodeescape' codec can't decode bytes in position 2-3: truncated \UXXXXXXXX escape
原因为path中‘\\’写成了‘/’
for i in range(n): a = sh.cell(i*3,0).value (maxw, maxh) = Image.open(os.getcwd() + '\\' + 'Desktop' + '\\' + 'demo' + '\\' + directory_time + '\\' + str(i) + '.png').size c = canvas.Canvas(os.getcwd() + '\\' + 'Desktop' + '\\' + 'demo' + '\\' + directory_time + '\\' + a + '.pdf', pagesize=portrait((maxw, maxh))) c.drawImage(os.getcwd() + '\\' + 'Desktop' + '\\' + 'demo' + '\\' + directory_time + '\\' + str(i) + '.png', 0, 0, maxw, maxh) c.showPage() c.save()
总结:跟人工在打印功能内转存pdf相比清晰度有所下降但不影响使用
测试中出现的问题:截图部分完全没有问题,但是转pdf的时候,有些过长的图片会只能读取一部分图片,解决方法是更换了一种转pdf方法,如下:
出自:windows下用Python把png图片转化为pdf文件_apollo_miracle的博客-CSDN博客_python 图片转pdf
import glob import fitz import os def pic2pdf(pdf_name, pic_floder): doc = fitz.open() for img in sorted(glob.glob(os.path.join(pic_floder, "*.png"))): # 读取图片,确保按文件名排序 print(img) imgdoc = fitz.open(img) # 打开图片 pdfbytes = imgdoc.convertToPDF() # 使用图片创建单页的 PDF imgpdf = fitz.open("pdf", pdfbytes) doc.insertPDF(imgpdf) # 将当前页插入文档 # 修订PDF文件名 if pdf_name.endswith(".pdf"): pdf_name += ".pdf" # 保存在图片文件夹下 save_pdf_path = os.path.join(pic_floder, pdf_name) if os.path.exists(save_pdf_path): os.remove(save_pdf_path) doc.save(save_pdf_path) # 保存pdf文件 doc.close() if __name__ == '__main__': pic2pdf("软件需求分析报告模板(完整版).pdf", "pic")
出现的问题:pip install fitz之后运行上段代码时出现错误:
No module named 'frontend'
解决方法为 pip install PyMuPDF
但是效果是所有图片被放置到一个pdf中,不符合要求
目前的解决方法参照:https://www.codespeedy.com/how-to-convert-image-to-pdf-in-python/
出现的问题:can only concatenate str (not "float") to str,print后括号内容必须全为字符串格式
步骤五:API
import dropbox class TransferData: def __init__(self, access_token): self.access_token = access_token def upload_file(self, file_from, file_to): dbx = dropbox.Dropbox(self.access_token) with open(file_from, 'rb') as f: dbx.files_upload(f.read(), file_to) def main(): access_token = 'xxxxxxxx' transferData = TransferData(access_token) file_from = 'C:/Users/Eleni/Desktop/demo/aaa .xlsx' # This is name of the file to be uploaded file_to = '/Test/aaa .xlsx' # This is the full path to upload the file to, including name that you wish the file to be called once uploaded. # API v2 print(transferData.upload_file(file_from, file_to)) if __name__ == '__main__': main()
出现问题的解决方法:Error in path with django-storages with dropbox - Stack Overflow
步骤六:PDF根据文件名分类
参考:python 将指定文件夹中的指定文件放入指定文件夹中 - 秋华 - 博客园 (cnblogs.com)
实现了步骤六之后,发现每个文件存到的文件夹都是不一样的。。。所以其实是不需要进行分类的,直接在excel中标记好路径就可以,所以这部分就只供学习使用
过程中同时参考了Python startswith()方法 | 菜鸟教程 (runoob.com)
步骤七:补充下在转化完pdf之后批量删除图片
Python批量删除指定文件夹下某一格式的文件,如.png图片_me凡的博客-CSDN博客
步骤八:删除临时文件夹
Python简单删除目录下文件以及文件夹_Cls的博客-CSDN博客_python 删除文件夹
步骤九:写入exe(还没有完成)
[272]如何把Python脚本导出为exe程序_周小董-CSDN博客_python 生成exe
步骤十:配置文件
python中配置文件的使用方法_python之战-CSDN博客_python 配置文件的使用
area.ini文件包含
[mysql] file_path = C:\Users\Eleni\Desktop\Bathla portal passcode .xlsx download_path = C:\Users\Eleni\Desktop\
最终的代码实现
1 import csv 2 import xlrd 3 import os 4 import time 5 import sys 6 import glob 7 import fitz 8 import shutil 9 import re 10 import configparser 11 import dropbox 12 from reportlab.lib.pagesizes import portrait 13 from reportlab.pdfgen import canvas 14 from PIL import Image 15 from selenium import webdriver 16 17 config = configparser.ConfigParser() 18 config.read('area.ini',encoding='utf-8') 19 wb = xlrd.open_workbook(config.get('mysql','file_path')) 20 sh = wb.sheet_by_name('工作表1') 21 n = int((sh.nrows+1)/3) 22 23 directory_time = time.strftime("%Y-%m-%d", time.localtime(time.time())) 24 try: 25 File_Path = config.get('mysql','download_path') + directory_time 26 if not os.path.exists(File_Path): 27 os.makedirs(File_Path) 28 print("目录新建成功:%s" % File_Path) 29 else: 30 print("目录存在") 31 except BaseException as msg: 32 print("新建目录失败:%s" % msg) 33 34 35 driver1 = webdriver.PhantomJS() 36 for i in range(n): 37 driver1.get(sh.cell(i*3+1,2).value) 38 driver1.save_screenshot(File_Path + '\\' + str(i) + '.png') 39 40 for i in range(n): 41 a = sh.cell(i*3,5).value 42 ImgFile = Image.open(File_Path + '\\' + str(i) + '.png') 43 if ImgFile.mode == 'RGBA': 44 ImgFile = ImgFile.convert("RGB") 45 ImgFile.save(File_Path + '\\' + str(a) + '_' + directory_time + '.pdf',"PDF") 46 ImgFile.close() 47 48 def del_files(path): 49 for root , dirs, files in os.walk(path): 50 for name in files: 51 if name.endswith(".png"): 52 os.remove(os.path.join(root, name)) 53 print ("Delete File: " + os.path.join(root, name)) 54 55 if __name__ == "__main__": 56 path = File_Path 57 del_files(path) 58 59 class TransferData: 60 def __init__(self, access_token): 61 self.access_token = access_token 62 63 def upload_file(self, file_from, file_to): 64 dbx = dropbox.Dropbox(self.access_token) 65 66 with open(file_from, 'rb') as f: 67 dbx.files_upload(f.read(), file_to) 68 69 def main(): 70 access_token = 'xxxxxx' 71 transferData = TransferData(access_token) 72 73 for i in range(n): 74 a = sh.cell(i*3,5).value 75 file_from = File_Path + '\\' + str(a) + '_' + directory_time + '.pdf' # This is name of the file to be uploaded 76 file_to =sh.cell(i*3,6).value + str(a) + '_' + directory_time + '.pdf' # This is the full path to upload the file to, including name that you wish the file to be called once uploaded. 77 print(transferData.upload_file(file_from, file_to)) 78 if __name__ == '__main__': 79 main() 80 81 filelist=[] #选取删除文件夹的路径,最终结果删除img文件夹 82 filelist=os.listdir(File_Path) #列出该目录下的所有文件名 83 for f in filelist: 84 filepath = os.path.join( File_Path, f ) #将文件名映射成绝对路劲 85 if os.path.isfile(filepath): #判断该文件是否为文件或者文件夹 86 os.remove(filepath) #若为文件,则直接删除 87 print(str(filepath)+" removed!") 88 elif os.path.isdir(filepath): 89 shutil.rmtree(filepath,True) #若为文件夹,则删除该文件夹及文件夹内所有文件 90 print("dir "+str(filepath)+" removed!") 91 shutil.rmtree(File_Path,True) #最后删除img总文件夹 92 print("删除成功") 93 94 path = config.get('mysql','file_path') # 文件路径 95 if os.path.exists(path): # 如果文件存在 96 # 删除文件,可使用以下两种方法。 97 os.remove(path) 98 #os.unlink(path) 99 else: 100 print('no such file:%s'%my_file) # 则返回文件不存在