Python百度文库爬虫之ppt文件
Python百度文库爬虫之ppt文件
对于文件的所有类型,我都会用一篇文章进行说明,链接:
- Python百度文库爬虫之txt文件
- Python百度文库爬虫之doc文件
- Python百度文库爬虫之pdf文件
- Python百度文库爬虫之ppt文件
- [Python百度文库爬虫之xls文件
- Python百度文件爬虫终极版
一.网页分析
PTT文件的内容实际是图片,我们只需要把图片下载并保存
from IPython.display import Image
Image("./Images/ppt_0.png",width="600px",height="400px")
二.数据链接
Image("./Images/ppt_1.png",width="600px",height="400px")
查看链接,与我们数据一样:
Image("./Images/ppt_3.png",width="600px",height="400px")
三.程序调试
import requests
import json
import re
import json
session=requests.session()
url=input("请输入下载的文件URL地址:")
content=session.get(url).content.decode('gbk')
doc_id=re.findall('view/(.*?).html',url)[0]
types=re.findall(r"docType.*?:.*?'(.*?)'",content)[0]
title=re.findall(r"title.*?:.*?'(.*?)'",content)[0]
请输入下载的文件URL地址: https://wenku.baidu.com/view/b906673ed1d233d4b14e852458fb770bf68a3b18.html?fr=search
doc_id
'b906673ed1d233d4b14e852458fb770bf68a3b18'
types
'ppt'
title
'精品课件-爬虫技术'
content_url='https://wenku.baidu.com/browse/getbcsurl?doc_id='+doc_id+'&pn=1&rn=9999&type=ppt'
content=session.get(content_url).content.decode('gbk')
url_list=re.findall('{"zoom":"(.*?)","page"',content)
url_list=[addr.replace('\\','') for addr in url_list]
url_list
['https://wkretype.bdimg.com/retype/zoom/0836f08558f5f61fb636661b?pn=1&o=jpg_6&md5sum=f2be3d1fed17d9e67fa325fbdfbafa6c&sign=c05e1cdb4f&png=0-487135&jpg=0-133194',
'https://wkretype.bdimg.com/retype/zoom/0836f08558f5f61fb636661b?pn=2&o=jpg_6&md5sum=f2be3d1fed17d9e67fa325fbdfbafa6c&sign=c05e1cdb4f&png=487136-641849&jpg=133195-323627',
'https://wkretype.bdimg.com/retype/zoom/0836f08558f5f61fb636661b?pn=3&o=jpg_6&md5sum=f2be3d1fed17d9e67fa325fbdfbafa6c&sign=c05e1cdb4f&png=641850-727977&jpg=323628-515968',
'https://wkretype.bdimg.com/retype/zoom/0836f08558f5f61fb636661b?pn=4&o=jpg_6&md5sum=f2be3d1fed17d9e67fa325fbdfbafa6c&sign=c05e1cdb4f&png=727978-856818&jpg=515969-615553',
'https://wkretype.bdimg.com/retype/zoom/0836f08558f5f61fb636661b?pn=5&o=jpg_6&md5sum=f2be3d1fed17d9e67fa325fbdfbafa6c&sign=c05e1cdb4f&png=856819-942946&jpg=615554-777861',
'https://wkretype.bdimg.com/retype/zoom/0836f08558f5f61fb636661b?pn=6&o=jpg_6&md5sum=f2be3d1fed17d9e67fa325fbdfbafa6c&sign=c05e1cdb4f&png=942947-1029074&jpg=777862-946638',
'https://wkretype.bdimg.com/retype/zoom/0836f08558f5f61fb636661b?pn=7&o=jpg_6&md5sum=f2be3d1fed17d9e67fa325fbdfbafa6c&sign=c05e1cdb4f&png=1029075-1125666&jpg=946639-1030299',
'https://wkretype.bdimg.com/retype/zoom/0836f08558f5f61fb636661b?pn=8&o=jpg_6&md5sum=f2be3d1fed17d9e67fa325fbdfbafa6c&sign=c05e1cdb4f&png=1125667-1245334&jpg=1030300-1167061',
'https://wkretype.bdimg.com/retype/zoom/0836f08558f5f61fb636661b?pn=9&o=jpg_6&md5sum=f2be3d1fed17d9e67fa325fbdfbafa6c&sign=c05e1cdb4f&png=1245335-1331462&jpg=1167062-1272951',
'https://wkretype.bdimg.com/retype/zoom/0836f08558f5f61fb636661b?pn=10&o=jpg_6&md5sum=f2be3d1fed17d9e67fa325fbdfbafa6c&sign=c05e1cdb4f&png=1331463-1417590&jpg=1272952-1420059',
'https://wkretype.bdimg.com/retype/zoom/0836f08558f5f61fb636661b?pn=11&o=jpg_6&md5sum=f2be3d1fed17d9e67fa325fbdfbafa6c&sign=c05e1cdb4f&png=1417591-1503718&jpg=1420060-1609566',
'https://wkretype.bdimg.com/retype/zoom/0836f08558f5f61fb636661b?pn=12&o=jpg_6&md5sum=f2be3d1fed17d9e67fa325fbdfbafa6c&sign=c05e1cdb4f&png=1503719-1589846&jpg=1609567-1829614',
'https://wkretype.bdimg.com/retype/zoom/0836f08558f5f61fb636661b?pn=13&o=jpg_6&md5sum=f2be3d1fed17d9e67fa325fbdfbafa6c&sign=c05e1cdb4f&png=1589847-1675974&jpg=1829615-1961046',
'https://wkretype.bdimg.com/retype/zoom/0836f08558f5f61fb636661b?pn=14&o=jpg_6&md5sum=f2be3d1fed17d9e67fa325fbdfbafa6c&sign=c05e1cdb4f&png=1675975-1762102&jpg=1961047-2057806',
'https://wkretype.bdimg.com/retype/zoom/0836f08558f5f61fb636661b?pn=15&o=jpg_6&md5sum=f2be3d1fed17d9e67fa325fbdfbafa6c&sign=c05e1cdb4f&png=1762103-1848230&jpg=2057807-2244791',
'https://wkretype.bdimg.com/retype/zoom/0836f08558f5f61fb636661b?pn=16&o=jpg_6&md5sum=f2be3d1fed17d9e67fa325fbdfbafa6c&sign=c05e1cdb4f&png=1848231-1934358&jpg=2244792-2465213',
'https://wkretype.bdimg.com/retype/zoom/0836f08558f5f61fb636661b?pn=17&o=jpg_6&md5sum=f2be3d1fed17d9e67fa325fbdfbafa6c&sign=c05e1cdb4f&png=1934359-2020486&jpg=2465214-2668167',
'https://wkretype.bdimg.com/retype/zoom/0836f08558f5f61fb636661b?pn=18&o=jpg_6&md5sum=f2be3d1fed17d9e67fa325fbdfbafa6c&sign=c05e1cdb4f&png=2020487-2106614&jpg=2668168-2791205',
'https://wkretype.bdimg.com/retype/zoom/0836f08558f5f61fb636661b?pn=19&o=jpg_6&md5sum=f2be3d1fed17d9e67fa325fbdfbafa6c&sign=c05e1cdb4f&png=2106615-2192742&jpg=2791206-2932355',
'https://wkretype.bdimg.com/retype/zoom/0836f08558f5f61fb636661b?pn=20&o=jpg_6&md5sum=f2be3d1fed17d9e67fa325fbdfbafa6c&sign=c05e1cdb4f&png=2192743-2278870&jpg=2932356-3128420',
'https://wkretype.bdimg.com/retype/zoom/0836f08558f5f61fb636661b?pn=21&o=jpg_6&md5sum=f2be3d1fed17d9e67fa325fbdfbafa6c&sign=c05e1cdb4f&png=2278871-2364998&jpg=3128421-3316449',
'https://wkretype.bdimg.com/retype/zoom/0836f08558f5f61fb636661b?pn=22&o=jpg_6&md5sum=f2be3d1fed17d9e67fa325fbdfbafa6c&sign=c05e1cdb4f&png=2364999-2451126&jpg=3316450-3622711',
'https://wkretype.bdimg.com/retype/zoom/0836f08558f5f61fb636661b?pn=23&o=jpg_6&md5sum=f2be3d1fed17d9e67fa325fbdfbafa6c&sign=c05e1cdb4f&png=2451127-2537254&jpg=3622712-3795370',
'https://wkretype.bdimg.com/retype/zoom/0836f08558f5f61fb636661b?pn=24&o=jpg_6&md5sum=f2be3d1fed17d9e67fa325fbdfbafa6c&sign=c05e1cdb4f&png=2537255-2623382&jpg=3795371-4062675',
'https://wkretype.bdimg.com/retype/zoom/0836f08558f5f61fb636661b?pn=25&o=jpg_6&md5sum=f2be3d1fed17d9e67fa325fbdfbafa6c&sign=c05e1cdb4f&png=2623383-&jpg=4062676-']
import os
path="F:\\桌面\\Files"+"\\"+title
if not os.path.exists(path):
os.mkdir(path)
for index,url in enumerate(url_list):
content=session.get(url).content
paths=os.path.join(path,str(index)+'.jpg')
with open(paths,'wb') as f:
f.write(content)
print("图片保存在"+title+"文件夹")
图片保存在精品课件-爬虫技术文件夹
Image("./Images/ppt_4.png",width="600px",height="400px")
三.函数编程
import requests
import json
import re
import os
session=requests.session()
path="F:\\桌面\\Files"
if not os.path.exists(path):
os.mkdir(path)
def parse_txt1(code,doc_id):
content_url='https://wenku.baidu.com/api/doc/getdocinfo?callback=cb&doc_id='+doc_id
content=session.get(content_url).content.decode(code)
md5sum=re.findall('"md5sum":"(.*?)",',content)[0]
rsign=re.findall('"rsign":"(.*?)"',content)[0]
pn=re.findall('"totalPageNum":"(.*?)"',content)[0]
content_url='https://wkretype.bdimg.com/retype/text/'+doc_id+'?rn='+pn+'&type=txt'+md5sum+'&rsign='+rsign
content=json.loads(session.get(content_url).content.decode('gbk'))
result=''
for item in content:
for i in item['parags']:
result+=i['c']
return result
def parse_txt2(content,code,doc_id):
md5sum=re.findall('"md5sum":"(.*?)",',content)[0]
rsign=re.findall('"rsign":"(.*?)"',content)[0]
pn=re.findall('"show_page":"(.*?)"',content)[0]
content_url='https://wkretype.bdimg.com/retype/text/'+doc_id+'?rn='+pn+'&type=txt'+md5sum+'&rsign='+rsign
content=json.loads(session.get(content_url).content.decode('utf-8'))
result=''
for item in content:
for i in item['parags']:
result+=i['c']
return result
def parse_doc(content):
url_list=re.findall(r'(https.*?0.json.*?)\\x22}',content)
url_list=[addr.replace("\\\\\\/","/") for addr in url_list]
result=""
for url in set(url_list):
content=session.get(url).content.decode('gbk')
y=0
txtlists=re.findall(r'"c":"(.*?)".*?"y":(.*?),',content)
for item in txtlists:
# 当item[1]的值与前面不同时,代表要换行了
if not y==item[1]:
y=item[1]
n='\n'
else:
n=''
result+=n
result+=item[0].encode('utf-8').decode('unicode_escape','ignore')
return result
def parse_pdf(content):
url_list=re.findall(r'(https.*?0.json.*?)\\x22}',content)
url_list=[addr.replace("\\\\\\/","/") for addr in url_list]
result=""
for url in set(url_list):
content=session.get(url).content.decode('gbk')
y=0
txtlists=re.findall(r'"c":"(.*?)".*?"y":(.*?),',content)
for item in txtlists:
# 当item[1]的值与前面不同时,代表要换行了
if not y==item[1]:
y=item[1]
n='\n'
else:
n=''
result+=n
result+=item[0].encode('utf-8').decode('unicode_escape','ignore')
return result
def parse_ppt(doc_id,title):
content_url='https://wenku.baidu.com/browse/getbcsurl?doc_id='+doc_id+'&pn=1&rn=9999&type=ppt'
content=session.get(content_url).content.decode('gbk')
url_list=re.findall('{"zoom":"(.*?)","page"',content)
url_list=[addr.replace('\\','') for addr in url_list]
path="F:\\桌面\\Files"+"\\"+title
if not os.path.exists(path):
os.mkdir(path)
for index,url in enumerate(url_list):
content=session.get(url).content
paths=os.path.join(path,str(index)+'.jpg')
with open(paths,'wb') as f:
f.write(content)
print("图片保存在"+title+"文件夹")
def save_file(title,filename,content):
with open(filename,'w',encoding='utf-8') as f:
f.write(content)
print("文件"+title+"保存成功")
f.close()
def main():
print("欢迎来到百度文库文件下载:")
print("-----------------------\r\n")
while True:
try:
print("1.doc \n 2.txt \n 3.ppt \n 4.xls\n 5.ppt\n")
types=input("请输入需要下载文件的格式(0退出):")
if types=="0":
break
if types not in ['txt','doc','pdf','ppt']:
print("抱歉功能尚未开发")
continue
url=input("请输入下载的文库URL地址:")
# 网页内容
response=session.get(url)
code=re.findall('charset=(.*?)"',response.text)[0]
if code.lower()!='utf-8':
code='gbk'
content=response.content.decode(code)
# 文件id
doc_id=re.findall('view/(.*?).html',url)[0]
# 文件类型
#types=re.findall(r"docType.*?:.*?'(.*?)'",content)[0]
# 文件主题
#title=re.findall(r"title.*?:.*?'(.*?)'",content)[0]
if types=='txt':
md5sum=re.findall('"md5sum":"(.*?)",',content)
if md5sum!=[]:
result=parse_txt2(content,code,doc_id)
title=re.findall(r'<title>(.*?). ',content)[0]
#filename=os.getcwd()+"\\Files\\"+title+'.txt'
filename=path+"\\"+title+".txt"
save_file(title,filename,result)
else:
result=parse_txt1(code,doc_id)
title=re.findall(r"title.*?:.*?'(.*?)'",content)[0]
#filename=os.getcwd()+"\\Files\\"+title+'.txt'
filename=path+"\\"+title+".txt"
save_file(title,filename,result)
elif types=='doc':
title=re.findall(r"title.*?:.*?'(.*?)'",content)[0]
result=parse_doc(content)
filename=path+"\\"+title+".doc"
save_file(title,filename,result)
elif types=='pdf':
title=re.findall(r"title.*?:.*?'(.*?)'",content)[0]
result=parse_pdf(content)
filename=path+"\\"+title+".txt"
save_file(title,filename,result)
elif types=='ppt':
title=re.findall(r"title.*?:.*?'(.*?)'",content)[0]
parse_ppt(doc_id,title)
except Exception as e:
print(e)
if __name__=='__main__':
main()
欢迎来到百度文库文件下载:
-----------------------
1.doc
2.txt
3.ppt
4.xls
5.ppt
请输入需要下载文件的格式(0退出): ppt
请输入下载的文库URL地址: https://wenku.baidu.com/view/b906673ed1d233d4b14e852458fb770bf68a3b18.html?fr=search
图片保存在精品课件-爬虫技术文件夹
1.doc
2.txt
3.ppt
4.xls
5.ppt
请输入需要下载文件的格式(0退出): ppt
请输入下载的文库URL地址: https://wenku.baidu.com/view/26f8556af021dd36a32d7375a417866fb94ac0e3.html?fr=search
图片保存在网络爬虫基本原理文件夹
1.doc
2.txt
3.ppt
4.xls
5.ppt
请输入需要下载文件的格式(0退出): 0
Image("./Images/ppt_5.png",width="600px",height="400px")
Image("./Images/ppt_6.png",width="600px",height="400px")