今天学习了python爬虫的简单操作。

1.学会创建文件夹和创建文件:

 1 import os
 2 
 3 def mkdir(path):
 4     if os.path.exists(path):###判断是文件夹否存在,否则有可能会报错
 5         print("The path has already existed .")
 6     else:
 7         os.makedirs(path)###新建文件夹
 8         print("Done .")
 9 
10 def write(path,str):
11     with open(path,"w+") as file:###写文件
12         file.write(str)
13 
14 def main():
15     mkdir("test")
16     write("test/test.txt","hello world")
17 
18 if __name__=="__main__":
19     main()
View Code

2.得到一个网站的源码(如果能够访问):

 1 from bs4 import BeautifulSoup
 2 import requests
 3 
 4 def main():
 5     html=requests.get("https://www.baidu.com")###去找这个网址
 6     html.encoding="utf-8"###中文网址,换个字符集
 7     soup=BeautifulSoup(html.text,"lxml")###美味的汤,就是正则表达式
 8     print(soup.prettify())###将源码格式化(不是删数据)
 9 
10 if __name__=="__main__":
11     main()
View Code

3.得到一个网站的源码中相应标签的元素(如果能够访问):

 1 import requests
 2 from bs4 import BeautifulSoup
 3 
 4 def write_to_file(content):
 5     with open("save.txt","a",encoding="utf-8") as f:
 6         f.write(content)
 7 
 8 def get_blog_info(url):
 9     html=requests.get(url)
10     soup=BeautifulSoup(html.text,"lxml")
11     print(soup.title)###各种各样的元素
12     print("="*100)
13     print(type(soup.title))
14     print("="*100)
15     print(type(soup.title.string))
16     print("="*100)
17     print(soup.title.string)
18     print("="*100)
19     print(soup.head)
20     print("="*100)
21     print(soup.p)
22 
23 def main():
24     blog_url="https://www.cnblogs.com/sgh1023"
25     get_blog_info(blog_url)
26 
27 if __name__=="__main__":
28     main()
View Code

4.下载一个图片(如果能够访问):

 1 import requests
 2 from bs4 import BeautifulSoup
 3 import os
 4 
 5 tot=0
 6 path="save"
 7 
 8 def mkdir(path):
 9     if os.path.exists(path):
10         return
11     else:
12         os.makedirs(path)
13 
14 def save(content):
15     global tot,path
16     mkdir(path)
17     with open(path+"/"+str(tot)+".png","wb+") as file:
18         file.write(content)
19         file.close()
20         tot=tot+1
21 
22 def download_image(url):###下图片,不保证一定成功
23     print("Now downloading...",tot)
24     response=requests.get(url)
25     save(response.content)
26     print("Done !")
27 
28 def main():
29     download_image("https://www.baidu.com/img/pc_1c6e30772d5e4103103bd460913332f9.png")
30 
31 if __name__=="__main__":
32     main()
View Code

5.下载一个网页的图片:

 1 import requests
 2 import urllib
 3 import os
 4 from bs4 import BeautifulSoup
 5 
 6 tot=0
 7 path="save"
 8 
 9 def mkdir(path):
10     if os.path.exists(path):
11         return
12     else:
13         os.makedirs(path)
14 
15 def save(content):
16     global tot,path
17     mkdir(path)
18     with open(path+"/"+str(tot)+".png","wb+") as file:
19         file.write(content)
20         file.close()
21         tot=tot+1
22 ######################################################################
23 def get_html_content(url):###获得网址的源码
24     req=urllib.request.Request(url)###添加头部,伪装Goole浏览器,这是抄的代码。
25     req.add_header('user-agent','Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36')
26     response=urllib.request.urlopen(req)
27     html=response.read()
28     return html
29 
30 def url_exist(url):###判断网址是否存在
31     try:
32         response=requests.get(url)
33         return True
34     except:
35         return False
36 
37 def download_image(url):###下图片
38     print("Now downloading...",tot,url)
39     if(url_exist(url)):###判断网址是否存在
40         response=requests.get(url)
41         save(response.content)
42         print("Done !")
43     else:
44         print("Unavailable !")
45 ######################################################################
46 def process(str):###简单地处理网址
47     if(str[0]=='h'):
48         return str;
49     elif(str[0]=='/' and str[1]!='/'):
50         return "https:/"+str
51     return "https:"+str;
52 
53 def get_image(url):
54     soup=BeautifulSoup(get_html_content(url),"lxml")
55     items=soup.find_all("img",{"src":True})
56     for i in items:
57         download_image(process(i["src"]))
58 
59 def main():
60     url="https://www.bilibili.com"
61     get_image(url)
62 
63 if __name__=="__main__":
64     main()
View Code

 当然,find_all的参数视具体情况而定。

6.正则表达式下载jpg:

 1 import requests
 2 import urllib
 3 import os
 4 import re
 5 from bs4 import BeautifulSoup
 6 
 7 tot=0
 8 path="save"
 9 
10 def mkdir(path):
11     if os.path.exists(path):
12         return
13     else:
14         os.makedirs(path)
15 
16 def save(content):
17     global tot,path
18     mkdir(path)
19     with open(path+"/"+str(tot)+".jpg","wb+") as file:
20         file.write(content)
21         file.close()
22         tot=tot+1
23 ################################################################################
24 def get_html_content(url):###获得网址的源码
25     req=urllib.request.Request(url)###添加头部,伪装Goole浏览器,这是抄的代码。
26     req.add_header('user-agent','Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36')
27     response=urllib.request.urlopen(req)
28     response.encode="utf-8"
29     html=response.read()
30     return html
31 
32 def url_exist(url):###判断网址是否存在
33     try:
34         response=requests.get(url)
35         return True
36     except:
37         return False
38 
39 def download_image(url):###下图片
40     print("Now downloading...",tot,url)
41     if(url_exist(url)):###判断网址是否存在
42         response=requests.get(url)
43         save(response.content)
44         print("Done !")
45     else:
46         print("Unavailable !")
47 ################################################################################
48 def process(str):###简单地处理网址
49     if(len(str)<=5):
50         return "NO"
51     if(str[0]=='h'):
52         return str;
53     elif(str[0]=='/' and str[1]=='/'):
54         return "https:"+str
55     elif(str[0]=='/' and str[1]!='/'):
56         return "https:/"+str
57     return "https://"+str;
58 
59 def get_image(url):
60     S=get_html_content(url)
61     items=re.findall('https://(.*?)([.]jpg)',str(S),re.S)###正则表达式
62     for i in items:
63         download_image(process(i[0]+i[1]))
64         if(tot==100):
65             print("Too many images ! Stop what you are doing !")
66             break
67 
68 def main():
69     url="https://image.baidu.com/search/index?tn=baiduimage&ct=201326592&lm=-1&cl=2&ie=gb18030&word=%D1%BC%D7%D3"
70     get_image(url)
71 
72 if __name__=="__main__":
73     main()
View Code

 

 posted on 2020-02-09 14:03  GreenDuck  阅读(198)  评论(0编辑  收藏  举报