python3爬虫 -----爬取古诗文-------from古诗文网站

 1 # -*- coding: utf-8 -*-
 2 #author:zxy
 3 #Date:2018-10-19
 4 
 5 
 6 import requests
 7 import re
 8 HEADERS={
 9     "User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
10                  "Chrome/69.0.3497.100 Safari/537.36"
11 }
12 
13 
14 def parse_url(url):
15     response=requests.get(url,headers=HEADERS)
16     text=response.text
17     titles=re.findall(r'<div\sclass="cont">.*?<b>(.*?)</b>',text,re.DOTALL) #r raw
18     dynasties=re.findall(r'<p\sclass="source">.*?<a.*?>(.*?)</a>',text,re.DOTALL)
19     authors=re.findall(r'<p\sclass="source">.*?<a.*?<a.*?>(.*?)</a>',text,re.DOTALL)
20     content_tags=re.findall(r'<div\sclass="contson".*?>(.*?)</div>',text,re.DOTALL)
21     contents=[]
22     for content_tag in content_tags:
23         x=re.sub('<.*?>','',content_tag)
24         xx=re.sub('', '。\n',x)
25         contents.append(xx.strip())
26     poems=[]
27     for value in zip(titles,dynasties,authors,contents):
28         title,dynasty,author,content=value
29         poem={
30             "title":title,
31             "dynasty":dynasty,
32             "author":author,
33             "content":content
34         }
35         poems.append(poem)
36 
37     with open('poems.txt','w',encoding="utf-8") as f:
38         for poem in poems:
39             for (key,value) in poem.items():
40                 if(key=="title"):
41                     f.write("{}\n".format(value))
42                 if (key == "dynasty"):
43                     f.write("\t{}\n".format(value))
44                 if(key=="author"):
45                     str="\t{}\n"
46                     f.write(str.format(value))
47                 if(key=="content"):
48                     print(value)
49                     f.write("{}\n\n\n".format(value))
50                     # print(x+"{}\n\n\n".format(value))
51 
52 if __name__ == '__main__':
53     url="https://www.gushiwen.org/default_1.aspx"
54     parse_url(url)

 

posted @ 2018-10-19 11:06  浅忆~  阅读(1965)  评论(0编辑  收藏  举报