爬虫实例:当当网书籍介绍
import json import re import requests from requests import RequestException def get_one_page(url): try: response = requests.get(url) if response.status_code == 200: # print('2sas') print(response.text) return response.text return None except RequestException: return None def parse_one_page(html): # 加上re.S后, .将会匹配换行符 pattern = re.compile('<li>.*?list_num.*?>(.*?)</div>.*?pic.*?src="(.*?)".*?/></a>.*?name"><a.*?title="(.*?)">.*?tuijian">(.*?)</span>.*?publisher_info.*?title="(.*?)".*?biaosheng.*?<span>(.*?)</span>.*?</li>',re.S) items = re.findall(pattern,html) for item in items: yield { 'index':item[0], 'iamge':item[1], 'title':item[2], 'tuijian':item[3], 'author':item[4], 'times':item[5], } def write_content_to_file(content): with open('book.txt', 'a', encoding='UTF-8') as f: f.write(json.dumps(content, ensure_ascii=False) + '\n') f.close() def main(page): url = "http://bang.dangdang.com/" \ "books/fivestars/01.00.00.00.00.00-recent30-0-0-1-" + str(page) html = get_one_page(url) parse_one_page(html) for item in parse_one_page(html): print(item) write_content_to_file(item) if __name__ == "__main__": for i in range(1,2): main(i)