爬虫
# -*- coding: utf-8 -*-
# @Time : 2019/5/31 19:33
# @Author : zejin
# @File : pachong.py
from urllib import request
import re
class Analysis():
url = 'https://book.douban.com/'
root_patten = '<div class="cover">([\s\S]*?)</div>'
name_patten = 'alt="([\s\S]*?)">'
adress_patten = 'href="([\s\S]*?)" title'
def __face_connect(self):
r = request.urlopen(self.url)
htmls = r.read()
htmls = str(htmls, encoding='utf-8')
return htmls
def __analysis(self,htmls):
root_htmls = re.findall(self.root_patten, htmls)
# print(root_htmls)
ancors = []
for html in root_htmls:
name = re.findall(self.name_patten, html)
adress = re.findall(self.adress_patten, html)
ancor = {"name":name, "adress":adress}
ancors.append(ancor)
# print(ancors)
return ancors
def __refine(self, ancors):
pass
def go(self):
htmls = self.__face_connect()
ancors = self.__analysis(htmls)
# self.__refine(ancors)
# ancors = self.__refine(ancors)
print(ancors)
analysis = Analysis()
analysis.go()
窈窕包包,君子好逑