爬虫

# -*- coding: utf-8 -*-
# @Time : 2019/5/31 19:33
# @Author : zejin
# @File : pachong.py

from urllib import request
import re

class Analysis():
url = 'https://book.douban.com/'
root_patten = '<div class="cover">([\s\S]*?)</div>'
name_patten = 'alt="([\s\S]*?)">'
adress_patten = 'href="([\s\S]*?)" title'

def __face_connect(self):
r = request.urlopen(self.url)
htmls = r.read()
htmls = str(htmls, encoding='utf-8')
return htmls

def __analysis(self,htmls):
root_htmls = re.findall(self.root_patten, htmls)
# print(root_htmls)
ancors = []
for html in root_htmls:
name = re.findall(self.name_patten, html)
adress = re.findall(self.adress_patten, html)
ancor = {"name":name, "adress":adress}
ancors.append(ancor)
# print(ancors)
return ancors

def __refine(self, ancors):
pass

def go(self):
htmls = self.__face_connect()
ancors = self.__analysis(htmls)
# self.__refine(ancors)
# ancors = self.__refine(ancors)
print(ancors)

analysis = Analysis()
analysis.go()
posted @ 2019-06-01 14:48  菜菜_包包  阅读(192)  评论(0编辑  收藏  举报