跟着锁薇姐姐学的爬取返利网h1标签信息~ -《狗嗨默示录》

# _*_ coding:UTF-8 _*_

import urllib.request
import urllib.error
import re
from bs4 import BeautifulSoup
import time
import socket

#获取主页的源码
fanly_url = "http://zhide.fanli.com/p" #多页
format_url = "http://zhide.fanli.com/detail/1-" #商品链接

class Faly(): #首字母大写
def __init__(self): #初始化构造函数
self.user_agent = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36' #头部信息
self.html_data = [] #放置商品信息的列表

#获取主页的源码
def get_html(self,start_page=1 ,end_page=7):
for i in range(start_page,end_page+1):
rt = urllib.request.Request(fanly_url+str(i)) #用地址创建一个对象
rt.add_header('User_Agent',self.user_agent)
try:
my_date = urllib.request.urlopen(rt).read().decode('UTF-8') #打开网页,获取源码
#print(my_date) #获取网页源码
self.html_data.append(my_date)
time.sleep(2)
socket.setdefaulttimeout(15)
except urllib.error.URLError as e:
if hasattr(e,'reason'): #判断异常是否存在的一个函数
print(u"连接失败",e.reason)
return str(self.html_data)

#html = Faly().get_html()

#获取产品的超链接
class GetData():
def __init__(self):
self.html = Faly().get_html() #获取源码
self.href = [] #放置六位数字的列表
self.ls = []
self.url = []
#获取产品的超链接
def get_hrefurl(self):
reg = r'data-id="\d{6}"' #商品6位数字正则
result = re.compile(reg) #编译,提高效率
tag = result.findall(self.html)
#tag = re.findall(result,self.html)
#print(tag)
for i in tag:
self.href.append(i)
#print(self.href)

#去重
reg2 = r"\d{6}"
result2 = re.findall(reg2,str(self.href))
if len(result2):
for data in result2:
if data not in self.ls:
self.ls.append(data)
url = format_url+str(data) #完整的商品链接
self.url.append(url)
#print(self.url[-1])
return self.url

#获取商品信息
class Href_mg():
def __init__(self):
self.list = GetData().get_hrefurl()
self.txt_list = [] #商品信息
def show_mg(self):
for item in range(len(self.list)):
if len(self.list):
url = str(self.list[item])
mg = urllib.request.Request(url)
try:
req = urllib.request.urlopen(mg).read()
soup = BeautifulSoup(req,"html.parser")
txt = soup.find_all('h1') #找标签
self.txt_list.append(txt)
print(self.txt_list[-1]) #打印商品列表
except urllib.error.URLError as e:

print(e.reason)
return str(self.txt_list)

if __name__ == "__main__": #判断文件入口
path = "yaozhi.txt"
with open(path,'a') as file:
data = Href_mg().show_mg() #获取产品的内容
reg4 = r'<.*+>'
data_s = data.replace(u'\xa0',u'').replace(',','\n').replace('全网最低','').replace('[','').replace(']','').replace(' ','').strip()
file.write(data_s)

posted @ 2017-08-04 12:12  李·狗嗨  阅读(252)  评论(0编辑  收藏  举报