大华产品信息爬虫
大华产品信息爬虫
语言环境python 3.7
1 #!/usr/bin/env python 2 # -*- coding :utf-8 -*- 3 import os, re, time, requests 4 import urllib.request 5 from bs4 import BeautifulSoup 6 from urllib import request 7 8 9 def whtml(title, ajax_url): 10 print(title, ajax_url) 11 wp = request.urlopen(ajax_url) 12 content = wp.read() 13 if not os.path.exists(now): 14 os.mkdir(now) 15 name = now + "\\" + title + ".html" 16 fp = open(name, "w+b") 17 fp.write(content) 18 fp.close() 19 20 21 def h2class(url_info, ajax_url): 22 headers = { 23 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'} 24 req = urllib.request.Request(url=url_info, headers=headers) 25 res = urllib.request.urlopen(req) 26 html = res.read().decode('utf-8') 27 soup = BeautifulSoup(html, "html.parser") 28 all_a = soup.find('div', class_='info-font fr').find_all('h2') 29 now2 = '--' + time.strftime("%H%M%S", time.localtime(time.time())) 30 for h2 in all_a: 31 title = h2.get_text() 32 title = re.sub('[/!@#$]', '_', title) + now2 33 whtml(title, ajax_url) 34 35 36 def dhinfo(url): 37 data = requests.get(url).text 38 dhinfo = re.findall(r"https://www.dahuatech.com/product/info/(\d+).html", data) 39 dhinfo1 = sorted(set(dhinfo), key=dhinfo.index) 40 for url in dhinfo1: 41 time.sleep(1) 42 url_info = ("https://www.dahuatech.com/product/info/%s.html" % url) 43 ajax_url = ("https://www.dahuatech.com/ajax/product/%s/2" % url) 44 h2class(url_info, ajax_url) 45 46 47 def product(): 48 data = requests.get('https://www.dahuatech.com/product.html').text 49 product = re.findall(r"https://www.dahuatech.com/product/lists/[1,9]\d*\.html\?area=[1,9]\d*", data) 50 product1 = sorted(set(product), key=product.index) 51 for url in product1: 52 print(url) 53 time.sleep(1) 54 dhinfo(url) 55 56 57 if __name__ == '__main__': 58 now = 'DH' + time.strftime("%Y%m%d", time.localtime(time.time())) 59 product()
--... ...--
iaoexl at outlook dot com
-------------------------------------------------------------
鱼跃此时海,花开彼岸天。只缘有余庆,翩翩在此间。