大华产品信息爬虫

大华产品信息爬虫

语言环境python 3.7

 1 #!/usr/bin/env python
 2 # -*- coding :utf-8 -*-
 3 import os, re, time, requests
 4 import urllib.request
 5 from bs4 import BeautifulSoup
 6 from urllib import request
 7 
 8 
 9 def whtml(title, ajax_url):
10     print(title, ajax_url)
11     wp = request.urlopen(ajax_url)
12     content = wp.read()
13     if not os.path.exists(now):
14         os.mkdir(now)
15     name = now + "\\" + title + ".html"
16     fp = open(name, "w+b")
17     fp.write(content)
18     fp.close()
19 
20 
21 def h2class(url_info, ajax_url):
22     headers = {
23         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'}
24     req = urllib.request.Request(url=url_info, headers=headers)
25     res = urllib.request.urlopen(req)
26     html = res.read().decode('utf-8')
27     soup = BeautifulSoup(html, "html.parser")
28     all_a = soup.find('div', class_='info-font fr').find_all('h2')
29     now2 = '--' + time.strftime("%H%M%S", time.localtime(time.time()))
30     for h2 in all_a:
31         title = h2.get_text()
32         title = re.sub('[/!@#$]', '_', title) + now2
33         whtml(title, ajax_url)
34 
35 
36 def dhinfo(url):
37     data = requests.get(url).text
38     dhinfo = re.findall(r"https://www.dahuatech.com/product/info/(\d+).html", data)
39     dhinfo1 = sorted(set(dhinfo), key=dhinfo.index)
40     for url in dhinfo1:
41         time.sleep(1)
42         url_info = ("https://www.dahuatech.com/product/info/%s.html" % url)
43         ajax_url = ("https://www.dahuatech.com/ajax/product/%s/2" % url)
44         h2class(url_info, ajax_url)
45 
46 
47 def product():
48     data = requests.get('https://www.dahuatech.com/product.html').text
49     product = re.findall(r"https://www.dahuatech.com/product/lists/[1,9]\d*\.html\?area=[1,9]\d*", data)
50     product1 = sorted(set(product), key=product.index)
51     for url in product1:
52         print(url)
53         time.sleep(1)
54         dhinfo(url)
55 
56 
57 if __name__ == '__main__':
58     now = 'DH' + time.strftime("%Y%m%d", time.localtime(time.time()))
59     product()

 

posted @ 2019-02-19 09:47  睡到自然醒的猪  阅读(213)  评论(0)    收藏  举报

iaoexl at outlook dot com, 返回顶部 →友情链接: 信息港 同城信息