食品伙伴网爬虫
常规爬虫,就是下载pdf文件
码云链接:https://gitee.com/MarkPolaris/food_partnership_network/tree/master
概览页
1 import requests 2 import re 3 import pymysql 4 import hashlib 5 import datetime 6 7 8 class GLY(object): 9 def __init__(self): 10 self.url = 'http://down.foodmate.net/special/standard/8.html' 11 self.headers = { 12 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36' 13 } 14 self.host = '127.0.0.1' 15 self.db = 'app_mark' 16 self.user = 'root' 17 self.passwd = '123456' 18 self.charset = 'utf8mb4' 19 20 def get_url(self): 21 response = requests.get(self.url, headers=self.headers) 22 response.encoding = response.apparent_encoding 23 html = response.text 24 urls = re.findall('<A title=.*?href="(.*?)"', html) 25 # 去重 26 urls = set(urls) 27 for url in urls: 28 hkey = hashlib.md5(url.encode(encoding='utf-8')).hexdigest() 29 tag = '0' 30 channel = '食品添加剂标准' 31 sitename = '食品伙伴网' 32 lasttime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') 33 list_data = [url, hkey, tag, channel, sitename, lasttime] 34 self.save_url(list_data) 35 print(len(urls)) 36 37 def save_url(self, list_data): 38 con = pymysql.connect(host=self.host, db=self.db, user=self.user, passwd=self.passwd, charset=self.charset) 39 cur = con.cursor() 40 sql = 'insert into gly(link, hkey, tag, channel, sitename, lasttime) values (%s, %s, %s, %s, %s, %s)' 41 try: 42 cur.execute(sql, list_data) 43 print('insert success') 44 except Exception as e: 45 con.rollback() 46 print('error~', e) 47 else: 48 con.commit() 49 cur.close() 50 con.close() 51 52 53 54 if __name__ == '__main__': 55 gly = GLY() 56 urls = gly.get_url()
细览页
1 import pymysql 2 import re 3 import datetime 4 import requests 5 from multiprocessing.dummy import Pool as ThreadPool 6 7 class XLY(object): 8 def __init__(self): 9 self.host = '127.0.0.1' 10 self.db = 'app_mark' 11 self.user = 'root' 12 self.passwd = '123456' 13 self.charset = 'utf8mb4' 14 self.headers = { 15 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36' 16 } 17 self.start = datetime.datetime.now() 18 19 def get_urls(self): 20 con = pymysql.connect(host=self.host, db=self.db, user=self.user, passwd=self.passwd, charset=self.charset) 21 cur = con.cursor() 22 sql = 'select link from gly where tag = "0" and sitename = "食品伙伴网"' 23 after_sql = 'update gly set tag = "1"' 24 try: 25 cur.execute(sql) 26 results = cur.fetchall() 27 except Exception as e: 28 con.rollback() 29 print('error~', e) 30 results = None 31 else: 32 con.commit() 33 cur.close() 34 con.close() 35 return results 36 37 def download(self, url): 38 url = url[0] 39 response = requests.get(url, headers=self.headers) 40 response.encoding = response.apparent_encoding 41 html = response.text 42 down_url = re.findall('<a class="telecom" href="(.*?)">', html, re.S) 43 try: 44 down_url = down_url[0] 45 r = requests.get(down_url, headers=self.headers) 46 file_name = 'D:/1_work/python采集/PDF/' + down_url.split('auth=')[-1] + '.pdf' 47 # print(file_name) 48 with open(file_name, 'wb') as pdf: 49 for content in r.iter_content(): 50 pdf.write(content) 51 except Exception as e: 52 print('error_url:{}; exception: {}'.format(url, e)) 53 print(down_url) 54 55 56 if __name__ == '__main__': 57 xly = XLY() 58 urls = xly.get_urls() 59 if urls: 60 # 多线程 61 pool = ThreadPool(20) 62 pool.map(xly.download, urls) 63 pool.close() 64 pool.join() 65 end = datetime.datetime.now() 66 print('耗时: {}'.format(end - xly.start)) 67 # for url in urls: 68 # url = url[0] 69 # xly.download(url) 70 # break