python爬虫-网页url采集爬虫
1 # coding:utf-8 2 # 网页url采集爬虫,给定网址,以及存储文件,将该网页内全部网址采集下,可指定文件存储方式 3 import requests, time 4 from lxml import etree 5 6 """ 7 url:给定的url 8 save_file_name:为url存储文件 9 """ 10 11 12 def Redirect(url): 13 try: 14 res = requests.get(url, timeout=10) 15 url = res.url 16 except Exception as e: 17 print("4", e) 18 time.sleep(1) 19 return url 20 21 22 def requests_for_url(url, save_file_name, file_model): 23 global selector 24 headers = { 25 'pragma': "no-cache", 26 'accept-encoding': "gzip, deflate, br", 27 'accept-language': "zh-CN,zh;q=0.8", 28 'upgrade-insecure-requests': "1", 29 'user-agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36", 30 'accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", 31 'cache-control': "no-cache", 32 'connection': "keep-alive", 33 } 34 try: 35 response = requests.request("GET", url, headers=headers) 36 selector = etree.HTML(response.text, parser=etree.HTMLParser(encoding='utf-8')) 37 except Exception as e: 38 print("页面加载失败", e) 39 40 return_set = set() 41 with open(save_file_name, file_model) as f: 42 try: 43 context = selector.xpath('//a/@href') 44 for i in context: 45 try: 46 if i[0] == "j": 47 continue 48 if i[0] == "/": 49 # print i 50 i = url + i.replace("/", "") 51 f.write(i) 52 f.write("\n") 53 return_set.add(i) 54 # print(len(return_set)) 55 print(len(return_set), context[0], i) 56 except Exception as e: 57 print("1", e) 58 except Exception as e: 59 print("2", e) 60 return return_set 61 62 63 if __name__ == '__main__': 64 # 网页url采集爬虫,给定网址,以及存储文件,将该网页内全部网址采集下,可指定文件存储方式 65 url = "https://www.ak47s.cn/" 66 save_file_name = "url.txt" 67 return_set = requests_for_url(url, save_file_name, "a") # “a”:追加 68 print(len(return_set))
本文来自博客园,作者:关于段主任的一切,转载请注明原文链接:https://www.cnblogs.com/fairylandfuture/p/14915209.html