反爬机制:
robots.txt协议
UA检测、referer检测
数据加密
图片懒加载
ip检测
验证码识别
字体反爬虫
可以使用Python的urllib模块查看robots.txt
from urllib import robotparser rp = robotparser.RobotFileParser() rp.set_url('https://www.cnblogs.com/robots.txt') url = 'https://www.cnblogs.com/' print(rp.can_fetch('Python', url))
破解数据加密
https://github.com/EnjoyScraping/JSpider
注:图片懒加载解决办法:直接通过img标签的自定义的属性名如src2(代替src的属性名)获取图片链接
代理IP:
代理池示例:https://github.com/losenine/proxy_pool
import random from time import sleep import requests from selenium import webdriver from lxml import etree ua_list = [ "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)", "Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/" "535.11" ] # chromedriver引擎所在的路径 chrome_driver = r'E:\chromedriver\chromedriver.exe' options = webdriver.ChromeOptions() # 指定浏览器分辨率 options.add_argument('window-size=1920x3000') # 设置无头模式,不弹出浏览器 options.add_argument("--headless") # 关闭左上方Chrome正受到自动测试软件的控制的提示 options.add_experimental_option('useAutomationExtension', False) options.add_experimental_option("excludeSwitches", ['enable-automation']) browser = webdriver.Chrome(options=options, executable_path=chrome_driver) def get_headers(): headers = { 'User-Agent': random.choice(ua_list), 'connection': 'close', } return headers # 检测每个IP是否可用 def verify_proxies(proxies): try: requests.get(url='http://httpbin.org/ip', headers=get_headers(), proxies=proxies, timeout=3) print(f'proxies {proxies} can use') return True except: print(f'proxies {proxies} cant use') return False # 获取代理IP def get_quanwang_ip(): http_proxies = [] https_proxies = [] url = 'http://www.goubanjia.com/' browser.get(url) # 窗口最大化 browser.maximize_window() # 下拉滚动条 browser.execute_script("scroll(0,1500)") sleep(1) res = browser.page_source tree = etree.HTML(res) tr_list = tree.xpath('//div[@class="span12"]/table/tbody/tr') for tr in tr_list: ip_port = tr.xpath('./td[@class="ip"]/div/text() | ./td[@class="ip"]/span/text()') ip = ''.join(ip_port[:-1]) port = ip_port[-1] ip_port = f'{ip}:{port}' ip_type = tr.xpath('./td[3]/a/text()')[0] if ip_type == 'http': proxy = f'http://{ip_port}' proxies = {'http': proxy} if verify_proxies(proxies): http_proxies.append(proxies) elif ip_type == 'https': proxy = f'https://{ip_port}' proxies = {'https': proxy} if verify_proxies(proxies): https_proxies.append(proxies) print('over') # print(http_proxies) # print(https_proxies) return http_proxies, https_proxies if __name__ == '__main__': get_quanwang_ip()
可访问 http://httpbin.org/ip 来查看当前自己的IP,使用代理IP后访问该URL来查看IP是否变了
常用代理IP网址:
全网代理IP:http://www.goubanjia.com/
快代理:https://www.kuaidaili.com/?utm_source=bdtg&utm_campaign=a10a1&utm_medium=a10
西祠代理:https://www.xicidaili.com/nn/
注:设置请求的代理ip: www.goubanjia.com 快代理 西祠代理
代理ip的类型必须和请求url的协议头保持一致
使用方法: 在requests请求中加上proxies={'传输协议':'IP:端口'},如 proxies={'https':'https://115.216.40.78:9999'}
例:page_text = requests.get(url=url,headers=headers,proxies={'https':'https://115.216.40.78:9999'}).text
#设置请求的代理ip: www.goubanjia.com 快代理 西祠代理 #代理ip的类型必须和请求url的协议头保持一致 url = 'https://www.baidu.com/s?wd=ip' page_text = requests.get(url=url,headers=headers,proxies={'https':'115.216.40.78:9999'}).text with open('./ip.html','w',encoding='utf-8') as fp: fp.write(page_text)