爬虫基本工具:urllib丶requests丶selenium丶pytesseract
urllib来实现cookie和ip代理
1 from urllib.request import Request, build_opener, urlopen 2 from fake_useragent import UserAgent 3 from urllib.parse import urlencode 4 from urllib.request import HTTPCookieProcessor 5 from http.cookiejar import CookieJar, FileCookieJar, MozillaCookieJar, LWPCookieJar # 用来将cookies保存到文件 6 # FileCookieJar 继承 CookieJar 7 # MozillaCookieJar\LWPCookieJar继承FileCookieJar 8 from urllib.request import ProxyHandler 9 import ssl 10 11 # get丶post请求 12 def get_html(aurl): 13 headers = { 14 'User-Agent': UserAgent().chrome 15 } 16 # 如果Request传了data参数的话,就是post请求,没传data就是get. 17 # 传data的时候,data先是字典类型,通过urlencode(data)编码之后为字符串,然后转字符串转成bytes,也就是str.encode 18 request = Request(url=aurl, headers=headers) 19 response = urlopen(request) 20 21 # 忽略验证证书 22 # ssl1 = ssl._create_unverified_context() 23 # response = urlopen(requet, context=ssl1) 24 return response.read() 25 26 # UserAgent 浏览器头 27 def user_agent_test(): 28 from urllib.request import Request, urlopen 29 from random import choice 30 from fake_useragent import UserAgent 31 from urllib.parse import quote, urlencode 32 33 url = 'http://www.baidu.com' 34 headers = { 35 # 模拟Chroom浏览器 36 'User-Agent': UserAgent().chrome 37 } 38 request = Request(url, headers=headers) 39 print(request.get_header('User-agent')) 40 # 随机选择 41 ua = UserAgent() 42 print(choice([ua.chrome, ua.safari, ua.firefox])) 43 print(ua.random) 44 45 # 中文参数编码 46 print(quote('中国')) 47 adata = { 48 'wd': '看一百次夜空里的深蓝', 49 'ie': 'utf-8' 50 } 51 url = 'https://www.baidu.com/s?{}'.format(urlencode(adata)) 52 print(url) 53 54 # 登录后保存cookie到文件 55 def get_cookie(): 56 url = 'https://support.highgo.com/highgo_api/login' 57 headers = { 58 'User-Agent': UserAgent().chrome, 59 # 'Cookie': "" # 可以直接指定cookie,也可以用HTTPCookieProcessor来保存post后的cookie 60 } 61 form_data = { 62 'userName': '773254968@qq.com', 63 'password': '039ac48bbf1bdb15e52eb8eb635dc13d' 64 } 65 fdata = urlencode(form_data).encode() 66 request = Request(url, headers=headers, data=fdata) 67 # response = urlopen(request) 带cookie的话就必须使用build_opener 68 mcj = MozillaCookieJar() 69 handler = HTTPCookieProcessor(mcj) 70 opener = build_opener(handler) 71 response = opener.open(request) 72 mcj.save('cookie.txt', ignore_expires=True, ignore_discard=True) 73 print(response.read().decode()) 74 75 # 从文件中加载cookie访问 76 def use_cookie(): 77 index_url = 'https://support.highgo.com/#/index' 78 headers = { 79 'User-Agent': UserAgent().chrome, 80 # 'Cookie': "" # 可以直接指定cookie,也可以用HTTPCookieProcessor来保存post后的cookie 81 } 82 mcj = MozillaCookieJar() 83 mcj.load('cookie.txt', ignore_discard=True, ignore_expires=True) 84 handler = HTTPCookieProcessor(mcj) 85 opener = build_opener(handler) 86 request = Request(index_url, headers=headers) 87 response = opener.open(request) 88 print(response.read().decode()) 89 90 # 代理 91 def opener_test(): 92 # 代理 93 url = 'http://www.baidu.com' 94 headers = { 95 'User-Agent': UserAgent().chrome 96 } 97 # 免费代理ip: https://www.89ip.cn/ 98 # handler = ProxyHandler({"http":"username:password@ip:port"}) 99 # handler = ProxyHandler({"http":"ip:port"}) 100 handler = ProxyHandler({"http": "101.43.93.67:7890"}) 101 opener = build_opener(handler) 102 103 request = Request(url, headers=headers) 104 response = opener.open(request) 105 print(response.read().decode()) 106 107 if __name__ == '__main__': 108 use_cookie()
requests来实现cookie和ip代理
1 import requests 2 from fake_useragent import UserAgent 3 4 def requests_get(): 5 url = 'https://support.highgo.com/#/index' 6 response = requests.get(url) 7 print(response.text) 8 9 def requests_post(): 10 url = 'https://support.highgo.com/highgo_api/login' 11 hearders = { 12 'User-agent': UserAgent().chrome 13 } 14 form_data = { 15 'userName': '773254968@qq.com', 16 'password': '039ac48bbf1bdb15e52eb8eb635dc13d' 17 } 18 response = requests.post(url, headers=hearders, data=form_data) 19 print(response.text) 20 21 def requets_proxy(): 22 url = 'https://support.highgo.com/#/index' 23 proxy = { 24 "http":"8.219.125.46:80" 25 } 26 hearders = { 27 'User-agent': UserAgent().chrome 28 } 29 response = requests.get(url, headers=hearders, proxies=proxy) 30 print(response.text) 31 32 def requests_ssl(): 33 url = 'https://www.12306.cn/mormhweb/' 34 hearders = { 35 'User-agent': UserAgent().chrome 36 } 37 requests.packages.urllib3.disable_warnings() 38 response = requests.get(url, verify=False, headers=hearders) 39 response.encoding = 'utf-8' 40 print(response.text) 41 42 def requests_cookies(): 43 url = 'https://support.highgo.com/highgo_api/login' 44 hearders = { 45 'User-agent': UserAgent().chrome 46 } 47 form_data = { 48 'userName': '773254968@qq.com', 49 'password': '039ac48bbf1bdb15e52eb8eb635dc13d' 50 } 51 session = requests.session() 52 response = session.post(url, headers=hearders, data=form_data) 53 print(response.text) 54 response = session.get('https://support.highgo.com/#/index') 55 print(response.text) 56 57 58 if __name__ == '__main__': 59 # requests_post() 60 # requests_get() 61 # requets_proxy() 62 # requests_ssl() 63 requests_cookies()
Selenium模拟浏览器
1 # 安装 2 # pip3 install selenium 3 # 功能:完全模拟浏览器访问网站 4 # Chrome需要配合:chromedriver 5 # https://blog.csdn.net/weixin_45109684/article/details/117650036 6 7 # PhantomJS 8 9 # Chromedriver 安装 10 # 1.安装Chrome,然后在[帮助]中查看版本 11 # 2.根据版本下载chromedriver (https://registry.npmmirror.com/binary.html?path=chromedriver/) 12 # 3.下载后的chromedriver解压后,copy到/usr/bin/目录下边 13 14 from selenium import webdriver 15 from selenium.webdriver.common.by import By 16 17 browser = webdriver.Chrome() 18 browser.get('https://www.baidu.com') 19 # By.XPATH 模式就是使用etree.xpath 20 # browser.find_element(By.XPATH, r"//input[@id='kw']").send_keys('看一百次夜空里的深蓝') 21 # find_elements是查找多个 22 # 要注意的是,find_element查找不到的时候会报错 23 browser.find_element(By.ID, 'kw').send_keys('看一百次夜空里的深蓝') 24 browser.find_element(By.ID, 'su').click() 25 # 执行js代码操作滚动条 26 # js = r"var q = document.getElementById('id').scrollTop = 0" 27 js = r"var q = document.documentElement.scrollTop = 1000" 28 # js = r"document.body.scrollTop=0" 29 browser.execute_script(js) 30 print(browser.page_source) 31 browser.close()
pytesseract验证码识别
1 from PIL import Image 2 import pytesseract 3 # Tesseract Ubuntu 安装 : https://tesseract-ocr.github.io/tessdoc/Installation.html 4 # sudo apt install tesseract-ocr 5 # sudo apt install libtesseract-dev 6 # github 地址: https://github.com/tesseract-ocr/tesseract 7 imageObject=Image.open('./lll.png') 8 print (imageObject) 9 print (pytesseract.image_to_string(imageObject))