Python基础(四)爬虫

python爬虫

系列文章 网上搜素的系列文章 记录一下后续可能会用https://mp.weixin.qq.com/mp/appmsgalbum?__biz=MzI3NzI1MzY4Mw==&action=getalbum&album_id=1786298272630816773#wechat_redirect

1. requests请求 2. User-Agent伪装
from selenium import webdriver
headers = {
    'User-Agent''Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Cookie':''
}
url='http://www.aaa.com/admin/'
response = webdriver.request('POST',url , data={"param1""value1"}, headers=headers)
print(response)
3. re/xpath解析数据
import re
import requests
response = requests.get("https://fanyi.baidu.com/")
title = re.findall(r"<title>(.*?)</title>", response.text)[0]
content = re.findall(r"<p>(.*?)</p>", response.text)[0]
print("标题:", title)
print("正文:", content)
4. Selenium+Chrome/Firefox
from selenium import webdriver
browser = webdriver.Chrome("E:\\googleDriver\\chromedriver.exe")
browser.implicitly_wait(10)
browser.get("IP")
from selenium.webdriver.common.by import By
text = browser.find_elements(By.CLASS_NAME,"el-input__inner")
username = text[0]
password = text[1]
username.send_keys('username')
password.send_keys('password')
5. 多进程/多线程
6. Proxies代理
import requests
proxies = {
    "http""http://user:password@proxy-ip:proxy-port"# HTTP协议代理
    "https""http://user:password@proxy-ip:proxy-port"# HTTPS协议代理
}
response = requests.get("https://www.example.com", proxies=proxies)
7. ajax请求破解
import requests
url = 'https://kyfw.12306.cn/otn/leftTicket/query?leftTicketDTO.train_date=2022-06-29&leftTicketDTO.from_station=HZH&leftTicketDTO.to_station=SHH&purpose_codes=ADULT'
headers = {
    "Accept""*/*",
    "Accept-Encoding""gzip, deflate, br",
    "Accept-Language""zh-CN,zh;q=0.9",
    "Cache-Control""no-cache",
    "Connection""keep-alive",
    "Cookie""_uab_collina=165606030144749982200458; JSESSIONID=E6E0AEED78C2D7C1F570B546D4EF1E54; highContrastMode=defaltMode; guidesStatus=off; cursorStatus=off; _jc_save_wfdc_flag=dc; RAIL_EXPIRATION=1656691421557; RAIL_DEVICEID=ri6nXn_Z4JvuTfJ_dKkesj62yt7o45BG6BTx7xmjwqzCkpc2n9XwDN03Jwe1zmbFvGtn3wq4kpkyCcfk8ffhwOZHh7Fj9QQZRXxt-3Wd54OC-InIOkYoe06yk8pAKK0LLBcbzOKj8MPwB51_xh8WDHQO09qjmooQ; BIGipServerpassport=770179338.50215.0000; route=6f50b51faa11b987e576cdb301e545c4; BIGipServerotn=1089470986.24610.0000; _jc_save_toStation=%u4E0A%u6D77%2CSHH; _jc_save_toDate=2022-06-29; BIGipServerpool_passport=182714890.50215.0000; _jc_save_fromDate=2022-06-29; _jc_save_fromStation=%u676D%u5DDE%2CHZH",
    "Host""kyfw.12306.cn",
    "If-Modified-Since""0",
    "Referer""https://kyfw.12306.cn/otn/leftTicket/init?linktypeid=dc&fs=%E6%88%90%E9%83%BD,CDW&ts=%E4%B8%8A%E6%B5%B7,SHH&date=2022-06-29&flag=N,N,Y",
    "Sec-Fetch-Dest""empty",
    "Sec-Fetch-Mode""cors",
    "Sec-Fetch-Site""same-origin",
    "User-Agent""Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.88 Safari/537.36",
    "X-Requested-With""XMLHttpRequest",
    "sec-ch-ua"'" Not A;Brand";v="99", "Chromium";v="100", "Google Chrome";v="100"',
    "sec-ch-ua-mobile""?0",
    "sec-ch-ua-platform"'"Windows"'
}
res = requests.get(url=url,  headers=headers)
print(res.status_code)
print(res)
print(res.json())
posted @ 2023-04-05 21:25  Kotlin  阅读(83)  评论(0编辑  收藏  举报
Live2D