一,requests发请求
s = requests.Session() payload = {'key1': 'value1', 'key2': 'value2'} proxies = {'http': 'http://47.98.163.18:8080', 'https': 'http://47.98.163.18:8080'} headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"} requests.get(url, headers=headers, verify = False, params=payload, allow_redirects=False, proxies=proxies).content.decode('utf-8') # headers 请求头 # data post请求数据 # verify ssl安全认证 # allow_redirects 重定向 # proxies 设置代理 requests.post(url, headers=headers, data=data,verify = False, allow_redirects=False).content.decode('utf-8') re=requests.post(url, headers=headers, data=data,verify = False) # 获取cookie requests.utils.dict_from_cookiejar(re.cookies) requests.packages.urllib3.disable_warnings() # 忽然警告 import urllib3 urllib3.disable_warnings() requests.get('https://github.com', timeout=2) # 设置超时时间 timeout 设置单一的值,将会用作 connect 和 read 二者的 timeout。 requests.get('https://github.com', timeout=(3.05, 27)) # 如果要分别制定,就需要传入一个元组。(connect, read)的超时时间 requests.get('https://github.com', timeout=None) # 如果需要让 request 永远等待,则传入一个 None 作为 timeout 的值。
二,requests
import requests kw = {'wd':'长城'} headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"} # params 接收一个字典或者字符串的查询参数,字典类型自动转换为url编码,不需要urlencode() response = requests.get("http://www.baidu.com/s?", params = kw, headers = headers) # 查看响应内容,response.text 返回的是Unicode格式的数据 print(response.text) # 查看响应内容,response.content返回的字节流数据 print(respones.content) # 查看完整url地址 print(response.url) # 查看响应头部字符编码 print(response.encoding) # 查看响应码 print(response.status_code)
三,python爬虫POST request payload形式的请求
import requests import json payloadHeader = { 'Host': 'sellercentral.amazon.com', 'Content-Type': 'application/json', } requests.post(postUrl, data=json.dumps(payloadData), headers=payloadHeader)
四,字典格式存数据库,要求数据库字段和字典格式字段一样
class MogujiePipeline(object): def __init__(self): # 创建数据库连接 self.db = pymysql.connect(host='localhost', port=3306, database='cfda', user='root', password='root', charset='utf8') # self.db = pymysql.connect(host='rm-bp195i4u0w1066u709o.mysql.rds.aliyuncs.com', port=3306, database='spider58', # user='spider58', # password='58spider@123', # charset='utf8') self.cursor = self.db.cursor() def process_item(self, item, spider): # 判断爬取的字段数据库中是否已经存在 print(f'select id from mogujie where clientUrl={item["clientUrl"]}') num = self.cursor.execute('select id from mogujie where clientUrl="{}"'.format(item["clientUrl"])) if not num: list_key = [] list_lalues = [] for key, lalues in item.items(): list_key.append(key) list_lalues.append("'" + str(lalues).replace("'", "‘") + "'") # 拼接sql语句 insert_sql = 'insert into mogujie({}) values({})'.format(', '.join(list_key), ', '.join(list_lalues)) print('insert_sql:', insert_sql) self.cursor.execute(insert_sql) self.db.commit() return item def close_spider(self, spider): # 关闭数据库的连接 self.cursor.close() self.db.close()
五.爬起json数据
import requests import json import pymysql import logging logging.basicConfig( level=logging.INFO, # 定义输出到文件的log级别,大于此级别的都被输出 format='%(asctime)s %(filename)s %(levelname)s : %(message)s', # 定义输出log的格式 datefmt='%Y-%m-%d %H:%M:%S', # 时间 filename='yibao.log', # log文件名 filemode='a') # 写入模式“w”或“a” class yibao(object): def __init__(self): self.db = pymysql.connect(host='localhost', port=3306, database='cfda', user='root', password='root', charset='utf8') self.cursor = self.db.cursor() self.headers = { "user-agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36", } self.url = 'http://code.nhsa.gov.cn:8000/jbzd/public/toStdOperationTreeList.html' self.parse_page() def parse_page(self): data = { 'operationId': 'icdIds', } html = requests.post(url=self.url, headers=self.headers, data=data).content.decode('utf-8') data_json = json.loads(html) for data in data_json: num = self.cursor.execute('select id from catalogue where id={}'.format()) if not num: # 插入数据 self.cursor.execute( 'insert into catalogue() values()'.format()) self.db.commit() # 查询数据 self.cursor.execute("select * from catalogue") data = self.cursor.fetchone() data = self.cursor.fetchall() # 更新数据 self.cursor.execute("update catalogue set ''='{}', ''='{}' where id={}".format()) self.db.commit() # 删除数据 self.cursor.execute("delete from catalogue where id={}".format()) self.db.commit() if __name__ == '__main__': yibao()
六.HTML数据
import requests import json import time import pymysql import logging import random from lxml import etree logging.basicConfig( level=logging.INFO, # 定义输出到文件的log级别,大于此级别的都被输出 format='%(asctime)s %(filename)s %(levelname)s : %(message)s', # 定义输出log的格式 datefmt='%Y-%m-%d %H:%M:%S', # 时间 filename='yibao.log', # log文件名 filemode='a') # 写入模式“w”或“a” class yibao(object): def __init__(self): self.db = pymysql.connect(host='localhost', port=3306, database='cfda', user='root', password='root', charset='utf8') self.cursor = self.db.cursor() self.headers = { "user-agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36", } self.url = 'http://code.nhsa.gov.cn:8000/jbzd/public/toStdOperationTreeList.html' self.parse_page() def parse_page(self): data = { 'operationId': 'icdIds', } html = requests.post(url=self.url, headers=self.headers, data=data).content.decode('utf-8') etree_html = etree.HTML(html) data = etree_html.xpath( '//*[@id="classicont"]/div[@class="els-doc-h4"]/a//text() | //div[@class="els-doc-con-left"]/a//text()') datas = etree_html.xpath( '//*[@id="classicont"]/div[@class="els-doc-h4"]/span//text() | //div[@class="els-doc-con-left"]/span//text()') for i in range(len(data)): num = self.cursor.execute('select id from catalogue where id={}'.format()) if not num: # 插入数据 self.cursor.execute( 'insert into catalogue() values()'.format()) self.db.commit() # 查询数据 self.cursor.execute("select * from catalogue") data = self.cursor.fetchone() data = self.cursor.fetchall() # 更新数据 self.cursor.execute("update catalogue set ''='{}', ''='{}' where id={}".format()) self.db.commit() # 删除数据 self.cursor.execute("delete from catalogue where id={}".format()) self.db.commit() if __name__ == '__main__': yibao()
七.使用代理
proxies = { "http": "http://ip:端口号", "https": "https://ip:端口号", } request.get(url, proxies=proxies) proxies = { "http": "http://username:password@ip:端口号", "https": "https://username:password@ip:端口号", } request.get(url, proxies=proxies)
八.requests简单使用 类
import json import os import time import traceback from urllib.parse import unquote import requests class yoyo_requests(object): def __init__(self): Logger.config(level="debug", processname=os.path.splitext(os.path.basename(__file__))[0]) self.logger = Logger self.headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'} def reque_url(self, url: str, method="get", data={}, params={}, headers={}): ''' :param url: url :param method: 请求方式 :param data: post请求数据 :param cookie: cookie :param domain: 域名 :return: response ''' class yo_response(object): status_code = 500 apparent_encoding = "utf-8" encoding = "utf-8" content = b"" text = "" headers = {} url = '' response = yo_response() if not url: return response if not hasattr(self, "Referer"): self.Referer = url self.headers["Referer"] = self.Referer headers = {**headers, **self.headers} for i in range(2): self.logger.info("现在请求的网站:{}".format(url)) try: time.sleep(0.01) if isinstance(data, str) and '=' in data and method != "get": data = data.replace('"%7B%7D"', "{}").replace("'%7B%7D'", "{}") if isinstance(data, dict): for key, value in data.items(): if isinstance(value, str): if "%7B%7D" in value: value = value.replace('"%7B%7D"', "{}").replace("'%7B%7D'", "{}") data[key] = unquote(value) if isinstance(value, list) or isinstance(value, dict): if "%7B%7D" in json.dumps(value, ensure_ascii=False): data[key] = json.loads(json.dumps(value, ensure_ascii=False).replace('"%7B%7D"', "{}").replace("'%7B%7D'", "{}")) if isinstance(params, str) and '=' in params and method != "get": params = dict([j.split('=') for j in params.split('&')]) if isinstance(params, dict): for key, value in params.items(): if isinstance(value, str): if "%7B%7D" in value: value = value.replace('"%7B%7D"', "{}").replace("'%7B%7D'", "{}") params[key] = unquote(value) if isinstance(value, list) or isinstance(value, dict): if "%7B%7D" in json.dumps(value, ensure_ascii=False): params[key] = json.loads(json.dumps(value, ensure_ascii=False).replace('"%7B%7D"', "{}").replace("'%7B%7D'", "{}")) if data and method != "get": self.logger.info("现在请求的网站data参数是:{}".format(data)) if params: self.logger.info("现在请求的网站params参数是:{}".format(params)) proxies = get_proxies() proxies = {} url = unquote(url) if method == "get": response = requests.get(url=url, verify=False, timeout=60, params=params, headers=headers, proxies=proxies) # , proxies={'https': '10.1.1.159:4568'} elif method == "json": headers['Content-Type'] = "application/json;charset=UTF-8" response = requests.post(url=url, verify=False, timeout=60, params=params, headers=headers, json=data, proxies=proxies) # , proxies={'https': '10.1.1.159:4568'} , ensure_ascii=False else: headers['Content-Type'] = "application/x-www-form-urlencoded; charset=UTF-8" response = requests.post(url=url, verify=False, timeout=60, params=params, headers=headers, data=data, proxies=proxies) # , proxies={'https': '10.1.1.159:4568'} status_code = response.status_code self.logger.info("status_code:{}".format(status_code)) self.Referer = url.encode("utf-8").decode("latin1") if status_code == 200: if not response.encoding or response.encoding and "utf" not in response.encoding.lower(): response.encoding = response.apparent_encoding if not response.encoding or "iso" in response.encoding.lower() or "indows" in response.encoding.lower(): response.encoding = "utf-8" break except Exception as e: traceback.print_exc() self.logger.error("请求网站报错:{}-{}".format(url, e)[:1023]) return response if __name__ == "__main__": yoyo = yoyo_requests() response = yoyo.reque_url("http://jypt.bzggzyjy.cn/bzweb/jyxx/012001/list1.html") print(response)
九.requests简单使用 函数
import json import os import time import traceback from urllib.parse import unquote import requests Logger.config(level="debug", processname=os.path.splitext(os.path.basename(__file__))[0]) def reque_url(url: str, method="get", data={}, params={}, headers={}): ''' :param url: url :param method: 请求方式 :param data: post请求数据 :param cookie: cookie :param domain: 域名 :return: response ''' class yo_response(object): status_code = 500 apparent_encoding = "utf-8" encoding = "utf-8" content = b"" text = "" headers = {} url = '' response = yo_response() if not url: return response if 'Referer' not in locals() and 'Referer' not in globals(): Referer = url headers = {**headers, **{'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36', "Referer": Referer}} for i in range(2): Logger.info("现在请求的网站:{}".format(url)) try: time.sleep(0.01) if isinstance(data, str) and '=' in data and method != "get": data = data.replace('"%7B%7D"', "{}").replace("'%7B%7D'", "{}") if isinstance(data, dict): for key, value in data.items(): if isinstance(value, str): if "%7B%7D" in value: value = value.replace('"%7B%7D"', "{}").replace("'%7B%7D'", "{}") data[key] = unquote(value) if isinstance(value, list) or isinstance(value, dict): if "%7B%7D" in json.dumps(value, ensure_ascii=False): data[key] = json.loads(json.dumps(value, ensure_ascii=False).replace('"%7B%7D"', "{}").replace("'%7B%7D'", "{}")) if isinstance(params, str) and '=' in params and method != "get": params = dict([j.split('=') for j in params.split('&')]) if isinstance(params, dict): for key, value in params.items(): if isinstance(value, str): if "%7B%7D" in value: value = value.replace('"%7B%7D"', "{}").replace("'%7B%7D'", "{}") params[key] = unquote(value) if isinstance(value, list) or isinstance(value, dict): if "%7B%7D" in json.dumps(value, ensure_ascii=False): params[key] = json.loads(json.dumps(value, ensure_ascii=False).replace('"%7B%7D"', "{}").replace("'%7B%7D'", "{}")) if data and method != "get": Logger.info("现在请求的网站data参数是:{}".format(data)) if params: Logger.info("现在请求的网站params参数是:{}".format(params)) proxies = get_proxies() proxies = {} url = unquote(url) if method == "get": response = requests.get(url=url, verify=False, timeout=60, params=params, headers=headers, proxies=proxies) # , proxies={'https': '10.1.1.159:4568'} elif method == "json": headers['Content-Type'] = "application/json;charset=UTF-8" response = requests.post(url=url, verify=False, timeout=60, params=params, headers=headers, json=data, proxies=proxies) # , proxies={'https': '10.1.1.159:4568'} , ensure_ascii=False else: headers['Content-Type'] = "application/x-www-form-urlencoded; charset=UTF-8" response = requests.post(url=url, verify=False, timeout=60, params=params, headers=headers, data=data, proxies=proxies) # , proxies={'https': '10.1.1.159:4568'} status_code = response.status_code Logger.info("status_code:{}".format(status_code)) Referer = url.encode("utf-8").decode("latin1") if status_code == 200: if not response.encoding or response.encoding and "utf" not in response.encoding.lower(): response.encoding = response.apparent_encoding if not response.encoding or "iso" in response.encoding.lower() or "indows" in response.encoding.lower(): response.encoding = "utf-8" break except Exception as e: traceback.print_exc() Logger.error("请求网站报错:{}-{}".format(url, e)[:1023]) return response if __name__ == "__main__": response = reque_url("http://jypt.bzggzyjy.cn/bzweb/jyxx/012001/list1.html") print(response)
十 requests获取cookie
response = self.reque_url(url=image_url) headers = {'Cookie': '; '.join([f'{key}={value}' for key, value in response.cookies.items()])}