instructor 进行PII 数据处理
内容来自官方文档,记录下
参考使用
- 代码
from typing import List
from pydantic import BaseModel
from openai import OpenAI
import instructor
class Data(BaseModel):
index: int
data_type: str
pii_value: str
class PIIDataExtraction(BaseModel):
"""
Extracted PII data from a document, all data_types should try to have consistent property names
"""
private_data: List[Data]
def scrub_data(self, content: str) -> str:
"""
Iterates over the private data and replaces the value with a placeholder in the form of
<{data_type}_{i}>
"""
for i, data in enumerate(self.private_data):
content = content.replace(data.pii_value, f"<{data.data_type}_{i}>")
return content
client = instructor.from_openai(OpenAI(
base_url="http://localhost:4000",
api_key="sk-ZTp5zuetNQoJNgG4xHgGzw",
))
EXAMPLE_DOCUMENT = """
我叫xxx,家住xxxxx,我的电话号码是xxxxxx,我的身份证号码是xxxxx,我的邮箱是xxxx
"""
pii_data = client.chat.completions.create(
model="dalongdemov3",
response_model=PIIDataExtraction,
messages=[
{
"role": "system",
"content": "You are a world class PII scrubbing model, Extract the PII data from the following document",
},
{
"role": "user",
"content": EXAMPLE_DOCUMENT,
},
],
)
print("Extracted PII Data:")
print(pii_data.model_dump_json())
- 效果
说明
利用LLM 的结构化输出还是可以解决不少以前感觉比较费事的东西的
参考资料
https://python.useinstructor.com/examples/pii/#defining-the-structures