pyinstaller 打包presidio、spacy出错
版本:
python3.11.5
主要代码:
import re
from presidio_analyzer.nlp_engine import NlpEngineProvider
from presidio_analyzer import AnalyzerEngine
from presidio_anonymizer import AnonymizerEngine
from presidio_anonymizer.entities import RecognizerResult, OperatorConfig
# 指定实体识别模型
provider = NlpEngineProvider(nlp_configuration={"nlp_engine_name": "spacy",
"models": [{"lang_code": "zh", "model_name": "zh_core_web_trf"}]})
# 创建引擎
nlp_engine = provider.create_engine()
analyzer = AnalyzerEngine(supported_languages=["zh"], nlp_engine=nlp_engine)
anonymizer = AnonymizerEngine()
# 指定实体识别类型
entities_to_analyze = ["PERSON", "LOCATION", "NRP", 'CREADIT_CARD']
class DataMasker:
@staticmethod
def run(text: str) -> str:
patterns = {
'phone': r'\b(\+?86[-\s]?)?1[3-9]\d{9}\b', # 手机号
'email': r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', # 邮箱
'id_card': r'\b\d{15}|\d{18}|\d{17}X\b', # 身份证号
}
def extract_match(match):
match = match.group(0)
return match[:5] + ((len(match)) - 5) * '*'
# 手机号、邮箱、身份证号脱敏
for key, pattern in patterns.items():
text = re.sub(pattern, extract_match, text)
recognizer_result = DataMasker.analyze(text)
text = DataMasker.anonymize(text, recognizer_result)
return text
@staticmethod
def process_person(person):
length = len(person)
if length == 1:
return person + '**'
else:
return person[0] + (length - 1) * '*'
@staticmethod
def process_address(address):
pattern = r'(\w+省|\w+市|\w+区|\w+镇|\w+小区|\w+路)'
def replace_address(match):
match = match.group()
length = len(match)
if '省' in match:
return (length - 1) * '*' + '省'
elif '市' in match:
return (length - 1) * '*' + '市'
elif '区' in match:
return (length - 1) * '*' + '区'
elif '镇' in match:
return (length - 1) * '*' + '镇'
elif '小区' in match:
return (length - 1) * '*' + '小区'
elif '路' in match:
return (length - 1) * '*' + '路'
return match
if bool(re.search(pattern, address)):
result = re.sub(pattern, replace_address, address)
else:
result = address[0] + (len(address) - 1) * '*'
return result
@staticmethod
def analyze(text: str) -> RecognizerResult:
recognizer_result = analyzer.analyze(text=text, entities=entities_to_analyze, language='zh')
return recognizer_result
@staticmethod
def anonymize(text: str, recognizer_result: RecognizerResult) -> str:
operators = {
"PERSON": OperatorConfig("custom", {'lambda': DataMasker.process_person}),
"LOCATION": OperatorConfig("custom", {'lambda': DataMasker.process_address}),
}
anonymized_text = anonymizer.anonymize(text=text, analyzer_results=recognizer_result,
operators=operators)
return anonymized_text.text
main.py
from data_masking import DataMasker
text = input('输入文本')
# text = '张三 ,350582277707175720 我在***路这里'
text= '那后面的地址也要加后溪镇?'
text = DataMasker.run(text)
print(text)
1.使用命令
pyinstaller --onefile --windowed main.py
打包后报错
requests.exceptions.ConnectionError: HTTPSConnectionPool(host='raw.githubuse....
2.另起一个项目测试了一下,没有激活当前虚拟环境,用其他python版本运行就会报这个错误
当前虚拟环境3.11.5下执行
pip install spacy
python -m spacy download zh_core_web_trf
pip install presidio_analyzer
pip install presidio_anonymizer
再去运行是不会报错的
在测试项目里继续执行打包
pyinstaller --onefile --windowed main.py
还是出现一样的错误
Traceback (most recent call last):
File "urllib3\connection.py", line 199, in _new_conn
File "urllib3\util\connection.py", line 60, in create_connection
File "socket.py", line 962, in getaddrinfo
socket.gaierror: [Errno 11004] getaddrinfo failed
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "urllib3\connectionpool.py", line 789, in urlopen
File "urllib3\connectionpool.py", line 490, in _make_request
File "urllib3\connectionpool.py", line 466, in _make_request
File "urllib3\connectionpool.py", line 1095, in _validate_conn
File "urllib3\connection.py", line 693, in connect
File "urllib3\connection.py", line 206, in _new_conn
urllib3.exceptions.NameResolutionError: <urllib3.connection.HTTPSConnection object at 0x000001F908268B50>: Failed to resolve 'raw.githubusercontent.com' ([Errno 11004] getaddrinfo failed)
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "requests\adapters.py", line 667, in send
File "urllib3\connectionpool.py", line 843, in urlopen
File "urllib3\util\retry.py", line 519, in increment
urllib3.exceptions.MaxRetryError: HTTPSConnectionPool(host='raw.githubusercontent.com', port=443): Max retries exceeded with url: /explosion/spacy-models/master/compatibility.json (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x000001F908268B50>: Failed to resolve 'raw.githubusercontent.com' ([Errno 11004] getaddrinfo failed)"))
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "main.py", line 1, in <module>
File "<frozen importlib._bootstrap>", line 1176, in _find_and_load
File "<frozen importlib._bootstrap>", line 1147, in _find_and_load_unlocked
File "<frozen importlib._bootstrap>", line 690, in _load_unlocked
File "PyInstaller\loader\pyimod02_importers.py", line 378, in exec_module
File "data_masking.py", line 12, in <module>
File "presidio_analyzer\nlp_engine\nlp_engine_provider.py", line 100, in create_engine
File "presidio_analyzer\nlp_engine\spacy_nlp_engine.py", line 57, in load
File "presidio_analyzer\nlp_engine\spacy_nlp_engine.py", line 64, in _download_spacy_model_if_needed
File "spacy\cli\download.py", line 85, in download
File "spacy\cli\download.py", line 130, in get_compatibility
File "requests\api.py", line 73, in get
File "requests\api.py", line 59, in request
File "requests\sessions.py", line 589, in request
File "requests\sessions.py", line 703, in send
File "requests\adapters.py", line 700, in send
requests.exceptions.ConnectionError: HTTPSConnectionPool(host='raw.githubusercontent.com', port=443): Max retries exceeded with url: /explosion/spacy-models/master/compatibility.json (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x000001F908268B50>: Failed to resolve 'raw.githubusercontent.com' ([Errno 11004] getaddrinfo failed)"))
3.开启VPN再启动没有反应,启动不了,可能是在下载模型
4.再次尝试在spec中添加data
a = Analysis(
['main.py'],
pathex=[],
binaries=[],
datas=[
('.\.venv\Lib\site-packages\zh_core_web_trf','.')],
hiddenimports=[],
hookspath=[],
hooksconfig={},
runtime_hooks=[],
excludes=[],
noarchive=False,
optimize=0,
)
pyinstall main.spec运行后一直出现,没有尝试等待一会看看会不会下载下来
Skipping pipeline package dependencies and setting `--no-deps`. You
don't seem to have the spaCy package itself installed (maybe because you've
built from source?), so installing the package dependencies would cause spaCy to
be downloaded, which probably isn't what you want. If the pipeline package has
other dependencies, you'll have to install them manually.
5.尝试取消使用--onefile和顯示控制臺
将代码改成
pyinstaller main.py
class LoadedSpacyNlpEngine(SpacyNlpEngine):
def __init__(self, loaded_spacy_model):
super().__init__()
self.nlp = {"zh": loaded_spacy_model}
import zh_core_web_trf
nlp = zh_core_web_trf.load()
nlp_engine = LoadedSpacyNlpEngine(loaded_spacy_model=nlp)
analyzer = AnalyzerEngine(supported_languages=["zh"], nlp_engine=nlp_engine)
https://stackoverflow.com/questions/66495437/cant-find-spacy-model-when-packaging-with-pyinstaller
出现错误
相同错误:https://github.com/explosion/spaCy/discussions/13423
- 创建hook-spacy.py
参考 https://www.airchip.org.cn/index.php/2024/04/17/spacy-to-pyinstaller/
尝试创建hook-spacy.py
from PyInstaller.utils.hooks import collect_data_files, collect_submodules, copy_metadata
# 收集并添加 spacy 模块数据
datas = collect_data_files('spacy', include_py_files=True)
hiddenimports = collect_submodules('spacy')
# 拷贝 spacy 的元数据,这对于找到模型路径很重要
datas += copy_metadata('spacy')
# 收集并添加 pkuseg 模块数据
datas += collect_data_files('spacy_pkuseg')
# 收集并添加模型数据
datas += collect_data_files('zh_core_web_trf')
datas += collect_data_files('spacy_curated_transformers')
pyinstaller main.py打包还是不行,一样的,先报错找不到_internal下没有zh_core_web_trf文件,放进去后再报错找不到curated_transformer
pyinstaller --onefile --additional-hooks-dir=. main.py 继续b
分类:
踩坑集合
Buy me a cup of coffee ☕.
【推荐】还在用 ECharts 开发大屏?试试这款永久免费的开源 BI 工具!
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 互联网不景气了那就玩玩嵌入式吧,用纯.NET开发并制作一个智能桌面机器人(四):结合BotSharp
· 一个基于 .NET 开源免费的异地组网和内网穿透工具
· 《HelloGitHub》第 108 期
· Windows桌面应用自动更新解决方案SharpUpdater5发布
· 我的家庭实验室服务器集群硬件清单