pyinstaller 打包presidio、spacy出错

版本:
python3.11.5

主要代码:

import re
from presidio_analyzer.nlp_engine import NlpEngineProvider
from presidio_analyzer import AnalyzerEngine
from presidio_anonymizer import AnonymizerEngine
from presidio_anonymizer.entities import RecognizerResult, OperatorConfig

# 指定实体识别模型
provider = NlpEngineProvider(nlp_configuration={"nlp_engine_name": "spacy",
                                                "models": [{"lang_code": "zh", "model_name": "zh_core_web_trf"}]})

# 创建引擎
nlp_engine = provider.create_engine()
analyzer = AnalyzerEngine(supported_languages=["zh"], nlp_engine=nlp_engine)
anonymizer = AnonymizerEngine()

# 指定实体识别类型
entities_to_analyze = ["PERSON", "LOCATION", "NRP", 'CREADIT_CARD']


class DataMasker:
    @staticmethod
    def run(text: str) -> str:
        patterns = {
            'phone': r'\b(\+?86[-\s]?)?1[3-9]\d{9}\b',  # 手机号
            'email': r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b',  # 邮箱
            'id_card': r'\b\d{15}|\d{18}|\d{17}X\b',  # 身份证号
        }

        def extract_match(match):
            match = match.group(0)
            return match[:5] + ((len(match)) - 5) * '*'

        # 手机号、邮箱、身份证号脱敏
        for key, pattern in patterns.items():
            text = re.sub(pattern, extract_match, text)

        recognizer_result = DataMasker.analyze(text)
        text = DataMasker.anonymize(text, recognizer_result)

        return text

    @staticmethod
    def process_person(person):
        length = len(person)
        if length == 1:
            return person + '**'
        else:
            return person[0] + (length - 1) * '*'

    @staticmethod
    def process_address(address):
        pattern = r'(\w+省|\w+市|\w+区|\w+镇|\w+小区|\w+路)'

        def replace_address(match):
            match = match.group()
            length = len(match)
            if '省' in match:
                return (length - 1) * '*' + '省'
            elif '市' in match:
                return (length - 1) * '*' + '市'
            elif '区' in match:
                return (length - 1) * '*' + '区'
            elif '镇' in match:
                return (length - 1) * '*' + '镇'
            elif '小区' in match:
                return (length - 1) * '*' + '小区'
            elif '路' in match:
                return (length - 1) * '*' + '路'
            return match


        if bool(re.search(pattern, address)):
            result = re.sub(pattern, replace_address, address)
        else:
            result = address[0] + (len(address) - 1) * '*'
        return result

    @staticmethod
    def analyze(text: str) -> RecognizerResult:
        recognizer_result = analyzer.analyze(text=text, entities=entities_to_analyze, language='zh')
        return recognizer_result

    @staticmethod
    def anonymize(text: str, recognizer_result: RecognizerResult) -> str:
        operators = {
            "PERSON": OperatorConfig("custom", {'lambda': DataMasker.process_person}),
            "LOCATION": OperatorConfig("custom", {'lambda': DataMasker.process_address}),
        }

        anonymized_text = anonymizer.anonymize(text=text, analyzer_results=recognizer_result,
                                               operators=operators)
        return anonymized_text.text

main.py

from data_masking import DataMasker

text = input('输入文本')
#     text = '张三 ,350582277707175720 我在***路这里'
text= '那后面的地址也要加后溪镇?'
text = DataMasker.run(text)
print(text)

1.使用命令

pyinstaller --onefile --windowed main.py

打包后报错

requests.exceptions.ConnectionError: HTTPSConnectionPool(host='raw.githubuse....

2.另起一个项目测试了一下,没有激活当前虚拟环境,用其他python版本运行就会报这个错误
当前虚拟环境3.11.5下执行

pip install spacy 
python -m spacy download zh_core_web_trf
pip install presidio_analyzer
pip install presidio_anonymizer

再去运行是不会报错的

在测试项目里继续执行打包

pyinstaller --onefile --windowed main.py

还是出现一样的错误

Traceback (most recent call last):
  File "urllib3\connection.py", line 199, in _new_conn
  File "urllib3\util\connection.py", line 60, in create_connection
  File "socket.py", line 962, in getaddrinfo
socket.gaierror: [Errno 11004] getaddrinfo failed

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "urllib3\connectionpool.py", line 789, in urlopen
  File "urllib3\connectionpool.py", line 490, in _make_request
  File "urllib3\connectionpool.py", line 466, in _make_request
  File "urllib3\connectionpool.py", line 1095, in _validate_conn
  File "urllib3\connection.py", line 693, in connect
  File "urllib3\connection.py", line 206, in _new_conn
urllib3.exceptions.NameResolutionError: <urllib3.connection.HTTPSConnection object at 0x000001F908268B50>: Failed to resolve 'raw.githubusercontent.com' ([Errno 11004] getaddrinfo failed)

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "requests\adapters.py", line 667, in send
  File "urllib3\connectionpool.py", line 843, in urlopen
  File "urllib3\util\retry.py", line 519, in increment
urllib3.exceptions.MaxRetryError: HTTPSConnectionPool(host='raw.githubusercontent.com', port=443): Max retries exceeded with url: /explosion/spacy-models/master/compatibility.json (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x000001F908268B50>: Failed to resolve 'raw.githubusercontent.com' ([Errno 11004] getaddrinfo failed)"))

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "main.py", line 1, in <module>
  File "<frozen importlib._bootstrap>", line 1176, in _find_and_load
  File "<frozen importlib._bootstrap>", line 1147, in _find_and_load_unlocked
  File "<frozen importlib._bootstrap>", line 690, in _load_unlocked
  File "PyInstaller\loader\pyimod02_importers.py", line 378, in exec_module
  File "data_masking.py", line 12, in <module>
  File "presidio_analyzer\nlp_engine\nlp_engine_provider.py", line 100, in create_engine
  File "presidio_analyzer\nlp_engine\spacy_nlp_engine.py", line 57, in load
  File "presidio_analyzer\nlp_engine\spacy_nlp_engine.py", line 64, in _download_spacy_model_if_needed
  File "spacy\cli\download.py", line 85, in download
  File "spacy\cli\download.py", line 130, in get_compatibility
  File "requests\api.py", line 73, in get
  File "requests\api.py", line 59, in request
  File "requests\sessions.py", line 589, in request
  File "requests\sessions.py", line 703, in send
  File "requests\adapters.py", line 700, in send
requests.exceptions.ConnectionError: HTTPSConnectionPool(host='raw.githubusercontent.com', port=443): Max retries exceeded with url: /explosion/spacy-models/master/compatibility.json (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x000001F908268B50>: Failed to resolve 'raw.githubusercontent.com' ([Errno 11004] getaddrinfo failed)"))

3.开启VPN再启动没有反应,启动不了,可能是在下载模型

4.再次尝试在spec中添加data


a = Analysis(
    ['main.py'],
    pathex=[],
    binaries=[],
    datas=[
('.\.venv\Lib\site-packages\zh_core_web_trf','.')],
    hiddenimports=[],
    hookspath=[],
    hooksconfig={},
    runtime_hooks=[],
    excludes=[],
    noarchive=False,
    optimize=0,
)

pyinstall main.spec运行后一直出现,没有尝试等待一会看看会不会下载下来

 Skipping pipeline package dependencies and setting `--no-deps`. You
don't seem to have the spaCy package itself installed (maybe because you've
built from source?), so installing the package dependencies would cause spaCy to
be downloaded, which probably isn't what you want. If the pipeline package has
other dependencies, you'll have to install them manually.

5.尝试取消使用--onefile和顯示控制臺
将代码改成

pyinstaller main.py
class LoadedSpacyNlpEngine(SpacyNlpEngine):
    def __init__(self, loaded_spacy_model):
        super().__init__()
        self.nlp = {"zh": loaded_spacy_model}
import zh_core_web_trf
nlp = zh_core_web_trf.load()
nlp_engine = LoadedSpacyNlpEngine(loaded_spacy_model=nlp)
analyzer = AnalyzerEngine(supported_languages=["zh"], nlp_engine=nlp_engine)

https://stackoverflow.com/questions/66495437/cant-find-spacy-model-when-packaging-with-pyinstaller
image
出现错误
image
相同错误:https://github.com/explosion/spaCy/discussions/13423

出现原因是pyinstaller没有将这个打包进去
image
image

  1. 创建hook-spacy.py
    参考 https://www.airchip.org.cn/index.php/2024/04/17/spacy-to-pyinstaller/
    尝试创建hook-spacy.py
from PyInstaller.utils.hooks import collect_data_files, collect_submodules, copy_metadata

# 收集并添加 spacy 模块数据
datas = collect_data_files('spacy', include_py_files=True)
hiddenimports = collect_submodules('spacy')

# 拷贝 spacy 的元数据,这对于找到模型路径很重要
datas += copy_metadata('spacy')

# 收集并添加 pkuseg 模块数据
datas += collect_data_files('spacy_pkuseg')

# 收集并添加模型数据
datas += collect_data_files('zh_core_web_trf')

datas += collect_data_files('spacy_curated_transformers')

pyinstaller main.py打包还是不行,一样的,先报错找不到_internal下没有zh_core_web_trf文件,放进去后再报错找不到curated_transformer

pyinstaller --onefile --additional-hooks-dir=. main.py 继续b

参考:https://blog.csdn.net/chenhao0515/article/details/141931419?spm=1001.2101.3001.6650.2&utm_medium=distribute.pc_relevant.none-task-blog-2~default~YuanLiJiHua~Position-2-141931419-blog-105755049.235^v43^pc_blog_bottom_relevance_base5&depth_1-utm_source=distribute.pc_relevant.none-task-blog-2~default~YuanLiJiHua~Position-2-141931419-blog-105755049.235^v43^pc_blog_bottom_relevance_base5&utm_relevant_index=5

https://blog.csdn.net/lichaobxd/article/details/105404527

https://cloud.tencent.com/developer/ask/sof/108553619

作者:Gim

出处:https://www.cnblogs.com/Gimm/p/18457473

版权:本作品采用「署名-非商业性使用-相同方式共享 4.0 国际」许可协议进行许可。

posted @   踩坑大王  阅读(7)  评论(0编辑  收藏  举报
相关博文:
阅读排行:
· 互联网不景气了那就玩玩嵌入式吧,用纯.NET开发并制作一个智能桌面机器人(四):结合BotSharp
· 一个基于 .NET 开源免费的异地组网和内网穿透工具
· 《HelloGitHub》第 108 期
· Windows桌面应用自动更新解决方案SharpUpdater5发布
· 我的家庭实验室服务器集群硬件清单
more_horiz
keyboard_arrow_up dark_mode palette
选择主题
点击右上角即可分享
微信分享提示