纠错api

文章目录

django_实现朴素/基本模糊拼写候选/纠错
- 项目地址

django_实现朴素/基本模糊拼写候选/纠错

这只是一个粗糙的玩具,不具备智能性

使用到的拼写数据库支持(一角)

在这里插入图片描述

数据库模型

 from django.db.models.functions import Length
CharField.register_lookup(Length)
 
class WordMatcher(models.Model):
    """词典升级的时候,模糊匹配的词典也需要一并升级!!!!"""
    spelling = models.CharField(max_length=255)
    char_set = models.CharField(max_length=26)
    # word_length = models.IntegerField(default=0)
    #    django提供了类似的Length的数据库函数,长度子字段可以不需要存储(本身也不是一个好主意)
    def __str__(self):
        return str([self.spelling,self.char_set])

Words词典

在这里插入图片描述

char_set字段的计算(数据库的产生)

 wob = Word.objects
class UpdateWordMatcher:
	
    # 填充单词模糊匹配数据库支持(模糊推荐算法)
    # 测试两个例子
    def update(self):
        # sub_dict_set = wob.all()
        sub_dict_set = wob.all()[:2]
        for item in sub_dict_set:
            # print(item)
            char_set = set(item.spelling)
            chars = list(char_set)
            chars.sort()
            chars_str = "".join(chars)
            # return chars_str
            print(chars_str)
            d = {"spelling": item.spelling, "char_set": chars_str}
            wmob.create(**d)
            # chars=list(char_set).sort()
            # print(chars)

序列化器

 class WordMatcherModelSerializer(ModelSerializer):
    class Meta:
        model=WordMatcher
        fields = "__all__"

参考代码

Serialzier部分是使用了Django_DRF框架的序列化器

 class WordMatcherViewSet(ModelViewSet):
    """ 模糊匹配数据库"""
    wmob = WordMatcher.objects
    queryset = wmob.all()
    serializer_class = WordMatcherModelSerializer
    #filter_fields = ['spelling', 'char_set']
    def fuzzy_match(self, req, spelling, start_with=0):
        """
 
        :param req:
        :type req:
        :param spelling:
        :type spelling:
        :param start_with:匹配开头的字符串长度 (default: {0},表示没有被强制规定)
        :type start_with:
        :return:
        :rtype: Response
        """
 
        spelling_len = len(spelling)
        # 根据模糊拼写的长度,选择一个较为合适的start_with
        if(start_with==0):
            #没有被强制设值
            # 启用内部判断取值
            if(spelling_len>4):
                start_with=2
            else:
                start_with=1
        # 获得模糊拼写的字母集合
        spelling_char_set = set(spelling)
        # 获取模糊拼写的字母集列表(字母集合去重后转换为列表)
        spelling_char_list = list(spelling_char_set)
        # 对字母集合转成的列表进行排序
        spelling_char_list.sort()
        spelling_char_set_str = "".join(spelling_char_set)
        spelling_char_set_len = len(spelling_char_set)
        # print("@spelling_chars:", spelling_char_set_str)
        # 定义匹配的单词的长度范围
        left_len = spelling_len * 0.70
        # right_len = spelling_len * 1.25
        right_len = spelling_len * 1.4
        if spelling_len >= 4:
            right_len = spelling_len * 2
 
        # 模糊匹配
        # queryset = self.get_queryset().filter(spelling__length__gte=left_len) & self.get_queryset().filter(
        #     spelling__length__lte=right_len) & self.queryset.filter(char_set__contains=spelling_chars)
       
        """限制单词长度"""
        queryset = self.queryset.filter(spelling__length__gte=left_len) & self.queryset.filter(
            spelling__length__lte=right_len)
        # 匹配开头(严格模式)(可以额外设置变量,追加if)
        # print(queryset)
        # 匹配发音:mysql中有一个soundx函数,
        queryset = queryset.filter(spelling__startswith=spelling[:start_with])
        """限制单词字符集规模的差异"""
        # 10:14(5:7);
        # 10:16(5:8);
        queryset = queryset.filter(char_set__length__lte=1.25 * spelling_char_set_len)
        # 3:5;
        # 4:5
        queryset = queryset.filter(char_set__length__gte=0.6 * spelling_char_set_len)
 
        """匹配字符组成(最后一步)"""
        # 方案0:使用icontains函数来匹配(无法匹配到替换了个别字母的形近词!
        ## 可以引入随机剔除字符操作
        # 方案1:双向包含(或运算)(比上衣种情况稍好,但还是无法满足需求)
        # 方案2:差集(限制差集中的元素个数)(容错个别字母(种类)的不同)
        # queryin = queryset.filter(char_set__in=spelling_chars)
        # print("@queryin:", queryin)
        # queryset = queryset.filter(Q(char_set__contains=spelling_chars) | Q(char_set__in=spelling_chars))
        items = []
        for item in queryset:
            item_char_set_len = len(item.char_set)
            item_spelling_len = len(item.spelling)
            intersection = set(item.char_set) & set(spelling_char_set)
            intersection_len = len(intersection)
            # if (item.spelling == "dad"):
            #     print("dad", diff, len(diff), item, spelling_chars_len * 0.5)
            #     # print()
            if (spelling_len >= 5):
                if (intersection_len >= spelling_char_set_len * 0.8 and intersection_len >= item_char_set_len * 0.8):
                    # if (item.spelling == "dad"):
                    # print(item, diff)
                    items.append(item)
            elif (intersection == spelling_char_set):
                #     长度小于5的输入,我们只需要检查是否包含全部字符即可
                # else:
                print("@intersection", intersection)
                print("@spelling_char_set", spelling_char_set)
                print(item, intersection, spelling_char_set_len)
                if (item_spelling_len == spelling_len):
                    items.append(item)
 
        # print(queryset)
        items.sort(key=lambda x:x.spelling)
        print(len(items))
        return Res(self.serializer_class(instance=items, many=True).data)
        # return Res(self.serializer_class(instance=queryset, many=True).data)
 
    def fuzzy_match_simple(self, req, spelling):
        return self.fuzzy_match(req, spelling)

路由

     path('fuzzy/<str:spelling>/', views.WordMatcherViewSet.as_view({
        "get": "fuzzy_match_simple"
    })),
    path('fuzzy/<str:spelling>/<int:start_with>/', views.WordMatcherViewSet.as_view({
        "get": "fuzzy_match"
    })),

api基本效果

eg0:

在这里插入图片描述

eg1:

在这里插入图片描述

eg2

GEThttp://127.0.0.1:8000/word/fuzzy/fhather/1
在这里插入图片描述

项目地址

posted @ 2024-09-23 14:06 xuchaoxin1375 阅读(8) 评论(0) 编辑收藏举报来源

刷新页面返回顶部

登录后才能查看或发表评论，立即登录或者逛逛博客园首页

· django_路由方式以及路由规则的编写routing approach&RESTful api development

· Django

· django基础部分

· django组件

阅读排行：
· 分享4款.NET开源、免费、实用的商城系统
· 全程不用写代码，我用AI程序员写了一个飞机大战
· MongoDB 8.0这个新功能碉堡了，比商业数据库还牛
· 记一次.NET内存居高不下排查解决与启示
· 白话解读 Dapr 1.15：你的「微服务管家」又秀新绝活了

公告

昵称： xuchaoxin1375
园龄： 4年10个月
粉丝： 1
关注： 0

+加关注

2025年3月

日

一

二

三

四

五

六

xuchaoxin1375

django_实现朴素/基本模糊单词拼写候选/纠错api

文章目录

django_实现朴素/基本模糊拼写候选/纠错

使用到的拼写数据库支持(一角)

数据库模型

Words词典

char_set字段的计算(数据库的产生)

序列化器

参考代码

路由

api基本效果

eg0:

eg1:

eg2

项目地址

公告

搜索

常用链接

随笔档案

阅读排行榜

推荐排行榜

	from django.db.models.functions import Length
	CharField.register_lookup(Length)

	class WordMatcher(models.Model):
	"""词典升级的时候,模糊匹配的词典也需要一并升级!!!!"""
	spelling = models.CharField(max_length=255)
	char_set = models.CharField(max_length=26)
	# word_length = models.IntegerField(default=0)
	# django提供了类似的Length的数据库函数,长度子字段可以不需要存储(本身也不是一个好主意)
	def __str__(self):
	return str([self.spelling,self.char_set])

	wob = Word.objects
	class UpdateWordMatcher:

	# 填充单词模糊匹配数据库支持(模糊推荐算法)
	# 测试两个例子
	def update(self):
	# sub_dict_set = wob.all()
	sub_dict_set = wob.all()[:2]
	for item in sub_dict_set:
	# print(item)
	char_set = set(item.spelling)
	chars = list(char_set)
	chars.sort()
	chars_str = "".join(chars)
	# return chars_str
	print(chars_str)
	d = {"spelling": item.spelling, "char_set": chars_str}
	wmob.create(**d)
	# chars=list(char_set).sort()
	# print(chars)

	class WordMatcherModelSerializer(ModelSerializer):
	class Meta:
	model=WordMatcher
	fields = "__all__"

	class WordMatcherViewSet(ModelViewSet):
	""" 模糊匹配数据库"""
	wmob = WordMatcher.objects
	queryset = wmob.all()
	serializer_class = WordMatcherModelSerializer
	#filter_fields = ['spelling', 'char_set']
	def fuzzy_match(self, req, spelling, start_with=0):
	"""

	:param req:
	:type req:
	:param spelling:
	:type spelling:
	:param start_with:匹配开头的字符串长度 (default: {0},表示没有被强制规定)
	:type start_with:
	:return:
	:rtype: Response
	"""

	spelling_len = len(spelling)
	# 根据模糊拼写的长度,选择一个较为合适的start_with
	if(start_with==0):
	#没有被强制设值
	# 启用内部判断取值
	if(spelling_len>4):
	start_with=2
	else:
	start_with=1
	# 获得模糊拼写的字母集合
	spelling_char_set = set(spelling)
	# 获取模糊拼写的字母集列表(字母集合去重后转换为列表)
	spelling_char_list = list(spelling_char_set)
	# 对字母集合转成的列表进行排序
	spelling_char_list.sort()
	spelling_char_set_str = "".join(spelling_char_set)
	spelling_char_set_len = len(spelling_char_set)
	# print("@spelling_chars:", spelling_char_set_str)
	# 定义匹配的单词的长度范围
	left_len = spelling_len * 0.70
	# right_len = spelling_len * 1.25
	right_len = spelling_len * 1.4
	if spelling_len >= 4:
	right_len = spelling_len * 2

	# 模糊匹配
	# queryset = self.get_queryset().filter(spelling__length__gte=left_len) & self.get_queryset().filter(
	# spelling__length__lte=right_len) & self.queryset.filter(char_set__contains=spelling_chars)

	"""限制单词长度"""
	queryset = self.queryset.filter(spelling__length__gte=left_len) & self.queryset.filter(
	spelling__length__lte=right_len)
	# 匹配开头(严格模式)(可以额外设置变量,追加if)
	# print(queryset)
	# 匹配发音:mysql中有一个soundx函数,
	queryset = queryset.filter(spelling__startswith=spelling[:start_with])
	"""限制单词字符集规模的差异"""
	# 10:14(5:7);
	# 10:16(5:8);
	queryset = queryset.filter(char_set__length__lte=1.25 * spelling_char_set_len)
	# 3:5;
	# 4:5
	queryset = queryset.filter(char_set__length__gte=0.6 * spelling_char_set_len)

	"""匹配字符组成(最后一步)"""
	# 方案0:使用icontains函数来匹配(无法匹配到替换了个别字母的形近词!
	## 可以引入随机剔除字符操作
	# 方案1:双向包含(或运算)(比上衣种情况稍好,但还是无法满足需求)
	# 方案2:差集(限制差集中的元素个数)(容错个别字母(种类)的不同)
	# queryin = queryset.filter(char_set__in=spelling_chars)
	# print("@queryin:", queryin)
	# queryset = queryset.filter(Q(char_set__contains=spelling_chars) \| Q(char_set__in=spelling_chars))
	items = []
	for item in queryset:
	item_char_set_len = len(item.char_set)
	item_spelling_len = len(item.spelling)
	intersection = set(item.char_set) & set(spelling_char_set)
	intersection_len = len(intersection)
	# if (item.spelling == "dad"):
	# print("dad", diff, len(diff), item, spelling_chars_len * 0.5)
	# # print()
	if (spelling_len >= 5):
	if (intersection_len >= spelling_char_set_len * 0.8 and intersection_len >= item_char_set_len * 0.8):
	# if (item.spelling == "dad"):
	# print(item, diff)
	items.append(item)
	elif (intersection == spelling_char_set):
	# 长度小于5的输入,我们只需要检查是否包含全部字符即可
	# else:
	print("@intersection", intersection)
	print("@spelling_char_set", spelling_char_set)
	print(item, intersection, spelling_char_set_len)
	if (item_spelling_len == spelling_len):
	items.append(item)

	# print(queryset)
	items.sort(key=lambda x:x.spelling)
	print(len(items))
	return Res(self.serializer_class(instance=items, many=True).data)
	# return Res(self.serializer_class(instance=queryset, many=True).data)

	def fuzzy_match_simple(self, req, spelling):
	return self.fuzzy_match(req, spelling)

	path('fuzzy/<str:spelling>/', views.WordMatcherViewSet.as_view({
	"get": "fuzzy_match_simple"
	})),
	path('fuzzy/<str:spelling>/<int:start_with>/', views.WordMatcherViewSet.as_view({
	"get": "fuzzy_match"
	})),