django_实现朴素/基本模糊拼写候选/纠错
这只是一个粗糙的玩具
,不具备智能性
使用到的拼写数据库支持(一角)

数据库模型
| from django.db.models.functions import Length |
| CharField.register_lookup(Length) |
| |
| class WordMatcher(models.Model): |
| """词典升级的时候,模糊匹配的词典也需要一并升级!!!!""" |
| spelling = models.CharField(max_length=255) |
| char_set = models.CharField(max_length=26) |
| |
| |
| def __str__(self): |
| return str([self.spelling,self.char_set]) |
Words词典

char_set字段的计算(数据库的产生)
| wob = Word.objects |
| class UpdateWordMatcher: |
| |
| |
| |
| def update(self): |
| |
| sub_dict_set = wob.all()[:2] |
| for item in sub_dict_set: |
| |
| char_set = set(item.spelling) |
| chars = list(char_set) |
| chars.sort() |
| chars_str = "".join(chars) |
| |
| print(chars_str) |
| d = {"spelling": item.spelling, "char_set": chars_str} |
| wmob.create(**d) |
| |
| |
序列化器
| class WordMatcherModelSerializer(ModelSerializer): |
| class Meta: |
| model=WordMatcher |
| fields = "__all__" |
参考代码
Serialzier部分是使用了Django_DRF框架的序列化器
| class WordMatcherViewSet(ModelViewSet): |
| """ 模糊匹配数据库""" |
| wmob = WordMatcher.objects |
| queryset = wmob.all() |
| serializer_class = WordMatcherModelSerializer |
| |
| def fuzzy_match(self, req, spelling, start_with=0): |
| """ |
| |
| :param req: |
| :type req: |
| :param spelling: |
| :type spelling: |
| :param start_with:匹配开头的字符串长度 (default: {0},表示没有被强制规定) |
| :type start_with: |
| :return: |
| :rtype: Response |
| """ |
| |
| spelling_len = len(spelling) |
| |
| if(start_with==0): |
| |
| |
| if(spelling_len>4): |
| start_with=2 |
| else: |
| start_with=1 |
| |
| spelling_char_set = set(spelling) |
| |
| spelling_char_list = list(spelling_char_set) |
| |
| spelling_char_list.sort() |
| spelling_char_set_str = "".join(spelling_char_set) |
| spelling_char_set_len = len(spelling_char_set) |
| |
| |
| left_len = spelling_len * 0.70 |
| |
| right_len = spelling_len * 1.4 |
| if spelling_len >= 4: |
| right_len = spelling_len * 2 |
| |
| |
| |
| |
| |
| """限制单词长度""" |
| queryset = self.queryset.filter(spelling__length__gte=left_len) & self.queryset.filter( |
| spelling__length__lte=right_len) |
| |
| |
| |
| queryset = queryset.filter(spelling__startswith=spelling[:start_with]) |
| """限制单词字符集规模的差异""" |
| |
| |
| queryset = queryset.filter(char_set__length__lte=1.25 * spelling_char_set_len) |
| |
| |
| queryset = queryset.filter(char_set__length__gte=0.6 * spelling_char_set_len) |
| |
| """匹配字符组成(最后一步)""" |
| |
| |
| |
| |
| |
| |
| |
| items = [] |
| for item in queryset: |
| item_char_set_len = len(item.char_set) |
| item_spelling_len = len(item.spelling) |
| intersection = set(item.char_set) & set(spelling_char_set) |
| intersection_len = len(intersection) |
| |
| |
| |
| if (spelling_len >= 5): |
| if (intersection_len >= spelling_char_set_len * 0.8 and intersection_len >= item_char_set_len * 0.8): |
| |
| |
| items.append(item) |
| elif (intersection == spelling_char_set): |
| |
| |
| print("@intersection", intersection) |
| print("@spelling_char_set", spelling_char_set) |
| print(item, intersection, spelling_char_set_len) |
| if (item_spelling_len == spelling_len): |
| items.append(item) |
| |
| |
| items.sort(key=lambda x:x.spelling) |
| print(len(items)) |
| return Res(self.serializer_class(instance=items, many=True).data) |
| |
| |
| def fuzzy_match_simple(self, req, spelling): |
| return self.fuzzy_match(req, spelling) |
| |
路由
| path('fuzzy/<str:spelling>/', views.WordMatcherViewSet.as_view({ |
| "get": "fuzzy_match_simple" |
| })), |
| path('fuzzy/<str:spelling>/<int:start_with>/', views.WordMatcherViewSet.as_view({ |
| "get": "fuzzy_match" |
| })), |
api基本效果
eg0:

eg1:

eg2
GEThttp://127.0.0.1:8000/word/fuzzy/fhather/1

项目地址
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 分享4款.NET开源、免费、实用的商城系统
· 全程不用写代码,我用AI程序员写了一个飞机大战
· MongoDB 8.0这个新功能碉堡了,比商业数据库还牛
· 记一次.NET内存居高不下排查解决与启示
· 白话解读 Dapr 1.15:你的「微服务管家」又秀新绝活了