基于贝叶斯的简单语种检测

一：准备数据

机器学习的算法要取得好效果，离不开数据。首先从网上拉一些数据用来测试。首先拉取英文数据：

我从英文小说网拉取了英文数据，对英文小说的句子进行清洗，得到我们想要的：

with open('en_data.txt','r',encoding='utf8')as f:# en_data为小说的内容
    data=f.read().split('\n')
    # print(data)
    lists=[]
    # p=re.compile(r"[\w]+[。，?!]")
    p=re.compile(r"[\w -]+")
    for i in data:
        l=p.findall(i)
        lists.append(l)
    # print(lists)
with open('data.csv','w',encoding='utf8') as f:
    for i in lists:
        if i:
            for b in i:
                if len(b)>20:
                    e=str(b).strip()
                    e=e.strip(',.')
                    d=e+',en\n'
                    f.write(d)
                    print(d)

然后拉取中文数据：我从新闻网站获得很多的中文文章，处理数据

with open('ch_data.txt','r',encoding='utf8')as f:# ch_data为新闻的内容
    data=f.read().split('\n')
    # print(data)
    lists=[]
    p=re.compile(r"[\w]+[。，?!]")
    # p=re.compile(r"[\w -]+")
    for i in data:
        l=p.findall(i)
        lists.append(l)
    # print(lists)
with open('data.csv','a',encoding='utf8') as f:
    for i in lists:
        if i:
            for b in i:
                if len(b)>10:
                    e=str(b).strip()
                    e=e.strip('，。')
                    d=e+',ch\n'
                    f.write(d)
                    print(d)

经过上面处理得到data.csv格式的数据，建议中文的数据量和英文的数据量差不多。

创建一个语种识别的类：

import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB


class LanguageDetector():
    def __init__(self, classifier=MultinomialNB()):
        self.classifier = classifier
        # 在降噪的数据上抽取出来有用的特征，我们抽取1-gram和2-gram的统计特征
        self.vectorizer = CountVectorizer(ngram_range=(1, 2), max_features=1000, preprocessor=self._remove_noise)
        """
        具体参数
        vec = CountVectorizer(
            lowercase=True,     # lowercase the text
            analyzer='char_wb', # tokenise by character ngrams
            ngram_range=(1,2),  # use ngrams of size 1 and 2
            max_features=1000,  # keep the most common 1000 ngrams
            preprocessor=remove_noise
        )
        """

    def _remove_noise(self, document):  # 这个函数去除噪音
        noise_pattern = re.compile("|".join(["http\S+", "\@\w+", "\#\w+"]))
        clean_text = re.sub(noise_pattern, "", document)
        return clean_text

    def features(self, X):
        return self.vectorizer.transform(X)

    def fit(self, X, y):
        self.vectorizer.fit(X)
        self.classifier.fit(self.features(X), y)  # 进行训练

    def predict(self, x):  # 进行预测
        return self.classifier.predict(self.features([x]))

    def score(self, X, y):  # 统计准确率
        return self.classifier.score(self.features(X), y)

实践测试：

in_f = open('data.csv', 'r', encoding='utf8')
lines = in_f.readlines()
in_f.close()
dataset = [(line.strip()[:-3], line.strip()[-2:]) for line in lines]  # 以元组的形式读入数据
# print(dataset)
# 将元数据集分割成训练集的测试集
x, y = zip(*dataset)
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1)

language_detector = LanguageDetector()
language_detector.fit(x_train, y_train)
print(language_detector.predict(
    'Life is a journey. What we should care about is not where its headed but what we see and how we feel. '))
print(language_detector.score(x_test, y_test))

# ['en'] 预测是英文
# 0.9849048348282652 准确率

完整代码：

# -*- coding: utf-8 -*-
# @Author  : FELIX
# @Date    : 2018/3/28 11:05

import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB


class LanguageDetector():
    def __init__(self, classifier=MultinomialNB()):
        self.classifier = classifier
        # 在降噪的数据上抽取出来有用的特征，我们抽取1-gram和2-gram的统计特征
        self.vectorizer = CountVectorizer(ngram_range=(1, 2), max_features=1000, preprocessor=self._remove_noise)
        """
        具体参数
        vec = CountVectorizer(
            lowercase=True,     # lowercase the text
            analyzer='char_wb', # tokenise by character ngrams
            ngram_range=(1,2),  # use ngrams of size 1 and 2
            max_features=1000,  # keep the most common 1000 ngrams
            preprocessor=remove_noise
        )
        """

    def _remove_noise(self, document):  # 这个函数去除噪音
        noise_pattern = re.compile("|".join(["http\S+", "\@\w+", "\#\w+"]))
        clean_text = re.sub(noise_pattern, "", document)
        return clean_text

    def features(self, X):
        return self.vectorizer.transform(X)

    def fit(self, X, y):
        self.vectorizer.fit(X)
        self.classifier.fit(self.features(X), y)  # 进行训练

    def predict(self, x):  # 进行预测
        return self.classifier.predict(self.features([x]))

    def score(self, X, y):  # 统计准确率
        return self.classifier.score(self.features(X), y)


in_f = open('data.csv', 'r', encoding='utf8')
lines = in_f.readlines()
in_f.close()
dataset = [(line.strip()[:-3], line.strip()[-2:]) for line in lines]  # 以元组的形式读入数据
# print(dataset)
# 将元数据集分割成训练集的测试集
x, y = zip(*dataset)
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1)

language_detector = LanguageDetector()
language_detector.fit(x_train, y_train)
print(language_detector.predict(
    'Life is a journey. What we should care about is not where its headed but what we see and how we feel. '))
print(language_detector.score(x_test, y_test))

完整代码

posted @ 2018-03-28 14:45 寂静的天空阅读(268) 评论(0) 编辑收藏举报

会员力量，点亮园子希望

刷新页面返回顶部

Felix Wang

Do the right things! Talk is cheap,Show me the code!

基于贝叶斯的简单语种检测

公告