欢迎来到Felix的博客

Do the right things! And talk is cheap,show me your code!

基于贝叶斯的简单语种检测

一:准备数据

机器学习的算法要取得好效果,离不开数据。首先从网上拉一些数据用来测试。首先拉取英文数据:

我从英文小说网拉取了英文数据,对英文小说的句子进行清洗,得到我们想要的:

with open('en_data.txt','r',encoding='utf8')as f:# en_data为小说的内容
    data=f.read().split('\n')
    # print(data)
    lists=[]
    # p=re.compile(r"[\w]+[。,?!]")
    p=re.compile(r"[\w -]+")
    for i in data:
        l=p.findall(i)
        lists.append(l)
    # print(lists)
with open('data.csv','w',encoding='utf8') as f:
    for i in lists:
        if i:
            for b in i:
                if len(b)>20:
                    e=str(b).strip()
                    e=e.strip(',.')
                    d=e+',en\n'
                    f.write(d)
                    print(d)

然后拉取中文数据:我从新闻网站获得很多的中文文章,处理数据

with open('ch_data.txt','r',encoding='utf8')as f:# ch_data为新闻的内容
    data=f.read().split('\n')
    # print(data)
    lists=[]
    p=re.compile(r"[\w]+[。,?!]")
    # p=re.compile(r"[\w -]+")
    for i in data:
        l=p.findall(i)
        lists.append(l)
    # print(lists)
with open('data.csv','a',encoding='utf8') as f:
    for i in lists:
        if i:
            for b in i:
                if len(b)>10:
                    e=str(b).strip()
                    e=e.strip(',。')
                    d=e+',ch\n'
                    f.write(d)
                    print(d)

经过上面处理得到data.csv格式的数据,建议中文的数据量和英文的数据量差不多。

创建一个语种识别的类:

import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB


class LanguageDetector():
    def __init__(self, classifier=MultinomialNB()):
        self.classifier = classifier
        # 在降噪的数据上抽取出来有用的特征,我们抽取1-gram和2-gram的统计特征
        self.vectorizer = CountVectorizer(ngram_range=(1, 2), max_features=1000, preprocessor=self._remove_noise)
        """
        具体参数
        vec = CountVectorizer(
            lowercase=True,     # lowercase the text
            analyzer='char_wb', # tokenise by character ngrams
            ngram_range=(1,2),  # use ngrams of size 1 and 2
            max_features=1000,  # keep the most common 1000 ngrams
            preprocessor=remove_noise
        )
        """

    def _remove_noise(self, document):  # 这个函数去除噪音
        noise_pattern = re.compile("|".join(["http\S+", "\@\w+", "\#\w+"]))
        clean_text = re.sub(noise_pattern, "", document)
        return clean_text

    def features(self, X):
        return self.vectorizer.transform(X)

    def fit(self, X, y):
        self.vectorizer.fit(X)
        self.classifier.fit(self.features(X), y)  # 进行训练

    def predict(self, x):  # 进行预测
        return self.classifier.predict(self.features([x]))

    def score(self, X, y):  # 统计准确率
        return self.classifier.score(self.features(X), y)

实践测试:

in_f = open('data.csv', 'r', encoding='utf8')
lines = in_f.readlines()
in_f.close()
dataset = [(line.strip()[:-3], line.strip()[-2:]) for line in lines]  # 以元组的形式读入数据
# print(dataset)
# 将元数据集分割成训练集的测试集
x, y = zip(*dataset)
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1)

language_detector = LanguageDetector()
language_detector.fit(x_train, y_train)
print(language_detector.predict(
    'Life is a journey. What we should care about is not where its headed but what we see and how we feel. '))
print(language_detector.score(x_test, y_test))

 # ['en'] 预测是英文
 # 0.9849048348282652 准确率

完整代码:

# -*- coding: utf-8 -*-
# @Author  : FELIX
# @Date    : 2018/3/28 11:05

import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB


class LanguageDetector():
    def __init__(self, classifier=MultinomialNB()):
        self.classifier = classifier
        # 在降噪的数据上抽取出来有用的特征,我们抽取1-gram和2-gram的统计特征
        self.vectorizer = CountVectorizer(ngram_range=(1, 2), max_features=1000, preprocessor=self._remove_noise)
        """
        具体参数
        vec = CountVectorizer(
            lowercase=True,     # lowercase the text
            analyzer='char_wb', # tokenise by character ngrams
            ngram_range=(1,2),  # use ngrams of size 1 and 2
            max_features=1000,  # keep the most common 1000 ngrams
            preprocessor=remove_noise
        )
        """

    def _remove_noise(self, document):  # 这个函数去除噪音
        noise_pattern = re.compile("|".join(["http\S+", "\@\w+", "\#\w+"]))
        clean_text = re.sub(noise_pattern, "", document)
        return clean_text

    def features(self, X):
        return self.vectorizer.transform(X)

    def fit(self, X, y):
        self.vectorizer.fit(X)
        self.classifier.fit(self.features(X), y)  # 进行训练

    def predict(self, x):  # 进行预测
        return self.classifier.predict(self.features([x]))

    def score(self, X, y):  # 统计准确率
        return self.classifier.score(self.features(X), y)


in_f = open('data.csv', 'r', encoding='utf8')
lines = in_f.readlines()
in_f.close()
dataset = [(line.strip()[:-3], line.strip()[-2:]) for line in lines]  # 以元组的形式读入数据
# print(dataset)
# 将元数据集分割成训练集的测试集
x, y = zip(*dataset)
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1)

language_detector = LanguageDetector()
language_detector.fit(x_train, y_train)
print(language_detector.predict(
    'Life is a journey. What we should care about is not where its headed but what we see and how we feel. '))
print(language_detector.score(x_test, y_test))
完整代码

 

posted @ 2018-03-28 14:45  寂静的天空  阅读(268)  评论(0编辑  收藏  举报
个人感悟: 一个人最好的镜子就是自己,你眼中的你和别人眼中的你,不是一回事。有人夸你,别信;有人骂你,别听。一根稻草,扔街上就是垃圾;捆上白菜就是白菜价;捆上大闸蟹就是大闸蟹的价。 一个人,不狂是没有出息的,但一直狂,肯定是没有出息的。雨打残花风卷流云,剑影刀光闪过后,你满脸冷酷的站在珠峰顶端,傲视苍生无比英武,此时我问你:你怎么下去? 改变自己就是改变自己的心态,该沉的时候沉下去,该浮的时候浮上来;不争名夺利,不投机取巧,不尔虞我诈;少说、多听、多行动。人每所谓穷通寿夭为命所系,岂不知造物之报施,全视人之自取。 座佑铭:每一个不曾起舞的日子,都是对生命的辜负。