团队项目冲刺第三天数据清洗一

数据清洗一

数据已经取出，然后是对数据的一个清洗

其中中文文本的分类

需要分词

还需要进行停用词的取出

以防对特征值的抽取造成过大影响

分词是为了进行特征抽取的一个词语分割然后进行提取作用

# encoding=utf-8                         #遍历文件，用ProsessofWords处理文件
from imp import reload
import jieba
import os
import numpy as np
import sys

reload(sys)


def EnumPathFiles(path, callback, stop_words_list):
    if not os.path.isdir(path):
        print('Error:"', path, '" is not a directory or does not exist.')
        return
    list_dirs = os.walk(path)

    for root, dirs, files in list_dirs:
        for d in dirs:
            print(d)
            EnumPathFiles(os.path.join(root, d), callback, stop_words_list)
        for f in files:
            callback(root, f, stop_words_list)


def ProsessofWords(textpath, stop_words_list):
    f = open(textpath, 'r', encoding='utf-8')
    text = f.read()
    f.close()
    result = list()
    outstr = ''
    seg_list = jieba.cut(text, cut_all=False)
    for word in seg_list:
        if word not in stop_words_list:
            if word != '\t':
                outstr += word
                outstr += " "
    f = open(textpath, 'w+', encoding='utf-8')
    f.write(outstr)
    f.close()


def callback1(path, filename, stop_words_list):
    textpath = path + '\\' + filename
    print(textpath)
    ProsessofWords(textpath, stop_words_list)


if __name__ == '__main__':
    stopwords_file = "../stopword/stopword.txt"
    stop_f = open(stopwords_file, "r", encoding='utf-8')
    stop_words = list()
    for line in stop_f.readlines():
        line = line.strip()
        if not len(line):
            continue
        stop_words.append(line)
    stop_f.close()
    print(len(stop_words))

    EnumPathFiles(r'../article', callback1, stop_words)

posted @ 2021-05-04 10:08 黄某人233 阅读(248) 评论(0) 编辑收藏举报

会员力量，点亮园子希望

刷新页面返回顶部

团队项目冲刺第三天 数据清洗 一

数据清洗 一

公告

团队项目冲刺第三天数据清洗一

数据清洗一