文本预处理之去除emoji以及指定文本类型筛选

批量删除指定文本

之前的keyword就不要了,准备删除掉,写了个脚本,批量删除keyword.txt结尾的文件

#!usr/bin/env python
#-*- coding:utf-8 -*-

import math
import os
import glob
import numpy as np
import jieba
import string
import jieba.analyse
def read_from_file(directions):      
    decode_set=['utf-8','gb18030','ISO-8859-2','gb2312','gbk','Error']#编码集
    #编码集循环
    for k in decode_set:
        try:
            file = open(directions,"r",encoding=k)
            readfile = file.read()#这步如果解码失败就会引起错误,跳到except。
            
            #print("open file %s with encoding %s" %(directions,k))#打印读取成功
            #readfile = readfile.encode(encoding="utf-8",errors="replace")#若是混合编码则将不可编码的字符替换为"?"。
            file.close()
            break#打开路径成功跳出编码匹配
        except:
            if k=="Error":#如果碰到这个程序终止运行
                raise Exception("%s had no way to decode"%directions)
            continue
    return readfile

filenames = []
filenames=glob.glob(r"D:/TXTandTXTkeyword/TXT/*.txt")
filenameslen=len(filenames)
count=0
countprint=0
for filename in filenames:
    countprint=countprint+1
    if countprint==10:
        print("\r%d : %d" %(count,filenameslen),end='')
        countprint=0
    os.remove(filename)
    count=count+1
print("%d : %d" %(count,filenameslen))
print("finished")

去除emoji

在之前的文本预处理工作之中,我只是简单的进行了jieba分词,然后效果很不好,里面有很多的乱码,经过调查,乱码来源如下:

一个是emoji,一个是非gbk字符

针对我们目前英文文本的处理操作,我们必须去除掉emoji和非gbk字符,这边提供一种思路,就是使用nltk.corpus中的wordnet去判断是否为标准英文单词,代码如下:

#!usr/bin/env python
#-*- coding:utf-8 -*-
from nltk.corpus import wordnet
import math
import os
import glob
import numpy as np
import jieba
import string
import jieba.analyse
def read_from_file(directions):      
    decode_set=['utf-8','gb18030','ISO-8859-2','gb2312','gbk','Error']#编码集
    #编码集循环
    for k in decode_set:
        try:
            file = open(directions,"r",encoding=k)
            readfile = file.read()#这步如果解码失败就会引起错误,跳到except。
            
            #print("open file %s with encoding %s" %(directions,k))#打印读取成功
            #readfile = readfile.encode(encoding="utf-8",errors="replace")#若是混合编码则将不可编码的字符替换为"?"。
            file.close()
            break#打开路径成功跳出编码匹配
        except:
            if k=="Error":#如果碰到这个程序终止运行
                raise Exception("%s had no way to decode"%directions)
            continue
    return readfile

filenames = []
filenames=glob.glob(r"D:/allkeyword/TXT/*.txt")
filenameslen=len(filenames)
count=0
countprint=0
for filename in filenames:
    countprint=countprint+1
    if countprint==10:
        print("\r%d : %d" %(count,filenameslen),end='')
        countprint=0
    names=filename.find('TXT')+4
    namee=filename.find('.txt')
    f=open(filename,"rb")
    content=f.readlines()
    content=" ".join('%s' %id for id in content)
    start=content.find('description')+15
    overflow=content.find('comments')
    end=content[start:].find('#')+start
    contentfinal=""
    j=start
    qq=start
    # print(content[start:overflow])
    for i in range(start,overflow):
        if content[i]=="#":
            contentfinal=contentfinal+content[j:i]
            x=content[i:].find(" ")+i
            i=x
            j=x
    contentfinal=contentfinal+content[j:i]
    if end>=overflow:
        end=overflow 
    # print("filename:\n")
    # print(filename[names:namee])
    file = open(r"D:/allkeyword/TXT/" +filename[names:namee]+'keyword' + '.txt','w')
    # print("content is:\n")
    # print(content[start:end])
    # print("\n keyword: \n")
    file_data = contentfinal
    #基于TF-IDF算法进行关键词抽取
    tfidf=jieba.analyse.extract_tags
    keywords=tfidf(file_data)
    for i in range(len(keywords)):
        if len(keywords)<=0:
            print("error,please check your input")
            break
        if wordnet.synsets(keywords[i]):
            file.write(keywords[i]+'\n')
            # print(keywords[i]+"\n")
        
    file.close()
    count=count+1
print("%d : %d" %(count,filenameslen))
print("finished")

指定social media语料库进行词的筛选

使用了nltk的语料库,nltk有着webtext这样的语料库,据此我对文本进行了筛选

#!usr/bin/env python
#-*- coding:utf-8 -*-
from nltk.corpus import webtext
from nltk.corpus import wordnet
import math
import os
import glob
import numpy as np
import jieba
import string
import jieba.analyse

str0=webtext.raw('firefox.txt')
str1=webtext.raw('grail.txt')
str2=webtext.raw('overheard.txt')
str3=webtext.raw('pirates.txt')
str4=webtext.raw('singles.txt')
str5=webtext.raw('wine.txt')

def read_from_file(directions):      
    decode_set=['utf-8','gb18030','ISO-8859-2','gb2312','gbk','Error']#编码集
    #编码集循环
    for k in decode_set:
        try:
            file = open(directions,"r",encoding=k)
            readfile = file.read()#这步如果解码失败就会引起错误,跳到except。
            
            #print("open file %s with encoding %s" %(directions,k))#打印读取成功
            #readfile = readfile.encode(encoding="utf-8",errors="replace")#若是混合编码则将不可编码的字符替换为"?"。
            file.close()
            break#打开路径成功跳出编码匹配
        except:
            if k=="Error":#如果碰到这个程序终止运行
                raise Exception("%s had no way to decode"%directions)
            continue
    return readfile

filenames = []
filenames=glob.glob(r"D:/allkeyword/TXT/*keyword.txt")
filenameslen=len(filenames)
count=0
countprint=0
for filename in filenames:
    countprint=countprint+1
    if countprint==10:
        print("\r%d : %d" %(count,filenameslen),end='')
        countprint=0
    names=filename.find('TXT')+4
    namee=filename.find('.txt')
    f=open(filename,'r')
    f1=open(filename,'r')
    countoffile0=len(f1.readlines())
    f1.close()
    countoffile1=0
    line=f.readline()
    file = open(r"D:/allkeyword/dealwithSocialMedia/"+filename[names:namee]+'new'+ '.txt','w')
    while line:
        text=line[:len(line)-1]
        # print(text)
        if str0.find(text)!=-1 or str1.find(text)!=-1 or str2.find(text)!=-1 or str3.find(text)!=-1 or str4.find(text)!=-1 or str5.find(text)!=-1:
            # print(text+" in")
            file.write(text+'\n')
            countoffile1=countoffile1+1
        line = f.readline()
    file.close()
    file1 = open(r"D:/allkeyword/stat/" +filename[names:namee]+'stat' + '.txt','w')
    file1.write("%d to %d" %(countoffile0,countoffile1))
    file1.close()

    # print("filename:\n")
    # print(filename[names:namee])
    count=count+1
print("%d : %d" %(count,filenameslen))
print("finished")
posted @ 2019-03-16 15:47  Harry666  阅读(649)  评论(0编辑  收藏  举报