数据集处理
pheme9 数据清洗
def clean_english_text(text): tweet = text.strip() # 去除链接 tweet = re.sub(r'http\S+|www\S+|https\S+', '', tweet) # 去除 <@user> # tweet = re.sub(r'<@user>', '', tweet) # 去除 <url> # tweet = re.sub(r'<url>', '', tweet) # 去除@用户 # tweet = re.sub(r'@\w+ ', '', tweet) tweet = re.sub(r'\.@\w+\: ', '', tweet) tweet = re.sub(r'\.@\w+ ', '', tweet) tweet = re.sub(r'@\w+\. ', '', tweet) tweet = re.sub(r'@\w+\: ', '', tweet) tweet = re.sub(r'@\w+ ', '', tweet) tweet = re.sub(r'\s+', ' ', tweet) # 使用正则表达式匹配和替换话题 # tweet = re.sub(r'#\w+', '', tweet) # 去除标点符号和特殊字符 # tweet = re.sub(r'[^\w\s]', '', tweet) # 去除多余的空格 tweet = re.sub(r'\s+', ' ', tweet).strip() # 小写 tweet = tweet.lower() return tweet
twitter 数据集清洗
def clean_english_text(text):
tweet = text.strip()
# 去除链接
tweet = re.sub(r'http\S+|www\S+|https\S+', '', tweet)
tweet = tweet.lower()
tweet = re.sub(r' url\b', '', tweet)
tweet = re.sub(r': url\b', '', tweet)
tweet = re.sub(r'\burl\b', '', tweet)
# 去除多余的空格
tweet = re.sub(r'\s+', ' ', tweet).strip()
return tweet
twitter15 数据集清洗
def clean_english_text(text):
tweet = text.strip()
tweet = tweet.lower()
tweet = re.sub(r'<url>', '', tweet)
tweet = re.sub(r'\.<@user>\:', ' ', tweet)
tweet = re.sub(r'<@user>\:', ' ', tweet)
tweet = re.sub(r'<@user>', ' ', tweet)
tweet = re.sub(r'\brt\b', ' ', tweet)
tweet = re.sub(r'\s+', ' ', tweet).strip()
return tweet
twitter16 数据集清洗
def clean_english_text(text):
tweet = text.strip()
tweet = tweet.lower()
tweet = re.sub(r'<url>', '', tweet)
tweet = re.sub(r'\.<@user>\:', ' ', tweet)
tweet = re.sub(r'<@user>\:', ' ', tweet)
tweet = re.sub(r'<@user>', ' ', tweet)
tweet = re.sub(r'\brt\b', ' ', tweet)
tweet = re.sub(r'\s+', ' ', tweet).strip()
return tweet
twitter_covid19 数据清洗
def clean_english_text(text): tweet = text.strip() tweet = tweet.lower() tweet = re.sub(r'url', '', tweet) tweet = re.sub(r'\s+', ' ', tweet).strip() return tweet
weibo 数据集清洗
def clean_chinese_text(text): tweet = text.strip() tweet = tweet.lower() # 去除链接 tweet = re.sub(r'http\S+|www\S+|https\S+', '', tweet) # 删除转发微博 tweet = re.sub(r'转发微博。', '', tweet) tweet = re.sub(r'转发微博', '', tweet) tweet = re.sub(r'轉發微博', '', tweet) tweet = re.sub(r'轉發微博。', '', tweet) tweet = re.sub(r'转发微博', '', tweet) tweet = re.sub(r'// @[a-zA-Z0-9\u4e00-\u9fa5_-]+:', ' ', tweet) tweet = re.sub(r'// @[a-zA-Z0-9\u4e00-\u9fa5_-]+:', ' ', tweet) tweet = re.sub(r'//@[a-zA-Z0-9\u4e00-\u9fa5_-]+:', ' ', tweet) tweet = re.sub(r'//@ [a-zA-Z0-9\u4e00-\u9fa5_-]+:', ' ', tweet) tweet = re.sub(r'转// @[a-zA-Z0-9\u4e00-\u9fa5_-]+ :', ' ', tweet) tweet = re.sub(r'// @[a-zA-Z0-9\u4e00-\u9fa5_-]+ :', ' ', tweet) tweet = re.sub(r'//@[a-zA-Z0-9\u4e00-\u9fa5_-]+', ' ', tweet) tweet = re.sub(r'回复@[a-zA-Z0-9\u4e00-\u9fa5_-]+:', ' ', tweet) tweet = re.sub(r'回复@[a-zA-Z0-9\u4e00-\u9fa5_-]+:', ' ', tweet) tweet = re.sub(r'@[a-zA-Z0-9\u4e00-\u9fa5_-]+', ' ', tweet) tweet = re.sub(r'@[a-zA-Z0-9\u4e00-\u9fa5_-]+ :', ' ', tweet) tweet = re.sub(r'http[s]?://\S+', '', tweet) # 去除多余的空格 tweet = re.sub(r' \s+', ' ', tweet).strip() return tweet
twitter_covid19
def clean_chinese_text(text): tweet = text.strip() tweet = tweet.lower() tweet = re.sub(r'转发微博。', '', tweet) tweet = re.sub(r'转发微博', '', tweet) tweet = re.sub(r'轉發微博', '', tweet) tweet = re.sub(r'轉發微博。', '', tweet) tweet = re.sub(r'查看图片', '', tweet) tweet = re.sub(r'网页链接', '', tweet) tweet = re.sub(r'图片评论', '', tweet) # 去除链接 tweet = re.sub(r'http\S+|www\S+|https\S+', '', tweet) # 删除转发微博 tweet = re.sub(r"<'([a-zA-Z0-9\u4e00-\u9fa5_-]+)'>@\1", ' ', tweet) tweet = re.sub(r"@<'([a-zA-Z0-9\u4e00-\u9fa5_-]+)'>@\1", ' ', tweet) tweet = re.sub(r"//<'[a-zA-Z0-9\u4e00-\u9fa5_-]+'>", ' ', tweet) #//<'强大大丨'>转发微博 tweet = re.sub(r"回复<'[a-zA-Z0-9\u4e00-\u9fa5_-]+'>", ' ', tweet) #回复<'美少女陈翠花'>哦 tweet = re.sub(r"<'[a-zA-Z0-9\u4e00-\u9fa5_-]+'>", ' ', tweet) #回复<'美少女陈翠花'>哦 # 去除多余的空格 tweet = re.sub(r' \s+', ' ', tweet).strip() return tweet
import copy import nltk from nltk.tokenize import TweetTokenizer from nltk.corpus import stopwords import os import re import json # nltk.download('stopwords') from tqdm import tqdm # 创建分词器 tokenizer = TweetTokenizer() # def clean_english_text(text): # tweet = text.strip() # tweet = tweet.lower() # tweet = re.sub(r'http\S+|www\S+|https\S+','[URL]', tweet) # tweet = re.sub(r' url\b', '[URL]', tweet) # tweet = re.sub(r'\burl\b', '[URL]', tweet) # # 去除多余的空格 # tweet = re.sub(r'\s+', ' ', tweet).strip() # return tweet # def clean_english_text(text): # tweet = text.strip() # # 去除链接 # tweet = re.sub(r'http\S+|www\S+|https\S+', '', tweet) # # 去除 <@user> # # tweet = re.sub(r'<@user>', '', tweet) # # 去除 <url> # # tweet = re.sub(r'<url>', '', tweet) # # 去除@用户 # # tweet = re.sub(r'@\w+ ', '', tweet) # # tweet = re.sub(r'\.@\w+\: ', '', tweet) # tweet = re.sub(r'\.@\w+ ', '', tweet) # tweet = re.sub(r'@\w+\. ', '', tweet) # tweet = re.sub(r'@\w+\: ', '', tweet) # tweet = re.sub(r'@\w+ ', '', tweet) # tweet = re.sub(r'\s+', ' ', tweet) # # # 去除多余的空格 # tweet = re.sub(r'\s+', ' ', tweet).strip() # # 小写 # tweet = tweet.lower() # return tweet def clean_chinese_text(text): tweet = text.strip() tweet = tweet.lower() tweet = re.sub(r'转发微博。', '', tweet) tweet = re.sub(r'转发微博', '', tweet) tweet = re.sub(r'轉發微博', '', tweet) tweet = re.sub(r'轉發微博。', '', tweet) tweet = re.sub(r'查看图片', '', tweet) tweet = re.sub(r'网页链接', '', tweet) tweet = re.sub(r'图片评论', '', tweet) # 去除链接 tweet = re.sub(r'http\S+|www\S+|https\S+', '', tweet) # 删除转发微博 tweet = re.sub(r"<'([a-zA-Z0-9\u4e00-\u9fa5_-]+)'>@\1", ' ', tweet) tweet = re.sub(r"@<'([a-zA-Z0-9\u4e00-\u9fa5_-]+)'>@\1", ' ', tweet) tweet = re.sub(r"//<'[a-zA-Z0-9\u4e00-\u9fa5_-]+'>", ' ', tweet) #//<'强大大丨'>转发微博 tweet = re.sub(r"回复<'[a-zA-Z0-9\u4e00-\u9fa5_-]+'>", ' ', tweet) #回复<'美少女陈翠花'>哦 tweet = re.sub(r"<'[a-zA-Z0-9\u4e00-\u9fa5_-]+'>", ' ', tweet) #回复<'美少女陈翠花'>哦 # 去除多余的空格 tweet = re.sub(r' \s+', ' ', tweet).strip() return tweet if __name__ == '__main__': if False: dataset_list = ["terrorist",'gossip','twitter15','twitter16','twitter_covid19','twitter'] d_name = dataset_list[0] print("dataset name = ",d_name) d_path = os.path.join(os.getcwd(),'dataset',d_name,'data') for file_name in tqdm(os.listdir(d_path)): # print(file_name) file_path = os.path.join(d_path,file_name) data = json.load(open(file_path,mode = 'r',encoding='utf-8')) source_text = copy.copy(data['source']['text']) data['source']['content'] = clean_english_text(source_text) for com_data in data['comment']: text = copy.copy(com_data['text']) com_data['content'] = clean_english_text(text) with open(file_path, 'w', encoding='utf-8') as file_obj: json.dump(data, file_obj, indent=4, ensure_ascii=False) if True: dataset_list = ["weibo",'weibo_covid19'] d_name = dataset_list[1] print("dataset name = ", d_name) d_path = os.path.join(os.getcwd(),'dataset',d_name,'data') for file_name in tqdm(os.listdir(d_path)): print(file_name) file_path = os.path.join(d_path,file_name) data = json.load(open(file_path,mode = 'r',encoding='utf-8')) source_text = copy.copy(data['source']['text']) data['source']['content'] = clean_chinese_text(source_text) for com_data in data['comment']: text = copy.copy(com_data['text']) com_data['content'] = clean_chinese_text(text) with open(file_path, 'w', encoding='utf-8') as file_obj: json.dump(data, file_obj, indent=4, ensure_ascii=False)
因上求缘,果上努力~~~~ 作者:图神经网络,转载请注明原文链接:https://www.cnblogs.com/BlairGrowing/articles/17681570.html