import pandas as pd
import re
# 获取所有句子,并且没有重复值
df1 = pd.read_csv("")
col1 = df1[["usermsg"]]
df2 = pd.read_csv("")
col2 = df2[["usermsg"]]
col = pd.concat([col1, col2])
col = col.drop_duplicates()
col = col["usermsg"].apply(lambda x: x.strip())
new_data = []
for i in col:
pattern2 = "(?<!\d)(1\d{10})(?!\d)"
phone_list2 = re.compile(pattern2).findall(i)
if phone_list2: # 剔除手机号
pass
elif "http" in i: # 剔除网站
pass
elif i.isdigit(): # 剔除纯数字
pass
elif i.encode("UTF-8").isalpha(): # 剔除纯英文
pass
elif i.encode("UTF-8").isalnum(): # 剔除混杂的纯英文和数字
pass
elif "." in i: # 剔除小数符号
pass
elif "/:" in i: # 剔除表情符号
pass
elif "[" in i: # 剔除表情符号
pass
elif "【" in i: # 剔除特殊符号
pass
elif ":" in i: # 剔除特殊符号
pass
elif "_" in i:
pass
elif "vx" in i:
pass
elif "wx" in i:
pass
elif "-" in i:
pass
elif ":" in i:
pass
else:
new_data.append(i)
new_df = pd.DataFrame(data=new_data)
new_df = new_df.dropna()
new_df.to_csv("new_clean.csv")