import re #过滤出中国,其中name不一定总为name str = '<div class ="name">中国</div>' resu = re.findall(r'<div class =".*">(.*?)</div>',str) print(resu) #过滤中文:汉字整体打印 str1 = "not 404 6.775 found 魏无羡 888 蓝湛" print ("\n练习:过滤出“",str1,"”中的中文,汉字整体打印") list= str1.split(" ") #['not', '404', '6.775', 'found', '', '', '魏无羡', '', '888', '', '蓝湛'] pattern = '\d+\.?\d*|[a-zA-Z]+' #\d+\.?\d* 过滤整数、小数等,[a-zA-Z] 过滤字母 resu1 = re.findall(pattern,str1) #['not', '404', '6.775', 'found', '888'] for i in resu1: list.remove(i) #list:['', '', '魏无羡', '', '', '蓝湛'] while '' in list: list.remove('') #去掉列表中的空格,不去也可以的,list:['魏无羡', '蓝湛'] str1_resu = ' '.join(list) print (str1_resu) #过滤中文2:单个汉字打印 str1 = "not 404 6.775 found 魏无羡 888 蓝湛" print ("\n练习2:过滤出",str1,"中的中文,单个汉字打印") list= str1.split(" ") print (list) pattern = '[\u4e00-\u9fa5]' #[\u4e00-\u9fa5] 过滤汉字 resu1 = re.findall(pattern,str1) print (resu1)
结果:
['中国'] 练习:过滤出“ not 404 6.775 found 魏无羡 888 蓝湛 ”中的中文,汉字整体打印 魏无羡 蓝湛 练习2:过滤出 not 404 6.775 found 魏无羡 888 蓝湛 中的中文,单个汉字打印 ['not', '404', '6.775', 'found', '', '魏无羡', '', '888', '', '蓝湛'] ['魏', '无', '羡', '蓝', '湛'] Process finished with exit code 0