1 2 3 4 5 6 | #判断邮箱电话格式 a = "1207384251@qq.com" b = '020-12345678' mail = re.search( '\d{6,12}@[a-zA-Z0-9]+.[a-zA-Z0-9]+' , a).group( 0 ) tele_num = re.search( '\d{3,4}-\d{6,8}' , b).group( 0 ) print (mail + '\n' + tele_num) |
1 2 3 4 5 6 7 | #识别全部电话号码 text = "(021)88776543 010-55667890 02584533622 057184720483 837922740" m = re.findall(r "\(?0\d{2,3}[) -]?\d{7,8}" ,text) if m: print (m) else : print ( 'not match' ) |
1 2 3 4 5 6 7 8 | #英文分词 #不保留匹配项 m = re.split( '\d+' , '123abc321cba' ) print (m) print ( "\n" ) #保留匹配项 m = re.split( '(\d+)' , '123abc321cba' ) print (m) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 | import requests from bs4 import BeautifulSoup from datetime import datetime import re res = requests.get( 'http://news.gzcc.cn/html/xiaoyuanxinwen/' ) res.encoding = 'utf-8' soup = BeautifulSoup(res.text, 'html.parser' ) # 获取新闻点击次数 def getNewsId(url): newsId = re.findall(r '\_(.*).html' , url)[ 0 ][ - 4 :] clickUrl = 'http://oa.gzcc.cn/api.php?op=count&id={}&modelid=80' . format (newsId) clickRes = requests.get(clickUrl) # 利用正则表达式获取新闻点击次数 clickCount = int (re.search( "hits'\).html\('(.*)'\);" , clickRes.text).group( 1 )) return clickCount def getNewDetail(newsUrl): # 读取新闻细节 resDescript = requests.get(newsUrl) resDescript.encoding = "utf-8" soupDescript = BeautifulSoup(resDescript.text, 'html.parser' ) content = soupDescript.select( ".show-content" )[ 0 ].text # 正文 info = soupDescript.select( ".show-info" )[ 0 ].text # info相关内容 # 第一种方法 分离 message = info.split() # 第二种方法 用正则表达式 time = re.search( "发布时间:(.*) \xa0\xa0 \xa0\xa0作者:" , info).group( 1 ) author = re.search( "作者:(.*)\xa0\xa0审核:" , info).group( 1 ) right = re.search( "审核:(.*)\xa0\xa0来源:" , info).group( 1 ) resource = re.search( '来源:(.*)\xa0\xa0\xa0\xa0摄影:' , info).group( 1 ) video = re.search( "摄影:(.*)\xa0\xa0\xa0\xa0点击:" , info).group( 1 ) count = getNewsId(newsUrl) dateTime = datetime.strptime(time, '%Y-%m-%d %H:%M:%S' ) print ( '标题' + ': ' + title) print ( '概要' + ': ' + description) print ( '链接' + ': ' + a) print ( '正文' + ' :' + content) print ( '发布时间:{0}\n作者:{1}\n审核:{2}\n来源:{3}\n摄影:{4}\n点击次数:{5}' . format (dateTime, author, right, resource, video,count)) print ( "\n" ) for s in soup.select( "li" ): if len (s.select( ".news-list-title" ))> 0 : title = s.select( ".news-list-title" )[ 0 ].text #新闻标题 description = s.select( ".news-list-description" )[ 0 ].text #新闻描述 a = s.a.attrs[ "href" ] #观看新闻细节 getNewDetail(a) break |
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】凌霞软件回馈社区,博客园 & 1Panel & Halo 联合会员上线
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】博客园社区专享云产品让利特惠,阿里云新客6.5折上折
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步