re 正则表达式爬取网站标题

import requests
import re
 
url = 'http://www.jd.com/'<br>#url='http://www.eastmoney.com/'
r=requests.get(url)
r.encoding='utf-8'
data=re.findall('<title>(.*?)</title>',r.text,re.S)
print(data)

['京东(JD.COM)-正品低价、品质保障、配送及时、轻松购物！']

['东方财富网：财经门户，提供专业的财经、股票、行情、证券、基金、理财、银行、保险、信托、期货、黄金、股吧、博客等各类财经资讯及数据']

import re
 
# 提取python
key = "javapythonc++php"
re.findall("python", key)[0]
"""
python
"""
 
# 提取出hello world
key = "<html><h1>hello world</h1></html>"
re.findall('<h1>hello world</h1>', key)
"""
['<h1>hello world</h1>']
"""
 
# 提取170
string = "我喜欢身高为170的女孩"
# re.findall("170", string)[0]
re.findall('\d+', string)
"""
['170']
"""
 
# 提取出http://和https://
key = 'http://www.baidu.com and https://boob.com'
re.findall('https{0,1}', key)   # {}前的字符出现0次或1次
"""
['http', 'https']
"""
 
# 提取出hit.
key = "bobo@hit.edu.com"
re.findall("h.*\.", key)   # .表示任意字符(\n除外)；*表示匹配0个或多个；\表示对.转义
"""
['hit.edu.']
"""
# 贪婪模式：根据正则表达式尽可能多地提取数据。
# 切换为非贪婪模式,加一个"?"
re.findall("h.*?\.", key)
"""
['hit.']
"""
 
# 匹配sas和saas
key = "saas and sas and saaas"
re.findall('sa{1,2}s', key)    # 匹配1-2次由前面表达式定义的片段
"""
['saas', 'sas']
"""
 
# 匹配i开头的行  re.S:基于单行匹配    re.M:基于多行匹配
string = '''fall in love with you
i love you very much
i love she
i love her'''
re.findall("^i.*", string, re.M)
"""
['i love you very much', 'i love she', 'i love her']
"""
 
# 匹配所有的行
string = """<div>静夜思
床前明月光
疑是地上霜
举头望明月
低头思故乡
</div>"""
re.findall('<div>.*</div>', string, re.S)
"""
['<div>静夜思\n床前明月光\n疑是地上霜\n举头望明月\n低头思故乡\n</div>']
"""
 

posted @ 2020-02-18 17:27 徐海建阅读(931) 评论(0) 编辑收藏举报

刷新页面返回顶部

登录后才能查看或发表评论，立即登录或者逛逛博客园首页

阅读排行：
· 分享4款.NET开源、免费、实用的商城系统
· 全程不用写代码，我用AI程序员写了一个飞机大战
· MongoDB 8.0这个新功能碉堡了，比商业数据库还牛
· 白话解读 Dapr 1.15：你的「微服务管家」又秀新绝活了
· 上周热点回顾（2.24-3.2）

公告

昵称：徐海建
园龄： 6年2个月
粉丝： 34
关注： 2

+加关注

2025年3月

日

一

二

三

四

五

六

徐海建的自学笔记

自学笔记

re 正则表达式爬取网站标题

公告

搜索

常用链接

我的标签

随笔档案

阅读排行榜

评论排行榜

推荐排行榜

最新评论

徐海建的自学笔记

自学笔记

re 正则表达式 爬取网站标题

公告

搜索

常用链接

我的标签

随笔档案

阅读排行榜

评论排行榜

推荐排行榜

最新评论

re 正则表达式爬取网站标题