RE模块正则表达式

行走的路上，似乎会有几分落寞，无奈。但只要你行走在自己心的路途中，一切便无悔。

day 23
# # ------------------------------------------------------------------------------------------------------------#


今日主要内容
    1. 正则表达式
        元字符
        . 除了换行符外任意字符。
        \w 数字。 字母 下划线
        \s 空白符
        \b 单词的末尾
        \d 数字
        \W 除了数字。 字母 下划线
        \D 数字数字
        \S 除了空白符
        ^  开头
        $ 结尾
        [] 字符组
        [^] 除了字符组内的元素外

        量词 限定符
        * {0，n}
        + {1，n}
        ? {0，1}
        {p} p次
        {p,} p次或更多次
        {p,q} 从p到q次

        .*
        .+

        .*?

        分组 ()
        \n 换行
        \\n \n
        \.
        \/
        \?

    2. re模块
        findall() 获取到匹配的所有内容
        finditer() 匹配到所有内容。 返回迭代器
        search() 搜索。查到了就返回
        match() 匹配. 从头开始匹配

        r"(?P<name>正则)"

import re

# res = re.search("e", "alex and exp") # 搜索. 搜到结果就返回
# print(res.group())

# res = re.match("\w+", "alex is not a good man") #  从头匹配.  如果匹配到了。 就返回
# print(res.group())

# lst = re.findall("\w+", "alex and exo")
# print(lst)

# it = re.finditer("\w+", "mai le fo leng")
# for el in it:
#     print(el.group())
#
# # 这个分组是优先级
# lst = re.findall(r"www\.(baidu|oldboy)\.com", "www.oldboy.com")
# print(lst)
#
# # (?: )  去掉优先级
# lst = re.findall(r"www\.(?:baidu|oldboy)\.com", "www.oldboy.com")
# print(lst)

# 加了括号。 split会保留你切的刀
lst = re.split("([ab])", "alex is not a sb, no he is a big sb") # 根据正则表达式进行切割
print(lst)
#
# # 替换
# res = re.sub(r"\d+", "_sb_", "alex333wusir666taibai789ritian020feng")
# print(res)
#
# # 替换。 返回的结果带有次数
# res = re.subn(r"\d+", "_sb_", "alex333wusir666taibai789ritian020feng")
# print(res)

# a = eval("1+3+5+6")
# print(a)

# code = "for i in range(10):print(i)"
# c = compile(code, "", "exec") # 编译代码
# exec(c)

# obj = re.compile(r"alex(?P<name>\d+)and") # 把正则表达式预加载
# res = obj.search("alex250andwusir38ritian2")
# print(res.group())
# print(res.group("name"))

# # ------------------------------------------------------------------------------------------------------------#

from urllib.request import urlopen
import re

# url = "https://www.dytt8.net/html/gndy/dyzz/20181114/57791.html"
# url2 = "https://movie.douban.com/top250?start=50&filter="
# content = urlopen(url).read().decode("gbk")
# content = urlopen(url).read().decode("utf-8")  #url2
# print(content)
#
# obj = re.compile(r'<div id="Zoom">.*?译　　名(?P<yiming>.*?)<br />◎片　　名(?P<pianming>.*?)<br />◎年　　'
#                  r'代(?P<nian>.*?)<br />.*?<td style="WORD-WRAP: break-word" bgcolor="#fdfddf"><a href="(?P<url>.*?)">', re.S) # re.S 去掉.的换行
#
# res = obj.search(content)
# print(res.group("yiming"))
# print(res.group("pianming"))
# print(res.group("nian"))
# print(res.group("url"))



obj = re.compile(r'<div class="item">.*?<spanclass="title">(?P<name>.*?)</span>.*?导演: (?P<daoyan>.*?)&nbsp;&nbsp;&nbsp;.*?<span class="rating_num" property="v:average">(?P<fen>.*?)</span>.*?<span>(?P<ren>.*?)人评价</span>', re.S)


def getContent(url):
    content = urlopen(url).read().decode("utf-8")
    return content

def parseContent(content):
    it = obj.finditer(content) # 把页面中所有匹配的内容进行匹配. 返回迭代器
    for el in it:
        yield {
            "name":el.group("name"),
            "daoyan":el.group("daoyan"),
            "ren":el.group("ren"),
            "fen":el.group("fen")
        }



for i in range(10):
    url = "https://movie.douban.com/top250?start=%s&filter=" % i*25
    g = parseContent(getContent(url))
    f = open("movie.txt", mode="a", encoding="utf-8")
    for el in g:
        f.write(str(el)+"\n")
    f.close()

posted @ 2018-11-18 15:17 黄豆豆丨Dealdwong 阅读(150) 评论(1) 编辑收藏举报

会员力量，点亮园子希望

刷新页面返回顶部

黄豆豆丨Dealdwong

RE模块 正则表达式

公告

RE模块正则表达式