python 豆瓣top250

豆瓣电影

import re
import requests
headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:94.0) Gecko/20100101 Firefox/94.0"}
url='https://movie.douban.com/top250'
proxies={
    'http':'http://123.207.96.189:80'
}           #防止被锁ip
word=0
while True:
    if word == 250:
        break
    p={'start':word}
    movie=requests.get(url,proxies=proxies,headers=headers,params=p)
    word=word+25
    abc=movie.text
    pattern=re.compile('<div class="item">.*?<em class="">(.*?)</em>.*? <a href="(.*?)">.*?<span class="title">(.*?)'
                    '</span>.*?<span class="title">&nbsp;/&nbsp;(.*?)</span>.*?<span class="other">&nbsp;/&nbsp;'
                    '(.*?)</span>.*?<p class="">.*?                           (.*?)&nbsp;&nbsp;&nbsp;(.*?)<br>'
                    '.*? property=(.*?)</span>.*?<span class="inq">(.*?)</span>',re.S)   #正则表达式     re.S 使得.匹配换行符  re.I 忽略大小写
    #print(pattern.findall(abc))
    for i in pattern.findall(abc):
        print(i)

豆瓣示例2

import requests
import re
import json
def param_html(url):
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; "
                            "Win64; x64; rv:94.0) Gecko/20100101 Firefox/94.0"}
    proxies = {
        'http': 'http://123.207.96.189:80'
    }
    response=requests.get(url,headers=headers,proxies=proxies)
    text=response.text
    pattern = re.compile('<div class="item">.*?<em class="">(.*?)</em>.*?src="(.*?)" class=".*?<a href="(.*?)">.*?'
                        '<span class="title">(.*?)'
                        '</span>.*?<span class="title">&nbsp;/&nbsp;(.*?)</span>.*?<span class="other">&nbsp;/&nbsp;'
                        '(.*?)</span>.*?<p class="">.*?                           (.*?)&nbsp;&nbsp;&nbsp;(.*?)<br>'
                        '.*? property=(.*?)</span>.*?<span class="inq">(.*?)</span>', re.S)  # 正则表达式
    # re.S 使得.匹配换行符  re.I 忽略大小写
    # print(pattern.findall(abc))
    results=pattern.findall(text)     #生成符合正则表达式的小列表
    for i in results:        #输出小列表内容
        print(i)
    for i in results:
        down_jpg(i[1])    #把小列表里的第二个值传给down_jpg (url)
        yield {
            '排名':i[0],
            '电影名':i[3],
            '英文名':i[4],
            '港台地区':i[5]

        }         #使用1次此函数就会递归50个小字典 将小列表的值递归出来

def down_jpg(url):
    r=requests.get(url)
    regix=re.compile('/public/(.*?)$',re.S)         #以 /public/(.*?)结尾的 所以要加$
    filename=regix.search(url).group(1)
    with open(filename,'wb') as  f:
        f.write(r.content)

def write_txt(str_obj):
    with open('film.txt','a',encoding='utf-8') as f:
        f.write(json.dumps(str_obj,ensure_ascii=False))
            #追加写   以dumps形式

def main():
    for i in range(0,26,25):
        url='https://movie.douban.com/top250?start='+str(i)
        for j in param_html(url):     #将50个小字典写入文件     
            write_txt(j)



main()

示例3

import re
import requests
import json
headers = {"User-Agent": "Mozilla/5.0"
" (Windows NT 10.0; Win64; x64; rv:94.0) Gecko/20100101 Firefox/94.0"}
proxies = {
    'http': 'http://123.207.96.189:80'
}

def get_html(proxies,headers):
    url='https://movie.douban.com/top250'
    word=0
    while True:
        if word == 50:
            break
        p={'start':word}
        movie=requests.get(url,proxies=proxies,headers=headers,params=p)
        word=word+25
        text=movie.text
        pattern=re.compile('<div class="item">.*?<em class="">(.*?)</em>.*?src="(.*?)" class="".*?<a href="(.*?)">.*?'
                        '<span class="title">(.*?)'
                        '</span>.*?<span class="title">&nbsp;/&nbsp;(.*?)</span>.*?<span class="other">&nbsp;/&nbsp;'
                        '(.*?)</span>.*?<p class="">.*?                           (.*?)&nbsp;&nbsp;&nbsp;(.*?)<br>'
                        '.*? property=(.*?)</span>.*?<span class="inq">(.*?)</span>',re.S)   #正则表达式

        for i in pattern.findall(text):
            yield {
                    '排名':i[0],
                    '电影名':i[3],
                    '英文名':i[4],
                    '港台地区':i[5],
                    '图片链接':i[1],
            }

def get_movie_brief():
    with open('movie.txt','a',encoding='utf-8') as f:
        for i in get_html(proxies,headers):
            f.write(json.dumps(i,ensure_ascii=False))

def get_movie_pic(proxies,headers):
    for i in get_html(proxies,headers):
        url=i['图片链接']
        re=requests.get(url,proxies=proxies,headers=headers)
        pic=re.content
        filename=str(i['排名'])+'.png'
        with open(filename,"wb") as f:
            f.write(pic)


get_movie_brief()
get_movie_pic(proxies,headers)
posted @ 2021-11-16 14:59  supermao12  阅读(61)  评论(0编辑  收藏  举报