pythohn3.7 爬虫豆瓣电影排行榜

#-*-coding:utf-8-*-
from  urllib import request,parse
import requests
import importlib,sys,re
importlib.reload(sys)
import os
import pdb
# sys.setdefaultencoding('utf8')

file_name=(r'E:\YS\practice\movie'+os.sep+'豆瓣电影排行250'+'.txt')
number=1
with open(file_name,"w",encoding='utf-8') as f:
        # f.writelines(content)
    for i in range(10):
        print('正在爬去第%d页'%(i+1))
        url='https://movie.douban.com/top250?'
        a=i
        # I=str(i)
        data={  'start':a
            }
        string=parse.urlencode(data)
        url+=string+'&filter='
        header={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0'}
        request_data=request.Request(url=url,headers=header)
        # pdb.set_trace()
        response=request.urlopen(request_data)
        # pdb.set_trace()
        content=response.read().decode('utf-8')
        pattern = r'<span class="title">.*?</span>'
        m=re.findall(pattern,content)
        print (m)
        if m!=None:
            for item in m:
                if '&nbsp' not in  item:
                    f.writelines(u'第%d名'%number+'-------'+item.split(">")[1].split("<")[0]+'\n')
                    number+=1
            
        print('第%d页爬出成功'%(i+1))
posted @ 2019-10-17 11:45  Littlefish-  阅读(155)  评论(0编辑  收藏  举报
Document