python 爬虫爬取豆瓣Top250榜单

python 爬虫爬取豆瓣Top250榜单

这是一个小作业。


request模块

使用request.get(url)可以爬取一个网址的信息

 # 构造合理的HTTP请求头, 伪装成浏览器, 绕过反爬虫机制,否则会被反爬虫机制拒绝(418)。 https://www.kesci.com/home/project/5dd6003700b0b900365feaeb
 user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.44 Safari/537.36"
r = requests.get('http://movie.douban.com/top250?start=225', headers={'User-Agent': user_agent})
print(r.status_code) # 418 表示返回失败, 200表示返回成功

f = open("1.txt", "w", encoding='utf-8')
html = r.text # 打印页面信息
print(html, file = f)

BeatifulSoup 模块

详细参考官方文档
1、安装

pip install beautifulsoup4
pip list // 查看安装的python 模块

2、建立BeautifulSoup4对象

bs = BeautifulSoup(html, "html.parser") #创建beautifulSoup4对象
print(bs.prettify()) # 有缩进地输出bs所有内容

3、访问一个标签内容

html = '''
<!DOCTYPE html>
<!--STATUS OK-->
<html>
 <head>
  <meta content="text/html;charset=utf-8" http-equiv="content-type"/>
  <meta content="IE=Edge" http-equiv="X-UA-Compatible"/>
  <meta content="always" name="referrer"/>
  <link href="https://ss1.bdstatic.com/5eN1bjq8AAUYm2zgoY3K/r/www/cache/bdorz/baidu.min.css" rel="stylesheet" type="text/css"/>
  <title>
   百度一下,你就知道 </title>
 </head>
 <body link="#0000cc">
  <div id="wrapper">
   <div id="head">
    <div class="head_wrapper">
     <div id="u1">
      <a class="mnav" href="http://news.baidu.com" name="tj_trnews">
       新闻 </a>
      <a class="mnav" href="https://www.hao123.com" name="tj_trhao123">
       hao123 </a>
      <a class="mnav" href="http://map.baidu.com" name="tj_trmap">
       地图 </a>
      <a class="mnav" href="http://v.baidu.com" name="tj_trvideo">
       视频 </a>
      <a class="mnav" href="http://tieba.baidu.com" name="tj_trtieba">
       贴吧 </a>
      <a class="bri" href="//www.baidu.com/more/" name="tj_briicon" style="display: block;">
       更多产品 </a>
     </div>
    </div>
   </div>
  </div>
 </body>
</html>
'''
# 以以上html页面为例

bs.title # 获得<title>两个标签内的所有内容
bs.div # 获取两个<div>标签之间地所有内容(默认第一个出现地div)
bs.a # 获取第一个出现地<a>标签的内容
bs.head # 获取head标签的内容。


bs.title.name # 标签名称,即title
bs.title.string # 输出title标签内第一个文字部分,或者子标签内第一个文字部分,如果有多个文字,无法输出
bs.a.stirng
bs.get_text() # 输出一个标签以及它的子标签的所有 文字

bs.find_all("a") #获取所有的a标签,并返回一个list
bs.find_all("div") #获取所有的div标签,返回一个list
bs.find_add(id='u1') # 返回所有id='u1'的标签
bs.find_all(id=True) # 返回所有存在id的标签
bs.find_all(class_='mnav') # 返回所有class为mnav的标签

bs.find_all(attrs={"class":"mnav"}) # 返回class=mnav的所有标签,此处class可以为
bs.find_all(attrs={"name":"tj_trnews"})
# 返回所有name=tj_trnews的标签
# find_all参数可以使用多个

# 访问子标签
tmp = bs.find_all("head")
print(t[0].a) # 注意find返回一个bs4对象,find_all返回一个list

csv 模块

1、读入

import csv
 with open('a.csv','r') as myFile:
     lines=csv.reader(myFile)
     for line in lines:
         print (line)
# 另外的写法
# f = open("a.csv", "r")
# lines = csv.reader(f)
# lines = csv.reader('a.csv', 'r')

2、写入

headers = ['class','name','sex','height','year']
rows = [
        [1,'xiaoming','male',168,23],
        [1,'xiaohong','female',162,22],
        [2,'xiaozhang','female',163,21],
        [2,'xiaoli','male',158,21]
    ]
with open('test.csv','w', newline='')as f: # 不加newline=''会出现隔行输出的情况
    f_csv = csv.writer(f)
    f_csv.writerow(headers) # 可以将一个list写入到一行
    f_csv.writerows(rows) # 可以写入多行

完整代码

最后写入csv存在乱码的情况,原因为csv文件对于中文编码默认为ansi,输出时为utf-8,修改方式使用记事本打开csv,点击另存为,然后选择编码为ansi,记事本既支持utf-8也支持ansi。

# coding=utf-8
# html = '''
# <ol class="grid_view">
#         <li>
#             <div class="item">
#                 <div class="pic">
#                     <em class="">1</em>
#                     <a href="https://movie.douban.com/subject/1292052/">
#                         <img width="100" alt="肖申克的救赎" src="https://img2.doubanio.com/view/photo/s_ratio_poster/public/p480747492.webp" class="">
#                     </a>
#                 </div>
#                 <div class="info">
#                     <div class="hd">
#                         <a href="https://movie.douban.com/subject/1292052/" class="">
#                             <span class="title">肖申克的救赎</span>
#                                     <span class="title">&nbsp;/&nbsp;The Shawshank Redemption</span>
#                                 <span class="other">&nbsp;/&nbsp;月黑高飞(港)  /  刺激1995(台)</span>
#                         </a>


#                             <span class="playable">[可播放]</span>
#                     </div>
#                     <div class="bd">
#                         <p class="">
#                             导演: 弗兰克·德拉邦特 Frank Darabont&nbsp;&nbsp;&nbsp;主演: 蒂姆·罗宾斯 Tim Robbins /...<br>
#                             1994&nbsp;/&nbsp;美国&nbsp;/&nbsp;犯罪 剧情
#                         </p>

                        
#                         <div class="star">
#                                 <span class="rating5-t"></span>
#                                 <span class="rating_num" property="v:average">9.7</span>
#                                 <span property="v:best" content="10.0"></span>
#                                 <span>2476527人评价</span>
#                         </div>

#                             <p class="quote">
#                                 <span class="inq">希望让人自由。</span>
#                             </p>
#                     </div>
#                 </div>
#             </div>
#         </li>
       
# </ol>
# ```

import requests
from bs4 import BeautifulSoup
import csv

Name = []
Name2 = []
Url = []
Actor = []
Score = []
Number = []

def getHtml(html):
    bs = BeautifulSoup(html, "html.parser")
    totlist = bs.find_all('ol', class_='grid_view')
    for nowMovie in totlist[0].find_all('li'):
        tmp = nowMovie.find_all('span', class_='title')
        movie_name = tmp[0].string
        if len(tmp) > 1:
            movie_name2 = tmp[1].string
            movie_name2 = movie_name2[3:]
        else :
            movie_name2 = ''
        tmp = nowMovie.find('div', class_ = 'hd')
        movie_url = tmp.a.get('href') # 此处和python字典的用法一样
        tmp = nowMovie.find('div', attrs={'class':'bd'})
        movie_actor = tmp.p.getText()
        tmp = nowMovie.find('span', attrs={'class':'rating_num'})
        movie_score = tmp.string
        tmp = nowMovie.find_all('span', class_=False)
        tmpstr = tmp[1].string
        movie_number = tmpstr.strip("人评价")


        Url.append(movie_url)
        Name.append(movie_name)
        Name2.append(movie_name2)
        Actor.append(movie_actor)
        Score.append(movie_score)
        Number.append(movie_number)
    return 

def printCsv():
    File = open('a.csv', 'w' , newline = '', encoding="utf-8" ) # 因为存在外文韩语日语等,所以需要用utf-8
    Print = csv.writer(File)
    Print.writerow(['电影名', '英文名', '评分', '评价人数', '演员', '电影链接'])
    for i in range(0, 250):
        nowlist  = []
        nowlist.append(Name[i])
        # nowlist.append(Name2[i]) # 存在日语韩语,注意编码
        nowlist.append(Score[i])
        nowlist.append(Number[i])
        # nowlist.append(Actor[i]) 
        nowlist.append(Url[i])
        Print.writerow(nowlist)
    
        
    

def main():
    # 构造合理的HTTP请求头, 伪装成浏览器, 绕过反爬虫机制,否则会被反爬虫机制拒绝(418)。 https://www.kesci.com/home/project/5dd6003700b0b900365feaeb
    user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.44 Safari/537.36"
    targetUrl = 'https://movie.douban.com/top250?start='
    for i in range(0, 226, 25):
        print(targetUrl + str(i))
        r = requests.get(targetUrl + str(i), headers={'User-Agent': user_agent})
        print(r.status_code) # 418 表示返回失败, 200表示返回成功
        getHtml(r.text)   
    printCsv()
   

if __name__=='__main__':
    main()
posted @ 2021-11-09 10:40  MJT12044  阅读(226)  评论(0编辑  收藏  举报