伯乐在线资讯URL

  • 伯乐资讯URL
  • # encoding: utf-8
    import requests
    from bs4 import BeautifulSoup
    import csv
    import time
    
    
    base_url = 'http://top.jobbole.com/page/'
    
    session = requests.session()
    inum=0
    
    def zhuqu(page):
        url_list = []
        url = base_url+str(page)+"/"
        # print(url)
        res = session.get(url=url)
        soup = BeautifulSoup(res.text, 'html.parser')
    
        post_nodes = soup.select(".list-posts .media .media-body h3 a")
    
        for post_node in post_nodes:
            post_url = post_node.get("href")
            url_list.append([post_url])
            # i+=1
            # print(i,post_url)
        print(url_list)
        return url_list
    # zhuqu(2)
    
    
    with open("伯乐资讯", 'w', newline="", encoding='utf-8') as csv_out:
            writer = csv.writer(csv_out)
            for i in range(355):
                if i%10==0:
                    time.sleep(1)
                row =zhuqu(i)
                if not row:
                    print("有错误")
                    continue
                else:
                    writer.writerows(row)
                    print(inum,"成功")
                    inum+=1

    代码如上

  • 问题:把广告也抓进来了,不知道怎么用css选择来避免抓取这种情况,懂的朋友给提示下。
posted @ 2018-06-18 18:18  大长胡子  阅读(173)  评论(0编辑  收藏  举报