爬虫获取新浪国内热点新闻(初学)——python2.7

# -*- coding:utf-8 -*-
import time
import sys

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support import expected_conditions as EC

reload(sys)
sys.setdefaultencoding('utf-8')


def get_driver(url):
    chrome_driver = r"C:\Python27\chromedriver.exe"
    chrome_options = Options()
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--disable-gpu')
    driver = webdriver.Chrome(options=chrome_options, executable_path=chrome_driver)

    driver.get(url)
    time.sleep(2)
    return driver


def get_url(driver):
    url_list = []
    url_li = driver.find_elements_by_xpath("/html/body/div[7]/div[2]/div[1]/div[2]/div/ul[1]/li/span[2]/a")
    for url in url_li:
        # print url.text
        # print url.get_attribute("href")
        url = url.get_attribute("href")
        url_list.append(url)
    return url_list


def get_text(driver):
    res_title = driver.find_element_by_class_name("main-title")
    # print res_title.text
    res_data = driver.find_element_by_class_name("article")
    # print res_data.text
    with open("sina_new\\"+res_title.text+".txt","w") as f:
        f.write(unicode("\xEF\xBB\xBF", "utf-8"))    # 关于【with opne】python2.7没有encoding,所以需要先写入此行信息。
        f.write(res_data.text)    


if __name__ == '__main__':
    url = "https://news.sina.com.cn/china/"
    driver = get_driver(url)
    url_list = get_url(driver)
    for url in url_list:
        driver = get_driver(url)
        get_text(driver)

posted @   vetra  阅读(57)  评论(0编辑  收藏  举报
(评论功能已被禁用)
相关博文:
阅读排行:
· 震惊!C++程序真的从main开始吗?99%的程序员都答错了
· winform 绘制太阳,地球,月球 运作规律
· 【硬核科普】Trae如何「偷看」你的代码?零基础破解AI编程运行原理
· 超详细:普通电脑也行Windows部署deepseek R1训练数据并当服务器共享给他人
· 上周热点回顾(3.3-3.9)

阅读目录(Content)

此页目录为空

点击右上角即可分享
微信分享提示