import time
import sys
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support import expected_conditions as EC
reload(sys)
sys.setdefaultencoding('utf-8')
def get_driver(url):
chrome_driver = r"C:\Python27\chromedriver.exe"
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
driver = webdriver.Chrome(options=chrome_options, executable_path=chrome_driver)
driver.get(url)
time.sleep(2)
return driver
def get_url(driver):
url_list = []
url_li = driver.find_elements_by_xpath("/html/body/div[7]/div[2]/div[1]/div[2]/div/ul[1]/li/span[2]/a")
for url in url_li:
url = url.get_attribute("href")
url_list.append(url)
return url_list
def get_text(driver):
res_title = driver.find_element_by_class_name("main-title")
res_data = driver.find_element_by_class_name("article")
with open("sina_new\\"+res_title.text+".txt","w") as f:
f.write(unicode("\xEF\xBB\xBF", "utf-8"))
f.write(res_data.text)
if __name__ == '__main__':
url = "https://news.sina.com.cn/china/"
driver = get_driver(url)
url_list = get_url(driver)
for url in url_list:
driver = get_driver(url)
get_text(driver)