展开阅读全文 js 爬虫操作
from selenium import webdriver import time import random from bs4 import * browser = webdriver.Chrome() url = 'https://so.gushiwen.org/shiwenv_ee16df5673bc.aspx' browser.get(url) ck_l_ori_len = len(browser.find_elements_by_link_text('展开阅读全文 ∨')) ck_l_ori_ok = 0 try: for isc in range(100): if ck_l_ori_ok == ck_l_ori_len: break time.sleep(1) js = 'window.scrollTo(0,document.body.scrollHeight)' js = 'window.scrollTo(0,100*{})'.format(isc) browser.execute_script(js) ck_l = browser.find_elements_by_link_text('展开阅读全文 ∨') for i in ck_l: try: i.click() ck_l_ori_ok += 1 except Exception as e: print(e) except Exception as e: print('window.scrollTo-->', e) # ck_l=browser.find_elements_by_link_text('展开阅读全文 ∨') # for i in ck_l: # try: # i.click() # except Exception as e: # print(e) xp_l = ['//*[@id="fanyi967"]/div/div[3]/a', ] myhtml = 'D:\\myhtml\\{}gushiwen.tmp.html'.format(random.randint(123, 999)) with open(myhtml, 'w', encoding='utf-8') as fw: fw.write(browser.page_source) sql = 'INSERT INTO parent_url (page_title,page_url,children_url) VALUES ' with open(myhtml, 'r', encoding='utf-8') as myhtml_o: bs = BeautifulSoup(myhtml_o, 'html.parser') dd = 9
a_=document.getElementsByTagName('a');le=a_.length;for(i=0;i<le;i++){if(a_[i].text=='展开阅读全文 ∨'){a_[i].click()}}
a_=document.getElementsByTagName('a');le=a_.length;for(i=0;i<le;i++){if(a_[i].text=='展开阅读全文 ∨'){try{a_[i].click()}catch(err){console.log(err)}}}
from selenium import webdriver import time import random from bs4 import * browser = webdriver.Chrome() url = 'https://so.gushiwen.org/shiwenv_ee16df5673bc.aspx' browser.get(url) # ck_l_ori_len = len(browser.find_elements_by_link_text('展开阅读全文 ∨')) # ck_l_ori_ok = 0 # try: # for isc in range(100): # if ck_l_ori_ok == ck_l_ori_len: # break # time.sleep(1) # js = 'window.scrollTo(0,document.body.scrollHeight)' # js = 'window.scrollTo(0,100*{})'.format(isc) # browser.execute_script(js) # ck_l = browser.find_elements_by_link_text('展开阅读全文 ∨') # for i in ck_l: # try: # i.click() # ck_l_ori_ok += 1 # except Exception as e: # print(e) # except Exception as e: # print('window.scrollTo-->', e) js = "a_=document.getElementsByTagName('a');le=a_.length;for(i=0;i<le;i++){if(a_[i].text=='展开阅读全文 ∨'){try{a_[i].click()}catch(err){console.log(err)}}}" try: browser.execute_script(js) except Exception as e: print(e) ck_l_ori_len = len(browser.find_elements_by_link_text('展开阅读全文 ∨')) ck_l_ori_ok = 0 try: for isc in range(100): if ck_l_ori_ok == ck_l_ori_len: break time.sleep(1) js = 'window.scrollTo(0,document.body.scrollHeight)' js = 'window.scrollTo(0,100*{})'.format(isc) browser.execute_script(js) ck_l = browser.find_elements_by_link_text('展开阅读全文 ∨') for i in ck_l: try: i.click() ck_l_ori_ok += 1 except Exception as e: print(e) except Exception as e: print('window.scrollTo-->', e)
from selenium import webdriver import time import random from bs4 import * from pyquery import PyQuery as pq browser = webdriver.Chrome() url = 'https://so.gushiwen.org/shiwenv_ee16df5673bc.aspx' browser.get(url) js = "a_=document.getElementsByTagName('a');le=a_.length;for(i=0;i<le;i++){if(a_[i].text=='展开阅读全文 ∨'){try{a_[i].click()}catch(err){console.log(err)}}}" try: browser.execute_script(js) except Exception as e: print(e) ck_l_ori_len = len(browser.find_elements_by_link_text('展开阅读全文 ∨')) ck_l_ori_ok = 0 try: for isc in range(100): if ck_l_ori_ok == ck_l_ori_len: break time.sleep(1) js = 'window.scrollTo(0,document.body.scrollHeight)' js = 'window.scrollTo(0,100*{})'.format(isc) browser.execute_script(js) ck_l = browser.find_elements_by_link_text('展开阅读全文 ∨') for i in ck_l: try: i.click() ck_l_ori_ok += 1 except Exception as e: print(e) except Exception as e: print('window.scrollTo-->', e) doc = pq(browser.page_source) pq_r_d = {'xmlns="http://www.w3.org/1999/xhtml"': ''} r_k, r_v = 'xmlns="http://www.w3.org/1999/xhtml"', '' article_ = doc('.left>:nth-child(2).sons>.cont>.contson').html().replace(r_k, r_v) title_d = {'h1': doc('.left>:nth-child(2).sons>.cont>:nth-child(2)').html().replace(r_k, r_v)} author_d = {'h3': doc('.left>:nth-child(2).sons>.cont>:nth-child(3)').text()} translation_ = doc('.left>:nth-child(4)>.contyishang>:nth-child(2)').html().replace(r_k, r_v) explanation_ = doc('.left>:nth-child(4)>.contyishang>:nth-child(3)').html().replace(r_k, r_v) refer_ = doc('.left>:nth-child(4)>.cankao').html().replace(r_k, r_v) author_img_url = doc('.left>.sonspic>.cont>.divimg>:nth-child(1)').html().split('src="')[-1].split('"')[0] d = 4