展开阅读全文 js 爬虫操作

 

from selenium import webdriver
import time
import random
from bs4 import *

browser = webdriver.Chrome()
url = 'https://so.gushiwen.org/shiwenv_ee16df5673bc.aspx'
browser.get(url)

ck_l_ori_len = len(browser.find_elements_by_link_text('展开阅读全文 ∨'))
ck_l_ori_ok = 0
try:
    for isc in range(100):
        if ck_l_ori_ok == ck_l_ori_len:
            break
        time.sleep(1)
        js = 'window.scrollTo(0,document.body.scrollHeight)'
        js = 'window.scrollTo(0,100*{})'.format(isc)
        browser.execute_script(js)
        ck_l = browser.find_elements_by_link_text('展开阅读全文 ∨')
        for i in ck_l:
            try:
                i.click()
                ck_l_ori_ok += 1
            except Exception as e:
                print(e)
except Exception as e:
    print('window.scrollTo-->', e)

# ck_l=browser.find_elements_by_link_text('展开阅读全文 ∨')
# for i in ck_l:
#     try:
#         i.click()
#     except Exception as e:
#         print(e)


xp_l = ['//*[@id="fanyi967"]/div/div[3]/a', ]

myhtml = 'D:\\myhtml\\{}gushiwen.tmp.html'.format(random.randint(123, 999))
with open(myhtml, 'w', encoding='utf-8') as fw:
    fw.write(browser.page_source)
sql = 'INSERT INTO parent_url (page_title,page_url,children_url) VALUES '
with open(myhtml, 'r', encoding='utf-8') as myhtml_o:
    bs = BeautifulSoup(myhtml_o, 'html.parser')

    dd = 9

 

a_=document.getElementsByTagName('a');le=a_.length;for(i=0;i<le;i++){if(a_[i].text=='展开阅读全文 ∨'){a_[i].click()}}

  

 

a_=document.getElementsByTagName('a');le=a_.length;for(i=0;i<le;i++){if(a_[i].text=='展开阅读全文 ∨'){try{a_[i].click()}catch(err){console.log(err)}}}

  

from selenium import webdriver
import time
import random
from bs4 import *

browser = webdriver.Chrome()
url = 'https://so.gushiwen.org/shiwenv_ee16df5673bc.aspx'
browser.get(url)

# ck_l_ori_len = len(browser.find_elements_by_link_text('展开阅读全文 ∨'))
# ck_l_ori_ok = 0
# try:
#     for isc in range(100):
#         if ck_l_ori_ok == ck_l_ori_len:
#             break
#         time.sleep(1)
#         js = 'window.scrollTo(0,document.body.scrollHeight)'
#         js = 'window.scrollTo(0,100*{})'.format(isc)
#         browser.execute_script(js)
#         ck_l = browser.find_elements_by_link_text('展开阅读全文 ∨')
#         for i in ck_l:
#             try:
#                 i.click()
#                 ck_l_ori_ok += 1
#             except Exception as e:
#                 print(e)
# except Exception as e:
#     print('window.scrollTo-->', e)

js = "a_=document.getElementsByTagName('a');le=a_.length;for(i=0;i<le;i++){if(a_[i].text=='展开阅读全文 ∨'){try{a_[i].click()}catch(err){console.log(err)}}}"
try:
    browser.execute_script(js)
except Exception as e:
    print(e)
    ck_l_ori_len = len(browser.find_elements_by_link_text('展开阅读全文 ∨'))
    ck_l_ori_ok = 0
    try:
        for isc in range(100):
            if ck_l_ori_ok == ck_l_ori_len:
                break
            time.sleep(1)
            js = 'window.scrollTo(0,document.body.scrollHeight)'
            js = 'window.scrollTo(0,100*{})'.format(isc)
            browser.execute_script(js)
            ck_l = browser.find_elements_by_link_text('展开阅读全文 ∨')
            for i in ck_l:
                try:
                    i.click()
                    ck_l_ori_ok += 1
                except Exception as e:
                    print(e)
    except Exception as e:
        print('window.scrollTo-->', e)

 

 

from selenium import webdriver
import time
import random
from bs4 import *
from pyquery import PyQuery as pq

browser = webdriver.Chrome()
url = 'https://so.gushiwen.org/shiwenv_ee16df5673bc.aspx'
browser.get(url)

js = "a_=document.getElementsByTagName('a');le=a_.length;for(i=0;i<le;i++){if(a_[i].text=='展开阅读全文 ∨'){try{a_[i].click()}catch(err){console.log(err)}}}"
try:
    browser.execute_script(js)
except Exception as e:
    print(e)
    ck_l_ori_len = len(browser.find_elements_by_link_text('展开阅读全文 ∨'))
    ck_l_ori_ok = 0
    try:
        for isc in range(100):
            if ck_l_ori_ok == ck_l_ori_len:
                break
            time.sleep(1)
            js = 'window.scrollTo(0,document.body.scrollHeight)'
            js = 'window.scrollTo(0,100*{})'.format(isc)
            browser.execute_script(js)
            ck_l = browser.find_elements_by_link_text('展开阅读全文 ∨')
            for i in ck_l:
                try:
                    i.click()
                    ck_l_ori_ok += 1
                except Exception as e:
                    print(e)
    except Exception as e:
        print('window.scrollTo-->', e)

doc = pq(browser.page_source)
pq_r_d = {'xmlns="http://www.w3.org/1999/xhtml"': ''}
r_k, r_v = 'xmlns="http://www.w3.org/1999/xhtml"', ''
article_ = doc('.left>:nth-child(2).sons>.cont>.contson').html().replace(r_k, r_v)
title_d = {'h1': doc('.left>:nth-child(2).sons>.cont>:nth-child(2)').html().replace(r_k, r_v)}
author_d = {'h3': doc('.left>:nth-child(2).sons>.cont>:nth-child(3)').text()}
translation_ = doc('.left>:nth-child(4)>.contyishang>:nth-child(2)').html().replace(r_k, r_v)
explanation_ = doc('.left>:nth-child(4)>.contyishang>:nth-child(3)').html().replace(r_k, r_v)
refer_ = doc('.left>:nth-child(4)>.cankao').html().replace(r_k, r_v)

author_img_url = doc('.left>.sonspic>.cont>.divimg>:nth-child(1)').html().split('src="')[-1].split('"')[0]

d = 4

  

 

posted @ 2018-04-24 20:43  papering  阅读(1533)  评论(0编辑  收藏  举报