未通过原因:近期在全网出现过高度相似文章被认为是旧闻

 

 

 

 

from bs4 import *
from pyquery import PyQuery as pq
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import os
import requests
import time
import threading
import logging
import random

browser = webdriver.Chrome()
url = 'https://so.gushiwen.org/shiwenv_ee16df5673bc.aspx'
browser.get(url)

js = "a_=document.getElementsByTagName('a');le=a_.length;for(i=0;i<le;i++){if(a_[i].text=='展开阅读全文 ∨'){try{a_[i].click()}catch(err){console.log(err)}}}"
try:
    browser.execute_script(js)
except Exception as e:
    print(e)
    ck_l_ori_len = len(browser.find_elements_by_link_text('展开阅读全文 ∨'))
    ck_l_ori_ok = 0
    try:
        for isc in range(100):
            if ck_l_ori_ok == ck_l_ori_len:
                break
            time.sleep(1)
            js = 'window.scrollTo(0,document.body.scrollHeight)'
            js = 'window.scrollTo(0,100*{})'.format(isc)
            browser.execute_script(js)
            ck_l = browser.find_elements_by_link_text('展开阅读全文 ∨')
            for i in ck_l:
                try:
                    i.click()
                    ck_l_ori_ok += 1
                except Exception as e:
                    print(e)
    except Exception as e:
        print('window.scrollTo-->', e)

doc = pq(browser.page_source)
pq_r_d = {'xmlns="http://www.w3.org/1999/xhtml"': ''}
r_k, r_v = 'xmlns="http://www.w3.org/1999/xhtml"', ''
article_ = doc('.left>:nth-child(2).sons>.cont>.contson').html().replace(r_k, r_v)
title_d = {'h1': doc('.left>:nth-child(2).sons>.cont>:nth-child(2)').html().replace(r_k, r_v)}
author_d = {'h3': doc('.left>:nth-child(2).sons>.cont>:nth-child(3)').text()}
translation_ = doc('.left>:nth-child(4)>.contyishang>:nth-child(2)').html().replace(r_k, r_v)
explanation_ = doc('.left>:nth-child(4)>.contyishang>:nth-child(3)').html().replace(r_k, r_v)
refer_ = doc('.left>:nth-child(4)>.cankao').html().replace(r_k, r_v)

author_img_url = doc('.left>.sonspic>.cont>.divimg>:nth-child(1)').html().split('src="')[-1].split('"')[0]

k = 'h1'
v = title_d[k]
db_html = '<{}>{}</{}>'.format(k, v, k)
k = 'h3'
v = author_d[k]
db_html = '{}<{}>{}</{}>'.format(db_html, k, v, k)
db_html = '{}{}'.format(db_html, '<br><img src="{}" ><br>'.format(author_img_url))
l = [db_html, article_, explanation_, translation_, refer_]
db_html = '<br><br>'.join(l)

rp_s_l = ['<a  href=', '<a href=', '<a title=', '<a  title=']
for rp_s in rp_s_l:
    img_n = db_html.count(rp_s)
    for i in range(img_n):
        p1 = db_html.index(rp_s, 0)
        tmp = '{}{}'.format(db_html[0:p1].replace('>', 'X'), db_html[p1 + 1:])
        p2 = tmp.index('>')
        db_html = '{}{}{}'.format(db_html[0:p1], '', db_html[p2 + 1:])

f_url_l = ['https://www.toutiao.com/a6514526304476332552/', 'https://www.toutiao.com/a6514778729951003150/']
f_url_l += ['https://www.toutiao.com/a6514216125151052291/',
            'https://www.toutiao.com/a6512315164463727111/']
f_url_l_a = f_url_l[int(time.time()) % len(f_url_l)]
# browser = webdriver.Chrome()

js = 'window.location.href="{}";'.format(f_url_l_a)
browser.execute_script(js)
# browser.get(f_url_l_a)
time.sleep(random.randint(10, 20))

js = 'window.location.href="https://sso.toutiao.com/login/";'
js = 'window.location.href="https://sso.toutiao.com/login/?service=https://mp.toutiao.com/sso_confirm/?redirect_url=/";'
browser.execute_script(js)
time.sleep(random.randint(10, 20))

start_time = time.strftime('%Y%m%d_%H%M%S', time.localtime(time.time()))
os_sep = os.sep
this_file_abspath, this_file_name = os.path.dirname(os.path.abspath(__file__)), os.path.abspath(__file__).split(os_sep)[
    -1]
logf = this_file_name + '.log'
try:
    logging.basicConfig(level=logging.INFO,
                        format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s[thread:%(thread)d][process:%(process)d]',
                        datefmt='%a, %d %b %Y %H:%M:%S',
                        filename=logf,
                        filemode='a')
except Exception as e:
    s = '%s%s%s' % ('logging.basicConfig EXCEPTION ', time.strftime('%Y%m%d_%H%M%S', time.localtime(time.time())), e)
    with open(logf, 'a') as fo:
        fo.write(s)
        print(s)
        os._exit(4002)

logging.info('START')

img_url = 'https://s3.pstatp.com/toutiao/static/img/logo.201f80d.png'
img_dir = 'C:\\Users\\sas\\PycharmProjects\\py_win_to_unix\\crontab_chk_url\\personas\\trunk\\plugins\\spider\\dl_img_tmp\\'
import random


def spider_webimg_dl_return_local_img_path(img_dir, img_url, local_default='default.DONOT_REMOVE.png'):
    r = '%s%s' % (img_dir, local_default)
    try:
        bytes = requests.get(img_url)._content
        # r = '%s%s%s%s%s' % (
        #     img_dir, time.strftime('%Y%m%d%H%M%S', time.localtime(time.time())), str(threading.get_ident()),
        #     img_url.replace('/', '_xl_').replace(':', '_fxl_').replace('?', '_fxlquestion_').replace('=',
        #                                                                                              '_fxlequal_').replace(
        #         '&', '_fxland_'), '.png')
        r = '%s%s%s%s%s' % (
            img_dir, time.strftime('%Y%m%d%H%M%S', time.localtime(time.time())), str(threading.get_ident()),
            random.randint(1234, 9876), '.png')
        if bytes != 0:
            with open(r, 'wb')as f:
                f.write(bytes)
    except Exception as e:
        print(e)
    return r


d = {}
d['title'] = '《{}》{}'.format(title_d['h1'], author_d['h3'])[0:30]
d['content'] = db_html

ac_type, ac = 'qq', {}
ac['u'], ac['p'] = '2783', 'w3q'
if ac_type == 'qq':
    myid, mypwd = ac['u'], ac['p']
    xp = '/html/body/div/div/div[2]/div/div/div/ul/li[3]'
    xp = '/html/body/div/div/div[2]/div/div/div/ul/li[2]'
    browser.find_element_by_xpath(xp).click()
    time.sleep(10)
    js = '%s%s%s' % ('document.getElementById("u").value="', myid, '"')
    browser.execute_script(js)
    js = '%s%s%s' % ('document.getElementById("p").value="', mypwd, '"')
    browser.execute_script(js)
    time.sleep(random.randint(5, 15))
    xp_newpage = '//*[@id="go"]'
    browser.find_element_by_xpath(xp_newpage).click()
    time.sleep(random.randint(10, 20))
browser.refresh()
js = 'window.location.href="https://www.toutiao.com/";'
browser.execute_script(js)
browser.refresh()
time.sleep(6)
js = 'window.location.href="https://mp.toutiao.com/profile_v2/publish/";'
js = 'window.location.href="https://mp.toutiao.com/profile_v3/graphic/publish";'
browser.execute_script(js)
time.sleep(6)
dbhtml_str, pgc_img_url_l = d['content'], []
myhtml = 'D:\\myhtml\\{}tmp.html'.format(random.randint(123, 999))
with open(myhtml, 'w', encoding='utf-8') as fw:
    fw.write(dbhtml_str)
with open(myhtml, 'r', encoding='utf-8') as myhtml_o:
    bs = BeautifulSoup(myhtml_o, 'html.parser')
    pgc_img_url_l = [i.attrs['src'] for i in bs.find_all('img')]
js = 'window.location.href="https://mp.toutiao.com/profile_v3/graphic/resource-manager";'
browser.execute_script(js)
time.sleep(2)
xp = '//*[@id="graphic"]/div/div/div[3]/div/div[1]/div[3]/div/input'
upload = browser.find_element_by_xpath(xp)
for img_url in pgc_img_url_l:
    local_img_path = spider_webimg_dl_return_local_img_path(img_dir, img_url)
    upload.send_keys(local_img_path)
time.sleep(10 * len(pgc_img_url_l))
pgc_img_url_l_toutiao = [i.find_element_by_tag_name('img').get_attribute('src') for i in
                         browser.find_elements_by_class_name('pic')][0:len(pgc_img_url_l)]
pgc_img_url_l_toutiao = [i for i in sorted(pgc_img_url_l_toutiao, reverse=True)]

'''
<img onload="editor.fireEvent('contentchange')" src="https://p1.pstatp.com/large/pgc-image/15238623686755f9e3c409a" _src="https://p1.pstatp.com/large/pgc-image/15238623686755f9e3c409a" alt="pgc-image/15238623686755f9e3c409a" buttonadded="true"> 
'''
dbhtml_str_ = dbhtml_str
img_n = dbhtml_str_.count('<img')
s = '<img onload="editor.fireEvent(\'contentchange\')" src="https://p1.pstatp.com/large/pgc-image/TTimgCode" _src="https://p1.pstatp.com/large/pgc-image/TTimgCode" alt="pgc-image/TTimgCode" buttonadded="true">'
s = '<img onload="editor.fireEvent(\'contentchange\')" src="https://p1.pstatp.com/large/pgc-image/TTimgCode" _src="https://p1.pstatp.com/large/pgc-image/TTimgCode" alt="pgc-image/TTimgCode" buttonadded="true">'
#     s = "<img onload='editor.fireEvent(\'contentchange\')' src='https://p1.pstatp.com/large/pgc-image/TTimgCode' _src='https://p1.pstatp.com/large/pgc-image/TTimgCode' alt='pgc-image/TTimgCode' buttonadded='true'>"
ss = ''
l = pgc_img_url_l_toutiao
for i in range(img_n):
    if i == 0:
        p1 = dbhtml_str.index('<img ', 0)
    else:
        p1 = dbhtml_str.index('<img ', p1 + 3)

    tmp = '{}{}'.format(dbhtml_str[0:p1].replace('>', 'X'), dbhtml_str[p1:])
    p2 = tmp.index('>')
    ss = s.replace('TTimgCode', l[0].split('/')[-1])
    dbhtml_str = '{}{}{}'.format(dbhtml_str[0:p1], ss, dbhtml_str[p2 + 1:])
    del l[0]
print('-----------------')
print(dbhtml_str)
time.sleep(2)
js = 'window.location.href="https://mp.toutiao.com/profile_v3/graphic/publish";'
browser.execute_script(js)
time.sleep(6)
xp_newpage = '//*[@id="title"]'
mytxt = d['title']
browser.find_element_by_xpath(xp_newpage).send_keys(Keys.SPACE)
browser.find_element_by_xpath(xp_newpage).send_keys(mytxt)
time.sleep(2)
xp = '//*[@id="edui18_body"]/div[1]'
# //*[@id="edui18_body"]/div[1]
browser.find_element_by_xpath(xp).click()
time.sleep(2)
xp = '//*[@id="images"]/div[1]/div/span'
browser.find_element_by_xpath(xp).click()
time.sleep(3)
'''
'it=document.getElementById("ueditor_0").contentWindow.document.getElementsByTagName("body")[0];it.innerHTML="{}"'.format(dbhtml_str.replace('onload="editor.fireEvent(\'contentchange\')"','').replace('"',"'").replace('\n',''))

'''
# 结合浏览器控制台,拼接符合语法的js字符串
r_d = {'onload="editor.fireEvent(\'contentchange\')"': '', '"': "'", '\n': ''}
dbhtml_str_py_js = dbhtml_str
for k in r_d:
    dbhtml_str_py_js = dbhtml_str_py_js.replace(k, r_d[k])
dbhtml_str_py_js = dbhtml_str_py_js.replace('nbsp;', ' ')
js = 'document.getElementById("ueditor_0").contentWindow.document.getElementsByTagName("body")[0].innerHTML="{}"'.format(
    dbhtml_str_py_js)
browser.execute_script(js)

# 自动
# Message: unknown error: Element is not clickable at point (589, 952)
try:
    for isc in range(2):
        time.sleep(1)
        js = 'window.scrollTo(0,document.body.scrollHeight)'
        browser.execute_script(js)
except Exception as e:
    print('window.scrollTo-->', e)
time.sleep(10)

try:
    xp = '//*[@id="graphic"]/div/div/div[2]/div[2]/div[1]/div[2]/div/div/div[1]/div/label[3]/span'
    browser.find_element_by_xpath(xp).click()
    time.sleep(1)
except Exception as e:
    print(e)

time.sleep(2)
# 不投放广告
xp = '//*[@id="graphic"]/div/div/div[2]/div[2]/div[2]/div[2]/div[1]/label[2]/span'
# 投放广告
xp = '//*[@id="graphic"]/div/div/div[2]/div[2]/div[2]/div[2]/div[1]/label[1]/span'
browser.find_element_by_xpath(xp).click()
time.sleep(1)
# 存草稿
xp = '//*[@id="graphic"]/div/div/div[2]/div[3]/div[2]/div[2]'
# 发布
xp = '//*[@id="graphic"]/div/div/div[2]/div[3]/div[2]/div[1]'

browser.find_element_by_xpath(xp).click()

time.sleep(random.randint(20, 30))
js = 'window.location.href="https://www.toutiao.com/"'
browser.execute_script(js)
try:
    browser.quit()
except Exception as e1:
    print(e1)
    logging.exception(e1)

  

<h1>秋夜读书每以二鼓尽为节</h1><h3>宋代
:
陆游</h3><br><img onload="editor.fireEvent('contentchange')" src="https://p1.pstatp.com/large/pgc-image/15246284315719a928e33a1" _src="https://p1.pstatp.com/large/pgc-image/15246284315719a928e33a1" alt="pgc-image/15246284315719a928e33a1" buttonadded="true"><br><br><br>
腐儒碌碌叹无奇,独喜遗编不我欺。<br  />白发无情侵老境,青灯有味似儿时。<br  />高梧策策传寒意,叠鼓冬冬迫睡期。<br  />秋夜渐长饥作祟,一杯山药进琼糜。
<br><br><strong >注释<br /></strong>以二鼓尽为节:指读书读到二更天才停止。二鼓,指更鼓报过二更。<br  />腐儒:作者自称。<br  />碌碌:平庸,无所作为。<br  />遗编:遗留后世的著作,泛指古代典籍。<br  />不我欺:并不欺骗我。<br  />策策:象声词,指风摇动树叶发出的响声。<br  />叠鼓:轻轻击鼓,指更鼓。<br  />冬冬:象声词,指鼓声。<br  />迫睡期:催人睡觉。<br  />作祟:暗中捣鬼,形容夜深了还没有睡觉,肚子饿了。<br  />琼糜:像琼浆一样甘美的粥。糜,粥。>▲</a><br><br><strong >译文<br /></strong>我这个迂腐的儒生,可叹一生碌碌无奇,却只爱前人留下来的著作,从不将我欺骗。<br  />白发无情地爬上头顶,渐渐地进入老年,读书的青灯却依旧像儿时那样亲切有味。<br  />高大的梧桐策策作响,传来一阵阵寒意,读书兴致正浓,忽听更鼓冬冬催人入睡。<br  />秋夜漫漫,饥肠辘辘,再也难以读下去,喝杯山药煮成的薯粥,胜过那佳肴美味。<br><br>
<p  style=" color:#999999;margin:0px; font-size:12px;line-height:160%;">参考资料:</p>
<div  style="clear:both; float:left;color:#999999; font-size:12px; width:630px; margin-top:4px;">
<span style="width:20px; float:left;">1、</span>
<span style="width:610px; float:left;">刘扬忠注评.陆游诗词选评:三秦出版社,2008.2:9-10</span>
</div>
<div  style="clear:both; float:left;color:#999999; font-size:12px; width:630px; margin-top:4px;">
<span style="width:20px; float:left;">2、</span>
<span style="width:610px; float:left;">(宋)陆游著,王水照,高克勤选注.陆游选集:人民文学出版社,1997年11月:16</span>
</div>
<div  style="clear:both; float:left;color:#999999; font-size:12px; width:630px; margin-top:4px;">
<span style="width:20px; float:left;">3、</span>
<span style="width:610px; float:left;">邓建烈主编.高中文言文精译精讲精练 高二:上海交通大学出版社,2006.06:35</span>
</div>

 

 

from bs4 import *
from pyquery import PyQuery as pq
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import os
import requests
import time
import threading
import logging
import random

start_time = time.strftime('%Y%m%d_%H%M%S', time.localtime(time.time()))
os_sep = os.sep
this_file_abspath, this_file_name = os.path.dirname(os.path.abspath(__file__)), os.path.abspath(__file__).split(os_sep)[
    -1]
logf = this_file_name + '.log'
try:
    logging.basicConfig(level=logging.INFO,
                        format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s[thread:%(thread)d][process:%(process)d]',
                        datefmt='%a, %d %b %Y %H:%M:%S',
                        filename=logf,
                        filemode='a')
except Exception as e:
    s = '%s%s%s' % ('logging.basicConfig EXCEPTION ', time.strftime('%Y%m%d_%H%M%S', time.localtime(time.time())), e)
    with open(logf, 'a') as fo:
        fo.write(s)
        print(s)
        os._exit(4002)

logging.info('START')

img_url = 'https://s3.pstatp.com/toutiao/static/img/logo.201f80d.png'
img_dir = 'C:\\Users\\sas\\PycharmProjects\\py_win_to_unix\\crontab_chk_url\\personas\\trunk\\plugins\\spider\\dl_img_tmp\\'
import random


def spider_webimg_dl_return_local_img_path(img_dir, img_url, local_default='default.DONOT_REMOVE.png'):
    r = '%s%s' % (img_dir, local_default)
    try:
        bytes = requests.get(img_url)._content
        # r = '%s%s%s%s%s' % (
        #     img_dir, time.strftime('%Y%m%d%H%M%S', time.localtime(time.time())), str(threading.get_ident()),
        #     img_url.replace('/', '_xl_').replace(':', '_fxl_').replace('?', '_fxlquestion_').replace('=',
        #                                                                                              '_fxlequal_').replace(
        #         '&', '_fxland_'), '.png')
        r = '%s%s%s%s%s' % (
            img_dir, time.strftime('%Y%m%d%H%M%S', time.localtime(time.time())), str(threading.get_ident()),
            random.randint(1234, 9876), '.png')
        if bytes != 0:
            with open(r, 'wb')as f:
                f.write(bytes)
    except Exception as e:
        print(e)
    return r


browser = webdriver.Chrome()
url = 'https://so.gushiwen.org/shiwenv_ee16df5673bc.aspx'
browser.get(url)
f_url_l = ['https://www.toutiao.com/a6514526304476332552/', 'https://www.toutiao.com/a6514778729951003150/']
f_url_l += ['https://www.toutiao.com/a6514216125151052291/',
            'https://www.toutiao.com/a6512315164463727111/']
f_url_l_a = f_url_l[int(time.time()) % len(f_url_l)]
# browser = webdriver.Chrome()

js = 'window.location.href="{}";'.format(f_url_l_a)
browser.execute_script(js)
# browser.get(f_url_l_a)
time.sleep(random.randint(10, 20))

js = 'window.location.href="https://sso.toutiao.com/login/";'
js = 'window.location.href="https://sso.toutiao.com/login/?service=https://mp.toutiao.com/sso_confirm/?redirect_url=/";'
browser.execute_script(js)
time.sleep(random.randint(10, 20))
ac_type, ac = 'qq', {}
ac['u'], ac['p'] = '344', 'gregr'
if ac_type == 'qq':
    myid, mypwd = ac['u'], ac['p']
    xp = '/html/body/div/div/div[2]/div/div/div/ul/li[3]'
    xp = '/html/body/div/div/div[2]/div/div/div/ul/li[2]'
    browser.find_element_by_xpath(xp).click()
    time.sleep(10)
    js = '%s%s%s' % ('document.getElementById("u").value="', myid, '"')
    browser.execute_script(js)
    js = '%s%s%s' % ('document.getElementById("p").value="', mypwd, '"')
    browser.execute_script(js)
    time.sleep(random.randint(5, 15))
    xp_newpage = '//*[@id="go"]'
    browser.find_element_by_xpath(xp_newpage).click()
    time.sleep(random.randint(10, 20))
browser.refresh()


with open('myurl.txt', 'r') as fr:
    for url in fr:
        url = url.replace('\n', '')
        try:
            # url = 'https://so.gushiwen.org/shiwenv_ee16df5673bc.aspx'
            # browser.get(url)
            js = 'window.location.href="{}";'.format(url)
            browser.execute_script(js)
            js = "a_=document.getElementsByTagName('a');le=a_.length;for(i=0;i<le;i++){if(a_[i].text=='展开阅读全文 ∨'){try{a_[i].click()}catch(err){console.log(err)}}}"
            try:
                browser.execute_script(js)
            except Exception as e:
                print(e)
                ck_l_ori_len = len(browser.find_elements_by_link_text('展开阅读全文 ∨'))
                ck_l_ori_ok = 0
                try:
                    for isc in range(100):
                        if ck_l_ori_ok == ck_l_ori_len:
                            break
                        time.sleep(1)
                        js = 'window.scrollTo(0,document.body.scrollHeight)'
                        js = 'window.scrollTo(0,100*{})'.format(isc)
                        browser.execute_script(js)
                        ck_l = browser.find_elements_by_link_text('展开阅读全文 ∨')
                        for i in ck_l:
                            try:
                                i.click()
                                ck_l_ori_ok += 1
                            except Exception as e:
                                print(e)
                except Exception as e:
                    print('window.scrollTo-->', e)
            doc = pq(browser.page_source)
            pq_r_d = {'xmlns="http://www.w3.org/1999/xhtml"': ''}
            r_k, r_v = 'xmlns="http://www.w3.org/1999/xhtml"', ''
            article_ = doc('.left>:nth-child(2).sons>.cont>.contson').html().replace(r_k, r_v)
            title_d = {'h1': doc('.left>:nth-child(2).sons>.cont>:nth-child(2)').html().replace(r_k, r_v)}
            author_d = {'h3': doc('.left>:nth-child(2).sons>.cont>:nth-child(3)').text()}
            translation_ = doc('.left>:nth-child(4)>.contyishang>:nth-child(2)').html().replace(r_k, r_v)
            explanation_ = doc('.left>:nth-child(4)>.contyishang>:nth-child(3)').html().replace(r_k, r_v)
            refer_ = doc('.left>:nth-child(4)>.cankao').html().replace(r_k, r_v)
            author_img_url = doc('.left>.sonspic>.cont>.divimg>:nth-child(1)').html().split('src="')[-1].split('"')[0]
            k = 'h1'
            v = title_d[k]
            db_html = '<{}>{}</{}>'.format(k, v, k)
            k = 'h3'
            v = author_d[k]
            db_html = '{}<{}>{}</{}>'.format(db_html, k, v, k)
            db_html = '{}{}'.format(db_html, '<br><img src="{}" ><br>'.format(author_img_url))
            l = [db_html, article_, explanation_, translation_, refer_]
            db_html = '<br><br>'.join(l)
            rp_s_l = ['<a  href=', '<a href=', '<a title=', '<a  title=']
            for rp_s in rp_s_l:
                img_n = db_html.count(rp_s)
                for i in range(img_n):
                    p1 = db_html.index(rp_s, 0)
                    tmp = '{}{}'.format(db_html[0:p1].replace('>', 'X'), db_html[p1 + 1:])
                    p2 = tmp.index('>')
                    db_html = '{}{}{}'.format(db_html[0:p1], '', db_html[p2 + 1:])
            d = {}
            d['title'] = '《{}》{}'.format(title_d['h1'], author_d['h3'])[0:30]
            d['content'] = db_html
            js = 'window.location.href="https://mp.toutiao.com/profile_v2/publish/";'
            js = 'window.location.href="https://mp.toutiao.com/profile_v3/graphic/publish";'
            browser.execute_script(js)
            time.sleep(6)
            dbhtml_str, pgc_img_url_l = d['content'], []
            myhtml = 'D:\\myhtml\\{}tmp.html'.format(random.randint(123, 999))
            with open(myhtml, 'w', encoding='utf-8') as fw:
                fw.write(dbhtml_str)
            with open(myhtml, 'r', encoding='utf-8') as myhtml_o:
                bs = BeautifulSoup(myhtml_o, 'html.parser')
                pgc_img_url_l = [i.attrs['src'] for i in bs.find_all('img')]
            js = 'window.location.href="https://mp.toutiao.com/profile_v3/graphic/resource-manager";'
            browser.execute_script(js)
            time.sleep(2)
            xp = '//*[@id="graphic"]/div/div/div[3]/div/div[1]/div[3]/div/input'
            upload = browser.find_element_by_xpath(xp)
            for img_url in pgc_img_url_l:
                local_img_path = spider_webimg_dl_return_local_img_path(img_dir, img_url)
                upload.send_keys(local_img_path)
            time.sleep(10 * len(pgc_img_url_l))
            pgc_img_url_l_toutiao = [i.find_element_by_tag_name('img').get_attribute('src') for i in
                                     browser.find_elements_by_class_name('pic')][0:len(pgc_img_url_l)]
            pgc_img_url_l_toutiao = [i for i in sorted(pgc_img_url_l_toutiao, reverse=True)]

            '''
            <img onload="editor.fireEvent('contentchange')" src="https://p1.pstatp.com/large/pgc-image/15238623686755f9e3c409a" _src="https://p1.pstatp.com/large/pgc-image/15238623686755f9e3c409a" alt="pgc-image/15238623686755f9e3c409a" buttonadded="true"> 
            '''
            dbhtml_str_ = dbhtml_str
            img_n = dbhtml_str_.count('<img')
            s = '<img onload="editor.fireEvent(\'contentchange\')" src="https://p1.pstatp.com/large/pgc-image/TTimgCode" _src="https://p1.pstatp.com/large/pgc-image/TTimgCode" alt="pgc-image/TTimgCode" buttonadded="true">'
            s = '<img onload="editor.fireEvent(\'contentchange\')" src="https://p1.pstatp.com/large/pgc-image/TTimgCode" _src="https://p1.pstatp.com/large/pgc-image/TTimgCode" alt="pgc-image/TTimgCode" buttonadded="true">'
            #     s = "<img onload='editor.fireEvent(\'contentchange\')' src='https://p1.pstatp.com/large/pgc-image/TTimgCode' _src='https://p1.pstatp.com/large/pgc-image/TTimgCode' alt='pgc-image/TTimgCode' buttonadded='true'>"
            ss = ''
            l = pgc_img_url_l_toutiao
            for i in range(img_n):
                if i == 0:
                    p1 = dbhtml_str.index('<img ', 0)
                else:
                    p1 = dbhtml_str.index('<img ', p1 + 3)

                tmp = '{}{}'.format(dbhtml_str[0:p1].replace('>', 'X'), dbhtml_str[p1:])
                p2 = tmp.index('>')
                ss = s.replace('TTimgCode', l[0].split('/')[-1])
                dbhtml_str = '{}{}{}'.format(dbhtml_str[0:p1], ss, dbhtml_str[p2 + 1:])
                del l[0]
            print('-----------------')
            print(dbhtml_str)
            time.sleep(2)
            js = 'window.location.href="https://mp.toutiao.com/profile_v3/graphic/publish";'
            browser.execute_script(js)
            time.sleep(6)
            xp_newpage = '//*[@id="title"]'
            mytxt = d['title']
            browser.find_element_by_xpath(xp_newpage).send_keys(Keys.SPACE)
            browser.find_element_by_xpath(xp_newpage).send_keys(mytxt)
            time.sleep(2)
            xp = '//*[@id="edui18_body"]/div[1]'
            # //*[@id="edui18_body"]/div[1]
            browser.find_element_by_xpath(xp).click()
            time.sleep(2)
            xp = '//*[@id="images"]/div[1]/div/span'
            browser.find_element_by_xpath(xp).click()
            time.sleep(3)
            '''
            'it=document.getElementById("ueditor_0").contentWindow.document.getElementsByTagName("body")[0];it.innerHTML="{}"'.format(dbhtml_str.replace('onload="editor.fireEvent(\'contentchange\')"','').replace('"',"'").replace('\n',''))
            
            '''
            # 结合浏览器控制台,拼接符合语法的js字符串
            r_d = {'onload="editor.fireEvent(\'contentchange\')"': '', '"': "'", '\n': ''}
            dbhtml_str_py_js = dbhtml_str
            for k in r_d:
                dbhtml_str_py_js = dbhtml_str_py_js.replace(k, r_d[k])
            dbhtml_str_py_js = dbhtml_str_py_js.replace('nbsp;', ' ')
            js = 'document.getElementById("ueditor_0").contentWindow.document.getElementsByTagName("body")[0].innerHTML="{}"'.format(
                dbhtml_str_py_js)
            browser.execute_script(js)

            # 自动
            # Message: unknown error: Element is not clickable at point (589, 952)
            try:
                for isc in range(2):
                    time.sleep(1)
                    js = 'window.scrollTo(0,document.body.scrollHeight)'
                    browser.execute_script(js)
            except Exception as e:
                print('window.scrollTo-->', e)
            time.sleep(10)
            try:
                xp = '//*[@id="graphic"]/div/div/div[2]/div[2]/div[1]/div[2]/div/div/div[1]/div/label[3]/span'
                browser.find_element_by_xpath(xp).click()
                time.sleep(1)
            except Exception as e:
                print(e)
            time.sleep(2)
            # 不投放广告
            xp = '//*[@id="graphic"]/div/div/div[2]/div[2]/div[2]/div[2]/div[1]/label[2]/span'
            # 投放广告
            xp = '//*[@id="graphic"]/div/div/div[2]/div[2]/div[2]/div[2]/div[1]/label[1]/span'
            browser.find_element_by_xpath(xp).click()
            time.sleep(1)
            # 存草稿
            xp = '//*[@id="graphic"]/div/div/div[2]/div[3]/div[2]/div[2]'
            # 发布
            xp = '//*[@id="graphic"]/div/div/div[2]/div[3]/div[2]/div[1]'
            browser.find_element_by_xpath(xp).click()
            time.sleep(random.randint(10, 20))
            js = 'window.location.href="https://www.toutiao.com/"'
            browser.execute_script(js)
        except Exception as e:
            print(e)
            logging.exception(e)
try:
    browser.quit()
except Exception as e1:
    print(e1)
    logging.exception(e1)

 

 重复校验的实时性 

 

 

 

posted @ 2018-04-25 11:20  papering  阅读(651)  评论(0编辑  收藏  举报