获取发布的头条的url,避免点击打开新的页面
https://www.toutiao.com/
document.getElementsByClassName("ugc-mode-content")[0].getElementsByTagName("a")[0].target='_self'
淘宝广告位置
写入应用
让代码在短期内产生价值
from selenium import webdriver import os import time import pymysql from bs4 import BeautifulSoup import requests import threading from selenium.webdriver.common.keys import Keys h, pt, u, p, db = 'localhost', 3306, 'root', '', 'qqzone' def mysql_fetch(sql, res_type='tuple'): global h, pt, u, p, db try: conn = pymysql.connect(host=h, port=pt, user=u, passwd=p, db=db, charset='utf8mb4') except Exception as e: print(e) return () if res_type == 'dic': cursor = conn.cursor(pymysql.cursors.DictCursor) else: cursor = conn.cursor() cursor.execute(sql) conn.commit() r = cursor.fetchall() cursor.close() conn.close() return r def mysql_write(sql): global h, pt, u, p, db try: conn = pymysql.connect(host=h, port=pt, user=u, passwd=p, db=db, charset='utf8mb4') except Exception as e: print(e) return 1 cursor = conn.cursor() cursor.execute(sql) conn.commit() cursor.close() conn.close() return 0 # D:\pyaction\toutiao_team_win img_dir = 'C:/Users/Administrator/Desktop/1/toutiao_team/dl_img/' img_dir = 'D:/pyaction/toutiao_team_win/dl_img/' import random def spider_webimg_dl_return_local_img_path(img_dir, img_url, media_type='img', local_default='default.DONOT_REMOVE.png'): r = '%s%s' % (img_dir, local_default) if media_type == 'img': try: req = requests.get(img_url) time.sleep(3) if req.status_code != 200: print('-!=200') return r time.sleep(30) print(img_url) bytes = req._content # r = '%s%s%s%s%s' % ( # img_dir, time.strftime('%Y%m%d%H%M%S', time.localtime(time.time())), str(threading.get_ident()), # img_url.split('!/c')[0].split('/')[-1].replace('*', '_'), '.png') # print(r) r = '%s%s%s%s%s' % ( img_dir, time.strftime('%Y%m%d%H%M%S', time.localtime(time.time())), str(threading.get_ident()), str(random.randrange(1000, 9999)), '.png') print(r) filter_l = ['&', '=', '?', '-'] for fi in filter_l: r.replace(fi, '') if bytes != 0: with open(r, 'wb')as f: f.write(bytes) except Exception as e: print(e) elif media_type == 'mp4': try: time.sleep(30) print(img_url) r = '%s%s%s%s%s' % ( img_dir, time.strftime('%Y%m%d%H%M%S', time.localtime(time.time())), str(threading.get_ident()), img_url.split('.mp4?')[0].split('/')[-1].replace('*', '_'), '.mp4') print(r) req = requests.get(img_url) time.sleep(3) if req.status_code != 200: print('-!=200') return '' bytes = req._content time.sleep(210) if bytes != 0: with open(r, 'wb')as f: f.write(bytes) except Exception as e: r = '' print(e) return r driver = webdriver.Chrome() myurl = 'https://weibo.com/u/1779073702' myurl = 'https://weibo.com/u/1779073702?is_all=1' myurl = 'https://weibo.com/login.php' driver.get(myurl) # 此处需要内存和cpu空余,能够支持dom解析和处理重js页面 time.sleep(10) driver.refresh() time.sleep(60) # xp = '//*[@id="pl_common_top"]/div/div/div[3]/div[2]/ul/li[3]/a' # try: # # 此处解决了不能点击该元素报错,第三次尝试ok # driver.find_element_by_xpath(xp).click() # time.sleep(40) # # except Exception as e: # print(e) # os._exit(1024) js = 'document.getElementsByClassName("username")[1].childNodes[0].value="leo201008@sina.cn";' \ 'document.getElementsByClassName("password")[0].childNodes[0].value="welcome";' \ 'document.getElementsByClassName("form_login_register")[0].childNodes[5].childNodes[0].click();' js = 'document.getElementById("loginname").value="leo201008@sina.cn";' \ 'document.getElementsByName("password")[0].value="welcome";' \ 'document.getElementsByClassName("W_btn_a btn_32px")[0].click();' try: driver.execute_script(js) time.sleep(30) except Exception as e: print(e) os._exit(1024) while True: sql = 'SELECT id, words,imgurls,time_site FROM qqzoneshuoshuo WHERE lefttimes_weibo>0 AND INSTR(imgurls,".mp4")=0 AND id IN ( SELECT MAX(id) FROM qqzoneshuoshuo GROUP BY id_site) ORDER BY time_script DESC,id ASC ;' res_content = mysql_fetch(sql, 'dic') print(res_content) if len(res_content) == 0: continue for i in res_content[0:]: # id, words,imgurls,time_site dbid, content, img_list, time_site = i['id'], i['words'], i['imgurls'], i['time_site'] if 1 > 13: if '天' in time_site or '月' in time_site: continue lh = int(time.strftime("%H", time.localtime())) if lh - int(time_site.split(':')[0]) >= 24: continue if '早安' in content and lh >= 11: continue elif '晚安' in content and lh <= 20: continue js = 'document.getElementsByClassName("gn_set_list")[2].childNodes[0].click();' try: driver.execute_script(js) except Exception as e: print('登录验证码', e) try: driver.quit() except: pass break time.sleep(10) # because another element <div> obscures it content = content.split('展开全文')[0].split('上传')[0].split('浏览')[0].replace('"', ' ').replace("'", ' ') content = content.replace('"', ' ').replace("'", ' ').replace('\n', ' ') filter_l = ['密龄素材空间', '评论'] for fi in filter_l: content = content.replace(fi, ' ') print(content) # js = 'document.getElementsByTagName("textarea")[0].value="{}新年快乐-密龄白藜芦醇DOAEZ朵韵诗-阿静艾卡尔@ http://www.icarei.cn期待与你携手前行!!";'.format( # content) # js = 'document.getElementsByTagName("textarea")[0].value="{}白藜芦醇-燕窝美妆-密龄DOAEZ朵韵诗-阿静艾卡尔@ http://www.icarei.cn期待与你携手前行!!";'.format( # content) js = 'document.getElementsByTagName("textarea")[0].value="{}南京同仁堂密龄白藜芦醇-燕窝美妆-DOAEZ朵韵诗-阿静@ http://www.icarei.cn期待与你携手前行!!";'.format(content) ad_url='https://item.taobao.com/item.htm?id=565875313425' js = 'document.getElementsByTagName("textarea")[0].value="{}南京同仁堂密龄白藜芦醇-燕窝美妆-DOAEZ朵韵诗-阿静@ {}!!";'.format(content,ad_url) print(js) try: # 需要键盘事件 - response driver.find_element_by_tag_name("textarea").send_keys(Keys.SPACE) time.sleep(2) driver.find_element_by_tag_name("textarea").send_keys(Keys.BACK_SPACE) driver.execute_script(js) time.sleep(2) except Exception as e: print(e) continue # 先填充文本:动态dom for iimg in range(2): js = 'document.getElementsByClassName("ficon_image")[0].click();' driver.execute_script(js) time.sleep(2) upload = driver.find_element_by_id('pic_upload').find_element_by_tag_name('input') img_url_list = img_list.split(',') try: # MAX=8 for img_url in img_url_list: if '.gif' in img_url or 'qzonestyle' in img_url: continue local_img_path = spider_webimg_dl_return_local_img_path(img_dir, img_url, local_default='default.DONOT_REMOVE.png') print(local_img_path) time.sleep(2) upload.send_keys(local_img_path) except Exception as e: print(e) try: js = 'document.getElementsByClassName("W_layer_close")[0].click();document.getElementsByClassName("func")[0].childNodes[3].click();' driver.execute_script(js) time.sleep(10) driver.refresh() except: pass continue time.sleep(5) js = 'document.getElementsByClassName("W_layer_close")[0].click();document.getElementsByClassName("func")[0].childNodes[3].click();' # js = 'document.getElementsByTagName("textarea")[0].click();document.getElementsByClassName("func")[0].childNodes[3].click();' # js = 'document.getElementsByClassName("func")[0].childNodes[3].click();' driver.execute_script(js) time.sleep(10) sql = 'UPDATE qqzoneshuoshuo SET lefttimes_weibo=lefttimes_weibo-1 WHERE id={}'.format(dbid) print(sql) try: mysql_write(sql) except: pass driver.refresh() time.sleep(random.randint(60 * 2, 60 * 5)) # 15min后刷新,循环存入数据,期间定时刷新,维持页面 for si in range(15): try: driver.refresh() time.sleep(60) time.sleep(random.randint(0, 10)) print(si) except Exception as e: print(145, e) from selenium import webdriver from time import sleep import time from selenium.webdriver.common.keys import Keys import os import requests import time import threading import logging start_time = time.strftime('%Y%m%d_%H%M%S', time.localtime(time.time())) os_sep = os.sep this_file_abspath, this_file_name = os.path.dirname(os.path.abspath(__file__)), os.path.abspath(__file__).split(os_sep)[ -1] logf = this_file_name + '.log' try: logging.basicConfig(level=logging.INFO, format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s[thread:%(thread)d][process:%(process)d]', datefmt='%a, %d %b %Y %H:%M:%S', filename=logf, filemode='a') except Exception as e: s = '%s%s%s' % ('logging.basicConfig EXCEPTION ', time.strftime('%Y%m%d_%H%M%S', time.localtime(time.time())), e) with open(logf, 'a') as fo: fo.write(s) print(s) os._exit(4002) logging.info('START') img_url = 'https://s3.pstatp.com/toutiao/static/img/logo.201f80d.png' img_dir = 'C:/Users/Administrator/Desktop/1/toutiao_team/dl_img/' img_dir = 'D:/pyaction/toutiao_team_win/dl_img/' def spider_webimg_dl_return_local_img_path(img_dir, img_url, local_default='default.DONOT_REMOVE.png'): r = '%s%s' % (img_dir, local_default) try: time.sleep(30) bytes = requests.get(img_url)._content # r = '%s%s%s%s%s' % ( # img_dir, time.strftime('%Y%m%d%H%M%S', time.localtime(time.time())), str(threading.get_ident()), # img_url.replace('/', '_xl_').replace(':', '_fxl_').replace('?', '_fxlquestion_').replace('=', # '_fxlequal_').replace( # '&', '_fxland_'), '.png') # r = '%s%s%s%s%s' % ( # img_dir, time.strftime('%Y%m%d%H%M%S', time.localtime(time.time())), str(threading.get_ident()), # img_url.split('!/c')[0].split('/')[-1].replace('*', '_'), '.png') # r = '%s%s%s%s%s' % ( img_dir, time.strftime('%Y%m%d%H%M%S', time.localtime(time.time())), str(threading.get_ident()), 'TOUTIAO0412', '.png') if bytes != 0: with open(r, 'wb')as f: f.write(bytes) except Exception as e: print(e) return r import pymysql h, pt, u, p, db = 'localhost', 3306, 'root', '', 'qqzone' def mysql_fetch(sql, res_type='tuple'): global h, pt, u, p, db try: conn = pymysql.connect(host=h, port=pt, user=u, passwd=p, db=db, charset='utf8') except Exception as e: print(e) return () if res_type == 'dic': cursor = conn.cursor(pymysql.cursors.DictCursor) else: cursor = conn.cursor() cursor.execute(sql) conn.commit() r = cursor.fetchall() cursor.close() conn.close() return r def mysql_write(sql): global h, pt, u, p, db try: conn = pymysql.connect(host=h, port=pt, user=u, passwd=p, db=db, charset='utf8') except Exception as e: print(e) return 1 cursor = conn.cursor() cursor.execute(sql) conn.commit() cursor.close() conn.close() return 0 import random while True: # if 3 > 2: logging.info('LOOP----') # sql = 'SELECT username,password,toutiaoid FROM xmt_star_helper_namepwd WHERE status=1 AND category=1 AND id>3000 AND NOT (toutiaoid IS NULL OR toutiaoid="" )' # res = mysql_fetch(sql) # ac_l = [{'u': i[0], 'p': i[1], 'toutiao_uid': i[2]} for i in res] ac_l = [{'u': 'leo201008@sina.cn', 'p': 'welcome'}] for ac in ac_l: myid, mypwd, toutiao_uid = ac['u'], ac['p'], '' sql = 'SELECT id, words,imgurls,time_site FROM qqzoneshuoshuo WHERE published_time IS NULL ORDER BY time_script DESC;' sql = 'SELECT id, words,imgurls,time_site FROM qqzoneshuoshuo WHERE published_time IS NULL AND CONCAT(words,imgurls) NOT IN (SELECT CONCAT(a.words,a.imgurls) FROM qqzoneshuoshuo_copy1 a ) ORDER BY time_script DESC;' sql = 'SELECT id, words,imgurls,time_site FROM qqzoneshuoshuo WHERE INSTR(imgurls,".mp4")=0 ORDER BY time_script DESC LIMIT 4;' sql = 'SELECT id, words,imgurls,time_site FROM qqzoneshuoshuo WHERE INSTR(imgurls,".mp4")=0 AND id IN ( SELECT MAX(id) FROM qqzoneshuoshuo GROUP BY id_site) ORDER BY time_script DESC,id ASC ;' sql = 'SELECT id, words,imgurls,time_site FROM qqzoneshuoshuo WHERE lefttimes_toutiao>0 AND INSTR(imgurls,".mp4")=0 AND id IN ( SELECT MAX(id) FROM qqzoneshuoshuo GROUP BY id_site) ORDER BY time_script DESC,id ASC ;' sql = 'SELECT id, words,imgurls,time_site FROM qqzoneshuoshuo WHERE lefttimes_toutiao>0 AND INSTR(imgurls,".mp4")=0 AND 1 ORDER BY time_script DESC,id ASC ;' logging.info(sql) res_content = mysql_fetch(sql, 'dic') if len(res_content) == 0: continue # https://www.wukong.com/question/6498933499305656590/ fake_question_l = ['6481471694505509134', '6514816760909152519', '6498933499305656590'] browser = webdriver.Chrome() fake_question_url = 'https://www.wukong.com/question/123/'.replace('123', random.choice(fake_question_l)) browser.get(fake_question_url) time.sleep(random.randrange(2, 5)) js = 'window.location.href="https://www.toutiao.com/ch/news_hot/"' browser.execute_script(js) time.sleep(random.randrange(2, 5)) js = 'window.location.href="https://sso.toutiao.com/login/"' browser.execute_script(js) time.sleep(random.randrange(2, 5)) ac_type = 'mail_qq' ac_type = 'qq' ac_type = 'sinawb' if ac_type == 'sinawb': xp_newpage = '/html/body/div/div/div[2]/div/div/div/ul/li[2]' browser.find_element_by_xpath(xp_newpage).click() myid, mypwd = 'leo201008@sina.cn', 'welcome' js = '%s%s%s' % ('document.getElementById("userId").value="', myid, '"') browser.execute_script(js) js = '%s%s%s' % ('document.getElementById("passwd").value="', mypwd, '"') browser.execute_script(js) xp_newpage = '//*[@id="outer"]/div/div[2]/form/div/div[2]/div/p/a[1]' browser.find_element_by_xpath(xp_newpage).click() time.sleep(random.randrange(60, 90)) elif ac_type == 'qq': xp_newpage = '/html/body/div/div/div[2]/div/div/div/ul/li[3]' browser.find_element_by_xpath(xp_newpage).click() myid, mypwd = ac['u'], ac['p'] # 当天注册的qq if myid == '2766907843': myid, mypwd = '2564649479', 'wanfan123qq' js = '%s%s%s' % ('document.getElementById("u").value="', myid, '"') browser.execute_script(js) js = '%s%s%s' % ('document.getElementById("p").value="', mypwd, '"') browser.execute_script(js) time.sleep(random.randint(10, 20)) xp_newpage = '//*[@id="go"]' browser.find_element_by_xpath(xp_newpage).click() time.sleep(random.randint(20, 30)) elif ac_type == 'mail_qq': xp_newpage = '/html/body/div/div/div[2]/div/div/div/ul/li[1]' browser.find_element_by_xpath(xp_newpage).click() myid, mypwd = 'wanf', '123332018' js = '%s%s%s' % ('document.getElementById("account").value="', myid, '"') browser.execute_script(js) js = '%s%s%s' % ('document.getElementById("password").value="', mypwd, '"') sleep(15) browser.execute_script(js) sleep(23) for i in range(3): try: xp_newpage = '/html/body/div/div/div[2]/div/div/div/form/input' browser.find_element_by_xpath(xp_newpage).click() except Exception as e: print(e) time.sleep(random.randrange(1, 3)) js = 'window.location.href="https://www.toutiao.com/"' time.sleep(random.randrange(1, 3)) js = 'window.location.href="https://mp.toutiao.com/profile_v3/weitoutiao"' browser.execute_script(js) time.sleep(random.randrange(5, 8)) sql = 'SELECT id, words,imgurls,time_site FROM qqzoneshuoshuo WHERE published_time IS NULL ORDER BY time_script DESC;' sql = 'SELECT id, words,imgurls,time_site FROM qqzoneshuoshuo WHERE published_time IS NULL AND CONCAT(words,imgurls) NOT IN (SELECT CONCAT(a.words,a.imgurls) FROM qqzoneshuoshuo_copy1 a ) ORDER BY time_script DESC;' sql = 'SELECT id, words,imgurls,time_site FROM qqzoneshuoshuo WHERE INSTR(imgurls,".mp4")=0 AND INSTR(imgurls,"纽约")>0 ORDER BY time_script DESC,id ASC ;' sql = 'SELECT id, words,imgurls,time_site FROM qqzoneshuoshuo WHERE INSTR(imgurls,".mp4")=0 AND 1 ORDER BY time_script DESC,id ASC ;' logging.info(sql) res_content = mysql_fetch(sql, 'dic') if len(res_content) == 0: continue for i in res_content[0:]: dbid, content, img_list, time_site = i['id'], i['words'], i['imgurls'], i['time_site'] if 1 > 13: if '天' in time_site or '月' in time_site: continue lh = int(time.strftime("%H", time.localtime())) if lh - int(time_site.split(':')[0]) >= 24: continue if '早安' in content and lh >= 11: continue elif '晚安' in content and lh <= 20: continue xp_newpage = '/html/body/div/div[2]/div[2]/div[1]/div/div/div/div/textarea' xp_newpage = '//*[@id="weitoutiao"]/div/div/div[1]/div[1]/textarea' # 需要键盘事件 反爬虫 browser.find_element_by_xpath(xp_newpage).send_keys(Keys.SPACE) # browser.find_element_by_xpath(xp_newpage).send_keys(Keys.CONTROL, 'a') # browser.find_element_by_xpath(xp_newpage).send_keys(Keys.CONTROL, 'x') # browser.find_element_by_xpath(xp_newpage).send_keys(Keys.CONTROL, 'v') # browser.find_element_by_xpath(xp_newpage).send_keys(Keys.BACK_SPACE) browser.find_element_by_xpath(xp_newpage).click() words = content # Message: SyntaxError: unterminated string literal mytxt = words.replace('\n', ' ').replace('\r', ' ').replace('\\br', ' ').replace('"', '“').replace("'", '‘') # Message: SyntaxError: missing ; before statement mytxt = mytxt.replace("'", '‘') ##qqzone mytxt = mytxt.split('展开全文')[0].split('上传')[0].split('浏览')[0] # 2000 头条 filter_l = ['密龄素材空间', '评论'] for fi in filter_l: mytxt = mytxt.replace(fi, ' ') # mytxt = '{}{}'.format(mytxt, 'DOAEZ朵韵诗密龄白藜芦醇招商 王静 艾卡尔 http://www.icarei.cn ') ad_url = 'https://item.taobao.com/item.htm?id=565875313425' mytxt = '{}{} {}'.format(mytxt, 'DOAEZ朵韵诗燕窝美妆 南京同仁堂密龄白藜芦醇 阿静 ', ad_url) mytxt = mytxt.replace('"', ' ').replace("'", ' ').replace('\n', ' ') mytxt = mytxt[0:2000] # 处理提交异常 browser.find_element_by_xpath(xp_newpage).send_keys(Keys.UP) # # try: # js = 'var a=document.getElementsByTagName("textarea")[0];a.value="{}";'.format(mytxt) # except Exception as jse: # print('.getElementsByTagName("textarea")--log-', jse) # continue # 需要键盘事件 反爬虫 browser.find_element_by_xpath(xp_newpage).send_keys(mytxt) time.sleep(random.randint(2, 5)) xp_newpage = '/html/body/div/div[2]/div[2]/div[1]/div/div/div/div/div[1]/div[1]/span[1]/span' xp_newpage = '//*[@id="weitoutiao"]/div/div/div[1]/button' xp_newpage = '//*[@id="weitoutiao"]/div/div/div[1]/div[2]/div/div[1]/div/i' browser.find_element_by_xpath(xp_newpage).click() # upload = browser.find_element_by_id('fileElem') # upload = browser.find_element_by_class_name('upl-board picture') upload = browser.find_element_by_class_name('upl-btn') xp_newpage = '//*[@id="weitoutiao"]/div/div/div[1]/div[2]/div/div[2]/div/div[2]/div/input' upload = browser.find_element_by_xpath(xp_newpage) logs_img = '' img_url_list = img_list.split(',') try: for imgid in img_url_list: img_url = imgid if '.gif' in img_url or 'qzonestyle' in img_url: continue local_img_path = spider_webimg_dl_return_local_img_path(img_dir, img_url, local_default='default.DONOT_REMOVE.png') print(local_img_path) time.sleep(random.randint(2, 4)) logs_img += img_url logs_img += local_img_path upload.send_keys(local_img_path) time.sleep(random.randint(3, 7)) except Exception as ee: img_url_default = '' img_url = img_url_default local_img_path = spider_webimg_dl_return_local_img_path(img_dir, img_url, local_default='default.DONOT_REMOVE.png') sleep(2) logs_img += img_url logs_img += local_img_path upload.send_keys(local_img_path) logging.exception(ee) xp_newpage = '//*[@id="weitoutiao"]/div/div/div[1]/button' try: browser.find_element_by_xpath(xp_newpage).click() except Exception as e: print(e) sql = 'UPDATE qqzoneshuoshuo SET lefttimes_toutiao=lefttimes_toutiao-1 WHERE id={}'.format(dbid) try: mysql_write(sql) print(sql) except Exception as e: print(e) time.sleep(random.randint(120, 300))