反抗分析

 

玩淘宝要做访问意图分析,玩今日头条要做访问路径、意图的反抗分析:在生态里边,没有上下班的概念,这才是all in

 

 

 

from selenium import webdriver
from  time import sleep
import time
from selenium.webdriver.common.keys import Keys
import os

import requests
import time
import threading
import logging
import random

start_time = time.strftime('%Y%m%d_%H%M%S', time.localtime(time.time()))
os_sep = os.sep
this_file_abspath, this_file_name = os.path.dirname(os.path.abspath(__file__)), os.path.abspath(__file__).split(os_sep)[
    -1]
logf = this_file_name + '.log'
try:
    logging.basicConfig(level=logging.INFO,
                        format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s[thread:%(thread)d][process:%(process)d]',
                        datefmt='%a, %d %b %Y %H:%M:%S',
                        filename=logf,
                        filemode='a')
except Exception as e:
    s = '%s%s%s' % ('logging.basicConfig EXCEPTION ', time.strftime('%Y%m%d_%H%M%S', time.localtime(time.time())), e)
    with open(logf, 'a') as fo:
        fo.write(s)
        print(s)
        os._exit(4002)

logging.info('START')

img_url = 'https://s3.pstatp.com/toutiao/static/img/logo.201f80d.png'
img_dir = 'C:\\Users\\sas\\PycharmProjects\\py_win_to_unix\\crontab_chk_url\\personas\\trunk\\plugins\\spider\\dl_img_tmp\\'


def spider_webimg_dl_return_local_img_path(img_dir, img_url, local_default='default.DONOT_REMOVE.png'):
    r = '%s%s' % (img_dir, local_default)
    try:
        bytes = requests.get(img_url)._content
        r = '%s%s%s%s%s' % (
            img_dir, time.strftime('%Y%m%d%H%M%S', time.localtime(time.time())), str(threading.get_ident()),
            img_url.replace('/', '_xl_').replace(':', '_fxl_').replace('?', '_fxlquestion_').replace('=',
                                                                                                     '_fxlequal_').replace(
                '&', '_fxland_'), '.png')
        if bytes != 0:
            with open(r, 'wb')as f:
                f.write(bytes)
    except Exception as e:
        print(e)
    return r


import pymysql

h, pt, u, p, db = '192.168.22.21', 3306, 'root', 'mp', 'tab_media_joke'


def mysql_fetch(sql, res_type='tuple'):
    global h, pt, u, p, db
    try:
        conn = pymysql.connect(host=h, port=pt, user=u, passwd=p, db=db, charset='utf8')
    except Exception as e:
        print(e)
        return ()
    if res_type == 'dic':
        cursor = conn.cursor(pymysql.cursors.DictCursor)
    else:

        cursor = conn.cursor()
    cursor.execute(sql)
    conn.commit()
    cursor.close()
    conn.close()
    return cursor.fetchall()


def mysql_write(sql):
    global h, pt, u, p, db
    try:
        conn = pymysql.connect(host=h, port=pt, user=u, passwd=p, db=db, charset='utf8')
    except Exception as e:
        print(e)
        return 1
    cursor = conn.cursor()
    cursor.execute(sql)
    conn.commit()
    cursor.close()
    conn.close()
    return 0


import random

while True:
    logging.info('LOOP----')
    sql = 'SELECT username,password,toutiaoid  FROM joke_tab_joke_namepwd WHERE status=1 AND category=1 AND id>236 AND NOT  (toutiaoid IS NULL OR toutiaoid="" )'
    sql = 'SELECT username,password,toutiaoid  FROM joke_tab_joke_namepwd WHERE status=1 AND category=1 AND id=7856582 AND NOT  (toutiaoid IS NULL OR toutiaoid="" )'
    res = mysql_fetch(sql)
    ac_l = [{'u': i[0], 'p': i[1], 'toutiao_uid': i[2]} for i in res]
    for ac in ac_l:
        myid, mypwd, toutiao_uid = ac['u'], ac['p'], ac['toutiao_uid']
        # 发布限制条件逻辑
        sql = "SELECT * FROM joke_tab_joke_relation_wukong_question  WHERE  INSTR(CONCAT(',',id_toutiao_uid_list,','),CONCAT(',','{}',',')) AND time_effective<={}  ORDER BY id DESC; ".format(
            toutiao_uid, int(time.time()));
        sql = "SELECT * FROM joke_joke_article_publish  WHERE  INSTR(CONCAT(',',id_toutiao_uid_list,','),CONCAT(',','{}',',')) AND time_effective<={}  ORDER BY id DESC; ".format(
            toutiao_uid, int(time.time()));
        print(sql)
        logging.info(sql)
        res_content = mysql_fetch(sql, 'dic')
        if len(res_content) == 0:
            continue
        id_article_list = [i['id_article_list'] for i in res_content]

        sql = 'SELECT * FROM joke_joke_article WHERE id IN ({}) AND id  NOT IN (SELECT article_id FROM  joke_joke_article_publish_result WHERE 1 AND toutiao_uid="{}" ) LIMIT 2; '.format(
            ','.join([i['id_article_list'] for i in res_content]), toutiao_uid)
        # sql = 'SELECT * FROM joke_tab_joke_wukong_question WHERE id  NOT IN (SELECT toutiao_uid FROM  joke_tab_joke_toutiaouser_wukong_question) LIMIT 1'
        logging.info(sql)
        res_content = mysql_fetch(sql, 'dic')
        if len(res_content) == 0:
            continue

        browser = webdriver.Chrome()
        f_url_l = ['https://www.toutiao.com/group/1589657566362638/',
                   'https://www.wukong.com/question/6388670742287876353/',
                   'https://www.wukong.com/tag/6215497898671475202/']
        f_url_l += ['https://www.wukong.com/question/6512777037948649741/',
                    'https://www.wukong.com/question/6469247721038414093/',
                    'https://www.wukong.com/question/6481502080249889037/']
        # f_url_l = []
        f_url_l = ['https://www.toutiao.com/a6514526304476332552/', 'https://www.toutiao.com/a6514661446876398088/',
                   'https://www.toutiao.com/a6514778729951003150/']
        f_url_l += ['https://www.toutiao.com/a6514216125151052291/', 'https://www.toutiao.com/a6512315164463727111/',
                    'https://www.toutiao.com/a6513334304318161411/']
        f_url_l_a = f_url_l[int(time.time()) % len(f_url_l)]
        # browser.get(random.choice(f_url_l))
        browser.get(f_url_l_a)
        time.sleep(random.randint(10, 20))

        js = 'window.location.href="https://sso.toutiao.com/login/";'
        js = 'window.location.href="https://sso.toutiao.com/login/?service=https://mp.toutiao.com/sso_confirm/?redirect_url=/";'
        browser.execute_script(js)
        time.sleep(random.randint(10, 20))

        #  js = 'window.location.href="https://sso.toutiao.com/login/?service=https%3A%2F%2Fwww.wukong.com%2Fwenda%2Fwelcome%2F#type=0";'
        browser.execute_script(js)

        ac_type = 'qq'
        if ac_type == 'qq':
            myid, mypwd = ac['u'], ac['p']
            xp = '/html/body/div/div/div[2]/div/div/div/ul/li[3]'
            browser.find_element_by_xpath(xp).click()
            time.sleep(10)
            js = '%s%s%s' % ('document.getElementById("u").value="', myid, '"')
            browser.execute_script(js)
            js = '%s%s%s' % ('document.getElementById("p").value="', mypwd, '"')
            browser.execute_script(js)
            time.sleep(random.randint(5, 15))
            xp_newpage = '//*[@id="go"]'
            browser.find_element_by_xpath(xp_newpage).click()
            time.sleep(random.randint(10, 20))
        elif ac_type == 'mail_qq':
            continue

        time.sleep(5)

        browser.refresh()
        js = 'window.location.href="https://www.toutiao.com/";'
        browser.execute_script(js)
        time.sleep(6)

        js = 'window.location.href="https://www.wukong.com/";'
        js = 'window.location.href="https://mp.toutiao.com/profile_v2/publish/";'
        js = 'window.location.href="https://mp.toutiao.com/profile_v3/graphic/publish";'
        browser.execute_script(js)

        time.sleep(6)

        js = 'document.getElementsByClassName("ask")[0].click();'
        browser.execute_script(js)
        time.sleep(12)

        time.sleep(random.randint(10, 20))
        # 需要键盘事件 反爬虫
        tmp_target = browser.find_element_by_class_name('input-box').find_element_by_tag_name('input')
        # tmp_target.send_keys(Keys.SPACE)
        # tmp_target.send_keys(Keys.CONTROL, 'a')
        # tmp_target.send_keys(Keys.CONTROL, 'x')
        # tmp_target.send_keys(Keys.CONTROL, 'v')
        # tmp_target.send_keys(Keys.BACK_SPACE)
        # time.sleep(random.randint(10, 20))


        # res_content = []
        for i in res_content[0:1]:
            dbid, content, img_list = i['id'], i['content'], i['img_list']

            tmp_l = ['口红', '指甲油', '护发素', '沐浴露', '洗手液', '洗发水', '牙膏']
            tmp_l_1 = ['老人', '小孩', '白领', '前台妹子', '行政妹子', '大学生', '高中生']
            tmp_l_2 = ['类型', '特质', '种类', '价位', '原材料', '主要成分', '价格']

            s = '{}{}{}{}{}{}{}'.format(str(random.randint(1, 12)), '月份,', random.choice(tmp_l_1), '适合使用什么',
                                        random.choice(tmp_l_2), '', random.choice(tmp_l))
            js = 'document.getElementsByClassName("input-box")[0].childNodes[0].value="{}";'.format(s)
            browser.execute_script(js)
            time.sleep(12)
            #
            tmp_target.send_keys(Keys.SPACE)

            js = 'document.getElementsByClassName("step-btn next")[0].click();'
            browser.execute_script(js)

            # step-btn submit

            js = 'document.getElementsByClassName("step-btn submit")[0].click();'
            browser.execute_script(js)
            time.sleep(12)

            #
            js = 'window.location.href="https://www.wukong.com/user/?uid={}&type=1";'.format(toutiao_uid)
            browser.execute_script(js)
            time.sleep(12)
            res_url = browser.find_element_by_class_name('question-title').find_elements_by_tag_name('a')[
                0].get_attribute('href')

            # print(i)
            # xp_newpage = '/html/body/div/div[2]/div[2]/div[1]/div/div/div/div/textarea'
            # try:
            #     browser.find_element_by_xpath(xp_newpage)
            # except Exception as e:
            #     print(e)
            #     break
            # browser.find_element_by_xpath(xp_newpage).click()
            # words = content
            # # Message: SyntaxError: unterminated string literal
            # mytxt = words.replace('\n', ' ').replace('\r', ' ').replace('\\br', ' ').replace('"', '').replace("'", '')
            # # Message: SyntaxError: missing ; before statement
            # mytxt = mytxt.replace("'", '')
            # # 2000 头条
            # mytxt = mytxt[0:2000]
            # mytxt = '好消息' if len(mytxt.replace(' ', '')) == 0 else mytxt
            #
            # # 需要键盘事件 反爬虫
            # browser.find_element_by_xpath(xp_newpage).send_keys(Keys.SPACE)
            # # browser.find_element_by_xpath(xp_newpage).send_keys(Keys.CONTROL, 'a')
            # # browser.find_element_by_xpath(xp_newpage).send_keys(Keys.CONTROL, 'x')
            # # browser.find_element_by_xpath(xp_newpage).send_keys(Keys.CONTROL, 'v')
            # # browser.find_element_by_xpath(xp_newpage).send_keys(Keys.BACK_SPACE)
            # time.sleep(random.randint(2, 5))
            #
            # try:
            #     # js = '%s%s%s' % ('document.getElementsByTagName("textarea")[0].value="', '', '"')
            #     # browser.execute_script(js)
            #     js = '%s%s%s' % ('document.getElementsByTagName("textarea")[0].value="', mytxt, '"')
            #     browser.execute_script(js)
            #     time.sleep(3)
            # except Exception as jse:
            #     print('.getElementsByTagName("textarea")--log-', jse)
            #     continue
            #
            # browser.find_element_by_xpath(xp_newpage).send_keys(Keys.SPACE)
            # xp_newpage = '/html/body/div/div[2]/div[2]/div[1]/div/div/div/div/div[1]/div[1]/span[1]/span'
            # browser.find_element_by_xpath(xp_newpage).click()
            # time.sleep(3)
            # try:
            #     upload = browser.find_element_by_id('fileElem')
            #
            #     logs_img = ''
            #     img_url_list = img_list.split(',')
            #
            #     for imgid in img_url_list:
            #         img_url = 'http://192.168.2.212:83/file/get?type=tab_joke&id=199'.replace('199', str(imgid))
            #         local_img_path = spider_webimg_dl_return_local_img_path(img_dir, img_url,
            #                                                                 local_default='default.DONOT_REMOVE.png')
            #         print(local_img_path)
            #         time.sleep(random.randint(2, 4))
            #         logs_img += img_url
            #         logs_img += local_img_path
            #         upload.send_keys(local_img_path)
            #         time.sleep(random.randint(3, 7))
            # except Exception as ee:
            #     img_url_default = ''
            #     img_url = img_url_default
            #     local_img_path = spider_webimg_dl_return_local_img_path(img_dir, img_url,
            #                                                             local_default='default.DONOT_REMOVE.png')
            #     sleep(2)
            #     logs_img += img_url
            #     logs_img += local_img_path
            #     # upload.send_keys(local_img_path)
            #     logging.exception(ee)
            #
            # try:
            #     xp_newpage = '/html/body/div/div[2]/div[2]/div[1]/div/ul'
            #     browser.find_element_by_xpath(xp_newpage).click()
            #     xp_newpage = '/html/body/div/div[2]/div[2]/div[1]/div/div/div/div/div[1]/div[2]/a'
            #     browser.find_element_by_xpath(xp_newpage).click()
            #
            #     time.sleep(random.randint(8, 20))
            #     js = 'document.getElementsByClassName("ugc-mode-content")[0].getElementsByTagName("a")[0].target="_self"'
            #     browser.execute_script(js)
            #
            #     time.sleep(random.randint(2, 5))
            #     xp_newpage = '/html/body/div/div[2]/div[2]/div[2]/ul/li[1]/div/div[2]/div/div[2]/a'
            #     browser.find_element_by_xpath(xp_newpage).click()
            #     time.sleep(random.randint(3, 6))
            #     url_curr = browser.current_url
            #
            #     with open('toutiao_success.log', 'a', encoding='utf-8') as f:
            #         logs = '%s%s%s%s%s\n' % (
            #             time.strftime('%Y%m%d_%H%M%S', time.localtime(time.time())), ac_type, myid[0:4], mytxt,
            #             logs_img)
            #         print(logs)
            #         f.write(logs)

            sql = 'INSERT INTO  joke_tab_joke_toutiaouser_action_wukong_question_action (article_id,article_url,time_script,toutiao_uid) VALUE("%s","%s","%s","%s");' % (
                dbid, res_url, int(time.time()), toutiao_uid)
            mysql_write(sql)
            print(sql)
            time.sleep(random.randint(20, 30))
            js = 'window.location.href="https://www.wukong.com/"'
            browser.execute_script(js)
            # except Exception as e_url_jump:
            #     print('e_url_jump', e_url_jump)
    try:
        browser.quit()
    except Exception as e1:
        print(e1)
        logging.exception(e1)

time.sleep(random.randint(120, 300))

 

 

 

 

 

 
 
 
posted @ 2018-04-16 11:33  papering  阅读(304)  评论(0编辑  收藏  举报