spider_action
spider from mobile to mobile to mobile
from selenium import webdriver from selenium.webdriver.chrome.options import Options # from selenium.webdriver.firefox.options import Options import time from time import sleep import math import random import sys tag_jmtool_list = ['(', '(', '-'] ua_list = [] with open('mobile_ua.txt', 'r', encoding='utf-8') as uafile: for i in uafile: if i.find('Mozilla') > -1: ua_list.append(i.replace('\n', '').strip()) ua_list_len_ = len(ua_list) - 1 def extract_name(name_): for i in tag_jmtool_list: name_ = name_.split(i)[0] return name_ target_type_list = ['住宅小区', '写字楼'] target_type_list = ['住宅小区'] target_dic = {} with open('JMTool0819am/任务JMTool.csv', 'r', encoding='utf-8') as csvfile: for i in csvfile: l = i.replace(' ', '').replace('\n', '').split('";"') if l[0].replace('"', '') in target_type_list: type_, city, district, addr, name_ = l type_, name_ = type_.replace('"', ''), name_.replace('"', '') name_reduction = extract_name(name_) if city not in target_dic: target_dic[city] = {} if district not in target_dic[city]: target_dic[city][district] = {} if type_ not in target_dic[city][district]: target_dic[city][district][type_] = {} if name_reduction not in target_dic[city][district]: target_dic[city][district][type_][name_reduction] = {} target_dic[city][district][type_][name_reduction]['name_reduction_list'] = [] target_dic[city][district][type_][name_reduction]['history_list'] = [] target_dic[city][district][type_][name_reduction]['name_reduction_list'].append(name_) target_dic[city][district][type_][name_reduction]['history_list'].append(l) def write_res_html(browser, dir_='baidu_map_html/'): current_url_ = '%s%s%s%s' % ('<!--', input_, browser.current_url, '-->') page_source = '%s%s' % (current_url_, browser.page_source) localtime_ = time.strftime("%y%m%d%H%M%S", time.localtime()) file_name = '%s%s%s%s' % (dir_, input_, localtime_, '.html') fo = open(file_name, 'w', encoding='utf-8') fo.write(page_source) fo.closed def gen_random_letter(): return chr(random.randint(97, 122)) def gen_random_num(): return random.randint(0, 10) def gen_sougo_pid(): res_ = '' for i in range(1, 17, 1): if i in [1, 3, 4, 15]: res_ = '%s%s' % (res_, gen_random_letter()) else: res_ = '%s%s' % (res_, gen_random_num()) return res_ def close_alert(browser, attitude='accept'): try: sleep(2) al = browser.switch_to.alert() sleep(1) if attitude == 'accept': al.accept() elif attitude == 'dismiss': al.dismiss() print(sys._getframe().f_lineno, 'alert-closed-ok') except Exception: print(sys._getframe().f_lineno, Exception, 'no-alert') # input_ = '深圳市南山区荟芳园' def mobile_mobile_pages_html(input_): # mobile_emulation = { # "deviceMetrics": {"width": 360, "height": 640, "pixelRatio": 3.0}, # "userAgent": "Mozilla/5.0 (Linux; Android 4.2.1; en-us; Nexus 5 Build/JOP40D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Mobile Safari/535.19"} ua_list_index = random.randint(0, ua_list_len_) mobile_emulation = { "deviceMetrics": {"width": 360, "height": 640, "pixelRatio": 3.0}} mobile_emulation['userAgent'] = ua_list[ua_list_index] chrome_options = Options() chrome_options.add_experimental_option("mobileEmulation", mobile_emulation) browser = webdriver.Chrome(chrome_options=chrome_options) url_seed = 'http://m.sogou.com/web/searchList.jsp?pid=sogou-mobb-%s-0007&keyword=百度地图' % (gen_sougo_pid()) print(url_seed) browser.get(url_seed) js = '%s%s%s' % ('document.getElementsByClassName("input-default js_input")[0].value="', input_, '"') browser.execute_script(js) xp_newpage = '//*[@id="sogou_vr_21384401_1_wrap"]/div/div[1]/a' browser.find_element_by_xpath(xp_newpage).click() sleep(2) # xp = '//*[@id="common-bottombanner-widget-fis"]/div/div/div[2]' # sleep(1) # browser.find_element_by_xpath(xp).click() close_alert(browser) try: xp = '//*[@id="place-widget-placenewlist-showall"]/span[1]' sleep(2) close_alert(browser) browser.find_element_by_xpath(xp) except Exception: print(sys._getframe().f_lineno, Exception) return close_alert(browser) if browser.find_element_by_xpath(xp).text.find('全部') == -1: return res_num = browser.find_element_by_xpath(xp).text.split('全部')[1].split('条')[0] res_num = int(res_num) page_num = 10 loop_breaker = math.ceil(res_num / page_num) close_alert(browser) if res_num <= page_num: write_res_html(browser) browser.quit() return close_alert(browser) xp = '//*[@id="place-widget-placenewlist-showall"]' browser.find_element_by_xpath(xp).click() write_res_html(browser) close_alert(browser) js = "window.scrollTo(0,document.body.scrollHeight)" browser.execute_script(js) sleep(1) try: xp_newpage = '//*[@id="fis_elm__7"]/div/div[2]/span[2]' browser.find_element_by_xpath(xp_newpage).click() sleep(1) except Exception: print(sys._getframe().f_lineno, Exception) write_res_html(browser) browser.quit() return for i in range(1, loop_breaker, 1): sleep(1) try: xp = '//*[@id="common-bottombanner-widget-fis"]/div/div/div[2]' sleep(3) browser.find_element_by_xpath(xp).click() except Exception: print(sys._getframe().f_lineno, Exception) sleep(10) break try: js = "window.scrollTo(0,document.body.scrollHeight)" browser.execute_script(js) sleep(1) except Exception: print(sys._getframe().f_lineno, Exception) sleep(10) try: xp_newpage = '//*[@id="fis_elm_pager__qk_7"]/div/div/span[2]' sleep(1) print(input_, i) browser.find_element_by_xpath(xp_newpage).click() write_res_html(browser) except Exception: print(sys._getframe().f_lineno, Exception) sleep(10) sleep(2) browser.quit() for city in target_dic: for district in target_dic[city]: for type_ in target_dic[city][district]: for name_reduction in target_dic[city][district][type_]: input_ = '%s%s%s' % (city, district, name_reduction) mobile_mobile_pages_html(input_)
from selenium import webdriver from selenium.webdriver.chrome.options import Options import time from time import sleep import math url_seed = 'http://m.sogou.com/web/searchList.jsp?pid=sogou-mobb-123asd-0007&keyword=百度地图' mobile_emulation = { "deviceMetrics": {"width": 360, "height": 640, "pixelRatio": 3.0}, "userAgent": "Mozilla/5.0 (Linux; Android 4.2.1; en-us; Nexus 5 Build/JOP40D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Mobile Safari/535.19"} chrome_options = Options() chrome_options.add_experimental_option("mobileEmulation", mobile_emulation) browser = webdriver.Chrome(chrome_options=chrome_options) browser.get(url_seed) input_ = '深圳市南山区荟芳园' js = '%s%s%s' % ('document.getElementsByClassName("input-default js_input")[0].value="', input_, '"') browser.execute_script(js) xp_newpage = '//*[@id="sogou_vr_21384401_1_wrap"]/div/div[1]/a' browser.find_element_by_xpath(xp_newpage).click() sleep(1) xp = '//*[@id="common-bottombanner-widget-fis"]/div/div/div[2]' browser.find_element_by_xpath(xp).click() xp = '//*[@id="place-widget-placenewlist-showall"]/span[1]' browser.find_element_by_xpath(xp) res_num = browser.find_element_by_xpath(xp).text.split('全部')[1].split('条')[0] res_num = int(res_num) page_num = 10 loop_breaker = math.ceil(res_num / page_num) def write_res_html(browser, dir_='baidu_map_html/'): current_url_ = '%s%s%s%s' % ('<!--', input_, browser.current_url, '-->') page_source = '%s%s' % (current_url_, browser.page_source) localtime_ = time.strftime("%y%m%d%H%M%S", time.localtime()) file_name = '%s%s%s%s' % (dir_, input_, localtime_, '.html') fo = open(file_name, 'w', encoding='utf-8') fo.write(page_source) fo.closed xp = '//*[@id="place-widget-placenewlist-showall"]' browser.find_element_by_xpath(xp).click() write_res_html(browser) js = "window.scrollTo(0,document.body.scrollHeight)" browser.execute_script(js) sleep(1) xp_newpage = '//*[@id="fis_elm__7"]/div/div[2]/span[2]' browser.find_element_by_xpath(xp_newpage).click() sleep(1) for i in range(1, loop_breaker, 1): sleep(1) xp = '//*[@id="common-bottombanner-widget-fis"]/div/div/div[2]' browser.find_element_by_xpath(xp).click() js = "window.scrollTo(0,document.body.scrollHeight)" browser.execute_script(js) sleep(1) xp_newpage = '//*[@id="fis_elm_pager__qk_7"]/div/div/span[2]' browser.find_element_by_xpath(xp_newpage).click() write_res_html(browser)
from selenium import webdriver from selenium.webdriver.chrome.options import Options url_seed = 'http://m.sogou.com/web/searchList.jsp?pid=sogou-mobb-123asd-0007&keyword=百度地图' mobile_emulation = { "deviceMetrics": {"width": 360, "height": 640, "pixelRatio": 3.0}, "userAgent": "Mozilla/5.0 (Linux; Android 4.2.1; en-us; Nexus 5 Build/JOP40D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Mobile Safari/535.19"} chrome_options = Options() chrome_options.add_experimental_option("mobileEmulation", mobile_emulation) browser = webdriver.Chrome(chrome_options=chrome_options) browser.get(url_seed) js = 'document.getElementsByClassName("input-default js_input")[0].value="深圳市南山区海岸城"' browser.execute_script(js) xp = '//*[@id="sogou_vr_21384401_1_wrap"]/div/div[1]/a' browser.find_element_by_xpath(xp).click()
ua
Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6
Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER
Opera/9.25 (Windows NT 5.1; U; en)
Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11
Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11
Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; 360SE)
Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)
Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20
Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6
Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1
Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)
Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12
Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0
import os, sys import time import logging import requests import threading from random import choice from selenium import webdriver from selenium.webdriver.common.desired_capabilities import DesiredCapabilities """ 全局约定,便于后期做日志分析 os._exit(INT) 4001 4002 4003 4004 """ os_sep = os.sep this_file_abspath, this_file_name = os.path.dirname(os.path.abspath(__file__)), os.path.abspath(__file__).split(os_sep)[ -1] base_dir = os.path.dirname(os_sep.join(os.path.abspath(__file__).split(os_sep)[0:-2])) log_abspath = '%s%s%s' % (base_dir, os_sep, 'log') """ 日志的记录不能依赖于日志类 """ now_, e = time.strftime('%Y%m%d_%H%M%S', time.localtime(time.time())), '启动脚本' logf, s = '%s%s%s%s' % (log_abspath, os_sep, this_file_name, now_), '%s%s%s%s' % (__file__, now_, os.getcwd(), e) with open(logf, 'a') as fo: fo.write(s) print(s) try: sys.path.append(base_dir) from core.utils import MysqlHelper except Exception as e: s = '%s%s%s' % ( 'from core.utils import MysqlHelper EXCEPTION ', time.strftime('%Y%m%d_%H%M%S', time.localtime(time.time())), e) with open(logf, 'a') as fo: fo.write(s) print(s) os._exit(4001) try: logging.basicConfig(level=logging.INFO, format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s[thread:%(thread)d][process:%(process)d]', datefmt='%a, %d %b %Y %H:%M:%S', filename=logf, filemode='a') except Exception as e: s = '%s%s%s' % ('logging.basicConfig EXCEPTION ', time.strftime('%Y%m%d_%H%M%S', time.localtime(time.time())), e) with open(logf, 'a') as fo: fo.write(s) print(s) os._exit(4002) try: fua, lua = '%s%s%s' % (this_file_abspath, os_sep, 'ua_list.txt'), [] with open(fua, 'r') as fo: for i in fo: lua.append(i.replace('\n', '')) except Exception as e: s = '%s%s' % ('打开文件 EXCEPTION ua文件路径: ', fua) logging.error(s) print(s) os._exit(4003) dcap = dict(DesiredCapabilities.PHANTOMJS) dcap["phantomjs.page.settings.userAgent"] = choice(lua) dcap['browserName'], dcap['platform'] = '', '' class MyThread(threading.Thread): def __init__(self, func, args, name): threading.Thread.__init__(self) self.func, self.args, self.name = func, args, name def run(self): self.func(self.args) ctrl_start, max_script_time = time.time(), 3600 * 4 def ctrl_runtime(exit_type=''): if time.time() - ctrl_start >= max_script_time: s = '%s%s%s%s%s%s%s%s%s' % ( '程序开始执行时间', ctrl_start, '执行时间阈值', max_script_time, '终止执行', ' exit_type =', exit_type, ' threadID ', threading.get_ident()) logging.info(s) if exit_type == '': exit(s) elif exit_type == 'sys': sys.exit(s) elif exit_type == 'os': # an integer is required # Required argument 'status' (pos 1) not found os._exit(4004) url_counter = 0 def main(): """ 对异常无限重启 """ try: mysql_obj = MysqlHelper() q = 'SELECT direct_order_id FROM test_error;' tuple_l = mysql_obj.select(q) pass_id_l = [i[0] for i in tuple_l] pass_id_l = [str(i) for i in pass_id_l] pass_id_l_s = ','.join(pass_id_l) del mysql_obj, tuple_l # 业务当前未失效的url在在test_order具有唯一行 # """ 后期任务: test_error积累一定数据后对url重新检测 #3个功能点:当前半个小时、当前未失效的url test_order内url的异常情况(当前的2个功能点)、(后期任务:test_error积累一定数据后对url重新检测) q = 'SELECT url,id FROM test_order WHERE unix_timestamp(now()) - create_time<=3600*48 AND id NOT in ( %s ) ORDER BY id DESC ;' % ( pass_id_l_s) q = 'SELECT url,id FROM test_order WHERE unix_timestamp(now()) < expire_time AND id NOT in ( %s ) ORDER BY id DESC ;' % ( pass_id_l_s) """ mysql_obj = MysqlHelper() q = 'SELECT url,id FROM test_order WHERE unix_timestamp(now()) < expire_time AND id NOT in ( %s ) ORDER BY id DESC ;' % ( pass_id_l_s) tuple_l = mysql_obj.select(q) del mysql_obj if len(tuple_l) == 0: s = '无待检测url,程序退出' print(s) logging.info(s) except Exception as e: s = '%s%s%s' % ('初始数据,查询数据库异常,无限次重启该脚本', e, time.strftime('%Y%m%d %H:%M:%S', time.localtime(time.time()))) print(s) logging.warning(s) cmd = 'python %s' % (__file__) os.system(cmd) os._exit(1024) # 考虑到每1小时执行下该脚本,对url异常的处理为:第一次请求为预期则终止请求,反之,间隔30后,再至多请求2次,每次间隔10s sleep_counter, sleep_step, sleep_seconds, mycode_l, repeat_times, repeat_sleep_times = 0, 20, 1, [ 'g3user.com', '51g3.com.cn'], 4, 10 # 重构到基类 where list # d当前为为了f_l字段的需求改动 def get_onerow(url, f_l=['title', 'uid', 'money_total'], tab='test_order'): t = -1 try: mysql_obj = MysqlHelper() f_s = ','.join(f_l) q = 'SELECT %s FROM %s WHERE url="%s" ORDER BY id DESC LIMIT 1' % (f_s, tab, url) s = '%s%s' % (' DB ', q) logging.info(s) t = mysql_obj.select(q) if t != -1: t = t[0] del mysql_obj except Exception as e: s = '%s%s' % (' DB ', e) logging.info(s) return t return t def chk_exception_url(url, sleep_seconds=0, http_tag='http://'): time.sleep(sleep_seconds) global url_counter ret = {} # db url状态值 状态 0:打不开 1:打开无广告 2:已处理 ret['ok'], ret['status_code'], s = -1, -1, '%s%s%s%s' % ( time.strftime('%Y%m%d %H:%M:%S', time.localtime(time.time())), ' threadID ', threading.get_ident(), url) try: if url.find('http') == -1: url = '%s%s' % (http_tag, url) r = requests.get(url) ret['status_code'], txt_pos = int(r.status_code), -1 s = '%s,%s,%s,%s,%s' % (s, ret['status_code'], url, r, r.reason) except Exception as e: ret['ok'] = 0 s = '%s %s %s' % (s, ' SPIDER ', e) logging.error(s) print(e, url) # 当前,仅考虑目标站返回200 if ret['status_code'] == 200: for ii in mycode_l: if r.text.find(ii) > -1: ret['ok'], txt_pos = 1, 1 break if txt_pos == -1: try: driver = webdriver.PhantomJS(desired_capabilities=dcap, executable_path='/usr/local/phantomjs/bin/phantomjs') driver.get(url) time.sleep(1) page_source = driver.page_source driver.quit() for ii in mycode_l: if page_source.find(ii) > -1: ret['ok'] = 1 break if ret['ok'] == -1: s = '%s%s' % (s, '返回200,但是在html中未检测到我公司代码。') ret['ok'], ret['info'] = 0, s except Exception as e: s = '%s %s %s' % (s, ' SPIDER ', e) logging.error(s) print(e, url) # elif ret['status_code'] == 403: # www.hsdcw.com/fenlei/41668214.html elif ret['status_code'] == 403: pass else: ret['ok'], ret['info'] = 0, s url_counter += 1 s = '%s/%s%s%s' % (url_counter, len(tuple_l), 'chk-ret', s) print(s) if ret['ok'] == 0: logging.warning(s) else: logging.info(s) return ret tn, tl, tstep = len(tuple_l), [], 4000 def tf(ts): te = ts + tstep te = min(te, tn) for i in tuple_l[ts:te]: ctrl_runtime(exit_type='os') url, chk_id = i s = '%s%s%s%s' % ( time.strftime('%Y%m%d %H:%M:%S', time.localtime(time.time())), ' threadID ', threading.get_ident(), url) if chk_id in pass_id_l: s = '%s%s' % (s, ' 跳过,之前test_error已写入该url ') logging.info(s) print(s) """ 针对新浪爱问的规则: 不检测 """ if url.find('iask.sina.com') > -1: continue write_db_flag = 1 for t in range(0, repeat_times, 1): ret = chk_exception_url(url, repeat_sleep_times) if ret['ok'] == 1: write_db_flag = 0 break if write_db_flag == 1: try: title, uid, money_total = get_onerow(url) except Exception as e: s = '%s%s%s' % (s, ' DB Exception-去test_order查', e) logging.info(s) print(s) break # 多线程 考虑到原包的 数据库限制,每次均实例化数据库类,用后删除 try: # 可以考虑分装到类构造器中 mysql_obj = MysqlHelper() except Exception as e: s = '%s%s%s' % (s, ' DB Exception- ', e) logging.error(s) print(s) break """ 多进程、线程并发 待优化,比如队列 """ q = 'SELECT id FROM test_error WHERE url="%s" LIMIT 1' % (url) try: r = mysql_obj.select(q) s = '%s%s%s' % (s, ' -SQL- ', q) logging.info(s) print(q) except Exception as e: s = '%s %s %s %s' % (s, ' DB Exception-', q, e) logging.info(s) print(s) break ctime = int(time.time()) # 建议优化此处数据库设计 db_status = 1 if ret['status_code'] == 200 else 0 if len(r) == 0: q = 'INSERT INTO test_error (title,url,status,remarks,update_time,create_time,uid,money,direct_order_id) VALUES ("%s","%s","%s","%s","%s","%s","%s","%s","%s")' % ( title, url, db_status, ret['info'], ctime, ctime, uid, money_total, chk_id) try: mysql_obj.execute(q) mysql_obj.commit() del mysql_obj s = '%s%s%s' % (s, ' DB SQL ok ', q) logging.info(s) print(s) except Exception as e: s = '%s%s%s%s' % (s, ' DB Exception- ', q, e) logging.error(s) print(s) elif len(r) == 1: continue for i in range(0, tn, tstep): if i >= tn: break thread_instance = MyThread(tf, (i), tf.__name__) tl.append(thread_instance) for t in tl: t.setDaemon = False t.start() for t in tl: t.join() if __name__ == '__main__': main()
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
# from selenium.webdriver.firefox.options import Options
import time
from time import sleep
import math
import random
import sys
import threading
from random import choice
# import urllib.parse
from bs4 import BeautifulSoup
ua_list = []
with open('mobile_ua.txt', 'r', encoding='utf-8') as uafile:
for i in uafile:
if i.find('Mozilla') > -1:
ua_list.append(i.replace('\n', '').strip())
ua_list_len_ = len(ua_list) - 1
def close_alert(browser, attitude='accept'):
# js='alert(window.alert=function(str){return;}'
# browser.execute_script(js)
# js= 'window.alert = function(str){return ;}'
# browser.execute_script(js)
return
# mobile_emulation = {
# "deviceMetrics": {"width": 360, "height": 640, "pixelRatio": 3.0},
# "userAgent": "Mozilla/5.0 (Linux; Android 4.2.1; en-us; Nexus 5 Build/JOP40D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Mobile Safari/535.19"}
ua_list_index = random.randint(0, ua_list_len_)
# mobile_emulation = {
# "deviceMetrics": {"width": 360, "height": 640, "pixelRatio": 3.0}}
#
# mobile_emulation['userAgent'] = choice(ua_list)
# chrome_options = Options()
# chrome_options.add_experimental_option("mobileEmulation", mobile_emulation)
# browser = webdriver.Chrome(chrome_options=chrome_options)
browser = webdriver.Chrome()
s_wd = '长尾'
url_seed = 'https://m.baidu.com/s?word=s_wd'
url_seed = url_seed.replace('s_wd', s_wd)
print(url_seed)
browser.get(url_seed)
rd = BeautifulSoup(browser.page_source, 'html.parser').find_all('a', class_='rw-item')
res_d_l = [{'contents': d.contents, 'href': d.attrs['href']} for d in rd]
browser.quit()
d = 3