spider_action
spider from mobile to mobile to mobile
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 | from selenium import webdriver from selenium.webdriver.chrome.options import Options # from selenium.webdriver.firefox.options import Options import time from time import sleep import math import random import sys tag_jmtool_list = [ '(' , '(' , '-' ] ua_list = [] with open ( 'mobile_ua.txt' , 'r' , encoding = 'utf-8' ) as uafile: for i in uafile: if i.find( 'Mozilla' ) > - 1 : ua_list.append(i.replace( '\n' , '').strip()) ua_list_len_ = len (ua_list) - 1 def extract_name(name_): for i in tag_jmtool_list: name_ = name_.split(i)[ 0 ] return name_ target_type_list = [ '住宅小区' , '写字楼' ] target_type_list = [ '住宅小区' ] target_dic = {} with open ( 'JMTool0819am/任务JMTool.csv' , 'r' , encoding = 'utf-8' ) as csvfile: for i in csvfile: l = i.replace( ' ' , ' ').replace(' \n ', ' ').split(' ";" ') if l[ 0 ].replace( '"', '') in target_type_list: type_, city, district, addr, name_ = l type_, name_ = type_.replace('"' , ' '), name_.replace(' " ', ' ') name_reduction = extract_name(name_) if city not in target_dic: target_dic[city] = {} if district not in target_dic[city]: target_dic[city][district] = {} if type_ not in target_dic[city][district]: target_dic[city][district][type_] = {} if name_reduction not in target_dic[city][district]: target_dic[city][district][type_][name_reduction] = {} target_dic[city][district][type_][name_reduction][ 'name_reduction_list' ] = [] target_dic[city][district][type_][name_reduction][ 'history_list' ] = [] target_dic[city][district][type_][name_reduction][ 'name_reduction_list' ].append(name_) target_dic[city][district][type_][name_reduction][ 'history_list' ].append(l) def write_res_html(browser, dir_ = 'baidu_map_html/' ): current_url_ = '%s%s%s%s' % ( '<!--' , input_, browser.current_url, '-->' ) page_source = '%s%s' % (current_url_, browser.page_source) localtime_ = time.strftime( "%y%m%d%H%M%S" , time.localtime()) file_name = '%s%s%s%s' % (dir_, input_, localtime_, '.html' ) fo = open (file_name, 'w' , encoding = 'utf-8' ) fo.write(page_source) fo.closed def gen_random_letter(): return chr (random.randint( 97 , 122 )) def gen_random_num(): return random.randint( 0 , 10 ) def gen_sougo_pid(): res_ = '' for i in range ( 1 , 17 , 1 ): if i in [ 1 , 3 , 4 , 15 ]: res_ = '%s%s' % (res_, gen_random_letter()) else : res_ = '%s%s' % (res_, gen_random_num()) return res_ def close_alert(browser, attitude = 'accept' ): try : sleep( 2 ) al = browser.switch_to.alert() sleep( 1 ) if attitude = = 'accept' : al.accept() elif attitude = = 'dismiss' : al.dismiss() print (sys._getframe().f_lineno, 'alert-closed-ok' ) except Exception: print (sys._getframe().f_lineno, Exception, 'no-alert' ) # input_ = '深圳市南山区荟芳园' def mobile_mobile_pages_html(input_): # mobile_emulation = { # "deviceMetrics": {"width": 360, "height": 640, "pixelRatio": 3.0}, # "userAgent": "Mozilla/5.0 (Linux; Android 4.2.1; en-us; Nexus 5 Build/JOP40D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Mobile Safari/535.19"} ua_list_index = random.randint( 0 , ua_list_len_) mobile_emulation = { "deviceMetrics" : { "width" : 360 , "height" : 640 , "pixelRatio" : 3.0 }} mobile_emulation[ 'userAgent' ] = ua_list[ua_list_index] chrome_options = Options() chrome_options.add_experimental_option( "mobileEmulation" , mobile_emulation) browser = webdriver.Chrome(chrome_options = chrome_options) url_seed = 'http://m.sogou.com/web/searchList.jsp?pid=sogou-mobb-%s-0007&keyword=百度地图' % (gen_sougo_pid()) print (url_seed) browser.get(url_seed) js = '%s%s%s' % ( 'document.getElementsByClassName("input-default js_input")[0].value="' , input_, '"' ) browser.execute_script(js) xp_newpage = '//*[@id="sogou_vr_21384401_1_wrap"]/div/div[1]/a' browser.find_element_by_xpath(xp_newpage).click() sleep( 2 ) # xp = '//*[@id="common-bottombanner-widget-fis"]/div/div/div[2]' # sleep(1) # browser.find_element_by_xpath(xp).click() close_alert(browser) try : xp = '//*[@id="place-widget-placenewlist-showall"]/span[1]' sleep( 2 ) close_alert(browser) browser.find_element_by_xpath(xp) except Exception: print (sys._getframe().f_lineno, Exception) return close_alert(browser) if browser.find_element_by_xpath(xp).text.find( '全部' ) = = - 1 : return res_num = browser.find_element_by_xpath(xp).text.split( '全部' )[ 1 ].split( '条' )[ 0 ] res_num = int (res_num) page_num = 10 loop_breaker = math.ceil(res_num / page_num) close_alert(browser) if res_num < = page_num: write_res_html(browser) browser.quit() return close_alert(browser) xp = '//*[@id="place-widget-placenewlist-showall"]' browser.find_element_by_xpath(xp).click() write_res_html(browser) close_alert(browser) js = "window.scrollTo(0,document.body.scrollHeight)" browser.execute_script(js) sleep( 1 ) try : xp_newpage = '//*[@id="fis_elm__7"]/div/div[2]/span[2]' browser.find_element_by_xpath(xp_newpage).click() sleep( 1 ) except Exception: print (sys._getframe().f_lineno, Exception) write_res_html(browser) browser.quit() return for i in range ( 1 , loop_breaker, 1 ): sleep( 1 ) try : xp = '//*[@id="common-bottombanner-widget-fis"]/div/div/div[2]' sleep( 3 ) browser.find_element_by_xpath(xp).click() except Exception: print (sys._getframe().f_lineno, Exception) sleep( 10 ) break try : js = "window.scrollTo(0,document.body.scrollHeight)" browser.execute_script(js) sleep( 1 ) except Exception: print (sys._getframe().f_lineno, Exception) sleep( 10 ) try : xp_newpage = '//*[@id="fis_elm_pager__qk_7"]/div/div/span[2]' sleep( 1 ) print (input_, i) browser.find_element_by_xpath(xp_newpage).click() write_res_html(browser) except Exception: print (sys._getframe().f_lineno, Exception) sleep( 10 ) sleep( 2 ) browser.quit() for city in target_dic: for district in target_dic[city]: for type_ in target_dic[city][district]: for name_reduction in target_dic[city][district][type_]: input_ = '%s%s%s' % (city, district, name_reduction) mobile_mobile_pages_html(input_) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 | from selenium import webdriver from selenium.webdriver.chrome.options import Options import time from time import sleep import math url_seed = 'http://m.sogou.com/web/searchList.jsp?pid=sogou-mobb-123asd-0007&keyword=百度地图' mobile_emulation = { "deviceMetrics" : { "width" : 360 , "height" : 640 , "pixelRatio" : 3.0 }, "userAgent" : "Mozilla/5.0 (Linux; Android 4.2.1; en-us; Nexus 5 Build/JOP40D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Mobile Safari/535.19" } chrome_options = Options() chrome_options.add_experimental_option( "mobileEmulation" , mobile_emulation) browser = webdriver.Chrome(chrome_options = chrome_options) browser.get(url_seed) input_ = '深圳市南山区荟芳园' js = '%s%s%s' % ( 'document.getElementsByClassName("input-default js_input")[0].value="' , input_, '"' ) browser.execute_script(js) xp_newpage = '//*[@id="sogou_vr_21384401_1_wrap"]/div/div[1]/a' browser.find_element_by_xpath(xp_newpage).click() sleep( 1 ) xp = '//*[@id="common-bottombanner-widget-fis"]/div/div/div[2]' browser.find_element_by_xpath(xp).click() xp = '//*[@id="place-widget-placenewlist-showall"]/span[1]' browser.find_element_by_xpath(xp) res_num = browser.find_element_by_xpath(xp).text.split( '全部' )[ 1 ].split( '条' )[ 0 ] res_num = int (res_num) page_num = 10 loop_breaker = math.ceil(res_num / page_num) def write_res_html(browser, dir_ = 'baidu_map_html/' ): current_url_ = '%s%s%s%s' % ( '<!--' , input_, browser.current_url, '-->' ) page_source = '%s%s' % (current_url_, browser.page_source) localtime_ = time.strftime( "%y%m%d%H%M%S" , time.localtime()) file_name = '%s%s%s%s' % (dir_, input_, localtime_, '.html' ) fo = open (file_name, 'w' , encoding = 'utf-8' ) fo.write(page_source) fo.closed xp = '//*[@id="place-widget-placenewlist-showall"]' browser.find_element_by_xpath(xp).click() write_res_html(browser) js = "window.scrollTo(0,document.body.scrollHeight)" browser.execute_script(js) sleep( 1 ) xp_newpage = '//*[@id="fis_elm__7"]/div/div[2]/span[2]' browser.find_element_by_xpath(xp_newpage).click() sleep( 1 ) for i in range ( 1 , loop_breaker, 1 ): sleep( 1 ) xp = '//*[@id="common-bottombanner-widget-fis"]/div/div/div[2]' browser.find_element_by_xpath(xp).click() js = "window.scrollTo(0,document.body.scrollHeight)" browser.execute_script(js) sleep( 1 ) xp_newpage = '//*[@id="fis_elm_pager__qk_7"]/div/div/span[2]' browser.find_element_by_xpath(xp_newpage).click() write_res_html(browser) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 | from selenium import webdriver from selenium.webdriver.chrome.options import Options url_seed = 'http://m.sogou.com/web/searchList.jsp?pid=sogou-mobb-123asd-0007&keyword=百度地图' mobile_emulation = { "deviceMetrics" : { "width" : 360 , "height" : 640 , "pixelRatio" : 3.0 }, "userAgent" : "Mozilla/5.0 (Linux; Android 4.2.1; en-us; Nexus 5 Build/JOP40D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Mobile Safari/535.19" } chrome_options = Options() chrome_options.add_experimental_option( "mobileEmulation" , mobile_emulation) browser = webdriver.Chrome(chrome_options = chrome_options) browser.get(url_seed) js = 'document.getElementsByClassName("input-default js_input")[0].value="深圳市南山区海岸城"' browser.execute_script(js) xp = '//*[@id="sogou_vr_21384401_1_wrap"]/div/div[1]/a' browser.find_element_by_xpath(xp).click() |
ua
Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6
Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER
Opera/9.25 (Windows NT 5.1; U; en)
Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11
Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11
Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; 360SE)
Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)
Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20
Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6
Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1
Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)
Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12
Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 | import os, sys import time import logging import requests import threading from random import choice from selenium import webdriver from selenium.webdriver.common.desired_capabilities import DesiredCapabilities """ 全局约定,便于后期做日志分析 os._exit(INT) 4001 4002 4003 4004 """ os_sep = os.sep this_file_abspath, this_file_name = os.path.dirname(os.path.abspath(__file__)), os.path.abspath(__file__).split(os_sep)[ - 1 ] base_dir = os.path.dirname(os_sep.join(os.path.abspath(__file__).split(os_sep)[ 0 : - 2 ])) log_abspath = '%s%s%s' % (base_dir, os_sep, 'log' ) """ 日志的记录不能依赖于日志类 """ now_, e = time.strftime( '%Y%m%d_%H%M%S' , time.localtime(time.time())), '启动脚本' logf, s = '%s%s%s%s' % (log_abspath, os_sep, this_file_name, now_), '%s%s%s%s' % (__file__, now_, os.getcwd(), e) with open (logf, 'a' ) as fo: fo.write(s) print (s) try : sys.path.append(base_dir) from core.utils import MysqlHelper except Exception as e: s = '%s%s%s' % ( 'from core.utils import MysqlHelper EXCEPTION ' , time.strftime( '%Y%m%d_%H%M%S' , time.localtime(time.time())), e) with open (logf, 'a' ) as fo: fo.write(s) print (s) os._exit( 4001 ) try : logging.basicConfig(level = logging.INFO, format = '%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s[thread:%(thread)d][process:%(process)d]' , datefmt = '%a, %d %b %Y %H:%M:%S' , filename = logf, filemode = 'a' ) except Exception as e: s = '%s%s%s' % ( 'logging.basicConfig EXCEPTION ' , time.strftime( '%Y%m%d_%H%M%S' , time.localtime(time.time())), e) with open (logf, 'a' ) as fo: fo.write(s) print (s) os._exit( 4002 ) try : fua, lua = '%s%s%s' % (this_file_abspath, os_sep, 'ua_list.txt' ), [] with open (fua, 'r' ) as fo: for i in fo: lua.append(i.replace( '\n' , '')) except Exception as e: s = '%s%s' % ( '打开文件 EXCEPTION ua文件路径: ' , fua) logging.error(s) print (s) os._exit( 4003 ) dcap = dict (DesiredCapabilities.PHANTOMJS) dcap[ "phantomjs.page.settings.userAgent" ] = choice(lua) dcap[ 'browserName' ], dcap[ 'platform' ] = ' ', ' ' class MyThread(threading.Thread): def __init__( self , func, args, name): threading.Thread.__init__( self ) self .func, self .args, self .name = func, args, name def run( self ): self .func( self .args) ctrl_start, max_script_time = time.time(), 3600 * 4 def ctrl_runtime(exit_type = ''): if time.time() - ctrl_start > = max_script_time: s = '%s%s%s%s%s%s%s%s%s' % ( '程序开始执行时间' , ctrl_start, '执行时间阈值' , max_script_time, '终止执行' , ' exit_type =' , exit_type, ' threadID ' , threading.get_ident()) logging.info(s) if exit_type = = '': exit(s) elif exit_type = = 'sys' : sys.exit(s) elif exit_type = = 'os' : # an integer is required # Required argument 'status' (pos 1) not found os._exit( 4004 ) url_counter = 0 def main(): """ 对异常无限重启 """ try : mysql_obj = MysqlHelper() q = 'SELECT direct_order_id FROM test_error;' tuple_l = mysql_obj.select(q) pass_id_l = [i[ 0 ] for i in tuple_l] pass_id_l = [ str (i) for i in pass_id_l] pass_id_l_s = ',' .join(pass_id_l) del mysql_obj, tuple_l # 业务当前未失效的url在在test_order具有唯一行 # """ 后期任务: test_error积累一定数据后对url重新检测 #3个功能点:当前半个小时、当前未失效的url test_order内url的异常情况(当前的2个功能点)、(后期任务:test_error积累一定数据后对url重新检测) q = 'SELECT url,id FROM test_order WHERE unix_timestamp(now()) - create_time<=3600*48 AND id NOT in ( %s ) ORDER BY id DESC ;' % ( pass_id_l_s) q = 'SELECT url,id FROM test_order WHERE unix_timestamp(now()) < expire_time AND id NOT in ( %s ) ORDER BY id DESC ;' % ( pass_id_l_s) """ mysql_obj = MysqlHelper() q = 'SELECT url,id FROM test_order WHERE unix_timestamp(now()) < expire_time AND id NOT in ( %s ) ORDER BY id DESC ;' % ( pass_id_l_s) tuple_l = mysql_obj.select(q) del mysql_obj if len (tuple_l) = = 0 : s = '无待检测url,程序退出' print (s) logging.info(s) except Exception as e: s = '%s%s%s' % ( '初始数据,查询数据库异常,无限次重启该脚本' , e, time.strftime( '%Y%m%d %H:%M:%S' , time.localtime(time.time()))) print (s) logging.warning(s) cmd = 'python %s' % (__file__) os.system(cmd) os._exit( 1024 ) # 考虑到每1小时执行下该脚本,对url异常的处理为:第一次请求为预期则终止请求,反之,间隔30后,再至多请求2次,每次间隔10s sleep_counter, sleep_step, sleep_seconds, mycode_l, repeat_times, repeat_sleep_times = 0 , 20 , 1 , [ 'g3user.com' , '51g3.com.cn' ], 4 , 10 # 重构到基类 where list # d当前为为了f_l字段的需求改动 def get_onerow(url, f_l = [ 'title' , 'uid' , 'money_total' ], tab = 'test_order' ): t = - 1 try : mysql_obj = MysqlHelper() f_s = ',' .join(f_l) q = 'SELECT %s FROM %s WHERE url="%s" ORDER BY id DESC LIMIT 1' % (f_s, tab, url) s = '%s%s' % ( ' DB ' , q) logging.info(s) t = mysql_obj.select(q) if t ! = - 1 : t = t[ 0 ] del mysql_obj except Exception as e: s = '%s%s' % ( ' DB ' , e) logging.info(s) return t return t def chk_exception_url(url, sleep_seconds = 0 , http_tag = 'http://' ): time.sleep(sleep_seconds) global url_counter ret = {} # db url状态值 状态 0:打不开 1:打开无广告 2:已处理 ret[ 'ok' ], ret[ 'status_code' ], s = - 1 , - 1 , '%s%s%s%s' % ( time.strftime( '%Y%m%d %H:%M:%S' , time.localtime(time.time())), ' threadID ' , threading.get_ident(), url) try : if url.find( 'http' ) = = - 1 : url = '%s%s' % (http_tag, url) r = requests.get(url) ret[ 'status_code' ], txt_pos = int (r.status_code), - 1 s = '%s,%s,%s,%s,%s' % (s, ret[ 'status_code' ], url, r, r.reason) except Exception as e: ret[ 'ok' ] = 0 s = '%s %s %s' % (s, ' SPIDER ' , e) logging.error(s) print (e, url) # 当前,仅考虑目标站返回200 if ret[ 'status_code' ] = = 200 : for ii in mycode_l: if r.text.find(ii) > - 1 : ret[ 'ok' ], txt_pos = 1 , 1 break if txt_pos = = - 1 : try : driver = webdriver.PhantomJS(desired_capabilities = dcap, executable_path = '/usr/local/phantomjs/bin/phantomjs' ) driver.get(url) time.sleep( 1 ) page_source = driver.page_source driver.quit() for ii in mycode_l: if page_source.find(ii) > - 1 : ret[ 'ok' ] = 1 break if ret[ 'ok' ] = = - 1 : s = '%s%s' % (s, '返回200,但是在html中未检测到我公司代码。' ) ret[ 'ok' ], ret[ 'info' ] = 0 , s except Exception as e: s = '%s %s %s' % (s, ' SPIDER ' , e) logging.error(s) print (e, url) # elif ret['status_code'] == 403: # www.hsdcw.com/fenlei/41668214.html elif ret[ 'status_code' ] = = 403 : pass else : ret[ 'ok' ], ret[ 'info' ] = 0 , s url_counter + = 1 s = '%s/%s%s%s' % (url_counter, len (tuple_l), 'chk-ret' , s) print (s) if ret[ 'ok' ] = = 0 : logging.warning(s) else : logging.info(s) return ret tn, tl, tstep = len (tuple_l), [], 4000 def tf(ts): te = ts + tstep te = min (te, tn) for i in tuple_l[ts:te]: ctrl_runtime(exit_type = 'os' ) url, chk_id = i s = '%s%s%s%s' % ( time.strftime( '%Y%m%d %H:%M:%S' , time.localtime(time.time())), ' threadID ' , threading.get_ident(), url) if chk_id in pass_id_l: s = '%s%s' % (s, ' 跳过,之前test_error已写入该url ' ) logging.info(s) print (s) """ 针对新浪爱问的规则: 不检测 """ if url.find( 'iask.sina.com' ) > - 1 : continue write_db_flag = 1 for t in range ( 0 , repeat_times, 1 ): ret = chk_exception_url(url, repeat_sleep_times) if ret[ 'ok' ] = = 1 : write_db_flag = 0 break if write_db_flag = = 1 : try : title, uid, money_total = get_onerow(url) except Exception as e: s = '%s%s%s' % (s, ' DB Exception-去test_order查' , e) logging.info(s) print (s) break # 多线程 考虑到原包的 数据库限制,每次均实例化数据库类,用后删除 try : # 可以考虑分装到类构造器中 mysql_obj = MysqlHelper() except Exception as e: s = '%s%s%s' % (s, ' DB Exception- ' , e) logging.error(s) print (s) break """ 多进程、线程并发 待优化,比如队列 """ q = 'SELECT id FROM test_error WHERE url="%s" LIMIT 1' % (url) try : r = mysql_obj.select(q) s = '%s%s%s' % (s, ' -SQL- ' , q) logging.info(s) print (q) except Exception as e: s = '%s %s %s %s' % (s, ' DB Exception-' , q, e) logging.info(s) print (s) break ctime = int (time.time()) # 建议优化此处数据库设计 db_status = 1 if ret[ 'status_code' ] = = 200 else 0 if len (r) = = 0 : q = 'INSERT INTO test_error (title,url,status,remarks,update_time,create_time,uid,money,direct_order_id) VALUES ("%s","%s","%s","%s","%s","%s","%s","%s","%s")' % ( title, url, db_status, ret[ 'info' ], ctime, ctime, uid, money_total, chk_id) try : mysql_obj.execute(q) mysql_obj.commit() del mysql_obj s = '%s%s%s' % (s, ' DB SQL ok ' , q) logging.info(s) print (s) except Exception as e: s = '%s%s%s%s' % (s, ' DB Exception- ' , q, e) logging.error(s) print (s) elif len (r) = = 1 : continue for i in range ( 0 , tn, tstep): if i > = tn: break thread_instance = MyThread(tf, (i), tf.__name__) tl.append(thread_instance) for t in tl: t.setDaemon = False t.start() for t in tl: t.join() if __name__ = = '__main__' : main() |
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
# from selenium.webdriver.firefox.options import Options
import time
from time import sleep
import math
import random
import sys
import threading
from random import choice
# import urllib.parse
from bs4 import BeautifulSoup
ua_list = []
with open('mobile_ua.txt', 'r', encoding='utf-8') as uafile:
for i in uafile:
if i.find('Mozilla') > -1:
ua_list.append(i.replace('\n', '').strip())
ua_list_len_ = len(ua_list) - 1
def close_alert(browser, attitude='accept'):
# js='alert(window.alert=function(str){return;}'
# browser.execute_script(js)
# js= 'window.alert = function(str){return ;}'
# browser.execute_script(js)
return
# mobile_emulation = {
# "deviceMetrics": {"width": 360, "height": 640, "pixelRatio": 3.0},
# "userAgent": "Mozilla/5.0 (Linux; Android 4.2.1; en-us; Nexus 5 Build/JOP40D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Mobile Safari/535.19"}
ua_list_index = random.randint(0, ua_list_len_)
# mobile_emulation = {
# "deviceMetrics": {"width": 360, "height": 640, "pixelRatio": 3.0}}
#
# mobile_emulation['userAgent'] = choice(ua_list)
# chrome_options = Options()
# chrome_options.add_experimental_option("mobileEmulation", mobile_emulation)
# browser = webdriver.Chrome(chrome_options=chrome_options)
browser = webdriver.Chrome()
s_wd = '长尾'
url_seed = 'https://m.baidu.com/s?word=s_wd'
url_seed = url_seed.replace('s_wd', s_wd)
print(url_seed)
browser.get(url_seed)
rd = BeautifulSoup(browser.page_source, 'html.parser').find_all('a', class_='rw-item')
res_d_l = [{'contents': d.contents, 'href': d.attrs['href']} for d in rd]
browser.quit()
d = 3
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】凌霞软件回馈社区,博客园 & 1Panel & Halo 联合会员上线
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步