spider_action

 

 

spider from mobile to mobile to mobile

 

 

 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
# from selenium.webdriver.firefox.options import Options
import time
from time import sleep
import math
import random
import sys
 
tag_jmtool_list = ['(', '(', '-']
 
ua_list = []
with open('mobile_ua.txt', 'r', encoding='utf-8') as uafile:
    for i in uafile:
        if i.find('Mozilla') > -1:
            ua_list.append(i.replace('\n', '').strip())
 
ua_list_len_ = len(ua_list) - 1
 
 
def extract_name(name_):
    for i in tag_jmtool_list:
        name_ = name_.split(i)[0]
    return name_
 
 
target_type_list = ['住宅小区', '写字楼']
target_type_list = ['住宅小区']
target_dic = {}
with open('JMTool0819am/任务JMTool.csv', 'r', encoding='utf-8') as csvfile:
    for i in csvfile:
        l = i.replace(' ', '').replace('\n', '').split('";"')
        if l[0].replace('"', '') in target_type_list:
            type_, city, district, addr, name_ = l
            type_, name_ = type_.replace('"', ''), name_.replace('"', '')
            name_reduction = extract_name(name_)
 
            if city not in target_dic:
                target_dic[city] = {}
            if district not in target_dic[city]:
                target_dic[city][district] = {}
            if type_ not in target_dic[city][district]:
                target_dic[city][district][type_] = {}
            if name_reduction not in target_dic[city][district]:
                target_dic[city][district][type_][name_reduction] = {}
                target_dic[city][district][type_][name_reduction]['name_reduction_list'] = []
                target_dic[city][district][type_][name_reduction]['history_list'] = []
 
            target_dic[city][district][type_][name_reduction]['name_reduction_list'].append(name_)
            target_dic[city][district][type_][name_reduction]['history_list'].append(l)
 
 
def write_res_html(browser, dir_='baidu_map_html/'):
    current_url_ = '%s%s%s%s' % ('<!--', input_, browser.current_url, '-->')
    page_source = '%s%s' % (current_url_, browser.page_source)
    localtime_ = time.strftime("%y%m%d%H%M%S", time.localtime())
    file_name = '%s%s%s%s' % (dir_, input_, localtime_, '.html')
    fo = open(file_name, 'w', encoding='utf-8')
    fo.write(page_source)
    fo.closed
 
 
def gen_random_letter():
    return chr(random.randint(97, 122))
 
 
def gen_random_num():
    return random.randint(0, 10)
 
 
def gen_sougo_pid():
    res_ = ''
    for i in range(1, 17, 1):
        if i in [1, 3, 4, 15]:
            res_ = '%s%s' % (res_, gen_random_letter())
        else:
            res_ = '%s%s' % (res_, gen_random_num())
    return res_
 
 
def close_alert(browser, attitude='accept'):
    try:
        sleep(2)
        al = browser.switch_to.alert()
        sleep(1)
        if attitude == 'accept':
            al.accept()
        elif attitude == 'dismiss':
            al.dismiss()
        print(sys._getframe().f_lineno, 'alert-closed-ok')
    except Exception:
        print(sys._getframe().f_lineno, Exception, 'no-alert')
 
 
# input_ = '深圳市南山区荟芳园'
 
def mobile_mobile_pages_html(input_):
    # mobile_emulation = {
    #     "deviceMetrics": {"width": 360, "height": 640, "pixelRatio": 3.0},
    #     "userAgent": "Mozilla/5.0 (Linux; Android 4.2.1; en-us; Nexus 5 Build/JOP40D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Mobile Safari/535.19"}
    ua_list_index = random.randint(0, ua_list_len_)
    mobile_emulation = {
        "deviceMetrics": {"width": 360, "height": 640, "pixelRatio": 3.0}}
 
    mobile_emulation['userAgent'] = ua_list[ua_list_index]
    chrome_options = Options()
    chrome_options.add_experimental_option("mobileEmulation", mobile_emulation)
    browser = webdriver.Chrome(chrome_options=chrome_options)
 
    url_seed = 'http://m.sogou.com/web/searchList.jsp?pid=sogou-mobb-%s-0007&keyword=百度地图' % (gen_sougo_pid())
    print(url_seed)
    browser.get(url_seed)
    js = '%s%s%s' % ('document.getElementsByClassName("input-default js_input")[0].value="', input_, '"')
    browser.execute_script(js)
    xp_newpage = '//*[@id="sogou_vr_21384401_1_wrap"]/div/div[1]/a'
    browser.find_element_by_xpath(xp_newpage).click()
    sleep(2)
 
    # xp = '//*[@id="common-bottombanner-widget-fis"]/div/div/div[2]'
    # sleep(1)
    # browser.find_element_by_xpath(xp).click()
    close_alert(browser)
    try:
        xp = '//*[@id="place-widget-placenewlist-showall"]/span[1]'
        sleep(2)
        close_alert(browser)
        browser.find_element_by_xpath(xp)
    except Exception:
        print(sys._getframe().f_lineno, Exception)
        return
    close_alert(browser)
    if browser.find_element_by_xpath(xp).text.find('全部') == -1:
        return
    res_num = browser.find_element_by_xpath(xp).text.split('全部')[1].split('条')[0]
    res_num = int(res_num)
    page_num = 10
    loop_breaker = math.ceil(res_num / page_num)
 
    close_alert(browser)
    if res_num <= page_num:
        write_res_html(browser)
        browser.quit()
        return
    close_alert(browser)
    xp = '//*[@id="place-widget-placenewlist-showall"]'
    browser.find_element_by_xpath(xp).click()
    write_res_html(browser)
    close_alert(browser)
    js = "window.scrollTo(0,document.body.scrollHeight)"
    browser.execute_script(js)
    sleep(1)
    try:
        xp_newpage = '//*[@id="fis_elm__7"]/div/div[2]/span[2]'
        browser.find_element_by_xpath(xp_newpage).click()
        sleep(1)
    except Exception:
        print(sys._getframe().f_lineno, Exception)
        write_res_html(browser)
        browser.quit()
        return
 
    for i in range(1, loop_breaker, 1):
        sleep(1)
        try:
            xp = '//*[@id="common-bottombanner-widget-fis"]/div/div/div[2]'
            sleep(3)
            browser.find_element_by_xpath(xp).click()
        except Exception:
            print(sys._getframe().f_lineno, Exception)
            sleep(10)
            break
        try:
            js = "window.scrollTo(0,document.body.scrollHeight)"
            browser.execute_script(js)
            sleep(1)
        except Exception:
            print(sys._getframe().f_lineno, Exception)
            sleep(10)
        try:
            xp_newpage = '//*[@id="fis_elm_pager__qk_7"]/div/div/span[2]'
            sleep(1)
            print(input_, i)
            browser.find_element_by_xpath(xp_newpage).click()
            write_res_html(browser)
        except Exception:
            print(sys._getframe().f_lineno, Exception)
            sleep(10)
    sleep(2)
    browser.quit()
 
 
for city in target_dic:
    for district in target_dic[city]:
        for type_ in target_dic[city][district]:
            for name_reduction in target_dic[city][district][type_]:
                input_ = '%s%s%s' % (city, district, name_reduction)
                mobile_mobile_pages_html(input_)

  

 

 

 

 

 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
from time import sleep
import math
 
url_seed = 'http://m.sogou.com/web/searchList.jsp?pid=sogou-mobb-123asd-0007&keyword=百度地图'
mobile_emulation = {
    "deviceMetrics": {"width": 360, "height": 640, "pixelRatio": 3.0},
    "userAgent": "Mozilla/5.0 (Linux; Android 4.2.1; en-us; Nexus 5 Build/JOP40D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Mobile Safari/535.19"}
chrome_options = Options()
chrome_options.add_experimental_option("mobileEmulation", mobile_emulation)
browser = webdriver.Chrome(chrome_options=chrome_options)
 
browser.get(url_seed)
input_ = '深圳市南山区荟芳园'
 
js = '%s%s%s' % ('document.getElementsByClassName("input-default js_input")[0].value="', input_, '"')
browser.execute_script(js)
xp_newpage = '//*[@id="sogou_vr_21384401_1_wrap"]/div/div[1]/a'
browser.find_element_by_xpath(xp_newpage).click()
sleep(1)
 
xp = '//*[@id="common-bottombanner-widget-fis"]/div/div/div[2]'
browser.find_element_by_xpath(xp).click()
 
xp = '//*[@id="place-widget-placenewlist-showall"]/span[1]'
browser.find_element_by_xpath(xp)
res_num = browser.find_element_by_xpath(xp).text.split('全部')[1].split('条')[0]
res_num = int(res_num)
page_num = 10
loop_breaker = math.ceil(res_num / page_num)
 
 
def write_res_html(browser, dir_='baidu_map_html/'):
    current_url_ = '%s%s%s%s' % ('<!--', input_, browser.current_url, '-->')
    page_source = '%s%s' % (current_url_, browser.page_source)
    localtime_ = time.strftime("%y%m%d%H%M%S", time.localtime())
    file_name = '%s%s%s%s' % (dir_, input_, localtime_, '.html')
    fo = open(file_name, 'w', encoding='utf-8')
    fo.write(page_source)
    fo.closed
 
 
xp = '//*[@id="place-widget-placenewlist-showall"]'
browser.find_element_by_xpath(xp).click()
write_res_html(browser)
 
js = "window.scrollTo(0,document.body.scrollHeight)"
browser.execute_script(js)
sleep(1)
xp_newpage = '//*[@id="fis_elm__7"]/div/div[2]/span[2]'
browser.find_element_by_xpath(xp_newpage).click()
sleep(1)
 
for i in range(1, loop_breaker, 1):
    sleep(1)
    xp = '//*[@id="common-bottombanner-widget-fis"]/div/div/div[2]'
    browser.find_element_by_xpath(xp).click()
    js = "window.scrollTo(0,document.body.scrollHeight)"
    browser.execute_script(js)
    sleep(1)
    xp_newpage = '//*[@id="fis_elm_pager__qk_7"]/div/div/span[2]'
    browser.find_element_by_xpath(xp_newpage).click()
    write_res_html(browser)

  

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
 
url_seed = 'http://m.sogou.com/web/searchList.jsp?pid=sogou-mobb-123asd-0007&keyword=百度地图'
mobile_emulation = {
    "deviceMetrics": {"width": 360, "height": 640, "pixelRatio": 3.0},
    "userAgent": "Mozilla/5.0 (Linux; Android 4.2.1; en-us; Nexus 5 Build/JOP40D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Mobile Safari/535.19"}
chrome_options = Options()
chrome_options.add_experimental_option("mobileEmulation", mobile_emulation)
browser = webdriver.Chrome(chrome_options=chrome_options)
 
browser.get(url_seed)
js = 'document.getElementsByClassName("input-default js_input")[0].value="深圳市南山区海岸城"'
browser.execute_script(js)
xp = '//*[@id="sogou_vr_21384401_1_wrap"]/div/div[1]/a'
browser.find_element_by_xpath(xp).click()

  

ua 

Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6
Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER
Opera/9.25 (Windows NT 5.1; U; en)
Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11
Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11
Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; 360SE)
Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)
Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20
Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6
Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1
Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)
Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12
Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0

 

 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
import os, sys
import time
import logging
import requests
import threading
 
from random import choice
from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
 
"""
全局约定,便于后期做日志分析
os._exit(INT)
4001 4002 4003 4004
"""
os_sep = os.sep
this_file_abspath, this_file_name = os.path.dirname(os.path.abspath(__file__)), os.path.abspath(__file__).split(os_sep)[
    -1]
base_dir = os.path.dirname(os_sep.join(os.path.abspath(__file__).split(os_sep)[0:-2]))
log_abspath = '%s%s%s' % (base_dir, os_sep, 'log')
 
"""
日志的记录不能依赖于日志类
"""
now_, e = time.strftime('%Y%m%d_%H%M%S', time.localtime(time.time())), '启动脚本'
logf, s = '%s%s%s%s' % (log_abspath, os_sep, this_file_name, now_), '%s%s%s%s' % (__file__, now_, os.getcwd(), e)
with open(logf, 'a') as fo:
    fo.write(s)
    print(s)
 
try:
    sys.path.append(base_dir)
    from core.utils import MysqlHelper
except Exception as e:
    s = '%s%s%s' % (
        'from core.utils import MysqlHelper EXCEPTION ', time.strftime('%Y%m%d_%H%M%S', time.localtime(time.time())),
        e)
    with open(logf, 'a') as fo:
        fo.write(s)
        print(s)
        os._exit(4001)
 
try:
    logging.basicConfig(level=logging.INFO,
                        format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s[thread:%(thread)d][process:%(process)d]',
                        datefmt='%a, %d %b %Y %H:%M:%S',
                        filename=logf,
                        filemode='a')
except Exception as e:
    s = '%s%s%s' % ('logging.basicConfig EXCEPTION ', time.strftime('%Y%m%d_%H%M%S', time.localtime(time.time())), e)
    with open(logf, 'a') as fo:
        fo.write(s)
        print(s)
        os._exit(4002)
 
try:
 
    fua, lua = '%s%s%s' % (this_file_abspath, os_sep,
                           'ua_list.txt'), []
    with open(fua, 'r') as fo:
        for i in fo:
            lua.append(i.replace('\n', ''))
except Exception as e:
    s = '%s%s' % ('打开文件 EXCEPTION  ua文件路径: ', fua)
    logging.error(s)
    print(s)
    os._exit(4003)
 
dcap = dict(DesiredCapabilities.PHANTOMJS)
dcap["phantomjs.page.settings.userAgent"] = choice(lua)
dcap['browserName'], dcap['platform'] = '', ''
 
 
class MyThread(threading.Thread):
    def __init__(self, func, args, name):
        threading.Thread.__init__(self)
        self.func, self.args, self.name = func, args, name
 
    def run(self):
        self.func(self.args)
 
 
ctrl_start, max_script_time = time.time(), 3600 * 4
 
 
def ctrl_runtime(exit_type=''):
    if time.time() - ctrl_start >= max_script_time:
        s = '%s%s%s%s%s%s%s%s%s' % (
            '程序开始执行时间', ctrl_start, '执行时间阈值', max_script_time, '终止执行', ' exit_type =', exit_type, ' threadID ',
            threading.get_ident())
        logging.info(s)
        if exit_type == '':
            exit(s)
        elif exit_type == 'sys':
            sys.exit(s)
        elif exit_type == 'os':
            # an integer is required
            # Required argument 'status' (pos 1) not found
            os._exit(4004)
 
 
url_counter = 0
 
 
def main():
    """
    对异常无限重启
    """
 
    try:
        mysql_obj = MysqlHelper()
        q = 'SELECT direct_order_id FROM test_error;'
        tuple_l = mysql_obj.select(q)
        pass_id_l = [i[0] for i in tuple_l]
        pass_id_l = [str(i) for i in pass_id_l]
        pass_id_l_s = ','.join(pass_id_l)
        del mysql_obj, tuple_l
 
        # 业务当前未失效的url在在test_order具有唯一行
        #
        """
        后期任务:
        test_error积累一定数据后对url重新检测
        #3个功能点:当前半个小时、当前未失效的url test_order内url的异常情况(当前的2个功能点)、(后期任务:test_error积累一定数据后对url重新检测)
 
        q = 'SELECT  url,id FROM test_order WHERE  unix_timestamp(now()) - create_time<=3600*48 AND id NOT in  ( %s )  ORDER BY id DESC ;' % (
            pass_id_l_s)
 
        q = 'SELECT url,id FROM test_order WHERE unix_timestamp(now()) < expire_time AND id NOT in  ( %s )  ORDER BY id DESC ;' % (
            pass_id_l_s)
 
        """
 
        mysql_obj = MysqlHelper()
        q = 'SELECT url,id FROM test_order WHERE unix_timestamp(now()) < expire_time AND id NOT in  ( %s )  ORDER BY id DESC ;' % (
            pass_id_l_s)
        tuple_l = mysql_obj.select(q)
        del mysql_obj
        if len(tuple_l) == 0:
            s = '无待检测url,程序退出'
            print(s)
            logging.info(s)
    except Exception as e:
        s = '%s%s%s' % ('初始数据,查询数据库异常,无限次重启该脚本', e, time.strftime('%Y%m%d %H:%M:%S', time.localtime(time.time())))
        print(s)
        logging.warning(s)
        cmd = 'python %s' % (__file__)
        os.system(cmd)
        os._exit(1024)
 
    # 考虑到每1小时执行下该脚本,对url异常的处理为:第一次请求为预期则终止请求,反之,间隔30后,再至多请求2次,每次间隔10s
    sleep_counter, sleep_step, sleep_seconds, mycode_l, repeat_times, repeat_sleep_times = 0, 20, 1, [
        'g3user.com', '51g3.com.cn'], 4, 10
 
    # 重构到基类 where list
    # d当前为为了f_l字段的需求改动
    def get_onerow(url, f_l=['title', 'uid', 'money_total'], tab='test_order'):
        t = -1
        try:
            mysql_obj = MysqlHelper()
            f_s = ','.join(f_l)
            q = 'SELECT %s FROM %s WHERE url="%s" ORDER BY id DESC LIMIT 1' % (f_s, tab, url)
            s = '%s%s' % (' DB ', q)
            logging.info(s)
            t = mysql_obj.select(q)
            if t != -1:
                t = t[0]
            del mysql_obj
        except Exception as e:
            s = '%s%s' % (' DB ', e)
            logging.info(s)
            return t
        return t
 
    def chk_exception_url(url, sleep_seconds=0, http_tag='http://'):
        time.sleep(sleep_seconds)
        global url_counter
 
        ret = {}
        # db url状态值 状态 0:打不开 1:打开无广告 2:已处理
        ret['ok'], ret['status_code'], s = -1, -1, '%s%s%s%s' % (
            time.strftime('%Y%m%d %H:%M:%S', time.localtime(time.time())), ' threadID ', threading.get_ident(), url)
        try:
            if url.find('http') == -1:
                url = '%s%s' % (http_tag, url)
            r = requests.get(url)
            ret['status_code'], txt_pos = int(r.status_code), -1
            s = '%s,%s,%s,%s,%s' % (s, ret['status_code'], url, r, r.reason)
        except Exception as e:
            ret['ok'] = 0
            s = '%s %s %s' % (s, ' SPIDER ', e)
            logging.error(s)
            print(e, url)
 
        # 当前,仅考虑目标站返回200
        if ret['status_code'] == 200:
            for ii in mycode_l:
                if r.text.find(ii) > -1:
                    ret['ok'], txt_pos = 1, 1
                    break
            if txt_pos == -1:
                try:
                    driver = webdriver.PhantomJS(desired_capabilities=dcap,
                                                 executable_path='/usr/local/phantomjs/bin/phantomjs')
                    driver.get(url)
                    time.sleep(1)
                    page_source = driver.page_source
                    driver.quit()
                    for ii in mycode_l:
                        if page_source.find(ii) > -1:
                            ret['ok'] = 1
                            break
                    if ret['ok'] == -1:
                        s = '%s%s' % (s, '返回200,但是在html中未检测到我公司代码。')
                        ret['ok'], ret['info'] = 0, s
                except Exception as e:
                    s = '%s %s %s' % (s, ' SPIDER ', e)
                    logging.error(s)
                    print(e, url)
 
        # elif ret['status_code'] == 403:
        # www.hsdcw.com/fenlei/41668214.html
        elif ret['status_code'] == 403:
            pass
        else:
            ret['ok'], ret['info'] = 0, s
 
        url_counter += 1
        s = '%s/%s%s%s' % (url_counter, len(tuple_l), 'chk-ret', s)
        print(s)
        if ret['ok'] == 0:
            logging.warning(s)
        else:
            logging.info(s)
        return ret
 
    tn, tl, tstep = len(tuple_l), [], 4000
 
    def tf(ts):
 
        te = ts + tstep
        te = min(te, tn)
        for i in tuple_l[ts:te]:
            ctrl_runtime(exit_type='os')
            url, chk_id = i
            s = '%s%s%s%s' % (
                time.strftime('%Y%m%d %H:%M:%S', time.localtime(time.time())), ' threadID ', threading.get_ident(), url)
            if chk_id in pass_id_l:
                s = '%s%s' % (s, ' 跳过,之前test_error已写入该url ')
                logging.info(s)
                print(s)
            """
          针对新浪爱问的规则:  不检测
          """
            if url.find('iask.sina.com') > -1:
                continue
            write_db_flag = 1
            for t in range(0, repeat_times, 1):
                ret = chk_exception_url(url, repeat_sleep_times)
                if ret['ok'] == 1:
                    write_db_flag = 0
                    break
 
            if write_db_flag == 1:
                try:
                    title, uid, money_total = get_onerow(url)
                except Exception as e:
                    s = '%s%s%s' % (s, ' DB Exception-去test_order查', e)
                    logging.info(s)
                    print(s)
                    break
 
                # 多线程 考虑到原包的 数据库限制,每次均实例化数据库类,用后删除
                try:
                    # 可以考虑分装到类构造器中
                    mysql_obj = MysqlHelper()
                except Exception as e:
                    s = '%s%s%s' % (s, ' DB Exception- ', e)
                    logging.error(s)
                    print(s)
                    break
 
                """
                多进程、线程并发
                待优化,比如队列
              """
                q = 'SELECT id FROM test_error WHERE url="%s" LIMIT 1' % (url)
                try:
                    r = mysql_obj.select(q)
                    s = '%s%s%s' % (s, ' -SQL- ', q)
                    logging.info(s)
                    print(q)
                except Exception as e:
                    s = '%s %s %s %s' % (s, ' DB Exception-', q, e)
                    logging.info(s)
                    print(s)
                    break
 
                ctime = int(time.time())
                # 建议优化此处数据库设计
                db_status = 1 if ret['status_code'] == 200 else 0
                if len(r) == 0:
                    q = 'INSERT INTO test_error (title,url,status,remarks,update_time,create_time,uid,money,direct_order_id) VALUES ("%s","%s","%s","%s","%s","%s","%s","%s","%s")' % (
                        title, url, db_status, ret['info'], ctime, ctime, uid, money_total, chk_id)
                    try:
                        mysql_obj.execute(q)
                        mysql_obj.commit()
                        del mysql_obj
                        s = '%s%s%s' % (s, ' DB SQL ok ', q)
                        logging.info(s)
                        print(s)
                    except Exception as e:
                        s = '%s%s%s%s' % (s, ' DB Exception- ', q, e)
                        logging.error(s)
                        print(s)
 
                elif len(r) == 1:
                    continue
 
    for i in range(0, tn, tstep):
        if i >= tn:
            break
        thread_instance = MyThread(tf, (i), tf.__name__)
        tl.append(thread_instance)
 
    for t in tl:
        t.setDaemon = False
        t.start()
    for t in tl:
        t.join()
 
 
if __name__ == '__main__':
    main()

  

 

 

 

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
# from selenium.webdriver.firefox.options import Options
import time
from time import sleep
import math
import random
import sys
import threading
from random import choice
# import urllib.parse
from bs4 import BeautifulSoup

ua_list = []
with open('mobile_ua.txt', 'r', encoding='utf-8') as uafile:
for i in uafile:
if i.find('Mozilla') > -1:
ua_list.append(i.replace('\n', '').strip())

ua_list_len_ = len(ua_list) - 1


def close_alert(browser, attitude='accept'):
# js='alert(window.alert=function(str){return;}'
# browser.execute_script(js)

# js= 'window.alert = function(str){return ;}'
# browser.execute_script(js)
return


# mobile_emulation = {
# "deviceMetrics": {"width": 360, "height": 640, "pixelRatio": 3.0},
# "userAgent": "Mozilla/5.0 (Linux; Android 4.2.1; en-us; Nexus 5 Build/JOP40D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Mobile Safari/535.19"}
ua_list_index = random.randint(0, ua_list_len_)
# mobile_emulation = {
# "deviceMetrics": {"width": 360, "height": 640, "pixelRatio": 3.0}}
#
# mobile_emulation['userAgent'] = choice(ua_list)
# chrome_options = Options()
# chrome_options.add_experimental_option("mobileEmulation", mobile_emulation)
# browser = webdriver.Chrome(chrome_options=chrome_options)
browser = webdriver.Chrome()
s_wd = '长尾'
url_seed = 'https://m.baidu.com/s?word=s_wd'

url_seed = url_seed.replace('s_wd', s_wd)
print(url_seed)
browser.get(url_seed)

rd = BeautifulSoup(browser.page_source, 'html.parser').find_all('a', class_='rw-item')
res_d_l = [{'contents': d.contents, 'href': d.attrs['href']} for d in rd]
browser.quit()
d = 3

 

posted @   papering  阅读(936)  评论(0编辑  收藏  举报
点击右上角即可分享
微信分享提示