spider_action

spider from mobile to mobile to mobile

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
# from selenium.webdriver.firefox.options import Options
import time
from time import sleep
import math
import random
import sys
 
tag_jmtool_list = ['（', '(', '-']
 
ua_list = []
with open('mobile_ua.txt', 'r', encoding='utf-8') as uafile:
    for i in uafile:
        if i.find('Mozilla') > -1:
            ua_list.append(i.replace('\n', '').strip())
 
ua_list_len_ = len(ua_list) - 1
 
 
def extract_name(name_):
    for i in tag_jmtool_list:
        name_ = name_.split(i)[0]
    return name_
 
 
target_type_list = ['住宅小区', '写字楼']
target_type_list = ['住宅小区']
target_dic = {}
with open('JMTool0819am/任务JMTool.csv', 'r', encoding='utf-8') as csvfile:
    for i in csvfile:
        l = i.replace(' ', '').replace('\n', '').split('";"')
        if l[0].replace('"', '') in target_type_list:
            type_, city, district, addr, name_ = l
            type_, name_ = type_.replace('"', ''), name_.replace('"', '')
            name_reduction = extract_name(name_)
 
            if city not in target_dic:
                target_dic[city] = {}
            if district not in target_dic[city]:
                target_dic[city][district] = {}
            if type_ not in target_dic[city][district]:
                target_dic[city][district][type_] = {}
            if name_reduction not in target_dic[city][district]:
                target_dic[city][district][type_][name_reduction] = {}
                target_dic[city][district][type_][name_reduction]['name_reduction_list'] = []
                target_dic[city][district][type_][name_reduction]['history_list'] = []
 
            target_dic[city][district][type_][name_reduction]['name_reduction_list'].append(name_)
            target_dic[city][district][type_][name_reduction]['history_list'].append(l)
 
 
def write_res_html(browser, dir_='baidu_map_html/'):
    current_url_ = '%s%s%s%s' % ('<!--', input_, browser.current_url, '-->')
    page_source = '%s%s' % (current_url_, browser.page_source)
    localtime_ = time.strftime("%y%m%d%H%M%S", time.localtime())
    file_name = '%s%s%s%s' % (dir_, input_, localtime_, '.html')
    fo = open(file_name, 'w', encoding='utf-8')
    fo.write(page_source)
    fo.closed
 
 
def gen_random_letter():
    return chr(random.randint(97, 122))
 
 
def gen_random_num():
    return random.randint(0, 10)
 
 
def gen_sougo_pid():
    res_ = ''
    for i in range(1, 17, 1):
        if i in [1, 3, 4, 15]:
            res_ = '%s%s' % (res_, gen_random_letter())
        else:
            res_ = '%s%s' % (res_, gen_random_num())
    return res_
 
 
def close_alert(browser, attitude='accept'):
    try:
        sleep(2)
        al = browser.switch_to.alert()
        sleep(1)
        if attitude == 'accept':
            al.accept()
        elif attitude == 'dismiss':
            al.dismiss()
        print(sys._getframe().f_lineno, 'alert-closed-ok')
    except Exception:
        print(sys._getframe().f_lineno, Exception, 'no-alert')
 
 
# input_ = '深圳市南山区荟芳园'
 
def mobile_mobile_pages_html(input_):
    # mobile_emulation = {
    #     "deviceMetrics": {"width": 360, "height": 640, "pixelRatio": 3.0},
    #     "userAgent": "Mozilla/5.0 (Linux; Android 4.2.1; en-us; Nexus 5 Build/JOP40D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Mobile Safari/535.19"}
    ua_list_index = random.randint(0, ua_list_len_)
    mobile_emulation = {
        "deviceMetrics": {"width": 360, "height": 640, "pixelRatio": 3.0}}
 
    mobile_emulation['userAgent'] = ua_list[ua_list_index]
    chrome_options = Options()
    chrome_options.add_experimental_option("mobileEmulation", mobile_emulation)
    browser = webdriver.Chrome(chrome_options=chrome_options)
 
    url_seed = 'http://m.sogou.com/web/searchList.jsp?pid=sogou-mobb-%s-0007&keyword=百度地图' % (gen_sougo_pid())
    print(url_seed)
    browser.get(url_seed)
    js = '%s%s%s' % ('document.getElementsByClassName("input-default js_input")[0].value="', input_, '"')
    browser.execute_script(js)
    xp_newpage = '//*[@id="sogou_vr_21384401_1_wrap"]/div/div[1]/a'
    browser.find_element_by_xpath(xp_newpage).click()
    sleep(2)
 
    # xp = '//*[@id="common-bottombanner-widget-fis"]/div/div/div[2]'
    # sleep(1)
    # browser.find_element_by_xpath(xp).click()
    close_alert(browser)
    try:
        xp = '//*[@id="place-widget-placenewlist-showall"]/span[1]'
        sleep(2)
        close_alert(browser)
        browser.find_element_by_xpath(xp)
    except Exception:
        print(sys._getframe().f_lineno, Exception)
        return
    close_alert(browser)
    if browser.find_element_by_xpath(xp).text.find('全部') == -1:
        return
    res_num = browser.find_element_by_xpath(xp).text.split('全部')[1].split('条')[0]
    res_num = int(res_num)
    page_num = 10
    loop_breaker = math.ceil(res_num / page_num)
 
    close_alert(browser)
    if res_num <= page_num:
        write_res_html(browser)
        browser.quit()
        return
    close_alert(browser)
    xp = '//*[@id="place-widget-placenewlist-showall"]'
    browser.find_element_by_xpath(xp).click()
    write_res_html(browser)
    close_alert(browser)
    js = "window.scrollTo(0,document.body.scrollHeight)"
    browser.execute_script(js)
    sleep(1)
    try:
        xp_newpage = '//*[@id="fis_elm__7"]/div/div[2]/span[2]'
        browser.find_element_by_xpath(xp_newpage).click()
        sleep(1)
    except Exception:
        print(sys._getframe().f_lineno, Exception)
        write_res_html(browser)
        browser.quit()
        return
 
    for i in range(1, loop_breaker, 1):
        sleep(1)
        try:
            xp = '//*[@id="common-bottombanner-widget-fis"]/div/div/div[2]'
            sleep(3)
            browser.find_element_by_xpath(xp).click()
        except Exception:
            print(sys._getframe().f_lineno, Exception)
            sleep(10)
            break
        try:
            js = "window.scrollTo(0,document.body.scrollHeight)"
            browser.execute_script(js)
            sleep(1)
        except Exception:
            print(sys._getframe().f_lineno, Exception)
            sleep(10)
        try:
            xp_newpage = '//*[@id="fis_elm_pager__qk_7"]/div/div/span[2]'
            sleep(1)
            print(input_, i)
            browser.find_element_by_xpath(xp_newpage).click()
            write_res_html(browser)
        except Exception:
            print(sys._getframe().f_lineno, Exception)
            sleep(10)
    sleep(2)
    browser.quit()
 
 
for city in target_dic:
    for district in target_dic[city]:
        for type_ in target_dic[city][district]:
            for name_reduction in target_dic[city][district][type_]:
                input_ = '%s%s%s' % (city, district, name_reduction)
                mobile_mobile_pages_html(input_)

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
from time import sleep
import math
 
url_seed = 'http://m.sogou.com/web/searchList.jsp?pid=sogou-mobb-123asd-0007&keyword=百度地图'
mobile_emulation = {
    "deviceMetrics": {"width": 360, "height": 640, "pixelRatio": 3.0},
    "userAgent": "Mozilla/5.0 (Linux; Android 4.2.1; en-us; Nexus 5 Build/JOP40D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Mobile Safari/535.19"}
chrome_options = Options()
chrome_options.add_experimental_option("mobileEmulation", mobile_emulation)
browser = webdriver.Chrome(chrome_options=chrome_options)
 
browser.get(url_seed)
input_ = '深圳市南山区荟芳园'
 
js = '%s%s%s' % ('document.getElementsByClassName("input-default js_input")[0].value="', input_, '"')
browser.execute_script(js)
xp_newpage = '//*[@id="sogou_vr_21384401_1_wrap"]/div/div[1]/a'
browser.find_element_by_xpath(xp_newpage).click()
sleep(1)
 
xp = '//*[@id="common-bottombanner-widget-fis"]/div/div/div[2]'
browser.find_element_by_xpath(xp).click()
 
xp = '//*[@id="place-widget-placenewlist-showall"]/span[1]'
browser.find_element_by_xpath(xp)
res_num = browser.find_element_by_xpath(xp).text.split('全部')[1].split('条')[0]
res_num = int(res_num)
page_num = 10
loop_breaker = math.ceil(res_num / page_num)
 
 
def write_res_html(browser, dir_='baidu_map_html/'):
    current_url_ = '%s%s%s%s' % ('<!--', input_, browser.current_url, '-->')
    page_source = '%s%s' % (current_url_, browser.page_source)
    localtime_ = time.strftime("%y%m%d%H%M%S", time.localtime())
    file_name = '%s%s%s%s' % (dir_, input_, localtime_, '.html')
    fo = open(file_name, 'w', encoding='utf-8')
    fo.write(page_source)
    fo.closed
 
 
xp = '//*[@id="place-widget-placenewlist-showall"]'
browser.find_element_by_xpath(xp).click()
write_res_html(browser)
 
js = "window.scrollTo(0,document.body.scrollHeight)"
browser.execute_script(js)
sleep(1)
xp_newpage = '//*[@id="fis_elm__7"]/div/div[2]/span[2]'
browser.find_element_by_xpath(xp_newpage).click()
sleep(1)
 
for i in range(1, loop_breaker, 1):
    sleep(1)
    xp = '//*[@id="common-bottombanner-widget-fis"]/div/div/div[2]'
    browser.find_element_by_xpath(xp).click()
    js = "window.scrollTo(0,document.body.scrollHeight)"
    browser.execute_script(js)
    sleep(1)
    xp_newpage = '//*[@id="fis_elm_pager__qk_7"]/div/div/span[2]'
    browser.find_element_by_xpath(xp_newpage).click()
    write_res_html(browser)

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
 
url_seed = 'http://m.sogou.com/web/searchList.jsp?pid=sogou-mobb-123asd-0007&keyword=百度地图'
mobile_emulation = {
    "deviceMetrics": {"width": 360, "height": 640, "pixelRatio": 3.0},
    "userAgent": "Mozilla/5.0 (Linux; Android 4.2.1; en-us; Nexus 5 Build/JOP40D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Mobile Safari/535.19"}
chrome_options = Options()
chrome_options.add_experimental_option("mobileEmulation", mobile_emulation)
browser = webdriver.Chrome(chrome_options=chrome_options)
 
browser.get(url_seed)
js = 'document.getElementsByClassName("input-default js_input")[0].value="深圳市南山区海岸城"'
browser.execute_script(js)
xp = '//*[@id="sogou_vr_21384401_1_wrap"]/div/div[1]/a'
browser.find_element_by_xpath(xp).click()

Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6
Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER
Opera/9.25 (Windows NT 5.1; U; en)
Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11
Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11
Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; 360SE)
Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)
Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20
Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6
Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1
Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)
Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12
Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

278

279

280

281

282

283

284

285

286

287

288

289

290

291

292

293

294

295

296

297

298

299

300

301

302

303

304

305

306

307

308

309

310

311

312

313

314

315

316

317

318

319

320

321

322

323

324

325

326

327

328

329

330

331

332

333

334

import os, sys
import time
import logging
import requests
import threading
 
from random import choice
from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
 
"""
全局约定，便于后期做日志分析
os._exit(INT)
4001 4002 4003 4004
"""
os_sep = os.sep
this_file_abspath, this_file_name = os.path.dirname(os.path.abspath(__file__)), os.path.abspath(__file__).split(os_sep)[
    -1]
base_dir = os.path.dirname(os_sep.join(os.path.abspath(__file__).split(os_sep)[0:-2]))
log_abspath = '%s%s%s' % (base_dir, os_sep, 'log')
 
"""
日志的记录不能依赖于日志类
"""
now_, e = time.strftime('%Y%m%d_%H%M%S', time.localtime(time.time())), '启动脚本'
logf, s = '%s%s%s%s' % (log_abspath, os_sep, this_file_name, now_), '%s%s%s%s' % (__file__, now_, os.getcwd(), e)
with open(logf, 'a') as fo:
    fo.write(s)
    print(s)
 
try:
    sys.path.append(base_dir)
    from core.utils import MysqlHelper
except Exception as e:
    s = '%s%s%s' % (
        'from core.utils import MysqlHelper EXCEPTION ', time.strftime('%Y%m%d_%H%M%S', time.localtime(time.time())),
        e)
    with open(logf, 'a') as fo:
        fo.write(s)
        print(s)
        os._exit(4001)
 
try:
    logging.basicConfig(level=logging.INFO,
                        format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s[thread:%(thread)d][process:%(process)d]',
                        datefmt='%a, %d %b %Y %H:%M:%S',
                        filename=logf,
                        filemode='a')
except Exception as e:
    s = '%s%s%s' % ('logging.basicConfig EXCEPTION ', time.strftime('%Y%m%d_%H%M%S', time.localtime(time.time())), e)
    with open(logf, 'a') as fo:
        fo.write(s)
        print(s)
        os._exit(4002)
 
try:
 
    fua, lua = '%s%s%s' % (this_file_abspath, os_sep,
                           'ua_list.txt'), []
    with open(fua, 'r') as fo:
        for i in fo:
            lua.append(i.replace('\n', ''))
except Exception as e:
    s = '%s%s' % ('打开文件 EXCEPTION  ua文件路径： ', fua)
    logging.error(s)
    print(s)
    os._exit(4003)
 
dcap = dict(DesiredCapabilities.PHANTOMJS)
dcap["phantomjs.page.settings.userAgent"] = choice(lua)
dcap['browserName'], dcap['platform'] = '', ''
 
 
class MyThread(threading.Thread):
    def __init__(self, func, args, name):
        threading.Thread.__init__(self)
        self.func, self.args, self.name = func, args, name
 
    def run(self):
        self.func(self.args)
 
 
ctrl_start, max_script_time = time.time(), 3600 * 4
 
 
def ctrl_runtime(exit_type=''):
    if time.time() - ctrl_start >= max_script_time:
        s = '%s%s%s%s%s%s%s%s%s' % (
            '程序开始执行时间', ctrl_start, '执行时间阈值', max_script_time, '终止执行', ' exit_type =', exit_type, ' threadID ',
            threading.get_ident())
        logging.info(s)
        if exit_type == '':
            exit(s)
        elif exit_type == 'sys':
            sys.exit(s)
        elif exit_type == 'os':
            # an integer is required
            # Required argument 'status' (pos 1) not found
            os._exit(4004)
 
 
url_counter = 0
 
 
def main():
    """
    对异常无限重启
    """
 
    try:
        mysql_obj = MysqlHelper()
        q = 'SELECT direct_order_id FROM test_error;'
        tuple_l = mysql_obj.select(q)
        pass_id_l = [i[0] for i in tuple_l]
        pass_id_l = [str(i) for i in pass_id_l]
        pass_id_l_s = ','.join(pass_id_l)
        del mysql_obj, tuple_l
 
        # 业务当前未失效的url在在test_order具有唯一行
        #
        """
        后期任务：
        test_error积累一定数据后对url重新检测
        #3个功能点：当前半个小时、当前未失效的url test_order内url的异常情况（当前的2个功能点）、（后期任务：test_error积累一定数据后对url重新检测）
 
        q = 'SELECT  url,id FROM test_order WHERE  unix_timestamp(now()) - create_time<=3600*48 AND id NOT in  ( %s )  ORDER BY id DESC ;' % (
            pass_id_l_s)
 
        q = 'SELECT url,id FROM test_order WHERE unix_timestamp(now()) < expire_time AND id NOT in  ( %s )  ORDER BY id DESC ;' % (
            pass_id_l_s)
 
        """
 
        mysql_obj = MysqlHelper()
        q = 'SELECT url,id FROM test_order WHERE unix_timestamp(now()) < expire_time AND id NOT in  ( %s )  ORDER BY id DESC ;' % (
            pass_id_l_s)
        tuple_l = mysql_obj.select(q)
        del mysql_obj
        if len(tuple_l) == 0:
            s = '无待检测url，程序退出'
            print(s)
            logging.info(s)
    except Exception as e:
        s = '%s%s%s' % ('初始数据，查询数据库异常，无限次重启该脚本', e, time.strftime('%Y%m%d %H:%M:%S', time.localtime(time.time())))
        print(s)
        logging.warning(s)
        cmd = 'python %s' % (__file__)
        os.system(cmd)
        os._exit(1024)
 
    # 考虑到每1小时执行下该脚本，对url异常的处理为：第一次请求为预期则终止请求，反之，间隔30后，再至多请求2次，每次间隔10s
    sleep_counter, sleep_step, sleep_seconds, mycode_l, repeat_times, repeat_sleep_times = 0, 20, 1, [
        'g3user.com', '51g3.com.cn'], 4, 10
 
    # 重构到基类 where list
    # d当前为为了f_l字段的需求改动
    def get_onerow(url, f_l=['title', 'uid', 'money_total'], tab='test_order'):
        t = -1
        try:
            mysql_obj = MysqlHelper()
            f_s = ','.join(f_l)
            q = 'SELECT %s FROM %s WHERE url="%s" ORDER BY id DESC LIMIT 1' % (f_s, tab, url)
            s = '%s%s' % (' DB ', q)
            logging.info(s)
            t = mysql_obj.select(q)
            if t != -1:
                t = t[0]
            del mysql_obj
        except Exception as e:
            s = '%s%s' % (' DB ', e)
            logging.info(s)
            return t
        return t
 
    def chk_exception_url(url, sleep_seconds=0, http_tag='http://'):
        time.sleep(sleep_seconds)
        global url_counter
 
        ret = {}
        # db url状态值 状态 0：打不开 1：打开无广告 2:已处理
        ret['ok'], ret['status_code'], s = -1, -1, '%s%s%s%s' % (
            time.strftime('%Y%m%d %H:%M:%S', time.localtime(time.time())), ' threadID ', threading.get_ident(), url)
        try:
            if url.find('http') == -1:
                url = '%s%s' % (http_tag, url)
            r = requests.get(url)
            ret['status_code'], txt_pos = int(r.status_code), -1
            s = '%s,%s,%s,%s,%s' % (s, ret['status_code'], url, r, r.reason)
        except Exception as e:
            ret['ok'] = 0
            s = '%s %s %s' % (s, ' SPIDER ', e)
            logging.error(s)
            print(e, url)
 
        # 当前，仅考虑目标站返回200
        if ret['status_code'] == 200:
            for ii in mycode_l:
                if r.text.find(ii) > -1:
                    ret['ok'], txt_pos = 1, 1
                    break
            if txt_pos == -1:
                try:
                    driver = webdriver.PhantomJS(desired_capabilities=dcap,
                                                 executable_path='/usr/local/phantomjs/bin/phantomjs')
                    driver.get(url)
                    time.sleep(1)
                    page_source = driver.page_source
                    driver.quit()
                    for ii in mycode_l:
                        if page_source.find(ii) > -1:
                            ret['ok'] = 1
                            break
                    if ret['ok'] == -1:
                        s = '%s%s' % (s, '返回200,但是在html中未检测到我公司代码。')
                        ret['ok'], ret['info'] = 0, s
                except Exception as e:
                    s = '%s %s %s' % (s, ' SPIDER ', e)
                    logging.error(s)
                    print(e, url)
 
        # elif ret['status_code'] == 403:
        # www.hsdcw.com/fenlei/41668214.html
        elif ret['status_code'] == 403:
            pass
        else:
            ret['ok'], ret['info'] = 0, s
 
        url_counter += 1
        s = '%s/%s%s%s' % (url_counter, len(tuple_l), 'chk-ret', s)
        print(s)
        if ret['ok'] == 0:
            logging.warning(s)
        else:
            logging.info(s)
        return ret
 
    tn, tl, tstep = len(tuple_l), [], 4000
 
    def tf(ts):
 
        te = ts + tstep
        te = min(te, tn)
        for i in tuple_l[ts:te]:
            ctrl_runtime(exit_type='os')
            url, chk_id = i
            s = '%s%s%s%s' % (
                time.strftime('%Y%m%d %H:%M:%S', time.localtime(time.time())), ' threadID ', threading.get_ident(), url)
            if chk_id in pass_id_l:
                s = '%s%s' % (s, ' 跳过，之前test_error已写入该url ')
                logging.info(s)
                print(s)
            """
          针对新浪爱问的规则:  不检测
          """
            if url.find('iask.sina.com') > -1:
                continue
            write_db_flag = 1
            for t in range(0, repeat_times, 1):
                ret = chk_exception_url(url, repeat_sleep_times)
                if ret['ok'] == 1:
                    write_db_flag = 0
                    break
 
            if write_db_flag == 1:
                try:
                    title, uid, money_total = get_onerow(url)
                except Exception as e:
                    s = '%s%s%s' % (s, ' DB Exception-去test_order查', e)
                    logging.info(s)
                    print(s)
                    break
 
                # 多线程 考虑到原包的 数据库限制，每次均实例化数据库类，用后删除
                try:
                    # 可以考虑分装到类构造器中
                    mysql_obj = MysqlHelper()
                except Exception as e:
                    s = '%s%s%s' % (s, ' DB Exception- ', e)
                    logging.error(s)
                    print(s)
                    break
 
                """
                多进程、线程并发
                待优化，比如队列
              """
                q = 'SELECT id FROM test_error WHERE url="%s" LIMIT 1' % (url)
                try:
                    r = mysql_obj.select(q)
                    s = '%s%s%s' % (s, ' -SQL- ', q)
                    logging.info(s)
                    print(q)
                except Exception as e:
                    s = '%s %s %s %s' % (s, ' DB Exception-', q, e)
                    logging.info(s)
                    print(s)
                    break
 
                ctime = int(time.time())
                # 建议优化此处数据库设计
                db_status = 1 if ret['status_code'] == 200 else 0
                if len(r) == 0:
                    q = 'INSERT INTO test_error (title,url,status,remarks,update_time,create_time,uid,money,direct_order_id) VALUES ("%s","%s","%s","%s","%s","%s","%s","%s","%s")' % (
                        title, url, db_status, ret['info'], ctime, ctime, uid, money_total, chk_id)
                    try:
                        mysql_obj.execute(q)
                        mysql_obj.commit()
                        del mysql_obj
                        s = '%s%s%s' % (s, ' DB SQL ok ', q)
                        logging.info(s)
                        print(s)
                    except Exception as e:
                        s = '%s%s%s%s' % (s, ' DB Exception- ', q, e)
                        logging.error(s)
                        print(s)
 
                elif len(r) == 1:
                    continue
 
    for i in range(0, tn, tstep):
        if i >= tn:
            break
        thread_instance = MyThread(tf, (i), tf.__name__)
        tl.append(thread_instance)
 
    for t in tl:
        t.setDaemon = False
        t.start()
    for t in tl:
        t.join()
 
 
if __name__ == '__main__':
    main()

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
# from selenium.webdriver.firefox.options import Options
import time
from time import sleep
import math
import random
import sys
import threading
from random import choice
# import urllib.parse
from  bs4 import BeautifulSoup

ua_list = []
with open('mobile_ua.txt', 'r', encoding='utf-8') as uafile:
    for i in uafile:
        if i.find('Mozilla') > -1:
            ua_list.append(i.replace('\n', '').strip())

ua_list_len_ = len(ua_list) - 1


def close_alert(browser, attitude='accept'):
    # js='alert(window.alert=function(str){return;}'
    # browser.execute_script(js)

    # js= 'window.alert = function(str){return ;}'
    # browser.execute_script(js)
    return


# mobile_emulation = {
#     "deviceMetrics": {"width": 360, "height": 640, "pixelRatio": 3.0},
#     "userAgent": "Mozilla/5.0 (Linux; Android 4.2.1; en-us; Nexus 5 Build/JOP40D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Mobile Safari/535.19"}
ua_list_index = random.randint(0, ua_list_len_)
# mobile_emulation = {
#     "deviceMetrics": {"width": 360, "height": 640, "pixelRatio": 3.0}}
#
# mobile_emulation['userAgent'] = choice(ua_list)
# chrome_options = Options()
# chrome_options.add_experimental_option("mobileEmulation", mobile_emulation)
# browser = webdriver.Chrome(chrome_options=chrome_options)
browser = webdriver.Chrome()
s_wd = '长尾'
url_seed = 'https://m.baidu.com/s?word=s_wd'

url_seed = url_seed.replace('s_wd', s_wd)
print(url_seed)
browser.get(url_seed)

rd = BeautifulSoup(browser.page_source, 'html.parser').find_all('a', class_='rw-item')
res_d_l = [{'contents': d.contents, 'href': d.attrs['href']} for d in rd]
browser.quit()
d = 3

posted @ 2017-08-19 12:30 papering 阅读(936) 评论(0) 编辑收藏举报

刷新页面返回顶部

登录后才能查看或发表评论，立即登录或者逛逛博客园首页

2025年1月

日

一

二

三

四

五

六