入门测试,扒拉百度搜索结果
import sys import re from typing import List import json from selenium import webdriver #from gjypjd.utils import exetcute_sql,if_headless #import pymysql from selenium.webdriver import Firefox from selenium.webdriver.common.by import By from selenium.webdriver.common.keys import Keys from selenium.webdriver.firefox.options import Options from selenium.webdriver.support import expected_conditions as expected from selenium.webdriver.support.wait import WebDriverWait from selenium.webdriver.common.desired_capabilities import DesiredCapabilities options = Options() options.add_argument('-headless') # 无头参数 # options.set_headless(True) # chrome_options = Options() # chrome_options.add_argument('--headless') # chrome_options.add_argument('--disable-gpu') dcap = dict(DesiredCapabilities.PHANTOMJS) dcap["phantomjs.page.settings.userAgent"] = ( "Mozilla/5.0 (Linux; Android 5.1.1; Nexus 6 Build/LYZ28E) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.23 Mobile Safari/537.36" ) # driver = webdriver.Firefox(firefox_options = options)#这里是火狐的浏览器运行方法 driver = webdriver.PhantomJS(desired_capabilities = dcap)# cookies ={"acw_tc": "276aedc516189085565492268e1530fd7e043ac4523bf573676dca2fca8f3f", "JSESSIONID": "C0B475EE5962B7C90F71EB862627BB73.7", "neCYtZEjo8GmS": "5O6Hn0i.gDRp6f8dJGudI4.UBHLWgRJlh5l0LYeQwx.9KEWPXazNRAx0ALqk7D1CDyRK5iJD4mQirkhOvKvo_eq", "neCYtZEjo8GmT": "53o_V4Cr523Lqqqmg4vUfBGZPKILr44ujVQJu1T3uCI6R2_HxDDUrgumJJqgoZPIADbffc.Zm0xoyktxoxtzqBT3PWI.SuY9aAu7l4hCzrWT5FmQzfRobFk490Pqa7n9DQ7uA86KhZsBZndKQIISaJ3YmIyjTFNa9TH4s8smZyjU2U1zwCiLF3SqijpHDPH3IQSX2DY49D_KkGemtWhT6leZepOcSqfVht8hFiv6uIMWKnsKELMPggGPmevnoLer0RYunt5uXlyTznUKdvOnH8AroiEJe72V2xqHyZ.cDdTtdWvPqib17RKRZb1c7WobFG" } driver.get('http://app1.nmpa.gov.cn/data_nmpa/face3/dir.html?type=yp') driver.implicitly_wait(10) print(driver.page_source) cookies1= driver.get_cookie('domain'); print(cookies1) with open("cookies.txt", "r") as f: # 从文件获取cookies,并转化成list对象 cookies: List[Dict] = json.load(f) # 遍历每一条cookies,把登录的cookies传入到企业微信中 for cookie in cookies: # 由于selenium的cookies不支持expiry,所以需要去掉 if "expiry" in cookie.keys(): # dict支持pop的删除函数 cookie.pop("expiry") # 添加cookies driver.add_cookie(cookie) # driver.add_cookie(cookie_dict= cookies) #get 方法 打开指定网址 driver.get('http://app1.nmpa.gov.cn/data_nmpa/face3/search.jsp?6SQk6G2z=GBK-5RWnNqwnGCEoyB6.X6qnkNUjJ44QBnuyOPTxHTrYkEVvJ_zldCQbi6OTK9gkK9QsBjidwgOSqy8a.aQYrg5SizKfHWyPoUF_u4uGfeAMDaoMNmkHbMzfgDMwYcj3fFjXQoiewH_.zQW53CWqKVDHO27YoNfVLGVqanx73YBQK_MGhAGWgCM1PFK7Fz0LvBQe6QURlVuhdpVNmN7wR4MUcec6UwQW4eAq4K5dIQY9Hj76NcKe5yxyb9GJqCDZ70c.D5fLtmNvOyIKSW08REmwXuR_xWJpSqLa9.sZFs3DpZ8913WU1ccwv.a1aNtJDeMQ14S8R.JOOSj2P5zhjENRj43LqrbMZIzs53f4S_mZbLV7&c1SoYK0a=GBK-4fzZ4ejgwRW3SCbDGETEb9bW8e_EQpv8bHkTV0LSyoMbKIL7lpMe7MKFCg_vcVd1P5rVJQNaT8WNG7XYltPd0db7VSRRcUegLXEpKYnPt1t.oVEvxl5ICYo7rsOrufJj6isZrBY25E2UCx2UFW8UfieSqYjda9fAMWsC2oDK4FjTxvgDF8gw1MnNGSVybtCXd') driver.implicitly_wait(10) # driver.implicitly_wait(1000) #选择网页元素 # element_keyword = driver.find_element_by_id('kw') # # #输入字符 # element_keyword.send_keys('宋曲') # # #找到搜索按钮 # element_search_button = driver.find_element_by_id('su') # # element_search_button.click() # time.sleep(2) # driver.implicitly_wait(10) # resultElemnts= driver.find_elements_by_class_name('result-op') html = driver.page_source print(driver.page_source) print('result compile=') regex = re.compile('<h3 class="t"><a[^>]*(.*?)</a>') tx = 1 patterns = re.findall(regex, html) for i in patterns: print(tx) print(i) # print(re.match(r'<a.*>(.*)</a>', i).group(1)) # print(i[1]) # print(i[1].split('//')[1]) tx = tx + 1 print('result count=') # print(len(resultElemnts)) # # print('result t=') # print(resultElemnts[1].find_element_by_class_name('t').text) # discount=1 # for item in resultElemnts: # if discount == 1: # discount = discount +1 # continue # # # s2 = (item.find_element_by_xpath('//div/a[1]')) # s9 = item.find_element_by_class_name('t') # if s9 is not None: # if s9.is_displayed(): # print(discount) # print(s9.text) # # # print(item.find_element_by_class_name('t').text) # # # if item.is_displayed(): # # print(discount) # # print(item.text) # discount=discount+1 # s2=resultElemnts[0].find_element_by_xpath(self,'//div/a[1]') # print('s1=') # if s2.is_displayed(): # print(s2.text) # print(resultElemnts[0].find_element_by_xpath('//div/a[1]')) print('s1 end') # ret = driver.find_element_by_id('1') # print(ret.text) # # if ret.text.startswith('宋曲'):#是不是已宋曲开头 # print('测试通过') # else: # print('不通过') #最后,driver.quit()让浏览器和驱动进程一起退出,不然桌面会有好多窗口 driver.quit() print('hello')