先使用phantomJS(selenuim以后将不支持phantomJS,可使用无头chrome 和火狐插件)模拟登录用户,拿到登录后页面的cookie,再用cookie访问本网站,拿到想要的搜索结果
#获取网站cookie方法 def obCookie(): url = "http://www.juming.com/" service_args = [] service_args.append('--load-images=no') ##关闭图片加载 service_args.append('--ignore-ssl-errors=true') ##忽略https错误 driver = webdriver.PhantomJS(executable_path="phantomjs.exe")#windows下 # driver = webdriver.PhantomJS("phantomjs")#linux下 print("获取网站cookie。。") driver.get(url) time.sleep(5) driver.find_element_by_xpath('//*[@id="UserName"]').send_keys("账号") driver.find_element_by_xpath('//*[@id="Password"]').send_keys("密码") driver.find_element_by_xpath('//*[@id="loginBox"]/button').click() time.sleep(2) cookie_list = driver.get_cookies() driver.close() cookie_dict = {} for i in cookie_list: cookie_dict[i["name"]] = i["value"] return cookie_dict
# 获取网站cookie cookie_dict = obCookie()
#从网页获取网站页数 def obtainPage(): print("-----------------------") print("获取网站页数方法") url="http://www.juming.com/newcha/index.htm?cha=1" # url = "http://www.juming.com/newcha/index.htm?cha=1&page=1" res = requests.post(url,headers=headers,data=payload ,cookies= cookie_dict) res.encoding = 'utf-8' soup = BeautifulSoup(res.text ,'html.parser') H1 = soup.select('.reg') page = 0 if H1: result = re.findall('.*">(.*)</span>.*',str(H1[0])) if result: if int(result[0]) % 50 == 0: page = int(result[0]) // 50 else: page = int(result[0]) // 50 +1 if page != 0: return page
#网站页数 page = obtainPage()
先模拟登陆获取cookie,再拿着cookie访问网站,获取想要的搜索结果
#获取当前时间后第二天 def obTime(): today = datetime.datetime.today() tomorrow = today + datetime.timedelta(days=2) data = str(tomorrow).split(" ") return data[0] data = obTime() payload = { "ymhz":"com,cn", "sfba_1":"1", "sclx":"2", "scsj":data, } headers = { "User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.117 Safari/537.36", }