day02(百度自动登录、豆瓣电影top250信息的获取)
1.豆瓣电影top250的信息获取
import requests import re def get_page(url): response=requests.get(url) # print(response.text) return response def parse_index(html): # 排名 url 名称 导演 主演 年份/类型 评分 评论 简介 movie_list=re.findall('<div class="item">.*?<em class="">(.*?)</em>.*?<a href="(.*?)">.*?<span class="title">(.*?)</span>.*?导演: (.*?)主演: (.*?)<br>(.*?)</p>.*?<span class="rating_num" .*?>(.*?)</span>.*?<span>(.*?)人评价</span>.*?<span class="inq">(.*?)</span>', html, re.S) return movie_list def save_data(movie): top,m_url,name,daoyan,actor,year_type,\ point,commit,desc=movie year_type=year_type.strip('\n') data=f''' =======欢迎观赏======= 电影排名:{top} 电影url:{m_url} 电影名称:{name} 电影导演:{daoyan} 电影主演:{actor} 年份类型:{year_type} 电影评分:{point} 电影评论:{commit} 电影简介:{desc} ====================== \n \n ''' print(data) with open('douban_top250','a',encoding='utf-8') as f: f.write(data) if __name__ == '__main__': #拼接所有主页 num = 0 for line in range(10): url=f'https://movie.douban.com/top250?start={num}&filter=' num+=25 index_res=get_page(url) # 解析主页获取电影信息 movie_list=parse_index(index_res.text) for movie in movie_list: save_data(movie)
运行结果部分展示:
2.在京东上自动搜索所需要的信息
from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.common.keys import Keys from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.wait import WebDriverWait import time # 方式一:通过驱动打开浏览器 drive = webdriver.Chrome() try: drive.get('http://www.jd.com/') wait=WebDriverWait(drive,10) input_tag=wait.until(EC.presence_of_element_located ((By.ID,'key')) ) time.sleep(5) input_tag.send_keys('公仔') input_tag.send_keys(Keys.ENTER) time.sleep(20) finally: drive.close()
3.自动登录百度
from selenium import webdriver #web驱动 from selenium.webdriver.common.keys import Keys# 键盘按键操作 import time drive = webdriver.Chrome() try: # 隐式等待,在get之前 drive.implicitly_wait(5) drive.get('https://www.baidu.com/') # 显示等待,在get之后 time.sleep(5) #通过链接文本去找 login_link= drive.find_element_by_link_text('登录') login_link.click() time.sleep(1) #通过id去找 user_login = drive.find_element_by_id('TANGRAM__PSP_10__footerULoginBtn') user_login.click() time.sleep(1) #通过classname去找 user = drive.find_element_by_class_name('pass-text-input-userName') user.send_keys('43434') #通过name去找 pwd = drive.find_element_by_name('password') pwd.send_keys('3424324') #通过id去找 submit = drive.find_element_by_id('TANGRAM__PSP_10__submit') submit.click() # # 5、find_element_by_partial_link_text # # 局部链接文本查找 # login_link = drive.find_element_by_partial_link_text('登') # login_link.click() # # # 6、find_element_by_css_selector # # 根据属性选择器查找元素 # # .: class # # #: id # login2_link = drive.find_element_by_css_selector('.tang-pass-footerBarULogin') # login2_link.click() # # # 7、find_element_by_tag_name # div = drive.find_elements_by_tag_name('div') # print(div) time.sleep(20) finally: drive.close()
总结:
selenium请求库优点:
------执行js代码
----- 不需要分析复杂的通信流程
------对浏览器做下拉、弹窗等操作
------获取动态数据
------破解登录验证
安装selenium请求库:
在cmd窗口中输入: pip3 install selenium
安装浏览器驱动:
http://npm.taobao.org/mirrors/chromedriver/2.38/