第六次作业

6.18-python学习

 

一.今日内容:

1.关于selenium选择器xpath的使用:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
from selenium import webdriver
 
driver = webdriver.Chrome(r'D:\BaiduNetdiskDownload\chromedriver_win32\chromedriver.exe')
 
 
try:
    # 隐式等待: 写在get请求前
    driver.implicitly_wait(5)
 
    driver.get('https://doc.scrapy.org/en/latest/_static/selectors-sample1.html')
 
    # 显式等待: 写在get请求后
    # wait.until(...)
 
    '''
     
<html>
 <head>
  <base href='http://example.com/' />
  <title>Example website</title>
 </head>
 <body>
  <div id='images'>
   <a href='image1.html'>Name: My image 1 <br /><img src='image1_thumb.jpg' /></a>
   <a href='image2.html'>Name: My image 2 <br /><img src='image2_thumb.jpg' /></a>
   <a href='image3.html'>Name: My image 3 <br /><img src='image3_thumb.jpg' /></a>
   <a href='image4.html'>Name: My image 4 <br /><img src='image4_thumb.jpg' /></a>
   <a href='image5.html'>Name: My image 5 <br /><img src='image5_thumb.jpg' /></a>
  </div>
 </body>
</html>
    '''
    # 根据xpath语法查找元素
    # / 从根节点开始找第一个
    html = driver.find_element_by_xpath('/html')
    # html = driver.find_element_by_xpath('/head')  # 报错
    print(html.tag_name)
 
    # // 从根节点开始找任意一个节点
    div = driver.find_element_by_xpath('//div')
    print(div.tag_name)
 
    # @
    # 查找id为images的div节点
    div = driver.find_element_by_xpath('//div[@id="images"]')
    print(div.tag_name)
    print(div.text)
 
    # 找到第一个a节点
    a = driver.find_element_by_xpath('//a')
    print(a.tag_name)
 
    # 找到所有a节点
    a_s = driver.find_elements_by_xpath('//a')
    print(a_s)
 
    # 找到第一个a节点的href属性
    # get_attribute:获取节点中某个属性
    a = driver.find_element_by_xpath('//a').get_attribute('href')
    print(a)
 
finally:
    driver.close()

2.selenium剩余更多操作:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
''''''
'''
点击、清除操作
'''
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
 
driver = webdriver.Chrome(r'D:\BaiduNetdiskDownload\chromedriver_win32\chromedriver.exe')
 
try:
    driver.implicitly_wait(10)
    # 1、往jd发送请求
    driver.get('https://www.jd.com/')
    # 找到输入框输入围城
    input_tag = driver.find_element_by_id('key')
    input_tag.send_keys('围城')
    # 键盘回车
    input_tag.send_keys(Keys.ENTER)
    time.sleep(2)
    # 找到输入框输入墨菲定律
    input_tag = driver.find_element_by_id('key')
    input_tag.clear()
    input_tag.send_keys('墨菲定律')
    # 找到搜索按钮点击搜索
    button = driver.find_element_by_class_name('button')
    button.click()
    time.sleep(10)
 
finally:
    driver.close()
 
 
'''
获取cookies  (了解)
'''
from selenium import webdriver
import time
 
driver = webdriver.Chrome(r'D:\BaiduNetdiskDownload\chromedriver_win32\chromedriver.exe')
 
try:
    driver.implicitly_wait(10)
    driver.get('https://www.zhihu.com/explore')
    print(driver.get_cookies())
 
    time.sleep(10)
finally:
    driver.close()
 
'''
选项卡
'''
#选项卡管理:切换选项卡,有js的方式windows.open,有windows快捷键:
# ctrl+t等,最通用的就是js的方式
import time
from selenium import webdriver
 
browser = webdriver.Chrome()
try:
    browser.get('https://www.baidu.com')
 
    # execute_script: 执行javascrpit代码
    # 弹窗操作
    # browser.execute_script('alert("tank")')
    # 新建浏览器窗口
    browser.execute_script(
        '''
        window.open();
        '''
    )
    time.sleep(1)
    print(browser.window_handles)  # 获取所有的选项卡
    # 切换到第二个窗口
    # 新:
    browser.switch_to.window(browser.window_handles[1])
    # 旧:
    # browser.switch_to_window(browser.window_handles[1])
 
    # 第二个窗口往淘宝发送请求
    browser.get('https://www.taobao.com')
    time.sleep(5)
 
    # 切换到第一个窗口
    browser.switch_to_window(browser.window_handles[0])
    browser.get('https://www.sina.com.cn')
 
    time.sleep(10)
finally:
    browser.close()
 
 
'''
ActionChangs动作链
'''
from selenium import webdriver
from selenium.webdriver import ActionChains
import time
 
driver = webdriver.Chrome()
driver.implicitly_wait(10)
driver.get('http://www.runoob.com/try/try.php?filename=jqueryui-api-droppable')
 
try:
 
    # driver.switch_to_frame('iframeResult')
    # 切换到id为iframeResult的窗口内
    driver.switch_to.frame('iframeResult')
 
    # 源位置
    draggable = driver.find_element_by_id('draggable')
 
    # 目标位置
    droppable = driver.find_element_by_id('droppable')
 
    # 调用ActionChains,必须把驱动对象传进去
    # 得到一个动作链对象,复制给一个变量
    actions = ActionChains(driver)
 
    # 方式一: 机器人
    # 瞬间把源图片位置秒移到目标图片位置
    # actions.drag_and_drop(draggable, droppable)  # 编写一个行为
    # actions.perform()  # 执行编写好的行为
 
 
    # 方式二: 模拟人的行为
    source = draggable.location['x']
    target = droppable.location['x']
    print(source, target)
 
    distance = target - source
    print(distance)
 
    # perform:每个动作都要调用perform执行
 
    # 点击并摁住源图片
    ActionChains(driver).click_and_hold(draggable).perform()
 
    s = 0
    while s < distance:
        # 执行位移操作
        ActionChains(driver).move_by_offset(xoffset=2, yoffset=0).perform()
        s += 2
 
    # 释放动作链
    ActionChains(driver).release().perform()
 
    time.sleep(10)
 
 
finally:
    driver.close()
 
 
'''
前进、后退
'''
from selenium import webdriver
import time
 
driver = webdriver.Chrome()
 
try:
    driver.implicitly_wait(10)
    driver.get('https://www.jd.com/')
    driver.get('https://www.baidu.com/')
    driver.get('https://www.cnblogs.com/')
 
    time.sleep(2)
 
    # 回退操作
    driver.back()
    time.sleep(1)
    # 前进操作
    driver.forward()
    time.sleep(1)
    driver.back()
    time.sleep(10)
 
finally:
    driver.close()

3.破解登录方法:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
from selenium import webdriver
from selenium.webdriver import ChromeOptions
import time
r'''
步骤:
    1、打开文件的查看,显示隐藏文件
    2、找到C:\Users\administortra\AppData\Local\Google\Chrome\User Data
        删除Default文件
    3、重新打开浏览器,并登陆百度账号
        - 此时会创建一个新的Default缓存文件
    4、添加cookies
    5、关闭谷歌浏览器后执行程序
'''
# 获取options对象,参数对象
options = ChromeOptions()
 
# 获取cookies保存路径
# 'C:\Users\administortra\AppData\Local\Google\Chrome\User Data'
profile_directory = r'--user-data-dir=C:\Users\administortra\AppData\Local\Google\Chrome\User Data'
 
# 添加用户信息目录
options.add_argument(profile_directory)
 
# 把参数加载到当前驱动中  chrome_options默认参数,用来接收options对象
driver = webdriver.Chrome(chrome_options=options)
 
try:
    driver.implicitly_wait(10)
    driver.get('https://www.baidu.com/')
    '''
    BDUSS:*****
    '''
    # 添加用户cookies信息
    # name、value必须小写
    driver.add_cookie({"name": "BDUSS", "value": "用户session字符串"})
 
    # 刷新操作
    driver.refresh()
 
    time.sleep(10)
 
finally:
    driver.close()

4.爬取京东商品信息:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
# ''''''
# '''
# 爬取京东商品信息:
#     请求url:
#         https://www.jd.com/
#     提取商品信息:
#         1.商品详情页
#         2.商品名称
#         3.商品价格
#         4.评价人数
#         5.商品商家
# '''
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
 
driver = webdriver.Chrome()
 
try:
    driver.implicitly_wait(10)
    # 1、往京东主页发送请求
    driver.get('https://www.jd.com/')
 
    # 2、输入商品名称,并回车搜索
    input_tag = driver.find_element_by_id('key')
    input_tag.send_keys('macbook')
    input_tag.send_keys(Keys.ENTER)
    time.sleep(2)
 
    # 通过JS控制滚轮滑动获取所有商品信息
    js_code = '''
        window.scrollTo(0,5000);
    '''
    driver.execute_script(js_code)  # 执行js代码
 
    # 等待数据加载
    time.sleep(2)
 
    # 3、查找所有商品div
    # good_div = driver.find_element_by_id('J_goodsList')
    good_list = driver.find_elements_by_class_name('gl-item')
    n = 1
    for good in good_list:
        # 根据属性选择器查找
        # 商品链接
        good_url = good.find_element_by_css_selector(
            '.p-img a').get_attribute('href')
 
        # 商品名称
        good_name = good.find_element_by_css_selector(
            '.p-name em').text.replace("\n", "--")
 
        # 商品价格
        good_price = good.find_element_by_class_name(
            'p-price').text.replace("\n", ":")
 
        # 评价人数
        good_commit = good.find_element_by_class_name(
            'p-commit').text.replace("\n", " ")
 
        # 商品商家
        good_from = good.find_element_by_class_name(
            'J_im_icon').text.replace("\n", " ")
 
        good_content = f'''
                    商品链接: {good_url}
                    商品名称: {good_name}
                    商品价格: {good_price}
                    评价人数: {good_commit}
                    商品商家: {good_from}
                    \n
                    '''
        print(good_content)
        with open('jd.txt', 'a', encoding='utf-8') as f:
            f.write(good_content)
 
    next_tag = driver.find_element_by_link_text('下一页')
 
    next_tag.click()
 
    time.sleep(10)
 
 
finally:
    driver.close()

二.作业:

1.爬取京东商品信息:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
 
def get_good(driver):
    try:
        js_code='''
            window.scrollTo(0,5000);
                '''
        driver.execute_script(js_code)
 
        time.sleep(2)
        good_list = driver.find_elements_by_class_name('gl-item')
        n = 1
        for good in good_list:
            good_url = good.find_element_by_css_selector('.p-img a').get_attribute('href')
 
            good_name = good.find_element_by_css_selector('.p-name em').text.replace("\n", "--")
 
            good_price = good.find_element_by_class_name('p-price').text.replace("\n", ":")
            good_commit = good.find_element_by_class_name('p-commit').text.replace("\n", " ")
            good_from = good.find_element_by_class_name('J_im_icon').text.replace("\n", " ")
 
            good_content = f'''
                            商品链接:{good_url}
                            商品名称:{good_name}
                            商品价格:{good_price}
                            评价人数:{good_commit}
                            商品商家:{good_from}
                            \n
                            '''
            print(good_content)
            # with open('jd.txt', 'a', encoding='utf-8')as f:
            #     f.write(good_content)
        time.sleep(10)
 
        next_tag=driver.find_element_by_class_name('pn-next')
        next_tag.click()
        time.sleep(2)
        get_good(driver)
        time.sleep(10)
 
    finally:
        driver.close()
 
 
if __name__ == '__main__':
    good_name=input('请输入商品名:').strip()
 
    driver = webdriver.Chrome()
    driver.implicitly_wait(10)
    driver.get("https://www.jd.com/")
    input_tag = driver.find_element_by_id('key')
    input_tag.send_keys(good_name)
    input_tag.send_keys(Keys.ENTER)
    time.sleep(2)
    get_good(driver)

posted on 2019-06-21 16:06  肥嘟嘟左卫门!  阅读(144)  评论(0编辑  收藏  举报

导航