爬虫使用

爬取新闻

import re

import requests
from bs4 import BeautifulSoup

import pymysql

# 建立数据库链接
conn = pymysql.connect(
    user='root',
    password="123456",
    host='127.0.0.1',
    database='news'
)


# 获取代理ip
def get_ip():
    proxies = {
        'https': requests.get('http://127.0.0.1:5010/get?type=https').json()["proxy"]
    }
    return proxies


cursor = conn.cursor()
# 代理发送请求
res = requests.get("https://www.autohome.com.cn/all/1/#liststart", proxies=get_ip())
# 使用解析器
soup = BeautifulSoup(res.text, 'lxml')  # 1 内容，2 解析器
ul_list = soup.findAll(name='ul', class_='article')  # 查找ul标签并且类中有article
print(len(ul_list))
for ul in ul_list:
    # 查找ul下的每一个li
    li_list = ul.findAll(name="li")
    # li中有标题，图片等
    for li in li_list:
        title_if = li.find(name='h3')  # 他有广告位
        if title_if:
            title = title_if.text
            article_url = "https:" + li.find(name='a').attrs.get("href")
            introduce = li.find(name='p')  # 介绍
            img = li.find(name='img').attrs.get("src")
            # eye = li.find(class_='icon12 icon12-eye').text  # 观看
            # infor = li.find(class_='icon12 icon12-infor').text  # 评论
            cursor.execute('insert into new (title,img,article_url) values (%s,%s,%s)',
                           args=[title, img, article_url])
            print("成功")
            conn.commit()

bs4介绍遍历文档树

html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story<span>lqz</span></b><b>adfasdf<b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""
soup = BeautifulSoup(html_doc, 'lxml')
# 1 用法 通过 . 遍历
# print(soup.html.body.a)
# print(soup.a)
# 2 获取标签内容
# print(soup.a.name)
# 3获取标签的属性
# print(soup.a.attrs)
# print(soup.a.attrs.get("id"))
# 4 获取标签内容
# print(soup.p.text)#标签内部全部内容
# print(soup.p.string)  # 当前标签有且只有自己（没有子标签），把文本内容拿出来
# print(soup.p.strings)  # 把子子孙孙的文本内容放到生成器中
# 5 嵌套选择， 多次.
# print(soup.head.title.text)
# 6 子节点
# print(soup.p.content)# p下所有直接子节点
# print(soup.p.children)# 得到一个迭代器,包含p下所有直接子节点

# 7 父节点
# print(soup.p.parent)  # 获取a标签的父节点
# print(list(soup.p.parents))  # 获取a标签的父节点
# 8 兄弟节点
# print(soup.a.next_sibing)  # 下一个兄弟
# print(soup.a.previous_sibing)  # 下一个兄弟
#
# print(soup.a.next_sibings)  # 下一个兄弟们
# print(soup.a.previous_sibings)  # 下一个兄弟们

bs4搜索文档数

# 五种过滤器: 字符串、正则表达式、列表、True、方法
# 1 字符串
# soup.find(name='a')
# soup.find(name='Elsie').parent
# # 括号中可以写 ：name id class_ href text,所有属性
# #2 正则
# soup.find(class_=re.compile('^b'))

# 3 列表
# soup.findAll(name=["b", "body"])
# 4 True
# print(soup.findAll(
#     href=True))  # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
# print(soup.findAll(src=True, name='img'))
# 其他用法
# recursive=True：是否递归查找，默认是True，如果写成false，只找第一层,  limit=None
# limit=None  限制找几条

css选择器

'''
div
.类名
#id号
div a  # div下的子子孙孙中得a
div>a  #div直接子节点
'''

# print(soup.select('.sister'))#[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

selenium基本使用

# requests 发送请求，不能加载ajax
# selenium：直接操作浏览器，不是直接发送http请求，而是用代码控制模拟人操作浏览器的行为，js会自动加载

# appnium ：直接操作手机
# 1 下载驱动
# -https://registry.npmmirror.com/binary.html?path=chromedriver/


from selenium import webdriver
from selenium.webdriver.common.by import By
import time

bro = webdriver.Chrome(executable_path='./chromedriver.exe')  # 打开浏览器
bro.get('https://www.baidu.com/')
input_name = bro.find_element(By.ID, value='kw')
input_name.send_keys('小兔子')
submit = bro.find_element(By.ID, 'su')
submit.click()
time.sleep(20)
bro.close()

模拟登录百度

from selenium import webdriver
from selenium.webdriver.common.by import By
import time

bro = webdriver.Chrome(executable_path='./chromedriver.exe')
bro.get("https://www.baidu.com/")
bro.implicitly_wait(10)
login_name = bro.find_element(By.LINK_TEXT, '登录')
login_name.click()
seed_name = bro.find_element(By.ID, 'TANGRAM__PSP_11__changeSmsCodeItem')
seed_name.click()
# TANGRAM__PSP_11__changePwdCodeItem
tangram_name = bro.find_element(By.ID, 'TANGRAM__PSP_11__changePwdCodeItem')
tangram_name.click()
username_name = bro.find_element(By.ID, 'TANGRAM__PSP_11__userName')
username_name.send_keys('')
password_name = bro.find_element(By.ID, 'TANGRAM__PSP_11__password')
password_name.send_keys('')
log_name = bro.find_element(By.ID, 'TANGRAM__PSP_11__submit')
log_name.click()
#这里用验证码
input_name = bro.find_element(By.ID, value='kw')
input_name.send_keys('小兔子')
submit = bro.find_element(By.ID, 'su')
submit.click()
time.sleep(20)
bro.close()

selenium其他用法

# 无头
# 搜索标签
# from selenium import webdriver
# from selenium.webdriver.common.by import By
#
# bro = webdriver.Chrome(executable_path='./chromedriver.exe')
# bro.get('https://www.cnblogs.com/')
# bro.implicitly_wait(10)
# # selenium find_element和 find_elements也支持css和xpath
# bro.find_element(By.ID)  # 根据ID查找
# bro.find_element(By.NAME)  # 根据name属性查找
# bro.find_element(By.TAG_NAME)  # 根据标签名找一个
# bro.find_element(By.LINK_TEXT)  # 根据a标签文字
# # bro.find_element(By.PARTIAL_LINK_TEXT)根据a标签模糊查找
# bro.find_element(By.CLASS_NAME)  # 根据类名
# bro.find_element(By.CSS_SELECTOR)  # 根据css选择器找
# bro.find_element(By.XPATH)  # 根据xpath查找
#
# bro.find_elements()  # 找所有

#截取验证码

from selenium import webdriver
from selenium.webdriver.common.by import By
from PIL import Image

bro = webdriver.Chrome(executable_path='./chromedriver.exe')
bro.get('https://www.chaojiying.com/apiuser/login/')

bro.implicitly_wait(10)
bro.maximize_window()
tag = bro.find_element(By.CSS_SELECTOR,
                       'body > div.wrapper_danye > div > div.content_login > div.login_form > form > div > img')
src = tag.get_attribute('src')
size = tag.size
print(size)
location = tag.location
print(location)
common_img = bro.save_screenshot('my.png')
img = Image.open('./my.png')
rangle = (int(location['x']) * 1.5, int(location['y']) * 1.5, int(location['x'] + size['width']) * 1.5,
          int(location['y'] + size['height']) * 1.5)
print(rangle)
see = img.crop(rangle)
see.save('se.png')

bro.close()

posted @ 2023-07-10 17:41 哈哈哈哼阅读(16) 评论(0) 编辑收藏举报

刷新页面返回顶部

爬虫使用

爬取新闻

bs4介绍遍历文档树

bs4搜索文档数

css选择器

selenium基本使用

模拟登录百度

selenium其他用法

#截取验证码

公告