爬取新闻
import re
import requests
from bs4 import BeautifulSoup
import pymysql
conn = pymysql.connect(
user='root',
password="123456",
host='127.0.0.1',
database='news'
)
def get_ip():
proxies = {
'https': requests.get('http://127.0.0.1:5010/get?type=https').json()["proxy"]
}
return proxies
cursor = conn.cursor()
res = requests.get("https://www.autohome.com.cn/all/1/#liststart", proxies=get_ip())
soup = BeautifulSoup(res.text, 'lxml')
ul_list = soup.findAll(name='ul', class_='article')
print(len(ul_list))
for ul in ul_list:
li_list = ul.findAll(name="li")
for li in li_list:
title_if = li.find(name='h3')
if title_if:
title = title_if.text
article_url = "https:" + li.find(name='a').attrs.get("href")
introduce = li.find(name='p')
img = li.find(name='img').attrs.get("src")
cursor.execute('insert into new (title,img,article_url) values (%s,%s,%s)',
args=[title, img, article_url])
print("成功")
conn.commit()
bs4介绍遍历文档树
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story<span>lqz</span></b><b>adfasdf<b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
soup = BeautifulSoup(html_doc, 'lxml')
bs4搜索文档数
css选择器
'''
div
.类名
#id号
div a # div下的子子孙孙中得a
div>a #div直接子节点
'''
selenium基本使用
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
bro = webdriver.Chrome(executable_path='./chromedriver.exe')
bro.get('https://www.baidu.com/')
input_name = bro.find_element(By.ID, value='kw')
input_name.send_keys('小兔子')
submit = bro.find_element(By.ID, 'su')
submit.click()
time.sleep(20)
bro.close()
模拟登录百度
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
bro = webdriver.Chrome(executable_path='./chromedriver.exe')
bro.get("https://www.baidu.com/")
bro.implicitly_wait(10)
login_name = bro.find_element(By.LINK_TEXT, '登录')
login_name.click()
seed_name = bro.find_element(By.ID, 'TANGRAM__PSP_11__changeSmsCodeItem')
seed_name.click()
tangram_name = bro.find_element(By.ID, 'TANGRAM__PSP_11__changePwdCodeItem')
tangram_name.click()
username_name = bro.find_element(By.ID, 'TANGRAM__PSP_11__userName')
username_name.send_keys('')
password_name = bro.find_element(By.ID, 'TANGRAM__PSP_11__password')
password_name.send_keys('')
log_name = bro.find_element(By.ID, 'TANGRAM__PSP_11__submit')
log_name.click()
input_name = bro.find_element(By.ID, value='kw')
input_name.send_keys('小兔子')
submit = bro.find_element(By.ID, 'su')
submit.click()
time.sleep(20)
bro.close()
selenium其他用法
#截取验证码
from selenium import webdriver
from selenium.webdriver.common.by import By
from PIL import Image
bro = webdriver.Chrome(executable_path='./chromedriver.exe')
bro.get('https://www.chaojiying.com/apiuser/login/')
bro.implicitly_wait(10)
bro.maximize_window()
tag = bro.find_element(By.CSS_SELECTOR,
'body > div.wrapper_danye > div > div.content_login > div.login_form > form > div > img')
src = tag.get_attribute('src')
size = tag.size
print(size)
location = tag.location
print(location)
common_img = bro.save_screenshot('my.png')
img = Image.open('./my.png')
rangle = (int(location['x']) * 1.5, int(location['y']) * 1.5, int(location['x'] + size['width']) * 1.5,
int(location['y'] + size['height']) * 1.5)
print(rangle)
see = img.crop(rangle)
see.save('se.png')
bro.close()
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 震惊!C++程序真的从main开始吗?99%的程序员都答错了
· 别再用vector<bool>了!Google高级工程师:这可能是STL最大的设计失误
· 单元测试从入门到精通
· 【硬核科普】Trae如何「偷看」你的代码?零基础破解AI编程运行原理
· 上周热点回顾(3.3-3.9)