1 爬取新闻
---xml格式,用了re匹配的[xml包含html,html是xml的一种]
---html,bs4,lxml。。。
---json:
-python :内置的
-java : fastjson---》漏洞
-java: 谷歌 Gson
-go :内置 基于反射,效率不高
import requests
from bs4 import BeautifulSoup
import pymysql
conn=pymysql.connect(
user='root' ,
password="123" ,
host='127.0.0.1' ,
database='cars'
)
cursor=conn.cursor(cursor=pymysql.cursors.DictCursor)
res = requests.get('https://www.autohome.com.cn/news/1/#liststart' )
soup = BeautifulSoup(res.text, 'lxml' )
ul_list = soup.find_all(name='ul' , class_='article' )
print (len (ul_list))
for ul in ul_list:
li_list = ul.find_all(name='li' )
print (len (li_list))
for li in li_list:
h3 = li.find(name='h3' )
if h3:
title = h3.text
url = 'https:' + li.find(name='a' ).attrs.get('href' )
desc = li.find(name='p' ).text
img = li.find(name='img' ).attrs.get('src' )
print ('''
新闻标题:%s
新闻图片:%s
新闻地址;%s
新闻摘要:%s
''' %(title,img,url,desc))
try :
cursor.execute('insert into news (title,img,url,`desc`) values (%s,%s,%s,%s)' ,args=[title,img,url,desc])
conn.commit()
except Exception as e:
conn.rollback()
print ("An error occurred:" , str (e))
results = cursor.fetchall()
for row in results:
print (row)
cursor.close()
conn.close()
2 bs4介绍遍历文档树
BeautifulSoup(markup, "html.parser" )
BeautifulSoup(markup, "lxml" )
from bs4 import BeautifulSoup
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story<span>lqz</span></b><b>adfasdf<b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
soup = BeautifulSoup(html_doc, 'lxml' )
print (soup.prettify())
a=soup.html.body.a
a=soup.a
print (a)
a=soup.a.name
print (a)
a=soup.a.attrs.get('id' )
print (a)
a=soup.a.attrs.get('href' )
print (a)
p=soup.p.text
s1=soup.p.string
s1=list (soup.p.strings)
print (soup.p.contents)
print (list (soup.p.children))
print (list (soup.p.descendants))
print (soup.a.parent)
print (list (soup.a.parents))
print (soup.a.next_sibling)
print (soup.a.previous_sibling)
print (list (soup.a.next_siblings))
print (list (soup.a.previous_siblings))
3 bs4搜索文档树
五种过滤器: 字符串、正则表达式、列表、True 、方法
from bs4 import BeautifulSoup
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b class ='baby'>The Dormouse's story<span>lqz</span></b><b>adfasdf<b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1" xx="xx">Elsie</a>
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3" name="zzz">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
soup = BeautifulSoup(html_doc, 'lxml' )
a=soup.find(name='a' )
a=soup.find_all(name='a' ,class_='sister' )
a=soup.find_all(name='a' ,id ='link1' )
a=soup.find(text='Elsie' ).parent
a=soup.find(href='http://example.com/elsie' )
a=soup.find(xx='xx' )
a=soup.find(attrs={'class' :'sister' })
a=soup.find(attrs={'name' :'zzz' })
print (a)
import re
a = soup.find_all(class_=re.compile ('^b' ))
a = soup.find_all(href=re.compile ('^http' ))
a = soup.find_all(name=re.compile ('^b' ))
print (a)
a = soup.find_all(name=['b' ,'body' ,'span' ])
a = soup.find_all(class_=['sister' ,'title' ])
print (a)
a=soup.find_all(href=True )
a=soup.find_all(src=True ,name='img' )
print (a)
def has_class_but_no_id (tag ):
return tag.has_attr('class' ) and not tag.has_attr('id' )
print (soup.find_all(name = has_class_but_no_id))
3.2 其他用法
a=soup.find_all(name='html' ,recursive=False )
a=soup.html.p.find(name='b' ,recursive=False )
print (a)
a=soup.find_all(name='a' ,limit=1 )
print (a)
4 css选择器
from bs4 import BeautifulSoup
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b class ='baby'>The Dormouse's story<span>lqz</span></b><b>adfasdf<b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1" xx="xx">Elsie</a>
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3" name="zzz">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
soup = BeautifulSoup(html_doc, 'lxml' )
'''
div
.类名
#id
div a # div下的子子孙孙中得a
div>a # div直接子节点a
'''
res=soup.select('.sister' )
res=soup.select('#link1' )
res=soup.p.find(name='b' ).select('span' )
print (res)
import requests
res=requests.get('http://it028.com/css-selectors.html' )
res.encoding='utf-8'
soup=BeautifulSoup(res.text,'lxml' )
res=soup.select('#content > table > tbody > tr:nth-child(14) > td:nth-child(3)' )
print (res)
5 selenium基本使用
1 下载谷歌浏览器驱动(跟浏览器版本一致)
-https://registry.npmmirror.com/binary.html?path=chromedriver/
-浏览器版本:114.0 .5735 .198
-驱动版本对应
-放到项目路径下
2 写代码
from selenium import webdriver
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
bro = webdriver.Chrome(executable_path='./chromedriver.exe' )
bro.get('https://www.baidu.com' )
time.sleep(1 )
input_name = bro.find_element(by=By.ID, value='kw' )
input_name.send_keys('性感美女诱惑' )
button=bro.find_element(By.ID,'su' )
button.click()
input_name.send_keys(Keys.ENTER)
time.sleep(2 )
bro.close()
5.1 模拟登录百度
from selenium import webdriver
import time
from selenium.webdriver.common.by import By
bro = webdriver.Chrome(executable_path='./chromedriver.exe' )
bro.get('https://www.baidu.com' )
bro.implicitly_wait(10 )
bro.maximize_window()
submit_login = bro.find_element(By.LINK_TEXT, '登录' )
submit_login.click()
sms_login = bro.find_element(By.ID, 'TANGRAM__PSP_11__headerLoginTab' )
sms_login.click()
time.sleep(1 )
username_login = bro.find_element(By.ID, 'TANGRAM__PSP_11__changePwdCodeItem' )
username_login.click()
time.sleep(1 )
username=bro.find_element(By.ID,'TANGRAM__PSP_11__userName' )
username.send_keys('306334678@qq.com' )
password=bro.find_element(By.ID,'TANGRAM__PSP_11__password' )
password.send_keys('asdfasdfasdfasfds' )
login=bro.find_element(By.ID,'TANGRAM__PSP_11__submit' )
time.sleep(1 )
login.click()
time.sleep(3 )
bro.close()
6 selenium其他用法
6.1 无头浏览器
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
chrome_options = Options()
chrome_options.add_argument('window-size=1920x3000' )
chrome_options.add_argument('--disable-gpu' )
chrome_options.add_argument('--hide-scrollbars' )
chrome_options.add_argument('blink-settings=imagesEnabled=false' )
chrome_options.add_argument('--headless' )
driver=webdriver.Chrome(chrome_options=chrome_options)
driver.get('https://www.baidu.com' )
print (driver.page_sourec)
print ('hao123' in driver.page_source)
driver.close()
6.2 搜索标签
from selenium import webdriver
import time
from selenium.webdriver.common.by import By
bro = webdriver.Chrome(executable_path='./chromedriver.exe' )
bro.get('https://www.cnblogs.com/' )
bro.implicitly_wait(10 )
res=bro.find_elements(by=By.TAG_NAME,value='div' )
print (res)
bro.close()
6.3 等待元素被加载
隐式等待:在browser.get('xxx' )前就设置,针对所有元素有效
显式等待:在browser.get('xxx' )之后设置,只针对某个元素有效
browser.implicitly_wait(10 )
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
tab_btn = WebDriverWait(browser, 10 ).until(
EC.presence_of_element_located((By.CLASS_NAME, 'tab' ))
)
tab_btn.click()
6.4 获取位置属性大小、文本
from selenium import webdriver
from selenium.webdriver import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
browser=webdriver.Chrome()
browser.get('https://www.amazon.cn/' )
tag = WebDriverWait(browser,10 ).until(EC.presence_of_element_located((By.ID,'cc-lm-tcgShowImgContainer' )))
tag=browser.find_element(By.CSS_SELECTOR,'#cc-lm-tcgShowImgContainer img' )
print (tag.get_attribute('src' ))
print (tag.text)
print (tag.id )
print (tag.location)
print (tag.tag_name)
print (tag.size)
browser.close()
作业
https://www.chaojiying.com/apiuser/login/
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· Manus重磅发布:全球首款通用AI代理技术深度解析与实战指南
· 被坑几百块钱后,我竟然真的恢复了删除的微信聊天记录!
· 没有Manus邀请码?试试免邀请码的MGX或者开源的OpenManus吧
· 园子的第一款AI主题卫衣上架——"HELLO! HOW CAN I ASSIST YOU TODAY
· 【自荐】一款简洁、开源的在线白板工具 Drawnix