1 爬取新闻
| |
| |
| ---xml格式,用了re匹配的 |
| ---html,bs4,lxml。。。 |
| ---json: |
| -python :内置的 |
| -java : fastjson---》漏洞 |
| -java: 谷歌 Gson |
| -go :内置 基于反射,效率不高 |
| |
| import requests |
| |
| |
| from bs4 import BeautifulSoup |
| import pymysql |
| conn=pymysql.connect( |
| user='root', |
| password="123", |
| host='127.0.0.1', |
| database='cars' |
| ) |
| cursor=conn.cursor() |
| res = requests.get('https://www.autohome.com.cn/news/1/#liststart') |
| |
| |
| soup = BeautifulSoup(res.text, 'lxml') |
| |
| |
| |
| ul_list = soup.find_all(name='ul', class_='article') |
| print(len(ul_list)) |
| for ul in ul_list: |
| li_list = ul.find_all(name='li') |
| print(len(li_list)) |
| for li in li_list: |
| h3 = li.find(name='h3') |
| |
| if h3: |
| title = h3.text |
| url = 'https:' + li.find(name='a').attrs.get('href') |
| desc = li.find(name='p').text |
| img = li.find(name='img').attrs.get('src') |
| |
| print(''' |
| 新闻标题:%s |
| 新闻图片:%s |
| 新闻地址;%s |
| 新闻摘要:%s |
| '''%(title,img,url,desc)) |
| |
| |
| cursor.execute('insert into news (title,img,url,`desc`) values (%s,%s,%s,%s)',args=[title,img,url,desc]) |
| |
| conn.commit() |
| |
2 bs4介绍遍历文档树
| from bs4 import BeautifulSoup |
| |
| html_doc = """ |
| <html><head><title>The Dormouse's story</title></head> |
| <body> |
| <p class="title"><b>The Dormouse's story<span>lqz</span></b><b>adfasdf<b></p> |
| |
| <p class="story">Once upon a time there were three little sisters; and their names were |
| <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a> |
| <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and |
| <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; |
| and they lived at the bottom of a well.</p> |
| |
| <p class="story">...</p> |
| """ |
| |
| soup = BeautifulSoup(html_doc, 'lxml') |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
3 bs4搜索文档树
| |
| 五种过滤器: 字符串、正则表达式、列表、True、方法 |
| from bs4 import BeautifulSoup |
| |
| html_doc = """ |
| <html><head><title>The Dormouse's story</title></head> |
| <body> |
| <p class="title"><b class ='baby'>The Dormouse's story<span>lqz</span></b><b>adfasdf<b></p> |
| |
| <p class="story">Once upon a time there were three little sisters; and their names were |
| <a href="http://example.com/elsie" class="sister" id="link1" xx="xx">Elsie</a> |
| <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and |
| <a href="http://example.com/tillie" class="sister" id="link3" name="zzz">Tillie</a>; |
| and they lived at the bottom of a well.</p> |
| |
| <p class="story">...</p> |
| """ |
| |
| soup = BeautifulSoup(html_doc, 'lxml') |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| import re |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
3.2 其他用法
4 css选择器
| from bs4 import BeautifulSoup |
| |
| html_doc = """ |
| <html><head><title>The Dormouse's story</title></head> |
| <body> |
| <p class="title"><b class ='baby'>The Dormouse's story<span>lqz</span></b><b>adfasdf<b></p> |
| |
| <p class="story">Once upon a time there were three little sisters; and their names were |
| <a href="http://example.com/elsie" class="sister" id="link1" xx="xx">Elsie</a> |
| <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and |
| <a href="http://example.com/tillie" class="sister" id="link3" name="zzz">Tillie</a>; |
| and they lived at the bottom of a well.</p> |
| |
| <p class="story">...</p> |
| """ |
| |
| soup = BeautifulSoup(html_doc, 'lxml') |
| |
| |
| ''' |
| div |
| .类名 |
| #id号 |
| div a # div下的子子孙孙中得a |
| div>a #div直接子节点 |
| ''' |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| import requests |
| |
| res=requests.get('http://it028.com/css-selectors.html') |
| |
| res.encoding='utf-8' |
| |
| soup=BeautifulSoup(res.text,'lxml') |
| res=soup.select('#content > table > tbody > tr:nth-child(14) > td:nth-child(3)') |
| |
| print(res) |
| |
5 selenium基本使用
| |
| |
| |
| |
| |
| |
| 1 下载谷歌浏览器驱动(跟浏览器版本一致) |
| -https://registry.npmmirror.com/binary.html?path=chromedriver/ |
| -浏览器版本:114.0.5735.199 |
| -驱动版本对应 |
| -放到项目路径下 |
| |
| 2 写代码 |
| |
| from selenium import webdriver |
| import time |
| from selenium.webdriver.common.by import By |
| |
| bro = webdriver.Chrome(executable_path='./chromedriver.exe') |
| |
| bro.get('https://www.baidu.com') |
| |
| time.sleep(1) |
| |
| input_name = bro.find_element(by=By.ID, value='kw') |
| |
| input_name.send_keys('性感美女诱惑') |
| button=bro.find_element(By.ID,'su') |
| button.click() |
| time.sleep(2) |
| bro.close() |
| |
5.1 模拟登录百度
| |
| |
| from selenium import webdriver |
| import time |
| from selenium.webdriver.common.by import By |
| |
| bro = webdriver.Chrome(executable_path='./chromedriver.exe') |
| bro.get('https://www.baidu.com') |
| |
| bro.implicitly_wait(10) |
| bro.maximize_window() |
| |
| |
| submit_login = bro.find_element(By.LINK_TEXT, '登录') |
| submit_login.click() |
| |
| |
| sms_login = bro.find_element(By.ID, 'TANGRAM__PSP_11__headerLoginTab') |
| sms_login.click() |
| time.sleep(1) |
| username_login = bro.find_element(By.ID, 'TANGRAM__PSP_11__changePwdCodeItem') |
| |
| username_login.click() |
| time.sleep(1) |
| |
| username=bro.find_element(By.ID,'TANGRAM__PSP_11__userName') |
| username.send_keys('306334678@qq.com') |
| password=bro.find_element(By.ID,'TANGRAM__PSP_11__password') |
| password.send_keys('asdfasdfasdfasfds') |
| |
| login=bro.find_element(By.ID,'TANGRAM__PSP_11__submit') |
| time.sleep(1) |
| login.submit() |
| |
| time.sleep(3) |
| bro.close() |
| |
6 selenium其他用法
6.1 无头
6.2 搜索标签
| |
| |
| from selenium import webdriver |
| import time |
| from selenium.webdriver.common.by import By |
| |
| bro = webdriver.Chrome(executable_path='./chromedriver.exe') |
| bro.get('https://www.cnblogs.com/') |
| bro.implicitly_wait(10) |
| |
| |
| |
| |
| |
| |
| res=bro.find_elements(by=By.TAG_NAME,value='div') |
| |
| |
| |
| |
| |
| |
| print(res) |
| |
| bro.close() |
本文作者:Python学习之旅
本文链接:https://www.cnblogs.com/yuezongke/p/17545788.html
版权声明:本作品采用知识共享署名-非商业性使用-禁止演绎 2.5 中国大陆许可协议进行许可。
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步