爬取新闻
import re
import requests
from bs4 import BeautifulSoup
import pymysql
# 建立数据库链接
conn = pymysql.connect(
user='root',
password="123456",
host='127.0.0.1',
database='news'
)
# 获取代理ip
def get_ip():
proxies = {
'https': requests.get('http://127.0.0.1:5010/get?type=https').json()["proxy"]
}
return proxies
cursor = conn.cursor()
# 代理发送请求
res = requests.get("https://www.autohome.com.cn/all/1/#liststart", proxies=get_ip())
# 使用解析器
soup = BeautifulSoup(res.text, 'lxml') # 1 内容,2 解析器
ul_list = soup.findAll(name='ul', class_='article') # 查找ul标签并且类中有article
print(len(ul_list))
for ul in ul_list:
# 查找ul下的每一个li
li_list = ul.findAll(name="li")
# li中有标题,图片等
for li in li_list:
title_if = li.find(name='h3') # 他有广告位
if title_if:
title = title_if.text
article_url = "https:" + li.find(name='a').attrs.get("href")
introduce = li.find(name='p') # 介绍
img = li.find(name='img').attrs.get("src")
# eye = li.find(class_='icon12 icon12-eye').text # 观看
# infor = li.find(class_='icon12 icon12-infor').text # 评论
cursor.execute('insert into new (title,img,article_url) values (%s,%s,%s)',
args=[title, img, article_url])
print("成功")
conn.commit()
bs4介绍遍历文档树
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story<span>lqz</span></b><b>adfasdf<b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
soup = BeautifulSoup(html_doc, 'lxml')
# 1 用法 通过 . 遍历
# print(soup.html.body.a)
# print(soup.a)
# 2 获取标签内容
# print(soup.a.name)
# 3获取标签的属性
# print(soup.a.attrs)
# print(soup.a.attrs.get("id"))
# 4 获取标签内容
# print(soup.p.text)#标签内部全部内容
# print(soup.p.string) # 当前标签有且只有自己(没有子标签),把文本内容拿出来
# print(soup.p.strings) # 把子子孙孙的文本内容放到生成器中
# 5 嵌套选择, 多次.
# print(soup.head.title.text)
# 6 子节点
# print(soup.p.content)# p下所有直接子节点
# print(soup.p.children)# 得到一个迭代器,包含p下所有直接子节点
# 7 父节点
# print(soup.p.parent) # 获取a标签的父节点
# print(list(soup.p.parents)) # 获取a标签的父节点
# 8 兄弟节点
# print(soup.a.next_sibing) # 下一个兄弟
# print(soup.a.previous_sibing) # 下一个兄弟
#
# print(soup.a.next_sibings) # 下一个兄弟们
# print(soup.a.previous_sibings) # 下一个兄弟们
bs4搜索文档数
# 五种过滤器: 字符串、正则表达式、列表、True、方法
# 1 字符串
# soup.find(name='a')
# soup.find(name='Elsie').parent
# # 括号中可以写 :name id class_ href text,所有属性
# #2 正则
# soup.find(class_=re.compile('^b'))
# 3 列表
# soup.findAll(name=["b", "body"])
# 4 True
# print(soup.findAll(
# href=True)) # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
# print(soup.findAll(src=True, name='img'))
# 其他用法
# recursive=True:是否递归查找,默认是True,如果写成false,只找第一层, limit=None
# limit=None 限制找几条
css选择器
'''
div
.类名
#id号
div a # div下的子子孙孙中得a
div>a #div直接子节点
'''
# print(soup.select('.sister'))#[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
selenium基本使用
# requests 发送请求,不能加载ajax
# selenium:直接操作浏览器,不是直接发送http请求,而是用代码控制模拟人操作浏览器的行为,js会自动加载
# appnium :直接操作手机
# 1 下载驱动
# -https://registry.npmmirror.com/binary.html?path=chromedriver/
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
bro = webdriver.Chrome(executable_path='./chromedriver.exe') # 打开浏览器
bro.get('https://www.baidu.com/')
input_name = bro.find_element(By.ID, value='kw')
input_name.send_keys('小兔子')
submit = bro.find_element(By.ID, 'su')
submit.click()
time.sleep(20)
bro.close()
模拟登录百度
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
bro = webdriver.Chrome(executable_path='./chromedriver.exe')
bro.get("https://www.baidu.com/")
bro.implicitly_wait(10)
login_name = bro.find_element(By.LINK_TEXT, '登录')
login_name.click()
seed_name = bro.find_element(By.ID, 'TANGRAM__PSP_11__changeSmsCodeItem')
seed_name.click()
# TANGRAM__PSP_11__changePwdCodeItem
tangram_name = bro.find_element(By.ID, 'TANGRAM__PSP_11__changePwdCodeItem')
tangram_name.click()
username_name = bro.find_element(By.ID, 'TANGRAM__PSP_11__userName')
username_name.send_keys('')
password_name = bro.find_element(By.ID, 'TANGRAM__PSP_11__password')
password_name.send_keys('')
log_name = bro.find_element(By.ID, 'TANGRAM__PSP_11__submit')
log_name.click()
#这里用验证码
input_name = bro.find_element(By.ID, value='kw')
input_name.send_keys('小兔子')
submit = bro.find_element(By.ID, 'su')
submit.click()
time.sleep(20)
bro.close()
selenium其他用法
# 无头
# 搜索标签
# from selenium import webdriver
# from selenium.webdriver.common.by import By
#
# bro = webdriver.Chrome(executable_path='./chromedriver.exe')
# bro.get('https://www.cnblogs.com/')
# bro.implicitly_wait(10)
# # selenium find_element和 find_elements也支持css和xpath
# bro.find_element(By.ID) # 根据ID查找
# bro.find_element(By.NAME) # 根据name属性查找
# bro.find_element(By.TAG_NAME) # 根据标签名找一个
# bro.find_element(By.LINK_TEXT) # 根据a标签文字
# # bro.find_element(By.PARTIAL_LINK_TEXT)根据a标签模糊查找
# bro.find_element(By.CLASS_NAME) # 根据类名
# bro.find_element(By.CSS_SELECTOR) # 根据css选择器找
# bro.find_element(By.XPATH) # 根据xpath查找
#
# bro.find_elements() # 找所有
#截取验证码
from selenium import webdriver
from selenium.webdriver.common.by import By
from PIL import Image
bro = webdriver.Chrome(executable_path='./chromedriver.exe')
bro.get('https://www.chaojiying.com/apiuser/login/')
bro.implicitly_wait(10)
bro.maximize_window()
tag = bro.find_element(By.CSS_SELECTOR,
'body > div.wrapper_danye > div > div.content_login > div.login_form > form > div > img')
src = tag.get_attribute('src')
size = tag.size
print(size)
location = tag.location
print(location)
common_img = bro.save_screenshot('my.png')
img = Image.open('./my.png')
rangle = (int(location['x']) * 1.5, int(location['y']) * 1.5, int(location['x'] + size['width']) * 1.5,
int(location['y'] + size['height']) * 1.5)
print(rangle)
see = img.crop(rangle)
see.save('se.png')
bro.close()