python脚本收集
发送邮件
from email.mime.text import MIMEText
from email.header import Header
from smtplib import SMTP_SSL
#qq邮箱smtp服务器
host_server = 'smtp.qq.com'
#sender_qq为发件人的qq号码
sender_qq = ''
#pwd为qq邮箱的授权码
pwd = ''
#发件人的邮箱
sender_qq_mail = ''
#收件人邮箱
receiver = ''
#邮件的正文内容
mail_content = '你好,我是来自知乎的 ,现在在进行一项用python登录qq邮箱发邮件的测试'
#邮件标题
mail_title = '邮件'
#ssl登录
smtp = SMTP_SSL(host_server)
#set_debuglevel()是用来调试的。参数值为1表示开启调试模式,参数值为0关闭调试模式
smtp.set_debuglevel(1)
smtp.ehlo(host_server)
smtp.login(sender_qq, pwd)
msg = MIMEText(mail_content, "plain", 'utf-8')
msg["Subject"] = Header(mail_title, 'utf-8')
msg["From"] = sender_qq_mail
msg["To"] = receiver
smtp.sendmail(sender_qq_mail, receiver, msg.as_string())
smtp.quit()
coco数据提取yolo
#COCO 格式的数据集转化为 YOLO 格式的数据集
#--json_path 输入的json文件路径
#--save_path 保存的文件夹名字,默认为当前目录下的labels。
import os
import json
from tqdm import tqdm
import argparse
parser = argparse.ArgumentParser()
#这里根据自己的json文件位置,换成自己的就行
parser.add_argument('--json_path', default='wildlife_instance_test2017.json',type=str, help="input: coco format(json)")
#这里设置.txt文件保存位置
parser.add_argument('--save_path', default='Lable/test', type=str, help="specify where to save the output dir of labels")
arg = parser.parse_args()
def convert(size, box):
dw = 1. / (size[0])
dh = 1. / (size[1])
x = box[0] + box[2] / 2.0
y = box[1] + box[3] / 2.0
w = box[2]
h = box[3]
#round函数确定(xmin, ymin, xmax, ymax)的小数位数
x = round(x * dw, 6)
w = round(w * dw, 6)
y = round(y * dh, 6)
h = round(h * dh, 6)
return (x, y, w, h)
# coding=utf-8
def check_charset(file_path):
import chardet
with open(file_path, "rb") as f:
data = f.read(4)
charset = chardet.detect(data)['encoding']
return charset
if __name__ == '__main__':
json_file = arg.json_path # COCO Object Instance 类型的标注
ana_txt_save_path = arg.save_path # 保存的路径
data = json.load(open(file=json_file,encoding='ISO-8859-1', mode='r'))
if not os.path.exists(ana_txt_save_path):
os.makedirs(ana_txt_save_path)
id_map = {} # coco数据集的id不连续!重新映射一下再输出!
with open(os.path.join(ana_txt_save_path, 'classes.txt'), 'w') as f:
# 写入classes.txt
for i, category in enumerate(data['categories']):
f.write(f"{category['name']}\n")
id_map[category['id']] = i
# print(id_map)
#这里需要根据自己的需要,更改写入图像相对路径的文件位置。
list_file = open(os.path.join(ana_txt_save_path, 'test2017.txt'), 'w')
for img in tqdm(data['images']):
filename = img["file_name"]
img_width = img["width"]
img_height = img["height"]
img_id = img["id"]python
head, tail = os.path.splitext(filename)
ana_txt_name = head + ".txt" # 对应的txt名字,与jpg一致
f_txt = open(os.path.join(ana_txt_save_path, ana_txt_name), 'w')
for ann in data['annotations']:
if ann['image_id'] == img_id:
box = convert((img_width, img_height), ann["bbox"])
f_txt.write("%s %s %s %s %s\n" % (id_map[ann["category_id"]], box[0], box[1], box[2], box[3]))
f_txt.close()
#将图片的相对路径写入train2017或val2017的路径
list_file.write('./images/test2017/%s.jpg\n' %(head))
list_file.close()
检测简单物体并裁剪
import cv2
import numpy as np
image = cv2.imread("1 (1).JPG")
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
gradX = cv2.Sobel(gray, ddepth=cv2.CV_32F, dx=1, dy=0, ksize=-1)
gradY = cv2.Sobel(gray, ddepth=cv2.CV_32F, dx=0, dy=1, ksize=-1)
# subtract the y-gradient from the x-gradient
gradient = cv2.subtract(gradX, gradY)
gradient = cv2.convertScaleAbs(gradient)
# blur and threshold the image
blurred = cv2.blur(gradient, (9, 9))
(_, thresh) = cv2.threshold(blurred, 90, 255, cv2.THRESH_BINARY)
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (25, 25))
closed = cv2.morphologyEx(thresh, cv2.MORPH_CLOSE, kernel)
# perform a series of erosions and dilations
closed = cv2.erode(closed, None, iterations=4)
closed = cv2.dilate(closed, None, iterations=4)
(cnts, _) = cv2.findContours(closed.copy(), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
c = sorted(cnts, key=cv2.contourArea, reverse=True)[0]
# compute the rotated bounding box of the largest contour
rect = cv2.minAreaRect(c)
box = np.int0(cv2.boxPoints(rect))
# draw a bounding box arounded the detected barcode and display the image
# cv2.drawContours(image, [box], -1, (0, 255, 0), 3)
# cv2.imshow("Image", image)
# cv2.imwrite("contoursImage2.jpg", image)
# cv2.waitKey(0)
Xs = [i[0] for i in box]
Ys = [i[1] for i in box]
x1 = min(Xs)
x2 = max(Xs)
y1 = min(Ys)
y2 = max(Ys)
hight = y2 - y1
width = x2 - x1
cropImg = image[y1-10:y1+hight+10, x1-10:x1+width+10]
cv2.imwrite("contoursImage3.jpg", cropImg)
获取文件夹下文件
def get_file_name(path,filetype):
pathList=[]
for root,dirs,files in os.walk(path):
for file in files:
if file.endswith(filetype):
pathList.append(file)
return pathList
数据获取
import cv2
import numpy as np
import os
def get_file_name(path,filetype):
pathList=[]
for root,dirs,files in os.walk(path):
for file in files:
if file.endswith(filetype):
pathList.append(file)
return pathList
def control_photo(ori_path,save_path):
image = cv2.imread(ori_path)
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
gradX = cv2.Sobel(gray, ddepth=cv2.CV_32F, dx=1, dy=0, ksize=-1)
gradY = cv2.Sobel(gray, ddepth=cv2.CV_32F, dx=0, dy=1, ksize=-1)
# subtract the y-gradient from the x-gradient
gradient = cv2.subtract(gradX, gradY)
gradient = cv2.convertScaleAbs(gradient)
# blur and threshold the image
blurred = cv2.blur(gradient, (9, 9))
(_, thresh) = cv2.threshold(blurred, 90, 255, cv2.THRESH_BINARY)
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (25, 25))
closed = cv2.morphologyEx(thresh, cv2.MORPH_CLOSE, kernel)
# perform a series of erosions and dilations
closed = cv2.erode(closed, None, iterations=4)
closed = cv2.dilate(closed, None, iterations=4)
(cnts, _) = cv2.findContours(closed.copy(), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
c = sorted(cnts, key=cv2.contourArea, reverse=True)[0]
# compute the rotated bounding box of the largest contour
rect = cv2.minAreaRect(c)
box = np.int0(cv2.boxPoints(rect))
# draw a bounding box arounded the detected barcode and display the image
# cv2.drawContours(image, [box], -1, (0, 255, 0), 3)
# cv2.imshow("Image", image)
# cv2.imwrite("contoursImage2.jpg", image)
# cv2.waitKey(0)
Xs = [i[0] for i in box]
Ys = [i[1] for i in box]
x1 = min(Xs)
x2 = max(Xs)
y1 = min(Ys)
y2 = max(Ys)
hight = y2 - y1
width = x2 - x1
cropImg = image[y1-10:y1+hight+10, x1-10:x1+width+10]
cv2.imwrite(save_path, cropImg)
if __name__ == '__main__':
for img_name in get_file_name("2A",".JPG"):
control_photo("2A/"+img_name,"cut_"+img_name)
# COCO 2017 dataset http://cocodataset.org
# download command/URL (optional)
download: bash ./scripts/get_coco.sh
# train and val data as 1) directory: path/images/, 2) file: path/images.txt, or 3) list: [path1/images/, path2/images/]
train: ./data/lote/images/train # 118287 images
val: ./data/lote/images/val # 5000 images
test: ./data/lote/images/test # 20288 of 40670 images, submit to https://competitions.codalab.org/competitions/20794
# number of classes
nc: 11
# class names
names: ['Pandas大熊猫','RedPandas小熊猫','MartesFlavigula黄喉鸬','MacacaThibetana藏猕猴','RhinopithecusRoxellana金丝猴','HystrixBrachyura豪猪','SusScrofa野猪','RusaUnicolor水鹿','ElaphodusCephalophus簇绒鹿','CapricornisMilneedwardsii中华鬣羚','PseudoisNayaur青羊']
存储txt信息
import torch, os
from model.model import parsingNet
from utils.common import merge_config
from utils.dist_utils import dist_print
from evaluation.eval_wrapper import eval_lane
import torch
import os
def get_file_name(path,filetype):
pathList=[]
for root,dirs,files in os.walk(path):
for file in files:
if file.endswith(filetype):
pathList.append(file)
return pathList
def save_txt(save_dir,str_info):
path=save_dir+"/tusimple_eval.txt"
os.makedirs(os.path.dirname(path),exist_ok=True)
with open(path,"a") as f:
f.write(str_info)
if __name__ == "__main__":
torch.backends.cudnn.benchmark = True
cfg_test_model_path= "1115_1422_b32_zbt_mas_cssc_b_test_2"
cfg_backbone='18p'
cfg_dataset = 'Tusimple'
cls_num_per_lane = 56
cfg_griding_num=100
cfg_num_lanes=4
cfg_test_work_dir="./tmp"
cfg_data_root="D:\pythoncode\dataset\Tusimple"
distributed = False
if 'WORLD_SIZE' in os.environ:
distributed = int(os.environ['WORLD_SIZE']) > 1
net = parsingNet(pretrained = False, backbone=cfg_backbone,cls_dim = (cfg_griding_num+1,cls_num_per_lane, cfg_num_lanes),
use_aux=False).cuda() # we dont need auxiliary segmentation in testing
model_strlist=get_file_name(cfg_test_model_path,"pth")
for model_str in model_strlist:
cfg_test_model_temp=cfg_test_model_path
cfg_test_model_temp=cfg_test_model_path+"/"+model_str
#这里设置循环
state_dict = torch.load(cfg_test_model_temp, map_location = 'cpu')['model']
compatible_state_dict = {}
for k, v in state_dict.items():
if 'module.' in k:
compatible_state_dict[k[7:]] = v
else:
compatible_state_dict[k] = v
net.load_state_dict(compatible_state_dict, strict = False)
if not os.path.exists(cfg_test_work_dir):
os.mkdir(cfg_test_work_dir)
strlist=eval_lane(net, cfg_dataset, cfg_data_root, cfg_test_work_dir, cfg_griding_num, False, distributed)
model_savestr=model_str+"\n"+strlist
save_txt(cfg_test_model_path,model_savestr)
获取自己的QQ空间说说以及相册信息
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
from urllib import request
def login(login_qq,password, business_qq):
'''
登陆
:param login_qq: 登陆用的QQ
:param password: 登陆的QQ密码
:param business_qq: 业务QQ
:return: driver
'''
driver = webdriver.Edge()
driver.get('https://user.qzone.qq.com/{}/311'.format(business_qq)) # URL
driver.implicitly_wait(10) # 隐示等待,为了等待充分加载好网址
driver.find_element_by_id('login_div')
driver.switch_to.frame('login_frame') # 切到输入账号密码的frame
driver.find_element_by_id('switcher_plogin').click() ##点击‘账号密码登录’
driver.find_element_by_id('u').clear() ##清空账号栏
driver.find_element_by_id('u').send_keys(login_qq) # 输入账号
driver.find_element_by_id('p').clear() # 清空密码栏
driver.find_element_by_id('p').send_keys(password) # 输入密码
driver.find_element_by_id('login_button').click() # 点击‘登录’
driver.switch_to.default_content()
driver.implicitly_wait(10)
time.sleep(5)
try:
driver.find_element_by_id('QM_OwnerInfo_Icon')
return driver
except:
print('不能访问' + business_qq)
return None
def get_shuoshuo(driver):
page = 1
while True:
# 下拉滚动条
for j in range(1, 5):
driver.execute_script("window.scrollBy(0,5000)")
time.sleep(2)
# 切换 frame
driver.switch_to.frame('app_canvas_frame')
# 构建 BeautifulSoup 对象
bs = BeautifulSoup(driver.page_source.encode('GBK', 'ignore').decode('gbk'))
# 找到页面上的所有说说
pres = bs.find_all('pre', class_='content')
for pre in pres:
shuoshuo = pre.text
tx = pre.parent.parent.find('a', class_="c_tx c_tx3 goDetail")['title']
print(tx + ":" + shuoshuo)
# 页数判断
page = page + 1
maxPage = bs.find('a', title='末页').text
if int(maxPage) < page:
break
driver.find_element_by_link_text(u'下一页').click()
# 回到主文档
driver.switch_to.default_content()
# 等待页面加载
time.sleep(3)
def get_photo(driver):
# 照片下载路径
photo_path = "C:/Users/xxx/Desktop/photo/{}/{}.jpg"
# 相册索引
photoIndex = 1
while True:
# 回到主文档
driver.switch_to.default_content()
# driver.switch_to.parent_frame()
# 点击头部的相册按钮
driver.find_element_by_xpath('//*[@id="menuContainer"]/div/ul/li[3]/a').click()
#等待加载
driver.implicitly_wait(10)
time.sleep(3)
# 切换 frame
driver.switch_to.frame('app_canvas_frame')
# 各个相册的超链接
a = driver.find_elements_by_class_name('album-cover')
# 单个相册
a[photoIndex].click()
driver.implicitly_wait(10)
time.sleep(3)
# 相册的第一张图
p = driver.find_elements_by_class_name('item-cover')[0]
p.click()
time.sleep(3)
# 相册大图在父frame,切换到父frame
driver.switch_to.parent_frame()
# 循环相册中的照片
while True:
# 照片url地址和名称
img = driver.find_element_by_id('js-img-disp')
src = img.get_attribute('src').replace('&t=5', '')
name = driver.find_element_by_id("js-photo-name").text
# 下载
request.urlretrieve(src, photo_path.format(qq, name))
# 取下面的 当前照片张数/总照片数量
counts = driver.find_element_by_xpath('//*[@id="js-ctn-infoBar"]/div/div[1]/span').text
counts = counts.split('/')
# 最后一张的时候退出照片浏览
if int(counts[0]) == int(counts[1]):
# 右上角的 X 按钮
driver.find_element_by_xpath('//*[@id="js-viewer-main"]/div[1]/a').click()
break
# 点击 下一张,网页加载慢,所以10次加载
for i in (1, 10):
if driver.find_element_by_id('js-btn-nextPhoto'):
n = driver.find_element_by_id('js-btn-nextPhoto')
ActionChains(driver).click(n).perform()
break
else:
time.sleep(5)
# 相册数量比较,是否下载了全部的相册
photoIndex = photoIndex + 1
if len(a) <= photoIndex:
break
if __name__ == '__main__':
login_qq="xxxx"
password="xxx"
business_qq="xxx"
driver=login(login_qq,password,business_qq )
get_shuoshuo(driver)
print("完成")
## 版本二
#!/usr/bin/env python3
# -*- coding:utf-8 -*-
"""一个用于下载QQ空间相册内所有照片的爬虫"""
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import WebDriverException
import os
import re
import sys
import time
import logging
import requests
from json import loads
class qqzone(object):
"""QQ空间相册爬虫"""
def __init__(self, user):
self.username = user['username']
self.password = user['password']
@staticmethod
def get_path(album_name):
home_path = os.path.expanduser('~')
path = os.path.join(home_path, 'Pictures/python/qqzone', album_name)
if not os.path.isdir(path):
os.makedirs(path)
return path
def _login_and_get_args(self):
"""登录QQ,获取Cookies和g_tk"""
opt = webdriver.EdgeOptions()
# opt.set_headless()
driver = webdriver.Edge()
driver.get('https://i.qq.com/')
logging.info('User {} login...'.format(self.username))
driver.switch_to.frame('login_frame')
driver.find_element_by_id('switcher_plogin').click()
driver.find_element_by_id('u').clear()
driver.find_element_by_id('u').send_keys(self.username)
driver.find_element_by_id('p').clear()
driver.find_element_by_id('p').send_keys(self.password)
driver.find_element_by_id('login_button').click()
time.sleep(1)
driver.get('https://user.qzone.qq.com/{}'.format(self.username))
try:
logging.info('Getting g_tk...')
self.g_tk = driver.execute_script(
'return QZONE.FP.getACSRFToken()')
logging.debug('g_tk: {}'.format(self.g_tk))
except WebDriverException:
logging.error(
'Getting g_tk failed, please check your QQ number and password')
driver.close()
driver.quit()
sys.exit(1)
logging.info('Getting Cookies...')
self.cookies = driver.get_cookies()
driver.close()
driver.quit()
def _init_session(self):
self.session = requests.Session()
for cookie in self.cookies:
self.session.cookies.set(cookie['name'], cookie['value'])
self.session.headers = {
'Referer': 'https://qzs.qq.com/qzone/photo/v7/page/photo.html?init=photo.v7/module/albumList/index&navBar=1',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36'
}
def _get_query_for_request(self, topicId=None, pageStart=0, pageNum=100):
"""获取请求相册信息或照片信息所需的参数
Args:
topicId: 每个相册对应的唯一标识符
pageStart: 请求某个相册的照片列表信息所需的起始页码
pageNum: 单次请求某个相册的照片数量
Returns:
一个组合好所有请求参数的字符串
"""
query = {
'g_tk': self.g_tk,
'hostUin': self.username,
'uin': self.username,
'appid': 4,
'inCharset': 'utf-8',
'outCharset': 'utf-8',
'source': 'qzone',
'plat': 'qzone',
'format': 'jsonp'
}
if topicId:
query['topicId'] = topicId
query['pageStart'] = pageStart
query['pageNum'] = pageNum
return '&'.join('{}={}'.format(key, val) for key, val in query.items())
def _load_callback_data(self, resp):
"""以json格式解析返回的jsonp数据"""
try:
resp.encoding = 'utf-8'
data = loads(re.search(r'.*?\(({.*}).*?\).*', resp.text, re.S)[1])
return data
except ValueError:
logging.error('Invalid input')
def _get_ablum_list(self):
"""获取相册的列表信息"""
album_url = 'https://h5.qzone.qq.com/proxy/domain/photo.qzone.qq.com/fcgi-bin/fcg_list_album_v3?' + \
self._get_query_for_request()
logging.info('Getting ablum list id...')
resp = self.session.get(album_url)
data = self._load_callback_data(resp)
album_list = {}
for item in data['data']['albumListModeSort']:
album_list[item['name']] = item['id']
return album_list
def _get_photo(self, album_name, album_id):
"""获取单个相册的照片列表信息,并下载该相册所有照片"""
photo_list_url = 'https://h5.qzone.qq.com/proxy/domain/photo.qzone.qq.com/fcgi-bin/cgi_list_photo?' + \
self._get_query_for_request(topicId=album_id)
logging.info('Getting photo list for album {}...'.format(album_name))
resp = self.session.get(photo_list_url)
data = self._load_callback_data(resp)
if data['data']['totalInPage'] == 0:
return None
file_dir = self.get_path(album_name)
for item in data['data']['photoList']:
path = '{}/{}.jpg'.format(file_dir, item['name'])
logging.info('Downloading {}-{}'.format(album_name, item['name']))
self._download_image(item['url'], path)
def _download_image(self, url, path):
"""下载单张照片"""
try:
print(path)
resp = self.session.get(url, timeout=15)
if resp.status_code == 200:
open(path, 'wb').write(resp.content)
except requests.exceptions.Timeout:
logging.warning('get {} timeout'.format(url))
except requests.exceptions.ConnectionError as e:
logging.error(e.__str__)
finally:
pass
def start(self):
"""爬虫的入口函数"""
self._login_and_get_args()
self._init_session()
album_list = self._get_ablum_list()
for name, id in album_list.items():
self._get_photo(name, id)
def get_user():
"""从终端获取用户输入的QQ号及密码"""
username = input('please input QQ number: ').strip()
if not re.match(r'^[1-9][0-9]{4,9}$', username):
logging.error('\033[31mQQ number is wrong!\033[0m')
sys.exit(1)
import getpass
password = getpass.getpass('password: ')
return {
'username': username,
'password': password
}
def main():
FORMAT = '%(asctime)s [%(levelname)s] %(message)s'
logging.basicConfig(format=FORMAT, level=logging.INFO)
# 默认QQ账户信息
user = {
'username': 'xxx',
'password': 'xxx'
}
# 加 -d 参数可以使用上面的默认账户,默认信息请自行修改
if not (len(sys.argv) > 1 and sys.argv[1] == '-d'):
user = get_user()
qz = qqzone(user)
qz.start()
if __name__ == '__main__':
main()
分类:
编程基础 / 实用小demo
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 震惊!C++程序真的从main开始吗?99%的程序员都答错了
· 【硬核科普】Trae如何「偷看」你的代码?零基础破解AI编程运行原理
· 单元测试从入门到精通
· 上周热点回顾(3.3-3.9)
· winform 绘制太阳,地球,月球 运作规律