python脚本收集

发送邮件

from email.mime.text import MIMEText
from email.header import Header
from smtplib import SMTP_SSL


#qq邮箱smtp服务器
host_server = 'smtp.qq.com'
#sender_qq为发件人的qq号码
sender_qq = ''
#pwd为qq邮箱的授权码
pwd = ''
#发件人的邮箱
sender_qq_mail = ''
#收件人邮箱
receiver = ''
#邮件的正文内容
mail_content = '你好,我是来自知乎的 ,现在在进行一项用python登录qq邮箱发邮件的测试'
#邮件标题
mail_title = '邮件'

#ssl登录
smtp = SMTP_SSL(host_server)
#set_debuglevel()是用来调试的。参数值为1表示开启调试模式,参数值为0关闭调试模式
smtp.set_debuglevel(1)
smtp.ehlo(host_server)
smtp.login(sender_qq, pwd)

msg = MIMEText(mail_content, "plain", 'utf-8')
msg["Subject"] = Header(mail_title, 'utf-8')
msg["From"] = sender_qq_mail
msg["To"] = receiver
smtp.sendmail(sender_qq_mail, receiver, msg.as_string())
smtp.quit()

coco数据提取yolo

#COCO 格式的数据集转化为 YOLO 格式的数据集
#--json_path 输入的json文件路径
#--save_path 保存的文件夹名字,默认为当前目录下的labels。

import os
import json
from tqdm import tqdm
import argparse

parser = argparse.ArgumentParser()
#这里根据自己的json文件位置,换成自己的就行
parser.add_argument('--json_path', default='wildlife_instance_test2017.json',type=str, help="input: coco format(json)")
#这里设置.txt文件保存位置
parser.add_argument('--save_path', default='Lable/test', type=str, help="specify where to save the output dir of labels")
arg = parser.parse_args()

def convert(size, box):
    dw = 1. / (size[0])
    dh = 1. / (size[1])
    x = box[0] + box[2] / 2.0
    y = box[1] + box[3] / 2.0
    w = box[2]
    h = box[3]
#round函数确定(xmin, ymin, xmax, ymax)的小数位数
    x = round(x * dw, 6)
    w = round(w * dw, 6)
    y = round(y * dh, 6)
    h = round(h * dh, 6)
    return (x, y, w, h)

# coding=utf-8
def check_charset(file_path):
    import chardet
    with open(file_path, "rb") as f:
        data = f.read(4)
        charset = chardet.detect(data)['encoding']
    return charset
 
if __name__ == '__main__':
    json_file =   arg.json_path # COCO Object Instance 类型的标注
    ana_txt_save_path = arg.save_path  # 保存的路径

    data = json.load(open(file=json_file,encoding='ISO-8859-1', mode='r'))
    if not os.path.exists(ana_txt_save_path):
        os.makedirs(ana_txt_save_path)

    id_map = {} # coco数据集的id不连续!重新映射一下再输出!
    with open(os.path.join(ana_txt_save_path, 'classes.txt'), 'w') as f:
        # 写入classes.txt
        for i, category in enumerate(data['categories']):
            f.write(f"{category['name']}\n")
            id_map[category['id']] = i
    # print(id_map)
    #这里需要根据自己的需要,更改写入图像相对路径的文件位置。
    list_file = open(os.path.join(ana_txt_save_path, 'test2017.txt'), 'w')
    for img in tqdm(data['images']):
        filename = img["file_name"]
        img_width = img["width"]
        img_height = img["height"]
        img_id = img["id"]python
        head, tail = os.path.splitext(filename)
        ana_txt_name = head + ".txt"  # 对应的txt名字,与jpg一致
        f_txt = open(os.path.join(ana_txt_save_path, ana_txt_name), 'w')
        for ann in data['annotations']:
            if ann['image_id'] == img_id:
                box = convert((img_width, img_height), ann["bbox"])
                f_txt.write("%s %s %s %s %s\n" % (id_map[ann["category_id"]], box[0], box[1], box[2], box[3]))
        f_txt.close()
        #将图片的相对路径写入train2017或val2017的路径
        list_file.write('./images/test2017/%s.jpg\n' %(head))
    list_file.close()

检测简单物体并裁剪

import cv2
import numpy as np

image = cv2.imread("1 (1).JPG")
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
gradX = cv2.Sobel(gray, ddepth=cv2.CV_32F, dx=1, dy=0, ksize=-1)
gradY = cv2.Sobel(gray, ddepth=cv2.CV_32F, dx=0, dy=1, ksize=-1)
# subtract the y-gradient from the x-gradient
gradient = cv2.subtract(gradX, gradY)
gradient = cv2.convertScaleAbs(gradient)
# blur and threshold the image
blurred = cv2.blur(gradient, (9, 9))
(_, thresh) = cv2.threshold(blurred, 90, 255, cv2.THRESH_BINARY)
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (25, 25))
closed = cv2.morphologyEx(thresh, cv2.MORPH_CLOSE, kernel)
# perform a series of erosions and dilations
closed = cv2.erode(closed, None, iterations=4)
closed = cv2.dilate(closed, None, iterations=4)
(cnts, _) = cv2.findContours(closed.copy(), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
c = sorted(cnts, key=cv2.contourArea, reverse=True)[0]
# compute the rotated bounding box of the largest contour
rect = cv2.minAreaRect(c)
box = np.int0(cv2.boxPoints(rect))
# draw a bounding box arounded the detected barcode and display the image
# cv2.drawContours(image, [box], -1, (0, 255, 0), 3)
# cv2.imshow("Image", image)
# cv2.imwrite("contoursImage2.jpg", image)
# cv2.waitKey(0)
Xs = [i[0] for i in box]
Ys = [i[1] for i in box]
x1 = min(Xs)
x2 = max(Xs)
y1 = min(Ys)
y2 = max(Ys)
hight = y2 - y1
width = x2 - x1
cropImg = image[y1-10:y1+hight+10, x1-10:x1+width+10]
cv2.imwrite("contoursImage3.jpg", cropImg)

获取文件夹下文件

def get_file_name(path,filetype):
    pathList=[]
    for root,dirs,files in os.walk(path):
        for file in files:
            if file.endswith(filetype):
                pathList.append(file)
    return pathList

数据获取

import cv2
import numpy as np

import os

def get_file_name(path,filetype):
    pathList=[]
    for root,dirs,files in os.walk(path):
        for file in files:
            if file.endswith(filetype):
                pathList.append(file)
    return pathList

def control_photo(ori_path,save_path):
    image = cv2.imread(ori_path)
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    gradX = cv2.Sobel(gray, ddepth=cv2.CV_32F, dx=1, dy=0, ksize=-1)
    gradY = cv2.Sobel(gray, ddepth=cv2.CV_32F, dx=0, dy=1, ksize=-1)
    # subtract the y-gradient from the x-gradient
    gradient = cv2.subtract(gradX, gradY)
    gradient = cv2.convertScaleAbs(gradient)
    # blur and threshold the image
    blurred = cv2.blur(gradient, (9, 9))
    (_, thresh) = cv2.threshold(blurred, 90, 255, cv2.THRESH_BINARY)
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (25, 25))
    closed = cv2.morphologyEx(thresh, cv2.MORPH_CLOSE, kernel)
    # perform a series of erosions and dilations
    closed = cv2.erode(closed, None, iterations=4)
    closed = cv2.dilate(closed, None, iterations=4)
    (cnts, _) = cv2.findContours(closed.copy(), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    c = sorted(cnts, key=cv2.contourArea, reverse=True)[0]
    # compute the rotated bounding box of the largest contour
    rect = cv2.minAreaRect(c)
    box = np.int0(cv2.boxPoints(rect))
    # draw a bounding box arounded the detected barcode and display the image
    # cv2.drawContours(image, [box], -1, (0, 255, 0), 3)
    # cv2.imshow("Image", image)
    # cv2.imwrite("contoursImage2.jpg", image)
    # cv2.waitKey(0)
    Xs = [i[0] for i in box]
    Ys = [i[1] for i in box]
    x1 = min(Xs)
    x2 = max(Xs)
    y1 = min(Ys)
    y2 = max(Ys)
    hight = y2 - y1
    width = x2 - x1
    cropImg = image[y1-10:y1+hight+10, x1-10:x1+width+10]
    cv2.imwrite(save_path, cropImg)


if __name__ == '__main__':
    for img_name in get_file_name("2A",".JPG"):
        control_photo("2A/"+img_name,"cut_"+img_name)

# COCO 2017 dataset http://cocodataset.org

# download command/URL (optional)
download: bash ./scripts/get_coco.sh

# train and val data as 1) directory: path/images/, 2) file: path/images.txt, or 3) list: [path1/images/, path2/images/]
train: ./data/lote/images/train  # 118287 images
val: ./data/lote/images/val  # 5000 images
test: ./data/lote/images/test  # 20288 of 40670 images, submit to https://competitions.codalab.org/competitions/20794

# number of classes
nc: 11

# class names
names: ['Pandas大熊猫','RedPandas小熊猫','MartesFlavigula黄喉鸬','MacacaThibetana藏猕猴','RhinopithecusRoxellana金丝猴','HystrixBrachyura豪猪','SusScrofa野猪','RusaUnicolor水鹿','ElaphodusCephalophus簇绒鹿','CapricornisMilneedwardsii中华鬣羚','PseudoisNayaur青羊']

存储txt信息

import torch, os
from model.model import parsingNet
from utils.common import merge_config
from utils.dist_utils import dist_print
from evaluation.eval_wrapper import eval_lane
import torch
import os

def get_file_name(path,filetype):
    pathList=[]
    for root,dirs,files in os.walk(path):
        for file in files:
            if file.endswith(filetype):
                pathList.append(file)
    return pathList

def save_txt(save_dir,str_info):
    path=save_dir+"/tusimple_eval.txt"
    os.makedirs(os.path.dirname(path),exist_ok=True)
    with open(path,"a") as f:
        f.write(str_info)
        

if __name__ == "__main__":
    torch.backends.cudnn.benchmark = True

    cfg_test_model_path= "1115_1422_b32_zbt_mas_cssc_b_test_2"
    cfg_backbone='18p'
    cfg_dataset = 'Tusimple'
    cls_num_per_lane = 56
    cfg_griding_num=100
    cfg_num_lanes=4
    cfg_test_work_dir="./tmp"
    cfg_data_root="D:\pythoncode\dataset\Tusimple"

    distributed = False
    if 'WORLD_SIZE' in os.environ:
        distributed = int(os.environ['WORLD_SIZE']) > 1

    net = parsingNet(pretrained = False, backbone=cfg_backbone,cls_dim = (cfg_griding_num+1,cls_num_per_lane, cfg_num_lanes),
                    use_aux=False).cuda() # we dont need auxiliary segmentation in testing

    model_strlist=get_file_name(cfg_test_model_path,"pth")

    for model_str in model_strlist:
        cfg_test_model_temp=cfg_test_model_path
        cfg_test_model_temp=cfg_test_model_path+"/"+model_str
        #这里设置循环

        state_dict = torch.load(cfg_test_model_temp, map_location = 'cpu')['model']
        compatible_state_dict = {}
        for k, v in state_dict.items():
            if 'module.' in k:
                compatible_state_dict[k[7:]] = v
            else:
                compatible_state_dict[k] = v

        net.load_state_dict(compatible_state_dict, strict = False)

        if not os.path.exists(cfg_test_work_dir):
            os.mkdir(cfg_test_work_dir)

        strlist=eval_lane(net, cfg_dataset, cfg_data_root, cfg_test_work_dir, cfg_griding_num, False, distributed)
        model_savestr=model_str+"\n"+strlist
        save_txt(cfg_test_model_path,model_savestr)


获取自己的QQ空间说说以及相册信息

import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
from urllib import request

def login(login_qq,password, business_qq):
    '''
    登陆
    :param login_qq: 登陆用的QQ
    :param password: 登陆的QQ密码
    :param business_qq: 业务QQ
    :return: driver
    '''
    driver = webdriver.Edge()
 
    driver.get('https://user.qzone.qq.com/{}/311'.format(business_qq))  # URL
    driver.implicitly_wait(10)  # 隐示等待,为了等待充分加载好网址
    driver.find_element_by_id('login_div')
    driver.switch_to.frame('login_frame')  # 切到输入账号密码的frame
    driver.find_element_by_id('switcher_plogin').click()  ##点击‘账号密码登录’
    driver.find_element_by_id('u').clear()  ##清空账号栏
    driver.find_element_by_id('u').send_keys(login_qq)  # 输入账号
    driver.find_element_by_id('p').clear()  # 清空密码栏
    driver.find_element_by_id('p').send_keys(password)  # 输入密码
    driver.find_element_by_id('login_button').click()  # 点击‘登录’
    driver.switch_to.default_content()
 
    driver.implicitly_wait(10)
    time.sleep(5)
 
    try:
        driver.find_element_by_id('QM_OwnerInfo_Icon')
        return driver
    except:
        print('不能访问' + business_qq)
        return None
    

def get_shuoshuo(driver):
    
    page = 1
    while True:
        # 下拉滚动条
        for j in range(1, 5):
            driver.execute_script("window.scrollBy(0,5000)")
            time.sleep(2)
 
        # 切换 frame
        driver.switch_to.frame('app_canvas_frame')
        # 构建 BeautifulSoup 对象
        bs = BeautifulSoup(driver.page_source.encode('GBK', 'ignore').decode('gbk'))
        # 找到页面上的所有说说
        pres = bs.find_all('pre', class_='content')
 
        for pre in pres:
            shuoshuo = pre.text
            tx = pre.parent.parent.find('a', class_="c_tx c_tx3 goDetail")['title']
            print(tx + ":" + shuoshuo)
 
        # 页数判断
        page = page + 1
        maxPage = bs.find('a', title='末页').text
 
        if int(maxPage) < page:
            break
 
        driver.find_element_by_link_text(u'下一页').click()
        # 回到主文档
        driver.switch_to.default_content()
        # 等待页面加载
        time.sleep(3)

def get_photo(driver):
    
    # 照片下载路径
    photo_path = "C:/Users/xxx/Desktop/photo/{}/{}.jpg"
    
    # 相册索引
    photoIndex = 1
 
    while True:
        # 回到主文档
        driver.switch_to.default_content()
        # driver.switch_to.parent_frame()
        # 点击头部的相册按钮
        driver.find_element_by_xpath('//*[@id="menuContainer"]/div/ul/li[3]/a').click()
        #等待加载
        driver.implicitly_wait(10)
        time.sleep(3)
        # 切换 frame
        driver.switch_to.frame('app_canvas_frame')
        # 各个相册的超链接
        a = driver.find_elements_by_class_name('album-cover')
        # 单个相册
        a[photoIndex].click()
 
        driver.implicitly_wait(10)
        time.sleep(3)
        # 相册的第一张图
        p = driver.find_elements_by_class_name('item-cover')[0]
        p.click()
        time.sleep(3)
 
        # 相册大图在父frame,切换到父frame
        driver.switch_to.parent_frame()
        # 循环相册中的照片
        while True:
            # 照片url地址和名称
            img = driver.find_element_by_id('js-img-disp')
            src = img.get_attribute('src').replace('&t=5', '')
            name = driver.find_element_by_id("js-photo-name").text
 
            # 下载
            request.urlretrieve(src, photo_path.format(qq, name))
 
            # 取下面的 当前照片张数/总照片数量
            counts = driver.find_element_by_xpath('//*[@id="js-ctn-infoBar"]/div/div[1]/span').text
 
            counts = counts.split('/')
            # 最后一张的时候退出照片浏览
            if int(counts[0]) == int(counts[1]):
                # 右上角的 X 按钮
                driver.find_element_by_xpath('//*[@id="js-viewer-main"]/div[1]/a').click()
                break
            # 点击 下一张,网页加载慢,所以10次加载
            for i in (1, 10):
                if driver.find_element_by_id('js-btn-nextPhoto'):
                    n = driver.find_element_by_id('js-btn-nextPhoto')
                    ActionChains(driver).click(n).perform()
                    break
                else:
                    time.sleep(5)
 
        # 相册数量比较,是否下载了全部的相册
        photoIndex = photoIndex + 1
        if len(a) <= photoIndex:
            break
 

if __name__ == '__main__':
    login_qq="xxxx"
    password="xxx"
    business_qq="xxx"
    driver=login(login_qq,password,business_qq )
    get_shuoshuo(driver)
    print("完成")
    
    
## 版本二
#!/usr/bin/env python3
# -*- coding:utf-8 -*-
"""一个用于下载QQ空间相册内所有照片的爬虫"""

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import WebDriverException

import os
import re
import sys
import time
import logging
import requests
from json import loads


class qqzone(object):
    """QQ空间相册爬虫"""

    def __init__(self, user):
        self.username = user['username']
        self.password = user['password']

    @staticmethod
    def get_path(album_name):
        home_path = os.path.expanduser('~')
        path = os.path.join(home_path, 'Pictures/python/qqzone', album_name)
        if not os.path.isdir(path):
            os.makedirs(path)
        return path

    def _login_and_get_args(self):
        """登录QQ,获取Cookies和g_tk"""
        opt = webdriver.EdgeOptions()
        # opt.set_headless()

        driver = webdriver.Edge()
        driver.get('https://i.qq.com/')

        logging.info('User {} login...'.format(self.username))
        driver.switch_to.frame('login_frame')
        driver.find_element_by_id('switcher_plogin').click()
        driver.find_element_by_id('u').clear()
        driver.find_element_by_id('u').send_keys(self.username)
        driver.find_element_by_id('p').clear()
        driver.find_element_by_id('p').send_keys(self.password)
        driver.find_element_by_id('login_button').click()

        time.sleep(1)
        driver.get('https://user.qzone.qq.com/{}'.format(self.username))

        try:
            logging.info('Getting g_tk...')
            self.g_tk = driver.execute_script(
                'return QZONE.FP.getACSRFToken()')
            logging.debug('g_tk: {}'.format(self.g_tk))
        except WebDriverException:
            logging.error(
                'Getting g_tk failed, please check your QQ number and password')
            driver.close()
            driver.quit()
            sys.exit(1)

        logging.info('Getting Cookies...')
        self.cookies = driver.get_cookies()

        driver.close()
        driver.quit()

    def _init_session(self):
        self.session = requests.Session()
        for cookie in self.cookies:
            self.session.cookies.set(cookie['name'], cookie['value'])
        self.session.headers = {
            'Referer': 'https://qzs.qq.com/qzone/photo/v7/page/photo.html?init=photo.v7/module/albumList/index&navBar=1',
            'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36'
        }

    def _get_query_for_request(self, topicId=None, pageStart=0, pageNum=100):
        """获取请求相册信息或照片信息所需的参数

        Args:
            topicId: 每个相册对应的唯一标识符
            pageStart: 请求某个相册的照片列表信息所需的起始页码
            pageNum: 单次请求某个相册的照片数量

        Returns:
            一个组合好所有请求参数的字符串
        """
        query = {
            'g_tk': self.g_tk,
            'hostUin': self.username,
            'uin': self.username,
            'appid': 4,
            'inCharset': 'utf-8',
            'outCharset': 'utf-8',
            'source': 'qzone',
            'plat': 'qzone',
            'format': 'jsonp'
        }
        if topicId:
            query['topicId'] = topicId
            query['pageStart'] = pageStart
            query['pageNum'] = pageNum
        return '&'.join('{}={}'.format(key, val) for key, val in query.items())

    def _load_callback_data(self, resp):
        """以json格式解析返回的jsonp数据"""
        try:
            resp.encoding = 'utf-8'
            data = loads(re.search(r'.*?\(({.*}).*?\).*', resp.text, re.S)[1])
            return data
        except ValueError:
            logging.error('Invalid input')

    def _get_ablum_list(self):
        """获取相册的列表信息"""
        album_url = 'https://h5.qzone.qq.com/proxy/domain/photo.qzone.qq.com/fcgi-bin/fcg_list_album_v3?' + \
            self._get_query_for_request()

        logging.info('Getting ablum list id...')
        resp = self.session.get(album_url)
        data = self._load_callback_data(resp)

        album_list = {}
        for item in data['data']['albumListModeSort']:
            album_list[item['name']] = item['id']

        return album_list

    def _get_photo(self, album_name, album_id):
        """获取单个相册的照片列表信息,并下载该相册所有照片"""
        photo_list_url = 'https://h5.qzone.qq.com/proxy/domain/photo.qzone.qq.com/fcgi-bin/cgi_list_photo?' + \
            self._get_query_for_request(topicId=album_id)

        logging.info('Getting photo list for album {}...'.format(album_name))
        resp = self.session.get(photo_list_url)
        data = self._load_callback_data(resp)
        if data['data']['totalInPage'] == 0:
            return None

        file_dir = self.get_path(album_name)
        for item in data['data']['photoList']:
            path = '{}/{}.jpg'.format(file_dir, item['name'])
            logging.info('Downloading {}-{}'.format(album_name, item['name']))
            self._download_image(item['url'], path)

    def _download_image(self, url, path):
        """下载单张照片"""
        try:
            print(path)
            resp = self.session.get(url, timeout=15)
            if resp.status_code == 200:
                open(path, 'wb').write(resp.content)
        except requests.exceptions.Timeout:
            logging.warning('get {} timeout'.format(url))
        except requests.exceptions.ConnectionError as e:
            logging.error(e.__str__)
        finally:
            pass

    def start(self):
        """爬虫的入口函数"""
        self._login_and_get_args()
        self._init_session()
        album_list = self._get_ablum_list()
        for name, id in album_list.items():
            self._get_photo(name, id)


def get_user():
    """从终端获取用户输入的QQ号及密码"""
    username = input('please input QQ number: ').strip()
    if not re.match(r'^[1-9][0-9]{4,9}$', username):
        logging.error('\033[31mQQ number is wrong!\033[0m')
        sys.exit(1)

    import getpass
    password = getpass.getpass('password: ')

    return {
        'username': username,
        'password': password
    }


def main():
    FORMAT = '%(asctime)s [%(levelname)s] %(message)s'
    logging.basicConfig(format=FORMAT, level=logging.INFO)

    # 默认QQ账户信息
    user = {
        'username': 'xxx',
        'password': 'xxx'
    }

    # 加 -d 参数可以使用上面的默认账户,默认信息请自行修改
    if not (len(sys.argv) > 1 and sys.argv[1] == '-d'):
        user = get_user()

    qz = qqzone(user)
    qz.start()


if __name__ == '__main__':
    main()
posted @   东血  阅读(13)  评论(0编辑  收藏  举报
相关博文:
阅读排行:
· 震惊!C++程序真的从main开始吗?99%的程序员都答错了
· 【硬核科普】Trae如何「偷看」你的代码?零基础破解AI编程运行原理
· 单元测试从入门到精通
· 上周热点回顾(3.3-3.9)
· winform 绘制太阳,地球,月球 运作规律

本站勉强运行 1780 天 21 小时 48 分 53 秒

目录导航
目录导航
python脚本收集
发送邮件
coco数据提取yolo
检测简单物体并裁剪
获取文件夹下文件
数据获取
存储txt信息
获取自己的QQ空间说说以及相册信息
发布于 2024-01-14 16:26
点击右上角即可分享
微信分享提示