python写的地震前兆数据自动下载归档软件

地震台站系统要处理各种前兆数据，每天都要下载各种前兆数据文件，日志文件等，耗时费力。为了解决这一问题。本人最近利用python3.8软件，编写了一个基于python爬虫的，地震台站系统前兆数据自动下载归档软件，实现了对形变（水管仪、伸缩仪、垂直摆）、水温、重力、气象三要素等前兆数据文件和观测日志自动保存归档的功能。
利用pyinstaller打包后放在工作电脑上。值班人员只需要双击这个软件，程序就自动完成全部前兆数据下载归档的工作，减轻了工作人员的负担。
"""
# 本程序为长白山火山监测数据下载专用软件。可实现火山形变、重力、深层水温等监测数据自动下载到指定目录
# 重要提示：在前兆文件下载页要求输入用户名和密码页，用谷歌浏览器按F12，出现“开发者工具”，按Ctrl+R，
# 选择network标签，filter标签选择all,可以看见download..点一下，可以看见requestURL,这时候输入用户名和密码，提交
# 又出现一个requestURL,点下面的，这时候右侧的url中包含了用户输入的用户名和密码信息，这个url就是实际文件的下载页
# 复制这个url，通过编辑下载这个网页文件就行了。
"""


import os
import re
import requests
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from tqdm import tqdm
from urllib.request import urlopen

def getGravityCookie():
    """
    本方法的关键是要先下载一个chromedriver.exe程序，软件在运行的时候通过代码打开谷歌浏览器，并通过代码完成输入用户名密码，完成
    完全由代码探友的模拟登陆过程。从而获取网站的正确cookie.这个解决方案非常适合于破解cookie经常动态变动的网站
    在获取正确的cookie后，用户
    """

    # 一下三行为无头模式运行，无头模式不开启浏览器，也就是在程序里面运行的
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    # browser = webdriver.Chrome(executable_path=(r'C:\Users\Administrator\Desktop\chrome_beta\ChromePortable\App\Chrome\chrome.exe'), options=chrome_options)
    # #如果不用上面三行，那么就用下面这一行。运行的时候回自动的开启浏览器，并在浏览器中自动运行，你可以看到自动运行的过程
    browser = webdriver.Chrome(
        # 下行中需要根据程序运行环境输入实际的chromedriver.exe文件路径
        executable_path=(r'D:\chrome_beta\ChromePortable\App\Chrome\chromedriver.exe'))
    # 设置访问链接
    browser.get("http://10.22.114.37")
    # 点击登录按钮
    # browser.find_element_by_name("submit").click()
    # 输入用户名
    browser.find_element_by_id("userName").send_keys("debug")
    # 点击“下一步”
    # browser.find_element_by_id("login-signin").click()
    # 等待10秒，以防读取不到（#login-passwd）元素
    # sleep(10)
    # 输入密码
    browser.find_element_by_id("pwd").send_keys("01234567")
    # 点击signin按钮
    browser.find_element_by_name("submit").click()
    # 获取cookie
    cookie_items = browser.get_cookies()
    cookie_str = ""
    # 组装cookie字符串
    for item_cookie in cookie_items:
        item_str = item_cookie["name"] + "=" + item_cookie["value"] + "; "
        cookie_str += item_str
    cookie = cookie_str[0:-2]
    # cookie_str的值就是正确的重力仪cookie
    return cookie


def welcome(i):
    print('-' * 80)
    print('欢迎使用：\033[1;31m[火山前兆监测数据自动下载归档软件 V2.0]\033[0m')
    print('软件开发者：\033[1;31m吉林省长白山天池火山监测站\033[0m\n')
    # print('发行时间："\033[1;31m2020－02－06\033[0m"')
    print('\033[1;31mCopyright © 2020 All Rights Reserved 版权所有\033[0m')
    print('现在开始下载 ' + i)
    print('-' * 80)

def create_dir(base_url, target_url, file_name, gravity_cookie, num_saved):
    """实现分析目标url，创建文件保存路径，保存文件等功能"""

    count = 0
    for i in target_url:
        # 定义tempname,临时保存当前的file_name, 分析文件名，定向到指定目录
        if base_url == 'http://10.22.114.37/':
            year = file_name[count][0:4]
        elif base_url == 'http://10.22.104.44/':
            year = file_name[count][17:21]
        elif base_url == 'http://10.22.114.42/':
            year = file_name[count][-13:-9]
        else:
            year = '20' + file_name[count][9:11]
        if base_url == 'http://10.22.114.31/' or base_url == 'http://10.22.114.32/':
            month = file_name[count][11:13]
        elif base_url == 'http://10.22.114.34/' and i[-3:] == 'epd':
            month = file_name[count][13:15]
        elif base_url == 'http://10.22.114.34/' and i[-3:] == 'sec':
            month = file_name[count][13:15]
        elif base_url == 'http://10.22.114.37/':
            month = file_name[count][4:6]
        elif base_url == 'http://10.22.104.44/':
            month = file_name[count][21:23]
        elif base_url == 'http://10.22.114.42/':
            month = file_name[count][-8:-6]

        if base_url == 'http://10.22.114.31/':
            dir = 'd:/观测数据下载/形变数据/水管仪/'
        elif base_url == 'http://10.22.114.32/':
            dir = 'd:/观测数据下载/形变数据/伸缩仪/'
        elif base_url == 'http://10.22.114.34/' and i[-3:] == 'epd':
            dir = 'd:/观测数据下载/形变数据/垂直摆/分数据/'
        elif base_url == 'http://10.22.114.34/' and i[-3:] == 'sec':
            dir = 'd:/观测数据下载/形变数据/垂直摆/秒数据/'
        elif base_url == 'http://10.22.114.37/':
            dir = 'd:/观测数据下载/重力数据/'
        elif base_url == 'http://10.22.104.44/':
            dir = 'd:/观测数据下载/气象三要素数据/'
        elif base_url == 'http://10.22.114.42/':
            dir = 'd:/观测数据下载/深层水温数据/'
        if base_url == 'http://10.22.114.37/':  # 重力数据下又分成“数据”和“日志”两个子目录分别保存*.dat和*.log文件
            newdir = dir + year + '/' + month + '/' + '数据' + '/'
            if not os.path.isdir(newdir):
                os.makedirs(newdir)
            logdir = dir + year + '/' + month + '/' + '日志' + '/'
            if not os.path.isdir(logdir):
                os.makedirs(logdir)
        else:
            newdir = dir + year + '/' + month + '/'
            if not os.path.isdir(newdir):
                os.makedirs(newdir)

        # 重力文件名是特殊的，在正则匹配出来的文件名前面还要加上仪器序列号
        if base_url == 'http://10.22.114.37/':
            tempname = newdir + '22004X212MPET0046' + file_name[count]
            # 再匹配生成日志文件名logname
            logname = tempname.replace('/数据/', '/日志/').replace('dat', 'log')
        else:
            tempname = newdir + file_name[count]
        # 因为重力手段要下载的数据分为data和log两类，与其它前兆手段数据都不同，所以下面检测文件是否存在和写入文件都分开写代码
        if base_url == 'http://10.22.114.37/':
            if os.path.isfile(tempname) and os.path.isfile(logname):
                print('文件%s和日志%s已经保存过了.....跳过！' % (file_name[count], file_name[count][:-4] + '.log'))
                count += 1
                continue
        else:
            if os.path.isfile(tempname):
                print('文件%s已经保存过了.....跳过！' % file_name[count])
                count += 1
                continue

        # headers是关键，如果requests.get()内不加header，则下载来的文件只是网页上的文字
        # 水管和伸缩仪数据不加headers,垂直摆和重力必须加相应的headers
        # TODO 重力数据自动保存目前还有一些问题，主要问题就是重力服务器的Cookie总变化，如果它变化 了，就需要重新输入正确的Cookie,才能正常运行
        # print('正在保存文件' + file_name[count] + '.....', end='')
        print(file_name[count], end = '')
        if base_url == 'http://10.22.114.34/':
            headers = {
                'User-Agent': 't: Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36',
                'Connection': 'keep-alive',
                'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',

                'Cookie': 'htvpcookie=01234567'
            }
            content = requests.get(i, headers = headers, stream = True)
        elif base_url == 'http://10.22.114.37/':
            headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36',
                'Connection': 'keep-alive',
                'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',

                 'Cookie': gravity_cookie
            }
            content = requests.get(i, headers = headers, stream = True)
        elif base_url == 'http://10.22.104.44/':
            headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36',
                'Connection': 'keep-alive',
                'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',

                'Cookie': 'UserName=administrator; Auth=111'
            }
            content = requests.get(i, headers = headers, stream = True)
        else:
            content = requests.get(i, stream = True)
        # 将requests.get()获得的数据内容写入对应目录的，存成文件，注意重力需要分别保存在‘数据’和‘日志’两个文件夹中

        # 保存重力之外的其它前兆手段的数据文件，并显示进度条
        if base_url != 'http://10.22.114.37/':
            chunk_size = 1024
            size = 0
            content_size = len(content.content)
            with open(tempname, 'wb') as f:
                for data in content.iter_content(chunk_size = chunk_size):
                    f.write(data)
                    size += len(data)
                    print('\r' + file_name[count] +':%s%.2f%%' % ('>' * int(size*50/content_size),
                                                                 float(size/content_size*100)),end='')
                else:
                    num_saved += 1
            print('')

            count += 1
        else:
            # 保存重力数据文件并显示进度条
            chunk_size = 1024
            size = 0
            # print('hello hello hello')
            content_size = len(content.content)
            with open(tempname, 'wb') as f:
                for data in content.iter_content(chunk_size=chunk_size):
                    f.write(data)
                    size += len(data)
                    print('\r' + file_name[count] + ':%s%.2f%%' % ('>' * int(size * 50 / content_size),
                                                                   float(size / content_size * 100)), end='')
                else:
                    num_saved += 1
            print('')
            # 保存重力日志文件并显示进度条
            headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36',
                'Connection': 'keep-alive',
                'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',

                 'Cookie': 'PHPSESSID=7sgmd7aanp68tvudsu01nhld13; path=/'
            }
            i_log = i[0:44] + 'log' + i[-13:-4] + '.log'
            content_log = requests.get(i_log, headers = headers, stream = True)
            chunk_size = 1024
            size = 0
            content_size = len(content_log.content)
            with open(logname, 'wb') as f:
                for data in content_log.iter_content(chunk_size=chunk_size):
                    f.write(data)
                    size += len(data)
                    print('\r' + file_name[count][:-4] + '.log' + ':%s%.2f%%' % ('>' * int(size * 50 / content_size),
                                                                   float(size / content_size * 100)), end='')
                else:
                    num_saved += 1
            print('')
            count += 1
    return num_saved


def getHTMLText(url, gravity_cookie):
    if url == 'http://10.22.114.37/Download.php?page=1' or url == 'http://10.22.114.37/Download.php?page=2' or url == 'http://10.22.114.37/Download.php?page=3':
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36',
            'Connection': 'keep-alive',
            'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',

            'Cookie': gravity_cookie
        }
        response = requests.get(url, headers=headers).content
        web_content = response.decode('utf8', 'ignore')
        return web_content
    else:
        try:
            r=requests.get(url)
            r.raise_for_status()
            r.encoding=r.apparent_encoding
            return r.text
        except:
            print("request failed")

def data_download(url, web_data, gravity_cookie, num_saved):
    target_url = []
    log_url = []

    if url == 'http://10.22.114.31/download.htm' or url == 'http://10.22.114.32/download.htm':
        res = re.compile('([0-9]+X[0-9]+[A-Z].EPD)')
    elif url == 'http://10.22.114.34/cgi-bin/getdownload.cgi?id=0':
        res = re.compile('([0-9]+X[0-9]+\.epd)')
    elif url == 'http://10.22.114.34/cgi-bin/getdownload.cgi?id=2':
        res = re.compile('([0-9]+X[0-9]+\.sec)')
    elif url == 'http://10.22.114.37/Download.php?page=1' or url == 'http://10.22.114.37/Download.php?page=2' or url == 'http://10.22.114.37/Download.php?page=3':
        res = re.compile('(\d+.dat)')
    elif url == 'http://10.22.104.44/Main/DownloadFiles.asp':
        res = re.compile('([0-9a-zA-Z]+.EPMS)')
    elif url == 'http://10.22.114.42/download.asp':
        res = re.compile('(X431DQYQ1455_[\d]{4}-[\d]{2}-[\d]{2}.sw)')

    file_name = re.findall(res, web_data)
    file_name = file_name[0::2]
    file_name.sort()
    if url == 'http://10.22.114.42/download.asp' or url == 'http://10.22.114.37/Download.php?page=2' or url == 'http://10.22.114.37/Download.php?page=3' or url == 'http://10.22.104.44/Main/DownloadFiles.asp':
        pass
    else:
        file_name = file_name[0:-1]
    # 获取传入url的基url
    base_url = url[:20]

    # 伸缩仪数据下载网址 http://10.22.114.32/22004231X200131A.EPD?u=administrator&p=01234567&B1=%CC%E1+%BD%BB
    # 根据url的值判断是要下载哪个监测手段的数据，不同手段数据下载目标target_url具体设置
    if url == 'http://10.22.114.31/download.htm' or url == 'http://10.22.114.32/download.htm':
        for i in file_name:
            target_url.append(base_url + i + '?u=administrator&p=01234567&B1=%CC%E1+%BD%BB')
        num_saved = create_dir(base_url, target_url, file_name, gravity_cookie, num_saved)
    elif url == 'http://10.22.114.34/cgi-bin/getdownload.cgi?id=0':
        for i in file_name:
            target_url.append(base_url + 'cgi-bin/download.cgi?id=0&filename=' + i)
        num_saved = create_dir(base_url, target_url, file_name, gravity_cookie, num_saved)
    elif url == 'http://10.22.114.34/cgi-bin/getdownload.cgi?id=2':
        for i in file_name:
            target_url.append(base_url + 'cgi-bin/download.cgi?id=2&filename=' + i)
        num_saved = create_dir(base_url, target_url, file_name, gravity_cookie, num_saved)
    elif url == 'http://10.22.114.37/Download.php?page=1' or url == 'http://10.22.114.37/Download.php?page=2' or url == 'http://10.22.114.37/Download.php?page=3':
        for i in file_name:
            target_url.append(base_url + 'infoDown.php?id=/mnt/sd/dat/' + i)
        num_saved = create_dir(base_url, target_url, file_name, gravity_cookie, num_saved)
    elif url == 'http://10.22.104.44/Main/DownloadFiles.asp':
        for i in file_name:
            target_url.append(base_url + 'Main/download.asp?path=\SDMEM\DataFiles\&file=' + i)
        num_saved = create_dir(base_url, target_url, file_name, gravity_cookie, num_saved)
    elif url == 'http://10.22.114.42/download.asp':
        for i in file_name:
            target_url.append(base_url + 'shiwu/' + i)
        num_saved = create_dir(base_url, target_url, file_name, gravity_cookie, num_saved)
    return num_saved

def main():
    num_saved = 0
    begin = time.time()
    gravity_cookie = getGravityCookie()
    url1 = 'http://10.22.114.31/download.htm'
    url2 = 'http://10.22.114.32/download.htm'
    url3 = 'http://10.22.114.34/cgi-bin/getdownload.cgi?id=0' # 垂直摆分数据
    url4 = 'http://10.22.114.34/cgi-bin/getdownload.cgi?id=2' # 垂直摆秒数据
    url5 = 'http://10.22.114.37/Download.php?page=1' #重力数据下载第1页
    url6 = 'http://10.22.114.37/Download.php?page=2' #重力数据下载第1页
    url7 = 'http://10.22.114.37/Download.php?page=3' #重力数据下载第1页
    url8 = 'http://10.22.104.44/Main/DownloadFiles.asp'
    url9 = 'http://10.22.114.42/download.asp'

    url = [url1, url2, url3, url4, url5, url6, url7, url8, url9]
    for i in url:
        welcome(i)
        web_data = getHTMLText(i, gravity_cookie)
        num_saved = data_download(i, web_data, gravity_cookie, num_saved)

    end = time.time()
    diff = (end - begin) / 60
    print('本次数据下载共运行%.2f分钟, 下载数据和日志文件%d个' % (diff, num_saved))
    print("*************************\033[1;31m软件将于1分钟后关闭\033[0m*************************")
    time.sleep(60)
main()
程序运行截图
posted @ 2020-02-05 20:14 Iceberg_710815 阅读(508) 评论(0) 编辑收藏举报
刷新页面返回顶部
python写的地震前兆数据自动下载归档软件

公告