中台办

import pymysql


def get_md5(parmStr):
    # 1、参数必须是utf8
    # 2、python3所有字符都是unicode形式,已经不存在unicode关键字
    # 3、python3 str 实质上就是unicode
    if isinstance(parmStr, str):
        # 如果是unicode先转utf-8
        parmStr = parmStr.encode("utf-8")
    m = hashlib.md5()
    m.update(parmStr)
    return m.hexdigest()


connect = pymysql.Connect(
    host='140.210.4.73',
    port=3306,
    user='twipad_cj',
    passwd='bj@#twipad_cj',
    db='tw_ipaddb',
    charset='utf8mb4'
)


def main4():
    url = 'http://www.gwytb.gov.cn/xwdt/zwyw/'

    headers = {
        'cookie': 'Hm_lvt_eb0ec21879d705b54e0b3bbc2b4ce670=1631690955; Hm_lpvt_eb0ec21879d705b54e0b3bbc2b4ce670=1631691679',
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4557.4 Safari/537.36'
    }
    html = requests.get(url=url, headers=headers).content.decode('gbk')
    # print(html)
    tree = etree.HTML(html)
    second_url = tree.xpath('//ul[@class="scdList wrapList"]/li/a//@href')
    # print(second_url)
    for second_url in second_url:
        time.sleep(random.randint(3, 5))
        print(second_url)
        ir_mediasourceid = 4
        ir_url = second_url
        ir_mediasource = '国台办官网'
        if_vcj = 0
        ir_videourl = ''
        time.sleep((random.randint(3, 8)))
        # print(second_url)
        html = requests.get(url=second_url, headers=headers).content.decode('gbk')

        # print(html)
        tree = etree.HTML(html)

        # 标题
        ir_title = tree.xpath('//div[@class="area wrapList"]/h1//text()')[0]
        # 时间
        ir_urltime = tree.xpath('//div[@class="area wrapList"]/div[@class="info"]/span//text()')[0]
        url_date = ir_urltime.split('-')
        url_date = url_date[0] + url_date[1]
        # print(url_date)
        if len(':') == 1:
            ir_urltime = ir_urltime + ':00'
        if len(':') == 0:
            ir_urltime = ir_urltime + '00:00:00'
        if len(':') == 2:
            ir_urltime = ir_urltime
        dt = datetime.datetime.strptime(ir_urltime, '%Y-%m-%d %H:%M:%S')
        ir_urltime = int(str(int(time.mktime(dt.timetuple()))))
        try:
            ir_picture = (''.join(tree.xpath('//div[@class="TRS_Editor"]//img//@src')[0])).replace('./', '')
            ir_picture = 'http://www.gwytb.gov.cn/xwdt/zwyw/' + url_date + '/' + ir_picture
        except:
            ir_picture = ''
        # 来源
        refrome = tree.xpath('//div[@class="area wrapList"]/div[@class="info"]/span//text()')[1]
        # 内容
        ir_content = tree.xpath('//p[@align="justify"]//text()')
        if ir_content == []:
            ir_content = tree.xpath('//td/p/span//text()')
        neirong = '\u3000\u3000'
        for i in ir_content:
            da = i.strip().replace(' ', '').replace('\n', '').replace('\r', '')
            if len(da) == 0:
                pass
            else:
                neirong += da
                neirong += '\n'
                neirong += '\u3000\u3000'
        neirong = neirong.replace('"', '')
        # print(neirong)
        ir_urldate = int(time.time())
        cursor = connect.cursor()
        ir_md5 = get_md5(second_url)
        cursor = connect.cursor()
        sql1 = 'select ir_md5 from tw_gtbhistory_abroaddataall'
        connect.ping(reconnect=True)
        cursor.execute(sql1)
        ir_md5_dec = cursor.fetchall()
        # 把需要查询的数据转变为元组
        md5 = tuple([ir_md5], )

        if md5 in ir_md5_dec:
            pass
            print('数据已存在')
        else:
            # print(ir_urltime,neirong)
            sql = 'INSERT IGNORE INTO tw_gtbhistory_abroaddataall (ir_urltime,ir_urldate,ir_content,ir_mediasourceid,ir_mediasource,ir_imgurl,ir_title,ir_md5,ir_url,ir_videourl,if_vcj)VALUES("%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s")' % (
                ir_urltime, ir_urldate, neirong, ir_mediasourceid, ir_mediasource, ir_picture, ir_title,
                ir_md5, ir_url, ir_videourl, if_vcj)
            connect.ping(reconnect=True)
            cursor.execute(sql)
            ir_idd = int(connect.insert_id())
            print('数据库自增id', ir_idd, '数据')
            # 提交数据库
            connect.commit()
            print("tw_webhistory_abroaddataall表数据存储成功!", )

            urrl = f'http://twipad.hnxinxiudata.top/api/data/gtb_data?ir_id={ir_idd}'
            r = requests.get(urrl)
            print(r)

            print('提交成功')

    connect.close()


import pymysql


def get_md5(parmStr):
    # 1、参数必须是utf8
    # 2、python3所有字符都是unicode形式,已经不存在unicode关键字
    # 3、python3 str 实质上就是unicode
    if isinstance(parmStr, str):
        # 如果是unicode先转utf-8
        parmStr = parmStr.encode("utf-8")
    m = hashlib.md5()
    m.update(parmStr)
    return m.hexdigest()


connect = pymysql.Connect(
    host='140.210.4.73',
    port=3306,
    user='twipad_cj',
    passwd='bj@#twipad_cj',
    db='tw_ipaddb',
    charset='utf8mb4'
)


def main3():
    url = 'http://www.gwytb.gov.cn/xwdt/xwfb/xwfbh/'
    headers = {
        'cookie': 'Hm_lvt_eb0ec21879d705b54e0b3bbc2b4ce670=1631690955; Hm_lpvt_eb0ec21879d705b54e0b3bbc2b4ce670=1631691679',
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4557.4 Safari/537.36'
    }
    html = requests.get(url=url, headers=headers).content.decode('gbk')
    # print(html)
    tree = etree.HTML(html)
    second_url = tree.xpath('//ul[@class="scdList wrapList"]/li/a//@href')
    # print(second_url)
    for second_url in second_url:
        ir_url = second_url
        ir_mediasourceid = 3
        if_vcj = 0
        ir_videourl = ''
        ir_mediasource = '国台办官网'
        time.sleep((random.randint(3, 8)))
        print(second_url)
        html = requests.get(url=second_url, headers=headers).content.decode('gbk')
        # print(html)
        tree = etree.HTML(html)
        ir_urldate = int(time.time())
        # 标题
        ir_title = tree.xpath('//div[@class="area wrapList"]/h1//text()')[0]
        # 时间
        ir_urltime = tree.xpath('//div[@class="area wrapList"]/div[@class="info"]/span//text()')[0]

        # 来源
        refrome = tree.xpath('//div[@class="area wrapList"]/div[@class="info"]/span//text()')[1]
        # 内容
        ir_content = tree.xpath('//p[@align="justify"]//text()')
        if ir_content == []:
            ir_content = tree.xpath('//td/p/span//text()')
        neirong = '\u3000\u3000'
        for i in ir_content:
            da = i.strip().replace(' ', '').replace('\n', '').replace('\r', '')
            if len(da) == 0:
                pass
            else:
                neirong += da
                neirong += '\n'
                neirong += '\u3000\u3000'
        neirong = neirong.replace('"', '')
        # print(neirong)
        url_date = ir_urltime.split('-')
        url_date = url_date[0] + url_date[1]
        # print(url_date)
        if len(':') == 1:
            ir_urltime = ir_urltime + ':00'
        if len(':') == 0:
            ir_urltime = ir_urltime + '00:00:00'
        if len(':') == 2:
            ir_urltime = ir_urltime
        ir_urltime = datetime.datetime.strptime(ir_urltime, '%Y-%m-%d %H:%M:%S')
        ir_urltime = int(str(int(time.mktime(ir_urltime.timetuple()))))
        # print(f'发布时间:{ir_urltime}')
        try:
            ir_picture = (''.join(tree.xpath('//div[@class="TRS_Editor"]//img//@src')[0])).replace('./', '')
            ir_picture = 'http://www.gwytb.gov.cn/xwdt/zwyw/' + url_date + '/' + ir_picture
        except:
            ir_picture = ''

        # cursor = connect.cursor()
        ir_md5 = get_md5(second_url)
        cursor = connect.cursor()
        sql1 = 'select ir_md5 from tw_gtbhistory_abroaddataall'
        connect.ping(reconnect=True)
        cursor.execute(sql1)
        ir_md5_dec = cursor.fetchall()
        # 把需要查询的数据转变为元组
        md5 = tuple([ir_md5], )

        if md5 in ir_md5_dec:
            pass
            print('数据已存在')
        else:
            # print(neirong)
            sql = 'INSERT IGNORE INTO tw_gtbhistory_abroaddataall (ir_urltime,ir_urldate,ir_content,ir_mediasourceid,ir_mediasource,ir_imgurl,ir_title,ir_md5,ir_url,ir_videourl,if_vcj)VALUES("%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s")' % (
                ir_urltime, ir_urldate, neirong, ir_mediasourceid, ir_mediasource, ir_picture, ir_title,
                ir_md5, ir_url, ir_videourl, if_vcj)
            connect.ping(reconnect=True)
            cursor.execute(sql)
            ir_idd = int(connect.insert_id())
            print('数据库自增id', ir_idd, '数据')
            # 提交数据库
            connect.commit()
            print("tw_webhistory_abroaddataall表数据存储成功!", )

            urrl = f'http://twipad.hnxinxiudata.top/api/data/gtb_data?ir_id={ir_idd}'
            r = requests.get(urrl)
            print(r)

            print('提交成功')

    connect.close()


import pymysql


def get_md5(parmStr):
    # 1、参数必须是utf8
    # 2、python3所有字符都是unicode形式,已经不存在unicode关键字
    # 3、python3 str 实质上就是unicode
    if isinstance(parmStr, str):
        # 如果是unicode先转utf-8
        parmStr = parmStr.encode("utf-8")
    m = hashlib.md5()
    m.update(parmStr)
    return m.hexdigest()


connect = pymysql.Connect(
    host='140.210.4.73',
    port=3306,
    user='twipad_cj',
    passwd='bj@#twipad_cj',
    db='tw_ipaddb',
    charset='utf8mb4'
)


def main1():
    url = 'http://www.gwytb.gov.cn/xwdt/xwfb/wyly/'
    headers = {
        'cookie': 'Hm_lvt_eb0ec21879d705b54e0b3bbc2b4ce670=1631690955; Hm_lpvt_eb0ec21879d705b54e0b3bbc2b4ce670=1631691679',
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4557.4 Safari/537.36'
    }
    html = requests.get(url=url, headers=headers).content.decode('gbk')
    # print(html)
    tree = etree.HTML(html)
    second_url = tree.xpath('//ul[@class="scdList wrapList"]/li/a//@href')
    # print(second_url)
    for second_url in second_url:
        ir_videourl = ''
        if_vcj = 0
        print(second_url)
        ir_mediasourceid = 1
        ir_mediasource = '国台办官网'
        ir_url = second_url
        time.sleep((random.randint(3, 8)))
        # print(second_url)
        html = requests.get(url=second_url, headers=headers).content.decode('gbk')
        # print(html)
        tree = etree.HTML(html)
        # 标题
        ir_title = tree.xpath('//div[@class="area wrapList"]/h1//text()')[0]
        # 时间
        ir_urltime = tree.xpath('//div[@class="area wrapList"]/div[@class="info"]/span//text()')[0]

        # 来源
        refrome = tree.xpath('//div[@class="area wrapList"]/div[@class="info"]/span//text()')[1]
        # 内容
        # print(ir_urltime)
        ir_content = tree.xpath('//p[@align="justify"]//text()')
        if ir_content == []:
            ir_content = tree.xpath('//td/p/span//text()')
        # ir_content = ''.join(ir_content)
        neirong = '\u3000\u3000'
        for i in ir_content:
            da = i.strip().replace(' ', '').replace('\n', '').replace('\r', '')
            if len(da) == 0:
                pass
            else:
                neirong += da
                neirong += '\n'
                neirong += '\u3000\u3000'
        neirong = neirong.replace('"', '')
        # print(neirong)
        ir_urldate = int(time.time())
        url_date = ir_urltime.split('-')
        url_date = url_date[0] + url_date[1]
        # print(url_date)
        if len(':') == 1:
            ir_urltime = ir_urltime + ':00'
        if len(':') == 0:
            ir_urltime = ir_urltime + '00:00:00'
        if len(':') == 2:
            ir_urltime = ir_urltime
        dt = datetime.datetime.strptime(ir_urltime, '%Y-%m-%d %H:%M:%S')
        ir_urltime = int(str(int(time.mktime(dt.timetuple()))))
        try:
            ir_picture = (''.join(tree.xpath('//div[@class="TRS_Editor"]//img//@src')[0])).replace('./', '')
            ir_picture = 'http://www.gwytb.gov.cn/xwdt/zwyw/' + url_date + '/' + ir_picture
        except:
            ir_picture = ''

        ir_md5 = get_md5(second_url)
        cursor = connect.cursor()
        sql1 = 'select ir_md5 from tw_gtbhistory_abroaddataall'
        connect.ping(reconnect=True)
        cursor.execute(sql1)
        ir_md5_dec = cursor.fetchall()
        # 把需要查询的数据转变为元组
        md5 = tuple([ir_md5], )

        if md5 in ir_md5_dec:
            pass
            print('数据已存在')
        else:
            # print(neirong)
            sql = 'INSERT IGNORE INTO tw_gtbhistory_abroaddataall (ir_urltime,ir_urldate,ir_content,ir_mediasourceid,ir_mediasource,ir_imgurl,ir_title,ir_md5,ir_url,ir_videourl,if_vcj)VALUES("%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s")' % (
                ir_urltime, ir_urldate, neirong, ir_mediasourceid, ir_mediasource, ir_picture, ir_title,
                ir_md5, ir_url, ir_videourl, if_vcj)
            connect.ping(reconnect=True)
            cursor.execute(sql)
            ir_idd = int(connect.insert_id())
            print('数据库自增id', ir_idd, '数据')
            # 提交数据库
            connect.commit()
            print("tw_webhistory_abroaddataall表数据存储成功!", )

            urrl = f'http://twipad.hnxinxiudata.top/api/data/gtb_data?ir_id={ir_idd}'
            r = requests.get(urrl)
            print(r)

            print('提交成功')

    connect.close()


import pymysql

import re


def get_md5(parmStr):
    # 1、参数必须是utf8
    # 2、python3所有字符都是unicode形式,已经不存在unicode关键字
    # 3、python3 str 实质上就是unicode
    if isinstance(parmStr, str):
        # 如果是unicode先转utf-8
        parmStr = parmStr.encode("utf-8")
    m = hashlib.md5()
    m.update(parmStr)
    return m.hexdigest()


def today_start():
    today = datetime.date.today()
    today_time = int(time.mktime(today.timetuple()))
    return today_time


def time_turn(time1):
    try:
        try:
            # 1小时转年月日
            TTime = time.time()
            try:
                xs = int(time1.split('小时')[0])
            except:
                xs = int(time1.split('小時')[0])
            sjc = xs * 60 * 60
            time1 = int(TTime - sjc)
            # print(time1)
            return time1
        except:
            TTime = time.time()
            try:
                xs = int(time1.split('分钟')[0])
            except:
                xs = int(time1.split('分鐘')[0])
            sjc = xs * 60
            time1 = int(TTime - sjc)
            # print(time1)
            return time1
    except:

        if time1[1] == '月' or time1[2] == '月':
            if len(re.findall('(.*?)月', time1)) == 1:

                time1 = time1.replace('月', '-').replace('日', ' ')
                if ':' in time1:
                    try:
                        time1 = '2021-' + time1 + ':00'
                        time1 = time1.replace(' :', ':')
                    except:
                        time1 = '2021-0' + time1 + ':00'
                        time1 = time1.replace(' :', ':')
                else:
                    try:
                        time1 = '2021-' + time1 + '00:00:00'
                        time1 = time1.replace(' :', ':')
                    except:
                        time1 = '2021-0' + time1 + '00:00:00'
                        time1 = time1.replace(' :', ':')
            else:
                time1 = time1.replace('月', '-').replace('日', ' ')
                time1 = '2021-' + time1 + '00:00:00'
                time1 = time1.replace(' :', ':')
            dt = datetime.datetime.strptime(time1, '%Y-%m-%d %H:%M:%S')
            # result从数据库中读出来的标准格式时间数据
            # # 10位,时间点相当于从1.1开始的当年时间编号
            time1 = int(str(int(time.mktime(dt.timetuple()))))
            # print(time1)
            return time1

        elif time1[0] == '昨':
            time1 = (time1.split('天')[-1])
            time1 = (today_start() - 24 * 3600) + int(time1.split(':')[0]) * 3600 + int(time1.split(':')[1]) * 60
            # print(time1)
            return time1
        elif time1 == '刚刚':
            time1 = int(time.time())
            return time1
        elif '天前' in time1:
            time1 = int(time.time()) - (int(time1.split('天')[0]) * 3600 * 24)
            return time1
        elif '2021年' in time1:
            time1 = time1.replace('年', '-').replace('月', '-').replace('日', ' ')
            time1 = time1 + ':00'
            time1 = time1.replace(' :', ':')
            dt = datetime.datetime.strptime(time1, '%Y-%m-%d %H:%M:%S')
            # result从数据库中读出来的标准格式时间数据
            # # 10位,时间点相当于从1.1开始的当年时间编号
            time1 = int(str(int(time.mktime(dt.timetuple()))))
            return time1
        elif time1.split('年')[0] != 2021:
            time1 = 0
            print('不是今年的数据,不采集')
            return time1


connect = pymysql.Connect(
    host='140.210.4.73',
    port=3306,
    user='twipad_cj',
    passwd='bj@#twipad_cj',
    db='tw_ipaddb',
    charset='utf8mb4'
)


def main2():
    headers = {
        'cookie': 'cna=BpzIGUmpdmICAd6ABp5x12Vl; sca=6805c192; atpsida=bc71ef32008ac447bd14bc0a_1631869012_1',
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4557.4 Safari/537.36'
    }
    url = 'https://search.cctv.com/ifsearch.php?page=1&qtext=%E5%9B%BD%E5%8F%B0%E5%8A%9E%E5%8F%91%E5%B8%83%E4%BC%9A&sort=relevance&pageSize=20&type=video&vtime=-1&datepid=1&channel=&pageflag=0&qtext_str=%E5%9B%BD%E5%8F%B0%E5%8A%9E%E5%8F%91%E5%B8%83%E4%BC%9A'
    html2 = requests.get(url=url, headers=headers).content.decode()
    html = re.findall('urllink":"(.*?)","imglink":', html2)

    for second_url in html:
        time.sleep(random.randint(3, 5))

        second_url = (''.join(second_url)).replace('\/', '/')
        ir_url = second_url
        ir_mediasourceid = 2
        if_vcj = 2
        print(second_url)
        html = requests.get(url=second_url, headers=headers).content.decode()
        # print(html)
        tree = etree.HTML(html)
        ir_urldate = int(time.time())
        ir_picture = 'https://' + tree.xpath('//meta[@property="og:image"]//@content')[0]
        ir_title = tree.xpath('//div[@class="tit"]//text()')
        if ir_title == []:
            ir_title = tree.xpath('//div[@class="cnt_nav"]/h3//text()')
        ir_title = ir_title[0]
        ir_mediasource = tree.xpath('//p[@class="info"]//i//text()')
        if ir_mediasource == []:
            ir_mediasource = tree.xpath('//span[@class="laiyuan"]//text()')
        ir_mediasource = (''.join(ir_mediasource)).replace('来源 : ', '')
        ir_urltime = tree.xpath('//span[@class="time"]//text()')
        if ir_urltime == []:
            ir_urltime = tree.xpath('//div[@class="text_box_02"]//p//text()')[3]
        print(ir_urltime)
        # print('-' in ir_urltime[0])
        if '-' in ir_urltime[0]:
            print('****************方法一**********************')
            ir_urltime1 = (''.join(ir_urltime)).split('-')
            ir_urltime2 = (''.join(ir_urltime1[-1])).split(' ')
            # print(ir_urltime1, ir_urltime2)
            ir_urltime = ir_urltime1[0] + '年' + ir_urltime1[1] + '月' + ir_urltime2[0] + '日' + ir_urltime2[1]

            ir_urltime = (''.join(ir_urltime)).replace('2021年0', '')
            ir_urltime = time_turn(ir_urltime)
            print(ir_urltime)
        else:
            print('****************方法二**********************')
            ir_urltime = (''.join(ir_urltime)).replace('2021年0', '')
            ir_urltime = time_turn(ir_urltime)
            print(ir_urltime)
        # ir_urltime = time_turn(ir_urltime)
        ir_content = ir_title
        ir_content = ir_content.replace('"', '')

        ir_url = second_url
        ir_md5 = get_md5(second_url)

        cursor = connect.cursor()
        sql1 = 'select ir_url from tw_gtbhistory_abroaddataall'
        connect.ping(reconnect=True)
        cursor.execute(sql1)
        ir_md5_dec = cursor.fetchall()
        # 把需要查询的数据转变为元组
        md5 = tuple([ir_md5], )

        if md5 in ir_md5_dec:
            pass
            print('数据已存在')
        else:
            sql = 'INSERT  ignore INTO tw_gtbhistory_abroaddataall (ir_urltime,ir_urldate,ir_content,ir_mediasourceid,ir_mediasource,ir_imgurl,ir_title,ir_md5,ir_url,ir_videourl,if_vcj)VALUES("%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s")' % (
                ir_urltime, ir_urldate, ir_content, ir_mediasourceid, ir_mediasource, ir_picture, ir_title,
                ir_md5, ir_url, second_url, if_vcj)
            connect.ping(reconnect=True)
            cursor.execute(sql)
            ir_idd = int(connect.insert_id())
            if ir_idd == 0:
                print('数据已存在')
            else:
                print('数据库自增id', ir_idd, '数据')
                # # 提交数据库
                connect.commit()

                print("tw_gtbhistory_abroaddataall表数据存储成功!", )

                urrl = f'http://twipad.hnxinxiudata.top/api/data/gtb_data?ir_id={ir_idd}'
                r = requests.get(urrl)
                print(r)

                print('********************************************')
                print(ir_urltime, ir_urldate, ir_content, ir_mediasourceid, ir_mediasource, ir_picture, ir_title,
                      ir_md5, ir_url, second_url, if_vcj)
                print('**********************************')
    connect.close()


import datetime
import random
import time

import pymysql
import requests
from lxml import etree
import hashlib


def get_md5(parmStr):
    # 1、参数必须是utf8
    # 2、python3所有字符都是unicode形式,已经不存在unicode关键字
    # 3、python3 str 实质上就是unicode
    if isinstance(parmStr, str):
        # 如果是unicode先转utf-8
        parmStr = parmStr.encode("utf-8")
    m = hashlib.md5()
    m.update(parmStr)
    return m.hexdigest()


connect = pymysql.Connect(
    host='140.210.4.73',
    port=3306,
    user='twipad_cj',
    passwd='bj@#twipad_cj',
    db='tw_ipaddb',
    charset='utf8mb4'
)


def main5():
    url = 'http://www.gwytb.gov.cn/bmst'
    headers = {
        'cookie': 'Hm_lvt_eb0ec21879d705b54e0b3bbc2b4ce670=1631690955; Hm_lpvt_eb0ec21879d705b54e0b3bbc2b4ce670=1631691679',
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4557.4 Safari/537.36'
    }
    html = requests.get(url=url, headers=headers).content.decode('gbk')
    # print(html)
    tree = etree.HTML(html)
    second_url = tree.xpath('//ul[@class="scdList wrapList"]/li/a//@href')
    # print(second_url)
    for second_url in second_url:
        ir_url = second_url
        ir_mediasourceid = 5
        ir_videourl = ''
        if_vcj = 0

        ir_mediasource = '国台办官网'
        time.sleep((random.randint(3, 8)))
        print(second_url)
        html = requests.get(url=second_url, headers=headers).content.decode('gbk')
        # print(html)
        tree = etree.HTML(html)
        # 标题
        ir_title = tree.xpath('//div[@class="area wrapList"]/h1//text()')[0]
        # 时间
        ir_urltime = tree.xpath('//div[@class="area wrapList"]/div[@class="info"]/span//text()')[0]

        # 来源
        refrome = tree.xpath('//div[@class="area wrapList"]/div[@class="info"]/span//text()')[1]
        # 内容
        ir_content = tree.xpath('//p[@align="justify"]//text()')
        if ir_content == []:
            ir_content = tree.xpath('//td/p/span//text()')
        # ir_content = ''.join(ir_content)
        neirong = '\u3000\u3000'
        for i in ir_content:
            da = i.strip().replace(' ', '').replace('\n', '').replace('\r', '')
            if len(da) == 0:
                pass
            else:
                neirong += da
                neirong += '\n'
                neirong += '\u3000\u3000'
        neirong = neirong.replace('"', '')

        # print(neirong)
        url_date = ir_urltime.split('-')
        url_date = url_date[0] + url_date[1]
        # print(url_date)
        ir_urldate = int(time.time())
        if len(':') == 1:
            ir_urltime = ir_urltime + ':00'
        if len(':') == 0:
            ir_urltime = ir_urltime + '00:00:00'
        if len(':') == 2:
            ir_urltime = ir_urltime
        dt = datetime.datetime.strptime(ir_urltime, '%Y-%m-%d %H:%M:%S')
        ir_urltime = int(str(int(time.mktime(dt.timetuple()))))
        try:
            ir_picture = (''.join(tree.xpath('//div[@class="TRS_Editor"]//img//@src')[0])).replace('./', '')
            ir_picture = 'http://www.gwytb.gov.cn/xwdt/zwyw/' + url_date + '/' + ir_picture
        except:
            ir_picture = ''

        ir_md5 = get_md5(second_url)

        cursor = connect.cursor()
        sql1 = 'select ir_md5 from tw_gtbhistory_abroaddataall'
        connect.ping(reconnect=True)
        cursor.execute(sql1)
        ir_md5_dec = cursor.fetchall()
        # 把需要查询的数据转变为元组
        md5 = tuple([ir_md5], )

        if md5 in ir_md5_dec:
            pass
            print('数据已存在')
        else:
            sql = 'INSERT IGNORE INTO tw_gtbhistory_abroaddataall (ir_urltime,ir_urldate,ir_content,ir_mediasourceid,ir_mediasource,ir_imgurl,ir_title,ir_md5,ir_url,ir_videourl,if_vcj)VALUES("%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s")' % (
                ir_urltime, ir_urldate, neirong, ir_mediasourceid, ir_mediasource, ir_picture, ir_title,
                ir_md5, ir_url, ir_videourl, if_vcj)
            connect.ping(reconnect=True)
            cursor.execute(sql)
            ir_idd = int(connect.insert_id())
            print('数据库自增id', ir_idd, '数据')
            # 提交数据库
            connect.commit()
            print("tw_webhistory_abroaddataall表数据存储成功!", )

            urrl = f'http://twipad.hnxinxiudata.top/api/data/gtb_data?ir_id={ir_idd}'
            r = requests.get(urrl)
            print(r)

            print('提交成功')
            # print(neirong)

    connect.close()
while 1:
    if __name__ == '__main__':
        main1()
        main2()
        main3()
        main4()
        main5()
        time.sleep(24*3600)

  

posted @ 2021-11-21 13:21  布都御魂  阅读(59)  评论(0编辑  收藏  举报