中台办
import pymysql
def get_md5(parmStr):
# 1、参数必须是utf8
# 2、python3所有字符都是unicode形式,已经不存在unicode关键字
# 3、python3 str 实质上就是unicode
if isinstance(parmStr, str):
# 如果是unicode先转utf-8
parmStr = parmStr.encode("utf-8")
m = hashlib.md5()
m.update(parmStr)
return m.hexdigest()
connect = pymysql.Connect(
host='140.210.4.73',
port=3306,
user='twipad_cj',
passwd='bj@#twipad_cj',
db='tw_ipaddb',
charset='utf8mb4'
)
def main4():
url = 'http://www.gwytb.gov.cn/xwdt/zwyw/'
headers = {
'cookie': 'Hm_lvt_eb0ec21879d705b54e0b3bbc2b4ce670=1631690955; Hm_lpvt_eb0ec21879d705b54e0b3bbc2b4ce670=1631691679',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4557.4 Safari/537.36'
}
html = requests.get(url=url, headers=headers).content.decode('gbk')
# print(html)
tree = etree.HTML(html)
second_url = tree.xpath('//ul[@class="scdList wrapList"]/li/a//@href')
# print(second_url)
for second_url in second_url:
time.sleep(random.randint(3, 5))
print(second_url)
ir_mediasourceid = 4
ir_url = second_url
ir_mediasource = '国台办官网'
if_vcj = 0
ir_videourl = ''
time.sleep((random.randint(3, 8)))
# print(second_url)
html = requests.get(url=second_url, headers=headers).content.decode('gbk')
# print(html)
tree = etree.HTML(html)
# 标题
ir_title = tree.xpath('//div[@class="area wrapList"]/h1//text()')[0]
# 时间
ir_urltime = tree.xpath('//div[@class="area wrapList"]/div[@class="info"]/span//text()')[0]
url_date = ir_urltime.split('-')
url_date = url_date[0] + url_date[1]
# print(url_date)
if len(':') == 1:
ir_urltime = ir_urltime + ':00'
if len(':') == 0:
ir_urltime = ir_urltime + '00:00:00'
if len(':') == 2:
ir_urltime = ir_urltime
dt = datetime.datetime.strptime(ir_urltime, '%Y-%m-%d %H:%M:%S')
ir_urltime = int(str(int(time.mktime(dt.timetuple()))))
try:
ir_picture = (''.join(tree.xpath('//div[@class="TRS_Editor"]//img//@src')[0])).replace('./', '')
ir_picture = 'http://www.gwytb.gov.cn/xwdt/zwyw/' + url_date + '/' + ir_picture
except:
ir_picture = ''
# 来源
refrome = tree.xpath('//div[@class="area wrapList"]/div[@class="info"]/span//text()')[1]
# 内容
ir_content = tree.xpath('//p[@align="justify"]//text()')
if ir_content == []:
ir_content = tree.xpath('//td/p/span//text()')
neirong = '\u3000\u3000'
for i in ir_content:
da = i.strip().replace(' ', '').replace('\n', '').replace('\r', '')
if len(da) == 0:
pass
else:
neirong += da
neirong += '\n'
neirong += '\u3000\u3000'
neirong = neirong.replace('"', '')
# print(neirong)
ir_urldate = int(time.time())
cursor = connect.cursor()
ir_md5 = get_md5(second_url)
cursor = connect.cursor()
sql1 = 'select ir_md5 from tw_gtbhistory_abroaddataall'
connect.ping(reconnect=True)
cursor.execute(sql1)
ir_md5_dec = cursor.fetchall()
# 把需要查询的数据转变为元组
md5 = tuple([ir_md5], )
if md5 in ir_md5_dec:
pass
print('数据已存在')
else:
# print(ir_urltime,neirong)
sql = 'INSERT IGNORE INTO tw_gtbhistory_abroaddataall (ir_urltime,ir_urldate,ir_content,ir_mediasourceid,ir_mediasource,ir_imgurl,ir_title,ir_md5,ir_url,ir_videourl,if_vcj)VALUES("%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s")' % (
ir_urltime, ir_urldate, neirong, ir_mediasourceid, ir_mediasource, ir_picture, ir_title,
ir_md5, ir_url, ir_videourl, if_vcj)
connect.ping(reconnect=True)
cursor.execute(sql)
ir_idd = int(connect.insert_id())
print('数据库自增id', ir_idd, '数据')
# 提交数据库
connect.commit()
print("tw_webhistory_abroaddataall表数据存储成功!", )
urrl = f'http://twipad.hnxinxiudata.top/api/data/gtb_data?ir_id={ir_idd}'
r = requests.get(urrl)
print(r)
print('提交成功')
connect.close()
import pymysql
def get_md5(parmStr):
# 1、参数必须是utf8
# 2、python3所有字符都是unicode形式,已经不存在unicode关键字
# 3、python3 str 实质上就是unicode
if isinstance(parmStr, str):
# 如果是unicode先转utf-8
parmStr = parmStr.encode("utf-8")
m = hashlib.md5()
m.update(parmStr)
return m.hexdigest()
connect = pymysql.Connect(
host='140.210.4.73',
port=3306,
user='twipad_cj',
passwd='bj@#twipad_cj',
db='tw_ipaddb',
charset='utf8mb4'
)
def main3():
url = 'http://www.gwytb.gov.cn/xwdt/xwfb/xwfbh/'
headers = {
'cookie': 'Hm_lvt_eb0ec21879d705b54e0b3bbc2b4ce670=1631690955; Hm_lpvt_eb0ec21879d705b54e0b3bbc2b4ce670=1631691679',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4557.4 Safari/537.36'
}
html = requests.get(url=url, headers=headers).content.decode('gbk')
# print(html)
tree = etree.HTML(html)
second_url = tree.xpath('//ul[@class="scdList wrapList"]/li/a//@href')
# print(second_url)
for second_url in second_url:
ir_url = second_url
ir_mediasourceid = 3
if_vcj = 0
ir_videourl = ''
ir_mediasource = '国台办官网'
time.sleep((random.randint(3, 8)))
print(second_url)
html = requests.get(url=second_url, headers=headers).content.decode('gbk')
# print(html)
tree = etree.HTML(html)
ir_urldate = int(time.time())
# 标题
ir_title = tree.xpath('//div[@class="area wrapList"]/h1//text()')[0]
# 时间
ir_urltime = tree.xpath('//div[@class="area wrapList"]/div[@class="info"]/span//text()')[0]
# 来源
refrome = tree.xpath('//div[@class="area wrapList"]/div[@class="info"]/span//text()')[1]
# 内容
ir_content = tree.xpath('//p[@align="justify"]//text()')
if ir_content == []:
ir_content = tree.xpath('//td/p/span//text()')
neirong = '\u3000\u3000'
for i in ir_content:
da = i.strip().replace(' ', '').replace('\n', '').replace('\r', '')
if len(da) == 0:
pass
else:
neirong += da
neirong += '\n'
neirong += '\u3000\u3000'
neirong = neirong.replace('"', '')
# print(neirong)
url_date = ir_urltime.split('-')
url_date = url_date[0] + url_date[1]
# print(url_date)
if len(':') == 1:
ir_urltime = ir_urltime + ':00'
if len(':') == 0:
ir_urltime = ir_urltime + '00:00:00'
if len(':') == 2:
ir_urltime = ir_urltime
ir_urltime = datetime.datetime.strptime(ir_urltime, '%Y-%m-%d %H:%M:%S')
ir_urltime = int(str(int(time.mktime(ir_urltime.timetuple()))))
# print(f'发布时间:{ir_urltime}')
try:
ir_picture = (''.join(tree.xpath('//div[@class="TRS_Editor"]//img//@src')[0])).replace('./', '')
ir_picture = 'http://www.gwytb.gov.cn/xwdt/zwyw/' + url_date + '/' + ir_picture
except:
ir_picture = ''
# cursor = connect.cursor()
ir_md5 = get_md5(second_url)
cursor = connect.cursor()
sql1 = 'select ir_md5 from tw_gtbhistory_abroaddataall'
connect.ping(reconnect=True)
cursor.execute(sql1)
ir_md5_dec = cursor.fetchall()
# 把需要查询的数据转变为元组
md5 = tuple([ir_md5], )
if md5 in ir_md5_dec:
pass
print('数据已存在')
else:
# print(neirong)
sql = 'INSERT IGNORE INTO tw_gtbhistory_abroaddataall (ir_urltime,ir_urldate,ir_content,ir_mediasourceid,ir_mediasource,ir_imgurl,ir_title,ir_md5,ir_url,ir_videourl,if_vcj)VALUES("%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s")' % (
ir_urltime, ir_urldate, neirong, ir_mediasourceid, ir_mediasource, ir_picture, ir_title,
ir_md5, ir_url, ir_videourl, if_vcj)
connect.ping(reconnect=True)
cursor.execute(sql)
ir_idd = int(connect.insert_id())
print('数据库自增id', ir_idd, '数据')
# 提交数据库
connect.commit()
print("tw_webhistory_abroaddataall表数据存储成功!", )
urrl = f'http://twipad.hnxinxiudata.top/api/data/gtb_data?ir_id={ir_idd}'
r = requests.get(urrl)
print(r)
print('提交成功')
connect.close()
import pymysql
def get_md5(parmStr):
# 1、参数必须是utf8
# 2、python3所有字符都是unicode形式,已经不存在unicode关键字
# 3、python3 str 实质上就是unicode
if isinstance(parmStr, str):
# 如果是unicode先转utf-8
parmStr = parmStr.encode("utf-8")
m = hashlib.md5()
m.update(parmStr)
return m.hexdigest()
connect = pymysql.Connect(
host='140.210.4.73',
port=3306,
user='twipad_cj',
passwd='bj@#twipad_cj',
db='tw_ipaddb',
charset='utf8mb4'
)
def main1():
url = 'http://www.gwytb.gov.cn/xwdt/xwfb/wyly/'
headers = {
'cookie': 'Hm_lvt_eb0ec21879d705b54e0b3bbc2b4ce670=1631690955; Hm_lpvt_eb0ec21879d705b54e0b3bbc2b4ce670=1631691679',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4557.4 Safari/537.36'
}
html = requests.get(url=url, headers=headers).content.decode('gbk')
# print(html)
tree = etree.HTML(html)
second_url = tree.xpath('//ul[@class="scdList wrapList"]/li/a//@href')
# print(second_url)
for second_url in second_url:
ir_videourl = ''
if_vcj = 0
print(second_url)
ir_mediasourceid = 1
ir_mediasource = '国台办官网'
ir_url = second_url
time.sleep((random.randint(3, 8)))
# print(second_url)
html = requests.get(url=second_url, headers=headers).content.decode('gbk')
# print(html)
tree = etree.HTML(html)
# 标题
ir_title = tree.xpath('//div[@class="area wrapList"]/h1//text()')[0]
# 时间
ir_urltime = tree.xpath('//div[@class="area wrapList"]/div[@class="info"]/span//text()')[0]
# 来源
refrome = tree.xpath('//div[@class="area wrapList"]/div[@class="info"]/span//text()')[1]
# 内容
# print(ir_urltime)
ir_content = tree.xpath('//p[@align="justify"]//text()')
if ir_content == []:
ir_content = tree.xpath('//td/p/span//text()')
# ir_content = ''.join(ir_content)
neirong = '\u3000\u3000'
for i in ir_content:
da = i.strip().replace(' ', '').replace('\n', '').replace('\r', '')
if len(da) == 0:
pass
else:
neirong += da
neirong += '\n'
neirong += '\u3000\u3000'
neirong = neirong.replace('"', '')
# print(neirong)
ir_urldate = int(time.time())
url_date = ir_urltime.split('-')
url_date = url_date[0] + url_date[1]
# print(url_date)
if len(':') == 1:
ir_urltime = ir_urltime + ':00'
if len(':') == 0:
ir_urltime = ir_urltime + '00:00:00'
if len(':') == 2:
ir_urltime = ir_urltime
dt = datetime.datetime.strptime(ir_urltime, '%Y-%m-%d %H:%M:%S')
ir_urltime = int(str(int(time.mktime(dt.timetuple()))))
try:
ir_picture = (''.join(tree.xpath('//div[@class="TRS_Editor"]//img//@src')[0])).replace('./', '')
ir_picture = 'http://www.gwytb.gov.cn/xwdt/zwyw/' + url_date + '/' + ir_picture
except:
ir_picture = ''
ir_md5 = get_md5(second_url)
cursor = connect.cursor()
sql1 = 'select ir_md5 from tw_gtbhistory_abroaddataall'
connect.ping(reconnect=True)
cursor.execute(sql1)
ir_md5_dec = cursor.fetchall()
# 把需要查询的数据转变为元组
md5 = tuple([ir_md5], )
if md5 in ir_md5_dec:
pass
print('数据已存在')
else:
# print(neirong)
sql = 'INSERT IGNORE INTO tw_gtbhistory_abroaddataall (ir_urltime,ir_urldate,ir_content,ir_mediasourceid,ir_mediasource,ir_imgurl,ir_title,ir_md5,ir_url,ir_videourl,if_vcj)VALUES("%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s")' % (
ir_urltime, ir_urldate, neirong, ir_mediasourceid, ir_mediasource, ir_picture, ir_title,
ir_md5, ir_url, ir_videourl, if_vcj)
connect.ping(reconnect=True)
cursor.execute(sql)
ir_idd = int(connect.insert_id())
print('数据库自增id', ir_idd, '数据')
# 提交数据库
connect.commit()
print("tw_webhistory_abroaddataall表数据存储成功!", )
urrl = f'http://twipad.hnxinxiudata.top/api/data/gtb_data?ir_id={ir_idd}'
r = requests.get(urrl)
print(r)
print('提交成功')
connect.close()
import pymysql
import re
def get_md5(parmStr):
# 1、参数必须是utf8
# 2、python3所有字符都是unicode形式,已经不存在unicode关键字
# 3、python3 str 实质上就是unicode
if isinstance(parmStr, str):
# 如果是unicode先转utf-8
parmStr = parmStr.encode("utf-8")
m = hashlib.md5()
m.update(parmStr)
return m.hexdigest()
def today_start():
today = datetime.date.today()
today_time = int(time.mktime(today.timetuple()))
return today_time
def time_turn(time1):
try:
try:
# 1小时转年月日
TTime = time.time()
try:
xs = int(time1.split('小时')[0])
except:
xs = int(time1.split('小時')[0])
sjc = xs * 60 * 60
time1 = int(TTime - sjc)
# print(time1)
return time1
except:
TTime = time.time()
try:
xs = int(time1.split('分钟')[0])
except:
xs = int(time1.split('分鐘')[0])
sjc = xs * 60
time1 = int(TTime - sjc)
# print(time1)
return time1
except:
if time1[1] == '月' or time1[2] == '月':
if len(re.findall('(.*?)月', time1)) == 1:
time1 = time1.replace('月', '-').replace('日', ' ')
if ':' in time1:
try:
time1 = '2021-' + time1 + ':00'
time1 = time1.replace(' :', ':')
except:
time1 = '2021-0' + time1 + ':00'
time1 = time1.replace(' :', ':')
else:
try:
time1 = '2021-' + time1 + '00:00:00'
time1 = time1.replace(' :', ':')
except:
time1 = '2021-0' + time1 + '00:00:00'
time1 = time1.replace(' :', ':')
else:
time1 = time1.replace('月', '-').replace('日', ' ')
time1 = '2021-' + time1 + '00:00:00'
time1 = time1.replace(' :', ':')
dt = datetime.datetime.strptime(time1, '%Y-%m-%d %H:%M:%S')
# result从数据库中读出来的标准格式时间数据
# # 10位,时间点相当于从1.1开始的当年时间编号
time1 = int(str(int(time.mktime(dt.timetuple()))))
# print(time1)
return time1
elif time1[0] == '昨':
time1 = (time1.split('天')[-1])
time1 = (today_start() - 24 * 3600) + int(time1.split(':')[0]) * 3600 + int(time1.split(':')[1]) * 60
# print(time1)
return time1
elif time1 == '刚刚':
time1 = int(time.time())
return time1
elif '天前' in time1:
time1 = int(time.time()) - (int(time1.split('天')[0]) * 3600 * 24)
return time1
elif '2021年' in time1:
time1 = time1.replace('年', '-').replace('月', '-').replace('日', ' ')
time1 = time1 + ':00'
time1 = time1.replace(' :', ':')
dt = datetime.datetime.strptime(time1, '%Y-%m-%d %H:%M:%S')
# result从数据库中读出来的标准格式时间数据
# # 10位,时间点相当于从1.1开始的当年时间编号
time1 = int(str(int(time.mktime(dt.timetuple()))))
return time1
elif time1.split('年')[0] != 2021:
time1 = 0
print('不是今年的数据,不采集')
return time1
connect = pymysql.Connect(
host='140.210.4.73',
port=3306,
user='twipad_cj',
passwd='bj@#twipad_cj',
db='tw_ipaddb',
charset='utf8mb4'
)
def main2():
headers = {
'cookie': 'cna=BpzIGUmpdmICAd6ABp5x12Vl; sca=6805c192; atpsida=bc71ef32008ac447bd14bc0a_1631869012_1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4557.4 Safari/537.36'
}
url = 'https://search.cctv.com/ifsearch.php?page=1&qtext=%E5%9B%BD%E5%8F%B0%E5%8A%9E%E5%8F%91%E5%B8%83%E4%BC%9A&sort=relevance&pageSize=20&type=video&vtime=-1&datepid=1&channel=&pageflag=0&qtext_str=%E5%9B%BD%E5%8F%B0%E5%8A%9E%E5%8F%91%E5%B8%83%E4%BC%9A'
html2 = requests.get(url=url, headers=headers).content.decode()
html = re.findall('urllink":"(.*?)","imglink":', html2)
for second_url in html:
time.sleep(random.randint(3, 5))
second_url = (''.join(second_url)).replace('\/', '/')
ir_url = second_url
ir_mediasourceid = 2
if_vcj = 2
print(second_url)
html = requests.get(url=second_url, headers=headers).content.decode()
# print(html)
tree = etree.HTML(html)
ir_urldate = int(time.time())
ir_picture = 'https://' + tree.xpath('//meta[@property="og:image"]//@content')[0]
ir_title = tree.xpath('//div[@class="tit"]//text()')
if ir_title == []:
ir_title = tree.xpath('//div[@class="cnt_nav"]/h3//text()')
ir_title = ir_title[0]
ir_mediasource = tree.xpath('//p[@class="info"]//i//text()')
if ir_mediasource == []:
ir_mediasource = tree.xpath('//span[@class="laiyuan"]//text()')
ir_mediasource = (''.join(ir_mediasource)).replace('来源 : ', '')
ir_urltime = tree.xpath('//span[@class="time"]//text()')
if ir_urltime == []:
ir_urltime = tree.xpath('//div[@class="text_box_02"]//p//text()')[3]
print(ir_urltime)
# print('-' in ir_urltime[0])
if '-' in ir_urltime[0]:
print('****************方法一**********************')
ir_urltime1 = (''.join(ir_urltime)).split('-')
ir_urltime2 = (''.join(ir_urltime1[-1])).split(' ')
# print(ir_urltime1, ir_urltime2)
ir_urltime = ir_urltime1[0] + '年' + ir_urltime1[1] + '月' + ir_urltime2[0] + '日' + ir_urltime2[1]
ir_urltime = (''.join(ir_urltime)).replace('2021年0', '')
ir_urltime = time_turn(ir_urltime)
print(ir_urltime)
else:
print('****************方法二**********************')
ir_urltime = (''.join(ir_urltime)).replace('2021年0', '')
ir_urltime = time_turn(ir_urltime)
print(ir_urltime)
# ir_urltime = time_turn(ir_urltime)
ir_content = ir_title
ir_content = ir_content.replace('"', '')
ir_url = second_url
ir_md5 = get_md5(second_url)
cursor = connect.cursor()
sql1 = 'select ir_url from tw_gtbhistory_abroaddataall'
connect.ping(reconnect=True)
cursor.execute(sql1)
ir_md5_dec = cursor.fetchall()
# 把需要查询的数据转变为元组
md5 = tuple([ir_md5], )
if md5 in ir_md5_dec:
pass
print('数据已存在')
else:
sql = 'INSERT ignore INTO tw_gtbhistory_abroaddataall (ir_urltime,ir_urldate,ir_content,ir_mediasourceid,ir_mediasource,ir_imgurl,ir_title,ir_md5,ir_url,ir_videourl,if_vcj)VALUES("%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s")' % (
ir_urltime, ir_urldate, ir_content, ir_mediasourceid, ir_mediasource, ir_picture, ir_title,
ir_md5, ir_url, second_url, if_vcj)
connect.ping(reconnect=True)
cursor.execute(sql)
ir_idd = int(connect.insert_id())
if ir_idd == 0:
print('数据已存在')
else:
print('数据库自增id', ir_idd, '数据')
# # 提交数据库
connect.commit()
print("tw_gtbhistory_abroaddataall表数据存储成功!", )
urrl = f'http://twipad.hnxinxiudata.top/api/data/gtb_data?ir_id={ir_idd}'
r = requests.get(urrl)
print(r)
print('********************************************')
print(ir_urltime, ir_urldate, ir_content, ir_mediasourceid, ir_mediasource, ir_picture, ir_title,
ir_md5, ir_url, second_url, if_vcj)
print('**********************************')
connect.close()
import datetime
import random
import time
import pymysql
import requests
from lxml import etree
import hashlib
def get_md5(parmStr):
# 1、参数必须是utf8
# 2、python3所有字符都是unicode形式,已经不存在unicode关键字
# 3、python3 str 实质上就是unicode
if isinstance(parmStr, str):
# 如果是unicode先转utf-8
parmStr = parmStr.encode("utf-8")
m = hashlib.md5()
m.update(parmStr)
return m.hexdigest()
connect = pymysql.Connect(
host='140.210.4.73',
port=3306,
user='twipad_cj',
passwd='bj@#twipad_cj',
db='tw_ipaddb',
charset='utf8mb4'
)
def main5():
url = 'http://www.gwytb.gov.cn/bmst'
headers = {
'cookie': 'Hm_lvt_eb0ec21879d705b54e0b3bbc2b4ce670=1631690955; Hm_lpvt_eb0ec21879d705b54e0b3bbc2b4ce670=1631691679',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4557.4 Safari/537.36'
}
html = requests.get(url=url, headers=headers).content.decode('gbk')
# print(html)
tree = etree.HTML(html)
second_url = tree.xpath('//ul[@class="scdList wrapList"]/li/a//@href')
# print(second_url)
for second_url in second_url:
ir_url = second_url
ir_mediasourceid = 5
ir_videourl = ''
if_vcj = 0
ir_mediasource = '国台办官网'
time.sleep((random.randint(3, 8)))
print(second_url)
html = requests.get(url=second_url, headers=headers).content.decode('gbk')
# print(html)
tree = etree.HTML(html)
# 标题
ir_title = tree.xpath('//div[@class="area wrapList"]/h1//text()')[0]
# 时间
ir_urltime = tree.xpath('//div[@class="area wrapList"]/div[@class="info"]/span//text()')[0]
# 来源
refrome = tree.xpath('//div[@class="area wrapList"]/div[@class="info"]/span//text()')[1]
# 内容
ir_content = tree.xpath('//p[@align="justify"]//text()')
if ir_content == []:
ir_content = tree.xpath('//td/p/span//text()')
# ir_content = ''.join(ir_content)
neirong = '\u3000\u3000'
for i in ir_content:
da = i.strip().replace(' ', '').replace('\n', '').replace('\r', '')
if len(da) == 0:
pass
else:
neirong += da
neirong += '\n'
neirong += '\u3000\u3000'
neirong = neirong.replace('"', '')
# print(neirong)
url_date = ir_urltime.split('-')
url_date = url_date[0] + url_date[1]
# print(url_date)
ir_urldate = int(time.time())
if len(':') == 1:
ir_urltime = ir_urltime + ':00'
if len(':') == 0:
ir_urltime = ir_urltime + '00:00:00'
if len(':') == 2:
ir_urltime = ir_urltime
dt = datetime.datetime.strptime(ir_urltime, '%Y-%m-%d %H:%M:%S')
ir_urltime = int(str(int(time.mktime(dt.timetuple()))))
try:
ir_picture = (''.join(tree.xpath('//div[@class="TRS_Editor"]//img//@src')[0])).replace('./', '')
ir_picture = 'http://www.gwytb.gov.cn/xwdt/zwyw/' + url_date + '/' + ir_picture
except:
ir_picture = ''
ir_md5 = get_md5(second_url)
cursor = connect.cursor()
sql1 = 'select ir_md5 from tw_gtbhistory_abroaddataall'
connect.ping(reconnect=True)
cursor.execute(sql1)
ir_md5_dec = cursor.fetchall()
# 把需要查询的数据转变为元组
md5 = tuple([ir_md5], )
if md5 in ir_md5_dec:
pass
print('数据已存在')
else:
sql = 'INSERT IGNORE INTO tw_gtbhistory_abroaddataall (ir_urltime,ir_urldate,ir_content,ir_mediasourceid,ir_mediasource,ir_imgurl,ir_title,ir_md5,ir_url,ir_videourl,if_vcj)VALUES("%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s")' % (
ir_urltime, ir_urldate, neirong, ir_mediasourceid, ir_mediasource, ir_picture, ir_title,
ir_md5, ir_url, ir_videourl, if_vcj)
connect.ping(reconnect=True)
cursor.execute(sql)
ir_idd = int(connect.insert_id())
print('数据库自增id', ir_idd, '数据')
# 提交数据库
connect.commit()
print("tw_webhistory_abroaddataall表数据存储成功!", )
urrl = f'http://twipad.hnxinxiudata.top/api/data/gtb_data?ir_id={ir_idd}'
r = requests.get(urrl)
print(r)
print('提交成功')
# print(neirong)
connect.close()
while 1:
if __name__ == '__main__':
main1()
main2()
main3()
main4()
main5()
time.sleep(24*3600)