tobbc.py
#导入数据库驱动
import mysql.connector
import sqlite3
import pymysql
import re
'''
配置mysql数据库
'''
SqlitePath = "C:/Users/kang9/Desktop/BBC/bin/Debug/Ted.sqlite"
replacement_patterns = [
(r'won\'t', 'will not'),
(r'can\'t', 'cannot'),
(r'i\'m', 'i am'),
(r'ain\'t', 'is not'),
(r'(\w+)\'ll', '\g<1> will'),
(r'(\w+)n\'t', '\g<1> not'),
(r'(\w+)\'ve', '\g<1> have'),
(r'(\w+)\'s', '\g<1> is'),
(r'(\w+)\'re', '\g<1> are'),
(r'(\w+)\'d', '\g<1> would'),
(r"s\'",'s'),
(r"\'", ''),
(r"O\'", ''),
(r"o\'", '')]
class RegexpReplacer(object):
def __init__(self, patterns=replacement_patterns):
self.patterns = [(re.compile(regex), repl) for (regex, repl) in patterns]
def replace(self, text):
s = text
for (pattern, repl) in self.patterns:
(s, count) = re.subn(pattern, repl, s)
return s
mydb = mysql.connector.connect(
host="localhost", # 数据库主机地址
user="root", # 数据库用户名
passwd="iyu123456", # 数据库密码
database = "newstest"
)
#创建mysql游标
mycursor = mydb.cursor()
#创建sqlite游标
#一定要填写绝对路径
conn = sqlite3.connect(SqlitePath)
cursor = conn.cursor()
#插入一条记录
mycursor.execute("select * from newstitle")
myresult = mycursor.fetchall()
replacer = RegexpReplacer()
for x in myresult:
pic = replacer.replace(x[10])
title = replacer.replace(x[1])
# title_cn = replacer.replace(x[5])
creatTime = replacer.replace(x[12])
DescCn = replacer.replace(x[2])
CategoryName = replacer.replace(x[6])
# print("x[0]:",x[0])
# print("pic:",pic)
# print("title:",title)
# # print("title_cn:",title_cn)
# print("x[8]:",x[8])
# print("creatTime:",creatTime)
# print("x[16]:",x[16])
# print("DescCn:",DescCn)
# print("CategoryName:",CategoryName)
print("ReadCount:",x[14])
print("ReadCount:",x[25])
sql = "INSERT INTO bbc(Id,Pic,Title,Title_cn,Type, Category, CreateTime,Flag,DescCn,CategoryName,Sound,ReadCount) " \
"VALUES (%s,'%s','%s', '%s' , '%s', '%s' ,'%s' ,%s , '%s', '%s' ,'%s','%s')" %\
(x[0],pic,title,x[5],x[25],CategoryName,creatTime,x[16],DescCn,CategoryName,x[8],x[14])
try:
cursor.execute(sql)
print("\033[0;36;40m", "插入新闻内容成功", "\033[0m")
conn.commit()
except Exception as e:
print("\033[0;31;40m", "插入失败:", e, "\033[0m","打印:::::", sql)
conn.rollback()
sql = "select * from bbc"
cursor.execute(sql)
values = cursor.fetchall()
for value in values:
print(value)
conn.close()
# cursor.close()
mycursor.close()
tobbcdetail.py
# coding: utf-8
import sqlite3
import mysql.connector
import mysql
import pymysql
import time
import json
import re
SqlitePath = "C:/Users/kang9/Desktop/BBC/bin/Debug/Ted.sqlite"
replacement_patterns = [
(r'won\'t', 'will not'),
(r'can\'t', 'cannot'),
(r'i\'m', 'i am'),
(r'ain\'t', 'is not'),
(r'(\w+)\'ll', '\g<1> will'),
(r'(\w+)n\'t', '\g<1> not'),
(r'(\w+)\'ve', '\g<1> have'),
(r'(\w+)\'s', '\g<1> is'),
(r'(\w+)\'re', '\g<1> are'),
(r'(\w+)\'d', '\g<1> would'),
(r"s\'",'s'),
(r"\'", ''),
(r"O\'", ''),
(r"o\'", '')]
class RegexpReplacer(object):
def __init__(self, patterns=replacement_patterns):
self.patterns = [(re.compile(regex), repl) for (regex, repl) in patterns]
def replace(self, text):
s = text
for (pattern, repl) in self.patterns:
(s, count) = re.subn(pattern, repl, s)
return s
replacer = RegexpReplacer()
conn_from = pymysql.connect("localhost", "root", "iyu123456", "newstest", charset='utf8')
print("ok")
_from = conn_from.cursor()
conn_to = sqlite3.connect(SqlitePath)
print("连接sqlite3成功")
_to = conn_to.cursor()
# 处理detail表格
count3 = 0
sql_4 = 'SELECT DISTINCT NewsID FROM newstext'
_from.execute(sql_4)
idlist = _from.fetchall()
for id in idlist:
sql_5 = 'select Id from bbcdetail where Id = ?'
params_5 = (id)
_to.execute(sql_5, params_5)
if not _to.fetchall():
itemid = id[0] # 所有在newstitle 里面的newsId
sql_3 = 'SELECT EndTiming, ParaId,IdIndex, Sentence_cn,Timing,Sentence,NewsID,AudioSrc FROM newstext WHERE NewsID = %s'
params_3 = (itemid)
_from.execute(sql_3, params_3)
d = _from.fetchall() #d此时是每一整篇新闻分成的句子及其后面数据为一个元组构成的大元组
d = list(d)
paranum = 0
list1 = list(range(len(d)))
for p in d:
# 条件表达式
a = ' ' if p[0] is None else p[0]
b = ' ' if p[4] is None else p[4]
list1[paranum] = {"EndTiming":a,"ParaId": str(p[1]), "IdIndex":str(p[2]), "Sentence_cn": p[3],
"Timing":b,"Sentence": p[5],}
paranum += 1
data = [json.dumps(w, ensure_ascii=False) for w in list1]
print("id is the ",itemid)
sql_33 = 'SELECT Category,CreatTime,Title,Sound,Pic,Flag,DescCn,Title_cn,ReadCount FROM newstitle WHERE NewsID = %s'
params_33 = (itemid)
try:
_from.execute(sql_33, params_33)
dd = _from.fetchall() # d此时是每一整篇新闻分成的句子及其后面数据为一个元组构成的大元组
# print("Flag",dd[0][5],",DescCn",dd[0][6],",Title_cn",dd[0][7],",ReadCount",dd[0][8])
except Exception as e:
print("错误是",e)
aha = ' ' if dd[0][7] is None else dd[0][7]
descontent = dd[0][6]
descontent = str(descontent).replace('"',"'")
# result = '{"result": 1,"data": [' + ','.join(data) + '],"id":' + str(itemid)+',"title":{'+ '"Category":'+dd[0][0]+',"CreateTime":'+dd[0][1]+',"Title":'+dd[0][2]+',"Sound":'+ dd[0][3]+'"Pic":'+dd[0][4]+',"Type":'+dd[0][0]+',"DescCn":'+(dd[0][5])+'"Title_cn:"'+dd[0][6]+',"ReadCount":'+str(dd[0][7])+'}}'
result = '{"result": 1,"data": [' + ','.join(data) + '],"id":"' + str(itemid)+'","title":{'+ '"Category":"'+dd[0][0]+'","CreateTime":"'+dd[0][1]+'","Title":"'+dd[0][2]+'","Sound":"'+ dd[0][3] +'","Pic":"'+dd[0][4]+'","Flag":"'+str(dd[0][5])+'","Type":"'+dd[0][0]+'","DescCn":"'+descontent+'","Title_cn":"'+aha+'","series":"0","CategoryName":"'+dd[0][0]+'","id":"'+str(itemid)+'","ReadCount":"'+str(dd[0][8])+'"},"message": "success"}'
pic = replacer.replace(result)
sql_6 = "insert into bbcdetail (Id, detail) values ('%s', '%s')" % (id[0], pic)
try:
_to.execute(sql_6)
conn_to.commit()
except Exception as e:
print("这又是一次错误",e)
count3 += 1
print("插入detail成功")
print("今天插入detail内容{}条".format(count3))