phpBB3导入帖子的Python脚本
关联的数据表
在phpBB3中导入用户时, 需要处理的有两张表, 一个是 topics, 一个是 posts.
为了方便与原数据关联, 需要在这两个表上新增一个字段并建立唯一索引
ALTER TABLE `topics` ADD COLUMN `ori_id` VARCHAR(32) NOT NULL DEFAULT '' AFTER `poll_vote_change`; ALTER TABLE `posts` ADD COLUMN `ori_id` VARCHAR(32) NOT NULL DEFAULT '' AFTER `post_edit_locked`; ALTER TABLE `topics` ADD UNIQUE INDEX `ori_id` USING BTREE (`ori_id`); ALTER TABLE `posts` ADD UNIQUE INDEX `ori_id` USING BTREE (`ori_id`);
如果是新安装的论坛, 在每次导入之前, 用以下语句初始化:
TRUNCATE `topics`; TRUNCATE `posts`;
需要的最小数据集
posts表, 需要的最小字段集为 `topic_id`, `forum_id`, `poster_id`, `poster_ip`, `post_time`, `enable_sig`, `post_username`, `post_subject`, `post_text`, `post_checksum`, `post_attachment`, `ori_id`
topics表, 需要的最小字段集为 `forum_id`, `topic_title`, `topic_time`, `ori_id`
导入topic和post时的处理逻辑
按创建时间将post正序排列后, 依次处理: 检查是否是topic的第一篇, 如果是则创建topic, 如果不是则去查找(目标db, 源db, 如果找不的第一篇时, 需要将自己设为第一篇), 这样topicId确定后就可以写入post了, 然后再拿写入产生的postId, 去更新topic的信息
Python代码
子函数
import rbcommon import re def getTopic(cursor, articleId): sql = 'SELECT topic_id FROM phpbb_topics WHERE ori_id = %s' cursor.execute(sql, (articleId)) row = cursor.fetchone() if (row is not None): return row['topic_id'] else: return 0 def getAuthor(cursor, name): sql = 'SELECT user_id FROM phpbb_users WHERE username_clean = %s' cursor.execute(sql, (name.lower())) row = cursor.fetchone() if (row is not None): return row['user_id'] else: print('Not exists:{}'.format(name)) exit() def insertTopic(cursor, forum_id, topic_title, topic_time, ori_id): sql = 'INSERT IGNORE INTO `phpbb_topics` (`forum_id`, `topic_title`, `topic_time`, `ori_id`) ' \ 'VALUES (%s, %s, %s, %s)' cursor.execute(sql, (forum_id, topic_title, topic_time, ori_id)) lastId = cursor.lastrowid return lastId def insertPost(cursor, topic_id, forum_id, poster_id, poster_ip, post_time, post_username, post_subject, post_text, post_attachment, ori_id): sql = 'INSERT IGNORE INTO `phpbb_posts` (`topic_id`, `forum_id`, `poster_id`, `poster_ip`, `post_time`, `enable_sig`, ' \ '`post_username`, `post_subject`, `post_text`, `post_checksum`, `post_attachment`, `ori_id`) ' \ 'VALUES (%s, %s, %s, %s, %s, 0, %s, %s, %s, \'\', %s, %s)' cursor.execute(sql, (topic_id, forum_id, poster_id, poster_ip, post_time, post_username, post_subject, post_text, post_attachment, ori_id)) lastId = cursor.lastrowid if (lastId == 0): print('Duplicate ID:>{}<'.format(ori_id)) return lastId def updateTopicFirst(cursor, authorId, postId, author, title, createdAt, topicId): sql = 'UPDATE phpbb_topics SET ' \ 'topic_poster=%s, topic_first_post_id=%s, topic_first_poster_name=%s, ' \ 'topic_last_poster_id=%s, topic_last_post_id=%s, topic_last_poster_name=%s, topic_last_post_subject=%s, topic_last_post_time=%s WHERE `topic_id`=%s' cursor.execute(sql, (authorId, postId, author, authorId, postId, author, title, createdAt, topicId)) def updateTopic(cursor, authorId, postId, author, title, createdAt, topicId): sql = 'UPDATE phpbb_topics SET topic_replies=topic_replies+1, topic_replies_real=topic_replies_real+1, ' \ 'topic_last_poster_id=%s, topic_last_post_id=%s, topic_last_poster_name=%s, topic_last_post_subject=%s, topic_last_post_time=%s WHERE `topic_id`=%s' cursor.execute(sql, (authorId, postId, author, title, createdAt, topicId))
主方法
tb_article_all = rbcommon.db['article_all'] limit = 1000 total = tb_article_all.estimated_document_count() for i in range(0, total, limit): print("\n" + '######## Start:' + str(i) + ', limit:' + str(limit) + ' ########') articles = tb_article_all.find().sort('createdAt', 1).limit(limit).skip(i) for article in articles: # extract the forumId, author, etc pos = article['_id'].find('.') forumId = article['_id'][0:pos] author = article['author'].strip() posterIp = '' if (not 'ip' in article) else article['ip'] attachments = 0 if (len(article['attachments']) == 0) else 1 # content = article['content'].replace('\\n', '\n') content = re.sub(r'\\n', '\n', article['content']) content = re.sub(r'\\r\[[;\d]{0,8}m', '', content) content = re.sub(r'\\(/|"|\')', r'\1', content) with rbcommon.mysqlclient.cursor() as cursor: # get author Id authorId = getAuthor(cursor, author) # Check if it is a topic firstPostFlag = False if (article['_id'] == article['parentId']): firstPostFlag = True # if yes, check if it exists topicId = getTopic(cursor, article['_id']) if (topicId == 0): # if not, insert a topic, get the topicId topicId = insertTopic(cursor, forumId, article['title'], article['createdAt'], article['_id']) else: # if not a topic, get the topic ID topicId = getTopic(cursor, article['parentId']) if (topicId == 0): # if not exists, find it and insert it(topic record), and get the topic ID dummy = tb_article_all.find_one({'_id': article['parentId']}) # if dummy not exists, make this post the first post if (dummy is None): dummy_title = article['title'] dummy_createdAt = article['createdAt'] dummy_author = article['author'].strip() dummy_ori_id = article['parentId'] firstPostFlag = True else: dummy_title = dummy['title'] dummy_createdAt = dummy['createdAt'] dummy_author = dummy['author'].strip() dummy_ori_id = dummy['_id'] topicId = insertTopic(cursor, forumId, dummy_title, dummy_createdAt, dummy_ori_id) # should not be 0 at this point if (topicId == 0): print('Failed to get topicId for {}'.format(article['_id'])) exit() # perform the actual post insert postId = insertPost(cursor, topicId, forumId, authorId, posterIp, article['createdAt'], author, article['title'], content, attachments, article['_id']) if (postId == 0): print('Post already exists: {}'.format(article['_id'])) rbcommon.mysqlclient.rollback() continue # update the topic if (firstPostFlag): updateTopicFirst(cursor, authorId, postId, author, article['title'], article['createdAt'], topicId) else: updateTopic(cursor, authorId, postId, author, article['title'], article['createdAt'], topicId) # commit all changes at last rbcommon.mysqlclient.commit()
同步版面数据
导入结束后, 即使同步后台数据和清空缓存, 在前台也是看不到版面文章的, 显示都是空. 可以在后台的版面管理中, 点击版面右侧的同步图标, 对每个版面进行手动同步. 如果版面较多不合适手工处理, 则可以在论坛的根目录下, 创建下面这个脚本 tmp.php:
<?php define('IN_PHPBB', true); $phpbb_root_path = (defined('PHPBB_ROOT_PATH')) ? PHPBB_ROOT_PATH : './'; $phpEx = substr(strrchr(__FILE__, '.'), 1); include($phpbb_root_path . 'common.' . $phpEx); include($phpbb_root_path . 'includes/functions_display.' . $phpEx); require($phpbb_root_path . 'includes/functions_admin.' . $phpEx); echo 'Start'; sync('forum', '', '', false, true); echo 'Done'; $cache->destroy('sql', FORUMS_TABLE); ?>
在命令行下执行命令 php tmp.php 版面的数据就全部同步了.
.