sphinx测试数据生成
import json from random import sample, randint from uuid import uuid4 def gen_random_words(): with open("D:\\exp\\test_data\\dictionary.txt") as f: words = [word.strip() for word in f] f.close() # print "OK. words length:", len(words) return sample(words, 3000) return [] total_words = 0 def sample_words(search_words, random_words): global total_words sample_cnt = 1000 for word in random_words: total_words += 1 if len(search_words) < sample_cnt: search_words.append(word) else: if randint(1, total_words) <= sample_cnt: kick_off = randint(0, sample_cnt-1) search_words[kick_off] = word def gen_an_event(words, search_words): query_words = sample(words, randint(1, 10)) sample_words(search_words,query_words) title = " ".join(query_words) query_words = sample(words, randint(1, 100)) sample_words(search_words,query_words) content = " ".join(query_words) event_data = {"title": title, "content": content} return event_data if __name__ == "__main__": search_words = [] for i in range(1): words = gen_random_words() lines_cnt = 500000 es_out_put = [""]*lines_cnt for i in range(0, lines_cnt): event = gen_an_event(words, search_words) es_out_put[i] = " (%d, 2, 9, NOW(), '%s', '%s'), \n" % (i+5, event["title"], event["content"]) # print es_out_put # print splunk_out_put out_puts = [es_out_put] file_name = str(uuid4()) + ".txt" for i,dir_name in enumerate(["Sphinx"]): outfile = "D:\\test_data\\%s\\%s" % (dir_name, file_name) f = open(outfile, "w") for j in range(0, lines_cnt): f.write(out_puts[i][j]) f.close() print outfile outfile = "D:\\test_data\\search_words2.txt" f = open(outfile, "w") f.write(json.dumps(search_words)) f.close() sql = ''' DROP TABLE IF EXISTS test.documents; CREATE TABLE test.documents ( id INTEGER PRIMARY KEY NOT NULL AUTO_INCREMENT, group_id INTEGER NOT NULL, group_id2 INTEGER NOT NULL, date_added DATETIME NOT NULL, title VARCHAR(255) NOT NULL, content TEXT NOT NULL ); REPLACE INTO test.documents ( id, group_id, group_id2, date_added, title, content ) VALUES ( 1, 1, 5, NOW(), 'test one', 'this is my test document number one. also checking search within phrases.' ), ( 2, 1, 6, NOW(), 'test two', 'this is my test document number two' ), ( 3, 2, 7, NOW(), 'another doc', 'this is another group' ), ( 4, 2, 8, NOW(), 'doc number four', 'this is to test groups' ); DROP TABLE IF EXISTS test.tags; CREATE TABLE test.tags ( docid INTEGER NOT NULL, tagid INTEGER NOT NULL, UNIQUE(docid,tagid) ); INSERT INTO test.tags VALUES (1,1), (1,3), (1,5), (1,7), (2,6), (2,4), (2,2), (3,15), (4,7), (4,40); '''