sphinx测试数据生成

import json
from random import sample, randint
from uuid import uuid4

def gen_random_words():
    with open("D:\\exp\\test_data\\dictionary.txt") as f:
        words = [word.strip() for word in f]
        f.close()
        # print "OK. words length:", len(words)
        return sample(words, 3000)
    return []


total_words = 0
def sample_words(search_words, random_words):
    global total_words
    sample_cnt = 1000
    for word in random_words:
        total_words += 1
        if len(search_words) < sample_cnt:
            search_words.append(word)
        else:
            if randint(1, total_words) <= sample_cnt:
                kick_off = randint(0, sample_cnt-1)
                search_words[kick_off] = word


def gen_an_event(words, search_words):
    query_words = sample(words, randint(1, 10))
    sample_words(search_words,query_words)
    title = " ".join(query_words)
    query_words = sample(words, randint(1, 100))
    sample_words(search_words,query_words)
    content = " ".join(query_words)
    event_data = {"title": title, "content": content}
    return event_data


if __name__ == "__main__":
    search_words = []
    for i in range(1):
        words = gen_random_words()
        lines_cnt = 500000
        es_out_put = [""]*lines_cnt
        for i in range(0, lines_cnt):
            event = gen_an_event(words, search_words)
            es_out_put[i] = "        (%d, 2, 9, NOW(), '%s', '%s'), \n" % (i+5, event["title"], event["content"])
        # print es_out_put
        # print splunk_out_put
        out_puts = [es_out_put]
        file_name = str(uuid4()) + ".txt"
        for i,dir_name in enumerate(["Sphinx"]):
            outfile = "D:\\test_data\\%s\\%s" % (dir_name, file_name)
            f = open(outfile, "w")
            for j in range(0, lines_cnt):
                f.write(out_puts[i][j])
            f.close()
            print outfile
    outfile = "D:\\test_data\\search_words2.txt"
    f = open(outfile, "w")
    f.write(json.dumps(search_words))
    f.close()


sql = '''
DROP TABLE IF EXISTS test.documents;
CREATE TABLE test.documents
(
        id                      INTEGER PRIMARY KEY NOT NULL AUTO_INCREMENT,
        group_id        INTEGER NOT NULL,
        group_id2       INTEGER NOT NULL,
        date_added      DATETIME NOT NULL,
        title           VARCHAR(255) NOT NULL,
        content         TEXT NOT NULL
);

REPLACE INTO test.documents ( id, group_id, group_id2, date_added, title, content ) VALUES
        ( 1, 1, 5, NOW(), 'test one', 'this is my test document number one. also checking search within phrases.' ),
        ( 2, 1, 6, NOW(), 'test two', 'this is my test document number two' ),
        ( 3, 2, 7, NOW(), 'another doc', 'this is another group' ),
        ( 4, 2, 8, NOW(), 'doc number four', 'this is to test groups' );

DROP TABLE IF EXISTS test.tags;
CREATE TABLE test.tags
(
        docid INTEGER NOT NULL,
        tagid INTEGER NOT NULL,
        UNIQUE(docid,tagid)
);

INSERT INTO test.tags VALUES
        (1,1), (1,3), (1,5), (1,7),
        (2,6), (2,4), (2,2),
        (3,15),
        (4,7), (4,40);
'''
posted @ 2017-03-31 14:59 bonelee 阅读(395) 评论(1) 收藏举报
刷新页面返回顶部
将者，智、信、仁、勇、严也。

Hi，我是李智华，华为-安全AI算法专家，欢迎来到安全攻防对抗的有趣世界。

sphinx测试数据生成

公告