python使用hbase

#coding:utf-8
__author__ = 'similarface'
from multiprocessing import Process
import happybase
import os
import re
import hashlib
import multiprocessing
from multiprocessing import Queue
basedir="/tmp/t8"
filterpath="/Users/similarface/Documents/20170303Morgene999ProductFullSNP.txt"
snpkey={}
pattern_barcode= re.compile(r'[0-9]{3}[-][0-9]{4}[-][0-9]{4}')
pattern_ls=re.compile(r'\s+')
def func(filepath,snpkey):
    conn=happybase.Connection(host='192.168.30.250')
    table=conn.table('chipdata')
    barcodes=pattern_barcode.findall(filepath)
    barcode=barcodes[0]
    i=0
    all=0
    with open(filepath,'rb') as foper:
        for line in foper:
            try:
                lines=pattern_ls.split(line.strip())
                chr=lines[1]
                pos=lines[2]
                key=chr+":"+pos
                #print key
                if key in snpkey:
                    all=all+1
                    m = hashlib.md5()
                    m.update(pos.strip())
                    rowkey = m.hexdigest()+":"+chr.upper()
                    dictkey='d:'+barcode
                    columns=[dictkey]
                    rows_as_dict = dict(table.row(rowkey,columns))
                    if rows_as_dict[dictkey]==lines[3]:
                        i=i+1
            except Exception,e:
                pass
    print barcode+":"+format((i+0.0)/all,'0.1%')+"match"+str(i)
        #q.put(barcode+":"+format((i+0.0)/all,'0.1%'))
    conn.close()

def read(q):
    while True:
        value = q.get(True)
        print 'Get %s from queue.' % value



if __name__ == "__main__":
    pool = multiprocessing.Pool(processes = 3)
    snpkey={}
    q = Queue()
    pattern_s=re.compile(r'\s+')
    with open(filterpath,'rb') as oper:
        for line in oper:
            if line.strip()!="":
                lines=pattern_s.split(line.strip())
                snpkey[':'.join(lines[0:2])]=""

    # pr = Process(target=read, args=(q,))
    # pr.start()

    for filename in os.listdir(basedir):
        if filename.endswith("snp"):
            filterpath=os.path.join(basedir,filename)
            pool.apply_async(func, args=(filterpath,snpkey))   #维持执行的进程总数为processes,当一个进程执行完毕后会添加新的进程进去

    print "Mark~ Mark~ Mark~~~~~~~~~~~~~~~~~~~~~~"
    pool.close()
    pool.join()   #调用join之前,先调用close函数,否则会出错。执行完close后不会有新的进程加入到pool,join函数等待所有子进程结束
    print "Sub-process(es) done."
    #pr.terminate()

 

posted @ 2017-06-20 11:06  similarface  阅读(383)  评论(0编辑  收藏  举报