Loading

miniblast_hash算法c语言实现

对于一组基因文件中的基因序列,选取一段基因片段,作为索引,利用hash表,查找固定的基因片段。有一定的并且容忍错误。

简单讲就是自己实现一个hashtable,将选出特定字符串建立索引,便于查询。输出时可容忍一定数量的错误。

贴上代码

HashTable.h

#include <iostream>
using namespace std;

typedef int KeyType;

#define NULLKEY -1

struct Entry {
    KeyType _key;
    string    _value;
    Entry(KeyType key = NULLKEY, string value = "") :_key(key), _value(value) {}
};

class hashTable {
public:
    hashTable();
    hashTable::hashTable(int table_size, double occupancy_get);
    //hashTable(int tableSize);
    ~hashTable();

    bool find(const Entry&  e);
    string hashTable::getValue(const KeyType& k);
    bool insert(const Entry& e);
    bool remove(const Entry& e);
    void clear();
    Entry& operator[](KeyType key);//重载下标操作;当找不到key对应的Entry时,插入Entry(key,0)
    int size();
    void display();

protected:
    int hashFunction(KeyType key);//将键值映射到对应地址
    void rehash();//调整hashTable大小
    bool find(const KeyType& k);//按键值查找
    int nextPrime();//p(n) = n^2 - n + 41, n<41, p<1681

private:
    Entry *_pTable;
    int _pos;//当前访问元素的位置
    int _size;
    int _capacity;
    int primeIndex;
};

HashTable.cpp

#include "HashTable.h"
static double _occupancy = 0.5;
//开放定址法
hashTable::hashTable()
{
    _capacity = 100;//初始化hashTable容量为100,便于观察rehash过程
    _pTable = new Entry[_capacity];
    _size = 0;
    primeIndex = 1;
}
hashTable::hashTable(int table_size,double occupancy_get)
{
    _capacity = table_size;
    _occupancy = occupancy_get;
    _pTable = new Entry[_capacity];
    _size = 0;
    primeIndex = 1;
}


hashTable::~hashTable()
{
    clear();
}

int hashTable::nextPrime()
{
    int p = std::pow(static_cast<float>(primeIndex), 2) - primeIndex +3777;
    primeIndex = primeIndex << 1;
    if (primeIndex >= 3777) {
        cout << "Max capacity reached. exit!" << endl;
        exit(-1);
    }
    return p;
}

bool hashTable::find(const Entry&  e)
{
    return(find(e._key));
}

bool hashTable::find(const KeyType& k)
{
    _pos = hashFunction(k);
    if (_pTable[_pos]._key == NULLKEY)
        return false;
    int lastPos = _pos;
    while (_pTable[_pos]._key != k) {
        if (++_pos%_capacity == lastPos)
            return false;
    }
    return true;
}

string hashTable::getValue(const KeyType& k)
{
    _pos = hashFunction(k);
    if (_pTable[_pos]._key == NULLKEY)
        return "";
    int lastPos = _pos;
    while (_pTable[_pos]._key != k) {
        if (++_pos%_capacity == lastPos)
            return "";
    }
    return _pTable[_pos]._value;
}

bool hashTable::insert(const Entry& e)
{
    if ((_size*1.0) / _capacity>_occupancy) //通过occupancy判断是否需要扩容
        rehash();//[OK]插入操作前,需要判断hash table是否需要扩容
    if (find(e))
        return false;
    _pTable[_pos] = e;
    ++_size;
    return true;
}

bool hashTable::remove(const Entry& e)
{
    if (!find(e))
        return false;
    _pTable[_pos]._key = NULLKEY;
    --_size;
    //rehash();//移除操作后,需要判断hash table是否需要缩容
    return true;
}

void hashTable::clear()
{
    delete[]_pTable;
    _size = _capacity = 0;
}

Entry& hashTable::operator[](KeyType key)
{
    if (!find(key))
        insert(Entry(key, 0));
    return _pTable[_pos];
}

int hashTable::size()
{
    return _size;
}

int hashTable::hashFunction(KeyType key)
{
    return key%_capacity;
}

void hashTable::display()
{
    cout << "capacity = " << _capacity << ", size = " << _size << endl;
    for (int i = 0; i<_capacity; i++) {
        if (_pTable[i]._key != NULLKEY);
//            cout << "key=" << _pTable[i]._key << ",value=" << _pTable[i]._value << endl;
    }
}


void hashTable::rehash()
{
    //cout << "begin rehash..." << endl;
    Entry *p = new Entry[_size];//用来暂存原哈希表
    for (int i = 0; i<_capacity; i++) {//i<_size不对;元素散列在容量为_capacity的hashTable中
        if (_pTable[i]._key != NULLKEY)
            *(p + i) = _pTable[i];
    }
    delete[]_pTable;
    int lastSize = _size;
    _size = 0;

    _capacity = nextPrime();
    _pTable = new Entry[_capacity];
    for (int i = 0; i<lastSize; i++)
        insert(*(p + i));
    delete[]p;
}

miniblast_hash.cpp

#include <iostream>
#include <string>
#include <fstream>
#include "HashTable.h"

using namespace std;
const int KMER_MAX = 100;

//得到基因组文件中,position开始,长度为len的字符串
string getStringByPos(string file_name, long position,int str_len) {
    //读入数据
    std::ifstream myfile(file_name);
    if (!myfile.is_open())
    {
        cout << "未成功打开文件" << endl;
    }
    string s = "";
    //移动文件指针到基因指定位置,因为有无用字符,需要边移动边判断
    for (long i = 0; i < position; i++)
    {
        char c;
        myfile.get(c);
        //移动指针的过程中要忽略无意义字符
        if (c=='/0'||c=='\n')
            i--;
    }
    //截取指定长度的字符串
    for (long i = 0; i < str_len; i++)
    {
        char c;
        myfile.get(c);
        //忽略无意义字符
        if (c == '/0' || c == '\n')
            i--;
        else
        {
            switch (c)
            {
            case 'A':
                s.append("A");
                break;
            case 'G':
                s.append("G");
                break;
            case 'C':
                s.append("C");
                break;
            case 'T':
                s.append("T");
                break;
            default:
                break;
            }
        }
    }
    return s;
}
//比较检索基因片段与原始片段相差个数
int compareString(string s1, string s2, int len) {
    int num = 0;
    for (int i = 0; i < len; i++)
    {
        if (s1[i] != s2[i])
            num++;
    }
    return num;
}
//搜索指定基因片段
void searchGenome(string file_name,hashTable *pTable,int mistakeNum,int kmer,string query_string,string out_name) {
    char *kmer_cut = new char[kmer];
    int length = query_string.length();
    int findNum = 0;
    int table_size = pTable->size();
    //std::ofstream out(out_name, ios::app);//输出结果至文件
    for (int i = 0; i <kmer; i++)
    {
        kmer_cut[i] = query_string[i];
    }
    string kmer_string(&kmer_cut[0],&kmer_cut[kmer]);
    for (int i = 0; i <table_size; i++) {
        
        string get = pTable->getValue(i);
        if (kmer_string == get)
        {
            
            if (i + length > table_size)
                continue;
            string ss = getStringByPos(file_name, i, length);
            int num = compareString(ss, query_string,length);
            if (num<=mistakeNum)
            {
                findNum++;
                cout <<i<<" "<<num<<" "<<ss << endl;
            //    out << i <<" "<< num <<" " << ss << endl;
            }
        }
    }
    
    if (findNum==0)
    {
        cout<< "No Match" << endl;
    //    out << "No Match" << endl;
    }
    //out.close();
}
//构建基因搜索用的hashtable索引
void GenomeIndex(string genome_file_name,int kmer, hashTable *pTable) {
    //读入数据,建立索引
    std::ifstream myfile(genome_file_name);
    if (!myfile.is_open())
    {
        cout << "未成功打开文件" << endl;
    }
    char *c = new char[KMER_MAX];
    myfile.get(c, kmer + 1);
    string add(&c[0], &c[kmer]);
    int geno_loc = 0;
    pTable->insert(Entry(0, add));
    //cout << add << endl;
    while (!myfile.eof())
    {
        char next_char;
        myfile.get(next_char);
        //去除文件中无用字符
        if (next_char != '/0'&&next_char != '\n')
        {
            for (int i = 0; i < kmer; i++) 
                c[i] = c[i + 1];
                
            c[kmer-1] = next_char;
            string add(&c[0], &c[kmer]);
            geno_loc++;
            pTable->insert(Entry(geno_loc, add));
            //cout << add <<geno_loc<< endl;
        }
    }
    myfile.close();
}

int main(int   argc, char*   argv[])
{
    
    /*string input_file_name ="";
    string out_file_name = "";
    if (argc>1)
    {
        input_file_name = string(argv[1]);
        out_file_name = string(argv[2]);
    }*/
    //如果不需要命令行参数执行文件,直接赋值input_file_name就好
    //input_file_name = "input_small.txt";
    std::freopen(argv[1], "r", stdin);
    std::freopen(argv[2], "w", stdout);
    //std::ifstream inputfile(input_file_name);
    
    
    hashTable *pTable=new hashTable();
    /*if (!inputfile.is_open())
    {
        cout << "未成功打开文件" << endl;
    }*/
    //先读取命令文件,设置相关参数,初始值为随意指定,无意义
    string str_order = "";
    string genome_file_name = "";
    int table_size =20;
    double occupancy =0.4;
    int kmer = 4;
    int mistakeNum = 0;
    string query_string;
    bool isIndex = false;
    while (str_order!="quit")
    {
        cin >> str_order;
        if (str_order=="genome")
        {
            cin >> genome_file_name;
        }
        else if (str_order == "table_size") {
            cin >> table_size;
        }
        else if (str_order == "occupancy") {
            cin >> occupancy;
            pTable = new hashTable(table_size, occupancy);
        }
        else if (str_order == "kmer") {
            cin >> kmer;
        }
        else if (str_order == "query") {
            cin >> mistakeNum >> query_string;
            //首次进行查询前,要建立索引hashTable
            if (!isIndex)
            {
                isIndex = true;
                GenomeIndex(genome_file_name, kmer, pTable);
            }
            //std::ofstream out(out_file_name,ios::app);//输出结果至文件
            /*if (out.is_open())
            {
                out << "Query: " << query_string <<endl;
                
            }*/
            cout << "Query: " << query_string << endl;
            //out.close();
            searchGenome(genome_file_name, pTable, mistakeNum, kmer, query_string, "");
        }
    }
    
    delete pTable;//释放指针,防止内存泄露
    //getchar();//暂停程序,观察结果
    return 0;
}

  因为需要,代码上有详细注解,欢迎交流学习。

posted @ 2018-05-16 11:18  CodeMonkey404  阅读(522)  评论(0编辑  收藏  举报