一种最坏情况线性运行时间的选择算法 - The missing worst-case linear-time Select algorithm in CLRS.

选择算法也就是求一个无序数组中第K大（小）的元素的值的算法，同通常的Top K等算法密切相关。

在CLRS中提到了一种最坏情况线性运行时间的选择算法，在书中给出了如下的文字描述（没有直接给出伪代码）。

1.Divide n elements into groups of 5
2.Find median of each group (How?  How long?)
3.Use Select() recursively to find median x of the n/5 medians
4.Partition the n elements around x.  Let k = rank(x)
5.if (i == k) then return x
  if (i < k) then use Select() recursively to find ith smallest element in first partition
    else (i > k) use Select() recursively to find (i-k)th smallest element in last partition

后来在MIT公开课网站上找到了这个课件。给出了几段伪代码，但是比上面的文字描述强不了多少；

得了还是自己动手丰衣足食吧；

代码中给出了同STL标准Select算法、一般SELECT和随机化的SELECT算法的运行效率比较分析；

现以C++实现了该选择算法，以供参考。

//determines the ith smallest element of an input array.
#include <iostream>
#include <algorithm>
#include <ctime>
#include <cassert>

using namespace std;

#include "randomized_select.cpp"

size_t my_ceil(size_t m, size_t n)
{
    size_t mul, rem;
    mul = m / n;
    rem = m % n;
    if (rem > 0)
        mul++;
    return mul;
}

size_t my_floor(size_t m, size_t n)
{
    return m / n;
}

template <class T>
void exchange(T& x, T& y)
{
    T t = x; x = y; y = t;
}

template <class T>
void insertion_sort(T *a, size_t n)
{
    int i, j;
    for (j = 1; j < (int) n; ++j) {
        T key = a[j];
        // insert a[j] into the sorted sequence a[0..j-1].
        i = j - 1;
        while (i >= 0 && a[i] > key) { //>= or >
            a[i + 1] = a[i];
            i--;
        }
        a[i + 1] = key;
    }
}

template <class T>
size_t partition(T *a, size_t p, size_t r)
{
    T x = a[r];
    int i = p - 1, j; //XXX in case of negative.
    for (j = p; j < (int) r; ++j)
    if (a[j] <= x) //XXX <= or <
        exchange(a[++i], a[j]);
    exchange(a[i + 1], a[r]);
    return i + 1;
}

// find index of element x in array a of size n.
template <class T>
size_t index_of(T *a, size_t n, T x)
{
    size_t i;
    for (i = 0; i < n; ++i)
    if (a[i] == x)
        return i;
    return i;
}

// partition around pivot on index i
template <class T>
size_t partition_x(T *a, size_t n, T x)
{
    if (n == 1) //XXX zero based index.
        return 0;
    size_t p, r, i; // [p, r]
    p = 0;
    r = n - 1;
    i = index_of(a, n, x);
    if (i >= n)
        cout << "i=" << i << "; n=" << n << endl;
    assert(i < n); //XXX not found!
    exchange(a[i], a[r]);
    return partition(a, p, r);
}

#ifndef NDEBUG
#define NDEBUG
#endif

// select from [p,r]
template <class T>
T select(T *a, size_t n, size_t i)
{
    if (n < 2) // 0,1
        return a[0]; //XXX

    // 1: divide n elements into groups of 5.
    size_t ng = my_floor(n, 5); // ng group with 5 elements.
    size_t ng1 = my_ceil(n, 5); // ng1 >= ng.
    size_t rem = n - ng * 5; // elements in last group.

#ifndef NDEBUG
    cout << "i=" << i << "; n=" << n << "; ng=" << ng << "; ng1=" << ng1 << endl;
#endif

    // 2: find median of each group.
    // sort each group.
    for (size_t j = 0; j < ng; ++j)
        insertion_sort(&a[5 * j], 5);
    //   sort last group(or only group).
    if (rem > 0)
        insertion_sort(&a[5 * ng], rem);

    // picking medians
    T *medians = new T[ng1];
    for (size_t j = 0; j < ng; ++j)
        medians[j] = a[5 * j + 2];
    //   picking the last median.
    if (rem > 0) {
        size_t last_mi = ng * 5 - 1 + my_ceil(rem, 2); // (start+end)/2 floor.
        medians[ng] = a[last_mi];
    }

    // 3: use select recursively to find median x of the medians.
    // partition on median-of-medians.
    size_t mmi = my_ceil(n, 10) - 1;
    T x = select(medians, ng1, mmi);
    delete [] medians; //XXX break;

#ifndef NDEBUG
    cout << "(before partition); x=" << x.value << ";" << endl;
    for (size_t y = 0; y < 5; y++) {
        for (size_t x = 0; x < ng; x++)
        if (gp[x].e[y].is_infinite)
            cout << "*" << "\t";
        else
            cout << gp[x].e[y].value << "\t";
        cout << endl;
    }
#endif

    // 4: partition input array around pivot x.
    size_t k = partition_x(a, n, x);

#ifndef NDEBUG
    cout << "(after partition); k=" << k << endl;
    for (size_t y = 0; y < 5; y++) {
        for (size_t x = 0; x < ng; x++)
        if (gp[x].e[y].is_infinite)
            cout << "*" << "\t";
        else
            cout << gp[x].e[y].value << "\t";
        cout << endl;
    }
    cout << endl;
#endif

    if (i == k) // DONE!
        return a[k]; // value of pivot.
    else if (i < k) // L array.
        return select(a, k, i);
    else // if (i > k) // G array,
        return select(a + k + 1, n - k - 1, i - k - 1);
}

// ith from 0.
template <class T>
T try_select(T *a, size_t n, size_t ith)
{
    clock_t beg = clock();

    T x = select(a, n, ith); // [0,n-1]

    clock_t end = clock();
    cout << "Select take " << (double) (end - beg) / CLOCKS_PER_SEC << " sec." << endl;

    return x;
}

// ith start from 0.
int try_std_select(int *a, size_t n, size_t ith)
{
    clock_t beg = clock();
    nth_element(a, a + ith + 1, a + n);
    clock_t end = clock();
    cout << "STD take " << (double) (end - beg) / CLOCKS_PER_SEC << " sec." << endl;
    return a[ith];
}

// ith start from 1.
int try_rand_select(int *a, size_t n, size_t ith)
{
    clock_t beg = clock();
    int ie = randsel::randomized_select(a, 0, n - 1, ith + 1);
    clock_t end = clock();
    cout << "random select take " << (double) (end - beg) / CLOCKS_PER_SEC << " sec." << endl;
    return ie;
}

void select_test()
{
    cout << "please choose the select algorithm: " << endl
        << "1: my select(default); 2: std nth_element; 3: random select;" << endl;
    int choice;
    cin >> choice;

#define _1K (1024)
#define _1M (_1K*_1K)
#define _1G (_1K*_1M)
#define _256M (256*_1M) //2^26

    // 1k, 1m, 256m
    for (size_t n = _1K; n <= _256M; ) {

        int *a = new int[n];

        for (size_t j = 0; j < n; ++j)
            a[j] = j * 10;
        random_shuffle(a, a + n);
        int ith = n / 2;

        cout << endl << "=============== n=" << n << ", i=" << ith << " ==============" << endl;
        int ie;

        switch (choice)
        {
        case 1:
            ie = try_select(a, n, ith); // zero based index.
            break;
        case 2:
            ie = try_std_select(a, n, ith);
            break;
        case 3:
            ie = try_rand_select(a, n, ith);
            break;
        default:
            ie = try_select(a, n, ith); // zero based index.
            break;
        }

        cout << ith << "-th:" << ie << endl;
        assert(ie == ith * 10);

        delete [] a;

        // level.
        if (n == _1K)
            n = _1M;
        else if (n == _1M)
            n = _256M;
        else if (n == _256M)
            n = _1G;
        else
            n = _1G;
    }
}

posted @ 2013-10-23 18:14 wilem 阅读(1272) 评论(0) 编辑收藏举报

刷新页面返回顶部

一种最坏情况线性运行时间的选择算法 - The missing worst-case linear-time Select algorithm in CLRS.

公告