高性能第四次第二题作业代码
#include <math.h>
#include <iostream>
#include <string>
#include <optional>
#include "dpc_common.hpp"
using namespace sycl;
using namespace std;
#define DEBUG 0
void ParallelBitonicSort(int data_gpu[], int n, queue &q) {
int size = pow(2, n);
int *a = data_gpu;
std::optional<event> last_event;
for (int step = 0; step < n; step++) {
for (int stage = step; stage >= 0; stage--) {
int seq_len = pow(2, stage + 1);
int two_power = 1 << (step - stage);
last_event = q.submit([&](auto &h) {
if (last_event.has_value())
h.depends_on(last_event.value());
h.parallel_for(range<1>(size), [=](id<1> i) {
int seq_num = i / seq_len;
int swapped_ele = -1;
int h_len = seq_len / 2;
if (i < (seq_len * seq_num) + h_len) swapped_ele = i + h_len;
int odd = seq_num / two_power;
bool increasing = ((odd % 2) == 0);
if (swapped_ele != -1) {
if (((a[i] > a[swapped_ele]) && increasing) ||
((a[i] < a[swapped_ele]) && !increasing)) {
int temp = a[i];
a[i] = a[swapped_ele];
a[swapped_ele] = temp;
}
}
});
});
} // end stage
} // end step
q.wait();
}
void ParallelBitonicSortBuffer(int data_gpu[], int n, queue &q) {
int size = pow(2, n);
buffer input(data_gpu, range(size));
for (int step = 0; step < n; step++) {
for (int stage = step; stage >= 0; stage--) {
int seq_len = pow(2, stage + 1);
int two_power = 1 << (step - stage);
q.submit([&](auto &h) {
accessor a(input, h);
h.parallel_for(size, [=](id<1> i) {
int seq_num = i / seq_len;
int swapped_ele = -1;
int h_len = seq_len / 2;
if (i < (seq_len * seq_num) + h_len) swapped_ele = i + h_len;
int odd = seq_num / two_power;
bool increasing = ((odd % 2) == 0);
if (swapped_ele != -1) {
if (((a[i] > a[swapped_ele]) && increasing) ||
((a[i] < a[swapped_ele]) && !increasing)) {
int temp = a[i];
a[i] = a[swapped_ele];
a[swapped_ele] = temp;
}
}
});
});
} // end stage
} // end step
}
void SwapElements(int step, int stage, int num_sequence, int seq_len,
int *array) {
for (int seq_num = 0; seq_num < num_sequence; seq_num++) {
int odd = seq_num / (pow(2, (step - stage)));
bool increasing = ((odd % 2) == 0);
int h_len = seq_len / 2;
// For all elements in a bitonic sequence, swap them if needed
for (int i = seq_num * seq_len; i < seq_num * seq_len + h_len; i++) {
int swapped_ele = i + h_len;
if (((array[i] > array[swapped_ele]) && increasing) ||
((array[i] < array[swapped_ele]) && !increasing)) {
int temp = array[i];
array[i] = array[swapped_ele];
array[swapped_ele] = temp;
}
} // end for all elements in a sequence
} // end all sequences
}
inline void BitonicSort(int a[], int n) {
for (int step = 0; step < n; step++) {
for (int stage = step; stage >= 0; stage--) {
int num_sequence = pow(2, (n - stage - 1));
int sequence_len = pow(2, stage + 1);
SwapElements(step, stage, num_sequence, sequence_len, a);
}
}
}
void DisplayArray(int a[], int array_size) {
for (int i = 0; i < array_size; ++i) cout << a[i] << " ";
cout << "\n";
}
void Usage(string prog_name, int exponent) {
cout << " Incorrect parameters\n";
cout << " Usage: " << prog_name << " n k \n\n";
cout << " n: Integer exponent presenting the size of the input array. "
"The number of element in\n";
cout << " the array must be power of 2 (e.g., 1, 2, 4, ...). Please "
"enter the corresponding\n";
cout << " exponent betwwen 0 and " << exponent - 1 << ".\n";
cout << " k: Seed used to generate a random sequence.\n";
}
int main(int argc, char *argv[]) {
int n, seed, size;
int exp_max = log2(numeric_limits<int>::max());
try {
n = stoi(argv[1]);
if (n < 0 || n >= exp_max) {
Usage(argv[0], exp_max);
return -1;
}
seed = stoi(argv[2]);
size = pow(2, n);
} catch (...) {
Usage(argv[0], exp_max);
return -1;
}
cout << "\nArray size: " << size << ", seed: " << seed << "\n";
queue q;
cout << "Device: " << q.get_device().get_info<info::device::name>() << "\n";
int *data_cpu = (int *)malloc(size * sizeof(int));
int *data_usm = malloc_shared<int>(size, q);
int *data_gpu = (int *)malloc(size * sizeof(int));
srand(seed);
for (int i = 0; i < size; i++)
data_usm[i] = data_gpu[i] = data_cpu[i] = rand() % 1000;
#if DEBUG
cout << "\ndata before:\n";
DisplayArray(data_usm, size);
#endif
dpc_common::TimeInterval t_par1;
ParallelBitonicSortBuffer(data_gpu, n, q);
cout << "Kernel time using buffer allocation: " << t_par1.Elapsed()
<< " sec\n";
#if DEBUG
cout << "\ndata_gpu after sorting using parallel bitonic sort:\n";
DisplayArray(data_gpu, size);
#endif
dpc_common::TimeInterval t_par2;
ParallelBitonicSort(data_usm, n, q);
cout << "Kernel time using USM: " << t_par2.Elapsed() << " sec\n";
#if DEBUG
cout << "\ndata_usm after sorting using parallel bitonic sort:\n";
DisplayArray(data_usm, size);
#endif
dpc_common::TimeInterval t_ser;
BitonicSort(data_cpu, n);
cout << "CPU serial time: " << t_ser.Elapsed() << " sec\n";
bool pass = true;
for (int i = 0; i < size - 1; i++) {
if ((data_usm[i] > data_usm[i + 1]) || (data_usm[i] != data_cpu[i])) {
pass = false;
break;
}
if ((data_gpu[i] > data_gpu[i + 1]) || (data_gpu[i] != data_cpu[i])) {
pass = false;
break;
}
}
// Clean resources.
free(data_cpu);
free(data_usm, q);
free(data_gpu);
if (!pass) {
cout << "\nFailed!\n";
return -2;
}
cout << "\nSuccess!\n";
return 0;
}