一个nvcc编译的小问题

下面的cuda代码为什么指定compute capability = 8.0 后在A100上跑结果不对？

nvcc a.cu # 正确
nvcc a.cu -gencode arch=compute_80,code=sm_80 # 结果不正确

// a.cu
#include <cuda_runtime.h>
#include <stdio.h>
#include <stdlib.h>

using ValueType = float;

__constant__ ValueType col_idx[10];

__global__ void aa_kernel(ValueType *x) {
    int idx = threadIdx.x;
    printf("idx = %d\n", idx);
    if(idx < 10)
        x[idx] = col_idx[idx];
}

int main() {
  ValueType *col_h = new ValueType[10];
  col_h[0] = 1.0;
  for(int i=1; i<10; i++) col_h[i] = 2*col_h[i-1];

  ValueType * x_d;
  cudaError_t err = cudaSuccess;
  err = cudaMalloc(&x_d, 32*sizeof(ValueType));
  if(err != cudaSuccess) printf("error\n");
  cudaMemcpyToSymbol(col_idx, col_h, 10*sizeof(ValueType));
  cudaStream_t stream;
  cudaStreamCreate(&stream);

  aa_kernel<<<1, 32, 0, stream>>>(x_d);
  cudaDeviceSynchronize();

  ValueType x[32] = {0};

  cudaMemcpy(x, x_d, 32*sizeof(ValueType), cudaMemcpyDeviceToHost);
  for(int i=0; i<12; i++) printf("%f\n", x[i]);
  return 0;
}