Оптимизация for loop на Си

Ответить

Aleksei_Rostov 0

13 октября, 2021

Опубликовано 13 октября, 2021 · Жалоба

Добрый день! Пытаюсь ускорить выполнение функции сортировки массива по индексам, которая запускается на Cortex a53.

Оптимизируемая функция rd_rotation_old. Новая функция rd_rotation.

#include <errno.h>
#include <getopt.h>
#include <poll.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <unistd.h>
#include <math.h>

#include <omp.h>
#include <fcntl.h>

#define TxChnl 3
#define RxChnl 16
#define Nchirp 128
#define NrFFT  2048

#define FFTR_REAL 1
#define CHIRP_MUX 4     /* chirp multiplexing in raw data */
#define N_TX_PER_CHIP 3 //NUM_TX_PER_DM
#define FFT_WIDTH 2048


typedef struct {
    int16_t re;
    int16_t im;
} cplx_int16_t;

typedef struct {
    cplx_int16_t val[4];
} cplx_int16x4_t;



void rd_rotation_old(void *in[], size_t in_buf_num, size_t n_tx_ch, size_t n_rx_ch, size_t n_bins, size_t n_chirps_per_slot, void *out[], size_t out_buf_num)
{
    const cplx_int16x4_t *p;
    cplx_int16x4_t *      out_p;

    unsigned int d_bin_idx; /* bin index in doppler fft input */
    unsigned int tdm_bin_off;
    unsigned int tdm_bin_idx;
    unsigned int chip_off; /* chip offset in the output buffer */
    unsigned int n_bins_proc = (FFTR_REAL ? n_bins / 2 : n_bins);
    size_t       n_chirps = n_chirps_per_slot * n_tx_ch;

    /*how many x4 samples per single chip*/
    chip_off = (n_chirps * n_bins_proc);
    for (int idx_buf = 0; idx_buf < in_buf_num; idx_buf++) {
        out_p = (cplx_int16x4_t *)out[idx_buf];
        for (int idx_chirp = 0; idx_chirp < n_chirps * CHIRP_MUX; idx_chirp++) {
            /* stride over separate chirp */
            p = (cplx_int16x4_t *)in[idx_buf] + (idx_chirp * n_bins);
            d_bin_idx = idx_chirp / CHIRP_MUX;
            tdm_bin_idx = (d_bin_idx / n_tx_ch);
            tdm_bin_off = ((d_bin_idx % n_tx_ch) * n_chirps_per_slot);
            for (int idx_smpl = 0; idx_smpl < n_bins_proc; idx_smpl++, p++) {
                *(out_p + idx_smpl * n_chirps + chip_off * (idx_chirp % CHIRP_MUX) + tdm_bin_off + tdm_bin_idx) = *p;
            }
        }
    }
}



void rd_rotation(cplx_int16x4_t *array_in, size_t n_tx_ch, size_t n_rx_ch, size_t n_bins, size_t n_chirps_per_slot, cplx_int16x4_t *array_out )
{
    unsigned int blockTxRx = n_tx_ch*n_rx_ch/4*n_bins;
    unsigned int blockTx   = n_rx_ch/4*n_bins;
    unsigned int blockRx   = n_bins/4*4;
    unsigned int ptr_out   = 0;
    unsigned int rx = 0, rbin = 0, tx = 0, nChirp = 0;

    unsigned int tmp_tx = 0; 

// #pragma omp parallel
{
// #pragma omp for
    for ( rx = 0; rx < n_rx_ch/4; ++rx ){
        for( rbin = 0; rbin < n_bins/2; ++rbin ){
             for( tx = 0; tx < n_tx_ch; ++tx ){
                 tmp_tx = tx*blockTx + rbin;
                for( nChirp = 0; nChirp < n_chirps_per_slot;   ++nChirp)
                {
                        ptr_out = nChirp + rx*TxChnl*NrFFT/2*Nchirp + rbin*TxChnl*Nchirp + tx*Nchirp;
                        *(array_out + ptr_out) = *(array_in + nChirp*blockTxRx +  tmp_tx + rx*blockRx);
                } // nChirp
            } // tx

        } // rbin
    } // rx            
} // omp
    
}

// m2nF9zXyiBRD
int main()
{
    printf("<- Start Application \n");

    
    #ifdef _OPENMP
        printf("<- OpenMP is supported \n");
    #endif


    cplx_int16x4_t *cube_in, *cube_out, *cube_gld;
    cube_in  = (cplx_int16x4_t *)malloc(Nchirp*TxChnl*RxChnl/4*NrFFT*sizeof(cplx_int16x4_t));
    cube_out = (cplx_int16x4_t *)malloc(Nchirp*TxChnl*RxChnl/4*NrFFT/2*sizeof(cplx_int16x4_t));
    cube_gld = (cplx_int16x4_t *)malloc(Nchirp*TxChnl*RxChnl/4*NrFFT/2*sizeof(cplx_int16x4_t));

    void *cube_in_arr[1]   = {cube_in};
    void *cube_out_arr[1]  = {cube_gld};

    if(cube_in == NULL || cube_out == NULL || cube_gld == NULL)
    {
        printf("Error malloc \n");
        exit(-1);
    }

    printf("<- Output buffers initialization \n");
    for(int i = 0; i < Nchirp*TxChnl*RxChnl/4*NrFFT/2; i ++)
        for(int k = 0; k < 4; k ++)
        {
            cube_out[i].val[k].re = (int16_t)rand() % 16382;
            cube_out[i].val[k].im = (int16_t)rand() % 16382;

            cube_gld[i].val[k].re = (int16_t)rand() % 16382;
            cube_gld[i].val[k].im = (int16_t)rand() % 16382;

        }
  
    for(int i = 0; i < Nchirp*TxChnl*RxChnl/4*NrFFT; i ++)
        for(int k = 0; k < 4; k ++)
        {
            cube_in[i].val[k].re = (int16_t)rand() % 16382;
            cube_in[i].val[k].im = (int16_t)rand() % 16382;
        }


    


    size_t in_buf_num = 1;
    size_t n_tx_ch    = (size_t)TxChnl;
    size_t n_rx_ch    = (size_t)RxChnl;
    size_t n_bins     = (size_t)NrFFT;
    size_t n_chirps_per_slot = (size_t)Nchirp;
    size_t out_buf_num = 1;

    double start, end, runTime;
    

    start = omp_get_wtime();
    rd_rotation_old( (void *)&cube_in_arr, in_buf_num, n_tx_ch, n_rx_ch, n_bins, n_chirps_per_slot, (void *)&cube_out_arr, out_buf_num);
    end = omp_get_wtime();
    runTime = end - start;
    printf("<- rd_rotation_old run time is %g ms \n", runTime*1000);


    start = omp_get_wtime();
    rd_rotation(&cube_in[0], n_tx_ch, n_rx_ch, n_bins, n_chirps_per_slot, &cube_out[0]);
    end = omp_get_wtime();
    runTime = end - start;

    printf("<- rd_rotation run time is %g ms \n", runTime*1000);


    int32_t diff = 0;

    for(int i = 0; i < Nchirp*TxChnl*RxChnl/4*NrFFT/2; i ++)
        for(int k = 0; k < 4; k ++)
            diff += (int32_t)abs(cube_out[i].val[k].re-cube_gld[i].val[k].re+cube_out[i].val[k].im-cube_gld[i].val[k].im);

    printf("<- Check difference: error is  %d \n", diff);

         
    free(cube_in);
    free(cube_out);
    free(cube_gld);
    printf("<- End Application \n");
    return 0;
}

Запускаю на на виртуальной машине с Ubuntu x86-64

gcc -Wall -O3 -fopenmp -o main main.c
./main
<- Start Application
<- OpenMP is supported
<- Output buffers initialization
<- rd_rotation_old run time is 21.8302 ms
<- rd_rotation run time is 9.4134 ms
<- Check difference: error is  0
<- End Application

Запускаю на Cortex A53

aarch64-linux-gnu-gcc -Wall -O3 -fopenmp  -o main main.c
./main
<- Start Application
<- OpenMP is supported
<- Output buffers initialization
<- rd_rotation_old run time is 86.7562 ms
<- rd_rotation run time is 282.807 ms
<- Check difference: error is  0
<- End Application

И там и там работают 4 ядра, но результаты противоположные.

Вопрос: как можно ускорить rd_rotation в том числе и с использованием OpenMP? И в чем причина различий во времени выполнения кода ?

Цитата

Поделиться сообщением

Ссылка на сообщение

Поделиться на другие сайты

gosha 0

29 ноября, 2021

Опубликовано 29 ноября, 2021 · Жалоба

On 10/13/2021 at 3:43 PM, Aleksei_Rostov said:

Оптимизируемая функция rd_rotation_old. Новая функция rd_rotation.



void rd_rotation(cplx_int16x4_t *array_in, size_t n_tx_ch, size_t n_rx_ch, size_t n_bins, size_t n_chirps_per_slot, cplx_int16x4_t *array_out )
{
    unsigned int blockTxRx = n_tx_ch*n_rx_ch/4*n_bins;
    unsigned int blockTx   = n_rx_ch/4*n_bins;
    unsigned int blockRx   = n_bins/4*4;
    unsigned int ptr_out   = 0;
    unsigned int rx = 0, rbin = 0, tx = 0, nChirp = 0;

    unsigned int tmp_tx = 0; 

// #pragma omp parallel
{
// #pragma omp for
    for ( rx = 0; rx < n_rx_ch/4; ++rx ){
        for( rbin = 0; rbin < n_bins/2; ++rbin ){
             for( tx = 0; tx < n_tx_ch; ++tx ){
                 tmp_tx = tx*blockTx + rbin;
                for( nChirp = 0; nChirp < n_chirps_per_slot;   ++nChirp)
                {
                        ptr_out = nChirp + rx*TxChnl*NrFFT/2*Nchirp + rbin*TxChnl*Nchirp + tx*Nchirp;
                        *(array_out + ptr_out) = *(array_in + nChirp*blockTxRx +  tmp_tx + rx*blockRx);
                } // nChirp
            } // tx

        } // rbin
    } // rx            
} // omp
    
}

// m2nF9zXyiBRD

Запускаю на на виртуальной машине с Ubuntu x86-64


gcc -Wall -O3 -fopenmp -o main main.c
./main
<- Start Application
<- OpenMP is supported
<- Output buffers initialization
<- rd_rotation_old run time is 21.8302 ms
<- rd_rotation run time is 9.4134 ms
<- Check difference: error is  0
<- End Application

Запускаю на Cortex A53


aarch64-linux-gnu-gcc -Wall -O3 -fopenmp  -o main main.c
./main
<- Start Application
<- OpenMP is supported
<- Output buffers initialization
<- rd_rotation_old run time is 86.7562 ms
<- rd_rotation run time is 282.807 ms
<- Check difference: error is  0
<- End Application

И там и там работают 4 ядра, но результаты противоположные.

Какая разрядность size_t 64 бит ?- Какая разрядность ARM - 32 бит ?

Уберите операцию деления- быстрее станет ?