Jump to content

    
Aleksei_Rostov

Оптимизация for loop на Си

Recommended Posts

Добрый день! Пытаюсь ускорить выполнение функции сортировки массива по индексам, которая запускается на Cortex a53.

Оптимизируемая функция rd_rotation_old. Новая функция rd_rotation.

#include <errno.h>
#include <getopt.h>
#include <poll.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <unistd.h>
#include <math.h>

#include <omp.h>
#include <fcntl.h>

#define TxChnl 3
#define RxChnl 16
#define Nchirp 128
#define NrFFT  2048

#define FFTR_REAL 1
#define CHIRP_MUX 4     /* chirp multiplexing in raw data */
#define N_TX_PER_CHIP 3 //NUM_TX_PER_DM
#define FFT_WIDTH 2048


typedef struct {
    int16_t re;
    int16_t im;
} cplx_int16_t;

typedef struct {
    cplx_int16_t val[4];
} cplx_int16x4_t;



void rd_rotation_old(void *in[], size_t in_buf_num, size_t n_tx_ch, size_t n_rx_ch, size_t n_bins, size_t n_chirps_per_slot, void *out[], size_t out_buf_num)
{
    const cplx_int16x4_t *p;
    cplx_int16x4_t *      out_p;

    unsigned int d_bin_idx; /* bin index in doppler fft input */
    unsigned int tdm_bin_off;
    unsigned int tdm_bin_idx;
    unsigned int chip_off; /* chip offset in the output buffer */
    unsigned int n_bins_proc = (FFTR_REAL ? n_bins / 2 : n_bins);
    size_t       n_chirps = n_chirps_per_slot * n_tx_ch;

    /*how many x4 samples per single chip*/
    chip_off = (n_chirps * n_bins_proc);
    for (int idx_buf = 0; idx_buf < in_buf_num; idx_buf++) {
        out_p = (cplx_int16x4_t *)out[idx_buf];
        for (int idx_chirp = 0; idx_chirp < n_chirps * CHIRP_MUX; idx_chirp++) {
            /* stride over separate chirp */
            p = (cplx_int16x4_t *)in[idx_buf] + (idx_chirp * n_bins);
            d_bin_idx = idx_chirp / CHIRP_MUX;
            tdm_bin_idx = (d_bin_idx / n_tx_ch);
            tdm_bin_off = ((d_bin_idx % n_tx_ch) * n_chirps_per_slot);
            for (int idx_smpl = 0; idx_smpl < n_bins_proc; idx_smpl++, p++) {
                *(out_p + idx_smpl * n_chirps + chip_off * (idx_chirp % CHIRP_MUX) + tdm_bin_off + tdm_bin_idx) = *p;
            }
        }
    }
}



void rd_rotation(cplx_int16x4_t *array_in, size_t n_tx_ch, size_t n_rx_ch, size_t n_bins, size_t n_chirps_per_slot, cplx_int16x4_t *array_out )
{
    unsigned int blockTxRx = n_tx_ch*n_rx_ch/4*n_bins;
    unsigned int blockTx   = n_rx_ch/4*n_bins;
    unsigned int blockRx   = n_bins/4*4;
    unsigned int ptr_out   = 0;
    unsigned int rx = 0, rbin = 0, tx = 0, nChirp = 0;

    unsigned int tmp_tx = 0; 

// #pragma omp parallel
{
// #pragma omp for
    for ( rx = 0; rx < n_rx_ch/4; ++rx ){
        for( rbin = 0; rbin < n_bins/2; ++rbin ){
             for( tx = 0; tx < n_tx_ch; ++tx ){
                 tmp_tx = tx*blockTx + rbin;
                for( nChirp = 0; nChirp < n_chirps_per_slot;   ++nChirp)
                {
                        ptr_out = nChirp + rx*TxChnl*NrFFT/2*Nchirp + rbin*TxChnl*Nchirp + tx*Nchirp;
                        *(array_out + ptr_out) = *(array_in + nChirp*blockTxRx +  tmp_tx + rx*blockRx);
                } // nChirp
            } // tx

        } // rbin
    } // rx            
} // omp
    
}

// m2nF9zXyiBRD
int main()
{
    printf("<- Start Application \n");

    
    #ifdef _OPENMP
        printf("<- OpenMP is supported \n");
    #endif


    cplx_int16x4_t *cube_in, *cube_out, *cube_gld;
    cube_in  = (cplx_int16x4_t *)malloc(Nchirp*TxChnl*RxChnl/4*NrFFT*sizeof(cplx_int16x4_t));
    cube_out = (cplx_int16x4_t *)malloc(Nchirp*TxChnl*RxChnl/4*NrFFT/2*sizeof(cplx_int16x4_t));
    cube_gld = (cplx_int16x4_t *)malloc(Nchirp*TxChnl*RxChnl/4*NrFFT/2*sizeof(cplx_int16x4_t));

    void *cube_in_arr[1]   = {cube_in};
    void *cube_out_arr[1]  = {cube_gld};

    if(cube_in == NULL || cube_out == NULL || cube_gld == NULL)
    {
        printf("Error malloc \n");
        exit(-1);
    }

    printf("<- Output buffers initialization \n");
    for(int i = 0; i < Nchirp*TxChnl*RxChnl/4*NrFFT/2; i ++)
        for(int k = 0; k < 4; k ++)
        {
            cube_out[i].val[k].re = (int16_t)rand() % 16382;
            cube_out[i].val[k].im = (int16_t)rand() % 16382;

            cube_gld[i].val[k].re = (int16_t)rand() % 16382;
            cube_gld[i].val[k].im = (int16_t)rand() % 16382;

        }
  
    for(int i = 0; i < Nchirp*TxChnl*RxChnl/4*NrFFT; i ++)
        for(int k = 0; k < 4; k ++)
        {
            cube_in[i].val[k].re = (int16_t)rand() % 16382;
            cube_in[i].val[k].im = (int16_t)rand() % 16382;
        }


    


    size_t in_buf_num = 1;
    size_t n_tx_ch    = (size_t)TxChnl;
    size_t n_rx_ch    = (size_t)RxChnl;
    size_t n_bins     = (size_t)NrFFT;
    size_t n_chirps_per_slot = (size_t)Nchirp;
    size_t out_buf_num = 1;

    double start, end, runTime;
    

    start = omp_get_wtime();
    rd_rotation_old( (void *)&cube_in_arr, in_buf_num, n_tx_ch, n_rx_ch, n_bins, n_chirps_per_slot, (void *)&cube_out_arr, out_buf_num);
    end = omp_get_wtime();
    runTime = end - start;
    printf("<- rd_rotation_old run time is %g ms \n", runTime*1000);


    start = omp_get_wtime();
    rd_rotation(&cube_in[0], n_tx_ch, n_rx_ch, n_bins, n_chirps_per_slot, &cube_out[0]);
    end = omp_get_wtime();
    runTime = end - start;

    printf("<- rd_rotation run time is %g ms \n", runTime*1000);


    int32_t diff = 0;

    for(int i = 0; i < Nchirp*TxChnl*RxChnl/4*NrFFT/2; i ++)
        for(int k = 0; k < 4; k ++)
            diff += (int32_t)abs(cube_out[i].val[k].re-cube_gld[i].val[k].re+cube_out[i].val[k].im-cube_gld[i].val[k].im);

    printf("<- Check difference: error is  %d \n", diff);

         
    free(cube_in);
    free(cube_out);
    free(cube_gld);
    printf("<- End Application \n");
    return 0;
}

Запускаю на на виртуальной машине с Ubuntu x86-64

gcc -Wall -O3 -fopenmp -o main main.c
./main
<- Start Application
<- OpenMP is supported
<- Output buffers initialization
<- rd_rotation_old run time is 21.8302 ms
<- rd_rotation run time is 9.4134 ms
<- Check difference: error is  0
<- End Application

Запускаю на Cortex A53

aarch64-linux-gnu-gcc -Wall -O3 -fopenmp  -o main main.c
./main
<- Start Application
<- OpenMP is supported
<- Output buffers initialization
<- rd_rotation_old run time is 86.7562 ms
<- rd_rotation run time is 282.807 ms
<- Check difference: error is  0
<- End Application

И там и там работают 4 ядра, но результаты противоположные.

Вопрос: как можно ускорить rd_rotation в том числе и с использованием OpenMP? И в чем причина различий во времени выполнения кода ?

Share this post


Link to post
Share on other sites

Join the conversation

You can post now and register later. If you have an account, sign in now to post with your account.

Guest
Reply to this topic...

×   Pasted as rich text.   Paste as plain text instead

  Only 75 emoji are allowed.

×   Your link has been automatically embedded.   Display as a link instead

×   Your previous content has been restored.   Clear editor

×   You cannot paste images directly. Upload or insert images from URL.