mig_benchmark.c

#include <math.h>
#include <stdlib.h>
#include <stdio.h>
#include <libgen.h>
#include <pthread.h>

/*
 * Copyright (C) 2008 GEOLAB Ltd. ( http://www.geo-lab.ru )
 *
 * This file contains a program for testing multi-CPU computers for their fit
 * to migration-type seismic processing. The test is based on the 'kirchslow'
 * subroutine from the book "Basic Earth Imaging" by Jon F.Claerbout and 
 * James L.Black. The subroutine implements a simplified version of 'Kirchhoff'
 * migration in the homogenious medium. 
 *
 * The program uses the mechanizm of POSIX threads for running
 * parallel computations. Each thread produces migration for its portion of
 * output, as the common input is all in memory.
 *
 * The input is a set of traces with all their samples but one set to zeroes,
 * as the only sample is set to 1. Thus the migration output should look like
 * a half-round.
 * 
 * The program takes 3 positional command line parameters (see 'main()'):
 *     1. Number of traces in input/output.
 *     2. Number of samples in input/output.
 *     3. Number of computing threads.
 *
 * Date: Feb-03-2007
 *
 * Modified: Nov-25-2007 - KuE - mig_one_trace(): introduced optimization 
 *                               made by the Intel ICSC CRT Team
 *
 * Author(s): KuE
 *
 * Remarks:
 *     There are also original functions from BEI book by Claerbout and Black.
 *     They are here for reference and for making modeling if required.
 */

#define max(a,b) ((a)>(b)?(a):(b))
#define min(a,b) ((a)<(b)?(a):(b))

/*
 * Purpose: 'slow' implementation of "Kirchhoff" modeling/migration.
 * Borrowed from the book "Basic Earth Imaging (Version 2.4)"
 * by Jon F.Claerbout and James L.Black pp.65-67.
 * Translated from Ratfor to C by KuE.
 */
void kirchslow(
        int adj, /* 0 - modeling, 1 - migration */
        float velhalf, /* half the interval velocity */
        float t0, /* */
        float dt, /* sampling rate, sec */
        float dx,  /* distance between traces, m */
        float ** modl, /* input for modeling, output for migration */
        int nt, /* number of samples */
        int nx, /* number of traces */
        float ** data /* input for migration, output for modeling */
        )
{
    int ix, iy, it, iz, ib, nz;
    float x0, y0, dy, z0, dz, t, x, y, z, hs;
    
    x0 = 0; y0 = 0; dy = dx; z0 = t0; dz = dt; nz = nt;
    for (ix = 0; ix < nx; ix++) {
        x = x0 + dx * ix;
        for (iy = 0; iy < nx; iy++) {
            y = y0 + dy * iy;
            for (iz = 0; iz < nz; iz++) {
                z = z0 + dz * iz;
                hs = (x - y) / velhalf;
                t = sqrt(z*z+hs*hs);
                it = 0.5+(t-t0)/dt;
                if ( it < nt ) {
                    if ( adj == 0 )
                        data[iy][it] += modl[ix][iz];
                    else
                        modl[ix][iz] += data[iy][it];
                }
            }
        }
    }
}

/*
 * Purpose: 'fast' implementation of "Kirchhoff" modeling/migration.
 * Borrowed from the book "Basic Earth Imaging (Version 2.4)"
 * by Jon F.Claerbout and James L.Black pp.67-68.
 * Translated from Ratfor to C by KuE.
 */
void kirchfast(
        int adj, /* 0 - modeling, 1 - migration */
        float * vrms, /* Vrms law, 'nt' samples, m/sec */
        float t0, /* */
        float dt, /* sampling rate, sec */
        float dx,  /* distance between traces, m */
        float ** modl, /* input for modeling, output for migration */
        int nt, /* number of samples */
        int nx, /* number of traces */
        float ** data /* input for migration, output for modeling */
        )
{
    int ix, iz, it, ib;
    float amp, t, z, b, x;

    for (ib = -nx+1; ib < nx; ib++) {
        b = dx*ib;
        for (iz = 1; iz < nt; iz++) {
            x = b*2./vrms[iz];
            z = t0+dt*iz;
            t = sqrt(z*z+x*x);
            it = 0.5+(t-t0)/dt;
            if (it >= nt)
                break;
            amp = z/t*sqrt(nt*dt/t);
            for (ix = max(0, 1-ib); ix < min(nx,nx-ib); ix++)
                if ( adj == 0 )
                    data[ix+ib][it] += modl[ix][iz]*amp;
                else
                    modl[ix][iz] += (data[ix+ib][it]*amp);
        }
    }
}

/*
 * Purpose: a modification of 'kirchslow()' by J.Claerbout to produce 
 *         migration of only 1 trace; the function is designed to test
 *         performance of SMP machines
 *
 * Remarks:
 *          2007-11-25 - this is a version optimized by the members of
 *                       Intel ICSC CRT Team: Justin Chen, Wang Zhe, 
 *                       Qiao Nan, Mao Xiaowei, and others;
 *                       they have introduced two arrays itt[] and ftt[] of
 *                       indices in order to avoid computing the index just
 *                       before its usage in the loop by iz - this makes it
 *                       possible for the Intel icc compiler to make 
 *                       straightforward vectorization;
 *                       use extra compiler flags:  "-no-prec-div -msse3"
 *                       when building this program with the Intel icc compiler
 *
 */
void mig_one_trace(
        float ** data, /* input for migration */
        int nx, /* number of traces */
        int nt, /* number of samples */
        float dx,  /* distance between traces, m */
        float dt, /* sampling rate, sec */
        float velhalf, /* half the velocity, m/sec */
        float * modl, /* migration output */
        int ix /* migration output trace number */
        )
{
    int iy, it, iz, ib, nz;
    float x0, y0, dy, z0, dz, t, x, y, z, hs;
    int itt[nt];
    float ftt[nt];
    
    x0 = 0; y0 = 0; dy = dx; z0 = 0; dz = dt; nz = nt;
    x = x0 + dx * ix;
    for (iy = 0; iy < nx; iy++) {
        y = y0 + dy * iy;
        for (iz = 0; iz < nz; iz++) {
            z = z0 + dz * iz;
            hs = (x - y) / velhalf;
            t = sqrt(z*z+hs*hs);
            ftt[iz] = 0.5+t/dt;
        }
        for (iz = 0; iz < nz; iz++) {
            itt[iz] = ftt[iz];
        }
        for (iz = 0; iz < nz; iz++) {
            if ( itt[iz] < nt ) {
                modl[iz] += data[iy][itt[iz]];
            }
        }
    }
//        for (iz = 0; iz < nz; iz++) {
//    printf("%g\n",modl[iz]);
//    }
}

/*
 * structure to pass data to a computing thread
 */
typedef struct tagTHREAD_DATA {
    float ** data; /* input */
    int nx; /* number of traces */
    int nt; /* number of samples */
    float dx;  /* distance between traces, m */
    float dt; /* sampling rate, sec */
    float velhalf; /* half the velocity, m/sec */
    float ** modl; /* migration output */
    int ix0; /* migrate from this trace */
    int dix; /* migrate every dix-th trace */
    pthread_t id; /* id of computing thread */ 
} THREAD_DATA;

/*
 * Purpose: computing thread
 */
void * thread(void * _data)
{
    THREAD_DATA * data = (THREAD_DATA *)_data;
    if ( data != NULL ) {
        int ix = 0;
        for (ix = data->ix0; ix < data->nx; ix += data->dix ) {
            mig_one_trace(data->data, data->nx, data->nt, data->dx, data->dt, 
                    data->velhalf, data->modl[ix], ix);
        }
    }
    return NULL;
}

/*
 * Purpose: prints 1D array of floats to stdout
 */
void print_1d_array(const char * what, float * data, int n)
{
    int i;
    printf("\n");
    printf("# %s\n", what);
    printf("\n");
    for (i = 0; i < n; i++)
        printf("%+14.6f\n", data[i]);
}

/*
 * Purpose: prints 2D array of floats to stdout
 */
void print_2d_array(const char * what, float ** data, int n1, int n2,
        int transpose, int reverse1, int reverse2)
{
    
    int i, j;
    printf("\n");
    printf("# %s\n", what);
    printf("\n");
    if ( transpose == 0 ) {
        for (i = 0; i < n1; i++) {
            int ix = i;
            if ( reverse1 ) ix = n1 - i - 1;
            for (j = 0; j < n2; j++) {
                int jx = j;
                if ( reverse2 ) jx = n2 - j - 1;
                if ( j > 0) printf(" ");
                printf("%+14.6f", data[ix][jx]);
            }
            if (i < n1 - 1) printf("\n");
        }
    }
    else {
        for (j = 0; j < n2; j++) {
            int jx = j;
            if ( reverse2 ) jx = n2 - j - 1;
            for (i = 0; i < n1; i++) {
                int ix = i;
                if ( reverse1 ) ix = n1 - i - 1;
                if ( i > 0) printf(" ");
                printf("%+14.6f", data[ix][jx]);
            }
            if (j < n2 - 1) printf("\n");
        }
    }
    printf("\n");
}

const int MAX_TRACES = 1000000; /* maximum number of traces */
const int MAX_SAMPLES = 100000; /* maximum number of samples */
const int MAX_THREADS = 64; /* maximum number of computing threads */

/*
 * Purpose: main function to run multi-CPU performance tests
 */
int main(int argc, char ** argv)
{
    int ret = 0; /* return code */
    float velhalf = 1000; /* half the velocity */
    float dt = 0.004; /* sampling rate */
    float dx = 25; /* distance between traces */
    float ** modl = NULL; /* migration output */
    float ** data = NULL; /* input */
    int nx = MAX_TRACES; /* number of traces */
    int nt = MAX_SAMPLES; /* number of samples */
    int n_threads = MAX_THREADS; /* number of computing threads */
    THREAD_DATA * thread_data = NULL; /* data for computing threads */
    char * prog = basename(argv[0]); /* program name */
    int i = 0;
    /*
     * get command line parameters
     */
    if ( argc != 4 ) {
        fprintf(stderr, "%s: error: too few arguments\n", prog);
        fprintf(stderr, "Usage: %s <traces> <samples> <threads>\n", prog);
        return -1;
    }
    nx = atoi(argv[1]);
    if ( nx < 1 || nx > MAX_TRACES ) {
        fprintf(stderr, 
            "%s: error: wrong number of traces: must be 1-%d\n", 
            prog, MAX_TRACES);
        return -1;
    }
    nt = atoi(argv[2]);
    if ( nt < 1 || nt > MAX_SAMPLES ) {
        fprintf(stderr, 
            "%s: error: wrong number of samples: must be 1-%d\n", 
            prog, MAX_SAMPLES);
        return -1;
    }
    n_threads = atoi(argv[3]);
    if ( n_threads < 1 || n_threads > MAX_THREADS ) {
        fprintf(stderr, 
            "%s: error: wrong number of threads: must be 1-%d\n", 
            prog, MAX_THREADS);
        return -1;
    }
    /*
     * allocate memory
     */
    modl = (float **)calloc(nx, sizeof(float *));
    if ( modl == NULL ) {
        fprintf(stderr, "%s: error: cannot allocate memory: modl\n", prog);
        ret = -1;
        goto FINISH;
    }
    for ( i = 0; i < nx; i++ ) {
        modl[i] = (float *)calloc(nt, sizeof(float));
        if ( modl[i] == NULL ) {
            fprintf(stderr, "%s: error: cannot allocate memory: modl[%d]\n",
                    prog, i);
            ret = -1;
            goto FINISH;
        }
    }
    data = (float **)calloc(nx, sizeof(float *));
    if ( data == NULL ) {
        fprintf(stderr, "%s: error: cannot allocate memory: data\n", prog);
        ret = -1;
        goto FINISH;
    }
    for ( i = 0; i < nx; i++ ) {
        data[i] = (float *)calloc(nt, sizeof(float));
        if ( data[i] == NULL ) {
            fprintf(stderr, "%s: error: cannot allocate memory: data[%d]\n",
                    prog, i);
            ret = -1;
            goto FINISH;
        }
    }
    thread_data = (THREAD_DATA *)calloc(n_threads, sizeof(THREAD_DATA));
    /*
     * init data for computing threads
     */
    for (i = 0; i < n_threads; i++) {
        thread_data[i].data = data;
        thread_data[i].nx = nx;
        thread_data[i].nt = nt;
        thread_data[i].dx = dx;
        thread_data[i].dt = dt;
        thread_data[i].velhalf = velhalf;
        thread_data[i].modl = modl;
        thread_data[i].ix0 = i;
        thread_data[i].dix = n_threads;
        thread_data[i].id = 0;
    }
    /*
     * modeling
     * do not remove - true modeling with 'kirchslow()' goes here
     *
    modl[nx/2][nt/2] = 1;
    kirchslow(0, velhalf, 0, dt, dx, modl, nt, nx, data);
    modl[nx/2][nt/2] = 0;
     */
    /*
     * init input for migration in case no true modeling is done
     */
    data[nx/2][nt/2] = 1;
    /*
     * migration
     */
    /*
     * do not remove - original 'kirchslow()' call may be used for debugging
    kirchslow(1, velhalf, 0, dt, dx, modl, nt, nx, data);
    goto FINISH;
    */
    /*
     * start computing threads
     */
    for (i = 0; i < n_threads; i++)
        if ( pthread_create(&thread_data[i].id, 0, &thread, 
                    &thread_data[i] ) != 0 ) {
            fprintf(stdout, "main: error: cannot create thread %i\n", i);
            ret = -1;
            goto FINISH;
        }
    /*
     * wait for computing threads to finish
     */
    for ( i = 0; i < n_threads; i++ )
            pthread_join(thread_data[i].id, NULL);
FINISH:
    /*
     * print output to stdout - can be plotted with gnuplot like this:
     *     1. run this program (e.g. 301 traces, 1001 samples, 2 threads)
     *         prog 301 1001 2 > output.txt
     *     2. start gnuplot
     *         gnuplot
     *     3. in gnuplot's command line
     *        set pm3d map
     *        splot 'output.txt' matrix
     *
    print_2d_array("modl after", modl, nx, nt, 1, 0, 1);
     */
    /*
     * release memory
     */
    if ( modl != NULL ) {
        for (i = 0; i < nx; i++) if ( modl[i] != NULL ) free(modl[i]);
        free(modl);
    }
    if ( data != NULL ) {
        for (i = 0; i < nx; i++) if ( data[i] != NULL ) free(data[i]);
        free(data);
    }
    if ( thread_data != NULL ) free(thread_data);
    return ret;
}