 * Copyright (C) 2008 GEOLAB Ltd. ( http://www.geo-lab.ru )
 * Contents: a program to test shared disk I/O performance on clusters
 * Compilation: 
 *   mpicc -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 file_cluster_benchmark.c
 * Usage:
 *   mpirun -np <number-of-nodes> ./file_cluster_benchmark parameters
 * Parameters (space separated, in the following order):
 *   1. Size of the file to read/write on each node, gigabytes
 *   2. Size of the block every read/write operates with, bytes
 *   3. Read every given block when testing sort-like reading
 *   4. Directory where to create/write/read files
 * Description:
 *    The program is started with the MPI mechanizm on a number of nodes.
 *    For each node it creates a file in the given directory and writes it
 *    by blocks of the given size until file's size exceeds the given value.
 *    The disk write rates are measured, and minium/maximum/average rates
 *    over all nodes are printed out. After that, the program reads these
 *    files successively, and minium/maximum/average read rates are printed
 *    out. Finally, the program starts reading the files on each node in
 *    the so-called "sorting" mode: read block #1, then #101, etc., up
 *    to the end of file, then read blocks #2, #102, and so on. The read
 *    rates for this mode are also printed out. Be prepared that the last
 *    test may take a while (up to 50-100 times slower than in successive
 *    reading).
 * Example:
 *   Test disk I/O for the cluster file system, mounted as /data, on 64 
 *   nodes with 16 GB file for each of them using block size 4096 bytes 
 *   and acccessing every 100th trace in the 'sorting' test:
 *   mpirun -np 64 ./file_cluster_benchmark 16 4096 100 /data
 *   On the SKIF-MSU supercomputer, use "-as one_per_node" parameter
 *   for mpirun to ensure each node runs only one MPI process.
 *   mpirun -np 64 -as one_per_node ...
 * Author: KuE ( ekurin@geo-lab.ru )
 * Date: 2008-12-28

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>
#include <errno.h>
#include <fcntl.h>
#include <time.h>
#include <mpi.h>

#define PROGRAM "file_cluster_benchmark"
#define BUF_SIZE 1024
#define ONE_KB ((off_t)1024)
#define ONE_MB ((off_t)1048576)
#define ONE_GB ((off_t)1073741824)
#define MAX_MEM_BLOCKS 1024

void print_usage ()
      "\nUsage: %s <file-size,GB> <block-size,B> <stride> <work-dir>\n\n", 

void mem_clean ()
  int i = 0, j = 0;
  char * mem_blocks[MAX_MEM_BLOCKS];
  memset(mem_blocks, 0, MAX_MEM_BLOCKS * sizeof(char *));

  for (i = 0; i < MAX_MEM_BLOCKS; i++)
    if ( (mem_blocks[i] = (char *)malloc(ONE_GB)) == NULL )
      /*fprintf(stderr, "mem_clean: info: %d GB allocated\n", i);*/
    for (j = 0; j < ONE_GB; j++)
      mem_blocks[i][j] = (char)i;

  for (i = 0; i < MAX_MEM_BLOCKS; i++)
    if ( mem_blocks[i] != NULL ) 

int main (int argc, char ** argv)
  int ret = 0;
  int fsize_gb = 0; /* file size, gigabytes */
  int bsize = 0; /* block size, bytes */
  int stride = 0; /* stride, blocks*/
  char wd[BUF_SIZE]; /* working directory */

  char buf[BUF_SIZE];
  struct stat statbuf;
  int st = 0, n = 0, i = 0, fd = -1, j = 0, from = 0;
  char * block = NULL;
  time_t t_start, t_stop, t_elapsed;
  off_t bytes = 0;
  double rate = 0.0, rate_min = 0.0, rate_max = 0.0, rate_avg = 0.0;
  double * rates = NULL;
  char node_name[MPI_MAX_PROCESSOR_NAME];
  char * node_names = NULL;

  if ( argc == 5 )
     * get/check/init program parameters
    fsize_gb = atoi(argv[1]);
    bsize = atoi(argv[2]);
    stride = atoi(argv[3]);
    strncpy(wd, argv[4], BUF_SIZE - 1);

    if ( fsize_gb < 1 )
      fprintf(stderr, "%s: error: file size is less than 1 GB\n", PROGRAM);
      return 1;

    if ( bsize < 1 )
      fprintf(stderr, "%s: error: block size is less than 1 byte\n", PROGRAM);
      return 1;

    n = ONE_GB * (off_t)fsize_gb / bsize;
    block = (char *)malloc(bsize);
    if ( block == NULL )
      fprintf(stderr, "%s: error: malloc failed for block: %s\n", 
          PROGRAM, strerror(errno));
      return 1;

    if ( stride < 2 )
      fprintf(stderr, "%s: error: stride is less than 2\n", PROGRAM);
      return 1;

    st = stat (wd, &statbuf);
    if ( st != 0 )
      fprintf(stderr, "%s: error: accessing %s: %s\n", 
          PROGRAM, wd, strerror(errno));
      return 1;
    else if ( ! S_ISDIR(statbuf.st_mode) )
      fprintf(stderr, "%s: error: %s is not a directory\n", PROGRAM, wd);
      return 1;
    if ( access (wd, W_OK) != 0 )
      fprintf(stderr, "%s: error: cannot write to directory %s\n", 
          PROGRAM, wd);
      return 1;

     * start MPI
    if ( MPI_Init (&argc, &argv) == MPI_SUCCESS )
      int comm_size = 1;
      int rank = 0;
      MPI_Comm_size (MPI_COMM_WORLD, &comm_size);
      MPI_Comm_rank (MPI_COMM_WORLD, &rank);
      node_names = (char *)calloc(comm_size * MPI_MAX_PROCESSOR_NAME,
          sizeof(char) );
      memset(node_name, 0, MPI_MAX_PROCESSOR_NAME);
      MPI_Get_processor_name(node_name, &i);
      MPI_Gather(node_name, MPI_MAX_PROCESSOR_NAME, MPI_CHAR, 
          node_names, MPI_MAX_PROCESSOR_NAME, MPI_CHAR, 0, 
      rates = (double *)calloc(comm_size, sizeof(double));
       * form file name according to the rank
      strncpy(buf, wd, BUF_SIZE - 1);
      if ( buf[strlen(buf) - 1] != '/' ) strncat(buf, "/", BUF_SIZE - 1);
      snprintf (buf + strlen(buf), BUF_SIZE - 1, "%-d", rank);
       * open file for writing
      fd = open (buf, O_CREAT | O_WRONLY | O_LARGEFILE | O_TRUNC,
         S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);
      if ( fd != -1 )
         * write file
        t_start = time (NULL);
        for (i = 0, bytes = 0; i < n; i++, bytes += bsize)
          if ( write (fd, block, bsize) < bsize ) 
            fprintf (stderr, "%s: error: writing %s failed: %s\n", 
              PROGRAM, buf, strerror(errno));
            ret = 1;
        close(fd); fd = -1;
        t_stop = time (NULL);
         * gather from all nodes and printout write performance figures
        t_elapsed = t_stop - t_start;
        rate = (double)bytes / t_elapsed / (double)ONE_MB;
        MPI_Reduce (&rate, &rate_min, 1, MPI_DOUBLE, MPI_MIN, 0, 
        MPI_Reduce (&rate, &rate_max, 1, MPI_DOUBLE, MPI_MAX, 0, 
        MPI_Reduce (&rate, &rate_avg, 1, MPI_DOUBLE, MPI_SUM, 0, 
        rate_avg /= (double)comm_size;
        MPI_Gather(&rate, 1, MPI_DOUBLE, rates, 1, MPI_DOUBLE, 0, 
        if ( rank == 0 )
          printf("\nWriting %d GB by %d nodes:\n\n", fsize_gb, comm_size);
          printf("\tmin rate = %-.1lf MB/sec\n", rate_min);
          printf("\tmax rate = %-.1lf MB/sec\n", rate_max);
          printf("\tavg rate = %-.1lf MB/sec\n\n", rate_avg);
          for (i = 0; i < comm_size; i++)
            printf("\tnode = %-d\thost = %-s\trate = %-.1lf MB/sec\n", 
                i, node_names + i * MPI_MAX_PROCESSOR_NAME, rates[i]);
        if ( ret == 0 ) /* no previous error */
           * re-open file for reading
          fd = open (buf, O_RDONLY | O_LARGEFILE);
          if ( fd != -1 )
             * read file successively
            mem_clean ();
            t_start = time (NULL);
            for (i = 0, bytes = 0; i < n; i++, bytes += bsize)
              if ( read (fd, block, bsize) < bsize ) 
                fprintf (stderr, "%s: error: reading %s failed: %s\n", 
                  PROGRAM, buf, strerror(errno));
                ret = 1;
            t_stop = time (NULL);
             * gather from all nodes and printout read performance figures
            t_elapsed = t_stop - t_start;
            rate = (double)bytes / t_elapsed / (double)ONE_MB;
            MPI_Reduce (&rate, &rate_min, 1, MPI_DOUBLE, MPI_MIN, 0, 
            MPI_Reduce (&rate, &rate_max, 1, MPI_DOUBLE, MPI_MAX, 0, 
            MPI_Reduce (&rate, &rate_avg, 1, MPI_DOUBLE, MPI_SUM, 0, 
            rate_avg /= (double)comm_size;
            MPI_Gather(&rate, 1, MPI_DOUBLE, rates, 1, MPI_DOUBLE, 0, 
            if ( rank == 0 )
              printf("\nSuccessive reading %d GB by %d nodes:\n\n", 
                  fsize_gb, comm_size);
              printf("\tmin rate = %-.1lf MB/sec\n", rate_min);
              printf("\tmax rate = %-.1lf MB/sec\n", rate_max);
              printf("\tavg rate = %-.1lf MB/sec\n\n", rate_avg);
              for (i = 0; i < comm_size; i++)
                printf("\tnode = %-d\thost = %-s\trate = %-.1lf MB/sec\n", 
                    i, node_names + i * MPI_MAX_PROCESSOR_NAME, rates[i]);
             * read file like in sorting
            mem_clean ();
            t_start = time (NULL);
            for (i = 0, from = 0, j = 0, bytes = 0; i < n; 
                i++, j++, bytes += bsize)
              off_t pos = 0;
              int num = from + j * stride;
              if ( num >= n )
                j = 0;
                num = from;
              pos = (off_t)num * bsize;
              if ( lseek(fd, pos, SEEK_SET) == (off_t)-1 ) 
                fprintf(stderr, "%s: error: lseek on %s failed: %s\n",
                    PROGRAM, buf, strerror(errno));
                ret = 1;
              if ( read (fd, block, bsize) < bsize ) 
                fprintf (stderr, "%s: error: reading %s failed: %s\n", 
                  PROGRAM, buf, strerror(errno));
                ret = 1;
            close(fd); fd = -1;
            t_stop = time (NULL);
             * gather from all nodes and printout read performance figures
            t_elapsed = t_stop - t_start;
            rate = (double)bytes / t_elapsed / (double)ONE_MB;
            MPI_Reduce (&rate, &rate_min, 1, MPI_DOUBLE, MPI_MIN, 0, 
            MPI_Reduce (&rate, &rate_max, 1, MPI_DOUBLE, MPI_MAX, 0, 
            MPI_Reduce (&rate, &rate_avg, 1, MPI_DOUBLE, MPI_SUM, 0, 
            rate_avg /= (double)comm_size;
            MPI_Gather(&rate, 1, MPI_DOUBLE, rates, 1, MPI_DOUBLE, 0, 
            if ( rank == 0 )
              printf("\nSort-like reading %d GB by %d nodes:\n\n", 
                  fsize_gb, comm_size);
              printf("\tmin rate = %-.1lf MB/sec\n", rate_min);
              printf("\tmax rate = %-.1lf MB/sec\n", rate_max);
              printf("\tavg rate = %-.1lf MB/sec\n\n", rate_avg);
              for (i = 0; i < comm_size; i++)
                printf("\tnode = %-d\thost = %-s\trate = %-.1lf MB/sec\n", 
                    i, node_names + i * MPI_MAX_PROCESSOR_NAME, rates[i]);
            fprintf (stderr, "%s: error: opening %s for reading failed: %s\n", 
                PROGRAM, buf, strerror(errno));
            ret = 1;
        fprintf (stderr, "%s: error: opening %s for writing failed: %s\n", 
            PROGRAM, buf, strerror(errno));
        ret = 1;
       * stop MPI environment
      MPI_Finalize ();
      fprintf (stderr, "%s: error: MPI_Init failed\n", PROGRAM);
      ret = 1;
    fprintf (stderr, "%s: error: too few parameters\n", PROGRAM);
    print_usage ();
    ret = 1;
  if ( block != NULL ) free(block);
  if ( rates != NULL ) free(rates);
  if ( node_names != NULL ) free(node_names);
  return ret;
