/* * Copyright (C) 2008 GEOLAB Ltd. ( http://www.geo-lab.ru ) * * Contents: a program to test shared disk I/O performance on clusters * * Compilation: * * mpicc -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 file_cluster_benchmark.c * * Usage: * * mpirun -np <number-of-nodes> ./file_cluster_benchmark parameters * * Parameters (space separated, in the following order): * * 1. Size of the file to read/write on each node, gigabytes * 2. Size of the block every read/write operates with, bytes * 3. Read every given block when testing sort-like reading * 4. Directory where to create/write/read files * * Description: * * The program is started with the MPI mechanizm on a number of nodes. * For each node it creates a file in the given directory and writes it * by blocks of the given size until file's size exceeds the given value. * The disk write rates are measured, and minium/maximum/average rates * over all nodes are printed out. After that, the program reads these * files successively, and minium/maximum/average read rates are printed * out. Finally, the program starts reading the files on each node in * the so-called "sorting" mode: read block #1, then #101, etc., up * to the end of file, then read blocks #2, #102, and so on. The read * rates for this mode are also printed out. Be prepared that the last * test may take a while (up to 50-100 times slower than in successive * reading). * * Example: * * Test disk I/O for the cluster file system, mounted as /data, on 64 * nodes with 16 GB file for each of them using block size 4096 bytes * and acccessing every 100th trace in the 'sorting' test: * * mpirun -np 64 ./file_cluster_benchmark 16 4096 100 /data * * On the SKIF-MSU supercomputer, use "-as one_per_node" parameter * for mpirun to ensure each node runs only one MPI process. * * mpirun -np 64 -as one_per_node ... * * Author: KuE ( ekurin@geo-lab.ru ) * * Date: 2008-12-28 * */ #include <stdio.h> #include <stdlib.h> #include <string.h> #include <sys/types.h> #include <sys/stat.h> #include <unistd.h> #include <errno.h> #include <fcntl.h> #include <time.h> #include <mpi.h> #define PROGRAM "file_cluster_benchmark" #define BUF_SIZE 1024 #define ONE_KB ((off_t)1024) #define ONE_MB ((off_t)1048576) #define ONE_GB ((off_t)1073741824) #define MAX_MEM_BLOCKS 1024 void print_usage () { fprintf(stderr, "\nUsage: %s <file-size,GB> <block-size,B> <stride> <work-dir>\n\n", PROGRAM); } void mem_clean () { int i = 0, j = 0; char * mem_blocks[MAX_MEM_BLOCKS]; memset(mem_blocks, 0, MAX_MEM_BLOCKS * sizeof(char *)); for (i = 0; i < MAX_MEM_BLOCKS; i++) { if ( (mem_blocks[i] = (char *)malloc(ONE_GB)) == NULL ) { /*fprintf(stderr, "mem_clean: info: %d GB allocated\n", i);*/ break; } for (j = 0; j < ONE_GB; j++) mem_blocks[i][j] = (char)i; } for (i = 0; i < MAX_MEM_BLOCKS; i++) if ( mem_blocks[i] != NULL ) free(mem_blocks[i]); } int main (int argc, char ** argv) { int ret = 0; int fsize_gb = 0; /* file size, gigabytes */ int bsize = 0; /* block size, bytes */ int stride = 0; /* stride, blocks*/ char wd[BUF_SIZE]; /* working directory */ char buf[BUF_SIZE]; struct stat statbuf; int st = 0, n = 0, i = 0, fd = -1, j = 0, from = 0; char * block = NULL; time_t t_start, t_stop, t_elapsed; off_t bytes = 0; double rate = 0.0, rate_min = 0.0, rate_max = 0.0, rate_avg = 0.0; double * rates = NULL; char node_name[MPI_MAX_PROCESSOR_NAME]; char * node_names = NULL; if ( argc == 5 ) { /* * get/check/init program parameters */ fsize_gb = atoi(argv[1]); bsize = atoi(argv[2]); stride = atoi(argv[3]); strncpy(wd, argv[4], BUF_SIZE - 1); if ( fsize_gb < 1 ) { fprintf(stderr, "%s: error: file size is less than 1 GB\n", PROGRAM); return 1; } if ( bsize < 1 ) { fprintf(stderr, "%s: error: block size is less than 1 byte\n", PROGRAM); return 1; } n = ONE_GB * (off_t)fsize_gb / bsize; block = (char *)malloc(bsize); if ( block == NULL ) { fprintf(stderr, "%s: error: malloc failed for block: %s\n", PROGRAM, strerror(errno)); return 1; } if ( stride < 2 ) { fprintf(stderr, "%s: error: stride is less than 2\n", PROGRAM); return 1; } st = stat (wd, &statbuf); if ( st != 0 ) { fprintf(stderr, "%s: error: accessing %s: %s\n", PROGRAM, wd, strerror(errno)); return 1; } else if ( ! S_ISDIR(statbuf.st_mode) ) { fprintf(stderr, "%s: error: %s is not a directory\n", PROGRAM, wd); return 1; } if ( access (wd, W_OK) != 0 ) { fprintf(stderr, "%s: error: cannot write to directory %s\n", PROGRAM, wd); return 1; } /* * start MPI */ if ( MPI_Init (&argc, &argv) == MPI_SUCCESS ) { int comm_size = 1; int rank = 0; MPI_Comm_size (MPI_COMM_WORLD, &comm_size); MPI_Comm_rank (MPI_COMM_WORLD, &rank); node_names = (char *)calloc(comm_size * MPI_MAX_PROCESSOR_NAME, sizeof(char) ); memset(node_name, 0, MPI_MAX_PROCESSOR_NAME); MPI_Get_processor_name(node_name, &i); MPI_Gather(node_name, MPI_MAX_PROCESSOR_NAME, MPI_CHAR, node_names, MPI_MAX_PROCESSOR_NAME, MPI_CHAR, 0, MPI_COMM_WORLD); rates = (double *)calloc(comm_size, sizeof(double)); /* * form file name according to the rank */ strncpy(buf, wd, BUF_SIZE - 1); if ( buf[strlen(buf) - 1] != '/' ) strncat(buf, "/", BUF_SIZE - 1); snprintf (buf + strlen(buf), BUF_SIZE - 1, "%-d", rank); /* * open file for writing */ fd = open (buf, O_CREAT | O_WRONLY | O_LARGEFILE | O_TRUNC, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH); if ( fd != -1 ) { /* * write file */ MPI_Barrier(MPI_COMM_WORLD); t_start = time (NULL); for (i = 0, bytes = 0; i < n; i++, bytes += bsize) { if ( write (fd, block, bsize) < bsize ) { fprintf (stderr, "%s: error: writing %s failed: %s\n", PROGRAM, buf, strerror(errno)); ret = 1; break; } } close(fd); fd = -1; t_stop = time (NULL); /* * gather from all nodes and printout write performance figures */ t_elapsed = t_stop - t_start; rate = (double)bytes / t_elapsed / (double)ONE_MB; MPI_Reduce (&rate, &rate_min, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD); MPI_Reduce (&rate, &rate_max, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); MPI_Reduce (&rate, &rate_avg, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); rate_avg /= (double)comm_size; MPI_Gather(&rate, 1, MPI_DOUBLE, rates, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD); if ( rank == 0 ) { printf("\nWriting %d GB by %d nodes:\n\n", fsize_gb, comm_size); printf("\tmin rate = %-.1lf MB/sec\n", rate_min); printf("\tmax rate = %-.1lf MB/sec\n", rate_max); printf("\tavg rate = %-.1lf MB/sec\n\n", rate_avg); for (i = 0; i < comm_size; i++) { printf("\tnode = %-d\thost = %-s\trate = %-.1lf MB/sec\n", i, node_names + i * MPI_MAX_PROCESSOR_NAME, rates[i]); } printf("\n"); } if ( ret == 0 ) /* no previous error */ { /* * re-open file for reading */ fd = open (buf, O_RDONLY | O_LARGEFILE); if ( fd != -1 ) { /* * read file successively */ mem_clean (); MPI_Barrier(MPI_COMM_WORLD); t_start = time (NULL); for (i = 0, bytes = 0; i < n; i++, bytes += bsize) { if ( read (fd, block, bsize) < bsize ) { fprintf (stderr, "%s: error: reading %s failed: %s\n", PROGRAM, buf, strerror(errno)); ret = 1; break; } } t_stop = time (NULL); /* * gather from all nodes and printout read performance figures */ t_elapsed = t_stop - t_start; rate = (double)bytes / t_elapsed / (double)ONE_MB; MPI_Reduce (&rate, &rate_min, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD); MPI_Reduce (&rate, &rate_max, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); MPI_Reduce (&rate, &rate_avg, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); rate_avg /= (double)comm_size; MPI_Gather(&rate, 1, MPI_DOUBLE, rates, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD); if ( rank == 0 ) { printf("\nSuccessive reading %d GB by %d nodes:\n\n", fsize_gb, comm_size); printf("\tmin rate = %-.1lf MB/sec\n", rate_min); printf("\tmax rate = %-.1lf MB/sec\n", rate_max); printf("\tavg rate = %-.1lf MB/sec\n\n", rate_avg); for (i = 0; i < comm_size; i++) { printf("\tnode = %-d\thost = %-s\trate = %-.1lf MB/sec\n", i, node_names + i * MPI_MAX_PROCESSOR_NAME, rates[i]); } printf("\n"); } /* * read file like in sorting */ mem_clean (); MPI_Barrier(MPI_COMM_WORLD); t_start = time (NULL); for (i = 0, from = 0, j = 0, bytes = 0; i < n; i++, j++, bytes += bsize) { off_t pos = 0; int num = from + j * stride; if ( num >= n ) { from++; j = 0; num = from; } pos = (off_t)num * bsize; if ( lseek(fd, pos, SEEK_SET) == (off_t)-1 ) { fprintf(stderr, "%s: error: lseek on %s failed: %s\n", PROGRAM, buf, strerror(errno)); ret = 1; break; } if ( read (fd, block, bsize) < bsize ) { fprintf (stderr, "%s: error: reading %s failed: %s\n", PROGRAM, buf, strerror(errno)); ret = 1; break; } } close(fd); fd = -1; t_stop = time (NULL); /* * gather from all nodes and printout read performance figures */ t_elapsed = t_stop - t_start; rate = (double)bytes / t_elapsed / (double)ONE_MB; MPI_Reduce (&rate, &rate_min, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD); MPI_Reduce (&rate, &rate_max, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); MPI_Reduce (&rate, &rate_avg, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); rate_avg /= (double)comm_size; MPI_Gather(&rate, 1, MPI_DOUBLE, rates, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD); if ( rank == 0 ) { printf("\nSort-like reading %d GB by %d nodes:\n\n", fsize_gb, comm_size); printf("\tmin rate = %-.1lf MB/sec\n", rate_min); printf("\tmax rate = %-.1lf MB/sec\n", rate_max); printf("\tavg rate = %-.1lf MB/sec\n\n", rate_avg); for (i = 0; i < comm_size; i++) { printf("\tnode = %-d\thost = %-s\trate = %-.1lf MB/sec\n", i, node_names + i * MPI_MAX_PROCESSOR_NAME, rates[i]); } printf("\n"); } } else { fprintf (stderr, "%s: error: opening %s for reading failed: %s\n", PROGRAM, buf, strerror(errno)); ret = 1; } } } else { fprintf (stderr, "%s: error: opening %s for writing failed: %s\n", PROGRAM, buf, strerror(errno)); ret = 1; } /* * stop MPI environment */ MPI_Finalize (); } else { fprintf (stderr, "%s: error: MPI_Init failed\n", PROGRAM); ret = 1; } } else { fprintf (stderr, "%s: error: too few parameters\n", PROGRAM); print_usage (); ret = 1; } if ( block != NULL ) free(block); if ( rates != NULL ) free(rates); if ( node_names != NULL ) free(node_names); return ret; }