! Conditional compilation : use scorep user task api #ifdef CPP_SCOREP #include "scorep/SCOREP_User.inc" #define SCOREP_DEFINE_HANDLE_ARRAY( handle ) SCOREP_USER_REGION_HANDLE, allocatable :: handle(:) #define SCOREP_ALLOCATE_HANDLE_ARRAY( handle, size ) allocate( scorep_handle(size) ) #else #define SCOREP_DEFINE_HANDLE_ARRAY(handle) #define SCOREP_ALLOCATE_HANDLE_ARRAY(handle,size) #define SCOREP_USER_REGION_INIT(handle,name,region_type) #define SCOREP_USER_REGION_ENTER(handle) #define SCOREP_USER_REGION_END(handle) #endif MODULE profiling_physic_mod use mod_phys_lmdz_omp_data, only : omp_size, omp_rank use dict_int_mod IMPLICIT NONE SAVE PRIVATE ! Shared variables CHARACTER(*), parameter :: modname = "profiling_physic_mod" LOGICAL, PARAMETER :: check_accuracy = .true. INTEGER, PARAMETER :: max_id=50, max_depth=10 INTEGER :: nb_id ! Number of timers CHARACTER(15), DIMENSION(max_id) :: name ! Timer names type(dict) :: ids ! Threadlocal variables INTEGER, ALLOCATABLE :: depth(:)!(omp_size) ! Number of current nested timers INTEGER, ALLOCATABLE :: current_id(:,:)!(omp_size, max_depth) ! Current nested timer ids REAL, ALLOCATABLE :: chrono(:,:)!(omp_size, max_depth) ! Last timer start time REAL, ALLOCATABLE :: elapsed(:,:)!(omp_size, max_id) ! Cumulative time for each timer SCOREP_DEFINE_HANDLE_ARRAY(scorep_handle) LOGICAL :: profiling_enabled = .false. PUBLIC :: enter_profile, exit_profile, print_profile CONTAINS ! Be sure to init mod_phys_lmdz_omp_data before profiling SUBROUTINE init_profiling use ioipsl_getin_p_mod, only : getin call getin("phy_profiling_enable", profiling_enabled) nb_id=0 allocate(depth(0:omp_size-1)) allocate(current_id(0:omp_size-1, max_depth)) allocate(chrono(0:omp_size-1, max_depth)) allocate(elapsed(0:omp_size-1, max_id)) depth(:)=0 SCOREP_ALLOCATE_HANDLE_ARRAY(scorep_handle, max_id) call reset_profiling() END SUBROUTINE init_profiling SUBROUTINE reset_profiling if( any( depth /= 0 ) ) call abort_physic(modname, "Impossible to reset profiling : a profiling region is still open",1) elapsed(:,:)=0 END SUBROUTINE SUBROUTINE register_id(thename, id) CHARACTER(*), INTENT(IN) :: thename INTEGER, INTENT(OUT) :: id nb_id = nb_id+1 if(nb_id > max_id) call abort_physic(modname, "Too many timers",1) id = nb_id name(id)=thename SCOREP_USER_REGION_INIT( scorep_handle(id), thename, SCOREP_USER_REGION_TYPE_COMMON ) END SUBROUTINE register_id FUNCTION get_elapsed(start) INTEGER(kind=8) :: count, count_rate REAL :: start,get_elapsed CALL SYSTEM_CLOCK(count,count_rate) if(check_accuracy .and. count < 10) print *, "Warning : Profiling elapsed time too short to be accurately measured" get_elapsed = (1.*count)/(1.*count_rate) - start IF(get_elapsed<0.) get_elapsed=0. END FUNCTION get_elapsed ! Enter profiling region with name 'name' ! If 'name' doesn't exist, new id is created ! Profiling region associated to name must be inactive (dosen't exist yet or 'exit_profile'd) ! First call for 'name' must be called for every thread in the parallel region SUBROUTINE enter_profile(name) CHARACTER(*) :: name integer :: id logical, save :: first = .true. ! Initialize profiling at first call if(first) then call checked_omp_barrier("Timers init") !$omp master first = .false. call init_profiling !$omp end master !$omp barrier end if if( profiling_enabled ) then ! Create prof_id at first call with 'name' if( .not. exists(ids, name) ) then call checked_omp_barrier(name) !$omp master call register_id(name,id) call insert_or_assign(ids, name, id) !$omp end master !$omp barrier end if call enter_profile_aux(get_val(ids, name)) endif END SUBROUTINE SUBROUTINE enter_profile_aux(id) INTEGER, INTENT(IN) :: id depth(omp_rank) = depth(omp_rank)+1 chrono(omp_rank, depth(omp_rank)) = get_elapsed(0.) current_id(omp_rank,depth(omp_rank)) = id SCOREP_USER_REGION_ENTER(scorep_handle(id)) END SUBROUTINE ! Exit profiling region with name 'name' ! Region must exist and be active on current thread SUBROUTINE exit_profile(name) CHARACTER(*) :: name if( profiling_enabled ) then if(.not. exists(ids, name)) call abort_physic(modname, "Exit unknown timer", 1) call exit_profile_aux(get_val(ids,name)) endif END SUBROUTINE SUBROUTINE exit_profile_aux(id) INTEGER, INTENT(IN) :: id INTEGER :: parent_id REAL :: my_elapsed IF(depth(omp_rank)<=0) call abort_physic(modname, "exit_profile called without a matching enter_profile (depth=0)",1) IF(id /= current_id(omp_rank, depth(omp_rank))) call abort_physic(modname, "wrong timer id : exit_profile id doesn't match enter_profile",1) SCOREP_USER_REGION_END(scorep_handle(id)) my_elapsed = get_elapsed(chrono(omp_rank, depth(omp_rank))) ! add elapsed to current profile elapsed(omp_rank,id) = elapsed(omp_rank,id) + my_elapsed depth(omp_rank) = depth(omp_rank)-1 ! and substract from parent profile IF(depth(omp_rank)>0) THEN parent_id = current_id(omp_rank,depth(omp_rank)) elapsed(omp_rank,parent_id) = elapsed(omp_rank,parent_id) - my_elapsed END IF END SUBROUTINE ! Collective operation : must be called on every process/thread SUBROUTINE print_profile !use mpi_mod !use mpipara use mod_phys_lmdz_mpi_data, only : is_mpi_root, mpi_size use mod_phys_lmdz_mpi_transfert , only : reduce_sum_mpi, reduce_min_mpi, reduce_max_mpi INTEGER :: i REAL :: mean_total, min_total, max_total, mean_total_local, min_total_local, max_total_local REAL :: omp_mean_time_local, omp_min_time_local, omp_max_time_local REAL :: mean_time, max_process, min_process, percent if( profiling_enabled ) then call enter_profile("print_profile") !$OMP MASTER ! mean_total : mean of sum of all timers on all threads and all procs mean_total_local = SUM(elapsed(:,1:nb_id))/omp_size/mpi_size !call MPI_Reduce(mean_total_local, mean_total, 1, MPI_DOUBLE, MPI_SUM, 0, comm_icosa, ierr) call reduce_sum_mpi( mean_total_local, mean_total ) ! min_total : min on all procs of sum of all timers on all threads min_total_local = MINVAL( SUM(elapsed(:,1:nb_id), 2) ) !call MPI_Reduce(min_total_local, min_total, 1, MPI_DOUBLE, MPI_MIN, 0, comm_icosa, ierr) call reduce_min_mpi(min_total_local, min_total) ! max_total : max on all procs of sum of all timers on all threads max_total_local = MAXVAL( SUM(elapsed(:,1:nb_id), 2) ) !call MPI_Reduce(max_total_local, max_total, 1, MPI_DOUBLE, MPI_MAX, 0, comm_icosa, ierr) call reduce_max_mpi(max_total_local, max_total) if(is_mpi_root) PRINT *, '---------------------- Profiling (Physic) --------------' if(is_mpi_root) PRINT ('(A15, F12.3, A, F12.3, A, F12.3, A)'), 'Total (Physic) (s) : ', mean_total, " [", min_total, ",", max_total, "]" if(is_mpi_root) PRINT ('(A15, A7, A47, A47)'), " "," % "," Process mean(s)[ min, max, diff/mean(%) ] ", "Process 1 threads(s)[ min, max, diff/mean(%) ]" DO i=1,nb_id omp_mean_time_local = SUM(elapsed(:,i))/omp_size ! Mean time of timer on local threads omp_min_time_local = MINVAL(elapsed(:,i)) ! Max time of timer on local threads omp_max_time_local = MAXVAL(elapsed(:,i)) ! Min time of timer on local threads !omp_inbalance_local = 100*(max_time-min_time)/min_time ! mean_time : mean of timer on all threads and all procs !call MPI_Reduce(omp_mean_time_local, mean_time, 1, MPI_DOUBLE, MPI_SUM, 0, comm_icosa, ierr); call reduce_sum_mpi( omp_mean_time_local, mean_time ) mean_time = mean_time/mpi_size ! max_process_time : max on procs of mean omp time !call MPI_Reduce(omp_mean_time_local, max_process, 1, MPI_DOUBLE, MPI_MAX, 0, comm_icosa, ierr) call reduce_max_mpi(omp_mean_time_local, max_process) ! min_process_time : min on procs of mean omp time !call MPI_Reduce(omp_mean_time_local, min_process, 1, MPI_DOUBLE, MPI_MIN, 0, comm_icosa, ierr) call reduce_min_mpi(omp_mean_time_local, min_process) percent = 100*mean_time/mean_total if(is_mpi_root) PRINT ('(A15, F5.1, F12.3, A, F12.3, A, F12.3, A, F6.1, A, F12.3, A, F12.3, A, F12.3, A, F6.1, A)'), name(i), percent, mean_time, " [", min_process, ",", max_process, "," , inbalance(mean_time, min_process, max_process), "]", omp_mean_time_local, " [", omp_min_time_local, ",", omp_max_time_local, "," , inbalance(omp_mean_time_local, omp_min_time_local, omp_max_time_local) , "]" END DO if(is_mpi_root) PRINT *, '---------------------- Profiling -----------------------' !$OMP END MASTER call exit_profile("print_profile") endif contains function inbalance( mean, min, max ) real :: mean, min, max real :: inbalance inbalance = 100*(max-min)/(mean+1E-10) end function END SUBROUTINE print_profile #ifdef CPP_OMP subroutine checked_omp_barrier(message) use omp_lib, only : omp_get_team_size character(*), intent(in) :: message integer, save :: count count = 0 !$omp barrier !$omp atomic update count = count + 1 !$omp barrier if( count /= omp_get_team_size(1) ) then print *, "Error in barrier in physic timers : ", trim(message) print *, count,"/",omp_get_team_size(1), "threads participated" call abort_physic(modname, "Timer barrier",1) end if end subroutine checked_omp_barrier #else subroutine checked_omp_barrier(message) character(*), intent(in) :: message end subroutine checked_omp_barrier #endif END MODULE profiling_physic_mod