trltomad_pack_unpack.F90 Source File


This file depends on

sourcefile~~trltomad_pack_unpack.f90~~EfferentGraph sourcefile~trltomad_pack_unpack.f90 trltomad_pack_unpack.F90 sourcefile~buffered_allocator_mod.f90 buffered_allocator_mod.F90 sourcefile~trltomad_pack_unpack.f90->sourcefile~buffered_allocator_mod.f90 sourcefile~ledir_mod.f90 ledir_mod.F90 sourcefile~trltomad_pack_unpack.f90->sourcefile~ledir_mod.f90 sourcefile~parkind_ectrans.f90 parkind_ectrans.F90 sourcefile~trltomad_pack_unpack.f90->sourcefile~parkind_ectrans.f90 sourcefile~tpm_dim.f90 tpm_dim.F90 sourcefile~trltomad_pack_unpack.f90->sourcefile~tpm_dim.f90 sourcefile~tpm_distr.f90 tpm_distr.F90 sourcefile~trltomad_pack_unpack.f90->sourcefile~tpm_distr.f90 sourcefile~tpm_fields.f90 tpm_fields.F90 sourcefile~trltomad_pack_unpack.f90->sourcefile~tpm_fields.f90 sourcefile~tpm_geometry.f90 tpm_geometry.F90 sourcefile~trltomad_pack_unpack.f90->sourcefile~tpm_geometry.f90 sourcefile~abort_trans_mod.f90 abort_trans_mod.F90 sourcefile~buffered_allocator_mod.f90->sourcefile~abort_trans_mod.f90 sourcefile~growing_allocator_mod.f90 growing_allocator_mod.F90 sourcefile~buffered_allocator_mod.f90->sourcefile~growing_allocator_mod.f90 sourcefile~ledir_mod.f90->sourcefile~buffered_allocator_mod.f90 sourcefile~ledir_mod.f90->sourcefile~parkind_ectrans.f90 sourcefile~ledir_mod.f90->sourcefile~tpm_dim.f90 sourcefile~ledir_mod.f90->sourcefile~tpm_distr.f90 sourcefile~ledir_mod.f90->sourcefile~tpm_geometry.f90 sourcefile~hicblas_mod.f90 hicblas_mod.F90 sourcefile~ledir_mod.f90->sourcefile~hicblas_mod.f90 sourcefile~tpm_fields_gpu.f90 tpm_fields_gpu.F90 sourcefile~ledir_mod.f90->sourcefile~tpm_fields_gpu.f90 sourcefile~tpm_gen.f90 tpm_gen.F90 sourcefile~ledir_mod.f90->sourcefile~tpm_gen.f90 sourcefile~tpm_stats.f90 tpm_stats.F90 sourcefile~ledir_mod.f90->sourcefile~tpm_stats.f90 sourcefile~abort_trans_mod.f90->sourcefile~tpm_gen.f90 sourcefile~growing_allocator_mod.f90->sourcefile~abort_trans_mod.f90 sourcefile~growing_allocator_mod.f90->sourcefile~tpm_gen.f90

Files dependent on this one

sourcefile~~trltomad_pack_unpack.f90~~AfferentGraph sourcefile~trltomad_pack_unpack.f90 trltomad_pack_unpack.F90 sourcefile~dir_trans_ctlad_mod.f90 dir_trans_ctlad_mod.F90 sourcefile~dir_trans_ctlad_mod.f90->sourcefile~trltomad_pack_unpack.f90 sourcefile~dir_transad.f90 dir_transad.F90 sourcefile~dir_transad.f90->sourcefile~dir_trans_ctlad_mod.f90 sourcefile~dir_transad.f90~2 dir_transad.F90 sourcefile~dir_transad.f90~2->sourcefile~dir_trans_ctlad_mod.f90

Source Code

#define ALIGN(I, A) (((I)+(A)-1)/(A)*(A))
! (C) Copyright 2000- ECMWF.
! (C) Copyright 2000- Meteo-France.
! (C) Copyright 2022- NVIDIA.
! 
! This software is licensed under the terms of the Apache Licence Version 2.0
! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
! In applying this licence, ECMWF does not waive the privileges and immunities
! granted to it by virtue of its status as an intergovernmental organisation
! nor does it submit to any jurisdiction.
!

MODULE TRLTOMAD_PACK_UNPACK
  USE BUFFERED_ALLOCATOR_MOD, ONLY: ALLOCATION_RESERVATION_HANDLE
  USE PARKIND_ECTRANS,        ONLY: JPIM
  IMPLICIT NONE

  PRIVATE
  PUBLIC :: TRLTOMAD_PACK_HANDLE, PREPARE_TRLTOMAD_PACK, TRLTOMAD_PACK
  PUBLIC :: TRLTOMAD_UNPACK_HANDLE, PREPARE_TRLTOMAD_UNPACK, TRLTOMAD_UNPACK

  TYPE TRLTOMAD_PACK_HANDLE
    TYPE(ALLOCATION_RESERVATION_HANDLE) :: HREEL_COMPLEX
  END TYPE
  TYPE TRLTOMAD_UNPACK_HANDLE
    TYPE(ALLOCATION_RESERVATION_HANDLE) :: HPFBUF
  END TYPE

  INTEGER(KIND=JPIM) :: A = 8 !Alignment

CONTAINS
  FUNCTION PREPARE_TRLTOMAD_PACK(ALLOCATOR, KF_FS) RESULT(HTRLTOM_PACK)
    USE PARKIND_ECTRANS,        ONLY: JPIM, JPRBT, JPIB
    USE TPM_DISTR,              ONLY: D
    USE ISO_C_BINDING,          ONLY: C_SIZE_T, C_SIZEOF
    USE BUFFERED_ALLOCATOR_MOD, ONLY: BUFFERED_ALLOCATOR, RESERVE

    IMPLICIT NONE

    TYPE(BUFFERED_ALLOCATOR), INTENT(INOUT) :: ALLOCATOR
    INTEGER(KIND=JPIM), INTENT(IN) :: KF_FS
    TYPE(TRLTOMAD_PACK_HANDLE) :: HTRLTOM_PACK

    REAL(KIND=JPRBT) :: DUMMY

    HTRLTOM_PACK%HREEL_COMPLEX = RESERVE(ALLOCATOR, 1_JPIB*KF_FS*D%NLENGTF*C_SIZEOF(DUMMY), "HTRLTOMAD_PACK%HREEL_COMPLEX")
  END FUNCTION PREPARE_TRLTOMAD_PACK

  SUBROUTINE TRLTOMAD_PACK(ALLOCATOR,HTRLTOM_PACK,PREEL_COMPLEX,FOUBUF_IN,KF_FS)
    !**** *TRLTOMAD_PACK* - Copy fourier data from local array to buffer

    !     Purpose.
    !     --------
    !        Routine for copying fourier data from local array to buffer

    !**   Interface.
    !     ----------
    !     CALL TRLTOM_PACK(...)

    !     Explicit arguments :  PREEL - local fourier/GP array
    !     --------------------  KF_FS - number of fields
    !
    !     Externals.  None.
    !     ----------

    !     Author.
    !     -------
    !        Mats Hamrud *ECMWF*

    !     ------------------------------------------------------------------

    USE BUFFERED_ALLOCATOR_MOD, ONLY: BUFFERED_ALLOCATOR, ASSIGN_PTR, GET_ALLOCATION
    USE PARKIND_ECTRANS,        ONLY: JPIM, JPRBT, JPIB
    USE TPM_DISTR,              ONLY: D, MYSETW
    USE TPM_GEOMETRY,           ONLY: G
    USE TPM_DIM,                ONLY: R
    USE ISO_C_BINDING,          ONLY: C_SIZE_T, C_SIZEOF
    !

    IMPLICIT NONE

    REAL(KIND=JPRBT), POINTER, INTENT(OUT) :: PREEL_COMPLEX(:)
    REAL(KIND=JPRBT), INTENT(IN) :: FOUBUF_IN(:)
    INTEGER(KIND=JPIM),INTENT(IN) :: KF_FS
    TYPE(BUFFERED_ALLOCATOR), INTENT(IN) :: ALLOCATOR
    TYPE(TRLTOMAD_PACK_HANDLE), INTENT(IN) :: HTRLTOM_PACK

    INTEGER(KIND=JPIM) :: JM,JF,IGLG,OFFSET_VAR,KGL,J,N
    INTEGER(KIND=JPIB) :: IOFF_LAT,ISTA

    REAL(KIND=JPRBT)    :: SCAL

    ASSOCIATE(D_NSTAGTF=>D%NSTAGTF, D_NPNTGTB0=>D%NPNTGTB0, D_NPTRLS=>D%NPTRLS, &
            & D_NDGL_FS=>D%NDGL_FS, G_NMEN=>G%NMEN, G_NLOEN=>G%NLOEN, R_NSMAX=>R%NSMAX)

    CALL ASSIGN_PTR(PREEL_COMPLEX, GET_ALLOCATION(ALLOCATOR, HTRLTOM_PACK%HREEL_COMPLEX),&
                  & 1_JPIB, 1_JPIB*KF_FS*D%NLENGTF*C_SIZEOF(PREEL_COMPLEX(1)))

    N = 1_JPIB*KF_FS*D%NLENGTF

#ifdef ACCGPU
    !$ACC DATA PRESENT(PREEL_COMPLEX)
    !$ACC PARALLEL LOOP FIRSTPRIVATE(N)
#endif
#ifdef OMPGPU
    !$OMP TARGET DATA MAP(PRESENT,ALLOC:PREEL_COMPLEX)
    !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO DEFAULT(NONE) SHARED(N,PREEL_COMPLEX)
#endif
    DO J=1,N
      PREEL_COMPLEX(J) = 0
    ENDDO
#ifdef ACCGPU
    !$ACC END DATA
#endif
#ifdef OMPGPU
    !$OMP END TARGET DATA
#endif

#ifdef OMPGPU
    !$OMP TARGET DATA MAP(PRESENT,ALLOC:G,G_NMEN,D,D_NPNTGTB0,FOUBUF_IN,PREEL_COMPLEX,D_NSTAGTF,&
    !$OMP& D_NDGL_FS,G_NLOEN,R,R_NSMAX)
#endif
#ifdef ACCGPU
    !$ACC DATA PRESENT(G,G_NMEN,D,D_NPNTGTB0,FOUBUF_IN,PREEL_COMPLEX,D_NSTAGTF,D_NDGL_FS,G_NLOEN, R,R_NSMAX) ASYNC(1)
#endif

    ! scale results and move into next transformation buffer

    OFFSET_VAR=D_NPTRLS(MYSETW)

#ifdef OMPGPU
    !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO COLLAPSE(3) PRIVATE(IGLG,IOFF_LAT,ISTA,SCAL) &
    !$OMP& SHARED(D,R,KF_FS,OFFSET_VAR,G,&
    !$OMP& PREEL_COMPLEX,FOUBUF_IN) MAP(TO:KF_FS,OFFSET_VAR)
#endif
#ifdef ACCGPU
    !$ACC PARALLEL LOOP PRIVATE(IGLG,IOFF_LAT,ISTA,SCAL) FIRSTPRIVATE(KF_FS,OFFSET_VAR) &
    !$ACC& TILE(32,16,1) DEFAULT(NONE) &
#ifndef _CRAYFTN
    !$ACC& ASYNC(1)
#else
    !$ACC&
#endif
#endif
    DO KGL=1,D_NDGL_FS
      DO JM=0,R_NSMAX !(note that R_NSMAX <= G_NMEN(IGLG) for all IGLG)
        DO JF=1,KF_FS
          IGLG = OFFSET_VAR+KGL-1
          IF (JM <= G_NMEN(IGLG)) THEN
            IOFF_LAT = KF_FS*D_NSTAGTF(KGL)+(JF-1)*(D_NSTAGTF(KGL+1)-D_NSTAGTF(KGL))

            SCAL = 1._JPRBT/REAL(G_NLOEN(IGLG),JPRBT)
            ISTA  = 2_JPIB*D_NPNTGTB0(JM,KGL)*KF_FS

            PREEL_COMPLEX(IOFF_LAT+2*JM+1) = SCAL * FOUBUF_IN(ISTA+2*JF-1)
            PREEL_COMPLEX(IOFF_LAT+2*JM+2) = SCAL * FOUBUF_IN(ISTA+2*JF  )
          ENDIF
        ENDDO
      ENDDO
    ENDDO
#ifdef ACCGPU
    !$ACC END DATA

    !$ACC WAIT(1)
#endif
#ifdef OMPGPU
    !$OMP END TARGET DATA
#endif
    END ASSOCIATE
  END SUBROUTINE TRLTOMAD_PACK

  FUNCTION PREPARE_TRLTOMAD_UNPACK(ALLOCATOR, KF_FS) RESULT(HTRLTOM_UNPACK)
    USE PARKIND_ECTRANS,        ONLY: JPIM, JPRBT, JPRD, JPIB
    USE BUFFERED_ALLOCATOR_MOD, ONLY: BUFFERED_ALLOCATOR, RESERVE
    USE LEDIR_MOD,              ONLY: LEDIR_STRIDES
    USE ISO_C_BINDING,          ONLY: C_SIZE_T, C_SIZEOF
    USE TPM_DISTR,              ONLY: D

    IMPLICIT NONE

    TYPE(BUFFERED_ALLOCATOR), INTENT(INOUT) :: ALLOCATOR
    INTEGER(KIND=JPIM), INTENT(IN) :: KF_FS
    TYPE(TRLTOMAD_UNPACK_HANDLE) :: HTRLTOM_UNPACK

    INTEGER(KIND=JPIM)  :: IIN_STRIDES0
    INTEGER(KIND=JPIB)  :: IIN_SIZE
    INTEGER(KIND=JPIM)  :: IIN0_STRIDES0, IIN0_SIZE
    INTEGER(KIND=JPIB)  :: ISIZE

    REAL(KIND=JPRBT) :: DUMMY

    CALL LEDIR_STRIDES(KF_FS,IIN_STRIDES0=IIN_STRIDES0,IIN_SIZE=IIN_SIZE,&
                       IIN0_STRIDES0=IIN0_STRIDES0,IIN0_SIZE=IIN0_SIZE)

    HTRLTOM_UNPACK%HPFBUF = RESERVE(ALLOCATOR, 2_JPIB*D%NLENGT1B*KF_FS*C_SIZEOF(DUMMY), "HTRLTOM_UNPACK%HPFBUF")
  END FUNCTION PREPARE_TRLTOMAD_UNPACK

  SUBROUTINE TRLTOMAD_UNPACK(ALLOCATOR,HTRLTOM_UNPACK,FOUBUF,ZINPS,ZINPA,ZINPS0,ZINPA0,KF_FS,KF_UV)
    USE PARKIND_ECTRANS,             ONLY: JPIM, JPRBT, JPRD, JPIB
    USE TPM_DIM,                     ONLY: R
    USE TPM_GEOMETRY,                ONLY: G
    USE BUFFERED_ALLOCATOR_MOD,      ONLY: BUFFERED_ALLOCATOR, ASSIGN_PTR, GET_ALLOCATION
    USE TPM_FIELDS,                  ONLY: F
    USE TPM_DISTR,                   ONLY: D
    USE LEDIR_MOD,                   ONLY: LEDIR_STRIDES
    USE, INTRINSIC :: ISO_C_BINDING, ONLY: C_SIZE_T, C_SIZEOF

    IMPLICIT NONE

    REAL(KIND=JPRBT), POINTER, INTENT(OUT) :: FOUBUF(:)
    REAL(KIND=JPRBT), INTENT(IN) :: ZINPS(:), ZINPA(:)
    REAL(KIND=JPRD),  INTENT(IN) :: ZINPS0(:), ZINPA0(:)
    INTEGER(KIND=JPIM), INTENT(IN) :: KF_FS, KF_UV
    TYPE(BUFFERED_ALLOCATOR), INTENT(IN) :: ALLOCATOR
    TYPE(TRLTOMAD_UNPACK_HANDLE), INTENT(IN) :: HTRLTOM_UNPACK

    REAL(KIND=JPRBT), POINTER :: PREEL_COMPLEX(:)

    INTEGER(KIND=JPIM) :: IIN_STRIDES0
    INTEGER(KIND=JPIB) :: IIN_SIZE
    INTEGER(KIND=JPIM) :: IIN0_STRIDES0, IIN0_SIZE

    INTEGER(KIND=JPIB) :: IALLOC_POS, IALLOC_SZ

    INTEGER(KIND=JPIB)  :: JF, OFFSET1, OFFSET2
    INTEGER(KIND=JPIM) :: KM, ISL, IGLS, JGL, KMLOC

    REAL(KIND=JPRBT) :: PAIA, PAIS

    ASSOCIATE(D_NUMP=>D%NUMP, R_NDGNH=>R%NDGNH, R_NDGL=>R%NDGL, F_RW=>F%RW, F_RACTHE=>F%RACTHE, &
            & D_MYMS=>D%MYMS, D_NPNTGTB1=>D%NPNTGTB1, D_OFFSETS_GEMM1=>D%OFFSETS_GEMM1, &
            & G_NDGLU=>G%NDGLU)
    CALL LEDIR_STRIDES(KF_FS,IIN_STRIDES0=IIN_STRIDES0,IIN_SIZE=IIN_SIZE,&
                       IIN0_STRIDES0=IIN0_STRIDES0,IIN0_SIZE=IIN0_SIZE)

    CALL ASSIGN_PTR(FOUBUF, GET_ALLOCATION(ALLOCATOR, HTRLTOM_UNPACK%HPFBUF),&
        & 1_JPIB, 2_JPIB*D%NLENGT1B*KF_FS*C_SIZEOF(FOUBUF(1)))
    
#ifdef OMPGPU
    !$OMP TARGET DATA MAP(PRESENT,ALLOC:ZINPS,ZINPA,ZINPS0,ZINPA0) &
    !$OMP& MAP(PRESENT,ALLOC:F,F_RW,F_RACTHE) &
    !$OMP& MAP(PRESENT,ALLOC:D,D_MYMS,D_NUMP,R,R_NDGNH,R_NDGL,G,G_NDGLU) &
    !$OMP& MAP(PRESENT,ALLOC:D_NPNTGTB1,D_OFFSETS_GEMM1,FOUBUF)

    !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO COLLAPSE(3) DEFAULT(NONE) &
    !$OMP& PRIVATE(KM,ISL,IGLS,OFFSET1,OFFSET2,PAIA,PAIS) &
    !$OMP& SHARED(D,R,KF_FS,G,FOUBUF,F,&
    !$OMP& IIN_STRIDES0,ZINPA,ZINPS,IIN0_STRIDES0,ZINPA0,ZINPS0,KF_UV) &
    !$OMP& MAP(TO:KF_FS,KF_UV,IIN_STRIDES0,IIN0_STRIDES0)
#endif
#ifdef ACCGPU
    !$ACC DATA &
    !$ACC& PRESENT(ZINPS,ZINPA,ZINPS0,ZINPA0) &
    !$ACC& PRESENT(F,F_RW,F_RACTHE) &
    !$ACC& PRESENT(D,D_MYMS,D_NUMP,R,R_NDGNH,R_NDGL,G,G_NDGLU) &
    !$ACC& PRESENT(D_NPNTGTB1,D_OFFSETS_GEMM1,FOUBUF)

    !$ACC PARALLEL LOOP DEFAULT(NONE) COLLAPSE(3) PRIVATE(KM,ISL,IGLS,OFFSET1,OFFSET2,JGL,PAIA,PAIS) &
    !$ACC&              FIRSTPRIVATE(KF_FS,KF_UV,IIN_STRIDES0,IIN0_STRIDES0) &
#ifndef _CRAYFTN
    !$ACC& ASYNC(1)
#else
    !$ACC&
#endif
#endif
    DO KMLOC=1,D_NUMP
      DO JGL=1,R_NDGNH
        DO JF=1,KF_FS*2
          KM = D_MYMS(KMLOC)
          ISL = R_NDGNH-G_NDGLU(KM)+1
          IF (JGL >= ISL) THEN
            !(DO JGL=ISL,R_NDGNH)
            IGLS = R_NDGL+1-JGL
            OFFSET1 = 2_JPIB*D_NPNTGTB1(KMLOC,JGL )*KF_FS
            OFFSET2 = 2_JPIB*D_NPNTGTB1(KMLOC,IGLS)*KF_FS
            PAIA = 0
            PAIS = 0
            IF (KM /= 0) THEN
              PAIA=REAL(F_RW(JGL),JPRBT)*ZINPA(JF+(JGL-ISL)*IIN_STRIDES0+IIN_STRIDES0*D_OFFSETS_GEMM1(KMLOC))
              PAIS=REAL(F_RW(JGL),JPRBT)*ZINPS(JF+(JGL-ISL)*IIN_STRIDES0+IIN_STRIDES0*D_OFFSETS_GEMM1(KMLOC))
            ELSEIF (MOD(JF-1,2) == 0) THEN
              ! every other field is sufficient because Im(KM=0) == 0
              PAIA=REAL(F_RW(JGL),JPRBT)*ZINPA0((JF-1)/2+1+(JGL-1)*IIN0_STRIDES0)
              PAIS=REAL(F_RW(JGL),JPRBT)*ZINPS0((JF-1)/2+1+(JGL-1)*IIN0_STRIDES0)
            ENDIF
            IF (JF <= 4*KF_UV) THEN
                ! Multiply in case of velocity
              PAIA = PAIA*REAL(F_RACTHE(JGL),JPRBT)
              PAIS = PAIS*REAL(F_RACTHE(JGL),JPRBT)
            ENDIF
            FOUBUF(OFFSET1+JF) =  PAIA+PAIS
            FOUBUF(OFFSET2+JF) = -PAIA+PAIS
          ENDIF
        ENDDO
      ENDDO
    END DO

#ifdef OMPGPU
    !$OMP END TARGET DATA
#endif
#ifdef ACCGPU
    !$ACC END DATA
#endif

   END ASSOCIATE
  END SUBROUTINE TRLTOMAD_UNPACK

END MODULE TRLTOMAD_PACK_UNPACK