trmtolad_pack_unpack.F90 Source File


This file depends on

sourcefile~~trmtolad_pack_unpack.f90~~EfferentGraph sourcefile~trmtolad_pack_unpack.f90 trmtolad_pack_unpack.F90 sourcefile~buffered_allocator_mod.f90 buffered_allocator_mod.F90 sourcefile~trmtolad_pack_unpack.f90->sourcefile~buffered_allocator_mod.f90 sourcefile~leinv_mod.f90 leinv_mod.F90 sourcefile~trmtolad_pack_unpack.f90->sourcefile~leinv_mod.f90 sourcefile~parkind_ectrans.f90 parkind_ectrans.F90 sourcefile~trmtolad_pack_unpack.f90->sourcefile~parkind_ectrans.f90 sourcefile~tpm_dim.f90 tpm_dim.F90 sourcefile~trmtolad_pack_unpack.f90->sourcefile~tpm_dim.f90 sourcefile~tpm_distr.f90 tpm_distr.F90 sourcefile~trmtolad_pack_unpack.f90->sourcefile~tpm_distr.f90 sourcefile~tpm_geometry.f90 tpm_geometry.F90 sourcefile~trmtolad_pack_unpack.f90->sourcefile~tpm_geometry.f90 sourcefile~abort_trans_mod.f90 abort_trans_mod.F90 sourcefile~buffered_allocator_mod.f90->sourcefile~abort_trans_mod.f90 sourcefile~growing_allocator_mod.f90 growing_allocator_mod.F90 sourcefile~buffered_allocator_mod.f90->sourcefile~growing_allocator_mod.f90 sourcefile~leinv_mod.f90->sourcefile~buffered_allocator_mod.f90 sourcefile~leinv_mod.f90->sourcefile~parkind_ectrans.f90 sourcefile~leinv_mod.f90->sourcefile~tpm_dim.f90 sourcefile~leinv_mod.f90->sourcefile~tpm_distr.f90 sourcefile~leinv_mod.f90->sourcefile~tpm_geometry.f90 sourcefile~hicblas_mod.f90 hicblas_mod.F90 sourcefile~leinv_mod.f90->sourcefile~hicblas_mod.f90 sourcefile~tpm_fields_gpu.f90 tpm_fields_gpu.F90 sourcefile~leinv_mod.f90->sourcefile~tpm_fields_gpu.f90 sourcefile~tpm_gen.f90 tpm_gen.F90 sourcefile~leinv_mod.f90->sourcefile~tpm_gen.f90 sourcefile~tpm_stats.f90 tpm_stats.F90 sourcefile~leinv_mod.f90->sourcefile~tpm_stats.f90 sourcefile~abort_trans_mod.f90->sourcefile~tpm_gen.f90 sourcefile~growing_allocator_mod.f90->sourcefile~abort_trans_mod.f90 sourcefile~growing_allocator_mod.f90->sourcefile~tpm_gen.f90

Files dependent on this one

sourcefile~~trmtolad_pack_unpack.f90~~AfferentGraph sourcefile~trmtolad_pack_unpack.f90 trmtolad_pack_unpack.F90 sourcefile~inv_trans_ctlad_mod.f90 inv_trans_ctlad_mod.F90 sourcefile~inv_trans_ctlad_mod.f90->sourcefile~trmtolad_pack_unpack.f90 sourcefile~inv_transad.f90 inv_transad.F90 sourcefile~inv_transad.f90->sourcefile~inv_trans_ctlad_mod.f90 sourcefile~inv_transad.f90~2 inv_transad.F90 sourcefile~inv_transad.f90~2->sourcefile~inv_trans_ctlad_mod.f90

Source Code

#define ALIGN(I, A) (((I)+(A)-1)/(A)*(A))
! (C) Copyright 2000- ECMWF.
! (C) Copyright 2000- Meteo-France.
! (C) Copyright 2022- NVIDIA.
! 
! This software is licensed under the terms of the Apache Licence Version 2.0
! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
! In applying this licence, ECMWF does not waive the privileges and immunities
! granted to it by virtue of its status as an intergovernmental organisation
! nor does it submit to any jurisdiction.
!

MODULE TRMTOLAD_PACK_UNPACK
  USE BUFFERED_ALLOCATOR_MOD, ONLY: ALLOCATION_RESERVATION_HANDLE
  IMPLICIT NONE

  PRIVATE
  PUBLIC :: TRMTOLAD_PACK, TRMTOLAD_PACK_HANDLE, PREPARE_TRMTOLAD_PACK
  PUBLIC :: TRMTOLAD_UNPACK, TRMTOLAD_UNPACK_HANDLE, PREPARE_TRMTOLAD_UNPACK

  TYPE TRMTOLAD_PACK_HANDLE
    TYPE(ALLOCATION_RESERVATION_HANDLE) :: HOUTS_AND_OUTA
  END TYPE
  TYPE TRMTOLAD_UNPACK_HANDLE
    TYPE(ALLOCATION_RESERVATION_HANDLE) :: HPFBUF
  END TYPE

CONTAINS
  FUNCTION PREPARE_TRMTOLAD_PACK(ALLOCATOR,KF_UV,KF_SCALARS,LVORGP,LDIVGP,LSCDERS) RESULT(HTRMTOLAD_PACK)
    USE PARKIND_ECTRANS,        ONLY: JPIM, JPRBT, JPRD, JPIB
    USE TPM_DISTR,              ONLY: D
    USE TPM_DIM,                ONLY: R
    USE ISO_C_BINDING,          ONLY: C_SIZE_T, C_SIZEOF
    USE LEINV_MOD,              ONLY: LEINV_STRIDES
    USE BUFFERED_ALLOCATOR_MOD, ONLY: BUFFERED_ALLOCATOR, RESERVE

    IMPLICIT NONE

    TYPE(BUFFERED_ALLOCATOR), INTENT(INOUT) :: ALLOCATOR
    INTEGER(KIND=JPIM), INTENT(IN) :: KF_UV,KF_SCALARS
    LOGICAL, INTENT(IN) :: LVORGP,LDIVGP,LSCDERS

    TYPE(TRMTOLAD_PACK_HANDLE) :: HTRMTOLAD_PACK

    INTEGER(KIND=JPIB) :: IALLOC_SZ
    INTEGER(KIND=JPIM)  :: IOUT_STRIDES0
    INTEGER(KIND=JPIB)  :: IOUT_SIZE
    INTEGER(KIND=JPIM)  :: IOUT0_STRIDES0, IOUT0_SIZE
    INTEGER(KIND=JPIM)  :: IIN_STRIDES0
    INTEGER(KIND=JPIB)  :: IIN_SIZE
    INTEGER(KIND=JPIM)  :: IIN0_STRIDES0, IIN0_SIZE

    REAL(KIND=JPRBT) :: ZPRBT_DUMMY
    REAL(KIND=JPRD) :: ZPRD_DUMMY

    INTEGER(KIND=JPIM) :: IF_READIN, IF_LEG

    ! # fields that are initially read. We always read vorticity
    ! and divergence! Also keep in mind that we actually have 2X
    ! this number of levels because real+complex
    IF_READIN = 0
    IF_READIN = IF_READIN + KF_UV ! Vorticity or divergence
    IF_READIN = IF_READIN + KF_UV ! Vorticity or divergence
    IF_READIN = IF_READIN + KF_UV ! U
    IF_READIN = IF_READIN + KF_UV ! V
    IF_READIN = IF_READIN + KF_SCALARS ! Scalars
    IF (LSCDERS) &
      IF_READIN = IF_READIN + KF_SCALARS ! Scalars NS Derivatives
    
    ! In Legendre space, we then ignore vorticity/divergence, if
    ! they don't need to be transformed.
    IF_LEG = IF_READIN
    IF(.NOT. LVORGP) IF_LEG = IF_LEG - KF_UV ! No vorticity needed
    IF(.NOT. LDIVGP) IF_LEG = IF_LEG - KF_UV ! No divergence needed

    CALL LEINV_STRIDES(IF_LEG,IOUT_STRIDES0,IOUT_SIZE,IIN_STRIDES0,IIN_SIZE,&
                       IOUT0_STRIDES0,IOUT0_SIZE,IIN0_STRIDES0,IIN0_SIZE)

    IALLOC_SZ = 0
    ! ZOUTA
    IALLOC_SZ = IALLOC_SZ + ALIGN(IOUT_SIZE*C_SIZEOF(ZPRBT_DUMMY),128)
    ! ZOUTS
    IALLOC_SZ = IALLOC_SZ + ALIGN(IOUT_SIZE*C_SIZEOF(ZPRBT_DUMMY),128)
    ! ZOUTA0
    IALLOC_SZ = IALLOC_SZ + ALIGN(IOUT0_SIZE*C_SIZEOF(ZPRD_DUMMY),128)
    ! ZOUTS0
    IALLOC_SZ = IALLOC_SZ + ALIGN(IOUT0_SIZE*C_SIZEOF(ZPRD_DUMMY),128)

    HTRMTOLAD_PACK%HOUTS_AND_OUTA = RESERVE(ALLOCATOR, IALLOC_SZ, "HTRMTOLAD_PACK%HOUTS_AND_OUTA")

  END FUNCTION
  SUBROUTINE TRMTOLAD_PACK(ALLOCATOR,HTRMTOL_PACK,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,FOUBUF_IN,KF_LEG)

    !**** *TRMTOL_PACK* - Packing buffer for TRMTOL

    !     Purpose.
    !     --------
    !        Packs data from LTINV outputs into FOUBUF for conversion to fourier space

    !**   Interface.
    !     ----------
    !        CALL TRMTOL_PACK(...)

    !        Explicit arguments :  ZOUTS - symmetric data
    !        --------------------  ZOUTA - asymmetric data
    !                              ZOUTS0 - symmetric data for KMLOC0
    !                              ZOUTA0 - asymmetric data for KMLOC0
    !                              FOUBUF_IN - output towards TRMTOL
    !                              KF_LEG - number of fields (we have 2XKF_LEG because complex)

    !        Implicit arguments :  None.
    !        --------------------

    !     Externals.
    !     ----------

    !     Reference.
    !     ----------
    !        ECMWF Research Department documentation of the IFS

    !     Author.
    !     -------
    !      Nils Wedi + Mats Hamrud + George Modzynski
    !
    !     Modifications.
    !     --------------
    !        J.Hague : Oct 2012 DR_HOOK round calls to DGEMM:
    !      F. Vana  05-Mar-2015  Support for single precision
    !     ------------------------------------------------------------------

    USE PARKIND_ECTRANS,        ONLY: JPIM, JPRB, JPRBT, JPRD, JPIB
    USE YOMHOOK,                ONLY: LHOOK, DR_HOOK, JPHOOK
    USE TPM_DIM,                ONLY: R
    USE TPM_GEOMETRY,           ONLY: G
    USE TPM_DISTR,              ONLY: D
    USE LEINV_MOD,              ONLY: LEINV_STRIDES
    USE BUFFERED_ALLOCATOR_MOD, ONLY: BUFFERED_ALLOCATOR, ASSIGN_PTR, GET_ALLOCATION
    USE ISO_C_BINDING,          ONLY: C_SIZE_T, C_SIZEOF

    IMPLICIT NONE


    !     DUMMY ARGUMENTS
    TYPE(BUFFERED_ALLOCATOR), INTENT(IN) :: ALLOCATOR
    TYPE(TRMTOLAD_PACK_HANDLE), INTENT(IN) :: HTRMTOL_PACK
    REAL(KIND=JPRB), INTENT(IN) :: FOUBUF_IN(:)
    REAL(KIND=JPRBT), INTENT(OUT), POINTER :: ZOUTS(:), ZOUTA(:)
    REAL(KIND=JPRD), INTENT(OUT), POINTER :: ZOUTS0(:), ZOUTA0(:)
    INTEGER(KIND=JPIM), INTENT(IN)  :: KF_LEG

    !     LOCAL
    REAL(KIND=JPRBT) :: ZAOA, ZSOA

    INTEGER(KIND=JPIB) :: IALLOC_POS, IALLOC_SZ
    INTEGER(KIND=JPIM) :: KMLOC, KM, ISL, JGL, JK, IGLS
    INTEGER(KIND=JPIB) :: OFFSET1, OFFSET2
    INTEGER(KIND=JPIM)  :: IOUT_STRIDES0
    INTEGER(KIND=JPIB)  :: IOUT_SIZE
    INTEGER(KIND=JPIM)  :: IOUT0_STRIDES0, IOUT0_SIZE

    REAL(KIND=JPHOOK) :: ZHOOK_HANDLE

    ASSOCIATE(D_NUMP=>D%NUMP, R_NDGNH=>R%NDGNH, R_NDGL=>R%NDGL, G_NDGLU=>G%NDGLU, &
            & D_MYMS=>D%MYMS, D_NPNTGTB1=>D%NPNTGTB1, D_OFFSETS_GEMM1=>D%OFFSETS_GEMM1)

    IF (LHOOK) CALL DR_HOOK('TRMTOLAD_PACK',0,ZHOOK_HANDLE)
    
    CALL LEINV_STRIDES(KF_LEG,IOUT_STRIDES0=IOUT_STRIDES0,IOUT_SIZE=IOUT_SIZE,&
                       IOUT0_STRIDES0=IOUT0_STRIDES0,IOUT0_SIZE=IOUT0_SIZE)

    IALLOC_POS = 1

    ! ZOUTA
    IALLOC_SZ = ALIGN(IOUT_SIZE*C_SIZEOF(ZOUTA(1)),128)
    CALL ASSIGN_PTR(ZOUTA, GET_ALLOCATION(ALLOCATOR, HTRMTOL_PACK%HOUTS_AND_OUTA),&
        & IALLOC_POS, IALLOC_SZ)
    IALLOC_POS = IALLOC_POS + IALLOC_SZ

    ! ZOUTS
    IALLOC_SZ = ALIGN(IOUT_SIZE*C_SIZEOF(ZOUTS(1)),128)
    CALL ASSIGN_PTR(ZOUTS, GET_ALLOCATION(ALLOCATOR, HTRMTOL_PACK%HOUTS_AND_OUTA),&
        & IALLOC_POS, IALLOC_SZ)
    IALLOC_POS = IALLOC_POS + IALLOC_SZ

    ! ZOUTA0
    IALLOC_SZ = ALIGN(IOUT0_SIZE*C_SIZEOF(ZOUTA0(1)),128)
    CALL ASSIGN_PTR(ZOUTA0, GET_ALLOCATION(ALLOCATOR, HTRMTOL_PACK%HOUTS_AND_OUTA),&
        & IALLOC_POS, IALLOC_SZ)
    IALLOC_POS = IALLOC_POS + IALLOC_SZ

    ! ZOUTS0
    IALLOC_SZ = ALIGN(IOUT0_SIZE*C_SIZEOF(ZOUTS0(1)),128)
    CALL ASSIGN_PTR(ZOUTS0, GET_ALLOCATION(ALLOCATOR, HTRMTOL_PACK%HOUTS_AND_OUTA),&
        & IALLOC_POS, IALLOC_SZ)
    IALLOC_POS = IALLOC_POS + IALLOC_SZ
    
#ifdef OMPGPU
    !$OMP TARGET DATA MAP(PRESENT,ALLOC:D,D_MYMS,D_NPNTGTB1,D_NUMP,G,G_NDGLU,R,R_NDGNH,R_NDGL) &
    !$OMP&            MAP(PRESENT,ALLOC:ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,FOUBUF_IN,D_OFFSETS_GEMM1)
#endif
#ifdef ACCGPU
    !$ACC DATA PRESENT(D,D_MYMS,D_NPNTGTB1,D_NUMP,G,G_NDGLU,R,R_NDGNH,R_NDGL) &
    !$ACC&     PRESENT(ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,FOUBUF_IN,D_OFFSETS_GEMM1)
#endif

#ifdef OMPGPU
    ! Directive incomplete -> putting more variables in SHARED() triggers internal compiler error
    ! ftn-7991: INTERNAL COMPILER ERROR:  "Too few arguments on the stack"
    !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO COLLAPSE(3) &
    !$OMP& SHARED(D,R,G,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,FOUBUF_IN) &
    !$OMP& PRIVATE(KM,ISL,IGLS,OFFSET1,OFFSET2,ZAOA,ZSOA) &
    !$OMP& MAP(TO:KF_LEG,IOUT_STRIDES0,IOUT0_STRIDES0)
#endif
#ifdef ACCGPU
    !$ACC PARALLEL LOOP COLLAPSE(3) DEFAULT(NONE) PRIVATE(KM,ISL,IGLS,OFFSET1,OFFSET2,ZAOA,ZSOA) &
    !$ACC&              FIRSTPRIVATE(KF_LEG,IOUT_STRIDES0,IOUT0_STRIDES0) &
#ifndef _CRAYFTN
    !$ACC& ASYNC(1)
#else
    !$ACC&
#endif
#endif
    DO KMLOC=1,D_NUMP
      DO JGL=1,R_NDGNH
        DO JK=1,2*KF_LEG
          KM = D_MYMS(KMLOC)
          ISL = R_NDGNH-G_NDGLU(KM)+1
          IF (JGL >= ISL) THEN
            !(DO JGL=ISL,R_NDGNH)
            IGLS = R_NDGL+1-JGL
            OFFSET1 = 2_JPIB*D_NPNTGTB1(KMLOC,JGL )*KF_LEG
            OFFSET2 = 2_JPIB*D_NPNTGTB1(KMLOC,IGLS)*KF_LEG
            
            ZSOA = FOUBUF_IN(OFFSET1+JK) + FOUBUF_IN(OFFSET2+JK)
            ZAOA = FOUBUF_IN(OFFSET1+JK) - FOUBUF_IN(OFFSET2+JK)
            IF(KM /= 0) THEN
              ZOUTS(JK+(JGL-ISL)*IOUT_STRIDES0+D_OFFSETS_GEMM1(KMLOC)*IOUT_STRIDES0) = ZSOA
              ZOUTA(JK+(JGL-ISL)*IOUT_STRIDES0+D_OFFSETS_GEMM1(KMLOC)*IOUT_STRIDES0) = ZAOA
            ELSEIF (MOD((JK-1),2) .EQ. 0) THEN
              ZOUTS0((JK-1)/2+1+(JGL-1)*IOUT0_STRIDES0) = ZSOA
              ZOUTA0((JK-1)/2+1+(JGL-1)*IOUT0_STRIDES0) = ZAOA
            ELSE
              ! Imaginary values of KM=0 is zero, though I don't think we care
              ! ZSOA = 0_JPRBT
              ! ZAOA = 0_JPRBT
            ENDIF

          ENDIF
        ENDDO
      ENDDO
    ENDDO

#ifdef OMPGPU
    !$OMP END TARGET DATA
#endif
#ifdef ACCGPU
    !$ACC WAIT(1)

    !$ACC END DATA
#endif

    IF (LHOOK) CALL DR_HOOK('TRMTOLAD_PACK',1,ZHOOK_HANDLE)

    END ASSOCIATE
  END SUBROUTINE TRMTOLAD_PACK

  FUNCTION PREPARE_TRMTOLAD_UNPACK(ALLOCATOR,KF_LEG) RESULT(HTRMTOLAD_UNPACK)
    USE PARKIND_ECTRANS,        ONLY: JPIM, JPRBT, JPIB
    USE TPM_DISTR,              ONLY: D
    USE BUFFERED_ALLOCATOR_MOD, ONLY: BUFFERED_ALLOCATOR, RESERVE
    USE ISO_C_BINDING,          ONLY: C_SIZE_T, C_SIZEOF

    IMPLICIT NONE

    TYPE(BUFFERED_ALLOCATOR), INTENT(INOUT) :: ALLOCATOR
    INTEGER(KIND=JPIM) :: KF_LEG

    TYPE(TRMTOLAD_UNPACK_HANDLE) :: HTRMTOLAD_UNPACK

    REAL(KIND=JPRBT) :: DUMMY

    HTRMTOLAD_UNPACK%HPFBUF = RESERVE(ALLOCATOR, 2_JPIB*D%NLENGT0B*KF_LEG*C_SIZEOF(DUMMY), "HTRMTOLAD_UNPACK%HPFBUF")
  END FUNCTION PREPARE_TRMTOLAD_UNPACK
  SUBROUTINE TRMTOLAD_UNPACK(ALLOCATOR,HTRMTOL_UNPACK,FOUBUF,PREEL_COMPLEX,KF_CURRENT,KF_TOTAL)

  !**** *TRMTOL_UNPACK* - Copy fourier data from buffer to local array

  !     Purpose.
  !     --------
  !        Routine for copying fourier data from buffer to local array

  !**   Interface.
  !     ----------
  !     CALL TRMTOL_UNPACK(...)

  !     Explicit arguments :  PREEL_COMPLEX - local fourier/GP array
  !     --------------------  KF_CURRENT - number of fields that are read (from Legendre space)
  !                           KF_TOTAL - total fields in PREEL ("stride")
  !
  !     Externals.  None.
  !     ----------

  !     Author.
  !     -------
  !        Mats Hamrud *ECMWF*

  !     Modifications.
  !     --------------
  !        Original : 2000-04-01

  !     ------------------------------------------------------------------

  USE PARKIND_ECTRANS,        ONLY: JPIM, JPRBT, JPIB
  USE TPM_DISTR,              ONLY: D, MYSETW
  USE TPM_GEOMETRY,           ONLY: G
  USE BUFFERED_ALLOCATOR_MOD, ONLY: BUFFERED_ALLOCATOR, ASSIGN_PTR, GET_ALLOCATION
  USE ISO_C_BINDING,          ONLY: C_SIZE_T, C_SIZEOF
  !

  IMPLICIT NONE

  REAL(KIND=JPRBT), INTENT(OUT), POINTER :: FOUBUF(:)
  REAL(KIND=JPRBT), INTENT(IN) :: PREEL_COMPLEX(:)
  INTEGER(KIND=JPIM),INTENT(IN) :: KF_CURRENT, KF_TOTAL
  TYPE(BUFFERED_ALLOCATOR), INTENT(IN) :: ALLOCATOR
  TYPE(TRMTOLAD_UNPACK_HANDLE), INTENT(IN) :: HTRMTOL_UNPACK

  INTEGER(KIND=JPIM) :: JM,JF,IGLG,OFFSET_VAR,KGL,ILOEN_MAX
  INTEGER(KIND=JPIB) :: IOFF_LAT, ISTA
  REAL(KIND=JPRBT) :: RET_REAL, RET_COMPLEX

  ASSOCIATE(D_NDGL_FS=>D%NDGL_FS, D_NSTAGTF=>D%NSTAGTF, D_NPNTGTB0=>D%NPNTGTB0, D_NPTRLS=>D%NPTRLS, &
          & G_NLOEN=>G%NLOEN, G_NMEN=>G%NMEN)

  CALL ASSIGN_PTR(FOUBUF, GET_ALLOCATION(ALLOCATOR, HTRMTOL_UNPACK%HPFBUF),&
                & 1_JPIB, 2_JPIB*D%NLENGT0B*KF_CURRENT*C_SIZEOF(FOUBUF(1)))

#ifdef OMPGPU
  !$OMP TARGET DATA MAP(PRESENT,ALLOC:G,G_NLOEN,G_NMEN,D,D_NPNTGTB0,FOUBUF,PREEL_COMPLEX,D_NSTAGTF,D_NDGL_FS)
#endif
#ifdef ACCGPU
  !$ACC DATA PRESENT(G,G_NLOEN,G_NMEN,D,D_NPNTGTB0,FOUBUF,PREEL_COMPLEX,D_NSTAGTF,D_NDGL_FS) ASYNC(1)
#endif

  OFFSET_VAR=D_NPTRLS(MYSETW)
  ILOEN_MAX=MAXVAL(G_NLOEN)
#ifdef OMPGPU
  ! Directive incomplete -> putting more variables in SHARED() triggers internal compiler error
  ! ftn-7991: INTERNAL COMPILER ERROR:  "Too few arguments on the stack"
  !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO COLLAPSE(3) &
  !$OMP& SHARED(D,G,KF_CURRENT,ILOEN_MAX,OFFSET_VAR,FOUBUF,PREEL_COMPLEX) &
  !$OMP& PRIVATE(IGLG,ISTA) &
  !$OMP& MAP(TO:KF_CURRENT,ILOEN_MAX,OFFSET_VAR,KF_TOTAL)
#endif
#ifdef ACCGPU
  !$ACC PARALLEL LOOP PRIVATE(IGLG,IOFF_LAT,ISTA) FIRSTPRIVATE(KF_CURRENT,&
  !$ACC&              KF_TOTAL,OFFSET_VAR,ILOEN_MAX) DEFAULT(NONE) TILE(32,16,1) &
#ifndef _CRAYFTN
  !$ACC& ASYNC(1)
#else
  !$ACC&
#endif
#endif
  DO KGL=1,D_NDGL_FS
    DO JF=1,KF_CURRENT
      DO JM=0,ILOEN_MAX/2
        IGLG = OFFSET_VAR+KGL-1

        ! FFT transforms NLON real values to floor(NLON/2)+1 complex numbers. Hence we have
        ! to fill those floor(NLON/2)+1 values.
        ! Truncation happens starting at G_NMEN+1. Hence, we zero-fill those values.
        IF (JM <= G_NLOEN(IGLG)/2) THEN
          IOFF_LAT = 1_JPIB*KF_TOTAL*D_NSTAGTF(KGL)+(JF-1)*(D_NSTAGTF(KGL+1)-D_NSTAGTF(KGL))

          IF (JM <= G_NMEN(IGLG)) THEN
            ISTA  = 2_JPIB*D_NPNTGTB0(JM,KGL)*KF_CURRENT

            FOUBUF(ISTA+2*JF-1) = PREEL_COMPLEX(IOFF_LAT+2*JM+1)
            FOUBUF(ISTA+2*JF  ) = PREEL_COMPLEX(IOFF_LAT+2*JM+2)
          ENDIF
        ENDIF
      ENDDO
    ENDDO
  ENDDO
#ifdef OMPGPU
  !$OMP END TARGET DATA
#endif
#ifdef ACCGPU
  !$ACC END DATA

  !$ACC WAIT(1)
#endif

  END ASSOCIATE

  END SUBROUTINE TRMTOLAD_UNPACK
END MODULE TRMTOLAD_PACK_UNPACK