dis_extract Subroutine

private subroutine dis_extract()

Uses

  • proc~~dis_extract~~UsesGraph proc~dis_extract dis_extract module~w90_sitesym w90_sitesym proc~dis_extract->module~w90_sitesym module~w90_io w90_io proc~dis_extract->module~w90_io module~w90_sitesym->module~w90_io module~w90_constants w90_constants module~w90_sitesym->module~w90_constants module~w90_io->module~w90_constants

Extracts an num_wann-dimensional subspace at each k by minimizing Omega_I

! send chunks of wkomegai1 to root node call comms_gatherv(wkomegai1_loc, counts(my_node_id), wkomegai1, counts, displs) ! send back the whole wkomegai1 array to other nodes call comms_bcast(wkomegai1(1), num_kpts)

! send chunks of wkomegai1 to root node call comms_gatherv(wkomegai1_loc, counts(my_node_id), wkomegai1, counts, displs) ! send back the whole wkomegai1 array to other nodes call comms_bcast(wkomegai1(1), num_kpts)

Arguments

None

Calls

proc~~dis_extract~~CallsGraph proc~dis_extract dis_extract zhpevx zhpevx proc~dis_extract->zhpevx proc~sitesym_symmetrize_zmatrix sitesym_symmetrize_zmatrix proc~dis_extract->proc~sitesym_symmetrize_zmatrix proc~io_error io_error proc~dis_extract->proc~io_error interface~comms_allreduce comms_allreduce proc~dis_extract->interface~comms_allreduce proc~comms_array_split comms_array_split proc~dis_extract->proc~comms_array_split proc~io_wallclocktime io_wallclocktime proc~dis_extract->proc~io_wallclocktime proc~comms_allreduce_real comms_allreduce_real interface~comms_allreduce->proc~comms_allreduce_real proc~comms_allreduce_cmplx comms_allreduce_cmplx interface~comms_allreduce->proc~comms_allreduce_cmplx

Called by

proc~~dis_extract~~CalledByGraph proc~dis_extract dis_extract proc~dis_main dis_main proc~dis_main->proc~dis_extract program~wannier wannier program~wannier->proc~dis_main proc~wannier_run wannier_run proc~wannier_run->proc~dis_main

Contents

Source Code


Source Code

  subroutine dis_extract()
    !==================================================================!
    !                                                                  !
    !! Extracts an num_wann-dimensional subspace at each k by
    !! minimizing Omega_I
    !                                                                  !
    !==================================================================!

    use w90_io, only: io_wallclocktime
    use w90_sitesym, only: ir2ik, ik2ir, nkptirr, nsymmetry, kptsym !YN: RS:

    implicit none

    ! MODIFIED:
    !           u_matrix_opt (At input it contains the initial guess for the optima
    ! subspace (expressed in terms of the original states inside the window)
    ! output it contains the  states that diagonalize the hamiltonian inside
    ! optimal subspace (again expressed in terms of the original window stat
    ! Giving out states that diagonalize the hamiltonian inside the optimal
    ! subspace (instead of the eigenstates of the Z matrix) is useful for
    ! performing the Wannier interpolation of the energy bands as described
    ! Sec. III.F of SMV)
    !
    !           eigval (At input: original energy eigenvalues.
    ! At output: eigenvalues of the hamiltonian inside optimal subspace)

    ! ----------------------------------------------------------------------
    ! TO DO: The complement subspace is computed but is not saved anywhere!
    ! (Check what was done with it in original code space.f)
    ! Diagonalize Z matrix only at those k points where ndimwin>num_wann?
    ! ----------------------------------------------------------------------

    ! *******************
    ! SHELLS OF K-VECTORS
    ! *******************
    ! nshells           number of shells of k-points to be used in the
    !                   finite-difference formulas for the k-derivatives
    ! aam: wb is now wb(1:nntot) 09/04/2006
    ! wb(nkp,nnx)       weight of the nnx-th b-vector (ordered along shells
    !                   of increasing length) associated with the nkp-th k-p
    ! wbtot             sum of the weights of all b-vectors associated with
    !                   given k-point (k-point 1 is used in calculation)
    ! nnlist(nkp,nnx)   vkpt(1:3,nnlist(nkp,nnx)) is the nnx-th neighboring
    !                   k-point of the nkp-th k-point vkpt(1:3,nkp) (or its
    !                   periodic image in the "home Brillouin zone")
    ! cm(n,m,nkp,nnx)   Overlap matrix <u_nk|u_{m,k+b}>

    ! Internal variables
    integer :: i, j, l, m, n, nn, nkp, nkp2, info, ierr, ndimk, p
    integer :: icompflag, iter, ndiff
    real(kind=dp) :: womegai, wkomegai, womegai1, rsum, delta_womegai
    real(kind=dp), allocatable :: wkomegai1(:)

    ! for MPI
    real(kind=dp), allocatable :: wkomegai1_loc(:)
    complex(kind=dp), allocatable :: camp_loc(:, :, :)
    complex(kind=dp), allocatable :: u_matrix_opt_loc(:, :, :)

    complex(kind=dp), allocatable :: ceamp(:, :, :)
    complex(kind=dp), allocatable :: camp(:, :, :)
    complex(kind=dp), allocatable :: czmat_in(:, :, :)
    complex(kind=dp), allocatable :: czmat_out(:, :, :)
    ! the z-matrices are now stored in local arrays
    complex(kind=dp), allocatable :: czmat_in_loc(:, :, :)
    complex(kind=dp), allocatable :: czmat_out_loc(:, :, :)
    complex(kind=dp), allocatable :: cham(:, :, :)

    integer, allocatable :: iwork(:)
    integer, allocatable :: ifail(:)
    real(kind=dp), allocatable :: w(:)
    real(kind=dp), allocatable :: rwork(:)
    complex(kind=dp), allocatable :: cap(:)
    complex(kind=dp), allocatable :: cwork(:)
    complex(kind=dp), allocatable :: cz(:, :)

    complex(kind=dp), allocatable :: cwb(:, :), cww(:, :), cbw(:, :)

    real(kind=dp), allocatable :: history(:)
    logical                       :: dis_converged
    complex(kind=dp) :: lambda(num_wann, num_wann) !RS:

    ! Needed to split an array on different nodes
    integer, dimension(0:num_nodes - 1) :: counts
    integer, dimension(0:num_nodes - 1) :: displs
    integer :: nkp_loc

    if (timing_level > 1 .and. on_root) call io_stopwatch('dis: extract', 1)

    if (on_root) write (stdout, '(/1x,a)') &
      '                  Extraction of optimally-connected subspace                  '
    if (on_root) write (stdout, '(1x,a)') &
      '                  ------------------------------------------                  '

    allocate (cwb(num_wann, num_bands), stat=ierr)
    if (ierr /= 0) call io_error('Error allocating cwb in dis_extract')
    allocate (cww(num_wann, num_wann), stat=ierr)
    if (ierr /= 0) call io_error('Error allocating cww in dis_extract')
    allocate (cbw(num_bands, num_wann), stat=ierr)
    if (ierr /= 0) call io_error('Error allocating cbw in dis_extract')
    cwb = cmplx_0; cww = cmplx_0; cbw = cmplx_0

    allocate (iwork(5*num_bands), stat=ierr)
    if (ierr /= 0) call io_error('Error allocating iwork in dis_extract')
    allocate (ifail(num_bands), stat=ierr)
    if (ierr /= 0) call io_error('Error allocating ifail in dis_extract')
    allocate (w(num_bands), stat=ierr)
    if (ierr /= 0) call io_error('Error allocating w in dis_extract')
    allocate (rwork(7*num_bands), stat=ierr)
    if (ierr /= 0) call io_error('Error allocating rwork in dis_extract')
    allocate (cap((num_bands*(num_bands + 1))/2), stat=ierr)
    if (ierr /= 0) call io_error('Error allocating cap in dis_extract')
    allocate (cwork(2*num_bands), stat=ierr)
    if (ierr /= 0) call io_error('Error allocating cwork in dis_extract')
    allocate (cz(num_bands, num_bands), stat=ierr)
    if (ierr /= 0) call io_error('Error allocating cz in dis_extract')

    ! for MPI
    call comms_array_split(num_kpts, counts, displs)
    allocate (u_matrix_opt_loc(num_bands, num_wann, max(1, counts(my_node_id))), stat=ierr)
    if (ierr /= 0) call io_error('Error allocating u_matrix_opt_loc in dis_extract')
    ! Copy matrix elements from global U matrix to local U matrix
    do nkp_loc = 1, counts(my_node_id)
      nkp = nkp_loc + displs(my_node_id)
      u_matrix_opt_loc(:, :, nkp_loc) = u_matrix_opt(:, :, nkp)
    enddo
    allocate (wkomegai1_loc(max(1, counts(my_node_id))), stat=ierr)
    if (ierr /= 0) call io_error('Error allocating wkomegai1_loc in dis_extract')
    allocate (czmat_in_loc(num_bands, num_bands, max(1, counts(my_node_id))), stat=ierr)
    if (ierr /= 0) call io_error('Error allocating czmat_in_loc in dis_extract')
    allocate (czmat_out_loc(num_bands, num_bands, max(1, counts(my_node_id))), stat=ierr)
    if (ierr /= 0) call io_error('Error allocating czmat_out_loc in dis_extract')

    allocate (wkomegai1(num_kpts), stat=ierr)
    if (ierr /= 0) call io_error('Error allocating wkomegai1 in dis_extract')
    allocate (czmat_in(num_bands, num_bands, num_kpts), stat=ierr)
    if (ierr /= 0) call io_error('Error allocating czmat_in in dis_extract')
    allocate (czmat_out(num_bands, num_bands, num_kpts), stat=ierr)
    if (ierr /= 0) call io_error('Error allocating czmat_out in dis_extract')

    allocate (history(dis_conv_window), stat=ierr)
    if (ierr /= 0) call io_error('Error allocating history in dis_extract')

    ! ********************************************
    ! ENERGY WINDOWS AND SUBSPACES AT EACH K-POINT
    ! ********************************************
    ! num_wann             dimensionality of the subspace at each k-point
    !                   (number of Wannier functions per unit cell that we w
    ! NDIMWIN(NKP)      number of bands at the nkp-th k-point that fall
    !                   within the outer energy window
    ! NDIMFROZ(NKP)     number of frozen bands at the nkp-th k-point
    ! INDXNFROZ(I,NKP)  INDEX (BETWEEN 1 AND NDIMWIN(NKP)) OF THE I-TH NON-F
    !                   ORIGINAL BAND STATE AT THE NKP-TH K-POINT
    ! U_MATRIX_OPT(J,L,NKP)    AMPLITUDE OF THE J-TH ENERGY EIGENVECTOR INSIDE THE
    !                   ENERGY WINDOW AT THE NKP-TH K-POINT IN THE EXPANSION
    !                   THE L-TH LEADING RLAMBDA EIGENVECTOR AT THE SAME K-P
    !                   If there are M_k frozen states, they occupy the lowe
    !                   entries of the second index of u_matrix_opt, and the leadin
    !                   nabnds-M_k eigenvectors of the Z matrix occupy the
    !                   remaining slots
    ! CAMP(J,L,NKP)     SAME AS U_MATRIX_OPT, BUT FOR THE COMPLEMENT SUBSPACE INSID
    !                   ENERGY WINDOW (I.E., THE NON-LEADING RLAMBDA EIGENVE
    ! CEAMP(J,L,NKPTS)  SAME AS U_MATRIX_OPT, BUT INSTEAD OF RLAMBDA EIGENVECTOR, I
    !                   FOR THE ENERGY EIGENVECTOR OBTAINED BY DIAGONALIZING
    !                   HAMILTONIAN IN THE OPTIMIZED SUBSPACE
    ! CZMAT_IN(M,N,NKP) Z-MATRIX [Eq. (21) SMV]
    ! CZMAT_OUT(M,N,NKP) OUTPUT Z-MATRIX FROM THE PRESENT ITERATION
    ! RLAMBDA           An eigenvalue of the Z matrix
    ! womegai           Gauge-invariant Wannier spread, computed usinf all s
    !                   from current iteration
    ! wkomegai1(NKP)    Eq. (18) of SMV
    ! womegai1          Eq.(11) of SMV (like wowmegai, but neighboring state
    !                   for computing overlaps are from previous iteration.
    !                   become equal at self-consistency)
    ! alphafixe         mixing parameter for the iterative procedure
    ! nitere            total number of iterations

    ! DEBUG
    if (iprint > 2) then
      if (on_root) write (stdout, '(a,/)') '  Original eigenvalues inside outer window:'
      do nkp = 1, num_kpts
        if (on_root) write (stdout, '(a,i3,3x,20(f9.5,1x))') '  K-point ', nkp, &
          (eigval_opt(i, nkp), i=1, ndimwin(nkp))
      enddo
    endif
    ! ENDDEBUG

    ! TO DO: Check if this is the best place to initialize icompflag
    icompflag = 0

    if (on_root) write (stdout, '(1x,a)') &
      '+---------------------------------------------------------------------+<-- DIS'
    if (on_root) write (stdout, '(1x,a)') &
      '|  Iter     Omega_I(i-1)      Omega_I(i)      Delta (frac.)    Time   |<-- DIS'
    if (on_root) write (stdout, '(1x,a)') &
      '+---------------------------------------------------------------------+<-- DIS'

    dis_converged = .false.

    ! ------------------
    ! BIG ITERATION LOOP
    ! ------------------
    do iter = 1, dis_num_iter

      if (timing_level > 1 .and. on_root) call io_stopwatch('dis: extract_1', 1)

      if (iter .eq. 1) then
        ! Initialize Z matrix at k points w/ non-frozen states
        do nkp_loc = 1, counts(my_node_id)
          nkp = nkp_loc + displs(my_node_id)
          if (num_wann .gt. ndimfroz(nkp)) call internal_zmatrix(nkp, nkp_loc, czmat_in_loc(:, :, nkp_loc))
        enddo

        if (lsitesymmetry) then
          call comms_gatherv(czmat_in_loc, num_bands*num_bands*counts(my_node_id), &
                             czmat_in, num_bands*num_bands*counts, num_bands*num_bands*displs)
          call comms_bcast(czmat_in(1, 1, 1), num_bands*num_bands*num_kpts)
          call sitesym_symmetrize_zmatrix(czmat_in, lwindow) !RS:
          do nkp_loc = 1, counts(my_node_id)
            nkp = nkp_loc + displs(my_node_id)
            czmat_in_loc(:, :, nkp_loc) = czmat_in(:, :, nkp)
          end do
        end if

      else
        ! [iter.ne.1]
        ! Update Z matrix at k points with non-frozen states, using a mixing sch
        do nkp_loc = 1, counts(my_node_id)
          nkp = nkp_loc + displs(my_node_id)
          if (lsitesymmetry) then                !YN: RS:
            if (ir2ik(ik2ir(nkp)) .ne. nkp) cycle !YN: RS:
          endif                                  !YN: RS:
          if (num_wann .gt. ndimfroz(nkp)) then
            ndimk = ndimwin(nkp) - ndimfroz(nkp)
            do i = 1, ndimk
              do j = 1, i
                czmat_in_loc(j, i, nkp_loc) = &
                  cmplx(dis_mix_ratio, 0.0_dp, dp)*czmat_out_loc(j, i, nkp_loc) &
                  + cmplx(1.0_dp - dis_mix_ratio, 0.0_dp, dp)*czmat_in_loc(j, i, nkp_loc)
                ! hermiticity
                czmat_in_loc(i, j, nkp_loc) = conjg(czmat_in_loc(j, i, nkp_loc))
              enddo
            enddo
          endif
        enddo
      endif
      ! [if iter=1]
      if (timing_level > 1 .and. on_root) call io_stopwatch('dis: extract_1', 2)

      if (timing_level > 1 .and. on_root) call io_stopwatch('dis: extract_2', 1)

      womegai1 = 0.0_dp
      ! wkomegai1 is defined by Eq. (18) of SMV.
      ! Contribution to wkomegai1 from frozen states should be calculated now
      ! every k (before updating any k), so that for iter>1 overlaps are with
      ! non-frozen neighboring states from the previous iteration

      wkomegai1 = real(num_wann, dp)*wbtot
      if (lsitesymmetry) then                                                                        !RS:
        do nkp = 1, nkptirr                                                                            !RS:
          wkomegai1(ir2ik(nkp)) = wkomegai1(ir2ik(nkp))*nsymmetry/count(kptsym(:, nkp) .eq. ir2ik(nkp)) !RS:
        enddo                                                                                       !RS:
      endif                                                                                          !RS:
      do nkp_loc = 1, counts(my_node_id)
        nkp = nkp_loc + displs(my_node_id)
        wkomegai1_loc(nkp_loc) = wkomegai1(nkp)
      end do

      do nkp_loc = 1, counts(my_node_id)
        nkp = nkp_loc + displs(my_node_id)
        if (ndimfroz(nkp) .gt. 0) then
          if (lsitesymmetry) call io_error('not implemented in symmetry-adapted mode') !YN: RS:
          do nn = 1, nntot
            nkp2 = nnlist(nkp, nn)
            call zgemm('C', 'N', ndimfroz(nkp), ndimwin(nkp2), ndimwin(nkp), cmplx_1, &
                       u_matrix_opt(:, :, nkp), num_bands, m_matrix_orig_local(:, :, nn, nkp_loc), num_bands, cmplx_0, &
                       cwb, num_wann)
            call zgemm('N', 'N', ndimfroz(nkp), num_wann, ndimwin(nkp2), cmplx_1, &
                       cwb, num_wann, u_matrix_opt(:, :, nkp2), num_bands, cmplx_0, cww, num_wann)
            rsum = 0.0_dp
            do n = 1, num_wann
              do m = 1, ndimfroz(nkp)
                rsum = rsum + real(cww(m, n), dp)**2 + aimag(cww(m, n))**2
              enddo
            enddo
            wkomegai1_loc(nkp_loc) = wkomegai1_loc(nkp_loc) - wb(nn)*rsum
          enddo
        endif
      enddo
      if (timing_level > 1 .and. on_root) call io_stopwatch('dis: extract_2', 2)

      if (timing_level > 1 .and. on_root) call io_stopwatch('dis: extract_3', 1)

      !! ! send chunks of wkomegai1 to root node
      !! call comms_gatherv(wkomegai1_loc, counts(my_node_id), wkomegai1, counts, displs)
      !! ! send back the whole wkomegai1 array to other nodes
      !! call comms_bcast(wkomegai1(1), num_kpts)

      ! Refine optimal subspace at k points w/ non-frozen states
      do nkp_loc = 1, counts(my_node_id)
        nkp = nkp_loc + displs(my_node_id)
        if (lsitesymmetry) then                                                     !RS:
          if (ir2ik(ik2ir(nkp)) .ne. nkp) cycle                                      !RS:
        end if                                                                      !RS:
        if (lsitesymmetry) then                                                     !RS:

          call sitesym_dis_extract_symmetry(nkp, ndimwin(nkp), czmat_in_loc(:, :, nkp_loc), lambda, u_matrix_opt_loc(:, :, nkp_loc)) !RS:

          do j = 1, num_wann                                                          !RS:
            wkomegai1_loc(nkp_loc) = wkomegai1_loc(nkp_loc) - real(lambda(j, j), kind=dp)               !RS:
          enddo                                                                    !RS:
        else                                                                        !RS:
          if (num_wann .gt. ndimfroz(nkp)) then
            ! Diagonalize Z matrix
            do j = 1, ndimwin(nkp) - ndimfroz(nkp)
              do i = 1, j
                cap(i + ((j - 1)*j)/2) = czmat_in_loc(i, j, nkp_loc)
              enddo
            enddo
            ndiff = ndimwin(nkp) - ndimfroz(nkp)
            call ZHPEVX('V', 'A', 'U', ndiff, cap, 0.0_dp, 0.0_dp, 0, 0, &
                        -1.0_dp, m, w, cz, num_bands, cwork, rwork, iwork, ifail, info)
            if (info .lt. 0) then
              if (on_root) write (stdout, *) ' *** ERROR *** ZHPEVX WHILE DIAGONALIZING Z MATRIX'
              if (on_root) write (stdout, *) ' THE ', -info, ' ARGUMENT OF ZHPEVX HAD AN ILLEGAL VALUE'
              call io_error(' dis_extract: error')
            endif
            if (info .gt. 0) then
              if (on_root) write (stdout, *) ' *** ERROR *** ZHPEVX WHILE DIAGONALIZING Z MATRIX'
              if (on_root) write (stdout, *) info, ' EIGENVECTORS FAILED TO CONVERGE'
              call io_error(' dis_extract: error')
            endif

            ! Update the optimal subspace by incorporating the num_wann-ndimfroz(nkp) l
            ! eigenvectors of the Z matrix into u_matrix_opt. Also, add contribution from
            ! non-frozen states to wkomegai1(nkp) (minus the corresponding eigenvalu
            m = ndimfroz(nkp)
            do j = ndimwin(nkp) - num_wann + 1, ndimwin(nkp) - ndimfroz(nkp)
              m = m + 1
              wkomegai1_loc(nkp_loc) = wkomegai1_loc(nkp_loc) - w(j)
              u_matrix_opt_loc(1:ndimwin(nkp), m, nkp_loc) = cmplx_0
              ndimk = ndimwin(nkp) - ndimfroz(nkp)
              do i = 1, ndimk
                p = indxnfroz(i, nkp)
                u_matrix_opt_loc(p, m, nkp_loc) = cz(i, j)
              enddo
            enddo
          endif
          ! [if num_wann>ndimfroz(nkp)]
        endif !RS:

        ! Now that we have contribs. from both frozen and non-frozen states to
        ! wkomegai1(nkp), add it to womegai1
        womegai1 = womegai1 + wkomegai1_loc(nkp_loc)

        if (index(devel_flag, 'compspace') > 0) then

          ! AT THE LAST ITERATION FIND A BASIS FOR THE (NDIMWIN(NKP)-num_wann)-DIMENS
          ! COMPLEMENT SPACE

          if (iter .eq. dis_num_iter) then
            allocate (camp(num_bands, num_bands, num_kpts), stat=ierr)
            if (ierr /= 0) call io_error('Error allocating camp in dis_extract')
            allocate (camp_loc(num_bands, num_bands, max(1, counts(my_node_id))), stat=ierr)
            if (ierr /= 0) call io_error('Error allocating ucamp_loc in dis_extract')

            if (ndimwin(nkp) .gt. num_wann) then
              do j = 1, ndimwin(nkp) - num_wann
                if (num_wann .gt. ndimfroz(nkp)) then
                  ! USE THE NON-LEADING EIGENVECTORS OF THE Z-MATRIX
                  camp_loc(1:ndimwin(nkp), j, nkp_loc) = cz(1:ndimwin(nkp), j)
                else
                  ! Then num_wann=NDIMFROZ(NKP)
                  ! USE THE ORIGINAL NON-FROZEN BLOCH EIGENSTATES
                  do i = 1, ndimwin(nkp)
                    camp_loc(i, j, nkp_loc) = cmplx_0
                    if (i .eq. indxnfroz(j, nkp)) camp_loc(i, j, nkp_loc) = cmplx_1
                  enddo
                endif
              enddo
            else
              icompflag = 1
            endif
          endif

        end if ! index(devel_flag,'compspace')>0

      enddo
      ! [Loop over k points (nkp)]

      !! ! send chunks of wkomegai1 to root node
      !! call comms_gatherv(wkomegai1_loc, counts(my_node_id), wkomegai1, counts, displs)
      !! ! send back the whole wkomegai1 array to other nodes
      !! call comms_bcast(wkomegai1(1), num_kpts)

      call comms_allreduce(womegai1, 1, 'SUM')

      call comms_gatherv(u_matrix_opt_loc, num_bands*num_wann*counts(my_node_id), &
                         u_matrix_opt, num_bands*num_wann*counts, num_bands*num_wann*displs)
      call comms_bcast(u_matrix_opt(1, 1, 1), num_bands*num_wann*num_kpts)
      if (lsitesymmetry) call sitesym_symmetrize_u_matrix(num_bands, u_matrix_opt, lwindow) !RS:

      if (index(devel_flag, 'compspace') > 0) then
        if (iter .eq. dis_num_iter) then
          call comms_gatherv(camp_loc, num_bands*num_bands*counts(my_node_id), &
                             camp, num_bands*num_bands*counts, num_bands*num_bands*displs)

          call comms_bcast(camp(1, 1, 1), num_bands*num_bands*num_kpts)
        endif
      endif

      if (timing_level > 1 .and. on_root) call io_stopwatch('dis: extract_3', 2)

      womegai1 = womegai1/real(num_kpts, dp)

      ! DEBUG
      ! Orthonormality check
      !         do nkp=1,nkpts
      !           write(*,*) ' '
      !           write(*,'(a8,i4)') 'k-point ',nkp
      !           do l=1,num_wann
      !           do m=1,l
      !             ctmp=czero
      !             do j=1,ndimwin(nkp)
      !               ctmp=ctmp+conjg(u_matrix_opt(j,m,nkp))*u_matrix_opt(j,l,nkp)
      !             enddo
      !             write(*,'(i2,2x,i2,f16.12,1x,f16.12)') l,m,ctmp
      !             if(l.eq.m) then
      !               if(abs(ctmp-cmplx(1.0d0,0.0d0)).gt.1.0e-8) then
      !                 write(*,'(a49,i4)')
      !     1           '*** ERROR *** with iterative subspace at k-point ',
      !     2           nkp
      !                 write(*,*) 'vectors in u_matrix_opt not orthonormal'
      !                 stop
      !               endif
      !             else
      !               if(abs(ctmp).gt.1.0e-8) then
      !                 write(*,'(a49,i4)')
      !     1           '*** ERROR *** with iterative subspace at k-point ',
      !     2           nkp
      !                 write(*,*) 'vectors in u_matrix_opt not orthonormal'
      !                 stop
      !               endif
      !             endif
      !           enddo
      !           enddo
      !         enddo
      ! ENDDEBUG

      ! Compute womegai  using the updated subspaces at all k, i.e.,
      ! replacing (i-1) by (i) in Eq. (12) SMV
      if (timing_level > 1 .and. on_root) call io_stopwatch('dis: extract_4', 1)

      womegai = 0.0_dp
      do nkp_loc = 1, counts(my_node_id)
        nkp = nkp_loc + displs(my_node_id)
        wkomegai = 0.0_dp
        do nn = 1, nntot
          nkp2 = nnlist(nkp, nn)
          call zgemm('C', 'N', num_wann, ndimwin(nkp2), ndimwin(nkp), cmplx_1, &
                     u_matrix_opt(:, :, nkp), num_bands, m_matrix_orig_local(:, :, nn, nkp_loc), num_bands, cmplx_0, &
                     cwb, num_wann)
          call zgemm('N', 'N', num_wann, num_wann, ndimwin(nkp2), cmplx_1, &
                     cwb, num_wann, u_matrix_opt(:, :, nkp2), num_bands, cmplx_0, cww, num_wann)
          rsum = 0.0_dp
          do n = 1, num_wann
            do m = 1, num_wann
              rsum = rsum + real(cww(m, n), dp)**2 + aimag(cww(m, n))**2
            enddo
          enddo
          wkomegai = wkomegai + wb(nn)*rsum
        enddo
        wkomegai = real(num_wann, dp)*wbtot - wkomegai
        womegai = womegai + wkomegai
      enddo

      call comms_allreduce(womegai, 1, 'SUM')

      womegai = womegai/real(num_kpts, dp)
      ! [Loop over k (nkp)]
      if (timing_level > 1 .and. on_root) call io_stopwatch('dis: extract_4', 2)

      delta_womegai = womegai1/womegai - 1.0_dp

      if (on_root) write (stdout, 124) iter, womegai1*lenconfac**2, womegai*lenconfac**2, &
        delta_womegai, io_wallclocktime()

124   format(2x, i6, 3x, f14.8, 3x, f14.8, 6x, es10.3, 2x, f8.2, 4x, '<-- DIS')

      ! Construct the updated Z matrix, CZMAT_OUT, at k points w/ non-frozen s
      do nkp_loc = 1, counts(my_node_id)
        nkp = nkp_loc + displs(my_node_id)
        if (num_wann .gt. ndimfroz(nkp)) call internal_zmatrix(nkp, nkp_loc, czmat_out_loc(:, :, nkp_loc))
      enddo

      if (lsitesymmetry) then
        call comms_gatherv(czmat_out_loc, num_bands*num_bands*counts(my_node_id), &
                           czmat_out, num_bands*num_bands*counts, num_bands*num_bands*displs)
        call comms_bcast(czmat_out(1, 1, 1), num_bands*num_bands*num_kpts)
        call sitesym_symmetrize_zmatrix(czmat_out, lwindow) !RS:
        do nkp_loc = 1, counts(my_node_id)
          nkp = nkp_loc + displs(my_node_id)
          czmat_out_loc(:, :, nkp_loc) = czmat_out(:, :, nkp)
        end do
      end if

      call internal_test_convergence()

      if (dis_converged) then
        if (on_root) write (stdout, '(/13x,a,es10.3,a,i2,a)') &
          '<<<      Delta <', dis_conv_tol, &
          '  over ', dis_conv_window, ' iterations     >>>'
        if (on_root) write (stdout, '(13x,a)') '<<< Disentanglement convergence criteria satisfied >>>'
        exit
      endif

    enddo
    ! [BIG ITERATION LOOP (iter)]

    deallocate (czmat_out, stat=ierr)
    if (ierr /= 0) call io_error('Error deallocating czmat_out in dis_extract')
    deallocate (czmat_in, stat=ierr)
    if (ierr /= 0) call io_error('Error deallocating czmat_in in dis_extract')
    deallocate (czmat_out_loc, stat=ierr)
    if (ierr /= 0) call io_error('Error deallocating czmat_out_loc in dis_extract')
    deallocate (czmat_in_loc, stat=ierr)
    if (ierr /= 0) call io_error('Error deallocating czmat_in_loc in dis_extract')

    if (on_root) then
      allocate (ceamp(num_bands, num_bands, num_kpts), stat=ierr)
      if (ierr /= 0) call io_error('Error allocating ceamp in dis_extract')
      allocate (cham(num_bands, num_bands, num_kpts), stat=ierr)
      if (ierr /= 0) call io_error('Error allocating cham in dis_extract')
    endif

    if (.not. dis_converged) then
      if (on_root) write (stdout, '(/5x,a)') &
        '<<< Warning: Maximum number of disentanglement iterations reached >>>'
      if (on_root) write (stdout, '(10x,a)') '<<< Disentanglement convergence criteria not satisfied >>>'
    endif

    if (index(devel_flag, 'compspace') > 0) then

      if (icompflag .eq. 1) then
        if (iprint > 2) then
          if (on_root) write (stdout, ('(/4x,a)')) &
            'WARNING: Complement subspace has zero dimensions at the following k-points:'
          i = 0
          if (on_root) write (stdout, '(4x)', advance='no')
          do nkp = 1, num_kpts
            if (ndimwin(nkp) .eq. num_wann) then
              i = i + 1
              if (i .le. 12) then
                if (on_root) write (stdout, '(i6)', advance='no') nkp
              else
                i = 1
                if (on_root) write (stdout, '(/4x)', advance='no')
                if (on_root) write (stdout, '(i6)', advance='no') nkp
              endif
            endif
          enddo
        endif
      endif

    endif

    ! Write the final womegai. This should remain unchanged during the
    ! subsequent minimization of Omega_tilde in wannierise.f90
    ! We store it in the checkpoint file as a sanity check
    if (on_root) write (stdout, '(/8x,a,f14.8,a/)') 'Final Omega_I ', &
      womegai*lenconfac**2, ' ('//trim(length_unit)//'^2)'

    ! Set public variable omega_invariant
    omega_invariant = womegai

    ! Currently, this part is not parallelized; thus, we perform the task only on root and then broadcast the result.
    if (on_root) then
      ! DIAGONALIZE THE HAMILTONIAN WITHIN THE OPTIMIZED SUBSPACES
      do nkp = 1, num_kpts

        do j = 1, num_wann
          do i = 1, num_wann
            cham(i, j, nkp) = cmplx_0
            do l = 1, ndimwin(nkp)
              cham(i, j, nkp) = cham(i, j, nkp) + conjg(u_matrix_opt(l, i, nkp)) &
                                *u_matrix_opt(l, j, nkp)*eigval_opt(l, nkp)
            enddo
          enddo
        enddo

        do j = 1, num_wann
          do i = 1, j
            cap(i + ((j - 1)*j)/2) = cham(i, j, nkp)
          enddo
        enddo

        call ZHPEVX('V', 'A', 'U', num_wann, cap, 0.0_dp, 0.0_dp, 0, 0, -1.0_dp, &
                    m, w, cz, num_bands, cwork, rwork, iwork, ifail, info)

        if (info .lt. 0) then
          if (on_root) write (stdout, *) ' *** ERROR *** ZHPEVX WHILE DIAGONALIZING HAMILTONIAN'
          if (on_root) write (stdout, *) ' THE ', -info, ' ARGUMENT OF ZHPEVX HAD AN ILLEGAL VALUE'
          call io_error(' dis_extract: error')
        endif
        if (info .gt. 0) then
          if (on_root) write (stdout, *) ' *** ERROR *** ZHPEVX WHILE DIAGONALIZING HAMILTONIAN'
          if (on_root) write (stdout, *) info, 'EIGENVECTORS FAILED TO CONVERGE'
          call io_error(' dis_extract: error')
        endif

        ! Store the energy eigenvalues of the optimal subspace (used in wann_ban
        eigval_opt(1:num_wann, nkp) = w(1:num_wann)

        ! CALCULATE AMPLITUDES OF THE CORRESPONDING ENERGY EIGENVECTORS IN TERMS
        ! THE ORIGINAL ("WINDOW SPACE") ENERGY EIGENVECTORS
        do j = 1, num_wann
          do i = 1, ndimwin(nkp)
            ceamp(i, j, nkp) = cmplx_0
            do l = 1, num_wann
              ceamp(i, j, nkp) = ceamp(i, j, nkp) + cz(l, j)*u_matrix_opt(i, l, nkp)
            enddo
          enddo
        enddo
        ! NKP
      enddo

      ! DEBUG
      if (iprint > 2) then
        if (on_root) write (stdout, '(/,a,/)') '  Eigenvalues inside optimal subspace:'
        do nkp = 1, num_kpts
          if (on_root) write (stdout, '(a,i3,2x,20(f9.5,1x))') '  K-point ', &
            nkp, (eigval_opt(i, nkp), i=1, num_wann)
        enddo
      endif
      ! ENDDEBUG

      ! Replace u_matrix_opt by ceamp. Both span the
      ! same space, but the latter is more convenient for the purpose of obtai
      ! an optimal Fourier-interpolated band structure: see Sec. III.E of SMV.
      if (.not. lsitesymmetry) then                                                                         !YN:
        do nkp = 1, num_kpts
          do j = 1, num_wann
            u_matrix_opt(1:ndimwin(nkp), j, nkp) = ceamp(1:ndimwin(nkp), j, nkp)
          enddo
        enddo
        !else                                                                                                        !YN:
        ! Above is skipped as we require Uopt(Rk) to be related to Uopt(k)                                        !YN: RS:
        !write(stdout,"(a)")  &                                                                                   !YN: RS:
        !  'Note(symmetry-adapted mode): u_matrix_opt are no longer the eigenstates of the subspace Hamiltonian.' !RS:
      endif                                                                                                        !YN:
    endif
    call comms_bcast(eigval_opt(1, 1), num_bands*num_kpts)
    call comms_bcast(u_matrix_opt(1, 1, 1), num_bands*num_wann*num_kpts)

    if (index(devel_flag, 'compspace') > 0) then

      if (icompflag .eq. 1) then
        if (iprint > 2) then
          if (on_root) write (stdout, *) 'AT SOME K-POINT(S) COMPLEMENT SUBSPACE HAS ZERO DIMENSIONALITY'
          if (on_root) write (stdout, *) '=> DID NOT CREATE FILE COMPSPACE.DAT'
        endif
      else
        ! DIAGONALIZE THE HAMILTONIAN IN THE COMPLEMENT SUBSPACE, WRITE THE
        ! CORRESPONDING EIGENFUNCTIONS AND ENERGY EIGENVALUES
        do nkp = 1, num_kpts
          do j = 1, ndimwin(nkp) - num_wann
            do i = 1, ndimwin(nkp) - num_wann
              cham(i, j, nkp) = cmplx_0
              do l = 1, ndimwin(nkp)
                cham(i, j, nkp) = cham(i, j, nkp) + conjg(camp(l, i, nkp)) &
                                  *camp(l, j, nkp)*eigval_opt(l, nkp)
              enddo
            enddo
          enddo
          do j = 1, ndimwin(nkp) - num_wann
            do i = 1, j
              cap(i + ((j - 1)*j)/2) = cham(i, j, nkp)
            enddo
          enddo
          ndiff = ndimwin(nkp) - num_wann
          call ZHPEVX('V', 'A', 'U', ndiff, cap, 0.0_dp, 0.0_dp, 0, 0, &
                      -1.0_dp, m, w, cz, num_bands, cwork, rwork, iwork, ifail, info)
          if (info .lt. 0) then
            if (on_root) write (stdout, *) '*** ERROR *** ZHPEVX WHILE DIAGONALIZING HAMILTONIAN'
            if (on_root) write (stdout, *) 'THE ', -info, ' ARGUMENT OF ZHPEVX HAD AN ILLEGAL VALUE'
            call io_error(' dis_extract: error')
          endif
          if (info .gt. 0) then
            if (on_root) write (stdout, *) '*** ERROR *** ZHPEVX WHILE DIAGONALIZING HAMILTONIAN'
            if (on_root) write (stdout, *) info, 'EIGENVECTORS FAILED TO CONVERGE'
            call io_error(' dis_extract: error')
          endif
          ! CALCULATE AMPLITUDES OF THE ENERGY EIGENVECTORS IN THE COMPLEMENT SUBS
          ! TERMS OF THE ORIGINAL ENERGY EIGENVECTORS
          do j = 1, ndimwin(nkp) - num_wann
            do i = 1, ndimwin(nkp)
              camp(i, j, nkp) = cmplx_0
              do l = 1, ndimwin(nkp) - num_wann
!write(stdout,*) 'i=',i,'   j=',j,'   l=',l
!write(stdout,*) '           camp(i,j,nkp)=',camp(i,j,nkp)
!write(stdout,*) '           cz(l,j)=',cz(l,j)
!write(stdout,*) '           u_matrix_opt(i,l,nkp)=',u_matrix_opt(i,l,nkp)

! aam: 20/10/2006 -- the second dimension of u_matrix_opt is out of bounds (allocated as num_wann)!
! commenting this line out.
!                     camp(i,j,nkp) = camp(i,j,nkp) + cz(l,j) * u_matrix_opt(i,l,nkp)
              enddo
            enddo
          enddo
        enddo   ! [loop over k points (nkp)]

      endif   ! [if icompflag=1]

    endif     ![if(index(devel_flag,'compspace')>0)]

    deallocate (history, stat=ierr)
    if (ierr /= 0) call io_error('Error deallocating history in dis_extract')

    if (on_root) then
      deallocate (cham, stat=ierr)
      if (ierr /= 0) call io_error('Error deallocating cham in dis_extract')
    endif
    if (allocated(camp)) then
      deallocate (camp, stat=ierr)
      if (ierr /= 0) call io_error('Error deallocating camp in dis_extract')
    end if
    if (allocated(camp_loc)) then
      deallocate (camp_loc, stat=ierr)
      if (ierr /= 0) call io_error('Error deallocating camp_loc in dis_extract')
    endif
    if (on_root) then
      deallocate (ceamp, stat=ierr)
      if (ierr /= 0) call io_error('Error deallocating ceamp in dis_extract')
    endif
    deallocate (u_matrix_opt_loc, stat=ierr)
    if (ierr /= 0) call io_error('Error deallocating u_matrix_opt_loc in dis_extract')
    deallocate (wkomegai1_loc, stat=ierr)
    if (ierr /= 0) call io_error('Error deallocating wkomegai1_loc in dis_extract')
    deallocate (wkomegai1, stat=ierr)
    if (ierr /= 0) call io_error('Error deallocating wkomegai1 in dis_extract')

    deallocate (cz, stat=ierr)
    if (ierr /= 0) call io_error('Error deallocating cz in dis_extract')
    deallocate (cwork, stat=ierr)
    if (ierr /= 0) call io_error('Error deallocating cwork in dis_extract')
    deallocate (cap, stat=ierr)
    if (ierr /= 0) call io_error('Error deallocating cap in dis_extract')
    deallocate (rwork, stat=ierr)
    if (ierr /= 0) call io_error('Error deallocating rwork in dis_extract')
    deallocate (w, stat=ierr)
    if (ierr /= 0) call io_error('Error deallocating w in dis_extract')
    deallocate (ifail, stat=ierr)
    if (ierr /= 0) call io_error('Error deallocating ifail in dis_extract')
    deallocate (iwork, stat=ierr)
    if (ierr /= 0) call io_error('Error deallocating iwork in dis_extract')

    deallocate (cbw, stat=ierr)
    if (ierr /= 0) call io_error('Error deallocating cbw in dis_extract')
    deallocate (cww, stat=ierr)
    if (ierr /= 0) call io_error('Error deallocating cww in dis_extract')
    deallocate (cwb, stat=ierr)
    if (ierr /= 0) call io_error('Error deallocating cwb in dis_extract')

    if (on_root) write (stdout, '(1x,a/)') &
      '+----------------------------------------------------------------------------+'

    if (timing_level > 1 .and. on_root) call io_stopwatch('dis: extract', 2)

    return

  contains

    subroutine internal_test_convergence()
      !! Check if we have converged

      implicit none

      integer :: ierr
      real(kind=dp), allocatable :: temp_hist(:)

      allocate (temp_hist(dis_conv_window), stat=ierr)
      if (ierr /= 0) call io_error('Error allocating temp_hist in dis_extract')

      if (iter .le. dis_conv_window) then
        history(iter) = delta_womegai
      else
        temp_hist = eoshift(history, 1, delta_womegai)
        history = temp_hist
      endif

      dis_converged = .false.
      if (iter .ge. dis_conv_window) then
        dis_converged = all(abs(history) .lt. dis_conv_tol)
      endif

      deallocate (temp_hist, stat=ierr)
      if (ierr /= 0) call io_error('Error deallocating temp_hist in dis_extract')

      return

    end subroutine internal_test_convergence

    !==================================================================!
    subroutine internal_zmatrix(nkp, nkp_loc, cmtrx)
      !==================================================================!
      !! Compute the Z-matrix
      !                                                                  !
      !                                                                  !
      !                                                                  !
      !==================================================================!

      implicit none

      integer, intent(in) :: nkp
      integer, intent(in) :: nkp_loc
      !! Which kpoint
      complex(kind=dp), intent(out) :: cmtrx(num_bands, num_bands)
      !! (M,N)-TH ENTRY IN THE (NDIMWIN(NKP)-NDIMFROZ(NKP)) x (NDIMWIN(NKP)-NDIMFRO
      !! HERMITIAN MATRIX AT THE NKP-TH K-POINT

      ! Internal variables
      integer          :: l, m, n, p, q, nn, nkp2, ndimk
      complex(kind=dp) :: csum

      if (timing_level > 1 .and. on_root) call io_stopwatch('dis: extract: zmatrix', 1)

      cmtrx = cmplx_0
      ndimk = ndimwin(nkp) - ndimfroz(nkp)
      do nn = 1, nntot
        nkp2 = nnlist(nkp, nn)
        call zgemm('N', 'N', num_bands, num_wann, ndimwin(nkp2), cmplx_1, &
                   m_matrix_orig_local(:, :, nn, nkp_loc), num_bands, u_matrix_opt(:, :, nkp2), num_bands, &
                   cmplx_0, cbw, num_bands)
        do n = 1, ndimk
          q = indxnfroz(n, nkp)
          do m = 1, n
            p = indxnfroz(m, nkp)
            csum = cmplx_0
            do l = 1, num_wann
              csum = csum + cbw(p, l)*conjg(cbw(q, l))
            enddo
            cmtrx(m, n) = cmtrx(m, n) + cmplx(wb(nn), 0.0_dp, kind=dp)*csum
            cmtrx(n, m) = conjg(cmtrx(m, n))
          enddo
        enddo
      enddo

      if (timing_level > 1 .and. on_root) call io_stopwatch('dis: extract: zmatrix', 2)

      return

    end subroutine internal_zmatrix

!~      !==================================================================!
!~!      function dis_zeig(nkp,m,cmk)
!~      function dis_zeig(nkp,m)
!~      !==================================================================!
!~      !                                                                  !
!~      !                                                                  !
!~      !                                                                  !
!~      !                                                                  !
!~      !==================================================================!
!~
!~        ! Computes <lambda>_mk = sum_{n=1}^N sum_b w_b |<u_{mk}|u_{n,k+b}>|^2
!~        ! [See Eqs. (12) and (17) of SMV]
!~
!~        implicit none
!~
!~        integer, intent(in) :: nkp
!~        integer, intent(in) :: m
!~!        complex(kind=dp), intent(in) :: cmk(num_bands,num_bands,nntot)
!~
!~        ! Internal variables
!~        real(kind=dp) :: dis_zeig
!~        complex(kind=dp) :: cdot_bloch
!~        integer :: n,nn,ndnnx,ndnn,nnsh,nkp2,l,j
!~
!~        dis_zeig=0.0_dp
!~
!~!        do nn=1,nntot
!~!           nkp2=nnlist(nkp,nn)
!~        do n = 1, num_wann
!~           do nn = 1, nntot
!~                 nkp2 = nnlist(nkp,nn)
!~                 ! Dotproduct
!~                 cdot_bloch = cmplx_0
!~                 do l = 1, ndimwin(nkp)
!~                    do j = 1, ndimwin(nkp2)
!~                       cdot_bloch = cdot_bloch + &
!~!                            conjg(u_matrix_opt(l,m,nkp)) * u_matrix_opt(j,n,nkp2) * cmk(l,j,nn)
!~                            conjg(u_matrix_opt(l,m,nkp)) * u_matrix_opt(j,n,nkp2) * m_matrix_orig(l,j,nn,nkp)
!~                    enddo
!~                 enddo
!~                 write(stdout,'(a,4i5,2f15.10)') 'zeig:',nkp,nn,m,n,cdot_bloch
!~!                 call zgemm('C','N',num_wann,ndimwin(nkp2),ndimwin(nkp),cmplx_1,&
!~!                      u_matrix_opt(:,:,nkp),num_bands,m_matrix_orig(:,:,nn,nkp),num_bands,cmplx_0,&
!~!                      cwb,num_wann)
!~!                 call zgemm('N','N',num_wann,num_wann,ndimwin(nkp2),cmplx_1,&
!~!                      cwb,num_wann,u_matrix_opt(:,:,nkp),num_bands,cmplx_0,cww,num_wann)
!~
!~                 dis_zeig = dis_zeig + wb(nn) * abs(cdot_bloch)**2
!~
!~!                 do n=1,num_wann
!~!                    dis_zeig = dis_zeig + wb(nn) * abs(cww(m,n))**2
!~!                 enddo
!~
!~              enddo
!~        enddo
!~
!~        return
!~
!~      end function dis_zeig

  end subroutine dis_extract