!+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
!                                                             
!   glissade_velo_higher.F90 - part of the Community Ice Sheet Model (CISM)  
!                                                              
!+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
!
!   Copyright (C) 2005-2018
!   CISM contributors - see AUTHORS file for list of contributors
!
!   This file is part of CISM.
!
!   CISM is free software: you can redistribute it and/or modify it
!   under the terms of the Lesser GNU General Public License as published
!   by the Free Software Foundation, either version 3 of the License, or
!   (at your option) any later version.
!
!   CISM is distributed in the hope that it will be useful,
!   but WITHOUT ANY WARRANTY; without even the implied warranty of
!   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
!   Lesser GNU General Public License for more details.
!
!   You should have received a copy of the Lesser GNU General Public License
!   along with CISM. If not, see <http://www.gnu.org/licenses/>.
!
!+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
!
! This module contains routines for computing the ice velocity using a 
! variational finite-element approach.  It solves the higher-order Blatter-Pattyn
! approximation for Stokes flow, as well as several simpler approximations
! (L1L2, shallow-shelf approximation, and shallow-ice approximation).
!
! See these papers for details:
!
! J.K. Dukowicz, S.F. Price and W.H. Lipscomb, 2010: Consistent
!    approximations and boundary conditions for ice-sheet dynamics
!    using a principle of least action.  J. Glaciology, 56 (197),
!    480-495.
!
! F. Pattyn, 2003: A new three-dimensional higher-order thermomechanical 
!    ice sheet model: Basic sensitivity, ice stream development, and
!    ice flow across subglacial lakes.  J. Geophys. Res., 108 (B8),
!    2382, doi:10.1029/2002JB002329.
!
! M. Perego, M. Gunzburger, and J. Burkardt, 2012: Parallel
!    finite-element implementation for higher-order ice-sheet models.
!    J. Glaciology, 58 (207), 76-88.
!
! Author: William Lipscomb
!         Los Alamos National Laboratory
!         Group T-3, MS B216
!         Los Alamos, NM 87545
!         USA
!         <lipscomb@lanl.gov>
!
! +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 

  module glissade_velo_higher

    use glimmer_global, only: dp
    use glimmer_physcon, only: n_glen, rhoi, rhoo, grav, scyr, pi
    use glimmer_paramets, only: iulog, eps08, eps10, eps11
    use glimmer_paramets, only: velo_scale, len_scale   ! used for whichefvs = HO_EFVS_FLOWFACT
    use glimmer_utils, only: point_diag
    use glimmer_log
    use glimmer_sparse_type
    use glimmer_sparse
    use glissade_grid_operators
    use glissade_masks, only: glissade_get_masks

    use glide_types

    use glissade_velo_higher_slap, only:   &
         slap_preprocess_3d,   slap_preprocess_2d,   &
         slap_postprocess_3d,  slap_postprocess_2d,  &
         slap_compute_residual_vector, slap_solve_test_matrix

    use glissade_velo_higher_pcg, only:   &
         pcg_solver_standard_3d,   pcg_solver_standard_2d,  &
         pcg_solver_chrongear_3d,  pcg_solver_chrongear_2d, &
         matvec_multiply_structured_3d

#ifdef TRILINOS
    use glissade_velo_higher_trilinos, only: &
         trilinos_fill_pattern_3d,     trilinos_fill_pattern_2d,     &
         trilinos_global_id_3d,        trilinos_global_id_2d,        &
         trilinos_assemble_3d,         trilinos_assemble_2d,         &
         trilinos_init_velocity_3d,    trilinos_init_velocity_2d,    &
         trilinos_extract_velocity_3d, trilinos_extract_velocity_2d, &
         trilinos_test
#endif

    use cism_parallel, only: this_rank, main_task, nhalo, tasks, &
         parallel_type, parallel_halo, staggered_parallel_halo, parallel_globalindex, &
         parallel_reduce_max, parallel_reduce_sum, not_parallel

    implicit none

    private
    public :: glissade_velo_higher_init, glissade_velo_higher_solve

    !----------------------------------------------------------------
    ! Here are some definitions:
    !
    ! The horizontal mesh is composed of cells and vertices.
    ! The cells are rectangular with uniform dimensions dx and dy.
    ! Each cell can be extruded to form a column with a specified number of layers.
    ! 
    ! An element is a layer of a cell, and a node is a corner of an element.
    ! Elements and nodes live in 3D space, whereas cells and vertices live in
    !  the horizontal plane.
    !
    ! Locally owned cells have indices (nhalo+1:nx-nhalo, nhalo+1,ny-nhalo).
    ! Locally owned vertices have indices (nhalo+1:nx-nhalo, nhalo+1,ny-nhalo),
    !  except for processors on the west and south edges of the global domain with outflow BC.
    !  For those processors, locally owned vertices have indices (nhalo:nx-nhalo, nhalo,ny-nhalo).
    ! The indices (staggered_ilo:staggered_ihi, staggered_jlo:staggered_jhi)
    !  define the limits of locally owned vertices for the given BC.
    ! Active cells are cells that (1) contain ice and (2) border locally owned vertices.
    ! Active vertices are all vertices of active cells.
    ! Active nodes are all nodes in the columns associated with active vertices.
    !----------------------------------------------------------------

    !----------------------------------------------------------------
    ! Finite element properties
    ! Assume 3D hexahedral elements.
    !----------------------------------------------------------------

    integer, parameter ::        &
       nNodesPerElement_3d = 8,  & ! 8 nodes for hexahedral elements
       nQuadPoints_3d = 8,       & ! number of quadrature points per hexahedral element
                                   ! These live at +- 1/sqrt(3) for reference hexahedron
       nNodeNeighbors_3d = 27      ! number of nearest node neighbors in 3D (including the node itself)

    integer, parameter ::        &
       nNodesPerElement_2d = 4,  & ! 4 nodes for faces of hexahedral elements
       nQuadPoints_2d = 4,       & ! number of quadrature points per element face
                                   ! These live at +- 1/sqrt(3) for reference square
       nNodeNeighbors_2d = 9       ! number of nearest node neighbors in 2D (including the node itself)

    real(dp), parameter ::     &
       rsqrt3 = 1.d0/sqrt(3.d0)    ! for quadrature points
         
    !----------------------------------------------------------------
    ! Arrays used for finite-element calculations
    !
    ! Most integals are done over 3D hexahedral elements.
    ! Surface integrals are done over 2D faces of these elements. 
    !----------------------------------------------------------------

    real(dp), dimension(nNodesPerElement_3d, nQuadPoints_3d) ::   & 
       phi_3d,         &    ! trilinear basis function, evaluated at quad pts
       dphi_dxr_3d,    &    ! dphi/dx for reference hexehedral element, evaluated at quad pts
       dphi_dyr_3d,    &    ! dphi/dy for reference hexahedral element, evaluated at quad pts
       dphi_dzr_3d          ! dphi/dy for reference hexahedral element, evaluated at quad pts

    real(dp), dimension(nNodesPerElement_3d) ::   & 
       phi_3d_ctr,         &! trilinear basis function, evaluated at cell ctr
       dphi_dxr_3d_ctr,    &! dphi/dx for reference hexahedral element, evaluated at cell ctr
       dphi_dyr_3d_ctr,    &! dphi/dy for reference hexahedral element, evaluated at cell ctr
       dphi_dzr_3d_ctr      ! dphi/dz for reference hexahedral element, evaluated at cell ctr

    real(dp), dimension(nQuadPoints_3d) ::  &
       xqp_3d, yqp_3d, zqp_3d,  &! quad pt coordinates in reference element
       wqp_3d                    ! quad pt weights

    real(dp), dimension(nNodesPerElement_2d, nQuadPoints_2d) ::   & 
       phi_2d,         &    ! bilinear basis function, evaluated at quad pts
       dphi_dxr_2d,    &    ! dphi/dx for reference rectangular element, evaluated at quad pts
       dphi_dyr_2d          ! dphi/dy for reference rectangular element, evaluated at quad pts

    real(dp), dimension(nNodesPerElement_2d) ::   & 
       phi_2d_ctr,         &! bilinear basis function, evaluated at cell ctr
       dphi_dxr_2d_ctr,    &! dphi/dx for reference rectangular element, evaluated at cell ctr
       dphi_dyr_2d_ctr      ! dphi/dy for reference rectangular element, evaluated at cell ctr

    real(dp), dimension(nQuadPoints_2d) ::  &
       xqp_2d, yqp_2d, &    ! quad pt coordinates in reference square
       wqp_2d               ! quad pt weights

    integer, dimension(nNodesPerElement_3d, nNodesPerElement_3d) ::  &
       ishift, jshift, kshift   ! matrices describing relative indices of nodes in an element

    integer, dimension(-1:1,-1:1,-1:1) :: &
       indxA_3d              ! maps relative (x,y,z) coordinates to an index between 1 and 27
                             ! index order is (i,j,k)

    integer, dimension(-1:1,-1:1) :: &
       indxA_2d              ! maps relative (x,y) coordinates to an index between 1 and 9
                             ! index order is (i,j)

    real(dp), dimension(3,3) ::  &
       identity3             ! 3 x 3 identity matrix

    real(dp) :: vol0    ! volume scale (m^3), used to scale 3D matrix values

    logical, parameter ::  &
       check_symmetry = .true.   ! if true, then check symmetry of assembled matrix

    ! various options for turning diagnostic prints on and off
    logical :: verbose = .false.
!    logical :: verbose = .true.  
    logical :: verbose_init = .false.   
!    logical :: verbose_init = .true.   
    logical :: verbose_solver = .false.
!    logical :: verbose_solver = .true.
    logical :: verbose_Jac = .false.
!    logical :: verbose_Jac = .true.
    logical :: verbose_residual = .false.
!    logical :: verbose_residual = .true.
    logical :: verbose_state = .false.
!    logical :: verbose_state = .true.
    logical :: verbose_velo = .false.
!    logical :: verbose_velo = .true.
    logical :: verbose_id = .false.
!    logical :: verbose_id = .true.
    logical :: verbose_load = .false.
!    logical :: verbose_load = .true.
    logical :: verbose_shelf = .false.
!    logical :: verbose_shelf = .true.
    logical :: verbose_matrix = .false.
!    logical :: verbose_matrix = .true.
    logical :: verbose_basal = .false.
!    logical :: verbose_basal = .true.
    logical :: verbose_bfric = .false.
!    logical :: verbose_bfric = .true.
    logical :: verbose_trilinos = .false.
!    logical :: verbose_trilinos = .true.
    logical :: verbose_beta = .false.
!    logical :: verbose_beta = .true.
    logical :: verbose_efvs = .false.
!    logical :: verbose_efvs = .true.
    logical :: verbose_tau = .false.
!    logical :: verbose_tau = .true.
    logical :: verbose_gridop = .false.
!    logical :: verbose_gridop= .true.
    logical :: verbose_dirichlet = .false.
!    logical :: verbose_dirichlet= .true.
    logical :: verbose_L1L2 = .false.
!    logical :: verbose_L1L2 = .true.
    logical :: verbose_diva = .false.
!    logical :: verbose_diva = .true.
    logical :: verbose_glp = .false.
!    logical :: verbose_glp = .true.
    logical :: verbose_picard = .false.
!    logical :: verbose_picard = .true.

    integer, parameter :: ktest = 1     ! vertical level of diagnostic point
    integer, parameter :: ptest = 1     ! diagnostic quadrature point

    ! option for writing matrix entries to text files
    logical, parameter :: write_matrix = .false.
!    logical, parameter :: write_matrix = .true.
    character(*), parameter :: matrix_label = 'label_here'  ! choose an appropriate label

    !WHL - debug for efvs
    real(dp), dimension(nNodesPerElement_3d, nQuadPoints_2d) ::   & 
       phi_3d_vav,         &! vertical avg of phi_3d
       dphi_dxr_3d_vav,    &! vertical avg of dphi_dxr_3d
       dphi_dyr_3d_vav,    &! vertical avg of dphi_dyr_3d
       dphi_dzr_3d_vav      ! vertical avg of dphi_dzr_3d

  contains

!****************************************************************************

  subroutine glissade_velo_higher_init

    !----------------------------------------------------------------
    ! Initial calculations for glissade higher-order solver.
    !----------------------------------------------------------------

    integer :: i, j, k, m, n, p
    integer :: pplus
    real(dp) :: xctr, yctr, zctr
    real(dp) :: sumx, sumy, sumz

    !----------------------------------------------------------------
    ! Initialize some time-independent finite element arrays
    !----------------------------------------------------------------

    !----------------------------------------------------------------
    ! Trilinear basis set for reference hexahedron, x=(-1,1), y=(-1,1), z=(-1,1)             
    ! Indexing is counter-clockwise from SW corner, with 1-4 on lower surface
    !  and 5-8 on upper surface
    ! The code uses "phi_3d" to denote these basis functions. 
    !
    ! N1 = (1-x)*(1-y)*(1-z)/8             N4----N3
    ! N2 = (1+x)*(1-y)*(1-z)/8             |     |    Lower layer        
    ! N3 = (1+x)*(1+y)*(1-z)/8             |     |
    ! N4 = (1-x)*(1+y)*(1-z)/8             N1----N2

    ! N5 = (1-x)*(1-y)*(1+z)/8             N8----N7
    ! N6 = (1+x)*(1-y)*(1+z)/8             |     |    Upper layer
    ! N7 = (1+x)*(1+y)*(1+z)/8             |     |
    ! N8 = (1-x)*(1+y)*(1+z)/8             N5----N6
    !----------------------------------------------------------------
   
    ! Set coordinates and weights of quadrature points for reference hexahedral element.
    ! Numbering is counter-clockwise from southwest, lower face (1-4) followed by
    !  upper face (5-8).

    xqp_3d(1) = -rsqrt3; yqp_3d(1) = -rsqrt3; zqp_3d(1) = -rsqrt3
    wqp_3d(1) =  1.d0

    xqp_3d(2) =  rsqrt3; yqp_3d(2) = -rsqrt3; zqp_3d(2) = -rsqrt3
    wqp_3d(2) =  1.d0

    xqp_3d(3) =  rsqrt3; yqp_3d(3) =  rsqrt3; zqp_3d(3) = -rsqrt3
    wqp_3d(3) =  1.d0

    xqp_3d(4) = -rsqrt3; yqp_3d(4) =  rsqrt3; zqp_3d(4) = -rsqrt3
    wqp_3d(4) =  1.d0

    xqp_3d(5) = -rsqrt3; yqp_3d(5) = -rsqrt3; zqp_3d(5) =  rsqrt3
    wqp_3d(5) =  1.d0

    xqp_3d(6) =  rsqrt3; yqp_3d(6) = -rsqrt3; zqp_3d(6) =  rsqrt3
    wqp_3d(6) =  1.d0

    xqp_3d(7) =  rsqrt3; yqp_3d(7) =  rsqrt3; zqp_3d(7) =  rsqrt3
    wqp_3d(7) =  1.d0

    xqp_3d(8) = -rsqrt3; yqp_3d(8) =  rsqrt3; zqp_3d(8) =  rsqrt3
    wqp_3d(8) =  1.d0

    if (verbose_init) then
       write(iulog,*) ' '
       write(iulog,*) 'Hexahedral elements, quad points, x, y, z:'
       sumx = 0.d0; sumy = 0.d0; sumz = 0.d0
       do p = 1, nQuadPoints_3d
          write(iulog,*) p, xqp_3d(p), yqp_3d(p), zqp_3d(p)
          sumx = sumx + xqp_3d(p); sumy = sumy + yqp_3d(p); sumz = sumz + zqp_3d(p)
       enddo
       write(iulog,*) ' '
       write(iulog,*) 'sums:', sumx, sumy, sumz
    endif

    ! Evaluate trilinear basis functions and their derivatives at each quad pt

    do p = 1, nQuadPoints_3d

       phi_3d(1,p) = (1.d0 - xqp_3d(p)) * (1.d0 - yqp_3d(p)) * (1.d0 - zqp_3d(p)) / 8.d0
       phi_3d(2,p) = (1.d0 + xqp_3d(p)) * (1.d0 - yqp_3d(p)) * (1.d0 - zqp_3d(p)) / 8.d0
       phi_3d(3,p) = (1.d0 + xqp_3d(p)) * (1.d0 + yqp_3d(p)) * (1.d0 - zqp_3d(p)) / 8.d0
       phi_3d(4,p) = (1.d0 - xqp_3d(p)) * (1.d0 + yqp_3d(p)) * (1.d0 - zqp_3d(p)) / 8.d0
       phi_3d(5,p) = (1.d0 - xqp_3d(p)) * (1.d0 - yqp_3d(p)) * (1.d0 + zqp_3d(p)) / 8.d0
       phi_3d(6,p) = (1.d0 + xqp_3d(p)) * (1.d0 - yqp_3d(p)) * (1.d0 + zqp_3d(p)) / 8.d0
       phi_3d(7,p) = (1.d0 + xqp_3d(p)) * (1.d0 + yqp_3d(p)) * (1.d0 + zqp_3d(p)) / 8.d0
       phi_3d(8,p) = (1.d0 - xqp_3d(p)) * (1.d0 + yqp_3d(p)) * (1.d0 + zqp_3d(p)) / 8.d0

       dphi_dxr_3d(1,p) = -(1.d0 - yqp_3d(p)) * (1.d0 - zqp_3d(p)) / 8.d0 
       dphi_dxr_3d(2,p) =  (1.d0 - yqp_3d(p)) * (1.d0 - zqp_3d(p)) / 8.d0 
       dphi_dxr_3d(3,p) =  (1.d0 + yqp_3d(p)) * (1.d0 - zqp_3d(p)) / 8.d0 
       dphi_dxr_3d(4,p) = -(1.d0 + yqp_3d(p)) * (1.d0 - zqp_3d(p)) / 8.d0
       dphi_dxr_3d(5,p) = -(1.d0 - yqp_3d(p)) * (1.d0 + zqp_3d(p)) / 8.d0 
       dphi_dxr_3d(6,p) =  (1.d0 - yqp_3d(p)) * (1.d0 + zqp_3d(p)) / 8.d0 
       dphi_dxr_3d(7,p) =  (1.d0 + yqp_3d(p)) * (1.d0 + zqp_3d(p)) / 8.d0 
       dphi_dxr_3d(8,p) = -(1.d0 + yqp_3d(p)) * (1.d0 + zqp_3d(p)) / 8.d0

       dphi_dyr_3d(1,p) = -(1.d0 - xqp_3d(p)) * (1.d0 - zqp_3d(p)) / 8.d0 
       dphi_dyr_3d(2,p) = -(1.d0 + xqp_3d(p)) * (1.d0 - zqp_3d(p)) / 8.d0 
       dphi_dyr_3d(3,p) =  (1.d0 + xqp_3d(p)) * (1.d0 - zqp_3d(p)) / 8.d0 
       dphi_dyr_3d(4,p) =  (1.d0 - xqp_3d(p)) * (1.d0 - zqp_3d(p)) / 8.d0 
       dphi_dyr_3d(5,p) = -(1.d0 - xqp_3d(p)) * (1.d0 + zqp_3d(p)) / 8.d0 
       dphi_dyr_3d(6,p) = -(1.d0 + xqp_3d(p)) * (1.d0 + zqp_3d(p)) / 8.d0 
       dphi_dyr_3d(7,p) =  (1.d0 + xqp_3d(p)) * (1.d0 + zqp_3d(p)) / 8.d0 
       dphi_dyr_3d(8,p) =  (1.d0 - xqp_3d(p)) * (1.d0 + zqp_3d(p)) / 8.d0 

       dphi_dzr_3d(1,p) = -(1.d0 - xqp_3d(p)) * (1.d0 - yqp_3d(p)) / 8.d0 
       dphi_dzr_3d(2,p) = -(1.d0 + xqp_3d(p)) * (1.d0 - yqp_3d(p)) / 8.d0 
       dphi_dzr_3d(3,p) = -(1.d0 + xqp_3d(p)) * (1.d0 + yqp_3d(p)) / 8.d0 
       dphi_dzr_3d(4,p) = -(1.d0 - xqp_3d(p)) * (1.d0 + yqp_3d(p)) / 8.d0 
       dphi_dzr_3d(5,p) =  (1.d0 - xqp_3d(p)) * (1.d0 - yqp_3d(p)) / 8.d0 
       dphi_dzr_3d(6,p) =  (1.d0 + xqp_3d(p)) * (1.d0 - yqp_3d(p)) / 8.d0 
       dphi_dzr_3d(7,p) =  (1.d0 + xqp_3d(p)) * (1.d0 + yqp_3d(p)) / 8.d0 
       dphi_dzr_3d(8,p) =  (1.d0 - xqp_3d(p)) * (1.d0 + yqp_3d(p)) / 8.d0 

       if (verbose_init) then
          write(iulog,*) ' '
          write(iulog,*) 'Quad point, p =', p
          write(iulog,*) 'n, phi_3d, dphi_dxr_3d, dphi_dyr_3d, dphi_dzr_3d:'
          do n = 1, 8
             write(iulog,*) n, phi_3d(n,p), dphi_dxr_3d(n,p), dphi_dyr_3d(n,p), dphi_dzr_3d(n,p)
          enddo
          write(iulog,*) ' '
          write(iulog,*) 'sum(phi_3d)', sum(phi_3d(:,p))  ! verified that sum = 1
          write(iulog,*) 'sum(dphi/dx)', sum(dphi_dxr_3d(:,p))  ! verified that sum = 0 (within roundoff)
          write(iulog,*) 'sum(dphi/dy)', sum(dphi_dyr_3d(:,p))  ! verified that sum = 0 (within roundoff)
          write(iulog,*) 'sum(dphi/dz)', sum(dphi_dzr_3d(:,p))  ! verified that sum = 0 (within roundoff)
       endif

    enddo   ! nQuadPoints_3d

    ! Evaluate trilinear basis functions and their derivatives at cell center
    ! Full formulas are not really needed at (x,y,z) = (0,0,0), but are included for completeness

    xctr = 0.d0
    yctr = 0.d0
    zctr = 0.d0

    phi_3d_ctr(1) = (1.d0 - xctr) * (1.d0 - yctr) * (1.d0 - zctr) / 8.d0
    phi_3d_ctr(2) = (1.d0 + xctr) * (1.d0 - yctr) * (1.d0 - zctr) / 8.d0
    phi_3d_ctr(3) = (1.d0 + xctr) * (1.d0 + yctr) * (1.d0 - zctr) / 8.d0
    phi_3d_ctr(4) = (1.d0 - xctr) * (1.d0 + yctr) * (1.d0 - zctr) / 8.d0
    phi_3d_ctr(5) = (1.d0 - xctr) * (1.d0 - yctr) * (1.d0 + zctr) / 8.d0
    phi_3d_ctr(6) = (1.d0 + xctr) * (1.d0 - yctr) * (1.d0 + zctr) / 8.d0
    phi_3d_ctr(7) = (1.d0 + xctr) * (1.d0 + yctr) * (1.d0 + zctr) / 8.d0
    phi_3d_ctr(8) = (1.d0 - xctr) * (1.d0 + yctr) * (1.d0 + zctr) / 8.d0
    
    dphi_dxr_3d_ctr(1) = -(1.d0 - yctr) * (1.d0 - zctr) / 8.d0 
    dphi_dxr_3d_ctr(2) =  (1.d0 - yctr) * (1.d0 - zctr) / 8.d0 
    dphi_dxr_3d_ctr(3) =  (1.d0 + yctr) * (1.d0 - zctr) / 8.d0 
    dphi_dxr_3d_ctr(4) = -(1.d0 + yctr) * (1.d0 - zctr) / 8.d0
    dphi_dxr_3d_ctr(5) = -(1.d0 - yctr) * (1.d0 + zctr) / 8.d0 
    dphi_dxr_3d_ctr(6) =  (1.d0 - yctr) * (1.d0 + zctr) / 8.d0 
    dphi_dxr_3d_ctr(7) =  (1.d0 + yctr) * (1.d0 + zctr) / 8.d0 
    dphi_dxr_3d_ctr(8) = -(1.d0 + yctr) * (1.d0 + zctr) / 8.d0
    
    dphi_dyr_3d_ctr(1) = -(1.d0 - xctr) * (1.d0 - zctr) / 8.d0 
    dphi_dyr_3d_ctr(2) = -(1.d0 + xctr) * (1.d0 - zctr) / 8.d0 
    dphi_dyr_3d_ctr(3) =  (1.d0 + xctr) * (1.d0 - zctr) / 8.d0 
    dphi_dyr_3d_ctr(4) =  (1.d0 - xctr) * (1.d0 - zctr) / 8.d0 
    dphi_dyr_3d_ctr(5) = -(1.d0 - xctr) * (1.d0 + zctr) / 8.d0 
    dphi_dyr_3d_ctr(6) = -(1.d0 + xctr) * (1.d0 + zctr) / 8.d0 
    dphi_dyr_3d_ctr(7) =  (1.d0 + xctr) * (1.d0 + zctr) / 8.d0 
    dphi_dyr_3d_ctr(8) =  (1.d0 - xctr) * (1.d0 + zctr) / 8.d0 
    
    dphi_dzr_3d_ctr(1) = -(1.d0 - xctr) * (1.d0 - yctr) / 8.d0 
    dphi_dzr_3d_ctr(2) = -(1.d0 + xctr) * (1.d0 - yctr) / 8.d0 
    dphi_dzr_3d_ctr(3) = -(1.d0 + xctr) * (1.d0 + yctr) / 8.d0 
    dphi_dzr_3d_ctr(4) = -(1.d0 - xctr) * (1.d0 + yctr) / 8.d0 
    dphi_dzr_3d_ctr(5) =  (1.d0 - xctr) * (1.d0 - yctr) / 8.d0 
    dphi_dzr_3d_ctr(6) =  (1.d0 + xctr) * (1.d0 - yctr) / 8.d0 
    dphi_dzr_3d_ctr(7) =  (1.d0 + xctr) * (1.d0 + yctr) / 8.d0 
    dphi_dzr_3d_ctr(8) =  (1.d0 - xctr) * (1.d0 + yctr) / 8.d0 

    ! Identity matrix
    identity3(1,:) = (/ 1.d0, 0.d0, 0.d0 /)
    identity3(2,:) = (/ 0.d0, 1.d0, 0.d0 /)
    identity3(3,:) = (/ 0.d0, 0.d0, 1.d0 /)

    ! Initialize some matrices that describe how the i, j and k indices of each node
    ! in each element are related to one another.

    ! The ishift matrix describes how the i indices of the 8 nodes are related to one another.
    ! E.g, if ishift (1,2) = 1, this means that node 2 has an i index
    ! one greater than the i index of node 1.

    ishift(1,:) = (/ 0,  1,  1,  0,  0,  1,  1,  0/)   
    ishift(2,:) = (/-1,  0,  0, -1, -1,  0,  0, -1/)   
    ishift(3,:) = ishift(2,:)
    ishift(4,:) = ishift(1,:)
    ishift(5,:) = ishift(1,:)
    ishift(6,:) = ishift(2,:)
    ishift(7,:) = ishift(2,:)
    ishift(8,:) = ishift(1,:)

    ! The jshift matrix describes how the j indices of the 8 nodes are related to one another.
    ! E.g, if jshift (1,4) = 1, this means that node 4 has a j index
    ! one greater than the j index of node 1.

    jshift(1,:) = (/ 0,  0,  1,  1,  0,  0,  1,  1/)   
    jshift(2,:) = jshift(1,:)
    jshift(3,:) = (/-1, -1,  0,  0, -1, -1,  0,  0/)   
    jshift(4,:) = jshift(3,:)
    jshift(5,:) = jshift(1,:)
    jshift(6,:) = jshift(1,:)
    jshift(7,:) = jshift(3,:)
    jshift(8,:) = jshift(3,:)

    ! The kshift matrix describes how the k indices of the 8 nodes are related to one another.
    ! E.g, if kshift (1,5) = -1, this means that node 5 has a k index
    ! one less than the k index of node 1.  (Assume that k increases downward.)

    kshift(1,:) = (/ 0,  0,  0,  0, -1, -1, -1, -1/)   
    kshift(2,:) = kshift(1,:)
    kshift(3,:) = kshift(1,:)
    kshift(4,:) = kshift(1,:)
    kshift(5,:) = (/ 1,  1,  1,  1,  0,  0,  0,  0/)
    kshift(6,:) = kshift(5,:)
    kshift(7,:) = kshift(5,:)
    kshift(8,:) = kshift(5,:)

    if (verbose_init) then
       write(iulog,*) ' '
       write(iulog,*) 'ishift:'
       do n = 1, 8
          write(iulog,'(8i4)') ishift(n,:)
       enddo
       write(iulog,*) ' '
       write(iulog,*) 'jshift:'
       do n = 1, 8
          write(iulog,'(8i4)') jshift(n,:)
       enddo
       write(iulog,*) ' '
       write(iulog,*) 'kshift:'
       do n = 1, 8
          write(iulog,'(8i4)') kshift(n,:)
       enddo
    endif

    !----------------------------------------------------------------
    ! Bilinear basis set for reference square, x=(-1,1), y=(-1,1)             
    ! Indexing is counter-clockwise from SW corner
    ! The code uses "phi_2d" to denote these basis functions. 
    !
    ! N1 = (1-x)*(1-y)/4             N4----N3
    ! N2 = (1+x)*(1-y)/4             |     |
    ! N3 = (1+x)*(1+y)/4             |     |
    ! N4 = (1-x)*(1+y)/4             N1----N2
    !----------------------------------------------------------------

    ! Set coordinates and weights of quadrature points for reference square.
    ! Numbering is counter-clockwise from southwest

    xqp_2d(1) = -rsqrt3; yqp_2d(1) = -rsqrt3
    wqp_2d(1) =  1.d0

    xqp_2d(2) =  rsqrt3; yqp_2d(2) = -rsqrt3
    wqp_2d(2) =  1.d0

    xqp_2d(3) =  rsqrt3; yqp_2d(3) =  rsqrt3
    wqp_2d(3) =  1.d0

    xqp_2d(4) = -rsqrt3; yqp_2d(4) =  rsqrt3
    wqp_2d(4) =  1.d0

    if (verbose_init) then
       write(iulog,*) ' '
       write(iulog,*) ' '
       write(iulog,*) 'Quadrilateral elements, quad points, x, y:'
       sumx = 0.d0; sumy = 0.d0; sumz = 0.d0
       do p = 1, nQuadPoints_2d
          write(iulog,*) p, xqp_2d(p), yqp_2d(p)
          sumx = sumx + xqp_2d(p); sumy = sumy + yqp_2d(p)
       enddo
       write(iulog,*) ' '
       write(iulog,*) 'sumx, sumy:', sumx, sumy
    endif

    ! Evaluate bilinear basis functions and their derivatives at each quad pt

    do p = 1, nQuadPoints_2d

       phi_2d(1,p) = (1.d0 - xqp_2d(p)) * (1.d0 - yqp_2d(p)) / 4.d0 
       phi_2d(2,p) = (1.d0 + xqp_2d(p)) * (1.d0 - yqp_2d(p)) / 4.d0
       phi_2d(3,p) = (1.d0 + xqp_2d(p)) * (1.d0 + yqp_2d(p)) / 4.d0 
       phi_2d(4,p) = (1.d0 - xqp_2d(p)) * (1.d0 + yqp_2d(p)) / 4.d0

       dphi_dxr_2d(1,p) = -(1.d0 - yqp_2d(p)) / 4.d0
       dphi_dxr_2d(2,p) =  (1.d0 - yqp_2d(p)) / 4.d0
       dphi_dxr_2d(3,p) =  (1.d0 + yqp_2d(p)) / 4.d0
       dphi_dxr_2d(4,p) = -(1.d0 + yqp_2d(p)) / 4.d0

       dphi_dyr_2d(1,p) = -(1.d0 - xqp_2d(p)) / 4.d0 
       dphi_dyr_2d(2,p) = -(1.d0 + xqp_2d(p)) / 4.d0 
       dphi_dyr_2d(3,p) =  (1.d0 + xqp_2d(p)) / 4.d0 
       dphi_dyr_2d(4,p) =  (1.d0 - xqp_2d(p)) / 4.d0 

       if (verbose_init) then
          write(iulog,*) ' '
          write(iulog,*) 'Quad point, p =', p
          write(iulog,*) 'n, phi_2d, dphi_dxr_2d, dphi_dyr_2d:'
          do n = 1, 4
             write(iulog,*) n, phi_2d(n,p), dphi_dxr_2d(n,p), dphi_dyr_2d(n,p)
          enddo
          write(iulog,*) 'sum(phi_2d)', sum(phi_2d(:,p))        ! verified that sum = 1
          write(iulog,*) 'sum(dphi/dx_2d)', sum(dphi_dxr_2d(:,p))  ! verified that sum = 0 (within roundoff)
          write(iulog,*) 'sum(dphi/dy_2d)', sum(dphi_dyr_2d(:,p))  ! verified that sum = 0 (within roundoff)
       endif

    enddo   ! nQuadPoints_2d

    ! Evaluate bilinear basis functions and their derivatives at cell center
    ! Full formulas are not really needed at (x,y) = (0,0), but are included for completeness

    xctr = 0.d0
    yctr = 0.d0

    phi_2d_ctr(1) = (1.d0 - xctr) * (1.d0 - yctr) / 4.d0 
    phi_2d_ctr(2) = (1.d0 + xctr) * (1.d0 - yctr) / 4.d0
    phi_2d_ctr(3) = (1.d0 + xctr) * (1.d0 + yctr) / 4.d0 
    phi_2d_ctr(4) = (1.d0 - xctr) * (1.d0 + yctr) / 4.d0
    
    dphi_dxr_2d_ctr(1) = -(1.d0 - yctr) / 4.d0 
    dphi_dxr_2d_ctr(2) =  (1.d0 - yctr) / 4.d0 
    dphi_dxr_2d_ctr(3) =  (1.d0 + yctr) / 4.d0 
    dphi_dxr_2d_ctr(4) = -(1.d0 + yctr) / 4.d0

    dphi_dyr_2d_ctr(1) = -(1.d0 - xctr) / 4.d0 
    dphi_dyr_2d_ctr(2) = -(1.d0 + xctr) / 4.d0 
    dphi_dyr_2d_ctr(3) =  (1.d0 + xctr) / 4.d0 
    dphi_dyr_2d_ctr(4) =  (1.d0 - xctr) / 4.d0 

    !----------------------------------------------------------------
    ! Compute indxA_3d; maps displacements i,j,k = (-1,0,1) onto an index from 1 to 27
    ! Numbering starts in SW corner of layers k-1, finishes in NE corner of layer k+1
    ! Diagonal term has index 14
    !----------------------------------------------------------------

    ! Layer k-1:           Layer k:            Layer k+1:
    !
    !   7    8    9          16   17   18        25   26   27 
    !   4    5    6          13   14   15        22   23   24
    !   1    2    3          10   11   12        19   20   21                                                                                               

    m = 0
    do k = -1,1
       do j = -1,1
          do i = -1,1
             m = m + 1
             indxA_3d(i,j,k) = m
          enddo
       enddo
    enddo

    !----------------------------------------------------------------
    ! Compute indxA_2d; maps displacements i,j = (-1,0,1) onto an index from 1 to 9
    ! Same as indxA_3d, but for a single layer
    !----------------------------------------------------------------

    m = 0
    do j = -1,1
       do i = -1,1
          m = m + 1
          indxA_2d(i,j) = m
       enddo
    enddo

    !WHL - debug for efvs

    ! Evaluate vertical averages of dphi_dxr_3d, dphi_dyr_3d and dphi_dzr_3d at each 2d quad pts.
    ! Using these instead of the full 3d basis functions can result in similar accuracy with
    !  only half as many QP computations.

    do p = 1, nQuadPoints_2d
       pplus = p + nQuadPoints_3d/2  ! p + 4 for hexahedra
       do n = 1, nNodesPerElement_3d
          phi_3d_vav(n,p) = 0.5d0 * (phi_3d(n,p) + phi_3d(n,pplus))
          dphi_dxr_3d_vav(n,p) = 0.5d0 * (dphi_dxr_3d(n,p) + dphi_dxr_3d(n,pplus))
          dphi_dyr_3d_vav(n,p) = 0.5d0 * (dphi_dyr_3d(n,p) + dphi_dyr_3d(n,pplus))
          dphi_dzr_3d_vav(n,p) = 0.5d0 * (dphi_dzr_3d(n,p) + dphi_dzr_3d(n,pplus))
       enddo
    enddo

  end subroutine glissade_velo_higher_init

!****************************************************************************

  subroutine glissade_velo_higher_solve(model,                &
                                        nx,     ny,     nz)

    !TODO - Remove nx, ny, nz from argument list?
    !       Would then have to allocate many local arrays.

    !----------------------------------------------------------------
    ! Solve the ice sheet flow equations for the horizontal velocity (uvel, vvel)
    !  at each node of each grid cell where ice is present.
    ! The standard solver is based on the Blatter-Pattyn first-order approximation
    !  of Stokes flow (which_ho_approx = HO_APPROX_BP).
    ! There are also options to solve the shallow-ice equations (HO_APPROX_SIA),
    !  shallow-shelf equations (HO_APPROX_SIA), or L1L2 equations (HO_APPROX_L1L2).
    ! Note: The SIA solver does a full matrix solution and is much slower than
    !       the local SIA solver (HO_APPROX_LOCAL_SIA) in glissade_velo_sia.F90.
    !----------------------------------------------------------------

    use glissade_basal_traction, only: glissade_calcbeta
    use glissade_therm, only: glissade_pressure_melting_point
    use glide_thck, only: glide_calclsrf
    use profile, only: t_startf, t_stopf

    !----------------------------------------------------------------
    ! Input-output arguments
    !----------------------------------------------------------------

    type(glide_global_type), intent(inout) :: model   ! derived type holding ice-sheet info

    !----------------------------------------------------------------
    ! Note that the glissade solver uses SI units.
    ! Thus we have grid cell dimensions and ice thickness in meters,
    !  velocity in m/s, and the rate factor in Pa^(-n) s(-1).
    !----------------------------------------------------------------

    !----------------------------------------------------------------
    ! Note: nx and ny are the horizontal dimensions of scalar arrays (e.g., thck and temp).
    !       The velocity arrays have horizontal dimensions (nx-1, ny-1).
    !       nz is the number of levels at which uvel and vvel are computed.
    !       The scalar variables generally live at layer midpoints and have
    !         vertical dimension nz-1.
    !----------------------------------------------------------------

    integer, intent(in) ::   &
       nx, ny,               &  ! number of grid cells in each horizontal direction
       nz                       ! number of vertical levels where velocity is computed
                                ! (same as model%general%upn)
 
    !----------------------------------------------------------------
    ! Local variables and pointers set to components of model derived type 
    !----------------------------------------------------------------

    real(dp) ::  &
       dx,  dy                  ! grid cell length and width (m)
                                ! assumed to have the same value for each grid cell

    integer :: &
       staggered_ilo, staggered_ihi, & ! bounds of locally owned vertices on staggered grid
       staggered_jlo, staggered_jhi

    type(parallel_type) :: parallel    ! info for parallel communication

    real(dp), dimension(:), pointer :: &
       sigma,                 & ! vertical sigma coordinate at layer interfaces, [0,1]
       stagsigma,             & ! staggered vertical sigma coordinate at layer midpoints
       stagwbndsigma            ! stagsigma augmented by sigma = 0 and 1 at upper and lower surfaces

    real(dp)  ::   & 
       dt,                   &  ! time step (s)
       thklim,               &  ! minimum ice thickness for active grounded cells (m)
       thck_gradient_ramp,   &  ! thickness scale over which gradients are ramped up from zero to full value (m)
       max_slope,            &  ! maximum slope allowed for surface gradient computations (unitless)
       eus,                  &  ! eustatic sea level (m), = 0. by default
       efvs_constant,        &  ! constant efvs value (Pa yr) for whichefvs = HO_EFVS_CONSTANT
       effstrain_min,        &  ! minimum value of effective strain rate (yr^-1) for viscosity computation
       pmp_threshold            ! bed is assumed thawed where Tbed >= pmptemp - pmp_threshold (deg C)

    real(dp), dimension(:,:), pointer ::  &
       thck,                 &  ! ice thickness (m)
                                ! Note: When using the subgrid CF scheme, thck => model%calving%thck_effective
                                !       Otherwise, thck => model%geometry%thck
       topg,                 &  ! elevation of topography (m)
       bpmp,                 &  ! pressure melting point temperature (C)
       beta,                 &  ! basal traction parameter (Pa/(m/yr))
       beta_internal,        &  ! beta field weighted by f_ground (such that beta = 0 beneath floating ice)
       bfricflx,             &  ! basal heat flux from friction (W/m^2) 
       f_flotation,          &  ! flotation function = (rhoi*thck) / (-rhoo*(topg-eus)) by default
                                ! used to be f_pattyn = -rhoo*(topg-eus) / (rhoi*thck)
       f_ground,             &  ! grounded ice fraction at vertices, 0 <= f_ground <= 1
       f_ground_cell            ! grounded ice fraction in cells, 0 <= f_ground_cell <= 1

    !TODO - Remove dependence on stagmask?  Currently it is needed for input to calcbeta.
    integer, dimension(:,:), pointer ::   &
       stagmask                 ! mask on staggered grid

    real(dp), dimension(:,:,:), pointer ::  &
       uvel, vvel,  &           ! velocity components (m/yr)
       temp,   &                ! ice temperature (deg C)
       flwa,   &                ! flow factor in units of Pa^(-n) yr^(-1)
       efvs,   &                ! effective viscosity (Pa yr)
       resid_u, resid_v,   &    ! u and v components of residual Ax - b (Pa/m)
       bu, bv                   ! right-hand-side vector b, divided into 2 parts

    real(dp), dimension(:,:), pointer ::  &
       uvel_2d, vvel_2d,       &! 2D velocity field; solution for SSA, L1L2 and DIVA 
       btractx, btracty,       &! components of basal traction (Pa)
       taudx, taudy             ! components of driving stress (Pa)

    real(dp), dimension(:,:,:), pointer ::  &
       tau_xz, tau_yz,         &! vertical components of stress tensor (Pa)
       tau_xx, tau_yy, tau_xy, &! horizontal components of stress tensor (Pa)
       tau_eff                  ! effective stress (Pa)

    integer,  dimension(:,:), pointer ::   &
       kinbcmask,              &! = 1 at vertices where u and v are prescribed from input data (Dirichlet BC), = 0 elsewhere
       umask_no_penetration,   &! = 1 at vertices along east/west global boundary where uvel = 0, = 0 elsewhere
       vmask_no_penetration     ! = 1 at vertices along north/south global boundary where vvel = 0, = 0 elsewhere

    integer ::   &
       whichbabc, &             ! option for basal boundary condition
       whichbeta_limit, &       ! option to limit beta for grounded ice
       whichefvs, &             ! option for effective viscosity calculation 
                                ! (calculate it or make it uniform)
       whichresid, &            ! option for method of calculating residual
       whichsparse, &           ! option for method of doing elliptic solve
                                ! (BiCG, GMRES, standalone Trilinos, etc.)
       whichnonlinear, &        ! option for nonlinear solver
                                ! (standard or accelerated Picard)
       whichapprox, &           ! option for which Stokes approximation to use
                                ! 0 = SIA, 1 = SSA, 2 = Blatter-Pattyn HO, 3 = L1L2
                                ! default = 2
       whichprecond, &          ! option for which preconditioner to use with 
                                !  structured PCG solver
                                ! 0 = none, 1 = diag, 2 = SIA-based
       whichgradient, &         ! option for gradient operator when computing grad(s)
                                ! 0 = centered, 1 = upstream
       whichgradient_margin, &  ! option for computing gradient at ice margin
                                ! 0 = include all neighbor cells in gradient calculation
                                ! 1 = include ice-covered and/or land cells
                                ! 2 = include ice-covered cells only
       whichassemble_beta,  &   ! 0 = standard finite element assembly
                                ! 1 = apply local value of beta at each vertex
       whichassemble_taud,  &   ! 0 = standard finite element assembly
                                ! 1 = apply local value of driving stress at each vertex
       whichassemble_bfric, &   ! 0 = standard finite element assembly
                                ! 1 = apply local value of basal friction at each vertex
       whichassemble_lateral, & ! 0 = standard finite element assembly
                                ! 1 = apply local value of thck and usrf at each marine edge
       whichcalving_front,  &   ! option for subgrid calving front scheme (either on or off)
       whichground,  &          ! option for computing grounded fraction of each cell
       whichflotation_function,&! option for computing flotation function at and near each vertex
       maxiter_nonlinear,    &  ! maximum number of nonlinear iterations
       linear_solve_ncheck,  &  ! number of iterations between convergence checks in the linear solver
       linear_maxiters          ! max number of linear iterations before quitting

    logical ::   &
         diva_slope_correction  ! if true, include a slope correction for the DIVA solver

    real(dp) ::  &
         linear_tolerance       ! tolerance for linear solver

    !--------------------------------------------------------
    ! Local parameters
    !--------------------------------------------------------

    real(dp), parameter :: &
         resid_target = 1.0d-04   ! assume velocity fields have converged below this resid 

    real(dp), parameter :: &
         L2_norm_large = 1.0d20   ! arbitrary large value for initializing the L2 norm

    !--------------------------------------------------------
    ! Local variables
    !--------------------------------------------------------

    real(dp), dimension(nx-1,ny-1) :: &
       xVertex, yVertex,    & ! x and y coordinates of each vertex (m)
       stagusrf,            & ! upper surface averaged to vertices, for active cells (m)
       stagthck,            & ! ice thickness averaged to vertices, for active cells (m)
       stagusrf_lateral,    & ! modified version of stagusrf; does not weight ice-free ocean cells
       stagthck_lateral,    & ! modified version of stagthck; does not weight ice-free ocean cells
       dusrf_dx, dusrf_dy,  & ! gradient of upper surface elevation (m/m)
       ubas, vbas             ! basal ice velocity (m/yr); input to calcbeta 

    integer, dimension(nx,ny) ::     &
       ice_mask,            & ! = 1 for cells where ice is present (thck > thklim)
       floating_mask,       & ! = 1 for cells where ice is present (thck > thklim) and floating
       ocean_mask,          & ! = 1 for cells where topography is below sea level and ice is absent
       land_mask,           & ! = 1 for cells where topography is above sea level
       ice_plus_land_mask     ! = 1 for active ice cells plus ice-free land cells

    real(dp), dimension(nx,ny) ::  &
       lsrf,                & ! lower surface elevation (m)
       usrf                   ! upper surface elevation (m)

    real(dp), dimension(nx-1,ny-1) :: &
       stagbedtemp,         & ! bed temperature averaged to vertices (deg C)
       stagbedpmp             ! bed pmp temperature averaged to vertices (deg C)    

    logical, dimension(nx,ny) ::     &
       active_cell            ! true for active cells (ice_mask = 1 and border locally owned vertices)

    logical, dimension(nx-1,ny-1) :: &
       active_vertex          ! true for vertices of active cells

    real(dp), dimension(nz-1,nx,ny) ::  &
       flwafact               ! temperature-based flow factor, 0.5 * A^(-1/n), 
                              ! used to compute effective viscosity
                              ! units: Pa yr^(1/n)

    real(dp), dimension(nz,nx-1,ny-1) ::   &
       usav, vsav,                 &! previous guess for velocity solution
       loadu, loadv                 ! assembled load vector, divided into 2 parts
                                    ! Note: loadu and loadv are computed only once per nonlinear solve,
                                    !       whereas bu and bv can be set each nonlinear iteration to account 
                                    !       for inhomogeneous Dirichlet BC
  
    integer, dimension(nz,nx-1,ny-1) ::    &
       umask_dirichlet,     & ! Dirichlet mask for u component of velocity, = 1 for prescribed velo, else = 0
       vmask_dirichlet        ! Dirichlet mask for v component of velocity, = 1 for prescribed velo, else = 0

    real(dp) :: &
       resid_velo,          & ! quantity related to velocity convergence
       L2_norm,             & ! L2 norm of residual, |Ax - b|
       L2_target,           & ! nonlinear convergence target for residual
       L2_norm_relative,    & ! L2 norm of residual relative to rhs, |Ax - b| / |b|
       L2_target_relative,  & ! nonlinear convergence target for relative residual
       err,                 & ! solution error from sparse_easy_solve
       outer_it_criterion,  & ! current value of outer (nonlinear) loop converence criterion
       outer_it_target        ! target value for outer-loop convergence

    logical, save ::    &
       converged_soln = .false.    ! true if we get a converged solution for velocity

    integer ::  & 
       counter,         & ! outer (nonlinear) iteration counter
       niters             ! linear iteration count

    integer :: nNonzeros  ! number of nonzero matrix entries

    ! The following large matrix arrays are allocated for a 3D solve (SIA or BP)

    real(dp), dimension(:,:,:,:), allocatable ::  &
       Auu, Auv,    &     ! assembled stiffness matrix, divided into 4 parts
       Avu, Avv           ! 1st dimension = 27 (node and its nearest neighbors in x, y and z direction) 
                          ! other dimensions = (k,i,j)

    ! The following are used for the SLAP and Trilinos solvers

    integer ::            &
       nNodesSolve        ! number of nodes where we solve for velocity

    integer, dimension(nz,nx-1,ny-1) ::  &
       nodeID             ! local ID for each node where we solve for velocity
                          ! For periodic BCs (as in ISMIP-HOM), halo node IDs will be copied
                          !  from the other side of the grid

    integer, dimension((nx-1)*(ny-1)*nz) ::   &
       iNodeIndex, jNodeIndex, kNodeIndex   ! i, j and k indices of nodes

    ! The following are used for the Trilinos solver only

    integer, dimension(nx-1,ny-1) ::  &
       global_vertex_id    ! unique global IDs for vertices on this processor

    integer, dimension(nz,nx-1,ny-1) ::  &
       global_node_id      ! unique global IDs for nodes on this processor

    integer, dimension(:), allocatable ::    &
       active_owned_unknown_map    ! maps owned active unknowns (u and v at each active node) to global IDs

    logical, dimension(:,:,:,:), allocatable ::  &
       Afill               ! true wherever the matrix value is potentially nonzero

    real(dp), dimension(:), allocatable ::   &
       velocityResult     ! velocity solution vector from Trilinos

    ! The following are used for the SLAP solver only

    type(sparse_matrix_type) ::  &
       matrix             ! sparse matrix for SLAP solver, defined in glimmer_sparse_types
                          ! includes nonzeroes, order, col, row, val 

    real(dp), dimension(:), allocatable ::   &   ! for SLAP solver
       rhs,             & ! right-hand-side (b) in Ax = b
       answer,          & ! answer (x) in Ax = b
       resid_vec          ! residual vector Ax - b

    integer ::          &
       matrix_order,    & ! order of matrix = number of rows
       max_nonzeros       ! upper bound for number of nonzero entries in sparse matrix

    ! The following arrays are used for a 2D matrix solve (SSA, L1L2 or DIVA)

    logical ::  &
       solve_2d           ! if true, solve a 2D matrix)
                          ! else solve a 3D matrix (SIA, BP)

    integer ::            &
       nVerticesSolve     ! number of vertices where we solve for velocity

    integer, dimension(nx-1,ny-1) ::  &
       vertexID           ! local ID for each vertex where we solve for velocity (in 2d)
    
    integer, dimension((nx-1)*(ny-1)) ::   &
       iVertexIndex, jVertexIndex   ! i and j indices of vertices

    real(dp), dimension(:,:,:), allocatable ::  &
       Auu_2d, Auv_2d,   &! assembled stiffness matrix, divided into 4 parts
       Avu_2d, Avv_2d     ! 3rd dimension = 9 (node and its nearest neighbors in x and y direction)
                          ! 1st and 2nd dimensions = (i,j)

    real(dp), dimension(:,:), allocatable ::  &
       bu_2d, bv_2d,     &! right-hand-side vector b, divided into 2 parts
       loadu_2d, loadv_2d ! assembled load vector, divided into 2 parts

    real(dp), dimension(:,:), allocatable ::  &
       usav_2d, vsav_2d

    real(dp), dimension(:,:), allocatable ::  &
       resid_u_2d, resid_v_2d   ! components of 2D solution residual

    logical, dimension(:,:,:), allocatable ::  &
       Afill_2d           ! true wherever the matrix value is potentially nonzero
                          ! 2D Trilinos only

    ! The following are used for the depth-integrated viscosity solve.
    ! Note: To support slope corrections, DIVA now has two version of beta_eff:
    !       one in the x direction and one in the y direction
    real(dp), dimension(:,:), allocatable :: &
       beta_eff_x,         & ! effective beta, defined by Goldberg (2011) eq. 41
       beta_eff_y,         & ! beta*u_b = beta_eff*u_av
       omega,              & ! double integral, defined by Goldberg (2011) eq. 35
                             ! Note: omega here is equal to Goldberg's omega/H
       stag_omega            ! omega interpolated to staggered grid

    real(dp), dimension(:,:,:), allocatable :: &
       omega_k,             &! single integral, defined by Goldberg (2011) eq. 32
       stag_omega_k          ! omega_k interpolated to staggered grid

    real(dp), dimension(:,:,:,:), allocatable :: &
       efvs_qp_3d            ! effective viscosity at each QP of each layer of each cell

    integer, parameter :: &
       diva_level_index = 0  ! level for which the DIVA scheme finds the 2D velocity
                             ! 0 = mean, 1 = upper surface
                             ! Results are not very sensitive to this choice                     

    real(dp), dimension(nx,ny) ::   &
       z_mean,             & ! mean of surface and basal elevation (m)
       theta_slope,        & ! slope angle (radians) based on surface and bed slopes
       theta_slope_x,      & ! slope angle (radians) in x direction based on surface and bed slopes
       theta_slope_y,      & ! slope angle (radians) in y direction based on surface and bed slopes
       diva_slope_factor_x,& ! correction factor for DIVA, based on theta_slope_x
       diva_slope_factor_y   ! correction factor for DIVA, based on theta_slope_y

    ! staggered versions of the fields above
    real(dp), dimension(nx-1,ny-1) ::  &
         stag_theta_slope, &
         stag_theta_slope_x, &
         stag_theta_slope_y, &
         stag_diva_slope_factor_x, &
         stag_diva_slope_factor_y

    real(dp) :: dsigma
    real(dp) :: maxbeta, minbeta
    integer :: i, j, k, m, n, p, r
    integer :: iA, jA, kA
    real(dp) :: maxthck, maxusrf
    logical, parameter :: test_matrix = .false.
!    logical, parameter :: test_matrix = .true.
    integer, parameter :: test_order = 4

    ! for trilinos test problem
    logical, parameter :: test_trilinos = .false.
!    logical, parameter :: test_trilinos = .true.

    ! for diagnostic prints
    integer, parameter :: xmax_print = 20

    !WHL - The remaining variables (through Auu_sav) are used by the accelerated Picard solver

    logical :: accel_picard           ! if true, do Picard acceleration; based on whichnonlinear
    real(dp) :: alpha_accel           ! factor for extending the vector (duvel, dvvel) to reduce the residual
    real(dp) :: L2_norm_alpha_sav     ! value of L2 norm of residual, given the previous alpha_accel
    logical :: assembly_is_done       ! true when the final assembled matrix is formed, based on the best value of alpha_accel

    real(dp), parameter :: &
         gamma_accel = 0.40d0         ! how much to increase alpha_accel for each attempt to extend the solution vector

    real(dp), parameter :: &
         alpha_accel_max = 3.0d0      ! max allowed value of alpha_accel

    real(dp), parameter :: &
         resid_reduction_threshold = 0.85d0  ! threshold for deciding whether to increase alpha_accel again

    ! for an accelerated 2D solve:
    real(dp), dimension(:,:), allocatable :: &
         uvel_2d_old, vvel_2d_old,  & ! velocity solution from previous nonlinear iteration
         duvel_2d, dvvel_2d,        & ! difference between current and previous velocity solutions
         uvel_2d_sav, vvel_2d_sav,  & ! current best value for velocity solution (smallest residual)
         beta_internal_sav            ! beta_internal associated with saved velocity

    real(dp), dimension(:,:,:), allocatable :: &
         Auu_2d_sav, Auv_2d_sav,    & ! assembled matrices associated with (uvel_2d_sav, vvel_2d_sav)
         Avu_2d_sav, Avv_2d_sav

    ! for an accelerated 3D solve:
    real(dp), dimension(:,:,:), allocatable :: &
         uvel_old, vvel_old,        & ! velocity solution from previous nonlinear iteration
         duvel, dvvel,              & ! difference between current and previous velocity solutions
         uvel_sav, vvel_sav           ! current best value for velocity solution (smallest residual)

    real(dp), dimension(:,:,:,:), allocatable :: &
         Auu_sav, Auv_sav,          & ! assembled matrices associated with (uvel_2d_sav, vvel_2d_sav)
         Avu_sav, Avv_sav

    integer :: itest, jtest    ! coordinates of diagnostic point
    integer :: rtest           ! task number for processor containing diagnostic point

    call t_startf('glissade_vhs_init')
    rtest = -999
    itest = 1
    jtest = 1
    if (this_rank == model%numerics%rdiag_local) then
       rtest = model%numerics%rdiag_local
       itest = model%numerics%idiag_local
       jtest = model%numerics%jdiag_local
    endif

    if (verbose .and. this_rank==rtest) then
       write(iulog,*) 'In glissade_velo_higher_solve'
       write(iulog,*) 'rank, itest, jtest, ktest =', rtest, itest, jtest, ktest
    endif

#ifdef TRILINOS
    if (test_trilinos) then
       call trilinos_test
       stop
    endif
#endif

    !--------------------------------------------------------
    ! Assign local pointers and variables to derived type components
    !--------------------------------------------------------

!    nx = model%general%ewn   ! currently passed in
!    ny = model%general%nsn
!    nz = model%general%upn

     dx = model%numerics%dew
     dy = model%numerics%dns

     parallel = model%parallel
     staggered_ilo = parallel%staggered_ilo
     staggered_ihi = parallel%staggered_ihi
     staggered_jlo = parallel%staggered_jlo
     staggered_jhi = parallel%staggered_jhi

     !TODO - Remove (:), (:,:) and (:,:,:) from pointer targets?

     !Note: If running with the subgrid CF scheme, thck points to calving%thck_effective
     !       instead of geometry%thck.  For partial_cf cells, thck_effective > thck.
     !      The goal is to compute velocities appropriate for a subgrid calving front,
     !       instead of a full cell with unrealistically thin ice.
     !      Instead of pointing to model%geometry%usrf, compute a local value of usrf
     !       that is consistent with the local value of thck.

     if (model%options%which_ho_calving_front == HO_CALVING_FRONT_SUBGRID) then
        thck  => model%calving%thck_effective(:,:)
     else
        thck  => model%geometry%thck(:,:)
     endif
     topg     => model%geometry%topg(:,:)
     sigma    => model%numerics%sigma(:)
     stagsigma=> model%numerics%stagsigma(:)
     stagwbndsigma=> model%numerics%stagwbndsigma(:)
     stagmask => model%geometry%stagmask(:,:)
     f_ground => model%geometry%f_ground(:,:)
     f_ground_cell => model%geometry%f_ground_cell(:,:)
     f_flotation => model%geometry%f_flotation(:,:)

     temp     => model%temper%temp
     flwa     => model%temper%flwa(:,:,:)
     efvs     => model%stress%efvs(:,:,:)
     beta     => model%velocity%beta(:,:)
     beta_internal => model%velocity%beta_internal(:,:)
     bfricflx => model%temper%bfricflx(:,:)
     bpmp     => model%temper%bpmp(:,:)

     uvel     => model%velocity%uvel(:,:,:)
     vvel     => model%velocity%vvel(:,:,:)
     uvel_2d  => model%velocity%uvel_2d(:,:)
     vvel_2d  => model%velocity%vvel_2d(:,:)
     resid_u  => model%velocity%resid_u(:,:,:)
     resid_v  => model%velocity%resid_v(:,:,:)
     bu       => model%velocity%rhs_u(:,:,:)
     bv       => model%velocity%rhs_v(:,:,:)

     btractx  => model%stress%btractx(:,:)
     btracty  => model%stress%btracty(:,:)
     taudx    => model%stress%taudx(:,:)
     taudy    => model%stress%taudy(:,:)
     tau_xz   => model%stress%tau%xz(:,:,:)
     tau_yz   => model%stress%tau%yz(:,:,:)
     tau_xx   => model%stress%tau%xx(:,:,:)
     tau_yy   => model%stress%tau%yy(:,:,:)
     tau_xy   => model%stress%tau%xy(:,:,:)
     tau_eff  => model%stress%tau%scalar(:,:,:)

     kinbcmask => model%velocity%kinbcmask(:,:)
     umask_no_penetration => model%velocity%umask_no_penetration(:,:)
     vmask_no_penetration => model%velocity%vmask_no_penetration(:,:)

     dt = model%numerics%dt
     thklim = model%numerics%thklim
     thck_gradient_ramp  = model%numerics%thck_gradient_ramp
     max_slope = model%paramets%max_slope
     eus = model%climate%eus
     efvs_constant = model%paramets%efvs_constant
     effstrain_min = model%paramets%effstrain_min
     pmp_threshold = model%temper%pmp_threshold

     whichbabc            = model%options%which_ho_babc
     whichbeta_limit      = model%options%which_ho_beta_limit
     whichefvs            = model%options%which_ho_efvs
     whichresid           = model%options%which_ho_resid
     whichsparse          = model%options%which_ho_sparse
     whichnonlinear       = model%options%which_ho_nonlinear
     whichapprox          = model%options%which_ho_approx
     whichprecond         = model%options%which_ho_precond
     maxiter_nonlinear    = model%options%glissade_maxiter
     linear_solve_ncheck  = model%options%linear_solve_ncheck
     linear_maxiters      = model%options%linear_maxiters
     linear_tolerance     = model%options%linear_tolerance
     whichgradient        = model%options%which_ho_gradient
     whichgradient_margin = model%options%which_ho_gradient_margin
     whichassemble_beta   = model%options%which_ho_assemble_beta
     whichassemble_taud   = model%options%which_ho_assemble_taud
     whichassemble_bfric  = model%options%which_ho_assemble_bfric
     whichassemble_lateral= model%options%which_ho_assemble_lateral
     whichcalving_front   = model%options%which_ho_calving_front
     whichground          = model%options%which_ho_ground
     whichflotation_function = model%options%which_ho_flotation_function
     diva_slope_correction= model%options%diva_slope_correction

    !--------------------------------------------------------
    ! Convert input variables to appropriate units for this solver.
    ! (Mainly SI, except that time units in flwa, velocities,
    !  and beta are years instead of seconds)
    !--------------------------------------------------------

    !TODO: Do not scale topg and eus, since we would like these fields
    !       to remain unchanged (BFB) throughout the simulation,
    !       unless isostasy is turned on.
    !      In the long run, remove the scale factors.

!pw call t_startf('glissade_velo_higher_scale_input')
    call glissade_velo_higher_scale_input(dx,      dy,            &
                                          whichcalving_front,     &
                                          thck,                   &
                                          topg,    eus,           &
                                          thklim,                 &
                                          thck_gradient_ramp,     &
                                          flwa,    efvs,          &
                                          btractx, btracty,       &
                                          uvel,    vvel,          &
                                          uvel_2d, vvel_2d)
!pw call t_stopf('glissade_velo_higher_scale_input')

    ! Now that thck and topg have the desired scaling (m), compute lsrf and usrf.
    ! Note: If using a subgrid calving scheme, these will be based on effective thickness.
    !       Will be recomputed based on the true thickness later in the diagnostic solve.
    call glide_calclsrf(thck, topg, eus, lsrf)
    usrf = max(0.d0, thck + lsrf)

    ! Set volume scale
    ! This is not strictly necessary, but dividing by this scale gives matrix coefficients 
    !  that are ~1.

    vol0  = 1.0d9    ! volume scale (m^3)

    if (whichapprox == HO_APPROX_SIA) then   ! SIA
       if (verbose_solver .and. main_task) write(iulog,*) 'Solving shallow-ice approximation'
    elseif (whichapprox == HO_APPROX_SSA) then  ! SSA
       if (verbose_solver .and. main_task) write(iulog,*) 'Solving shallow-shelf approximation'
    elseif (whichapprox == HO_APPROX_L1L2) then  ! L1L2
       if (verbose_solver .and. main_task) write(iulog,*) 'Solving depth-integrated L1L2 approximation'
    elseif (whichapprox == HO_APPROX_DIVA) then  ! DIVA, based on Goldberg (2011)
       if (verbose_solver .and. main_task) write(iulog,*) 'Solving depth-integrated viscosity approximation'
    else   ! Blatter-Pattyn higher-order 
       if (verbose_solver .and. main_task) write(iulog,*) 'Solving Blatter-Pattyn higher-order approximation'
    endif

    if (whichapprox==HO_APPROX_SSA .or. whichapprox==HO_APPROX_L1L2 .or. whichapprox==HO_APPROX_DIVA) then
       solve_2d = .true.
    else   ! 3D solve
       solve_2d = .false.
    endif

    if (whichnonlinear == HO_NONLIN_PICARD_ACCEL) then
       accel_picard = .true.
       if ((verbose_solver .or. verbose_picard) .and. main_task) write(iulog,*) 'Running with Picard acceleration'
    else
       accel_picard = .false.
       if ((verbose_solver .or. verbose_picard) .and. main_task) write(iulog,*) 'Running standard Picard'
    endif

    if (solve_2d) then
       ! allocate arrays needed for a 2D solve
       allocate(Auu_2d(nx-1,ny-1,nNodeNeighbors_2d))
       allocate(Auv_2d(nx-1,ny-1,nNodeNeighbors_2d))
       allocate(Avu_2d(nx-1,ny-1,nNodeNeighbors_2d))
       allocate(Avv_2d(nx-1,ny-1,nNodeNeighbors_2d))
       allocate(bu_2d(nx-1,ny-1))
       allocate(bv_2d(nx-1,ny-1))
       allocate(loadu_2d(nx-1,ny-1))
       allocate(loadv_2d(nx-1,ny-1))
       allocate(usav_2d(nx-1,ny-1))
       allocate(vsav_2d(nx-1,ny-1))
       allocate(resid_u_2d(nx-1,ny-1))
       allocate(resid_v_2d(nx-1,ny-1))
       if (accel_picard) then
          allocate(uvel_2d_old(nx-1,ny-1))
          allocate(vvel_2d_old(nx-1,ny-1))
          allocate(duvel_2d(nx-1,ny-1))
          allocate(dvvel_2d(nx-1,ny-1))
          allocate(uvel_2d_sav(nx-1,ny-1))
          allocate(vvel_2d_sav(nx-1,ny-1))
          allocate(Auu_2d_sav(nx-1,ny-1,nNodeNeighbors_2d))
          allocate(Auv_2d_sav(nx-1,ny-1,nNodeNeighbors_2d))
          allocate(Avu_2d_sav(nx-1,ny-1,nNodeNeighbors_2d))
          allocate(Avv_2d_sav(nx-1,ny-1,nNodeNeighbors_2d))
          allocate(beta_internal_sav(nx-1,ny-1))
       endif
    else
       allocate(Auu(nNodeNeighbors_3d,nz,nx-1,ny-1))
       allocate(Auv(nNodeNeighbors_3d,nz,nx-1,ny-1))
       allocate(Avu(nNodeNeighbors_3d,nz,nx-1,ny-1))
       allocate(Avv(nNodeNeighbors_3d,nz,nx-1,ny-1))
       if (accel_picard) then
          allocate(uvel_old(nz,nx-1,ny-1))
          allocate(vvel_old(nz,nx-1,ny-1))
          allocate(duvel(nz,nx-1,ny-1))
          allocate(dvvel(nz,nx-1,ny-1))
          allocate(uvel_sav(nz,nx-1,ny-1))
          allocate(vvel_sav(nz,nx-1,ny-1))
          allocate(Auu_sav(nNodeNeighbors_3d,nz,nx-1,ny-1))
          allocate(Auv_sav(nNodeNeighbors_3d,nz,nx-1,ny-1))
          allocate(Avu_sav(nNodeNeighbors_3d,nz,nx-1,ny-1))
          allocate(Avv_sav(nNodeNeighbors_3d,nz,nx-1,ny-1))
          allocate(beta_internal_sav(nx-1,ny-1))
       endif
    endif

    if (whichapprox == HO_APPROX_DIVA) then
!!       call parallel_halo(efvs, parallel)   ! efvs halo update is in glissade_diagnostic_variable_solve
       allocate(beta_eff_x(nx-1,ny-1))
       allocate(beta_eff_y(nx-1,ny-1))
       allocate(omega(nx,ny))
       allocate(omega_k(nz,nx,ny))
       allocate(stag_omega(nx-1,ny-1))
       allocate(stag_omega_k(nz,nx-1,ny-1))
       allocate(efvs_qp_3d(nz-1,nQuadPoints_2d,nx,ny))
       beta_eff_x(:,:) = 0.d0
       beta_eff_y(:,:) = 0.d0
       omega(:,:) = 0.d0
       omega_k(:,:,:) = 0.d0
       stag_omega(:,:) = 0.d0
       stag_omega_k(:,:,:) = 0.d0
       ! Note: Initializing efvs_qp as efvs is a reasonable first guess that allows us to
       !       write efvs to the restart file instead of efvs_qp (which is 4x larger).
       do p = 1, nQuadPoints_2d
          efvs_qp_3d(:,p,:,:) = efvs(:,:,:)
       enddo
    endif

    if (whichapprox /= HO_APPROX_DIVA) then
       ! Set the 2D velocity to the velocity at the bed
       ! Note: For L1L2 and SSA, this is the 2D velocity solution from the previous solve.
       !       For DIVA, the velocity solution from the previous solve is typically the
       !        mean velocity, which cannot be extracted exactly from the 3D velocity field
       !        and must be stored in a separate array.
       uvel_2d(:,:) = uvel(nz,:,:)
       vvel_2d(:,:) = vvel(nz,:,:)
    endif

    if (test_matrix) then
       if (whichsparse <= HO_SPARSE_GMRES) then   ! this test works for SLAP solver only
          call slap_solve_test_matrix(test_order, whichsparse)
       else
          write(iulog,*) 'Invalid value for whichsparse with test_matrix subroutine'
          stop
       endif
    endif

    ! Make sure that the geometry and flow factor are correct in halo cells.
    ! These calls are commented out, since the halo updates are done in 
    !  module glissade.F90, before calling glissade_velo_higher_solve.

!    call parallel_halo(thck, parallel)
!    call parallel_halo(topg, parallel)
!    call parallel_halo(usrf, parallel)
!    call parallel_halo(flwa, parallel)

    !------------------------------------------------------------------------------
    ! Setup for higher-order solver: Compute nodal geometry, allocate storage, etc.
    ! These are quantities that do not change during the outer nonlinear loop. 
    !------------------------------------------------------------------------------

    if (verbose_state) then
       maxthck = maxval(thck(:,:))
       maxthck = parallel_reduce_max(maxthck)
       maxusrf = maxval(usrf(:,:))
       maxusrf = parallel_reduce_max(maxusrf)

       if (this_rank==rtest) then
          write(iulog,*) ' '
          write(iulog,*) 'nx, ny, nz:', nx, ny, nz
          write(iulog,*) 'vol0:', vol0
          write(iulog,*) 'thklim:', thklim
          write(iulog,*) 'max thck, usrf:', maxthck, maxusrf
          write(iulog,*) 'sigma coordinate:'
          do k = 1, nz
             write(iulog,*) k, sigma(k)
          enddo
       endif

       call point_diag(usrf, 'usrf (m)', itest, jtest, rtest, 7, 7)
       call point_diag(thck, 'thck (m)', itest, jtest, rtest, 7, 7)
       call point_diag(topg, 'topg (m)', itest, jtest, rtest, 7, 7)
       call point_diag(uvel(1,:,:), 'sfc uvel (m/yr)', itest, jtest, rtest, 7, 7)
       call point_diag(vvel(1,:,:), 'sfc vvel (m/yr)', itest, jtest, rtest, 7, 7)
       call point_diag(flwa(1,:,:), 'flwa (Pa-3 yr_1), k = 1', itest, jtest, rtest, 7, 7, '(e12.5)')
    endif      ! verbose_state
 
    !------------------------------------------------------------------------------
    ! Specify Dirichlet boundary conditions (prescribed uvel and vvel)
    !------------------------------------------------------------------------------

    ! initialize
    umask_dirichlet(:,:,:) = 0 
    vmask_dirichlet(:,:,:) = 0   

    ! Set the Dirichlet mask at the bed for no-slip BCs.
    if (whichbabc == HO_BABC_NO_SLIP .and. whichapprox /= HO_APPROX_DIVA) then
       ! Impose zero sliding everywhere at the bed
       ! Note: For the DIVA case, this BC is handled by setting beta_eff = 1/omega
       !TODO - Allow application of no-slip BC at selected basal nodes instead of all nodes?
       umask_dirichlet(nz,:,:) = 1    ! u = v = 0 at bed
       vmask_dirichlet(nz,:,:) = 1
    endif
       
    ! Set mask in columns identified in kinbcmask, typically read from file at initialization.
    ! Note: Assuming there is no vertical shear at these points, the bed velocity is the same
    !       as the velocity throughout the column.  This allows us to use the 3D umask_dirichlet
    !       and vmask_dirichlet with a 2D solver.
    ! TODO: Support Dirichlet condition with vertical shear for L1L2 and DIVA?
    !
    ! For a no-penetration global BC, set umask_dirichlet = 0 and uvel = 0.d0 along east/west global boundaries,
    !  and set vmask_dirichlet = 0 and vvel = 0.d0 along north/south global boundaries (based on umask_no_penetration
    !  and vmask_no_penetration, which are computed at initialization). 
    !
    ! For a 2D solve, initialize uvel_2d and vvel_2d at Dirichlet points to the bed velocity.

    do j = 1, ny-1
       do i = 1, nx-1

          ! if kinbcmask = 1, set Dirichlet masks for both uvel and vvel
          if (kinbcmask(i,j) == 1) then
             umask_dirichlet(:,i,j) = 1
             vmask_dirichlet(:,i,j) = 1
             if (solve_2d) then
                uvel_2d(i,j) = uvel(nz,i,j)
                vvel_2d(i,j) = vvel(nz,i,j)
             endif
          endif

          ! for the no-penetration global BC, prescribe zero outflow velocities
          ! (v = 0 at N/S boundaries, u = 0 at E/W boundaries)
          ! for other global BCs (periodic and outflow), umask_no_penetration = vmask_no_penetration = 0 everywhere

          if (umask_no_penetration(i,j) == 1) then
             umask_dirichlet(:,i,j) = 1
             uvel(:,i,j) = 0.d0
             if (solve_2d) uvel_2d(i,j) = 0.d0
          endif

          if (vmask_no_penetration(i,j) == 1) then
             vmask_dirichlet(:,i,j) = 1
             vvel(:,i,j) = 0.d0
             if (solve_2d) vvel_2d(i,j) = 0.d0
          endif

       enddo
    enddo

    !Note: The following halo updates are not needed here, provided that kinbcmask,
    !      umask_no_penetration and vmask_no_penetration receive halo updates
    !      (as done in glissade_initialise)
!    call staggered_parallel_halo(umask_dirichlet, parallel)
!    call staggered_parallel_halo(vmask_dirichlet, parallel)

    if (verbose_dirichlet) then
       call point_diag(kinbcmask, 'kinbcmask', itest, jtest, rtest, 7, 7, 'i6')
       call point_diag(umask_no_penetration, 'umask_no_pen', itest, jtest, rtest, 7, 7, 'i6')
       call point_diag(vmask_no_penetration, 'vmask_no_pen', itest, jtest, rtest, 7, 7, 'i6')
       call point_diag(umask_dirichlet(1,:,:), 'umask_dirichlet', itest, jtest, rtest, 7, 7, 'i6')
       call point_diag(vmask_dirichlet(1,:,:), 'vmask_dirichlet', itest, jtest, rtest, 7, 7, 'i6')
       call point_diag(uvel(1,:,:), 'uvel (m/yr)', itest, jtest, rtest, 7, 7)
       call point_diag(uvel(1,:,:), 'vvel (m/yr)', itest, jtest, rtest, 7, 7)
    endif   ! verbose_dirichlet

    !------------------------------------------------------------------------------
    ! Compute masks for the velocity solver: 
    ! (1) ice mask = 1 in cells where ice is present (thck > thklim)
    ! (2) floating mask = 1 in cells where ice is present (thck > thklim) and floating
    ! (3) ocean mask = = 1 in cells where topography is below sea level and ice is absent
    ! (4) land mask = 1 in cells where topography is at or above sea level
    !------------------------------------------------------------------------------

    !TODO: Modify glissade_get_masks so that 'parallel' is not needed
    call glissade_get_masks(nx,               ny,               &
                            parallel,                           &
                            thck,             topg,             &
                            eus,              thklim,           &
                            ice_mask,                           &
                            floating_mask = floating_mask,      &
                            ocean_mask = ocean_mask,            &
                            land_mask = land_mask)

    ! Compute a mask which is the union of ice cells and land-based cells (including ice-free land).
    where (ice_mask == 1 .or. land_mask == 1)
       ice_plus_land_mask = 1
    elsewhere
       ice_plus_land_mask = 0
    endwhere

    !------------------------------------------------------------------------------
    ! Compute the ice thickness and upper surface elevation on the staggered grid.
    ! (requires that thck and usrf are up to date in all cells that border locally owned vertices).
    ! All cells, including ice-free cells, are included in the interpolation.
    !------------------------------------------------------------------------------

    call glissade_stagger(nx,           ny,         &
                          thck,         stagthck)

    call glissade_stagger(nx,           ny,         &
                          usrf,         stagusrf)


    if (verbose_gridop) then
       call point_diag(thck, 'thck (m)', itest, jtest, rtest, 7, 7)
       call point_diag(stagthck, 'stagthck (m)', itest, jtest, rtest, 7, 7)
    endif

    !------------------------------------------------------------------------------
    ! Compute the surface elevation gradient on the staggered grid
    ! (requires that usrf is up to date in halo cells)
    !
    ! Possible settings for whichgradient_margin:
    !   HO_GRADIENT_MARGIN_LAND = 0
    !   HO_GRADIENT_MARGIN_HYBRID = 1
    !   HO_GRADIENT_MARGIN_MARINE = 2
    !
    ! gradient_margin = 0 computes gradients at all edges, even if one cell
    !  if ice-free.  This is what Glide does, but is not appropriate if we have ice-covered
    !  marine-based cells lying above ice-free ocean cells, because the gradient is too big.
    ! gradient_margin_in = 1 computes gradients at edges with
    !  (1) ice-covered cells on either side, or
    !  (2) ice-covered cell (land or marine-based) above ice-free land
    !  This option is designed for both land- and ocean-terminating boundaries. It is the default.
    ! gradient_margin_in = 2 computes gradients only at edges with ice-covered cells
    !  on each side.  This is appropriate for problems with ice shelves, but is
    !  is less accurate than options 0 or 1 for land-based problems (e.g., Halfar SIA).
    !
    ! Passing in max_slope ensures that the surface elevation gradient on the edge
    !  between two cells does not exceed a prescribed value.
    ! Although slope-limiting is not very physical, it helps prevent CFL violations
    !  in regions of steep coastal topography. Some input Greenland data sets have
    !  slopes of up to ~0.3 between adjacent grid cells, leading to very large velocities
    !  even with a no-slip basal boundary condition. 
    !
    ! There are three options for whichgradient:
    ! (0) centered
    ! (1) first-order upstream
    ! (2) second-order upstream.
    ! Centered gradients are the default, but an upstream gradient may be preferred
    !  to damp checkerboard noise.
    !------------------------------------------------------------------------------

!pw call t_startf('glissade_gradient')

    call glissade_surface_elevation_gradient(nx,           ny,          &
                                             dx,           dy,          &
                                             itest, jtest, rtest,       &
                                             ice_mask,                  &
                                             land_mask,                 &
                                             usrf,         thck,        &
                                             topg,         eus,         &
                                             thklim,                    &
                                             thck_gradient_ramp,        &
                                             dusrf_dx,     dusrf_dy,    &
                                             whichgradient,             &
                                             whichgradient_margin,      &
                                             max_slope = max_slope)

    ! halo updates
    call staggered_parallel_halo(dusrf_dx, parallel)
    call staggered_parallel_halo(dusrf_dy, parallel)

!pw call t_stopf('glissade_gradient')

    if (verbose_glp) then
       call point_diag(model%basal_physics%effecpress_stag, 'N_stag', itest, jtest, rtest, 7, 7, 'f10.0')
       call point_diag(usrf, 'usrf (m)', itest, jtest, rtest, 7, 7)
       call point_diag(thck, 'thck (m)', itest, jtest, rtest, 7, 7)
       call point_diag(f_flotation, 'f_flotation (m)', itest, jtest, rtest, 7, 7)
       call point_diag(f_ground_cell, 'f_ground_cell', itest, jtest, rtest, 7, 7)
       call point_diag(f_ground, 'f_ground at vertices', itest, jtest, rtest, 7, 7)
       call point_diag(dusrf_dx, 'dusrf_dx', itest, jtest, rtest, 7, 7)
    endif

    if (verbose_gridop) then
       call point_diag(thck, 'thck (m)', itest, jtest, rtest, 7, 7)
       call point_diag(stagthck, 'stagthck (m)', itest, jtest, rtest, 7, 7)
       call point_diag(usrf, 'usrf (m)', itest, jtest, rtest, 7, 7)
       call point_diag(dusrf_dx, 'dusrf_dx', itest, jtest, rtest, 7, 7)
       call point_diag(dusrf_dy, 'dusrf_dy', itest, jtest, rtest, 7, 7)
    endif  ! verbose_gridop

    !------------------------------------------------------------------------------
    ! Identify the active cells (i.e., cells with ice_mask = 1, and bordering
    !  a locally owned vertex) and active vertices (all vertices of active cells).
    ! Compute the vertices of each element.
    ! Count the number of owned active nodes on this processor, and assign a 
    !  unique local ID to each such node.
    !------------------------------------------------------------------------------

!pw call t_startf('glissade_get_vertex_geom')
    call get_vertex_geometry(nx,             ny,                   &
                             nz,             nhalo,                &
                             parallel,                             &
                             dx,             dy,                   &
                             itest,  jtest,  rtest,                &
                             ice_mask,                      &
                             xVertex,        yVertex,              &
                             active_cell,    active_vertex,        &
                             nNodesSolve,    nVerticesSolve,       &
                             nodeID,         vertexID,             &
                             iNodeIndex,     jNodeIndex,  kNodeIndex, &
                             iVertexIndex,   jVertexIndex)
!pw call t_stopf('glissade_get_vertex_geom')

    ! Zero out the velocity for inactive vertices
    do j = staggered_jlo, staggered_jhi    ! locally owned vertices only
       do i = staggered_ilo, staggered_ihi
          if (.not.active_vertex(i,j)) then
             uvel(:,i,j) = 0.d0
             vvel(:,i,j) = 0.d0
             if (solve_2d) then
                uvel_2d(i,j) = 0.d0
                vvel_2d(i,j) = 0.d0
             endif
          endif
       enddo
    enddo

    ! Assign the appropriate local ID to vertices and nodes in the halo.
    ! NOTE: This works for single-processor runs with periodic BCs
    !       (e.g., ISMIP-HOM), but not for multiple processors.

    call t_startf('glissade_halo_nodeID')
    call staggered_parallel_halo(nodeID, parallel)
    call staggered_parallel_halo(vertexID, parallel)
    call t_stopf('glissade_halo_nodeID')

    if (verbose_id) then
       call point_diag(vertexID, 'vertexID', itest, jtest, rtest, 7, 7, '(i5)')
       call point_diag(nodeID(1,:,:), 'nodeID, k = 1', itest, jtest, rtest, 7, 7, '(i5)')
    endif

    ! Initialization for the Trilinos solver
    ! Allocate arrays, initialize the velocity solution, compute an array 
    !  that maps the local index for owned active nodes to a unique global ID,
    !  and communicate this array to Trilinos

#ifdef TRILINOS
    if (whichsparse == HO_SPARSE_TRILINOS) then   

       if (solve_2d) then

          allocate(active_owned_unknown_map(2*nVerticesSolve))
          allocate(velocityResult(2*nVerticesSolve))
          allocate(Afill_2d(nx-1,ny-1,nNodeNeighbors_2d))

          !----------------------------------------------------------------
          ! Compute global IDs needed to initialize the Trilinos solver
          !----------------------------------------------------------------

          call t_startf('glissade_trilinos_glbid')
          call trilinos_global_id_2d(nx,             ny,           &
                                     parallel,                     &
                                     nVerticesSolve,               &
                                     iVertexIndex,   jVertexIndex, &
                                     global_vertex_id,             &
                                     active_owned_unknown_map)
          call t_stopf('glissade_trilinos_glbid')

          !----------------------------------------------------------------
          ! Send this information to Trilinos (trilinosGlissadeSolver.cpp)
          !----------------------------------------------------------------

          call t_startf('glissade_init_tgs')
          call initializetgs(2*nVerticesSolve, active_owned_unknown_map, comm)
          call t_stopf('glissade_init_tgs')

          !----------------------------------------------------------------
          ! If this is the first outer iteration, then save the pattern of matrix
          ! values that are potentially nonzero and should be sent to Trilinos.
          ! Trilinos requires that this pattern remains fixed during the outer loop.
          !----------------------------------------------------------------

          call t_startf('glissade_trilinos_fill_pattern')
          call trilinos_fill_pattern_2d(nx,            ny,              &
                                        active_vertex, nVerticesSolve,  &
                                        iVertexIndex,  jVertexIndex,    &
                                        indxA_2d,      Afill_2d)
          call t_stopf('glissade_trilinos_fill_pattern')

          !----------------------------------------------------------------
          ! Initialize the solution vector from uvel/vvel.
          !----------------------------------------------------------------

          call trilinos_init_velocity_2d(nx,           ny,           &
                                         nVerticesSolve,             &
                                         iNodeIndex,   jNodeIndex,   &
                                         uvel_2d,      vvel_2d,      &
                                         velocityResult)

       else   ! 3D solve

          allocate(active_owned_unknown_map(2*nNodesSolve))
          allocate(velocityResult(2*nNodesSolve))
          allocate(Afill(nNodeNeighbors_3d,nz,nx-1,ny-1))

          !----------------------------------------------------------------
          ! Compute global IDs needed to initialize the Trilinos solver
          !----------------------------------------------------------------

          call t_startf('glissade_trilinos_glbid')
          call trilinos_global_id_3d(nx,         ny,         nz,   &
                                     parallel,                     &
                                     nNodesSolve,                  &
                                     iNodeIndex, jNodeIndex, kNodeIndex,  &
                                     global_node_id,               &
                                     active_owned_unknown_map)
          call t_stopf('glissade_trilinos_glbid')

          !----------------------------------------------------------------
          ! Send this information to Trilinos (trilinosGlissadeSolver.cpp)
          !----------------------------------------------------------------

          call t_startf('glissade_init_tgs')
          call initializetgs(2*nNodesSolve, active_owned_unknown_map, comm)
          call t_stopf('glissade_init_tgs')

          !----------------------------------------------------------------
          ! If this is the first outer iteration, then save the pattern of matrix
          ! values that are potentially nonzero and should be sent to Trilinos.
          ! Trilinos requires that this pattern remains fixed during the outer loop.
          !----------------------------------------------------------------

          call t_startf('glissade_trilinos_fill_pattern')
          call trilinos_fill_pattern_3d(nx,            ny,           nz,   &
                                        active_vertex, nNodesSolve,        &
                                        iNodeIndex,    jNodeIndex,   kNodeIndex,  &
                                        indxA_3d,      Afill)
                                     
          call t_stopf('glissade_trilinos_fill_pattern')

          !----------------------------------------------------------------
          ! Initialize the solution vector from uvel/vvel.
          !----------------------------------------------------------------

          call trilinos_init_velocity_3d(nx,           ny,                       &
                                         nz,           nNodesSolve,              &
                                         iNodeIndex,   jNodeIndex,  kNodeIndex,  &
                                         uvel,         vvel,                     &
                                         velocityResult)

       endif   ! whichapprox
    endif      ! whichsparse
#endif

    !------------------------------------------------------------------------------
    ! Initialize the basal traction parameter, beta_internal.
    ! Note: If beta is read from an external file, the external value should not be changed.
    !        This value is saved in model%velocity%beta.
    !       The glissade solver uses a beta field weighted by f_ground.
    !        This field is stored in model%velocity%beta_internal and can change over time.
    !       For a no-slip boundary condition (HO_BABC_NO_SLIP), beta_internal is not computed,
    !        so beta_internal = 0 will be written to output.
    !------------------------------------------------------------------------------

    beta_internal(:,:) = 0.d0

    ! Note: There was a call here to calc_effective_pressure, moved to the glissade diagnostic solve.

    !------------------------------------------------------------------------------
    ! For the HO_BABC_BETA_BPMP option, compute a mask of vertices where the bed is at
    ! the pressure melting point, resulting in lower traction.
    !------------------------------------------------------------------------------

    ! initialize to 0 everywhere
    model%basal_physics%bpmp_mask(:,:) = 0
  
    if (whichbabc == HO_BABC_BETA_BPMP) then

       ! interpolate bed temperature to vertices
       ! For stagger_margin_in = 1, only ice-covered cells are included in the interpolation
       call glissade_stagger(nx,           ny,           &
                             temp(nz,:,:), stagbedtemp,  &
                             ice_mask,     stagger_margin_in = 1)
       
       ! interpolate bed pmp temperature to vertices
       call glissade_stagger(nx,           ny,           &
                             bpmp(:,:),    stagbedpmp(:,:), &
                             ice_mask,     stagger_margin_in = 1)

       ! compute a bed pmp mask at vertices; this mask is passed to calcbeta below
       ! Note: The bed is considered thawed if the interpolated bed temperature is
       !       within pmp_threshold of the interpolated pmp temperature.
       where (stagbedtemp >= stagbedpmp - pmp_threshold .and. active_vertex)
          model%basal_physics%bpmp_mask = 1
       endwhere

    endif   ! HO_BABC_BETA_BPMP

    !------------------------------------------------------------------------------
    ! Compute the factor A^(-1/n) appearing in the expression for effective viscosity.
    ! This factor is often denoted as B in the literature.
    ! Note: The rate factor (flwa = A) is assumed to have units of Pa^(-n) yr^(-1).
    !       Thus flwafact = 0.5 * A^(-1/n) has units Pa yr^(1/n).
    !------------------------------------------------------------------------------

    flwafact(:,:,:) = 0.d0

    ! Note: flwa is available in all cells, so flwafact can be computed in all cells.
    !       This includes cells with thck <= thklim, in case a value of flwa is needed
    !        (e.g., inactive land-margin cells adjacent to active cells).

    ! Loop over all cells that border locally owned vertices.
    ! This includes halo rows to the north and east.
    ! OK to skip cells outside the global domain.
    !TODO - Simply compute flwafact for all cells?  We should have flwa for all cells.

    do j = 1+nhalo, ny-nhalo+1
       do i = 1+nhalo, nx-nhalo+1
          ! gn = exponent in Glen's flow law (= 3 by default)
          do k = 1, nz-1
             if (flwa(k,i,j) > 0.0d0) then
                flwafact(k,i,j) = 0.5d0 * flwa(k,i,j)**(-1.d0/n_glen)
             endif
          enddo
       enddo
    enddo

    if (verbose_efvs) then
       call point_diag(flwafact(1,:,:), 'flwafact, k = 1', itest, jtest, rtest, 7, 7, '(f10.0)')
    endif

    !------------------------------------------------------------------------------
    ! If using SLAP solver, then allocate space for the sparse matrix (A), rhs (b), 
    !  answer (x), and residual vector (Ax-b).
    !------------------------------------------------------------------------------

    if (whichsparse <= HO_SPARSE_GMRES) then  ! using SLAP solver

       if (solve_2d) then
          matrix_order = 2*nVerticesSolve
          max_nonzeros = matrix_order*2*nNodeNeighbors_2d  ! nNodeNeighbors_2d = 9
                                                           ! 18 = 2 * 9 (since solving for both u and v)
       else  ! 3D solve
          matrix_order = 2*nNodesSolve
          max_nonzeros = matrix_order*2*nNodeNeighbors_3d  ! nNodeNeighbors_3d = 27
                                                           ! 54 = 2 * 27 (since solving for both u and v)
       endif

       allocate(matrix%row(max_nonzeros), matrix%col(max_nonzeros), matrix%val(max_nonzeros))
       allocate(rhs(matrix_order), answer(matrix_order), resid_vec(matrix_order))

       answer(:) = 0.d0
       rhs(:) = 0.d0
       resid_vec(:) = 0.d0

       if (verbose_matrix) then
          write(iulog,*) 'matrix_order =', matrix_order
          write(iulog,*) 'max_nonzeros = ', max_nonzeros
       endif

    endif   ! SLAP solver
 
    !---------------------------------------------------------------
    ! Print some diagnostic info
    !---------------------------------------------------------------

    if (main_task .and. verbose_solver) then
       write(iulog,*) ' '
       if (whichresid == HO_RESID_L2NORM) then  ! use L2 norm of residual
          write(iulog,*) 'iter #     resid (L2 norm)       target resid'
       elseif (whichresid == HO_RESID_L2NORM_RELATIVE) then  ! relative residual, |Ax-b|/|b|
          write(iulog,*) 'iter #     resid, |Ax-b|/|b|     target resid'
       else                                     ! residual based on velocity
          write(iulog,*) 'iter #     velo resid            target resid'
       end if
    endif

    !------------------------------------------------------------------------------
    ! Set initial solver values 
    !------------------------------------------------------------------------------

    counter = 0
    resid_velo = 1.d0

    L2_norm   = L2_norm_large      ! arbitrary large value
    L2_target = 1.0d-4

    !WHL: For standard test cases (dome, circular shelf), a relative target of 1.0d-7 is 
    !     roughly as stringent as an absolute target of 1.0d-4.
    !       
    L2_norm_relative = L2_norm_large
    L2_target_relative = 1.0d-7

    outer_it_criterion = 1.0d10   ! guarantees at least one loop
    outer_it_target    = 1.0d-12

    ! Set initial values for the accelerated Picard solver

    if (accel_picard) then
       alpha_accel = 1.0d0
       L2_norm_alpha_sav = L2_norm_large  ! arbitrary large value
       if (solve_2d) then
          uvel_2d_old(:,:) = uvel_2d(:,:)
          vvel_2d_old(:,:) = vvel_2d(:,:)
          duvel_2d(:,:) = 0.0d0
          dvvel_2d(:,:) = 0.0d0
       else   ! 3D solve
          uvel_old(:,:,:) = uvel(:,:,:)
          vvel_old(:,:,:) = vvel(:,:,:)
          duvel(:,:,:) = 0.0d0
          dvvel(:,:,:) = 0.0d0
       endif
    endif

    !------------------------------------------------------------------------------
    ! Assemble the load vector b
    ! This goes before the outer loop because the load vector
    !  does not change from one nonlinear iteration to the next.
    !------------------------------------------------------------------------------

    loadu(:,:,:) = 0.d0
    loadv(:,:,:) = 0.d0

    !------------------------------------------------------------------------------
    ! Gravitational forcing
    !------------------------------------------------------------------------------

    call t_startf('glissade_load_vector_gravity')

    call load_vector_gravity(nx,               ny,              &
                             nz,               nhalo,           &
                             sigma,            stagwbndsigma,   &
                             dx,               dy,              &
                             itest,   jtest,   rtest,           &
                             active_cell,      active_vertex,   &
                             xVertex,          yVertex,         &
                             stagusrf,         stagthck,        &
                             dusrf_dx,         dusrf_dy,        &
                             whichassemble_taud,                &
                             loadu,            loadv)
       
    call t_stopf('glissade_load_vector_gravity')

    ! Compute components of gravitational driving stress
    taudx(:,:) = 0.d0
    taudy(:,:) = 0.d0
    do j = 1, ny-1
       do i = 1, nx-1
          do k = 1, nz
             taudx(i,j) = taudx(i,j) + loadu(k,i,j)
             taudy(i,j) = taudy(i,j) + loadv(k,i,j)
          enddo
       enddo
    enddo
    taudx(:,:) = taudx(:,:) * vol0/(dx*dy)  ! convert from model units to Pa
    taudy(:,:) = taudy(:,:) * vol0/(dx*dy)

    if (verbose_load) then
       ! Note: The first of these quantities is the load vector on the rhs of the matrix.
       !       The second is the value that would go on the rhs by simply taking rho*g*H*ds/dx.
       !       These will not agree exactly because of the way H is handled in FE assembly,
       !        but they should be close if which_ho_assemble_taud = HO_ASSEMBLE_TAUD_LOCAL.
       !       If which_ho_assemble_taud = HO_ASSEMBLE_TAUD_STANDARD, they can differ substantially.
       call point_diag(taudx, 'vert sum of grav load vector', itest, jtest, rtest, 7, 7, '(f10.0)')
       call point_diag(-rhoi*grav*stagthck(i,j)*dusrf_dx, 'rhoi*g*H*ds/dx', itest, jtest, rtest, 7, 7, '(f10.0)')
       call point_diag(uvel_2d, 'Starting uvel_2d (m/yr)', itest, jtest, rtest, 7, 7)
    endif

    !------------------------------------------------------------------------------
    ! Lateral pressure at vertical ice edge.
    !------------------------------------------------------------------------------

    ! The following is a kluge for computing lateral load at marine cliff edges.
    ! When stagthck and stagusrf are computed above, ice-free cells are included in the average.
    ! This is appropriate at land-terminating margins, but not for marine cliffs.
    ! For whichassemble_lateral = HO_ASSEMBLE_LATERAL_LOCAL, stagusrf and stagthck are not used.
    !  Instead, we use usrf and thck from the ice-filled cliff cell.
    ! But for whichassemble_lateral = HO_ASSEMBLE_LATERAL_STANDARD, the load is proportional
    !  to stagthck and stagusrf and will be too low if the staggered averages
    !  incorporate ice-free ocean.  So we need to compute appropriate staggered averages
    !  without weighting ice-free cells.
    ! At some point, we might want to deprecate HO_ASSEMBLE_LATERAL_STANDARD.

    if (whichassemble_lateral == HO_ASSEMBLE_LATERAL_STANDARD) then

       call glissade_stagger(nx,           ny,               &
                             thck,         stagthck_lateral, &
                             ice_plus_land_mask,             &
                             stagger_margin_in = 1)

       call glissade_stagger(nx,           ny,               &
                             usrf,         stagusrf_lateral, &
                             ice_plus_land_mask,             &
                             stagger_margin_in = 1)
    else
       stagusrf_lateral = stagusrf
       stagthck_lateral = stagthck
    endif

    call t_startf('glissade_load_vector_lateral_bc')
    call load_vector_lateral_bc(nx,               ny,              &
                                nz,               sigma,           &
                                nhalo,                             &
                                itest,   jtest,   rtest,           &
                                whichassemble_lateral,             &
                                land_mask,        ocean_mask,      &
                                active_cell,                       &
                                xVertex,          yVertex,         &
                                usrf,             thck,            &
                                stagusrf_lateral, stagthck_lateral,  &
                                loadu,            loadv)
    call t_stopf('glissade_load_vector_lateral_bc')

    call t_stopf('glissade_vhs_init')

    !------------------------------------------------------------------------------
    ! If solving a 2D problem (e.g., SSA at one level), sum the load vector over columns.
    ! Note: It would be slightly more efficient to compute the load vector at a single level
    !       using custom 2D subroutines. However, this would require extra code and would
    !       save little work, since the load vector is computed only once per timestep.
    !------------------------------------------------------------------------------

    if (solve_2d) then

       loadu_2d(:,:) = 0.d0
       loadv_2d(:,:) = 0.d0

       do j = 1, ny-1
          do i = 1, nx-1
             do k = 1, nz
                loadu_2d(i,j) = loadu_2d(i,j) + loadu(k,i,j)
                loadv_2d(i,j) = loadv_2d(i,j) + loadv(k,i,j)
             enddo
          enddo
       enddo

    endif

    if (verbose_load) then
       call point_diag(stagthck, 'stagthck (m)', itest, jtest, rtest, 7, 7)
       call point_diag(stagusrf, 'stagusrf (m)', itest, jtest, rtest, 7, 7)
       call point_diag(taudx*dx*dy/vol0, 'loadu_2d (taudx term)', itest, jtest, rtest, 7, 7)
       call point_diag(taudy*dx*dy/vol0, 'loadu_2d (taudy term)', itest, jtest, rtest, 7, 7)
       if (solve_2d) then
          call point_diag(loadu_2d - taudx*dx*dy/vol0, 'loadu_2d (lateral term)', itest, jtest, rtest, 7, 7)
          call point_diag(loadv_2d - taudy*dx*dy/vol0, 'loadv_2d (lateral term)', itest, jtest, rtest, 7, 7)
          call point_diag(loadu_2d, 'loadu_2d', itest, jtest, rtest, 7, 7)
          call point_diag(loadv_2d, 'loadv_2d', itest, jtest, rtest, 7, 7)
       else   ! 3d
          do k = 1, nz
             call point_diag(loadu(k,:,:), 'loadu_3d', itest, jtest, rtest, 7, 7)
             call point_diag(loadv(k,:,:), 'loadv_3d', itest, jtest, rtest, 7, 7)
          enddo
       endif
    endif
    
    ! Optional slope correction factor for DIVA

    ! Initialize to values appropriate for small slopes
    ! Modified below if diva_slope_correction = T
    theta_slope = 0.0d0
    theta_slope_x = 0.0d0
    theta_slope_y = 0.0d0
    stag_theta_slope_x = 0.0d0
    stag_theta_slope_y = 0.0d0
    stag_theta_slope = 0.0d0
    diva_slope_factor_x = 1.0d0
    diva_slope_factor_y = 1.0d0
    stag_diva_slope_factor_x = 1.0d0
    stag_diva_slope_factor_y = 1.0d0

    if (whichapprox==HO_APPROX_DIVA) then
       if (diva_slope_correction) then

          ! Compute the slope angle at each cell center.
          ! Note: Results will differ based on whether we use the slope of usrf, lsrf, or an average
          !       (e.g. for ISMIP-HOM Test A, which has steep lsrf and gentler usrf).
          ! Note: theta_slope is a required output argument but is not used.

!!          z_mean = usrf
!!          z_mean = lsrf
          z_mean = 0.5d0 * (usrf + lsrf)

          call glissade_slope_angle(&
               nx,           ny,     &
               dx,           dy,     &  ! m
               itest, jtest, rtest,  &
               z_mean,               &  ! m
               theta_slope,          &  ! radians
               theta_slope_x,        &
               theta_slope_y)

          call parallel_halo(theta_slope_x, parallel)
          call parallel_halo(theta_slope_y, parallel)

          ! Repeat for vertices.
          ! We do separate calls for centers and vertices to avoid issues with angle interpolation.
          ! Note: stag_theta_slope is a required output argument but is not used.
          call glissade_slope_angle_staggered(&
               nx,           ny,     &
               dx,           dy,     &  ! m
               itest, jtest, rtest,  &
               z_mean,               &  ! m
               stag_theta_slope,     &  ! radians
               stag_theta_slope_x,   &
               stag_theta_slope_y)

          call parallel_halo(stag_theta_slope_x, parallel)
          call parallel_halo(stag_theta_slope_y, parallel)

          ! Compute the slope factors used to correct vertical derivatives (du/dz) and (dv/dz)
          diva_slope_factor_x = cos(theta_slope_x)**2 + 4.d0*sin(theta_slope_x)**2
          diva_slope_factor_y = cos(theta_slope_y)**2 + 4.d0*sin(theta_slope_y)**2
          stag_diva_slope_factor_x = cos(stag_theta_slope_x)**2 + 4.d0*sin(stag_theta_slope_x)**2
          stag_diva_slope_factor_y = cos(stag_theta_slope_y)**2 + 4.d0*sin(stag_theta_slope_y)**2

          if (verbose_diva) then
             call point_diag(z_mean, 'z_mean (m)', itest, jtest, rtest, 7, 7)
             call point_diag(theta_slope_x*180.d0/pi, 'theta_slope_x (deg)', itest, jtest, rtest, 7, 7)
             call point_diag(theta_slope_y*180.d0/pi, 'theta_slope_y (deg)', itest, jtest, rtest, 7, 7)
             call point_diag(diva_slope_factor_x, 'diva_slope_factor_x', itest, jtest, rtest, 7, 7)
             call point_diag(diva_slope_factor_y, 'diva_slope_factor_y', itest, jtest, rtest, 7, 7)
             if (this_rank == rtest) then
                i = itest
                j = jtest
                write(iulog,*) ' '
                write(iulog,*) 'i, j, uvel_2d, vvel_2d, beta_eff_x, beta_eff_y, btractx, btracty:',  &
                     i, j, uvel_2d(i,j), vvel_2d(i,j), beta_eff_x(i,j), beta_eff_y(i,j), btractx(i,j), btracty(i,j)
             endif
          endif

       endif   ! diva_slope_correction

    endif   ! DIVA

    !------------------------------------------------------------------------------
    ! Main outer loop: Iterate to solve the nonlinear problem
    !------------------------------------------------------------------------------

    call t_startf('glissade_vhs_nonlinear_loop')
    do while (outer_it_criterion >= outer_it_target .and. counter < maxiter_nonlinear)

       ! Advance the iteration counter

       counter = counter + 1

       !------------------------------------------------------------------------------
       ! Loop to support an accelerated Picard solver
       !
       ! This loop wraps the following computations:
       ! (1) If accel_picard = T, adjust the velocity solution using the method of Dukowicz (2019).
       ! (2) Calculate beta
       ! (3) Assemble the matrix
       ! (4) Compute the residual R = A(u)*u - b
       ! (5) If accel_picard = T, check whether the adjusted solution has reduced the residual.
       !     If so, then repeat (1)-(4). Iterate until the adjustment no longer reduces the residual.
       ! If accel_picard = F, we simply do (2)-(4) once, and proceed to the matrix solution.
       !
       ! Note: We need to solve once for the velocity before we can adjust it and
       !       compare residuals. So (1) and (5) are done only when counter >= 2.
       !------------------------------------------------------------------------------

       ! If using the accelerated Picard solver, then update the velocity.
       ! Note: For the first velocity update during the new nonlinear iteration,
       !       we have alpha_accel = 1 and (duvel_2d, dvvel_2d) = 0,
       !       so uvel_2d and vvel_2d are the final values from the previous nonliner iteration.

       assembly_is_done = .false.

       do while (.not.assembly_is_done)

          if (accel_picard .and. counter >= 2) then

             if (solve_2d) then

                ! Update the 2D velocity by extending the vector (duvel_2d, dvvel_2d)
                uvel_2d(:,:) = uvel_2d_old(:,:) + alpha_accel * duvel_2d(:,:)
                vvel_2d(:,:) = vvel_2d_old(:,:) + alpha_accel * dvvel_2d(:,:)
                call staggered_parallel_halo(uvel_2d, parallel)
                call staggered_parallel_halo(vvel_2d, parallel)

                if (whichapprox == HO_APPROX_DIVA) then

                   ! Update the basal traction and the 3D velocity, given the new 2D velocity.
                   ! Note: The updated quantities needed for Picard acceleration are ubas and vbas (for calcbeta)
                   !       along with btractx and btracty (for effective viscosity).
                   !       We do not need velocity at all levels, but it is simplest just to call compute_3d_velocity_diva.

                   call compute_3d_velocity_diva(&
                        nx,                 ny,                   &
                        nz,                 sigma,                &
                        itest,   jtest,     rtest,                &
                        active_vertex,      diva_level_index,     &
                        ice_plus_land_mask,                       &
                        stag_omega,         omega_k,              &
                        beta_internal,                            &
                        beta_eff_x,         beta_eff_y,           &
                        stag_theta_slope_x, stag_theta_slope_y,   &
                        stag_diva_slope_factor_x,                 &
                        stag_diva_slope_factor_y,                 &
                        uvel_2d,            vvel_2d,              &
                        btractx,            btracty,              &
                        uvel,               vvel)

                endif   ! DIVA

                if (verbose_picard .and. this_rank == rtest) then
                   write(iulog,*) ' '
                   write(iulog,*) 'Picard accel: counter, alpha =', counter, alpha_accel
                   i = itest
                   j = jtest
                   write(iulog,*) 'rank, i, j =', rtest, i, j
                   write(iulog,*) '   old uvel_2d, vvel_2d:', uvel_2d_old(i,j), vvel_2d_old(i,j)
                   write(iulog,*) '     duvel_2d, dvvel_2d:', duvel_2d(i,j), dvvel_2d(i,j)
                   write(iulog,*) '   new uvel_2d, vvel_2d:', uvel_2d(i,j), vvel_2d(i,j)
                   write(iulog,*) '      new ubas, vbas   :', uvel(nz,i,j), vvel(nz,i,j)
                endif

             else   ! 3D solve

                ! Update the 3D velocity by extending the vector (duvel, dvvel)
                uvel(:,:,:) = uvel_old(:,:,:) + alpha_accel * duvel(:,:,:)
                vvel(:,:,:) = vvel_old(:,:,:) + alpha_accel * dvvel(:,:,:)
                call staggered_parallel_halo(uvel, parallel)
                call staggered_parallel_halo(vvel, parallel)

                if (verbose_picard .and. this_rank == rtest) then
                   write(iulog,*) ' '
                   write(iulog,*) 'Picard accel: counter, alpha =', counter, alpha_accel
                   i = itest
                   j = jtest
                   k = 1
                   write(iulog,*) 'rank, i, j, k =', rtest, k, i, j
                   write(iulog,*) '   old uvel, vvel:', uvel_old(k,i,j), vvel_old(k,i,j)
                   write(iulog,*) '     duvel, dvvel:', duvel(k,i,j), dvvel(k,i,j)
                   write(iulog,*) '   new uvel, vvel:', uvel(k,i,j), vvel(k,i,j)
                   write(iulog,*) '      new ubas, vbas   :', uvel(nz,i,j), vvel(nz,i,j)
                endif

             endif   ! 2D or 3D

          endif  ! accelerated Picard

          !---------------------------------------------------------------------------
          ! Compute or prescribe the basal traction field 'beta'.
          !
          ! Notes:
          ! (1) We could compute beta before the main outer loop if beta
          !     were assumed to be independent of velocity.  Computing beta here,
          !     however, allows for more general sliding laws where beta depends
          !     on the velocity.
          ! (2) The units of the input arguments in calcbeta are assumed to be the
          !     same as the Glissade units.
          ! (3) The computed beta (called beta_internal) is weighted by f_ground,
          !     the grounded fraction at each vertex.  With a GLP, f_ground is
          !     between 0 and 1 for vertices adjacent to the GL, allowing for a smooth
          !     change in beta as the GL advances and retreats.
          ! (4) The basal velocity is a required input to calcbeta.
          !     DIVA does not compute the basal velocity in the 2D matrix solve,
          !     but computes the full 3D velocity after each iteration so that
          !     uvel/vvel(nz,:,:) are available here.
          ! (5) For which_ho_babc = HO_BABC_BETA_EXTERNAL, beta currently has
          !     dimensionless Glimmer units. Rather than incur roundoff errors by
          !     repeatedly multiplying and dividing by scaling constants, the conversion
          !     to Pa yr/m is done here in the argument list.
          ! (6) Subroutine calcbeta includes a halo update for beta_internal at the end.
          !-------------------------------------------------------------------

          if (whichapprox == HO_APPROX_SSA .or. whichapprox == HO_APPROX_L1L2) then
             ubas(:,:) = uvel_2d(:,:)
             vbas(:,:) = vvel_2d(:,:)
          else  ! 3D solve or DIVA
             ubas(:,:) = uvel(nz,:,:)
             vbas(:,:) = vvel(nz,:,:)
          endif

!!       if (verbose_beta) then
          if (verbose_beta .and. counter==1) then
             if (this_rank == rtest) write(iulog,*) 'Before calcbeta, counter =', counter
             call point_diag(usrf, 'usrf (m)', itest, jtest, rtest, 7, 7)
             call point_diag(thck, 'thck (m)', itest, jtest, rtest, 7, 7)
             call point_diag(topg, 'topg (m)', itest, jtest, rtest, 7, 7)
             call point_diag(ice_mask, 'ice_mask', itest, jtest, rtest, 7, 7)
             call point_diag(floating_mask, 'floating_mask', itest, jtest, rtest, 7, 7)
             call point_diag(f_flotation, 'f_flotation (m)', itest, jtest, rtest, 7, 7)
             call point_diag(f_ground_cell, 'f_ground_cell', itest, jtest, rtest, 7, 7)
             call point_diag(f_ground, 'f_ground at vertices', itest, jtest, rtest, 7, 7)
!             call point_diag(ocean_mask, 'ocean_mask', itest, jtest, rtest, 7, 7)
!             call point_diag(bpmp, 'bpmp', itest, jtest, rtest, 7, 7)
!             call point_diag(btemp, 'btemp', itest, jtest, rtest, 7, 7)
!             call point_diag(bpmp - btemp, 'bpmp - btemp', itest, jtest, rtest, 7, 7)
!             call point_diag(model%basal_physics%effecpress, 'effecpress (Pa)', itest, jtest, rtest, 7, 7, 'f10.0')
!             call point_diag(model%basal_physics%effecpress_stag, 'effecpress_stag (Pa)', itest, jtest, rtest, 7, 7, 'f10.0')
          endif  ! verbose_beta

          call glissade_calcbeta(&
               whichbabc,                        &
               parallel,                         &
               dx,            dy,                &
               nx,            ny,                &
               ubas,          vbas,              &
               model%basal_physics,              &
               flwa(nz-1,:,:),                   &  ! basal flwa layer
               thck,                             &
               topg,          eus,               &
               ice_mask,                         &
               land_mask,                        &
               f_ground,                         &
               beta*scyr,                        &  ! external beta (intent in)
               beta_internal,                    &  ! beta weighted by f_ground (intent inout)
               whichbeta_limit,                  &
               itest = itest, jtest = jtest, rtest = rtest)

!          if (verbose_beta) then
!             maxbeta = maxval(beta_internal(:,:))
!             maxbeta = parallel_reduce_max(maxbeta)
!             minbeta = minval(beta_internal(:,:))
!             minbeta = parallel_reduce_min(minbeta)
!          endif

!          if (verbose_beta .and. main_task) then
!             write(iulog,*) 'max, min beta (Pa/(m/yr)) =', maxbeta, minbeta
!          endif

          !-------------------------------------------------------------------
          ! Assemble the linear system Ax = b
          !
          ! Depending on the value of whichapprox, we can assemble either a 2D system
          ! (to solve for uvel and vvel at one level) or a 3D system (to solve for
          !  uvel and vvel at all levels).
          !-------------------------------------------------------------------
       
          if (solve_2d) then  ! assemble 2D matrix

             call t_startf('glissade_assemble_2d')

             ! save current velocity
             usav_2d(:,:) = uvel_2d(:,:)
             vsav_2d(:,:) = vvel_2d(:,:)

             ! Assemble the matrix

             call assemble_stiffness_matrix_2d(nx,               ny,              &
                                               nz,                                &
                                               sigma,            stagsigma,       &
                                               nhalo,                             &
                                               itest,   jtest,   rtest,           &
                                               active_cell,                       &
                                               xVertex,          yVertex,         &
                                               uvel_2d,          vvel_2d,         &
                                               stagusrf,         stagthck,        &
                                               flwa,             flwafact,        &
                                               whichapprox,                       &
                                               diva_slope_factor_x, diva_slope_factor_y,  &
                                               whichefvs,        efvs,            &
                                               efvs_constant,    effstrain_min,   &
                                               Auu_2d,           Auv_2d,          &
                                               Avu_2d,           Avv_2d,          &
                                               dusrf_dx,         dusrf_dy,        &
                                               thck,                              &
                                               btractx,          btracty,         &
                                               omega_k,          omega,   &
                                               efvs_qp_3d)

             if (whichapprox == HO_APPROX_DIVA) then

                ! Halo update for omega
                ! This is needed so that beta_eff, computed below, will be correct in halos

                call parallel_halo(omega, parallel)

                ! Interpolate the appropriate integral
                if (diva_level_index == 0) then   ! solving for 2D mean velocity field

                   ! Interpolate omega to the staggered grid
                   call glissade_stagger(nx,           ny,               &
                                         omega(:,:),   stag_omega(:,:),  &
                                         ice_plus_land_mask,             &
                                         stagger_margin_in = 1)

                else  ! solving for the velocity at level k (k = 1 at upper surface)

                   k = diva_level_index

                   call parallel_halo(omega_k(k,:,:), parallel)

                   ! Interpolate omega_k to the staggered grid
                   call glissade_stagger(nx,              ny,               &
                                         omega_k(k,:,:),  stag_omega(:,:),  &
                                         ice_plus_land_mask,                &
                                         stagger_margin_in = 1)

                endif
                
                !-------------------------------------------------------------------
                ! Compute effective beta based on Goldberg (2011) eq. 40 and 41
                !
                ! If solving for the depth-integrated velocity u_mean:
                !
                !       beta_eff * u_mean = beta * u_b
                !
                ! where beta_eff = beta / (1 + beta*omega)
                !          omega = int_b^z {[(s-z)/H]^2 * 1/efvs * dz}
                !
                ! If solving for the surface velocity u_sfc:
                !
                !       beta_eff * u_sfc = beta * u_b
                !
                ! where beta_eff = beta / (1 + beta*omega_1)
                !        omega_1 = int_b^s {[(s-z)/H] * 1/efvs * dz}
                !                = omega_k for k = 1
                !
                ! To implement a no-slip basal BC, set beta_eff = 1/omega
                !
                ! May 2025: Added some slope correction terms.
                !           As a result, there is one version of beta_eff for the x direction
                !            and another for the y direction.
                !--------------------------------------------------------------------

                beta_eff_x(:,:) = 0.d0
                beta_eff_y(:,:) = 0.d0

                !Note: The 'if' is not strictly needed, since the corrected beta_eff is equal
                !      to the uncorrected beta_eff whe slope_factor = 1.0 and theta_slope = 0.0
                if (diva_slope_correction) then  ! compute a larger beta_eff at each vertex based on the slope

                   if (whichbabc == HO_BABC_NO_SLIP) then
                      where(stag_omega > 0.0d0)
                         beta_eff_x = stag_diva_slope_factor_x / (stag_omega*cos(stag_theta_slope_x))
                         beta_eff_y = stag_diva_slope_factor_y / (stag_omega*cos(stag_theta_slope_y))
                      endwhere
                   else
                      beta_eff_x = (stag_diva_slope_factor_x*beta_internal)  &
                        / (stag_diva_slope_factor_x + beta_internal*stag_omega*cos(stag_theta_slope_x))
                      beta_eff_y = (stag_diva_slope_factor_y*beta_internal)  &
                        / (stag_diva_slope_factor_y + beta_internal*stag_omega*cos(stag_theta_slope_y))
                   endif

                else   ! no slope correction

                   if (whichbabc == HO_BABC_NO_SLIP) then
                      where (stag_omega > 0.d0)
                         beta_eff_x = 1.d0 / stag_omega
                         beta_eff_y = 1.d0 / stag_omega
                      endwhere
                   else   ! slip allowed at bed
                      beta_eff_x = beta_internal(:,:) / (1.d0 + beta_internal*stag_omega)
                      beta_eff_y = beta_internal(:,:) / (1.d0 + beta_internal*stag_omega)
                   endif

                endif

                if (verbose_diva) then
                   if (this_rank == rtest) then
                      i = itest
                      j = jtest
                      write(iulog,*) ' '
                      write(iulog,*) 'uvel, beta_eff_x, btractx:', uvel_2d(i,j), beta_eff_x(i,j), btractx(i,j)
                   endif
                   call point_diag(omega, 'omega', itest, jtest, rtest, 7, 7, '(e10.3)')
                   call point_diag(stag_omega, 'stag_omega', itest, jtest, rtest, 7, 7, '(e10.3)')
                   call point_diag(beta_eff_x, 'beta_eff_x', itest, jtest, rtest, 7, 7, '(e10.3)')
                   call point_diag(beta_eff_y, 'beta_eff_y', itest, jtest, rtest, 7, 7, '(e10.3)')
                endif

                if (diva_slope_correction) then

                   ! Incorporate basal sliding boundary conditions with basal curvature,
                   ! based on beta_eff_x and beta_eff_y

                   call basal_sliding_bc_2d_diva(&
                        nx,                ny,              &
                        nNodeNeighbors_2d, nhalo,           &
                        parallel,                           &
                        dx,                dy,              &
                        itest,   jtest,    rtest,           &
                        active_cell,       active_vertex,   &
                        beta_eff_x,        beta_eff_y,      &
                        lsrf,                               &
                        xVertex,           yVertex,         &
                        whichassemble_beta,                 &
                        Auu_2d,            Avv_2d)

                else

                   ! Incorporate basal sliding boundary conditions based on beta_eff,
                   !  but without basal curvature

                   call basal_sliding_bc_2d(&
                        nx,                ny,              &
                        nNodeNeighbors_2d, nhalo,           &
                        parallel,                           &
                        dx,                dy,              &
                        itest,   jtest,    rtest,           &
                        active_cell,       active_vertex,   &
                        beta_eff_x,                         &  ! same as beta_eff_y
                        lsrf,                               &
                        xVertex,           yVertex,         &
                        whichassemble_beta,                 &
                        Auu_2d,            Avv_2d)

                endif   ! diva_slope_correction

             else    ! L1L2, SSA

                ! Incorporate basal sliding boundary conditions, based on beta_internal

                call basal_sliding_bc_2d(&
                     nx,                ny,              &
                     nNodeNeighbors_2d, nhalo,           &
                     parallel,                           &
                     dx,                dy,              &
                     itest,   jtest,    rtest,           &
                     active_cell,       active_vertex,   &
                     beta_internal,                      &
                     lsrf,                               &
                     xVertex,           yVertex,         &
                     whichassemble_beta,                 &
                     Auu_2d,            Avv_2d)

             endif    ! whichapprox (SSA, L1L2, DIVA)

             call t_stopf('glissade_assemble_2d')

             if (verbose_matrix .and. this_rank==rtest) write(iulog,*) 'Assembled the 2D stiffness matrix'

             !---------------------------------------------------------------------------
             ! Set rhs to the load vector
             ! The rhs can be adjusted below to account for inhomogeneous Dirichlet BC
             !---------------------------------------------------------------------------

             bu_2d(:,:) = loadu_2d(:,:)
             bv_2d(:,:) = loadv_2d(:,:)

             !---------------------------------------------------------------------------
             ! Incorporate Dirichlet boundary conditions (prescribed uvel and vvel)
             ! Note: With a no-slip BC, umask_dirichlet(nz,:,:) = vmask_dirichlet(nz,:,:) = .true.,
             !        except for the DIVA scheme.
             !       For DIVA, the no-slip BC is enforced by setting beta_eff = 1/omega.
             !---------------------------------------------------------------------------

             if (verbose_dirichlet .and. main_task) then
                write(iulog,*) 'Call Dirichlet_bc'
             endif

             call t_startf('glissade_dirichlet_2d')
             call dirichlet_boundary_conditions_2d(nx,                       ny,                      &
                                                   nhalo,                                             &
                                                   active_vertex,                                     &
                                                   umask_dirichlet(nz,:,:),  vmask_dirichlet(nz,:,:), &
                                                   uvel_2d,                  vvel_2d,                 &
                                                   Auu_2d,                   Auv_2d,                  &
                                                   Avu_2d,                   Avv_2d,                  &
                                                   bu_2d,                    bv_2d)
             call t_stopf('glissade_dirichlet_2d')

             !---------------------------------------------------------------------------
             ! Halo updates for matrices
             !---------------------------------------------------------------------------
     
             call t_startf('glissade_halo_Aijm')
             do m = 1, nNodeNeighbors_2d
                call staggered_parallel_halo(Auu_2d(:,:,m), parallel)
                call staggered_parallel_halo(Auv_2d(:,:,m), parallel)
                call staggered_parallel_halo(Avu_2d(:,:,m), parallel)
                call staggered_parallel_halo(Avv_2d(:,:,m), parallel)
             enddo
             call t_stopf('glissade_halo_Aijm')

             !---------------------------------------------------------------------------
             ! Halo updates for rhs vectors
             ! (Not sure if these are necessary, but leaving them for now)
             !---------------------------------------------------------------------------

             call t_startf('glissade_halo_bxxs')
             call staggered_parallel_halo(bu_2d(:,:), parallel)
             call staggered_parallel_halo(bv_2d(:,:), parallel)
             call t_stopf('glissade_halo_bxxs')

             !---------------------------------------------------------------------------
             ! Check symmetry of assembled matrix
             !
             ! There may be small differences from perfect symmetry due to roundoff errors.
             ! If sufficiently small, these differences are fixed by averaging the two values
             !  that should be symmetric.  Otherwise the code aborts.
             !---------------------------------------------------------------------------

             if (check_symmetry) then
                call t_startf('glissade_chk_symmetry')
                call check_symmetry_assembled_matrix_2d(nx,            ny,       &
                                                        parallel,                &
                                                        active_vertex,           &
                                                        Auu_2d,        Auv_2d,   &
                                                        Avu_2d,        Avv_2d)
                call t_stopf('glissade_chk_symmetry')
             endif

             !---------------------------------------------------------------------------
             ! Count the total number of nonzero entries on all processors.
             !---------------------------------------------------------------------------

             call count_nonzeros_2d(nx,            ny,       &
                                    parallel,                &
                                    Auu_2d,        Auv_2d,   &
                                    Avu_2d,        Avv_2d,   &
                                    active_vertex,           &
                                    nNonzeros)

             if (write_matrix) then
                if (counter == 1) then    ! first outer iteration only
                   call t_startf('glissade_wrt_mat')
                   call write_matrix_elements_2d(nx,             ny,            &
                                                 nVerticesSolve, vertexID,      &
                                                 iVertexIndex,   jVertexIndex,  &
                                                 Auu_2d,         Auv_2d,        &
                                                 Avu_2d,         Avv_2d,        &
                                                 bu_2d,          bv_2d)
                   call t_stopf('glissade_wrt_mat')
                endif
             endif   ! write_matrix

             if (verbose_matrix .and. this_rank==rtest) then
                i = itest
                j = jtest
                write(iulog,*) ' '
                write(iulog,*) 'After assembly and BC, i, j =', i, j
                write(iulog,*) 'Auu_2d sum =', sum(Auu_2d(i,j,:))
                write(iulog,*) 'Auv_2d sum =', sum(Auv_2d(i,j,:))
                write(iulog,*) 'Avu_2d sum =', sum(Avu_2d(i,j,:))
                write(iulog,*) 'Avv_2d sum =', sum(Avv_2d(i,j,:))

                m = indxA_2d(0,0)  ! diag entry
                write(iulog,*) ' '
                write(iulog,*) 'Matrix row properties, j =', j
                write(iulog,*) ' '
                write(iulog,*) 'i, diag, max, min, sum:'
                do i = itest-3, itest+3
                   write(iulog,*) ' '
                   write(iulog,'(a8, i4, 4f20.8)') 'Auu_2d:', i, Auu_2d(i,j,m), maxval(Auu_2d(i,j,:)), &
                                                      minval(Auu_2d(i,j,:)),   sum(Auu_2d(i,j,:))
                   write(iulog,'(a8, i4, 4f20.8)') 'Auv_2d:', i, Auv_2d(i,j,m), maxval(Auv_2d(i,j,:)), &
                                                      minval(Auv_2d(i,j,:)),   sum(Auv_2d(i,j,:))
                   write(iulog,'(a8, i4, 4f20.8)') 'Avv_2d:', i, Avv_2d(i,j,m), maxval(Avv_2d(i,j,:)), &
                                                      minval(Avv_2d(i,j,:)),   sum(Avv_2d(i,j,:))
                enddo

                i = itest
                j = jtest
                write(iulog,*) 'i, j =', i, j
                write(iulog,*) 'iA, jA, Auu_2d, Auv_2d, Avu_2d, Avv_2d:'
                do jA = -1, 1
                   do iA = -1, 1
                      m = indxA_2d(iA,jA)
                      write(iulog,*) iA, jA, Auu_2d(i,j,m), Auv_2d(i,j,m), Avu_2d(i,j,m), Avv_2d(i,j,m)
                   enddo
                enddo
                write(iulog,*) ' '
                write(iulog,*) 'bu_2d =', bu_2d(i,j)
                write(iulog,*) 'bv_2d =', bv_2d(i,j)

             endif  ! verbose_matrix

          else  ! assemble 3D matrix

             ! save current velocity
             usav(:,:,:) = uvel(:,:,:)
             vsav(:,:,:) = vvel(:,:,:)

             !---------------------------------------------------------------------------
             ! Assemble the stiffness matrix A
             !---------------------------------------------------------------------------

             call t_startf('glissade_assemble_3d')
             call assemble_stiffness_matrix_3d(nx,               ny,              &
                                               nz,               sigma,           &
                                               nhalo,                             &
                                               itest,   jtest,   rtest,           &
                                               active_cell,                       &
                                               xVertex,          yVertex,         &
                                               uvel,             vvel,            &
                                               stagusrf,         stagthck,        &
                                               flwafact,         whichapprox,     &
                                               whichefvs,        efvs,            &
                                               efvs_constant,    effstrain_min,   &
                                               Auu,              Auv,             &
                                               Avu,              Avv)
             call t_stopf('glissade_assemble_3d')
          
             if (verbose_matrix .and. this_rank==rtest) write(iulog,*) 'Assembled the 3D stiffness matrix'

             !---------------------------------------------------------------------------
             ! Incorporate basal sliding boundary conditions, based on beta_internal
             !---------------------------------------------------------------------------

             if (whichbabc /= HO_BABC_NO_SLIP) then

                call basal_sliding_bc_3d(&
                     nx,                  ny,              &
                     nNodeNeighbors_3d,   nhalo,           &
                     parallel,                             &
                     dx,                  dy,              &
                     itest,    jtest,     rtest,           &
                     active_cell,         active_vertex,   &
                     beta_internal,       lsrf,            &
                     xVertex,             yVertex,         &
                     whichassemble_beta,                   &
                     Auu(:,nz,:,:),       Avv(:,nz,:,:))

             endif   ! whichbabc

             !---------------------------------------------------------------------------
             ! Set rhs to the load vector
             ! The rhs can be adjusted below to account for inhomogeneous Dirichlet BC
             !---------------------------------------------------------------------------

             bu(:,:,:) = loadu(:,:,:)
             bv(:,:,:) = loadv(:,:,:)

             !---------------------------------------------------------------------------
             ! Incorporate Dirichlet boundary conditions (prescribed uvel and vvel)
             !---------------------------------------------------------------------------

             if (verbose_dirichlet .and. main_task) write(iulog,*) 'Call Dirichlet_bc'

             call t_startf('glissade_dirichlet_3d')
             call dirichlet_boundary_conditions_3d(nx,              ny,                &
                                                   nz,              nhalo,             &
                                                   active_vertex,                      &
                                                   umask_dirichlet, vmask_dirichlet,   &
                                                   uvel,            vvel,              &
                                                   Auu,             Auv,               &
                                                   Avu,             Avv,               &
                                                   bu,              bv)
             call t_stopf('glissade_dirichlet_3d')
          
             !---------------------------------------------------------------------------
             ! Halo updates for matrices
             !---------------------------------------------------------------------------
          
             call t_startf('glissade_halo_Axxs')
             call staggered_parallel_halo(Auu(:,:,:,:), parallel)
             call staggered_parallel_halo(Auv(:,:,:,:), parallel)
             call staggered_parallel_halo(Avu(:,:,:,:), parallel)
             call staggered_parallel_halo(Avv(:,:,:,:), parallel)
             call t_stopf('glissade_halo_Axxs')
          
             !---------------------------------------------------------------------------
             ! Halo updates for rhs vectors
             ! (Not sure if these are necessary, but leaving them for now)
             !---------------------------------------------------------------------------

             call t_startf('glissade_halo_bxxs')
             call staggered_parallel_halo(bu(:,:,:), parallel)
             call staggered_parallel_halo(bv(:,:,:), parallel)
             call t_stopf('glissade_halo_bxxs')

             !---------------------------------------------------------------------------
             ! Check symmetry of assembled matrix
             !
             ! There may be small differences from perfect symmetry due to roundoff errors.
             ! If sufficiently small, these differences are fixed by averaging the two values
             !  that should be symmetric.  Otherwise the code aborts.
             !
             ! Note: It might be OK to skip this check for production code.  However,
             !       small violations of symmetry are not tolerated well by some solvers.
             !       For example, the SLAP PCG solver with incomplete Cholesky preconditioning
             !       can crash if symmetry is not perfect.
             !---------------------------------------------------------------------------

             if (check_symmetry) then
                call t_startf('glissade_chk_symmetry')
                call check_symmetry_assembled_matrix_3d(nx,    ny,     nz,       &
                                                        parallel,                &
                                                        active_vertex,           &
                                                        Auu,           Auv,      &
                                                        Avu,           Avv)
                call t_stopf('glissade_chk_symmetry')
             endif

             !---------------------------------------------------------------------------
             ! Count the total number of nonzero entries on all processors.
             !---------------------------------------------------------------------------

             call count_nonzeros_3d(nx,     ny,    nz,      &
                                    parallel,               &
                                    Auu,           Auv,     &
                                    Avu,           Avv,     &
                                    active_vertex,          &
                                    nNonzeros)

             if (write_matrix) then
                if (counter == 1) then    ! first outer iteration only
                   call t_startf('glissade_wrt_mat')
                   call write_matrix_elements_3d(nx,          ny,         nz,         &
                                                 nNodesSolve, nodeID,                 &
                                                 iNodeIndex,  jNodeIndex, kNodeIndex, &
                                                 Auu,         Auv,                    &
                                                 Avu,         Avv,                    &
                                                 bu,          bv)
                   call t_stopf('glissade_wrt_mat')
                endif
             endif   ! write_matrix

             if (verbose_matrix .and. this_rank==rtest) then
                i = itest
                j = jtest
                k = ktest
                write(iulog,*) ' '
                write(iulog,*) 'i,j,k =', i, j, k
                write(iulog,*) 'Auu sum =', sum(Auu(:,k,i,j))
                write(iulog,*) 'Auv sum =', sum(Auv(:,k,i,j))
                write(iulog,*) 'Avu sum =', sum(Avu(:,k,i,j))
                write(iulog,*) 'Avv sum =', sum(Avv(:,k,i,j))
                write(iulog,*) ' '
                write(iulog,*) 'iA, jA, kA, Auu, Auv, Avu, Avv:'
                do kA = -1, 1
                   do jA = -1, 1
                      do iA = -1, 1
                         m = indxA_3d(iA,jA,kA)
                         write(iulog,*) iA, jA, kA, Auu(m,k,i,j), Auv(m,k,i,j), Avu(m,k,i,j), Avv(m,k,i,j)
                      enddo
                   enddo
                enddo
                write(iulog,*) 'i, j, k: ', i, j, k
                write(iulog,*) 'bu =', bu(k,i,j)
                write(iulog,*) 'bv =', bv(k,i,j)
                j = jtest
                k = ktest
                m = indxA_3d(0,0,0)  ! diag entry
                write(iulog,*) ' '
                write(iulog,*) 'Matrix row properties, j, k =', j, k
                write(iulog,*) ' '
                write(iulog,*) 'i, diag, max, min, sum:'
                do i = 1, nx-1
                   write(iulog,*) ' '
                   write(iulog,'(a4, i4, 4f16.8)') 'Auu:', i, Auu(m,k,i,j), maxval(Auu(:,k,i,j)), minval(Auu(:,k,i,j)), sum(Auu(:,k,i,j))
                   write(iulog,'(a4, i4, 4f16.8)') 'Auv:', i, Auv(m,k,i,j), maxval(Auv(:,k,i,j)), minval(Auv(:,k,i,j)), sum(Auv(:,k,i,j))
                   write(iulog,'(a4, i4, 4f16.8)') 'Avv:', i, Avv(m,k,i,j), maxval(Avv(:,k,i,j)), minval(Avv(:,k,i,j)), sum(Avv(:,k,i,j))
                enddo
             
             endif  ! verbose_matrix

          endif  ! assemble 2d or 3d matrix

          !---------------------------------------------------------------------------
          ! If the matrix has no nonzero entries, then set velocities to zero and exit the solver.
          !---------------------------------------------------------------------------

          if (verbose_matrix .and. main_task) write(iulog,*) 'nNonzeros in matrix =', nNonzeros

          if (nNonzeros == 0) then  ! clean up and return

             resid_u(:,:,:) = 0.d0
             resid_v(:,:,:) = 0.d0
             bu(:,:,:) = 0.d0
             bv(:,:,:) = 0.d0
             uvel(:,:,:) = 0.d0
             vvel(:,:,:) = 0.d0

             call t_startf('glissade_velo_higher_scale_output')
             call glissade_velo_higher_scale_output(whichcalving_front,     &
                                                    thck,    topg,          &
                                                    flwa,    efvs,          &
                                                    beta_internal,          &
                                                    resid_u, resid_v,       &
                                                    bu,      bv,            &
                                                    uvel,    vvel,          &
                                                    uvel_2d, vvel_2d,       &
                                                    btractx, btracty,       &
                                                    taudx,   taudy,         &
                                                    tau_xz,  tau_yz,        &
                                                    tau_xx,  tau_yy,        &
                                                    tau_xy,  tau_eff)
             call t_stopf('glissade_velo_higher_scale_output')
          
             if (main_task) write(iulog,*) 'No nonzeros in matrix; exit glissade_velo_higher_solve'
             return

          endif  ! nNonzeros = 0

          !---------------------------------------------------------------------------
          ! Given the current velocity solution u_n and the assembled matrix A(u_n),
          !  compute the residual R = Au - b and its L2 norm.
          ! If the norm satisfies a convergence criterion, the code will exit the
          !  outer nonlinear loop below.
          ! Note: The residual is a global quantity that is broadcast to each local task,
          !       so all tasks will agree on whether or not the solver has converged
          !       (and for accelerated Picard, whether the residual norm has decreased).
          !---------------------------------------------------------------------------

          if (solve_2d) then

             call t_startf('glissade_resid_vec')
             call compute_residual_vector_2d(nx,            ny,            &
                                             parallel,                     &
                                             itest,  jtest, rtest,         &
                                             active_vertex,                &
                                             Auu_2d,        Auv_2d,        &
                                             Avu_2d,        Avv_2d,        &
                                             bu_2d,         bv_2d,         &
                                             uvel_2d,       vvel_2d,       &
                                             resid_u_2d,    resid_v_2d,    &
                                             L2_norm,       L2_norm_relative)
             call t_stopf('glissade_resid_vec')

             call t_startf('glissade_accel_picard')
             if (accel_picard) then

                if (verbose_picard) then
                   if (this_rank == rtest) then
                      write(iulog,*) ' '
                      write(iulog,*) 'Saved L2 norm, new L2 norm:', L2_norm_alpha_sav, L2_norm
                   endif
                   call point_diag(resid_u_2d, 'resid_u_2d', itest, jtest, rtest, 7, 7, '(e10.3)')
                   call point_diag(uvel_2d, 'uvel_2d', itest, jtest, rtest, 7, 7, '(f10.3)')
                endif

                if (counter >= 2) then

                   call evaluate_accelerated_picard_2d(nx,            ny,                   &
                                                       L2_norm,       L2_norm_large,        &
                                                       L2_norm_alpha_sav,                   &
                                                       alpha_accel,   alpha_accel_max,      &
                                                       gamma_accel,   resid_reduction_threshold,  &
                                                       uvel_2d,       vvel_2d,              &
                                                       Auu_2d,        Auv_2d,               &
                                                       Avu_2d,        Avv_2d,               &
                                                       uvel_2d_old,   vvel_2d_old,          &
                                                       duvel_2d,      dvvel_2d,             &
                                                       uvel_2d_sav,   vvel_2d_sav,          &
                                                       Auu_2d_sav,    Auv_2d_sav,           &
                                                       Avu_2d_sav,    Avv_2d_sav,           &
                                                       beta_internal, beta_internal_sav,    &
                                                       assembly_is_done)

                else   ! counter = 1

                   ! proceed to the matrix solution
                   assembly_is_done = .true.

                   if (verbose_picard .and. main_task) then
                      write(iulog,*) 'nonlinear counter = 1; continue to matrix solver'
                   endif

                endif  ! counter >= 2

             else   ! accel_picard = F

                ! proceed to the matrix solution
                assembly_is_done = .true.

             endif   ! accel_picard
             call t_stopf('glissade_accel_picard')

          else  ! 3D solve

             call t_startf('glissade_resid_vec')
             call compute_residual_vector_3d(nx,    ny,     nz,            &
                                             parallel,                     &
                                             itest,  jtest, rtest,         &
                                             active_vertex,                &
                                             Auu,           Auv,           &
                                             Avu,           Avv,           &
                                             bu,            bv,            &
                                             uvel,          vvel,          &
                                             resid_u,       resid_v,       &
                                             L2_norm,       L2_norm_relative)
             call t_stopf('glissade_resid_vec')

             call t_startf('glissade_accel_picard')
             if (accel_picard) then

                if (verbose_picard) then
                   if (this_rank == rtest) then
                      write(iulog,*) ' '
                      write(iulog,*) 'Saved L2 norm, new L2 norm:', L2_norm_alpha_sav, L2_norm
                   endif
                   call point_diag(resid_u(1,:,:), 'resid_u, k = 1', itest, jtest, rtest, 7, 7, '(e10.3)')
                   call point_diag(uvel(1,:,:), 'uvel', itest, jtest, rtest, 7, 7, '(f14.9)')
                endif

                if (counter >= 2) then

                   call evaluate_accelerated_picard_3d(L2_norm,       L2_norm_large,        &
                                                       L2_norm_alpha_sav,                   &
                                                       alpha_accel,   alpha_accel_max,      &
                                                       gamma_accel,   resid_reduction_threshold,  &
                                                       uvel,          vvel,                 &
                                                       Auu,           Auv,                  &
                                                       Avu,           Avv,                  &
                                                       uvel_old,      vvel_old,             &
                                                       duvel,         dvvel,                &
                                                       uvel_sav,      vvel_sav,             &
                                                       Auu_sav,       Auv_sav,              &
                                                       Avu_sav,       Avv_sav,              &
                                                       beta_internal, beta_internal_sav,    &
                                                       assembly_is_done)

                else   ! counter = 1

                   ! proceed to the matrix solution
                   assembly_is_done = .true.

                   !WHL - debug
                   if (verbose_picard .and. main_task) then
                      write(iulog,*) 'nonlinear counter = 1; continue to matrix solver'
                   endif

                endif  ! counter >= 2

             else   ! accel_picard = F

                ! proceed to the matrix solution
                assembly_is_done = .true.

             endif   ! accel_picard
             call t_stopf('glissade_accel_picard')

          endif   ! 2D or 3D solve

       enddo  ! while (.not.assembly_is_done)

       ! Optional diagnostics

       if (verbose_beta .and. counter > 1 .and. mod(counter-1,12)==0) then
!!       if (verbose_beta) then

          call point_diag(log10(max(beta_internal,1.d-99)), 'log_beta', itest, jtest, rtest, 7, 7, '(f10.5)')
          if (solve_2d) then
             call point_diag(uvel_2d, 'Mean uvel (m/yr)', itest, jtest, rtest, 7, 7)
             call point_diag(vvel_2d, 'Mean vvel (m/yr)', itest, jtest, rtest, 7, 7)
          else	 ! 3D velocity solve
             call point_diag(uvel(nz,:,:), 'Basal uvel (m/yr)', itest, jtest, rtest, 7, 7)
             call point_diag(vvel(nz,:,:), 'Basal vvel (m/yr)', itest, jtest, rtest, 7, 7)
             call point_diag(uvel(1,:,:), 'Sfc uvel (m/yr)', itest, jtest, rtest, 7, 7)
             call point_diag(vvel(1,:,:), 'Sfc vvel (m/yr)', itest, jtest, rtest, 7, 7)
          endif

          if (whichbabc == HO_BABC_BETA_BPMP) then
             call point_diag(stagbedtemp, 'staggered bed temp', itest, jtest, rtest, 7, 7, '(f10.5)')
             call point_diag(stagbedpmp, 'staggered bed pmp', itest, jtest, rtest, 7, 7, '(f10.5)')
             call point_diag(model%basal_physics%bpmp_mask, 'bpmp mask', itest, jtest, rtest, 7, 7)
          endif  ! HO_BABC_BETA_BPMP

          if (whichbabc == HO_BABC_YIELD_PICARD) then
             call point_diag(model%basal_physics%mintauf, 'mintauf', itest, jtest, rtest, 7, 7, '(e10.3)')
          endif

       endif   ! verbose_beta

       !---------------------------------------------------------------------------
       ! Solve the 2D or 3D matrix system.
       !---------------------------------------------------------------------------

       !---------------------------------------------------------------------------
       ! First, handle a possible problem case: Set uvel_2d = vvel_2d = 0 for the case
       !  of a Dirichlet no-slip basal BC and a 2D L1L2 solve.
       ! It would be pointless to apply the SSA to a no-slip problem, but this case
       !  is included for completeness.
       ! Note: DIVA computes a nonzero 2D velocity with a no-slip BC.
       !---------------------------------------------------------------------------

       if ((whichapprox==HO_APPROX_L1L2 .or. whichapprox==HO_APPROX_SSA) .and. &
              whichbabc==HO_BABC_NO_SLIP) then

          ! zero out velocity and related fields
          uvel_2d(:,:) = 0.d0
          vvel_2d(:,:) = 0.d0
          resid_u_2d(:,:) = 0.d0
          resid_v_2d(:,:) = 0.d0
          L2_norm = 0.d0   ! to force convergence on first step
          L2_norm_relative = 0.d0

       elseif (whichsparse == HO_SPARSE_PCG_STANDARD .or.   &
               whichsparse == HO_SPARSE_PCG_CHRONGEAR) then   ! native PCG solver
                                                              ! works for both serial and parallel runs

          if (solve_2d) then

             !------------------------------------------------------------------------
             ! Call linear PCG solver, compute uvel and vvel on local processor
             !------------------------------------------------------------------------

             !WHL - Passing itest, jtest, rtest for debugging

             call t_startf('glissade_pcg_slv_struct')

             if (whichsparse == HO_SPARSE_PCG_CHRONGEAR) then   ! use Chronopoulos-Gear PCG algorithm
                                                                ! (better scaling for large problems)
                call pcg_solver_chrongear_2d(nx,           ny,            &
                                             parallel,                    &
                                             indxA_2d,     active_vertex, &
                                             Auu_2d,       Auv_2d,        &
                                             Avu_2d,       Avv_2d,        &
                                             bu_2d,        bv_2d,         &
                                             uvel_2d,      vvel_2d,       &
                                             whichprecond, linear_solve_ncheck, &
                                             linear_tolerance,            &
                                             linear_maxiters,             &
                                             err,          niters,        &
                                             itest, jtest, rtest)

             else   ! use standard PCG algorithm
             
                call pcg_solver_standard_2d(nx,           ny,            &
                                            parallel,                    &
                                            indxA_2d,     active_vertex, &
                                            Auu_2d,       Auv_2d,        &
                                            Avu_2d,       Avv_2d,        &
                                            bu_2d,        bv_2d,         &
                                            uvel_2d,      vvel_2d,       &
                                            whichprecond, linear_solve_ncheck, &
                                            linear_tolerance,            &
                                            linear_maxiters,             &
                                            err,          niters,        &
                                            itest, jtest, rtest)

             endif  ! whichsparse

          else   ! 3D solve

             !------------------------------------------------------------------------
             ! Call linear PCG solver, compute uvel and vvel on local processor
             !------------------------------------------------------------------------

             !WHL - Passing itest, jtest, rtest for debugging

             call t_startf('glissade_pcg_slv_struct')

             if (whichsparse == HO_SPARSE_PCG_CHRONGEAR) then   ! use Chronopoulos-Gear PCG algorithm
                                                                ! (better scaling for large problems)

                call pcg_solver_chrongear_3d(nx,           ny,            &
                                             nz,           parallel,      &
                                             indxA_2d,     indxA_3d,      &
                                             active_vertex,               &
                                             Auu,          Auv,           &
                                             Avu,          Avv,           &
                                             bu,           bv,            &
                                             uvel,         vvel,          &
                                             whichprecond, linear_solve_ncheck, &
                                             linear_tolerance,            &
                                             linear_maxiters,             &
                                             err,          niters,        &
                                             itest, jtest, rtest)

             else   ! use standard PCG algorithm
             
                call pcg_solver_standard_3d(nx,           ny,            &
                                            nz,           parallel,      &
                                            indxA_3d,     active_vertex, &
                                            Auu,          Auv,           &
                                            Avu,          Avv,           &
                                            bu,           bv,            &
                                            uvel,         vvel,          &
                                            whichprecond, linear_solve_ncheck, &
                                            linear_tolerance,            &
                                            linear_maxiters,             &
                                            err,          niters,        &
                                            itest, jtest, rtest)

             endif   ! whichsparse

          endif      ! whichapprox

          call t_stopf('glissade_pcg_slv_struct')

#ifdef TRILINOS
       elseif (whichsparse == HO_SPARSE_TRILINOS) then   ! solve with Trilinos

          !------------------------------------------------------------------------
          ! Compute the residual vector and its L2 norm
          !------------------------------------------------------------------------

          if (solve_2d) then

             !------------------------------------------------------------------------
             ! Given Auu, bu, etc., assemble the matrix and RHS in a form
             ! suitable for Trilinos
             !------------------------------------------------------------------------

             if (verbose_trilinos .and. main_task) then
                write(iulog,*) 'L2_norm, L2_target =', L2_norm, L2_target
                write(iulog,*) 'Assemble matrix for Trilinos'
             endif

             call t_startf('glissade_trilinos_assemble')
             call trilinos_assemble_2d(nx,             ny,               &   
                                       nVerticesSolve, global_vertex_id, &
                                       iVertexIndex,   jVertexIndex,     &
                                       indxA_2d,       Afill_2d,         &
                                       Auu_2d,         Auv_2d,           &
                                       Avu_2d,         Avv_2d,           &
                                       bu_2d,          bv_2d)
             call t_stopf('glissade_trilinos_assemble')

             !------------------------------------------------------------------------
             ! Solve the linear matrix problem
             !------------------------------------------------------------------------

             if (verbose_trilinos .and. main_task) write(iulog,*) 'Solve the matrix using Trilinos'

             call t_startf('glissade_vel_tgs')
             call solvevelocitytgs(velocityResult)
             call t_stopf('glissade_vel_tgs')

             !------------------------------------------------------------------------
             ! Put the velocity solution back into 2D arrays
             !------------------------------------------------------------------------

             call t_startf('glissade_trilinos_post')
             call trilinos_extract_velocity_2d(nx,            ny,           &
                                               nVerticesSolve,              &
                                               iVertexIndex,  jVertexIndex, &
                                               velocityResult,              &
                                               uvel_2d,       vvel_2d)
             call t_stopf('glissade_trilinos_post')

          else   ! 3D solve

             !------------------------------------------------------------------------
             ! Given Auu, bu, etc., assemble the matrix and RHS in a form
             ! suitable for Trilinos
             !------------------------------------------------------------------------

             if (verbose_trilinos .and. main_task) then
                write(iulog,*) 'L2_norm, L2_target =', L2_norm, L2_target
                write(iulog,*) 'Assemble matrix for Trilinos'
             endif

             call t_startf('glissade_trilinos_assemble')
             call trilinos_assemble_3d(nx,           ny,            nz,  &   
                                       nNodesSolve,  global_node_id,     &
                                       iNodeIndex,   jNodeIndex,    kNodeIndex,  &
                                       indxA_3d,     Afill,              &
                                       Auu,          Auv,                &
                                       Avu,          Avv,                &
                                       bu,           bv)
             call t_stopf('glissade_trilinos_assemble')

             !------------------------------------------------------------------------
             ! Solve the linear matrix problem
             !------------------------------------------------------------------------

             if (verbose_trilinos .and. main_task) write(iulog,*) 'Solve the matrix using Trilinos'

             call t_startf('glissade_vel_tgs')
             call solvevelocitytgs(velocityResult)
             call t_stopf('glissade_vel_tgs')

             !------------------------------------------------------------------------
             ! Put the velocity solution back into 3D arrays
             !------------------------------------------------------------------------

             call t_startf('glissade_trilinos_post')
             call trilinos_extract_velocity_3d(nx,          ny,         nz,  &
                                               nNodesSolve,                  &
                                               iNodeIndex,  jNodeIndex, kNodeIndex, &
                                               velocityResult,               &
                                               uvel,        vvel)
             call t_stopf('glissade_trilinos_post')

          endif  ! whichapprox
#endif

       else   ! one-processor SLAP solve   
          
          !------------------------------------------------------------------------
          ! Given the stiffness matrices (Auu, etc.) and rhs vector (bu, bv) in
          !  structured format, form the global matrix and rhs in SLAP format.
          !------------------------------------------------------------------------

          if (verbose) write(iulog,*) 'Form global matrix in SLAP sparse format'
 
          matrix%order = matrix_order
          matrix%nonzeros = max_nonzeros
          matrix%symmetric = .false.   ! Although the matrix is symmetric, we don't pass it to SLAP in symmetric form

          call t_startf('glissade_slap_preprocess')
          if (solve_2d) then

             call slap_preprocess_2d(nx,             ny,           &   
                                     nVerticesSolve, vertexID,     &
                                     iVertexIndex,   jVertexIndex, &
                                     indxA_2d,                     &
                                     Auu_2d,         Auv_2d,       &
                                     Avu_2d,         Avv_2d,       &
                                     bu_2d,          bv_2d,        &
                                     uvel_2d,        vvel_2d,      &
                                     matrix_order,                 &
                                     matrix,         rhs,          &
                                     answer)

          else   ! 3D solve

             call slap_preprocess_3d(nx,           ny,          nz, &   
                                     nNodesSolve,  nodeID,      &
                                     iNodeIndex,   jNodeIndex,  &
                                     kNodeIndex,   indxA_3d,    &
                                     Auu,          Auv,         &
                                     Avu,          Avv,         &
                                     bu,           bv,          &
                                     uvel,         vvel,        &
                                     matrix_order,              &
                                     matrix,       rhs,         &
                                     answer)

          endif  ! whichapprox
          call t_stopf('glissade_slap_preprocess')

          !------------------------------------------------------------------------
          ! Compute the residual vector and its L2_norm
          ! Note: The residual was already computed above using the native Fortran
          !       data structures. May be OK to omit this call.
          !------------------------------------------------------------------------

          call t_startf('glissade_slap_resid_vec')
          call slap_compute_residual_vector(matrix,  answer,    &
                                            rhs,     resid_vec, &
                                            L2_norm, L2_norm_relative)
          call t_stopf('glissade_slap_resid_vec')

          if (verbose_residual .and. main_task) then
             write(iulog,*) 'L2_norm of residual =', L2_norm
          endif

          !------------------------------------------------------------------------
          ! Solve the linear matrix problem
          !------------------------------------------------------------------------

          call t_startf('glissade_easy_slv')
          call sparse_easy_solve(matrix, rhs,    answer,  &
                                 err,    niters, whichsparse)
          call t_stopf('glissade_easy_slv')

          !------------------------------------------------------------------------
          ! Put the velocity solution back into the uvel and vvel arrays
          !------------------------------------------------------------------------

          call t_startf('glissade_slap_post')

          if (solve_2d) then

             call slap_postprocess_2d(nVerticesSolve,              &
                                      iVertexIndex, jVertexIndex,  &
                                      answer,       resid_vec,     &
                                      uvel_2d,      vvel_2d,       &
                                      resid_u_2d,   resid_v_2d)

          else   ! 3D solve

             call slap_postprocess_3d(nNodesSolve,                            &
                                      iNodeIndex,   jNodeIndex,  kNodeIndex,  &
                                      answer,       resid_vec,                &
                                      uvel,         vvel,                     &
                                      resid_u,      resid_v)

          endif   ! whichapprox

          call t_stopf('glissade_slap_post')

       endif   ! whichsparse 

       if (whichsparse /= HO_SPARSE_TRILINOS) then
          ! niters isn't set when using the trilinos solver
          if (main_task .and. verbose_solver) then
             write(iulog,*) 'Solved the linear system, niters, err =', niters, err
          endif
       end if

       if (solve_2d) then

          !------------------------------------------------------------------------
          ! Halo updates for uvel and vvel
          !------------------------------------------------------------------------

          call t_startf('glissade_halo_xvel')
          call staggered_parallel_halo(uvel_2d, parallel)
          call staggered_parallel_halo(vvel_2d, parallel)
          call t_stopf('glissade_halo_xvel')

          if (verbose_velo .and. this_rank==rtest) then
             i = itest
             j = jtest
             write(iulog,*) 'rank, i, j, uvel_2d, vvel_2d (m/yr):', &
                      this_rank, i, j, uvel_2d(i,j), vvel_2d(i,j)               
          endif

          !---------------------------------------------------------------------------
          ! Compute residual quantities based on the velocity solution
          !---------------------------------------------------------------------------

          call t_startf('glissade_resid_vec2')
          call compute_residual_velocity_2d(whichresid,    parallel,      &
                                            uvel_2d,       vvel_2d,       &
                                            usav_2d,       vsav_2d,       &
                                            resid_velo)
          call t_stopf('glissade_resid_vec2')

          if (accel_picard) then
             ! Compute the velocity difference (du,dv).
             ! For the next nonlinear iteration, we will see if extending the difference vector
             !  (i.e., alpha_accel > 1) reduces the residual.
             duvel_2d = uvel_2d - uvel_2d_old
             dvvel_2d = vvel_2d - vvel_2d_old
          endif

          !---------------------------------------------------------------------------
          ! Do some calculations specific to the DIVA scheme.
          ! Given the new 2D velocity, compute the new basal traction and 3D velocity.
          !---------------------------------------------------------------------------

          if (whichapprox == HO_APPROX_DIVA) then

             call compute_3d_velocity_diva(&
                  nx,                 ny,                   &
                  nz,                 sigma,                &
                  itest,   jtest,     rtest,                &
                  active_vertex,      diva_level_index,     &
                  ice_plus_land_mask,                       &
                  stag_omega,         omega_k,              &
                  beta_internal,                            &
                  beta_eff_x,         beta_eff_y,           &
                  stag_theta_slope_x, stag_theta_slope_y,   &
                  stag_diva_slope_factor_x,                 &
                  stag_diva_slope_factor_y,                 &
                  uvel_2d,            vvel_2d,              &
                  btractx,            btracty,              &
                  uvel,               vvel)

             call staggered_parallel_halo(uvel, parallel)
             call staggered_parallel_halo(vvel, parallel)

          endif   ! DIVA

       else   ! 3D solve

          !------------------------------------------------------------------------
          ! Halo updates for uvel and vvel
          !------------------------------------------------------------------------

          call t_startf('glissade_halo_xvel')
          call staggered_parallel_halo(uvel, parallel)
          call staggered_parallel_halo(vvel, parallel)
          call t_stopf('glissade_halo_xvel')
          
          if ((verbose_velo .or. verbose_residual) .and. this_rank==rtest) then
             i = itest
             j = jtest
             write(iulog,*) ' '
             write(iulog,*) 'iter, rank, i, j:', counter, this_rank, i, j
             write(iulog,*) 'k, uvel, vvel, resid_u, resid_v:'
             do k = 1, nz
                write(iulog,*) k, uvel(k,i,j), vvel(k,i,j), resid_u(k,i,j), resid_v(k,i,j)
             enddo
          endif

          !---------------------------------------------------------------------------
          ! Compute residual quantities based on the velocity solution
          !---------------------------------------------------------------------------

          call t_startf('glissade_resid_vec2')
          call compute_residual_velocity_3d(whichresid,    parallel,   &
                                            uvel,          vvel,       &
                                            usav,          vsav,       &
                                            resid_velo)
          call t_stopf('glissade_resid_vec2')

          if (accel_picard) then
             ! Compute the velocity difference (du,dv).
             ! For the next nonlinear iteration, we will see if extending the difference vector
             !  (i.e., alpha_accel > 1) reduces the residual.
             duvel = uvel - uvel_old
             dvvel = vvel - vvel_old
          endif

       endif ! 2D or 3D solve

       !---------------------------------------------------------------------------
       ! Write diagnostics (iteration number, max residual, and residual target
       !---------------------------------------------------------------------------

       if (main_task .and. verbose_solver) then
          if (whichresid == HO_RESID_L2NORM) then
             write(iulog,'(i4,2g20.6)') counter, L2_norm, L2_target
          elseif (whichresid == HO_RESID_L2NORM_RELATIVE) then
             write(iulog,'(i4,2g20.6)') counter, L2_norm_relative, L2_target_relative
          else
             write(iulog,'(i4,2g20.6)') counter, resid_velo, resid_target
          end if
       endif

       !---------------------------------------------------------------------------
       ! Update the outer loop stopping criterion
       !---------------------------------------------------------------------------

       if (whichresid == HO_RESID_L2NORM) then
          outer_it_criterion = L2_norm
          outer_it_target = L2_target           ! L2_target is currently set to 1.d-4 and held constant
       elseif (whichresid == HO_RESID_L2NORM_RELATIVE) then
          outer_it_criterion = L2_norm_relative
          outer_it_target = L2_target_relative  ! L2_target_relative is currently set to 1.d-7 and held constant
       else
          outer_it_criterion = resid_velo
          outer_it_target = resid_target   ! resid_target is currently a parameter = 1.d-4  
       end if

    enddo  ! while (outer_it_criterion >= outer_it_target .and. counter < maxiter_nonlinear)

    call t_stopf('glissade_vhs_nonlinear_loop')

    if (counter < maxiter_nonlinear) then
       converged_soln = .true.
       if (main_task) then
          write(iulog,*) 'Solution has converged: counter, err =', counter, L2_norm
       endif
    else
       converged_soln = .false.
       if (main_task) then
          write(iulog,*) 'Solution has NOT converged: counter, err =', counter, L2_norm
       endif
    endif

    if (verbose_glp) then
       call point_diag(beta_internal, 'beta_internal', itest, jtest, rtest, 7, 7)
    endif

    !------------------------------------------------------------------------------
    ! After a 2D solve, fill in the full 3D velocity arrays.
    ! This is a simple copy for SSA, but requires vertical integrals for L1L2 and DIVA.
    ! Note: We store redundant 3D residual info rather than creating a separate 2D residual array.
    !------------------------------------------------------------------------------

    if (whichapprox == HO_APPROX_SSA) then ! fill the 3D velocity and residual arrays with the 2D values

       do k = 1, nz
          uvel(k,:,:) = uvel_2d(:,:)
          vvel(k,:,:) = vvel_2d(:,:)
          resid_u(k,:,:) = resid_u_2d(:,:)
          resid_v(k,:,:) = resid_v_2d(:,:)
       enddo

    elseif (whichapprox == HO_APPROX_L1L2) then

       if (verbose_L1L2 .and. main_task) write(iulog,*) 'Compute 3D velocity, L1L2'

       uvel(nz,:,:) = uvel_2d(:,:)
       vvel(nz,:,:) = vvel_2d(:,:)
       do k = 1, nz
          resid_u(k,:,:) = resid_u_2d(:,:)
          resid_v(k,:,:) = resid_v_2d(:,:)
       enddo

       call compute_3d_velocity_L1L2(nx,               ny,              &
                                     nz,                                &
                                     sigma,            stagsigma,       &
                                     dx,               dy,              &
                                     itest,   jtest,   rtest,           &
                                     parallel,                          &
                                     ice_mask,         floating_mask,   &
                                     active_cell,      active_vertex,   &
                                     umask_dirichlet(nz,:,:),           &
                                     vmask_dirichlet(nz,:,:),           &
                                     xVertex,          yVertex,         &
                                     thck,             stagthck,        &
                                     usrf,                              &
                                     dusrf_dx,         dusrf_dy,        &
                                     flwa,             efvs,            &
                                     whichefvs,                         &
                                     uvel,             vvel)

       call staggered_parallel_halo(uvel, parallel)
       call staggered_parallel_halo(vvel, parallel)

    elseif (whichapprox == HO_APPROX_DIVA) then

       do k = 1, nz
          resid_u(k,:,:) = resid_u_2d(:,:)
          resid_v(k,:,:) = resid_v_2d(:,:)
       enddo

       if (verbose_diva .and. this_rank==rtest) then
          write(iulog,*) 'Computed 3D velocity, DIVA'
          i = itest
          j = jtest
          write(iulog,*) ' '
          write(iulog,*) 'i, j, beta, beta_eff_x, beta_eff_y:', &
               i, j, beta_internal(i,j), beta_eff_x(i,j), beta_eff_y(i,j)
          write(iulog,*) ' '
          i = itest
          j = jtest
          write(iulog,*) 'Computed 3D velocities, i, j =', i, j
          write(iulog,*) 'k, uvel, vvel:'
          do k = 1, nz
             write(iulog,*) k, uvel(k,i,j), vvel(k,i,j)
          enddo
          write(iulog,*) ' '
          write(iulog,*) 'Mean velocity =', uvel_2d(i,j), vvel_2d(i,j)
       endif

    endif   ! whichapprox

    !WHL - debug
    !TODO - One diagnostic to write out column velocities for any approximation
    if (whichapprox == HO_APPROX_BP .and. this_rank==rtest) then
       write(iulog,*) ' '
       i = itest
       j = jtest
       write(iulog,*) 'Computed 3D BP velocities, i, j =', i, j
       write(iulog,*) 'k, uvel, vvel:'
       do k = 1, nz
          write(iulog,*) k, uvel(k,i,j), vvel(k,i,j)
       enddo
       write(iulog,*) ' '
       !TODO - Make this more exact (based on sigma layers)
       write(iulog,*) 'Rough mean velocity =', sum(uvel(:,i,j))/nz, sum(vvel(:,i,j))/nz
    endif

    !------------------------------------------------------------------------------
    ! Compute the components of the 3D stress tensor.
    ! These are diagnostic, except that tau_eff is used in the temperature calculation.
    !------------------------------------------------------------------------------

    call compute_internal_stress(nx,            ny,            &
                                 nz,            sigma,         &
                                 nhalo,                        &
                                 itest, jtest,  rtest,         &
                                 active_cell,                  &
                                 xVertex,       yVertex,       &
                                 stagusrf,      stagthck,      &
                                 whichapprox,   flwafact,      &
                                 whichefvs,     efvs,          &
                                 efvs_constant, effstrain_min, &
                                 uvel,          vvel,          &
                                 tau_xz,        tau_yz,        &
                                 tau_xx,        tau_yy,        &
                                 tau_xy,        tau_eff)

    !------------------------------------------------------------------------------
    ! Compute the heat flux due to basal friction for each grid cell.
    !------------------------------------------------------------------------------

    call compute_basal_friction_heatflx(nx,            ny,            &
                                        nhalo,                        &
                                        itest, jtest,  rtest,         &
                                        active_cell,   active_vertex, &
                                        xVertex,       yVertex,       &
                                        uvel(nz,:,:),  vvel(nz,:,:),  &
                                        beta_internal, whichassemble_bfric,  &
                                        bfricflx)

    call parallel_halo(bfricflx, parallel)

    if (verbose_bfric) then
       call point_diag(bfricflx, 'basal friction heat flux (W/m2)', itest, jtest, rtest, 7, 7, '(e10.3)')
    endif

    !------------------------------------------------------------------------------
    ! Compute the components of basal traction.
    !------------------------------------------------------------------------------

    btractx(:,:) = beta_internal(:,:) * uvel(nz,:,:)
    btracty(:,:) = beta_internal(:,:) * vvel(nz,:,:)

    if (verbose_velo) then
       call point_diag(uvel(1,:,:), 'uvel, k = 1', itest, jtest, rtest, 7, 7)
       call point_diag(vvel(1,:,:), 'vvel, k = 1', itest, jtest, rtest, 7, 7)
       call point_diag(uvel(nz,:,:), 'uvel, k = nz', itest, jtest, rtest, 7, 7)
       call point_diag(vvel(nz,:,:), 'vvel, k = nz', itest, jtest, rtest, 7, 7)
       if (this_rank == rtest) then
          i = itest
          j = jtest
          write(iulog,*) 'max(uvel, vvel) =', maxval(uvel), maxval(vvel)
          write(iulog,*) 'New velocity: rank, i, j =', this_rank, i, j
          do k = 1, nz
             write(iulog,*) k, uvel(k,i,j), vvel(k,i,j)
          enddo
          if (solve_2d) write(iulog,*) '2D velo:', uvel_2d(i,j), vvel_2d(i,j)
       endif
    endif  ! verbose_velo

    !------------------------------------------------------------------------------
    ! Clean up
    !------------------------------------------------------------------------------

    call t_startf('glissade_vhs_cleanup')
    if (whichsparse <= HO_SPARSE_GMRES) then  ! using SLAP solver
       deallocate(matrix%row, matrix%col, matrix%val)
       deallocate(rhs, answer, resid_vec)
    endif

#ifdef TRILINOS
    if (whichsparse == HO_SPARSE_TRILINOS) then
       deallocate(active_owned_unknown_map)
       deallocate(velocityResult)
       if (solve_2d) then
          deallocate(Afill_2d)
       else
          deallocate(Afill)
       endif
    endif
#endif

    if (solve_2d) then
       deallocate(Auu_2d, Auv_2d, Avu_2d, Avv_2d)
       deallocate(bu_2d, bv_2d)
       deallocate(loadu_2d, loadv_2d)
       deallocate(usav_2d, vsav_2d)
       deallocate(resid_u_2d, resid_v_2d)
       if (accel_picard) then
          deallocate(uvel_2d_old, vvel_2d_old)
          deallocate(duvel_2d, dvvel_2d)
          deallocate(uvel_2d_sav, vvel_2d_sav)
          deallocate(Auu_2d_sav, Auv_2d_sav, Avu_2d_sav, Avv_2d_sav)
          deallocate(beta_internal_sav)
       endif
    else
       deallocate(Auu, Auv, Avu, Avv)
       !TODO - any other arrays to deallocate here?
       if (accel_picard) then
          deallocate(uvel_old, vvel_old)
          deallocate(duvel, dvvel)
          deallocate(uvel_sav, vvel_sav)
          deallocate(Auu_sav, Auv_sav, Avu_sav, Avv_sav)
          deallocate(beta_internal_sav)
       endif
    endif

    if (whichapprox == HO_APPROX_DIVA) then
       deallocate(beta_eff_x)
       deallocate(beta_eff_y)
       deallocate(omega)
       deallocate(omega_k)
       deallocate(stag_omega)
       deallocate(stag_omega_k)
       deallocate(efvs_qp_3d)
    endif

    !------------------------------------------------------------------------------
    ! Convert output variables to appropriate CISM units (generally dimensionless).
    ! Note: bfricflx already has the desired units (W/m^2).
    !------------------------------------------------------------------------------

!pw call t_startf('glissade_velo_higher_scale_output')
    call glissade_velo_higher_scale_output(whichcalving_front,     &
                                           thck,    topg,          &
                                           flwa,    efvs,          &
                                           beta_internal,          &
                                           resid_u, resid_v,       &
                                           bu,      bv,            &
                                           uvel,    vvel,          &
                                           uvel_2d, vvel_2d,       &
                                           btractx, btracty,       &
                                           taudx,   taudy,         &
                                           tau_xz,  tau_yz,        &
                                           tau_xx,  tau_yy,        &
                                           tau_xy,  tau_eff)
!pw call t_stopf('glissade_velo_higher_scale_output')
    call t_stopf('glissade_vhs_cleanup')

  end subroutine glissade_velo_higher_solve

!****************************************************************************

  subroutine glissade_velo_higher_scale_input(dx,      dy,            &
                                              whichcalving_front,     &
                                              thck,                   &
                                              topg,    eus,           &
                                              thklim,                 &
                                              thck_gradient_ramp,     &
                                              flwa,    efvs,          &
                                              btractx, btracty,       &
                                              uvel,    vvel,          &
                                              uvel_2d, vvel_2d)

    !--------------------------------------------------------
    ! Convert input variables (generally dimensionless)
    ! to appropriate units for the Glissade solver.
    !--------------------------------------------------------

    real(dp), intent(inout) ::   &
       dx, dy                  ! grid cell length and width 

    integer, intent(in) :: &
         whichcalving_front    ! = 1 for subgrid CF, else = 0

    real(dp), dimension(:,:), intent(inout) ::   &
       thck,                &  ! ice thickness
       topg                    ! elevation of topography

    real(dp), intent(inout) ::   &
       eus,                 &  ! eustatic sea level (= 0 by default)
       thklim,              &  ! minimum ice thickness for active grounded cells
       thck_gradient_ramp      ! thickness scale over which gradients are ramped up from zero to full value

    real(dp), dimension(:,:,:), intent(inout) ::  &
       flwa,   &               ! flow factor in units of Pa^(-n) yr^(-1)
       efvs                    ! effective viscosity (Pa yr)

    real(dp), dimension(:,:), intent(inout)  ::  &
       btractx, btracty,  &    ! components of basal traction (Pa)
       uvel_2d, vvel_2d        ! components of 2D velocity (m/yr)

    real(dp), dimension(:,:,:), intent(inout) ::  &
       uvel, vvel              ! components of 3D velocity (m/yr)

    !TODO - Remove this rescaling; use SI units (s instead of yr) in the code.

    ! rate factor: rescale from Pa^(-n) s^(-1) to Pa^(-n) yr^(-1)
    flwa = flwa * scyr

    ! effective viscosity: rescale from Pa s to Pa yr
    efvs = efvs / scyr

    ! ice velocity: rescale from m/s to m/yr
    uvel = uvel * scyr
    vvel = vvel * scyr
    uvel_2d = uvel_2d * scyr
    vvel_2d = vvel_2d * scyr

  end subroutine glissade_velo_higher_scale_input

!****************************************************************************

  subroutine glissade_velo_higher_scale_output(whichcalving_front,      &
                                               thck,    topg,           &
                                               flwa,    efvs,           &                                       
                                               beta_internal,           &
                                               resid_u, resid_v,        &
                                               bu,      bv,             &
                                               uvel,    vvel,           &
                                               uvel_2d, vvel_2d,        &
                                               btractx, btracty,        &
                                               taudx,   taudy,          &
                                               tau_xz,  tau_yz,         &
                                               tau_xx,  tau_yy,         &
                                               tau_xy,  tau_eff)

    !--------------------------------------------------------
    ! Convert output variables to appropriate CISM units
    ! (generally dimensionless)
    !--------------------------------------------------------

    integer, intent(in) :: &
         whichcalving_front    ! = 1 for subgrid CF, else = 0

    real(dp), dimension(:,:), intent(inout) ::  &
       thck,                 &  ! ice thickness
       topg                     ! elevation of topography

    real(dp), dimension(:,:,:), intent(inout) ::  &
       flwa,   &                ! flow factor in units of Pa^(-n) yr^(-1)
       efvs                     ! effective viscosity (Pa yr)

    real(dp), dimension(:,:), intent(inout)  ::  &
       beta_internal            ! basal traction parameter (Pa/(m/yr))

    real(dp), dimension(:,:,:), intent(inout) ::  &
       uvel, vvel,    &         ! components of 3D velocity (m/yr)
       resid_u, resid_v,  &     ! components of residual Ax - b (Pa/m)
       bu, bv                   ! components of b in Ax = b (Pa/m)

    real(dp), dimension(:,:), intent(inout) ::  &
       uvel_2d, vvel_2d,       &! components of 2D velocity (m/yr)
       btractx, btracty,       &! components of basal traction (Pa)
       taudx,   taudy           ! components of driving stress (Pa)

    real(dp), dimension(:,:,:), intent(inout) ::  &
       tau_xz, tau_yz,         &! vertical components of stress tensor (Pa)
       tau_xx, tau_yy, tau_xy, &! horizontal components of stress tensor (Pa)
       tau_eff                  ! effective stress (Pa)

    !TODO - Remove the rescaling of input and output fields, using SI units
    !       (s instead of yr) in the code

    ! Convert flow factor from Pa^(-n) yr^(-1) to Pa^(-n) s^(-1)
    flwa = flwa / scyr

    ! Convert effective viscosity from Pa yr to Pa s
    efvs = efvs * scyr

    ! Convert beta_internal from Pa/(m/yr) to Pa/(m/s)
    beta_internal = beta_internal * scyr

    ! Convert velocity from m/yr to m/s
    uvel = uvel / scyr
    vvel = vvel / scyr
    uvel_2d = uvel_2d / scyr
    vvel_2d = vvel_2d / scyr

  end subroutine glissade_velo_higher_scale_output

!****************************************************************************

  subroutine get_vertex_geometry(nx,             ny,                   &
                                 nz,             nhalo,                &
                                 parallel,                             &
                                 dx,             dy,                   &
                                 itest,  jtest,  rtest,                &
                                 ice_mask,                             &
                                 xVertex,        yVertex,              &
                                 active_cell,    active_vertex,        &
                                 nNodesSolve,    nVerticesSolve,       &
                                 nodeID,         vertexID,             &
                                 iNodeIndex,     jNodeIndex,  kNodeIndex, &
                                 iVertexIndex,   jVertexIndex)
                            
    !----------------------------------------------------------------
    ! Compute coordinates for each vertex.
    ! Identify and count the active cells and vertices for the finite-element calculations.
    ! Active cells include all cells that contain ice (thck > thklim) and border locally owned vertices.
    ! Active vertices include all vertices of active cells.
    !
    ! Also compute some indices needed for the SLAP and Trilinos solvers.
    !TODO - Move SLAP/Trilinos part to a different subroutine?
    !----------------------------------------------------------------

    !----------------------------------------------------------------
    ! Input-output arguments
    !----------------------------------------------------------------

    integer, intent(in) ::   &
       nx,  ny,              &    ! number of grid cells in each direction
       nz,                   &    ! number of vertical levels where velocity is computed
       nhalo                      ! number of halo layers

    type(parallel_type), intent(in) :: &
         parallel                  ! info for parallel communication

    real(dp), intent(in) ::  &
       dx,  dy                ! grid cell length and width (m)
                              ! assumed to have the same value for each grid cell

    integer, intent(in) :: &
       itest, jtest, rtest    ! coordinates of diagnostic point

    integer, dimension(nx,ny), intent(in) ::  &
       ice_mask        ! = 1 for cells with active ice, else = 0

    real(dp), dimension(nx-1,ny-1), intent(out) :: &
       xVertex, yVertex       ! x and y coordinates of each vertex

    logical, dimension(nx,ny), intent(out) :: &
       active_cell            ! true for active cells 
                              ! (thck > thklim, bordering a locally owned vertex)

    logical, dimension(nx-1,ny-1), intent(out) :: &
       active_vertex          ! true for vertices of active cells

    ! The remaining input/output arguments are for the SLAP and Trilinos solvers

    integer, intent(out) :: &
       nNodesSolve,         & ! number of locally owned nodes where we solve for velocity
       nVerticesSolve         ! number of locally owned vertices where we solve for velocity

    integer, dimension(nz,nx-1,ny-1), intent(out) ::  &
       nodeID                 ! local ID for each node where we solve for velocity

    integer, dimension(nx-1,ny-1), intent(out) ::  &
       vertexID               ! local ID for each vertex where we solve for velocity

    integer, dimension((nx-1)*(ny-1)*nz), intent(out) ::   &
       iNodeIndex, jNodeIndex, kNodeIndex   ! i, j and k indices of nodes

    integer, dimension((nx-1)*(ny-1)), intent(out) ::   &
       iVertexIndex, jVertexIndex   ! i and j indices of vertices

    !---------------------------------------------------------
    ! Local variables
    !---------------------------------------------------------

    integer :: i, j, k

    integer :: &
         staggered_ilo, staggered_ihi, &  ! bounds of locally owned vertices on staggered grid
         staggered_jlo, staggered_jhi

    staggered_ilo = parallel%staggered_ilo
    staggered_ihi = parallel%staggered_ihi
    staggered_jlo = parallel%staggered_jlo
    staggered_jhi = parallel%staggered_jhi

    !----------------------------------------------------------------
    ! Compute the x and y coordinates of each vertex.
    ! By convention, vertex (i,j) lies at the NE corner of cell(i,j).
    !----------------------------------------------------------------

    xVertex(:,:) = 0.d0
    yVertex(:,:) = 0.d0
    do j = 1, ny-1
    do i = 1, nx-1
       xVertex(i,j) = dx * i
       yVertex(i,j) = dy * j
    enddo
    enddo

    ! Identify the active cells.
    ! Include all cells that border locally owned vertices and contain ice.

    active_cell(:,:) = .false.

    do j = nhalo+1, ny-nhalo+1  ! include east and north layer of halo cells
    do i = nhalo+1, nx-nhalo+1
       if (ice_mask(i,j) == 1) then
          active_cell(i,j) = .true.
       endif
    enddo
    enddo

    ! Identify the active vertices
    ! Include all vertices of active cells

    active_vertex(:,:) = .false.

    do j = nhalo+1, ny-nhalo+1
    do i = nhalo+1, nx-nhalo+1
       if (active_cell(i,j)) then
          active_vertex(i-1:i, j-1:j) = .true.  ! all vertices of this cell
       endif
    enddo
    enddo

    ! Identify and count the nodes where we must solve for the velocity.
    ! This indexing is used for pre- and post-processing of the assembled matrix
    !  when we call the SLAP or Trilinos solver (one processor only).
    ! It is not required by the native PCG solver.

    nVerticesSolve  = 0
    vertexID(:,:)   = 0
    iVertexIndex(:) = 0
    jVertexIndex(:) = 0

    nNodesSolve   = 0
    nodeID(:,:,:) = 0
    iNodeIndex(:) = 0
    jNodeIndex(:) = 0
    kNodeIndex(:) = 0

    do j = staggered_jlo, staggered_jhi    ! locally owned vertices only
    do i = staggered_ilo, staggered_ihi
       if (active_vertex(i,j)) then   ! all nodes in ice column are active
          nVerticesSolve = nVerticesSolve + 1
          vertexID(i,j) = nVerticesSolve     ! unique local index for each vertex
          iVertexIndex(nVerticesSolve) = i
          jVertexIndex(nVerticesSolve) = j
          do k = 1, nz               
             nNodesSolve = nNodesSolve + 1   
             nodeID(k,i,j) = nNodesSolve     ! unique local index for each node
             iNodeIndex(nNodesSolve) = i
             jNodeIndex(nNodesSolve) = j
             kNodeIndex(nNodesSolve) = k
           enddo   ! k
        endif      ! active vertex
    enddo          ! i
    enddo          ! j

    if (verbose .and. this_rank==rtest) then
       write(iulog,*) ' '
       write(iulog,*) 'nVerticesSolve, nNodesSolve =', nVerticesSolve, nNodesSolve
    endif

  end subroutine get_vertex_geometry

!****************************************************************************

  subroutine load_vector_gravity(nx,               ny,              &
                                 nz,               nhalo,           &
                                 sigma,            stagwbndsigma,   & 
                                 dx,               dy,              &
                                 itest,  jtest,    rtest,           &
                                 active_cell,      active_vertex,   &
                                 xVertex,          yVertex,         &
                                 stagusrf,         stagthck,        &
                                 dusrf_dx,         dusrf_dy,        &
                                 whichassemble_taud,                &
                                 loadu,            loadv)

    integer, intent(in) ::      &
       nx, ny,                  &    ! horizontal grid dimensions
       nz,                      &    ! number of vertical levels at which velocity is computed
                                     ! Note: the number of elements per column is nz-1
       nhalo                         ! number of halo layers

    real(dp), dimension(nz), intent(in) ::    &
       sigma                         ! sigma vertical coordinate

    real(dp), dimension(0:nz), intent(in) ::    &
       stagwbndsigma                 ! stagsigma augmented by sigma = 0 and 1 at upper and lower surfaces

    real(dp), intent(in) ::     &
       dx, dy                        ! grid cell length and width

    integer, intent(in) :: &
       itest, jtest, rtest           ! coordinates of diagnostic point

    logical, dimension(nx,ny), intent(in) ::  &
       active_cell                   ! true if cell contains ice and borders a locally owned vertex

    logical, dimension(nx-1,ny-1), intent(in) ::  &
       active_vertex                 ! true for vertices of active cells

    real(dp), dimension(nx-1,ny-1), intent(in) ::   &
       xVertex, yVertex     ! x and y coordinates of vertices

    real(dp), dimension(nx-1,ny-1), intent(in) ::  &
       stagusrf,       &  ! upper surface elevation on staggered grid (m)
       stagthck           ! ice thickness on staggered grid (m)

    real(dp), dimension(nx-1,ny-1), intent(in) ::  &
       dusrf_dx,       &  ! upper surface elevation gradient on staggered grid (m/m)
       dusrf_dy

    integer, intent(in) :: &
       whichassemble_taud   ! = 0 for standard finite element computation of driving stress terms
                            ! = 1 for computation that uses only the local value of the driving stress at each node

    real(dp), dimension(nz,nx-1,ny-1), intent(inout) ::  &
       loadu, loadv       ! load vector, divided into u and v components

    !----------------------------------------------------------------
    ! Local variables
    !----------------------------------------------------------------

    real(dp), dimension(nNodesPerElement_3d) ::     &
       x, y, z,         & ! Cartesian coordinates of nodes
       dsdx, dsdy         ! upper surface elevation gradient at nodes

    real(dp)  ::   &
       dz,              & ! element height
       detJ               ! determinant of Jacobian for the transformation
                          !  between the reference element and true element

    !Note - These are not currently used except as dummy arguments
    real(dp), dimension(nNodesPerElement_3d) ::   &
       dphi_dx_3d, dphi_dy_3d, dphi_dz_3d  ! derivatives of basis functions, evaluated at quad pts

    real(dp) ::    &
       dsdx_qp, dsdy_qp       ! upper surface elevation gradient at quad pt

    integer :: i, j, k, n, p

    integer :: iNode, jNode, kNode

    if (verbose_load .and. this_rank==rtest) then
       write(iulog,*) ' '
       write(iulog,*) 'In load_vector_gravity: itest, jtest, ktest, rtest =', itest, jtest, ktest, rtest
    endif
                
    if (whichassemble_taud == HO_ASSEMBLE_TAUD_LOCAL) then

       ! Sum over active vertices
       do j = 1, ny-1
          do i = 1, nx-1
             if (active_vertex(i,j)) then

                if (verbose_load .and. this_rank==rtest .and. i==itest .and. j==jtest) then
                   write(iulog,*) 'i, j, dsdx, dsdy:', i, j, dusrf_dx(i,j), dusrf_dy(i,j)
                endif

                do k = 1, nz      ! loop over vertices in this column
                                  ! assume k increases from upper surface to bed

                   dz = stagthck(i,j) * (stagwbndsigma(k) - stagwbndsigma(k-1))

                   ! Add the ds/dx and ds/dy terms to the load vector for this node
                   loadu(k,i,j) = loadu(k,i,j) - rhoi*grav * dx*dy*dz/vol0 * dusrf_dx(i,j)
                   loadv(k,i,j) = loadv(k,i,j) - rhoi*grav * dx*dy*dz/vol0 * dusrf_dy(i,j)

                   if (verbose_load .and. this_rank==rtest .and. i==itest .and. j==jtest) then
                      write(iulog,*) 'k, dz, delta(loadu), delta(loadv):', k, dz, -rhoi*grav*dx*dy*dz/vol0 * dusrf_dx(i,j), &
                                                                           -rhoi*grav*dx*dy*dz/vol0 * dusrf_dy(i,j)
                   endif

                enddo   ! k

             endif   ! active_vertex
          enddo   ! i
       enddo   ! j

       return

    else   ! standard assembly

       ! Sum over elements in active cells 
       ! Loop over all cells that border locally owned vertices
       ! This includes halo cells to the north and east

       do j = nhalo+1, ny-nhalo+1
       do i = nhalo+1, nx-nhalo+1
       
          if (active_cell(i,j)) then

             do k = 1, nz-1    ! loop over elements in this column 
                               ! assume k increases from upper surface to bed

                ! compute spatial coordinates and upper surface elevation gradient for each node

                do n = 1, nNodesPerElement_3d

                   ! Determine (k,i,j) for this node
                   ! The reason for the '7' is that node 7, in the NE corner of the upper layer, has index (k,i,j).
                   ! Indices for other nodes are computed relative to this node.
                   iNode = i + ishift(7,n)
                   jNode = j + jshift(7,n)
                   kNode = k + kshift(7,n)

                   x(n) = xVertex(iNode,jNode)
                   y(n) = yVertex(iNode,jNode)
                   z(n) = stagusrf(iNode,jNode) - sigma(kNode)*stagthck(iNode,jNode)
                   dsdx(n) = dusrf_dx(iNode,jNode)
                   dsdy(n) = dusrf_dy(iNode,jNode)

                   if (verbose_load .and. this_rank==rtest .and. i==itest .and. j==jtest .and. k==ktest) then
                      write(iulog,*) 'i, j, k, n, x, y, z, dsdx, dsdy:', i, j, k, n, x(n), y(n), z(n), dsdx(n), dsdy(n)
                   endif

                enddo   ! nodes per element

                ! Loop over quadrature points for this element
   
                do p = 1, nQuadPoints_3d

                   ! Evaluate detJ at the quadrature point.
                   ! TODO: The derivatives are not used.  Make these optional arguments?
                   !WHL - debug - Pass in i, j, k, and p for now

                   call get_basis_function_derivatives_3d(x(:),          y(:),          z(:),                    &
                                                          dphi_dxr_3d(:,p), dphi_dyr_3d(:,p), dphi_dzr_3d(:,p),  &
                                                          dphi_dx_3d(:),    dphi_dy_3d(:),    dphi_dz_3d(:),     &
                                                          detJ,                                                  &
                                                          itest, jtest, rtest,                                   &
                                                          i, j, k, p)

                   ! Increment the load vector with the gravitational contribution from this quadrature point

                   ! Evaluate dsdx and dsdy at this quadrature point
                   dsdx_qp = 0.d0
                   dsdy_qp = 0.d0
                   do n = 1, nNodesPerElement_3d
                      dsdx_qp = dsdx_qp + phi_3d(n,p) * dsdx(n)
                      dsdy_qp = dsdy_qp + phi_3d(n,p) * dsdy(n)
                   enddo

                   if (verbose_load .and. this_rank==rtest .and. i==itest .and. j==jtest .and. k==ktest) then
                      write(iulog,*) ' '
                      write(iulog,*) 'Increment load vector, i, j, k, p =', i, j, k, p
                      write(iulog,*) 'ds/dx, ds/dy =', dsdx_qp, dsdy_qp
                      write(iulog,*) 'detJ/vol0 =', detJ/vol0
                      write(iulog,*) 'detJ/vol0* (ds/dx, ds/dy) =', detJ/vol0*dsdx_qp, detJ/vol0*dsdy_qp
                   endif

                   ! Loop over the nodes of the element
                   do n = 1, nNodesPerElement_3d

                      ! Determine (k,i,j) for this node
                      iNode = i + ishift(7,n)
                      jNode = j + jshift(7,n)
                      kNode = k + kshift(7,n)
         
                      ! Add the ds/dx and ds/dy terms to the load vector for this node
                      loadu(kNode,iNode,jNode) = loadu(kNode,iNode,jNode) - &
                           rhoi*grav * wqp_3d(p) * detJ/vol0 * dsdx_qp * phi_3d(n,p)
                      loadv(kNode,iNode,jNode) = loadv(kNode,iNode,jNode) - &
                           rhoi*grav * wqp_3d(p) * detJ/vol0 * dsdy_qp * phi_3d(n,p)

                      if (verbose_load .and. this_rank==rtest .and. i==itest .and. j==jtest .and. k==ktest .and. p==ptest) then
                         write(iulog,*) ' '
                         write(iulog,*) 'n, phi_3d(n), delta(loadu), delta(loadv):', n, phi_3d(n,p), &
                                  rhoi*grav*wqp_3d(p)*detJ/vol0 * dsdx_qp * phi_3d(n,p), &
                                  rhoi*grav*wqp_3d(p)*detJ/vol0 * dsdy_qp * phi_3d(n,p)
                      endif

                   enddo   ! nNodesPerElement_3d

                enddo      ! nQuadPoints_3d

             enddo         ! k

          endif            ! active_cell

       enddo               ! i
       enddo               ! j

    endif   ! whichasssemble_taud

  end subroutine load_vector_gravity

!****************************************************************************

  subroutine load_vector_lateral_bc(nx,               ny,              &
                                    nz,               sigma,           &
                                    nhalo,                             &
                                    itest,   jtest,   rtest,           &
                                    whichassemble_lateral,             &
                                    land_mask,                         &
                                    ocean_mask,                        &
                                    active_cell,                       &
                                    xVertex,          yVertex,         &
                                    usrf,             thck,            &
                                    stagusrf,         stagthck,        &
                                    loadu,            loadv)

    integer, intent(in) ::      &
       nx, ny,                  &    ! horizontal grid dimensions
       nz,                      &    ! number of vertical levels at which velocity is computed
                                     ! Note: the number of elements per column is nz-1
       nhalo,                   &    ! number of halo layers
       whichassemble_lateral         ! = 0 for standard finite element computation of lateral load terms
                                     ! = 1 for computation that uses usrf and thck of the cell containing the marine edge

    real(dp), dimension(nz), intent(in) ::    &
       sigma                         ! sigma vertical coordinate

    integer, intent(in) :: &
       itest, jtest, rtest           ! coordinates of diagnostic point

    logical, dimension(nx,ny), intent(in) ::  &
       active_cell                   ! true if cell contains ice, borders a locally owned vertex,
                                     ! and is not an inactive calving_front cell

    integer, dimension(nx,ny), intent(in) ::  &
       land_mask,                  & ! = 1 if topg >= eus
       ocean_mask                    ! = 1 if topography is below sea level and ice is absent

    real(dp), dimension(nx-1,ny-1), intent(in) ::   &
       xVertex, yVertex     ! x and y coordinates of vertices

    real(dp), dimension(nx,ny), intent(in) ::  &
       usrf,            & ! upper surface elevation (m) on ice grid
       thck               ! ice thickness (m) on ice grid

    real(dp), dimension(nx-1,ny-1), intent(in) ::  &
       stagusrf,        & ! upper surface elevation (m) on staggered grid
       stagthck           ! ice thickness (m) on staggered grid

    real(dp), dimension(nz,nx-1,ny-1), intent(inout) ::  &
       loadu, loadv       ! load vector, divided into u and v components

    !----------------------------------------------------------------
    ! Local variables
    !----------------------------------------------------------------

    integer :: i, j

    ! Sum over elements in active cells 
    ! Loop over cells that contain locally owned vertices

    ! Note: Lateral shelf BCs are applied to active cells (either floating or grounded) that border the ocean.

    do j = nhalo+1, ny-nhalo+1
    do i = nhalo+1, nx-nhalo+1
       
       if ((verbose_shelf .or. verbose_load) .and. i==itest .and. j==jtest .and. this_rank==rtest) then
          write(iulog,*) 'rank, i, j =', this_rank, i, j
          write(iulog,*) 'ocean_mask (i-1:i,j)  =', ocean_mask(i-1:i, j)
          write(iulog,*) 'ocean_mask (i-1:i,j-1)=', ocean_mask(i-1:i, j-1)
       endif

       ! Compute the spreading term for all active cells that share an edge with an ice-free ocean cell.

       if (active_cell(i,j)) then

          if ( ocean_mask(i-1,j) == 1) then

             call lateral_shelf_bc(nx,              ny,              &
                                   nz,              sigma,           &
                                   itest,   jtest,  rtest,           &
                                   whichassemble_lateral,            &
                                   'west',                           &
                                   i,               j,               &
                                   usrf,            thck,            &
                                   stagusrf,        stagthck,        &
                                   xVertex,         yVertex,         &
                                   loadu,           loadv)
          endif

          if ( ocean_mask(i+1,j) == 1) then

             call lateral_shelf_bc(nx,              ny,              &
                                   nz,              sigma,           &
                                   itest,   jtest,  rtest,           &
                                   whichassemble_lateral,            &
                                   'east',                           &
                                   i,               j,               &
                                   usrf,            thck,            &
                                   stagusrf,        stagthck,        &
                                   xVertex,         yVertex,         &
                                   loadu,           loadv)
          endif

          if ( ocean_mask(i,j-1) == 1) then

             call lateral_shelf_bc(nx,              ny,              &
                                   nz,              sigma,           &
                                   itest,   jtest,  rtest,           &
                                   whichassemble_lateral,            &
                                   'south',                          &
                                   i,               j,               &
                                   usrf,            thck,            &
                                   stagusrf,        stagthck,        &
                                   xVertex,         yVertex,         &
                                   loadu,           loadv)
          endif

          if ( ocean_mask(i,j+1) == 1) then

             call lateral_shelf_bc(nx,              ny,              &
                                   nz,              sigma,           &
                                   itest,   jtest,  rtest,           &
                                   whichassemble_lateral,            &
                                   'north',                          &
                                   i,               j,               &
                                   usrf,            thck,            &
                                   stagusrf,        stagthck,        &
                                   xVertex,         yVertex,         &
                                   loadu,           loadv)
          endif

       endif      ! active_cell

    enddo         ! i
    enddo         ! j

  end subroutine load_vector_lateral_bc

!****************************************************************************

  subroutine lateral_shelf_bc(nx,                  ny,              &
                              nz,                  sigma,           &
                              itest,   jtest,      rtest,           &
                              whichassemble_lateral,                &
                              face,                                 &
                              iCell,               jCell,           &
                              usrf,                thck,            &
                              stagusrf,            stagthck,        &
                              xVertex,             yVertex,         &
                              loadu,               loadv)

    !----------------------------------------------------------------------------------
    ! Determine the contribution to the load vector from ice and water pressure at the
    !  vertical boundary between ice and ocean (or alternatively, from ice pressure alone
    !  at a vertical boundary between ice and air).
    !
    ! This subroutine computes the vertically averaged hydrostatic pressure at a vertical face
    !  associated with the grid cell column (iCell, jCell).
    !
    ! At a given point, this pressure is proportional to the difference between
    ! (1) the vertically averaged pressure exerted outward (toward the ocean) by the ice front
    ! (2) the vertically averaged pressure exerted by the ocean back toward the ice
    ! 
    ! (1) is given by p_out = 0.5*rhoi*grav*H
    ! (2) is given by p_in  = 0.5*rhoi*grav*H*(rhoi/rhoo) for a floating shelf
    !                       = 0.5*rhoo*grav*H*(1 - s/H)^2 for s <= H but ice not necessarily afloat
    !
    ! The second term goes to zero for a land-terminating cliff. 
    ! The two pressure terms are opposite in sign, so the net vertically averaged pressure,
    !  directed toward the ocean (or air), is given by
    ! 
    !                    p_av = 0.5*rhoi*grav*H*(1 - rhoi/rhoo) for a floating shelf
    !                           0.5*rhoi*grav*H - 0.5*rhoo*grav*H * (1 - min(s/H,1))^2 for ice not necessarily afloat
    !
    ! Here we sum over quadrature points for each ocean-bordering face of each element.
    ! The contribution from each quadrature point to node N is proportional to the product
    !
    !                    p_av(s,H) * detJ * phi(n,p)
    !
    ! where s and H are the surface elevation and ice thickness evaluated at that point,
    !  detJ is the determinant of the transformation linking the reference 2D element coordinates
    !  to the true coordinates at that point, and phi(n,p) is the basis function evaluated at that point.
    !
    !-----------------------------------------------------------------------------------

    integer, intent(in) ::      &
       nx, ny,                  &    ! horizontal grid dimensions
       nz,                      &    ! number of vertical levels at which velocity is computed
                                     ! Note: the number of elements per column is nz-1
       whichassemble_lateral,   &    ! = 0 for standard finite element computation of lateral load terms
                                     ! = 1 for computation that uses usrf and thck of the cell containing the marine edge
       iCell, jCell                  ! i and j indices for this cell

    character(len=*), intent(in) ::  &
       face                          ! 'north', 'south', 'east', or 'west'

    real(dp), dimension(nz), intent(in) ::    &
       sigma                         ! sigma vertical coordinate

    integer, intent(in) :: &
       itest, jtest, rtest           ! coordinates of diagnostic point

    real(dp), dimension(nx-1,ny-1), intent(in) ::   &
       xVertex, yVertex              ! x and y coordinates of vertices

    !Note: usrf and thck are used for local assembly; stagusrf and stagthck are used for standard assembly
    real(dp), dimension(nx,ny), intent(in) ::  &
       usrf,                      &  ! upper surface elevation (m) on ice grid
       thck                          ! ice thickness (m) on ice grid (m)

    real(dp), dimension(nx-1,ny-1), intent(in) ::  &
       stagusrf,                  &  ! upper surface elevation (m) on staggered grid
       stagthck                      ! ice thickness (m) on staggered grid (m)

    real(dp), dimension(nz,nx-1,ny-1), intent(inout) ::  &
       loadu, loadv                  ! load vector, divided into u and v components

    !----------------------------------------------------------------
    ! Local variables
    !----------------------------------------------------------------

    real(dp), dimension(nNodesPerElement_2d) ::     &
       x, y,               & ! local coordinates of nodes
       s,                  & ! upper surface elevation at nodes
       h                     ! ice thickness at nodes

    integer, dimension(nNodesPerElement_2d) ::     &
       iNode, jNode, kNode   ! global indices of each node

    !Note: These are not currently used except as dummy arguments
    real(dp), dimension(nNodesPerElement_2d) ::   &
       dphi_dx_2d, dphi_dy_2d    ! derivatives of basis functions, evaluated at quad pts

    real(dp)  ::        &
       h_qp,            & ! ice thickness at a given quadrature point (m)
       s_qp,            & ! ice surface elevation at a given quadrature point (m)
       p_av,            & ! net outward pressure from ice, p_out - p_in
       detJ               ! determinant of Jacobian for the transformation
                          !  between the reference element and true element

    integer :: k, n, p

    if ((verbose_shelf .or. verbose_load) .and. &
         iCell == itest .and. jCell == jtest .and. this_rank == rtest) then
       write(iulog,*) ' '
       write(iulog,*) 'In lateral_shelf_bc, rank, i, j =', this_rank, iCell, jCell
       write(iulog,*) 'thck, usrf =', thck(iCell,jCell), usrf(iCell,jCell)
    endif

    ! Compute nodal geometry in a local xy reference system.
    ! Note: The local y direction is really the vertical direction.
    !       The local x direction depends on the face (N/S/E/W).
    ! The diagrams below show the node indexing convention, along with the true
    !  directions for each face.  The true directions are mapped to local (x,y).

    iNode(:) = 0
    jNode(:) = 0

    if (face=='west') then

       !     4-----3       z
       !     |     |       ^
       !     |     |       |
       !     1-----2       ---> -y

       iNode(1) = iCell-1
       jNode(1) = jCell

       iNode(2) = iCell-1
       jNode(2) = jCell-1

       x(1) = yvertex(iNode(1), jNode(1))
       x(2) = yvertex(iNode(2), jNode(2))

    elseif (face=='east') then

       !     4-----3       z
       !     |     |       ^
       !     |     |       |
       !     1-----2       ---> y

       iNode(1) = iCell
       jNode(1) = jCell-1

       iNode(2) = iCell
       jNode(2) = jCell

       x(1) = yvertex(iNode(1), jNode(1))
       x(2) = yvertex(iNode(2), jNode(2))

    elseif (face=='south') then

       !     4-----3       z
       !     |     |       ^
       !     |     |       |
       !     1-----2       ---> x

       iNode(1) = iCell-1
       jNode(1) = jCell-1

       iNode(2) = iCell
       jNode(2) = jCell-1

       x(1) = xvertex(iNode(1), jNode(1))
       x(2) = xvertex(iNode(2), jNode(2))

    elseif (face=='north') then

       !     4-----3       z
       !     |     |       ^
       !     |     |       |
       !     1-----2       ---> -x

       iNode(1) = iCell
       jNode(1) = jCell

       iNode(2) = iCell-1
       jNode(2) = jCell

       x(1) = xvertex(iNode(1), jNode(1))
       x(2) = xvertex(iNode(2), jNode(2))

    endif

    iNode(3) = iNode(2)
    jNode(3) = jNode(2)

    iNode(4) = iNode(1)
    jNode(4) = jNode(1)

    x(3) = x(2)
    x(4) = x(1)

    if (whichassemble_lateral == HO_ASSEMBLE_LATERAL_LOCAL) then  ! use thck and usrf of the cell that owns the marine edge

       s(1:4) = usrf(iCell,jCell)
       h(1:4) = thck(iCell,jCell)

    else  ! use stagthck and stagusrf at vertices

       s(1) = stagusrf(iNode(1), jNode(1))
       s(2) = stagusrf(iNode(2), jNode(2))
       s(3) = s(2)
       s(4) = s(1)

       h(1) = stagthck(iNode(1), jNode(1))
       h(2) = stagthck(iNode(2), jNode(2))
       h(3) = h(2)
       h(4) = h(1)

    endif

    ! loop over element faces in column
    ! assume k increases from upper surface to bottom 

    do k = 1, nz-1

       ! Compute the local y coordinate (i.e., the actual z coordinate)
       y(1) = s(1) - sigma(k+1)*h(1)   ! lower left
       y(2) = s(2) - sigma(k+1)*h(2)   ! lower right
       y(3) = s(3) - sigma(k)  *h(3)   ! upper right
       y(4) = s(4) - sigma(k)  *h(4)   ! upper left

       ! Set the k index for each node
       kNode(1) = k+1
       kNode(2) = k+1
       kNode(3) = k
       kNode(4) = k

       ! loop over quadrature points

       do p = 1, nQuadPoints_2d

          ! Compute basis function derivatives and det(J) for this quadrature point
          ! For now, pass in i, j, k, p for debugging
          !TODO - Modify this subroutine to return only detJ, and not the derivatives?

          if (verbose_shelf .and. this_rank==rtest .and. iCell==itest .and. jCell==jtest .and. k==ktest) then
!             write(iulog,*) 'Get detJ, i, j, k, p =', iCell, jCell, k, p
!             write(iulog,*) 'x =', x(:)
!             write(iulog,*) 'y =', y(:)
!             write(iulog,*) 'dphi_dxr_2d =', dphi_dxr_2d(:,p)
!             write(iulog,*) 'dphi_dyr_2d =', dphi_dyr_2d(:,p)
          endif

          call get_basis_function_derivatives_2d(x(:),              y(:),               &
                                                 dphi_dxr_2d(:,p),  dphi_dyr_2d(:,p),   &
                                                 dphi_dx_2d(:),     dphi_dy_2d(:),      &
                                                 detJ,                                  &
                                                 itest, jtest, rtest,                   &
                                                 iCell, jCell, p)

          ! For some faces, detJ is computed to be a negative number because the face is
          ! oriented opposite the x or y axis.  Fix this by taking the absolute value.

          detJ = abs(detJ)

          ! Evaluate the ice thickness and surface elevation at this quadrature point

          if (whichassemble_lateral == HO_ASSEMBLE_LATERAL_LOCAL) then  ! use thck and usrf of the cell that owns the marine edge

             h_qp = thck(iCell,jCell)
             s_qp = usrf(iCell,jCell)

          else   ! standard finite-element assembly, use staggered thck and usrf at vertices

             h_qp = 0.d0
             s_qp = 0.d0
             do n = 1, nNodesPerElement_2d
                h_qp = h_qp + phi_2d(n,p) * h(n)
                s_qp = s_qp + phi_2d(n,p) * s(n)
             enddo

          endif

          if ( (verbose_shelf .or. verbose_load) .and. &
               this_rank==rtest .and. iCell==itest .and. jCell==jtest .and. k==ktest) then
             write(iulog,*) ' '
             write(iulog,*) 'Increment shelf load vector, i, j, face, k, p =', iCell, jCell, trim(face), k, p
!!             write(iulog,*) 'whichassemble_lateral =', whichassemble_lateral
!!             write(iulog,*) 'h_qp, s_qp =', h_qp, s_qp
!!             write(iulog,*) 'detJ/vol0 =', detJ/vol0
          endif

          ! Increment the load vector with the shelf water pressure contribution from 
          !  this quadrature point.
          ! Increment loadu for east/west faces and loadv for north/south faces.

          ! The following formula works not just for floating ice, but for any edge between
          !  an ice-covered marine-based cell and an ocean cell.
          p_av = 0.5d0*rhoi*grav*h_qp &                                   ! p_out
               - 0.5d0*rhoo*grav*h_qp * (1.d0 - min(s_qp/h_qp,1.d0))**2   ! p_in

          ! The following formula works for floating ice.
          ! It can be derived from the formula above using Archimedes: rhoi*h = rhoo*(h-s) 
!!          p_av = 0.5d0*rhoi*grav*h_qp * (1.d0 - rhoi/rhoo)

          if ( (verbose_shelf .or. verbose_load) .and. &
               this_rank==rtest .and. iCell==itest .and. jCell==jtest .and. k==ktest) then
             write(iulog,*) ' p_av =', p_av
          endif

          if (trim(face) == 'west') then  ! net force in -x direction

             do n = 1, nNodesPerElement_2d
                loadu(kNode(n),iNode(n),jNode(n)) = loadu(kNode(n),iNode(n),jNode(n))    &
                                                  - p_av * wqp_2d(p) * detJ/vol0 * phi_2d(n,p)
             enddo

          elseif (trim(face) == 'east') then  ! net force in x direction

             do n = 1, nNodesPerElement_2d
                loadu(kNode(n),iNode(n),jNode(n)) = loadu(kNode(n),iNode(n),jNode(n))    &
                                                  + p_av * wqp_2d(p) * detJ/vol0 * phi_2d(n,p)
             enddo

          elseif (trim(face) == 'south') then  ! net force in -y direction

             do n = 1, nNodesPerElement_2d
                loadv(kNode(n),iNode(n),jNode(n)) = loadv(kNode(n),iNode(n),jNode(n))    &
                                                  - p_av * wqp_2d(p) * detJ/vol0 * phi_2d(n,p)
             enddo

          elseif (trim(face) == 'north') then  ! net force in y direction
 
             do n = 1, nNodesPerElement_2d
                loadv(kNode(n),iNode(n),jNode(n)) = loadv(kNode(n),iNode(n),jNode(n))    &
                                                  + p_av * wqp_2d(p) * detJ/vol0 * phi_2d(n,p)
             enddo

          endif   ! face = N/S/E/W

       enddo   ! nQuadPoints_2d

    enddo   ! k (element faces in column)

  end subroutine lateral_shelf_bc

!****************************************************************************

  subroutine assemble_stiffness_matrix_3d(nx,               ny,              &
                                          nz,               sigma,           &
                                          nhalo,                             &
                                          itest,   jtest,   rtest,           &
                                          active_cell,                       &
                                          xVertex,          yVertex,         &
                                          uvel,             vvel,            &
                                          stagusrf,         stagthck,        &
                                          flwafact,         whichapprox,     &
                                          whichefvs,        efvs,            &
                                          efvs_constant,    effstrain_min,   &
                                          Auu,              Auv,             &
                                          Avu,              Avv)

    !----------------------------------------------------------------
    ! Assemble the stiffness matrix A in the linear system Ax = b.
    ! This subroutine is called for each nonlinear iteration if
    !  we are iterating on the effective viscosity.
    !----------------------------------------------------------------

    !----------------------------------------------------------------
    ! Input-output arguments
    !----------------------------------------------------------------

    integer, intent(in) ::      &
       nx, ny,                  &    ! horizontal grid dimensions
       nz,                      &    ! number of vertical levels at which velocity is computed
                                     ! Note: the number of elements per column is nz-1
       nhalo                         ! number of halo layers

    real(dp), dimension(nz), intent(in) ::    &
       sigma                         ! sigma vertical coordinate

    integer, intent(in) :: &
       itest, jtest, rtest           ! coordinates of diagnostic point

    logical, dimension(nx,ny), intent(in) ::  &
       active_cell                   ! true if cell contains ice and borders a locally owned vertex

    real(dp), dimension(nx-1,ny-1), intent(in) ::   &
       xVertex, yVertex     ! x and y coordinates of vertices

    real(dp), dimension(nz,nx-1,ny-1), intent(in) ::  &
       uvel, vvel         ! velocity components (m/yr)

    real(dp), dimension(nx-1,ny-1), intent(in) ::  &
       stagusrf,       &  ! upper surface elevation on staggered grid (m)
       stagthck           ! ice thickness on staggered grid (m)

    real(dp), dimension(nz-1,nx,ny), intent(in) ::  &
       flwafact           ! temperature-based flow factor, 0.5 * A^(-1/n), 
                          ! used to compute the effective viscosity
                          ! units: Pa yr^(1/n)

    integer, intent(in) ::   &
       whichapprox,     & ! option for Stokes approximation (BP, SSA, SIA)
       whichefvs          ! option for effective viscosity calculation 

    real(dp), dimension(nz-1,nx,ny), intent(out) ::  &
       efvs               ! effective viscosity (Pa yr)

    real(dp), intent(in) :: &
       efvs_constant,   & ! constant value of effective viscosity (Pa yr)
       effstrain_min      ! minimum value of effective strain rate (yr^-1) for computing viscosity

    real(dp), dimension(nNodeNeighbors_3d,nz,nx-1,ny-1), intent(out) ::  &
       Auu, Auv,    &     ! assembled stiffness matrix, divided into 4 parts
       Avu, Avv

    !---------------------------------------------------------
    ! Local variables
    !---------------------------------------------------------

    real(dp), dimension(nQuadPoints_3d) ::   &
       detJ               ! determinant of J

    real(dp), dimension(nNodesPerElement_3d) ::   &
       dphi_dx_3d, dphi_dy_3d, dphi_dz_3d  ! derivatives of basis function, evaluated at quad pt

    !----------------------------------------------------------------
    ! Note: Kuu, Kuv, Kvu, and Kvv are 8x8 components of the stiffness matrix
    !       for the local element.  (The combined stiffness matrix is 16x16.)
    !
    ! Once these matrices are formed, their coefficients are summed into the assembled
    !  matrices Auu, Auv, Avu, Avv.  The A matrices each have as many rows as there are
    !  active nodes, but only 27 columns, corresponding to the 27 vertices that belong to
    !  the 8 elements sharing a given node.
    !
    ! The native structured PCG solver works with the dense A matrices in the form
    ! computed here.  For the SLAP solver, the terms of the A matrices are put
    ! in a sparse matrix during preprocessing.  For the Trilinos solver, the terms
    ! of the A matrices are passed to Trilinos one row at a time. 
    !----------------------------------------------------------------

    real(dp), dimension(nNodesPerElement_3d, nNodesPerElement_3d) ::   &   !
       Kuu,          &    ! element stiffness matrix, divided into 4 parts as shown below
       Kuv,          &    !  
       Kvu,          &    !
       Kvv                !    Kuu  | Kuv
                          !    _____|____          
                          !         |
                          !    Kvu  | Kvv
                          !         
                          ! Kvu may not be needed if matrix is symmetric, but is included for now

    real(dp), dimension(nNodesPerElement_3d) ::     &
       x, y, z,         & ! Cartesian coordinates of nodes
       u, v,            & ! u and v at nodes
       s                  ! upper surface elevation at nodes

    real(dp), dimension(nQuadPoints_3d) ::    &
       efvs_qp            ! effective viscosity at a quad pt

    logical, parameter ::   &
       check_symmetry_element = .true.  ! if true, then check symmetry of element matrix
                                        !Note: Can speed up assembly a bit by setting to false for production

    integer :: i, j, k, n, p
    integer :: iNode, jNode, kNode

    if (verbose_matrix .and. main_task) then
       write(iulog,*) ' '
       write(iulog,*) 'In assemble_stiffness_matrix_3d'
       write(iulog,*) 'itest, jtest, ktest, rtest =', itest, jtest, ktest, rtest
    endif

    ! Initialize effective viscosity
    efvs(:,:,:) = 0.d0

    ! Initialize global stiffness matrix

    Auu(:,:,:,:) = 0.d0
    Auv(:,:,:,:) = 0.d0
    Avu(:,:,:,:) = 0.d0
    Avv(:,:,:,:) = 0.d0

    ! Sum over elements in active cells 
    ! Loop over all cells that border locally owned vertices.

    do j = nhalo+1, ny-nhalo+1
    do i = nhalo+1, nx-nhalo+1

       if (active_cell(i,j)) then

          do k = 1, nz-1    ! loop over elements in this column 
                            ! assume k increases from upper surface to bed

             ! Initialize element stiffness matrix
             Kuu(:,:) = 0.d0
             Kuv(:,:) = 0.d0
             Kvu(:,:) = 0.d0
             Kvv(:,:) = 0.d0
  
             ! compute spatial coordinates, velocity, and upper surface elevation for each node

             do n = 1, nNodesPerElement_3d

                ! Determine (k,i,j) for this node
                ! The reason for the '7' is that node 7, in the NE corner of the upper layer, has index (k,i,j).
                ! Indices for other nodes are computed relative to this node.
                iNode = i + ishift(7,n)
                jNode = j + jshift(7,n)
                kNode = k + kshift(7,n)

                x(n) = xVertex(iNode,jNode)
                y(n) = yVertex(iNode,jNode)
                z(n) = stagusrf(iNode,jNode) - sigma(kNode)*stagthck(iNode,jNode)
                u(n) = uvel(kNode,iNode,jNode)
                v(n) = vvel(kNode,iNode,jNode)
                s(n) = stagusrf(iNode,jNode)

                if (verbose_matrix .and. this_rank==rtest .and. i==itest .and. j==jtest .and. k==ktest) then
                   write(iulog,*) ' '
                   write(iulog,*) 'i, j, k, n, x, y, z:', i, j, k, n, x(n), y(n), z(n)
                   write(iulog,*) 's, u, v:', s(n), u(n), v(n)
                endif

             enddo   ! nodes per element

             ! Loop over quadrature points for this element
   
             do p = 1, nQuadPoints_3d

                ! Evaluate the derivatives of the element basis functions at this quadrature point.
                !WHL - Pass in i, j, k, and p to the following subroutines for debugging.

                call get_basis_function_derivatives_3d(x(:),             y(:),             z(:),              &          
                                                       dphi_dxr_3d(:,p), dphi_dyr_3d(:,p), dphi_dzr_3d(:,p),  &
                                                       dphi_dx_3d(:),    dphi_dy_3d(:),    dphi_dz_3d(:),     &
                                                       detJ(p),                                               &
                                                       itest, jtest, rtest,                                   &
                                                       i, j, k, p)

!          call t_startf('glissade_effective_viscosity')
                call compute_effective_viscosity(whichefvs,        whichapprox,                       &
                                                 efvs_constant,    effstrain_min,                     &
                                                 nNodesPerElement_3d,                                 &
                                                 dphi_dx_3d(:),    dphi_dy_3d(:),    dphi_dz_3d(:),   &
                                                 u(:),             v(:),                              & 
                                                 flwafact(k,i,j),  efvs_qp(p),                        &
                                                 itest, jtest, rtest,                                 &
                                                 i, j, k, p )
!          call t_stopf('glissade_effective_viscosity')

                if (verbose_efvs .and. this_rank==rtest .and. i==itest .and. j==jtest .and. p==ptest) then
                   write(iulog,*) 'i, j, k, p, efvs (Pa yr):', i, j, k, p, efvs_qp(p)
                endif

                ! Increment the element stiffness matrix with the contribution from each quadrature point.

!          call t_startf('glissade_compute_element_matrix')
                call compute_element_matrix(whichapprox,     nNodesPerElement_3d,               & 
                                            wqp_3d(p),       detJ(p),          efvs_qp(p),      &
                                            dphi_dx_3d(:),   dphi_dy_3d(:),    dphi_dz_3d(:),   &
                                            Kuu(:,:),        Kuv(:,:),                          &
                                            Kvu(:,:),        Kvv(:,:),                          &
                                            itest, jtest, rtest,                                &
                                            i, j, k, p )
!          call t_stopf('glissade_compute_element_matrix')

             enddo   ! nQuadPoints_3d

             ! Compute average of effective viscosity over quad pts
             efvs(k,i,j) = 0.d0

             do p = 1, nQuadPoints_3d
                efvs(k,i,j) = efvs(k,i,j) + efvs_qp(p)
             enddo
             efvs(k,i,j) = efvs(k,i,j) / nQuadPoints_3d
             
             if (check_symmetry_element) then
                call check_symmetry_element_matrix(nNodesPerElement_3d,  &
                                                   Kuu, Kuv, Kvu, Kvv)
             endif

             ! Sum terms of element matrix K into dense assembled matrix A

             call element_to_global_matrix_3d(nx,           ny,          nz,    &
                                              i,            j,           k,     &
                                              itest,        jtest,       rtest, &
                                              Kuu,          Kuv,                &
                                              Kvu,          Kvv,                &
                                              Auu,          Auv,                &
                                              Avu,          Avv)

          enddo   ! nz  (loop over elements in this column)

          if (verbose_efvs .and. this_rank==rtest .and. i==itest .and. j==jtest) then
             write(iulog,*) ' '
             write(iulog,*) 'Assembled 3D matrix, i, j =', i, j
             write(iulog,*) 'k, flwafact, efvs:'
             do k = 1, nz-1
                write(iulog,*) k, flwafact(k,i,j), efvs(k,i,j)
             enddo
          endif

       endif   ! active_cell

    enddo      ! i
    enddo      ! j

  end subroutine assemble_stiffness_matrix_3d

!****************************************************************************

  subroutine assemble_stiffness_matrix_2d(nx,               ny,              &
                                          nz,                                &
                                          sigma,            stagsigma,       &
                                          nhalo,                             &
                                          itest,   jtest,   rtest,           &
                                          active_cell,                       &
                                          xVertex,          yVertex,         &
                                          uvel_2d,          vvel_2d,         &
                                          stagusrf,         stagthck,        &
                                          flwa,             flwafact,        &
                                          whichapprox,                       &
                                          diva_slope_factor_x, diva_slope_factor_y, &
                                          whichefvs,        efvs,            &
                                          efvs_constant,    effstrain_min,   &
                                          Auu,              Auv,             &
                                          Avu,              Avv,             &
                                          dusrf_dx,         dusrf_dy,        &
                                          thck,                              &
                                          btractx,          btracty,         &
                                          omega_k,          omega,           &
                                          efvs_qp_3d)
  
    !----------------------------------------------------------------
    ! Assemble the stiffness matrix A in the linear system Ax = b.
    ! This subroutine is called for each nonlinear iteration if
    !  we are iterating on the effective viscosity.
    ! The matrix A can be based on the shallow-shelf approximation or 
    !  the depth-integrated L1L2 approximation (Schoof and Hindmarsh, 2010).
    !----------------------------------------------------------------

    use glissade_grid_operators, only: glissade_vertical_average

    !----------------------------------------------------------------
    ! Input-output arguments
    !----------------------------------------------------------------

    integer, intent(in) ::      &
       nx, ny,                  &    ! horizontal grid dimensions
       nz,                      &    ! number of vertical levels at which velocity is computed
                                     ! (used for flwafact)
       nhalo                         ! number of halo layers

    real(dp), dimension(nz), intent(in) ::    &
       sigma              ! sigma vertical coordinate

    real(dp), dimension(nz-1), intent(in) ::    &
       stagsigma          ! staggered sigma vertical coordinate

    integer, intent(in) :: &
       itest, jtest, rtest           ! coordinates of diagnostic point

    logical, dimension(nx,ny), intent(in) ::  &
       active_cell        ! true if cell contains ice and borders a locally owned vertex

    real(dp), dimension(nx-1,ny-1), intent(in) ::   &
       xVertex, yVertex   ! x and y coordinates of vertices

    real(dp), dimension(nx-1,ny-1), intent(in) ::  &
       uvel_2d, vvel_2d   ! 2D velocity components (m/yr)

    real(dp), dimension(nx-1,ny-1), intent(in) ::  &
       stagusrf,       &  ! upper surface elevation on staggered grid (m)
       stagthck           ! ice thickness on staggered grid (m)

    !TODO - Pass in flwa only, and compute flwafact here?
    real(dp), dimension(nz-1,nx,ny), intent(in) ::  &
       flwa,             &! temperature-based flow factor A, Pa^{-n} yr^{-1}
       flwafact           ! temperature-based flow factor, 0.5 * A^(-1/n), Pa yr^(1/n)
                          ! used to compute the effective viscosity

    integer, intent(in) ::   &
       whichapprox,     & ! option for Stokes approximation (BP, L1L2, SSA, SIA)
       whichefvs          ! option for effective viscosity calculation 

    real(dp), dimension(nx,ny), intent(in) ::   &
       diva_slope_factor_x, &  ! correction factor for DIVA in x direction, based on theta_slope_x
       diva_slope_factor_y     ! correction factor for DIVA in y direction, based on theta_slope_y

    real(dp), dimension(nz-1,nx,ny), intent(out) ::  &
       efvs               ! effective viscosity (Pa yr)

    real(dp), intent(in) :: &
       efvs_constant,   & ! constant value of effective viscosity (Pa yr)
       effstrain_min      ! minimum value of effective strain rate (yr^-1) for computing viscosity

    real(dp), dimension(nx-1,ny-1,nNodeNeighbors_2d), intent(out) ::  &
       Auu, Auv,    &     ! assembled stiffness matrix, divided into 4 parts
       Avu, Avv                                    

    ! The following optional arguments are used for the L1L2 approximation only

    real(dp), dimension(nx-1,ny-1), intent(in), optional ::  &
       dusrf_dx,       &  ! upper surface elevation gradient on staggered grid (m/m)
       dusrf_dy           ! needed for L1L2 assembly only

    ! The following optional arguments are used for DIVA only

    real(dp), dimension(nx,ny), intent(in), optional ::   &
       thck               ! ice thickness (m)

    real(dp), dimension(nx-1,ny-1), intent(in), optional ::   &
       btractx, btracty         ! components of basal traction (Pa)

    real(dp), dimension(nz,nx,ny), intent(out), optional :: &
       omega_k            ! single integral, defined by Goldberg (2011) eq. 32

    real(dp), dimension(nx,ny), intent(out), optional :: &
       omega              ! double integral, defined by Goldberg (2011) eq. 35
                          ! Note: omega here = Goldberg's omega/H

    real(dp), dimension(nz-1,nQuadPoints_2d,nx,ny), intent(inout), optional ::  &
       efvs_qp_3d         ! effective viscosity (Pa yr)

    !---------------------------------------------------------
    ! Local variables
    !---------------------------------------------------------

    real(dp), dimension(nQuadPoints_2d) ::   &
       detJ               ! determinant of J

    real(dp), dimension(nNodesPerElement_2d) ::   &
       dphi_dx_2d, dphi_dy_2d, dphi_dz_2d  ! derivatives of basis function, evaluated at quad pts
                                           ! set dphi_dz = 0 for 2D problem

    !----------------------------------------------------------------
    ! Note: Kuu, Kuv, Kvu, and Kvv are 4x4 components of the stiffness matrix
    !       for the local element.  (The combined stiffness matrix is 8x8.)
    !
    ! Once these matrices are formed, their coefficients are summed into the global
    !  matrices Auu_2d, Auv_2d, Avu_2d, Avv_2d.  The global matrices each have as 
    !  many rows as there are active vertices, but only 9 columns, corresponding to 
    !  the 9 vertices of the 4 elements sharing a given node.
    !
    ! The native structured PCG solver works with the dense A matrices in the form
    ! computed here.  For the SLAP solver, the terms of the A matrices are put
    ! in a sparse matrix format during preprocessing.  For the Trilinos solver, 
    ! the terms of the A matrices are passed to Trilinos one row at a time. 
    !----------------------------------------------------------------

    real(dp), dimension(nNodesPerElement_2d, nNodesPerElement_2d) ::   &   !
       Kuu,          &    ! element stiffness matrix, divided into 4 parts as shown below
       Kuv,          &    !  
       Kvu,          &    !
       Kvv                !    Kuu  | Kuv
                          !    _____|____          
                          !         |
                          !    Kvu  | Kvv
                          !         
                          ! Kvu may not be needed if matrix is symmetric, but is included for now

    real(dp), dimension(nNodesPerElement_2d) ::     &
       x, y,            & ! Cartesian coordinates of vertices
       u, v,            & ! depth-integrated mean velocity at vertices (m/yr)
       h,               & ! thickness at vertices (m)
       s,               & ! upper surface elevation at vertices (m)
       bx, by,          & ! basal traction at vertices (Pa) (DIVA only)
       dsdx, dsdy         ! upper surface elevation gradient at vertices (m/m) (L1L2 only)

    real(dp), dimension(nQuadPoints_2d) ::    &
       efvs_qp_vertavg    ! vertically averaged effective viscosity at a quad pt

    real(dp) ::         &
       h_qp               ! thickness at a quad pt

    real(dp), dimension(nz-1,nQuadPoints_2d) ::    &
       efvs_qp            ! effective viscosity at each layer in a cell column
                          ! corresponding to a quad pt

    logical, parameter ::   &
       check_symmetry_element = .true.  ! if true, then check symmetry of element matrix

    real(dp), dimension(nx,ny) ::  &
       flwafact_2d        ! vertically averaged flow factor

    integer :: i, j, k, n, p
    integer :: iVertex, jVertex

    if (verbose_matrix .and. main_task) then
       write(iulog,*) ' '
       write(iulog,*) 'In assemble_stiffness_matrix_2d'
    endif

    ! Initialize effective viscosity
    efvs(:,:,:) = 0.d0

    ! Initialize global stiffness matrix

    Auu(:,:,:) = 0.d0
    Auv(:,:,:) = 0.d0
    Avu(:,:,:) = 0.d0
    Avv(:,:,:) = 0.d0

    ! Compute vertical average of flow factor (SSA only)
    if (whichapprox == HO_APPROX_SSA) then
       call glissade_vertical_average(nx,       ny,      &
                                      nz,       sigma,   &
                                      flwafact, flwafact_2d)
    endif

    ! Sum over elements in active cells 
    ! Loop over all cells that border locally owned vertices.

    do j = nhalo+1, ny-nhalo+1
    do i = nhalo+1, nx-nhalo+1
       
       if (active_cell(i,j)) then

          ! Initialize element stiffness matrix
          Kuu(:,:) = 0.d0
          Kuv(:,:) = 0.d0
          Kvu(:,:) = 0.d0
          Kvv(:,:) = 0.d0
  
          ! Compute spatial coordinates, velocity, thickness and surface elevation for each vertex
          ! Also compute surface elevation gradient (for L1L2) and basal traction (for DIVA)
          do n = 1, nNodesPerElement_2d

             ! Determine (i,j) for this vertex
             ! The reason for the '3' is that node 3, in the NE corner of the grid cell, has index (i,j).
             ! Indices for other nodes are computed relative to this vertex.
             iVertex = i + ishift(3,n)
             jVertex = j + jshift(3,n)

             x(n) = xVertex(iVertex,jVertex)
             y(n) = yVertex(iVertex,jVertex)
             u(n) = uvel_2d(iVertex,jVertex)
             v(n) = vvel_2d(iVertex,jVertex)
             s(n) = stagusrf(iVertex,jVertex)
             h(n) = stagthck(iVertex,jVertex)
             if (present(dusrf_dx) .and. present(dusrf_dy)) then  ! L1L2
                dsdx(n) = dusrf_dx(iVertex,jVertex)
                dsdy(n) = dusrf_dy(iVertex,jVertex)
             endif
             if (present(btractx) .and. present(btracty)) then    ! DIVA
                bx(n) = btractx(iVertex,jVertex)
                by(n) = btracty(iVertex,jVertex)
             endif

             if (verbose_matrix .and. this_rank==rtest .and. i==itest .and. j==jtest) then
                write(iulog,*) ' '
                write(iulog,*) 'i, j, n, x, y:', i, j, n, x(n), y(n)
                write(iulog,*) 's, h, u, v:', s(n), h(n), u(n), v(n)
                if (present(btractx) .and. present(btracty)) write(iulog,*) 'bx, by:', bx(n), by(n)
             endif

          enddo   ! vertices per element

          ! Loop over quadrature points for this element
   
          do p = 1, nQuadPoints_2d

             ! Evaluate the derivatives of the element basis functions at this quadrature point.

             call get_basis_function_derivatives_2d(x(:),             y(:),          &
                                                    dphi_dxr_2d(:,p), dphi_dyr_2d(:,p), &
                                                    dphi_dx_2d(:),    dphi_dy_2d(:),    &
                                                    detJ(p),                            &
                                                    itest, jtest, rtest,                &
                                                    i, j, p)

             dphi_dz_2d(:) = 0.d0

             if (whichapprox == HO_APPROX_L1L2) then

                ! Compute effective viscosity for each layer at this quadrature point
                !TODO - sigma -> stagsigma for L1L2 viscosity?
                call compute_effective_viscosity_L1L2(whichefvs,                               &
                                                      efvs_constant,        effstrain_min,     &
                                                      nz,                   sigma,             &
                                                      nNodesPerElement_2d,  phi_2d(:,p),       &
                                                      dphi_dx_2d(:),        dphi_dy_2d(:),     &
                                                      u(:),                 v(:),              & 
                                                      h(:),                                    &
                                                      dsdx(:),              dsdy(:),           &
                                                      flwa(:,i,j),          flwafact(:,i,j),   &
                                                      efvs_qp(:,p),                            &
                                                      itest,    jtest,      rtest,             &
                                                      i, j, p)

                ! Compute vertical average of effective viscosity
                efvs_qp_vertavg(p) = 0.d0
                do k = 1, nz-1
                   efvs_qp_vertavg(p) = efvs_qp_vertavg(p) + efvs_qp(k,p) * (sigma(k+1) - sigma(k))
                enddo

             elseif (whichapprox == HO_APPROX_DIVA) then

                ! Copy efvs_qp from global array to local column array
                efvs_qp(:,:) = efvs_qp_3d(:,:,i,j)

                ! Compute effective viscosity for each layer at this quadrature point
                ! Note: efvs_qp_3d is intent(inout); old value is used to compute new value
                call compute_effective_viscosity_diva(whichefvs,                               &
                                                      efvs_constant,        effstrain_min,     &
                                                      nz,                   stagsigma,         &
                                                      nNodesPerElement_2d,  phi_2d(:,p),       &
                                                      dphi_dx_2d(:),        dphi_dy_2d(:),     &
                                                      u(:),                 v(:),              & 
                                                      bx(:),                by(:),             &
                                                      diva_slope_factor_x(i,j),                &
                                                      diva_slope_factor_y(i,j),                &
                                                      h(:),                                    &
                                                      flwa(:,i,j),          flwafact(:,i,j),   &
                                                      efvs_qp(:,p),                            &
                                                      itest,    jtest,      rtest,             &
                                                      i, j, p)

                if (verbose_efvs .and. this_rank==rtest .and. i==itest .and. j==jtest .and. p==ptest) then
                   write(iulog,*) 'i, j, k, p, efvs (Pa yr):', i, j, k, p, efvs_qp(:,p)
                endif

                !WHL - Copy local efvs_qp to the global array
                efvs_qp_3d(:,:,i,j) = efvs_qp(:,:)

                ! Compute vertical average of effective viscosity
                efvs_qp_vertavg(p) = 0.d0
                do k = 1, nz-1
                   efvs_qp_vertavg(p) = efvs_qp_vertavg(p) + efvs_qp(k,p)*(sigma(k+1) - sigma(k))
                enddo

             else     ! SSA

                ! Compute vertically averaged effective viscosity at this quadrature point
                !TODO - Why do we pass in dphi_dz_2d here and not elsewhere?
                call compute_effective_viscosity(whichefvs,        whichapprox,                       &
                                                 efvs_constant,    effstrain_min,                     &
                                                 nNodesPerElement_2d,                                 &
                                                 dphi_dx_2d(:),    dphi_dy_2d(:),    dphi_dz_2d(:),   &
                                                 u(:),             v(:),                              & 
                                                 flwafact_2d(i,j), efvs_qp_vertavg(p),                &
                                                 itest, jtest, rtest,                                 &
                                                 i, j, 1, p)

                ! Copy vertically averaged value to all levels
                efvs_qp(:,p) = efvs_qp_vertavg(p)

             endif    ! whichapprox

             ! Compute ice thickness at this quadrature point

             h_qp = 0.d0
             do n = 1, nNodesPerElement_2d
                h_qp = h_qp + phi_2d(n,p) * h(n)
             enddo

             ! Increment the element stiffness matrix with the contribution from each quadrature point.
             ! Note: The effective viscosity is multiplied by thickness since the equation to be solved
             !       is vertically integrated.

             call compute_element_matrix(whichapprox,     nNodesPerElement_2d,               & 
                                         wqp_2d(p),       detJ(p),                           &
                                         h_qp*efvs_qp_vertavg(p),                            &
                                         dphi_dx_2d(:),   dphi_dy_2d(:),    dphi_dz_2d(:),   &
                                         Kuu(:,:),        Kuv(:,:),                          &
                                         Kvu(:,:),        Kvv(:,:),                          &
                                         itest, jtest, rtest,                                &
                                         i, j, 1, p )

          enddo   ! nQuadPoints_2d

          if (whichapprox == HO_APPROX_DIVA) then

             ! Compute vertical integrals needed for the 2D solve and 3D velocity reconstruction
             call compute_integrals_diva(nz,             sigma,         &
                                         itest,  jtest,  rtest,         &
                                         thck(i,j),      efvs_qp(:,:),  &
                                         omega_k(:,i,j), omega(i,j),    &
                                         i, j)

          endif

          ! Compute average of effective viscosity over quad points.
          ! For L1L2 and DIVA there is a different efvs in each layer.
          ! For SSA, simply write the vertical average value to each layer.

          efvs(:,i,j) = 0.d0
          do p = 1, nQuadPoints_2d
             do k = 1, nz-1
                efvs(k,i,j) = efvs(k,i,j) + efvs_qp(k,p)
             enddo
          enddo
          efvs(:,i,j) = efvs(:,i,j) / nQuadPoints_2d

          if (check_symmetry_element) then
             call check_symmetry_element_matrix(nNodesPerElement_2d,   &
                                                Kuu, Kuv, Kvu, Kvv)
          endif

          ! Sum the terms of element matrix K into the dense assembled matrix A

          call element_to_global_matrix_2d(nx,           ny,        &
                                           i,            j,         &
                                           itest, jtest, rtest,     &
                                           Kuu,          Kuv,       &
                                           Kvu,          Kvv,       &
                                           Auu,          Auv,       &
                                           Avu,          Avv)

          if (verbose_efvs .and. this_rank==rtest .and. i==itest .and. j==jtest) then
             write(iulog,*) ' '
             write(iulog,*) 'Assembled 2D matrix, i, j =', i, j
             write(iulog,*) 'k, flwafact, efvs:'
             do k = 1, nz-1
                write(iulog,*) k, flwafact(k,i,j), efvs(k,i,j)
             enddo
          endif

       endif   ! active_cell

    enddo      ! i
    enddo      ! j

  end subroutine assemble_stiffness_matrix_2d

!****************************************************************************

! For now, passing in i and j for debugging

  subroutine compute_integrals_diva(nz,           sigma,         &
                                    itest, jtest, rtest,         &
                                    thck,         efvs_qp,       &
                                    omega_k,      omega,   i, j)

    !----------------------------------------------------------------
    ! Compute some integrals used by the DIVA solver to relate velocities
    ! in different parts of the column:
    !
    !    F1(z) = int_b^z {[(s-z)/H] * 1/efvs * dz}
    !    F2    = int_b^s {[(s-z)/H]^2 * 1/efvs * dz}
    !          = int_b^s {F1(z)/H * dz}
    !
    ! Because efvs is highly nonlinear and appears in the denominator,
    ! it should be more accurate to compute the integral at each quadrature
    ! point and then average to the cell center, rather than average efvs 
    ! to the cell center and then integrate.
    !----------------------------------------------------------------

    !----------------------------------------------------------------
    ! Input-output arguments
    !----------------------------------------------------------------

    integer, intent(in) ::    &
       nz                 ! number of vertical levels at which velocity is computed

    real(dp), dimension(nz), intent(in) ::    &
       sigma              ! sigma vertical coordinate

    integer, intent(in) :: &
       itest, jtest, rtest           ! coordinates of diagnostic point

    real(dp), intent(in) ::  &
       thck               ! ice thickness (m)

    real(dp), dimension(nz-1,nQuadPoints_2d), intent(in) ::  &
       efvs_qp            ! effective viscosity (Pa yr) at each quad point in each layer

    real(dp), dimension(nz), intent(out) :: &
       omega_k            ! single integral, defined by Goldberg (2011) eq. 32

    real(dp), intent(out) :: &
       omega              ! double integral, defined by Goldberg (2011) eq. 35
                          ! Note: omega here = Goldberg's omega/H

    integer, intent(in) :: i, j   ! temporary, for debugging

    !---------------------------------------------------------
    ! Local variables
    !---------------------------------------------------------

    integer :: k, p

    real(dp), dimension(nz,nQuadPoints_2d) :: &
       omega_kp   ! omega_k in a column associated with a quad point

    real(dp) :: &
       layer_avg, dz, depth

    !WHL - debug
    real(dp), dimension(nz) :: fact_k

    omega_k(:) = 0.d0
    omega = 0.d0

    ! Compute omega_k in the vertical column at each quad point
    do p = 1, nQuadPoints_2d
       omega_kp(nz,p) = 0.d0
       do k = nz-1, 1, -1
          depth = 0.5d0*(sigma(k)+sigma(k+1))   ! depth/thck
          dz = (sigma(k+1)-sigma(k)) * thck
          omega_kp(k,p) = omega_kp(k+1,p) + depth/efvs_qp(k,p) * dz
       enddo
    enddo

    ! Average from quad points to the cell center
    do k = 1, nz
       omega_k(k) = sum(omega_kp(k,:)) / nQuadPoints_2d
    enddo

    ! Integrate omega_k in the vertical to obtain omega
    omega = 0.d0
    do k = 1, nz-1
       layer_avg = 0.5d0*(omega_k(k) + omega_k(k+1))
       dz = sigma(k+1)-sigma(k)  ! dz/thck
       omega = omega + layer_avg * dz
    enddo
             
    if (verbose_diva .and. this_rank==rtest .and. i==itest .and. j==jtest) then
       write(iulog,*) ' '
       write(iulog,*) 'DIVA integrals, i, j =', i, j
       write(iulog,*) 'k, integral_k:'
       do k = 1, nz
          write(iulog,*) k, omega_k(k)
       enddo
       write(iulog,*) 'omega =', omega
    endif

    !TODO - Test results further with this integral
    !Note - The following code computes the integral Arthern-style.

    do p = 1, nQuadPoints_2d
       omega_kp(nz,p) = 0.d0
       do k = nz-1, 1, -1
          depth = 0.5d0*(sigma(k)+sigma(k+1))   ! depth/thck
          dz = (sigma(k+1)-sigma(k)) * thck
          omega_kp(k,p) = omega_kp(k+1,p) + depth**2/efvs_qp(k,p) * dz
       enddo
    enddo

    ! Average from quad points to the cell center
    do k = 1, nz
       fact_k(k) = sum(omega_kp(k,:)) / nQuadPoints_2d
    enddo
!!    omega = fact_k(1)  ! Uncomment to use Arthern value of omega
    
    if (verbose_diva .and. this_rank==rtest .and. i==itest .and. j==jtest) then
       write(iulog,*) ' '
       write(iulog,*) 'Arthern integrals, i, j =', i, j
       write(iulog,*) 'k, fact_k:'
       do k = 1, nz
          write(iulog,*) k, fact_k(k)
       enddo
       write(iulog,*) 'omega =', fact_k(1)
    endif

  end subroutine compute_integrals_diva

!****************************************************************************

  subroutine compute_3d_velocity_diva(&
       nx,                  ny,                 &
       nz,                  sigma,              &
       itest,    jtest,     rtest,              &
       active_vertex,       diva_level_index,   &
       ice_plus_land_mask,                      &
       stag_omega,          omega_k,            &
       beta,                                    &
       beta_eff_x,          beta_eff_y,         &
       stag_theta_slope_x,  stag_theta_slope_y, &
       stag_diva_slope_factor_x,                &
       stag_diva_slope_factor_y,                &
       uvel_2d,             vvel_2d,            &
       btractx,             btracty,            &
       uvel,                vvel)
    
    !----------------------------------------------------------------
    ! Compute the 3D velocity field for the DIVA scheme,
    ! given the 2D velocity solution and the 3D effective viscosity.
    !----------------------------------------------------------------

    !----------------------------------------------------------------
    ! Input-output arguments
    !----------------------------------------------------------------

    integer, intent(in) ::      &
       nx, ny,                  &    ! horizontal grid dimensions
       nz                            ! number of vertical levels at which velocity is computed

    real(dp), dimension(nz), intent(in) ::    &
       sigma              ! sigma vertical coordinate

    integer, intent(in) :: &
       itest, jtest, rtest           ! coordinates of diagnostic point

    logical, dimension(nx-1,ny-1), intent(in) ::  &
       active_vertex      ! true for vertices of active cells

    integer, intent(in) ::   &
       diva_level_index   ! level for which the DIVA scheme finds the 2D velocity
                          ! 0 = mean, 1 = upper surface

    integer, dimension(nx,ny), intent(in) :: &
       ice_plus_land_mask ! = 1 for active ice cells plus ice-free land cells

    real(dp), dimension(nz,nx,ny), intent(in) ::  &
       omega_k            ! single integral, defined by Goldberg eq. 32 (m^2/(Pa yr))
                          ! interpolated to staggered grid below

    real(dp), dimension(nx-1,ny-1), intent(in) ::  &
       beta,            & ! basal friction coefficient (Pa/(m/yr))
       beta_eff_x,      & ! effective beta, defined by Goldberg (2011) eq. 41
       beta_eff_y,      & ! beta*u_b = beta_eff_x*u_av, beta*v_b = beta_eff_y*v_av
       stag_theta_slope_x, & ! slope angle (radians) in x direction at vertices
       stag_theta_slope_y, & ! slope angle (radians) in y direction at vertices
       stag_diva_slope_factor_x, & ! slope correction factor in x direction
       stag_diva_slope_factor_y, & ! slope correction factor in y direction
       stag_omega,      & ! double integral, defined by Goldberg eq. 35 (m^2/(Pa yr))
                          ! already interpolated to staggered grid
                          ! Note: omega here = Goldberg's omega/H
       uvel_2d, vvel_2d   ! depth-integrated mean velocity; solution of 2D velocity solve (m/yr)

    real(dp), dimension(nx-1,ny-1), intent(out) ::  &
       btractx, btracty   ! components of basal traction (Pa); btractx = beta_eff * uvel

    real(dp), dimension(nz,nx-1,ny-1), intent(inout) ::  &
       uvel, vvel         ! 3D velocity components (m/yr)

    ! Local variables

    integer :: i, j, k

    real(dp), dimension(nz,nx-1,ny-1) ::  &
         stag_omega_k       ! single integral, defined by Goldberg eq. 32 (m^2/(Pa yr))
                            ! interpolated to staggered grid

    real(dp), dimension(nx-1,ny-1) ::  &
         stag_integral      ! integral that relates bed velocity to uvel_2d and vvel_2d
                            ! = stag_omega for diva_level_index = 0
                            ! = stag_omega_k(k,:,:) for other values of diva_level_index


    real(dp) ::  &
         slope_correction_x, & ! slope-based correction for vertical shear in x direction
         slope_correction_y    ! slope-based correction for vertical shear in y direction

    ! Compute the components of basal traction, based on Goldberg (2011) eq. 38-39.
    ! These are needed to compute the effective viscosity on the next nonlinear iteration.

    btractx(:,:) = beta_eff_x(:,:) * uvel_2d(:,:)
    btracty(:,:) = beta_eff_y(:,:) * vvel_2d(:,:)

    ! Interpolate omega_k to the staggered grid
    !TODO - Remove ice_plus_land_mask and do a standard staggering?
    do k = 1, nz
       call glissade_stagger(nx,              ny,                   &
                             omega_k(k,:,:),  stag_omega_k(k,:,:),  &
                             ice_plus_land_mask,                    &
                             stagger_margin_in = 1)
    enddo

    ! Identify the appropriate integral for relating uvel_2d/vvel_2d to the bed velocity

    if (diva_level_index == 0) then  ! solved for mean velocity
       stag_integral(:,:) = stag_omega(:,:)
    else
       k = diva_level_index
       stag_integral(:,:) = stag_omega_k(k,:,:)
    endif

    !----------------------------------------------------------------
    ! Compute the 3D velocity field
    !----------------------------------------------------------------

    do j = 1, ny-1
       do i = 1, nx-1
          if (active_vertex(i,j)) then

             ! basal velocity (Goldberg eq. 34)
             if (beta(i,j) > 0.0d0) then
                uvel(nz,i,j) = uvel_2d(i,j) * beta_eff_x(i,j)/beta(i,j)
                vvel(nz,i,j) = vvel_2d(i,j) * beta_eff_y(i,j)/beta(i,j)
             else
                uvel(nz,i,j) = uvel_2d(i,j)
                vvel(nz,i,j) = vvel_2d(i,j)
             endif

             ! vertical velocity profile (Goldberg eq. 32, with slope correction added))
             ! Note: slope_correction = 1 if diva_slope_correction = F
             slope_correction_x = cos(stag_theta_slope_x(i,j)) / stag_diva_slope_factor_x(i,j)
             slope_correction_y = cos(stag_theta_slope_y(i,j)) / stag_diva_slope_factor_y(i,j)
             do k = 1, nz-1
                uvel(k,i,j) = uvel(nz,i,j) + btractx(i,j)*stag_omega_k(k,i,j)*slope_correction_x
                vvel(k,i,j) = vvel(nz,i,j) + btracty(i,j)*stag_omega_k(k,i,j)*slope_correction_y
             enddo

          endif   ! active_vertex
       enddo      ! i
    enddo         ! j

  end subroutine compute_3d_velocity_diva

!****************************************************************************

  subroutine compute_3d_velocity_L1L2(nx,               ny,              &
                                      nz,                                &
                                      sigma,            stagsigma,       &
                                      dx,               dy,              &
                                      itest,   jtest,   rtest,           &
                                      parallel,                          &
                                      ice_mask,         land_mask,       &
                                      active_cell,      active_vertex,   &
                                      umask_dirichlet,  vmask_dirichlet, &
                                      xVertex,          yVertex,         &
                                      thck,             stagthck,        &
                                      usrf,                              &
                                      dusrf_dx,         dusrf_dy,        &
                                      flwa,             efvs,            &
                                      whichefvs,                         &
                                      uvel,             vvel)

    !----------------------------------------------------------------
    ! Given the basal velocity and the 3D profile of effective viscosity and
    !  horizontal-plane stresses, construct the 3D stress and velocity profiles
    !  for the L1L2 approximation, following Perego et al. (J. Glaciol., 2012).
    !----------------------------------------------------------------

    !----------------------------------------------------------------
    ! Input-output arguments
    !----------------------------------------------------------------

    integer, intent(in) ::      &
       nx, ny,                  &  ! horizontal grid dimensions
       nz                          ! number of vertical levels at which velocity is computed

    real(dp), intent(in) ::     &
       dx, dy                      ! grid cell length and width 

    integer, intent(in) :: &
       itest, jtest, rtest         ! coordinates of diagnostic point

    type(parallel_type), intent(in) :: &
       parallel                    ! info for parallel communication

    real(dp), dimension(nz), intent(in) ::    &
       sigma                       ! sigma vertical coordinate at layer boundaries

    real(dp), dimension(nz-1), intent(in) ::    &
       stagsigma                   ! sigma vertical coordinate at layer midpoints

    integer, dimension(nx,ny), intent(in) ::  &
       ice_mask,        & ! = 1 for cells where ice is present (thck > thklim), else = 0
       land_mask          ! = 1 for cells with topg >= eus, else = 0

    logical, dimension(nx,ny), intent(in) ::  &
       active_cell        ! true if cell contains ice and borders a locally owned vertex

    logical, dimension(nx-1,ny-1), intent(in) ::  &
       active_vertex      ! true for vertices of active cells

    real(dp), dimension(nx-1,ny-1), intent(in) ::   &
       xVertex, yVertex   ! x and y coordinates of vertices

    integer, dimension(nx-1,ny-1), intent(in) ::  &
       umask_dirichlet,  &! Dirichlet mask for u velocity, = 1 for prescribed velo, else = 0
       vmask_dirichlet    ! Dirichlet mask for v velocity, = 1 for prescribed velo, else = 0

    real(dp), dimension(nx,ny), intent(in) ::  &
       thck,             &! ice thickness at cell centers (m)
       usrf               ! upper surface elevation at cell centers (m)

    real(dp), dimension(nx-1,ny-1), intent(in) ::  &
       stagthck,       &  ! ice thickness at vertices (m)
       dusrf_dx,       &  ! upper surface elevation gradient at cell vertices (m/m)
       dusrf_dy

    real(dp), dimension(nz-1,nx,ny), intent(in) ::  &
       flwa,           &  ! temperature-based flow factor A, Pa^{-n} yr^{-1}
       efvs               ! effective viscosity, Pa yr

    integer, intent(in) :: &
       whichefvs          ! option for effective viscosity calculation

    real(dp), dimension(nz,nx-1,ny-1), intent(inout) ::  &
       uvel, vvel         ! velocity components (m/yr)
                          ! on input, only the basal component (index nz) is known
                          ! on output, the full 3D velocity field is known

    !----------------------------------------------------------------
    ! Local variables
    !----------------------------------------------------------------

    integer :: iVertex, jVertex  ! indices of element vertices

    real(dp), dimension(nNodesPerElement_2d) ::   &
       x, y,                    &! x and y coordinates of element vertices 
       u, v,                    &! basal velocity components at element vertices
       dphi_dx_2d, dphi_dy_2d    ! derivatives of basis functions, evaluated at cell center

    real(dp) ::   &
       detJ                      ! determinant of J (never used in calculation)

    real(dp), dimension(nx,ny) ::  &
       du_dx, du_dy,            &! basal strain rate components, evaluated at cell centers
       dv_dx, dv_dy,            &!
       work1, work2, work3       ! work arrays for computing tau_xz and tau_yz; located at cell centers

    real(dp), dimension(nz-1,nx,ny) ::   &
       tau_parallel,            &! tau_parallel, evaluated at cell centers
       efvs_integral_z_to_s      ! integral of effective viscosity from base of layer k
                                 ! to the upper surface (Pa yr m)

    ! Note: These L1L2 stresses are located at nodes.
    !       The diagnostic stresses (model%stress%tau%xz, etc.) are located at cell centers.
    real(dp), dimension(nz-1,nx-1,ny-1) ::   &
       tau_xz, tau_yz,          &! vertical shear stress components at layer midpoints for each vertex
       tau_xz_sia, tau_yz_sia    ! like tau_xz and tau_yz, but with SIA terms only
       
    real(dp), dimension(nx-1,ny-1) ::   &
       dwork1_dx, dwork1_dy,    &! derivatives of work arrays; located at vertices
       dwork2_dx, dwork2_dy,    &!
       dwork3_dx, dwork3_dy,    &!
       stagtau_parallel_sq,     &! tau_parallel^2, interpolated to staggered grid
       stagflwa                  ! flwa, interpolated to staggered grid

    real(dp) ::   &
       depth,                   &! distance from upper surface to midpoint of a given layer
       eps_parallel,            &! parallel effective strain rate, evaluated at cell centers
       tau_eff_sq,              &! square of effective stress (Pa^2)
                                 ! = tau_parallel^2 + tau_perp^2 for L1L2
       fact                      ! factor in velocity integral

    integer :: i, j, k, n

    !-----------------------------------------------------------------------------------------------
    ! Compute velocity at vertices following Perego et al. (2012).
    !
    ! The latest version was implemented in fall 2021 for the paper by Robinson et al. (TC, 2021).
    ! An important change was to evaluate both the membrane stress and SIA stress terms at layer midpoints.
    ! Previously, the membrane stress was evaluated at lower layer boundaries.
    ! With the change, the stability curve is parallel to the SIA curve at fine resolution for the slab problem.
    ! Before the change, the model did not converge on the solution for dx <~ 200 m.
    !-----------------------------------------------------------------------------------------------

    logical, parameter :: &
         include_membrane_stress_in_tau = .true.  ! if true, include membrane stresses in tau_xz and tau_yz;
                                                  ! if false, include the SIA stress only
    integer :: &
         staggered_ilo, staggered_ihi, &  ! bounds of locally owned vertices on staggered grid
         staggered_jlo, staggered_jhi

    staggered_ilo = parallel%staggered_ilo
    staggered_ihi = parallel%staggered_ihi
    staggered_jlo = parallel%staggered_jlo
    staggered_jhi = parallel%staggered_jhi

    ! Initialize
    efvs_integral_z_to_s(:,:,:) = 0.d0
    tau_parallel(:,:,:) = 0.d0
    du_dx(:,:) = 0.d0
    du_dy(:,:) = 0.d0
    dv_dx(:,:) = 0.d0
    dv_dy(:,:) = 0.d0
    tau_xz(:,:,:) = 0.d0
    tau_yz(:,:,:) = 0.d0
    tau_xz_sia(:,:,:) = 0.d0
    tau_yz_sia(:,:,:) = 0.d0

    ! initialize uvel = vvel = 0 except at bed
    uvel(1:nz-1,:,:) = 0.d0
    vvel(1:nz-1,:,:) = 0.d0

    ! Compute viscosity integral and strain rates in elements.
    ! Loop over all cells that border locally owned vertices.

    do j = nhalo+1, ny-nhalo+1
       do i = nhalo+1, nx-nhalo+1
          if (active_cell(i,j)) then

             ! Load x and y coordinates and basal velocity at cell vertices

             do n = 1, nNodesPerElement_2d

                ! Determine (i,j) for this vertex
                ! The reason for the '3' is that node 3, in the NE corner of the grid cell, has index (i,j).
                ! Indices for other nodes are computed relative to this vertex.
                iVertex = i + ishift(3,n)
                jVertex = j + jshift(3,n)

                x(n) = xVertex(iVertex,jVertex)
                y(n) = yVertex(iVertex,jVertex)

                u(n) = uvel(nz,iVertex,jVertex)   ! basal velocity
                v(n) = vvel(nz,iVertex,jVertex)

             enddo

             ! Compute dphi_dx and dphi_dy at cell center

             call get_basis_function_derivatives_2d(x(:),               y(:),               &
                                                    dphi_dxr_2d_ctr(:), dphi_dyr_2d_ctr(:), &
                                                    dphi_dx_2d(:),      dphi_dy_2d(:),      &
                                                    detJ,                                   &
                                                    itest, jtest, rtest,                    &
                                                    i, j, 1)

             ! Compute basal strain rate components at cell center
             
             do n = 1, nNodesPerElement_2d
                du_dx(i,j) = du_dx(i,j) + dphi_dx_2d(n)*u(n)
                du_dy(i,j) = du_dy(i,j) + dphi_dy_2d(n)*u(n)
                
                dv_dx(i,j) = dv_dx(i,j) + dphi_dx_2d(n)*v(n)
                dv_dy(i,j) = dv_dy(i,j) + dphi_dy_2d(n)*v(n)
             enddo

             ! Compute effective strain rate (squared) at cell centers
             ! See Perego et al. Eq. 17:
             !     eps_parallel^2 = eps_xx^2 + eps_yy^2 + eps_xx*eps_yy + eps_xy^2

             eps_parallel = sqrt(du_dx(i,j)**2 + dv_dy(i,j)**2 + du_dx(i,j)*dv_dy(i,j)  &
                                 + 0.25d0*(dv_dx(i,j) + du_dy(i,j))**2)

             ! For each layer k, compute tau_parallel at cell centers
             do k = 1, nz-1
                tau_parallel(k,i,j) = 2.d0 * efvs(k,i,j) * eps_parallel
             enddo

             ! For each layer k, compute the integral of the effective viscosity from
             ! the midpoint of layer k to the upper surface.

             efvs_integral_z_to_s(1,i,j) = efvs(1,i,j) * (stagsigma(1))*thck(i,j)
             do k = 2, nz-1
                efvs_integral_z_to_s(k,i,j) = efvs_integral_z_to_s(k-1,i,j)  &
                                            + efvs(k-1,i,j) * (sigma(k) - stagsigma(k-1))*thck(i,j)  &
                                            + efvs(k,i,j)   * (stagsigma(k) - sigma(k))*thck(i,j)
             enddo   ! k

          endif   ! active_cell
       enddo      ! i
    enddo         ! j

    call parallel_halo(tau_parallel, parallel)

    !--------------------------------------------------------------------------------
    ! For each active vertex, compute the vertical shear stresses tau_xz and tau_yz
    ! in each layer of the column.
    !
    ! These stresses are given by Perego et al. Eq. 27:
    !
    !   tau_xz(z) = -rhoi*grav*ds_dx*(s-z) + 2*d/dx[efvs_int(z) * (2*du_dx + dv_dy)]
    !                                      + 2*d/dy[efvs_int(z) *   (du_dy + dv_dx)] 
    !
    !   tau_yz(z) = -rhoi*grav*ds_dy*(s-z) + 2*d/dx[efvs_int(z) *   (du_dy + dv_dx)] 
    !                                      + 2*d/dy[efvs_int(z) * (2*dv_dy + du_dx)]
    !
    ! where efvs_int is the integral of efvs from z to s computed above;
    ! the strain rate components of basal velocity are also as computed above.
    !
    ! There is not a clean way to compute these stresses using finite-element techniques,
    ! because strain rates are discontinuous at cell edges and vertices.  Instead, we use
    ! a standard centered finite difference method to evaluate d/dx and d/dy of the
    ! bracketed terms.
    !
    ! Given the vertical shear stresses tau_xz and tau_yz for each layer k,
    !  compute the velocity components at each level, following Perego et al. Eq. 30:
    ! 
    !    u(z) = u_b + 2 * integral_b_to_z [A*tau_eff^(n-1)*tau_xz dz]
    !    v(z) = v_b + 2 * integral_b_to_z [A*tau_eff^(n-1)*tau_yz dz]
    ! 
    ! where tau_eff^2 = tau_parallel^2 + tau_perp^2
    !
    !    tau_parallel^2 = (2 * efvs * eps_parallel)^2
    !    tau_perp ^2 = tau_xz^2 + tau_yz^2
    !--------------------------------------------------------------------------------

    do k = nz-1, 1, -1   ! loop over velocity levels above the bed

       ! Average tau_parallel and flwa to vertices
       ! With stagger_margin_in = 1, only cells with ice are included in the average.

       call glissade_stagger(nx,                   ny,                         &
                             tau_parallel(k,:,:),  stagtau_parallel_sq(:,:),   &
                             ice_mask,             stagger_margin_in = 1)
       stagtau_parallel_sq(:,:) = stagtau_parallel_sq(:,:)**2

       call glissade_stagger(nx,          ny,              &
                             flwa(k,:,:), stagflwa(:,:),   &
                             ice_mask,    stagger_margin_in = 1)

       ! Compute work arrays at cell centers.
       ! These are needed to find tau_xz and tau_yz.

       work1(:,:) = efvs_integral_z_to_s(k,:,:) * (2.d0*du_dx(:,:) + dv_dy(:,:))
       work2(:,:) = efvs_integral_z_to_s(k,:,:) *      (du_dy(:,:) + dv_dx(:,:))
       work3(:,:) = efvs_integral_z_to_s(k,:,:) * (2.d0*dv_dy(:,:) + du_dx(:,:))

       call parallel_halo(work1, parallel)
       call parallel_halo(work2, parallel)
       call parallel_halo(work3, parallel)

       ! Compute horizontal gradients of the work arrays.
       ! We need dwork1_dx, dwork2_dx, dwork2_dy and dwork3_dx at vertices.
       ! The calls to glissade_centered_gradient compute a couple of extraneous derivatives,
       !  but these calls are simpler than inlining the gradient code.
       ! With gradient_margin_in = 1, only ice-covered cells are included in the gradient.
       ! This is the appropriate setting, since efvs and strain rates have no meaning in ice-free cells.
       
       call glissade_gradient(nx,           ny,         &
                              dx,           dy,         &
                              itest, jtest, rtest,      &
                              work1,                    &
                              dwork1_dx,    dwork1_dy,  &
                              ice_mask,                 &
                              gradient_margin_in = 1)

       call glissade_gradient(nx,           ny,         &
                              dx,           dy,         &
                              itest, jtest, rtest,      &
                              work2,                    &
                              dwork2_dx,    dwork2_dy,  &
                              ice_mask,                 &
                              gradient_margin_in = 1)

       call glissade_gradient(nx,           ny,         &
                              dx,           dy,         &
                              itest, jtest, rtest,      &
                              work3,                    &
                              dwork3_dx,    dwork3_dy,  &
                              ice_mask,                 &
                              gradient_margin_in = 1)

       ! loop over locally owned active vertices
       do j = staggered_jlo, staggered_jhi
          do i = staggered_ilo, staggered_ihi
             if (active_vertex(i,j)) then

                ! Evaluate tau_xz and tau_yz for this layer
                ! Compute two versions of these stresses: with all terms including membrane stresses,
                !  and with SIA terms only.  Optionally, the SIA-only versions can be used in velocity integrals.

                depth = stagsigma(k) * stagthck(i,j)   ! depth at layer midpoint
                tau_xz_sia(k,i,j) = -rhoi*grav*depth*dusrf_dx(i,j)
                tau_yz_sia(k,i,j) = -rhoi*grav*depth*dusrf_dy(i,j)

                tau_xz(k,i,j) = tau_xz_sia(k,i,j) + 2.d0*dwork1_dx(i,j) + dwork2_dy(i,j)
                tau_yz(k,i,j) = tau_yz_sia(k,i,j) + dwork2_dx(i,j) + 2.d0*dwork3_dy(i,j)

                tau_eff_sq = stagtau_parallel_sq(i,j) + tau_xz(k,i,j)**2 + tau_yz(k,i,j)**2

                ! Note: The first formula below is correct for whichefvs = 2 (efvs computed from effective strain rate),
                !        but not for whichefvs = 0 (constant efvs) or whichefvs = 1 (multiple of flow factor).
                !       For these options we need a modified formula.
                !
                ! Recall: efvs = 1/2 * A^(-1/n) * eps_e^[(1-n)/n]
                !              = 1/2 * A^(-1/n) * [A tau_e^n]^[(1-n)/n]
                !              = 1/2 * A^(-1) * tau_e^(1-n)
                !  =>   1/efvs = 2 * A * tau_e(n-1)
                !
                ! Thus, for options 0 and 1, we can replace 2 * A * tau_e^(n-1) below with 1/efvs.

                if (whichefvs == HO_EFVS_NONLINEAR) then
                   fact = 2.d0 * stagflwa(i,j) * tau_eff_sq**((n_glen-1.d0)/2.d0) &
                        * (sigma(k+1) - sigma(k))*stagthck(i,j)
                else   ! HO_EFVS_CONSTANT, HO_EFVS_FLOWFACT
                   if (efvs(k,i,j) > 0.0d0) then
                      fact = (sigma(k+1) - sigma(k))*stagthck(i,j) / efvs(k,i,j)
                   else
                      fact = 0.0d0
                   endif
                endif

                ! Reset velocity to prescribed basal value if Dirichlet condition applies,
                ! else compute velocity at this level

                if (umask_dirichlet(i,j) == 1) then
                   uvel(k,i,j) = uvel(nz,i,j)
                else
                   if (include_membrane_stress_in_tau) then
                      uvel(k,i,j) = uvel(k+1,i,j) + fact * tau_xz(k,i,j)
                   else   ! SIA stress term only
                      uvel(k,i,j) = uvel(k+1,i,j) + fact * tau_xz_sia(k,i,j)
                   endif
                endif

                if (vmask_dirichlet(i,j) == 1) then
                   vvel(k,i,j) = vvel(nz,i,j)
                else
                   if (include_membrane_stress_in_tau) then
                      vvel(k,i,j) = vvel(k+1,i,j) + fact * tau_yz(k,i,j)
                   else   ! SIA stress term only
                      vvel(k,i,j) = vvel(k+1,i,j) + fact * tau_yz_sia(k,i,j)
                   endif
                endif

                if (verbose_L1L2 .and. this_rank==rtest .and. i==itest .and. j==jtest) then
                   depth = stagsigma(k) * stagthck(i,j)
                   write(iulog,*) ' '
                   write(iulog,*) 'k, depth, fact:', &
                        k, depth, fact
                   write(iulog,*) 'tau_xz(i,j): SIA term, membrane term, total:', &
                        tau_xz_sia(k,i,j), tau_xz(k,i,j) - tau_xz_sia(k,i,j), tau_xz(k,i,j)
                   write(iulog,*) 'tau_yz(i,j): SIA term, membrane term, total:', &
                        tau_yz_sia(k,i,j), tau_yz(k,i,j) - tau_yz_sia(k,i,j), tau_yz(k,i,j)
                   write(iulog,*) 'uvel(k), vvel(k):', uvel(k,i,j), vvel(k,i,j)
                endif

             endif

          enddo   ! i
       enddo      ! j

    enddo         ! k

  end subroutine compute_3d_velocity_L1L2

!****************************************************************************

  subroutine get_basis_function_derivatives_3d(xNode,       yNode,       zNode,       &
                                               dphi_dxr_3d, dphi_dyr_3d, dphi_dzr_3d, &
                                               dphi_dx_3d,  dphi_dy_3d,  dphi_dz_3d,  &
                                               detJ,                                  &
                                               itest, jtest, rtest,                   &
                                               i, j, k, p)

    !------------------------------------------------------------------
    ! Evaluate the x, y and z derivatives of the element basis functions
    ! at a particular quadrature point.
    !
    ! Also determine the Jacobian of the transformation between the
    ! reference element and the true element.
    ! 
    ! This subroutine should work for any 3D element with any number of nodes.
    !------------------------------------------------------------------
 
    real(dp), dimension(nNodesPerElement_3d), intent(in) :: &
       xNode, yNode, zNode,          &! nodal coordinates
       dphi_dxr_3d, dphi_dyr_3d, dphi_dzr_3d   ! derivatives of basis functions at quad pt
                                               !  wrt x, y and z in reference element

    real(dp), dimension(nNodesPerElement_3d), intent(out) :: &
       dphi_dx_3d, dphi_dy_3d, dphi_dz_3d      ! derivatives of basis functions at quad pt
                                               !  wrt x, y and z in true Cartesian coordinates  

    real(dp), intent(out) :: &
         detJ      ! determinant of Jacobian matrix

    real(dp), dimension(3,3) ::  &
         Jac,      &! Jacobian matrix
         Jinv,     &! inverse Jacobian matrix
         cofactor   ! matrix of cofactors

    integer, intent(in) :: &
       itest, jtest, rtest              ! coordinates of diagnostic point

    integer, intent(in) :: i, j, k, p   ! indices passed in for debugging

    integer :: n, row, col

    logical, parameter :: Jac_bug_check = .false.   ! set to true for debugging
    real(dp), dimension(3,3) :: prod     ! Jac * Jinv (should be identity matrix)

    !------------------------------------------------------------------
    ! Compute the Jacobian for the transformation from the reference
    ! coordinates to the true coordinates:
    !
    !                 |                                                                          |
    !                 | sum_n{dphi_n/dxr * xn}   sum_n{dphi_n/dxr * yn}   sum_n{dphi_n/dxr * zn} |
    !   J(xr,yr,zr) = |                                                                          |
    !                 | sum_n{dphi_n/dyr * xn}   sum_n{dphi_n/dyr * yn}   sum_n{dphi_n/dyr * zn} |
    !                 |                                                                          |
    !                 | sum_n{dphi_n/dzr * xn}   sum_n{dphi_n/dzr * yn}   sum_n{dphi_n/dzr * zn} |
    !                 !                                                                          |
    !
    ! where (xn,yn,zn) are the true Cartesian nodal coordinates,
    !       (xr,yr,zr) are the coordinates of the quad point in the reference element,
    !       and sum_n denotes a sum over nodes.
    !------------------------------------------------------------------

    if (verbose_Jac .and. this_rank==rtest .and. i==itest .and. j==jtest .and. k==ktest) then
       write(iulog,*) ' '
       write(iulog,*) 'In get_basis_function_derivatives_3d: i, j, k, p =', i, j, k, p
    endif

    Jac(:,:) = 0.d0

    do n = 1, nNodesPerElement_3d
       Jac(1,1) = Jac(1,1) + dphi_dxr_3d(n) * xNode(n)
       Jac(1,2) = Jac(1,2) + dphi_dxr_3d(n) * yNode(n)
       Jac(1,3) = Jac(1,3) + dphi_dxr_3d(n) * zNode(n)
       Jac(2,1) = Jac(2,1) + dphi_dyr_3d(n) * xNode(n)
       Jac(2,2) = Jac(2,2) + dphi_dyr_3d(n) * yNode(n)
       Jac(2,3) = Jac(2,3) + dphi_dyr_3d(n) * zNode(n)
       Jac(3,1) = Jac(3,1) + dphi_dzr_3d(n) * xNode(n)
       Jac(3,2) = Jac(3,2) + dphi_dzr_3d(n) * yNode(n)
       Jac(3,3) = Jac(3,3) + dphi_dzr_3d(n) * zNode(n)
    enddo

    !------------------------------------------------------------------
    ! Compute the determinant and inverse of J
    !------------------------------------------------------------------

    cofactor(1,1) =   Jac(2,2)*Jac(3,3) - Jac(2,3)*Jac(3,2)
    cofactor(1,2) = -(Jac(2,1)*Jac(3,3) - Jac(2,3)*Jac(3,1))
    cofactor(1,3) =   Jac(2,1)*Jac(3,2) - Jac(2,2)*Jac(3,1)
    cofactor(2,1) = -(Jac(1,2)*Jac(3,3) - Jac(1,3)*Jac(3,2))
    cofactor(2,2) =   Jac(1,1)*Jac(3,3) - Jac(1,3)*Jac(3,1)
    cofactor(2,3) = -(Jac(1,1)*Jac(3,2) - Jac(1,2)*Jac(3,1))
    cofactor(3,1) =   Jac(1,2)*Jac(2,3) - Jac(1,3)*Jac(2,2)
    cofactor(3,2) = -(Jac(1,1)*Jac(2,3) - Jac(1,3)*Jac(2,1))
    cofactor(3,3) =   Jac(1,1)*Jac(2,2) - Jac(1,2)*Jac(2,1)

    detJ = Jac(1,1)*cofactor(1,1) + Jac(1,2)*cofactor(1,2) + Jac(1,3)*cofactor(1,3)

    if (verbose_Jac .and. this_rank==rtest .and. i==itest .and. j==jtest .and. k==ktest) then
       write(iulog,*) ' '
       write(iulog,*) 'detJ1:', Jac(1,1)*cofactor(1,1) + Jac(1,2)*cofactor(1,2) + Jac(1,3)*cofactor(1,3)
       write(iulog,*) 'detJ2:', Jac(2,1)*cofactor(2,1) + Jac(2,2)*cofactor(2,2) + Jac(2,3)*cofactor(2,3)
       write(iulog,*) 'detJ3:', Jac(3,1)*cofactor(3,1) + Jac(3,2)*cofactor(3,2) + Jac(3,3)*cofactor(3,3)
    endif

    if (abs(detJ) > 0.d0) then
       do col = 1, 3
          do row = 1, 3
             Jinv(row,col) = cofactor(col,row)
          enddo
       enddo
       Jinv(:,:) = Jinv(:,:) / detJ
    else
       write(iulog,*) 'stopping, det J = 0'
       write(iulog,*) 'i, j, k, p:', i, j, k, p
       write(iulog,*) 'Jacobian matrix:'
       write(iulog,*) Jac(1,:)
       write(iulog,*) Jac(2,:)
       write(iulog,*) Jac(3,:) 
       call write_log('Jacobian matrix is singular', GM_FATAL)
    endif

    if (verbose_Jac .and. this_rank==rtest .and. i==itest .and. j==jtest .and. k==ktest) then
       write(iulog,*) ' '
       write(iulog,*) 'Jacobian calc, p =', p
       write(iulog,*) 'det J =', detJ
       write(iulog,*) ' '
       write(iulog,*) 'Jacobian matrix:'
       write(iulog,*) Jac(1,:)
       write(iulog,*) Jac(2,:)
       write(iulog,*) Jac(3,:)
       write(iulog,*) ' '
       write(iulog,*) 'cofactor matrix:'
       write(iulog,*) cofactor(1,:)
       write(iulog,*) cofactor(2,:)
       write(iulog,*) cofactor(3,:)
       write(iulog,*) ' '
       write(iulog,*) 'Inverse matrix:'
       write(iulog,*) Jinv(1,:)
       write(iulog,*) Jinv(2,:)
       write(iulog,*) Jinv(3,:)
       write(iulog,*) ' '
       prod = matmul(Jac, Jinv)
       write(iulog,*) 'Jac*Jinv:'
       write(iulog,*) prod(1,:)
       write(iulog,*) prod(2,:)
       write(iulog,*) prod(3,:)
    endif

    ! Optional bug check: Verify that J * Jinv = I

    if (Jac_bug_check) then
       prod = matmul(Jac,Jinv)
       do col = 1, 3
          do row = 1, 3
             if (abs(prod(row,col) - identity3(row,col)) > 1.d-11) then
                write(iulog,*) 'stopping, Jac * Jinv /= identity'
                write(iulog,*) 'i, j, k, p:', i, j, k, p
                write(iulog,*) 'Jac*Jinv:'
                write(iulog,*) prod(1,:)
                write(iulog,*) prod(2,:)
                write(iulog,*) prod(3,:)
                call write_log('Jacobian matrix was not correctly inverted', GM_FATAL)
             endif
          enddo
       enddo
    endif  ! Jac_bug_check

    !------------------------------------------------------------------
    ! Compute the contribution of this quadrature point to dphi/dx and dphi/dy
    ! for each basis function.
    !
    !   | dphi_n/dx |          | dphi_n/dxr |
    !   |           |          |            | 
    !   | dphi_n/dy | = Jinv * | dphi_n/dyr |
    !   |           |          |            |
    !   | dphi_n/dz |          | dphi_n/dzr |
    !
    !------------------------------------------------------------------

    dphi_dx_3d(:) = 0.d0
    dphi_dy_3d(:) = 0.d0
    dphi_dz_3d(:) = 0.d0

    do n = 1, nNodesPerElement_3d
       dphi_dx_3d(n) = Jinv(1,1)*dphi_dxr_3d(n)  &
                     + Jinv(1,2)*dphi_dyr_3d(n)  &
                     + Jinv(1,3)*dphi_dzr_3d(n)
       dphi_dy_3d(n) = Jinv(2,1)*dphi_dxr_3d(n)  &
                     + Jinv(2,2)*dphi_dyr_3d(n)  &
                     + Jinv(2,3)*dphi_dzr_3d(n)
       dphi_dz_3d(n) = Jinv(3,1)*dphi_dxr_3d(n)  &
                     + Jinv(3,2)*dphi_dyr_3d(n)  &
                     + Jinv(3,3)*dphi_dzr_3d(n)
    enddo

    if (Jac_bug_check) then

       ! Check that the sum of dphi_dx, etc. is close to zero  

       if (abs( sum(dphi_dx_3d)/maxval(dphi_dx_3d) ) > 1.d-11) then
          write(iulog,*) 'stopping, sum over basis functions of dphi_dx > 0'
          write(iulog,*) 'dphi_dx_3d =', dphi_dx_3d(:)
          write(iulog,*) 'sum =', sum(dphi_dx_3d)
          write(iulog,*) 'i, j, k, p =', i, j, k, p
          call write_log('Sum over basis functions of dphi_dx /= 0', GM_FATAL)
       endif

       if (abs( sum(dphi_dy_3d)/maxval(dphi_dy_3d) ) > 1.d-11) then
          write(iulog,*) 'stopping, sum over basis functions of dphi_dy > 0'
          write(iulog,*) 'dphi_dy_3d =', dphi_dy_3d(:)
          write(iulog,*) 'sum =', sum(dphi_dy_3d)
          write(iulog,*) 'i, j, k, p =', i, j, k, p
          call write_log('Sum over basis functions of dphi_dy /= 0', GM_FATAL)
       endif

       if (abs( sum(dphi_dz_3d)/maxval(dphi_dz_3d) ) > 1.d-11) then
          write(iulog,*) 'stopping, sum over basis functions of dphi_dz > 0'
          write(iulog,*) 'dphi_dz_3d =', dphi_dz_3d(:)
          write(iulog,*) 'sum =', sum(dphi_dz_3d)
          write(iulog,*) 'i, j, k, p =', i, j, k, p
          call write_log('Sum over basis functions of dphi_dz /= 0', GM_FATAL)
       endif

    endif  ! Jac_bug_check

  end subroutine get_basis_function_derivatives_3d

!****************************************************************************

  subroutine get_basis_function_derivatives_2d(xNode,       yNode,         &
                                               dphi_dxr_2d, dphi_dyr_2d,   &
                                               dphi_dx_2d,  dphi_dy_2d,    &
                                               detJ,                       &
                                               itest, jtest, rtest,        &
                                               i, j, p)

    !------------------------------------------------------------------
    ! Evaluate the x and y derivatives of 2D element basis functions
    ! at a particular quadrature point.
    !
    ! Also determine the Jacobian of the transformation between the
    ! reference element and the true element.
    ! 
    ! This subroutine should work for any 2D element with any number of nodes.
    !------------------------------------------------------------------

    real(dp), dimension(nNodesPerElement_2d), intent(in) :: &
       xNode, yNode,                   &! nodal coordinates
       dphi_dxr_2d, dphi_dyr_2d         ! derivatives of basis functions at quad pt
                                        !  wrt x and y in reference element

    real(dp), dimension(nNodesPerElement_2d), intent(out) :: &
       dphi_dx_2d, dphi_dy_2d           ! derivatives of basis functions at quad pt
                                        !  wrt x and y in true Cartesian coordinates  

    real(dp), intent(out) :: &
                detJ      ! determinant of Jacobian matrix

    real(dp), dimension(2,2) ::  &
                Jac,      &! Jacobian matrix
                Jinv       ! inverse Jacobian matrix

    integer, intent(in) :: &
       itest, jtest, rtest              ! coordinates of diagnostic point

    integer, intent(in) :: i, j, p

    integer :: n, row, col

    logical, parameter :: Jac_bug_check = .false.   ! set to true for debugging
    real(dp), dimension(2,2) :: prod     ! Jac * Jinv (should be identity matrix)

    !------------------------------------------------------------------
    ! Compute the Jacobian for the transformation from the reference
    ! coordinates to the true coordinates:
    !
    !              |                                                  |
    !              | sum_n{dphi_n/dxr * xn}   sum_n{dphi_n/dxr * yn}  |
    !   J(xr,yr) = |                                                  |
    !              | sum_n{dphi_n/dyr * xn}   sum_n{dphi_n/dyr * yn}  |
    !              |                                                  |
    !
    ! where (xn,yn) are the true Cartesian nodal coordinates,
    !       (xr,yr) are the coordinates of the quad point in the reference element,
    !       and sum_n denotes a sum over nodes.
    !------------------------------------------------------------------

    Jac(:,:) = 0.d0

    if ((verbose_Jac .or. verbose_diva) .and. this_rank==rtest .and. i==itest .and. j==jtest) then
       write(iulog,*) ' '
       write(iulog,*) 'In get_basis_function_derivatives_2d: i, j, p =', i, j, p
    endif

    do n = 1, nNodesPerElement_2d
       if (verbose_Jac .and. this_rank==rtest .and. i==itest .and. j==jtest) then
          write(iulog,*) ' '
          write(iulog,*) 'n, x, y:', n, xNode(n), yNode(n)
          write(iulog,*) 'dphi_dxr_2d, dphi_dyr_2d:', dphi_dxr_2d(n), dphi_dyr_2d(n)
       endif
       Jac(1,1) = Jac(1,1) + dphi_dxr_2d(n) * xNode(n)
       Jac(1,2) = Jac(1,2) + dphi_dxr_2d(n) * yNode(n)
       Jac(2,1) = Jac(2,1) + dphi_dyr_2d(n) * xNode(n)
       Jac(2,2) = Jac(2,2) + dphi_dyr_2d(n) * yNode(n)
    enddo

    !------------------------------------------------------------------
    ! Compute the determinant and inverse of J
    !------------------------------------------------------------------

    detJ = Jac(1,1)*Jac(2,2) - Jac(1,2)*Jac(2,1)

    if (abs(detJ) > 0.d0) then
       Jinv(1,1) =  Jac(2,2)/detJ
       Jinv(1,2) = -Jac(1,2)/detJ
       Jinv(2,1) = -Jac(2,1)/detJ
       Jinv(2,2) =  Jac(1,1)/detJ
    else
       write(iulog,*) 'stopping, det J = 0'
       write(iulog,*) 'i, j, p:', i, j, p
       write(iulog,*) 'Jacobian matrix:'
       write(iulog,*) Jac(1,:)
       write(iulog,*) Jac(2,:)
       call write_log('Jacobian matrix is singular', GM_FATAL)
    endif

    if (verbose_Jac .and. this_rank==rtest .and. i==itest .and. j==jtest) then
       write(iulog,*) ' '
       write(iulog,*) 'Jacobian calc, p =', p
       write(iulog,*) 'det J =', detJ
       write(iulog,*) ' '
       write(iulog,*) 'Jacobian matrix:'
       write(iulog,*) Jac(1,:)
       write(iulog,*) Jac(2,:)
       write(iulog,*) ' '
       write(iulog,*) 'Inverse matrix:'
       write(iulog,*) Jinv(1,:)
       write(iulog,*) Jinv(2,:)
       write(iulog,*) ' '
       prod = matmul(Jac, Jinv)
       write(iulog,*) 'Jac*Jinv:'
       write(iulog,*) prod(1,:)
       write(iulog,*) prod(2,:)
    endif

    ! Optional bug check - Verify that J * Jinv = I

    if (Jac_bug_check) then
       prod = matmul(Jac,Jinv)
       do col = 1, 2
          do row = 1, 2
             if (abs(prod(row,col) - identity3(row,col)) > 1.d-12) then
                write(iulog,*) 'stopping, Jac * Jinv /= identity'
                write(iulog,*) 'i, j, p:', i, j, p
                write(iulog,*) 'Jac*Jinv:'
                write(iulog,*) prod(1,:)
                write(iulog,*) prod(2,:)
                call write_log('Jacobian matrix was not correctly inverted', GM_FATAL)
             endif
          enddo
       enddo
    endif

    !------------------------------------------------------------------
    ! Compute the contribution of this quadrature point to dphi/dx and dphi/dy
    ! for each basis function.
    !
    !   | dphi_n/dx |          | dphi_n/dxr |
    !   |           | = Jinv * |            |
    !   | dphi_n/dy |          | dphi_n/dyr |
    !
    !------------------------------------------------------------------

    dphi_dx_2d(:) = 0.d0
    dphi_dy_2d(:) = 0.d0

    do n = 1, nNodesPerElement_2d
       dphi_dx_2d(n) = dphi_dx_2d(n) + Jinv(1,1)*dphi_dxr_2d(n)  &
                                     + Jinv(1,2)*dphi_dyr_2d(n)
       dphi_dy_2d(n) = dphi_dy_2d(n) + Jinv(2,1)*dphi_dxr_2d(n)  &
                                     + Jinv(2,2)*dphi_dyr_2d(n)
    enddo

    if (verbose_Jac .and. this_rank==rtest .and. i==itest .and. j==jtest) then
       write(iulog,*) ' '
       write(iulog,*) 'dphi_dx_2d:', dphi_dx_2d(:)
       write(iulog,*) 'dphi_dy_2d:', dphi_dy_2d(:)
    endif

    if (Jac_bug_check) then

       ! Check that the sum of dphi_dx, etc. is close to zero  
       if (abs( sum(dphi_dx_2d)/maxval(dphi_dx_2d) ) > 1.d-11) then
          write(iulog,*) 'stopping, sum over basis functions of dphi_dx > 0'
          write(iulog,*) 'dphi_dx_2d =', dphi_dx_2d(:)
          write(iulog,*) 'i, j, p =', i, j, p
          call write_log('Sum over basis functions of dphi_dx /= 0', GM_FATAL)
       endif

       if (abs( sum(dphi_dy_2d)/maxval(dphi_dy_2d) ) > 1.d-11) then
          write(iulog,*) 'stopping, sum over basis functions of dphi_dy > 0'
          write(iulog,*) 'dphi_dy =', dphi_dy_2d(:)
          write(iulog,*) 'i, j, p =', i, j, p
          call write_log('Sum over basis functions of dphi_dy /= 0', GM_FATAL)
       endif

    endif

  end subroutine get_basis_function_derivatives_2d

!****************************************************************************

  subroutine get_area_scale_factor_curved_2d(&
       xNode,       yNode,       zNode,      &
       dphi_dxr_2d, dphi_dyr_2d,             &
       itest, jtest, rtest,                  &
       i, j, p,                              &
       area_scale_factor)

    !------------------------------------------------------------------
    ! Compute a scaling factor for area transformation between flat 2D coordinates
    ! and a curved 2D surface (e.g., the base of an ice sheet) embedded in 3D space.
    !
    ! This subroutine should work for any 2D element with any number of nodes.
    !------------------------------------------------------------------

    real(dp), dimension(nNodesPerElement_2d), intent(in) :: &
         xNode, yNode, zNode,           & ! nodal coordinates
         dphi_dxr_2d, dphi_dyr_2d         ! derivatives of basis functions at quad pt
                                          !  wrt x and y in reference element

    integer, intent(in) :: &
       itest, jtest, rtest              ! coordinates of diagnostic point

    integer, intent(in) :: i, j, p

    real(dp), intent(out) :: &
         area_scale_factor   ! area scaling factor for transformations between a flat 2D surface
                             ! and a curved 2D surface in 3D space;

    ! Local variables

    real(dp), dimension(2,3) ::  &
         Jac               ! Jacobian matrix J

    integer :: n
    real(dp) :: term1, term2, term3

    !------------------------------------------------------------------
    ! Compute the Jacobian for the transformation from the reference coordinates (in R2)
    ! to the 3D true coordinates (in R3):
    !
    !              |                                                                           |
    !              | sum_n{dphi_n/dxr * xn}   sum_n{dphi_n/dxr * yn}   sum_n{dphi_n/dxr * zn}  |
    !   J(xr,yr) = |                                                                           |
    !              | sum_n{dphi_n/dyr * xn}   sum_n{dphi_n/dyr * yn}   sum_n{dphi_n/dyr * zn}  |
    !              |                                                                           |
    !
    ! where (xn,yn,zn) are the true Cartesian nodal coordinates,
    !       (xr,yr) are the coordinates of the quad point in the reference element,
    !       and sum_n denotes a sum over nodes.
    !------------------------------------------------------------------

    if ((verbose_Jac .or. verbose_diva) .and. this_rank==rtest .and. i==itest .and. j==jtest) then
       write(iulog,*) ' '
       write(iulog,*) 'In get_area_scale_factor_curved_2d: i, j, p =', i, j, p
    endif

    Jac(:,:) = 0.d0

    do n = 1, nNodesPerElement_2d
       if (verbose_Jac .and. this_rank==rtest .and. i==itest .and. j==jtest) then
          write(iulog,*) ' '
          write(iulog,*) 'n, x, y, z:', n, xNode(n), yNode(n), zNode(n)
          write(iulog,*) 'dphi_dxr_2d, dphi_dyr_2d:', dphi_dxr_2d(n), dphi_dyr_2d(n)
       endif
       Jac(1,1) = Jac(1,1) + dphi_dxr_2d(n) * xNode(n)
       Jac(1,2) = Jac(1,2) + dphi_dxr_2d(n) * yNode(n)
       Jac(1,3) = Jac(1,3) + dphi_dxr_2d(n) * zNode(n)
       Jac(2,1) = Jac(2,1) + dphi_dyr_2d(n) * xNode(n)
       Jac(2,2) = Jac(2,2) + dphi_dyr_2d(n) * yNode(n)
       Jac(2,3) = Jac(2,3) + dphi_dyr_2d(n) * zNode(n)
    enddo

    if (verbose_Jac .and. this_rank==rtest .and. i==itest .and. j==jtest) then
       write(iulog,*) ' '
       write(iulog,*) '2x3 Jacobian matrix:'
       write(iulog,*) Jac(1,:)
       write(iulog,*) Jac(2,:)
    endif

    !------------------------------------------------------------------
    ! Compute the area scaling factor for this transformation.
    ! For a square matrix, this would be det(J).
    ! For non-square J, the following formula works. It is similar to
    !  computing a cross product of the vectors joining the nodes.
    !------------------------------------------------------------------

    term1 =  (Jac(1,1)*Jac(2,2) - Jac(1,2)*Jac(2,1))
    term2 = -(Jac(1,1)*Jac(2,3) - Jac(1,3)*Jac(2,1))
    term3 =  (Jac(1,2)*Jac(2,3) - Jac(1,3)*Jac(2,2))
    area_scale_factor = sqrt(term1**2 + term2**2 + term3**2)

  end subroutine get_area_scale_factor_curved_2d

!****************************************************************************

  subroutine compute_basal_friction_heatflx(nx,            ny,            &
                                            nhalo,                        &
                                            itest, jtest,  rtest,         &
                                            active_cell,   active_vertex, &
                                            xVertex,       yVertex,       &
                                            uvel,          vvel,          &
                                            beta,          whichassemble_bfric,  &
                                            bfricflx)

    !----------------------------------------------------------------
    ! Compute the heat flux due to basal friction, given the 2D basal
    !  velocity and beta fields.
    !
    ! Assume a sliding law of the form:
    !   tau_x = -beta*u
    !   tau_y = -beta*v
    ! where beta and (u,v) are defined at vertices.
    !
    ! The frictional heat flux (W/m^2) is given by q_b = tau_b * u_b,
    ! where tau_b and u_b are the magnitudes of the basal stress
    ! and velocity (e.g., Cuffey & Paterson, p. 418).
    !
    ! Note: There is a choice of two methods for this calculation:
    !       (0) a finite-element method, summing over beta*(u^2 + v^2) at quadrature points
    !       (1) a simple method, computing beta*(u^2 + v^2) at vertices
    !       Method (0) should formally be more accurate, at least where the flow is smooth.
    !       However, it can lead to inaccurate and hugely excessive frictional fluxes where
    !        the flow transitions steeply from high beta/low velo to low beta/high velo
    !        (e.g., at the edge of fjords). In this case there are QPs with relatively
    !        high velocity combined with large beta.
    !       To choose method (1), set which_ho_assemble_bfric = 1 in the config file.
    !----------------------------------------------------------------

    !----------------------------------------------------------------
    ! Input-output arguments
    !----------------------------------------------------------------

    integer, intent(in) ::      &
       nx, ny,                  &    ! horizontal grid dimensions
       nhalo                         ! number of halo layers

    integer, intent(in) :: &
       itest, jtest, rtest              ! coordinates of diagnostic point

    logical, dimension(nx,ny), intent(in) ::  &
       active_cell            ! true if cell contains ice and borders a locally owned vertex

    logical, dimension(nx-1,ny-1), intent(in) ::  &
       active_vertex          ! true for vertices of active cells

    real(dp), dimension(nx-1,ny-1), intent(in) :: &
       xVertex, yVertex       ! x and y coordinates of each vertex (m)

    real(dp), dimension(nx-1,ny-1), intent(in) :: &
       uvel, vvel,          & ! basal velocity components at each vertex (m/yr)
       beta                   ! basal traction parameter (Pa/(m/yr))
                              ! typically = beta_internal (beta weighted by f_ground)

    integer, intent(in) ::  &
       whichassemble_bfric    ! = 0 for standard finite element computation of basal friction
                              ! = 1 for computation that uses only the local value of the basal friction at each vertex

    real(dp), dimension(nx,ny), intent(out) :: &
       bfricflx               ! basal heat flux from friction (W/m^2), computed at cell centers

    !----------------------------------------------------------------
    ! Local variables
    !----------------------------------------------------------------

    integer :: i, j, n, p
    integer :: iVertex, jVertex

    real(dp), dimension(nx-1,ny-1) :: &
         stagbfricflx     ! basal heat flux from friction, computed at vertices

    real(dp), dimension(nNodesPerElement_2d) ::   &
       x, y,            & ! spatial coordinates of nodes
       u, v,            & ! velocity components at nodes
       b                  ! beta at nodes

    real(dp) ::         &
       u_qp, v_qp,      & ! u and v at quadrature points
       beta_qp,         & ! beta at quadrature points
       sum_wqp            ! sum of weighting factors

    logical, parameter :: bfricflx_finite_element = .false.  ! if true, do a finite-element summation
                                                             ! if false, take beta*(u^2 + v^2) at active vertices
                                                             ! (see comments above)
    ! initialize
    bfricflx(:,:) = 0.d0

    if (whichassemble_bfric == HO_ASSEMBLE_BFRIC_STANDARD) then

       ! do finite-element calculation (can be inaccurate at sharp transitions in beta and velocity)

       ! Loop over locally owned cells
       do j = nhalo+1, ny-nhalo
          do i = nhalo+1, nx-nhalo

             if (active_cell(i,j)) then   ! ice is present

                ! Load x and y coordinates, basal velocity, and beta at cell vertices

                do n = 1, nNodesPerElement_2d

                   ! Determine (i,j) for this vertex
                   ! The reason for the '3' is that node 3, in the NE corner of the grid cell, has index (i,j).
                   ! Indices for other nodes are computed relative to this vertex.
                   iVertex = i + ishift(3,n)
                   jVertex = j + jshift(3,n)
                   
                   x(n) = xVertex(iVertex,jVertex)
                   y(n) = yVertex(iVertex,jVertex)
                   u(n) = uvel(iVertex,jVertex)
                   v(n) = vvel(iVertex,jVertex)
                   b(n) = beta(iVertex,jVertex)
                   
                enddo

                sum_wqp = 0.d0

                ! loop over quadrature points
                do p = 1, nQuadPoints_2d
                   
                   ! Evaluate u, v and beta at this quadrature point
                   
                   u_qp = 0.d0
                   v_qp = 0.d0
                   beta_qp = 0.d0
                   do n = 1, nNodesPerElement_2d
                      u_qp = u_qp + phi_2d(n,p) * u(n)
                      v_qp = v_qp + phi_2d(n,p) * v(n)
                      beta_qp = beta_qp + phi_2d(n,p) * b(n)
                   enddo
                   
                   ! Increment basal frictional heating
                   
                   bfricflx(i,j) = bfricflx(i,j) + wqp_2d(p) * beta_qp * (u_qp**2 + v_qp**2)
                   sum_wqp = sum_wqp + wqp_2d(p)
                   
                   if (verbose_bfric .and. this_rank==rtest .and. i==itest .and. j==jtest) then
                      write(iulog,*) ' '
                      write(iulog,*) 'Increment basal friction heating, i, j, p =', i, j, p
                      write(iulog,*) 'u, v, beta_qp =', u_qp, v_qp, beta_qp
                      write(iulog,*) 'local increment =', beta_qp * (u_qp**2 + v_qp**2) / scyr
                   endif
                   
                enddo   ! nQuadPoints_2d
                
                ! Scale the result:
                ! Divide by sum_wqp to get average of beta*(u^2 + v^2) over cell
                ! Divide by scyr to convert Pa m/yr to Pa m/s = W/m^2
                
                bfricflx(i,j) = bfricflx(i,j) / (sum_wqp * scyr)
                
                if (verbose_bfric .and. this_rank==rtest .and. i==itest .and. j==jtest) then
                   write(iulog,*) ' '
                   write(iulog,*) 'i, j, bfricflx:', i, j, bfricflx(i,j)
                   write(iulog,*) 'beta, uvel, vvel:', beta(i,j), uvel(i,j), vvel(i,j)
                endif
                
             endif      ! active_cell
             
          enddo         ! i
       enddo            ! j
       
    else   ! whichassemble_bfric = HO_ASSEMBLE_BFRIC_LOCAL; local calculation at active vertices

       ! compute frictional heating at vertices

       stagbfricflx(:,:) = 0.d0

       do j = 1, ny-1
          do i = 1, nx-1
       
             if (active_vertex(i,j)) then

                stagbfricflx(i,j) = beta(i,j) * (uvel(i,j)**2 + vvel(i,j)**2)
                stagbfricflx(i,j) = stagbfricflx(i,j) / scyr   ! convert Pa m/yr to Pa m/s = W/m^2

             endif      ! active_vertex
             
          enddo         ! i
       enddo            ! j

       ! interpolate from vertices to cell centers
       ! Note: The optional arguments vmask and stagger_margin_in are not included.
       !       This means that zero values at inactive vertices are included in the average
       !       for a given cell.

       call glissade_unstagger(nx,            ny,               &
                               stagbfricflx,  bfricflx)

       ! Note: Halo update of bfricflux moved to higher level

       if (verbose_bfric .and. this_rank==rtest) then
          i = itest
          j = jtest
          write(iulog,*) ' '
          write(iulog,*) 'i, j, bfricflx:', i, j, bfricflx(i,j)
          write(iulog,*) ' '
          write(iulog,*) 'i, j, beta, uvel, vvel, stagbfricflx:'
          do j = jtest-1, jtest
             do i = itest-1, itest
                write(iulog,*) i, j, beta(i,j), uvel(i,j), vvel(i,j), stagbfricflx(i,j)
             enddo
          enddo
       endif

    endif  ! whichassemble_bfric

  end subroutine compute_basal_friction_heatflx

!****************************************************************************

  subroutine compute_internal_stress (nx,            ny,            &
                                      nz,            sigma,         &
                                      nhalo,                        &
                                      itest, jtest,  rtest,         &
                                      active_cell,                  &
                                      xVertex,       yVertex,       &
                                      stagusrf,      stagthck,      &
                                      whichapprox,   flwafact,      &
                                      whichefvs,     efvs,          &
                                      efvs_constant, effstrain_min, &
                                      uvel,          vvel,          &
                                      tau_xz,        tau_yz,        &
                                      tau_xx,        tau_yy,        &
                                      tau_xy,        tau_eff)

    !----------------------------------------------------------------
    ! Compute internal ice stresses at the center of each element,
    !  given the 3D velocity field and flow factor.
    !----------------------------------------------------------------

    !----------------------------------------------------------------
    ! Input-output arguments
    !----------------------------------------------------------------

    integer, intent(in) ::      &
       nx, ny,                  &    ! horizontal grid dimensions
       nz,                      &    ! number of vertical levels at which velocity is computed
       nhalo                         ! number of halo layers

    real(dp), dimension(nz), intent(in) ::    &
       sigma              ! sigma vertical coordinate

    integer, intent(in) :: &
       itest, jtest, rtest              ! coordinates of diagnostic point

    logical, dimension(nx,ny), intent(in) ::  &
       active_cell        ! true if cell contains ice and borders a locally owned vertex

    real(dp), dimension(nx-1,ny-1), intent(in) :: &
       xVertex, yVertex       ! x and y coordinates of each vertex (m)

    real(dp), dimension(nx-1,ny-1), intent(in) ::  &
       stagusrf,       &  ! upper surface elevation on staggered grid (m)
       stagthck           ! ice thickness on staggered grid (m)

    integer, intent(in) ::   &
       whichapprox,     & ! option for Stokes approximation (BP, L1L2, SSA, SIA)
       whichefvs          ! option for effective viscosity calculation 

    real(dp), dimension(nz-1,nx,ny), intent(in) ::  &
       efvs,           &  ! precomputed effective viscosity
                          ! used for L1L2 only; efvs is recomputed at QPs for other approximations
       flwafact           ! temperature-based flow factor, 0.5 * A^(-1/n), Pa yr^(1/n)
                          ! used to compute the effective viscosity

    real(dp), intent(in) :: &
       efvs_constant,   & ! constant value of effective viscosity (Pa yr)
       effstrain_min      ! minimum value of effective strain rate (yr^-1) for computing viscosity

    real(dp), dimension(nz,nx-1,ny-1), intent(in) ::  &
       uvel, vvel         ! velocity components at each node (m/yr)

    ! stress tensor components, co-located with efvs at the center of each element
    real(dp), dimension(nz-1,nx,ny), intent(out) ::   &
       tau_xz, tau_yz,         &! vertical components of stress tensor (Pa)
       tau_xx, tau_yy, tau_xy, &! horizontal components of stress tensor (Pa)
       tau_eff                  ! effective stress (Pa)

    !----------------------------------------------------------------
    ! Local variables
    !----------------------------------------------------------------

    real(dp), dimension(nNodesPerElement_3d) ::  &
       dphi_dx_3d, dphi_dy_3d, dphi_dz_3d   ! derivatives of 3D nodal basis functions at a quadrature point

    real(dp) ::               &
       detJ,                  & ! determinant of Jacobian at a quad pt
                                ! not used but part of interface to get_basis_function_derivatives
       du_dx, du_dy, du_dz,   & ! strain rate components
       dv_dx, dv_dy, dv_dz,   & 
       efvs_qp                  ! effective viscosity at a quad pt (Pa yr)

    real(dp), dimension(nNodesPerElement_3d) ::   &
       x, y, z,         & ! spatial coordinates of nodes
       u, v               ! velocity components at nodes

    integer :: i, j, k, n, p
    integer :: iNode, jNode, kNode
   
    ! initialize stresses
    tau_xz (:,:,:) = 0.d0
    tau_yz (:,:,:) = 0.d0
    tau_xx (:,:,:) = 0.d0
    tau_yy (:,:,:) = 0.d0
    tau_xy (:,:,:) = 0.d0
    tau_eff(:,:,:) = 0.d0

    ! Loop over cells that border locally owned vertices

    do j = nhalo+1, ny-nhalo+1
       do i = nhalo+1, nx-nhalo+1
       
          if (active_cell(i,j)) then

             ! Loop over layers
             do k = 1, nz-1

                ! compute spatial coordinates and velocity for each node of this element
                do n = 1, nNodesPerElement_3d

                   ! Determine (k,i,j) for this node
                   ! The reason for the '7' is that node 7, in the NE corner of the upper layer, has index (k,i,j).
                   ! Indices for other nodes are computed relative to this node.
                   iNode = i + ishift(7,n)
                   jNode = j + jshift(7,n)
                   kNode = k + kshift(7,n)
                   
                   x(n) = xVertex(iNode,jNode)
                   y(n) = yVertex(iNode,jNode)
                   z(n) = stagusrf(iNode,jNode) - sigma(kNode)*stagthck(iNode,jNode)
                   u(n) = uvel(kNode,iNode,jNode)
                   v(n) = vvel(kNode,iNode,jNode)
                   
                enddo   ! nodes per element

                ! Loop over quadrature points
                do p = 1, nQuadPoints_3d

                   ! Compute derivative of basis functions at this quad pt
                   call get_basis_function_derivatives_3d(x(:),             y(:),             z(:),              &          
                                                          dphi_dxr_3d(:,p), dphi_dyr_3d(:,p), dphi_dzr_3d(:,p),  &
                                                          dphi_dx_3d(:),    dphi_dy_3d(:),    dphi_dz_3d(:),     &
                                                          detJ,                                                  &
                                                          itest, jtest, rtest,                                   &
                                                          i, j, k, p)

                   ! Compute strain rates at this quadrature point, looping over nodes of element
                   du_dx = 0.d0
                   du_dy = 0.d0
                   du_dz = 0.d0
                   dv_dx = 0.d0
                   dv_dy = 0.d0
                   dv_dz = 0.d0

                   if (whichapprox == HO_APPROX_SIA) then

                      do n = 1, nNodesPerElement_3d
                         du_dz = du_dz + dphi_dz_3d(n)*u(n)
                         dv_dz = dv_dz + dphi_dz_3d(n)*v(n)
                      enddo

                   elseif (whichapprox == HO_APPROX_SSA) then

                      do n = 1, nNodesPerElement_3d
                         du_dx = du_dx + dphi_dx_3d(n)*u(n)
                         du_dy = du_dy + dphi_dy_3d(n)*u(n)
                         dv_dx = dv_dx + dphi_dx_3d(n)*v(n)
                         dv_dy = dv_dy + dphi_dy_3d(n)*v(n)
                      enddo

                   else    !  3D higher-order (BP or L1L2)
 
                      do n = 1, nNodesPerElement_3d
                         du_dx = du_dx + dphi_dx_3d(n)*u(n)
                         du_dy = du_dy + dphi_dy_3d(n)*u(n)
                         du_dz = du_dz + dphi_dz_3d(n)*u(n)
                         dv_dx = dv_dx + dphi_dx_3d(n)*v(n)
                         dv_dy = dv_dy + dphi_dy_3d(n)*v(n)
                         dv_dz = dv_dz + dphi_dz_3d(n)*v(n)
                      enddo

                   endif  ! whichapprox

                   if (whichapprox == HO_APPROX_L1L2) then

                      ! efvs is computed in a complicated way for L1L2.
                      ! Instead of recomputing it here for each QP, simply assume that the value at each QP
                      !  is equal to the average efvs in the element. This will give a small averaging error.

                      efvs_qp = efvs(k,i,j)

                   else  ! other approximations (SIA, SSA, BP)

                      ! Compute the effective viscosity at this quadrature point.

                      call compute_effective_viscosity(whichefvs,        whichapprox,                       &
                                                       efvs_constant,    effstrain_min,                     &
                                                       nNodesPerElement_3d,                                 &
                                                       dphi_dx_3d(:),    dphi_dy_3d(:),    dphi_dz_3d(:),   &
                                                       u(:),             v(:),                              & 
                                                       flwafact(k,i,j),  efvs_qp,                           &
                                                       itest, jtest, rtest,                                 &
                                                       i, j, k, p)

                   endif

                   ! Increment stresses, adding the value at this quadrature point

                   tau_xz(k,i,j) = tau_xz(k,i,j) + efvs_qp * du_dz            ! 2 * efvs * eps_xz
                   tau_yz(k,i,j) = tau_yz(k,i,j) + efvs_qp * dv_dz            ! 2 * efvs * eps_yz
                   tau_xx(k,i,j) = tau_xx(k,i,j) + 2.d0 * efvs_qp * du_dx     ! 2 * efvs * eps_xx
                   tau_yy(k,i,j) = tau_yy(k,i,j) + 2.d0 * efvs_qp * dv_dy     ! 2 * efvs * eps_yy
                   tau_xy(k,i,j) = tau_xy(k,i,j) + efvs_qp * (dv_dx + du_dy)  ! 2 * efvs * eps_xy

                enddo     ! p

                ! Final stress tensor components, averaged over quad pts
                tau_xz(k,i,j) = tau_xz(k,i,j) / nQuadPoints_3d
                tau_yz(k,i,j) = tau_yz(k,i,j) / nQuadPoints_3d
                tau_xx(k,i,j) = tau_xx(k,i,j) / nQuadPoints_3d
                tau_yy(k,i,j) = tau_yy(k,i,j) / nQuadPoints_3d
                tau_xy(k,i,j) = tau_xy(k,i,j) / nQuadPoints_3d
                
                ! Effective stress
                tau_eff(k,i,j) = sqrt(tau_xx(k,i,j)**2 + tau_yy(k,i,j)**2             &
                                    + tau_xx(k,i,j)*tau_yy(k,i,j) + tau_xy(k,i,j)**2  &
                                    + tau_xz(k,i,j)**2 + tau_yz(k,i,j)**2)

             enddo  ! k

             if (verbose_tau .and. this_rank==rtest .and. i==itest .and. j==jtest) then
                write(iulog,*) ' '
                write(iulog,*) 'i, j =', i, j
                write(iulog,*) 'k, tau_xz, tau_yz, tau_xx, tau_yy, tau_xy, tau_eff:'
                do k = 1, nz-1
                   write(iulog,*) k, tau_xz(k,i,j), tau_yz(k,i,j), tau_xx(k,i,j), &
                              tau_yy(k,i,j), tau_xy(k,i,j), tau_eff(k,i,j)
                enddo
             endif   ! verbose_tau

          endif     ! active cell

       enddo        ! i
    enddo           ! j

  end subroutine compute_internal_stress

!****************************************************************************

  subroutine compute_effective_viscosity (whichefvs,     whichapprox,            &
                                          efvs_constant, effstrain_min,          &
                                          nNodesPerElement,                      &
                                          dphi_dx,       dphi_dy,    dphi_dz,    &
                                          uvel,          vvel,                   &
                                          flwafact,      efvs,                   &
                                          itest, jtest,  rtest,                  &
                                          i, j, k, p )

    ! Compute effective viscosity at a quadrature point, based on the latest
    !  guess for the velocity field
    ! Note: Elements can be either 2D or 3D

    !----------------------------------------------------------------
    ! Input-output arguments
    !----------------------------------------------------------------

    integer, intent(in) :: &
       whichefvs       ! method for computing effective viscosity
                       ! 0 = constant value
                       ! 1 = proportional to flow factor
                       ! 2 = nonlinear function of effective strain rate 

    integer, intent(in) :: &
       whichapprox     ! option for Stokes approximation (BP, SSA, SIA)

    real(dp), intent(in) :: &
       efvs_constant   ! constant value of effective viscosity (Pa yr)

    ! Note: Mauro Perego suggests 1.e-8 yr^{-1} for effstrain_min.
    !       This value seems adequate for SSA and DIVA, but can be too low for BP.
    !       For Antarctic problems, a value of 1.e-6 improves BP convergence (WHL, July 2020).
    !       Alex Robinson uses 1.e-6 for Yelmo.
    real(dp), intent(in) :: &
       effstrain_min   ! minimum value of effective strain rate (yr^-1) for computing viscosity

    integer, intent(in) :: nNodesPerElement   ! number of nodes per element
                                              ! = 4 for 2D, = 8 for 3D

    real(dp), dimension(nNodesPerElement), intent(in) ::  &
       dphi_dx, dphi_dy, dphi_dz   ! derivatives of basis functions at this quadrature point
                                   ! dphi_dz = 0 for 2D SSA

    real(dp), dimension(nNodesPerElement), intent(in) ::  &
       uvel, vvel      ! current guess for velocity at each node of element (m/yr)

    real(dp), intent(in) ::  &
       flwafact        ! temperature-based flow factor for this element, 0.5 * A^{-1/n}
                       ! units: Pa yr^{1/n}

    real(dp), intent(out) ::   &
       efvs            ! effective viscosity at this quadrature point (Pa yr)
                       ! computed as 0.5 * A^{-1/n) * effstrain^{(1-n)/n)}

    integer, intent(in) :: &
       itest, jtest, rtest            ! coordinates of diagnostic point

    integer, intent(in) :: i, j, k, p

    !----------------------------------------------------------------
    ! Local variables
    !----------------------------------------------------------------

    real(dp) ::               &
       du_dx, du_dy, du_dz,   & ! strain rate components
       dv_dx, dv_dy, dv_dz,   &
       effstrain,             & ! effective strain rate, yr^{-1}
       effstrainsq              ! square of effective strain rate
        
    integer :: n

    real(dp) :: &
       p_effstr                 ! exponent (1-n)/n in effective viscosity relation

    real(dp), parameter :: p2 = -1.d0/3.d0

    ! Set exponent that depends on Glen's exponent
    p_effstr  = (1.d0 - n_glen)/n_glen

    select case(whichefvs)

    case(HO_EFVS_CONSTANT)

       ! Steve Price recommends 10^6 to 10^7 Pa yr
       ! ISMIP-HOM Test F requires 2336041.42829 Pa yr; this is the default value set in glide_types.F90
       efvs = efvs_constant

       if (verbose_efvs .and. this_rank==rtest .and. i==itest .and. j==jtest .and. k==ktest) then
          write(iulog,*) 'Set efvs = constant (Pa yr):', efvs
       endif

    case(HO_EFVS_FLOWFACT)      ! set the effective viscosity to a multiple of the flow factor, 0.5*A^(-1/n)
                 
       ! Units: flwafact has units Pa yr^{1/n}
       !        effstrain has units yr^{-1}
       !        p_effstr = (1-n)/n 
       !                 = -2/3 for n=3
       ! Thus efvs has units Pa yr
 
       !TODO - Test HO_EFVS_FLOWFACT option and make sure the units and scales are OK

       effstrain = velo_scale/len_scale * scyr  ! typical strain rate, yr^{-1}
       efvs = flwafact * effstrain**p_effstr  

       if (verbose_efvs .and. this_rank==rtest .and. i==itest .and. j==jtest .and. k==ktest) then
          write(iulog,*) 'flwafact, effstrain (yr-1), efvs (Pa yr)=', flwafact, effstrain, efvs
       endif

    case(HO_EFVS_NONLINEAR)    ! compute effective viscosity based on effective strain rate

       ! initialize strain rates
       du_dx = 0.d0
       du_dy = 0.d0
       du_dz = 0.d0
       dv_dx = 0.d0
       dv_dy = 0.d0
       dv_dz = 0.d0
       
       ! Compute effective strain rate (squared) at this quadrature point (PGB 2012, eq. 3 and 9)
       ! Units are yr^(-1)

       if (whichapprox == HO_APPROX_SIA) then

          do n = 1, nNodesPerElement
             du_dz = du_dz + dphi_dz(n)*uvel(n)
             dv_dz = dv_dz + dphi_dz(n)*vvel(n)
          enddo

          effstrainsq = effstrain_min**2          &
                      + 0.25d0 * (du_dz**2 + dv_dz**2)

       elseif (whichapprox == HO_APPROX_SSA) then

          do n = 1, nNodesPerElement

             du_dx = du_dx + dphi_dx(n)*uvel(n)
             du_dy = du_dy + dphi_dy(n)*uvel(n)

             dv_dx = dv_dx + dphi_dx(n)*vvel(n)
             dv_dy = dv_dy + dphi_dy(n)*vvel(n)

          enddo

          effstrainsq = effstrain_min**2          &
                      + (du_dx**2 + dv_dy**2 + du_dx*dv_dy + 0.25d0*(dv_dx + du_dy)**2)

       else   ! 3D higher-order

          do n = 1, nNodesPerElement

             du_dx = du_dx + dphi_dx(n)*uvel(n)
             du_dy = du_dy + dphi_dy(n)*uvel(n)
             du_dz = du_dz + dphi_dz(n)*uvel(n)

             dv_dx = dv_dx + dphi_dx(n)*vvel(n)
             dv_dy = dv_dy + dphi_dy(n)*vvel(n)
             dv_dz = dv_dz + dphi_dz(n)*vvel(n)

          enddo

          effstrainsq = effstrain_min**2                                      &
                      + (du_dx**2 + dv_dy**2 + du_dx*dv_dy + 0.25d0*(dv_dx + du_dy)**2)  &
                      + 0.25d0*(du_dz**2 + dv_dz**2)

       endif  ! whichapprox

       ! Compute effective viscosity (PGB 2012, eq. 4)
       ! Units: flwafact has units Pa yr^{1/n}
       !        effstrain has units yr^{-1}
       !        p_effstr = (1-n)/n 
       !                  = -2/3 for n=3
       ! Thus efvs has units Pa yr
 
       efvs = flwafact * effstrainsq**(p_effstr/2.d0)

       if (verbose_efvs .and. this_rank==rtest .and. i==itest .and. j==jtest .and. k==ktest .and. p==ptest) then
          write(iulog,*) ' '
          write(iulog,*) 'i, j, k, p =', i, j, k, p
          write(iulog,*) 'flwafact, effstrain (yr-1), efvs(Pa yr) =', flwafact, sqrt(effstrainsq), efvs
          write(iulog,*) 'du_dx, du_dy, du_dz:', du_dx, du_dy, du_dz
          write(iulog,*) 'dv_dx, dv_dy, dv_dz:', dv_dx, dv_dy, dv_dz
       endif
 
   end select

  end subroutine compute_effective_viscosity

!****************************************************************************

  subroutine compute_effective_viscosity_L1L2(whichefvs,                            &
                                              efvs_constant,    effstrain_min,      &
                                              nz,               sigma,              &
                                              nNodesPerElement, phi,                &
                                              dphi_dx,          dphi_dy,            &
                                              uvel,             vvel,               &
                                              stagthck,                             &
                                              dsdx,             dsdy,               &
                                              flwa,             flwafact,           &
                                              efvs,                                 &
                                              itest,   jtest,   rtest,              &
                                              i, j, p )

    ! Compute the effective viscosity at each layer of an ice column corresponding
    !  to a particular quadrature point, based on the L1L2 formulation.
    ! See PGB(2012), section 2.3

    !----------------------------------------------------------------
    ! Input-output arguments
    !----------------------------------------------------------------

    integer, intent(in) :: &
       whichefvs          ! method for computing effective viscosity
                          ! 0 = constant value
                          ! 1 = proportional to flow factor
                          ! 2 = nonlinear function of effective strain rate 

    real(dp), intent(in) :: &
       efvs_constant      ! constant value of effective viscosity (Pa yr)
                          ! (used for option HO_EFVS_CONSTANT)

    real(dp), intent(in) :: &
       effstrain_min      ! minimum value of effective strain rate (yr^-1) for computing viscosity
                          ! see comments above in compute_effective_viscosity

    integer, intent(in) ::  &
       nz,               &! number of vertical levels at which velocity is computed
                          ! Note: The number of layers (or elements in a column) is nz-1
       nNodesPerElement   ! number of nodes per element, = 4 for 2D rectangular faces

    real(dp), dimension(nz), intent(in) ::    &
       sigma              ! sigma vertical coordinate

    integer, intent(in) :: &
       itest, jtest, rtest         ! coordinates of diagnostic point

    integer, intent(in) :: i, j, p

    real(dp), dimension(nNodesPerElement), intent(in) ::  &
       phi,           &   ! basic functions at this quadrature point
       dphi_dx, dphi_dy   ! derivatives of basis functions at this quadrature point

    real(dp), dimension(nNodesPerElement), intent(in) ::  &
       uvel, vvel,       &! current guess for basal velocity at cell vertices (m/yr)
       dsdx, dsdy,       &! upper surface elevation gradient at vertices (m/m)
       stagthck           ! ice thickness at vertices

    real(dp), dimension(nz-1), intent(in) ::  &
       flwa,             &! temperature-based flow factor A at each layer of this cell column
                          ! units: Pa^{-n} yr^{-1}
       flwafact           ! temperature-based flow factor for this element, 0.5 * A^{-1/n}
                          ! units: Pa yr^{1/n}  (used for option HO_EFVS_FLOWFACT)

    real(dp), dimension(nz-1), intent(out) ::   &
       efvs               ! effective viscosity of each layer corresponding to this quadrature point (Pa yr)
                          ! computed as 1 / (2*A*tau_eff^{(n-1)/2})
                          !           = 1 / (2*A*tau_eff^2) given n = 3
                          ! where tau_eff^2 = tau_parallel^2 + tau_perp^2
 
    !----------------------------------------------------------------
    ! Local parameters
    !----------------------------------------------------------------

    real(dp) ::  &
       p_effstr              ! exponent (1-n)/n in effective viscosity relation
                                                               
    !----------------------------------------------------------------
    ! Local variables
    !----------------------------------------------------------------

    real(dp) ::            &
       du_dx, du_dy,       & ! horizontal strain rate components at this quadrature point, yr^{-1}
       dv_dx, dv_dy,       &
       ds_dx, ds_dy,       & ! gradient of upper surface elevation at this QP (m/m)
       thck,               & ! ice thickness (m) at this QP
       effstrain,          & ! effective strain rate at QP, yr^{-1}
       effstrainsq,        & ! square of effective strain rate
       tau_parallel,       & ! norm of tau_parallel at each layer of this cell column, 
                             !  where |tau_parallel|^2 = tau_xx^2 + tau_yy^2 + tau_xx*tau_yy + tau_xy^2
                             !  See PGB(2012), eq. 17 and 20
       tau_perp,           & ! norm of tau_perp at a given layer of a cell column,
                             !  where |tau_perp|^2 = [rhoi*grav*(s-z)*|grad(s)|]^2
       grads,              & ! norm of sfc elevation gradient at this QP, sqrt(ds_dx^2 + ds_dy^2)
       depth                 ! distance (m) from surface to level k at this QP 

    real(dp) :: a, b, c, rootA, rootB   ! terms in cubic equation

    integer :: n, k

    ! Set exponent that depends on Glen's exponent
    p_effstr = (1.d0 - n_glen) / n_glen

    select case(whichefvs)

    case(HO_EFVS_CONSTANT)

       ! Steve Price recommends 10^6 to 10^7 Pa yr
       ! ISMIP-HOM Test F requires 2336041.42829 Pa yr; this is the default value set in glide_types.F90
       efvs(:) = efvs_constant

       if (verbose_efvs .and. this_rank==rtest .and. i==itest .and. j==jtest) then
          write(iulog,*) 'Set efvs = constant (Pa yr):', efvs
       endif

    case(HO_EFVS_FLOWFACT)      ! set the effective viscosity to a multiple of the flow factor, 0.5*A^(-1/n)
                
       ! Set the effective strain rate (s^{-1}) based on typical velocity and length scales
       !
       ! Units: flwafact has units Pa yr^{1/n}
       !        effstrain has units yr^{-1}
       !        p_effstr = (1-n)/n
       !                 = -2/3 for n=3
       ! Thus efvs has units Pa yr
   
       effstrain = velo_scale/len_scale * scyr  ! typical strain rate, yr^{-1}
       efvs(:) = flwafact(:) * effstrain**p_effstr  

       if (verbose_efvs .and. this_rank==rtest .and. i==itest .and. j==jtest) then
          write(iulog,*) 'flwafact, effstrain (yr-1), efvs (Pa yr)=', flwafact, effstrain, efvs
       endif

    case(HO_EFVS_NONLINEAR)    ! compute effective viscosity based on effective strain rate

       du_dx = 0.d0
       du_dy = 0.d0
       dv_dx = 0.d0
       dv_dy = 0.d0
       ds_dx = 0.d0
       ds_dy = 0.d0
       thck  = 0.d0

       do n = 1, nNodesPerElement

          du_dx = du_dx + dphi_dx(n)*uvel(n)
          du_dy = du_dy + dphi_dy(n)*uvel(n)

          dv_dx = dv_dx + dphi_dx(n)*vvel(n)
          dv_dy = dv_dy + dphi_dy(n)*vvel(n)

          ds_dx = ds_dx + phi(n)*dsdx(n)
          ds_dy = ds_dy + phi(n)*dsdy(n)

          thck = thck + phi(n)*stagthck(n)

       enddo

       ! Compute effective strain rate at this quadrature point (PGB 2012, eq. 17)

       effstrainsq = effstrain_min**2          &
                   + du_dx**2 + dv_dy**2 + du_dx*dv_dy + 0.25d0*(dv_dx + du_dy)**2
       effstrain = sqrt(effstrainsq)

       if (verbose_efvs .and. this_rank==rtest .and. i==itest .and. j==jtest .and. p==ptest) then
          write(iulog,*) ' '
          write(iulog,*) 'i, j, p, effstrain (yr-1):', i, j, p, effstrain
          write(iulog,*) 'du_dx, du_dy =', du_dx, du_dy
          write(iulog,*) 'dv_dx, dv_dy =', dv_dx, dv_dy
          write(iulog,*) 'ds_dx, ds_dy =', ds_dx, ds_dy
!          write(iulog,*) 'n, phi, dphi_dx, dphi_dy:'
!          do n = 1, nNodesPerElement_2d
!             write(iulog,*) n, phi(n), dphi_dx(n), dphi_dy(n)
!          enddo
       endif

       !---------------------------------------------------------------------------
       ! Solve for tau_parallel in the relation (PGB 2012, eq. 22)
       !
       !     effstrain = A * (tau_parallel^2 + tau_perp^2)^{(n-1)/2} * tau_parallel
       !
       !     where tau_perp^2 = [(pg)*(s-z)*|grad(s)|]^2 = SIA stress
       !              grad(s) = sqrt(ds_dx^2 + ds_dy^2)
       !                    n = 3, so we have a cubic equation
       !
       ! This relation can be written as a cubic equation of the form
       !
       !            x^3 + a*x + b = 0,
       !
       ! where for this problem, x = tau_parallel > 0,
       !                         a = tau_perp^2 >= 0,
       !                         b = -effstrain/A < 0.
       !
       ! If (b^2)/4 + (a^3)/27 > 0, then there is one real root A + B, where
       ! 
       !     A = [-b/2 + sqrt((b^2)/4 + (a^3)/27)]^(1/3)
       !     B = -[b/2 + sqrt((b^2)/4 + (a^3)/27)]^(1/3)
       !  
       ! There is also a pair of complex conjugate roots that are not of interest here.
       !
       ! Note: If a^3/27 << b^2/4 (as can happen if |grad(s)| is small), then the
       !       bracketed term in B is given to a good approximation by 
       !
       !       b/2 + (|b|/2)*(1 + 2a^3/(27b^2)) = a^3 / (27|b|).
       !
       ! Hence B = -a / (3 * |b|^(1/3)).
       !
       ! We use the alternate expression for B when a^3/27 < 1.d-6 * b^2/4,
       !  so as to avoid roundoff error from subtracting two large numbers of nearly
       !  the same size. 
       !---------------------------------------------------------------------------
       !TODO - Code an iterative solution for tau_parallel, for n /= 3.
       !TODO - Replace sigma with stagsigma?  Not sure if depth should be at layer midpt or base

       do k = 1, nz-1   ! loop over layers
          depth = thck * sigma(k+1)
          grads = sqrt(ds_dx**2 + ds_dy**2)
          tau_perp = rhoi*grav*depth*grads
          a = tau_perp**2
          b = -effstrain / flwa(k)
          c = sqrt(b**2/4.d0 + a**3/27.d0)
          rootA = (-b/2.d0 + c)**(1.d0/3.d0)
          if (a**3/(27.d0) > 1.d-6 * (b**2/4.d0)) then
             rootB = -(b/2.d0 + c)**(1.d0/3.d0)
          else    ! b/2 + c is small; compute solution to first order without subtracting two large, nearly equal numbers
             rootB = -a / (3.d0*(abs(b))**(1.d0/3.d0))
          endif
          tau_parallel = rootA + rootB

          !TODO - Currently limited to n = 1 and n = 3.  Allow arbitrary n.
          if (abs(n_glen - 1.d0) < 1.d-10) then  ! n = 1
             efvs(k) = 1.d0 / (2.d0 * flwa(k))
          elseif (abs(n_glen - 3.d0) < 1.d-10) then  ! n = 3
             efvs(k) = 1.d0 / (2.d0 * flwa(k) * (tau_parallel**2 + tau_perp**2))  ! given n = 3
          else
             call write_log('Invalid value of n_glen for L1L2 solver', GM_FATAL)
          endif

          !WHL - debug
          if (verbose_efvs .and. this_rank==rtest .and. i==itest .and. j==jtest .and. k==ktest .and. p==ptest) then
             write(iulog,*) 'i, j, k, p =', i, j, k, p
!             write(iulog,*) 'a, b, c:', a, b, c
!             write(iulog,*) '-b/2 + c, -b/2 - c:', -b/2 + c, -b/2 - c
!             write(iulog,*) 'roots A, B:', rootA, rootB
!             write(iulog,*) 'tau_perp, tau_parallel:', tau_perp, tau_parallel
!             write(iulog,*) 'flwa:', flwa(k)
             write(iulog,*) 'flwafact, effstrain, efvs_BP, efvs:', 0.5d0*flwa(k)**(-1.d0/3.d0), effstrain,  &
                                                            0.5d0*flwa(k)**(-1.d0/3.d0) * effstrain**(-2.d0/3.d0), efvs(k)
          endif

       enddo   ! k

    end select

  end subroutine compute_effective_viscosity_L1L2

!****************************************************************************

  subroutine compute_effective_viscosity_diva(whichefvs,                            &
                                              efvs_constant,    effstrain_min,      &
                                              nz,               stagsigma,          &
                                              nNodesPerElement, phi,                &
                                              dphi_dx,          dphi_dy,            &
                                              uvel,             vvel,               &
                                              btractx,          btracty,            &
                                              diva_slope_factor_x, diva_slope_factor_y, &
                                              stagthck,                             &
                                              flwa,             flwafact,           &
                                              efvs,                                 &
                                              itest,  jtest,    rtest,              &
                                              i, j, p )
    
    ! Compute the effective viscosity at each layer of an ice column corresponding
    !  to a particular quadrature point, based on the depth-integrated formulation.
    ! See Goldberg(2011) for details.

    !----------------------------------------------------------------
    ! Input-output arguments
    !----------------------------------------------------------------

    integer, intent(in) :: &
       whichefvs          ! method for computing effective viscosity
                          ! 0 = constant value
                          ! 1 = proportional to flow factor
                          ! 2 = nonlinear function of effective strain rate 

    real(dp), intent(in) :: &
       efvs_constant      ! constant value of effective viscosity (Pa yr)
                          ! (used for option HO_EFVS_CONSTANT)

    real(dp), intent(in) :: &
       effstrain_min      ! minimum value of effective strain rate (yr^-1) for computing viscosity
                          ! see comments above in compute_effective_viscosity

    integer, intent(in) ::  &
       nz,               &! number of vertical levels at which velocity is computed
                          ! Note: The number of layers (or elements in a column) is nz-1
       nNodesPerElement   ! number of nodes per element, = 4 for 2D rectangular faces

    real(dp), dimension(nz-1), intent(in) ::    &
       stagsigma          ! staggered sigma vertical coordinate

    real(dp), dimension(nNodesPerElement), intent(in) ::  &
       phi,           &   ! basic functions at this quadrature point
       dphi_dx, dphi_dy   ! derivatives of basis functions at this quadrature point

    real(dp), dimension(nNodesPerElement), intent(in) ::  &
       uvel, vvel,       &! current guess for depth_integrated mean velocity at cell vertices (m/yr)
       btractx, btracty, &! components of basal traction (Pa)
       stagthck           ! ice thickness at vertices

    real(dp), intent(in) :: &
         diva_slope_factor_x, &  ! slope factor in x direction for this vertex
         diva_slope_factor_y     ! slope factor in y direction for this vertex

    real(dp), dimension(nz-1), intent(in) ::  &
       flwa,             &! temperature-based flow factor A at each layer of this cell column
                          ! units: Pa^{-n} yr^{-1}
       flwafact           ! temperature-based flow factor for this element, 0.5 * A^{-1/n}
                          ! units: Pa yr^{1/n}  (used for option HO_EFVS_FLOWFACT)

    !WHL - intent(out) if solving cubic, but (inout) if using old efvs in calculation
    real(dp), dimension(nz-1), intent(inout) ::   &
       efvs               ! effective viscosity of each layer corresponding to this quadrature point (Pa yr)

    integer, intent(in) :: &
       itest, jtest, rtest         ! coordinates of diagnostic point

    integer, intent(in) :: i, j, p

    !----------------------------------------------------------------
    ! Local variables
    !----------------------------------------------------------------

    real(dp) ::            &
       du_dx, du_dy,       & ! horizontal strain rate components at this quadrature point (yr^{-1})
       dv_dx, dv_dy,       &
       taux,  tauy,        & ! basal shear stress components at this QP (Pa)
       thck,               & ! ice thickness (m) at this QP
       effstrain,          & ! effective strain rate at QP  (yr^{-1})
       effstrainsq,        & ! square of effective strain rate
       depth                 ! distance (m) from surface to layer k at this QP 

    real(dp) :: facta, factb, a, b, c, rootA, rootB   ! terms in cubic equation

    integer :: n, k
    real(dp) :: du_dz, dv_dz

    real(dp) :: &
       p_effstr              ! exponent (1-n)/n in effective viscosity relation

    !WHL - For ISMIP-HOM, the cubic solve is not robust.  It leads to oscillations
    !      in successive iterations between uvel_2d/vvel_2d and btractx/btracty
    !TODO - Remove the cubic solve for efvs, unless we find a way to make it robust?
    logical, parameter :: cubic = .false.

    ! Set exponent that depends on Glen's exponent
    p_effstr  = (1.d0 - n_glen)/n_glen

    select case(whichefvs)

    case(HO_EFVS_CONSTANT)

       ! Steve Price recommends 10^6 to 10^7 Pa yr
       ! ISMIP-HOM Test F requires 2336041.42829 Pa yr; this is the default value set in glide_types.F90
       efvs(:) = efvs_constant

       if (verbose_efvs .and. this_rank==rtest .and. i==itest .and. j==jtest) then
          write(iulog,*) 'Set efvs = constant (Pa yr):', efvs
       endif

    case(HO_EFVS_FLOWFACT)      ! set the effective viscosity to a multiple of the flow factor, 0.5*A^(-1/n)
                
       ! Set the effective strain rate (s^{-1}) based on typical velocity and length scales
       !
       ! Units: flwafact has units Pa yr^{1/n}
       !        effstrain has units yr^{-1}
       !        p_effstr = (1-n)/n 
       !                 = -2/3 for n=3
       ! Thus efvs has units Pa yr
   
       effstrain = velo_scale/len_scale * scyr  ! typical strain rate, yr^{-1}
       efvs(:) = flwafact(:) * effstrain**p_effstr  

       if (verbose_efvs .and. this_rank==rtest .and. i==itest .and. j==jtest) then
          write(iulog,*) 'flwafact, effstrain (yr-1), efvs (Pa yr)=', flwafact, effstrain, efvs
       endif

    case(HO_EFVS_NONLINEAR)    ! compute effective viscosity based on effective strain rate

       du_dx = 0.d0
       du_dy = 0.d0
       dv_dx = 0.d0
       dv_dy = 0.d0
       thck  = 0.d0
       taux  = 0.d0
       tauy  = 0.d0

       do n = 1, nNodesPerElement

          du_dx = du_dx + dphi_dx(n)*uvel(n)
          du_dy = du_dy + dphi_dy(n)*uvel(n)

          dv_dx = dv_dx + dphi_dx(n)*vvel(n)
          dv_dy = dv_dy + dphi_dy(n)*vvel(n)

          taux = taux + phi(n)*btractx(n)
          tauy = tauy + phi(n)*btracty(n)

          thck = thck + phi(n)*stagthck(n)

       enddo

    if (cubic) then

       ! Compute effective strain rate (squared) at this quadrature point

       effstrainsq = effstrain_min**2          &
                   + du_dx**2 + dv_dy**2 + du_dx*dv_dy + 0.25d0*(dv_dx + du_dy)**2

       if (verbose_efvs .and. this_rank==rtest .and. i==itest .and. j==jtest .and. p==ptest) then
          write(iulog,*) ' '
          write(iulog,*) 'i, j, p, effstrain (yr-1):', i, j, p, sqrt(effstrainsq)
          write(iulog,*) 'p_effstr =', p_effstr
          write(iulog,*) 'du_dx, du_dy =', du_dx, du_dy
          write(iulog,*) 'dv_dx, dv_dy =', dv_dx, dv_dy
          write(iulog,*) 'btractx, btracty =',  btractx, btracty
          write(iulog,*) 'taux, tauy =', taux, tauy
       endif

       !---------------------------------------------------------------------------
       ! Solve for efvs in the relation
       !
       ! efvs = 1/2 * A^(-1/n) * [effstrainsq + (1/4)*(u_z^2 + v_z^2)]^[(1-n)/(2n)]
       !
       ! where effstrainsq = du_dx**2 + dv_dy**2 + du_dx*dv_dy + (1/4)*(dv_dx + du_dy)**2
       !                     + small regularization term
       !       u_z = tau_x*(s-z) / (H*efvs)
       !       v_z = tau_y*(s-z) / (H*efvs)
       !
       !       tau_x = beta*u_b = beta_eff_x*u
       !       tau_y = beta*v_b = beta_eff_y*v
       !
       !       (u,v) is the depth-averaged mean velocity
       !
       ! For n = 3, this relation can be written as a cubic equation of the form
       !
       !       x^3 + a*x + b = 0,
       !
       ! where x = efvs
       !       a = [(tau_x^2 + tau_y^2)*(s-z)^2 / (4*H^2*effstrainsq) >= 0
       !       b = -1/(8*A*effstrainsq) < 0
       !
       ! See comments in compute_effective_viscosity_L1L2 for more details on the cubic solve.
       !
       ! NOTE: This scheme does not reliably converge.
       !
       !       The problem is that taux and tauy are proportional to beta_eff, which is
       !        a function of the old viscosity.  Mixing the old and new viscosity in the
       !        expression for vertical shear can lead to oscillations.
       !---------------------------------------------------------------------------

       facta = (taux**2 + tauy**2) / (4.d0 * thck**2 * effstrainsq)
       factb = -1.d0 / (8.d0 * effstrainsq)
       do k = 1, nz-1   ! loop over layers
          depth = thck * stagsigma(k)
          a = facta * depth**2
          b = factb / flwa(k)
          c = sqrt(b**2/4.d0 + a**3/27.d0)
          rootA = (-b/2.d0 + c)**(1.d0/3.d0)
          if (a**3/(27.d0) > 1.d-6 * (b**2/4.d0)) then
             rootB = -(b/2.d0 + c)**(1.d0/3.d0)
          else    ! b/2 + c is small; compute solution to first order without subtracting two large, nearly equal numbers
             rootB = -a / (3.d0*(abs(b))**(1.d0/3.d0))
          endif
          efvs(k) = rootA + rootB

          if (verbose_efvs .and. this_rank==rtest .and. i==itest .and. j==jtest .and. k==ktest .and. p==ptest) then
             write(iulog,*) ' '
             write(iulog,*) 'i, j, k, p, depth =', i, j, k, p, depth
             write(iulog,*) 'a, b, c:', a, b, c
             write(iulog,*) '-b/2 + c, -b/2 - c:', -b/2 + c, -b/2 - c
             write(iulog,*) 'roots A, B:', rootA, rootB
             write(iulog,*) 'flwa:', flwa(k)
             effstrain = sqrt(effstrainsq)
             write(iulog,*) 'flwafact, effstrain, efvs_SSA, efvs:', flwafact(k), effstrain,  &
                                                             flwafact(k)*effstrain**(-2.d0/3.d0), efvs(k)
          endif

       enddo   ! k

    else  ! solve for efvs, using the old value of efvs to estimate the vertical strain rates

       !Notes: taux is based on basal traction; it is a sum over btractx = beta_eff*uvel_2d from the last iteration
       !       Optionally, the (du/dz) and (dv/dz) terms in the effective strain rate are multiplied
       !        by a slope correction factor to enhance the viscosity.
       !       (The cubic scheme above does not include this correction.)

       do k = 1, nz-1   ! loop over layers
          if (efvs(k)==0.d0) then
             efvs(k) = flwafact(k) * effstrain_min**p_effstr  ! efvs associated with minimum strain rate
          endif
          du_dz = taux * stagsigma(k) / (efvs(k)*diva_slope_factor_x)   ! old value of efvs on RHS
          dv_dz = tauy * stagsigma(k) / (efvs(k)*diva_slope_factor_y)
          effstrainsq = effstrain_min**2          &
                      + du_dx**2 + dv_dy**2 + du_dx*dv_dy + 0.25d0*(dv_dx + du_dy)**2  &
                      + 0.25d0 * du_dz**2 * diva_slope_factor_x   &
                      + 0.25d0 * dv_dz**2 * diva_slope_factor_y
          efvs(k) = flwafact(k) * effstrainsq**(p_effstr/2.d0)

          if (verbose_efvs .and. this_rank==rtest .and. i==itest .and. j==jtest .and. p==ptest) then
             write(iulog,*) ' '
             write(iulog,*) 'i, j, k, p, effstrain (yr-1):', i, j, k, p, sqrt(effstrainsq)
             write(iulog,*) 'p_effstr =', p_effstr
             write(iulog,*) 'du_dx, du_dy, du_dz =', du_dx, du_dy, du_dz
             write(iulog,*) 'dv_dx, dv_dy, dv_dz =', dv_dx, dv_dy, dv_dz
             write(iulog,*) 'flwafact, efvs (Pa yr)=', flwafact(k), efvs(k)
          endif

       enddo

    endif   ! cubic

    end select

  end subroutine compute_effective_viscosity_diva

!****************************************************************************

  subroutine compute_element_matrix(whichapprox, nNodesPerElement,     &
                                    wqp,         detJ,                 &
                                    efvs,                              &
                                    dphi_dx,     dphi_dy,    dphi_dz,  &
                                    Kuu,         Kuv,                  &
                                    Kvu,         Kvv,                  &
                                    itest, jtest, rtest,               &
                                    i, j, k, p)

    !------------------------------------------------------------------
    ! Increment the stiffness matrices Kuu, Kuv, Kvu, Kvv with the
    ! contribution from a particular quadrature point, 
    ! based on the chosen Stokes approximation.
    !
    ! Note: Elements can be either 2D or 3D
    !------------------------------------------------------------------

    !----------------------------------------------------------------
    ! Input-output arguments
    !----------------------------------------------------------------

    integer, intent(in) :: &
         whichapprox     ! which Stokes approximation to use (BP, SIA, SSA)

    integer, intent(in) :: nNodesPerElement  ! number of nodes per element

    real(dp), intent(in) ::    &
             wqp,        &! weight for this quadrature point
             detJ,       &! determinant of Jacobian for the transformation
                          !  between the reference element and true element
             efvs         ! effective viscosity at this quadrature point

    real(dp), dimension(nNodesPerElement), intent(in) ::  &
             dphi_dx, dphi_dy, dphi_dz   ! derivatives of basis functions,
                                         ! evaluated at this quadrature point

    real(dp), dimension(nNodesPerElement,nNodesPerElement), intent(inout) :: &
             Kuu, Kuv, Kvu, Kvv     ! components of element stiffness matrix

    integer, intent(in) :: &
       itest, jtest, rtest         ! coordinates of diagnostic point

    integer, intent(in) :: i, j, k, p

    !----------------------------------------------------------------
    ! Local variables
    !----------------------------------------------------------------

    real(dp) :: efvs_factor
    integer :: nr, nc

    if (verbose_matrix .and. this_rank==rtest .and. i==itest .and. j==jtest .and. k==ktest) then
       write(iulog,*) ' '
       write(iulog,*) 'Increment element matrix, i, j, k, p =', i, j, k, p
    endif

    ! Increment the element stiffness matrices for the appropriate approximation.

    !Note: Scaling by volume such that detJ/vol0 is close to unity
    efvs_factor = efvs * wqp * detJ/vol0
    
    if (verbose_matrix .and. this_rank==rtest .and. i==itest .and. j==jtest .and. &
         k==ktest .and. p==ptest) then
       write(iulog,*) ' '
       write(iulog,*) 'i, j, k, p:', i, j, k, p
       write(iulog,*) 'efvs, wqp, detJ/vol0 =', efvs, wqp, detJ/vol0
       write(iulog,*) 'dphi_dz(1) =', dphi_dz(1)
       write(iulog,*) 'dphi_dx(1) =', dphi_dx(1)
       write(iulog,*) 'Kuu dphi/dz increment(1,1) =', efvs_factor*dphi_dz(1)*dphi_dz(1)
       write(iulog,*) 'Kuu dphi/dx increment(1,1) =', efvs_factor*4.d0*dphi_dx(1)*dphi_dx(1)
    endif

    if (whichapprox == HO_APPROX_SIA) then

       do nc = 1, nNodesPerElement      ! columns of K
          do nr = 1, nNodesPerElement   ! rows of K

             Kuu(nr,nc) = Kuu(nr,nc) + efvs_factor * (dphi_dz(nr)*dphi_dz(nc))
             Kvv(nr,nc) = Kvv(nr,nc) + efvs_factor * (dphi_dz(nr)*dphi_dz(nc))             

          enddo  ! row
       enddo     ! column

    elseif (whichapprox == HO_APPROX_SSA) then

       do nc = 1, nNodesPerElement      ! columns of K
          do nr = 1, nNodesPerElement   ! rows of K

             Kuu(nr,nc) = Kuu(nr,nc) + efvs_factor * (4.d0*dphi_dx(nr)*dphi_dx(nc) + dphi_dy(nr)*dphi_dy(nc))
             Kuv(nr,nc) = Kuv(nr,nc) + efvs_factor * (2.d0*dphi_dx(nr)*dphi_dy(nc) + dphi_dy(nr)*dphi_dx(nc))
             Kvu(nr,nc) = Kvu(nr,nc) + efvs_factor * (2.d0*dphi_dy(nr)*dphi_dx(nc) + dphi_dx(nr)*dphi_dy(nc))
             Kvv(nr,nc) = Kvv(nr,nc) + efvs_factor * (4.d0*dphi_dy(nr)*dphi_dy(nc) + dphi_dx(nr)*dphi_dx(nc))

          enddo
       enddo

    else   ! Blatter-Pattyn higher-order (or DIVA)
           ! The terms in parentheses can be derived from PGB 2012, eq. 13 and 15.
           ! The factor of 2 in front of efvs has been absorbed into the quantities in parentheses.

       do nc = 1, nNodesPerElement      ! columns of K
          do nr = 1, nNodesPerElement   ! rows of K

             Kuu(nr,nc) = Kuu(nr,nc) + efvs_factor *                                             &
                                    ( 4.d0*dphi_dx(nr)*dphi_dx(nc) + dphi_dy(nr)*dphi_dy(nc)     &
                                    + dphi_dz(nr)*dphi_dz(nc) )

             Kuv(nr,nc) = Kuv(nr,nc) + efvs_factor *                                             &
                                     (2.d0*dphi_dx(nr)*dphi_dy(nc) + dphi_dy(nr)*dphi_dx(nc))

             Kvu(nr,nc) = Kvu(nr,nc) + efvs_factor *                                             &
                                     (2.d0*dphi_dy(nr)*dphi_dx(nc) + dphi_dx(nr)*dphi_dy(nc))

             Kvv(nr,nc) = Kvv(nr,nc) + efvs_factor *                                             &
                                    ( 4.d0*dphi_dy(nr)*dphi_dy(nc) + dphi_dx(nr)*dphi_dx(nc)        &
                                    + dphi_dz(nr)*dphi_dz(nc) )

          enddo  ! nr (rows)
       enddo     ! nc (columns)

    endif  ! whichapprox

  end subroutine compute_element_matrix

!****************************************************************************

  subroutine element_to_global_matrix_3d(nx,           ny,          nz,          &
                                         iElement,     jElement,    kElement,    &
                                         itest,        jtest,       rtest,       &
                                         Kuu,          Kuv,                      &
                                         Kvu,          Kvv,                      &
                                         Auu,          Auv,                      &
                                         Avu,          Avv)
             
    ! Sum terms of element matrix K into dense assembled matrix A
    ! K is partitioned into Kuu, Kuv, Kvu, and Kvv, and similarly for A.

    integer, intent(in) ::   &
       nx, ny,               &  ! horizontal grid dimensions
       nz                       ! number of vertical levels where velocity is computed

    integer, intent(in) ::   &
       iElement, jElement, kElement     ! i, j and k indices for this element

    integer, intent(in) :: &
       itest, jtest, rtest         ! coordinates of diagnostic point

    real(dp), dimension(nNodesPerElement_3d,nNodesPerElement_3d), intent(in) ::  &
       Kuu, Kuv, Kvu, Kvv       ! element matrix

    real(dp), dimension(nNodeNeighbors_3d,nz,nx-1,ny-1), intent(inout) ::    &
       Auu, Auv, Avu, Avv       ! assembled matrix

    integer :: i, j, k, m
    integer :: iA, jA, kA
    integer :: n, nr, nc

    if (verbose_matrix .and. this_rank==rtest .and. iElement==itest .and. jElement==jtest .and. kElement==ktest) then
       write(iulog,*) 'Element i, j, k:', iElement, jElement, kElement 
       write(iulog,*) 'Rows of Kuu:'
       do n = 1, nNodesPerElement_3d
          write(iulog, '(8e12.4)') Kuu(n,:)
       enddo
    endif

    !WHL - On a Mac I tried switching the loops to put nc on the outside, but 
    !      the one with nr on the outside is faster.
    do nr = 1, nNodesPerElement_3d       ! rows of K

       ! Determine row of A to be incremented by finding (k,i,j) for this node
       ! The reason for the '7' is that node 7, in the NE corner of the upper layer, has index (k,i,j).
       ! Indices for other nodes are computed relative to this node.
       i = iElement + ishift(7,nr)
       j = jElement + jshift(7,nr)
       k = kElement + kshift(7,nr)
      
       do nc = 1, nNodesPerElement_3d    ! columns of K

          ! Determine column of A to be incremented
          kA = kshift(nr,nc)           ! k index of A into which K(m,n) is summed
          iA = ishift(nr,nc)           ! similarly for i and j indices 
          jA = jshift(nr,nc)           ! these indices can take values -1, 0 and 1
          m = indxA_3d(iA,jA,kA)

          ! Increment A
          Auu(m,k,i,j) = Auu(m,k,i,j) + Kuu(nr,nc)
          Auv(m,k,i,j) = Auv(m,k,i,j) + Kuv(nr,nc)
          Avu(m,k,i,j) = Avu(m,k,i,j) + Kvu(nr,nc)
          Avv(m,k,i,j) = Avv(m,k,i,j) + Kvv(nr,nc)

       enddo     ! nc

    enddo        ! nr

  end subroutine element_to_global_matrix_3d

!****************************************************************************

  subroutine element_to_global_matrix_2d(nx,           ny,        &
                                         iElement,     jElement,  &
                                         itest, jtest, rtest,     &
                                         Kuu,          Kuv,       &
                                         Kvu,          Kvv,       &
                                         Auu,          Auv,       &
                                         Avu,          Avv)

    ! Sum terms of element matrix K into dense assembled matrix A
    ! K is partitioned into Kuu, Kuv, Kvu, and Kvv, and similarly for A.

    integer, intent(in) ::   &
       nx, ny                   ! horizontal grid dimensions

    integer, intent(in) ::   &
       iElement, jElement       ! i and j indices for this element

    integer, intent(in) :: &
       itest, jtest, rtest         ! coordinates of diagnostic point

    real(dp), dimension(nNodesPerElement_2d,nNodesPerElement_2d), intent(in) ::  &
       Kuu, Kuv, Kvu, Kvv       ! element matrix

    real(dp), dimension(nx-1,ny-1,nNodeNeighbors_2d), intent(inout) ::    &
       Auu, Auv, Avu, Avv       ! assembled matrix

    integer :: i, j, m
    integer :: iA, jA
    integer :: n, nr, nc

    if (verbose_matrix .and. this_rank==rtest .and. iElement==itest .and. jElement==jtest) then
       write(iulog,*) 'Element i, j:', iElement, jElement 
       write(iulog,*) 'Rows of Kuu:'
       do n = 1, nNodesPerElement_2d
          write(iulog, '(8e12.4)') Kuu(n,:)
       enddo
    endif

    do nr = 1, nNodesPerElement_2d       ! rows of K

       ! Determine row of A to be incremented by finding (i,j) for this node
       ! The reason for the '3' is that node 3, in the NE corner of this gridcell, has index (i,j).
       ! Indices for other nodes are computed relative to this node.
       i = iElement + ishift(3,nr)
       j = jElement + jshift(3,nr)
      
       do nc = 1, nNodesPerElement_2d    ! columns of K

          ! Determine column of A to be incremented
          iA = ishift(nr,nc)           ! similarly for i and j indices 
          jA = jshift(nr,nc)           ! these indices can take values -1, 0 and 1
          m = indxA_2d(iA,jA)

          ! Increment A
          Auu(i,j,m) = Auu(i,j,m) + Kuu(nr,nc)
          Auv(i,j,m) = Auv(i,j,m) + Kuv(nr,nc)
          Avu(i,j,m) = Avu(i,j,m) + Kvu(nr,nc)
          Avv(i,j,m) = Avv(i,j,m) + Kvv(nr,nc)

          if (verbose_matrix .and. this_rank==rtest .and. iElement==itest .and. jElement==jtest) then
             write(iulog,*) 'Increment Auu, element i, j, nr, nc =', iElement, jElement, nr, nc
             write(iulog,*) '     i, j, m, Kuu, new Auu:', i, j, m, Kuu(nr,nc), Auu(i,j,m)
          endif

       enddo     ! nc
    enddo        ! nr

  end subroutine element_to_global_matrix_2d

!****************************************************************************
    !WHL, May 2025:
    ! Note: There are two subroutines that assemble matrix elements for sliding for 2D solvers:
    !        basal_sliding_bc_2d and basal_sliding_bc_2d_diva.
    !        The latter is used with DIVA when diva_slope_correction = T
    !        The former is used with DIVA when diva_slope_correction = F,
    !        and with other 2D solvers (L1L2 and SSA).
    !       Set diva_slope_correction = F to reproduce older results.

  !TODO - Call this subroutine for both 2D and 3D solvers.
  !       First need to switch the index order for 3D matrices.
  subroutine basal_sliding_bc_2d(nx,               ny,              &
                                 nNeighbors,       nhalo,           &
                                 parallel,                          &
                                 dx,               dy,              &
                                 itest,  jtest,    rtest,           &
                                 active_cell,      active_vertex,   &
                                 beta,                              &
                                 lsrf,                              &
                                 xVertex,          yVertex,         &
                                 whichassemble_beta,                &
                                 Auu,              Avv)


    !------------------------------------------------------------------------
    ! Increment the Auu and Avv matrices with basal traction terms.
    ! Do a surface integral over all basal faces that contain at least one node with a stress BC.
    ! (Not Dirichlet or free-slip)
    ! Note: Basal Dirichlet BCs are enforced after matrix assembly.
    !
    ! Assume a sliding law of the form:
    !   tau_x = -beta*u
    !   tau_y = -beta*v
    ! where beta is defined at vertices (and beta may depend
    ! on the velocity from a previous iteration).
    !
    ! Note: The input beta field should already have been weighted by f_ground. We should have
    !       beta = 0 for floating ice (f_ground = 0). If using a GLP, then beta will
    !       have less than its full value for partially floating ice (0 < f_ground < 1).
    !------------------------------------------------------------------------

    use glissade_grid_operators, only: glissade_stagger

    integer, intent(in) ::      &
       nx, ny,                  &    ! horizontal grid dimensions
       nNeighbors,              &    ! number of neighbors of each node (used for last dimension of Auu/Avv)
                                     ! = 27 for 3D solve, = 9 for 2D solve
       nhalo                         ! number of halo layers

    type(parallel_type), intent(in) :: &
       parallel                      ! info for parallel communication

    real(dp), intent(in) ::     &
       dx, dy                        ! grid cell length and width

    integer, intent(in) :: &
       itest, jtest, rtest         ! coordinates of diagnostic point

    logical, dimension(nx,ny), intent(in) ::  &
       active_cell                   ! true if cell contains ice and borders a locally owned vertex

    logical, dimension(nx-1,ny-1), intent(in) ::  &
       active_vertex                 ! true for vertices of active cells

    real(dp), dimension(nx-1,ny-1), intent(in) ::    &
       beta                          ! basal traction field (Pa/(m/yr)) at cell vertices
                                     ! typically = beta_internal (beta weighted by f_ground)

    real(dp), dimension(nx,ny), intent(in) ::    &
       lsrf                          ! lower ice surface elevation (m)

    real(dp), dimension(nx-1,ny-1), intent(in) ::   &
       xVertex, yVertex     ! x and y coordinates of vertices

    integer, intent(in) :: &
       whichassemble_beta   ! = 0 for standard finite element computation of basal forcing terms
                            ! = 1 for computation that uses only the local value of beta at each node

    real(dp), dimension(nx-1,ny-1,nNeighbors), intent(inout) ::  &
       Auu, Avv             ! parts of stiffness matrix (basal layer only)

    !----------------------------------------------------------------
    ! Local variables
    !----------------------------------------------------------------

    integer :: i, j, n, p, nr, nc, iA, jA, m, ii, jj

    real(dp), dimension(nNodesPerElement_2d) ::   &
       x, y, z,     & ! Cartesian coordinates of basal nodes
       b              ! beta at basal nodes

    !TODO - These are not currently used except as dummy arguments
    real(dp), dimension(nNodesPerElement_2d) ::   &
       dphi_dx_2d, dphi_dy_2d, dphi_dz_2d  ! derivatives of basis functions, evaluated at quad pts

    real(dp) ::   &
       beta_qp,     & ! beta evaluated at quadrature point
       detJ           ! determinant of Jacobian for the transformation
                      !  between the reference element and true element

    real(dp), dimension(nNodesPerElement_2d, nNodesPerElement_2d) ::   &
       Kuu, Kvv       ! components of element matrix associated with basal sliding

    if (verbose_basal) then
       call point_diag(beta, 'beta', itest, jtest, rtest, 7, 7, '(f10.0)')
    endif

    if (whichassemble_beta == HO_ASSEMBLE_BETA_LOCAL) then

       if (nNeighbors == nNodeNeighbors_3d) then  ! 3D problem
          m = indxA_3d(0,0,0)
       else  ! 2D problem
          m = indxA_2d(0,0)
       endif

       ! Sum over active vertices
       do j = 1, ny-1
          do i = 1, nx-1
             if (active_vertex(i,j)) then
                Auu(i,j,m) = Auu(i,j,m) + dx*dy/vol0 * beta(i,j)
                Avv(i,j,m) = Avv(i,j,m) + dx*dy/vol0 * beta(i,j)
             endif   ! active_vertex
          enddo   ! i
       enddo   ! j

    else   ! standard assembly

       ! Sum over elements in active cells
       ! Loop over all cells that contain locally owned vertices
       do j = nhalo+1, ny-nhalo+1
       do i = nhalo+1, nx-nhalo+1

          !TODO - Should we exclude cells that have Dirichlet basal BCs for all vertices?

          if (active_cell(i,j)) then

             ! Set x and y for each node

             !     4-----3       y
             !     |     |       ^
             !     |     |       |
             !     1-----2       ---> x

             x(1) = xVertex(i-1,j-1)
             x(2) = xVertex(i,j-1)
             x(3) = xVertex(i,j)
             x(4) = xVertex(i-1,j)

             y(1) = yVertex(i-1,j-1)
             y(2) = yVertex(i,j-1)
             y(3) = yVertex(i,j)
             y(4) = yVertex(i-1,j)

             b(1) = beta(i-1,j-1)
             b(2) = beta(i,j-1)
             b(3) = beta(i,j)
             b(4) = beta(i-1,j)

             ! loop over quadrature points

             do p = 1, nQuadPoints_2d

                ! Compute basis function derivatives and det(J) for this quadrature point
                ! For now, pass in i, j, k, p for debugging
                !TODO - Modify this subroutine so that the output derivatives are optional?

                call get_basis_function_derivatives_2d(&
                     x(:),             y(:),               &
                     dphi_dxr_2d(:,p), dphi_dyr_2d(:,p),   &
                     dphi_dx_2d(:),    dphi_dy_2d(:),      &
                     detJ,                                 &
                     itest, jtest, rtest,                  &
                     i, j, p)

                ! Evaluate beta at this quadrature point, taking a phi-weighted sum over neighboring vertices.
                beta_qp = 0.d0
                do n = 1, nNodesPerElement_2d
                   beta_qp = beta_qp + phi_2d(n,p) * b(n)
                enddo

                if (verbose_basal .and. this_rank==rtest .and. i==itest .and. j==jtest) then
                   write(iulog,*) ' '
                   write(iulog,*) 'Increment basal traction, i, j, p =', i, j, p
                   write(iulog,*) 'beta_qp, detJ/vol0 =', beta_qp, detJ/vol0
                endif

                ! Compute the element matrix for this quadrature point
                ! (Note volume scaling)
                !TODO - Replace detJ/vol0 with dx*dy?

                Kuu(:,:) = 0.d0

                do nc = 1, nNodesPerElement_2d      ! columns of K
                   do nr = 1, nNodesPerElement_2d   ! rows of K
                      Kuu(nr,nc) = Kuu(nr,nc) + beta_qp * wqp_2d(p) * detJ/vol0 * phi_2d(nr,p)*phi_2d(nc,p)
                   enddo  ! m (rows)
                enddo     ! n (columns)

                !Note: Is this true for all sliding laws?
                Kvv(:,:) = Kuu(:,:)

                ! Insert terms of basal element matrices into global matrices Auu and Avv

                do nr = 1, nNodesPerElement_2d     ! rows of K

                   ! Determine (i,j) for this node
                   ! The reason for the '3' is that node 3, in the NE corner of the cell, has horizontal indices (i,j).
                   ! Indices for other nodes are computed relative to this node.

                   ii = i + ishift(3,nr)
                   jj = j + jshift(3,nr)

                   do nc = 1, nNodesPerElement_2d ! columns of K

                      iA = ishift(nr,nc)          ! iA index of A into which K(nr,nc) is summed
                      jA = jshift(nr,nc)          ! similarly for jA

                      if (nNeighbors == nNodeNeighbors_3d) then  ! 3D problem
                         m = indxA_3d(iA,jA,0)
                      else  ! 2D problem
                         m = indxA_2d(iA,jA)
                      endif

                      Auu(ii,jj,m) = Auu(ii,jj,m) + Kuu(nr,nc)
                      Avv(ii,jj,m) = Avv(ii,jj,m) + Kvv(nr,nc)

                      if (verbose_basal .and. this_rank==rtest .and. ii==itest .and. jj==jtest .and. m==5) then
                         ! m = 5 gives the influence of beta at vertex(i,j) on velocity at vertex(ii,jj).
                         ! For local assembly, Auu and Avv get nonzero increments only for m = 5.
                         write(iulog,*) 'Basal increment for Auu and Avv: source (i,j), Kuu, new Auu, ii, jj, m =', &
                              i, j, Kuu(nr,nc), Auu(ii,jj,m), ii, jj, m
                      endif

                   enddo     ! nc
                enddo        ! nr

                if (verbose_basal .and. this_rank==rtest .and. i==itest .and. j==jtest) then
!                  write(iulog,*) ' '
!                  write(iulog,*) 'i, j =', i, j
!                  write(iulog,*) 'Kuu:'
!                  do nr = 1, nNodesPerElement_2d
!                     write(iulog,*) nr, Kuu(nr,:)
!                  enddo
!                  write(iulog,*) ' '
!                  write(iulog,*) 'rowsum(Kuu):'
!                  do nr = 1, nNodesPerElement_2d
!                     write(iulog,*) nr, sum(Kuu(nr,:))
!                  enddo
!                  write(iulog,*) ' '
!                  write(iulog,*) 'sum(Kuu):', sum(Kuu(:,:))
                endif

             enddo   ! nQuadPoints_2d

          endif      ! active_cell

       enddo         ! i
       enddo         ! j

    endif   ! whichassemble_beta

    if (verbose_basal .and. this_rank==rtest) then
       i = itest
       j = jtest
       if (nNeighbors == nNodeNeighbors_3d) then  ! 3D problem
          m = indxA_3d(0,0,0)
       else
          m = indxA_2d(0,0)
       endif
       write(iulog,*) ' '
       write(iulog,*) 'Basal BC: i, j, diagonal index =', i, j, m
       write(iulog,*) 'New Auu diagonal:', Auu(i,j,m)
       write(iulog,*) 'New Avv diagonal:', Avv(i,j,m)
    endif

  end subroutine basal_sliding_bc_2d

!****************************************************************************

  subroutine basal_sliding_bc_2d_diva(&
       nx,               ny,              &
       nNeighbors,       nhalo,           &
       parallel,                          &
       dx,               dy,              &
       itest,  jtest,    rtest,           &
       active_cell,      active_vertex,   &
       beta_eff_x,       beta_eff_y,      &
       lsrf,                              &
       xVertex,          yVertex,         &
       whichassemble_beta,                &
       Auu,              Avv)

    !------------------------------------------------------------------------
    ! Increment the Auu and Avv matrices with basal traction terms.
    ! Do a surface integral over all basal faces that contain at least one node with a stress BC.
    ! (Not Dirichlet or free-slip)
    ! Note: Basal Dirichlet BCs are enforced after matrix assembly.
    !
    ! This subroutine differs from the one above in that it includes terms
    !  to improve accuracy in regions of steep slopes.
    ! For example, there are two values of beta_eff: one depending on the slope
    !  in the x direction and the other on the slope in the y direction.
    !
    ! Assume a sliding law of the form:
    !   tau_x = -beta*u
    !   tau_y = -beta*v
    ! where beta is defined at vertices (and may depend on the velocity from a previous iteration).
    !
    ! Note: The input beta field should already have been weighted by f_ground. We should have
    !       beta = 0 for floating ice (f_ground = 0). If using a GLP, then beta will
    !       have less than its full value for partially floating ice (0 < f_ground < 1).
    !------------------------------------------------------------------------

    use glissade_grid_operators, only: glissade_stagger, glissade_slope_angle

    integer, intent(in) ::      &
         nx, ny,                  &    ! horizontal grid dimensions
         nNeighbors,              &    ! number of neighbors of each node (used for last dimension of Auu/Avv)
                                       ! = 27 for 3D solve, = 9 for 2D solve
         nhalo                         ! number of halo layers

    type(parallel_type), intent(in) :: &
         parallel                      ! info for parallel communication

    real(dp), intent(in) ::     &
         dx, dy                        ! grid cell length and width

    integer, intent(in) :: &
         itest, jtest, rtest         ! coordinates of diagnostic point

    logical, dimension(nx,ny), intent(in) ::  &
         active_cell                   ! true if cell contains ice and borders a locally owned vertex

    logical, dimension(nx-1,ny-1), intent(in) ::  &
         active_vertex                 ! true for vertices of active cells

    real(dp), dimension(nx-1,ny-1), intent(in) ::  &
         beta_eff_x, beta_eff_y        ! basal traction coefficients(Pa/(m/yr)) at cell vertices
                                       ! defined such that beta_eff*umean = beta*ub

    real(dp), dimension(nx,ny), intent(in) ::    &
         lsrf                          ! lower ice surface elevation (m)

    real(dp), dimension(nx-1,ny-1), intent(in) ::   &
         xVertex, yVertex     ! x and y coordinates of vertices

    integer, intent(in) :: &
         whichassemble_beta   ! = 0 for standard finite element computation of basal forcing terms
                              ! = 1 for computation that uses only the local value of beta at each node

    real(dp), dimension(nx-1,ny-1,nNeighbors), intent(inout) ::  &
         Auu, Avv             ! parts of stiffness matrix (basal layer only)

    !----------------------------------------------------------------
    ! Local variables
    !----------------------------------------------------------------

    integer :: i, j, n, p, nr, nc, iA, jA, m, ii, jj

    real(dp), dimension(nNodesPerElement_2d) ::   &
         x, y, z,     & ! Cartesian coordinates of basal nodes
         bx, by         ! beta_eff_x and beta_eff_y at basal nodes

    real(dp) ::   &
         beta_qpx,    & ! beta_eff_x evaluated at quadrature point
         beta_qpy,    & ! beta_eff_y evaluated at quadrature point
         detJ           ! determinant of Jacobian for the transformation
                        !  between the reference element and true element

    real(dp), dimension(nNodesPerElement_2d, nNodesPerElement_2d) ::   &
         Kuu, Kvv       ! components of element matrix associated with basal sliding

    real(dp), dimension(nx-1,ny-1) ::   &
         staglsrf       ! lsrf interpolated to the staggered grid

    real(dp), dimension(nx-1,ny-1) ::   &
         theta_basal_slope,   & ! basal slope angle (radians)
         theta_basal_slope_x, & ! basal slope angle in x direction
         theta_basal_slope_y    ! basal slope angle in y direction

    if (verbose_basal) then
       call point_diag(beta_eff_x, 'DIVA beta_eff_x', itest, jtest, rtest, 7, 7, '(f10.0)')
       call point_diag(beta_eff_y, 'DIVA beta_eff_y', itest, jtest, rtest, 7, 7, '(f10.0)')
    endif

    if (whichassemble_beta == HO_ASSEMBLE_BETA_LOCAL) then

       if (nNeighbors == nNodeNeighbors_3d) then  ! 3D problem
          m = indxA_3d(0,0,0)
       else  ! 2D problem
          m = indxA_2d(0,0)
       endif

       ! Average the lower ice surface elevation to vertices
       call glissade_stagger(&
            nx,       ny,         &
            lsrf,     staglsrf)

       ! Compute the angle between the lower ice surface and the horizontal.
       !TODO - Make sure this doesn't give bad values in ice-free regions.
       call glissade_slope_angle(&
            nx-1,         ny-1,   &
            dx,           dy,     &  ! m
            itest, jtest, rtest,  &
            staglsrf,             &  ! m
            theta_basal_slope,    &  ! radians
            theta_basal_slope_x,  &
            theta_basal_slope_y)

       call parallel_halo(theta_basal_slope_x, parallel)
       call parallel_halo(theta_basal_slope_y, parallel)

       if (verbose_basal) then
          call point_diag(theta_basal_slope_x*180.d0/pi, 'theta_basal_slope_x (deg)', itest, jtest, rtest, 7, 7, '(f10.0)')
          call point_diag(theta_basal_slope_y*180.d0/pi, 'theta_basal_slope_y (deg)', itest, jtest, rtest, 7, 7, '(f10.0)')
       endif

       ! Sum over active vertices
       do j = 1, ny-1
          do i = 1, nx-1
             if (active_vertex(i,j)) then
                Auu(i,j,m) = Auu(i,j,m) + dx*dy/vol0 * beta_eff_x(i,j) / cos(theta_basal_slope_x(i,j))
                Avv(i,j,m) = Avv(i,j,m) + dx*dy/vol0 * beta_eff_y(i,j) / cos(theta_basal_slope_y(i,j))
             endif   ! active_vertex
          enddo   ! i
       enddo   ! j

    else   ! standard assembly

       ! Average the lower ice surface elevation to vertices
       call glissade_stagger(&
            nx,       ny,         &
            lsrf,     staglsrf)

       ! Sum over elements in active cells
       ! Loop over all cells that contain locally owned vertices
       do j = nhalo+1, ny-nhalo+1
       do i = nhalo+1, nx-nhalo+1

          !TODO - Should we exclude cells that have Dirichlet basal BCs for all vertices?

          if (active_cell(i,j)) then

             ! Set x and y for each node

             !     4-----3       y
             !     |     |       ^
             !     |     |       |
             !     1-----2       ---> x

             x(1) = xVertex(i-1,j-1)
             x(2) = xVertex(i,j-1)
             x(3) = xVertex(i,j)
             x(4) = xVertex(i-1,j)

             y(1) = yVertex(i-1,j-1)
             y(2) = yVertex(i,j-1)
             y(3) = yVertex(i,j)
             y(4) = yVertex(i-1,j)

             z(1) = staglsrf(i-1,j-1)
             z(2) = staglsrf(i,j-1)
             z(3) = staglsrf(i,j)
             z(4) = staglsrf(i-1,j)

             bx(1) = beta_eff_x(i-1,j-1)
             bx(2) = beta_eff_x(i,j-1)
             bx(3) = beta_eff_x(i,j)
             bx(4) = beta_eff_x(i-1,j)

             by(1) = beta_eff_y(i-1,j-1)
             by(2) = beta_eff_y(i,j-1)
             by(3) = beta_eff_y(i,j)
             by(4) = beta_eff_y(i-1,j)

             ! loop over quadrature points

             do p = 1, nQuadPoints_2d

                ! Compute an area scale factor for this quadrature point

                call get_area_scale_factor_curved_2d(&
                     x(:),             y(:),             z(:),  &
                     dphi_dxr_2d(:,p), dphi_dyr_2d(:,p),        &
                     itest, jtest, rtest,                       &
                     i,     j,     p,                           &
                     detJ)

                beta_qpx = 0.d0
                beta_qpy = 0.d0
                do n = 1, nNodesPerElement_2d
                   beta_qpx = beta_qpx + phi_2d(n,p) * bx(n)
                   beta_qpy = beta_qpy + phi_2d(n,p) * by(n)
                enddo

                if (verbose_basal .and. this_rank==rtest .and. i==itest .and. j==jtest) then
                   write(iulog,*) ' '
                   write(iulog,*) 'Increment basal traction, i, j, p =', i, j, p
                   write(iulog,*) 'beta_qpx, beta_qpy, detJ/vol0 =', beta_qpx, beta_qpy, detJ/vol0
                endif

                ! Compute the element matrix for this quadrature point
                ! (Note volume scaling)
                !TODO - Replace detJ/vol0 with dx*dy?

                Kuu(:,:) = 0.d0
                Kvv(:,:) = 0.d0

                do nc = 1, nNodesPerElement_2d      ! columns of K
                   do nr = 1, nNodesPerElement_2d   ! rows of K
                      Kuu(nr,nc) = Kuu(nr,nc) + beta_qpx * wqp_2d(p) * detJ/vol0 * phi_2d(nr,p)*phi_2d(nc,p)
                      Kvv(nr,nc) = Kvv(nr,nc) + beta_qpy * wqp_2d(p) * detJ/vol0 * phi_2d(nr,p)*phi_2d(nc,p)
                   enddo  ! m (rows)
                enddo  ! n (columns)

                ! Insert terms of basal element matrices into global matrices Auu and Avv

                do nr = 1, nNodesPerElement_2d     ! rows of K

                   ! Determine (i,j) for this node
                   ! The reason for the '3' is that node 3, in the NE corner of the cell, has horizontal indices (i,j).
                   ! Indices for other nodes are computed relative to this node.

                   ii = i + ishift(3,nr)
                   jj = j + jshift(3,nr)

                   do nc = 1, nNodesPerElement_2d ! columns of K

                      iA = ishift(nr,nc)          ! iA index of A into which K(nr,nc) is summed
                      jA = jshift(nr,nc)          ! similarly for jA

                      if (nNeighbors == nNodeNeighbors_3d) then  ! 3D problem
                         m = indxA_3d(iA,jA,0)
                      else  ! 2D problem
                         m = indxA_2d(iA,jA)
                      endif

                      Auu(ii,jj,m) = Auu(ii,jj,m) + Kuu(nr,nc)
                      Avv(ii,jj,m) = Avv(ii,jj,m) + Kvv(nr,nc)

                      if (verbose_basal .and. this_rank==rtest .and. ii==itest .and. jj==jtest .and. m==5) then
                         ! m = 5 gives the influence of beta at vertex(i,j) on velocity at vertex(ii,jj).
                         ! For local assembly, Auu and Avv get nonzero increments only for m = 5.
                         write(iulog,*) 'Basal increment for Auu and Avv: source (i,j), Kuu, new Auu, ii, jj, m =', &
                              i, j, Kuu(nr,nc), Auu(ii,jj,m), ii, jj, m
                      endif

                   enddo     ! nc
                enddo        ! nr

             enddo   ! nQuadPoints_2d

          endif      ! active_cell

       enddo         ! i
       enddo         ! j

    endif   ! whichassemble_beta

    if (verbose_basal .and. this_rank==rtest) then
       i = itest
       j = jtest
       if (nNeighbors == nNodeNeighbors_3d) then  ! 3D problem
          m = indxA_3d(0,0,0)
       else
          m = indxA_2d(0,0)
       endif
       write(iulog,*) ' '
       write(iulog,*) 'Basal BC: i, j, diagonal index =', i, j, m
       write(iulog,*) 'New Auu diagonal:', Auu(i,j,m)
       write(iulog,*) 'New Avv diagonal:', Avv(i,j,m)
    endif

  end subroutine basal_sliding_bc_2d_diva

!****************************************************************************

  subroutine basal_sliding_bc_3d(nx,               ny,              &
                              nNeighbors,       nhalo,           &
                              parallel,                          &
                              dx,               dy,              &
                              itest,  jtest,    rtest,           &
                              active_cell,      active_vertex,   &
                              beta,             lsrf,            &
                              xVertex,          yVertex,         &
                              whichassemble_beta,                &
                              Auu,              Avv)

    !------------------------------------------------------------------------
    ! Increment the Auu and Avv matrices with basal traction terms.
    ! Do a surface integral over all basal faces that contain at least one node with a stress BC. 
    ! (Not Dirichlet or free-slip)
    ! Note: Basal Dirichlet BCs are enforced after matrix assembly. 
    !
    ! Assume a sliding law of the form:
    !   tau_x = -beta*u
    !   tau_y = -beta*v
    ! where beta is defined at vertices (and beta may depend
    ! on the velocity from a previous iteration).
    !
    ! Note: The input beta field should already have been weighted by f_ground. We should have
    !       beta = 0 for floating ice (f_ground = 0). If using a GLP, then beta will
    !       have less than its full value for partially floating ice (0 < f_ground < 1). 
    !------------------------------------------------------------------------

    use glissade_grid_operators, only: glissade_stagger, glissade_slope_angle

    integer, intent(in) ::      &
       nx, ny,                  &    ! horizontal grid dimensions
       nNeighbors,              &    ! number of neighbors of each node (used for first dimension of Auu/Avv)
                                     ! = 27 for 3D solve, = 9 for 2D solve
       nhalo                         ! number of halo layers

    type(parallel_type), intent(in) :: &
       parallel                      ! info for parallel communication

    real(dp), intent(in) ::     &
       dx, dy                        ! grid cell length and width

    integer, intent(in) :: &
       itest, jtest, rtest         ! coordinates of diagnostic point

    logical, dimension(nx,ny), intent(in) ::  &
       active_cell                   ! true if cell contains ice and borders a locally owned vertex

    logical, dimension(nx-1,ny-1), intent(in) ::  &
       active_vertex                 ! true for vertices of active cells

    real(dp), dimension(nx-1,ny-1), intent(in) ::    &
       beta                          ! basal traction field (Pa/(m/yr)) at cell vertices
                                     ! typically = beta_internal (beta weighted by f_ground)

    real(dp), dimension(nx,ny), intent(in) ::    &
       lsrf                          ! lower ice surface elevation (m)

    real(dp), dimension(nx-1,ny-1), intent(in) ::   &
       xVertex, yVertex     ! x and y coordinates of vertices

    integer, intent(in) :: &
       whichassemble_beta   ! = 0 for standard finite element computation of basal forcing terms
                            ! = 1 for computation that uses only the local value of beta at each node

    real(dp), dimension(nNeighbors,nx-1,ny-1), intent(inout) ::  &
       Auu, Avv             ! parts of stiffness matrix (basal layer only)

    !----------------------------------------------------------------
    ! Local variables
    !----------------------------------------------------------------

    integer :: i, j, n, p, nr, nc, iA, jA, m, ii, jj

    real(dp), dimension(nNodesPerElement_2d) ::   &
       x, y, z,     & ! Cartesian coordinates of basal nodes
       b              ! beta at basal nodes

    ! Note: These are not currently used except as dummy arguments
    real(dp), dimension(nNodesPerElement_2d) ::   &
       dphi_dx_2d, dphi_dy_2d, dphi_dz_2d  ! derivatives of basis functions, evaluated at quad pts

    real(dp) ::   &
       beta_qp,     & ! beta evaluated at quadrature point
       detJ           ! determinant of Jacobian for the transformation
                      !  between the reference element and true element

    real(dp), dimension(nNodesPerElement_2d, nNodesPerElement_2d) ::   &
       Kuu, Kvv       ! components of element matrix associated with basal sliding

    real(dp), dimension(nx-1,ny-1) ::   &
       staglsrf       ! lsrf interpolated to the staggered grid

    real(dp), dimension(nx-1,ny-1) ::   &
       theta_basal_slope    ! basal slope angle (radians)

    !WHL, May 2025:
    ! Set this parameter to true for more accurate basal sliding on steep slopes.
    ! Set to false for back compatiblity.
    logical, parameter :: curved_2d_basal_jacobian = .true.
!    logical, parameter :: curved_2d_basal_jacobian = .false.

    if (verbose_basal) then
       call point_diag(beta, 'beta', itest, jtest, rtest, 7, 7, '(f10.0)')
    endif

    if (whichassemble_beta == HO_ASSEMBLE_BETA_LOCAL) then

       if (nNeighbors == nNodeNeighbors_3d) then  ! 3D problem
          m = indxA_3d(0,0,0)
       else  ! 2D problem
          m = indxA_2d(0,0)
       endif
       
       ! Average the lower ice surface elevation to vertices
       call glissade_stagger(&
            nx,       ny,         &
            lsrf,     staglsrf)

       ! Compute the angle between the lower ice surface and the horizontal.
       !TODO - Make sure this doesn't give bad values in ice-free regions.
       call glissade_slope_angle(&
            nx-1,         ny-1,   &
            dx,           dy,     &  ! m
            itest, jtest, rtest,  &
            staglsrf,             &  ! m
            theta_basal_slope)         ! radians

       call parallel_halo(theta_basal_slope, parallel)

       ! Sum over active vertices
       do j = 1, ny-1
          do i = 1, nx-1
             if (active_vertex(i,j)) then
                Auu(m,i,j) = Auu(m,i,j) + dx*dy/vol0 * beta(i,j) / cos(theta_basal_slope(i,j))
                Avv(m,i,j) = Avv(m,i,j) + dx*dy/vol0 * beta(i,j) / cos(theta_basal_slope(i,j))
             endif   ! active_vertex
          enddo   ! i
       enddo   ! j

    else   ! standard assembly

       ! Average the lower ice surface elevation to vertices
       call glissade_stagger(&
            nx,       ny,         &
            lsrf,     staglsrf)

       ! Sum over elements in active cells
       ! Loop over all cells that contain locally owned vertices
       do j = nhalo+1, ny-nhalo+1
       do i = nhalo+1, nx-nhalo+1
       
          !TODO - Should we exclude cells that have Dirichlet basal BCs for all vertices?

          if (active_cell(i,j)) then

             ! Set (x,y,z) for each node

             !     4-----3       y
             !     |     |       ^
             !     |     |       |
             !     1-----2       ---> x

             x(1) = xVertex(i-1,j-1)
             x(2) = xVertex(i,j-1)
             x(3) = xVertex(i,j)
             x(4) = xVertex(i-1,j)

             y(1) = yVertex(i-1,j-1)
             y(2) = yVertex(i,j-1)
             y(3) = yVertex(i,j)
             y(4) = yVertex(i-1,j)

             z(1) = staglsrf(i-1,j-1)
             z(2) = staglsrf(i,j-1)
             z(3) = staglsrf(i,j)
             z(4) = staglsrf(i-1,j)

             b(1) = beta(i-1,j-1)
             b(2) = beta(i,j-1)
             b(3) = beta(i,j)
             b(4) = beta(i-1,j)

             ! loop over quadrature points

             do p = 1, nQuadPoints_2d

                ! Compute basis function derivatives and det(J) for this quadrature point
                ! For now, pass in i, j, k, p for debugging
                !TODO - Modify this subroutine so that the output derivatives are optional?

                if (curved_2d_basal_jacobian) then

                   ! Compute an area scale factor for this quadrature point

                   call get_area_scale_factor_curved_2d(&
                        x(:),             y(:),             z(:),  &
                        dphi_dxr_2d(:,p), dphi_dyr_2d(:,p),        &
                        itest, jtest, rtest,                       &
                        i,     j,     p,                           &
                        detJ)

                else   ! old calculation with detJ based on a flat bed

                   call get_basis_function_derivatives_2d(x(:),             y(:),               &
                                                       dphi_dxr_2d(:,p), dphi_dyr_2d(:,p),   &
                                                       dphi_dx_2d(:),    dphi_dy_2d(:),      &
                                                       detJ,                                 &
                                                       itest, jtest, rtest,                  &
                                                       i, j, p)
                endif

                ! Evaluate beta at this quadrature point, taking a phi-weighted sum over neighboring vertices.
                beta_qp = 0.d0
                do n = 1, nNodesPerElement_2d
                   beta_qp = beta_qp + phi_2d(n,p) * b(n)
                enddo

                if (verbose_basal .and. this_rank==rtest .and. i==itest .and. j==jtest) then
                   write(iulog,*) ' '
                   write(iulog,*) 'Increment basal traction, i, j, p =', i, j, p
                   write(iulog,*) 'beta_qp, detJ/vol0 =', beta_qp, detJ/vol0
                endif

                ! Compute the element matrix for this quadrature point
                ! (Note volume scaling)
                !TODO - Replace detJ/vol0 with dx*dy?

                Kuu(:,:) = 0.d0

                do nc = 1, nNodesPerElement_2d      ! columns of K
                   do nr = 1, nNodesPerElement_2d   ! rows of K
                      Kuu(nr,nc) = Kuu(nr,nc) + beta_qp * wqp_2d(p) * detJ/vol0 * phi_2d(nr,p)*phi_2d(nc,p)
                   enddo  ! m (rows)
                enddo     ! n (columns)

                !Note: Is this true for all sliding laws?
                Kvv(:,:) = Kuu(:,:)

                ! Insert terms of basal element matrices into global matrices Auu and Avv

                do nr = 1, nNodesPerElement_2d     ! rows of K

                   ! Determine (i,j) for this node
                   ! The reason for the '3' is that node 3, in the NE corner of the cell, has horizontal indices (i,j).
                   ! Indices for other nodes are computed relative to this node.

                   ii = i + ishift(3,nr)
                   jj = j + jshift(3,nr)

                   do nc = 1, nNodesPerElement_2d ! columns of K

                      iA = ishift(nr,nc)          ! iA index of A into which K(nr,nc) is summed
                      jA = jshift(nr,nc)          ! similarly for jA

                      if (nNeighbors == nNodeNeighbors_3d) then  ! 3D problem
                         m = indxA_3d(iA,jA,0)
                      else  ! 2D problem
                         m = indxA_2d(iA,jA)
                      endif

                      Auu(m,ii,jj) = Auu(m,ii,jj) + Kuu(nr,nc)
                      Avv(m,ii,jj) = Avv(m,ii,jj) + Kvv(nr,nc)

                      if (verbose_basal .and. this_rank==rtest .and. ii==itest .and. jj==jtest .and. m==5) then
                         ! m = 5 gives the influence of beta at vertex(i,j) on velocity at vertex(ii,jj).
                         ! For local assembly, Auu and Avv get nonzero increments only for m = 5.
                         write(iulog,*) 'Basal increment for Auu and Avv: source (i,j), Kuu, new Auu, ii, jj, m =', &
                              i, j, Kuu(nr,nc), Auu(m,ii,jj), ii, jj, m
                      endif

                   enddo     ! nc
                enddo        ! nr

                if (verbose_basal .and. this_rank==rtest .and. i==itest .and. j==jtest) then
!                  write(iulog,*) ' '
!                  write(iulog,*) 'i, j =', i, j
!                  write(iulog,*) 'Kuu:'
!                  do nr = 1, nNodesPerElement_2d
!                     write(iulog,*) nr, Kuu(nr,:)
!                  enddo
!                  write(iulog,*) ' '
!                  write(iulog,*) 'rowsum(Kuu):'
!                  do nr = 1, nNodesPerElement_2d
!                     write(iulog,*) nr, sum(Kuu(nr,:))
!                  enddo
!                  write(iulog,*) ' '
!                  write(iulog,*) 'sum(Kuu):', sum(Kuu(:,:))
                endif

             enddo   ! nQuadPoints_2d

          endif      ! active_cell

       enddo         ! i
       enddo         ! j

    endif   ! whichassemble_beta

    if (verbose_basal .and. this_rank==rtest) then
       i = itest
       j = jtest
       if (nNeighbors == nNodeNeighbors_3d) then  ! 3D problem
          m = indxA_3d(0,0,0)
       else
          m = indxA_2d(0,0)
       endif
       write(iulog,*) ' '
       write(iulog,*) 'Basal BC: i, j, diagonal index =', i, j, m
       write(iulog,*) 'New Auu diagonal:', Auu(m,i,j)
       write(iulog,*) 'New Avv diagonal:', Avv(m,i,j)
    endif

  end subroutine basal_sliding_bc_3d

!****************************************************************************

  subroutine dirichlet_boundary_conditions_3d(nx,              ny,               &
                                              nz,              nhalo,            &
                                              active_vertex,                     &
                                              umask_dirichlet, vmask_dirichlet,  &
                                              uvel,            vvel,             &
                                              Auu,             Auv,              &
                                              Avu,             Avv,              &
                                              bu,              bv)

    !----------------------------------------------------------------
    ! Modify the global matrix and RHS for Dirichlet boundary conditions,
    !  where uvel and vvel are prescribed at certain nodes.
    ! For each such node, we zero out the row, except for setting the diagonal term to 1.
    ! We also zero out the column, moving terms containing uvel/vvel to the rhs.
    !----------------------------------------------------------------

    !----------------------------------------------------------------
    ! Input-output arguments
    !----------------------------------------------------------------

    integer, intent(in) ::   &
       nx, ny,               &  ! horizontal grid dimensions
       nz,                   &  ! number of vertical levels where velocity is computed
       nhalo                    ! number of halo layers

    logical, dimension(nx-1,ny-1), intent(in) ::  &
       active_vertex       ! true for active vertices (vertices of active cells)

      integer, dimension(nz,nx-1,ny-1), intent(in) ::  &
       umask_dirichlet,   &! Dirichlet mask for u velocity (if true, u is prescribed)
       vmask_dirichlet     ! Dirichlet mask for v velocity (if true, v is prescribed)

    real(dp), dimension(nz,nx-1,ny-1), intent(in) ::  &
       uvel, vvel          ! velocity components

    real(dp), dimension(nNodeNeighbors_3d,nz,nx-1,ny-1), intent(inout) ::   &
       Auu, Auv,    &      ! assembled stiffness matrix, divided into 4 parts
       Avu, Avv                                    

    real(dp), dimension(nz,nx-1,ny-1), intent(inout) ::   &
       bu, bv              ! assembled load vector, divided into 2 parts

    !----------------------------------------------------------------
    ! Local variables
    !----------------------------------------------------------------
    
    integer :: i, j, k     ! Cartesian indices of nodes
    integer :: iA, jA, kA  ! i, j, and k offsets of neighboring nodes 
    integer :: m

    ! Loop over all vertices that border locally owned vertices.
    ! For outflow BC, OK to skip vertices outside the global domain (i < nhalo or j < nhalo).
    ! Note: Need nhalo >= 2 so as not to step out of bounds.

     do j = nhalo, ny-nhalo+1
        do i = nhalo, nx-nhalo+1
          if (active_vertex(i,j)) then
             do k = 1, nz

                if (umask_dirichlet(k,i,j) == 1) then

                   ! set the rhs to the prescribed velocity
                   bu(k,i,j) = uvel(k,i,j)

                   ! loop through matrix values in the rows associated with this node
                   ! (Auu contains one row, Avu contains a second row)
                   do kA = -1,1
                   do jA = -1,1
                   do iA = -1,1

                      if ( (k+kA >= 1 .and. k+kA <= nz)         &
                                      .and.                     &
                           (i+iA >= 1 .and. i+iA <= nx-1)       &
                                      .and.                     &
                           (j+jA >= 1 .and. j+jA <= ny-1) ) then

                         if (iA==0 .and. jA==0 .and. kA==0) then  ! main diagonal

                            ! Set Auu = 1 on the main diagonal
                            ! Set Auv term = 0; this term is off-diagonal for the fully assembled matrix
                            ! Set Avu term = 0 to preserve matrix symmetry (given that Auv term = 0)
                            m = indxA_3d(0,0,0)
                            Auu(m,k,i,j) = 1.d0
                            Auv(m,k,i,j) = 0.d0
                            Avu(m,k,i,j) = 0.d0

                            !TODO - Set bu above, outside iA/jA loop
                            ! Set the rhs to the prescribed velocity, forcing u = prescribed uvel for this vertex
!!                            bu(k,i,j) = uvel(k,i,j)
                            
                         else     ! not on the diagonal

                            ! Zero out non-diagonal matrix terms in the rows associated with this node
                            m = indxA_3d(iA,jA,kA)
                            Auu(m, k, i, j) = 0.d0
                            Auv(m, k, i, j) = 0.d0

                            ! Shift terms associated with this velocity to the rhs.
                            ! Note: The remaining operations do not change the answer, but do restore symmetry to the matrix.
                            m = indxA_3d(-iA,-jA,-kA)

                            if (umask_dirichlet(k+kA, i+iA, j+jA) /= 1) then
                               ! Move (Auu term) * uvel to rhs
                               bu(k+kA, i+iA, j+jA) = bu(k+kA, i+iA, j+jA) - Auu(m, k+kA, i+iA, j+jA) * uvel(k,i,j) 
                               Auu(m, k+kA, i+iA, j+jA) = 0.d0
                            endif

                            if (vmask_dirichlet(k+kA, i+iA, j+jA) /= 1) then
                               ! Move (Avu term) * uvel to rhs
                               bv(k+kA, i+iA, j+jA) = bv(k+kA, i+iA, j+jA) - Avu(m, k+kA, i+iA, j+jA) * uvel(k,i,j)
                               Avu(m, k+kA, i+iA, j+jA) = 0.d0
                            endif

                         endif  ! on the diagonal

                     endif     ! i+iA, j+jA, and k+kA in bounds

                  enddo        ! kA
                  enddo        ! iA
                  enddo        ! jA

                endif    ! umask_dirichlet

                if (vmask_dirichlet(k,i,j) == 1) then

                   ! set the rhs to the prescribed velocity
                   bv(k,i,j) = vvel(k,i,j)

                   ! loop through matrix values in the rows associated with this node
                   ! (Auu contains one row, Avu contains a second row)
                   do kA = -1,1
                   do jA = -1,1
                   do iA = -1,1

                      if ( (k+kA >= 1 .and. k+kA <= nz)         &
                                      .and.                     &
                           (i+iA >= 1 .and. i+iA <= nx-1)       &
                                      .and.                     &
                           (j+jA >= 1 .and. j+jA <= ny-1) ) then

                         if (iA==0 .and. jA==0 .and. kA==0) then  ! main diagonal

                            ! Set Avv = 1 on the main diagonal
                            ! Set Avu term = 0; this term is off-diagonal for the fully assembled matrix
                            ! Set Auv term = 0 to preserve matrix symmetry (given that Avu term = 0)
                            m = indxA_3d(0,0,0)

                            Auv(m,k,i,j) = 0.d0
                            Avu(m,k,i,j) = 0.d0
                            Avv(m,k,i,j) = 1.d0

                            !TODO - Set bv above, outside iA/jA loop
                            ! Set the rhs to the prescribed velocity, forcing v = prescribed vvel for this node
!!                            bv(k,i,j) = vvel(k,i,j)
                            
                         else     ! not on the diagonal

                            ! Zero out non-diagonal matrix terms in the rows associated with this node
                            m = indxA_3d(iA,jA,kA)
                            Avu(m, k, i, j) = 0.d0
                            Avv(m, k, i, j) = 0.d0

                            ! Shift terms associated with this velocity to the rhs.
                            ! Note: The remaining operations do not change the answer, but do restore symmetry to the matrix.
                            m = indxA_3d(-iA,-jA,-kA)

                            if (umask_dirichlet(k+kA, i+iA, j+jA) /= 1) then
                               ! Move (Auv term) * vvel to rhs
                               bu(k+kA, i+iA, j+jA) = bu(k+kA, i+iA, j+jA) - Auv(m, k+kA, i+iA, j+jA) * vvel(k,i,j)
                               Auv(m, k+kA, i+iA, j+jA) = 0.d0
                            endif

                            if (vmask_dirichlet(k+kA, i+iA, j+jA) /= 1) then
                               ! Move (Avv term) * vvel to rhs
                               bv(k+kA, i+iA, j+jA) = bv(k+kA, i+iA, j+jA) - Avv(m, k+kA, i+iA, j+jA) * vvel(k,i,j)
                               Avv(m, k+kA, i+iA, j+jA) = 0.d0
                            endif

                         endif  ! on the diagonal

                     endif     ! i+iA, j+jA, and k+kA in bounds

                  enddo        ! kA
                  enddo        ! iA
                  enddo        ! jA

                endif    ! vmask_dirichlet

             enddo       ! k
          endif          ! active_vertex
       enddo             ! i
    enddo                ! j

  end subroutine dirichlet_boundary_conditions_3d

!****************************************************************************

  subroutine dirichlet_boundary_conditions_2d(nx,              ny,               &
                                              nhalo,                             &
                                              active_vertex,                     &
                                              umask_dirichlet, vmask_dirichlet,  &
                                              uvel,            vvel,             &
                                              Auu,             Auv,              &
                                              Avu,             Avv,              &
                                              bu,              bv)

    !----------------------------------------------------------------
    ! Modify the global matrix and RHS for Dirichlet boundary conditions,
    !  where uvel and vvel are prescribed at certain nodes.
    ! For each such node, we zero out the row, except for setting the diagonal term to 1.
    ! We also zero out the column, moving terms containing uvel/vvel to the rhs.
    !----------------------------------------------------------------

    !----------------------------------------------------------------
    ! Input-output arguments
    !----------------------------------------------------------------

    integer, intent(in) ::   &
       nx, ny,               &  ! horizontal grid dimensions
       nhalo                    ! number of halo layers

    logical, dimension(nx-1,ny-1), intent(in) ::  &
       active_vertex       ! true for active vertices (vertices of active cells)

    integer, dimension(nx-1,ny-1), intent(in) ::  &
       umask_dirichlet,   &! Dirichlet mask for velocity (if true, u is prescribed)
       vmask_dirichlet     ! Dirichlet mask for velocity (if true, v is prescribed)

    real(dp), dimension(nx-1,ny-1), intent(in) ::  &
       uvel, vvel          ! velocity components

    real(dp), dimension(nx-1,ny-1,nNodeNeighbors_2d), intent(inout) ::   &
       Auu, Auv,    &      ! assembled stiffness matrix, divided into 4 parts
       Avu, Avv                                    

    real(dp), dimension(nx-1,ny-1), intent(inout) ::   &
       bu, bv              ! assembled load vector, divided into 2 parts

    !----------------------------------------------------------------
    ! Local variables
    !----------------------------------------------------------------
    
    integer :: i, j     ! Cartesian indices of nodes
    integer :: iA, jA   ! i and j offsets of neighboring nodes 
    integer :: m, mm

    ! Loop over all vertices that border locally owned vertices.
    ! Locally owned vertices are (staggered_ilo:staggered_ihi, staggered_jlo_staggered_jhi).
    ! OK to skip vertices outside the global domain (i < nhalo or j < nhalo).
    ! Note: Need nhalo >= 2 so as not to step out of bounds.

    do jA = -1,1
    do iA = -1,1
       m  = indxA_2d(iA,jA)
       mm = indxA_2d(-iA,-jA)

       do j = nhalo, ny-nhalo+1
          do i = nhalo, nx-nhalo+1
             if (active_vertex(i,j)) then

                if (umask_dirichlet(i,j) == 1) then

                   ! set the rhs to the prescribed velocity
                   bu(i,j) = uvel(i,j)

                   ! loop through matrix values in the rows associated with this vertex
                   ! (Auu contains one row, Avu contains a second row)

                   if ( (i+iA >= 1 .and. i+iA <= nx-1)       &
                                   .and.                     &
                        (j+jA >= 1 .and. j+jA <= ny-1) ) then

                      if (iA==0 .and. jA==0) then  ! main diagonal

                         ! Set Auu = 1 on the main diagonal
                         ! Set Auv term = 0; this term is off-diagonal for the fully assembled matrix
                         ! Set Avu term = 0 to preserve matrix symmetry (given that Auv term = 0)
                         Auu(i,j,indxA_2d(0,0)) = 1.d0
                         Auv(i,j,indxA_2d(0,0)) = 1.d0
                         Avu(i,j,indxA_2d(0,0)) = 1.d0

                      else     ! not on the diagonal

                         ! Zero out non-diagonal matrix terms in the row associated with this vertex
                         Auu(i,j,m) = 0.d0
                         Auv(i,j,m) = 0.d0

                         ! Shift terms associated with this velocity to the rhs.
                         ! Note: The remaining operations do not change the answer, but do restore symmetry to the matrix.
                         ! Recall mm = indxA_2d(-iA,-jA)

                         if (umask_dirichlet(i+iA, j+jA) /= 1) then
                            ! Move (Auu term) * uvel to rhs
                            bu(i+iA, j+jA) = bu(i+iA, j+jA) - Auu(i+iA, j+jA, mm) * uvel(i,j)
                            Auu(i+iA, j+jA, mm) = 0.d0
                         endif

                         if (vmask_dirichlet(i+iA, j+jA) /= 1) then
                            ! Move (Avu term) * uvel to rhs
                            bv(i+iA, j+jA) = bv(i+iA, j+jA) - Avu(i+iA, j+jA, mm) * uvel(i,j)
                            Avu(i+iA, j+jA, mm) = 0.d0
                         endif

                      endif  ! on the diagonal

                   endif     ! i+iA and j+jA in bounds

                endif       ! umask_dirichlet

                if (vmask_dirichlet(i,j) == 1) then

                   ! set the rhs to the prescribed velocity
                   bv(i,j) = vvel(i,j)

                   ! loop through matrix values in the rows associated with this vertex
                   ! (Auv contains one row, Avv contains a second row)

                   if ( (i+iA >= 1 .and. i+iA <= nx-1)       &
                                   .and.                     &
                        (j+jA >= 1 .and. j+jA <= ny-1) ) then

                      if (iA==0 .and. jA==0) then  ! main diagonal

                         ! Set Avv = 1 on the main diagonal
                         ! Set Avu term = 0; this term is off-diagonal for the fully assembled matrix
                         ! Set Auv term = 0 to preserve matrix symmetry (given that Avu term = 0)
                         Auv(i,j,indxA_2d(0,0)) = 0.d0
                         Avu(i,j,indxA_2d(0,0)) = 0.d0
                         Avv(i,j,indxA_2d(0,0)) = 1.d0

                      else     ! not on the diagonal

                         ! Zero out non-diagonal matrix terms in the rows associated with this vertex
                         Avu(i,j,m) = 0.d0
                         Avv(i,j,m) = 0.d0

                         ! Shift terms associated with this velocity to the rhs.
                         ! Note: The remaining operations do not change the answer, but do restore symmetry to the matrix.
                         ! Recall mm = indxA_2d(-iA,-jA)

                         if (umask_dirichlet(i+iA, j+jA) /= 1) then
                            ! Move (Auv term) * vvel to rhs
                            bu(i+iA, j+jA) = bu(i+iA, j+jA) - Auv(i+iA, j+jA, mm) * vvel(i,j)
                            Auv(i+iA, j+jA, mm) = 0.d0
                         endif

                         if (vmask_dirichlet(i+iA, j+jA) /= 1) then
                            ! Move (Avv term) * vvel to rhs
                            bv(i+iA, j+jA) = bv(i+iA, j+jA) - Avv(i+iA, j+jA, mm) * vvel(i,j)
                            Avv(i+iA, j+jA, mm) = 0.d0
                         endif

                      endif  ! on the diagonal

                   endif     ! i+iA and j+jA in bounds

                endif       ! vmask_dirichlet

             endif          ! active_vertex
          enddo             ! i
       enddo                ! j

    enddo    ! iA
    enddo    ! jA

  end subroutine dirichlet_boundary_conditions_2d

!****************************************************************************

  subroutine compute_residual_vector_3d(nx,    ny,     nz,            &
                                        parallel,                     &
                                        itest,  jtest, rtest,         &
                                        active_vertex,                &
                                        Auu,           Auv,           &
                                        Avu,           Avv,           &
                                        bu,            bv,            &
                                        uvel,          vvel,          &
                                        resid_u,       resid_v,       &
                                        L2_norm,       L2_norm_relative)

    ! Compute the residual vector Ax - b and its L2 norm.
    ! This subroutine assumes that the matrix is stored in structured (x/y/z) format.

    integer, intent(in) ::   &
       nx, ny,             &  ! horizontal grid dimensions (for scalars)
       nz                     ! number of vertical levels where velocity is computed

    type(parallel_type), intent(in) :: &
       parallel               ! info for parallel communication

    integer, intent(in) :: &
       itest, jtest, rtest         ! coordinates of diagnostic point

    logical, dimension(nx-1,ny-1), intent(in) ::   &
       active_vertex          ! T for columns (i,j) where velocity is computed, else F

    real(dp), dimension(nNodeNeighbors_3d,nz,nx-1,ny-1), intent(in) ::   &
       Auu, Auv, Avu, Avv     ! four components of assembled matrix
                              ! 1st dimension = 3 (node and its nearest neighbors in x, y and z direction)
                              ! other dimensions = (z,x,y) indices
                              !
                              !    Auu  | Auv
                              !    _____|____
                              !    Avu  | Avv
                              !         |

    real(dp), dimension(nz,nx-1,ny-1), intent(in) ::  &
       bu, bv              ! assembled load (rhs) vector, divided into 2 parts

   real(dp), dimension(nz,nx-1,ny-1), intent(in) ::   &
       uvel, vvel          ! u and v components of velocity (m/yr)

    real(dp), dimension(nz,nx-1,ny-1), intent(out) ::   &
       resid_u,      & ! residual vector, divided into 2 parts
       resid_v         !

    real(dp), intent(out) ::    &
       L2_norm             ! L2 norm of residual vector, |Ax - b|

    real(dp), intent(out), optional ::    &
       L2_norm_relative    ! L2 norm of residual vector relative to rhs, |Ax - b| / |b|

    real(dp), dimension(nz,nx-1,ny-1) ::   &
       resid_sq        ! resid_u^2 + resid_v^2

    real(dp) :: my_max_resid, global_max_resid

    integer :: i, j, k, iA, jA, kA, m, iglobal, jglobal

    real(dp) :: L2_norm_rhs   ! L2 norm of rhs vector, |b|

    integer :: &
         staggered_ilo, staggered_ihi, &  ! bounds of locally owned vertices on staggered grid
         staggered_jlo, staggered_jhi

    staggered_ilo = parallel%staggered_ilo
    staggered_ihi = parallel%staggered_ihi
    staggered_jlo = parallel%staggered_jlo
    staggered_jhi = parallel%staggered_jhi

    ! Compute u and v components of A*x

    resid_u(:,:,:) = 0.d0
    resid_v(:,:,:) = 0.d0

    !TODO - Replace the following by a call to matvec_multiply_structured_3d
    ! Loop over locally owned vertices

    do j = staggered_jlo, staggered_jhi
    do i = staggered_ilo, staggered_ihi

       if (active_vertex(i,j)) then

          do k = 1, nz

             do kA = -1,1
             do jA = -1,1
             do iA = -1,1

                if ( (k+kA >= 1 .and. k+kA <= nz)      &
                                .and.                  &
                     (i+iA >= 1 .and. i+iA <= nx-1)    &
                             .and.                     &
                     (j+jA >= 1 .and. j+jA <= ny-1) ) then

                   m = indxA_3d(iA,jA,kA)

                   resid_u(k,i,j) = resid_u(k,i,j)                     & 
                                  + Auu(m,k,i,j)*uvel(k+kA,i+iA,j+jA)  &
                                  + Auv(m,k,i,j)*vvel(k+kA,i+iA,j+jA)

                   resid_v(k,i,j) = resid_v(k,i,j)                     &
                                  + Avu(m,k,i,j)*uvel(k+kA,i+iA,j+jA)  &
                                  + Avv(m,k,i,j)*vvel(k+kA,i+iA,j+jA)

                endif   ! in bounds

             enddo   ! kA
             enddo   ! iA
             enddo   ! jA

          enddo   ! k

       endif   ! active_vertex

    enddo   ! i
    enddo   ! j

    ! Subtract b to get A*x - b
    ! Sum up squared L2 norm as we go

    L2_norm = 0.d0
    resid_sq(:,:,:) = 0.0d0

    ! Loop over locally owned vertices

    do j = staggered_jlo, staggered_jhi
    do i = staggered_ilo, staggered_ihi
       if (active_vertex(i,j)) then
          do k = 1, nz
             resid_u(k,i,j) = resid_u(k,i,j) - bu(k,i,j)
             resid_v(k,i,j) = resid_v(k,i,j) - bv(k,i,j)
             resid_sq(k,i,j) = resid_u(k,i,j)*resid_u(k,i,j) + resid_v(k,i,j)*resid_v(k,i,j)
             L2_norm = L2_norm + resid_sq(k,i,j)
          enddo  ! k
       endif     ! active vertex
    enddo        ! i
    enddo        ! j

    ! Take global sum, then take square root
    L2_norm = parallel_reduce_sum(L2_norm)
    L2_norm = sqrt(L2_norm)

    if (verbose_residual) then

       if (this_rank==rtest) then
          i = itest
          j = jtest
          k = ktest
          write(iulog,*) 'In compute_residual_vector_3d: task, i, j, k =', this_rank, i, j, k
          write(iulog, '(a16, 2f13.7, 2e13.5)') &
               '  u, v, ru, rv: ', uvel(k,i,j), vvel(k,i,j), resid_u(k,i,j), resid_v(k,i,j)
       endif

       ! Compute max value of (squared) residual on this task.
       ! If this task owns the vertex with the global max residual, then print a diagnostic message.
       my_max_resid = maxval(resid_sq)
       global_max_resid = parallel_reduce_max(my_max_resid)

       if (abs((my_max_resid - global_max_resid)/global_max_resid) < 1.0d-6) then
          do j = staggered_jlo, staggered_jhi
             do i = staggered_ilo, staggered_ihi
                do k = 1, nz
                   if (abs((resid_sq(k,i,j) - global_max_resid)/global_max_resid) < 1.0d-6) then
                      call parallel_globalindex(i, j, iglobal, jglobal, parallel)
                      write(iulog, '(a24, 2i6, i4, 2e13.5, e16.8)') 'ig, jg, k, ru, rv, rmax:', &
                           iglobal, jglobal, k, resid_u(k,i,j), resid_v(k,i,j), sqrt(global_max_resid)
                      write(iulog,*) ' '
                   endif
                enddo
             enddo
          enddo
       endif

    endif  ! verbose_residual

    if (present(L2_norm_relative)) then   ! compute L2_norm relative to rhs

       L2_norm_rhs = 0.d0

       do j = staggered_jlo, staggered_jhi
       do i = staggered_ilo, staggered_ihi
          if (active_vertex(i,j)) then
             do k = 1, nz
                L2_norm_rhs = L2_norm_rhs + bu(k,i,j)*bu(k,i,j) + bv(k,i,j)*bv(k,i,j)
             enddo  ! k
          endif     ! active vertex
       enddo        ! i
       enddo        ! j

       ! Take global sum, then take square root
       L2_norm_rhs = parallel_reduce_sum(L2_norm_rhs)
       L2_norm_rhs = sqrt(L2_norm_rhs)

       if (L2_norm_rhs > 0.d0) then
          L2_norm_relative = L2_norm / L2_norm_rhs
       else
          L2_norm_relative = 0.d0
       endif

    endif

  end subroutine compute_residual_vector_3d

!****************************************************************************

  subroutine compute_residual_vector_2d(nx,            ny,            &
                                        parallel,                     &
                                        itest,  jtest, rtest,         &
                                        active_vertex,                &
                                        Auu,           Auv,           &
                                        Avu,           Avv,           &
                                        bu,            bv,            &
                                        uvel,          vvel,          &
                                        resid_u,       resid_v,       &
                                        L2_norm,       L2_norm_relative)

    ! Compute the residual vector Ax - b and its L2 norm.
    ! This subroutine assumes that the matrix is stored in structured (x/y/z) format.

    integer, intent(in) ::   &
       nx, ny                 ! horizontal grid dimensions (for scalars)

    type(parallel_type), intent(in) :: &
       parallel               ! info for parallel communication

    integer, intent(in) :: &
       itest, jtest, rtest         ! coordinates of diagnostic point

    logical, dimension(nx-1,ny-1), intent(in) ::   &
       active_vertex          ! T for columns (i,j) where velocity is computed, else F

    real(dp), dimension(nx-1,ny-1,nNodeNeighbors_2d), intent(in) ::   &
       Auu, Auv, Avu, Avv     ! four components of assembled matrix
                              ! 3rd dimension = 9 (node and its nearest neighbors in x and y directions)
                              ! 1st and 2nd dimensions = (x,y) indices
                              !
                              !    Auu  | Auv
                              !    _____|____
                              !    Avu  | Avv
                              !         |

    real(dp), dimension(nx-1,ny-1), intent(in) ::  &
       bu, bv              ! assembled load (rhs) vector, divided into 2 parts

   real(dp), dimension(nx-1,ny-1), intent(in) ::   &
       uvel, vvel          ! u and v components of velocity (m/yr)

    real(dp), dimension(nx-1,ny-1), intent(out) ::   &
       resid_u,      & ! residual vector, divided into 2 parts
       resid_v

    real(dp), intent(out) ::    &
       L2_norm             ! L2 norm of residual vector, |Ax - b|

    real(dp), intent(out), optional ::    &
       L2_norm_relative    ! L2 norm of residual vector relative to rhs, |Ax - b| / |b|

    real(dp), dimension(nx-1,ny-1) ::  &
       resid_sq            ! resid_u^2 + resid_v^2

    real(dp) :: my_max_resid, global_max_resid

    integer :: i, j, iA, jA, m, iglobal, jglobal

    real(dp) :: L2_norm_rhs   ! L2 norm of rhs vector, |b|

    integer :: &
         staggered_ilo, staggered_ihi, &  ! bounds of locally owned vertices on staggered grid
         staggered_jlo, staggered_jhi

    staggered_ilo = parallel%staggered_ilo
    staggered_ihi = parallel%staggered_ihi
    staggered_jlo = parallel%staggered_jlo
    staggered_jhi = parallel%staggered_jhi

    ! Compute u and v components of A*x

    resid_u(:,:) = 0.d0
    resid_v(:,:) = 0.d0

    ! Loop over locally owned vertices
    do jA = -1,1
       do iA = -1,1
          m = indxA_2d(iA,jA)
          do j = staggered_jlo, staggered_jhi
             do i = staggered_ilo, staggered_ihi
                if (active_vertex(i,j)) then
                   if ( (i+iA >= 1 .and. i+iA <= nx-1)    &
                                   .and.                     &
                        (j+jA >= 1 .and. j+jA <= ny-1) ) then
                      resid_u(i,j) = resid_u(i,j)                     &
                                   + Auu(i,j,m)*uvel(i+iA,j+jA)  &
                                   + Auv(i,j,m)*vvel(i+iA,j+jA)
                      resid_v(i,j) = resid_v(i,j)                     &
                                   + Avu(i,j,m)*uvel(i+iA,j+jA)  &
                                   + Avv(i,j,m)*vvel(i+iA,j+jA)
                   endif   ! in bounds
                endif   ! active_vertex
             enddo   ! i
          enddo   ! j
       enddo   ! iA
    enddo      ! jA

    ! Subtract b to get A*x - b
    ! Sum up squared L2 norm as we go

    L2_norm = 0.d0
    resid_sq(:,:) = 0.0d0

    ! Loop over locally owned vertices

    do j = staggered_jlo, staggered_jhi
    do i = staggered_ilo, staggered_ihi
       if (active_vertex(i,j)) then
          resid_u(i,j) = resid_u(i,j) - bu(i,j)
          resid_v(i,j) = resid_v(i,j) - bv(i,j)
          resid_sq(i,j) = resid_u(i,j)*resid_u(i,j) + resid_v(i,j)*resid_v(i,j)
          L2_norm = L2_norm + resid_sq(i,j)
       endif     ! active vertex
    enddo        ! i
    enddo        ! j

    ! Take global sum, then take square root

    L2_norm = parallel_reduce_sum(L2_norm)
    L2_norm = sqrt(L2_norm)

    if (verbose_residual) then

       if (this_rank==rtest) then
          i = itest
          j = jtest
          write(iulog,*) 'In compute_residual_vector_2d: task, i, j =', this_rank, i, j
          write(iulog, '(a16, 2f13.7, 2e13.5)') &
               '  u, v, ru, rv: ', uvel(i,j), vvel(i,j), resid_u(i,j), resid_v(i,j)
       endif

       ! Compute max value of (squared) residual on this task.
       ! If this task owns the vertex with the global max residual, then print a diagnostic message.
       my_max_resid = maxval(resid_sq)
       global_max_resid = parallel_reduce_max(my_max_resid)

       if (abs((my_max_resid - global_max_resid)/global_max_resid) < 1.0d-6) then
          do j = staggered_jlo, staggered_jhi
             do i = staggered_ilo, staggered_ihi
                if (abs((resid_sq(i,j) - global_max_resid)/global_max_resid) < 1.0d-6) then
                   call parallel_globalindex(i, j, iglobal, jglobal, parallel)
                   write(iulog, '(a24, 2i6, 2e13.5, e16.8)') 'ig, jg, ru, rv, rmax:', &
                        iglobal, jglobal, resid_u(i,j), resid_v(i,j), sqrt(global_max_resid)
                   write(iulog,*) ' '
                endif
             enddo
          enddo
       endif

    endif  ! verbose_residual

    if (present(L2_norm_relative)) then   ! compute L2_norm relative to rhs

       L2_norm_rhs = 0.d0

       do j = staggered_jlo, staggered_jhi
       do i = staggered_ilo, staggered_ihi
          if (active_vertex(i,j)) then
             L2_norm_rhs = L2_norm_rhs + bu(i,j)*bu(i,j) + bv(i,j)*bv(i,j)
          endif     ! active vertex
       enddo        ! i
       enddo        ! j

       ! Take global sum, then take square root
       L2_norm_rhs = parallel_reduce_sum(L2_norm_rhs)
       L2_norm_rhs = sqrt(L2_norm_rhs)

       if (L2_norm_rhs > 0.d0) then
          L2_norm_relative = L2_norm / L2_norm_rhs
       else
          L2_norm_relative = 0.d0
       endif

    endif

  end subroutine compute_residual_vector_2d

!****************************************************************************

  subroutine evaluate_accelerated_picard_3d(&
       L2_norm,       L2_norm_large,        &
       L2_norm_alpha_sav,                   &
       alpha_accel,   alpha_accel_max,      &
       gamma_accel,   resid_reduction_threshold,  &
       uvel,          vvel,                 &
       Auu,           Auv,                  &
       Avu,           Avv,                  &
       uvel_old,      vvel_old,             &
       duvel,         dvvel,                &
       uvel_sav,      vvel_sav,             &
       Auu_sav,       Auv_sav,              &
       Avu_sav,       Avv_sav,              &
       beta_internal, beta_internal_sav,    &
       assembly_is_done)

    real(dp), intent(in) :: &
         L2_norm,                &  ! latest value of L2 norm of residual
         L2_norm_large,          &  ! large value for re-initializing the L2 norm
         gamma_accel,            &  ! how much to increase alpha_accel for each attempt to extend the solution vector
         alpha_accel_max,        &  ! max allowed value of alpha_accel
         resid_reduction_threshold  ! threshold for deciding whether to increase alpha_accel again

    real(dp), intent(inout) :: &
         alpha_accel,            &  ! factor for extending the vector (duvel, dvvel) to reduce the residual
         L2_norm_alpha_sav          ! value of L2 norm of residual, given the previous alpha_accel

    real(dp), dimension(:,:), intent(inout) ::  &
         beta_internal,                & ! beta_internal as a function of uvel_2d and vvel_2d
         beta_internal_sav               ! beta_internal as a function of uvel_2d_sav and vvel_2d_sav

    real(dp), dimension(:,:,:), intent(inout) ::  &
         uvel,        vvel,            & ! latest guess for the velocity solution
         uvel_old,    vvel_old,        & ! velocity solution from previous nonlinear iteration
         duvel,       dvvel,           & ! difference between old velocity solution and latest solution
         uvel_sav,    vvel_sav           ! best velocity solution so far, based on the residual norm

    real(dp), dimension(:,:,:,:), intent(inout) ::  &
         Auu,         Auv,             & ! latest assembled matrices as a function of uvel_2d and vvel_2d
         Avu,         Avv,             &
         Auu_sav,     Auv_sav,         & ! assembled matrices as a function of uvel_2d_sav and vvel_2d_sav
         Avu_sav,     Avv_sav

    logical, intent(inout) :: &
         assembly_is_done                ! if true, then accept the current assembled matrices and proceed to solution

    if (L2_norm < resid_reduction_threshold*L2_norm_alpha_sav .and. &
         alpha_accel + gamma_accel <= alpha_accel_max) then

       ! The residual norm decreased substantially, relative to the previous value.
       ! ("Substantially" is defined by the factor resid_reduction_threshold < 1.)

       ! Save the latest values of the solver inputs

       uvel_sav = uvel
       vvel_sav = vvel
       Auu_sav = Auu
       Auv_sav = Auv
       Avu_sav = Avu
       Avv_sav = Avv
       beta_internal_sav = beta_internal

       ! Increase alpha_accel and see if the residual keeps getting smaller.
       ! If not, we will back off to the saved values above.
       alpha_accel = alpha_accel + gamma_accel
       L2_norm_alpha_sav = L2_norm

       if (verbose_picard .and. main_task) then
          write(iulog,*) 'Keep going, alpha =', alpha_accel
       endif

       ! Since assembly_is_done = F, we now return to the start of the loop:
       ! do while (.not.assembly_is_done)

    elseif (L2_norm < L2_norm_alpha_sav) then

       ! The residual norm decreased only a little (or we have reached alpha_accel_max).
       ! Call it good and move on to the solver.

       if (verbose_picard .and. main_task) then
          write(iulog,*) 'Hold, alpha =', alpha_accel
       endif

       ! Save this velocity as the starting point for the next nonlinear iteration
       uvel_old = uvel
       vvel_old = vvel

       ! Reset alpha_accel and L2_norm_alpha_sav for the next nonlinear iteration
       alpha_accel = 1.0d0
       L2_norm_alpha_sav = L2_norm_large

       ! proceed to the matrix solution
       assembly_is_done = .true.

    else

       ! The residual is larger than the previous value.
       ! Switch back to the previously saved velocity and matrix with the lower residual.
       uvel = uvel_sav
       vvel = vvel_sav
       Auu = Auu_sav
       Auv = Auv_sav
       Avu = Avu_sav
       Avv = Avv_sav
       beta_internal = beta_internal_sav

       ! Save this velocity as the starting point for the next nonlinear iteration
       uvel_old = uvel
       vvel_old = vvel

       if (verbose_picard .and. main_task) then
          write(iulog,*) 'Back up to alpha =', alpha_accel - gamma_accel
          write(iulog,*) 'Continue to matrix solver'
       endif

       ! Reset alpha_accel and L2_norm_alpha_sav for the next nonlinear iteration
       alpha_accel = 1.0d0
       L2_norm_alpha_sav = L2_norm_large

       ! proceed to the matrix solution
       assembly_is_done = .true.

    endif  ! L2_norm of residual has reduced

  end subroutine evaluate_accelerated_picard_3d

!****************************************************************************

  subroutine evaluate_accelerated_picard_2d(&
       nx,            ny,                   &
       L2_norm,       L2_norm_large,        &
       L2_norm_alpha_sav,                   &
       alpha_accel,   alpha_accel_max,      &
       gamma_accel,   resid_reduction_threshold,  &
       uvel_2d,       vvel_2d,              &
       Auu_2d,        Auv_2d,               &
       Avu_2d,        Avv_2d,               &
       uvel_2d_old,   vvel_2d_old,          &
       duvel_2d,      dvvel_2d,             &
       uvel_2d_sav,   vvel_2d_sav,          &
       Auu_2d_sav,    Auv_2d_sav,           &
       Avu_2d_sav,    Avv_2d_sav,           &
       beta_internal, beta_internal_sav,    &
       assembly_is_done)

    integer, intent(in) :: &
         nx,  ny                    ! number of grid cells in each direction

    real(dp), intent(in) :: &
         L2_norm,                &  ! latest value of L2 norm of residual
         L2_norm_large,          &  ! large value for re-initializing the L2 norm
         gamma_accel,            &  ! how much to increase alpha_accel for each attempt to extend the solution vector
         alpha_accel_max,        &  ! max allowed value of alpha_accel
         resid_reduction_threshold  ! threshold for deciding whether to increase alpha_accel again

    real(dp), intent(inout) :: &
         alpha_accel,            &  ! factor for extending the vector (duvel, dvvel) to reduce the residual
         L2_norm_alpha_sav          ! value of L2 norm of residual, given the previous alpha_accel

    real(dp), dimension(nx-1,ny-1), intent(inout) ::  &
         uvel_2d,     vvel_2d,         & ! latest guess for the velocity solution
         uvel_2d_old, vvel_2d_old,     & ! velocity solution from previous nonlinear iteration
         duvel_2d,    dvvel_2d,        & ! difference between old velocity solution and latest solution
         uvel_2d_sav, vvel_2d_sav,     & ! best velocity solution so far, based on the residual norm
         beta_internal,                & ! beta_internal as a function of uvel_2d and vvel_2d
         beta_internal_sav               ! beta_internal as a function of uvel_2d_sav and vvel_2d_sav

    real(dp), dimension(nx-1,ny-1,nNodeNeighbors_2d), intent(inout) ::  &
         Auu_2d,      Auv_2d,          & ! latest assembled matrices as a function of uvel_2d and vvel_2d
         Avu_2d,      Avv_2d,          &
         Auu_2d_sav,  Auv_2d_sav,      & ! assembled matrices as a function of uvel_2d_sav and vvel_2d_sav
         Avu_2d_sav,  Avv_2d_sav

    logical, intent(inout) :: &
         assembly_is_done                ! if true, then accept the current assembled matrices and proceed to solution

    if (L2_norm < resid_reduction_threshold*L2_norm_alpha_sav .and. &
         alpha_accel + gamma_accel <= alpha_accel_max) then

       ! The residual norm decreased substantially, relative to the previous value.
       ! ("Substantially" is defined by the factor resid_reduction_threshold < 1.)

       ! Save the latest values of the solver inputs

       uvel_2d_sav = uvel_2d
       vvel_2d_sav = vvel_2d
       Auu_2d_sav = Auu_2d
       Auv_2d_sav = Auv_2d
       Avu_2d_sav = Avu_2d
       Avv_2d_sav = Avv_2d
       beta_internal_sav = beta_internal

       ! Increase alpha_accel and see if the residual keeps getting smaller.
       ! If not, we will back off to the saved values above.
       alpha_accel = alpha_accel + gamma_accel
       L2_norm_alpha_sav = L2_norm

       if (verbose_picard .and. main_task) then
          write(iulog,*) 'Keep going, alpha =', alpha_accel
       endif

       ! Since assembly_is_done = F, we now return to the start of the loop:
       ! do while (.not.assembly_is_done)

    elseif (L2_norm < L2_norm_alpha_sav) then

       ! The residual norm decreased only a little (or we have reached alpha_accel_max).
       ! Call it good and move on to the solver.

       if (verbose_picard .and. main_task) then
          write(iulog,*) 'Hold, alpha =', alpha_accel
       endif

       ! Save this velocity as the starting point for the next nonlinear iteration
       uvel_2d_old = uvel_2d
       vvel_2d_old = vvel_2d

       ! Reset alpha_accel and L2_norm_alpha_sav for the next nonlinear iteration
       alpha_accel = 1.0d0
       L2_norm_alpha_sav = L2_norm_large

       ! proceed to the matrix solution
       assembly_is_done = .true.

    else

       ! The residual is larger than the previous value.
       ! Switch back to the previously saved velocity and matrix with the lower residual.
       uvel_2d = uvel_2d_sav
       vvel_2d = vvel_2d_sav
       Auu_2d = Auu_2d_sav
       Auv_2d = Auv_2d_sav
       Avu_2d = Avu_2d_sav
       Avv_2d = Avv_2d_sav
       beta_internal = beta_internal_sav

       ! Save this velocity as the starting point for the next nonlinear iteration
       uvel_2d_old = uvel_2d
       vvel_2d_old = vvel_2d

       if (verbose_picard .and. main_task) then
          write(iulog,*) 'Back up to alpha =', alpha_accel - gamma_accel
          write(iulog,*) 'Continue to matrix solver'
       endif

       ! Reset alpha_accel and L2_norm_alpha_sav for the next nonlinear iteration
       alpha_accel = 1.0d0
       L2_norm_alpha_sav = L2_norm_large

       ! proceed to the matrix solution
       assembly_is_done = .true.

    endif  ! L2_norm of residual has reduced

  end subroutine evaluate_accelerated_picard_2d

!****************************************************************************

  subroutine compute_residual_velocity_3d(whichresid,    parallel,      &
                                          uvel,          vvel,          &
                                          usav,          vsav,          &
                                          resid_velo)

    integer, intent(in) ::   &
       whichresid             ! option for method to use when calculating residual

    type(parallel_type), intent(in) :: &
         parallel             ! info for parallel communication

    real(dp), dimension(:,:,:), intent(in) ::  &
       uvel, vvel,      & ! current guess for velocity
       usav, vsav         ! previous guess for velocity

    real(dp), intent(out) ::    &
       resid_velo         ! quantity related to velocity convergence

    integer ::   &
       imaxdiff, jmaxdiff, kmaxdiff   ! location of maximum speed difference
                                      ! currently computed but not used
 
    integer :: i, j, k, count

    real(dp) ::   &
       speed,      &   ! current guess for ice speed
       oldspeed,   &   ! previous guess for ice speed
       diffspeed       ! abs(speed-oldspeed)

    integer :: &
         staggered_ilo, staggered_ihi, &  ! bounds of locally owned vertices on staggered grid
         staggered_jlo, staggered_jhi

    staggered_ilo = parallel%staggered_ilo
    staggered_ihi = parallel%staggered_ihi
    staggered_jlo = parallel%staggered_jlo
    staggered_jhi = parallel%staggered_jhi

    ! Compute a residual quantity based on convergence of the velocity field.
    !TODO - Remove some of these velocity residual methods?  They are rarely if ever used.

    ! options for residual calculation method, as specified in configuration file
    ! case(0): use max of abs( vel_old - vel ) / vel )
    ! case(1): use max of abs( vel_old - vel ) / vel ) but ignore basal vels
    ! case(2): use mean of abs( vel_old - vel ) / vel )
    ! case(3): use max of abs( vel_old - vel ) / vel ) (in addition to L2 norm)
    
    resid_velo = 0.d0
    imaxdiff = 0
    jmaxdiff = 0
    kmaxdiff = 0

    select case (whichresid)

    case(HO_RESID_MAXU_NO_UBAS)   ! max speed difference, excluding the bed

       ! Loop over locally owned vertices

       do j = staggered_jlo, staggered_jhi
          do i = staggered_ilo, staggered_ihi
             do k = 1, size(uvel,1) - 1         ! ignore bed velocity
                speed = sqrt(uvel(k,i,j)**2 + vvel(k,i,j)**2)
                if (speed /= 0.d0) then
                   oldspeed = sqrt(usav(k,i,j)**2 + vsav(k,i,j)**2)
                   diffspeed = abs((oldspeed - speed)/speed)
                   if (diffspeed > resid_velo) then
                      resid_velo = diffspeed
                      imaxdiff = i
                      jmaxdiff = j
                      kmaxdiff = k
                   endif
                endif
             enddo
          enddo
       enddo
       
       ! take global max
       resid_velo = parallel_reduce_max(resid_velo)

    case(HO_RESID_MEANU)   ! mean relative speed difference

       count = 0

       ! Loop over locally owned vertices

       do j = staggered_jlo, staggered_jhi
          do i = staggered_ilo, staggered_ihi
             do k = 1, size(uvel,1) - 1         ! ignore bed velocity
                speed = sqrt(uvel(k,i,j)**2 + vvel(k,i,j)**2)
                if (speed /= 0.d0) then
                   count = count+1
                   oldspeed = sqrt(usav(k,i,j)**2 + vsav(k,i,j)**2)
                   diffspeed = abs((oldspeed - speed)/speed)                
                   resid_velo = resid_velo + diffspeed
                endif
             enddo
          enddo
       enddo

       if (count > 0) resid_velo = resid_velo / count

       !TODO - Need to convert the mean residual to a global value.
       !       (Or simply remove this case, which is rarely if ever used)
       call not_parallel(__FILE__, __LINE__)

   case default    ! max speed difference, including basal speeds
                   ! (case HO_RESID_MAXU or HO_RESID_L2NORM or HO_RESID_L2NORM_RELATIVE)

       ! Loop over locally owned vertices

       do j = staggered_jlo, staggered_jhi
          do i = staggered_ilo, staggered_ihi
             do k = 1, size(uvel,1)
                speed = sqrt(uvel(k,i,j)**2 + vvel(k,i,j)**2)
                if (speed /= 0.d0) then
                   oldspeed = sqrt(usav(k,i,j)**2 + vsav(k,i,j)**2)
                   diffspeed = abs((oldspeed - speed)/speed)
                   if (diffspeed > resid_velo) then
                      resid_velo = diffspeed
                      imaxdiff = i
                      jmaxdiff = j
                      kmaxdiff = k
                   endif
                endif
             enddo
          enddo
       enddo

       resid_velo = parallel_reduce_max(resid_velo)
       
    end select

  end subroutine compute_residual_velocity_3d

!****************************************************************************

  subroutine compute_residual_velocity_2d(whichresid,    parallel,    &
                                          uvel,          vvel,        &
                                          usav,          vsav,        &
                                          resid_velo)

    integer, intent(in) ::   &
         whichresid         ! option for method to use when calculating residual

    type(parallel_type), intent(in) :: &
         parallel             ! info for parallel communication

    real(dp), dimension(:,:), intent(in) ::  &
         uvel, vvel,      & ! current guess for velocity
         usav, vsav         ! previous guess for velocity

    real(dp), intent(out) ::    &
         resid_velo         ! quantity related to velocity convergence

    integer ::   &
         imaxdiff, jmaxdiff   ! location of maximum speed difference
                              ! currently computed but not used
 
    integer :: i, j, count

    real(dp) ::   &
         speed,      &   ! current guess for ice speed
         oldspeed,   &   ! previous guess for ice speed
         diffspeed       ! abs(speed-oldspeed)

    integer :: &
         staggered_ilo, staggered_ihi, &  ! bounds of locally owned vertices on staggered grid
         staggered_jlo, staggered_jhi

    staggered_ilo = parallel%staggered_ilo
    staggered_ihi = parallel%staggered_ihi
    staggered_jlo = parallel%staggered_jlo
    staggered_jhi = parallel%staggered_jhi

    ! Compute a residual quantity based on convergence of the velocity field.

    ! options for residual calculation method, as specified in configuration file
    ! case(0): use max of abs( vel_old - vel ) / vel )
    ! case(1): use max of abs( vel_old - vel ) / vel ) but ignore basal vels
    ! case(2): use mean of abs( vel_old - vel ) / vel )
    ! case(3): use max of abs( vel_old - vel ) / vel ) (in addition to L2 norm)
    
    resid_velo = 0.d0
    imaxdiff = 0
    jmaxdiff = 0

    select case (whichresid)

    case(HO_RESID_MAXU_NO_UBAS)   ! max speed difference, excluding the bed

       ! Loop over locally owned vertices

       do j = staggered_jlo, staggered_jhi
          do i = staggered_ilo, staggered_ihi
             speed = sqrt(uvel(i,j)**2 + vvel(i,j)**2)
             if (speed /= 0.d0) then
                oldspeed = sqrt(usav(i,j)**2 + vsav(i,j)**2)
                diffspeed = abs((oldspeed - speed)/speed)
                if (diffspeed > resid_velo) then
                   resid_velo = diffspeed
                   imaxdiff = i
                   jmaxdiff = j
                endif
             endif
          enddo
       enddo
       
       ! take global max
       resid_velo = parallel_reduce_max(resid_velo)

    case(HO_RESID_MEANU)   ! mean relative speed difference

       count = 0

       ! Loop over locally owned vertices

       do j = staggered_jlo, staggered_jhi
          do i = staggered_ilo, staggered_ihi
             speed = sqrt(uvel(i,j)**2 + vvel(i,j)**2)
             if (speed /= 0.d0) then
                count = count+1
                oldspeed = sqrt(usav(i,j)**2 + vsav(i,j)**2)
                diffspeed = abs((oldspeed - speed)/speed)                
                resid_velo = resid_velo + diffspeed
             endif
          enddo
       enddo

       if (count > 0) resid_velo = resid_velo / count

       !TODO - Need to convert the mean residual to a global value.
       !       (Or simply remove this case, which is rarely if ever used)
       call not_parallel(__FILE__, __LINE__)

   case default    ! max speed difference, including basal speeds
                   ! (case HO_RESID_MAXU or HO_RESID_L2NORM)

       ! Loop over locally owned vertices

       do j = staggered_jlo, staggered_jhi
          do i = staggered_ilo, staggered_ihi
             speed = sqrt(uvel(i,j)**2 + vvel(i,j)**2)
             if (speed /= 0.d0) then
                oldspeed = sqrt(usav(i,j)**2 + vsav(i,j)**2)
                diffspeed = abs((oldspeed - speed)/speed)
                if (diffspeed > resid_velo) then
                   resid_velo = diffspeed
                   imaxdiff = i
                   jmaxdiff = j
                endif
             endif
          enddo
       enddo

       resid_velo = parallel_reduce_max(resid_velo)
       
    end select

  end subroutine compute_residual_velocity_2d

!****************************************************************************

  subroutine count_nonzeros_3d(nx,    ny,     nz,      &
                               parallel,               &
                               Auu,           Auv,     &
                               Avu,           Avv,     &
                               active_vertex,          &
                               nNonzeros)

    !----------------------------------------------------------------
    ! Input-output arguments
    !----------------------------------------------------------------

    integer, intent(in) ::   &
         nx,  ny,           & ! number of grid cells in each direction
         nz                   ! number of vertical levels where velocity is computed

    type(parallel_type), intent(in) :: &
         parallel             ! info for parallel communication

    real(dp), dimension(nNodeNeighbors_3d,nz,nx-1,ny-1), intent(in) ::  &
         Auu, Auv,    &     ! assembled stiffness matrix, divided into 4 parts
         Avu, Avv                                    

    logical, dimension(nx-1,ny-1), intent(in) :: &
         active_vertex      ! true for vertices of active cells

    integer, intent(out) ::   &
         nNonzeros          ! number of nonzero matrix elements

    !----------------------------------------------------------------
    ! Local variables
    !----------------------------------------------------------------

    integer :: i, j, k, iA, jA, kA, m

    integer :: &
         staggered_ilo, staggered_ihi, &  ! bounds of locally owned vertices on staggered grid
         staggered_jlo, staggered_jhi

    staggered_ilo = parallel%staggered_ilo
    staggered_ihi = parallel%staggered_ihi
    staggered_jlo = parallel%staggered_jlo
    staggered_jhi = parallel%staggered_jhi

    nNonzeros = 0
    do j = staggered_jlo, staggered_jhi
       do i = staggered_ilo, staggered_ihi
          if (active_vertex(i,j)) then
             do k = 1, nz
                do kA = -1, 1
                do jA = -1, 1
                do iA = -1, 1
                   m = indxA_3d(iA,jA,kA)
                   if (Auu(m,k,i,j) /= 0.d0) nNonzeros = nNonzeros + 1
                   if (Auv(m,k,i,j) /= 0.d0) nNonzeros = nNonzeros + 1
                   if (Avu(m,k,i,j) /= 0.d0) nNonzeros = nNonzeros + 1
                   if (Avv(m,k,i,j) /= 0.d0) nNonzeros = nNonzeros + 1
                enddo 
                enddo
                enddo
             enddo  ! k
          endif     ! active_vertex
       enddo        ! i
    enddo           ! j

    nNonzeros = parallel_reduce_sum(nNonzeros)

  end subroutine count_nonzeros_3d

!****************************************************************************

  subroutine count_nonzeros_2d(nx,            ny,     &
                               parallel,              &
                               Auu,           Auv,    &
                               Avu,           Avv,    &
                               active_vertex,         &
                               nNonzeros)

    !----------------------------------------------------------------
    ! Input-output arguments
    !----------------------------------------------------------------

    integer, intent(in) ::   &
       nx, ny             ! number of grid cells in each direction

    type(parallel_type), intent(in) :: &
         parallel             ! info for parallel communication

    real(dp), dimension(nx-1,ny-1,nNodeNeighbors_2d), intent(in) ::  &
       Auu, Auv,    &     ! assembled stiffness matrix, divided into 4 parts
       Avu, Avv                                    

    logical, dimension(nx-1,ny-1), intent(in) :: &
       active_vertex      ! true for vertices of active cells

    integer, intent(out) ::   &
       nNonzeros          ! number of nonzero matrix elements

    !----------------------------------------------------------------
    ! Local variables
    !----------------------------------------------------------------

    integer :: i, j, m

    integer :: &
         staggered_ilo, staggered_ihi, &  ! bounds of locally owned vertices on staggered grid
         staggered_jlo, staggered_jhi

    staggered_ilo = parallel%staggered_ilo
    staggered_ihi = parallel%staggered_ihi
    staggered_jlo = parallel%staggered_jlo
    staggered_jhi = parallel%staggered_jhi

    nNonzeros = 0

    do m = 1, nNodeNeighbors_2d
       do j = staggered_jlo, staggered_jhi
          do i = staggered_ilo, staggered_ihi
             if (active_vertex(i,j)) then
                if (Auu(i,j,m) /= 0.d0) nNonzeros = nNonzeros + 1
                if (Auv(i,j,m) /= 0.d0) nNonzeros = nNonzeros + 1
                if (Avu(i,j,m) /= 0.d0) nNonzeros = nNonzeros + 1
                if (Avv(i,j,m) /= 0.d0) nNonzeros = nNonzeros + 1
             endif     ! active_vertex
          enddo        ! i
       enddo           ! j
    enddo              ! m

    nNonzeros = parallel_reduce_sum(nNonzeros)

  end subroutine count_nonzeros_2d

!****************************************************************************

  subroutine check_symmetry_element_matrix(nNodesPerElement,  &
                                           Kuu, Kuv, Kvu, Kvv)

    !------------------------------------------------------------------
    ! Check that the element stiffness matrix is symmetric.
    ! This is true provided that (1) Kuu = (Kuu)^T
    !                            (2) Kvv = (Kvv)^T
    !                            (3) Kuv = (Kvu)^T
    ! This subroutine works for either 2D or 3D elements.
    ! A symmetry check should not be needed for production runs with a well-tested code,
    !  but is included for now to help with debugging.
    !------------------------------------------------------------------

    integer, intent(in) :: nNodesPerElement  ! number of nodes per element

    real(dp), dimension(nNodesPerElement, nNodesPerElement), intent(in) ::   &
             Kuu, Kuv, Kvu, Kvv     ! component of element stiffness matrix
                                    !
                                    !    Kuu  | Kuv
                                    !    _____|____          
                                    !    Kvu  | Kvv
                                    !         |

    integer :: i, j

    ! make sure Kuu = (Kuu)^T

    do j = 1, nNodesPerElement
       do i = j, nNodesPerElement
          if (abs(Kuu(i,j) - Kuu(j,i)) > eps10) then
             write(iulog,*) 'Kuu is not symmetric'
             write(iulog,*) 'i, j, Kuu(i,j), Kuu(j,i):', i, j, Kuu(i,j), Kuu(j,i)
             stop
          endif    
       enddo
    enddo

    ! check that Kvv = (Kvv)^T

    do j = 1, nNodesPerElement
       do i = j, nNodesPerElement
          if (abs(Kvv(i,j) - Kvv(j,i)) > eps10) then
             write(iulog,*) 'Kvv is not symmetric'
             write(iulog,*) 'i, j, Kvv(i,j), Kvv(j,i):', i, j, Kvv(i,j), Kvv(j,i)
             stop
          endif    
       enddo
    enddo

    ! Check that Kuv = (Kvu)^T

    do j = 1, nNodesPerElement
       do i = 1, nNodesPerElement
          if (abs(Kuv(i,j) - Kvu(j,i)) > eps10) then
             write(iulog,*) 'Kuv /= (Kvu)^T'
             write(iulog,*) 'i, j, Kuv(i,j), Kvu(j,i):', i, j, Kuv(i,j), Kvu(j,i)
             stop
          endif    
       enddo
    enddo

  end subroutine check_symmetry_element_matrix

!****************************************************************************

  subroutine check_symmetry_assembled_matrix_3d(nx,    ny,     nz,      &
                                                parallel,               &
                                                active_vertex,          &
                                                Auu, Auv, Avu, Avv)

    !------------------------------------------------------------------
    ! Check that the assembled stiffness matrix is symmetric.
    ! This is true provided that (1) Auu = (Auu)^T
    !                            (2) Avv = (Avv)^T
    !                            (3) Auv = (Avu)^T
    ! The A matrices are assembled in a dense fashion to save storage
    !  and preserve the i/j/k structure of the grid.
    !
    ! There can be small differences from perfect symmetry due to roundoff error.
    ! These differences are fixed provided they are small enough.
    !------------------------------------------------------------------    

    integer, intent(in) ::   &
         nx, ny,        &   ! horizontal grid dimensions
         nz                 ! number of vertical levels where velocity is computed

    type(parallel_type), intent(in) :: &
         parallel             ! info for parallel communication

    logical, dimension(nx-1,ny-1), intent(in) ::   &
         active_vertex            ! T for columns (i,j) where velocity is computed, else F

    real(dp), dimension(nNodeNeighbors_3d,nz,nx-1,ny-1), intent(inout) ::   &
         Auu, Auv, Avu, Avv       ! components of assembled stiffness matrix
                                  !
                                  !    Auu  | Auv
                                  !    _____|____
                                  !         |
                                  !    Avu  | Avv

    integer :: i, j, k, iA, jA, kA, m, mm
    integer :: iglobal, jglobal

    real(dp) :: val1, val2          ! values of matrix coefficients

    real(dp) :: maxdiff, diag_entry, avg_val

    integer :: &
         staggered_ilo, staggered_ihi, &  ! bounds of locally owned vertices on staggered grid
         staggered_jlo, staggered_jhi

    staggered_ilo = parallel%staggered_ilo
    staggered_ihi = parallel%staggered_ihi
    staggered_jlo = parallel%staggered_jlo
    staggered_jhi = parallel%staggered_jhi

    ! Check matrix for symmetry

    ! Here we correct for small differences from symmetry due to roundoff error.
    ! The maximum departure from symmetry is set to be a small fraction 
    !  of the diagonal entry for the row.
    ! If the departure from symmetry is larger than this, then the model prints a warning 
    !  and/or aborts.

    maxdiff = 0.d0

    ! Loop over locally owned vertices.
    ! Each active vertex is associate with 2*nz matrix rows belonging to this processor.

    do j = staggered_jlo, staggered_jhi
       do i = staggered_ilo, staggered_ihi
          if (active_vertex(i,j)) then
             do k = 1, nz

                ! Check Auu and Auv for symmetry

                m = indxA_3d(0,0,0)
                diag_entry = Auu(m,k,i,j)

                !WHL - debug
                if (diag_entry /= diag_entry) then
                   write(iulog,*) 'Diagonal NaN: k, i, j =', k, i, j
                   call parallel_globalindex(i, j, iglobal, jglobal, parallel)
                   write(iulog,*) '   iglobal, jglobal:', iglobal, jglobal
!!                   stop
                endif

                do jA = -1, 1
                do iA = -1, 1
                do kA = -1, 1

                   if (k+kA >= 1 .and. k+kA <=nz) then  ! to keep k index in bounds

                      m =  indxA_3d( iA, jA, kA)
                      mm = indxA_3d(-iA,-jA,-kA)

                      ! Check that Auu = Auu^T

                      val1 = Auu( m, k,    i,    j   )   ! value of Auu(row,col)
                      val2 = Auu(mm, k+kA, i+iA, j+jA)   ! value of Auu(col,row)

                      if (val2 /= val1) then
                          
                         if (abs(val2 - val1) > maxdiff) maxdiff = abs(val2 - val1)

                         ! if difference is small, then fix the asymmetry by averaging values
                         ! else print a warning and abort

                         if ( abs(val2-val1) < eps08*abs(diag_entry) ) then
                            avg_val = 0.5d0 * (val1 + val2)
                            Auu( m, k,   i,   j   ) = avg_val
                            Auu(mm, k+kA,i+iA,j+jA) = avg_val
                         else
                            write(iulog,*) 'WARNING: Auu is not symmetric: this_rank, i, j, k, iA, jA, kA =', &
                                 this_rank, i, j, k, iA, jA, kA
                            write(iulog,*) 'Auu(row,col), Auu(col,row), diff/diag:', val1, val2, (val2 - val1)/diag_entry
                            call parallel_globalindex(i, j, iglobal, jglobal, parallel)
                            write(iulog,*) '   iglobal, jglobal:', iglobal, jglobal
!!                            stop
                         endif

                      endif   ! val2 /= val1
                
                      ! Check that Auv = (Avu)^T

                      val1 = Auv( m, k,    i,    j)      ! value of Auv(row,col)
                      val2 = Avu(mm, k+kA, i+iA, j+jA)   ! value of Avu(col,row)

                      if (val2 /= val1) then

                         if (abs(val2 - val1) > maxdiff) maxdiff = abs(val2 - val1)

                         ! if difference is small, then fix the asymmetry by averaging values
                         ! else print a warning and abort

                         if ( abs(val2-val1) < eps08*abs(diag_entry) ) then
                            avg_val = 0.5d0 * (val1 + val2)
                            Auv( m, k,   i,   j   ) = avg_val
                            Avu(mm, k+kA,i+iA,j+jA) = avg_val
                         else
                            write(iulog,*) 'WARNING: Auv is not equal to (Avu)^T, this_rank, i, j, k, iA, jA, kA =', &
                                 this_rank, i, j, k, iA, jA, kA
                            write(iulog,*) 'Auv(row,col), Avu(col,row), diff/diag:', val1, val2, (val2 - val1)/diag_entry
                            call parallel_globalindex(i, j, iglobal, jglobal, parallel)
                            write(iulog,*) '   iglobal, jglobal:', iglobal, jglobal
!!                            stop
                         endif

                      endif  ! val2 /= val1

                   endif     ! k+kA in bounds
            
                enddo        ! kA
                enddo        ! iA
                enddo        ! jA

                ! Now check Avu and Avv

                m = indxA_3d(0,0,0)
                diag_entry = Avv(m,k,i,j)

                !WHL - debug
                if (diag_entry /= diag_entry) then
                   write(iulog,*) 'WARNING: Diagonal NaN: k, i, j =', k, i, j
                   call parallel_globalindex(i, j, iglobal, jglobal, parallel)
                   write(iulog,*) '   iglobal, jglobal:', iglobal, jglobal
!!                   stop
                endif

                ! check that Avv = (Avv)^T

                do jA = -1, 1
                do iA = -1, 1
                do kA = -1, 1

                   if (k+kA >= 1 .and. k+kA <=nz) then  ! to keep k index in bounds

                      m  = indxA_3d( iA, jA, kA)
                      mm = indxA_3d(-iA,-jA,-kA)

                      val1 = Avv( m, k,    i,    j)      ! value of Avv(row,col)
                      val2 = Avv(mm, k+kA, i+iA, j+jA)   ! value of Avv(col,row)

                      if (val2 /= val1) then
                          
                         if (abs(val2 - val1) > maxdiff) maxdiff = abs(val2 - val1)

                         ! if difference is small, then fix the asymmetry by averaging values
                         ! else print a warning and abort

                         if ( abs(val2-val1) < eps08*abs(diag_entry) ) then
                            avg_val = 0.5d0 * (val1 + val2)
                            Avv( m, k,   i,   j   ) = avg_val
                            Avv(mm, k+kA,i+iA,j+jA) = avg_val
                         else
                            write(iulog,*) 'WARNING: Avv is not symmetric: this_rank, i, j, k, iA, jA, kA =', &
                                 this_rank, i, j, k, iA, jA, kA
                            write(iulog,*) 'Avv(row,col), Avv(col,row), diff/diag:', val1, val2, (val2 - val1)/diag_entry
                            call parallel_globalindex(i, j, iglobal, jglobal, parallel)
                            write(iulog,*) '   iglobal, jglobal:', iglobal, jglobal
!!                            stop
                         endif

                      endif   ! val2 /= val1

                      ! Check that Avu = (Auv)^T

                      val1 = Avu( m, k,    i,    j)      ! value of Avu(row,col)
                      val2 = Auv(mm, k+kA, i+iA, j+jA)   ! value of Auv(col,row)

                      if (abs(val2 - val1) > maxdiff) maxdiff = abs(val2 - val1)

                      if (val2 /= val1) then

                         ! if difference is small, then fix the asymmetry by averaging values
                         ! else print a warning and abort

                         if ( abs(val2-val1) < eps08*abs(diag_entry) ) then
                            avg_val = 0.5d0 * (val1 + val2)
                            Avu( m, k,   i,   j   ) = avg_val
                            Auv(mm, k+kA,i+iA,j+jA) = avg_val
                         else
                            write(iulog,*) 'WARNING: Avu is not equal to (Auv)^T, this_rank, i, j, k, iA, jA, kA =', &
                                 this_rank, i, j, k, iA, jA, kA
                            write(iulog,*) 'Avu(row,col), Auv(col,row), diff/diag:', val1, val2, (val2 - val1)/diag_entry
                            call parallel_globalindex(i, j, iglobal, jglobal, parallel)
                            write(iulog,*) '   iglobal, jglobal:', iglobal, jglobal
!!                            stop
                         endif

                      endif  ! val2 /= val1

                   endif     ! k+kA in bounds

                enddo        ! kA
                enddo        ! iA
                enddo        ! jA

             enddo     ! k
          endif        ! active_vertex
       enddo           ! i
    enddo              ! j

    if (verbose_matrix) maxdiff = parallel_reduce_max(maxdiff)

    if (verbose_matrix .and. main_task) then
       write(iulog,*) ' '
       write(iulog,*) 'Max difference from symmetry =', maxdiff
    endif

  end subroutine check_symmetry_assembled_matrix_3d

!****************************************************************************

  subroutine check_symmetry_assembled_matrix_2d(nx,            ny,       &
                                                parallel,                &
                                                active_vertex,           &
                                                Auu, Auv, Avu, Avv)

    !------------------------------------------------------------------
    ! Check that the assembled stiffness matrix is symmetric.
    ! This is true provided that (1) Auu = (Auu)^T
    !                            (2) Avv = (Avv)^T
    !                            (3) Auv = (Avu)^T
    ! The A matrices are assembled in a dense fashion to save storage
    !  and preserve the i/j/k structure of the grid.
    !
    ! There can be small differences from perfect symmetry due to roundoff error.
    ! These differences are fixed provided they are small enough.
    !------------------------------------------------------------------    

    integer, intent(in) ::   &
       nx, ny                   ! horizontal grid dimensions

    type(parallel_type), intent(in) :: &
         parallel             ! info for parallel communication

    logical, dimension(nx-1,ny-1), intent(in) ::   &
       active_vertex            ! T for columns (i,j) where velocity is computed, else F

    real(dp), dimension(nx-1,ny-1,nNodeNeighbors_2d), intent(inout) ::   &
       Auu, Auv, Avu, Avv       ! components of assembled stiffness matrix
                                !
                                !    Auu  | Auv
                                !    _____|____          
                                !         |
                                !    Avu  | Avv                                    

    integer :: i, j, iA, jA, m, mm, iglobal, jglobal

    real(dp) :: val1, val2          ! values of matrix coefficients

    real(dp) :: maxdiff, diag_entry, avg_val

    integer :: &
         staggered_ilo, staggered_ihi, &  ! bounds of locally owned vertices on staggered grid
         staggered_jlo, staggered_jhi

    staggered_ilo = parallel%staggered_ilo
    staggered_ihi = parallel%staggered_ihi
    staggered_jlo = parallel%staggered_jlo
    staggered_jhi = parallel%staggered_jhi

    ! Check matrix for symmetry

    ! Here we correct for small differences from symmetry due to roundoff error.
    ! The maximum departure from symmetry is set to be a small fraction
    !  of the diagonal entry for the row.
    ! If the departure from symmetry is larger than this, then the model prints a warning 
    !  and/or aborts.

    maxdiff = 0.d0

    ! Loop over locally owned vertices.
    ! Each active vertex is associate with 2*nz matrix rows belonging to this processor.

    do jA = -1, 1
    do iA = -1, 1
       m =  indxA_2d( iA, jA)
       mm = indxA_2d(-iA,-jA)

       do j = staggered_jlo, staggered_jhi
          do i = staggered_ilo, staggered_ihi
             if (active_vertex(i,j)) then

                ! Check Auu and Auv for symmetry
                diag_entry = Auu(i,j,indxA_2d(0,0))

                !WHL - debug
                if (diag_entry /= diag_entry) then
                   write(iulog,*) 'WARNING: Diagonal NaN: i, j =', i, j
                   call parallel_globalindex(i, j, iglobal, jglobal, parallel)
                   write(iulog,*) '   iglobal, jglobal:', iglobal, jglobal
!!                   stop
                endif

                ! Check that Auu = Auu^T
                val1 = Auu(i,    j,    m )   ! value of Auu(row,col)
                val2 = Auu(i+iA, j+jA, mm)   ! value of Auu(col,row)
                if (val2 /= val1) then
                   if (abs(val2 - val1) > maxdiff) maxdiff = abs(val2 - val1)
                   ! if difference is small, then fix the asymmetry by averaging values
                   ! else print a warning and abort
                   if ( abs(val2-val1) < eps08*abs(diag_entry) ) then
                      avg_val = 0.5d0 * (val1 + val2)
                      Auu(i,    j,    m ) = avg_val
                      Auu(i+iA, j+jA, mm) = avg_val
                   else
                      write(iulog,*) 'WARNING: Auu is not symmetric: this_rank, i, j, iA, jA =', this_rank, i, j, iA, jA
                      write(iulog,*) 'Auu(row,col), Auu(col,row), diff/diag:', val1, val2, (val2 - val1)/diag_entry
                      call parallel_globalindex(i, j, iglobal, jglobal, parallel)
                      write(iulog,*) '   iglobal, jglobal:', iglobal, jglobal
!!                      stop
                   endif
                endif   ! val2 /= val1

                ! Check that Auv = (Avu)^T
                val1 = Auv(i,    j,    m )   ! value of Auv(row,col)
                val2 = Avu(i+iA, j+jA, mm)   ! value of Avu(col,row)
                if (val2 /= val1) then
                   if (abs(val2 - val1) > maxdiff) maxdiff = abs(val2 - val1)
                   ! if difference is small, then fix the asymmetry by averaging values
                   ! else print a warning and abort
                   if ( abs(val2-val1) < eps08*abs(diag_entry) ) then
                      avg_val = 0.5d0 * (val1 + val2)
                      Auv(i,    j,    m ) = avg_val
                      Avu(i+iA, j+jA, mm) = avg_val
                   else
                      write(iulog,*) 'WARNING: Auv is not equal to (Avu)^T, this_rank, i, j, iA, jA =', this_rank, i, j, iA, jA
                      write(iulog,*) 'Auv(row,col), Avu(col,row), diff/diag:', val1, val2, (val2 - val1)/diag_entry
                      call parallel_globalindex(i, j, iglobal, jglobal, parallel)
                      write(iulog,*) '   iglobal, jglobal:', iglobal, jglobal
!!                      stop
                   endif
                endif  ! val2 /= val1

                ! Now check Avu and Avv
                diag_entry = Avv(i,j,indxA_2d(0,0))

                ! check that Avv = (Avv)^T
                val1 = Avv(i,    j,    m )   ! value of Avv(row,col)
                val2 = Avv(i+iA, j+jA, mm)   ! value of Avv(col,row)

                if (val2 /= val1) then
                   if (abs(val2 - val1) > maxdiff) maxdiff = abs(val2 - val1)

                   ! if difference is small, then fix the asymmetry by averaging values
                   ! else print a warning and abort
                   if ( abs(val2-val1) < eps08*abs(diag_entry) ) then
                      avg_val = 0.5d0 * (val1 + val2)
                      Avv(i,    j,    m ) = avg_val
                      Avv(i+iA, j+jA, mm) = avg_val
                   else
                      write(iulog,*) 'WARNING: Avv is not symmetric: this_rank, i, j, iA, jA =', this_rank, i, j, iA, jA
                      write(iulog,*) 'Avv(row,col), Avv(col,row), diff/diag:', val1, val2, (val2 - val1)/diag_entry
                      call parallel_globalindex(i, j, iglobal, jglobal, parallel)
                      write(iulog,*) '   iglobal, jglobal:', iglobal, jglobal
!!                      stop
                   endif

                endif   ! val2 /= val1

                ! Check that Avu = (Auv)^T
                val1 = Avu(i,    j,    m )   ! value of Avu(row,col)
                val2 = Auv(i+iA, j+jA, mm)   ! value of Auv(col,row)

                if (abs(val2 - val1) > maxdiff) maxdiff = abs(val2 - val1)

                if (val2 /= val1) then

                   ! if difference is small, then fix the asymmetry by averaging values
                   ! else print a warning and abort
                   if ( abs(val2-val1) < eps08*abs(diag_entry) ) then
                      avg_val = 0.5d0 * (val1 + val2)
                      Avu(i,    j,    m ) = avg_val
                      Auv(i+iA, j+jA, mm) = avg_val
                   else
                      write(iulog,*) 'WARNING: Avu is not equal to (Auv)^T, this_rank, i, j, iA, jA =', this_rank, i, j, iA, jA
                      write(iulog,*) 'Avu(row,col), Auv(col,row), diff/diag:', val1, val2, (val2 - val1)/diag_entry
                      call parallel_globalindex(i, j, iglobal, jglobal, parallel)
                      write(iulog,*) '   iglobal, jglobal:', iglobal, jglobal
!!                      stop
                   endif

                endif  ! val2 /= val1

             endif        ! active_vertex
          enddo           ! i
       enddo              ! j
    enddo     ! iA
    enddo     ! jA

    if (verbose_matrix) maxdiff = parallel_reduce_max(maxdiff)

    if (verbose_matrix .and. main_task) then
       write(iulog,*) ' '
       write(iulog,*) 'Max difference from symmetry =', maxdiff
    endif

  end subroutine check_symmetry_assembled_matrix_2d

!****************************************************************************

  subroutine write_matrix_elements_3d(nx,    ny,   nz,     &
                                      nNodesSolve, nodeID, &
                                      iNodeIndex,  jNodeIndex,  &
                                      kNodeIndex,          &
                                      Auu,         Auv,    &
                                      Avu,         Avv,    &
                                      bu,          bv)

    ! Write matrix elements to text files.
    ! Note: Does not work when running on more than one task.

    integer, intent(in) ::   &
       nx, ny,               &  ! horizontal grid dimensions
       nz,                   &  ! number of vertical levels at which velocity is computed
       nNodesSolve              ! number of nodes where we solve for velocity

    integer, dimension(nz,nx-1,ny-1), intent(in) ::  &
       nodeID             ! ID for each node

    integer, dimension(:), intent(in) ::   &
       iNodeIndex, jNodeIndex, kNodeIndex   ! i, j and k indices of active nodes

    real(dp), dimension(nNodeNeighbors_3d,nz,nx-1,ny-1), intent(in) ::  &
       Auu, Auv,    &     ! assembled stiffness matrix, divided into 4 parts
       Avu, Avv           ! 1st dimension = node and its nearest neighbors in x, y and z direction 
                          ! other dimensions = (k,i,j) indices

    real(dp), dimension(nz,nx-1,ny-1), intent(in) ::  &
       bu, bv             ! assembled load (rhs) vector, divided into 2 parts

    ! Local variables

    integer :: rowA, colA
    integer :: i, j, k, m, iA, jA, kA

    real(dp), dimension(nNodesSolve, nNodesSolve) ::   &
       Auu_val, Auv_val, Avu_val, Avv_val   ! dense matrices

    real(dp), dimension(nNodesSolve) :: nonzeros

    if (tasks > 1) then
       call write_log('Error: Cannot write matrix elements to files when tasks > 1', GM_FATAL)
    endif

    Auu_val(:,:) = 0.d0
    Auv_val(:,:) = 0.d0
    Avu_val(:,:) = 0.d0
    Avv_val(:,:) = 0.d0

    do rowA = 1, nNodesSolve

       i = iNodeIndex(rowA)
       j = jNodeIndex(rowA)
       k = kNodeIndex(rowA)

       do kA = -1, 1
       do jA = -1, 1
       do iA = -1, 1

          if ( (k+kA >= 1 .and. k+kA <= nz)         &
                          .and.                     &
               (i+iA >= 1 .and. i+iA <= nx-1)       &
                          .and.                     &
               (j+jA >= 1 .and. j+jA <= ny-1) ) then

             colA = nodeID(k+kA, i+iA, j+jA)   ! ID for neighboring node
             m = indxA_3d(iA,jA,kA)

             if (colA > 0) then 
                Auu_val(rowA, colA) = Auu(m,k,i,j)
                Auv_val(rowA, colA) = Auv(m,k,i,j)
                Avu_val(rowA, colA) = Avu(m,k,i,j)
                Avv_val(rowA, colA) = Avv(m,k,i,j)
             endif

          endif     ! i+iA, j+jA, and k+kA in bounds

       enddo        ! kA
       enddo        ! iA
       enddo        ! jA

    enddo           ! rowA 

    !WHL - bug check
    write(iulog,*) ' '
    write(iulog,*) 'nonzeros per row:'
    do rowA = 1, nNodesSolve
       nonzeros(rowA) = 0
       do colA = 1, nNodesSolve
          if (abs(Auu_val(rowA,colA)) > 1.d-11) then
             nonzeros(rowA) = nonzeros(rowA) + 1
          endif
       enddo
!       write(iulog,*) rowA, nonzeros(rowA)
    enddo

    write(iulog,*) 'Write matrix elements to file, label =', matrix_label

    ! Write matrices to file (one line of file corresponding to each row of matrix)

    open(unit=10, file='Auu.'//matrix_label, status='unknown')
    open(unit=11, file='Auv.'//matrix_label, status='unknown')
    open(unit=12, file='Avu.'//matrix_label, status='unknown')
    open(unit=13, file='Avv.'//matrix_label, status='unknown')

    do rowA = 1, nNodesSolve
       write(10,'(i6)',advance='no') rowA
       write(11,'(i6)',advance='no') rowA
       write(12,'(i6)',advance='no') rowA
       write(13,'(i6)',advance='no') rowA
       do colA = 1, nNodesSolve
          write(10,'(e16.8)',advance='no') Auu_val(rowA,colA)
          write(11,'(e16.8)',advance='no') Auv_val(rowA,colA)
          write(12,'(e16.8)',advance='no') Avu_val(rowA,colA)
          write(13,'(e16.8)',advance='no') Avv_val(rowA,colA)
       enddo
       write(10,*) ' '
       write(11,*) ' '
       write(12,*) ' '
       write(13,*) ' '
    enddo

    close(10)
    close(11)
    close(12)
    close(13)

    write(iulog,*) 'Done writing matrix elements'

    ! write load vectors to file
    open(unit=14, file='bu.'//matrix_label, status='unknown')
    open(unit=15, file='bv.'//matrix_label, status='unknown')
    do rowA = 1, nNodesSolve
       i = iNodeIndex(rowA)
       j = jNodeIndex(rowA)
       k = kNodeIndex(rowA)
       write(14,'(i6, e16.8)') rowA, bu(k,i,j)
       write(15,'(i6, e16.8)') rowA, bv(k,i,j)
    enddo
    close(14)
    close(15)

  end subroutine write_matrix_elements_3d
  
!****************************************************************************

  subroutine write_matrix_elements_2d(nx,             ny,            &
                                      nVerticesSolve, vertexID,      &
                                      iVertexIndex,   jVertexIndex,  &
                                      Auu,            Auv,           &
                                      Avu,            Avv,           &
                                      bu,             bv)

    ! Write matrix elements to text files.
    ! Note: Does not work when running on more than one task.

    integer, intent(in) ::   &
       nx, ny,               &  ! horizontal grid dimensions
       nVerticesSolve           ! number of vertices where we solve for velocity

    integer, dimension(nx-1,ny-1), intent(in) ::  &
       vertexID             ! ID for each vertex

    integer, dimension(:), intent(in) ::   &
       iVertexIndex, jVertexIndex   ! i and j indices of active vertices

    real(dp), dimension(nx-1,ny-1,nNodeNeighbors_2d), intent(in) ::  &
       Auu, Auv,    &     ! assembled stiffness matrix, divided into 4 parts
       Avu, Avv           ! 1st dimension = vertex and its nearest neighbors in x and y direction 
                          ! other dimensions = (i,j) indices

    real(dp), dimension(nx-1,ny-1), intent(in) ::  &
       bu, bv             ! assembled load (rhs) vector, divided into 2 parts

    ! Local variables

    integer :: rowA, colA
    integer :: i, j, m, iA, jA

    real(dp), dimension(nVerticesSolve, nVerticesSolve) ::   &
       Auu_val, Auv_val, Avu_val, Avv_val   ! dense matrices

    real(dp), dimension(nVerticesSolve) :: nonzeros

    if (tasks > 1) then
       call write_log('Error: Cannot write matrix elements to files when tasks > 1', GM_FATAL)
    endif

    Auu_val(:,:) = 0.d0
    Auv_val(:,:) = 0.d0
    Avu_val(:,:) = 0.d0
    Avv_val(:,:) = 0.d0

    do rowA = 1, nVerticesSolve

       i = iVertexIndex(rowA)
       j = jVertexIndex(rowA)
       do jA = -1, 1
       do iA = -1, 1

          if ( (i+iA >= 1 .and. i+iA <= nx-1)       &
                          .and.                     &
               (j+jA >= 1 .and. j+jA <= ny-1) ) then

             colA = vertexID(i+iA, j+jA)   ! ID for neighboring vertex
             m = indxA_2d(iA,jA)

             if (colA > 0) then 
                Auu_val(rowA, colA) = Auu(i,j,m)
                Auv_val(rowA, colA) = Auv(i,j,m)
                Avu_val(rowA, colA) = Avu(i,j,m)
                Avv_val(rowA, colA) = Avv(i,j,m)
             endif

          endif     ! i+iA and j+jA in bounds

       enddo        ! iA
       enddo        ! jA

    enddo           ! rowA 

    !WHL - bug check
    write(iulog,*) ' '
    write(iulog,*) 'nonzeros per row:'
    do rowA = 1, nVerticesSolve
       nonzeros(rowA) = 0
       do colA = 1, nVerticesSolve
          if (abs(Auu_val(rowA,colA)) > 1.d-11) then
             nonzeros(rowA) = nonzeros(rowA) + 1
          endif
       enddo
!       write(iulog,*) rowA, nonzeros(rowA)
    enddo

    write(iulog,*) 'Write matrix elements to file, label =', matrix_label

    ! Write matrices to file (one line of file corresponding to each row of matrix)

    open(unit=10, file='Auu.'//matrix_label, status='unknown')
    open(unit=11, file='Auv.'//matrix_label, status='unknown')
    open(unit=12, file='Avu.'//matrix_label, status='unknown')
    open(unit=13, file='Avv.'//matrix_label, status='unknown')

    do rowA = 1, nVerticesSolve
       write(10,'(i6)',advance='no') rowA
       write(11,'(i6)',advance='no') rowA
       write(12,'(i6)',advance='no') rowA
       write(13,'(i6)',advance='no') rowA
       do colA = 1, nVerticesSolve
          write(10,'(e16.8)',advance='no') Auu_val(rowA,colA)
          write(11,'(e16.8)',advance='no') Auv_val(rowA,colA)
          write(12,'(e16.8)',advance='no') Avu_val(rowA,colA)
          write(13,'(e16.8)',advance='no') Avv_val(rowA,colA)
       enddo
       write(10,*) ' '
       write(11,*) ' '
       write(12,*) ' '
       write(13,*) ' '
    enddo

    close(10)
    close(11)
    close(12)
    close(13)

    write(iulog,*) 'Done writing matrix elements'

    ! write load vectors to file
    open(unit=14, file='bu.'//matrix_label, status='unknown')
    open(unit=15, file='bv.'//matrix_label, status='unknown')
    do rowA = 1, nVerticesSolve
       i = iVertexIndex(rowA)
       j = jVertexIndex(rowA)
       write(14,'(i6, e16.8)') rowA, bu(i,j)
       write(15,'(i6, e16.8)') rowA, bv(i,j)
    enddo
    close(14)
    close(15)

  end subroutine write_matrix_elements_2d

!****************************************************************************
  !TODO - Either delete this subroutine, or switch the indices.  Not currently used.
  subroutine compress_3d_to_2d(nx,        ny,      nz,  &
                               Auu,       Auv,          &
                               Avu,       Avv,          &
                               bu,        bv,           &
                               Auu_2d,    Auv_2d,       &
                               Avu_2d,    Avv_2d,       &
                               bu_2d,     bv_2d)

    !----------------------------------------------------------------
    ! Form the 2D matrix and rhs by combining terms from the 3D matrix and rhs.
    ! This combination is based on the assumption of no vertical shear;
    !  i.e., uvel and vvel have the same value at each level in a given column.
    !----------------------------------------------------------------

    !----------------------------------------------------------------
    ! Input-output arguments
    !----------------------------------------------------------------

    integer, intent(in) ::   &
       nx, ny,               &  ! horizontal grid dimensions
       nz                       ! number of vertical levels where velocity is computed

    real(dp), dimension(nNodeNeighbors_3d,nz,nx-1,ny-1), intent(in) ::  &
       Auu, Auv,    &     ! assembled 3D stiffness matrix, divided into 4 parts
       Avu, Avv           
                          
    real(dp), dimension(nz,nx-1,ny-1), intent(in) ::  &
       bu, bv             ! assembled 3D rhs vector, divided into 2 parts
                          
    real(dp), dimension(nNodeNeighbors_2d,nx-1,ny-1), intent(out) ::  &
       Auu_2d, Auv_2d,   &! assembled 2D (SSA) stiffness matrix, divided into 4 parts
       Avu_2d, Avv_2d           
                          
    real(dp), dimension(nx-1,ny-1), intent(out) ::  &
       bu_2d, bv_2d       ! assembled 2D (SSA) rhs vector, divided into 2 parts
                          
    !----------------------------------------------------------------
    ! Local variables
    !----------------------------------------------------------------

    integer :: i, j, k, iA, jA, kA, m, m2

    ! Initialize 2D matrix and rhs

    Auu_2d(:,:,:) = 0.d0
    Auv_2d(:,:,:) = 0.d0
    Avu_2d(:,:,:) = 0.d0
    Avv_2d(:,:,:) = 0.d0
    bu_2d(:,:) = 0.d0
    bv_2d(:,:) = 0.d0

    ! Form 2D matrix and rhs

    do j = 1, ny-1
       do i = 1, nx-1
          do k = 1, nz

             ! matrix
             do kA = -1,1
                do jA = -1,1
                   do iA = -1,1
                      m = indxA_3d(iA,jA,kA)
                      m2 = indxA_2d(iA,jA)
                      Auu_2d(m2,i,j) = Auu_2d(m2,i,j) + Auu(m,k,i,j)
                      Auv_2d(m2,i,j) = Auv_2d(m2,i,j) + Auv(m,k,i,j)
                      Avu_2d(m2,i,j) = Avu_2d(m2,i,j) + Avu(m,k,i,j)
                      Avv_2d(m2,i,j) = Avv_2d(m2,i,j) + Avv(m,k,i,j)
                   enddo   ! iA
                enddo      ! jA
             enddo         ! kA

             ! rhs
             bu_2d(i,j) = bu_2d(i,j) + bu(k,i,j)
             bv_2d(i,j) = bv_2d(i,j) + bv(k,i,j)

          enddo            ! k
       enddo               ! i
    enddo                  ! j

  end subroutine compress_3d_to_2d

!****************************************************************************

  end module glissade_velo_higher

!****************************************************************************
