Ladhall timings
                       (work in progress)
       From: original table sent by P. Prelovsek 99/02/11


SUMMARY
    Optimum performance on original program is obtained with
    "f90 -r8000 -mips4 -64 -O3 -r8" (144 sec). This is about 30% faster 
    than "f90 -O2" (193 sec). "f77 -Ofast=ip21 -r8" achieves 161 sec.
    

PROGRAM
    Small memory (total 24 MB, resident about 3 MB).
    In the main loop, the if statement asking for sign is .true.
    on only 2520 cases, and .false. in 17808, which indicates a
    symmetric if construct (ladhall_m2_bench.f, Appendix C) may not 
    be beneficial. However, if the loops and floating point operations
    are distributed, the topology of the if construct is no longer important.


MACHINES
    saturn:    SGI Power Challenge, 75 MHz R8000 (300 MFLOPS)
    dune:      Sun Enterprise 4500, 336 MHz


Table I.  Ladhall timings. RF is the floating point efficiency
(integer operations not counted); actual efficiency is larger but
we do not know theoretical MINTOPS to calculate it. Parameters for case 
"b" are shown in Appendix A and B. Parameters for case "a" are unknown.
------------------------------------------------------------------------
          case  time    rsize   FP op.    INT op.  MFLOPS  MINTOPS   RF
                 [s]     [MB]  count *1  count *1
                               [10**6]   [10**6]
------------------------------------------------------------------------
saturn *2   a    466      3     12219     16728       26     36     0.09

       *2   b    193      3      6065      8306       31     43     0.10
       *3        231
       *4        144             6162      8027       43            0.14
       *5        144

       *11       219
       *12       177
       *13       180
       *14       775 
       *14a      789
       *14b      183
       *15       161                                  38            0.13
       *15a      172

       *16       176
       *17       276
       *18       276

       *21       170
       *22       244

       *23       241
       *24       175

       *31       142
       *32       142
       
       *41       137             6065                 44            0.15

       *51       202
       *52       227
       *53       202
       *54       223
       *55       155
       *56       169
       *57       178

       *61       169
       *62       169
       *63       225
------------------------------------------------------------------------
dune   *a   b    171
------------------------------------------------------------------------
*1  From pixie/prof operation count.
*2  f90 -O2 -r8.
*3  f90 -r8000 -mips4 -64 -O2 -r8.
*4  f90 -r8000 -mips4 -64 -O3 -r8 (speedup versus *2: -O3).
*5  f90 -r8000 -mips4 -64 -O3 -r8 -pfa -WK,-p=1.
*11 f77 -O3 -r8.
*12 f77 -r8000 -mips4 -64 -O3 -r8.
*13 f77 -r8000 -mips4 -64 -O3 -r8 -pfa -WK,-p=1.
*14 f77 -r8000 -mips4 -64 -O3 -LNO:ou=4:cs1=16k:cs2=4m -TENV:X=4 
    -pfa -WK,-so=3,-ro=3,-o=5 -WK,-p=1.
*14a f77 -r8000 -mips4 -64 -O3 -LNO:cs1=16k:cs2=4m -TENV:X=4 
    -pfa -WK,-so=3,-ro=3,-o=5 -WK,-p=1.
*14b f77 -r8000 -mips4 -64 -O3 -TENV:X=4.
*15 f77 -Ofast=ip21 -r8.
*15a f77 -Ofast=ip21 -r8 -LNO:cs1=16k:cs2=4m -TENV:X=4 
     -pfa -WK,-so=3,-ro=3,-o=5 -WK,-p=1.
*16 f77 -Ofast=ip21 -r8 -lcomplib.sgimath (TQL2 from precompiled library,
    PYTHAG from source file).
*17  f77 -r8000 -mips4 -64 -O2 -r8.
*18  f77 -r8000 -mips4 -64 -O2 -r8 -pfa -WK,-p=1.
*21  Modified main loop: ladhall_m1_bench.f (Appendix C),
     "f90 -r8000 -mips4 -64 -O3 -r8".
*22  As *21, but "f77 -Ofast=ip21 -r8".
*23  Modified main loop: ladhall_m2_bench.f (Appendix C),
     "f77 -Ofast=ip21 -r8" (*8).
*24  As *23, but "f90 -r8000 -mips4 -64 -O2 -r8".
*31  Modified main loop: ladhall_m2a_bench.f (Appendix C),
     f90 -r8000 -mips4 -64 -O3 -r8.
*32  As *31, but ladhall_m2b_bench.f.
*41  ladhall_m3_bench.f (index array hamfli transposed).
*51  ladhall_m4_bench.f (index array hamfli transposed, two sets of DO-loops,
     one for storing indices, the other for calculations;
     "f90 -r8000 -mips4 -64 -O3 -r8".
*52  As *51, but "f77 -Ofast=ip21 -r8".
*53  As *51, but INTEGER*4 temporary array (instead of INTEGER*1). Same speed
     (ladhall_m4a_bench.f).
*54  As *51, but the "if" statement moved into the second (multiplication) 
     loop (ladhall_m4b_bench.f).
*55  As *51, but multiplications distributed between loops 
     (ladhall_m4c_bench.f).
*56  As *55, but a different distribution (ladhall_m4c1_bench.f).
*57  As *56, but if-then-else construct. Now that the loops and FP operations 
     are distributed, the topology of the if is no longer important
     (ladhall_m4c2_bench.f).
*61  Only the inner loop and FP ops are distributed, one-dimensional 
     temporary array (ladhall_m5_bench.f).
*62  As *61, but "f90 -r8000 -mips4 -64 -O3 -r8 -pfa -WK,-p=1".
*63  As *62, but "f77 -r8000 -mips4 -64 -O3 -r8 -pfa -WK,-p=1".
*a  f90 -fast.


Runs and source modifications performed by R. Krivec


APPENDIX A.  Sample output for case "b", dune.

  nx =  6  ny =   2  N =  12  Nh =   1  nud =  0  mq =  0

  nth =   16  ith1,2 =   0   8  iter =  15   lstmax =  80
 t,t1 =  -1.00  -1.00  Jmin, max =   0.20   1.00  ncj =   5
  alpha = 1.000 gamma =  1.000
 cb =  0.0500  del =  0.01000  dth0 =  0.01000

 S_z loop, nu=5
    Total configurations : 5544
    Parent configurations: 924
    Degenerated parents  : 0


  cj, cj1=  0.20  0.20 cth0=  0.28580  en00=   -4.2146    rhall =  0.151
  edd=   -0.680  ett=   0.9514  etdb=  -0.0980  edb=  -0.1068
  del0= -0.0078 etd0=   0.0000 eddt=   0.2179  etdb1=  -0.1322  rhall1=  0.2043

  cj, cj1=  0.40  0.40 cth0=  0.29301  en00=   -5.9237    rhall = -0.312
  edd=   -0.912  ett=   0.6320  etdb=   0.1799  edb=  -0.3113
  del0= -0.0171 etd0=   0.0000 eddt=   1.2638  etdb1=  -0.2515  rhall1=  0.4364

  cj, cj1=  0.60  0.60 cth0=  0.29375  en00=   -7.6978    rhall = -0.512
  edd=   -0.971  ett=   0.5860  etdb=   0.2917  edb=  -0.3973
  del0= -0.0204 etd0=   0.0000 eddt=   1.7681  etdb1=  -0.4314  rhall1=  0.7578

  cj, cj1=  0.80  0.80 cth0=  0.28996  en00=   -9.5049    rhall = -0.490
  edd=   -1.005  ett=   0.6053  etdb=   0.2979  edb=  -0.4381
  del0= -0.0218 etd0=   0.0000 eddt=   2.0349  etdb1=  -0.5892  rhall1=  0.9686

  cj, cj1=  1.00  1.00 cth0=  0.28524  en00=  -11.3324    rhall = -0.342
  edd=   -1.029  ett=   0.6353  etdb=   0.2233  edb=  -0.4530
  del0= -0.0220 etd0=   0.0000 eddt=   2.1472  etdb1=  -0.7222  rhall1=  1.1050
TIME: Total    170.87317


APPENDIX B.  Sample output for case "b", saturn.

  nx =  6  ny =   2  N =  12  Nh =   1  nud =  0  mq =  0

  nth =   16  ith1,2 =   0   8  iter =  15   lstmax =  80
 t,t1 =  -1.00  -1.00  Jmin, max =   0.20   1.00  ncj =   5
  alpha = 1.000 gamma =  1.000
 cb =  0.0500  del =  0.01000  dth0 =  0.01000

 S_z loop, nu=           5
    Total configurations :         5544
    Parent configurations:          924
    Degenerated parents  :            0


  cj, cj1=  0.20  0.20 cth0=  0.28581  en00=   -4.2146    rhall =  0.152
  edd=   -0.680  ett=   0.9514  etdb=  -0.0983  edb=  -0.1068
  del0= -0.0078 etd0=   0.0000 eddt=   0.2177  etdb1=  -0.1324  rhall1=  0.2047

  cj, cj1=  0.40  0.40 cth0=  0.29301  en00=   -5.9237    rhall = -0.312
  edd=   -0.912  ett=   0.6320  etdb=   0.1799  edb=  -0.3113
  del0= -0.0171 etd0=   0.0000 eddt=   1.2639  etdb1=  -0.2515  rhall1=  0.4364

  cj, cj1=  0.60  0.60 cth0=  0.29375  en00=   -7.6978    rhall = -0.512
  edd=   -0.971  ett=   0.5860  etdb=   0.2916  edb=  -0.3973
  del0= -0.0204 etd0=   0.0000 eddt=   1.7680  etdb1=  -0.4314  rhall1=  0.7578

  cj, cj1=  0.80  0.80 cth0=  0.28996  en00=   -9.5049    rhall = -0.490
  edd=   -1.005  ett=   0.6053  etdb=   0.2979  edb=  -0.4381
  del0= -0.0218 etd0=   0.0000 eddt=   2.0350  etdb1=  -0.5892  rhall1=  0.9687

  cj, cj1=  1.00  1.00 cth0=  0.28522  en00=  -11.3324    rhall = -0.342
  edd=   -1.029  ett=   0.6353  etdb=   0.2234  edb=  -0.4530
  del0= -0.0220 etd0=   0.0000 eddt=   2.1483  etdb1=  -0.7224  rhall1=  1.1051
TIME: Total    192.67506


APPENDIX C.  Critical sections of the original ladhall and the modified ones.

ladhall_bench.f (original):
      isig=isign(1,hamfli(i,l))
      cjjj=cjj
      if( isig.eq.-1 ) cjjj=cjj1
      psi1=psi1+phi(ib,1)*irelm(ia)*cjjj*dnf(ib)

ladhall_m1_bench.f:
      if( isign(1,hamfli(i,l)) .eq. -1 ) then
          psi1=psi1+phi(ib,1)*irelm(ia)*cjj1*dnf(ib)
      else
          psi1=psi1+phi(ib,1)*irelm(ia)*cjj*dnf(ib)
      endif

ladhall_m2_bench.f:
      zzz = phi(ib,1)*irelm(ia)*dnf(ib)
      if( isign(1,hamfli(i,l)) .eq.-1 ) then
          psi1 = psi1 + zzz * cjj1
      else
          psi1 = psi1 + zzz * cjj
      endif

ladhall_m2a_bench.f:
      zzz = phi(ib,1)*irelm(ia)*dnf(ib)
      cjjj = cjj
      if( isign(1,hamfli(i,l)) .eq.-1 ) cjjj = cjj1
      psi1 = psi1 + zzz * cjjj

ladhall_m2b_bench.f:
      zzz = irelm(ia)*phi(ib,1)*dnf(ib)
      cjjj = cjj
      if( isign(1,hamfli(i,l)) .eq.-1 ) cjjj = cjj1
      psi1 = psi1 + zzz * cjjj

ladhall_m3_bench.f (as original, but transposed hamfli):
      isig=isign(1,hamfli(l,i))
      cjjj=cjj
      if( isig.eq.-1 ) cjjj=cjj1
      psi1=psi1+phi(ib,1)*irelm(ia)*cjjj*dnf(ib)

ladhall_m4_bench.f (as original, but transposed hamfli, and two sets of DO 
loops):
      c16xxx(l,i)   = phi(ib,1)
      i1xxxx(l,i)   = irelm(ia)
      r8xxxx(1,l,i) = dnf(ib)
      ...
      psi1 = psi1 + c16xxx(l,i) *
     &    r8xxxx(1,l,i) * r8xxxx(2,l,i) * i1xxxx(l,i) 

ladhall_m4c1_bench.f (FP ops distributed between loops):
      cjj=cj*gamma/2.d0
      cjj1=cj1*gamma/2.d0
c     rk: transposed hamfli
      do i=1,np
        if (dnf(i).gt.1.d-8) then
          do l=1,nsf
            ia=iabs(hamfli(l,i))
            ib=irelp(ia)
            i4xxxx(l,i) = isign(1,hamfli(l,i))
            c16xxx(l,i) = phi(ib,1) * dnf(ib) * irelm(ia)
          enddo
        endif
      enddo
      do i=1,np
        if (dnf(i).gt.1.d-8) then
          psi1=cmplx(0.d0,0.d0)
          do l=1,nsf
            cjjj=cjj
            if( i4xxxx(l,i) .eq.-1 ) cjjj=cjj1
            psi1 = psi1 + c16xxx(l,i) * cjjj
          enddo
          psi(i)=psi(i)+psi1/dnf(i)
        endif
      enddo

ladhall_m5_bench.f:
      cjj=cj*gamma/2.d0
      cjj1=cj1*gamma/2.d0
c     rk: transposed hamfli
      do i=1,np
        if (dnf(i).gt.1.d-8) then
          do l=1,nsf
            ia=iabs(hamfli(l,i))
            isig=isign(1,hamfli(l,i))
            ib=irelp(ia)
            c16xxx(l) = phi(ib,1) * dnf(ib) * irelm(ia)
          enddo
          psi1=cmplx(0.d0,0.d0)
          do l=1,nsf
            if( isig.eq.-1 ) then
                psi1 = psi1 + c16xxx(l) * cjj1
            else
                psi1 = psi1 + c16xxx(l) * cjj
            endif
          enddo
          psi(i)=psi(i)+psi1/dnf(i)
        endif
      enddo