Here is an application example of a least square problem resolution which is implemented in test_pcg_mapmat.c . Considering a problem formulated as $ A^t A x = A^t b $ , Solution x can be compute iteratively using conjutgate gradient. Instead computing and storing the whole $A^tA$ matrix, we apply succesively pointing, $A^t$ , and unpointing products, $A$ , at each iterate.

Classic gradient conjugate algorithm has been slightly modified. As we explain $A^t$ and $A$ are applied succesively. Furtherwise dotproduct operations in the overlapped domain have been moved in the distributed domain. Espacially we use relation : $< A^t y, x > = < y , Ax > $ .

Algorithm needs into memory 6 vectors :

3 in ovelapped domain(x, gradient, direction),
3 in distributed domain.

Complexity, counting operations at each iterate, is detailled as follow :

4 produits scalaires dans le domaine temporelle (communication = single MPI_Allreduce),
3 axpy dans le domaine de la carte (no communication),
3 multiplication par (no communication),
1 multiplication par (communication = MIDAPACK Communication scheme).

    Mat A;
    double *x, *g, *d;
    double *Ax_b, *Ag, *Ad; 
   
    MatCreate(&A, m, nnz, MPI_COMM_WORLD);      //allocate matrix tabs
    MatSetIndices(&A, m*nnz, 0, nnz, indices);  //copy indices into matrix structure
    MatSetValues(&A, m*nnz, 0, nnz, values);    //copy values into matrix structure
   
    MatLocalShape(&A, SORT_FLAG, OMP_FLAG);     //reindex data structure
   
    MatComShape(&A, COM_SCHEME_FLAG);           //build communication pattern 
   
    //conjugate gradient initialization
    //allocate vectors (overlapped domain) 
    g  = (double *) malloc(A.lcount*sizeof(double));      //g (gradient)
    d  = (double *) malloc(A.lcount*sizeof(double));      //d (direction)  
    //allocate vector (distributed domain) 
    Ax_b = (double *) malloc(m*sizeof(double));           //Ax_b = Ax-b
    Ad = (double *) malloc(m*sizeof(double));             //Ad = A d
    Ag = (double *) malloc(m*sizeof(double));             //Ag = A g
   
    MatVecProd(&A, x, Ax_b, 0);           //Ax_b = Ax-b
    for(i=0; i<m; i++)                    // 
      Ax_b[i] = Ax_b[i]-b[i];             //
   
    TrMatVecProd(&A, Ax_b, d, 0);         //Ad = A d =  A A^t(Ax-b)
    MatVecProd(&A, d, Ad, 0);             //
   
    resnew=0.0;                           //initial residu, resnew = ||A^t(Ax-b)|| = <Ax_b, Ad>
    localreduce=0.0;                      //
    for(i=0; i<m; i++)                    //         
      localreduce+=Ax_b[i]*Ad[i];         //
    MPI_Allreduce(&localreduce, &resnew, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
   
    //conjugate gradient iterate
    for(k=0; k<KMAX ; k++){               //begin loop
   
      alpha=0.0;                          //alpha = <Ad, Ax_b>
      localreduce=0.0;                    //
      for(i=0; i<m; i++)                  //         
        localreduce+=Ad[i]*Ax_b[i];       //
      MPI_Allreduce(&localreduce, &alpha, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
   
      gamma=0.0;                          //gamma = <Ad, Ad>
      localreduce=0.0;                    //
      for(i=0; i<m; i++)                  //         
        localreduce+=Ad[i]*Ad[i];         //
      MPI_Allreduce(&localreduce, &gamma, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
   
      for(j=0; j<A.lcount; j++)           // x = x + (alpha/gamma) d 
        x[j] = x[j] - (alpha/gamma)* d[j];//
   
      MatVecProd(&A, x, Ax_b, 0);         //Ax_b = Ax-b
      for(i=0; i<m; i++)                  // 
        Ax_b[i] = Ax_b[i]-b[i];           //
   
      TrMatVecProd(&A, Ax_b, g, 0);       //g  = A^t(Ax-b)
      MatVecProd(&A, g, Ag, 0);           //Ag = AA^t(Ax-b)
   
      resold=resnew;                      //residu = ||g|| = <Ax-b, Ag>
      resnew=0.0;                         //
      localreduce=0.0;                    //
      for(i=0; i<m; i++)                  //         
        localreduce+=Ax_b[i]*Ag[i];       //
      MPI_Allreduce(&localreduce, &resnew, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
   
      beta=0.0;                           //beta = <Ag, Ad>
      localreduce=0.0;                    //
      for(i=0; i<m; i++)                  //         
        localreduce+=Ag[i]*Ad[i];         //
      MPI_Allreduce(&localreduce, &beta, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
   
      if(resnew<tol)                       //convergence test
         break;
       
      for(j=0; j<A.lcount; j++)           //d = -g + (beta/gamma) d
        d[j]= -g[j] + (beta/gamma)*d[j];  // 
   
      MatVecProd(&A, d, Ad, 0);           //Ad = A d
    }

More information about pointing operator are detailled, in the pointing function synposis