
function TestGPU
%%
%
% conclusion on 19 April,2019. Same conclusions on 14 Sep, 2023.
% GPU linsolve using sparse matrices about 3 to 4 times slower than CPU
% GPU linsolve using full matrices a bit faster than CPU
%
%
%%



NumWorkers=8 ;

ParPool = gcp('nocreate') ;

if isempty(ParPool)

    parpool('Processes',NumWorkers)

elseif (ParPool.NumWorkers~=NumWorkers)

    delete(gcp('nocreate'))
    parpool('Processes',NumWorkers)

end

%%
iExperiment=0;
density=0.01 ;
timings=zeros(10,7)+NaN;

nRepeat=2;

for N=[100 1000 5000] % 500 1000 2000 3000 20000]

    iExperiment=iExperiment+1;

    fprintf('\n \n N=%i \n ', N)
    fprintf(' Creating arrays.\n')

    %Asparse=sprand(N,N,density) ;
    Asparse=sprandsym(N,density) ;
    Asparse=Asparse+sparse(1:N,1:N,1) ;
    Afull=full(Asparse);
    x=ones(N,1) ;
    Adistfull=distributed(Afull) ;
    Adistsparse=distributed(Asparse) ;
    Agpufull=gpuArray(Afull);
    xgpu=gpuArray(x) ;
    Agpusparse=gpuArray(Asparse) ;
    xdist=distributed(x);

    AgpusparseSingle=single(Agpusparse); 
    xgpuSingle=single(xgpu); 

    fprintf('A CPU distributed full. \n')
    for k=1:nRepeat
        CPUdistfull=tic ;
        y=Adistfull\xdist;
        CPUdistfull=toc(CPUdistfull);
    end

    CPUdistsparse=NaN;
    %% matlab 2019a does not work with sprandsym, only sprand
    fprintf('A CPU distributed sparse. \n')
    for k=1:nRepeat
        CPUdistsparse=tic ;
        y=Adistsparse\xdist;
        CPUdistsparse=toc(CPUdistsparse);
    end
    %%

    fprintf('A CPU full  \n')
    for k=1:nRepeat
        CPUfull=tic ;
        y=Afull\x;
        CPUfull=toc(CPUfull);
    end

    fprintf('A CPU sparse \n')
    for k=1:nRepeat
        CPUsparse=tic ;
        y=Asparse\x;
        CPUsparse=toc(CPUsparse);
    end

    fprintf('A GPU \n')
    for k=1:nRepeat

        GPUfull=tic;
        ygpu=Agpufull\xgpu;
        GPUfull=toc(GPUfull);
    end

    fprintf('A GPU sparse \n')
    for k=1:nRepeat
        GPUsparse=tic ;
        y=Agpusparse\xgpu;
        GPUsparse=toc(GPUsparse);
    end

    fprintf('A GPU sparse single \n')
    for k=1:nRepeat
        GPUsparseSingle=tic ;
        yGPUsingle=AgpusparseSingle\xgpuSingle;
        GPUsparseSingle=toc(GPUsparseSingle);
    end


    timings(iExperiment,1)= N ;
    timings(iExperiment,2)= CPUfull ;
    timings(iExperiment,3)= GPUfull ;
    timings(iExperiment,4)= CPUdistfull ;
    timings(iExperiment,5)= CPUsparse ;
    timings(iExperiment,6)= CPUdistsparse ;
    timings(iExperiment,7)= GPUsparse ;
    timings(iExperiment,8)= GPUsparseSingle ;

end

figure
plot(timings(:,1),timings(:,2),"o-r",DisplayName="CPU full")
hold on
plot(timings(:,1),timings(:,3),"x-b",DisplayName="GPU full")
plot(timings(:,1),timings(:,4),"+-g",DisplayName="CPU distributed")
plot(timings(:,1),timings(:,5),"*-c",DisplayName="CPU sparse")
plot(timings(:,1),timings(:,6),"^-m",DisplayName="CPU sparse distributed")
plot(timings(:,1),timings(:,7),"o-k",DisplayName="GPU sparse")
plot(timings(:,1),timings(:,7),Marker="diamond",MarkerFaceColor="r",LineStyle="--",DisplayName="GPU sparse single")

legend(Location='northwest')
xlabel("Problem size N")
ylabel("time (sec)")

end