From a597ee2cb6042c8fc3c39d779c5dcec89fa22728 Mon Sep 17 00:00:00 2001
From: Johannes Taraz <johannes.taraz@gmail.com>
Date: Tue, 20 Feb 2024 16:54:29 +0100
Subject: [PATCH 1/9] t2

---
 src/ExtendableSparse.jl | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/ExtendableSparse.jl b/src/ExtendableSparse.jl
index 700bbda..36dca95 100644
--- a/src/ExtendableSparse.jl
+++ b/src/ExtendableSparse.jl
@@ -16,6 +16,7 @@ if USE_GPL_LIBS
     using SuiteSparse
 end
 
+@info "test2"
 
 using DocStringExtensions
 

From 873ff1e9bbf88993ec62a3eb23ac46b6a112f7ed Mon Sep 17 00:00:00 2001
From: Johannes Taraz <johannes.taraz@gmail.com>
Date: Tue, 20 Feb 2024 17:48:30 +0100
Subject: [PATCH 2/9] add ExtendableSparseParallel

---
 Project.toml                                  |   2 +
 src/ExtendableSparse.jl                       |  13 +-
 .../ExtendableSparseParallel.jl               | 258 ++++++
 .../preparatory.jl                            | 427 ++++++++++
 .../struct_flush.jl                           | 263 ++++++
 .../supersparse.jl                            | 788 ++++++++++++++++++
 6 files changed, 1749 insertions(+), 2 deletions(-)
 create mode 100644 src/matrix/ExtendableSparseMatrixParallel/ExtendableSparseParallel.jl
 create mode 100644 src/matrix/ExtendableSparseMatrixParallel/preparatory.jl
 create mode 100644 src/matrix/ExtendableSparseMatrixParallel/struct_flush.jl
 create mode 100644 src/matrix/ExtendableSparseMatrixParallel/supersparse.jl

diff --git a/Project.toml b/Project.toml
index 89ce6e1..46d6e61 100644
--- a/Project.toml
+++ b/Project.toml
@@ -15,6 +15,8 @@ Sparspak = "e56a9233-b9d6-4f03-8d0f-1825330902ac"
 StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
 SuiteSparse = "4607b0f0-06f3-5cda-b6b1-a6196a1729e9"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
+Metis = "2679e427-3c69-5b7f-982b-ece356f1e94b"
+ExtendableGrids = "cfc395e8-590f-11e8-1f13-43a2532b2fa8"
 
 [weakdeps]
 AMGCLWrap = "4f76b812-4ba5-496d-b042-d70715554288"
diff --git a/src/ExtendableSparse.jl b/src/ExtendableSparse.jl
index 36dca95..372df82 100644
--- a/src/ExtendableSparse.jl
+++ b/src/ExtendableSparse.jl
@@ -4,6 +4,10 @@ using LinearAlgebra
 using Sparspak
 using ILUZero
 
+using Metis
+using Base.Threads
+using ExtendableGrids
+
 if  !isdefined(Base, :get_extension)
     using Requires
 end
@@ -16,8 +20,6 @@ if USE_GPL_LIBS
     using SuiteSparse
 end
 
-@info "test2"
-
 using DocStringExtensions
 
 import SparseArrays: AbstractSparseMatrixCSC, rowvals, getcolptr, nonzeros
@@ -31,6 +33,13 @@ export SparseMatrixLNK,
 
 export eliminate_dirichlet, eliminate_dirichlet!, mark_dirichlet
 
+
+include("matrix/ExtendableSparseMatrixParallel/ExtendableSparseParallel.jl")
+
+export ExtendableSparseMatrixParallel, SuperSparseMatrixLNK
+export addtoentry!, reset!, dummy_assembly!, preparatory_multi_ps_less_reverse, fr, addtoentry!, rawupdateindex!, updateindex!, compare_matrices_light
+
+
 include("factorizations/factorizations.jl")
 
 export JacobiPreconditioner,
diff --git a/src/matrix/ExtendableSparseMatrixParallel/ExtendableSparseParallel.jl b/src/matrix/ExtendableSparseMatrixParallel/ExtendableSparseParallel.jl
new file mode 100644
index 0000000..68dace8
--- /dev/null
+++ b/src/matrix/ExtendableSparseMatrixParallel/ExtendableSparseParallel.jl
@@ -0,0 +1,258 @@
+include("supersparse.jl")
+include("preparatory.jl")
+#include("prep_time.jl")
+
+mutable struct ExtendableSparseMatrixParallel{Tv, Ti <: Integer} <: AbstractSparseMatrix{Tv, Ti}
+    """
+    Final matrix data
+    """
+    cscmatrix::SparseMatrixCSC{Tv, Ti}
+
+    """
+    Linked list structure holding data of extension
+    """
+    lnkmatrices::Vector{SuperSparseMatrixLNK{Tv, Ti}}
+
+	grid::ExtendableGrid
+
+	nnts::Vector{Ti}
+    
+    sortednodesperthread::Matrix{Ti}
+    
+    old_noderegions::Matrix{Ti}
+    
+    cellsforpart::Vector{Vector{Ti}}
+    
+    globalindices::Vector{Vector{Ti}}
+    
+    new_indices::Vector{Ti}
+    
+    rev_new_indices::Vector{Ti}
+    
+    start::Vector{Ti}
+    
+    cellparts::Vector{Ti}
+    
+    nt::Ti
+    
+    depth::Ti
+    
+    
+end
+
+
+
+function ExtendableSparseMatrixParallel{Tv, Ti}(nm, nt, depth; x0=0.0, x1=1.0) where {Tv, Ti <: Integer}
+	grid, nnts, s, onr, cfp, gi, gc, ni, rni, starts, cellparts = preparatory_multi_ps_less_reverse(nm, nt, depth, Ti; x0, x1)
+	csc = spzeros(Tv, Ti, num_nodes(grid), num_nodes(grid))
+	lnk = [SuperSparseMatrixLNK{Tv, Ti}(num_nodes(grid), nnts[tid]) for tid=1:nt]
+	ExtendableSparseMatrixParallel{Tv, Ti}(csc, lnk, grid, nnts, s, onr, cfp, gi, ni, rni, starts, cellparts, nt, depth)
+end
+
+
+
+function addtoentry!(A::ExtendableSparseMatrixParallel{Tv, Ti}, i, j, tid, v; known_that_unknown=false) where {Tv, Ti <: Integer}
+	if known_that_unknown
+		A.lnkmatrices[tid][i, A.sortednodesperthread[tid, j]] += v
+		return
+	end
+	
+	if updatentryCSC2!(A.cscmatrix, i, j, v)
+	else
+		A.lnkmatrices[tid][i, A.sortednodesperthread[tid, j]] += v
+	end
+end
+
+
+#=
+function addtoentry!(A::ExtendableSparseMatrixParallel{Tv, Ti}, i, j, v; known_that_unknown=false) where {Tv, Ti <: Integer}
+	if known_that_unknown
+		level, tid = last_nz(ext.old_noderegions[:, ext.rev_new_indices[j]])
+		A.lnkmatrices[tid][i, A.sortednodesperthread[tid, j]] += v
+		return
+	end
+	
+	if updatentryCSC2!(A.cscmatrix, i, j, v)
+	else
+		level, tid = last_nz(ext.old_noderegions[:, ext.rev_new_indices[j]])
+		A.lnkmatrices[tid][i, A.sortednodesperthread[tid, j]] += v
+	end
+end
+=#
+
+
+"""
+`function addtoentry!(A::ExtendableSparseMatrixParallel{Tv, Ti}, i, j, v; known_that_unknown=true) where {Tv, Ti <: Integer}`
+
+A[i,j] += v, using any partition.
+If the partition should be specified (for parallel use), use 
+`function addtoentry!(A::ExtendableSparseMatrixParallel{Tv, Ti}, i, j, tid, v; known_that_unknown=true) where {Tv, Ti <: Integer}`.
+"""
+function addtoentry!(A::ExtendableSparseMatrixParallel{Tv, Ti}, i, j, v; known_that_unknown=false) where {Tv, Ti <: Integer}
+	if known_that_unknown
+		level, tid = last_nz(A.old_noderegions[:, A.rev_new_indices[j]])
+		A.lnkmatrices[tid][i, A.sortednodesperthread[tid, j]] += v
+		return
+	end
+	
+	if updatentryCSC2!(A.cscmatrix, i, j, v)
+	else
+		level, tid = last_nz(A.old_noderegions[:, A.rev_new_indices[j]])
+		A.lnkmatrices[tid][i, A.sortednodesperthread[tid, j]] += v
+	end
+end
+
+#---------------------------------
+
+
+function updateindex!(ext::ExtendableSparseMatrixParallel{Tv, Ti},
+                      op,
+                      v,
+                      i,
+                      j) where {Tv, Ti <: Integer}
+    k = ExtendableSparse.findindex(ext.cscmatrix, i, j)
+    if k > 0
+        ext.cscmatrix.nzval[k] = op(ext.cscmatrix.nzval[k], v)
+        return
+    else
+    	level, tid = last_nz(ext.old_noderegions[:, ext.rev_new_indices[j]])
+		updateindex!(ext.lnkmatrices[tid], op, v, i, ext.sortednodesperthread[tid, j])
+    end
+    ext
+end
+
+function updateindex!(ext::ExtendableSparseMatrixParallel{Tv, Ti},
+                      op,
+                      v,
+                      i,
+                      j,
+                      tid) where {Tv, Ti <: Integer}
+    k = ExtendableSparse.findindex(ext.cscmatrix, i, j)
+    if k > 0
+        ext.cscmatrix.nzval[k] = op(ext.cscmatrix.nzval[k], v)
+        return
+    else
+    	updateindex!(ext.lnkmatrices[tid], op, v, i, ext.sortednodesperthread[tid, j])
+    end
+    ext
+end
+
+function rawupdateindex!(ext::ExtendableSparseMatrixParallel{Tv, Ti},
+                         op,
+                         v,
+                         i,
+                         j) where {Tv, Ti <: Integer}
+    k = ExtendableSparse.findindex(ext.cscmatrix, i, j)
+    if k > 0
+        ext.cscmatrix.nzval[k] = op(ext.cscmatrix.nzval[k], v)
+    else
+        level, tid = last_nz(ext.old_noderegions[:, ext.rev_new_indices[j]])
+	    rawupdateindex!(ext.lnkmatrices[tid], op, v, i, ext.sortednodesperthread[tid, j])
+    end
+    ext
+end
+
+function rawupdateindex!(ext::ExtendableSparseMatrixParallel{Tv, Ti},
+                         op,
+                         v,
+                         i,
+                         j, 
+                         tid) where {Tv, Ti <: Integer}
+    k = ExtendableSparse.findindex(ext.cscmatrix, i, j)
+    if k > 0
+        ext.cscmatrix.nzval[k] = op(ext.cscmatrix.nzval[k], v)
+    else
+        rawupdateindex!(ext.lnkmatrices[tid], op, v, i, ext.sortednodesperthread[tid, j])
+    end
+    ext
+end
+
+function Base.getindex(ext::ExtendableSparseMatrixParallel{Tv, Ti},
+                       i::Integer,
+                       j::Integer) where {Tv, Ti <: Integer}
+    k = ExtendableSparse.findindex(ext.cscmatrix, i, j)
+    if k > 0
+        return ext.cscmatrix.nzval[k]
+    end
+    
+    level, tid = last_nz(ext.old_noderegions[:, ext.rev_new_indices[j]])
+	ext.lnkmatrices[tid][i, ext.sortednodesperthread[tid, j]]
+    
+end
+
+function Base.setindex!(ext::ExtendableSparseMatrixParallel{Tv, Ti},
+                        v::Union{Number,AbstractVecOrMat},
+                        i::Integer,
+                        j::Integer) where {Tv, Ti}
+    k = ExtendableSparse.findindex(ext.cscmatrix, i, j)
+    if k > 0
+        ext.cscmatrix.nzval[k] = v
+    else
+		level, tid = last_nz(ext.old_noderegions[:, ext.rev_new_indices[j]])
+		#@info typeof(tid), typeof(j)
+		jj = ext.sortednodesperthread[tid, j]
+		ext.lnkmatrices[tid][i, jj] = v
+    end
+end
+
+
+
+#------------------------------------
+
+function reset!(A::ExtendableSparseMatrixParallel{Tv, Ti}) where {Tv, Ti <: Integer}
+	A.cscmatrix = spzeros(Tv, Ti, num_nodes(A.grid), num_nodes(A.grid))
+	A.lnkmatrices = [SuperSparseMatrixLNK{Tv, Ti}(num_nodes(A.grid), A.nnts[tid]) for tid=1:A.nt]
+end
+
+function nnz_flush(ext::ExtendableSparseMatrixParallel)
+    flush!(ext)
+    return nnz(ext.cscmatrix)
+end
+
+function nnz_noflush(ext::ExtendableSparseMatrixParallel)
+    return nnz(ext.cscmatrix), sum([ext.lnkmatrices[i].nnz for i=1:ext.nt])
+end
+	
+function matrixindextype(A::ExtendableSparseMatrixParallel{Tv, Ti}) where {Tv, Ti <: Integer}
+	Ti
+end
+
+function matrixvaluetype(A::ExtendableSparseMatrixParallel{Tv, Ti}) where {Tv, Ti <: Integer}
+	Tv
+end
+
+
+
+function Base.show(io::IO, ::MIME"text/plain", ext::ExtendableSparseMatrixParallel)
+    #flush!(ext)
+    xnnzCSC, xnnzLNK = nnz_noflush(ext)
+    m, n = size(ext)
+    print(io,
+          m,
+          "×",
+          n,
+          " ",
+          typeof(ext),
+          " with ",
+          xnnzCSC,
+          " stored ",
+          xnnzCSC == 1 ? "entry" : "entries",
+          " in CSC and ",
+          xnnzLNK,
+          " stored ",
+          xnnzLNK == 1 ? "entry" : "entries",
+          " in LNK. CSC:")
+
+    if !haskey(io, :compact)
+        io = IOContext(io, :compact => true)
+    end
+
+    if !(m == 0 || n == 0 || xnnzCSC == 0)
+        print(io, ":\n")
+        Base.print_array(IOContext(io), ext.cscmatrix)
+    end
+end
+
+Base.size(A::ExtendableSparseMatrixParallel) = (A.cscmatrix.m, A.cscmatrix.n)
+
+include("struct_flush.jl")
diff --git a/src/matrix/ExtendableSparseMatrixParallel/preparatory.jl b/src/matrix/ExtendableSparseMatrixParallel/preparatory.jl
new file mode 100644
index 0000000..e14a066
--- /dev/null
+++ b/src/matrix/ExtendableSparseMatrixParallel/preparatory.jl
@@ -0,0 +1,427 @@
+"""
+`function preparatory_multi_ps_less_reverse(nm, nt, depth)`
+
+`nm` is the number of nodes in each dimension (Examples: 2d: nm = (100,100) -> 100 x 100 grid, 3d: nm = (50,50,50) -> 50 x 50 x 50 grid).
+`nt` is the number of threads.
+`depth` is the number of partition layers, for depth=1, there are nt parts and 1 separator, for depth=2, the separator is partitioned again, leading to 2*nt+1 submatrices...
+To assemble the system matrix parallely, things such as `cellsforpart` (= which thread takes which cells) need to be computed in advance. This is done here.
+"""
+function preparatory_multi_ps_less_reverse(nm, nt, depth, Ti; sequential=false, x0=0.0, x1=1.0)
+	grid = getgrid(nm; x0, x1)
+	
+	if sequential
+		(allcells, start, cellparts) = grid_to_graph_ps_multi!(grid, nt, depth)#)
+	else
+		(allcells, start, cellparts) = grid_to_graph_ps_multi_par!(grid, nt, depth)
+	end
+
+	(nnts, s, onr, gi, gc, ni, rni, starts) = get_nnnts_and_sortednodesperthread_and_noderegs_from_cellregs_ps_less_reverse_nopush(
+		cellparts, allcells, start, num_nodes(grid), Ti, nt
+	)
+	
+	cfp = bettercellsforpart(cellparts, depth*nt+1)
+	return grid, nnts, s, onr, cfp, gi, gc, ni, rni, starts, cellparts
+end
+
+
+"""
+`function get_nnnts_and_sortednodesperthread_and_noderegs_from_cellregs_ps_less_reverse_nopush(cellregs, allcells, start, nn, Ti, nt)`
+
+After the cellregions (partitioning of the grid) of the grid have been computed, other things have to be computed, such as `sortednodesperthread` a depth+1 x num_nodes matrix, here `sortednodesperthreads[i,j]` is the point at which the j-th node appears in the i-th level matrix in the corresponding submatrix.
+`cellregs` contains the partiton for each cell.
+Furthermore, `nnts` (number of nodes of the threads) is computed, which contain for each thread the number of nodes that are contained in the cells of that thread.
+`allcells` and `start` together behave like the rowval and colptr arrays of a CSC matrix, such that `allcells[start[j]:start[j+1]-1]` are all cells that contain the j-th node.
+`nn` is the number of nodes in the grid.
+`Ti` is the type (Int64,...) of the elements in the created arrays.
+`nt` is the number of threads.
+"""
+function get_nnnts_and_sortednodesperthread_and_noderegs_from_cellregs_ps_less_reverse_nopush(cellregs, allcells, start, nn, Ti, nt)
+		
+	num_matrices = maximum(cellregs)
+	depth = Int(floor((num_matrices-1)/nt))
+
+	#loop over each node, get the cellregion of the cell (the one not in the separator) write the position of that node inside the cellregions sorted ranking into a long vector
+	#nnts = [zeros(Ti, nt+1) for i=1:depth+1]
+	nnts = zeros(Ti, nt)
+	#noderegs_max_tmp = 0
+	old_noderegions = zeros(Ti, (depth+1, nn))
+	
+	# Count nodes per thread:
+	tmp = zeros(depth+1)
+	for j=1:nn
+		cells = @view allcells[start[j]:start[j+1]-1]
+		sortedcellregs = unique(sort(cellregs[cells]))
+		#tmp = []
+		tmpctr = 1
+		for cr in sortedcellregs
+			crmod = (cr-1)%nt+1
+			level = Int(ceil(cr/nt))
+			#nnts[crmod] += 1
+			old_noderegions[level,j] = crmod
+			if !(crmod in tmp[1:tmpctr-1])
+				nnts[crmod] += 1
+				#sortednodesperthread[crmod,j] = nnts[crmod] #nnts[i][cr]
+				#push!(tmp, crmod)
+				tmp[tmpctr] = crmod
+				tmpctr += 1
+			end
+		end
+	end
+	
+	# Reorder inidices to receive a block structure:
+	# Taking the original matrix [a_ij] and mapping each i and j to new_indices[i] and new_indices[j], gives a block structure
+	# the reverse is also defined rev_new_indices[new_indices[k]] = k
+	# From now on we will only use this new ordering
+	counter_for_reorder = zeros(Ti, depth*nt+1)
+	for j=1:nn
+		level, reg                                 = last_nz(old_noderegions[:, j])
+		counter_for_reorder[(level-1)*nt + reg]    += 1 #(reg-1)*depth + level] += 1
+	end
+	
+	starts               = vcat([0], cumsum(counter_for_reorder))
+	counter_for_reorder2 = zeros(Ti, depth*nt+1)
+	new_indices          = Vector{Ti}(undef, nn)
+	rev_new_indices      = Vector{Ti}(undef, nn)
+	origin               = Vector{Ti}(undef, nn)
+	for j=1:nn
+		level, reg                                  = last_nz(old_noderegions[:, j])
+		counter_for_reorder2[(level-1)*nt + reg]    += 1
+		origin[j]                                   = reg
+		new_indices[j]                              = starts[(level-1)*nt + reg]+counter_for_reorder2[(level-1)*nt + reg]
+		rev_new_indices[new_indices[j]]             = j
+	end
+	starts .+= 1
+	
+	# Build sortednodesperthread and globalindices array:
+	# They are inverses of each other: globalindices[tid][sortednodeperthread[tid][j]] = j
+	# Note that j has to be a `new index`
+	
+	sortednodesperthread = zeros(Ti, (nt, nn)) #vvcons(Ti, nnts)
+	globalindices = vvcons(Ti, nnts)
+	gictrs = zeros(Ti, nt)
+
+	for nj=1:nn
+		oj = rev_new_indices[nj]
+		cells = @view allcells[start[oj]:start[oj+1]-1]
+		sortedcellregs = unique(sort(cellregs[cells]))
+		#tmp = []
+		tmpctr = 1
+		for cr in sortedcellregs
+			crmod = (cr-1)%nt+1
+			level = Int(ceil(cr/nt))
+			if !(crmod in tmp[1:tmpctr-1])
+				gictrs[crmod] += 1 # , level] += 1
+				sortednodesperthread[crmod,nj] = gictrs[crmod]
+				globalindices[crmod][gictrs[crmod]] = nj
+				#push!(tmp, crmod)
+				tmp[tmpctr] = crmod
+				tmpctr += 1
+			end
+		end
+	end
+	
+	nnts, sortednodesperthread, old_noderegions, globalindices, gictrs, new_indices, rev_new_indices, starts
+end
+
+
+
+
+
+
+
+
+"""
+`function separate!(cellregs, nc, ACSC, nt, level0, ctr_sepanodes)`
+
+This function partitons the separator, which is done if `depth`>1 (see `grid_to_graph_ps_multi!` and/or `preparatory_multi_ps`).
+`cellregs` contains the regions/partitions/colors of each cell. 
+`nc` is the number of cells in the grid.
+`ACSC` is the adjacency matrix of the graph of the (separator-) grid (vertex in graph is cell in grid, edge in graph means two cells share a node) stored as a CSC. 
+`nt` is the number of threads.
+`level0` is the separator-partitoning level, if the (first) separator is partitioned, level0 = 1, in the next iteration, level0 = 2...
+`preparatory_multi_ps` is the number of separator-cells.
+"""
+function separate!(cellregs, nc, ACSC, nt, level0, ctr_sepanodes)
+	sepanodes = findall(x->x==nt+1, cellregs)
+
+	indptr = collect(1:nc+1)
+	indices = zeros(Int64, nc)
+	rowval = zeros(Int64, nc)
+
+	indptrT = collect(1:ctr_sepanodes+1)
+	indicesT = zeros(Int64, ctr_sepanodes)
+	rowvalT = zeros(Int64, ctr_sepanodes)
+
+	for (i,j) in enumerate(sepanodes)
+		indices[j] = i
+		indicesT[i] = j
+		rowval[j]  = 1
+		rowvalT[i] = 1
+	end
+
+	R = SparseMatrixCSC(ctr_sepanodes, nc, indptr, indices, rowval)
+	RT = SparseMatrixCSC(nc, ctr_sepanodes, indptrT, indicesT, rowvalT)
+	prod = ACSC*dropzeros(RT)
+	RART = dropzeros(R)*ACSC*dropzeros(RT)
+	
+	partition2 = Metis.partition(RART, nt)
+	cellregs2 = copy(partition2)
+
+	ctr_sepanodes = 0
+	for (i,j) in enumerate(sepanodes)
+		rows = RART.rowval[RART.colptr[i]:(RART.colptr[i+1]-1)]
+		cellregs[j] = level0*nt + cellregs2[i]
+		if minimum(partition2[rows]) != maximum(partition2[rows])
+			cellregs[j] = (level0+1)*nt+1
+			ctr_sepanodes += 1
+		end
+	end
+
+	RART, ctr_sepanodes
+end
+
+
+
+"""
+`function grid_to_graph_ps_multi!(grid, nt, depth)`
+
+The function assigns colors/partitons to each cell in the `grid`. First, the grid is partitoned into `nt` partitions. If `depth` > 1, the separator is partitioned again...
+`grid` is a simplexgrid. 
+`nt` is the number of threads.
+`depth` is the number of partition layers, for depth=1, there are nt parts and 1 separator, for depth=2, the separator is partitioned again, leading to 2*nt+1 submatrices...
+"""
+function grid_to_graph_ps_multi!(grid, nt, depth)
+	A = SparseMatrixLNK{Int64, Int64}(num_cells(grid), num_cells(grid))
+	number_cells_per_node = zeros(Int64, num_nodes(grid))
+	for j=1:num_cells(grid)
+		for node_id in grid[CellNodes][:,j]
+			number_cells_per_node[node_id] += 1
+		end
+	end
+	allcells = zeros(Int64, sum(number_cells_per_node))
+	start = ones(Int64, num_nodes(grid)+1)
+	start[2:end] += cumsum(number_cells_per_node)
+	number_cells_per_node .= 0
+	for j=1:num_cells(grid)
+		for node_id in grid[CellNodes][:,j]
+			allcells[start[node_id] + number_cells_per_node[node_id]] = j
+			number_cells_per_node[node_id] += 1
+		end
+	end
+
+	for j=1:num_nodes(grid)
+		cells = @view allcells[start[j]:start[j+1]-1]
+		for (i,id1) in enumerate(cells)
+			for id2 in cells[i+1:end]
+				A[id1,id2] = 1
+				A[id2,id1] = 1
+			end
+		end	
+	end
+
+	ACSC = SparseArrays.SparseMatrixCSC(A)
+	
+	partition = Metis.partition(ACSC, nt)
+	cellregs  = copy(partition)
+	
+	ctr_sepanodes = 0
+	for j=1:num_cells(grid)
+		rows = ACSC.rowval[ACSC.colptr[j]:(ACSC.colptr[j+1]-1)]
+		if minimum(partition[rows]) != maximum(partition[rows])
+			cellregs[j] = nt+1
+			ctr_sepanodes += 1
+		end
+	end
+	RART = ACSC
+	for level=1:depth-1
+		RART, ctr_sepanodes = separate!(cellregs, num_cells(grid), RART, nt, level, ctr_sepanodes)
+	end
+
+			
+	return allcells, start, cellregs
+end
+
+
+
+function grid_to_graph_ps_multi_par!(grid, nt, depth)
+	time = zeros(12)
+	As = [ExtendableSparseMatrix{Int64, Int64}(num_cells(grid), num_cells(grid)) for tid=1:nt]
+	number_cells_per_node = zeros(Int64, num_nodes(grid))
+	
+	cn = grid[CellNodes]
+	
+	for j=1:num_cells(grid)
+		tmp = view(cn, :, j)
+		for node_id in tmp
+			number_cells_per_node[node_id] += 1
+		end
+	end
+		
+	
+	allcells = zeros(Int64, sum(number_cells_per_node))
+	start = ones(Int64, num_nodes(grid)+1)
+	start[2:end] += cumsum(number_cells_per_node)
+	number_cells_per_node .= 0
+	
+	for j=1:num_cells(grid)
+		tmp = view(cn, :, j)
+		for node_id in tmp
+			allcells[start[node_id] + number_cells_per_node[node_id]] = j
+			number_cells_per_node[node_id] += 1
+		end
+	end
+
+	node_range = get_starts(num_nodes(grid), nt)
+	Threads.@threads for tid=1:nt
+		for j in node_range[tid]:node_range[tid+1]-1
+			cells = @view allcells[start[j]:start[j+1]-1]
+			l = length(cells)
+			for (i,id1) in enumerate(cells)
+				ce = view(cells, i+1:l)
+				for id2 in ce
+					As[tid][id1,id2] = 1
+					As[tid][id2,id1] = 1
+				end
+			end	
+		end
+		ExtendableSparse.flush!(As[tid])
+	end
+	
+	ACSC = add_all_par!(As).cscmatrix
+		
+	#SparseArrays.SparseMatrixCSC(A))
+	
+	
+	partition = Metis.partition(ACSC, nt)
+	cellregs  = copy(partition)
+	
+	ctr_sepanodes_a = zeros(Int64, nt)
+	
+	cell_range = get_starts(num_cells(grid), nt)
+	Threads.@threads :static for tid=1:nt
+		for j in cell_range[tid]:cell_range[tid+1]-1
+			rows = @view ACSC.rowval[ACSC.colptr[j]:(ACSC.colptr[j+1]-1)]
+			if minimum(partition[rows]) != maximum(partition[rows])
+				cellregs[j] = nt+1
+				ctr_sepanodes_a[tid] += 1
+			end
+		end
+	end
+	
+	ctr_sepanodes = sum(ctr_sepanodes_a)
+			
+	#=
+	time[10] = @elapsed for j=1:num_cells(grid)
+		rows = ACSC.rowval[ACSC.colptr[j]:(ACSC.colptr[j+1]-1)]
+		if minimum(partition[rows]) != maximum(partition[rows])
+			cellregs[j] = nt+1
+			ctr_sepanodes += 1
+		end
+	end
+	=#
+	RART = ACSC
+	for level=1:depth-1
+		RART, ctr_sepanodes = separate!(cellregs, num_cells(grid), RART, nt, level, ctr_sepanodes)
+	end
+
+			
+	return allcells, start, cellregs
+end
+
+
+function add_all_par!(As)
+	nt = length(As)
+	depth = Int(floor(log2(nt)))
+	ende = nt
+	for level=1:depth
+		
+		@threads :static for tid=1:2^(depth-level)
+			#@info "$level, $tid"
+			start = tid+2^(depth-level)
+			while start <= ende
+				As[tid] += As[start]
+				start += 2^(depth-level)
+			end
+		end
+		ende = 2^(depth-level)
+	end
+	As[1]
+
+end
+
+
+"""
+`function vvcons(Ti, lengths)`
+
+`lengths` is a vector of integers.
+The function creates a vector of zero vectors of type `Ti` of length `lengths[i]`.
+"""
+function vvcons(Ti, lengths)
+	x::Vector{Vector{Ti}} = [zeros(Ti, i) for i in lengths]
+	return x
+end
+
+
+"""
+`function bettercellsforpart(xx, upper)`
+
+`xx` are the CellRegions (i.e. the color/partition of each cell).
+`upper` is the number of partitions (upper=depth*nt+1).
+The function returns a vector e.g. [v1, v2, v3, v4, v5].
+The element v1 would be the list of cells that are in partition 1 etc.
+The function is basically a faster findall.
+"""
+function bettercellsforpart(xx, upper)
+	ctr = zeros(Int64, upper)
+	for x in xx
+		ctr[x] += 1
+	end
+	cfp = vvcons(Int64, ctr)
+	ctr .= 1
+	for (i,x) in enumerate(xx)
+		cfp[x][ctr[x]] = i
+		ctr[x] += 1
+	end
+	cfp
+end
+
+"""
+`function getgrid(nm)`
+
+Returns a simplexgrid with a given number of nodes in each dimension.
+`nm` is the number of nodes in each dimension (Examples: 2d: nm = (100,100) -> 100 x 100 grid, 3d: nm = (50,50,50) -> 50 x 50 x 50 grid).
+"""
+function getgrid(nm; x0=0.0, x1=1.0)
+	if length(nm) == 2
+		n,m = nm
+		xx = collect(LinRange(x0, x1, n))
+		yy = collect(LinRange(x0, x1, m))
+		grid = simplexgrid(xx, yy)
+	else 
+		n,m,l = nm
+		xx = collect(LinRange(x0, x1, n))
+		yy = collect(LinRange(x0, x1, m))
+		zz = collect(LinRange(x0, x1, l))
+		grid = simplexgrid(xx, yy, zz)
+	end
+	grid
+end
+
+function get_starts(n, nt)
+	ret = ones(Int64, nt+1)
+	ret[end] = n+1
+	for i=nt:-1:2
+		ret[i] = ret[i+1] - Int(round(ret[i+1]/i)) #Int(round(n/nt))-1
+	end 
+	ret
+end
+
+function last_nz(x)
+	n = length(x)
+	for j=n:-1:1
+		if x[j] != 0
+			return (j, x[j])
+		end
+	end
+end
+
diff --git a/src/matrix/ExtendableSparseMatrixParallel/struct_flush.jl b/src/matrix/ExtendableSparseMatrixParallel/struct_flush.jl
new file mode 100644
index 0000000..c27aab0
--- /dev/null
+++ b/src/matrix/ExtendableSparseMatrixParallel/struct_flush.jl
@@ -0,0 +1,263 @@
+function flush!(A::ExtendableSparseMatrixParallel; do_dense=false, keep_zeros=true)
+
+
+	if !do_dense
+		A.cscmatrix = A.cscmatrix+sparse_flush!(A; keep_zeros)
+	
+	else
+		if keep_zeros
+			A.cscmatrix = dense_flush_keepzeros!(A.lnkmatrices, A.old_noderegions, A.sortednodesperthread, A.nt, A.rev_new_indices)
+		else
+			A.cscmatrix = dense_flush_removezeros!(A.lnkmatrices, A.old_noderegions, A.sortednodesperthread, A.nt, A.rev_new_indices)
+		end
+	end
+	
+	A.lnkmatrices = [SuperSparseMatrixLNK{matrixvaluetype(A), matrixindextype(A)}(num_nodes(A.grid), A.nnts[tid]) for tid=1:A.nt]
+
+end
+
+"""
+`CSC_RLNK_plusequals_less3_reordered_super!` from `plusequals.jl`
+"""
+function sparse_flush!(A::ExtendableSparseMatrixParallel; keep_zeros=true)
+
+	#dropzeros!(
+	plus_remap(A.lnkmatrices, A.cscmatrix, A.globalindices; keep_zeros)
+	#)
+		
+end
+
+
+
+"""
+`CSC_RLNK_si_oc_ps_dz_less_reordered` from `conversion.jl`
+"""
+function dense_flush_keepzeros!(
+	As::Vector{SuperSparseMatrixLNK{Tv, Ti}}, 
+	onr, s, nt, rni
+	) where {Tv, Ti <: Integer}
+
+	nnz = sum([As[i].nnz for i=1:nt]) #you could also subtract the diagonal entries from shared columns, since those are definitely double
+	indptr = zeros(Ti, As[1].m+1)
+	indices = zeros(Ti, nnz) #sum(As.nnz))
+	data = zeros(Float64, nnz) #sum(As.nnz))
+	ctr = 1
+	eqctr = 0
+	tmp = zeros(Ti, size(onr)[1])
+	
+	for nj=1:As[1].m
+		indptr[nj] = ctr
+		oj = rni[nj]
+		regionctr = 1
+		jc = 0
+		nrr = view(onr, :, oj) 
+		tmp .= 0
+		for region in nrr #nrr #[:,j]
+			regmod = region #(region-1)%nt+1
+			if (region > 0) & !(region in tmp)
+				k = s[regmod, nj]
+				if regionctr == 1
+					while k>0
+						#if As[regmod].nzval[k] != 0.0
+							indices[ctr] = As[regmod].rowval[k]
+							data[ctr]    = As[regmod].nzval[k]
+							
+							for jcc=1:jc
+								if indices[ctr-jcc] > indices[ctr-jcc+1]
+									tmp_i = indices[ctr-jcc+1]
+									tmp_d = data[ctr-jcc+1]
+									indices[ctr-jcc+1] = indices[ctr-jcc]
+									data[ctr-jcc+1]    = data[ctr-jcc]
+									
+									indices[ctr-jcc] = tmp_i
+									data[ctr-jcc]    = tmp_d
+								else
+									break
+								end
+							end
+							
+							ctr += 1
+							jc += 1
+						#end
+						k = As[regmod].colptr[k]
+					end
+				else
+					while k>0
+						#if As[regmod].nzval[k] != 0.0
+							indices[ctr] = As[regmod].rowval[k]
+							data[ctr]    = As[regmod].nzval[k]
+							
+							for jcc=1:jc
+								if indices[ctr-jcc] > indices[ctr-jcc+1]
+									tmp_i = indices[ctr-jcc+1]
+									tmp_d = data[ctr-jcc+1]
+									indices[ctr-jcc+1] = indices[ctr-jcc]
+									data[ctr-jcc+1]    = data[ctr-jcc]
+									
+									indices[ctr-jcc] = tmp_i
+									data[ctr-jcc]    = tmp_d
+								elseif indices[ctr-jcc] == indices[ctr-jcc+1]
+									data[ctr-jcc] += data[ctr-jcc+1]
+									eqctr += 1
+									
+									for jccc=1:jcc
+										indices[ctr-jcc+jccc] = indices[ctr-jcc+jccc+1]
+										data[ctr-jcc+jccc]    = data[ctr-jcc+jccc+1]
+									end
+									
+									ctr -= 1
+									jc  -= 1
+									
+									break
+								else
+									break
+								end
+							end
+							
+							ctr += 1
+							jc += 1
+						#end
+						k = As[regmod].colptr[k]
+					end
+					
+				end
+				tmp[regionctr] = region
+				regionctr += 1
+				
+			end
+			
+		end
+		
+	end
+
+	#@warn ctr/nnz
+	
+	indptr[end] = ctr
+	resize!(indices, ctr-1)
+	resize!(data, ctr-1)
+
+	
+	SparseArrays.SparseMatrixCSC(
+		As[1].m, As[1].m, indptr, indices, data
+	)
+	
+end
+
+
+function dense_flush_removezeros!(
+	As::Vector{SuperSparseMatrixLNK{Tv, Ti}}, 
+	onr, s, nt, rni
+	) where {Tv, Ti <: Integer}
+
+	nnz = sum([As[i].nnz for i=1:nt]) #you could also subtract the diagonal entries from shared columns, since those are definitely double
+	indptr = zeros(Ti, As[1].m+1)
+	indices = zeros(Ti, nnz) #sum(As.nnz))
+	data = zeros(Float64, nnz) #sum(As.nnz))
+	ctr = 1
+	eqctr = 0
+	tmp = zeros(Ti, size(onr)[1])
+	
+	for nj=1:As[1].m
+		indptr[nj] = ctr
+		oj = rni[nj]
+		regionctr = 1
+		jc = 0
+		nrr = view(onr, :, oj) 
+		tmp .= 0
+		for region in nrr #nrr #[:,j]
+			regmod = region #(region-1)%nt+1
+			if (region > 0) & !(region in tmp)
+				k = s[regmod, nj]
+				if regionctr == 1
+					while k>0
+						if As[regmod].nzval[k] != 0.0
+							indices[ctr] = As[regmod].rowval[k]
+							data[ctr]    = As[regmod].nzval[k]
+							
+							for jcc=1:jc
+								if indices[ctr-jcc] > indices[ctr-jcc+1]
+									tmp_i = indices[ctr-jcc+1]
+									tmp_d = data[ctr-jcc+1]
+									indices[ctr-jcc+1] = indices[ctr-jcc]
+									data[ctr-jcc+1]    = data[ctr-jcc]
+									
+									indices[ctr-jcc] = tmp_i
+									data[ctr-jcc]    = tmp_d
+								else
+									break
+								end
+							end
+							
+							ctr += 1
+							jc += 1
+						end
+						k = As[regmod].colptr[k]
+					end
+				else
+					while k>0
+						if As[regmod].nzval[k] != 0.0
+							indices[ctr] = As[regmod].rowval[k]
+							data[ctr]    = As[regmod].nzval[k]
+							
+							for jcc=1:jc
+								if indices[ctr-jcc] > indices[ctr-jcc+1]
+									tmp_i = indices[ctr-jcc+1]
+									tmp_d = data[ctr-jcc+1]
+									indices[ctr-jcc+1] = indices[ctr-jcc]
+									data[ctr-jcc+1]    = data[ctr-jcc]
+									
+									indices[ctr-jcc] = tmp_i
+									data[ctr-jcc]    = tmp_d
+								elseif indices[ctr-jcc] == indices[ctr-jcc+1]
+									data[ctr-jcc] += data[ctr-jcc+1]
+									eqctr += 1
+									
+									for jccc=1:jcc
+										indices[ctr-jcc+jccc] = indices[ctr-jcc+jccc+1]
+										data[ctr-jcc+jccc]    = data[ctr-jcc+jccc+1]
+									end
+									
+									ctr -= 1
+									jc  -= 1
+									
+									break
+								else
+									break
+								end
+							end
+							
+							ctr += 1
+							jc += 1
+						end
+						k = As[regmod].colptr[k]
+					end
+					
+				end
+				tmp[regionctr] = region
+				regionctr += 1
+				
+			end
+			
+		end
+		
+	end
+
+	#@warn ctr/nnz
+	
+	indptr[end] = ctr
+	resize!(indices, ctr-1)
+	resize!(data, ctr-1)
+
+	
+	SparseArrays.SparseMatrixCSC(
+		As[1].m, As[1].m, indptr, indices, data
+	)
+	
+end
+
+
+
+
+
+
+
diff --git a/src/matrix/ExtendableSparseMatrixParallel/supersparse.jl b/src/matrix/ExtendableSparseMatrixParallel/supersparse.jl
new file mode 100644
index 0000000..ae52f60
--- /dev/null
+++ b/src/matrix/ExtendableSparseMatrixParallel/supersparse.jl
@@ -0,0 +1,788 @@
+
+using SparseArrays
+using ExtendableSparse
+
+mutable struct SuperSparseMatrixLNK{Tv, Ti <: Integer} <: AbstractSparseMatrix{Tv, Ti}
+    """
+    Number of rows
+    """
+    m::Ti
+
+    """
+    Number of columns
+    """
+    n::Ti
+
+    """
+    Number of nonzeros
+    """
+    nnz::Ti
+
+    """
+    Length of arrays
+    """
+    nentries::Ti
+
+    """
+    Linked list of column entries. Initial length is n,
+    it grows with each new entry.
+
+    colptr[index] contains the next
+    index in the list or zero, in the later case terminating the list which
+    starts at index 1<=j<=n for each column j.
+    """
+    colptr::Vector{Ti}
+
+    """
+    Row numbers. For each index it contains the zero (initial state)
+    or the row numbers corresponding to the column entry list in colptr.
+
+    Initial length is n,
+    it grows with each new entry.
+    """
+    rowval::Vector{Ti}
+
+    """
+    Nonzero entry values correspondin to each pair
+    (colptr[index],rowval[index])
+
+    Initial length is n,  it grows with each new entry.
+    """
+    nzval::Vector{Tv}
+    
+    
+    collnk::Vector{Ti}
+    
+    colctr::Ti
+end
+
+
+function SparseArrays.SparseMatrixCSC(A::SuperSparseMatrixLNK{Tv, Ti})::SparseArrays.SparseMatrixCSC where {Tv, Ti <: Integer}
+	SparseArrays.SparseMatrixCSC(SparseMatrixLNK{Tv, Ti}(A.m, A.n, A.nnz, A.nentries, A.colptr, A.rowval, A.nzval))
+
+end
+
+function SuperSparseMatrixLNK{Tv, Ti}(m, n) where {Tv, Ti <: Integer}
+    SuperSparseMatrixLNK{Tv, Ti}(m, n, 0, n, zeros(Ti, n), zeros(Ti, n), zeros(Tv, n), zeros(Ti, n), 0)
+end
+
+
+function findindex(lnk::SuperSparseMatrixLNK, i, j)
+    if !((1 <= i <= lnk.m) & (1 <= j <= lnk.n))
+        throw(BoundsError(lnk, (i, j)))
+    end
+
+    k = j
+    k0 = j
+    while k > 0
+        if lnk.rowval[k] == i
+            return k, 0
+        end
+        k0 = k
+        k = lnk.colptr[k]
+    end
+    return 0, k0
+end
+
+"""
+Return tuple containing size of the matrix.
+"""
+Base.size(lnk::SuperSparseMatrixLNK) = (lnk.m, lnk.n)
+
+"""    
+Return value stored for entry or zero if not found
+"""
+function Base.getindex(lnk::SuperSparseMatrixLNK{Tv, Ti}, i, j) where {Tv, Ti}
+    k, k0 = findindex(lnk, i, j)
+    if k == 0
+        return zero(Tv)
+    else
+        return lnk.nzval[k]
+    end
+end
+
+function addentry!(lnk::SuperSparseMatrixLNK, i, j, k, k0)
+    # increase number of entries
+    lnk.nentries += 1
+    if length(lnk.nzval) < lnk.nentries
+        newsize = Int(ceil(5.0 * lnk.nentries / 4.0))
+        resize!(lnk.nzval, newsize)
+        resize!(lnk.rowval, newsize)
+        resize!(lnk.colptr, newsize)
+    end
+
+    # Append entry if not found
+    lnk.rowval[lnk.nentries] = i
+
+    # Shift the end of the list
+    lnk.colptr[lnk.nentries] = 0
+    lnk.colptr[k0] = lnk.nentries
+
+    # Update number of nonzero entries
+    lnk.nnz += 1
+    return lnk.nentries
+end
+
+"""    
+Update value of existing entry, otherwise extend matrix if v is nonzero.
+"""
+function Base.setindex!(lnk::SuperSparseMatrixLNK, v, i, j)
+    if !((1 <= i <= lnk.m) & (1 <= j <= lnk.n))
+        throw(BoundsError(lnk, (i, j)))
+    end
+
+    # Set the first  column entry if it was not yet set.
+    if lnk.rowval[j] == 0 && !iszero(v)
+    	lnk.colctr             += 1
+    	lnk.collnk[lnk.colctr] = j
+        lnk.rowval[j]          = i
+        lnk.nzval[j]           = v
+        lnk.nnz                += 1
+        return lnk
+    end
+
+    k, k0 = findindex(lnk, i, j)
+    if k > 0
+        lnk.nzval[k] = v
+        return lnk
+    end
+    if !iszero(v)
+        k = addentry!(lnk, i, j, k, k0)
+        lnk.nzval[k] = v
+    end
+    return lnk
+end
+
+"""
+Update element of the matrix  with operation `op`. 
+It assumes that `op(0,0)==0`. If `v` is zero, no new 
+entry is created.
+"""
+function updateindex!(lnk::SuperSparseMatrixLNK{Tv, Ti}, op, v, i, j) where {Tv, Ti}
+    # Set the first  column entry if it was not yet set.
+    if lnk.rowval[j] == 0 && !iszero(v)
+        lnk.colctr             += 1
+    	lnk.collnk[lnk.colctr] = j
+        lnk.rowval[j]          = i
+        lnk.nzval[j]           = op(lnk.nzval[j], v)
+        lnk.nnz                += 1
+        return lnk
+    end
+    k, k0 = findindex(lnk, i, j)
+    if k > 0
+        lnk.nzval[k] = op(lnk.nzval[k], v)
+        return lnk
+    end
+    if !iszero(v)
+        k = addentry!(lnk, i, j, k, k0)
+        lnk.nzval[k] = op(zero(Tv), v)
+    end
+    lnk
+end
+
+function rawupdateindex!(lnk::SuperSparseMatrixLNK{Tv, Ti}, op, v, i, j) where {Tv, Ti}
+    # Set the first  column entry if it was not yet set.
+    if lnk.rowval[j] == 0
+        lnk.colctr             += 1
+    	lnk.collnk[lnk.colctr] = j
+        lnk.rowval[j]          = i
+        lnk.nzval[j]           = op(lnk.nzval[j], v)
+        lnk.nnz                += 1
+        return lnk
+    end
+    k, k0 = findindex(lnk, i, j)
+    if k > 0
+        lnk.nzval[k] = op(lnk.nzval[k], v)
+        return lnk
+    end
+    if !iszero(v)
+        k = addentry!(lnk, i, j, k, k0)
+        lnk.nzval[k] = op(zero(Tv), v)
+    end
+    lnk
+end
+
+#=
+mutable struct ColEntry{Tv, Ti <: Integer}
+    rowval::Ti
+    nzval::Tv
+end
+
+# Comparison method for sorting
+Base.isless(x::ColEntry, y::ColEntry) = (x.rowval < y.rowval)
+=#
+
+function get_column!(col::Vector{ColEntry{Tv, Ti}}, lnk::SuperSparseMatrixLNK{Tv, Ti}, j::Ti)::Ti where {Tv, Ti <: Integer}
+	k = j
+	ctr = zero(Ti)
+	while k>0
+		if abs(lnk.nzval[k]) > 0
+			ctr += 1
+			col[ctr] = ColEntry(lnk.rowval[k], lnk.nzval[k])
+		end
+		k = lnk.colptr[k]
+	end
+	sort!(col, 1, ctr, Base.QuickSort, Base.Forward)
+	ctr
+end
+
+
+function remove_doubles!(col, coll)
+	#input_ctr = 1
+	last = 1
+	for j=2:coll
+		if col[j].rowval == col[last].rowval
+			col[last].nzval += col[j].nzval
+		else
+			last += 1
+			if last != j
+				col[last] = col[j]
+			end
+		end
+	end	
+	last
+end
+
+function get_column_removezeros!(col::Vector{ColEntry{Tv, Ti}}, lnks::Vector{SuperSparseMatrixLNK{Tv, Ti}}, js, tids, length)::Ti where {Tv, Ti <: Integer}
+	ctr = zero(Ti)
+	for i=1:length
+		tid = tids[i]
+		k   = js[i] 
+		#for (tid,j) in zip(tids, js) #j0:j1
+		#tid = tids[j]
+		#k = j
+		while k>0
+			if abs(lnks[tid].nzval[k]) > 0
+				ctr += 1
+				col[ctr] = ColEntry(lnks[tid].rowval[k], lnks[tid].nzval[k])
+			end
+			k = lnks[tid].colptr[k]
+		end
+	end
+	
+	sort!(col, 1, ctr, Base.QuickSort, Base.Forward)
+	ctr = remove_doubles!(col, ctr)
+	#print_col(col, ctr)
+	ctr
+
+end
+
+function get_column_keepzeros!(col::Vector{ColEntry{Tv, Ti}}, lnks::Vector{SuperSparseMatrixLNK{Tv, Ti}}, js, tids, length)::Ti where {Tv, Ti <: Integer}
+	ctr = zero(Ti)
+	for i=1:length
+		tid = tids[i]
+		k   = js[i] 
+		#for (tid,j) in zip(tids, js) #j0:j1
+		#tid = tids[j]
+		#k = j
+		while k>0
+			#if abs(lnks[tid].nzval[k]) > 0
+				ctr += 1
+				col[ctr] = ColEntry(lnks[tid].rowval[k], lnks[tid].nzval[k])
+			#end
+			k = lnks[tid].colptr[k]
+		end
+	end
+	
+	sort!(col, 1, ctr, Base.QuickSort, Base.Forward)
+	ctr = remove_doubles!(col, ctr)
+	#print_col(col, ctr)
+	ctr
+
+end
+
+function merge_into!(rowval::Vector{Ti}, nzval::Vector{Tv}, C::SparseArrays.SparseMatrixCSC{Tv, Ti}, col::Vector{ColEntry{Tv, Ti}}, J::Ti, coll::Ti, ptr1::Ti) where {Tv, Ti <: Integer}
+	j_min = 1
+	numshifts = 0
+	j_last = 0
+	last_row = 0
+	
+	#@warn "MERGING $J"
+	
+	#rowval0 = copy(C.rowval[C.colptr[J]:C.colptr[J+1]-1])
+	#endptr = C.colptr[J+1]
+	
+	for (di,i) in enumerate(C.colptr[J]:C.colptr[J+1]-1)
+		for j=j_min:coll
+			#if col[j].rowval == last_row
+			#	#@info "!! col j rowval == last row"
+			#end
+			if col[j].rowval < C.rowval[i] #ptr1+di+numshifts] #i+numshifts]
+				if col[j].rowval == last_row
+					#@info "$(ptr1+di+numshifts) : backwards EQUALITY: "
+					nzval[ptr1+di+numshifts] += col[j].nzval
+				else
+					#@info "$(ptr1+di+numshifts) : Insert from col: j=$j"
+					#shift_e!(C.rowval, C.nzval, 1, i+numshifts, C.colptr[end]-1)
+					rowval[ptr1+di+numshifts] = col[j].rowval
+					nzval[ptr1+di+numshifts]  = col[j].nzval
+					numshifts += 1
+					#endptr += 1
+				end
+				j_last = j
+			elseif col[j].rowval > C.rowval[i] #if col[j].rowval  
+				#@info "$(ptr1+di+numshifts) : Insert from C: i=$i"
+				rowval[ptr1+di+numshifts] = C.rowval[i]
+				nzval[ptr1+di+numshifts]  = C.nzval[i]
+				j_min = j
+				break
+			else
+				#@info "$(ptr1+di+numshifts) : normal EQUALITY: i=$i, j=$j"
+				rowval[ptr1+di+numshifts] = C.rowval[i]
+				nzval[ptr1+di+numshifts]  = C.nzval[i]+col[j].nzval
+				#numshifts += 1
+				j_min = j+1
+				j_last = j
+				
+				if j == coll
+					#@info "$(ptr1+di+numshifts+1) → $(ptr1+numshifts+(C.colptr[J+1]-C.colptr[J]))"
+					rowval[ptr1+di+numshifts+1:ptr1+numshifts+(C.colptr[J+1]-C.colptr[J])] = view(C.rowval, i+1:C.colptr[J+1]-1) #C.rowval[i:C.colptr[J+1]-1]
+					nzval[ptr1+di+numshifts+1:ptr1+numshifts+(C.colptr[J+1]-C.colptr[J])]  = view(C.nzval, i+1:C.colptr[J+1]-1) #C.nzval[i:C.colptr[J+1]-1]
+					
+					#@info "FINISH"
+					return numshifts
+				end
+				
+				break
+			end
+			
+			if j == coll
+				#@info "$(ptr1+di+numshifts) → $(ptr1+numshifts+(C.colptr[J+1]-C.colptr[J]))"
+				rowval[ptr1+di+numshifts:ptr1+numshifts+(C.colptr[J+1]-C.colptr[J])] = view(C.rowval, i:C.colptr[J+1]-1) #C.rowval[i:C.colptr[J+1]-1]
+				nzval[ptr1+di+numshifts:ptr1+numshifts+(C.colptr[J+1]-C.colptr[J])]  = view(C.nzval, i:C.colptr[J+1]-1) #C.nzval[i:C.colptr[J+1]-1]
+				
+				#@info "FINISH"
+				return numshifts
+			end
+			
+			last_row = col[j].rowval
+		end
+	end
+	endptr = ptr1 + numshifts + (C.colptr[J+1]-C.colptr[J])
+	last_row = 0
+	numshifts_old = numshifts
+	numshifts = 0
+	#start_ptr = endptr - 1 #C.colptr[J+1]-1
+	if j_last > 0
+		last_row = col[j_last].rowval
+	end
+	
+	if j_last != coll
+		for j=j_last+1:coll
+			if col[j].rowval != last_row
+				numshifts += 1
+				#shift_e!(C.rowval, C.nzval, 1, start_ptr+numshifts, C.colptr[end]-1)
+				#for k=start_ptr+numshifts:
+				#@info "$(endptr+numshifts) : after..."
+				rowval[endptr+numshifts] = col[j].rowval
+				nzval[endptr+numshifts]  = col[j].nzval
+				last_row                    = rowval[endptr+numshifts]
+				#colptr[J+1:end]             .+= 1
+			else
+				nzval[endptr+numshifts] += col[j].nzval
+			end
+		end
+	end
+	
+	return numshifts + numshifts_old
+
+end
+
+
+function print_col(col, coll)
+	v = zeros((2, coll))
+	for j=1:coll
+		v[1,j] = col[j].rowval
+		v[2,j] = col[j].nzval
+	end
+	@info v
+end
+
+function plus(lnk::SparseMatrixLNK{Tv, Ti}, csc::SparseArrays.SparseMatrixCSC) where {Tv, Ti <: Integer}
+	if lnk.nnz == 0
+		return csc
+	elseif length(csc.rowval) == 0
+		return SparseMatrixCSC(lnk)
+	else
+		return lnk + csc
+	end
+end
+
+function plus(lnk::SuperSparseMatrixLNK{Tv, Ti}, csc::SparseArrays.SparseMatrixCSC) where {Tv, Ti <: Integer}
+	gi = collect(1:csc.n)
+	
+	
+	supersparsecolumns = gi[lnk.collnk[1:lnk.colctr]]
+	sortedcolumnids    = sortperm(supersparsecolumns)
+	sortedcolumns      = supersparsecolumns[sortedcolumnids]
+	#sortedcolumns      = vcat([1], sortedcolumns)
+	sortedcolumns      = vcat(sortedcolumns, [csc.n+1])
+	
+	col = [ColEntry{Tv, Ti}(0, zero(Tv)) for i=1:csc.m]
+	
+	#@info sortedcolumnids 
+	
+	nnz_sum   = length(csc.rowval) + lnk.nnz
+	colptr    = Vector{Ti}(undef, csc.n+1)
+	rowval    = Vector{Ti}(undef, nnz_sum)
+	nzval     = Vector{Tv}(undef, nnz_sum)
+	colptr[1] = one(Ti)
+	
+	#first part: columns between 1 and first column of lnk
+	
+	colptr[1:sortedcolumns[1]] = view(csc.colptr, 1:sortedcolumns[1])
+	rowval[1:csc.colptr[sortedcolumns[1]]-1] = view(csc.rowval, 1:csc.colptr[sortedcolumns[1]]-1)
+	nzval[1:csc.colptr[sortedcolumns[1]]-1]  = view(csc.nzval, 1:csc.colptr[sortedcolumns[1]]-1)
+	
+	numshifts = 0
+	
+	for J=1:length(sortedcolumns)-1
+		#@info ">>>>>>> $J <<<<<<<<<<<<<<<"
+		# insert new added column here / dummy
+		i = sortedcolumns[J]
+		coll = get_column!(col, lnk, i)
+		#print_col(col, coll)
+		
+		nns       = merge_into!(rowval, nzval, csc, col, i, coll, colptr[i]-1)
+		
+		numshifts += nns
+		#j = colptr[i] #sortedcolumns[J]] 
+		#rowval[j] = J
+		#nzval[j]  = J
+		# insertion end
+		
+		#colptr[i+1] = colptr[i] + csc.colptr[i+1]-csc.colptr[i] + numshifts
+		
+		#a = i+1
+		#b = sortedcolumns[J+1]
+		#@info a, b
+		
+		
+		#colptr[i+1:sortedcolumns[J+1]] = (csc.colptr[i+1:sortedcolumns[J+1]]-csc.colptr[i:sortedcolumns[J+1]-1]).+(colptr[i] + nns)
+		
+		colptr[i+1:sortedcolumns[J+1]] = csc.colptr[i+1:sortedcolumns[J+1]].+(-csc.colptr[i]+colptr[i] + nns)
+		
+		
+		rowval[colptr[i+1]:colptr[sortedcolumns[J+1]]-1] = view(csc.rowval, csc.colptr[i+1]:csc.colptr[sortedcolumns[J+1]]-1)
+		nzval[colptr[i+1]:colptr[sortedcolumns[J+1]]-1]  = view(csc.nzval, csc.colptr[i+1]:csc.colptr[sortedcolumns[J+1]]-1)
+		
+		
+		#=
+		
+		@info csc.colptr[a:b]
+		
+		colptr[a:b] = csc.colptr[a:b].+numshifts
+		
+		#colptr[i+2:sortedcolumns[J+1]] = csc.colptr[i+2:sortedcolumns[J+1]].+numshifts
+		@info i, J, colptr[i+2], colptr[sortedcolumns[J+1]], csc.colptr[i+2], csc.colptr[sortedcolumns[J+1]]
+		@info i, J, colptr[a], colptr[b], csc.colptr[a], csc.colptr[b]
+		rowval[colptr[i+2]:colptr[sortedcolumns[J+1]]] = view(csc.rowval, csc.colptr[i+2]:csc.colptr[sortedcolumns[J+1]])
+		nzval[colptr[i+2]:colptr[sortedcolumns[J+1]]]  = view(csc.nzval, csc.colptr[i+2]:csc.colptr[sortedcolumns[J+1]])
+		#rowval[colptrsortedcolumns[J+1]]
+		=#
+	end
+	
+	#@info colptr
+	
+	resize!(rowval, length(csc.rowval)+numshifts)
+	resize!(nzval, length(csc.rowval)+numshifts)
+	
+	
+	SparseMatrixCSC(csc.m, csc.n, colptr, rowval, nzval)
+
+		
+
+end
+
+
+function plus_remap(lnks::Vector{SuperSparseMatrixLNK{Tv, Ti}}, csc::SparseArrays.SparseMatrixCSC, gi::Vector{Vector{Ti}}; keep_zeros=true) where {Tv, Ti <: Integer}
+	nt = length(lnks)
+
+	if keep_zeros
+		get_col! = get_column_keepzeros!
+	else
+		get_col! = get_column_removezeros!
+	end
+	lnkscols           = vcat([lnks[i].collnk[1:lnks[i].colctr] for i=1:nt]...)
+	supersparsecolumns = vcat([gi[i][lnks[i].collnk[1:lnks[i].colctr]] for i=1:nt]...)
+	num_cols = sum([lnks[i].colctr for i=1:nt])
+	tids     = Vector{Ti}(undef, num_cols)
+	ctr = 0
+	for i=1:nt
+		for j=1:lnks[i].colctr
+			ctr += 1
+			tids[ctr] = i
+		end
+	end
+
+
+	sortedcolumnids    = sortperm(supersparsecolumns)
+	sortedcolumns      = supersparsecolumns[sortedcolumnids]
+	sortedcolumns      = vcat(sortedcolumns, [Ti(csc.n+1)])
+	
+	coll = sum([lnks[i].nnz for i=1:nt])
+	nnz_sum   = length(csc.rowval) + coll
+	colptr    = Vector{Ti}(undef, csc.n+1)
+	rowval    = Vector{Ti}(undef, nnz_sum)
+	nzval     = Vector{Tv}(undef, nnz_sum)
+	colptr[1] = one(Ti)
+	
+	if csc.m < coll
+		coll = csc.m
+	end
+	
+	col = [ColEntry{Tv, Ti}(0, zero(Tv)) for i=1:coll]
+	numshifts = 0
+	
+	colptr[1:sortedcolumns[1]] = view(csc.colptr, 1:sortedcolumns[1])
+	rowval[1:csc.colptr[sortedcolumns[1]]-1] = view(csc.rowval, 1:csc.colptr[sortedcolumns[1]]-1)
+	nzval[1:csc.colptr[sortedcolumns[1]]-1]  = view(csc.nzval, 1:csc.colptr[sortedcolumns[1]]-1)
+	
+	J = 1
+	i0 = 0
+	#lj_last = []
+	#tid_last = []
+	lj_last  = Vector{Ti}(undef, nt)
+	tid_last = Vector{Ti}(undef, nt)
+	ctr_last = 1
+	gj_last = 0
+	for J=1:length(sortedcolumns)-1
+		gj_now  = sortedcolumns[J]
+		gj_next = sortedcolumns[J+1]
+			
+		lj_last[ctr_last]  = lnkscols[sortedcolumnids[J]]
+		tid_last[ctr_last] = tids[sortedcolumnids[J]]
+		
+		if gj_now != gj_next
+			#@info typeof(lnks)
+			# do stuff from gj_last to gj_now / from last_lj to J
+			#@info lj_last, tid_last
+			coll = get_col!(col, lnks, lj_last, tid_last, ctr_last)
+			
+			nns       = merge_into!(rowval, nzval, csc, col, gj_now, coll, colptr[gj_now]-one(Ti))
+			numshifts += nns
+			
+			
+			colptr[gj_now+1:sortedcolumns[J+1]] = csc.colptr[gj_now+1:sortedcolumns[J+1]].+(-csc.colptr[gj_now]+colptr[gj_now] + nns)
+			
+			rowval[colptr[gj_now+1]:colptr[sortedcolumns[J+1]]-1] = view(csc.rowval, csc.colptr[gj_now+1]:csc.colptr[sortedcolumns[J+1]]-1)
+			nzval[colptr[gj_now+1]:colptr[sortedcolumns[J+1]]-1]  = view(csc.nzval, csc.colptr[gj_now+1]:csc.colptr[sortedcolumns[J+1]]-1)
+			
+			#rowval[colptr[gj_now+1]:colptr[sortedcolumns[J+1]]-1] = csc.rowval[csc.colptr[gj_now+1]:csc.colptr[sortedcolumns[J+1]]-1]
+			#nzval[colptr[gj_now+1]:colptr[sortedcolumns[J+1]]-1]  = csc.nzval[csc.colptr[gj_now+1]:csc.colptr[sortedcolumns[J+1]]-1]
+			
+			
+			#for k=csc.colptr[gj_now+1]:csc.colptr[sortedcolumns[J+1]]-1
+			#	k2 = k+(-csc.colptr[gj_now]+colptr[gj_now] + nns)
+			#	rowval[k2] = csc.rowval[k]
+			#	nzval[k2]  = csc.nzval[k]
+			#end	
+			
+			gj_last  = gj_now
+			ctr_last = 0 #tids[sortedcolumnids[J]]]
+		end
+		
+		ctr_last += 1
+		
+		
+	end
+		
+	
+	resize!(rowval, length(csc.rowval)+numshifts)
+	resize!(nzval, length(csc.rowval)+numshifts)
+	
+	
+	SparseArrays.SparseMatrixCSC(csc.m, csc.n, colptr, rowval, nzval)
+	
+
+	#for ...
+	#	take many columns together if necessary in `get_column`
+	#end
+
+
+
+end
+
+
+
+function plus_remap(lnk::SuperSparseMatrixLNK{Tv, Ti}, csc::SparseArrays.SparseMatrixCSC, gi::Vector{Ti}) where {Tv, Ti <: Integer}
+
+	#@info lnk.collnk[1:lnk.colctr]
+	
+	
+	supersparsecolumns = gi[lnk.collnk[1:lnk.colctr]]
+	sortedcolumnids    = sortperm(supersparsecolumns)
+	sortedcolumns      = supersparsecolumns[sortedcolumnids]
+	#sortedcolumns      = vcat([1], sortedcolumns)
+	#@info typeof(supersparsecolumns), typeof(sortedcolumns)
+	
+	sortedcolumns      = vcat(sortedcolumns, [Ti(csc.n+1)])
+	
+	#@info typeof(supersparsecolumns), typeof(sortedcolumns)
+	
+	
+	#@info supersparsecolumns
+	#@info sortedcolumns
+	#@info lnk.collnk[1:length(sortedcolumns)-1]
+	#@info lnk.collnk[sortedcolumnids[1:length(sortedcolumns)-1]]
+	
+	col = [ColEntry{Tv, Ti}(0, zero(Tv)) for i=1:csc.m]
+	
+	#@info sortedcolumnids 
+	
+	nnz_sum   = length(csc.rowval) + lnk.nnz
+	colptr    = Vector{Ti}(undef, csc.n+1)
+	rowval    = Vector{Ti}(undef, nnz_sum)
+	nzval     = Vector{Tv}(undef, nnz_sum)
+	colptr[1] = one(Ti)
+	
+	#first part: columns between 1 and first column of lnk
+	
+	colptr[1:sortedcolumns[1]] = view(csc.colptr, 1:sortedcolumns[1])
+	rowval[1:csc.colptr[sortedcolumns[1]]-1] = view(csc.rowval, 1:csc.colptr[sortedcolumns[1]]-1)
+	nzval[1:csc.colptr[sortedcolumns[1]]-1]  = view(csc.nzval, 1:csc.colptr[sortedcolumns[1]]-1)
+	
+	numshifts = 0
+	
+	for J=1:length(sortedcolumns)-1
+		i = sortedcolumns[J]
+		
+		coll = get_column!(col, lnk, lnk.collnk[sortedcolumnids[J]] )
+		#@info typeof(i), typeof(coll), typeof(colptr), typeof(colptr[i]), typeof(colptr[i]-1)
+		nns       = merge_into!(rowval, nzval, csc, col, i, coll, colptr[i]-one(Ti))
+		numshifts += nns
+		
+		
+		colptr[i+1:sortedcolumns[J+1]] = csc.colptr[i+1:sortedcolumns[J+1]].+(-csc.colptr[i]+colptr[i] + nns)
+		rowval[colptr[i+1]:colptr[sortedcolumns[J+1]]-1] = view(csc.rowval, csc.colptr[i+1]:csc.colptr[sortedcolumns[J+1]]-1)
+		nzval[colptr[i+1]:colptr[sortedcolumns[J+1]]-1]  = view(csc.nzval, csc.colptr[i+1]:csc.colptr[sortedcolumns[J+1]]-1)
+		
+		#=
+		for k=csc.colptr[i+1]:csc.colptr[sortedcolumns[J+1]]-1
+			k2 = k+(-csc.colptr[i]+colptr[i] + nns)
+			rowval[k2] = csc.rowval[k]
+			nzval[k2]  = csc.nzval[k]
+		end
+		=#
+	end
+	
+	
+	resize!(rowval, length(csc.rowval)+numshifts)
+	resize!(nzval, length(csc.rowval)+numshifts)
+	
+	
+	SparseArrays.SparseMatrixCSC(csc.m, csc.n, colptr, rowval, nzval)
+
+end
+
+
+
+
+function plus_loop(lnk::SuperSparseMatrixLNK{Tv, Ti}, csc::SparseArrays.SparseMatrixCSC) where {Tv, Ti <: Integer}
+	gi = collect(1:csc.n)
+	
+	supersparsecolumns = gi[lnk.collnk[1:lnk.colctr]]
+	sortedcolumnids    = sortperm(supersparsecolumns)
+	sortedcolumns      = supersparsecolumns[sortedcolumnids]
+	#sortedcolumns      = vcat([1], sortedcolumns)
+	sortedcolumns      = vcat(sortedcolumns, [csc.n+1])
+	
+	col = [ColEntry{Tv, Ti}(0, zero(Tv)) for i=1:csc.m]
+	
+	#@info sortedcolumnids 
+	
+	nnz_sum   = length(csc.rowval) + lnk.nnz
+	colptr    = Vector{Ti}(undef, csc.n+1)
+	rowval    = Vector{Ti}(undef, nnz_sum)
+	nzval     = Vector{Tv}(undef, nnz_sum)
+	colptr[1] = one(Ti)
+	
+	#first part: columns between 1 and first column of lnk
+	
+	colptr[1:sortedcolumns[1]] = view(csc.colptr, 1:sortedcolumns[1])
+	rowval[1:csc.colptr[sortedcolumns[1]]-1] = view(csc.rowval, 1:csc.colptr[sortedcolumns[1]]-1)
+	nzval[1:csc.colptr[sortedcolumns[1]]-1]  = view(csc.nzval, 1:csc.colptr[sortedcolumns[1]]-1)
+	
+	numshifts = 0
+	
+	for J=1:length(sortedcolumns)-1
+		i = sortedcolumns[J]
+		coll = get_column!(col, lnk, i)
+		
+		nns       = merge_into!(rowval, nzval, csc, col, i, coll, colptr[i]-1)
+		numshifts += nns
+		
+		colptr[i+1:sortedcolumns[J+1]] = csc.colptr[i+1:sortedcolumns[J+1]].+(-csc.colptr[i]+colptr[i] + nns)
+		
+		
+		for k=csc.colptr[i+1]:csc.colptr[sortedcolumns[J+1]]-1
+			k2 = k+(-csc.colptr[i]+colptr[i] + nns)
+			rowval[k2] = csc.rowval[k]
+			nzval[k2]  = csc.nzval[k]
+		end
+		
+		
+	end
+	
+	#@info colptr
+	
+	resize!(rowval, length(csc.rowval)+numshifts)
+	resize!(nzval, length(csc.rowval)+numshifts)
+	
+	
+	SparseMatrixCSC(csc.m, csc.n, colptr, rowval, nzval)
+
+		
+
+end
+
+
+
+function twodisjointsets(n, k)
+	A = rand(1:n, k)
+	B = zeros(Int64, k)
+	done = false
+	ctr = 0
+	while ctr != k
+		v = rand(1:n)
+		if !(v in A)
+			ctr += 1
+			B[ctr] = v
+		end
+	end
+
+	A, B
+end
+
+function distinct(x, n)
+	y = zeros(typeof(x[1]), n)
+	ctr = 0
+	while ctr != n
+		v = rand(x)
+		if !(v in y[1:ctr])
+			ctr += 1
+			y[ctr] = v
+		end
+	end
+	y
+end 
+
+
+function mean(x)
+	sum(x)/length(x)
+end
+
+function form(x)
+	[minimum(x), mean(x), maximum(x)]
+end
+
+
+
+
+
+
+
+
+
+
+

From 4d15e47275db1ac41de6e08b935ea32d3c32e1f2 Mon Sep 17 00:00:00 2001
From: Johannes Taraz <johannes.taraz@gmail.com>
Date: Tue, 20 Feb 2024 17:58:45 +0100
Subject: [PATCH 3/9] add Mittal,Al-Kurdi ILU

---
 src/ExtendableSparse.jl                   |   2 +
 src/factorizations/factorizations.jl      |   2 +
 src/factorizations/ilu_Al-Kurdi_Mittal.jl | 136 ++++++++++++++++++++++
 src/factorizations/iluam.jl               |  35 ++++++
 4 files changed, 175 insertions(+)
 create mode 100644 src/factorizations/ilu_Al-Kurdi_Mittal.jl
 create mode 100644 src/factorizations/iluam.jl

diff --git a/src/ExtendableSparse.jl b/src/ExtendableSparse.jl
index 372df82..2894027 100644
--- a/src/ExtendableSparse.jl
+++ b/src/ExtendableSparse.jl
@@ -39,6 +39,8 @@ include("matrix/ExtendableSparseMatrixParallel/ExtendableSparseParallel.jl")
 export ExtendableSparseMatrixParallel, SuperSparseMatrixLNK
 export addtoentry!, reset!, dummy_assembly!, preparatory_multi_ps_less_reverse, fr, addtoentry!, rawupdateindex!, updateindex!, compare_matrices_light
 
+include("factorizations/ilu_Al-Kurdi_Mittal.jl")
+using .ILUAM
 
 include("factorizations/factorizations.jl")
 
diff --git a/src/factorizations/factorizations.jl b/src/factorizations/factorizations.jl
index d278d8b..ead23c5 100644
--- a/src/factorizations/factorizations.jl
+++ b/src/factorizations/factorizations.jl
@@ -157,6 +157,7 @@ end
 include("jacobi.jl")
 include("ilu0.jl")
 include("iluzero.jl")
+include("iluam.jl")
 include("parallel_jacobi.jl")
 include("parallel_ilu0.jl")
 include("sparspak.jl")
@@ -165,6 +166,7 @@ include("blockpreconditioner.jl")
 @eval begin
     @makefrommatrix ILU0Preconditioner
     @makefrommatrix ILUZeroPreconditioner
+    @makefrommatrix ILUAMPreconditioner
     @makefrommatrix PointBlockILUZeroPreconditioner
     @makefrommatrix JacobiPreconditioner
     @makefrommatrix ParallelJacobiPreconditioner
diff --git a/src/factorizations/ilu_Al-Kurdi_Mittal.jl b/src/factorizations/ilu_Al-Kurdi_Mittal.jl
new file mode 100644
index 0000000..a47bd50
--- /dev/null
+++ b/src/factorizations/ilu_Al-Kurdi_Mittal.jl
@@ -0,0 +1,136 @@
+module ILUAM
+using LinearAlgebra, SparseArrays
+
+import LinearAlgebra.ldiv!, LinearAlgebra.\, SparseArrays.nnz 
+
+
+mutable struct ILUAMPrecon{T,N}
+
+	diag::AbstractVector
+    nzval::AbstractVector
+	rowval::AbstractVector
+	colptr::AbstractVector
+	
+end
+
+function ILUAMPrecon(A::SparseMatrixCSC{T,N}, b_type=T) where {T,N<:Integer}
+    n      = A.n # number of columns
+	nzval  = copy(A.nzval)
+	diag   = Vector{N}(undef, n)
+	
+    ILUAMPrecon{T, N}(diag, copy(A.nzval), copy(A.rowval), copy(A.colptr))
+end
+
+function iluAM!(LU::ILUAMPrecon{T,N}, A::SparseMatrixCSC{T,N}) where {T,N<:Integer}
+    nzval  = LU.nzval
+    diag   = LU.diag
+    
+    colptr = LU.colptr
+	rowval = LU.rowval
+	n      = A.n # number of columns
+	point  = zeros(N, n) #Vector{N}(undef, n)
+	
+	# find diagonal entries
+	for j=1:n
+		for v=colptr[j]:colptr[j+1]-1
+			if rowval[v] == j
+				diag[j] = v
+				break
+			end
+			#elseif rowval[v] 
+		end
+	end
+	
+	# compute L and U
+	for j=1:n
+		for v=colptr[j]:colptr[j+1]-1  ## start at colptr[j]+1 ??
+			point[rowval[v]] = v
+		end
+		
+		for v=colptr[j]:diag[j]-1
+			i = rowval[v]
+			#nzval[v] /= nzval[diag[i]]
+			for w=diag[i]+1:colptr[i+1]-1
+				k = point[rowval[w]]
+				if k>0
+					nzval[k] -= nzval[v]*nzval[w]
+				end
+			end
+		end
+		
+		for v=diag[j]+1:colptr[j+1]-1
+			nzval[v] /= nzval[diag[j]]
+		end
+		
+		
+		for v=colptr[j]:colptr[j+1]-1
+			point[rowval[v]] = zero(N)
+		end
+	end
+end
+
+function iluAM(A::SparseMatrixCSC{T,N}) where {T,N<:Integer}
+    LU = ILUAMPrecon(A::SparseMatrixCSC{T,N})
+    iluAM!(LU, A)
+    LU
+end
+
+
+function forward_substitution!(y, ilu::ILUAMPrecon{T,N}, v) where {T,N<:Integer}
+	n = ilu.A.n
+	nzval = ilu.nzval
+	colptr = ilu.colptr
+	rowval = ilu.rowval
+	diag = ilu.diag
+	y .= 0
+	@inbounds for j=1:n
+		y[j] += v[j]
+		for v=diag[j]+1:colptr[j+1]-1
+			y[rowval[v]] -= nzval[v]*y[j]
+		end
+	end
+	y
+end
+
+
+function backward_substitution!(x, ilu::ILUAMPrecon{T,N}, y) where {T,N<:Integer}
+    n = ilu.A.n
+	nzval = ilu.nzval
+	colptr = ilu.colptr
+	rowval = ilu.rowval
+	diag = ilu.diag
+	wrk = copy(y)
+	@inbounds for j=n:-1:1
+		x[j] = wrk[j] / nzval[diag[j]]		
+		for i=colptr[j]:diag[j]-1
+			wrk[rowval[i]] -= nzval[i]*x[j]
+		end
+	end
+    x
+end
+
+function ldiv!(x, ilu::ILUAMPrecon{T,N}, b) where {T,N<:Integer}
+    y = copy(b)
+    forward_substitution!(y, ilu, b)
+    backward_substitution!(x, ilu, y)
+    x
+end
+
+function ldiv!(ilu::ILUAMPrecon{T,N}, b) where {T,N<:Integer}
+    y = copy(b)
+    forward_substitution!(y, ilu, b)
+    backward_substitution!(b, ilu, y)
+    b
+end
+
+function \(ilu::ILUAMPrecon{T,N}, b) where {T,N<:Integer}
+    x = copy(b)
+    ldiv!(x, ilu, b)
+end
+
+function nnz(ilu::ILUAMPrecon{T,N}) where {T,N<:Integer}
+    length(ilu.nzval)
+end
+
+
+end
\ No newline at end of file
diff --git a/src/factorizations/iluam.jl b/src/factorizations/iluam.jl
new file mode 100644
index 0000000..5d65e40
--- /dev/null
+++ b/src/factorizations/iluam.jl
@@ -0,0 +1,35 @@
+mutable struct ILUAMPreconditioner <: AbstractPreconditioner
+    A::ExtendableSparseMatrix
+    factorization::ILUAM.ILUAMPrecon
+    phash::UInt64
+    function ILUAMPreconditioner()
+        p = new()
+        p.phash = 0
+        p
+    end
+end
+
+"""
+```
+ILUAMPreconditioner()
+ILUAMPreconditioner(matrix)
+```
+Incomplete LU preconditioner with zero fill-in using ... . This preconditioner
+also calculates and stores updates to the off-diagonal entries and thus delivers better convergence than  the [`ILU0Preconditioner`](@ref).
+"""
+function ILUAMPreconditioner end
+
+function update!(p::ILUAMPreconditioner)
+    flush!(p.A)
+    if p.A.phash != p.phash
+        p.factorization = ILUAM.iluAM(p.A.cscmatrix)
+        p.phash=p.A.phash
+    else
+        ILUAM.ilu0!(p.factorization, p.A.cscmatrix)
+    end
+    p
+end
+
+allow_views(::ILUAMPreconditioner)=true
+allow_views(::Type{ILUAMPreconditioner})=true
+

From 4e0e30958e8f7e51606e4b6d76b71f5ec8761988 Mon Sep 17 00:00:00 2001
From: Johannes Taraz <johannes.taraz@gmail.com>
Date: Fri, 23 Feb 2024 12:47:51 +0100
Subject: [PATCH 4/9] implement ILU (sequential and parallel) based on Mittal
 and Al-Kurdi

---
 Project.toml                                  |   6 +-
 src/ExtendableSparse.jl                       |  33 ++-
 src/factorizations/factorizations.jl          | 119 +++++---
 src/factorizations/ilu_Al-Kurdi_Mittal.jl     | 176 ++++++++----
 src/factorizations/ilu_Al-Kurdi_Mittal_0.jl   | 146 ++++++++++
 src/factorizations/ilu_Al-Kurdi_Mittal_1.jl   | 229 +++++++++++++++
 src/factorizations/iluam.jl                   |   7 +-
 src/factorizations/pilu_Al-Kurdi_Mittal.jl    | 270 ++++++++++++++++++
 src/factorizations/piluam.jl                  |  36 +++
 .../ExtendableSparseParallel.jl               |  34 ++-
 .../struct_flush.jl                           |   2 +-
 11 files changed, 941 insertions(+), 117 deletions(-)
 create mode 100644 src/factorizations/ilu_Al-Kurdi_Mittal_0.jl
 create mode 100644 src/factorizations/ilu_Al-Kurdi_Mittal_1.jl
 create mode 100644 src/factorizations/pilu_Al-Kurdi_Mittal.jl
 create mode 100644 src/factorizations/piluam.jl

diff --git a/Project.toml b/Project.toml
index 46d6e61..8776054 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,13 +1,15 @@
 name = "ExtendableSparse"
 uuid = "95c220a8-a1cf-11e9-0c77-dbfce5f500b3"
 authors = ["Juergen Fuhrmann <juergen.fuhrmann@wias-berlin.de>"]
-version = "1.4"
+version = "1.4.0"
 
 [deps]
 AMGCLWrap = "4f76b812-4ba5-496d-b042-d70715554288"
 DocStringExtensions = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae"
+ExtendableGrids = "cfc395e8-590f-11e8-1f13-43a2532b2fa8"
 ILUZero = "88f59080-6952-5380-9ea5-54057fb9a43f"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+Metis = "2679e427-3c69-5b7f-982b-ece356f1e94b"
 Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
 Requires = "ae029012-a4dd-5104-9daa-d747884805df"
 SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
@@ -15,8 +17,6 @@ Sparspak = "e56a9233-b9d6-4f03-8d0f-1825330902ac"
 StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
 SuiteSparse = "4607b0f0-06f3-5cda-b6b1-a6196a1729e9"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
-Metis = "2679e427-3c69-5b7f-982b-ece356f1e94b"
-ExtendableGrids = "cfc395e8-590f-11e8-1f13-43a2532b2fa8"
 
 [weakdeps]
 AMGCLWrap = "4f76b812-4ba5-496d-b042-d70715554288"
diff --git a/src/ExtendableSparse.jl b/src/ExtendableSparse.jl
index 2894027..285d0c2 100644
--- a/src/ExtendableSparse.jl
+++ b/src/ExtendableSparse.jl
@@ -28,25 +28,39 @@ include("matrix/sparsematrixcsc.jl")
 include("matrix/sparsematrixlnk.jl")
 include("matrix/extendable.jl")
 
-export SparseMatrixLNK,
-       ExtendableSparseMatrix, flush!, nnz, updateindex!, rawupdateindex!, colptrs, sparse
+export SparseMatrixLNK, ExtendableSparseMatrix, flush!, nnz, updateindex!, rawupdateindex!, colptrs, sparse
 
 export eliminate_dirichlet, eliminate_dirichlet!, mark_dirichlet
 
 
+@warn "ESMP!"
 include("matrix/ExtendableSparseMatrixParallel/ExtendableSparseParallel.jl")
 
-export ExtendableSparseMatrixParallel, SuperSparseMatrixLNK
-export addtoentry!, reset!, dummy_assembly!, preparatory_multi_ps_less_reverse, fr, addtoentry!, rawupdateindex!, updateindex!, compare_matrices_light
 
 include("factorizations/ilu_Al-Kurdi_Mittal.jl")
-using .ILUAM
-
+#using .ILUAM
+include("factorizations/pilu_Al-Kurdi_Mittal.jl")
+#using .PILUAM
 include("factorizations/factorizations.jl")
 
+include("factorizations/simple_iteration.jl")
+export simple, simple!
+
+include("matrix/sprand.jl")
+export sprand!, sprand_sdd!, fdrand, fdrand!, fdrand_coo, solverbenchmark
+
+
+
+
+export ExtendableSparseMatrixParallel, SuperSparseMatrixLNK
+export addtoentry!, reset!, dummy_assembly!, preparatory_multi_ps_less_reverse, fr, addtoentry!, rawupdateindex!, updateindex!, compare_matrices_light
+
+
 export JacobiPreconditioner,
     ILU0Preconditioner,
     ILUZeroPreconditioner,
+    ILUAMPreconditioner,
+    PILUAMPreconditioner,
     PointBlockILUZeroPreconditioner,
     ParallelJacobiPreconditioner,
     ParallelILU0Preconditioner,
@@ -57,13 +71,6 @@ export AbstractFactorization, LUFactorization, CholeskyFactorization, SparspakLU
 export issolver
 export factorize!, update!
 
-include("factorizations/simple_iteration.jl")
-export simple, simple!
-
-include("matrix/sprand.jl")
-export sprand!, sprand_sdd!, fdrand, fdrand!, fdrand_coo, solverbenchmark
-
-
 @static if  !isdefined(Base, :get_extension)
     function __init__()
         @require Pardiso = "46dd5b70-b6fb-5a00-ae2d-e8fea33afaf2"  begin
diff --git a/src/factorizations/factorizations.jl b/src/factorizations/factorizations.jl
index ead23c5..2d56fce 100644
--- a/src/factorizations/factorizations.jl
+++ b/src/factorizations/factorizations.jl
@@ -51,6 +51,52 @@ Determine if factorization is a solver or not
 issolver(::AbstractLUFactorization) = true
 issolver(::AbstractPreconditioner) = false
 
+
+
+""""
+    @makefrommatrix(fact)
+
+For an AbstractFactorization `MyFact`, provide methods
+```
+    MyFact(A::ExtendableSparseMatrix; kwargs...)
+    MyFact(A::SparseMatrixCSC; kwargs...)
+```
+"""
+macro makefrommatrix(fact)
+    return quote
+        function $(esc(fact))(A::ExtendableSparseMatrix; kwargs...)
+            factorize!($(esc(fact))(;kwargs...), A)
+        end
+        function $(esc(fact))(A::SparseMatrixCSC; kwargs...)
+            $(esc(fact))(ExtendableSparseMatrix(A); kwargs...)
+        end
+    end
+end
+
+include("ilu0.jl")
+include("iluzero.jl")
+include("iluam.jl")
+include("piluam.jl")
+include("parallel_jacobi.jl")
+include("parallel_ilu0.jl")
+include("sparspak.jl")
+include("blockpreconditioner.jl")
+include("jacobi.jl")
+
+@eval begin
+    @makefrommatrix ILU0Preconditioner
+    @makefrommatrix ILUZeroPreconditioner
+    @makefrommatrix ILUAMPreconditioner
+    @makefrommatrix PILUAMPreconditioner
+    @makefrommatrix PointBlockILUZeroPreconditioner
+    @makefrommatrix JacobiPreconditioner
+    @makefrommatrix ParallelJacobiPreconditioner
+    @makefrommatrix ParallelILU0Preconditioner
+    @makefrommatrix SparspakLU
+    @makefrommatrix UpdateteableBlockpreconditioner
+    @makefrommatrix BlockPreconditioner
+end
+
 """
 ```
 factorize!(factorization, matrix)
@@ -65,8 +111,40 @@ function factorize!(p::AbstractFactorization, A::ExtendableSparseMatrix)
     p
 end
 
+function factorize!(p::PILUAMPreconditioner, A::ExtendableSparseMatrixParallel)
+    p.A = A
+    update!(p)
+    p
+end
+
+#function factorize!(p::AbstractFactorization, A::ExtendableSparseMatrixParallel)
+#    p.A = A
+#    update!(p)
+#    p
+#end
+
+#factorize!(p::AbstractFactorization, A::ExtendableSparseMatrixParallel)=factorize!(p,ExtendableSparseMatrix(A.cscmatrix))
+
+#factorize!(p::PILUAMPrecon, A::ExtendableSparseMatrixParallel)=factorize!(p,ExtendableSparseMatrix(A.cscmatrix))
 
 factorize!(p::AbstractFactorization, A::SparseMatrixCSC)=factorize!(p,ExtendableSparseMatrix(A))
+
+#function factorize!(p::PILUAMPrecon, A::ExtendableSparseMatrixParallel)
+#    factorize!(p, A)
+#end
+
+#function factorize!(p::AbstractFactorization, A::ExtendableSparseMatrixParallel)
+#    factorize!(p, A.cscmatrix)
+#end
+
+
+#function factorize!(p::AbstractFactorization, A::ExtendableSparseMatrix)
+#    factorize!(p, A.cscmatrix)
+#end
+
+
+#factorize!(p::PILUAMPrecon, A::ExtendableSparseMatrixParallel)=factorize!(p,A)
+
 """
 ```
 lu!(factorization, matrix)
@@ -134,47 +212,6 @@ LinearAlgebra.ldiv!(fact::AbstractFactorization, v) = ldiv!(fact.factorization,
 
 
 
-""""
-    @makefrommatrix(fact)
-
-For an AbstractFactorization `MyFact`, provide methods
-```
-    MyFact(A::ExtendableSparseMatrix; kwargs...)
-    MyFact(A::SparseMatrixCSC; kwargs...)
-```
-"""
-macro makefrommatrix(fact)
-    return quote
-        function $(esc(fact))(A::ExtendableSparseMatrix; kwargs...)
-            factorize!($(esc(fact))(;kwargs...), A)
-        end
-        function $(esc(fact))(A::SparseMatrixCSC; kwargs...)
-            $(esc(fact))(ExtendableSparseMatrix(A); kwargs...)
-        end
-    end
-end
-
-include("jacobi.jl")
-include("ilu0.jl")
-include("iluzero.jl")
-include("iluam.jl")
-include("parallel_jacobi.jl")
-include("parallel_ilu0.jl")
-include("sparspak.jl")
-include("blockpreconditioner.jl")
-
-@eval begin
-    @makefrommatrix ILU0Preconditioner
-    @makefrommatrix ILUZeroPreconditioner
-    @makefrommatrix ILUAMPreconditioner
-    @makefrommatrix PointBlockILUZeroPreconditioner
-    @makefrommatrix JacobiPreconditioner
-    @makefrommatrix ParallelJacobiPreconditioner
-    @makefrommatrix ParallelILU0Preconditioner
-    @makefrommatrix SparspakLU
-    @makefrommatrix UpdateteableBlockpreconditioner
-    @makefrommatrix BlockPreconditioner
-end
 
 if USE_GPL_LIBS
     #requires SuiteSparse which is not available in non-GPL builds
diff --git a/src/factorizations/ilu_Al-Kurdi_Mittal.jl b/src/factorizations/ilu_Al-Kurdi_Mittal.jl
index a47bd50..97bb9a8 100644
--- a/src/factorizations/ilu_Al-Kurdi_Mittal.jl
+++ b/src/factorizations/ilu_Al-Kurdi_Mittal.jl
@@ -1,34 +1,27 @@
-module ILUAM
-using LinearAlgebra, SparseArrays
+#module ILUAM
+#using LinearAlgebra, SparseArrays
 
 import LinearAlgebra.ldiv!, LinearAlgebra.\, SparseArrays.nnz 
 
+@info "ILUAM"
 
 mutable struct ILUAMPrecon{T,N}
 
 	diag::AbstractVector
     nzval::AbstractVector
-	rowval::AbstractVector
-	colptr::AbstractVector
+	A::AbstractMatrix
 	
 end
 
-function ILUAMPrecon(A::SparseMatrixCSC{T,N}, b_type=T) where {T,N<:Integer}
-    n      = A.n # number of columns
-	nzval  = copy(A.nzval)
-	diag   = Vector{N}(undef, n)
-	
-    ILUAMPrecon{T, N}(diag, copy(A.nzval), copy(A.rowval), copy(A.colptr))
-end
-
-function iluAM!(LU::ILUAMPrecon{T,N}, A::SparseMatrixCSC{T,N}) where {T,N<:Integer}
-    nzval  = LU.nzval
-    diag   = LU.diag
-    
-    colptr = LU.colptr
-	rowval = LU.rowval
-	n      = A.n # number of columns
-	point  = zeros(N, n) #Vector{N}(undef, n)
+function iluAM(A::SparseMatrixCSC{Tv,Ti}) where {Tv, Ti <:Integer}
+	@info "iluAM"
+	nzval = copy(A.nzval)
+	colptr = A.colptr
+	rowval = A.rowval
+	#nzval  = ILU.nzval
+	n = A.n # number of columns
+	point = zeros(Ti, n) #Vector{Ti}(undef, n)
+	diag  = Vector{Ti}(undef, n)
 	
 	# find diagonal entries
 	for j=1:n
@@ -64,25 +57,23 @@ function iluAM!(LU::ILUAMPrecon{T,N}, A::SparseMatrixCSC{T,N}) where {T,N<:Integ
 		
 		
 		for v=colptr[j]:colptr[j+1]-1
-			point[rowval[v]] = zero(N)
+			point[rowval[v]] = zero(Ti)
 		end
 	end
+	#nzval, diag
+	ILUAMPrecon{Tv,Ti}(diag, nzval, A)
 end
 
-function iluAM(A::SparseMatrixCSC{T,N}) where {T,N<:Integer}
-    LU = ILUAMPrecon(A::SparseMatrixCSC{T,N})
-    iluAM!(LU, A)
-    LU
-end
-
+function forward_subst_old!(y, v, nzval, diag, A)
+	n      = A.n
+	colptr = A.colptr
+	rowval = A.rowval
+	
+	for i in eachindex(y)
+        y[i] = zero(Float64)
+    end
 
-function forward_substitution!(y, ilu::ILUAMPrecon{T,N}, v) where {T,N<:Integer}
-	n = ilu.A.n
-	nzval = ilu.nzval
-	colptr = ilu.colptr
-	rowval = ilu.rowval
-	diag = ilu.diag
-	y .= 0
+	#y .= 0
 	@inbounds for j=1:n
 		y[j] += v[j]
 		for v=diag[j]+1:colptr[j+1]-1
@@ -93,44 +84,119 @@ function forward_substitution!(y, ilu::ILUAMPrecon{T,N}, v) where {T,N<:Integer}
 end
 
 
-function backward_substitution!(x, ilu::ILUAMPrecon{T,N}, y) where {T,N<:Integer}
-    n = ilu.A.n
-	nzval = ilu.nzval
-	colptr = ilu.colptr
-	rowval = ilu.rowval
-	diag = ilu.diag
-	wrk = copy(y)
+function backward_subst_old!(x, y, nzval, diag, A)
+	n      = A.n
+	colptr = A.colptr
+	rowval = A.rowval
 	@inbounds for j=n:-1:1
-		x[j] = wrk[j] / nzval[diag[j]]		
+		x[j] = y[j] / nzval[diag[j]] 
+		
 		for i=colptr[j]:diag[j]-1
-			wrk[rowval[i]] -= nzval[i]*x[j]
+			y[rowval[i]] -= nzval[i]*x[j]
 		end
+		
 	end
-    x
+	x
 end
 
-function ldiv!(x, ilu::ILUAMPrecon{T,N}, b) where {T,N<:Integer}
-    y = copy(b)
-    forward_substitution!(y, ilu, b)
-    backward_substitution!(x, ilu, y)
-    x
+function ldiv!(x, ILU::ILUAMPrecon, b)
+	nzval = ILU.nzval
+	diag  = ILU.diag
+	A     = ILU.A
+	y = copy(b)
+	#forward_subst!(y, b, ILU)
+	forward_subst_old!(y, b, nzval, diag, A)
+	backward_subst_old!(x, y, nzval, diag, A)
+	x
 end
 
-function ldiv!(ilu::ILUAMPrecon{T,N}, b) where {T,N<:Integer}
-    y = copy(b)
-    forward_substitution!(y, ilu, b)
-    backward_substitution!(b, ilu, y)
-    b
+function ldiv!(ILU::ILUAMPrecon, b)
+	nzval = ILU.nzval
+	diag  = ILU.diag
+	A     = ILU.A
+	y = copy(b)
+	#forward_subst!(y, b, ILU)
+	forward_subst_old!(y, b, nzval, diag, A)
+	backward_subst_old!(b, y, nzval, diag, A)
+	b
 end
 
 function \(ilu::ILUAMPrecon{T,N}, b) where {T,N<:Integer}
     x = copy(b)
     ldiv!(x, ilu, b)
+	x
 end
 
 function nnz(ilu::ILUAMPrecon{T,N}) where {T,N<:Integer}
     length(ilu.nzval)
 end
 
+#=
+function forward_subst!(y, v, ilu) #::ILUAMPrecon{T,N}) where {T,N<:Integer}
+	@info "fw"
+	n = ilu.A.n
+	nzval  = ilu.nzval
+	diag   = ilu.diag
+	colptr = ilu.A.colptr
+	rowval = ilu.A.rowval
+	
+	for i in eachindex(y)
+        y[i] = zero(Float64)
+    end
+
+	#y .= 0
+	@inbounds for j=1:n
+		y[j] += v[j]
+		for v=diag[j]+1:colptr[j+1]-1
+			y[rowval[v]] -= nzval[v]*y[j]
+		end
+	end
+	y
+end
+
+function backward_subst!(x, y, ilu) #::ILUAMPrecon{T,N}) where {T,N<:Integer}
+    @info "bw"
+	n = ilu.A.n
+	nzval = ilu.nzval
+	diag = ilu.diag
+	colptr = ilu.A.colptr
+	rowval = ilu.A.rowval
+	#wrk = copy(y)
+	@inbounds for j=n:-1:1
+		x[j] = y[j] / nzval[diag[j]] 
+		
+		for i=colptr[j]:diag[j]-1
+			y[rowval[i]] -= nzval[i]*x[j]
+		end
+		
+	end
+	x
+end
+
+function iluam_subst(ILU::ILUAMPrecon, b)
+	y = copy(b)
+	forward_subst!(y, b, ILU)
+	z = copy(b)
+	backward_subst!(z, y, ILU)
+	z
+end
+
+
+
+function iluam_subst_old(ILU::ILUAMPrecon, b)
+	nzval = ILU.nzval
+	diag  = ILU.diag
+	A     = ILU.A
+	y = copy(b)
+	#forward_subst!(y, b, ILU)
+	forward_subst_old!(y, b, nzval, diag, A)
+	z = copy(b)
+	backward_subst_old!(z, y, nzval, diag, A)
+	#backward_subst!(z, y, ILU)
+	z
+end
+=#
+
+
 
-end
\ No newline at end of file
+#end
\ No newline at end of file
diff --git a/src/factorizations/ilu_Al-Kurdi_Mittal_0.jl b/src/factorizations/ilu_Al-Kurdi_Mittal_0.jl
new file mode 100644
index 0000000..26f9788
--- /dev/null
+++ b/src/factorizations/ilu_Al-Kurdi_Mittal_0.jl
@@ -0,0 +1,146 @@
+module ILUAM
+using LinearAlgebra, SparseArrays
+
+import LinearAlgebra.ldiv!, LinearAlgebra.\, SparseArrays.nnz 
+
+
+mutable struct ILUAMPrecon{T,N}
+
+	diag::AbstractVector
+    nzval::AbstractVector
+	rowval::AbstractVector
+	colptr::AbstractVector
+	
+end
+
+function ILUAMPrecon(A::SparseMatrixCSC{T,N}, b_type=T) where {T,N<:Integer}
+    @info "ILUAMPrecon"
+	n      = A.n # number of columns
+	nzval  = copy(A.nzval)
+	diag   = Vector{N}(undef, n)
+	
+    ILUAMPrecon{T, N}(diag, copy(A.nzval), copy(A.rowval), copy(A.colptr))
+end
+
+function iluAM!(LU::ILUAMPrecon{T,N}, A::SparseMatrixCSC{T,N}) where {T,N<:Integer}
+    @info "iluAM!"
+	nzval  = LU.nzval
+    diag   = LU.diag
+    
+    colptr = LU.colptr
+	rowval = LU.rowval
+	n      = A.n # number of columns
+	point  = zeros(N, n) #Vector{N}(undef, n)
+	
+	t = zeros(5)
+
+	# find diagonal entries
+	t[1] = @elapsed for j=1:n
+		for v=colptr[j]:colptr[j+1]-1
+			if rowval[v] == j
+				diag[j] = v
+				break
+			end
+			#elseif rowval[v] 
+		end
+	end
+	
+	# compute L and U
+	for j=1:n
+		t[2] += @elapsed for v=colptr[j]:colptr[j+1]-1  ## start at colptr[j]+1 ??
+			point[rowval[v]] = v
+		end
+		
+		t[3] += @elapsed for v=colptr[j]:diag[j]-1
+			i = rowval[v]
+			#nzval[v] /= nzval[diag[i]]
+			for w=diag[i]+1:colptr[i+1]-1
+				k = point[rowval[w]]
+				if k>0
+					nzval[k] -= nzval[v]*nzval[w]
+				end
+			end
+		end
+		
+		t[4] += @elapsed for v=diag[j]+1:colptr[j+1]-1
+			nzval[v] /= nzval[diag[j]]
+		end
+		
+		
+		t[5] += @elapsed for v=colptr[j]:colptr[j+1]-1
+			point[rowval[v]] = zero(N)
+		end
+	end
+	t
+end
+
+function iluAM(A::SparseMatrixCSC{T,N}) where {T,N<:Integer}
+    t = zeros(6)
+	t[1] = @elapsed (LU = ILUAMPrecon(A::SparseMatrixCSC{T,N}))
+    t[2:6] = iluAM!(LU, A)
+	@info t
+    LU
+end
+
+
+function forward_substitution!(y, ilu::ILUAMPrecon{T,N}, v) where {T,N<:Integer}
+	n = ilu.A.n
+	nzval = ilu.nzval
+	colptr = ilu.colptr
+	rowval = ilu.rowval
+	diag = ilu.diag
+	y .= 0
+	@inbounds for j=1:n
+		y[j] += v[j]
+		for v=diag[j]+1:colptr[j+1]-1
+			y[rowval[v]] -= nzval[v]*y[j]
+		end
+	end
+	y
+end
+
+
+function backward_substitution!(x, ilu::ILUAMPrecon{T,N}, y) where {T,N<:Integer}
+    n = ilu.A.n
+	nzval = ilu.nzval
+	colptr = ilu.colptr
+	rowval = ilu.rowval
+	diag = ilu.diag
+	wrk = copy(y)
+	@inbounds for j=n:-1:1
+		x[j] = wrk[j] / nzval[diag[j]]		
+		for i=colptr[j]:diag[j]-1
+			wrk[rowval[i]] -= nzval[i]*x[j]
+		end
+	end
+    x
+end
+
+function ldiv!(x, ilu::ILUAMPrecon{T,N}, b) where {T,N<:Integer}
+	@info "AM ldiv1"
+    y = copy(b)
+    forward_substitution!(y, ilu, b)
+    backward_substitution!(x, ilu, y)
+    x
+end
+
+function ldiv!(ilu::ILUAMPrecon{T,N}, b) where {T,N<:Integer}
+    @info "AM ldiv2"
+	y = copy(b)
+    forward_substitution!(y, ilu, b)
+    backward_substitution!(b, ilu, y)
+    b
+end
+
+function \(ilu::ILUAMPrecon{T,N}, b) where {T,N<:Integer}
+    @info "AM bs "
+	x = copy(b)
+    ldiv!(x, ilu, b)
+end
+
+function nnz(ilu::ILUAMPrecon{T,N}) where {T,N<:Integer}
+    length(ilu.nzval)
+end
+
+
+end
\ No newline at end of file
diff --git a/src/factorizations/ilu_Al-Kurdi_Mittal_1.jl b/src/factorizations/ilu_Al-Kurdi_Mittal_1.jl
new file mode 100644
index 0000000..a599094
--- /dev/null
+++ b/src/factorizations/ilu_Al-Kurdi_Mittal_1.jl
@@ -0,0 +1,229 @@
+module ILUAM
+using LinearAlgebra, SparseArrays
+
+#import LinearAlgebra.ldiv!, LinearAlgebra.\, SparseArrays.nnz 
+
+@info "ILUAM"
+
+mutable struct ILUAMPrecon{T,N}
+
+	diag::AbstractVector
+    nzval::AbstractVector
+	A::AbstractMatrix
+	
+end
+
+function ILUAMPrecon(A::SparseMatrixCSC{T,N}, b_type=T) where {T,N<:Integer}
+    @info "ILUAMPrecon"
+	n      = A.n # number of columns
+	nzval  = copy(A.nzval)
+	diag   = Vector{N}(undef, n)
+	
+    ILUAMPrecon{T, N}(diag, copy(A.nzval), A)
+end
+
+
+
+function iluAM!(LU::ILUAMPrecon{T,N}, A::SparseMatrixCSC{T,N}) where {T,N<:Integer}
+    @info "iluAM!"
+	nzval  = LU.nzval
+    diag   = LU.diag
+    
+    colptr = LU.A.colptr
+	rowval = LU.A.rowval
+	n      = A.n # number of columns
+	point  = zeros(N, n) #Vector{N}(undef, n)
+	
+	t = zeros(5)
+
+	# find diagonal entries
+	t[1] = @elapsed for j=1:n
+		for v=colptr[j]:colptr[j+1]-1
+			if rowval[v] == j
+				diag[j] = v
+				break
+			end
+			#elseif rowval[v] 
+		end
+	end
+	
+	# compute L and U
+	for j=1:n
+		t[2] += @elapsed for v=colptr[j]:colptr[j+1]-1  ## start at colptr[j]+1 ??
+			point[rowval[v]] = v
+		end
+		
+		t[3] += @elapsed for v=colptr[j]:diag[j]-1
+			i = rowval[v]
+			#nzval[v] /= nzval[diag[i]]
+			for w=diag[i]+1:colptr[i+1]-1
+				k = point[rowval[w]]
+				if k>0
+					nzval[k] -= nzval[v]*nzval[w]
+				end
+			end
+		end
+		
+		t[4] += @elapsed for v=diag[j]+1:colptr[j+1]-1
+			nzval[v] /= nzval[diag[j]]
+		end
+		
+		
+		t[5] += @elapsed for v=colptr[j]:colptr[j+1]-1
+			point[rowval[v]] = zero(N)
+		end
+	end
+	t
+end
+
+
+function iluAM(A::SparseMatrixCSC{Tv,Ti}) where {Tv, Ti <:Integer}
+	@info "iluAM"
+	nzval = copy(A.nzval)
+	colptr = A.colptr
+	rowval = A.rowval
+	#nzval  = ILU.nzval
+	n = A.n # number of columns
+	point = zeros(Ti, n) #Vector{Ti}(undef, n)
+	diag  = Vector{Ti}(undef, n)
+	
+	# find diagonal entries
+	for j=1:n
+		for v=colptr[j]:colptr[j+1]-1
+			if rowval[v] == j
+				diag[j] = v
+				break
+			end
+			#elseif rowval[v] 
+		end
+	end
+	
+	# compute L and U
+	for j=1:n
+		for v=colptr[j]:colptr[j+1]-1  ## start at colptr[j]+1 ??
+			point[rowval[v]] = v
+		end
+		
+		for v=colptr[j]:diag[j]-1
+			i = rowval[v]
+			#nzval[v] /= nzval[diag[i]]
+			for w=diag[i]+1:colptr[i+1]-1
+				k = point[rowval[w]]
+				if k>0
+					nzval[k] -= nzval[v]*nzval[w]
+				end
+			end
+		end
+		
+		for v=diag[j]+1:colptr[j+1]-1
+			nzval[v] /= nzval[diag[j]]
+		end
+		
+		
+		for v=colptr[j]:colptr[j+1]-1
+			point[rowval[v]] = zero(Ti)
+		end
+	end
+	#nzval, diag
+	ILUAMPrecon{Tv,Ti}(diag, nzval, A)
+end
+
+#function iluAM(A::SparseMatrixCSC{T,N}) where {T,N<:Integer}
+#    t = zeros(6)
+#	t[1] = @elapsed (LU = ILUAMPrecon(A::SparseMatrixCSC{T,N}))
+#    t[2:6] = iluAM!(LU, A)
+#	@info t
+#    LU
+#end
+
+
+function forward_substitution!(y, ilu::ILUAMPrecon{T,N}, v) where {T,N<:Integer}
+	n = ilu.A.n
+	nzval = ilu.nzval
+	colptr = ilu.A.colptr
+	rowval = ilu.A.rowval
+	diag = ilu.diag
+	y .= 0
+	@inbounds for j=1:n
+		y[j] += v[j]
+		for v=diag[j]+1:colptr[j+1]-1
+			y[rowval[v]] -= nzval[v]*y[j]
+		end
+	end
+	y
+end
+
+
+function backward_substitution!(x, ilu::ILUAMPrecon{T,N}, y) where {T,N<:Integer}
+    n = ilu.A.n
+	nzval = ilu.nzval
+	colptr = ilu.A.colptr
+	rowval = ilu.A.rowval
+	diag = ilu.diag
+	wrk = copy(y)
+	@inbounds for j=n:-1:1
+		x[j] = wrk[j] / nzval[diag[j]]		
+		for i=colptr[j]:diag[j]-1
+			wrk[rowval[i]] -= nzval[i]*x[j]
+		end
+	end
+    x
+end
+
+function ldiv_new!(x, ilu, v)
+
+	n = ilu.A.n
+	y = Vector{Float64}(undef, n)
+	y .= 0
+	nzval = ilu.nzval
+	colptr = ilu.A.colptr
+	rowval = ilu.A.rowval
+	diag = ilu.diag
+	#forward
+	@inbounds for j=1:n
+		y[j] += v[j]
+		for v=diag[j]+1:colptr[j+1]-1
+			y[rowval[v]] -= nzval[v]*y[j]
+		end
+	end
+
+	#backward
+	wrk = copy(y)
+	@inbounds for j=n:-1:1
+		x[j] = wrk[j] / nzval[diag[j]]		
+		for i=colptr[j]:diag[j]-1
+			wrk[rowval[i]] -= nzval[i]*x[j]
+		end
+	end
+    x
+end
+
+function ldiv!(x, ilu::ILUAMPrecon{T,N}, b) where {T,N<:Integer}
+	#@info "AM ldiv1"
+    y = copy(b)
+    forward_substitution!(y, ilu, b)
+    backward_substitution!(x, ilu, y)
+    x
+end
+
+function ldiv!(ilu::ILUAMPrecon{T,N}, b) where {T,N<:Integer}
+    @info "AM ldiv2"
+	y = copy(b)
+    forward_substitution!(y, ilu, b)
+    backward_substitution!(b, ilu, y)
+    b
+end
+
+function \(ilu::ILUAMPrecon{T,N}, b) where {T,N<:Integer}
+    @info "AM bs "
+	x = copy(b)
+    ldiv!(x, ilu, b)
+	x
+end
+
+function nnz(ilu::ILUAMPrecon{T,N}) where {T,N<:Integer}
+    length(ilu.nzval)
+end
+
+
+end
\ No newline at end of file
diff --git a/src/factorizations/iluam.jl b/src/factorizations/iluam.jl
index 5d65e40..a4aed06 100644
--- a/src/factorizations/iluam.jl
+++ b/src/factorizations/iluam.jl
@@ -1,6 +1,6 @@
 mutable struct ILUAMPreconditioner <: AbstractPreconditioner
     A::ExtendableSparseMatrix
-    factorization::ILUAM.ILUAMPrecon
+    factorization::ILUAMPrecon
     phash::UInt64
     function ILUAMPreconditioner()
         p = new()
@@ -22,10 +22,11 @@ function ILUAMPreconditioner end
 function update!(p::ILUAMPreconditioner)
     flush!(p.A)
     if p.A.phash != p.phash
-        p.factorization = ILUAM.iluAM(p.A.cscmatrix)
+        p.factorization = iluAM(p.A.cscmatrix)
         p.phash=p.A.phash
     else
-        ILUAM.ilu0!(p.factorization, p.A.cscmatrix)
+        @warn "fuck?"
+        ilu0!(p.factorization, p.A.cscmatrix)
     end
     p
 end
diff --git a/src/factorizations/pilu_Al-Kurdi_Mittal.jl b/src/factorizations/pilu_Al-Kurdi_Mittal.jl
new file mode 100644
index 0000000..a1ef818
--- /dev/null
+++ b/src/factorizations/pilu_Al-Kurdi_Mittal.jl
@@ -0,0 +1,270 @@
+#module PILUAM
+#using Base.Threads
+#using LinearAlgebra, SparseArrays
+
+import LinearAlgebra.ldiv!, LinearAlgebra.\, SparseArrays.nnz 
+
+@info "PILUAM"
+
+mutable struct PILUAMPrecon{T,N}
+
+	diag::AbstractVector
+    nzval::AbstractVector
+	A::AbstractMatrix
+	start::AbstractVector
+	nt::Integer
+	depth::Integer
+	
+end
+
+function use_vector_par(n, nt, Ti)
+	point = [Vector{Ti}(undef, n) for tid=1:nt]
+	@threads for tid=1:nt
+		point[tid] = zeros(Ti, n)
+	end
+	point
+end
+
+function compute_lu!(nzval, point, j0, j1, tid, rowval, colptr, diag, Ti)
+    for j=j0:j1-1
+        for v=colptr[j]:colptr[j+1]-1
+            point[tid][rowval[v]] = v
+        end
+        
+        for v=colptr[j]:diag[j]-1
+            i = rowval[v]
+            for w=diag[i]+1:colptr[i+1]-1
+                k = point[tid][rowval[w]]
+                if k>0
+                    nzval[k] -= nzval[v]*nzval[w]
+                end
+            end
+        end
+        
+        for v=diag[j]+1:colptr[j+1]-1
+            nzval[v] /= nzval[diag[j]]
+        end
+        
+        for v=colptr[j]:colptr[j+1]-1
+            point[tid][rowval[v]] = zero(Ti)
+        end
+    end
+end
+
+function piluAM(A::ExtendableSparseMatrixParallel{Tv,Ti}) where {Tv, Ti <:Integer}
+	start = A.start
+	nt = A.nt
+	depth = A.depth
+	
+	colptr = A.cscmatrix.colptr
+	rowval = A.cscmatrix.rowval
+	nzval  = Vector{Tv}(undef, length(rowval)) #copy(A.nzval)
+	n = A.cscmatrix.n # number of columns
+	diag  = Vector{Ti}(undef, n)
+	point = use_vector_par(n, A.nt, Int32)
+	
+	# find diagonal entries
+	#
+	@threads for tid=1:depth*nt+1
+		for j=start[tid]:start[tid+1]-1
+			for v=colptr[j]:colptr[j+1]-1
+				nzval[v] = A.cscmatrix.nzval[v]
+				if rowval[v] == j
+					diag[j] = v
+				end
+				#elseif rowval[v] 
+			end
+		end
+	end
+
+	#=
+	@info "piluAM"
+	nzval = copy(A.cscmatrix.nzval)
+	colptr = A.cscmatrix.colptr
+	rowval = A.cscmatrix.rowval
+	#nzval  = ILU.nzval
+	n = A.n # number of columns
+	diag  = Vector{Ti}(undef, n)
+	start = A.start
+    nt    = A.nt
+    depth = A.depth
+    point = use_vector_par(n, nt, Ti)
+
+	# find diagonal entries
+    @threads for tid=1:depth*nt+1
+        for j=start[tid]:start[tid+1]-1
+            for v=colptr[j]:colptr[j+1]-1
+                if rowval[v] == j
+                    diag[j] = v
+                    break
+                end
+                #elseif rowval[v] 
+            end
+        end
+    end
+	
+	# compute L and U
+    for level=1:depth
+        @threads for tid=1:nt
+            compute_lu!(nzval, point, start[(level-1)*nt+tid], start[(level-1)*nt+tid+1], tid, rowval, colptr, diag, Ti)
+        end
+    end
+
+    compute_lu!(nzval, point, start[depth*nt+1], start[depth*nt+2], 1, rowval, colptr, diag, Ti)
+	=#
+
+	for level=1:depth
+		@threads for tid=1:nt
+			for j=start[(level-1)*nt+tid]:start[(level-1)*nt+tid+1]-1
+				for v=colptr[j]:colptr[j+1]-1
+					point[tid][rowval[v]] = v
+				end
+				
+				for v=colptr[j]:diag[j]-1
+					i = rowval[v]
+					for w=diag[i]+1:colptr[i+1]-1
+						k = point[tid][rowval[w]]
+						if k>0
+							nzval[k] -= nzval[v]*nzval[w]
+						end
+					end
+				end
+				
+				for v=diag[j]+1:colptr[j+1]-1
+					nzval[v] /= nzval[diag[j]]
+				end
+				
+				for v=colptr[j]:colptr[j+1]-1
+					point[tid][rowval[v]] = zero(Ti)
+				end
+			end
+		end
+	end
+	
+	#point = zeros(Ti, n) #Vector{Ti}(undef, n)
+	for j=start[depth*nt+1]:start[depth*nt+2]-1
+		for v=colptr[j]:colptr[j+1]-1
+			point[1][rowval[v]] = v
+		end
+		
+		for v=colptr[j]:diag[j]-1
+			i = rowval[v]
+			for w=diag[i]+1:colptr[i+1]-1
+				k = point[1][rowval[w]]
+				if k>0
+					nzval[k] -= nzval[v]*nzval[w]
+				end
+			end
+		end
+		
+		for v=diag[j]+1:colptr[j+1]-1
+			nzval[v] /= nzval[diag[j]]
+		end
+		
+		for v=colptr[j]:colptr[j+1]-1
+			point[1][rowval[v]] = zero(Ti)
+		end
+	end
+
+	#nzval, diag
+	PILUAMPrecon{Tv,Ti}(diag, nzval, A.cscmatrix, start, nt, depth)
+end
+
+function forward_subst_old!(y, v, nzval, diag, start, nt, depth, A)
+	#@info "fwo"
+	n = A.n
+	colptr = A.colptr
+	rowval = A.rowval
+	
+	y .= 0
+	
+	for level=1:depth
+		@threads for tid=1:nt
+			@inbounds for j=start[(level-1)*nt+tid]:start[(level-1)*nt+tid+1]-1
+				y[j] += v[j]
+				for v=diag[j]+1:colptr[j+1]-1
+					y[rowval[v]] -= nzval[v]*y[j]
+				end
+			end
+		end
+	end
+	
+	@inbounds for j=start[depth*nt+1]:start[depth*nt+2]-1
+		y[j] += v[j]
+		for v=diag[j]+1:colptr[j+1]-1
+			y[rowval[v]] -= nzval[v]*y[j]
+		end
+	end
+	
+end
+
+
+function backward_subst_old!(x, y, nzval, diag, start, nt, depth, A)
+	#@info "bwo"
+	n = A.n
+	colptr = A.colptr
+	rowval = A.rowval
+	#wrk = copy(y)
+	
+	
+	@inbounds for j=start[depth*nt+2]-1:-1:start[depth*nt+1]
+		x[j] = y[j] / nzval[diag[j]] 
+		
+		for i=colptr[j]:diag[j]-1
+			y[rowval[i]] -= nzval[i]*x[j]
+		end
+		
+	end
+	
+	for level=depth:-1:1
+		@threads for tid=1:nt
+			@inbounds for j=start[(level-1)*nt+tid+1]-1:-1:start[(level-1)*nt+tid]
+				x[j] = y[j] / nzval[diag[j]] 
+				for i=colptr[j]:diag[j]-1
+					y[rowval[i]] -= nzval[i]*x[j]
+				end
+			end
+		end
+	end
+
+end
+
+function ldiv!(x, ILU::PILUAMPrecon, b)
+	nzval = ILU.nzval
+	diag  = ILU.diag
+	A     = ILU.A
+    start = ILU.start
+    nt    = ILU.nt
+    depth = ILU.depth
+	y = copy(b)
+	#forward_subst!(y, b, ILU)
+	forward_subst_old!(y, b, nzval, diag, start, nt, depth, A)
+	backward_subst_old!(x, y, nzval, diag, start, nt, depth, A)
+	x
+end
+
+function ldiv!(ILU::PILUAMPrecon, b)
+	nzval = ILU.nzval
+	diag  = ILU.diag
+	A     = ILU.A
+    start = ILU.start
+    nt    = ILU.nt
+    depth = ILU.depth
+	y = copy(b)
+	#forward_subst!(y, b, ILU)
+	forward_subst_old!(y, b, nzval, diag, start, nt, depth, A)
+	backward_subst_old!(b, y, nzval, diag, start, nt, depth, A)
+	b
+end
+
+function \(ilu::PILUAMPrecon{T,N}, b) where {T,N<:Integer}
+    x = copy(b)
+    ldiv!(x, ilu, b)
+	x
+end
+
+function nnz(ilu::PILUAMPrecon{T,N}) where {T,N<:Integer}
+    length(ilu.nzval)
+end
+
+#end
\ No newline at end of file
diff --git a/src/factorizations/piluam.jl b/src/factorizations/piluam.jl
new file mode 100644
index 0000000..4a5fcdc
--- /dev/null
+++ b/src/factorizations/piluam.jl
@@ -0,0 +1,36 @@
+mutable struct PILUAMPreconditioner <: AbstractPreconditioner
+    A::ExtendableSparseMatrixParallel
+    factorization::PILUAMPrecon
+    phash::UInt64
+    function PILUAMPreconditioner()
+        p = new()
+        p.phash = 0
+        p
+    end
+end
+
+"""
+```
+PILUAMPreconditioner()
+PILUAMPreconditioner(matrix)
+```
+Incomplete LU preconditioner with zero fill-in using ... . This preconditioner
+also calculates and stores updates to the off-diagonal entries and thus delivers better convergence than  the [`ILU0Preconditioner`](@ref).
+"""
+function PILUAMPreconditioner end
+
+function update!(p::PILUAMPreconditioner)
+    flush!(p.A)
+    if p.A.phash != p.phash
+        p.factorization = piluAM(p.A)
+        p.phash=p.A.phash
+    else
+        @warn "fuck?"
+        ilu0!(p.factorization, p.A.cscmatrix)
+    end
+    p
+end
+
+allow_views(::PILUAMPreconditioner)=true
+allow_views(::Type{PILUAMPreconditioner})=true
+
diff --git a/src/matrix/ExtendableSparseMatrixParallel/ExtendableSparseParallel.jl b/src/matrix/ExtendableSparseMatrixParallel/ExtendableSparseParallel.jl
index 68dace8..9e63c37 100644
--- a/src/matrix/ExtendableSparseMatrixParallel/ExtendableSparseParallel.jl
+++ b/src/matrix/ExtendableSparseMatrixParallel/ExtendableSparseParallel.jl
@@ -36,6 +36,12 @@ mutable struct ExtendableSparseMatrixParallel{Tv, Ti <: Integer} <: AbstractSpar
     nt::Ti
     
     depth::Ti
+
+    phash::UInt64
+
+    n::Ti
+
+    m::Ti
     
     
 end
@@ -46,7 +52,7 @@ function ExtendableSparseMatrixParallel{Tv, Ti}(nm, nt, depth; x0=0.0, x1=1.0) w
 	grid, nnts, s, onr, cfp, gi, gc, ni, rni, starts, cellparts = preparatory_multi_ps_less_reverse(nm, nt, depth, Ti; x0, x1)
 	csc = spzeros(Tv, Ti, num_nodes(grid), num_nodes(grid))
 	lnk = [SuperSparseMatrixLNK{Tv, Ti}(num_nodes(grid), nnts[tid]) for tid=1:nt]
-	ExtendableSparseMatrixParallel{Tv, Ti}(csc, lnk, grid, nnts, s, onr, cfp, gi, ni, rni, starts, cellparts, nt, depth)
+	ExtendableSparseMatrixParallel{Tv, Ti}(csc, lnk, grid, nnts, s, onr, cfp, gi, ni, rni, starts, cellparts, nt, depth, phash(csc), csc.n, csc.m)
 end
 
 
@@ -253,6 +259,32 @@ function Base.show(io::IO, ::MIME"text/plain", ext::ExtendableSparseMatrixParall
     end
 end
 
+"""
+`function entryexists2(CSC, i, j)`
+
+Find out if CSC already has an nonzero entry at i,j without any allocations
+"""
+function entryexists2(CSC, i, j) #find out if CSC already has an nonzero entry at i,j
+	#vals = 
+	#ids = CSC.colptr[j]:(CSC.colptr[j+1]-1)
+	i in view(CSC.rowval, CSC.colptr[j]:(CSC.colptr[j+1]-1))
+end
+
+
+function updatentryCSC2!(CSC::SparseArrays.SparseMatrixCSC{Tv, Ti}, i::Integer, j::Integer, v) where {Tv, Ti <: Integer}
+	p1 = CSC.colptr[j]
+	p2 = CSC.colptr[j+1]-1
+
+	searchk = searchsortedfirst(view(CSC.rowval, p1:p2), i) + p1 - 1
+	
+	if (searchk <= p2) && (CSC.rowval[searchk] == i)
+		CSC.nzval[searchk] += v
+		return true
+	else
+		return false
+	end
+end
+
 Base.size(A::ExtendableSparseMatrixParallel) = (A.cscmatrix.m, A.cscmatrix.n)
 
 include("struct_flush.jl")
diff --git a/src/matrix/ExtendableSparseMatrixParallel/struct_flush.jl b/src/matrix/ExtendableSparseMatrixParallel/struct_flush.jl
index c27aab0..38608ad 100644
--- a/src/matrix/ExtendableSparseMatrixParallel/struct_flush.jl
+++ b/src/matrix/ExtendableSparseMatrixParallel/struct_flush.jl
@@ -11,7 +11,7 @@ function flush!(A::ExtendableSparseMatrixParallel; do_dense=false, keep_zeros=tr
 			A.cscmatrix = dense_flush_removezeros!(A.lnkmatrices, A.old_noderegions, A.sortednodesperthread, A.nt, A.rev_new_indices)
 		end
 	end
-	
+	A.phash = phash(A.cscmatrix)
 	A.lnkmatrices = [SuperSparseMatrixLNK{matrixvaluetype(A), matrixindextype(A)}(num_nodes(A.grid), A.nnts[tid]) for tid=1:A.nt]
 
 end

From bb0935c46ec2d0dff30283a36064c9e62c5017ca Mon Sep 17 00:00:00 2001
From: Johannes Taraz <johannes.taraz@gmail.com>
Date: Fri, 23 Feb 2024 15:38:05 +0100
Subject: [PATCH 5/9] ColEntry from struct to mutable struct

---
 src/matrix/sparsematrixlnk.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/matrix/sparsematrixlnk.jl b/src/matrix/sparsematrixlnk.jl
index 2976e88..b00c6fc 100644
--- a/src/matrix/sparsematrixlnk.jl
+++ b/src/matrix/sparsematrixlnk.jl
@@ -278,7 +278,7 @@ end
 
 # Struct holding pair of value and row
 # number, for sorting
-struct ColEntry{Tv, Ti <: Integer}
+mutable struct ColEntry{Tv, Ti <: Integer}
     rowval::Ti
     nzval::Tv
 end

From 47a6a96f71e06c216bb3eeb32403ff45c8179128 Mon Sep 17 00:00:00 2001
From: Johannes Taraz <johannes.taraz@gmail.com>
Date: Mon, 26 Feb 2024 14:10:38 +0100
Subject: [PATCH 6/9] enable deeper partitioning / fixing some preparatory
 functions

---
 src/factorizations/ilu_Al-Kurdi_Mittal.jl     |   4 +-
 src/factorizations/iluam.jl                   |   1 +
 src/factorizations/pilu_Al-Kurdi_Mittal.jl    |   2 +
 src/factorizations/piluam.jl                  |   1 +
 .../ExtendableSparseParallel.jl               |   2 +-
 .../preparatory.jl                            | 416 ++++++++++++++----
 6 files changed, 336 insertions(+), 90 deletions(-)

diff --git a/src/factorizations/ilu_Al-Kurdi_Mittal.jl b/src/factorizations/ilu_Al-Kurdi_Mittal.jl
index 97bb9a8..0b6b1b2 100644
--- a/src/factorizations/ilu_Al-Kurdi_Mittal.jl
+++ b/src/factorizations/ilu_Al-Kurdi_Mittal.jl
@@ -14,7 +14,7 @@ mutable struct ILUAMPrecon{T,N}
 end
 
 function iluAM(A::SparseMatrixCSC{Tv,Ti}) where {Tv, Ti <:Integer}
-	@info "iluAM"
+	#@info "iluAM"
 	nzval = copy(A.nzval)
 	colptr = A.colptr
 	rowval = A.rowval
@@ -100,6 +100,7 @@ function backward_subst_old!(x, y, nzval, diag, A)
 end
 
 function ldiv!(x, ILU::ILUAMPrecon, b)
+	#@info "iluam ldiv 1"
 	nzval = ILU.nzval
 	diag  = ILU.diag
 	A     = ILU.A
@@ -111,6 +112,7 @@ function ldiv!(x, ILU::ILUAMPrecon, b)
 end
 
 function ldiv!(ILU::ILUAMPrecon, b)
+	#@info "iluam ldiv 2"
 	nzval = ILU.nzval
 	diag  = ILU.diag
 	A     = ILU.A
diff --git a/src/factorizations/iluam.jl b/src/factorizations/iluam.jl
index a4aed06..6d061b0 100644
--- a/src/factorizations/iluam.jl
+++ b/src/factorizations/iluam.jl
@@ -22,6 +22,7 @@ function ILUAMPreconditioner end
 function update!(p::ILUAMPreconditioner)
     flush!(p.A)
     if p.A.phash != p.phash
+        @warn "p.A.phash != p.phash"
         p.factorization = iluAM(p.A.cscmatrix)
         p.phash=p.A.phash
     else
diff --git a/src/factorizations/pilu_Al-Kurdi_Mittal.jl b/src/factorizations/pilu_Al-Kurdi_Mittal.jl
index a1ef818..15a8b23 100644
--- a/src/factorizations/pilu_Al-Kurdi_Mittal.jl
+++ b/src/factorizations/pilu_Al-Kurdi_Mittal.jl
@@ -230,6 +230,7 @@ function backward_subst_old!(x, y, nzval, diag, start, nt, depth, A)
 end
 
 function ldiv!(x, ILU::PILUAMPrecon, b)
+	#@info "piluam ldiv 1"
 	nzval = ILU.nzval
 	diag  = ILU.diag
 	A     = ILU.A
@@ -244,6 +245,7 @@ function ldiv!(x, ILU::PILUAMPrecon, b)
 end
 
 function ldiv!(ILU::PILUAMPrecon, b)
+	#@info "piluam ldiv 2"
 	nzval = ILU.nzval
 	diag  = ILU.diag
 	A     = ILU.A
diff --git a/src/factorizations/piluam.jl b/src/factorizations/piluam.jl
index 4a5fcdc..075f73f 100644
--- a/src/factorizations/piluam.jl
+++ b/src/factorizations/piluam.jl
@@ -22,6 +22,7 @@ function PILUAMPreconditioner end
 function update!(p::PILUAMPreconditioner)
     flush!(p.A)
     if p.A.phash != p.phash
+        @warn "p.A.phash != p.phash"
         p.factorization = piluAM(p.A)
         p.phash=p.A.phash
     else
diff --git a/src/matrix/ExtendableSparseMatrixParallel/ExtendableSparseParallel.jl b/src/matrix/ExtendableSparseMatrixParallel/ExtendableSparseParallel.jl
index 9e63c37..b635a33 100644
--- a/src/matrix/ExtendableSparseMatrixParallel/ExtendableSparseParallel.jl
+++ b/src/matrix/ExtendableSparseMatrixParallel/ExtendableSparseParallel.jl
@@ -49,7 +49,7 @@ end
 
 
 function ExtendableSparseMatrixParallel{Tv, Ti}(nm, nt, depth; x0=0.0, x1=1.0) where {Tv, Ti <: Integer}
-	grid, nnts, s, onr, cfp, gi, gc, ni, rni, starts, cellparts = preparatory_multi_ps_less_reverse(nm, nt, depth, Ti; x0, x1)
+	grid, nnts, s, onr, cfp, gi, gc, ni, rni, starts, cellparts, depth = preparatory_multi_ps_less_reverse(nm, nt, depth, Ti; x0, x1)
 	csc = spzeros(Tv, Ti, num_nodes(grid), num_nodes(grid))
 	lnk = [SuperSparseMatrixLNK{Tv, Ti}(num_nodes(grid), nnts[tid]) for tid=1:nt]
 	ExtendableSparseMatrixParallel{Tv, Ti}(csc, lnk, grid, nnts, s, onr, cfp, gi, ni, rni, starts, cellparts, nt, depth, phash(csc), csc.n, csc.m)
diff --git a/src/matrix/ExtendableSparseMatrixParallel/preparatory.jl b/src/matrix/ExtendableSparseMatrixParallel/preparatory.jl
index e14a066..a29356c 100644
--- a/src/matrix/ExtendableSparseMatrixParallel/preparatory.jl
+++ b/src/matrix/ExtendableSparseMatrixParallel/preparatory.jl
@@ -6,23 +6,36 @@
 `depth` is the number of partition layers, for depth=1, there are nt parts and 1 separator, for depth=2, the separator is partitioned again, leading to 2*nt+1 submatrices...
 To assemble the system matrix parallely, things such as `cellsforpart` (= which thread takes which cells) need to be computed in advance. This is done here.
 """
-function preparatory_multi_ps_less_reverse(nm, nt, depth, Ti; sequential=false, x0=0.0, x1=1.0)
+function preparatory_multi_ps_less_reverse(nm, nt, depth, Ti; sequential=false, x0=0.0, x1=1.0, minsize_sepa=10, do_print=false, check_partition=false)
 	grid = getgrid(nm; x0, x1)
-	
+	adepth = 0
 	if sequential
-		(allcells, start, cellparts) = grid_to_graph_ps_multi!(grid, nt, depth)#)
+		(allcells, start, cellparts, adepth) = grid_to_graph_ps_multi!(grid, nt, depth; minsize_sepa, do_print)#)
 	else
-		(allcells, start, cellparts) = grid_to_graph_ps_multi_par!(grid, nt, depth)
+		(allcells, start, cellparts, adepth) = grid_to_graph_ps_multi_par!(grid, nt, depth; minsize_sepa, do_print)
+	end
+
+	if (adepth != depth) && do_print
+		@info "The requested depth of partitioning is too high. The depth is set to $adepth."
 	end
 
+	depth = adepth
+	cfp = bettercellsforpart(cellparts, depth*nt+1)
+
+	if check_partition
+		validate_partition(grid, cellparts, start, allcells, nt, depth)
+	end
+
+	@info length.(cfp)
+	@info minimum(cellparts), maximum(cellparts), nt, depth
+
 	(nnts, s, onr, gi, gc, ni, rni, starts) = get_nnnts_and_sortednodesperthread_and_noderegs_from_cellregs_ps_less_reverse_nopush(
-		cellparts, allcells, start, num_nodes(grid), Ti, nt
+		cellparts, allcells, start, num_nodes(grid), Ti, nt, depth
 	)
 	
-	cfp = bettercellsforpart(cellparts, depth*nt+1)
-	return grid, nnts, s, onr, cfp, gi, gc, ni, rni, starts, cellparts
-end
 
+	return grid, nnts, s, onr, cfp, gi, gc, ni, rni, starts, cellparts, adepth
+end
 
 """
 `function get_nnnts_and_sortednodesperthread_and_noderegs_from_cellregs_ps_less_reverse_nopush(cellregs, allcells, start, nn, Ti, nt)`
@@ -35,10 +48,10 @@ Furthermore, `nnts` (number of nodes of the threads) is computed, which contain
 `Ti` is the type (Int64,...) of the elements in the created arrays.
 `nt` is the number of threads.
 """
-function get_nnnts_and_sortednodesperthread_and_noderegs_from_cellregs_ps_less_reverse_nopush(cellregs, allcells, start, nn, Ti, nt)
+function get_nnnts_and_sortednodesperthread_and_noderegs_from_cellregs_ps_less_reverse_nopush(cellregs, allcells, start, nn, Ti, nt, depth)
 		
-	num_matrices = maximum(cellregs)
-	depth = Int(floor((num_matrices-1)/nt))
+	#num_matrices = maximum(cellregs)
+	#depth = Int(floor((num_matrices-1)/nt))
 
 	#loop over each node, get the cellregion of the cell (the one not in the separator) write the position of that node inside the cellregions sorted ranking into a long vector
 	#nnts = [zeros(Ti, nt+1) for i=1:depth+1]
@@ -62,6 +75,11 @@ function get_nnnts_and_sortednodesperthread_and_noderegs_from_cellregs_ps_less_r
 				nnts[crmod] += 1
 				#sortednodesperthread[crmod,j] = nnts[crmod] #nnts[i][cr]
 				#push!(tmp, crmod)
+				if tmpctr > depth+1
+					@info "Cellregs: ", sortedcellregs
+					@info "Levels  : ", Int.(ceil.(sortedcellregs/nt))
+					@info "PartsMod: ", ((sortedcellregs.-1).%nt).+1
+				end
 				tmp[tmpctr] = crmod
 				tmpctr += 1
 			end
@@ -127,9 +145,6 @@ end
 
 
 
-
-
-
 """
 `function separate!(cellregs, nc, ACSC, nt, level0, ctr_sepanodes)`
 
@@ -141,47 +156,77 @@ This function partitons the separator, which is done if `depth`>1 (see `grid_to_
 `level0` is the separator-partitoning level, if the (first) separator is partitioned, level0 = 1, in the next iteration, level0 = 2...
 `preparatory_multi_ps` is the number of separator-cells.
 """
-function separate!(cellregs, nc, ACSC, nt, level0, ctr_sepanodes)
-	sepanodes = findall(x->x==nt+1, cellregs)
+function separate!(cellregs, nc, ACSC, nt, level0, ctr_sepanodes, ri, gi, do_print)
+	# current number of cells treated
+    nc2 = size(ACSC, 1)
 
-	indptr = collect(1:nc+1)
-	indices = zeros(Int64, nc)
-	rowval = zeros(Int64, nc)
+	indptr  = collect(1:nc2+1)
+	indices = zeros(Int64, nc2)
+	rowval  = zeros(Int64, nc2)
 
 	indptrT = collect(1:ctr_sepanodes+1)
 	indicesT = zeros(Int64, ctr_sepanodes)
 	rowvalT = zeros(Int64, ctr_sepanodes)
 
-	for (i,j) in enumerate(sepanodes)
-		indices[j] = i
+	for i=1:ctr_sepanodes
+	    j = ri[i]
+        indices[j] = i
 		indicesT[i] = j
 		rowval[j]  = 1
 		rowvalT[i] = 1
 	end
 
-	R = SparseMatrixCSC(ctr_sepanodes, nc, indptr, indices, rowval)
-	RT = SparseMatrixCSC(nc, ctr_sepanodes, indptrT, indicesT, rowvalT)
-	prod = ACSC*dropzeros(RT)
+    
+
+	R = SparseMatrixCSC(ctr_sepanodes, nc2, indptr, indices, rowval)
+	RT = SparseMatrixCSC(nc2, ctr_sepanodes, indptrT, indicesT, rowvalT)
+	# current adjacency matrix, taken as a part of the given one ACSC
 	RART = dropzeros(R)*ACSC*dropzeros(RT)
 	
-	partition2 = Metis.partition(RART, nt)
-	cellregs2 = copy(partition2)
-
-	ctr_sepanodes = 0
-	for (i,j) in enumerate(sepanodes)
-		rows = RART.rowval[RART.colptr[i]:(RART.colptr[i+1]-1)]
-		cellregs[j] = level0*nt + cellregs2[i]
-		if minimum(partition2[rows]) != maximum(partition2[rows])
-			cellregs[j] = (level0+1)*nt+1
-			ctr_sepanodes += 1
-		end
-	end
-
-	RART, ctr_sepanodes
+	cellregs2 = Metis.partition(RART, nt)
+	
+    
+    for i=1:ctr_sepanodes
+        if cellregs[gi[i]] < level0*nt+1
+            @warn "cell treated in this iteration was not a separator-cell last iteration"
+        end
+        cellregs[gi[i]] = level0*nt + cellregs2[i]
+    end
+
+	# how many cells are in the separator of the new partiton (which is only computed on the separator of the old partition)
+    new_ctr_sepanodes = 0
+    ri2 = Vector{Int64}(undef, ctr_sepanodes)
+    gi2 = Vector{Int64}(undef, ctr_sepanodes)
+    
+    for tid=1:nt
+        for i=1:ctr_sepanodes
+            if cellregs2[i] == tid
+				neighbors = RART.rowval[RART.colptr[i]:(RART.colptr[i+1]-1)]
+                rows = gi[vcat(neighbors, [i])]
+				#counts how many different regions (besides) the separator are adjacent to the current cell
+                x = how_many_different_below(cellregs[rows], (level0+1)*nt+1)
+                if x > 1
+                    cellregs[gi[i]] = (level0+1)*nt+1
+                    new_ctr_sepanodes += 1
+                    gi2[new_ctr_sepanodes] = gi[i]
+                    ri2[new_ctr_sepanodes] = i
+                end
+            end
+        end
+    end
+
+
+    ri2 = ri2[1:new_ctr_sepanodes]
+    gi2 = gi2[1:new_ctr_sepanodes]
+
+    if do_print
+    	@info "At level $(level0+1), we found $new_ctr_sepanodes cells that have to be treated in the next iteration!"
+    end
+
+	RART, new_ctr_sepanodes, ri2, gi2
 end
 
 
-
 """
 `function grid_to_graph_ps_multi!(grid, nt, depth)`
 
@@ -190,7 +235,7 @@ The function assigns colors/partitons to each cell in the `grid`. First, the gri
 `nt` is the number of threads.
 `depth` is the number of partition layers, for depth=1, there are nt parts and 1 separator, for depth=2, the separator is partitioned again, leading to 2*nt+1 submatrices...
 """
-function grid_to_graph_ps_multi!(grid, nt, depth)
+function grid_to_graph_ps_multi!(grid, nt, depth; minsize_sepa=10, do_print=false)
 	A = SparseMatrixLNK{Int64, Int64}(num_cells(grid), num_cells(grid))
 	number_cells_per_node = zeros(Int64, num_nodes(grid))
 	for j=1:num_cells(grid)
@@ -224,27 +269,46 @@ function grid_to_graph_ps_multi!(grid, nt, depth)
 	partition = Metis.partition(ACSC, nt)
 	cellregs  = copy(partition)
 	
+	sn = Vector{Int64}(undef, num_cells(grid))
+	gi = Vector{Int64}(undef, num_cells(grid))
 	ctr_sepanodes = 0
-	for j=1:num_cells(grid)
-		rows = ACSC.rowval[ACSC.colptr[j]:(ACSC.colptr[j+1]-1)]
-		if minimum(partition[rows]) != maximum(partition[rows])
-			cellregs[j] = nt+1
-			ctr_sepanodes += 1
-		end
+    
+    for tid=1:nt
+        for j=1:num_cells(grid)
+            if cellregs[j] == tid
+                rows = vcat(ACSC.rowval[ACSC.colptr[j]:(ACSC.colptr[j+1]-1)], [j])
+                if how_many_different_below(cellregs[rows], nt+1) > 1 
+                    cellregs[j] = nt+1 #+ctr_sepanodes
+                    ctr_sepanodes += 1
+                    sn[ctr_sepanodes] = j
+                    gi[ctr_sepanodes] = j
+                end
+            end
+        end
 	end
-	RART = ACSC
+
+    sn = sn[1:ctr_sepanodes]
+    gi = gi[1:ctr_sepanodes]
+    
+    if do_print
+        @info "At level $(1), we found $ctr_sepanodes cells that have to be treated in the next iteration!"
+    end
+
+    RART = copy(ACSC)
+    actual_depth = 1
 	for level=1:depth-1
-		RART, ctr_sepanodes = separate!(cellregs, num_cells(grid), RART, nt, level, ctr_sepanodes)
+		RART, ctr_sepanodes, sn, gi = separate!(cellregs, num_cells(grid), RART, nt, level, ctr_sepanodes, sn, gi, do_print)
+        actual_depth += 1
+		if ctr_sepanodes < minsize_sepa
+			break
+		end
 	end
-
-			
-	return allcells, start, cellregs
+        
+    return allcells, start, cellregs, actual_depth, ACSC
 end
 
 
-
-function grid_to_graph_ps_multi_par!(grid, nt, depth)
-	time = zeros(12)
+function grid_to_graph_ps_multi_par!(grid, nt, depth; minsize_sepa=10, do_print=false)
 	As = [ExtendableSparseMatrix{Int64, Int64}(num_cells(grid), num_cells(grid)) for tid=1:nt]
 	number_cells_per_node = zeros(Int64, num_nodes(grid))
 	
@@ -288,54 +352,64 @@ function grid_to_graph_ps_multi_par!(grid, nt, depth)
 	end
 	
 	ACSC = add_all_par!(As).cscmatrix
-		
-	#SparseArrays.SparseMatrixCSC(A))
-	
-	
-	partition = Metis.partition(ACSC, nt)
-	cellregs  = copy(partition)
 	
-	ctr_sepanodes_a = zeros(Int64, nt)
+	cellregs = Metis.partition(ACSC, nt)
 	
-	cell_range = get_starts(num_cells(grid), nt)
-	Threads.@threads :static for tid=1:nt
-		for j in cell_range[tid]:cell_range[tid+1]-1
-			rows = @view ACSC.rowval[ACSC.colptr[j]:(ACSC.colptr[j+1]-1)]
-			if minimum(partition[rows]) != maximum(partition[rows])
-				cellregs[j] = nt+1
-				ctr_sepanodes_a[tid] += 1
-			end
-		end
+	sn = [Vector{Int64}(undef, Int(ceil(num_cells(grid)/nt))) for tid=1:nt]
+	ctr_sepanodess = zeros(Int64, nt)
+    
+    @threads for tid=1:nt
+        for j=1:num_cells(grid)
+            if cellregs[j] == tid
+                rows = vcat(ACSC.rowval[ACSC.colptr[j]:(ACSC.colptr[j+1]-1)], [j])
+                if how_many_different_below(cellregs[rows], nt+1) > 1 
+                    cellregs[j] = nt+1 #+ctr_sepanodes
+                    ctr_sepanodess[tid] += 1
+                    sn[tid][ctr_sepanodess[tid]] = j
+                end
+            end
+        end
 	end
-	
-	ctr_sepanodes = sum(ctr_sepanodes_a)
-			
-	#=
-	time[10] = @elapsed for j=1:num_cells(grid)
-		rows = ACSC.rowval[ACSC.colptr[j]:(ACSC.colptr[j+1]-1)]
-		if minimum(partition[rows]) != maximum(partition[rows])
-			cellregs[j] = nt+1
-			ctr_sepanodes += 1
-		end
-	end
-	=#
-	RART = ACSC
+
+    for tid=1:nt
+        sn[tid] = sn[tid][1:ctr_sepanodess[tid]]
+    end
+    ctr_sepanodes = sum(ctr_sepanodess)
+    sn = vcat(sn...)
+    gi = copy(sn)
+
+    if do_print
+        @info "At level $(1), we found $ctr_sepanodes cells that have to be treated in the next iteration!"
+    end
+
+    RART = ACSC
+    actual_depth = 1
 	for level=1:depth-1
-		RART, ctr_sepanodes = separate!(cellregs, num_cells(grid), RART, nt, level, ctr_sepanodes)
+		RART, ctr_sepanodes, sn, gi = separate!(cellregs, num_cells(grid), RART, nt, level, ctr_sepanodes, sn, gi, do_print)
+        actual_depth += 1
+		if ctr_sepanodes < minsize_sepa
+			break
+		end
 	end
-
-			
-	return allcells, start, cellregs
+    
+    #grid[CellRegions] = cellregs
+    #grid
+    return allcells, start, cellregs, actual_depth
 end
 
+"""
+`function add_all_par!(As)`
 
+Add LNK matrices (stored in a vector) parallely (tree structure).
+The result is stored in the first LNK matrix.
+"""
 function add_all_par!(As)
 	nt = length(As)
 	depth = Int(floor(log2(nt)))
 	ende = nt
 	for level=1:depth
 		
-		@threads :static for tid=1:2^(depth-level)
+		@threads for tid=1:2^(depth-level)
 			#@info "$level, $tid"
 			start = tid+2^(depth-level)
 			while start <= ende
@@ -425,3 +499,169 @@ function last_nz(x)
 	end
 end
 
+
+function how_many_different_below(x0, y; u=0)
+    x = copy(x0)
+    z = unique(x)
+    t = findall(w->w<y,z)
+    t = findall(w->w>u,z[t])
+    length(t)
+end
+
+
+
+function lookat_grid_to_graph_ps_multi!(nm, nt, depth)
+	grid = getgrid(nm)
+	A = SparseMatrixLNK{Int64, Int64}(num_cells(grid), num_cells(grid))
+	number_cells_per_node = zeros(Int64, num_nodes(grid))
+	for j=1:num_cells(grid)
+		for node_id in grid[CellNodes][:,j]
+			number_cells_per_node[node_id] += 1
+		end
+	end
+	allcells = zeros(Int64, sum(number_cells_per_node))
+	start = ones(Int64, num_nodes(grid)+1)
+	start[2:end] += cumsum(number_cells_per_node)
+	number_cells_per_node .= 0
+	for j=1:num_cells(grid)
+		for node_id in grid[CellNodes][:,j]
+			allcells[start[node_id] + number_cells_per_node[node_id]] = j
+			number_cells_per_node[node_id] += 1
+		end
+	end
+
+	for j=1:num_nodes(grid)
+		cells = @view allcells[start[j]:start[j+1]-1]
+		for (i,id1) in enumerate(cells)
+			for id2 in cells[i+1:end]
+				A[id1,id2] = 1
+				A[id2,id1] = 1
+			end
+		end	
+	end
+
+	ACSC = SparseArrays.SparseMatrixCSC(A)
+	
+	partition = Metis.partition(ACSC, nt)
+	cellregs  = copy(partition)
+	
+	sn = []
+	gi = []
+	ctr_sepanodes = 0
+	for j=1:num_cells(grid)
+		rows = ACSC.rowval[ACSC.colptr[j]:(ACSC.colptr[j+1]-1)]
+		if minimum(partition[rows]) != maximum(partition[rows])
+			cellregs[j] = nt+1
+			ctr_sepanodes += 1
+			push!(sn, j)
+			push!(gi, j)
+		end
+	end
+	RART = ACSC
+	#sn = 1:num_cells(grid)
+	#gi = 1:num_cells(grid)
+	for level=1:depth-1
+		RART, ctr_sepanodes, sn, gi = separate_careful!(cellregs, num_cells(grid), RART, nt, level, ctr_sepanodes, sn, gi)
+		if ctr_sepanodes == 0
+			return RART
+		end
+	end
+
+			
+	#return allcells, start, cellregs
+	RART
+end
+
+
+function adjacencies(grid)
+	A = SparseMatrixLNK{Int64, Int64}(num_cells(grid), num_cells(grid))
+	number_cells_per_node = zeros(Int64, num_nodes(grid))
+	for j=1:num_cells(grid)
+		for node_id in grid[CellNodes][:,j]
+			number_cells_per_node[node_id] += 1
+		end
+	end
+	allcells = zeros(Int64, sum(number_cells_per_node))
+	start = ones(Int64, num_nodes(grid)+1)
+	start[2:end] += cumsum(number_cells_per_node)
+	number_cells_per_node .= 0
+	for j=1:num_cells(grid)
+		for node_id in grid[CellNodes][:,j]
+			allcells[start[node_id] + number_cells_per_node[node_id]] = j
+			number_cells_per_node[node_id] += 1
+		end
+	end
+
+	for j=1:num_nodes(grid)
+		cells = @view allcells[start[j]:start[j+1]-1]
+		for (i,id1) in enumerate(cells)
+			for id2 in cells[i+1:end]
+				A[id1,id2] = 1
+				A[id2,id1] = 1
+			end
+		end	
+	end
+
+	allcells, start, SparseArrays.SparseMatrixCSC(A)
+end
+
+function check_adjacencies(nm)
+	grid = getgrid(nm)
+	allcells, start, A = adjacencies(grid)
+
+	i = 1
+	cells1 = sort(vcat([i], A.rowval[A.colptr[i]:(A.colptr[i+1]-1)])) #adjacent cells
+	nodes2 = grid[CellNodes][:,i]
+	cells2 = sort(unique(vcat([allcells[start[j]:start[j+1]-1] for j in nodes2]...)))
+
+	@info cells1
+	@info cells2
+	@info maximum(abs.(cells1-cells2))
+
+
+end
+
+#=
+function check_partition(nm, nt, depth)
+	grid = getgrid(nm)
+	
+	(allcells, start, cellregs, adepth, ACSC) = grid_to_graph_ps_multi!(grid, nt, depth; minsize_sepa=10, do_print=true)#)
+	
+	if (adepth != depth)
+		@info "The requested depth of partitioning is too high. The depth is set to $adepth."
+	end
+	depth = adepth
+
+	validate_partition(num_nodes(grid), num_cells(grid), grid, cellregs, start, allcells, nt, depth, ACSC)
+end
+=#
+
+function validate_partition(grid, cellregs, start, allcells, nt, depth)
+	@info "Node based validation"
+	violation_ctr = 0
+
+	for j=1:num_nodes(grid)
+		cells = @view allcells[start[j]:start[j+1]-1]
+		sortedcellregs = unique(sort(cellregs[cells]))
+		levels         = Int.(ceil.(sortedcellregs/nt))
+		
+		for i=1:depth+1
+			ids_lev = findall(x->x==i, levels)
+			if length(ids_lev) > 1
+				violation_ctr += 1
+
+				if violation_ctr == 1
+					@info "Node Id : ", j
+					@info "Cellregs: ", sortedcellregs
+					@info "Levels  : ", levels
+					
+					loc = findall(x->x==4, Int.(ceil.(cellregs[allcells[start[j]:start[j+1]-1]]/nt)))
+					cells_at_level4 = allcells[loc.+(start[j]-1)]
+					@info cells_at_level4, cellregs[cells_at_level4]
+					@info grid[CellNodes][:,cells_at_level4[1]], grid[CellNodes][:,cells_at_level4[2]]
+				end
+			end
+		end
+	end
+	@info "We found $violation_ctr violation(s)"
+end
\ No newline at end of file

From b2693ec41fc21fb3fdbb71648f47a6d21aa1e690 Mon Sep 17 00:00:00 2001
From: Johannes Taraz <johannes.taraz@gmail.com>
Date: Sun, 17 Mar 2024 10:57:59 +0100
Subject: [PATCH 7/9] add parallel matrix vector product

---
 src/ExtendableSparse.jl                       |   1 +
 src/factorizations/filu_Al-Kurdi_Mittal.jl    | 160 ++++++++++++++++++
 src/factorizations/ilu_Al-Kurdi_Mittal.jl     |  65 ++++++-
 src/factorizations/iluam.jl                   |   4 +-
 src/factorizations/pilu_Al-Kurdi_Mittal.jl    | 149 ++++++++++++----
 src/factorizations/piluam.jl                  |   5 +-
 .../ExtendableSparseParallel.jl               |  63 ++++++-
 .../preparatory.jl                            |   4 +-
 .../struct_flush.jl                           |   4 +
 9 files changed, 410 insertions(+), 45 deletions(-)
 create mode 100644 src/factorizations/filu_Al-Kurdi_Mittal.jl

diff --git a/src/ExtendableSparse.jl b/src/ExtendableSparse.jl
index 285d0c2..bcf85e6 100644
--- a/src/ExtendableSparse.jl
+++ b/src/ExtendableSparse.jl
@@ -37,6 +37,7 @@ export eliminate_dirichlet, eliminate_dirichlet!, mark_dirichlet
 include("matrix/ExtendableSparseMatrixParallel/ExtendableSparseParallel.jl")
 
 
+
 include("factorizations/ilu_Al-Kurdi_Mittal.jl")
 #using .ILUAM
 include("factorizations/pilu_Al-Kurdi_Mittal.jl")
diff --git a/src/factorizations/filu_Al-Kurdi_Mittal.jl b/src/factorizations/filu_Al-Kurdi_Mittal.jl
new file mode 100644
index 0000000..2099208
--- /dev/null
+++ b/src/factorizations/filu_Al-Kurdi_Mittal.jl
@@ -0,0 +1,160 @@
+#module PILUAM
+#using Base.Threads
+#using LinearAlgebra, SparseArrays
+
+import LinearAlgebra.ldiv!, LinearAlgebra.\, SparseArrays.nnz 
+
+@info "PILUAM"
+
+mutable struct PILUAMPrecon{T,N}
+
+	diag::AbstractVector
+    nzval::AbstractVector
+	A::AbstractMatrix
+	
+end
+
+function iluAM!(ILU::PILUAMPrecon{Tv,Ti}, A::ExtendableSparseMatrixParallel{Tv, Ti}) where {Tv, Ti <:Integer}
+	@info "filuAM!"
+    diag = ILU.diag
+	nzval = ILU.nzval
+
+	nzval = copy(A.cscmatrix.nzval)
+	diag  = Vector{Ti}(undef, n)
+	ILU.A = A
+	colptr = A.cscmatrix.colptr
+	rowval = A.cscmatrix.rowval
+	n = A.n
+	point = zeros(Ti, n)
+
+	for j=1:n
+		for v=colptr[j]:colptr[j+1]-1
+			if rowval[v] == j
+				diag[j] = v
+				break
+			end
+			#elseif rowval[v] 
+		end
+	end
+	
+	# compute L and U
+	for j=1:n
+		for v=colptr[j]:colptr[j+1]-1  ## start at colptr[j]+1 ??
+			point[rowval[v]] = v
+		end
+		
+		for v=colptr[j]:diag[j]-1
+			i = rowval[v]
+			#nzval[v] /= nzval[diag[i]]
+			for w=diag[i]+1:colptr[i+1]-1
+				k = point[rowval[w]]
+				if k>0
+					nzval[k] -= nzval[v]*nzval[w]
+				end
+			end
+		end
+		
+		for v=diag[j]+1:colptr[j+1]-1
+			nzval[v] /= nzval[diag[j]]
+		end
+		
+		
+		for v=colptr[j]:colptr[j+1]-1
+			point[rowval[v]] = zero(Ti)
+		end
+	end
+
+end
+
+
+function piluAM(A::ExtendableSparseMatrixParallel{Tv,Ti}) where {Tv, Ti <:Integer}
+	@info "filuAM, $(A[1,1])"
+	nzval = copy(A.cscmatrix.nzval)
+	colptr = A.cscmatrix.colptr
+	rowval = A.cscmatrix.rowval
+	#nzval  = ILU.nzval
+	n = A.n # number of columns
+	point = zeros(Ti, n) #Vector{Ti}(undef, n)
+	diag  = Vector{Ti}(undef, n)
+
+	# find diagonal entries
+	for j=1:n
+		for v=colptr[j]:colptr[j+1]-1
+			if rowval[v] == j
+				diag[j] = v
+				break
+			end
+			#elseif rowval[v] 
+		end
+	end
+	
+	# compute L and U
+	for j=1:n
+		for v=colptr[j]:colptr[j+1]-1  ## start at colptr[j]+1 ??
+			point[rowval[v]] = v
+		end
+		
+		for v=colptr[j]:diag[j]-1
+			i = rowval[v]
+			#nzval[v] /= nzval[diag[i]]
+			for w=diag[i]+1:colptr[i+1]-1
+				k = point[rowval[w]]
+				if k>0
+					nzval[k] -= nzval[v]*nzval[w]
+				end
+			end
+		end
+		
+		for v=diag[j]+1:colptr[j+1]-1
+			nzval[v] /= nzval[diag[j]]
+		end
+		
+		
+		for v=colptr[j]:colptr[j+1]-1
+			point[rowval[v]] = zero(Ti)
+		end
+	end
+	#nzval, diag
+	PILUAMPrecon{Tv,Ti}(diag, nzval, A)
+end
+
+
+
+function ldiv!(x, ILU::PILUAMPrecon, b)
+	#@info "iluam ldiv 1"
+	nzval = ILU.nzval
+	diag  = ILU.diag
+	A     = ILU.A.cscmatrix
+	y = copy(b)
+	#forward_subst!(y, b, ILU)
+	forward_subst_old!(y, b, nzval, diag, A)
+	backward_subst_old!(x, y, nzval, diag, A)
+	@info "FILUAM:", b[1], y[1], x[1], maximum(abs.(b-A*x)) 
+    #, maximum(abs.(b-A*x)), b[1], x[1], y[1]
+	x
+end
+
+
+function ldiv!(ILU::PILUAMPrecon, b)
+	#@info "iluam ldiv 2"
+	nzval = ILU.nzval
+	diag  = ILU.diag
+	A     = ILU.A.cscmatrix
+	y = copy(b)
+	#forward_subst!(y, b, ILU)
+	forward_subst_old!(y, b, nzval, diag, A)
+	backward_subst_old!(b, y, nzval, diag, A)
+	b
+end
+
+function \(ilu::PILUAMPrecon{T,N}, b) where {T,N<:Integer}
+    x = copy(b)
+    ldiv!(x, ilu, b)
+	x
+end
+
+function nnz(ilu::PILUAMPrecon{T,N}) where {T,N<:Integer}
+    length(ilu.nzval)
+end
+
+#end
\ No newline at end of file
diff --git a/src/factorizations/ilu_Al-Kurdi_Mittal.jl b/src/factorizations/ilu_Al-Kurdi_Mittal.jl
index 0b6b1b2..ad2207d 100644
--- a/src/factorizations/ilu_Al-Kurdi_Mittal.jl
+++ b/src/factorizations/ilu_Al-Kurdi_Mittal.jl
@@ -3,7 +3,7 @@
 
 import LinearAlgebra.ldiv!, LinearAlgebra.\, SparseArrays.nnz 
 
-@info "ILUAM"
+#@info "ILUAM"
 
 mutable struct ILUAMPrecon{T,N}
 
@@ -13,6 +13,58 @@ mutable struct ILUAMPrecon{T,N}
 	
 end
 
+
+function iluAM!(ILU::ILUAMPrecon{Tv,Ti}, A::SparseMatrixCSC{Tv, Ti}) where {Tv, Ti <:Integer}
+	diag = ILU.diag
+	nzval = ILU.nzval
+
+	nzval = copy(A.nzval)
+	diag  = Vector{Ti}(undef, n)
+	ILU.A = A
+	colptr = A.colptr
+	rowval = A.rowval
+	n = A.n
+	point = zeros(Ti, n)
+
+	for j=1:n
+		for v=colptr[j]:colptr[j+1]-1
+			if rowval[v] == j
+				diag[j] = v
+				break
+			end
+			#elseif rowval[v] 
+		end
+	end
+	
+	# compute L and U
+	for j=1:n
+		for v=colptr[j]:colptr[j+1]-1  ## start at colptr[j]+1 ??
+			point[rowval[v]] = v
+		end
+		
+		for v=colptr[j]:diag[j]-1
+			i = rowval[v]
+			#nzval[v] /= nzval[diag[i]]
+			for w=diag[i]+1:colptr[i+1]-1
+				k = point[rowval[w]]
+				if k>0
+					nzval[k] -= nzval[v]*nzval[w]
+				end
+			end
+		end
+		
+		for v=diag[j]+1:colptr[j+1]-1
+			nzval[v] /= nzval[diag[j]]
+		end
+		
+		
+		for v=colptr[j]:colptr[j+1]-1
+			point[rowval[v]] = zero(Ti)
+		end
+	end
+
+end
+
 function iluAM(A::SparseMatrixCSC{Tv,Ti}) where {Tv, Ti <:Integer}
 	#@info "iluAM"
 	nzval = copy(A.nzval)
@@ -33,6 +85,9 @@ function iluAM(A::SparseMatrixCSC{Tv,Ti}) where {Tv, Ti <:Integer}
 			#elseif rowval[v] 
 		end
 	end
+
+	#@info diag[1:20]'
+	#@info diag[end-20:end]'
 	
 	# compute L and U
 	for j=1:n
@@ -65,6 +120,7 @@ function iluAM(A::SparseMatrixCSC{Tv,Ti}) where {Tv, Ti <:Integer}
 end
 
 function forward_subst_old!(y, v, nzval, diag, A)
+	#@info "fso, $(sum(nzval)), $(sum(nzval.^2)), $(sum(diag)), $(A[1,1])"
 	n      = A.n
 	colptr = A.colptr
 	rowval = A.rowval
@@ -85,6 +141,7 @@ end
 
 
 function backward_subst_old!(x, y, nzval, diag, A)
+	#@info "bso, $(sum(nzval)), $(sum(nzval.^2)), $(sum(diag)), $(A[1,1])"
 	n      = A.n
 	colptr = A.colptr
 	rowval = A.rowval
@@ -99,7 +156,9 @@ function backward_subst_old!(x, y, nzval, diag, A)
 	x
 end
 
+
 function ldiv!(x, ILU::ILUAMPrecon, b)
+	#t = @elapsed begin
 	#@info "iluam ldiv 1"
 	nzval = ILU.nzval
 	diag  = ILU.diag
@@ -108,6 +167,10 @@ function ldiv!(x, ILU::ILUAMPrecon, b)
 	#forward_subst!(y, b, ILU)
 	forward_subst_old!(y, b, nzval, diag, A)
 	backward_subst_old!(x, y, nzval, diag, A)
+	#@info "ILUAM:", b[1], y[1], x[1], maximum(abs.(b-A*x)), nnz(A) #, A[10,10]
+	#, b[1], x[1], y[1]#maximum(abs.(b)), maximum(abs.(x))
+	#end
+	#println("$t") #@info t
 	x
 end
 
diff --git a/src/factorizations/iluam.jl b/src/factorizations/iluam.jl
index 6d061b0..24b75be 100644
--- a/src/factorizations/iluam.jl
+++ b/src/factorizations/iluam.jl
@@ -22,12 +22,10 @@ function ILUAMPreconditioner end
 function update!(p::ILUAMPreconditioner)
     flush!(p.A)
     if p.A.phash != p.phash
-        @warn "p.A.phash != p.phash"
         p.factorization = iluAM(p.A.cscmatrix)
         p.phash=p.A.phash
     else
-        @warn "fuck?"
-        ilu0!(p.factorization, p.A.cscmatrix)
+        iluam!(p.factorization, p.A.cscmatrix)
     end
     p
 end
diff --git a/src/factorizations/pilu_Al-Kurdi_Mittal.jl b/src/factorizations/pilu_Al-Kurdi_Mittal.jl
index 15a8b23..f2861ed 100644
--- a/src/factorizations/pilu_Al-Kurdi_Mittal.jl
+++ b/src/factorizations/pilu_Al-Kurdi_Mittal.jl
@@ -4,7 +4,7 @@
 
 import LinearAlgebra.ldiv!, LinearAlgebra.\, SparseArrays.nnz 
 
-@info "PILUAM"
+#@info "PILUAM"
 
 mutable struct PILUAMPrecon{T,N}
 
@@ -51,6 +51,95 @@ function compute_lu!(nzval, point, j0, j1, tid, rowval, colptr, diag, Ti)
     end
 end
 
+function piluAM!(ILU::PILUAMPrecon{Tv,Ti}, A::ExtendableSparseMatrixParallel{Tv,Ti}) where {Tv, Ti <:Integer}
+	@info "piluAM!"
+	diag = ILU.diag
+	nzval = ILU.nzval
+	ILU.A = A
+	start = ILU.start
+	
+	ILU.nt = A.nt
+	nt = A.nt
+	
+	ILU.depth = A.depth
+	depth = A.depth
+	
+	
+	colptr = A.cscmatrix.colptr
+	rowval = A.cscmatrix.rowval
+	n = A.cscmatrix.n # number of columns
+	diag  = Vector{Ti}(undef, n)
+	nzval  = Vector{Tv}(undef, length(rowval)) #copy(A.nzval)
+	point = use_vector_par(n, A.nt, Int32)
+
+
+	@threads for tid=1:depth*nt+1
+		for j=start[tid]:start[tid+1]-1
+			for v=colptr[j]:colptr[j+1]-1
+				nzval[v] = A.cscmatrix.nzval[v]
+				if rowval[v] == j
+					diag[j] = v
+				end
+				#elseif rowval[v] 
+			end
+		end
+	end
+
+	for level=1:depth
+		@threads for tid=1:nt
+			for j=start[(level-1)*nt+tid]:start[(level-1)*nt+tid+1]-1
+				for v=colptr[j]:colptr[j+1]-1
+					point[tid][rowval[v]] = v
+				end
+				
+				for v=colptr[j]:diag[j]-1
+					i = rowval[v]
+					for w=diag[i]+1:colptr[i+1]-1
+						k = point[tid][rowval[w]]
+						if k>0
+							nzval[k] -= nzval[v]*nzval[w]
+						end
+					end
+				end
+				
+				for v=diag[j]+1:colptr[j+1]-1
+					nzval[v] /= nzval[diag[j]]
+				end
+				
+				for v=colptr[j]:colptr[j+1]-1
+					point[tid][rowval[v]] = zero(Ti)
+				end
+			end
+		end
+	end
+	
+	#point = zeros(Ti, n) #Vector{Ti}(undef, n)
+	for j=start[depth*nt+1]:start[depth*nt+2]-1
+		for v=colptr[j]:colptr[j+1]-1
+			point[1][rowval[v]] = v
+		end
+		
+		for v=colptr[j]:diag[j]-1
+			i = rowval[v]
+			for w=diag[i]+1:colptr[i+1]-1
+				k = point[1][rowval[w]]
+				if k>0
+					nzval[k] -= nzval[v]*nzval[w]
+				end
+			end
+		end
+		
+		for v=diag[j]+1:colptr[j+1]-1
+			nzval[v] /= nzval[diag[j]]
+		end
+		
+		for v=colptr[j]:colptr[j+1]-1
+			point[1][rowval[v]] = zero(Ti)
+		end
+	end
+
+end
+
 function piluAM(A::ExtendableSparseMatrixParallel{Tv,Ti}) where {Tv, Ti <:Integer}
 	start = A.start
 	nt = A.nt
@@ -65,6 +154,22 @@ function piluAM(A::ExtendableSparseMatrixParallel{Tv,Ti}) where {Tv, Ti <:Intege
 	
 	# find diagonal entries
 	#
+	
+	#=
+	for j=1:n
+		for v=colptr[j]:colptr[j+1]-1
+			nzval[v] = A.cscmatrix.nzval[v]
+			if rowval[v] == j
+				diag[j] = v
+				#break
+			end
+			#elseif rowval[v] 
+		end
+	end
+	=#
+
+
+
 	@threads for tid=1:depth*nt+1
 		for j=start[tid]:start[tid+1]-1
 			for v=colptr[j]:colptr[j+1]-1
@@ -77,41 +182,9 @@ function piluAM(A::ExtendableSparseMatrixParallel{Tv,Ti}) where {Tv, Ti <:Intege
 		end
 	end
 
-	#=
-	@info "piluAM"
-	nzval = copy(A.cscmatrix.nzval)
-	colptr = A.cscmatrix.colptr
-	rowval = A.cscmatrix.rowval
-	#nzval  = ILU.nzval
-	n = A.n # number of columns
-	diag  = Vector{Ti}(undef, n)
-	start = A.start
-    nt    = A.nt
-    depth = A.depth
-    point = use_vector_par(n, nt, Ti)
 
-	# find diagonal entries
-    @threads for tid=1:depth*nt+1
-        for j=start[tid]:start[tid+1]-1
-            for v=colptr[j]:colptr[j+1]-1
-                if rowval[v] == j
-                    diag[j] = v
-                    break
-                end
-                #elseif rowval[v] 
-            end
-        end
-    end
-	
-	# compute L and U
-    for level=1:depth
-        @threads for tid=1:nt
-            compute_lu!(nzval, point, start[(level-1)*nt+tid], start[(level-1)*nt+tid+1], tid, rowval, colptr, diag, Ti)
-        end
-    end
-
-    compute_lu!(nzval, point, start[depth*nt+1], start[depth*nt+2], 1, rowval, colptr, diag, Ti)
-	=#
+	#@info diag[1:20]'
+	#@info diag[end-20:end]'	
 
 	for level=1:depth
 		@threads for tid=1:nt
@@ -171,6 +244,7 @@ function piluAM(A::ExtendableSparseMatrixParallel{Tv,Ti}) where {Tv, Ti <:Intege
 end
 
 function forward_subst_old!(y, v, nzval, diag, start, nt, depth, A)
+	#@info "pfso, $(sum(nzval)), $(sum(nzval.^2)), $(sum(diag)), $(A[1,1])"
 	#@info "fwo"
 	n = A.n
 	colptr = A.colptr
@@ -200,6 +274,8 @@ end
 
 
 function backward_subst_old!(x, y, nzval, diag, start, nt, depth, A)
+	#@info "pbso, $(sum(nzval)), $(sum(nzval.^2)), $(sum(diag)), $(A[1,1])"
+	
 	#@info "bwo"
 	n = A.n
 	colptr = A.colptr
@@ -229,6 +305,7 @@ function backward_subst_old!(x, y, nzval, diag, start, nt, depth, A)
 
 end
 
+
 function ldiv!(x, ILU::PILUAMPrecon, b)
 	#@info "piluam ldiv 1"
 	nzval = ILU.nzval
@@ -241,6 +318,8 @@ function ldiv!(x, ILU::PILUAMPrecon, b)
 	#forward_subst!(y, b, ILU)
 	forward_subst_old!(y, b, nzval, diag, start, nt, depth, A)
 	backward_subst_old!(x, y, nzval, diag, start, nt, depth, A)
+	#@info "PILUAM:", b[1], y[1], x[1], maximum(abs.(b-A*x)), nnz(A) #, A[10,10]
+	#@info "PILUAM:", maximum(abs.(b-A*x)), b[1], x[1], maximum(abs.(b)), maximum(abs.(x))
 	x
 end
 
diff --git a/src/factorizations/piluam.jl b/src/factorizations/piluam.jl
index 075f73f..50a46fd 100644
--- a/src/factorizations/piluam.jl
+++ b/src/factorizations/piluam.jl
@@ -20,14 +20,13 @@ also calculates and stores updates to the off-diagonal entries and thus delivers
 function PILUAMPreconditioner end
 
 function update!(p::PILUAMPreconditioner)
+    #@warn "Should flush now", nnz_noflush(p.A)
     flush!(p.A)
     if p.A.phash != p.phash
-        @warn "p.A.phash != p.phash"
         p.factorization = piluAM(p.A)
         p.phash=p.A.phash
     else
-        @warn "fuck?"
-        ilu0!(p.factorization, p.A.cscmatrix)
+        piluAM!(p.factorization, p.A)
     end
     p
 end
diff --git a/src/matrix/ExtendableSparseMatrixParallel/ExtendableSparseParallel.jl b/src/matrix/ExtendableSparseMatrixParallel/ExtendableSparseParallel.jl
index b635a33..2c91a12 100644
--- a/src/matrix/ExtendableSparseMatrixParallel/ExtendableSparseParallel.jl
+++ b/src/matrix/ExtendableSparseMatrixParallel/ExtendableSparseParallel.jl
@@ -285,6 +285,67 @@ function updatentryCSC2!(CSC::SparseArrays.SparseMatrixCSC{Tv, Ti}, i::Integer,
 	end
 end
 
-Base.size(A::ExtendableSparseMatrixParallel) = (A.cscmatrix.m, A.cscmatrix.n)
 
+
+
+Base.size(A::ExtendableSparseMatrixParallel) = (A.cscmatrix.m, A.cscmatrix.n)
 include("struct_flush.jl")
+
+
+
+import LinearAlgebra.mul!
+
+"""
+```function LinearAlgebra.mul!(y, A, x)```
+
+This overwrites the mul! function for A::ExtendableSparseMatrixParallel
+"""
+function LinearAlgebra.mul!(y::AbstractVector{Tv}, A::ExtendableSparseMatrixParallel{Tv, Ti}, x::AbstractVector{Tv}) where {Tv, Ti<:Integer}
+    #@info "my matvec"
+    _, nnzLNK = nnz_noflush(A)
+    @assert nnzLNK == 0
+    #mul!(y, A.cscmatrix, x)
+    matvec!(y, A, x)
+end
+
+
+"""
+```function matvec!(y, A, x)```
+
+y <- A*x, where y and x are vectors and A is an ExtendableSparseMatrixParallel
+this computation is done in parallel, it has the same result as y = A.cscmatrix*x
+"""
+function matvec!(y::AbstractVector{Tv}, A::ExtendableSparseMatrixParallel{Tv,Ti}, x::AbstractVector{Tv}) where {Tv, Ti<:Integer}
+    #a1 = @allocated begin
+    nt = A.nt
+    depth = A.depth
+    colptr = A.cscmatrix.colptr
+    nzv = A.cscmatrix.nzval
+    rv  = A.cscmatrix.rowval
+
+    LinearAlgebra._rmul_or_fill!(y, 0.0)
+    
+    #end
+    #a2 = @allocated 
+    for level=1:depth
+        @threads for tid::Int64=1:nt
+            for col::Int64=A.start[(level-1)*nt+tid]:A.start[(level-1)*nt+tid+1]-1
+                for row::Int64=colptr[col]:colptr[col+1]-1 # in nzrange(A, col)
+                    y[rv[row]] += nzv[row]*x[col]
+                end
+            end
+        end
+    end
+
+    @threads for tid=1:1
+        #a3 = @allocated 
+        for col::Int64=A.start[depth*nt+1]:A.start[depth*nt+2]-1
+            for row::Int64=colptr[col]:colptr[col+1]-1 #nzrange(A, col)
+                y[rv[row]] += nzv[row]*x[col]
+            end
+        end
+    end
+    
+    #println(a1,a2,a3)
+    y
+end
diff --git a/src/matrix/ExtendableSparseMatrixParallel/preparatory.jl b/src/matrix/ExtendableSparseMatrixParallel/preparatory.jl
index a29356c..7eeb3d3 100644
--- a/src/matrix/ExtendableSparseMatrixParallel/preparatory.jl
+++ b/src/matrix/ExtendableSparseMatrixParallel/preparatory.jl
@@ -26,8 +26,8 @@ function preparatory_multi_ps_less_reverse(nm, nt, depth, Ti; sequential=false,
 		validate_partition(grid, cellparts, start, allcells, nt, depth)
 	end
 
-	@info length.(cfp)
-	@info minimum(cellparts), maximum(cellparts), nt, depth
+	#@info length.(cfp)
+	#@info minimum(cellparts), maximum(cellparts), nt, depth
 
 	(nnts, s, onr, gi, gc, ni, rni, starts) = get_nnnts_and_sortednodesperthread_and_noderegs_from_cellregs_ps_less_reverse_nopush(
 		cellparts, allcells, start, num_nodes(grid), Ti, nt, depth
diff --git a/src/matrix/ExtendableSparseMatrixParallel/struct_flush.jl b/src/matrix/ExtendableSparseMatrixParallel/struct_flush.jl
index 38608ad..73471dc 100644
--- a/src/matrix/ExtendableSparseMatrixParallel/struct_flush.jl
+++ b/src/matrix/ExtendableSparseMatrixParallel/struct_flush.jl
@@ -1,5 +1,9 @@
 function flush!(A::ExtendableSparseMatrixParallel; do_dense=false, keep_zeros=true)
+	_, nnzLNK = nnz_noflush(A)
 
+	if nnzLNK == 0
+		return
+	end
 
 	if !do_dense
 		A.cscmatrix = A.cscmatrix+sparse_flush!(A; keep_zeros)

From c3f262c09f64648f35222c82c5775f266df344e9 Mon Sep 17 00:00:00 2001
From: Johannes Taraz <johannes.taraz@gmail.com>
Date: Sun, 17 Mar 2024 11:03:16 +0100
Subject: [PATCH 8/9] remove old code

---
 src/ExtendableSparse.jl                     |   2 +-
 src/factorizations/filu_Al-Kurdi_Mittal.jl  | 160 --------------
 src/factorizations/ilu_Al-Kurdi_Mittal_0.jl | 146 -------------
 src/factorizations/ilu_Al-Kurdi_Mittal_1.jl | 229 --------------------
 4 files changed, 1 insertion(+), 536 deletions(-)
 delete mode 100644 src/factorizations/filu_Al-Kurdi_Mittal.jl
 delete mode 100644 src/factorizations/ilu_Al-Kurdi_Mittal_0.jl
 delete mode 100644 src/factorizations/ilu_Al-Kurdi_Mittal_1.jl

diff --git a/src/ExtendableSparse.jl b/src/ExtendableSparse.jl
index bcf85e6..9c490ca 100644
--- a/src/ExtendableSparse.jl
+++ b/src/ExtendableSparse.jl
@@ -33,7 +33,7 @@ export SparseMatrixLNK, ExtendableSparseMatrix, flush!, nnz, updateindex!, rawup
 export eliminate_dirichlet, eliminate_dirichlet!, mark_dirichlet
 
 
-@warn "ESMP!"
+#@warn "ESMP!"
 include("matrix/ExtendableSparseMatrixParallel/ExtendableSparseParallel.jl")
 
 
diff --git a/src/factorizations/filu_Al-Kurdi_Mittal.jl b/src/factorizations/filu_Al-Kurdi_Mittal.jl
deleted file mode 100644
index 2099208..0000000
--- a/src/factorizations/filu_Al-Kurdi_Mittal.jl
+++ /dev/null
@@ -1,160 +0,0 @@
-#module PILUAM
-#using Base.Threads
-#using LinearAlgebra, SparseArrays
-
-import LinearAlgebra.ldiv!, LinearAlgebra.\, SparseArrays.nnz 
-
-@info "PILUAM"
-
-mutable struct PILUAMPrecon{T,N}
-
-	diag::AbstractVector
-    nzval::AbstractVector
-	A::AbstractMatrix
-	
-end
-
-function iluAM!(ILU::PILUAMPrecon{Tv,Ti}, A::ExtendableSparseMatrixParallel{Tv, Ti}) where {Tv, Ti <:Integer}
-	@info "filuAM!"
-    diag = ILU.diag
-	nzval = ILU.nzval
-
-	nzval = copy(A.cscmatrix.nzval)
-	diag  = Vector{Ti}(undef, n)
-	ILU.A = A
-	colptr = A.cscmatrix.colptr
-	rowval = A.cscmatrix.rowval
-	n = A.n
-	point = zeros(Ti, n)
-
-	for j=1:n
-		for v=colptr[j]:colptr[j+1]-1
-			if rowval[v] == j
-				diag[j] = v
-				break
-			end
-			#elseif rowval[v] 
-		end
-	end
-	
-	# compute L and U
-	for j=1:n
-		for v=colptr[j]:colptr[j+1]-1  ## start at colptr[j]+1 ??
-			point[rowval[v]] = v
-		end
-		
-		for v=colptr[j]:diag[j]-1
-			i = rowval[v]
-			#nzval[v] /= nzval[diag[i]]
-			for w=diag[i]+1:colptr[i+1]-1
-				k = point[rowval[w]]
-				if k>0
-					nzval[k] -= nzval[v]*nzval[w]
-				end
-			end
-		end
-		
-		for v=diag[j]+1:colptr[j+1]-1
-			nzval[v] /= nzval[diag[j]]
-		end
-		
-		
-		for v=colptr[j]:colptr[j+1]-1
-			point[rowval[v]] = zero(Ti)
-		end
-	end
-
-end
-
-
-function piluAM(A::ExtendableSparseMatrixParallel{Tv,Ti}) where {Tv, Ti <:Integer}
-	@info "filuAM, $(A[1,1])"
-	nzval = copy(A.cscmatrix.nzval)
-	colptr = A.cscmatrix.colptr
-	rowval = A.cscmatrix.rowval
-	#nzval  = ILU.nzval
-	n = A.n # number of columns
-	point = zeros(Ti, n) #Vector{Ti}(undef, n)
-	diag  = Vector{Ti}(undef, n)
-
-	# find diagonal entries
-	for j=1:n
-		for v=colptr[j]:colptr[j+1]-1
-			if rowval[v] == j
-				diag[j] = v
-				break
-			end
-			#elseif rowval[v] 
-		end
-	end
-	
-	# compute L and U
-	for j=1:n
-		for v=colptr[j]:colptr[j+1]-1  ## start at colptr[j]+1 ??
-			point[rowval[v]] = v
-		end
-		
-		for v=colptr[j]:diag[j]-1
-			i = rowval[v]
-			#nzval[v] /= nzval[diag[i]]
-			for w=diag[i]+1:colptr[i+1]-1
-				k = point[rowval[w]]
-				if k>0
-					nzval[k] -= nzval[v]*nzval[w]
-				end
-			end
-		end
-		
-		for v=diag[j]+1:colptr[j+1]-1
-			nzval[v] /= nzval[diag[j]]
-		end
-		
-		
-		for v=colptr[j]:colptr[j+1]-1
-			point[rowval[v]] = zero(Ti)
-		end
-	end
-	#nzval, diag
-	PILUAMPrecon{Tv,Ti}(diag, nzval, A)
-end
-
-
-
-function ldiv!(x, ILU::PILUAMPrecon, b)
-	#@info "iluam ldiv 1"
-	nzval = ILU.nzval
-	diag  = ILU.diag
-	A     = ILU.A.cscmatrix
-	y = copy(b)
-	#forward_subst!(y, b, ILU)
-	forward_subst_old!(y, b, nzval, diag, A)
-	backward_subst_old!(x, y, nzval, diag, A)
-	@info "FILUAM:", b[1], y[1], x[1], maximum(abs.(b-A*x)) 
-    #, maximum(abs.(b-A*x)), b[1], x[1], y[1]
-	x
-end
-
-
-function ldiv!(ILU::PILUAMPrecon, b)
-	#@info "iluam ldiv 2"
-	nzval = ILU.nzval
-	diag  = ILU.diag
-	A     = ILU.A.cscmatrix
-	y = copy(b)
-	#forward_subst!(y, b, ILU)
-	forward_subst_old!(y, b, nzval, diag, A)
-	backward_subst_old!(b, y, nzval, diag, A)
-	b
-end
-
-function \(ilu::PILUAMPrecon{T,N}, b) where {T,N<:Integer}
-    x = copy(b)
-    ldiv!(x, ilu, b)
-	x
-end
-
-function nnz(ilu::PILUAMPrecon{T,N}) where {T,N<:Integer}
-    length(ilu.nzval)
-end
-
-#end
\ No newline at end of file
diff --git a/src/factorizations/ilu_Al-Kurdi_Mittal_0.jl b/src/factorizations/ilu_Al-Kurdi_Mittal_0.jl
deleted file mode 100644
index 26f9788..0000000
--- a/src/factorizations/ilu_Al-Kurdi_Mittal_0.jl
+++ /dev/null
@@ -1,146 +0,0 @@
-module ILUAM
-using LinearAlgebra, SparseArrays
-
-import LinearAlgebra.ldiv!, LinearAlgebra.\, SparseArrays.nnz 
-
-
-mutable struct ILUAMPrecon{T,N}
-
-	diag::AbstractVector
-    nzval::AbstractVector
-	rowval::AbstractVector
-	colptr::AbstractVector
-	
-end
-
-function ILUAMPrecon(A::SparseMatrixCSC{T,N}, b_type=T) where {T,N<:Integer}
-    @info "ILUAMPrecon"
-	n      = A.n # number of columns
-	nzval  = copy(A.nzval)
-	diag   = Vector{N}(undef, n)
-	
-    ILUAMPrecon{T, N}(diag, copy(A.nzval), copy(A.rowval), copy(A.colptr))
-end
-
-function iluAM!(LU::ILUAMPrecon{T,N}, A::SparseMatrixCSC{T,N}) where {T,N<:Integer}
-    @info "iluAM!"
-	nzval  = LU.nzval
-    diag   = LU.diag
-    
-    colptr = LU.colptr
-	rowval = LU.rowval
-	n      = A.n # number of columns
-	point  = zeros(N, n) #Vector{N}(undef, n)
-	
-	t = zeros(5)
-
-	# find diagonal entries
-	t[1] = @elapsed for j=1:n
-		for v=colptr[j]:colptr[j+1]-1
-			if rowval[v] == j
-				diag[j] = v
-				break
-			end
-			#elseif rowval[v] 
-		end
-	end
-	
-	# compute L and U
-	for j=1:n
-		t[2] += @elapsed for v=colptr[j]:colptr[j+1]-1  ## start at colptr[j]+1 ??
-			point[rowval[v]] = v
-		end
-		
-		t[3] += @elapsed for v=colptr[j]:diag[j]-1
-			i = rowval[v]
-			#nzval[v] /= nzval[diag[i]]
-			for w=diag[i]+1:colptr[i+1]-1
-				k = point[rowval[w]]
-				if k>0
-					nzval[k] -= nzval[v]*nzval[w]
-				end
-			end
-		end
-		
-		t[4] += @elapsed for v=diag[j]+1:colptr[j+1]-1
-			nzval[v] /= nzval[diag[j]]
-		end
-		
-		
-		t[5] += @elapsed for v=colptr[j]:colptr[j+1]-1
-			point[rowval[v]] = zero(N)
-		end
-	end
-	t
-end
-
-function iluAM(A::SparseMatrixCSC{T,N}) where {T,N<:Integer}
-    t = zeros(6)
-	t[1] = @elapsed (LU = ILUAMPrecon(A::SparseMatrixCSC{T,N}))
-    t[2:6] = iluAM!(LU, A)
-	@info t
-    LU
-end
-
-
-function forward_substitution!(y, ilu::ILUAMPrecon{T,N}, v) where {T,N<:Integer}
-	n = ilu.A.n
-	nzval = ilu.nzval
-	colptr = ilu.colptr
-	rowval = ilu.rowval
-	diag = ilu.diag
-	y .= 0
-	@inbounds for j=1:n
-		y[j] += v[j]
-		for v=diag[j]+1:colptr[j+1]-1
-			y[rowval[v]] -= nzval[v]*y[j]
-		end
-	end
-	y
-end
-
-
-function backward_substitution!(x, ilu::ILUAMPrecon{T,N}, y) where {T,N<:Integer}
-    n = ilu.A.n
-	nzval = ilu.nzval
-	colptr = ilu.colptr
-	rowval = ilu.rowval
-	diag = ilu.diag
-	wrk = copy(y)
-	@inbounds for j=n:-1:1
-		x[j] = wrk[j] / nzval[diag[j]]		
-		for i=colptr[j]:diag[j]-1
-			wrk[rowval[i]] -= nzval[i]*x[j]
-		end
-	end
-    x
-end
-
-function ldiv!(x, ilu::ILUAMPrecon{T,N}, b) where {T,N<:Integer}
-	@info "AM ldiv1"
-    y = copy(b)
-    forward_substitution!(y, ilu, b)
-    backward_substitution!(x, ilu, y)
-    x
-end
-
-function ldiv!(ilu::ILUAMPrecon{T,N}, b) where {T,N<:Integer}
-    @info "AM ldiv2"
-	y = copy(b)
-    forward_substitution!(y, ilu, b)
-    backward_substitution!(b, ilu, y)
-    b
-end
-
-function \(ilu::ILUAMPrecon{T,N}, b) where {T,N<:Integer}
-    @info "AM bs "
-	x = copy(b)
-    ldiv!(x, ilu, b)
-end
-
-function nnz(ilu::ILUAMPrecon{T,N}) where {T,N<:Integer}
-    length(ilu.nzval)
-end
-
-
-end
\ No newline at end of file
diff --git a/src/factorizations/ilu_Al-Kurdi_Mittal_1.jl b/src/factorizations/ilu_Al-Kurdi_Mittal_1.jl
deleted file mode 100644
index a599094..0000000
--- a/src/factorizations/ilu_Al-Kurdi_Mittal_1.jl
+++ /dev/null
@@ -1,229 +0,0 @@
-module ILUAM
-using LinearAlgebra, SparseArrays
-
-#import LinearAlgebra.ldiv!, LinearAlgebra.\, SparseArrays.nnz 
-
-@info "ILUAM"
-
-mutable struct ILUAMPrecon{T,N}
-
-	diag::AbstractVector
-    nzval::AbstractVector
-	A::AbstractMatrix
-	
-end
-
-function ILUAMPrecon(A::SparseMatrixCSC{T,N}, b_type=T) where {T,N<:Integer}
-    @info "ILUAMPrecon"
-	n      = A.n # number of columns
-	nzval  = copy(A.nzval)
-	diag   = Vector{N}(undef, n)
-	
-    ILUAMPrecon{T, N}(diag, copy(A.nzval), A)
-end
-
-
-
-function iluAM!(LU::ILUAMPrecon{T,N}, A::SparseMatrixCSC{T,N}) where {T,N<:Integer}
-    @info "iluAM!"
-	nzval  = LU.nzval
-    diag   = LU.diag
-    
-    colptr = LU.A.colptr
-	rowval = LU.A.rowval
-	n      = A.n # number of columns
-	point  = zeros(N, n) #Vector{N}(undef, n)
-	
-	t = zeros(5)
-
-	# find diagonal entries
-	t[1] = @elapsed for j=1:n
-		for v=colptr[j]:colptr[j+1]-1
-			if rowval[v] == j
-				diag[j] = v
-				break
-			end
-			#elseif rowval[v] 
-		end
-	end
-	
-	# compute L and U
-	for j=1:n
-		t[2] += @elapsed for v=colptr[j]:colptr[j+1]-1  ## start at colptr[j]+1 ??
-			point[rowval[v]] = v
-		end
-		
-		t[3] += @elapsed for v=colptr[j]:diag[j]-1
-			i = rowval[v]
-			#nzval[v] /= nzval[diag[i]]
-			for w=diag[i]+1:colptr[i+1]-1
-				k = point[rowval[w]]
-				if k>0
-					nzval[k] -= nzval[v]*nzval[w]
-				end
-			end
-		end
-		
-		t[4] += @elapsed for v=diag[j]+1:colptr[j+1]-1
-			nzval[v] /= nzval[diag[j]]
-		end
-		
-		
-		t[5] += @elapsed for v=colptr[j]:colptr[j+1]-1
-			point[rowval[v]] = zero(N)
-		end
-	end
-	t
-end
-
-
-function iluAM(A::SparseMatrixCSC{Tv,Ti}) where {Tv, Ti <:Integer}
-	@info "iluAM"
-	nzval = copy(A.nzval)
-	colptr = A.colptr
-	rowval = A.rowval
-	#nzval  = ILU.nzval
-	n = A.n # number of columns
-	point = zeros(Ti, n) #Vector{Ti}(undef, n)
-	diag  = Vector{Ti}(undef, n)
-	
-	# find diagonal entries
-	for j=1:n
-		for v=colptr[j]:colptr[j+1]-1
-			if rowval[v] == j
-				diag[j] = v
-				break
-			end
-			#elseif rowval[v] 
-		end
-	end
-	
-	# compute L and U
-	for j=1:n
-		for v=colptr[j]:colptr[j+1]-1  ## start at colptr[j]+1 ??
-			point[rowval[v]] = v
-		end
-		
-		for v=colptr[j]:diag[j]-1
-			i = rowval[v]
-			#nzval[v] /= nzval[diag[i]]
-			for w=diag[i]+1:colptr[i+1]-1
-				k = point[rowval[w]]
-				if k>0
-					nzval[k] -= nzval[v]*nzval[w]
-				end
-			end
-		end
-		
-		for v=diag[j]+1:colptr[j+1]-1
-			nzval[v] /= nzval[diag[j]]
-		end
-		
-		
-		for v=colptr[j]:colptr[j+1]-1
-			point[rowval[v]] = zero(Ti)
-		end
-	end
-	#nzval, diag
-	ILUAMPrecon{Tv,Ti}(diag, nzval, A)
-end
-
-#function iluAM(A::SparseMatrixCSC{T,N}) where {T,N<:Integer}
-#    t = zeros(6)
-#	t[1] = @elapsed (LU = ILUAMPrecon(A::SparseMatrixCSC{T,N}))
-#    t[2:6] = iluAM!(LU, A)
-#	@info t
-#    LU
-#end
-
-
-function forward_substitution!(y, ilu::ILUAMPrecon{T,N}, v) where {T,N<:Integer}
-	n = ilu.A.n
-	nzval = ilu.nzval
-	colptr = ilu.A.colptr
-	rowval = ilu.A.rowval
-	diag = ilu.diag
-	y .= 0
-	@inbounds for j=1:n
-		y[j] += v[j]
-		for v=diag[j]+1:colptr[j+1]-1
-			y[rowval[v]] -= nzval[v]*y[j]
-		end
-	end
-	y
-end
-
-
-function backward_substitution!(x, ilu::ILUAMPrecon{T,N}, y) where {T,N<:Integer}
-    n = ilu.A.n
-	nzval = ilu.nzval
-	colptr = ilu.A.colptr
-	rowval = ilu.A.rowval
-	diag = ilu.diag
-	wrk = copy(y)
-	@inbounds for j=n:-1:1
-		x[j] = wrk[j] / nzval[diag[j]]		
-		for i=colptr[j]:diag[j]-1
-			wrk[rowval[i]] -= nzval[i]*x[j]
-		end
-	end
-    x
-end
-
-function ldiv_new!(x, ilu, v)
-
-	n = ilu.A.n
-	y = Vector{Float64}(undef, n)
-	y .= 0
-	nzval = ilu.nzval
-	colptr = ilu.A.colptr
-	rowval = ilu.A.rowval
-	diag = ilu.diag
-	#forward
-	@inbounds for j=1:n
-		y[j] += v[j]
-		for v=diag[j]+1:colptr[j+1]-1
-			y[rowval[v]] -= nzval[v]*y[j]
-		end
-	end
-
-	#backward
-	wrk = copy(y)
-	@inbounds for j=n:-1:1
-		x[j] = wrk[j] / nzval[diag[j]]		
-		for i=colptr[j]:diag[j]-1
-			wrk[rowval[i]] -= nzval[i]*x[j]
-		end
-	end
-    x
-end
-
-function ldiv!(x, ilu::ILUAMPrecon{T,N}, b) where {T,N<:Integer}
-	#@info "AM ldiv1"
-    y = copy(b)
-    forward_substitution!(y, ilu, b)
-    backward_substitution!(x, ilu, y)
-    x
-end
-
-function ldiv!(ilu::ILUAMPrecon{T,N}, b) where {T,N<:Integer}
-    @info "AM ldiv2"
-	y = copy(b)
-    forward_substitution!(y, ilu, b)
-    backward_substitution!(b, ilu, y)
-    b
-end
-
-function \(ilu::ILUAMPrecon{T,N}, b) where {T,N<:Integer}
-    @info "AM bs "
-	x = copy(b)
-    ldiv!(x, ilu, b)
-	x
-end
-
-function nnz(ilu::ILUAMPrecon{T,N}) where {T,N<:Integer}
-    length(ilu.nzval)
-end
-
-
-end
\ No newline at end of file

From 85de6691898ecf8c8b9b1abad545e253183c6d71 Mon Sep 17 00:00:00 2001
From: Johannes Taraz <johannes.taraz@gmail.com>
Date: Sun, 24 Mar 2024 17:40:05 +0100
Subject: [PATCH 9/9] added preparation for edgewise assembly

---
 src/factorizations/pilu_Al-Kurdi_Mittal.jl    |   2 +-
 .../ExtendableSparseParallel.jl               |   9 +-
 .../preparatory.jl                            | 234 ++++++++++++++++--
 .../struct_flush.jl                           |  14 +-
 4 files changed, 231 insertions(+), 28 deletions(-)

diff --git a/src/factorizations/pilu_Al-Kurdi_Mittal.jl b/src/factorizations/pilu_Al-Kurdi_Mittal.jl
index f2861ed..ad9529b 100644
--- a/src/factorizations/pilu_Al-Kurdi_Mittal.jl
+++ b/src/factorizations/pilu_Al-Kurdi_Mittal.jl
@@ -52,7 +52,7 @@ function compute_lu!(nzval, point, j0, j1, tid, rowval, colptr, diag, Ti)
 end
 
 function piluAM!(ILU::PILUAMPrecon{Tv,Ti}, A::ExtendableSparseMatrixParallel{Tv,Ti}) where {Tv, Ti <:Integer}
-	@info "piluAM!"
+	#@info "piluAM!"
 	diag = ILU.diag
 	nzval = ILU.nzval
 	ILU.A = A
diff --git a/src/matrix/ExtendableSparseMatrixParallel/ExtendableSparseParallel.jl b/src/matrix/ExtendableSparseMatrixParallel/ExtendableSparseParallel.jl
index 2c91a12..b413c5c 100644
--- a/src/matrix/ExtendableSparseMatrixParallel/ExtendableSparseParallel.jl
+++ b/src/matrix/ExtendableSparseMatrixParallel/ExtendableSparseParallel.jl
@@ -103,7 +103,7 @@ function addtoentry!(A::ExtendableSparseMatrixParallel{Tv, Ti}, i, j, v; known_t
 	
 	if updatentryCSC2!(A.cscmatrix, i, j, v)
 	else
-		level, tid = last_nz(A.old_noderegions[:, A.rev_new_indices[j]])
+		_, tid = last_nz(A.old_noderegions[:, A.rev_new_indices[j]])
 		A.lnkmatrices[tid][i, A.sortednodesperthread[tid, j]] += v
 	end
 end
@@ -316,7 +316,6 @@ y <- A*x, where y and x are vectors and A is an ExtendableSparseMatrixParallel
 this computation is done in parallel, it has the same result as y = A.cscmatrix*x
 """
 function matvec!(y::AbstractVector{Tv}, A::ExtendableSparseMatrixParallel{Tv,Ti}, x::AbstractVector{Tv}) where {Tv, Ti<:Integer}
-    #a1 = @allocated begin
     nt = A.nt
     depth = A.depth
     colptr = A.cscmatrix.colptr
@@ -325,8 +324,6 @@ function matvec!(y::AbstractVector{Tv}, A::ExtendableSparseMatrixParallel{Tv,Ti}
 
     LinearAlgebra._rmul_or_fill!(y, 0.0)
     
-    #end
-    #a2 = @allocated 
     for level=1:depth
         @threads for tid::Int64=1:nt
             for col::Int64=A.start[(level-1)*nt+tid]:A.start[(level-1)*nt+tid+1]-1
@@ -337,8 +334,9 @@ function matvec!(y::AbstractVector{Tv}, A::ExtendableSparseMatrixParallel{Tv,Ti}
         end
     end
 
+
+
     @threads for tid=1:1
-        #a3 = @allocated 
         for col::Int64=A.start[depth*nt+1]:A.start[depth*nt+2]-1
             for row::Int64=colptr[col]:colptr[col+1]-1 #nzrange(A, col)
                 y[rv[row]] += nzv[row]*x[col]
@@ -346,6 +344,5 @@ function matvec!(y::AbstractVector{Tv}, A::ExtendableSparseMatrixParallel{Tv,Ti}
         end
     end
     
-    #println(a1,a2,a3)
     y
 end
diff --git a/src/matrix/ExtendableSparseMatrixParallel/preparatory.jl b/src/matrix/ExtendableSparseMatrixParallel/preparatory.jl
index 7eeb3d3..033a2fa 100644
--- a/src/matrix/ExtendableSparseMatrixParallel/preparatory.jl
+++ b/src/matrix/ExtendableSparseMatrixParallel/preparatory.jl
@@ -6,24 +6,31 @@
 `depth` is the number of partition layers, for depth=1, there are nt parts and 1 separator, for depth=2, the separator is partitioned again, leading to 2*nt+1 submatrices...
 To assemble the system matrix parallely, things such as `cellsforpart` (= which thread takes which cells) need to be computed in advance. This is done here.
 """
-function preparatory_multi_ps_less_reverse(nm, nt, depth, Ti; sequential=false, x0=0.0, x1=1.0, minsize_sepa=10, do_print=false, check_partition=false)
+function preparatory_multi_ps_less_reverse(nm, nt, depth, Ti; sequential=false, assembly=:cellwise, x0=0.0, x1=1.0, minsize_sepa=10, do_print=false, check_partition=false)
 	grid = getgrid(nm; x0, x1)
 	adepth = 0
 	if sequential
-		(allcells, start, cellparts, adepth) = grid_to_graph_ps_multi!(grid, nt, depth; minsize_sepa, do_print)#)
+		(allcells, start, cellparts, adepth) = grid_to_graph_cellwise!(grid, nt, depth; minsize_sepa, do_print)#)
 	else
-		(allcells, start, cellparts, adepth) = grid_to_graph_ps_multi_par!(grid, nt, depth; minsize_sepa, do_print)
+		(allcells, start, cellparts, adepth) = grid_to_graph_cellwise_par!(grid, nt, depth; minsize_sepa, do_print)
 	end
 
 	if (adepth != depth) && do_print
 		@info "The requested depth of partitioning is too high. The depth is set to $adepth."
 	end
-
 	depth = adepth
-	cfp = bettercellsforpart(cellparts, depth*nt+1)
+	
+	if assembly == :cellwise
+		cfp = bettercellsforpart(cellparts, depth*nt+1)
+
+	else
+		edgeparts = edgewise_partition_from_cellwise_partition(grid, cellparts)
+		cfp = bettercellsforpart(edgeparts, depth*nt+1)
+	end
 
+		
 	if check_partition
-		validate_partition(grid, cellparts, start, allcells, nt, depth)
+		validate_partition(grid, cellparts, start, allcells, nt, depth, assembly)
 	end
 
 	#@info length.(cfp)
@@ -126,7 +133,7 @@ function get_nnnts_and_sortednodesperthread_and_noderegs_from_cellregs_ps_less_r
 		tmpctr = 1
 		for cr in sortedcellregs
 			crmod = (cr-1)%nt+1
-			level = Int(ceil(cr/nt))
+			#level = Int(ceil(cr/nt))
 			if !(crmod in tmp[1:tmpctr-1])
 				gictrs[crmod] += 1 # , level] += 1
 				sortednodesperthread[crmod,nj] = gictrs[crmod]
@@ -235,7 +242,7 @@ The function assigns colors/partitons to each cell in the `grid`. First, the gri
 `nt` is the number of threads.
 `depth` is the number of partition layers, for depth=1, there are nt parts and 1 separator, for depth=2, the separator is partitioned again, leading to 2*nt+1 submatrices...
 """
-function grid_to_graph_ps_multi!(grid, nt, depth; minsize_sepa=10, do_print=false)
+function grid_to_graph_cellwise!(grid, nt, depth; minsize_sepa=10, do_print=false)
 	A = SparseMatrixLNK{Int64, Int64}(num_cells(grid), num_cells(grid))
 	number_cells_per_node = zeros(Int64, num_nodes(grid))
 	for j=1:num_cells(grid)
@@ -304,11 +311,10 @@ function grid_to_graph_ps_multi!(grid, nt, depth; minsize_sepa=10, do_print=fals
 		end
 	end
         
-    return allcells, start, cellregs, actual_depth, ACSC
+    return allcells, start, cellregs, actual_depth
 end
 
-
-function grid_to_graph_ps_multi_par!(grid, nt, depth; minsize_sepa=10, do_print=false)
+function grid_to_graph_cellwise_par!(grid, nt, depth; minsize_sepa=10, do_print=false)
 	As = [ExtendableSparseMatrix{Int64, Int64}(num_cells(grid), num_cells(grid)) for tid=1:nt]
 	number_cells_per_node = zeros(Int64, num_nodes(grid))
 	
@@ -397,6 +403,195 @@ function grid_to_graph_ps_multi_par!(grid, nt, depth; minsize_sepa=10, do_print=
     return allcells, start, cellregs, actual_depth
 end
 
+function grid_to_graph_edgewise!(grid, nt, depth; minsize_sepa=10, do_print=false)
+	ce = grid[CellEdges]
+    A = SparseMatrixLNK{Int64, Int64}(num_edges(grid), num_edges(grid))
+	number_edges_per_node = zeros(Int64, num_nodes(grid))
+	
+    for i=1:num_edges(grid)
+        for node_id in grid[EdgeNodes][:,i] 
+            number_edges_per_node[node_id] += 1
+        end
+    end
+
+    alledges = zeros(Int64, sum(number_edges_per_node))
+	start = ones(Int64, num_nodes(grid)+1)
+	start[2:end] += cumsum(number_edges_per_node)
+	number_edges_per_node .= 0
+	
+    for j=1:num_edges(grid)
+		for node_id in grid[EdgeNodes][:,j]
+			alledges[start[node_id] + number_edges_per_node[node_id]] = j
+			number_edges_per_node[node_id] += 1
+		end
+	end
+
+	for j=1:num_nodes(grid)
+		edges = @view alledges[start[j]:start[j+1]-1]
+		for (i,id1) in enumerate(edges)
+			for id2 in edges[i+1:end]
+				A[id1,id2] = 1
+				A[id2,id1] = 1
+			end
+		end	
+	end
+
+	ACSC = SparseArrays.SparseMatrixCSC(A)
+	
+	partition = Metis.partition(ACSC, nt)
+
+    sn = Vector{Int64}(undef, num_edges(grid))
+	gi = Vector{Int64}(undef, num_edges(grid))
+	ctr_sepanodes = 0
+    
+	edgeregs  = copy(partition)
+    for tid=1:nt
+        for j=1:num_edges(grid)
+            if edgeregs[j] == tid
+                rows = vcat(ACSC.rowval[ACSC.colptr[j]:(ACSC.colptr[j+1]-1)], [j])
+                if how_many_different_below(edgeregs[rows], nt+1) > 1 
+                    edgeregs[j] = nt+1 #+ctr_sepanodes
+                    ctr_sepanodes += 1
+                    sn[ctr_sepanodes] = j
+                    gi[ctr_sepanodes] = j
+                end
+            end
+        end
+	end
+
+    sn = sn[1:ctr_sepanodes]
+    gi = gi[1:ctr_sepanodes]
+    
+    if do_print
+        @info "At level $(1), we found $ctr_sepanodes cells that have to be treated in the next iteration!"
+    end
+    
+    RART = copy(ACSC)
+    actual_depth = 1
+	for level=1:depth-1
+		RART, ctr_sepanodes, sn, gi = separate!(edgeregs, num_edges(grid), RART, nt, level, ctr_sepanodes, sn, gi, do_print)
+        actual_depth += 1
+		if ctr_sepanodes < minsize_sepa
+			break
+		end
+	end
+        
+    return alledges, start, edgeregs, actual_depth
+end
+
+function grid_to_graph_edgewise_par!(grid, nt, depth; minsize_sepa=10, do_print=false)
+	ce = grid[CellEdges]
+    cn = grid[EdgeNodes]
+	
+    As = [ExtendableSparseMatrix{Int64, Int64}(num_edges(grid), num_edges(grid)) for tid=1:nt]
+	number_edges_per_node = zeros(Int64, num_nodes(grid))
+	
+	
+	for j=1:num_edges(grid)
+		tmp = view(cn, :, j)
+		for node_id in tmp
+			number_edges_per_node[node_id] += 1
+		end
+	end
+		
+	
+	alledges = zeros(Int64, sum(number_edges_per_node))
+	start = ones(Int64, num_nodes(grid)+1)
+	start[2:end] += cumsum(number_edges_per_node)
+	number_edges_per_node .= 0
+	
+	for j=1:num_edges(grid)
+		tmp = view(cn, :, j)
+		for node_id in tmp
+			alledges[start[node_id] + number_edges_per_node[node_id]] = j
+			number_edges_per_node[node_id] += 1
+		end
+	end
+
+	node_range = get_starts(num_nodes(grid), nt)
+	Threads.@threads for tid=1:nt
+		for j in node_range[tid]:node_range[tid+1]-1
+			edges = @view alledges[start[j]:start[j+1]-1]
+			l = length(edges)
+			for (i,id1) in enumerate(edges)
+				ce = view(edges, i+1:l)
+				for id2 in ce
+					As[tid][id1,id2] = 1
+					As[tid][id2,id1] = 1
+					
+				end
+			end	
+		end
+		ExtendableSparse.flush!(As[tid])
+	end
+	
+	ACSC = add_all_par!(As).cscmatrix
+	
+	cellregs = Metis.partition(ACSC, nt)
+	
+	sn = [Vector{Int64}(undef, Int(ceil(num_cells(grid)/nt))) for tid=1:nt]
+	ctr_sepanodess = zeros(Int64, nt)
+    
+    @threads for tid=1:nt
+        for j=1:num_edges(grid)
+            if cellregs[j] == tid
+                rows = vcat(ACSC.rowval[ACSC.colptr[j]:(ACSC.colptr[j+1]-1)], [j])
+                if how_many_different_below(cellregs[rows], nt+1) > 1 
+                    cellregs[j] = nt+1 #+ctr_sepanodes
+                    ctr_sepanodess[tid] += 1
+                    sn[tid][ctr_sepanodess[tid]] = j
+                end
+            end
+        end
+	end
+
+    for tid=1:nt
+        sn[tid] = sn[tid][1:ctr_sepanodess[tid]]
+    end
+    ctr_sepanodes = sum(ctr_sepanodess)
+    sn = vcat(sn...)
+    gi = copy(sn)
+
+    if do_print
+        @info "At level $(1), we found $ctr_sepanodes edges that have to be treated in the next iteration!"
+    end
+
+    RART = ACSC
+    actual_depth = 1
+	for level=1:depth-1
+		RART, ctr_sepanodes, sn, gi = separate!(cellregs, num_cells(grid), RART, nt, level, ctr_sepanodes, sn, gi, do_print)
+        actual_depth += 1
+		if ctr_sepanodes < minsize_sepa
+			break
+		end
+	end
+    
+    #grid[CellRegions] = cellregs
+    #grid
+    return alledges, start, cellregs, actual_depth
+end
+
+
+function edgewise_partition_from_cellwise_partition(grid, cellregs)
+	ce = grid[CellEdges]
+	if num_edges(grid) == 0
+		grid[EdgeNodes]
+	end
+	
+	edgeregs = maximum(cellregs)*ones(Int64, num_edges(grid))
+
+	for icell=1:num_cells(grid)
+		tmp = cellregs[icell]
+		for iedge in ce[:,icell]
+			if tmp < edgeregs[iedge]
+				edgeregs[iedge] = tmp
+			end
+		end
+	end
+
+	edgeregs
+end
+
 """
 `function add_all_par!(As)`
 
@@ -636,10 +831,15 @@ function check_partition(nm, nt, depth)
 end
 =#
 
-function validate_partition(grid, cellregs, start, allcells, nt, depth)
-	@info "Node based validation"
+function validate_partition(grid, cellregs, start, allcells, nt, depth, assemblytype)
 	violation_ctr = 0
 
+	if assemblytype == :cellwise
+		key = CellNodes
+	else
+		key = EdgeNodes
+	end
+
 	for j=1:num_nodes(grid)
 		cells = @view allcells[start[j]:start[j+1]-1]
 		sortedcellregs = unique(sort(cellregs[cells]))
@@ -651,14 +851,14 @@ function validate_partition(grid, cellregs, start, allcells, nt, depth)
 				violation_ctr += 1
 
 				if violation_ctr == 1
-					@info "Node Id : ", j
-					@info "Cellregs: ", sortedcellregs
-					@info "Levels  : ", levels
+					@info "Node Id : $j (we only show one violation)"
+					@info "Cellregs: $sortedcellregs"
+					@info "Levels  : $levels"
 					
 					loc = findall(x->x==4, Int.(ceil.(cellregs[allcells[start[j]:start[j+1]-1]]/nt)))
 					cells_at_level4 = allcells[loc.+(start[j]-1)]
 					@info cells_at_level4, cellregs[cells_at_level4]
-					@info grid[CellNodes][:,cells_at_level4[1]], grid[CellNodes][:,cells_at_level4[2]]
+					@info grid[key][:,cells_at_level4[1]], grid[key][:,cells_at_level4[2]]
 				end
 			end
 		end
diff --git a/src/matrix/ExtendableSparseMatrixParallel/struct_flush.jl b/src/matrix/ExtendableSparseMatrixParallel/struct_flush.jl
index 73471dc..1b8fc48 100644
--- a/src/matrix/ExtendableSparseMatrixParallel/struct_flush.jl
+++ b/src/matrix/ExtendableSparseMatrixParallel/struct_flush.jl
@@ -49,6 +49,9 @@ function dense_flush_keepzeros!(
 	eqctr = 0
 	tmp = zeros(Ti, size(onr)[1])
 	
+	#@warn [As[i].nnz for i=1:nt], [As[i].n for i=1:nt], [As[i].m for i=1:nt]
+	#@info maximum.([As[i].colptr for i=1:nt])
+
 	for nj=1:As[1].m
 		indptr[nj] = ctr
 		oj = rni[nj]
@@ -62,7 +65,10 @@ function dense_flush_keepzeros!(
 				k = s[regmod, nj]
 				if regionctr == 1
 					while k>0
-						#if As[regmod].nzval[k] != 0.0
+						if As[regmod].rowval[k] != 0
+							if ctr > nnz
+								@info "ctr > nnz, $nj, $oj"
+							end
 							indices[ctr] = As[regmod].rowval[k]
 							data[ctr]    = As[regmod].nzval[k]
 							
@@ -82,12 +88,12 @@ function dense_flush_keepzeros!(
 							
 							ctr += 1
 							jc += 1
-						#end
+						end
 						k = As[regmod].colptr[k]
 					end
 				else
 					while k>0
-						#if As[regmod].nzval[k] != 0.0
+						if As[regmod].rowval[k] != 0
 							indices[ctr] = As[regmod].rowval[k]
 							data[ctr]    = As[regmod].nzval[k]
 							
@@ -120,7 +126,7 @@ function dense_flush_keepzeros!(
 							
 							ctr += 1
 							jc += 1
-						#end
+						end
 						k = As[regmod].colptr[k]
 					end