//go:build !noasm && !appengine && !gccgo // +build !noasm,!appengine,!gccgo // Copyright 2015, Klaus Post, see LICENSE for details. // Copyright 2019, Minio, Inc. package reedsolomon import ( "sync" ) //go:noescape func _galMulAVX512Parallel81(in, out [][]byte, matrix *[matrixSize81]byte, addTo bool) //go:noescape func _galMulAVX512Parallel82(in, out [][]byte, matrix *[matrixSize82]byte, addTo bool) //go:noescape func _galMulAVX512Parallel84(in, out [][]byte, matrix *[matrixSize84]byte, addTo bool) const ( dimIn = 8 // Number of input rows processed simultaneously dimOut81 = 1 // Number of output rows processed simultaneously for x1 routine dimOut82 = 2 // Number of output rows processed simultaneously for x2 routine dimOut84 = 4 // Number of output rows processed simultaneously for x4 routine matrixSize81 = (16 + 16) * dimIn * dimOut81 // Dimension of slice of matrix coefficient passed into x1 routine matrixSize82 = (16 + 16) * dimIn * dimOut82 // Dimension of slice of matrix coefficient passed into x2 routine matrixSize84 = (16 + 16) * dimIn * dimOut84 // Dimension of slice of matrix coefficient passed into x4 routine ) // Construct block of matrix coefficients for single output row in parallel func setupMatrix81(matrixRows [][]byte, inputOffset, outputOffset int, matrix *[matrixSize81]byte) { offset := 0 for c := inputOffset; c < inputOffset+dimIn; c++ { for iRow := outputOffset; iRow < outputOffset+dimOut81; iRow++ { if c < len(matrixRows[iRow]) { coeff := matrixRows[iRow][c] copy(matrix[offset*32:], mulTableLow[coeff][:]) copy(matrix[offset*32+16:], mulTableHigh[coeff][:]) } else { // coefficients not used for this input shard (so null out) v := matrix[offset*32 : offset*32+32] for i := range v { v[i] = 0 } } offset += dimIn if offset >= dimIn*dimOut81 { offset -= dimIn*dimOut81 - 1 } } } } // Construct block of matrix coefficients for 2 output rows in parallel func setupMatrix82(matrixRows [][]byte, inputOffset, outputOffset int, matrix *[matrixSize82]byte) { offset := 0 for c := inputOffset; c < inputOffset+dimIn; c++ { for iRow := outputOffset; iRow < outputOffset+dimOut82; iRow++ { if c < len(matrixRows[iRow]) { coeff := matrixRows[iRow][c] copy(matrix[offset*32:], mulTableLow[coeff][:]) copy(matrix[offset*32+16:], mulTableHigh[coeff][:]) } else { // coefficients not used for this input shard (so null out) v := matrix[offset*32 : offset*32+32] for i := range v { v[i] = 0 } } offset += dimIn if offset >= dimIn*dimOut82 { offset -= dimIn*dimOut82 - 1 } } } } // Construct block of matrix coefficients for 4 output rows in parallel func setupMatrix84(matrixRows [][]byte, inputOffset, outputOffset int, matrix *[matrixSize84]byte) { offset := 0 for c := inputOffset; c < inputOffset+dimIn; c++ { for iRow := outputOffset; iRow < outputOffset+dimOut84; iRow++ { if c < len(matrixRows[iRow]) { coeff := matrixRows[iRow][c] copy(matrix[offset*32:], mulTableLow[coeff][:]) copy(matrix[offset*32+16:], mulTableHigh[coeff][:]) } else { // coefficients not used for this input shard (so null out) v := matrix[offset*32 : offset*32+32] for i := range v { v[i] = 0 } } offset += dimIn if offset >= dimIn*dimOut84 { offset -= dimIn*dimOut84 - 1 } } } } // Invoke AVX512 routine for single output row in parallel func galMulAVX512Parallel81(in, out [][]byte, matrixRows [][]byte, inputOffset, outputOffset, start, stop int, matrix81 *[matrixSize81]byte) { done := stop - start if done <= 0 { return } inputEnd := inputOffset + dimIn if inputEnd > len(in) { inputEnd = len(in) } outputEnd := outputOffset + dimOut81 if outputEnd > len(out) { outputEnd = len(out) } // We know the max size, alloc temp array. var inTmp [dimIn][]byte for i, v := range in[inputOffset:inputEnd] { inTmp[i] = v[start:stop] } var outTmp [dimOut81][]byte for i, v := range out[outputOffset:outputEnd] { outTmp[i] = v[start:stop] } addTo := inputOffset != 0 // Except for the first input column, add to previous results _galMulAVX512Parallel81(inTmp[:inputEnd-inputOffset], outTmp[:outputEnd-outputOffset], matrix81, addTo) done = start + ((done >> 6) << 6) if done < stop { galMulAVX512LastInput(inputOffset, inputEnd, outputOffset, outputEnd, matrixRows, done, stop, out, in) } } // Invoke AVX512 routine for 2 output rows in parallel func galMulAVX512Parallel82(in, out [][]byte, matrixRows [][]byte, inputOffset, outputOffset, start, stop int, matrix82 *[matrixSize82]byte) { done := stop - start if done <= 0 { return } inputEnd := inputOffset + dimIn if inputEnd > len(in) { inputEnd = len(in) } outputEnd := outputOffset + dimOut82 if outputEnd > len(out) { outputEnd = len(out) } // We know the max size, alloc temp array. var inTmp [dimIn][]byte for i, v := range in[inputOffset:inputEnd] { inTmp[i] = v[start:stop] } var outTmp [dimOut82][]byte for i, v := range out[outputOffset:outputEnd] { outTmp[i] = v[start:stop] } addTo := inputOffset != 0 // Except for the first input column, add to previous results _galMulAVX512Parallel82(inTmp[:inputEnd-inputOffset], outTmp[:outputEnd-outputOffset], matrix82, addTo) done = start + ((done >> 6) << 6) if done < stop { galMulAVX512LastInput(inputOffset, inputEnd, outputOffset, outputEnd, matrixRows, done, stop, out, in) } } // Invoke AVX512 routine for 4 output rows in parallel func galMulAVX512Parallel84(in, out [][]byte, matrixRows [][]byte, inputOffset, outputOffset, start, stop int, matrix84 *[matrixSize84]byte) { done := stop - start if done <= 0 { return } inputEnd := inputOffset + dimIn if inputEnd > len(in) { inputEnd = len(in) } outputEnd := outputOffset + dimOut84 if outputEnd > len(out) { outputEnd = len(out) } // We know the max size, alloc temp array. var inTmp [dimIn][]byte for i, v := range in[inputOffset:inputEnd] { inTmp[i] = v[start:stop] } var outTmp [dimOut84][]byte for i, v := range out[outputOffset:outputEnd] { outTmp[i] = v[start:stop] } addTo := inputOffset != 0 // Except for the first input column, add to previous results _galMulAVX512Parallel84(inTmp[:inputEnd-inputOffset], outTmp[:outputEnd-outputOffset], matrix84, addTo) done = start + ((done >> 6) << 6) if done < stop { galMulAVX512LastInput(inputOffset, inputEnd, outputOffset, outputEnd, matrixRows, done, stop, out, in) } } func galMulAVX512LastInput(inputOffset int, inputEnd int, outputOffset int, outputEnd int, matrixRows [][]byte, done int, stop int, out [][]byte, in [][]byte) { for c := inputOffset; c < inputEnd; c++ { for iRow := outputOffset; iRow < outputEnd; iRow++ { if c < len(matrixRows[iRow]) { mt := mulTable[matrixRows[iRow][c]][:256] for i := done; i < stop; i++ { if c == 0 { // only set value for first input column out[iRow][i] = mt[in[c][i]] } else { // and add for all others out[iRow][i] ^= mt[in[c][i]] } } } } } } // Perform the same as codeSomeShards, but taking advantage of // AVX512 parallelism for up to 4x faster execution as compared to AVX2 func (r *reedSolomon) codeSomeShardsAvx512(matrixRows, inputs, outputs [][]byte, outputCount, byteCount int) { // Process using no goroutines start, end := 0, r.o.perRound if end > byteCount { end = byteCount } for start < byteCount { matrix84 := [matrixSize84]byte{} matrix82 := [matrixSize82]byte{} matrix81 := [matrixSize81]byte{} outputRow := 0 // First process (multiple) batches of 4 output rows in parallel if outputRow+dimOut84 <= outputCount { for ; outputRow+dimOut84 <= outputCount; outputRow += dimOut84 { for inputRow := 0; inputRow < len(inputs); inputRow += dimIn { setupMatrix84(matrixRows, inputRow, outputRow, &matrix84) galMulAVX512Parallel84(inputs, outputs, matrixRows, inputRow, outputRow, start, end, &matrix84) } } } // Then process a (single) batch of 2 output rows in parallel if outputRow+dimOut82 <= outputCount { for inputRow := 0; inputRow < len(inputs); inputRow += dimIn { setupMatrix82(matrixRows, inputRow, outputRow, &matrix82) galMulAVX512Parallel82(inputs, outputs, matrixRows, inputRow, outputRow, start, end, &matrix82) } outputRow += dimOut82 } // Lastly, we may have a single output row left (for uneven parity) if outputRow < outputCount { for inputRow := 0; inputRow < len(inputs); inputRow += dimIn { setupMatrix81(matrixRows, inputRow, outputRow, &matrix81) galMulAVX512Parallel81(inputs, outputs, matrixRows, inputRow, outputRow, start, end, &matrix81) } } start = end end += r.o.perRound if end > byteCount { end = byteCount } } } // Perform the same as codeSomeShards, but taking advantage of // AVX512 parallelism for up to 4x faster execution as compared to AVX2 func (r *reedSolomon) codeSomeShardsAvx512P(matrixRows, inputs, outputs [][]byte, outputCount, byteCount int) { var wg sync.WaitGroup do := byteCount / r.o.maxGoroutines if do < r.o.minSplitSize { do = r.o.minSplitSize } // Make sizes divisible by 64 do = (do + 63) & (^63) start := 0 for start < byteCount { if start+do > byteCount { do = byteCount - start } wg.Add(1) go func(grStart, grStop int) { start, stop := grStart, grStart+r.o.perRound if stop > grStop { stop = grStop } // Loop for each round. matrix84 := [matrixSize84]byte{} matrix82 := [matrixSize82]byte{} matrix81 := [matrixSize81]byte{} for start < grStop { outputRow := 0 // First process (multiple) batches of 4 output rows in parallel if outputRow+dimOut84 <= outputCount { // 1K matrix buffer for ; outputRow+dimOut84 <= outputCount; outputRow += dimOut84 { for inputRow := 0; inputRow < len(inputs); inputRow += dimIn { setupMatrix84(matrixRows, inputRow, outputRow, &matrix84) galMulAVX512Parallel84(inputs, outputs, matrixRows, inputRow, outputRow, start, stop, &matrix84) } } } // Then process a (single) batch of 2 output rows in parallel if outputRow+dimOut82 <= outputCount { // 512B matrix buffer for inputRow := 0; inputRow < len(inputs); inputRow += dimIn { setupMatrix82(matrixRows, inputRow, outputRow, &matrix82) galMulAVX512Parallel82(inputs, outputs, matrixRows, inputRow, outputRow, start, stop, &matrix82) } outputRow += dimOut82 } // Lastly, we may have a single output row left (for uneven parity) if outputRow < outputCount { for inputRow := 0; inputRow < len(inputs); inputRow += dimIn { setupMatrix81(matrixRows, inputRow, outputRow, &matrix81) galMulAVX512Parallel81(inputs, outputs, matrixRows, inputRow, outputRow, start, stop, &matrix81) } } start = stop stop += r.o.perRound if stop > grStop { stop = grStop } } wg.Done() }(start, start+do) start += do } wg.Wait() }