/* SPDX-License-Identifier: GPL-2.0-only OR GPL-3.0-only */
/* Copyright (c) 2022-2025 Brett Sheffield <bacs@librecast.net> */

#include <matrix.h>
#include <gf256.h>

void matrix_row_add_avx2(matrix_t *dst, const int drow, const matrix_t *src, const int srow)
{
	assert(matrix_cols(dst) == matrix_cols(src));
	uint8_t *d = matrix_ptr_row(dst, drow);
	uint8_t *s = matrix_ptr_row(src, srow);
	const int mcols = matrix_cols(dst);
	int maxv = mcols - mcols % 32;
	int j;
	for (j = 0; j < maxv; j += 32) {
		__m256i S = _mm256_loadu_si256((const __m256i_u *)&s[j]);
		__m256i D = _mm256_loadu_si256((const __m256i_u *)&d[j]);
		D = _mm256_xor_si256(D, S);
		_mm256_storeu_si256((__m256i*)&d[j], D);
	}
	maxv = mcols - mcols % 16;
	for (; j < maxv; j += 16) {
		__m128i S = _mm_loadu_si128((const __m128i_u *)&s[j]);
		__m128i D = _mm_loadu_si128((const __m128i_u *)&d[j]);
		D = _mm_xor_si128(D, S);
		_mm_storeu_si128((__m128i*)&d[j], D);
	}
	for (; j < mcols; j++) d[j] ^= s[j];
}

/*
 * Method adapted from the technique described in:
 * J. S. Plank and K. M. Greenan and E. L. Miller (2013)
 * "Screaming Fast Galois Field Arithmetic Using Intel SIMD Instructions"
 * http://web.eecs.utk.edu/~jplank/plank/papers/FAST-2013-GF.html
 */
static __m128i m256_mul_128(const __m256i *t1, const __m256i *t2, __m128i A)
{
	__m128i mask1 = _mm_set1_epi8((uint8_t)0x0f);
	__m128i mask2 = _mm_set1_epi8((uint8_t)0xf0);
	__m128i l, h;
	l = _mm_and_si128(A, mask1);
	l = _mm_shuffle_epi8(_mm256_castsi256_si128(*t1), l);
	h = _mm_and_si128(A, mask2);
	h = _mm_srli_epi64(h, 4);
	l = _mm_shuffle_epi8(_mm256_castsi256_si128(*t2), h);
	return _mm_xor_si128(h, l);
}

static void m256_mul_128_inplace(const __m256i *t1, const __m256i *t2, uint8_t *d)
{
	__m128i D = _mm_loadu_si128((const __m128i_u *)d);
	D = m256_mul_128(t1, t2, D);
	_mm_storeu_si128((__m128i*)d, D);
}

static __m256i m256_mul_256(const __m256i *t1, const __m256i *t2, __m256i A)
{
	__m256i mask1, mask2, l, h;
	mask1 = _mm256_set1_epi8((uint8_t)0x0f);
	mask2 = _mm256_set1_epi8((uint8_t)0xf0);
	l = _mm256_and_si256(A, mask1);
	l = _mm256_shuffle_epi8(*t1, l);
	h = _mm256_and_si256(A, mask2);
	h = _mm256_srli_epi64(h, 4);
	h = _mm256_shuffle_epi8(*t2, h);
	return _mm256_xor_si256(h, l);
}

static void m256_mul_256_inplace(const __m256i *t1, const __m256i *t2, uint8_t *d)
{
	__m256i D = _mm256_loadu_si256((const __m256i_u *)d);
	D = m256_mul_256(t1, t2, D);
	_mm256_storeu_si256((__m256i*)d, D);
}

void matrix_row_mul_avx2(matrix_t *m, const int row, const int off, const uint8_t y)
{
	const __m256i t1 = _mm256_broadcastsi128_si256(_mm_loadu_si128((const __m128i_u *)GF256LR[y][0]));
	const __m256i t2 = _mm256_broadcastsi128_si256(_mm_loadu_si128((const __m128i_u *)GF256LR[y][1]));
	uint8_t *d = matrix_ptr_row(m, row) + off;
	const int max = m->cols - off;
	const int maxv = max - max % 32;
	int j = 0;
	for (; j < maxv; j += 32) m256_mul_256_inplace(&t1, &t2, &d[j]);
	for (; j < maxv; j += 16) m256_mul_128_inplace(&t1, &t2, &d[j]);
	for (; j < max; j++) d[j] = GF256MUL(d[j], y);
}

void matrix_row_mul_byrow_avx2(matrix_t *m, const int rdst, const int off, const int rsrc, const uint8_t y)
{
	assert(y);
	const __m256i t1 = _mm256_broadcastsi128_si256(_mm_loadu_si128((const __m128i_u *)GF256LR[y][0]));
	const __m256i t2 = _mm256_broadcastsi128_si256(_mm_loadu_si128((const __m128i_u *)GF256LR[y][1]));
	uint8_t *d = matrix_ptr_row(m, rdst) + off;
	uint8_t *s = matrix_ptr_row(m, rsrc) + off;
	const int max = m->cols - off;
	const int maxv = max - max % 32;
	int i = 0;
	for (; i < maxv; i += 32) {
		__m256i S = _mm256_loadu_si256((const __m256i_u *)&s[i]);
		__m256i D = _mm256_loadu_si256((const __m256i_u *)&d[i]);
		S = m256_mul_256(&t1, &t2, S);
		D = _mm256_xor_si256(D, S);
		_mm256_storeu_si256((__m256i*)&d[i], D);
	}
	for (; i < maxv; i += 16) {
		__m128i S = _mm_loadu_si128((const __m128i_u *)&s[i]);
		__m128i D = _mm_loadu_si128((const __m128i_u *)&d[i]);
		S = m256_mul_128(&t1, &t2, S);
		D = _mm_xor_si128(D, S);
		_mm_storeu_si128((__m128i*)&d[i], D);
	}
	for (; i < max; i++) {
		d[i] ^= GF256MUL(s[i], y);
	}
}
