/*
 * riscv-crc32c.c
 *
 * RISC-V Zbc CRC32C (Castagnoli) hardware acceleration test
 *
 * Based on Abseil's production implementation.
 * https://github.com/abseil/abseil-cpp/pull/1986
 * absl/crc/internal/crc_riscv.cc
 *
 * Build commands:
 *   gcc -O2 -o crc32c-gcc-sw riscv-crc32c.c
 *   gcc -O2 -march=rv64gc_zbc -o crc32c-gcc-zbc riscv-crc32c.c
 *   clang -O2 -o crc32c-clang-sw riscv-crc32c.c
 *   clang -O2 -march=rv64gc_zbc -o crc32c-clang-zbc riscv-crc32c.c
 */

#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <endian.h>

#define TEST_SIZE (1024 * 1024)  /* 1 MB */
#define ITERATIONS 100

/* CRC-32C (Castagnoli) polynomial: 0x1EDC6F41 (normal), 0x82F63B78 (reflected) */
#define CRC32C_POLY 0x82F63B78

/*
 * Software CRC32C implementation
 * Standard table-driven algorithm used as baseline
 */
static uint32_t crc32c_table[256];

static void
init_crc32c_table(void)
{
	uint32_t crc;
	int i, j;

	for (i = 0; i < 256; i++)
	{
		crc = i;
		for (j = 0; j < 8; j++)
		{
			if (crc & 1)
				crc = (crc >> 1) ^ CRC32C_POLY;
			else
				crc >>= 1;
		}
		crc32c_table[i] = crc;
	}
}

static uint32_t
crc32c_sw(uint32_t crc, const uint8_t *data, size_t len)
{
	const uint8_t *p = data;
	const uint8_t *end = data + len;

	crc = ~crc;

	while (p < end)
	{
		crc = crc32c_table[(crc ^ *p++) & 0xFF] ^ (crc >> 8);
	}

	return ~crc;
}

#if defined(__riscv) && (__riscv_xlen == 64) && \
    (defined(__riscv_zbc) || defined(__riscv_zbkc))

/*
 * Hardware CRC32C implementation using RISC-V Zbc extension
 *
 * Algorithm from Abseil (PR #1986):
 * - Fold 16-byte blocks using carry-less multiply (clmul/clmulh)
 * - Reduce 128-bit to 64-bit to 32-bit using precomputed constants
 * - Barrett reduction for final 32-bit CRC
 *
 * This is the production-tested algorithm used by Google Abseil.
 */

typedef struct {
	uint64_t lo;
	uint64_t hi;
} V128;

/* Carry-less multiply intrinsics */
static inline uint64_t
clmul(uint64_t a, uint64_t b)
{
	uint64_t out;
	__asm__("clmul %0, %1, %2" : "=r"(out) : "r"(a), "r"(b));
	return out;
}

static inline uint64_t
clmulh(uint64_t a, uint64_t b)
{
	uint64_t out;
	__asm__("clmulh %0, %1, %2" : "=r"(out) : "r"(a), "r"(b));
	return out;
}

static inline V128
clmul128(uint64_t a, uint64_t b)
{
	V128 result;
	result.lo = clmul(a, b);
	result.hi = clmulh(a, b);
	return result;
}

static inline V128
v128_xor(V128 a, V128 b)
{
	V128 result;
	result.lo = a.lo ^ b.lo;
	result.hi = a.hi ^ b.hi;
	return result;
}

static inline V128
v128_and_mask32(V128 a)
{
	V128 result;
	result.lo = a.lo & 0x00000000FFFFFFFFull;
	result.hi = a.hi & 0x00000000FFFFFFFFull;
	return result;
}

static inline V128
v128_shift_right64(V128 a)
{
	V128 result;
	result.lo = a.hi;
	result.hi = 0;
	return result;
}

static inline V128
v128_shift_right32(V128 a)
{
	V128 result;
	result.lo = (a.lo >> 32) | (a.hi << 32);
	result.hi = (a.hi >> 32);
	return result;
}

static inline V128
v128_load(const uint8_t *p)
{
	V128 result;
	result.lo = le64toh(*(const uint64_t *)p);
	result.hi = le64toh(*(const uint64_t *)(p + 8));
	return result;
}

/*
 * Core CRC32C algorithm using carry-less multiplication
 * Input: crc in working form (already inverted with ~crc)
 * Output: crc in working form (still inverted)
 */
static uint32_t
crc32c_clmul_core(uint32_t crc_inverted, const uint8_t *buf, uint64_t len)
{
	/* CRC32C (Castagnoli) constants from Abseil */
	const uint64_t kK5 = 0x0f20c0dfeull;  /* Folding constant */
	const uint64_t kK6 = 0x14cd00bd6ull;  /* Folding constant */
	const uint64_t kK7 = 0x0dd45aab8ull;  /* 64->32 reduction */
	const uint64_t kP1 = 0x105ec76f0ull;  /* Barrett reduction */
	const uint64_t kP2 = 0x0dea713f1ull;  /* Barrett reduction */

	/* Load first 16-byte block and fold in CRC */
	V128 x = v128_load(buf);
	x.lo ^= (uint64_t)crc_inverted;
	buf += 16;
	len -= 16;

	/* Fold 16-byte blocks into 128-bit accumulator */
	while (len >= 16)
	{
		V128 block = v128_load(buf);
		V128 lo = clmul128(x.lo, kK5);
		V128 hi = clmul128(x.hi, kK6);
		x = v128_xor(v128_xor(lo, hi), block);
		buf += 16;
		len -= 16;
	}

	/* Reduce 128-bit to 64-bit */
	{
		V128 tmp = clmul128(kK6, x.lo);
		x = v128_xor(v128_shift_right64(x), tmp);
	}

	/* Reduce 64-bit to 32-bit */
	{
		V128 tmp = v128_shift_right32(x);
		x = v128_and_mask32(x);
		x = clmul128(kK7, x.lo);
		x = v128_xor(x, tmp);
	}

	/* Barrett reduction to final 32-bit CRC */
	{
		V128 tmp = v128_and_mask32(x);
		tmp = clmul128(kP2, tmp.lo);
		tmp = v128_and_mask32(tmp);
		tmp = clmul128(kP1, tmp.lo);
		x = v128_xor(x, tmp);
	}

	/* Extract result from second 32-bit lane */
	return (uint32_t)((x.lo >> 32) & 0xFFFFFFFFu);
}

/*
 * High-level CRC32C with hardware acceleration
 * Handles alignment and small buffers, delegates to hardware for large aligned blocks
 */
static uint32_t
crc32c_hw(uint32_t crc, const uint8_t *buf, size_t length)
{
	const size_t kMinLen = 32;
	const size_t kChunkLen = 16;

	/* Use software for small buffers */
	if (length < kMinLen)
		return crc32c_sw(crc, buf, length);

	/* Process unaligned head with software */
	size_t unaligned = length % kChunkLen;
	if (unaligned)
	{
		crc = crc32c_sw(crc, buf, unaligned);
		buf += unaligned;
		length -= unaligned;
	}

	/* Process aligned blocks with hardware */
	if (length > 0)
	{
		/* Hardware expects inverted CRC (working form) */
		uint32_t crc_inverted = ~crc;
		crc_inverted = crc32c_clmul_core(crc_inverted, buf, length);
		/* Invert back to standard form */
		crc = ~crc_inverted;
	}

	return crc;
}

#else

/* On non-RISC-V or without Zbc, hardware version is same as software */
static uint32_t
crc32c_hw(uint32_t crc, const uint8_t *buf, size_t length)
{
	return crc32c_sw(crc, buf, length);
}

#endif /* __riscv && __riscv_zbc */

static double
now(void)
{
	struct timespec ts;
	clock_gettime(CLOCK_MONOTONIC, &ts);
	return ts.tv_sec + ts.tv_nsec / 1e9;
}

int
main(void)
{
	uint8_t *data;
	uint32_t crc_sw = 0, crc_hw = 0;
	double start, elapsed_sw, elapsed_hw;
	double mb_per_sec;
	size_t i;
	int iter;

	/* Initialize lookup table */
	init_crc32c_table();

	/* Allocate test data */
	data = malloc(TEST_SIZE);

	/* Fill with random data */
	srand(42);
	for (i = 0; i < TEST_SIZE; i++)
		data[i] = rand() & 0xFF;

	/* Benchmark software implementation */
	start = now();
	for (iter = 0; iter < ITERATIONS; iter++)
	{
		crc_sw = crc32c_sw(0, data, TEST_SIZE);
	}
	elapsed_sw = now() - start;
	mb_per_sec = (TEST_SIZE * ITERATIONS / (1024.0 * 1024.0)) / elapsed_sw;
	printf("sw crc32c: %8.3f sec  (%10.2f MB/s)\n",
	       elapsed_sw, mb_per_sec);

	/* Benchmark hardware implementation */
	start = now();
	for (iter = 0; iter < ITERATIONS; iter++)
	{
		crc_hw = crc32c_hw(0, data, TEST_SIZE);
	}
	elapsed_hw = now() - start;
	mb_per_sec = (TEST_SIZE * ITERATIONS / (1024.0 * 1024.0)) / elapsed_hw;
	printf("hw crc32c: %8.3f sec  (%10.2f MB/s)\n",
	       elapsed_hw, mb_per_sec);

	printf("\ndiff: %.2fx\n", elapsed_sw / elapsed_hw);

	/* Verify correctness */
	if (crc_sw != crc_hw)
	{
		printf("\n[ERROR] Results don't match!\n");
		printf("\tsw: 0x%08X\n", crc_sw);
		printf("\thw: 0x%08X\n", crc_hw);
		free(data);
		return 1;
	}

	printf("match: 0x%08X\n", crc_sw);

	/* Test with known vector */
	{
		const char *test = "123456789";
		const uint32_t expected = 0xE3069283;
		uint32_t result;

		result = crc32c_sw(0, (const uint8_t *)test, strlen(test));
		printf("\nvalidation: CRC32C(\"%s\") = 0x%08X %s\n",
		       test, result,
		       (result == expected) ? "(correct)" : "(WRONG!)");
	}

	/* Test 48-byte pattern */
	{
	    uint8_t buf[48];
	    for (int i = 0; i < 48; i++)
	        buf[i] = i;

		printf("\n=== Testing 48-byte pattern [0..47] ===\n");
		printf("Buffer address: %p (mod 16 = %lu)\n",
			buf, (unsigned long)((uintptr_t)buf % 16));

		uint32_t sw = crc32c_sw(0, buf, 48);
		uint32_t hw = crc32c_hw(0, buf, 48);

		printf("Software: 0x%08X\n", sw);
		printf("Hardware: 0x%08X\n", hw);
		printf("Match: %s\n", (sw == hw) ? "YES" : "NO");
	}

	free(data);
	return 0;
}
