# Python - Requirements to run other scripts...
from CayleyKernel import multiply as kernel
import cupy as cp
import numpy as np
import matplotlib.pyplot as plt

# Configuration
LEVELS = 9 # do not set level above 10, because recursion will silently fail.

powers = np.arange(LEVELS + 1)
dimensions = 2**powers

def conj(x):
     xstar = -x
     xstar[0] *= -1
     return xstar

def classic(x, y):
    n = len(x)
    if n == 1:
        return x*y
    m = n // 2
    a, b = x[:m], x[m:]
    c, d = y[:m], y[m:]
    z = cp.zeros(n, dtype=x.dtype)
    z[:m] = classic(a, c) - classic(conj(d), b)
    z[m:] = classic(d, a) + classic(b, conj(c))
    return z

print('ready')

Results for float32 (<class 'numpy.float32'>)
Dimension    | Level              | Max Absolute Error   | Vector Point Error   | Status
-----------------------------------------------------------------------------------------------
1            | Level 0            | 0.00e+00              | 0.00e+00             | ✅ PASSED
2            | Level 1            | 5.96e-08              | 6.03e-08             | ✅ PASSED
4            | Level 2            | 1.19e-07              | 1.20e-07             | ✅ PASSED
8            | Level 3            | 2.38e-07              | 3.37e-07             | ✅ PASSED
16           | Level 4            | 1.19e-07              | 2.91e-07             | ✅ PASSED
32           | Level 5            | 7.15e-07              | 1.24e-06             | ✅ PASSED
64           | Level 6            | 1.91e-06              | 3.25e-06             | ✅ PASSED
128          | Level 7            | 3.81e-06              | 7.85e-06             | ✅ PASSED
256          | Level 8            | 7.87e-06              | 2.25e-05             | ❌ FAILED
512          | Level 9            | 4.58e-05              | 6.70e-05             | ❌ FAILED

Results for float64 (<class 'numpy.float64'>)
Dimension    | Level              | Max Absolute Error   | Vector Point Error   | Status
-----------------------------------------------------------------------------------------------
1            | Level 0            | 0.00e+00              | 0.00e+00             | ✅ PASSED
2            | Level 1            | 0.00e+00              | 0.00e+00             | ✅ PASSED
4            | Level 2            | 1.11e-16              | 1.57e-16             | ✅ PASSED
8            | Level 3            | 2.22e-16              | 3.10e-16             | ✅ PASSED
16           | Level 4            | 3.33e-16              | 5.43e-16             | ✅ PASSED
32           | Level 5            | 8.88e-16              | 1.72e-15             | ✅ PASSED
64           | Level 6            | 2.00e-15              | 5.42e-15             | ✅ PASSED
128          | Level 7            | 5.33e-15              | 1.35e-14             | ✅ PASSED
256          | Level 8            | 1.75e-14              | 4.07e-14             | ✅ PASSED
512          | Level 9            | 1.42e-13              | 1.77e-13             | ✅ PASSED

done

done

done

Level  | Dimension  | Max Result Component Value     | Zero Divisor Status
-------------------------------------------------------------------------------------
0      | 1          | N/A (Division Algebra)         | 🚫 No ZDs Allowed
1      | 2          | N/A (Division Algebra)         | 🚫 No ZDs Allowed
2      | 4          | N/A (Division Algebra)         | 🚫 No ZDs Allowed
3      | 8          | N/A (Division Algebra)         | 🚫 No ZDs Allowed
4      | 16         | 0.00e+00                       | ✅ PERFECT ZERO
5      | 32         | 0.00e+00                       | ✅ PERFECT ZERO
6      | 64         | 0.00e+00                       | ✅ PERFECT ZERO
7      | 128        | 0.00e+00                       | ✅ PERFECT ZERO
8      | 256        | 0.00e+00                       | ✅ PERFECT ZERO
9      | 512        | 0.00e+00                       | ✅ PERFECT ZERO
10     | 1024       | 0.00e+00                       | ✅ PERFECT ZERO
11     | 2048       | 0.00e+00                       | ✅ PERFECT ZERO
12     | 4096       | 0.00e+00                       | ✅ PERFECT ZERO
13     | 8192       | 0.00e+00                       | ✅ PERFECT ZERO
14     | 16384      | 0.00e+00                       | ✅ PERFECT ZERO
15     | 32768      | 0.00e+00                       | ✅ PERFECT ZERO
16     | 65536      | 0.00e+00                       | ✅ PERFECT ZERO
17     | 131072     | 0.00e+00                       | ✅ PERFECT ZERO
18     | 262144     | 0.00e+00                       | ✅ PERFECT ZERO
done

Level  | Dimension  | CUDA Kernel (ms)   | Classic Loop (ms)  | Speedup
-----------------------------------------------------------------------
0      | 1          | 0.0130             | 0.0055             | 0.4x
1      | 2          | 0.0106             | 0.0962             | 9.0x
2      | 4          | 0.0108             | 0.3890             | 35.9x
3      | 8          | 0.0165             | 1.6218             | 98.4x
4      | 16         | 0.0163             | 6.5328             | 400.0x
5      | 32         | 0.0245             | 25.8794            | 1054.9x
6      | 64         | 0.0453             | 103.6712           | 2286.4x
7      | 128        | 0.0867             | 412.6849           | 4757.4x
8      | 256        | 0.1990             | 1652.9630          | 8305.9x

done

done

Level  | Dimension  | Max Result Component Value     | Zero Divisor Status
-------------------------------------------------------------------------------------
0      | 1          | N/A (Division Algebra)         | 🚫 No ZDs Allowed
1      | 2          | N/A (Division Algebra)         | 🚫 No ZDs Allowed
2      | 4          | N/A (Division Algebra)         | 🚫 No ZDs Allowed
3      | 8          | N/A (Division Algebra)         | 🚫 No ZDs Allowed
4      | 16         | 0.00e+00                       | ✅ PERFECT ZERO
5      | 32         | 0.00e+00                       | ✅ PERFECT ZERO
6      | 64         | 0.00e+00                       | ✅ PERFECT ZERO
7      | 128        | 0.00e+00                       | ✅ PERFECT ZERO
8      | 256        | 0.00e+00                       | ✅ PERFECT ZERO
9      | 512        | 0.00e+00                       | ✅ PERFECT ZERO
10     | 1024       | 0.00e+00                       | ✅ PERFECT ZERO
11     | 2048       | 0.00e+00                       | ✅ PERFECT ZERO
12     | 4096       | 0.00e+00                       | ✅ PERFECT ZERO
13     | 8192       | 0.00e+00                       | ✅ PERFECT ZERO
14     | 16384      | 0.00e+00                       | ✅ PERFECT ZERO
15     | 32768      | 0.00e+00                       | ✅ PERFECT ZERO
16     | 65536      | 0.00e+00                       | ✅ PERFECT ZERO
17     | 131072     | 0.00e+00                       | ✅ PERFECT ZERO
18     | 262144     | 0.00e+00                       | ✅ PERFECT ZERO
done

Level  | Dimension  | CUDA Kernel (ms)   | Classic Loop (ms)  | Speedup
-----------------------------------------------------------------------
0      | 1          | 0.0130             | 0.0055             | 0.4x
1      | 2          | 0.0106             | 0.0962             | 9.0x
2      | 4          | 0.0108             | 0.3890             | 35.9x
3      | 8          | 0.0165             | 1.6218             | 98.4x
4      | 16         | 0.0163             | 6.5328             | 400.0x
5      | 32         | 0.0245             | 25.8794            | 1054.9x
6      | 64         | 0.0453             | 103.6712           | 2286.4x
7      | 128        | 0.0867             | 412.6849           | 4757.4x
8      | 256        | 0.1990             | 1652.9630          | 8305.9x

done

Level  | Dimension  | Max Result Component Value     | Zero Divisor Status
-------------------------------------------------------------------------------------
0      | 1          | N/A (Division Algebra)         | 🚫 No ZDs Allowed
1      | 2          | N/A (Division Algebra)         | 🚫 No ZDs Allowed
2      | 4          | N/A (Division Algebra)         | 🚫 No ZDs Allowed
3      | 8          | N/A (Division Algebra)         | 🚫 No ZDs Allowed
4      | 16         | 0.00e+00                       | ✅ PERFECT ZERO
5      | 32         | 0.00e+00                       | ✅ PERFECT ZERO
6      | 64         | 0.00e+00                       | ✅ PERFECT ZERO
7      | 128        | 0.00e+00                       | ✅ PERFECT ZERO
8      | 256        | 0.00e+00                       | ✅ PERFECT ZERO
9      | 512        | 0.00e+00                       | ✅ PERFECT ZERO
10     | 1024       | 0.00e+00                       | ✅ PERFECT ZERO
11     | 2048       | 0.00e+00                       | ✅ PERFECT ZERO
12     | 4096       | 0.00e+00                       | ✅ PERFECT ZERO
13     | 8192       | 0.00e+00                       | ✅ PERFECT ZERO
14     | 16384      | 0.00e+00                       | ✅ PERFECT ZERO
15     | 32768      | 0.00e+00                       | ✅ PERFECT ZERO
16     | 65536      | 0.00e+00                       | ✅ PERFECT ZERO
17     | 131072     | 0.00e+00                       | ✅ PERFECT ZERO
18     | 262144     | 0.00e+00                       | ✅ PERFECT ZERO
done

Level  | Dimension  | CUDA Kernel (ms)   | Classic Loop (ms)  | Speedup
-----------------------------------------------------------------------
0      | 1          | 0.0130             | 0.0055             | 0.4x
1      | 2          | 0.0106             | 0.0962             | 9.0x
2      | 4          | 0.0108             | 0.3890             | 35.9x
3      | 8          | 0.0165             | 1.6218             | 98.4x
4      | 16         | 0.0163             | 6.5328             | 400.0x
5      | 32         | 0.0245             | 25.8794            | 1054.9x
6      | 64         | 0.0453             | 103.6712           | 2286.4x
7      | 128        | 0.0867             | 412.6849           | 4757.4x
8      | 256        | 0.1990             | 1652.9630          | 8305.9x

done

# Python - Requirements to run other scripts...
from CayleyKernel import multiply as kernel
import cupy as cp
import numpy as np
import matplotlib.pyplot as plt

# Configuration
LEVELS = 9 # do not set level above 10, because recursion will silently fail.

powers = np.arange(LEVELS + 1)
dimensions = 2**powers

def conj(x):
     xstar = -x
     xstar[0] *= -1
     return xstar

def classic(x, y):
    n = len(x)
    if n == 1:
        return x*y
    m = n // 2
    a, b = x[:m], x[m:]
    c, d = y[:m], y[m:]
    z = cp.zeros(n, dtype=x.dtype)
    z[:m] = classic(a, c) - classic(conj(d), b)
    z[m:] = classic(d, a) + classic(b, conj(c))
    return z

print('ready')

# Python - Kernel vs Classic - Zero Divisor checks
import cupy as cp
import numpy as np
from CayleyKernel import multiply as kernel

def get_zero_divisor_error(target_level):
    """
    Constructs pathological non-zero vectors using specific structural indices.
    Returns the maximum absolute element magnitude resulting from their product.
    """
    dim = 2**target_level
    if target_level < 4:
        return None # Division algebras do not contain zero divisors
    
    half_dim = dim // 2
    
    # Initialize zero vectors in the doubled space
    x = cp.zeros(dim, dtype=cp.float64)
    y = cp.zeros(dim, dtype=cp.float64)
    
    # Specific structural index mapping to guarantee a zero-divisor intersection
    x[1] = 1.0
    x[half_dim + 2] = 1.0
    
    y[6] = 1.0
    y[half_dim + 5] = -1.0
    
    # Execute the multiplication using the verified CUDA kernel wrapper
    z_vector = kernel(x, y)
    
    # Extract the largest absolute numerical leakage element in the output array
    return float(cp.max(cp.abs(z_vector)))


# --- Live Data Aggregation Loop & Output Table Setup ---
MAX_LEVELS = 18
collected_levels = list(range(MAX_LEVELS + 1))

print(f"{'Level':<6} | {'Dimension':<10} | {'Max Result Component Value':<30} | {'Zero Divisor Status'}")
print("-" * 85)

for lvl in collected_levels:
    dim = 2**lvl
    
    if lvl < 4:
        print(f"{lvl:<6} | {dim:<10} | {'N/A (Division Algebra)':<30} | 🚫 No ZDs Allowed")
    else:
        # Run live calculation on the GPU kernel
        max_err = get_zero_divisor_error(lvl)
        
        # Evaluate status directly using standard double-precision accuracy bounds
        if max_err < 1e-12:
            status = "✅ PERFECT ZERO"
        else:
            status = "❌ ERROR DETECTED"
            
        print(f"{lvl:<6} | {dim:<10} | {max_err:<30.2e} | {status}")
print('done')

import cupy as cp
import numpy as np
import matplotlib.pyplot as plt
from CayleyKernel import multiply as kernel

LEVELS = 8

def profile_multiplication_speed():
    """Profiles execution time for both implementations across dimensions."""
    # Test from Level 0 up to Level 10 (Dimension 1024)
    # Beyond Level 10, recursion limits or stack depths can distort pure timing comparisons
    levels = list(range(LEVELS + 1))
    kernel_times = []
    classic_times = []
    
    print(f"{'Level':<6} | {'Dimension':<10} | {'CUDA Kernel (ms)':<18} | {'Classic Loop (ms)':<18} | {'Speedup'}")
    print("-" * 71)
    
    for lvl in levels:
        dim = 2**lvl
        
        # Initialize random test inputs
        x = cp.random.rand(dim, dtype=cp.float64)
        y = cp.random.rand(dim, dtype=cp.float64)
        
        # Warmup passes to eliminate initial JIT compilation overhead
        _ = kernel(x, y)
        _ = classic(x, y)
        cp.cuda.Device().synchronize()
        
        # 1. Benchmark Custom CUDA Kernel
        start_evt = cp.cuda.Event()
        end_evt = cp.cuda.Event()
        start_evt.record()
        for _ in range(100):  # Run 100 iterations for stable averages
            _ = kernel(x, y)
        end_evt.record()
        end_evt.synchronize()
        t_kernel = cp.cuda.get_elapsed_time(start_evt, end_evt) / 100.0
        kernel_times.append(t_kernel)
        
        # 2. Benchmark Classic Recursive Baseline
        start_evt.record()
        for _ in range(100):
            _ = classic(x, y)
        end_evt.record()
        end_evt.synchronize()
        t_classic = cp.cuda.get_elapsed_time(start_evt, end_evt) / 100.0
        classic_times.append(t_classic)
        
        speedup = t_classic / max(t_kernel, 1e-9)
        print(f"{lvl:<6} | {dim:<10} | {t_kernel:<18.4f} | {t_classic:<18.4f} | {speedup:.1f}x")
        
    # --- Dark Themed Plotting ---
    fig, ax = plt.subplots(figsize=(12, 6.5), facecolor='black')
    ax.set_facecolor('black')
    
    # Plot performance metrics
    ax.plot(levels, classic_times, marker='o', color='coral', linewidth=2.5, label='Classic Recursive Baseline')
    ax.plot(levels, kernel_times, marker='s', color='mediumspringgreen', linewidth=2.5, label='Custom CUDA Kernel Wrapper')
    
    # Configure logarithmic y-axis to neatly show scaling variance over magnitudes
    ax.set_yscale('log')
    ax.set_title("Cayley-Dickson Multiplication Performance Scaling", color='white', fontsize=14, fontweight='bold', pad=15)
    ax.set_xlabel("Dimension Power ($n$ in $2^n$)", color='white', fontsize=12, labelpad=10)
    ax.set_ylabel("Execution Time per Operation (Milliseconds, Log Scale)", color='white', fontsize=12, labelpad=10)
    
    # Layout grids, ticks, and spine configurations
    ax.set_xticks(levels)
    ax.set_xticklabels([f"$2^{{{p}}}$\n({2**p})" for p in levels], color='white', fontsize=10)
    ax.tick_params(axis='y', colors='white')
    ax.grid(True, which="both", linestyle=':', alpha=0.3, color='gray')
    
    for spine in ax.spines.values():
        spine.set_color('#444444')
        
    ax.legend(fontsize=10, loc='upper left', facecolor='#111111', edgecolor='gray', labelcolor='white')
    plt.tight_layout()
    plt.show()

profile_multiplication_speed()
print('done')

Cayley-Dickson Cuda Kernel vs Recursion
~ validation by comparison ~¶

📊 1. Algorithmic Validation & Precision Analysis¶

Key Observations:¶

Recommendation for Production:¶

🧬 2. Structural Algebra Validation¶

🌪️ 3. Zero Divisor (ZD) Structural Isolation & Validation¶

🚀 4. Performance Scaling & Acceleration Analysis¶

1. The Low-Dimension Latency Overhead ($2^0$)¶

2. The Algorithmic Complexity Divergence ($2^1$ to $2^8$)¶

3. Scaling Efficiency & Speedup Metrics¶

Scaling Summary¶

🏁 Summary & Findings¶

🔑 Verified Checklist:¶

🏁 Appendix Code:

Cayley-Dickson Cuda Kernel vs Recursion~ validation by comparison ~¶

📊 1. Algorithmic Validation & Precision Analysis¶

Key Observations:¶

Recommendation for Production:¶

🧬 2. Structural Algebra Validation¶

🌪️ 3. Zero Divisor (ZD) Structural Isolation & Validation¶

🚀 4. Performance Scaling & Acceleration Analysis¶

1. The Low-Dimension Latency Overhead ($2^0$)¶

2. The Algorithmic Complexity Divergence ($2^1$ to $2^8$)¶

3. Scaling Efficiency & Speedup Metrics¶

Scaling Summary¶

🏁 Summary & Findings¶

🔑 Verified Checklist:¶

🏁 Appendix Code:

Cayley-Dickson Cuda Kernel vs Recursion
~ validation by comparison ~¶