/* -*- coding: utf-8 -*- * p448.c - Modular calculation with p448: 2^448 - 2^224 - 1 * * Copyright (C) 2021 Free Software Initiative of Japan * Author: NIIBE Yutaka * * This file is a part of Gnuk, a GnuPG USB Token implementation. * * Gnuk is free software: you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * Gnuk is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public * License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . * */ #include #include "p448.h" #define MASK_28BITS 0x0fffffff static void p448_add_raw (p448_t *x, const p448_t *a, const p448_t *b) { int i; for (i = 0; i < N_REDUNDANT_LIMBS; i++) x->limb[i] = a->limb[i] + b->limb[i]; } static void p448_sub_raw (p448_t *x, const p448_t *a, const p448_t *b) { int i; for (i = 0; i < N_REDUNDANT_LIMBS; i++) x->limb[i] = a->limb[i] - b->limb[i]; } static uint64_t mul64_32x32 (const uint32_t a, const uint32_t b) { return ((uint64_t)a) * b; } /** * Compute X = A * B mod p448 */ /* * When we set phi = 2^224, p448 can be expressed as: * * p448 = phi^2 - phy - 1 * * Here, using the right hand side and make a fomula * * phi^2 - phy - 1 = 0 * * it is the fomula where it's solution is golden ratio. * * By analogy, so, p448 is called "golden-ratio prime". * * When we set phi = 2^224, Karatsuba multiplication goes like: * * (p + q * phi) * (r + s * phi) * = pr + (ps + qr)*phy + qs*phi^2 * == (pr + qs) + (ps + qr + qs) * phy (mod p448) * = (pr + qs) + ((p + q)*(r + s) - pr) * phy * * That is, it can be done by three times of 224-bit multiplications * (instead of four). * * Let us see more detail. * * The formula above is calculated to: * = lower224(pr + qs) + upper224(pr + qs)*phy * + lower224((p + q)*(r + s) - pr)*phy * + upper224((p + q)*(r + s) - pr)*phy^2 (mod p448) * == lower224(pr + qs) * + upper224((p + q)*(r + s) - pr) * + (upper224(pr + qs) * + lower224((p + q)*(r + s) - pr) * + upper224((p + q)*(r + s) - pr))*phy (mod p448) * = lower224(pr + qs) * + upper224((p + q)*(r + s) - pr) * + (lower224((p + q)*(r + s) - pr) * + upper224((p + q)*(r + s) + qs)) * phy * */ /* Here is a figure of: multiplication by 8-limb * 8-limb a b c d e f g h * i j k l m n o p --------------------------------------------- ap bp cp dp ep fp gp hp ao bo co do eo fo go ho an bn cn dn en fn gn hn am bm cm dm em fm gm hm al bl cl dl el fl gl hl ak bk ck dk ek fk gk hk aj bj cj dj ej fj gj hj ai bi ci di ei fi gi hi Considering lower224, it's: ap bp cp dp ep fp gp hp bo co do eo fo go ho cn dn en fn gn hn dm em fm gm hm el fl gl hl fk gk hk gj hj hi Considering upper224, it's: ao an bn am bm cm al bl cl dl ak bk ck dk ek aj bj cj dj ej fj ai bi ci di ei fi gi */ void p448_mul (p448_t *__restrict__ x, const p448_t *a, const p448_t *b) { int i, j; uint64_t v64_0, v64_1, v64_2; uint32_t p_q[8], r_s[8]; uint32_t *px; const uint32_t *pa, *pb; px = x->limb; pa = a->limb; pb = b->limb; /* Firstly, we do Karatsuba preparation. */ for (i = 0; i < 8; i++) { p_q[i] = pa[i] + pa[i+8]; r_s[i] = pb[i] + pb[i+8]; } v64_0 = v64_1 = 0; for (j = 0; j < 8; j++) { v64_2 = 0; /* Compute lower half of limbs (lower224) */ /* __ <-- j * | / | * |/ v i * */ for (i = 0; i <= j; i++) { v64_0 += mul64_32x32 (pa[8+j-i], pb[8+i]);/* accumulating q*s */ v64_1 += mul64_32x32 (p_q[j-i], r_s[i]); /* accumulating p_q*r_s */ v64_2 += mul64_32x32 (pa[j-i], pb[i]); /* accumulating p*r */ } v64_0 += v64_2; /* Compute pr+qs. */ v64_1 -= v64_2; /* Compute p_q*r_s - pr. */ v64_2 = 0; /* Compute upper half of limbs (upper224) */ /* <-- j * /| | * /_| v i * */ for (; i < 8; i++) { v64_0 -= mul64_32x32 (pa[8+j-i], pb[i]); /* accumulating -p*r */ v64_1 += mul64_32x32 (pa[16+j-i], pb[8+i]);/* accumulating q*s */ v64_2 += mul64_32x32 (p_q[8+j-i], r_s[i]); /* accumulating p_q*r_s */ } v64_0 += v64_2; /* Compute p_q*r_s - pr. */ v64_1 += v64_2; /* Compute p_q*r_s + qs. */ px[j] = v64_0 & MASK_28BITS; px[j+8] = v64_1 & MASK_28BITS; v64_0 >>= 28; v64_1 >>= 28; } /* "Carry" remains as: 2^448 * v64_1 + 2^224 * v64_0 */ /* * Subtract p448 times v64_1 to clear msbs, meaning, clear those * bits and adding v64_1 to px[0] and px[8] (in mod p448 * calculation). */ v64_0 += v64_1; v64_0 += px[8]; v64_1 += px[0]; px[8] = v64_0 & MASK_28BITS; px[0] = v64_1 & MASK_28BITS; /* Still, it carries to... */ v64_0 >>= 28; v64_1 >>= 28; px[9] += v64_0; px[1] += v64_1; /* DONE. */ } /** * Compute X = A * 39081 */ void p448_mul_39081 (p448_t *x, const p448_t *a) { int i; const uint32_t w = 39081; uint32_t *px; const uint32_t *pa; uint64_t v64; uint32_t carry; px = x->limb; pa = a->limb; v64 = 0; for (i = 0; i < N_REDUNDANT_LIMBS; i++) { v64 += mul64_32x32 (w, pa[i]); px[i] = v64 & MASK_28BITS; v64 >>= 28; } carry = v64; carry += px[0]; px[0] = carry & MASK_28BITS; px[1] += carry >> 28; carry = v64; carry += px[8]; px[8] = carry & MASK_28BITS; px[9] += carry >> 28; } /* ah bh ch dh eh fh gh HH bg cg dg eg fg GG cf df ef FF de EE DD CC dc ec BB cb db eb fb AA ba ca da ea fa ga */ /** * Compute X = A^2 mod p448 */ void p448_sqr (p448_t *__restrict__ x, const p448_t *a) { int i, j; uint64_t v64_0, v64_1, v64_2, v64_3; uint32_t p_q[8]; uint32_t *px; const uint32_t *pa; px = x->limb; pa = a->limb; /* Firstly, we do Karatsuba preparation. */ for (i = 0; i < 8; i++) p_q[i] = pa[i] + pa[i+8]; v64_0 = v64_1 = 0; for (j = 0; j < 8; j++) { v64_2 = 0; /* Compute lower half of limbs (lower224) */ /* __ <-- j * | / | * |/ v i * */ for (i = 0; i <= j/2; i++) { int cond = ((j & 1) || i != j/2); v64_3 = mul64_32x32 (pa[8+j-i], pa[8+i]);/* accumulating q*q */ v64_0 += (v64_3 << cond); v64_3 = mul64_32x32 (p_q[j-i], p_q[i]); /* accumulating p_q^2 */ v64_1 += (v64_3 << cond); v64_3 = mul64_32x32 (pa[j-i], pa[i]); /* accumulating p*p */ v64_2 += (v64_3 << cond); } v64_0 += v64_2; /* Compute pp+qq. */ v64_1 -= v64_2; /* Compute p_q^2 - pp. */ v64_2 = 0; /* Compute upper half of limbs (upper224) */ /* <-- j * /| | * /_| v i * */ if (!(j & 1)) { v64_0 -= mul64_32x32 (pa[4+i-1], pa[4+i-1]); /* accumulating -p*p */ v64_1 += mul64_32x32 (pa[12+i-1], pa[12+i-1]);/* accumulating q*q */ v64_2 += mul64_32x32 (p_q[4+i-1], p_q[4+i-1]);/* accumulating p_q^2 */ } for (; i < 4; i++) { v64_3 = mul64_32x32 (pa[4+j-i], pa[4+i]); v64_0 -= (v64_3 << 1); /* accumulating -p*p */ v64_3 = mul64_32x32 (pa[12+j-i], pa[12+i]); v64_1 += (v64_3 << 1); /* accumulating q*q */ v64_3 = mul64_32x32 (p_q[4+j-i], p_q[4+i]); v64_2 += (v64_3 << 1); /* accumulating p_q^2 */ } v64_0 += v64_2; /* Compute p_q^2 - p^2. */ v64_1 += v64_2; /* Compute p_q^2 + q^2. */ px[j] = v64_0 & MASK_28BITS; px[j+8] = v64_1 & MASK_28BITS; v64_0 >>= 28; v64_1 >>= 28; } /* "Carry" remains as: 2^448 * v64_1 + 2^224 * v64_0 */ /* * Subtract p448 times v64_1 to clear msbs, meaning, clear those * bits and adding v64_1 to px[0] and px[8] (in mod p448 * calculation). */ v64_0 += v64_1; v64_0 += px[8]; v64_1 += px[0]; px[8] = v64_0 & MASK_28BITS; px[0] = v64_1 & MASK_28BITS; /* Still, it carries to... */ v64_0 >>= 28; v64_1 >>= 28; px[9] += v64_0; px[1] += v64_1; /* DONE. */ } /** * Weak reduce - Make each limb of redundunt representation smaller. * Do our best weakly to zeroing most significant 4-bit. * * Note that: p448 = 2^448 - 2^224 - 1 * * Subtracting p448 means that subtracting 2^448 then adding 2^224 + 1. */ void p448_weak_reduce (p448_t *a) { int i; uint32_t tmp = a->limb[15] >> 28; a->limb[8] += tmp; /* Adding TMP * 2^224 (28 * 8 = 224) */ /* Compute top to bottom. */ for (i = 0; i < N_REDUNDANT_LIMBS - 1; i++) a->limb[N_REDUNDANT_LIMBS - i - 1] = (a->limb[N_REDUNDANT_LIMBS - i - 1] & MASK_28BITS) + (a->limb[N_REDUNDANT_LIMBS - i - 2] >> 28); a->limb[0] = (a->limb[0] & MASK_28BITS) + tmp; } static const p448_t p448[1] = { { { 0x0fffffff, 0x0fffffff, 0x0fffffff, 0x0fffffff, 0x0fffffff, 0x0fffffff, 0x0fffffff, 0x0fffffff, 0x0ffffffe, 0x0fffffff, 0x0fffffff, 0x0fffffff, 0x0fffffff, 0x0fffffff, 0x0fffffff, 0x0fffffff } } }; static uint32_t p448_add_carry_cond (p448_t *x, const p448_t *a, const p448_t *b, uint32_t cond) { int i; uint32_t v; uint32_t carry = 0; uint32_t *px; const uint32_t *pa, *pb; cond = cond * MASK_28BITS; px = x->limb; pa = a->limb; pb = b->limb; for (i = 0; i < N_REDUNDANT_LIMBS; i++) { v = *pb & cond; *px = *pa + carry; carry = (*px < carry); *px = (*px + v) & MASK_28BITS; carry += (*px < v); px++; pa++; pb++; } return carry; } static uint32_t p448_sub_borrow (p448_t *x, const p448_t *a, const p448_t *b) { int i; uint32_t v; uint32_t borrow = 0; uint32_t *px; const uint32_t *pa, *pb; px = x->limb; pa = a->limb; pb = b->limb; for (i = 0; i < N_REDUNDANT_LIMBS; i++) { uint32_t borrow0 = (*pa < borrow); v = *pb; *px = *pa - borrow; borrow = (*px < v) + borrow0; *px = (*px - v) & MASK_28BITS; px++; pa++; pb++; } return borrow; } /** * Strong reduce - Make sure that each limb of redundunt * representation has zeros of significant 4-bit. */ void p448_strong_reduce (p448_t *a) { uint32_t tmp; uint32_t is_negative; /* * Clear the 4-bit of the last (top) limb. As stated in the comment * of weak_reduce, subtracting p448 means that subtracting 2^448 * then adding 2^224 + 1. */ tmp = a->limb[15] >> 28; a->limb[8] += tmp; a->limb[0] += tmp; a->limb[15] &= MASK_28BITS; /* * Here, it's: 0 <= v < 2*p448 * * When v > p448, subtract p448 from v, then it becomes strongly reduced. * Otherwise, it's already strongly reduced. */ /* Subtract p448 */ is_negative = p448_sub_borrow (a, a, p448); /* Add p448 conditionally, when it becomes negative. */ p448_add_carry_cond (a, a, p448, is_negative); } /** * Convert to wire-format from internal redundant representation. */ void p448_serialize (uint8_t serial[56], const struct p448_t *x) { int i; p448_t tmp[1]; uint8_t *p = serial; *tmp = *x; p448_strong_reduce (tmp); for (i = 0; i < 8; i++) { uint32_t limb0 = tmp->limb[2*i]; uint32_t limb1 = tmp->limb[2*i+1]; *p++ = limb0; *p++ = (limb0 >> 8); *p++ = (limb0 >> 16); *p++ = ((limb0 >> 24) & 0x0f) | ((limb1 & 0x0f )<< 4); *p++ = (limb1 >> 4); *p++ = (limb1 >> 12); *p++ = (limb1 >> 20); } } /** * Convert from wire-format to internal redundant representation. */ void p448_deserialize (p448_t *x, const uint8_t serial[56]) { int i; const uint8_t *p = serial + 56; for (i = 0; i < 8; i++) { uint32_t v; v = *--p; v <<= 8; v |= *--p; v <<= 8; v |= *--p; v <<= 8; v |= *--p; x->limb[N_REDUNDANT_LIMBS-2*i-1] = (v >> 4); v = (v & 0x0f); v <<= 8; v |= *--p; v <<= 8; v |= *--p; v <<= 8; v |= *--p; x->limb[N_REDUNDANT_LIMBS-2*i-2] = v & MASK_28BITS; } } /* X = A^(2*N) */ static void p448_sqrn (p448_t *__restrict__ x, const p448_t *a, int n) { p448_t tmp[1]; if ((n&1)) { p448_sqr (x, a); n--; } else { p448_sqr (tmp, a); p448_sqr (x, tmp); n -= 2; } for (; n; n -= 2) { p448_sqr (tmp, x); p448_sqr (x, tmp); } } /** * Compute X = A^(-1) mod p448 (if A=0, return X = 0) * * Internally, do A^(p448 - 2) to get A^(-1). */ void p448_inv (p448_t *__restrict__ x, const p448_t *a) { p448_t t[1], u[1]; /* * Bit pattern of p448-2: 1{223} 0 1{222}01 * * 222-bit can be composed by 3-bit three times to get 9-bit, 9-bit * two times to get 18-bit, 18-bit two times plus 1-bit to get 37-bit. * 37-bit three times to get 111-bit, and lastly 111-bit two times. * 222 = 111*2 = 37*3*2 = (18*2+1)*3*2 = (9*2*2+1)*3*2 = (3*3*2*2+1)*3*2 */ p448_sqr ( x, a ); /* 10 */ p448_mul ( t, a, x ); /* 11 */ p448_sqr ( x, t ); /* 110 */ p448_mul ( t, a, x ); /* 111 */ p448_sqrn ( x, t, 3 ); /* 111000 */ p448_mul ( u, t, x ); /* 111111 */ p448_sqrn ( x, u, 3 ); /* 111111000 */ p448_mul ( u, t, x ); /* 111111111 */ p448_sqrn ( t, u, 9 ); /* 1{9} 0{9} */ p448_mul ( x, u, t ); /* 1{18} */ p448_sqr ( t, x ); /* 1{18} 0 */ p448_mul ( u, a, t ); /* 1{19} */ p448_sqrn ( t, u, 18 ); /* 1{19} 0{18} */ p448_mul ( u, x, t ); /* 1{37} */ p448_sqrn ( t, u, 37 ); /* 1{37} 0{37} */ p448_mul ( x, u, t ); /* 1{74} */ p448_sqrn ( t, x, 37 ); /* 1{74} 0{37} */ p448_mul ( x, u, t ); /* 1{111} */ p448_sqrn ( t, x, 111 ); /* 1{111} 0{111} */ p448_mul ( u, x, t ); /* 1{222} */ p448_sqr ( t, u ); /* 1{222} 0 */ p448_mul ( x, a, t ); /* 1{223} */ p448_sqrn ( u, x, 224 ); /* 1{223} 0{224} */ p448_mul ( x, u, t ); /* 1{223} 0 1{222}0 */ p448_sqr ( t, x ); /* 1{223} 0 1{222}00 */ p448_mul ( x, a, t ); /* 1{223} 0 1{222}01 */ } static const p448_t p448_times_2[1] = { { { 0x1ffffffe, 0x1ffffffe, 0x1ffffffe, 0x1ffffffe, 0x1ffffffe, 0x1ffffffe, 0x1ffffffe, 0x1ffffffe, 0x1ffffffc, 0x1ffffffe, 0x1ffffffe, 0x1ffffffe, 0x1ffffffe, 0x1ffffffe, 0x1ffffffe, 0x1ffffffe } } }; /** * Compute X = A + B mod p448, result is weakly reduced. * */ void p448_add (p448_t *x, const p448_t *a, const p448_t *b) { p448_add_raw (x, a, b); p448_weak_reduce (x); } /** * Compute X = A - B mod p448, result is weakly reduced. * */ void p448_sub (p448_t *x, const p448_t *a, const p448_t *b) { p448_t tmp[1]; p448_sub_raw (tmp, a, b); p448_add_raw (x, p448_times_2, tmp); p448_weak_reduce (x); }