large.c: replace binary_load/add() etc with uint64_t based large_load/add() etc for performance

2025-05-24 12:07:00 -04:00 · 2020-06-14 14:42:40 +01:00 · 2020-06-14 14:42:40 +01:00 · e8a238aad1
commit e8a238aad1
parent 3690c19749
14 changed files with 1566 additions and 804 deletions
--- a/backend/large.c
+++ b/backend/large.c
@ -2,7 +2,7 @@

 /*
    libzint - the open source barcode library
-    Copyright (C) 2008-2017 Robin Stuart <rstuart114@gmail.com>
+    Copyright (C) 2008 - 2020 Robin Stuart <rstuart114@gmail.com>

    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions
@ -31,181 +31,287 @@
 */
 /* vim: set ts=4 sw=4 et : */

+/* `large_mul_u64()` and `large_div_u64()` are adapted from articles by F. W. Jacob
+ *   https://www.codeproject.com/Tips/618570/UInt-Multiplication-Squaring
+ *   "This article, along with any associated source code and files, is licensed under The BSD License"
+ *   http://www.codeproject.com/Tips/785014/UInt-Division-Modulus
+ *   "This article, along with any associated source code and files, is licensed under The BSD License"
+ *
+ * These in turn are based on Hacker's Delight (2nd Edition, 2012) by Henry S. Warren, Jr.
+ *   "You are free to use, copy, and distribute any of the code on this web site, whether modified by you or not."
+ *   https://web.archive.org/web/20190716204559/http://www.hackersdelight.org/permissions.htm
+ *
+ * `clz_u64()` and other bits and pieces are adapted from r128.h by Alan Hickman (fahickman)
+ *   https://github.com/fahickman/r128/blob/master/r128.h
+ *   "R128 is released into the public domain. See LICENSE for details." LICENSE is The Unlicense.
+ */
 #include <stdio.h>
-#include <string.h>
+#ifdef _MSC_VER
+#include <malloc.h>
+#endif
 #include "common.h"
 #include "large.h"

-INTERNAL void binary_add(short int accumulator[], short int input_buffer[]) { /* Binary addition */
-    int i, carry;
-    carry = 0;
+#define MASK32  0xFFFFFFFF

-    for (i = 0; i < 112; i++) {
-        int done = 0;
-        if (((input_buffer[i] == 0) && (accumulator[i] == 0))
-                && ((carry == 0) && (done == 0))) {
-            accumulator[i] = 0;
-            carry = 0;
-            done = 1;
-        }
-        if (((input_buffer[i] == 0) && (accumulator[i] == 0))
-                && ((carry == 1) && (done == 0))) {
-            accumulator[i] = 1;
-            carry = 0;
-            done = 1;
-        }
-        if (((input_buffer[i] == 0) && (accumulator[i] == 1))
-                && ((carry == 0) && (done == 0))) {
-            accumulator[i] = 1;
-            carry = 0;
-            done = 1;
-        }
-        if (((input_buffer[i] == 0) && (accumulator[i] == 1))
-                && ((carry == 1) && (done == 0))) {
-            accumulator[i] = 0;
-            carry = 1;
-            done = 1;
-        }
-        if (((input_buffer[i] == 1) && (accumulator[i] == 0))
-                && ((carry == 0) && (done == 0))) {
-            accumulator[i] = 1;
-            carry = 0;
-            done = 1;
-        }
-        if (((input_buffer[i] == 1) && (accumulator[i] == 0))
-                && ((carry == 1) && (done == 0))) {
-            accumulator[i] = 0;
-            carry = 1;
-            done = 1;
-        }
-        if (((input_buffer[i] == 1) && (accumulator[i] == 1))
-                && ((carry == 0) && (done == 0))) {
-            accumulator[i] = 0;
-            carry = 1;
-            done = 1;
-        }
-        if (((input_buffer[i] == 1) && (accumulator[i] == 1))
-                && ((carry == 1) && (done == 0))) {
-            accumulator[i] = 1;
-            carry = 1;
-            done = 1;
-        }
+/* Convert decimal string `s` of (at most) length `length` to 64-bit and place in 128-bit `t` */
+INTERNAL void large_load_str_u64(large_int *t, const unsigned char *s, int length) {
+    uint64_t val = 0;
+    const unsigned char *se = s + length;
+    for (; s < se && *s >= '0' && *s <= '9'; s++) {
+        val *= 10;
+        val += *s - '0';
+    }
+    t->lo = val;
+    t->hi = 0;
+}
+
+/* Add 128-bit `s` to 128-bit `t` */
+INTERNAL void large_add(large_int *t, const large_int *s) {
+    t->lo += s->lo;
+    t->hi += s->hi + (t->lo < s->lo);
+}
+
+/* Add 64-bit `s` to 128-bit `t` */
+INTERNAL void large_add_u64(large_int *t, uint64_t s) {
+    t->lo += s;
+    if (t->lo < s) {
+        t->hi++;
    }
 }

-INTERNAL void binary_subtract(short int accumulator[], short int input_buffer[]) {
-    /* 2's compliment subtraction */
-    /* take input_buffer from accumulator and put answer in accumulator */
-    int i;
-    short int sub_buffer[112];
+/* Subtract 64-bit `s` from 128-bit `t` */
+INTERNAL void large_sub_u64(large_int *t, uint64_t s) {
+    uint64_t r = t->lo - s;
+    if (r > t->lo) {
+        t->hi--;
+    }
+    t->lo = r;
+}

-    for (i = 0; i < 112; i++) {
-        if (input_buffer[i] == 0) {
-            sub_buffer[i] = 1;
+/* Multiply 128-bit `t` by 64-bit `s`
+ * See Jacob `mult64to128()` and Warren Section 8-2
+ * Note '0' denotes low 32-bits, '1' high 32-bits
+ * if   p00 == s0 * tlo0
+ *      k00 == carry of p00
+ *      p01 == s0 * tlo1
+ *      k01 == carry of (p01 + k00)
+ *      p10 == s1 * tlo0
+ *      k10 == carry of p10
+ *      p11 == s1 * tlo1 (unmasked, i.e. including unshifted carry if any)
+ * then t->lo == (p01 + p10 + k00) << 32 + p00
+ * and  t->hi == p11 + k10 + k01 + thi * s
+ *
+ *      (thi)      tlo1      tlo0
+ * x                 s1        s0
+ *      -------------------------
+ *                            p00
+ *      k01        p01 + k00
+ *                 p10
+ *      p11 + k10
+ */
+INTERNAL void large_mul_u64(large_int *t, uint64_t s) {
+    uint64_t thi = t->hi;
+    uint64_t tlo0 = t->lo & MASK32;
+    uint64_t tlo1 = t->lo >> 32;
+
+    uint64_t s0 = s & MASK32;
+    uint64_t s1 = s >> 32;
+
+    uint64_t tmp = s0 * tlo0; /* p00 (unmasked) */
+    uint64_t p00 = tmp & MASK32;
+    uint64_t k10;
+
+    tmp = (s1 * tlo0) + (tmp >> 32); /* (p10 + k00) (p10 unmasked) */
+    k10 = tmp >> 32;
+
+    tmp = (s0 * tlo1) + (tmp & MASK32); /* (p01 + p10 + k00) (p01 unmasked) */
+
+    t->lo = (tmp << 32) + p00; /* (p01 + p10 + k00) << 32 + p00 (note any carry from unmasked p01 shifted out) */
+    t->hi = (s1 * tlo1) + k10 + (tmp >> 32) + thi * s; /* p11 + k10 + k01 + thi * s */
+}
+
+/* Count leading zeroes. See Hickman `r128__clz64()` */
+STATIC_UNLESS_ZINT_TEST int clz_u64(uint64_t x) {
+   uint64_t n = 64, y;
+   y = x >> 32; if (y) { n -= 32; x = y; }
+   y = x >> 16; if (y) { n -= 16; x = y; }
+   y = x >>  8; if (y) { n -=  8; x = y; }
+   y = x >>  4; if (y) { n -=  4; x = y; }
+   y = x >>  2; if (y) { n -=  2; x = y; }
+   y = x >>  1; if (y) { n -=  1; x = y; }
+   return (int) (n - x);
+}
+
+/* Divide 128-bit dividend `t` by 64-bit divisor `v`
+ * See Jacob `divmod128by128/64()` and Warren Section 9–2 (divmu64.c.txt)
+ * Note digits are 32-bit parts */
+INTERNAL uint64_t large_div_u64(large_int *t, uint64_t v) {
+    const uint64_t b = 0x100000000; /* Number base (2**32) */
+    uint64_t qhi = 0; /* High digit of returned quotient */
+
+    uint64_t tnhi, tnlo, tnlo1, tnlo0, vn1, vn0; /* Normalized forms of (parts of) t and v */
+    uint64_t rnhilo1; /* Remainder after dividing 1st 3 digits of t by v */
+    uint64_t qhat1, qhat0; /* Estimated quotient digits */
+    uint64_t rhat; /* Remainder of estimated quotient digit */
+    uint64_t tmp;
+    int norm_shift;
+
+    /* Deal with single-digit (i.e. 32-bit) divisor here */
+    if (v < b) {
+        qhi = t->hi / v;
+        tmp = ((t->hi - qhi * v) << 32) + (t->lo >> 32); /* k * b + tlo1 */
+        qhat1 = tmp / v;
+        tmp = ((tmp - qhat1 * v) << 32) + (t->lo & MASK32); /* k * b + tlo0 */
+        qhat0 = tmp / v;
+        t->lo = (qhat1 << 32) | qhat0;
+        t->hi = qhi;
+        return tmp - qhat0 * v;
+    }
+
+    /* Main algorithm requires t->hi < v */
+    if (t->hi >= v) {
+        qhi = t->hi / v;
+        t->hi %= v;
+    }
+
+    /* Normalize by shifting v left just enough so that its high-order
+     * bit is on, and shift t left the same amount. Note don't need extra
+     * high-end digit for dividend as t->hi < v */
+
+    norm_shift = clz_u64(v);
+    v <<= norm_shift;
+    vn1 = v >> 32;
+    vn0 = v & MASK32;
+
+    if (norm_shift > 0) {
+        tnhi = (t->hi << norm_shift) | (t->lo >> (64 - norm_shift));
+        tnlo = t->lo << norm_shift;
+    } else {
+        tnhi = t->hi;
+        tnlo = t->lo;
+    }
+
+    tnlo1 = tnlo >> 32;
+    tnlo0 = tnlo & MASK32;
+
+    /* Compute qhat1 estimate */
+
+    qhat1 = tnhi / vn1; /* Divide first digit of v into first 2 digits of t */
+    rhat = tnhi % vn1;
+
+    /* Loop until qhat1 one digit and <= (rhat * b + 3rd digit of t) / vn0 */
+    for (tmp = qhat1 * vn0; qhat1 >= b || tmp > (rhat << 32) + tnlo1; tmp -= vn0) {
+        --qhat1;
+        rhat += vn1;
+        if (rhat >= b) { /* Must check here as (rhat << 32) would overflow */
+            break; /* qhat1 * vn0 < b * b (since vn0 < b) */
+        }
+    }
+    /* Note qhat1 will be exact as have fully divided by 2-digit divisor
+     * (can only be too high by 1 (and require "add back" step) if divisor at least 3 digits) */
+
+    rnhilo1 = (tnhi << 32) + tnlo1 - (qhat1 * v); /* Note high digit (if any) of both tnhi and (qhat1 * v) shifted out */
+
+    /* Compute qhat0 estimate */
+
+    qhat0 = rnhilo1 / vn1; /* Divide first digit of v into 2-digit remains of first 3 digits of t */
+    rhat = rnhilo1 % vn1;
+
+    /* Loop until qhat0 one digit and <= (rhat * b + 4th digit of t) / vn0 */
+    for (tmp = qhat0 * vn0; qhat0 >= b || tmp > (rhat << 32) + tnlo0; tmp -= vn0) {
+        --qhat0;
+        rhat += vn1;
+        if (rhat >= b) {
+            break;
+        }
+    }
+    /* Similarly qhat0 will be exact */
+
+    t->lo = (qhat1 << 32) | qhat0;
+    t->hi = qhi;
+
+    /* Unnormalize remainder */
+    return ((rnhilo1 << 32) + tnlo0 - (qhat0 * v)) >> norm_shift;
+}
+
+/* Unset a bit (zero-based) */
+INTERNAL void large_unset_bit(large_int *t, int bit) {
+    if (bit < 64) {
+        t->lo &= ~(((uint64_t) 1) << bit);
+    } else if (bit < 128) {
+        t->hi &= ~(((uint64_t) 1) << (bit - 64));
+    }
+}
+
+/* Ouput large_int into an unsigned int array of size `size`, each element containing `bits` bits */
+INTERNAL void large_uint_array(const large_int *t, unsigned int *uint_array, int size, int bits) {
+    int i, j;
+    uint64_t mask;
+    if (bits <= 0) {
+        bits = 8;
+    } else if (bits > 32) {
+        bits = 32;
+    }
+    mask = ~(((uint64_t) -1) << bits);
+    for (i = 0, j = 0; i < size && j < 64; i++, j += bits) {
+        uint_array[size - 1 - i] = (t->lo >> j) & mask; /* Little-endian order */
+    }
+    if (i < size) {
+        if (j != 64) {
+            j -= 64;
+            /* (first j bits of t->hi) << (bits - j) | (last (bits - j) bits of t->lo) */
+            uint_array[size - i] = ((t->hi & ~((((uint64_t) -1) << j))) << (bits - j)) | (t->lo >> (64 - (bits - j)) & mask);
        } else {
-            sub_buffer[i] = 0;
+            j = 0;
+        }
+        for (; i < size && j < 64; i++, j += bits) {
+            uint_array[size - 1 - i] = (t->hi >> j) & mask;
+        }
+        if (i < size && j != 128) {
+            uint_array[size - 1 - i] = t->hi >> (j - bits) & mask;
        }
    }
-    binary_add(accumulator, sub_buffer);
-
-    sub_buffer[0] = 1;
-
-    for (i = 1; i < 112; i++) {
-        sub_buffer[i] = 0;
-    }
-    binary_add(accumulator, sub_buffer);
 }

-INTERNAL void binary_multiply(short int reg[], char data[]) {
-    /* Multiply the contents of reg[] by a number */
-    short int temp[112] = {0};
-    short int accum[112] = {0};
+/* As `large_uint_array()` above, except output to unsigned char array */
+INTERNAL void large_uchar_array(const large_int *t, unsigned char *uchar_array, int size, int bits) {
    int i;
-    
-    binary_load(temp, data, strlen(data));
-    
-    for (i = 0; i < 102; i++) {
-        if (temp[i] == 1) {
-            binary_add(accum, reg);
-        }
-        shiftup(reg);
-    }
-    
-    for (i = 0; i < 112; i++) {
-        reg[i] = accum[i];
+#ifndef _MSC_VER
+    unsigned int uint_array[size ? size : 1]; /* Avoid run-time warning if size is 0 */
+#else
+    unsigned int *uint_array = _alloca((size ? size : 1) * sizeof(unsigned int));
+#endif
+
+    large_uint_array(t, uint_array, size, bits);
+
+    for (i = 0; i < size; i++) {
+        uchar_array[i] = uint_array[i];
    }
 }

-INTERNAL void shiftdown(short int buffer[]) {
-    int i;
+/* Output formatted large_int to stdout */
+INTERNAL void large_print(large_int *t) {
+    char buf[35]; /* 2 (0x) + 32 (hex) + 1 */

-    buffer[102] = 0;
-    buffer[103] = 0;
-
-    for (i = 0; i < 102; i++) {
-        buffer[i] = buffer[i + 1];
-    }
+    puts(large_dump(t, buf));
 }

-INTERNAL void shiftup(short int buffer[]) {
-    int i;
+/* Format large_int into buffer, which should be at least 35 chars in size */
+INTERNAL char *large_dump(large_int *t, char *buf) {
+    unsigned int tlo1 = large_lo(t) >> 32;
+    unsigned int tlo0 = large_lo(t) & MASK32;
+    unsigned int thi1 = large_hi(t) >> 32;
+    unsigned int thi0 = large_hi(t) & MASK32;

-    for (i = 102; i > 0; i--) {
-        buffer[i] = buffer[i - 1];
-    }
-
-    buffer[0] = 0;
-}
-
-INTERNAL short int islarger(short int accum[], short int reg[]) {
-    /* Returns 1 if accum[] is larger than reg[], else 0 */
-    int i, latch, larger;
-    latch = 0;
-    i = 103;
-    larger = 0;
-
-
-    do {
-        if ((accum[i] == 1) && (reg[i] == 0)) {
-            latch = 1;
-            larger = 1;
-        }
-        if ((accum[i] == 0) && (reg[i] == 1)) {
-            latch = 1;
-        }
-        i--;
-    } while ((latch == 0) && (i >= 0));
-
-    return larger;
-}
-
-INTERNAL void binary_load(short int reg[], char data[], const size_t src_len) {
-	size_t    read;
-	int       i;
-    short int temp[112] = {0};
-
-    for (i = 0; i < 112; i++) {
-        reg[i] = 0;
-    }
-
-    for (read = 0; read < src_len; read++) {
-
-        for (i = 0; i < 112; i++) {
-            temp[i] = reg[i];
-        }
-
-        for (i = 0; i < 9; i++) {
-            binary_add(reg, temp);
-        }
-
-        for (i = 0; i < 112; i++) {
-            temp[i] = 0;
-        }
-
-        for (i = 0; i < 4; i++) {
-                if (ctoi(data[read]) & (0x01 << i)) temp[i] = 1;
-        }
-
-        binary_add(reg, temp);
+    if (thi1) {
+        sprintf(buf, "0x%X%08X%08X%08X", thi1, thi0, tlo1, tlo0);
+    } else if (thi0) {
+        sprintf(buf, "0x%X%08X%08X", thi0, tlo1, tlo0);
+    } else if (tlo1) {
+        sprintf(buf, "0x%X%08X", tlo1, tlo0);
+    } else {
+        sprintf(buf, "0x%X", tlo0);
    }
+    return buf;
 }