large.c: replace binary_load/add() etc with uint64_t based large_load/add() etc for performance

This commit is contained in:
gitlost 2020-06-14 14:42:40 +01:00
parent 3690c19749
commit e8a238aad1
14 changed files with 1566 additions and 804 deletions

View file

@ -2,7 +2,7 @@
/*
libzint - the open source barcode library
Copyright (C) 2008-2017 Robin Stuart <rstuart114@gmail.com>
Copyright (C) 2008 - 2020 Robin Stuart <rstuart114@gmail.com>
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
@ -31,181 +31,287 @@
*/
/* vim: set ts=4 sw=4 et : */
/* `large_mul_u64()` and `large_div_u64()` are adapted from articles by F. W. Jacob
* https://www.codeproject.com/Tips/618570/UInt-Multiplication-Squaring
* "This article, along with any associated source code and files, is licensed under The BSD License"
* http://www.codeproject.com/Tips/785014/UInt-Division-Modulus
* "This article, along with any associated source code and files, is licensed under The BSD License"
*
* These in turn are based on Hacker's Delight (2nd Edition, 2012) by Henry S. Warren, Jr.
* "You are free to use, copy, and distribute any of the code on this web site, whether modified by you or not."
* https://web.archive.org/web/20190716204559/http://www.hackersdelight.org/permissions.htm
*
* `clz_u64()` and other bits and pieces are adapted from r128.h by Alan Hickman (fahickman)
* https://github.com/fahickman/r128/blob/master/r128.h
* "R128 is released into the public domain. See LICENSE for details." LICENSE is The Unlicense.
*/
#include <stdio.h>
#include <string.h>
#ifdef _MSC_VER
#include <malloc.h>
#endif
#include "common.h"
#include "large.h"
INTERNAL void binary_add(short int accumulator[], short int input_buffer[]) { /* Binary addition */
int i, carry;
carry = 0;
#define MASK32 0xFFFFFFFF
for (i = 0; i < 112; i++) {
int done = 0;
if (((input_buffer[i] == 0) && (accumulator[i] == 0))
&& ((carry == 0) && (done == 0))) {
accumulator[i] = 0;
carry = 0;
done = 1;
}
if (((input_buffer[i] == 0) && (accumulator[i] == 0))
&& ((carry == 1) && (done == 0))) {
accumulator[i] = 1;
carry = 0;
done = 1;
}
if (((input_buffer[i] == 0) && (accumulator[i] == 1))
&& ((carry == 0) && (done == 0))) {
accumulator[i] = 1;
carry = 0;
done = 1;
}
if (((input_buffer[i] == 0) && (accumulator[i] == 1))
&& ((carry == 1) && (done == 0))) {
accumulator[i] = 0;
carry = 1;
done = 1;
}
if (((input_buffer[i] == 1) && (accumulator[i] == 0))
&& ((carry == 0) && (done == 0))) {
accumulator[i] = 1;
carry = 0;
done = 1;
}
if (((input_buffer[i] == 1) && (accumulator[i] == 0))
&& ((carry == 1) && (done == 0))) {
accumulator[i] = 0;
carry = 1;
done = 1;
}
if (((input_buffer[i] == 1) && (accumulator[i] == 1))
&& ((carry == 0) && (done == 0))) {
accumulator[i] = 0;
carry = 1;
done = 1;
}
if (((input_buffer[i] == 1) && (accumulator[i] == 1))
&& ((carry == 1) && (done == 0))) {
accumulator[i] = 1;
carry = 1;
done = 1;
}
/* Convert decimal string `s` of (at most) length `length` to 64-bit and place in 128-bit `t` */
INTERNAL void large_load_str_u64(large_int *t, const unsigned char *s, int length) {
uint64_t val = 0;
const unsigned char *se = s + length;
for (; s < se && *s >= '0' && *s <= '9'; s++) {
val *= 10;
val += *s - '0';
}
t->lo = val;
t->hi = 0;
}
/* Add 128-bit `s` to 128-bit `t` */
INTERNAL void large_add(large_int *t, const large_int *s) {
t->lo += s->lo;
t->hi += s->hi + (t->lo < s->lo);
}
/* Add 64-bit `s` to 128-bit `t` */
INTERNAL void large_add_u64(large_int *t, uint64_t s) {
t->lo += s;
if (t->lo < s) {
t->hi++;
}
}
INTERNAL void binary_subtract(short int accumulator[], short int input_buffer[]) {
/* 2's compliment subtraction */
/* take input_buffer from accumulator and put answer in accumulator */
int i;
short int sub_buffer[112];
/* Subtract 64-bit `s` from 128-bit `t` */
INTERNAL void large_sub_u64(large_int *t, uint64_t s) {
uint64_t r = t->lo - s;
if (r > t->lo) {
t->hi--;
}
t->lo = r;
}
for (i = 0; i < 112; i++) {
if (input_buffer[i] == 0) {
sub_buffer[i] = 1;
/* Multiply 128-bit `t` by 64-bit `s`
* See Jacob `mult64to128()` and Warren Section 8-2
* Note '0' denotes low 32-bits, '1' high 32-bits
* if p00 == s0 * tlo0
* k00 == carry of p00
* p01 == s0 * tlo1
* k01 == carry of (p01 + k00)
* p10 == s1 * tlo0
* k10 == carry of p10
* p11 == s1 * tlo1 (unmasked, i.e. including unshifted carry if any)
* then t->lo == (p01 + p10 + k00) << 32 + p00
* and t->hi == p11 + k10 + k01 + thi * s
*
* (thi) tlo1 tlo0
* x s1 s0
* -------------------------
* p00
* k01 p01 + k00
* p10
* p11 + k10
*/
INTERNAL void large_mul_u64(large_int *t, uint64_t s) {
uint64_t thi = t->hi;
uint64_t tlo0 = t->lo & MASK32;
uint64_t tlo1 = t->lo >> 32;
uint64_t s0 = s & MASK32;
uint64_t s1 = s >> 32;
uint64_t tmp = s0 * tlo0; /* p00 (unmasked) */
uint64_t p00 = tmp & MASK32;
uint64_t k10;
tmp = (s1 * tlo0) + (tmp >> 32); /* (p10 + k00) (p10 unmasked) */
k10 = tmp >> 32;
tmp = (s0 * tlo1) + (tmp & MASK32); /* (p01 + p10 + k00) (p01 unmasked) */
t->lo = (tmp << 32) + p00; /* (p01 + p10 + k00) << 32 + p00 (note any carry from unmasked p01 shifted out) */
t->hi = (s1 * tlo1) + k10 + (tmp >> 32) + thi * s; /* p11 + k10 + k01 + thi * s */
}
/* Count leading zeroes. See Hickman `r128__clz64()` */
STATIC_UNLESS_ZINT_TEST int clz_u64(uint64_t x) {
uint64_t n = 64, y;
y = x >> 32; if (y) { n -= 32; x = y; }
y = x >> 16; if (y) { n -= 16; x = y; }
y = x >> 8; if (y) { n -= 8; x = y; }
y = x >> 4; if (y) { n -= 4; x = y; }
y = x >> 2; if (y) { n -= 2; x = y; }
y = x >> 1; if (y) { n -= 1; x = y; }
return (int) (n - x);
}
/* Divide 128-bit dividend `t` by 64-bit divisor `v`
* See Jacob `divmod128by128/64()` and Warren Section 92 (divmu64.c.txt)
* Note digits are 32-bit parts */
INTERNAL uint64_t large_div_u64(large_int *t, uint64_t v) {
const uint64_t b = 0x100000000; /* Number base (2**32) */
uint64_t qhi = 0; /* High digit of returned quotient */
uint64_t tnhi, tnlo, tnlo1, tnlo0, vn1, vn0; /* Normalized forms of (parts of) t and v */
uint64_t rnhilo1; /* Remainder after dividing 1st 3 digits of t by v */
uint64_t qhat1, qhat0; /* Estimated quotient digits */
uint64_t rhat; /* Remainder of estimated quotient digit */
uint64_t tmp;
int norm_shift;
/* Deal with single-digit (i.e. 32-bit) divisor here */
if (v < b) {
qhi = t->hi / v;
tmp = ((t->hi - qhi * v) << 32) + (t->lo >> 32); /* k * b + tlo1 */
qhat1 = tmp / v;
tmp = ((tmp - qhat1 * v) << 32) + (t->lo & MASK32); /* k * b + tlo0 */
qhat0 = tmp / v;
t->lo = (qhat1 << 32) | qhat0;
t->hi = qhi;
return tmp - qhat0 * v;
}
/* Main algorithm requires t->hi < v */
if (t->hi >= v) {
qhi = t->hi / v;
t->hi %= v;
}
/* Normalize by shifting v left just enough so that its high-order
* bit is on, and shift t left the same amount. Note don't need extra
* high-end digit for dividend as t->hi < v */
norm_shift = clz_u64(v);
v <<= norm_shift;
vn1 = v >> 32;
vn0 = v & MASK32;
if (norm_shift > 0) {
tnhi = (t->hi << norm_shift) | (t->lo >> (64 - norm_shift));
tnlo = t->lo << norm_shift;
} else {
tnhi = t->hi;
tnlo = t->lo;
}
tnlo1 = tnlo >> 32;
tnlo0 = tnlo & MASK32;
/* Compute qhat1 estimate */
qhat1 = tnhi / vn1; /* Divide first digit of v into first 2 digits of t */
rhat = tnhi % vn1;
/* Loop until qhat1 one digit and <= (rhat * b + 3rd digit of t) / vn0 */
for (tmp = qhat1 * vn0; qhat1 >= b || tmp > (rhat << 32) + tnlo1; tmp -= vn0) {
--qhat1;
rhat += vn1;
if (rhat >= b) { /* Must check here as (rhat << 32) would overflow */
break; /* qhat1 * vn0 < b * b (since vn0 < b) */
}
}
/* Note qhat1 will be exact as have fully divided by 2-digit divisor
* (can only be too high by 1 (and require "add back" step) if divisor at least 3 digits) */
rnhilo1 = (tnhi << 32) + tnlo1 - (qhat1 * v); /* Note high digit (if any) of both tnhi and (qhat1 * v) shifted out */
/* Compute qhat0 estimate */
qhat0 = rnhilo1 / vn1; /* Divide first digit of v into 2-digit remains of first 3 digits of t */
rhat = rnhilo1 % vn1;
/* Loop until qhat0 one digit and <= (rhat * b + 4th digit of t) / vn0 */
for (tmp = qhat0 * vn0; qhat0 >= b || tmp > (rhat << 32) + tnlo0; tmp -= vn0) {
--qhat0;
rhat += vn1;
if (rhat >= b) {
break;
}
}
/* Similarly qhat0 will be exact */
t->lo = (qhat1 << 32) | qhat0;
t->hi = qhi;
/* Unnormalize remainder */
return ((rnhilo1 << 32) + tnlo0 - (qhat0 * v)) >> norm_shift;
}
/* Unset a bit (zero-based) */
INTERNAL void large_unset_bit(large_int *t, int bit) {
if (bit < 64) {
t->lo &= ~(((uint64_t) 1) << bit);
} else if (bit < 128) {
t->hi &= ~(((uint64_t) 1) << (bit - 64));
}
}
/* Ouput large_int into an unsigned int array of size `size`, each element containing `bits` bits */
INTERNAL void large_uint_array(const large_int *t, unsigned int *uint_array, int size, int bits) {
int i, j;
uint64_t mask;
if (bits <= 0) {
bits = 8;
} else if (bits > 32) {
bits = 32;
}
mask = ~(((uint64_t) -1) << bits);
for (i = 0, j = 0; i < size && j < 64; i++, j += bits) {
uint_array[size - 1 - i] = (t->lo >> j) & mask; /* Little-endian order */
}
if (i < size) {
if (j != 64) {
j -= 64;
/* (first j bits of t->hi) << (bits - j) | (last (bits - j) bits of t->lo) */
uint_array[size - i] = ((t->hi & ~((((uint64_t) -1) << j))) << (bits - j)) | (t->lo >> (64 - (bits - j)) & mask);
} else {
sub_buffer[i] = 0;
j = 0;
}
for (; i < size && j < 64; i++, j += bits) {
uint_array[size - 1 - i] = (t->hi >> j) & mask;
}
if (i < size && j != 128) {
uint_array[size - 1 - i] = t->hi >> (j - bits) & mask;
}
}
binary_add(accumulator, sub_buffer);
sub_buffer[0] = 1;
for (i = 1; i < 112; i++) {
sub_buffer[i] = 0;
}
binary_add(accumulator, sub_buffer);
}
INTERNAL void binary_multiply(short int reg[], char data[]) {
/* Multiply the contents of reg[] by a number */
short int temp[112] = {0};
short int accum[112] = {0};
/* As `large_uint_array()` above, except output to unsigned char array */
INTERNAL void large_uchar_array(const large_int *t, unsigned char *uchar_array, int size, int bits) {
int i;
binary_load(temp, data, strlen(data));
for (i = 0; i < 102; i++) {
if (temp[i] == 1) {
binary_add(accum, reg);
}
shiftup(reg);
}
for (i = 0; i < 112; i++) {
reg[i] = accum[i];
#ifndef _MSC_VER
unsigned int uint_array[size ? size : 1]; /* Avoid run-time warning if size is 0 */
#else
unsigned int *uint_array = _alloca((size ? size : 1) * sizeof(unsigned int));
#endif
large_uint_array(t, uint_array, size, bits);
for (i = 0; i < size; i++) {
uchar_array[i] = uint_array[i];
}
}
INTERNAL void shiftdown(short int buffer[]) {
int i;
/* Output formatted large_int to stdout */
INTERNAL void large_print(large_int *t) {
char buf[35]; /* 2 (0x) + 32 (hex) + 1 */
buffer[102] = 0;
buffer[103] = 0;
for (i = 0; i < 102; i++) {
buffer[i] = buffer[i + 1];
}
puts(large_dump(t, buf));
}
INTERNAL void shiftup(short int buffer[]) {
int i;
/* Format large_int into buffer, which should be at least 35 chars in size */
INTERNAL char *large_dump(large_int *t, char *buf) {
unsigned int tlo1 = large_lo(t) >> 32;
unsigned int tlo0 = large_lo(t) & MASK32;
unsigned int thi1 = large_hi(t) >> 32;
unsigned int thi0 = large_hi(t) & MASK32;
for (i = 102; i > 0; i--) {
buffer[i] = buffer[i - 1];
}
buffer[0] = 0;
}
INTERNAL short int islarger(short int accum[], short int reg[]) {
/* Returns 1 if accum[] is larger than reg[], else 0 */
int i, latch, larger;
latch = 0;
i = 103;
larger = 0;
do {
if ((accum[i] == 1) && (reg[i] == 0)) {
latch = 1;
larger = 1;
}
if ((accum[i] == 0) && (reg[i] == 1)) {
latch = 1;
}
i--;
} while ((latch == 0) && (i >= 0));
return larger;
}
INTERNAL void binary_load(short int reg[], char data[], const size_t src_len) {
size_t read;
int i;
short int temp[112] = {0};
for (i = 0; i < 112; i++) {
reg[i] = 0;
}
for (read = 0; read < src_len; read++) {
for (i = 0; i < 112; i++) {
temp[i] = reg[i];
}
for (i = 0; i < 9; i++) {
binary_add(reg, temp);
}
for (i = 0; i < 112; i++) {
temp[i] = 0;
}
for (i = 0; i < 4; i++) {
if (ctoi(data[read]) & (0x01 << i)) temp[i] = 1;
}
binary_add(reg, temp);
if (thi1) {
sprintf(buf, "0x%X%08X%08X%08X", thi1, thi0, tlo1, tlo0);
} else if (thi0) {
sprintf(buf, "0x%X%08X%08X", thi0, tlo1, tlo0);
} else if (tlo1) {
sprintf(buf, "0x%X%08X", tlo1, tlo0);
} else {
sprintf(buf, "0x%X", tlo0);
}
return buf;
}