Skip to content

Commit 41a1437

Browse files
committed
add MP_SMALL_STACK_SIZE option
This adds an option to use a heap-buffer for the usually stack-based `MP_WARRAY`-sized temporary buffers. Per default it will reserve a single buffer, which can be modified * at compile-time via the `MP_WARRAY_NUM` define * at run-time by calling `mp_warray_init()` The internal structure can only be created once. If one wants to modify the maximum number of elements, the entire structure has to be free'd by calling `mp_warray_free()`. In case one wants to use this option with multiple threads, one shall use the `mp_warray_init()` function and pass appropriate locking functions. Signed-off-by: Steffen Jaeckel <s@jaeckel.eu>
1 parent 7f39a72 commit 41a1437

20 files changed

+394
-9
lines changed

.github/workflows/main.yml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,12 @@ jobs:
7070
# RSA superclass with tests (no sanitizer, but debug info)
7171
- { BUILDOPTIONS: '--with-cc=gcc --with-m64 --cflags=-DLTM_NOTHING --cflags=-DSC_RSA_1_WITH_TESTS --limit-valgrind', SANITIZER: '', COMPILE_DEBUG: '1', COMPILE_LTO: '0', CONV_WARNINGS: '', OTHERDEPS: '' }
7272

73+
# Build with small stack-size
74+
- { BUILDOPTIONS: '--with-cc=gcc --with-m32 --with-m64 --cflags=-DMP_SMALL_STACK_SIZE', SANITIZER: '', COMPILE_DEBUG: '0', COMPILE_LTO: '0', CONV_WARNINGS: '', OTHERDEPS: 'gcc-multilib' }
75+
- { BUILDOPTIONS: '--with-cc=gcc --with-m32 --with-m64 --cflags=-DMP_SMALL_STACK_SIZE --cflags=-DMP_NO_LOCKING', SANITIZER: '', COMPILE_DEBUG: '0', COMPILE_LTO: '0', CONV_WARNINGS: '', OTHERDEPS: 'gcc-multilib' }
76+
- { BUILDOPTIONS: '--with-cc=clang-10 --with-m32 --with-m64 --cflags=-DMP_SMALL_STACK_SIZE', SANITIZER: '1', COMPILE_DEBUG: '0', COMPILE_LTO: '0', CONV_WARNINGS: '', OTHERDEPS: 'clang-10 llvm-10 gcc-multilib' }
77+
- { BUILDOPTIONS: '--with-cc=clang-10 --with-m32 --with-m64 --cflags=-DMP_SMALL_STACK_SIZE --cflags=-DMP_TEST_LOCKING', SANITIZER: '1', COMPILE_DEBUG: '0', COMPILE_LTO: '0', CONV_WARNINGS: '', OTHERDEPS: 'clang-10 llvm-10 gcc-multilib' }
78+
7379
# Test "autotuning", the automatic evaluation and setting of the Toom-Cook cut-offs.
7480
#- env: SANITIZER=1 BUILDOPTIONS='--with-cc=gcc-5 --cflags=-DMP_16BIT --limit-valgrind --make-option=tune'
7581
#- env: SANITIZER=1 BUILDOPTIONS='--with-cc=gcc-5 --cflags=-DMP_32BIT --limit-valgrind --make-option=tune'

CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -122,7 +122,7 @@ if(COMPILE_LTO)
122122
if(COMPILER_SUPPORTS_LTO)
123123
set_property(TARGET ${PROJECT_NAME} PROPERTY INTERPROCEDURAL_OPTIMIZATION TRUE)
124124
else()
125-
message(SEND_ERROR "This compiler does not support LTO. Reconfigure ${PROJECT_NAME} with -DCOMPILE_LTO=OFF.")
125+
message(FATAL_ERROR "This compiler does not support LTO. Reconfigure ${PROJECT_NAME} with -DCOMPILE_LTO=OFF.")
126126
endif()
127127
endif()
128128

demo/test.c

Lines changed: 29 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2451,6 +2451,21 @@ static int test_mp_pack_unpack(void)
24512451
return EXIT_FAILURE;
24522452
}
24532453

2454+
2455+
#ifdef MP_TEST_LOCKING
2456+
#ifdef MP_NO_LOCKING
2457+
#error "Can't test locking when locking is disabled"
2458+
#endif
2459+
static mp_lock lock_ctx;
2460+
static int noop_lock_unlock(void *ctx)
2461+
{
2462+
EXPECT(ctx == &lock_ctx);
2463+
return 0;
2464+
LBL_ERR:
2465+
return -1;
2466+
}
2467+
#endif
2468+
24542469
#ifndef LTM_TEST_DYNAMIC
24552470
#define ONLY_PUBLIC_API_C
24562471
#endif
@@ -2525,14 +2540,22 @@ static int unit_tests(int argc, char **argv)
25252540
unsigned long i, ok, fail, nop;
25262541
uint64_t t;
25272542
int j;
2543+
#ifdef MP_TEST_LOCKING
2544+
lock_ctx.lock = noop_lock_unlock;
2545+
lock_ctx.unlock = noop_lock_unlock;
2546+
lock_ctx.ctx = &lock_ctx;
25282547

2548+
if (mp_warray_init(MP_WARRAY_NUM, true, &lock_ctx) != MP_OKAY)
2549+
return EXIT_FAILURE;
2550+
#endif
25292551
ok = fail = nop = 0;
25302552

25312553
t = (uint64_t)time(NULL);
25322554
printf("SEED: 0x%" PRIx64 "\n\n", t);
25332555
s_mp_rand_jenkins_init(t);
25342556
mp_rand_source(s_mp_rand_jenkins);
25352557

2558+
25362559
for (i = 0; i < (sizeof(test) / sizeof(test[0])); ++i) {
25372560
if (argc > 1) {
25382561
for (j = 1; j < argc; ++j) {
@@ -2556,8 +2579,12 @@ static int unit_tests(int argc, char **argv)
25562579
}
25572580
fprintf(fail?stderr:stdout, "Tests OK/NOP/FAIL: %lu/%lu/%lu\n", ok, nop, fail);
25582581

2559-
if (fail != 0) return EXIT_FAILURE;
2560-
else return EXIT_SUCCESS;
2582+
EXPECT(mp_warray_free() != -2);
2583+
2584+
if (fail == 0)
2585+
return EXIT_SUCCESS;
2586+
LBL_ERR:
2587+
return EXIT_FAILURE;
25612588
}
25622589

25632590
int main(int argc, char **argv)

doc/bn.tex

Lines changed: 63 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -352,6 +352,20 @@ \subsubsection{Operand Size Related}
352352
\end{center}
353353
\end{small}
354354

355+
\subsection{Small-Stack option}
356+
\label{ch:SMALL_STACK_INTRO}
357+
The library can be compiled with the symbol \texttt{MP\_SMALL\_STACK\_SIZE} defined, which results in
358+
the temporary \texttt{MP\_WARRAY}-sized stack buffers being put on the heap.
359+
This comes with one problem, namely: formerly promised thread-safety isn't given anymore.
360+
Therefore if the Small-Stack option is enabled while doing multi threading, the provided locking
361+
mechanism shall be used.
362+
For some use cases it can be desired to use the Small-Stack option, but there are no threads and
363+
therefore we provide the possibility to disable locking by defining the symbol \texttt{MP\_NO\_LOCKING}.
364+
365+
In case one already knows how many threads must be supported, the symbol \texttt{MP\_WARRAY\_NUM} can
366+
be useful. It can be pre-defined at compile time to the number of heap buffers created on automatic
367+
initialisation. C.f. \ref{ch:SMALL_STACK_API} for the dynamic API and further details.
368+
355369
\section{Purpose of LibTomMath}
356370
Unlike GNU MP (GMP) Library, LIP, OpenSSL or various other commercial kits (Miracl), LibTomMath
357371
was not written with bleeding edge performance in mind. First and foremost LibTomMath was written
@@ -428,7 +442,9 @@ \chapter{Getting Started with LibTomMath}
428442
\section{Building Programs}
429443
In order to use LibTomMath you must include ``tommath.h'' and link against the appropriate library
430444
file (typically
431-
libtommath.a). There is no library initialization required and the entire library is thread safe.
445+
libtommath.a). There is no library initialization required and the entire library is thread safe
446+
if it is used in its default configuration. Locking is recommended if the small-stack option
447+
is enabled and multiple threads are used, c.f. \ref{ch:SMALL_STACK_INTRO} resp. \ref{ch:SMALL_STACK_API}
432448

433449
\section{Return Codes}
434450
There are five possible return codes a function may return.
@@ -813,6 +829,52 @@ \subsection{Adding additional digits}
813829
\end{alltt}
814830
\end{small}
815831

832+
\section{Small-Stack option}
833+
\label{ch:SMALL_STACK_API}
834+
835+
In case the \texttt{MP\_SMALL\_STACK\_SIZE} symbol is defined the following functions
836+
can be useful.
837+
838+
To initialize the internal structure the following function shall be called.
839+
840+
\index{mp\_warray\_init}
841+
\begin{alltt}
842+
mp_err mp_warray_init(size_t n_alloc, bool preallocate, mp_lock *lock);
843+
\end{alltt}
844+
845+
The flag \texttt{preallocate} controls whether the internal buffers --
846+
\texttt{n\_alloc} buffers of size \texttt{MP\_WARRAY} -- will be allocated when
847+
\texttt{mp\_warray\_init()} is called, or whether they will be allocated when required.
848+
The \texttt{mp\_lock} struct looks as follows and shall be used to protect the
849+
internal structure when using the library in a multi-threaded application.
850+
851+
\index{mp\_lock}
852+
\begin{alltt}
853+
typedef struct {
854+
int (*lock)(void *ctx);
855+
int (*unlock)(void *ctx);
856+
void *ctx;
857+
} mp_lock;
858+
\end{alltt}
859+
860+
The \texttt{mp\_lock.lock} resp. \texttt{mp\_lock.unlock} functions will be called before resp.
861+
after modifying the internal struct.
862+
The \texttt{mp\_lock.ctx} element will be passed to those functions.
863+
864+
To free the internally allocated memory the following function shall be called.
865+
866+
\index{mp\_warray\_free}
867+
\begin{alltt}
868+
int mp_warray_free(void);
869+
\end{alltt}
870+
871+
872+
Those two API functions are always available, even if the \texttt{MP\_SMALL\_STACK\_SIZE} option
873+
has been disabled at compile time.
874+
In that case \texttt{mp\_warray\_init()} will return \texttt{MP\_ERR} and \texttt{mp\_warray\_free()}
875+
will return $-1$.
876+
877+
816878
\chapter{Basic Operations}
817879
\section{Copying}
818880

helper.pl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -394,7 +394,7 @@ sub update_dep
394394
foreach my $filename (glob '*mp_*.c') {
395395
my $content;
396396
my $cc = $ENV{'CC'} || 'gcc';
397-
$content = `$cc -E -x c -DLTM_ALL $filename`;
397+
$content = `$cc -E -x c -DLTM_ALL -DMP_SMALL_STACK_SIZE $filename`;
398398
$content =~ s/^# 1 "$filename".*?^# 2 "$filename"//ms;
399399

400400
# convert filename to upper case so we can use it as a define

mp_warray_free.c

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
#include "tommath_private.h"
2+
#ifdef MP_WARRAY_FREE_C
3+
/* LibTomMath, multiple-precision integer library -- Tom St Denis */
4+
/* SPDX-License-Identifier: Unlicense */
5+
6+
/* static check that the multiplication won't overflow */
7+
MP_STATIC_ASSERT(warray_free_sz_does_not_overflow, (sizeof(mp_word) * MP_WARRAY) >= MP_WARRAY)
8+
9+
static int s_warray_free(void)
10+
{
11+
int ret = 0;
12+
size_t n;
13+
S_MP_WARRAY_LOCK();
14+
for (n = 0; n < s_mp_warray.allocated; ++n) {
15+
if (s_mp_warray.l_used[n].warray) {
16+
ret = -2;
17+
goto ERR_OUT;
18+
}
19+
}
20+
for (n = 0; n < s_mp_warray.allocated; ++n) {
21+
MP_FREE(s_mp_warray.l_free[n].warray, sizeof(mp_word) * MP_WARRAY);
22+
s_mp_warray.l_free[n].warray = NULL;
23+
}
24+
s_mp_warray_free(s_mp_warray.usable);
25+
ERR_OUT:
26+
S_MP_WARRAY_UNLOCK();
27+
return ret;
28+
}
29+
30+
int mp_warray_free(void)
31+
{
32+
if (MP_HAS(MP_SMALL_STACK_SIZE)) return s_warray_free();
33+
return -1;
34+
}
35+
36+
#endif

mp_warray_init.c

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
#include "tommath_private.h"
2+
#ifdef MP_WARRAY_INIT_C
3+
/* LibTomMath, multiple-precision integer library -- Tom St Denis */
4+
/* SPDX-License-Identifier: Unlicense */
5+
6+
static mp_err s_warray_init(size_t n_alloc, bool preallocate, mp_lock *lock)
7+
{
8+
size_t n;
9+
if (s_mp_warray.l_free != NULL || s_mp_warray.l_used != NULL) {
10+
return MP_VAL;
11+
}
12+
13+
if (MP_HAS(MP_USE_LOCKING) && (lock != NULL)) {
14+
if (lock->lock == NULL || lock->unlock == NULL)
15+
return MP_VAL;
16+
s_mp_warray.lock = *lock;
17+
s_mp_warray.locking_enabled = true;
18+
} else {
19+
s_mp_zero_buf(&s_mp_warray.lock, sizeof(s_mp_warray.lock));
20+
}
21+
22+
s_mp_warray.l_free = MP_CALLOC(n_alloc, sizeof(*(s_mp_warray.l_free)));
23+
s_mp_warray.l_used = MP_CALLOC(n_alloc, sizeof(*(s_mp_warray.l_used)));
24+
if (s_mp_warray.l_free == NULL || s_mp_warray.l_used == NULL) {
25+
s_mp_warray_free(n_alloc);
26+
return MP_MEM;
27+
}
28+
29+
if (preallocate) {
30+
for (n = 0; n < n_alloc; ++n) {
31+
s_mp_warray.l_free[n].warray = MP_CALLOC(MP_WARRAY, sizeof(mp_word));
32+
if (s_mp_warray.l_free[n].warray == NULL) {
33+
while (n > 0) {
34+
n--;
35+
MP_FREE(s_mp_warray.l_free[n].warray, MP_WARRAY * sizeof(mp_word));
36+
s_mp_warray.l_free[n].warray = NULL;
37+
}
38+
s_mp_warray_free(n_alloc);
39+
return MP_MEM;
40+
}
41+
}
42+
s_mp_warray.allocated = n_alloc;
43+
}
44+
45+
s_mp_warray.usable = n_alloc;
46+
return MP_OKAY;
47+
}
48+
49+
mp_err mp_warray_init(size_t n_alloc, bool preallocate, mp_lock *lock)
50+
{
51+
if (MP_HAS(MP_SMALL_STACK_SIZE)) return s_warray_init(n_alloc, preallocate, lock);
52+
return MP_ERR;
53+
}
54+
55+
#endif

s_mp_montgomery_reduce_comba.c

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,12 @@ mp_err s_mp_montgomery_reduce_comba(mp_int *x, const mp_int *n, mp_digit rho)
1515
{
1616
int ix, oldused;
1717
mp_err err;
18-
mp_word W[MP_WARRAY];
18+
mp_word MP_ALLOC_WARRAY(W);
19+
20+
MP_CHECK_WARRAY(W);
1921

2022
if (x->used > MP_WARRAY) {
23+
MP_FREE_WARRAY(W);
2124
return MP_VAL;
2225
}
2326

@@ -26,6 +29,7 @@ mp_err s_mp_montgomery_reduce_comba(mp_int *x, const mp_int *n, mp_digit rho)
2629

2730
/* grow a as required */
2831
if ((err = mp_grow(x, n->used + 1)) != MP_OKAY) {
32+
MP_FREE_WARRAY(W);
2933
return err;
3034
}
3135

@@ -110,6 +114,7 @@ mp_err s_mp_montgomery_reduce_comba(mp_int *x, const mp_int *n, mp_digit rho)
110114

111115
mp_clamp(x);
112116

117+
MP_FREE_WARRAY(W);
113118
/* if A >= m then A = A - m */
114119
if (mp_cmp_mag(x, n) != MP_LT) {
115120
return s_mp_sub(x, n, x);

s_mp_mul_comba.c

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,15 +23,18 @@ mp_err s_mp_mul_comba(const mp_int *a, const mp_int *b, mp_int *c, int digs)
2323
{
2424
int oldused, pa, ix;
2525
mp_err err;
26-
mp_digit W[MP_WARRAY];
26+
mp_digit MP_ALLOC_WARRAY(W);
2727
mp_word _W;
2828

2929
if (digs < 0) {
3030
return MP_VAL;
3131
}
3232

33+
MP_CHECK_WARRAY(W);
34+
3335
/* grow the destination as required */
3436
if ((err = mp_grow(c, digs)) != MP_OKAY) {
37+
MP_FREE_WARRAY(W);
3538
return err;
3639
}
3740

@@ -77,6 +80,7 @@ mp_err s_mp_mul_comba(const mp_int *a, const mp_int *b, mp_int *c, int digs)
7780
s_mp_zero_digs(c->dp + c->used, oldused - c->used);
7881

7982
mp_clamp(c);
83+
MP_FREE_WARRAY(W);
8084
return MP_OKAY;
8185
}
8286
#endif

s_mp_mul_high_comba.c

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,16 +16,19 @@ mp_err s_mp_mul_high_comba(const mp_int *a, const mp_int *b, mp_int *c, int digs
1616
{
1717
int oldused, pa, ix;
1818
mp_err err;
19-
mp_digit W[MP_WARRAY];
19+
mp_digit MP_ALLOC_WARRAY(W);
2020
mp_word _W;
2121

2222
if (digs < 0) {
2323
return MP_VAL;
2424
}
2525

26+
MP_CHECK_WARRAY(W);
27+
2628
/* grow the destination as required */
2729
pa = a->used + b->used;
2830
if ((err = mp_grow(c, pa)) != MP_OKAY) {
31+
MP_FREE_WARRAY(W);
2932
return err;
3033
}
3134

@@ -69,6 +72,7 @@ mp_err s_mp_mul_high_comba(const mp_int *a, const mp_int *b, mp_int *c, int digs
6972
s_mp_zero_digs(c->dp + c->used, oldused - c->used);
7073

7174
mp_clamp(c);
75+
MP_FREE_WARRAY(W);
7276
return MP_OKAY;
7377
}
7478
#endif

0 commit comments

Comments
 (0)