TLS/SSL and crypto library
Revisión | 67956bda58a1692d67a9ec0c75390a29e5ce27cd (tree) |
---|---|
Tiempo | 2009-03-25 21:08:15 |
Autor | cvs2svn <cvs2svn> |
Commiter | cvs2svn |
This commit was manufactured by cvs2svn to create branch
'BRANCH_OpenSSL_0_9_8k'.
@@ -0,0 +1,440 @@ | ||
1 | +/* apps/genpkey.c */ | |
2 | +/* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL | |
3 | + * project 2006 | |
4 | + */ | |
5 | +/* ==================================================================== | |
6 | + * Copyright (c) 2006 The OpenSSL Project. All rights reserved. | |
7 | + * | |
8 | + * Redistribution and use in source and binary forms, with or without | |
9 | + * modification, are permitted provided that the following conditions | |
10 | + * are met: | |
11 | + * | |
12 | + * 1. Redistributions of source code must retain the above copyright | |
13 | + * notice, this list of conditions and the following disclaimer. | |
14 | + * | |
15 | + * 2. Redistributions in binary form must reproduce the above copyright | |
16 | + * notice, this list of conditions and the following disclaimer in | |
17 | + * the documentation and/or other materials provided with the | |
18 | + * distribution. | |
19 | + * | |
20 | + * 3. All advertising materials mentioning features or use of this | |
21 | + * software must display the following acknowledgment: | |
22 | + * "This product includes software developed by the OpenSSL Project | |
23 | + * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)" | |
24 | + * | |
25 | + * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to | |
26 | + * endorse or promote products derived from this software without | |
27 | + * prior written permission. For written permission, please contact | |
28 | + * licensing@OpenSSL.org. | |
29 | + * | |
30 | + * 5. Products derived from this software may not be called "OpenSSL" | |
31 | + * nor may "OpenSSL" appear in their names without prior written | |
32 | + * permission of the OpenSSL Project. | |
33 | + * | |
34 | + * 6. Redistributions of any form whatsoever must retain the following | |
35 | + * acknowledgment: | |
36 | + * "This product includes software developed by the OpenSSL Project | |
37 | + * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)" | |
38 | + * | |
39 | + * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY | |
40 | + * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
41 | + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR | |
42 | + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR | |
43 | + * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | |
44 | + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT | |
45 | + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; | |
46 | + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | |
47 | + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, | |
48 | + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | |
49 | + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED | |
50 | + * OF THE POSSIBILITY OF SUCH DAMAGE. | |
51 | + * ==================================================================== | |
52 | + * | |
53 | + * This product includes cryptographic software written by Eric Young | |
54 | + * (eay@cryptsoft.com). This product includes software written by Tim | |
55 | + * Hudson (tjh@cryptsoft.com). | |
56 | + * | |
57 | + */ | |
58 | +#include <stdio.h> | |
59 | +#include <string.h> | |
60 | +#include "apps.h" | |
61 | +#include <openssl/pem.h> | |
62 | +#include <openssl/err.h> | |
63 | +#include <openssl/evp.h> | |
64 | +#ifndef OPENSSL_NO_ENGINE | |
65 | +#include <openssl/engine.h> | |
66 | +#endif | |
67 | + | |
68 | +static int init_keygen_file(BIO *err, EVP_PKEY_CTX **pctx, | |
69 | + const char *file, ENGINE *e); | |
70 | +static int genpkey_cb(EVP_PKEY_CTX *ctx); | |
71 | + | |
72 | +#define PROG genpkey_main | |
73 | + | |
74 | +int MAIN(int, char **); | |
75 | + | |
76 | +int MAIN(int argc, char **argv) | |
77 | + { | |
78 | + ENGINE *e = NULL; | |
79 | + char **args, *outfile = NULL; | |
80 | + char *passarg = NULL; | |
81 | + BIO *in = NULL, *out = NULL; | |
82 | + const EVP_CIPHER *cipher = NULL; | |
83 | + int outformat; | |
84 | + int text = 0; | |
85 | + EVP_PKEY *pkey=NULL; | |
86 | + EVP_PKEY_CTX *ctx = NULL; | |
87 | + char *pass = NULL; | |
88 | + int badarg = 0; | |
89 | + int ret = 1, rv; | |
90 | + | |
91 | + int do_param = 0; | |
92 | + | |
93 | + if (bio_err == NULL) | |
94 | + bio_err = BIO_new_fp (stderr, BIO_NOCLOSE); | |
95 | + | |
96 | + if (!load_config(bio_err, NULL)) | |
97 | + goto end; | |
98 | + | |
99 | + outformat=FORMAT_PEM; | |
100 | + | |
101 | + ERR_load_crypto_strings(); | |
102 | + OpenSSL_add_all_algorithms(); | |
103 | + args = argv + 1; | |
104 | + while (!badarg && *args && *args[0] == '-') | |
105 | + { | |
106 | + if (!strcmp(*args,"-outform")) | |
107 | + { | |
108 | + if (args[1]) | |
109 | + { | |
110 | + args++; | |
111 | + outformat=str2fmt(*args); | |
112 | + } | |
113 | + else badarg = 1; | |
114 | + } | |
115 | + else if (!strcmp(*args,"-pass")) | |
116 | + { | |
117 | + if (!args[1]) goto bad; | |
118 | + passarg= *(++args); | |
119 | + } | |
120 | +#ifndef OPENSSL_NO_ENGINE | |
121 | + else if (strcmp(*args,"-engine") == 0) | |
122 | + { | |
123 | + if (!args[1]) | |
124 | + goto bad; | |
125 | + e = setup_engine(bio_err, *(++args), 0); | |
126 | + } | |
127 | +#endif | |
128 | + else if (!strcmp (*args, "-paramfile")) | |
129 | + { | |
130 | + if (!args[1]) | |
131 | + goto bad; | |
132 | + args++; | |
133 | + if (do_param == 1) | |
134 | + goto bad; | |
135 | + if (!init_keygen_file(bio_err, &ctx, *args, e)) | |
136 | + goto end; | |
137 | + } | |
138 | + else if (!strcmp (*args, "-out")) | |
139 | + { | |
140 | + if (args[1]) | |
141 | + { | |
142 | + args++; | |
143 | + outfile = *args; | |
144 | + } | |
145 | + else badarg = 1; | |
146 | + } | |
147 | + else if (strcmp(*args,"-algorithm") == 0) | |
148 | + { | |
149 | + if (!args[1]) | |
150 | + goto bad; | |
151 | + if (!init_gen_str(bio_err, &ctx, *(++args),e, do_param)) | |
152 | + goto end; | |
153 | + } | |
154 | + else if (strcmp(*args,"-pkeyopt") == 0) | |
155 | + { | |
156 | + if (!args[1]) | |
157 | + goto bad; | |
158 | + if (!ctx) | |
159 | + { | |
160 | + BIO_puts(bio_err, "No keytype specified\n"); | |
161 | + goto bad; | |
162 | + } | |
163 | + else if (pkey_ctrl_string(ctx, *(++args)) <= 0) | |
164 | + { | |
165 | + BIO_puts(bio_err, "parameter setting error\n"); | |
166 | + ERR_print_errors(bio_err); | |
167 | + goto end; | |
168 | + } | |
169 | + } | |
170 | + else if (strcmp(*args,"-genparam") == 0) | |
171 | + { | |
172 | + if (ctx) | |
173 | + goto bad; | |
174 | + do_param = 1; | |
175 | + } | |
176 | + else if (strcmp(*args,"-text") == 0) | |
177 | + text=1; | |
178 | + else | |
179 | + { | |
180 | + cipher = EVP_get_cipherbyname(*args + 1); | |
181 | + if (!cipher) | |
182 | + { | |
183 | + BIO_printf(bio_err, "Unknown cipher %s\n", | |
184 | + *args + 1); | |
185 | + badarg = 1; | |
186 | + } | |
187 | + if (do_param == 1) | |
188 | + badarg = 1; | |
189 | + } | |
190 | + args++; | |
191 | + } | |
192 | + | |
193 | + if (!ctx) | |
194 | + badarg = 1; | |
195 | + | |
196 | + if (badarg) | |
197 | + { | |
198 | + bad: | |
199 | + BIO_printf(bio_err, "Usage: genpkey [options]\n"); | |
200 | + BIO_printf(bio_err, "where options may be\n"); | |
201 | + BIO_printf(bio_err, "-out file output file\n"); | |
202 | + BIO_printf(bio_err, "-outform X output format (DER or PEM)\n"); | |
203 | + BIO_printf(bio_err, "-pass arg output file pass phrase source\n"); | |
204 | + BIO_printf(bio_err, "-<cipher> use cipher <cipher> to encrypt the key\n"); | |
205 | +#ifndef OPENSSL_NO_ENGINE | |
206 | + BIO_printf(bio_err, "-engine e use engine e, possibly a hardware device.\n"); | |
207 | +#endif | |
208 | + BIO_printf(bio_err, "-paramfile file parameters file\n"); | |
209 | + BIO_printf(bio_err, "-algorithm alg the public key algorithm\n"); | |
210 | + BIO_printf(bio_err, "-pkeyopt opt:value set the public key algorithm option <opt>\n" | |
211 | + " to value <value>\n"); | |
212 | + BIO_printf(bio_err, "-genparam generate parameters, not key\n"); | |
213 | + BIO_printf(bio_err, "-text print the in text\n"); | |
214 | + BIO_printf(bio_err, "NB: options order may be important! See the manual page.\n"); | |
215 | + goto end; | |
216 | + } | |
217 | + | |
218 | + if (!app_passwd(bio_err, passarg, NULL, &pass, NULL)) | |
219 | + { | |
220 | + BIO_puts(bio_err, "Error getting password\n"); | |
221 | + goto end; | |
222 | + } | |
223 | + | |
224 | + if (outfile) | |
225 | + { | |
226 | + if (!(out = BIO_new_file (outfile, "wb"))) | |
227 | + { | |
228 | + BIO_printf(bio_err, | |
229 | + "Can't open output file %s\n", outfile); | |
230 | + goto end; | |
231 | + } | |
232 | + } | |
233 | + else | |
234 | + { | |
235 | + out = BIO_new_fp (stdout, BIO_NOCLOSE); | |
236 | +#ifdef OPENSSL_SYS_VMS | |
237 | + { | |
238 | + BIO *tmpbio = BIO_new(BIO_f_linebuffer()); | |
239 | + out = BIO_push(tmpbio, out); | |
240 | + } | |
241 | +#endif | |
242 | + } | |
243 | + | |
244 | + EVP_PKEY_CTX_set_cb(ctx, genpkey_cb); | |
245 | + EVP_PKEY_CTX_set_app_data(ctx, bio_err); | |
246 | + | |
247 | + if (do_param) | |
248 | + { | |
249 | + if (EVP_PKEY_paramgen(ctx, &pkey) <= 0) | |
250 | + { | |
251 | + BIO_puts(bio_err, "Error generating parameters\n"); | |
252 | + ERR_print_errors(bio_err); | |
253 | + goto end; | |
254 | + } | |
255 | + } | |
256 | + else | |
257 | + { | |
258 | + if (EVP_PKEY_keygen(ctx, &pkey) <= 0) | |
259 | + { | |
260 | + BIO_puts(bio_err, "Error generating key\n"); | |
261 | + ERR_print_errors(bio_err); | |
262 | + goto end; | |
263 | + } | |
264 | + } | |
265 | + | |
266 | + if (do_param) | |
267 | + rv = PEM_write_bio_Parameters(out, pkey); | |
268 | + else if (outformat == FORMAT_PEM) | |
269 | + rv = PEM_write_bio_PrivateKey(out, pkey, cipher, NULL, 0, | |
270 | + NULL, pass); | |
271 | + else if (outformat == FORMAT_ASN1) | |
272 | + rv = i2d_PrivateKey_bio(out, pkey); | |
273 | + else | |
274 | + { | |
275 | + BIO_printf(bio_err, "Bad format specified for key\n"); | |
276 | + goto end; | |
277 | + } | |
278 | + | |
279 | + if (rv <= 0) | |
280 | + { | |
281 | + BIO_puts(bio_err, "Error writing key\n"); | |
282 | + ERR_print_errors(bio_err); | |
283 | + } | |
284 | + | |
285 | + if (text) | |
286 | + { | |
287 | + if (do_param) | |
288 | + rv = EVP_PKEY_print_params(out, pkey, 0, NULL); | |
289 | + else | |
290 | + rv = EVP_PKEY_print_private(out, pkey, 0, NULL); | |
291 | + | |
292 | + if (rv <= 0) | |
293 | + { | |
294 | + BIO_puts(bio_err, "Error printing key\n"); | |
295 | + ERR_print_errors(bio_err); | |
296 | + } | |
297 | + } | |
298 | + | |
299 | + ret = 0; | |
300 | + | |
301 | + end: | |
302 | + if (pkey) | |
303 | + EVP_PKEY_free(pkey); | |
304 | + if (ctx) | |
305 | + EVP_PKEY_CTX_free(ctx); | |
306 | + if (out) | |
307 | + BIO_free_all(out); | |
308 | + BIO_free(in); | |
309 | + if (pass) | |
310 | + OPENSSL_free(pass); | |
311 | + | |
312 | + return ret; | |
313 | + } | |
314 | + | |
315 | +static int init_keygen_file(BIO *err, EVP_PKEY_CTX **pctx, | |
316 | + const char *file, ENGINE *e) | |
317 | + { | |
318 | + BIO *pbio; | |
319 | + EVP_PKEY *pkey = NULL; | |
320 | + EVP_PKEY_CTX *ctx = NULL; | |
321 | + if (*pctx) | |
322 | + { | |
323 | + BIO_puts(err, "Parameters already set!\n"); | |
324 | + return 0; | |
325 | + } | |
326 | + | |
327 | + pbio = BIO_new_file(file, "r"); | |
328 | + if (!pbio) | |
329 | + { | |
330 | + BIO_printf(err, "Can't open parameter file %s\n", file); | |
331 | + return 0; | |
332 | + } | |
333 | + | |
334 | + pkey = PEM_read_bio_Parameters(pbio, NULL); | |
335 | + BIO_free(pbio); | |
336 | + | |
337 | + if (!pkey) | |
338 | + { | |
339 | + BIO_printf(bio_err, "Error reading parameter file %s\n", file); | |
340 | + return 0; | |
341 | + } | |
342 | + | |
343 | + ctx = EVP_PKEY_CTX_new(pkey, e); | |
344 | + if (!ctx) | |
345 | + goto err; | |
346 | + if (EVP_PKEY_keygen_init(ctx) <= 0) | |
347 | + goto err; | |
348 | + EVP_PKEY_free(pkey); | |
349 | + *pctx = ctx; | |
350 | + return 1; | |
351 | + | |
352 | + err: | |
353 | + BIO_puts(err, "Error initializing context\n"); | |
354 | + ERR_print_errors(err); | |
355 | + if (ctx) | |
356 | + EVP_PKEY_CTX_free(ctx); | |
357 | + if (pkey) | |
358 | + EVP_PKEY_free(pkey); | |
359 | + return 0; | |
360 | + | |
361 | + } | |
362 | + | |
363 | +int init_gen_str(BIO *err, EVP_PKEY_CTX **pctx, | |
364 | + const char *algname, ENGINE *e, int do_param) | |
365 | + { | |
366 | + EVP_PKEY_CTX *ctx = NULL; | |
367 | + const EVP_PKEY_ASN1_METHOD *ameth; | |
368 | + ENGINE *tmpeng = NULL; | |
369 | + int pkey_id; | |
370 | + | |
371 | + if (*pctx) | |
372 | + { | |
373 | + BIO_puts(err, "Algorithm already set!\n"); | |
374 | + return 0; | |
375 | + } | |
376 | + | |
377 | + ameth = EVP_PKEY_asn1_find_str(&tmpeng, algname, -1); | |
378 | + | |
379 | +#ifndef OPENSSL_NO_ENGINE | |
380 | + if (!ameth && e) | |
381 | + ameth = ENGINE_get_pkey_asn1_meth_str(e, algname, -1); | |
382 | +#endif | |
383 | + | |
384 | + if (!ameth) | |
385 | + { | |
386 | + BIO_printf(bio_err, "Algorithm %s not found\n", algname); | |
387 | + return 0; | |
388 | + } | |
389 | + | |
390 | + ERR_clear_error(); | |
391 | + | |
392 | + EVP_PKEY_asn1_get0_info(&pkey_id, NULL, NULL, NULL, NULL, ameth); | |
393 | +#ifndef OPENSSL_NO_ENGINE | |
394 | + if (tmpeng) | |
395 | + ENGINE_finish(tmpeng); | |
396 | +#endif | |
397 | + ctx = EVP_PKEY_CTX_new_id(pkey_id, e); | |
398 | + | |
399 | + if (!ctx) | |
400 | + goto err; | |
401 | + if (do_param) | |
402 | + { | |
403 | + if (EVP_PKEY_paramgen_init(ctx) <= 0) | |
404 | + goto err; | |
405 | + } | |
406 | + else | |
407 | + { | |
408 | + if (EVP_PKEY_keygen_init(ctx) <= 0) | |
409 | + goto err; | |
410 | + } | |
411 | + | |
412 | + *pctx = ctx; | |
413 | + return 1; | |
414 | + | |
415 | + err: | |
416 | + BIO_printf(err, "Error initializing %s context\n", algname); | |
417 | + ERR_print_errors(err); | |
418 | + if (ctx) | |
419 | + EVP_PKEY_CTX_free(ctx); | |
420 | + return 0; | |
421 | + | |
422 | + } | |
423 | + | |
424 | +static int genpkey_cb(EVP_PKEY_CTX *ctx) | |
425 | + { | |
426 | + char c='*'; | |
427 | + BIO *b = EVP_PKEY_CTX_get_app_data(ctx); | |
428 | + int p; | |
429 | + p = EVP_PKEY_CTX_get_keygen_info(ctx, 0); | |
430 | + if (p == 0) c='.'; | |
431 | + if (p == 1) c='+'; | |
432 | + if (p == 2) c='*'; | |
433 | + if (p == 3) c='\n'; | |
434 | + BIO_write(b,&c,1); | |
435 | + (void)BIO_flush(b); | |
436 | +#ifdef LINT | |
437 | + p=n; | |
438 | +#endif | |
439 | + return 1; | |
440 | + } |
@@ -0,0 +1,284 @@ | ||
1 | +/* apps/pkey.c */ | |
2 | +/* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL | |
3 | + * project 2006 | |
4 | + */ | |
5 | +/* ==================================================================== | |
6 | + * Copyright (c) 2006 The OpenSSL Project. All rights reserved. | |
7 | + * | |
8 | + * Redistribution and use in source and binary forms, with or without | |
9 | + * modification, are permitted provided that the following conditions | |
10 | + * are met: | |
11 | + * | |
12 | + * 1. Redistributions of source code must retain the above copyright | |
13 | + * notice, this list of conditions and the following disclaimer. | |
14 | + * | |
15 | + * 2. Redistributions in binary form must reproduce the above copyright | |
16 | + * notice, this list of conditions and the following disclaimer in | |
17 | + * the documentation and/or other materials provided with the | |
18 | + * distribution. | |
19 | + * | |
20 | + * 3. All advertising materials mentioning features or use of this | |
21 | + * software must display the following acknowledgment: | |
22 | + * "This product includes software developed by the OpenSSL Project | |
23 | + * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)" | |
24 | + * | |
25 | + * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to | |
26 | + * endorse or promote products derived from this software without | |
27 | + * prior written permission. For written permission, please contact | |
28 | + * licensing@OpenSSL.org. | |
29 | + * | |
30 | + * 5. Products derived from this software may not be called "OpenSSL" | |
31 | + * nor may "OpenSSL" appear in their names without prior written | |
32 | + * permission of the OpenSSL Project. | |
33 | + * | |
34 | + * 6. Redistributions of any form whatsoever must retain the following | |
35 | + * acknowledgment: | |
36 | + * "This product includes software developed by the OpenSSL Project | |
37 | + * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)" | |
38 | + * | |
39 | + * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY | |
40 | + * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
41 | + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR | |
42 | + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR | |
43 | + * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | |
44 | + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT | |
45 | + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; | |
46 | + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | |
47 | + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, | |
48 | + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | |
49 | + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED | |
50 | + * OF THE POSSIBILITY OF SUCH DAMAGE. | |
51 | + * ==================================================================== | |
52 | + * | |
53 | + * This product includes cryptographic software written by Eric Young | |
54 | + * (eay@cryptsoft.com). This product includes software written by Tim | |
55 | + * Hudson (tjh@cryptsoft.com). | |
56 | + * | |
57 | + */ | |
58 | +#include <stdio.h> | |
59 | +#include <string.h> | |
60 | +#include "apps.h" | |
61 | +#include <openssl/pem.h> | |
62 | +#include <openssl/err.h> | |
63 | +#include <openssl/evp.h> | |
64 | + | |
65 | +#define PROG pkey_main | |
66 | + | |
67 | +int MAIN(int, char **); | |
68 | + | |
69 | +int MAIN(int argc, char **argv) | |
70 | + { | |
71 | + ENGINE *e = NULL; | |
72 | + char **args, *infile = NULL, *outfile = NULL; | |
73 | + char *passargin = NULL, *passargout = NULL; | |
74 | + BIO *in = NULL, *out = NULL; | |
75 | + const EVP_CIPHER *cipher = NULL; | |
76 | + int informat, outformat; | |
77 | + int pubin = 0, pubout = 0, pubtext = 0, text = 0, noout = 0; | |
78 | + EVP_PKEY *pkey=NULL; | |
79 | + char *passin = NULL, *passout = NULL; | |
80 | + int badarg = 0; | |
81 | +#ifndef OPENSSL_NO_ENGINE | |
82 | + char *engine=NULL; | |
83 | +#endif | |
84 | + int ret = 1; | |
85 | + | |
86 | + if (bio_err == NULL) | |
87 | + bio_err = BIO_new_fp (stderr, BIO_NOCLOSE); | |
88 | + | |
89 | + if (!load_config(bio_err, NULL)) | |
90 | + goto end; | |
91 | + | |
92 | + informat=FORMAT_PEM; | |
93 | + outformat=FORMAT_PEM; | |
94 | + | |
95 | + ERR_load_crypto_strings(); | |
96 | + OpenSSL_add_all_algorithms(); | |
97 | + args = argv + 1; | |
98 | + while (!badarg && *args && *args[0] == '-') | |
99 | + { | |
100 | + if (!strcmp(*args,"-inform")) | |
101 | + { | |
102 | + if (args[1]) | |
103 | + { | |
104 | + args++; | |
105 | + informat=str2fmt(*args); | |
106 | + } | |
107 | + else badarg = 1; | |
108 | + } | |
109 | + else if (!strcmp(*args,"-outform")) | |
110 | + { | |
111 | + if (args[1]) | |
112 | + { | |
113 | + args++; | |
114 | + outformat=str2fmt(*args); | |
115 | + } | |
116 | + else badarg = 1; | |
117 | + } | |
118 | + else if (!strcmp(*args,"-passin")) | |
119 | + { | |
120 | + if (!args[1]) goto bad; | |
121 | + passargin= *(++args); | |
122 | + } | |
123 | + else if (!strcmp(*args,"-passout")) | |
124 | + { | |
125 | + if (!args[1]) goto bad; | |
126 | + passargout= *(++args); | |
127 | + } | |
128 | +#ifndef OPENSSL_NO_ENGINE | |
129 | + else if (strcmp(*args,"-engine") == 0) | |
130 | + { | |
131 | + if (!args[1]) goto bad; | |
132 | + engine= *(++args); | |
133 | + } | |
134 | +#endif | |
135 | + else if (!strcmp (*args, "-in")) | |
136 | + { | |
137 | + if (args[1]) | |
138 | + { | |
139 | + args++; | |
140 | + infile = *args; | |
141 | + } | |
142 | + else badarg = 1; | |
143 | + } | |
144 | + else if (!strcmp (*args, "-out")) | |
145 | + { | |
146 | + if (args[1]) | |
147 | + { | |
148 | + args++; | |
149 | + outfile = *args; | |
150 | + } | |
151 | + else badarg = 1; | |
152 | + } | |
153 | + else if (strcmp(*args,"-pubin") == 0) | |
154 | + { | |
155 | + pubin=1; | |
156 | + pubout=1; | |
157 | + pubtext=1; | |
158 | + } | |
159 | + else if (strcmp(*args,"-pubout") == 0) | |
160 | + pubout=1; | |
161 | + else if (strcmp(*args,"-text_pub") == 0) | |
162 | + { | |
163 | + pubtext=1; | |
164 | + text=1; | |
165 | + } | |
166 | + else if (strcmp(*args,"-text") == 0) | |
167 | + text=1; | |
168 | + else if (strcmp(*args,"-noout") == 0) | |
169 | + noout=1; | |
170 | + else | |
171 | + { | |
172 | + cipher = EVP_get_cipherbyname(*args + 1); | |
173 | + if (!cipher) | |
174 | + { | |
175 | + BIO_printf(bio_err, "Unknown cipher %s\n", | |
176 | + *args + 1); | |
177 | + badarg = 1; | |
178 | + } | |
179 | + } | |
180 | + args++; | |
181 | + } | |
182 | + | |
183 | + if (badarg) | |
184 | + { | |
185 | + bad: | |
186 | + BIO_printf(bio_err, "Usage pkey [options]\n"); | |
187 | + BIO_printf(bio_err, "where options are\n"); | |
188 | + BIO_printf(bio_err, "-in file input file\n"); | |
189 | + BIO_printf(bio_err, "-inform X input format (DER or PEM)\n"); | |
190 | + BIO_printf(bio_err, "-passin arg input file pass phrase source\n"); | |
191 | + BIO_printf(bio_err, "-outform X output format (DER or PEM)\n"); | |
192 | + BIO_printf(bio_err, "-out file output file\n"); | |
193 | + BIO_printf(bio_err, "-passout arg output file pass phrase source\n"); | |
194 | +#ifndef OPENSSL_NO_ENGINE | |
195 | + BIO_printf(bio_err, "-engine e use engine e, possibly a hardware device.\n"); | |
196 | +#endif | |
197 | + return 1; | |
198 | + } | |
199 | + | |
200 | +#ifndef OPENSSL_NO_ENGINE | |
201 | + e = setup_engine(bio_err, engine, 0); | |
202 | +#endif | |
203 | + | |
204 | + if (!app_passwd(bio_err, passargin, passargout, &passin, &passout)) | |
205 | + { | |
206 | + BIO_printf(bio_err, "Error getting passwords\n"); | |
207 | + goto end; | |
208 | + } | |
209 | + | |
210 | + if (outfile) | |
211 | + { | |
212 | + if (!(out = BIO_new_file (outfile, "wb"))) | |
213 | + { | |
214 | + BIO_printf(bio_err, | |
215 | + "Can't open output file %s\n", outfile); | |
216 | + goto end; | |
217 | + } | |
218 | + } | |
219 | + else | |
220 | + { | |
221 | + out = BIO_new_fp (stdout, BIO_NOCLOSE); | |
222 | +#ifdef OPENSSL_SYS_VMS | |
223 | + { | |
224 | + BIO *tmpbio = BIO_new(BIO_f_linebuffer()); | |
225 | + out = BIO_push(tmpbio, out); | |
226 | + } | |
227 | +#endif | |
228 | + } | |
229 | + | |
230 | + if (pubin) | |
231 | + pkey = load_pubkey(bio_err, infile, informat, 1, | |
232 | + passin, e, "Public Key"); | |
233 | + else | |
234 | + pkey = load_key(bio_err, infile, informat, 1, | |
235 | + passin, e, "key"); | |
236 | + if (!pkey) | |
237 | + goto end; | |
238 | + | |
239 | + if (!noout) | |
240 | + { | |
241 | + if (outformat == FORMAT_PEM) | |
242 | + { | |
243 | + if (pubout) | |
244 | + PEM_write_bio_PUBKEY(out,pkey); | |
245 | + else | |
246 | + PEM_write_bio_PrivateKey(out, pkey, cipher, | |
247 | + NULL, 0, NULL, passout); | |
248 | + } | |
249 | + else if (outformat == FORMAT_ASN1) | |
250 | + { | |
251 | + if (pubout) | |
252 | + i2d_PUBKEY_bio(out, pkey); | |
253 | + else | |
254 | + i2d_PrivateKey_bio(out, pkey); | |
255 | + } | |
256 | + else | |
257 | + { | |
258 | + BIO_printf(bio_err, "Bad format specified for key\n"); | |
259 | + goto end; | |
260 | + } | |
261 | + | |
262 | + } | |
263 | + | |
264 | + if (text) | |
265 | + { | |
266 | + if (pubtext) | |
267 | + EVP_PKEY_print_public(out, pkey, 0, NULL); | |
268 | + else | |
269 | + EVP_PKEY_print_private(out, pkey, 0, NULL); | |
270 | + } | |
271 | + | |
272 | + ret = 0; | |
273 | + | |
274 | + end: | |
275 | + EVP_PKEY_free(pkey); | |
276 | + BIO_free_all(out); | |
277 | + BIO_free(in); | |
278 | + if (passin) | |
279 | + OPENSSL_free(passin); | |
280 | + if (passout) | |
281 | + OPENSSL_free(passout); | |
282 | + | |
283 | + return ret; | |
284 | + } |
@@ -0,0 +1,201 @@ | ||
1 | +/* apps/pkeyparam.c */ | |
2 | +/* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL | |
3 | + * project 2006 | |
4 | + */ | |
5 | +/* ==================================================================== | |
6 | + * Copyright (c) 2006 The OpenSSL Project. All rights reserved. | |
7 | + * | |
8 | + * Redistribution and use in source and binary forms, with or without | |
9 | + * modification, are permitted provided that the following conditions | |
10 | + * are met: | |
11 | + * | |
12 | + * 1. Redistributions of source code must retain the above copyright | |
13 | + * notice, this list of conditions and the following disclaimer. | |
14 | + * | |
15 | + * 2. Redistributions in binary form must reproduce the above copyright | |
16 | + * notice, this list of conditions and the following disclaimer in | |
17 | + * the documentation and/or other materials provided with the | |
18 | + * distribution. | |
19 | + * | |
20 | + * 3. All advertising materials mentioning features or use of this | |
21 | + * software must display the following acknowledgment: | |
22 | + * "This product includes software developed by the OpenSSL Project | |
23 | + * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)" | |
24 | + * | |
25 | + * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to | |
26 | + * endorse or promote products derived from this software without | |
27 | + * prior written permission. For written permission, please contact | |
28 | + * licensing@OpenSSL.org. | |
29 | + * | |
30 | + * 5. Products derived from this software may not be called "OpenSSL" | |
31 | + * nor may "OpenSSL" appear in their names without prior written | |
32 | + * permission of the OpenSSL Project. | |
33 | + * | |
34 | + * 6. Redistributions of any form whatsoever must retain the following | |
35 | + * acknowledgment: | |
36 | + * "This product includes software developed by the OpenSSL Project | |
37 | + * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)" | |
38 | + * | |
39 | + * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY | |
40 | + * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
41 | + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR | |
42 | + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR | |
43 | + * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | |
44 | + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT | |
45 | + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; | |
46 | + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | |
47 | + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, | |
48 | + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | |
49 | + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED | |
50 | + * OF THE POSSIBILITY OF SUCH DAMAGE. | |
51 | + * ==================================================================== | |
52 | + * | |
53 | + * This product includes cryptographic software written by Eric Young | |
54 | + * (eay@cryptsoft.com). This product includes software written by Tim | |
55 | + * Hudson (tjh@cryptsoft.com). | |
56 | + * | |
57 | + */ | |
58 | +#include <stdio.h> | |
59 | +#include <string.h> | |
60 | +#include "apps.h" | |
61 | +#include <openssl/pem.h> | |
62 | +#include <openssl/err.h> | |
63 | +#include <openssl/evp.h> | |
64 | + | |
65 | +#define PROG pkeyparam_main | |
66 | + | |
67 | +int MAIN(int, char **); | |
68 | + | |
69 | +int MAIN(int argc, char **argv) | |
70 | + { | |
71 | + char **args, *infile = NULL, *outfile = NULL; | |
72 | + BIO *in = NULL, *out = NULL; | |
73 | + int text = 0, noout = 0; | |
74 | + EVP_PKEY *pkey=NULL; | |
75 | + int badarg = 0; | |
76 | +#ifndef OPENSSL_NO_ENGINE | |
77 | + ENGINE *e = NULL; | |
78 | + char *engine=NULL; | |
79 | +#endif | |
80 | + int ret = 1; | |
81 | + | |
82 | + if (bio_err == NULL) | |
83 | + bio_err = BIO_new_fp (stderr, BIO_NOCLOSE); | |
84 | + | |
85 | + if (!load_config(bio_err, NULL)) | |
86 | + goto end; | |
87 | + | |
88 | + ERR_load_crypto_strings(); | |
89 | + OpenSSL_add_all_algorithms(); | |
90 | + args = argv + 1; | |
91 | + while (!badarg && *args && *args[0] == '-') | |
92 | + { | |
93 | + if (!strcmp (*args, "-in")) | |
94 | + { | |
95 | + if (args[1]) | |
96 | + { | |
97 | + args++; | |
98 | + infile = *args; | |
99 | + } | |
100 | + else badarg = 1; | |
101 | + } | |
102 | + else if (!strcmp (*args, "-out")) | |
103 | + { | |
104 | + if (args[1]) | |
105 | + { | |
106 | + args++; | |
107 | + outfile = *args; | |
108 | + } | |
109 | + else badarg = 1; | |
110 | + } | |
111 | +#ifndef OPENSSL_NO_ENGINE | |
112 | + else if (strcmp(*args,"-engine") == 0) | |
113 | + { | |
114 | + if (!args[1]) goto bad; | |
115 | + engine= *(++args); | |
116 | + } | |
117 | +#endif | |
118 | + | |
119 | + else if (strcmp(*args,"-text") == 0) | |
120 | + text=1; | |
121 | + else if (strcmp(*args,"-noout") == 0) | |
122 | + noout=1; | |
123 | + args++; | |
124 | + } | |
125 | + | |
126 | + if (badarg) | |
127 | + { | |
128 | +#ifndef OPENSSL_NO_ENGINE | |
129 | + bad: | |
130 | +#endif | |
131 | + BIO_printf(bio_err, "Usage pkeyparam [options]\n"); | |
132 | + BIO_printf(bio_err, "where options are\n"); | |
133 | + BIO_printf(bio_err, "-in file input file\n"); | |
134 | + BIO_printf(bio_err, "-out file output file\n"); | |
135 | + BIO_printf(bio_err, "-text print parameters as text\n"); | |
136 | + BIO_printf(bio_err, "-noout don't output encoded parameters\n"); | |
137 | +#ifndef OPENSSL_NO_ENGINE | |
138 | + BIO_printf(bio_err, "-engine e use engine e, possibly a hardware device.\n"); | |
139 | +#endif | |
140 | + return 1; | |
141 | + } | |
142 | + | |
143 | +#ifndef OPENSSL_NO_ENGINE | |
144 | + e = setup_engine(bio_err, engine, 0); | |
145 | +#endif | |
146 | + | |
147 | + if (infile) | |
148 | + { | |
149 | + if (!(in = BIO_new_file (infile, "r"))) | |
150 | + { | |
151 | + BIO_printf(bio_err, | |
152 | + "Can't open input file %s\n", infile); | |
153 | + goto end; | |
154 | + } | |
155 | + } | |
156 | + else | |
157 | + in = BIO_new_fp (stdin, BIO_NOCLOSE); | |
158 | + | |
159 | + if (outfile) | |
160 | + { | |
161 | + if (!(out = BIO_new_file (outfile, "w"))) | |
162 | + { | |
163 | + BIO_printf(bio_err, | |
164 | + "Can't open output file %s\n", outfile); | |
165 | + goto end; | |
166 | + } | |
167 | + } | |
168 | + else | |
169 | + { | |
170 | + out = BIO_new_fp (stdout, BIO_NOCLOSE); | |
171 | +#ifdef OPENSSL_SYS_VMS | |
172 | + { | |
173 | + BIO *tmpbio = BIO_new(BIO_f_linebuffer()); | |
174 | + out = BIO_push(tmpbio, out); | |
175 | + } | |
176 | +#endif | |
177 | + } | |
178 | + | |
179 | + pkey = PEM_read_bio_Parameters(in, NULL); | |
180 | + if (!pkey) | |
181 | + { | |
182 | + BIO_printf(bio_err, "Error reading paramters\n"); | |
183 | + ERR_print_errors(bio_err); | |
184 | + goto end; | |
185 | + } | |
186 | + | |
187 | + if (!noout) | |
188 | + PEM_write_bio_Parameters(out,pkey); | |
189 | + | |
190 | + if (text) | |
191 | + EVP_PKEY_print_params(out, pkey, 0, NULL); | |
192 | + | |
193 | + ret = 0; | |
194 | + | |
195 | + end: | |
196 | + EVP_PKEY_free(pkey); | |
197 | + BIO_free_all(out); | |
198 | + BIO_free(in); | |
199 | + | |
200 | + return ret; | |
201 | + } |
@@ -0,0 +1,570 @@ | ||
1 | +/* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL | |
2 | + * project 2006. | |
3 | + */ | |
4 | +/* ==================================================================== | |
5 | + * Copyright (c) 2006 The OpenSSL Project. All rights reserved. | |
6 | + * | |
7 | + * Redistribution and use in source and binary forms, with or without | |
8 | + * modification, are permitted provided that the following conditions | |
9 | + * are met: | |
10 | + * | |
11 | + * 1. Redistributions of source code must retain the above copyright | |
12 | + * notice, this list of conditions and the following disclaimer. | |
13 | + * | |
14 | + * 2. Redistributions in binary form must reproduce the above copyright | |
15 | + * notice, this list of conditions and the following disclaimer in | |
16 | + * the documentation and/or other materials provided with the | |
17 | + * distribution. | |
18 | + * | |
19 | + * 3. All advertising materials mentioning features or use of this | |
20 | + * software must display the following acknowledgment: | |
21 | + * "This product includes software developed by the OpenSSL Project | |
22 | + * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)" | |
23 | + * | |
24 | + * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to | |
25 | + * endorse or promote products derived from this software without | |
26 | + * prior written permission. For written permission, please contact | |
27 | + * licensing@OpenSSL.org. | |
28 | + * | |
29 | + * 5. Products derived from this software may not be called "OpenSSL" | |
30 | + * nor may "OpenSSL" appear in their names without prior written | |
31 | + * permission of the OpenSSL Project. | |
32 | + * | |
33 | + * 6. Redistributions of any form whatsoever must retain the following | |
34 | + * acknowledgment: | |
35 | + * "This product includes software developed by the OpenSSL Project | |
36 | + * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)" | |
37 | + * | |
38 | + * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY | |
39 | + * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
40 | + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR | |
41 | + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR | |
42 | + * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | |
43 | + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT | |
44 | + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; | |
45 | + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | |
46 | + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, | |
47 | + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | |
48 | + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED | |
49 | + * OF THE POSSIBILITY OF SUCH DAMAGE. | |
50 | + * ==================================================================== | |
51 | + * | |
52 | + * This product includes cryptographic software written by Eric Young | |
53 | + * (eay@cryptsoft.com). This product includes software written by Tim | |
54 | + * Hudson (tjh@cryptsoft.com). | |
55 | + * | |
56 | + */ | |
57 | + | |
58 | + | |
59 | +#include "apps.h" | |
60 | +#include <string.h> | |
61 | +#include <openssl/err.h> | |
62 | +#include <openssl/pem.h> | |
63 | +#include <openssl/evp.h> | |
64 | + | |
65 | +#define KEY_PRIVKEY 1 | |
66 | +#define KEY_PUBKEY 2 | |
67 | +#define KEY_CERT 3 | |
68 | + | |
69 | +static void usage(void); | |
70 | + | |
71 | +#undef PROG | |
72 | + | |
73 | +#define PROG pkeyutl_main | |
74 | + | |
75 | +static EVP_PKEY_CTX *init_ctx(int *pkeysize, | |
76 | + char *keyfile, int keyform, int key_type, | |
77 | + char *passargin, int pkey_op, ENGINE *e); | |
78 | + | |
79 | +static int setup_peer(BIO *err, EVP_PKEY_CTX *ctx, int peerform, | |
80 | + const char *file); | |
81 | + | |
82 | +static int do_keyop(EVP_PKEY_CTX *ctx, int pkey_op, | |
83 | + unsigned char *out, size_t *poutlen, | |
84 | + unsigned char *in, size_t inlen); | |
85 | + | |
86 | +int MAIN(int argc, char **); | |
87 | + | |
88 | +int MAIN(int argc, char **argv) | |
89 | +{ | |
90 | + BIO *in = NULL, *out = NULL; | |
91 | + char *infile = NULL, *outfile = NULL, *sigfile = NULL; | |
92 | + ENGINE *e = NULL; | |
93 | + int pkey_op = EVP_PKEY_OP_SIGN, key_type = KEY_PRIVKEY; | |
94 | + int keyform = FORMAT_PEM, peerform = FORMAT_PEM; | |
95 | + char badarg = 0, rev = 0; | |
96 | + char hexdump = 0, asn1parse = 0; | |
97 | + EVP_PKEY_CTX *ctx = NULL; | |
98 | + char *passargin = NULL; | |
99 | + int keysize = -1; | |
100 | + | |
101 | + unsigned char *buf_in = NULL, *buf_out = NULL, *sig = NULL; | |
102 | + size_t buf_outlen; | |
103 | + int buf_inlen = 0, siglen = -1; | |
104 | + | |
105 | + int ret = 1, rv = -1; | |
106 | + | |
107 | + argc--; | |
108 | + argv++; | |
109 | + | |
110 | + if(!bio_err) bio_err = BIO_new_fp(stderr, BIO_NOCLOSE); | |
111 | + | |
112 | + if (!load_config(bio_err, NULL)) | |
113 | + goto end; | |
114 | + ERR_load_crypto_strings(); | |
115 | + OpenSSL_add_all_algorithms(); | |
116 | + | |
117 | + while(argc >= 1) | |
118 | + { | |
119 | + if (!strcmp(*argv,"-in")) | |
120 | + { | |
121 | + if (--argc < 1) badarg = 1; | |
122 | + infile= *(++argv); | |
123 | + } | |
124 | + else if (!strcmp(*argv,"-out")) | |
125 | + { | |
126 | + if (--argc < 1) badarg = 1; | |
127 | + outfile= *(++argv); | |
128 | + } | |
129 | + else if (!strcmp(*argv,"-sigfile")) | |
130 | + { | |
131 | + if (--argc < 1) badarg = 1; | |
132 | + sigfile= *(++argv); | |
133 | + } | |
134 | + else if(!strcmp(*argv, "-inkey")) | |
135 | + { | |
136 | + if (--argc < 1) | |
137 | + badarg = 1; | |
138 | + else | |
139 | + { | |
140 | + ctx = init_ctx(&keysize, | |
141 | + *(++argv), keyform, key_type, | |
142 | + passargin, pkey_op, e); | |
143 | + if (!ctx) | |
144 | + { | |
145 | + BIO_puts(bio_err, | |
146 | + "Error initializing context\n"); | |
147 | + ERR_print_errors(bio_err); | |
148 | + badarg = 1; | |
149 | + } | |
150 | + } | |
151 | + } | |
152 | + else if (!strcmp(*argv,"-peerkey")) | |
153 | + { | |
154 | + if (--argc < 1) | |
155 | + badarg = 1; | |
156 | + else if (!setup_peer(bio_err, ctx, peerform, *(++argv))) | |
157 | + badarg = 1; | |
158 | + } | |
159 | + else if (!strcmp(*argv,"-passin")) | |
160 | + { | |
161 | + if (--argc < 1) badarg = 1; | |
162 | + passargin= *(++argv); | |
163 | + } | |
164 | + else if (strcmp(*argv,"-peerform") == 0) | |
165 | + { | |
166 | + if (--argc < 1) badarg = 1; | |
167 | + peerform=str2fmt(*(++argv)); | |
168 | + } | |
169 | + else if (strcmp(*argv,"-keyform") == 0) | |
170 | + { | |
171 | + if (--argc < 1) badarg = 1; | |
172 | + keyform=str2fmt(*(++argv)); | |
173 | + } | |
174 | +#ifndef OPENSSL_NO_ENGINE | |
175 | + else if(!strcmp(*argv, "-engine")) | |
176 | + { | |
177 | + if (--argc < 1) | |
178 | + badarg = 1; | |
179 | + else | |
180 | + e = setup_engine(bio_err, *(++argv), 0); | |
181 | + } | |
182 | +#endif | |
183 | + else if(!strcmp(*argv, "-pubin")) | |
184 | + key_type = KEY_PUBKEY; | |
185 | + else if(!strcmp(*argv, "-certin")) | |
186 | + key_type = KEY_CERT; | |
187 | + else if(!strcmp(*argv, "-asn1parse")) | |
188 | + asn1parse = 1; | |
189 | + else if(!strcmp(*argv, "-hexdump")) | |
190 | + hexdump = 1; | |
191 | + else if(!strcmp(*argv, "-sign")) | |
192 | + pkey_op = EVP_PKEY_OP_SIGN; | |
193 | + else if(!strcmp(*argv, "-verify")) | |
194 | + pkey_op = EVP_PKEY_OP_VERIFY; | |
195 | + else if(!strcmp(*argv, "-verifyrecover")) | |
196 | + pkey_op = EVP_PKEY_OP_VERIFYRECOVER; | |
197 | + else if(!strcmp(*argv, "-rev")) | |
198 | + rev = 1; | |
199 | + else if(!strcmp(*argv, "-encrypt")) | |
200 | + pkey_op = EVP_PKEY_OP_ENCRYPT; | |
201 | + else if(!strcmp(*argv, "-decrypt")) | |
202 | + pkey_op = EVP_PKEY_OP_DECRYPT; | |
203 | + else if(!strcmp(*argv, "-derive")) | |
204 | + pkey_op = EVP_PKEY_OP_DERIVE; | |
205 | + else if (strcmp(*argv,"-pkeyopt") == 0) | |
206 | + { | |
207 | + if (--argc < 1) | |
208 | + badarg = 1; | |
209 | + else if (!ctx) | |
210 | + { | |
211 | + BIO_puts(bio_err, | |
212 | + "-pkeyopt command before -inkey\n"); | |
213 | + badarg = 1; | |
214 | + } | |
215 | + else if (pkey_ctrl_string(ctx, *(++argv)) <= 0) | |
216 | + { | |
217 | + BIO_puts(bio_err, "parameter setting error\n"); | |
218 | + ERR_print_errors(bio_err); | |
219 | + goto end; | |
220 | + } | |
221 | + } | |
222 | + else badarg = 1; | |
223 | + if(badarg) | |
224 | + { | |
225 | + usage(); | |
226 | + goto end; | |
227 | + } | |
228 | + argc--; | |
229 | + argv++; | |
230 | + } | |
231 | + | |
232 | + if (!ctx) | |
233 | + { | |
234 | + usage(); | |
235 | + goto end; | |
236 | + } | |
237 | + | |
238 | + if (sigfile && (pkey_op != EVP_PKEY_OP_VERIFY)) | |
239 | + { | |
240 | + BIO_puts(bio_err, "Signature file specified for non verify\n"); | |
241 | + goto end; | |
242 | + } | |
243 | + | |
244 | + if (!sigfile && (pkey_op == EVP_PKEY_OP_VERIFY)) | |
245 | + { | |
246 | + BIO_puts(bio_err, "No signature file specified for verify\n"); | |
247 | + goto end; | |
248 | + } | |
249 | + | |
250 | +/* FIXME: seed PRNG only if needed */ | |
251 | + app_RAND_load_file(NULL, bio_err, 0); | |
252 | + | |
253 | + if (pkey_op != EVP_PKEY_OP_DERIVE) | |
254 | + { | |
255 | + if(infile) | |
256 | + { | |
257 | + if(!(in = BIO_new_file(infile, "rb"))) | |
258 | + { | |
259 | + BIO_puts(bio_err, | |
260 | + "Error Opening Input File\n"); | |
261 | + ERR_print_errors(bio_err); | |
262 | + goto end; | |
263 | + } | |
264 | + } | |
265 | + else | |
266 | + in = BIO_new_fp(stdin, BIO_NOCLOSE); | |
267 | + } | |
268 | + | |
269 | + if(outfile) | |
270 | + { | |
271 | + if(!(out = BIO_new_file(outfile, "wb"))) | |
272 | + { | |
273 | + BIO_printf(bio_err, "Error Creating Output File\n"); | |
274 | + ERR_print_errors(bio_err); | |
275 | + goto end; | |
276 | + } | |
277 | + } | |
278 | + else | |
279 | + { | |
280 | + out = BIO_new_fp(stdout, BIO_NOCLOSE); | |
281 | +#ifdef OPENSSL_SYS_VMS | |
282 | + { | |
283 | + BIO *tmpbio = BIO_new(BIO_f_linebuffer()); | |
284 | + out = BIO_push(tmpbio, out); | |
285 | + } | |
286 | +#endif | |
287 | + } | |
288 | + | |
289 | + if (sigfile) | |
290 | + { | |
291 | + BIO *sigbio = BIO_new_file(sigfile, "rb"); | |
292 | + if (!sigbio) | |
293 | + { | |
294 | + BIO_printf(bio_err, "Can't open signature file %s\n", | |
295 | + sigfile); | |
296 | + goto end; | |
297 | + } | |
298 | + siglen = bio_to_mem(&sig, keysize * 10, sigbio); | |
299 | + BIO_free(sigbio); | |
300 | + if (siglen <= 0) | |
301 | + { | |
302 | + BIO_printf(bio_err, "Error reading signature data\n"); | |
303 | + goto end; | |
304 | + } | |
305 | + } | |
306 | + | |
307 | + if (in) | |
308 | + { | |
309 | + /* Read the input data */ | |
310 | + buf_inlen = bio_to_mem(&buf_in, keysize * 10, in); | |
311 | + if(buf_inlen <= 0) | |
312 | + { | |
313 | + BIO_printf(bio_err, "Error reading input Data\n"); | |
314 | + exit(1); | |
315 | + } | |
316 | + if(rev) | |
317 | + { | |
318 | + size_t i; | |
319 | + unsigned char ctmp; | |
320 | + size_t l = (size_t)buf_inlen; | |
321 | + for(i = 0; i < l/2; i++) | |
322 | + { | |
323 | + ctmp = buf_in[i]; | |
324 | + buf_in[i] = buf_in[l - 1 - i]; | |
325 | + buf_in[l - 1 - i] = ctmp; | |
326 | + } | |
327 | + } | |
328 | + } | |
329 | + | |
330 | + if(pkey_op == EVP_PKEY_OP_VERIFY) | |
331 | + { | |
332 | + rv = EVP_PKEY_verify(ctx, sig, (size_t)siglen, | |
333 | + buf_in, (size_t)buf_inlen); | |
334 | + if (rv == 0) | |
335 | + BIO_puts(out, "Signature Verification Failure\n"); | |
336 | + else if (rv == 1) | |
337 | + BIO_puts(out, "Signature Verified Successfully\n"); | |
338 | + if (rv >= 0) | |
339 | + goto end; | |
340 | + } | |
341 | + else | |
342 | + { | |
343 | + rv = do_keyop(ctx, pkey_op, NULL, (size_t *)&buf_outlen, | |
344 | + buf_in, (size_t)buf_inlen); | |
345 | + if (rv > 0) | |
346 | + { | |
347 | + buf_out = OPENSSL_malloc(buf_outlen); | |
348 | + if (!buf_out) | |
349 | + rv = -1; | |
350 | + else | |
351 | + rv = do_keyop(ctx, pkey_op, | |
352 | + buf_out, (size_t *)&buf_outlen, | |
353 | + buf_in, (size_t)buf_inlen); | |
354 | + } | |
355 | + } | |
356 | + | |
357 | + if(rv <= 0) | |
358 | + { | |
359 | + BIO_printf(bio_err, "Public Key operation error\n"); | |
360 | + ERR_print_errors(bio_err); | |
361 | + goto end; | |
362 | + } | |
363 | + ret = 0; | |
364 | + if(asn1parse) | |
365 | + { | |
366 | + if(!ASN1_parse_dump(out, buf_out, buf_outlen, 1, -1)) | |
367 | + ERR_print_errors(bio_err); | |
368 | + } | |
369 | + else if(hexdump) | |
370 | + BIO_dump(out, (char *)buf_out, buf_outlen); | |
371 | + else | |
372 | + BIO_write(out, buf_out, buf_outlen); | |
373 | + | |
374 | + end: | |
375 | + if (ctx) | |
376 | + EVP_PKEY_CTX_free(ctx); | |
377 | + BIO_free(in); | |
378 | + BIO_free_all(out); | |
379 | + if (buf_in) | |
380 | + OPENSSL_free(buf_in); | |
381 | + if (buf_out) | |
382 | + OPENSSL_free(buf_out); | |
383 | + if (sig) | |
384 | + OPENSSL_free(sig); | |
385 | + return ret; | |
386 | +} | |
387 | + | |
388 | +static void usage() | |
389 | +{ | |
390 | + BIO_printf(bio_err, "Usage: pkeyutl [options]\n"); | |
391 | + BIO_printf(bio_err, "-in file input file\n"); | |
392 | + BIO_printf(bio_err, "-out file output file\n"); | |
393 | + BIO_printf(bio_err, "-signature file signature file (verify operation only)\n"); | |
394 | + BIO_printf(bio_err, "-inkey file input key\n"); | |
395 | + BIO_printf(bio_err, "-keyform arg private key format - default PEM\n"); | |
396 | + BIO_printf(bio_err, "-pubin input is a public key\n"); | |
397 | + BIO_printf(bio_err, "-certin input is a certificate carrying a public key\n"); | |
398 | + BIO_printf(bio_err, "-pkeyopt X:Y public key options\n"); | |
399 | + BIO_printf(bio_err, "-sign sign with private key\n"); | |
400 | + BIO_printf(bio_err, "-verify verify with public key\n"); | |
401 | + BIO_printf(bio_err, "-verifyrecover verify with public key, recover original data\n"); | |
402 | + BIO_printf(bio_err, "-encrypt encrypt with public key\n"); | |
403 | + BIO_printf(bio_err, "-decrypt decrypt with private key\n"); | |
404 | + BIO_printf(bio_err, "-derive derive shared secret\n"); | |
405 | + BIO_printf(bio_err, "-hexdump hex dump output\n"); | |
406 | +#ifndef OPENSSL_NO_ENGINE | |
407 | + BIO_printf(bio_err, "-engine e use engine e, possibly a hardware device.\n"); | |
408 | +#endif | |
409 | + BIO_printf(bio_err, "-passin arg pass phrase source\n"); | |
410 | + | |
411 | +} | |
412 | + | |
413 | +static EVP_PKEY_CTX *init_ctx(int *pkeysize, | |
414 | + char *keyfile, int keyform, int key_type, | |
415 | + char *passargin, int pkey_op, ENGINE *e) | |
416 | + { | |
417 | + EVP_PKEY *pkey = NULL; | |
418 | + EVP_PKEY_CTX *ctx = NULL; | |
419 | + char *passin = NULL; | |
420 | + int rv = -1; | |
421 | + X509 *x; | |
422 | + if(((pkey_op == EVP_PKEY_OP_SIGN) || (pkey_op == EVP_PKEY_OP_DECRYPT) | |
423 | + || (pkey_op == EVP_PKEY_OP_DERIVE)) | |
424 | + && (key_type != KEY_PRIVKEY)) | |
425 | + { | |
426 | + BIO_printf(bio_err, "A private key is needed for this operation\n"); | |
427 | + goto end; | |
428 | + } | |
429 | + if(!app_passwd(bio_err, passargin, NULL, &passin, NULL)) | |
430 | + { | |
431 | + BIO_printf(bio_err, "Error getting password\n"); | |
432 | + goto end; | |
433 | + } | |
434 | + switch(key_type) | |
435 | + { | |
436 | + case KEY_PRIVKEY: | |
437 | + pkey = load_key(bio_err, keyfile, keyform, 0, | |
438 | + passin, e, "Private Key"); | |
439 | + break; | |
440 | + | |
441 | + case KEY_PUBKEY: | |
442 | + pkey = load_pubkey(bio_err, keyfile, keyform, 0, | |
443 | + NULL, e, "Public Key"); | |
444 | + break; | |
445 | + | |
446 | + case KEY_CERT: | |
447 | + x = load_cert(bio_err, keyfile, keyform, | |
448 | + NULL, e, "Certificate"); | |
449 | + if(x) | |
450 | + { | |
451 | + pkey = X509_get_pubkey(x); | |
452 | + X509_free(x); | |
453 | + } | |
454 | + break; | |
455 | + | |
456 | + } | |
457 | + | |
458 | + *pkeysize = EVP_PKEY_size(pkey); | |
459 | + | |
460 | + if (!pkey) | |
461 | + goto end; | |
462 | + | |
463 | + ctx = EVP_PKEY_CTX_new(pkey, e); | |
464 | + | |
465 | + EVP_PKEY_free(pkey); | |
466 | + | |
467 | + if (!ctx) | |
468 | + goto end; | |
469 | + | |
470 | + switch(pkey_op) | |
471 | + { | |
472 | + case EVP_PKEY_OP_SIGN: | |
473 | + rv = EVP_PKEY_sign_init(ctx); | |
474 | + break; | |
475 | + | |
476 | + case EVP_PKEY_OP_VERIFY: | |
477 | + rv = EVP_PKEY_verify_init(ctx); | |
478 | + break; | |
479 | + | |
480 | + case EVP_PKEY_OP_VERIFYRECOVER: | |
481 | + rv = EVP_PKEY_verify_recover_init(ctx); | |
482 | + break; | |
483 | + | |
484 | + case EVP_PKEY_OP_ENCRYPT: | |
485 | + rv = EVP_PKEY_encrypt_init(ctx); | |
486 | + break; | |
487 | + | |
488 | + case EVP_PKEY_OP_DECRYPT: | |
489 | + rv = EVP_PKEY_decrypt_init(ctx); | |
490 | + break; | |
491 | + | |
492 | + case EVP_PKEY_OP_DERIVE: | |
493 | + rv = EVP_PKEY_derive_init(ctx); | |
494 | + break; | |
495 | + } | |
496 | + | |
497 | + if (rv <= 0) | |
498 | + { | |
499 | + EVP_PKEY_CTX_free(ctx); | |
500 | + ctx = NULL; | |
501 | + } | |
502 | + | |
503 | + end: | |
504 | + | |
505 | + if (passin) | |
506 | + OPENSSL_free(passin); | |
507 | + | |
508 | + return ctx; | |
509 | + | |
510 | + | |
511 | + } | |
512 | + | |
513 | +static int setup_peer(BIO *err, EVP_PKEY_CTX *ctx, int peerform, | |
514 | + const char *file) | |
515 | + { | |
516 | + EVP_PKEY *peer = NULL; | |
517 | + int ret; | |
518 | + if (!ctx) | |
519 | + { | |
520 | + BIO_puts(err, "-peerkey command before -inkey\n"); | |
521 | + return 0; | |
522 | + } | |
523 | + | |
524 | + peer = load_pubkey(bio_err, file, peerform, 0, NULL, NULL, "Peer Key"); | |
525 | + | |
526 | + if (!peer) | |
527 | + { | |
528 | + BIO_printf(bio_err, "Error reading peer key %s\n", file); | |
529 | + ERR_print_errors(err); | |
530 | + return 0; | |
531 | + } | |
532 | + | |
533 | + ret = EVP_PKEY_derive_set_peer(ctx, peer); | |
534 | + | |
535 | + EVP_PKEY_free(peer); | |
536 | + if (ret <= 0) | |
537 | + ERR_print_errors(err); | |
538 | + return ret; | |
539 | + } | |
540 | + | |
541 | +static int do_keyop(EVP_PKEY_CTX *ctx, int pkey_op, | |
542 | + unsigned char *out, size_t *poutlen, | |
543 | + unsigned char *in, size_t inlen) | |
544 | + { | |
545 | + int rv = 0; | |
546 | + switch(pkey_op) | |
547 | + { | |
548 | + case EVP_PKEY_OP_VERIFYRECOVER: | |
549 | + rv = EVP_PKEY_verify_recover(ctx, out, poutlen, in, inlen); | |
550 | + break; | |
551 | + | |
552 | + case EVP_PKEY_OP_SIGN: | |
553 | + rv = EVP_PKEY_sign(ctx, out, poutlen, in, inlen); | |
554 | + break; | |
555 | + | |
556 | + case EVP_PKEY_OP_ENCRYPT: | |
557 | + rv = EVP_PKEY_encrypt(ctx, out, poutlen, in, inlen); | |
558 | + break; | |
559 | + | |
560 | + case EVP_PKEY_OP_DECRYPT: | |
561 | + rv = EVP_PKEY_decrypt(ctx, out, poutlen, in, inlen); | |
562 | + break; | |
563 | + | |
564 | + case EVP_PKEY_OP_DERIVE: | |
565 | + rv = EVP_PKEY_derive(ctx, out, poutlen); | |
566 | + break; | |
567 | + | |
568 | + } | |
569 | + return rv; | |
570 | + } |
@@ -0,0 +1,1144 @@ | ||
1 | +/* apps/ts.c */ | |
2 | +/* Written by Zoltan Glozik (zglozik@stones.com) for the OpenSSL | |
3 | + * project 2002. | |
4 | + */ | |
5 | +/* ==================================================================== | |
6 | + * Copyright (c) 2001 The OpenSSL Project. All rights reserved. | |
7 | + * | |
8 | + * Redistribution and use in source and binary forms, with or without | |
9 | + * modification, are permitted provided that the following conditions | |
10 | + * are met: | |
11 | + * | |
12 | + * 1. Redistributions of source code must retain the above copyright | |
13 | + * notice, this list of conditions and the following disclaimer. | |
14 | + * | |
15 | + * 2. Redistributions in binary form must reproduce the above copyright | |
16 | + * notice, this list of conditions and the following disclaimer in | |
17 | + * the documentation and/or other materials provided with the | |
18 | + * distribution. | |
19 | + * | |
20 | + * 3. All advertising materials mentioning features or use of this | |
21 | + * software must display the following acknowledgment: | |
22 | + * "This product includes software developed by the OpenSSL Project | |
23 | + * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)" | |
24 | + * | |
25 | + * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to | |
26 | + * endorse or promote products derived from this software without | |
27 | + * prior written permission. For written permission, please contact | |
28 | + * licensing@OpenSSL.org. | |
29 | + * | |
30 | + * 5. Products derived from this software may not be called "OpenSSL" | |
31 | + * nor may "OpenSSL" appear in their names without prior written | |
32 | + * permission of the OpenSSL Project. | |
33 | + * | |
34 | + * 6. Redistributions of any form whatsoever must retain the following | |
35 | + * acknowledgment: | |
36 | + * "This product includes software developed by the OpenSSL Project | |
37 | + * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)" | |
38 | + * | |
39 | + * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY | |
40 | + * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
41 | + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR | |
42 | + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR | |
43 | + * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | |
44 | + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT | |
45 | + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; | |
46 | + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | |
47 | + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, | |
48 | + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | |
49 | + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED | |
50 | + * OF THE POSSIBILITY OF SUCH DAMAGE. | |
51 | + * ==================================================================== | |
52 | + * | |
53 | + * This product includes cryptographic software written by Eric Young | |
54 | + * (eay@cryptsoft.com). This product includes software written by Tim | |
55 | + * Hudson (tjh@cryptsoft.com). | |
56 | + * | |
57 | + */ | |
58 | + | |
59 | +#include <stdio.h> | |
60 | +#include <stdlib.h> | |
61 | +#include <string.h> | |
62 | +#include "apps.h" | |
63 | +#include <openssl/bio.h> | |
64 | +#include <openssl/err.h> | |
65 | +#include <openssl/pem.h> | |
66 | +#include <openssl/rand.h> | |
67 | +#include <openssl/ts.h> | |
68 | +#include <openssl/bn.h> | |
69 | + | |
70 | +#undef PROG | |
71 | +#define PROG ts_main | |
72 | + | |
73 | +/* Length of the nonce of the request in bits (must be a multiple of 8). */ | |
74 | +#define NONCE_LENGTH 64 | |
75 | + | |
76 | +/* Macro definitions for the configuration file. */ | |
77 | +#define ENV_OID_FILE "oid_file" | |
78 | + | |
79 | +/* Local function declarations. */ | |
80 | + | |
81 | +static ASN1_OBJECT *txt2obj(const char *oid); | |
82 | +static CONF *load_config_file(const char *configfile); | |
83 | + | |
84 | +/* Query related functions. */ | |
85 | +static int query_command(const char *data, char *digest, | |
86 | + const EVP_MD *md, const char *policy, int no_nonce, | |
87 | + int cert, const char *in, const char *out, int text); | |
88 | +static BIO *BIO_open_with_default(const char *file, const char *mode, | |
89 | + FILE *default_fp); | |
90 | +static TS_REQ *create_query(BIO *data_bio, char *digest, const EVP_MD *md, | |
91 | + const char *policy, int no_nonce, int cert); | |
92 | +static int create_digest(BIO *input, char *digest, | |
93 | + const EVP_MD *md, unsigned char **md_value); | |
94 | +static ASN1_INTEGER *create_nonce(int bits); | |
95 | + | |
96 | +/* Reply related functions. */ | |
97 | +static int reply_command(CONF *conf, char *section, char *engine, | |
98 | + char *queryfile, char *passin, char *inkey, | |
99 | + char *signer, char *chain, const char *policy, | |
100 | + char *in, int token_in, char *out, int token_out, | |
101 | + int text); | |
102 | +static TS_RESP *read_PKCS7(BIO *in_bio); | |
103 | +static TS_RESP *create_response(CONF *conf, const char *section, char *engine, | |
104 | + char *queryfile, char *passin, char *inkey, | |
105 | + char *signer, char *chain, const char *policy); | |
106 | +static ASN1_INTEGER * MS_CALLBACK serial_cb(TS_RESP_CTX *ctx, void *data); | |
107 | +static ASN1_INTEGER *next_serial(const char *serialfile); | |
108 | +static int save_ts_serial(const char *serialfile, ASN1_INTEGER *serial); | |
109 | + | |
110 | +/* Verify related functions. */ | |
111 | +static int verify_command(char *data, char *digest, char *queryfile, | |
112 | + char *in, int token_in, | |
113 | + char *ca_path, char *ca_file, char *untrusted); | |
114 | +static TS_VERIFY_CTX *create_verify_ctx(char *data, char *digest, | |
115 | + char *queryfile, | |
116 | + char *ca_path, char *ca_file, | |
117 | + char *untrusted); | |
118 | +static X509_STORE *create_cert_store(char *ca_path, char *ca_file); | |
119 | +static int MS_CALLBACK verify_cb(int ok, X509_STORE_CTX *ctx); | |
120 | + | |
121 | +/* Main function definition. */ | |
122 | +int MAIN(int, char **); | |
123 | + | |
124 | +int MAIN(int argc, char **argv) | |
125 | + { | |
126 | + int ret = 1; | |
127 | + char *configfile = NULL; | |
128 | + char *section = NULL; | |
129 | + CONF *conf = NULL; | |
130 | + enum mode { | |
131 | + CMD_NONE, CMD_QUERY, CMD_REPLY, CMD_VERIFY | |
132 | + } mode = CMD_NONE; | |
133 | + char *data = NULL; | |
134 | + char *digest = NULL; | |
135 | + const EVP_MD *md = NULL; | |
136 | + char *rnd = NULL; | |
137 | + char *policy = NULL; | |
138 | + int no_nonce = 0; | |
139 | + int cert = 0; | |
140 | + char *in = NULL; | |
141 | + char *out = NULL; | |
142 | + int text = 0; | |
143 | + char *queryfile = NULL; | |
144 | + char *passin = NULL; /* Password source. */ | |
145 | + char *password =NULL; /* Password itself. */ | |
146 | + char *inkey = NULL; | |
147 | + char *signer = NULL; | |
148 | + char *chain = NULL; | |
149 | + char *ca_path = NULL; | |
150 | + char *ca_file = NULL; | |
151 | + char *untrusted = NULL; | |
152 | + char *engine = NULL; | |
153 | + /* Input is ContentInfo instead of TimeStampResp. */ | |
154 | + int token_in = 0; | |
155 | + /* Output is ContentInfo instead of TimeStampResp. */ | |
156 | + int token_out = 0; | |
157 | + int free_bio_err = 0; | |
158 | + | |
159 | + ERR_load_crypto_strings(); | |
160 | + apps_startup(); | |
161 | + | |
162 | + if (bio_err == NULL && (bio_err = BIO_new(BIO_s_file())) != NULL) | |
163 | + { | |
164 | + free_bio_err = 1; | |
165 | + BIO_set_fp(bio_err, stderr, BIO_NOCLOSE | BIO_FP_TEXT); | |
166 | + } | |
167 | + | |
168 | + for (argc--, argv++; argc > 0; argc--, argv++) | |
169 | + { | |
170 | + if (strcmp(*argv, "-config") == 0) | |
171 | + { | |
172 | + if (argc-- < 1) goto usage; | |
173 | + configfile = *++argv; | |
174 | + } | |
175 | + else if (strcmp(*argv, "-section") == 0) | |
176 | + { | |
177 | + if (argc-- < 1) goto usage; | |
178 | + section = *++argv; | |
179 | + } | |
180 | + else if (strcmp(*argv, "-query") == 0) | |
181 | + { | |
182 | + if (mode != CMD_NONE) goto usage; | |
183 | + mode = CMD_QUERY; | |
184 | + } | |
185 | + else if (strcmp(*argv, "-data") == 0) | |
186 | + { | |
187 | + if (argc-- < 1) goto usage; | |
188 | + data = *++argv; | |
189 | + } | |
190 | + else if (strcmp(*argv, "-digest") == 0) | |
191 | + { | |
192 | + if (argc-- < 1) goto usage; | |
193 | + digest = *++argv; | |
194 | + } | |
195 | + else if (strcmp(*argv, "-rand") == 0) | |
196 | + { | |
197 | + if (argc-- < 1) goto usage; | |
198 | + rnd = *++argv; | |
199 | + } | |
200 | + else if (strcmp(*argv, "-policy") == 0) | |
201 | + { | |
202 | + if (argc-- < 1) goto usage; | |
203 | + policy = *++argv; | |
204 | + } | |
205 | + else if (strcmp(*argv, "-no_nonce") == 0) | |
206 | + { | |
207 | + no_nonce = 1; | |
208 | + } | |
209 | + else if (strcmp(*argv, "-cert") == 0) | |
210 | + { | |
211 | + cert = 1; | |
212 | + } | |
213 | + else if (strcmp(*argv, "-in") == 0) | |
214 | + { | |
215 | + if (argc-- < 1) goto usage; | |
216 | + in = *++argv; | |
217 | + } | |
218 | + else if (strcmp(*argv, "-token_in") == 0) | |
219 | + { | |
220 | + token_in = 1; | |
221 | + } | |
222 | + else if (strcmp(*argv, "-out") == 0) | |
223 | + { | |
224 | + if (argc-- < 1) goto usage; | |
225 | + out = *++argv; | |
226 | + } | |
227 | + else if (strcmp(*argv, "-token_out") == 0) | |
228 | + { | |
229 | + token_out = 1; | |
230 | + } | |
231 | + else if (strcmp(*argv, "-text") == 0) | |
232 | + { | |
233 | + text = 1; | |
234 | + } | |
235 | + else if (strcmp(*argv, "-reply") == 0) | |
236 | + { | |
237 | + if (mode != CMD_NONE) goto usage; | |
238 | + mode = CMD_REPLY; | |
239 | + } | |
240 | + else if (strcmp(*argv, "-queryfile") == 0) | |
241 | + { | |
242 | + if (argc-- < 1) goto usage; | |
243 | + queryfile = *++argv; | |
244 | + } | |
245 | + else if (strcmp(*argv, "-passin") == 0) | |
246 | + { | |
247 | + if (argc-- < 1) goto usage; | |
248 | + passin = *++argv; | |
249 | + } | |
250 | + else if (strcmp(*argv, "-inkey") == 0) | |
251 | + { | |
252 | + if (argc-- < 1) goto usage; | |
253 | + inkey = *++argv; | |
254 | + } | |
255 | + else if (strcmp(*argv, "-signer") == 0) | |
256 | + { | |
257 | + if (argc-- < 1) goto usage; | |
258 | + signer = *++argv; | |
259 | + } | |
260 | + else if (strcmp(*argv, "-chain") == 0) | |
261 | + { | |
262 | + if (argc-- < 1) goto usage; | |
263 | + chain = *++argv; | |
264 | + } | |
265 | + else if (strcmp(*argv, "-verify") == 0) | |
266 | + { | |
267 | + if (mode != CMD_NONE) goto usage; | |
268 | + mode = CMD_VERIFY; | |
269 | + } | |
270 | + else if (strcmp(*argv, "-CApath") == 0) | |
271 | + { | |
272 | + if (argc-- < 1) goto usage; | |
273 | + ca_path = *++argv; | |
274 | + } | |
275 | + else if (strcmp(*argv, "-CAfile") == 0) | |
276 | + { | |
277 | + if (argc-- < 1) goto usage; | |
278 | + ca_file = *++argv; | |
279 | + } | |
280 | + else if (strcmp(*argv, "-untrusted") == 0) | |
281 | + { | |
282 | + if (argc-- < 1) goto usage; | |
283 | + untrusted = *++argv; | |
284 | + } | |
285 | + else if (strcmp(*argv, "-engine") == 0) | |
286 | + { | |
287 | + if (argc-- < 1) goto usage; | |
288 | + engine = *++argv; | |
289 | + } | |
290 | + else if ((md = EVP_get_digestbyname(*argv + 1)) != NULL) | |
291 | + { | |
292 | + /* empty. */ | |
293 | + } | |
294 | + else | |
295 | + goto usage; | |
296 | + } | |
297 | + | |
298 | + /* Seed the random number generator if it is going to be used. */ | |
299 | + if (mode == CMD_QUERY && !no_nonce) | |
300 | + { | |
301 | + if (!app_RAND_load_file(NULL, bio_err, 1) && rnd == NULL) | |
302 | + BIO_printf(bio_err, "warning, not much extra random " | |
303 | + "data, consider using the -rand option\n"); | |
304 | + if (rnd != NULL) | |
305 | + BIO_printf(bio_err,"%ld semi-random bytes loaded\n", | |
306 | + app_RAND_load_files(rnd)); | |
307 | + } | |
308 | + | |
309 | + /* Get the password if required. */ | |
310 | + if(mode == CMD_REPLY && passin && | |
311 | + !app_passwd(bio_err, passin, NULL, &password, NULL)) | |
312 | + { | |
313 | + BIO_printf(bio_err,"Error getting password.\n"); | |
314 | + goto cleanup; | |
315 | + } | |
316 | + | |
317 | + /* Check consistency of parameters and execute | |
318 | + the appropriate function. */ | |
319 | + switch (mode) | |
320 | + { | |
321 | + case CMD_NONE: | |
322 | + goto usage; | |
323 | + case CMD_QUERY: | |
324 | + /* Data file and message imprint cannot be specified | |
325 | + at the same time. */ | |
326 | + ret = data != NULL && digest != NULL; | |
327 | + if (ret) goto usage; | |
328 | + /* Load the config file for possible policy OIDs. */ | |
329 | + conf = load_config_file(configfile); | |
330 | + ret = !query_command(data, digest, md, policy, no_nonce, cert, | |
331 | + in, out, text); | |
332 | + break; | |
333 | + case CMD_REPLY: | |
334 | + conf = load_config_file(configfile); | |
335 | + if (in == NULL) | |
336 | + { | |
337 | + ret = !(queryfile != NULL && conf != NULL && !token_in); | |
338 | + if (ret) goto usage; | |
339 | + } | |
340 | + else | |
341 | + { | |
342 | + /* 'in' and 'queryfile' are exclusive. */ | |
343 | + ret = !(queryfile == NULL); | |
344 | + if (ret) goto usage; | |
345 | + } | |
346 | + | |
347 | + ret = !reply_command(conf, section, engine, queryfile, | |
348 | + password, inkey, signer, chain, policy, | |
349 | + in, token_in, out, token_out, text); | |
350 | + break; | |
351 | + case CMD_VERIFY: | |
352 | + ret = !(((queryfile && !data && !digest) | |
353 | + || (!queryfile && data && !digest) | |
354 | + || (!queryfile && !data && digest)) | |
355 | + && in != NULL); | |
356 | + if (ret) goto usage; | |
357 | + | |
358 | + ret = !verify_command(data, digest, queryfile, in, token_in, | |
359 | + ca_path, ca_file, untrusted); | |
360 | + } | |
361 | + | |
362 | + goto cleanup; | |
363 | + | |
364 | + usage: | |
365 | + BIO_printf(bio_err, "usage:\n" | |
366 | + "ts -query [-rand file%cfile%c...] [-config configfile] " | |
367 | + "[-data file_to_hash] [-digest digest_bytes]" | |
368 | + "[-md2|-md4|-md5|-sha|-sha1|-mdc2|-ripemd160] " | |
369 | + "[-policy object_id] [-no_nonce] [-cert] " | |
370 | + "[-in request.tsq] [-out request.tsq] [-text]\n", | |
371 | + LIST_SEPARATOR_CHAR, LIST_SEPARATOR_CHAR); | |
372 | + BIO_printf(bio_err, "or\n" | |
373 | + "ts -reply [-config configfile] [-section tsa_section] " | |
374 | + "[-queryfile request.tsq] [-passin password] " | |
375 | + "[-signer tsa_cert.pem] [-inkey private_key.pem] " | |
376 | + "[-chain certs_file.pem] [-policy object_id] " | |
377 | + "[-in response.tsr] [-token_in] " | |
378 | + "[-out response.tsr] [-token_out] [-text] [-engine id]\n"); | |
379 | + BIO_printf(bio_err, "or\n" | |
380 | + "ts -verify [-data file_to_hash] [-digest digest_bytes] " | |
381 | + "[-queryfile request.tsq] " | |
382 | + "-in response.tsr [-token_in] " | |
383 | + "-CApath ca_path -CAfile ca_file.pem " | |
384 | + "-untrusted cert_file.pem\n"); | |
385 | + cleanup: | |
386 | + /* Clean up. */ | |
387 | + app_RAND_write_file(NULL, bio_err); | |
388 | + NCONF_free(conf); | |
389 | + OPENSSL_free(password); | |
390 | + OBJ_cleanup(); | |
391 | + if (free_bio_err) | |
392 | + { | |
393 | + BIO_free_all(bio_err); | |
394 | + bio_err = NULL; | |
395 | + } | |
396 | + | |
397 | + OPENSSL_EXIT(ret); | |
398 | + } | |
399 | + | |
400 | +/* | |
401 | + * Configuration file-related function definitions. | |
402 | + */ | |
403 | + | |
404 | +static ASN1_OBJECT *txt2obj(const char *oid) | |
405 | + { | |
406 | + ASN1_OBJECT *oid_obj = NULL; | |
407 | + | |
408 | + if (!(oid_obj = OBJ_txt2obj(oid, 0))) | |
409 | + BIO_printf(bio_err, "cannot convert %s to OID\n", oid); | |
410 | + | |
411 | + return oid_obj; | |
412 | + } | |
413 | + | |
414 | +static CONF *load_config_file(const char *configfile) | |
415 | + { | |
416 | + CONF *conf = NULL; | |
417 | + long errorline = -1; | |
418 | + | |
419 | + if (!configfile) configfile = getenv("OPENSSL_CONF"); | |
420 | + if (!configfile) configfile = getenv("SSLEAY_CONF"); | |
421 | + | |
422 | + if (configfile && | |
423 | + (!(conf = NCONF_new(NULL)) || | |
424 | + NCONF_load(conf, configfile, &errorline) <= 0)) | |
425 | + { | |
426 | + if (errorline <= 0) | |
427 | + BIO_printf(bio_err, "error loading the config file " | |
428 | + "'%s'\n", configfile); | |
429 | + else | |
430 | + BIO_printf(bio_err, "error on line %ld of config file " | |
431 | + "'%s'\n", errorline, configfile); | |
432 | + } | |
433 | + | |
434 | + if (conf != NULL) | |
435 | + { | |
436 | + const char *p; | |
437 | + | |
438 | + BIO_printf(bio_err,"Using configuration from %s\n", configfile); | |
439 | + p = NCONF_get_string(conf, NULL, ENV_OID_FILE); | |
440 | + if (p != NULL) | |
441 | + { | |
442 | + BIO *oid_bio = BIO_new_file(p, "r"); | |
443 | + if (!oid_bio) | |
444 | + ERR_print_errors(bio_err); | |
445 | + else | |
446 | + { | |
447 | + OBJ_create_objects(oid_bio); | |
448 | + BIO_free_all(oid_bio); | |
449 | + } | |
450 | + } | |
451 | + else | |
452 | + ERR_clear_error(); | |
453 | + if(!add_oid_section(bio_err, conf)) | |
454 | + ERR_print_errors(bio_err); | |
455 | + } | |
456 | + return conf; | |
457 | + } | |
458 | + | |
459 | +/* | |
460 | + * Query-related method definitions. | |
461 | + */ | |
462 | + | |
463 | +static int query_command(const char *data, char *digest, const EVP_MD *md, | |
464 | + const char *policy, int no_nonce, | |
465 | + int cert, const char *in, const char *out, int text) | |
466 | + { | |
467 | + int ret = 0; | |
468 | + TS_REQ *query = NULL; | |
469 | + BIO *in_bio = NULL; | |
470 | + BIO *data_bio = NULL; | |
471 | + BIO *out_bio = NULL; | |
472 | + | |
473 | + /* Build query object either from file or from scratch. */ | |
474 | + if (in != NULL) | |
475 | + { | |
476 | + if ((in_bio = BIO_new_file(in, "rb")) == NULL) goto end; | |
477 | + query = d2i_TS_REQ_bio(in_bio, NULL); | |
478 | + } | |
479 | + else | |
480 | + { | |
481 | + /* Open the file if no explicit digest bytes were specified. */ | |
482 | + if (!digest | |
483 | + && !(data_bio = BIO_open_with_default(data, "rb", stdin))) | |
484 | + goto end; | |
485 | + /* Creating the query object. */ | |
486 | + query = create_query(data_bio, digest, md, | |
487 | + policy, no_nonce, cert); | |
488 | + /* Saving the random number generator state. */ | |
489 | + } | |
490 | + if (query == NULL) goto end; | |
491 | + | |
492 | + /* Write query either in ASN.1 or in text format. */ | |
493 | + if ((out_bio = BIO_open_with_default(out, "wb", stdout)) == NULL) | |
494 | + goto end; | |
495 | + if (text) | |
496 | + { | |
497 | + /* Text output. */ | |
498 | + if (!TS_REQ_print_bio(out_bio, query)) | |
499 | + goto end; | |
500 | + } | |
501 | + else | |
502 | + { | |
503 | + /* ASN.1 output. */ | |
504 | + if (!i2d_TS_REQ_bio(out_bio, query)) | |
505 | + goto end; | |
506 | + } | |
507 | + | |
508 | + ret = 1; | |
509 | + | |
510 | + end: | |
511 | + ERR_print_errors(bio_err); | |
512 | + | |
513 | + /* Clean up. */ | |
514 | + BIO_free_all(in_bio); | |
515 | + BIO_free_all(data_bio); | |
516 | + BIO_free_all(out_bio); | |
517 | + TS_REQ_free(query); | |
518 | + | |
519 | + return ret; | |
520 | + } | |
521 | + | |
522 | +static BIO *BIO_open_with_default(const char *file, const char *mode, | |
523 | + FILE *default_fp) | |
524 | + { | |
525 | + return file == NULL ? | |
526 | + BIO_new_fp(default_fp, BIO_NOCLOSE) | |
527 | + : BIO_new_file(file, mode); | |
528 | + } | |
529 | + | |
530 | +static TS_REQ *create_query(BIO *data_bio, char *digest, const EVP_MD *md, | |
531 | + const char *policy, int no_nonce, int cert) | |
532 | + { | |
533 | + int ret = 0; | |
534 | + TS_REQ *ts_req = NULL; | |
535 | + int len; | |
536 | + TS_MSG_IMPRINT *msg_imprint = NULL; | |
537 | + X509_ALGOR *algo = NULL; | |
538 | + unsigned char *data = NULL; | |
539 | + ASN1_OBJECT *policy_obj = NULL; | |
540 | + ASN1_INTEGER *nonce_asn1 = NULL; | |
541 | + | |
542 | + /* Setting default message digest. */ | |
543 | + if (!md && !(md = EVP_get_digestbyname("sha1"))) goto err; | |
544 | + | |
545 | + /* Creating request object. */ | |
546 | + if (!(ts_req = TS_REQ_new())) goto err; | |
547 | + | |
548 | + /* Setting version. */ | |
549 | + if (!TS_REQ_set_version(ts_req, 1)) goto err; | |
550 | + | |
551 | + /* Creating and adding MSG_IMPRINT object. */ | |
552 | + if (!(msg_imprint = TS_MSG_IMPRINT_new())) goto err; | |
553 | + | |
554 | + /* Adding algorithm. */ | |
555 | + if (!(algo = X509_ALGOR_new())) goto err; | |
556 | + if (!(algo->algorithm = OBJ_nid2obj(EVP_MD_type(md)))) goto err; | |
557 | + if (!(algo->parameter = ASN1_TYPE_new())) goto err; | |
558 | + algo->parameter->type = V_ASN1_NULL; | |
559 | + if (!TS_MSG_IMPRINT_set_algo(msg_imprint, algo)) goto err; | |
560 | + | |
561 | + /* Adding message digest. */ | |
562 | + if ((len = create_digest(data_bio, digest, md, &data)) == 0) | |
563 | + goto err; | |
564 | + if (!TS_MSG_IMPRINT_set_msg(msg_imprint, data, len)) goto err; | |
565 | + | |
566 | + if (!TS_REQ_set_msg_imprint(ts_req, msg_imprint)) goto err; | |
567 | + | |
568 | + /* Setting policy if requested. */ | |
569 | + if (policy && !(policy_obj = txt2obj(policy))) goto err; | |
570 | + if (policy_obj && !TS_REQ_set_policy_id(ts_req, policy_obj)) goto err; | |
571 | + | |
572 | + /* Setting nonce if requested. */ | |
573 | + if (!no_nonce && !(nonce_asn1 = create_nonce(NONCE_LENGTH))) goto err; | |
574 | + if (nonce_asn1 && !TS_REQ_set_nonce(ts_req, nonce_asn1)) goto err; | |
575 | + | |
576 | + /* Setting certificate request flag if requested. */ | |
577 | + if (!TS_REQ_set_cert_req(ts_req, cert)) goto err; | |
578 | + | |
579 | + ret = 1; | |
580 | + err: | |
581 | + if (!ret) | |
582 | + { | |
583 | + TS_REQ_free(ts_req); | |
584 | + ts_req = NULL; | |
585 | + BIO_printf(bio_err, "could not create query\n"); | |
586 | + } | |
587 | + TS_MSG_IMPRINT_free(msg_imprint); | |
588 | + X509_ALGOR_free(algo); | |
589 | + OPENSSL_free(data); | |
590 | + ASN1_OBJECT_free(policy_obj); | |
591 | + ASN1_INTEGER_free(nonce_asn1); | |
592 | + return ts_req; | |
593 | + } | |
594 | + | |
595 | +static int create_digest(BIO *input, char *digest, const EVP_MD *md, | |
596 | + unsigned char **md_value) | |
597 | + { | |
598 | + int md_value_len; | |
599 | + | |
600 | + md_value_len = EVP_MD_size(md); | |
601 | + if (md_value_len < 0) | |
602 | + goto err; | |
603 | + if (input) | |
604 | + { | |
605 | + /* Digest must be computed from an input file. */ | |
606 | + EVP_MD_CTX md_ctx; | |
607 | + unsigned char buffer[4096]; | |
608 | + int length; | |
609 | + | |
610 | + *md_value = OPENSSL_malloc(md_value_len); | |
611 | + if (*md_value == 0) goto err; | |
612 | + | |
613 | + EVP_DigestInit(&md_ctx, md); | |
614 | + while ((length = BIO_read(input, buffer, sizeof(buffer))) > 0) | |
615 | + { | |
616 | + EVP_DigestUpdate(&md_ctx, buffer, length); | |
617 | + } | |
618 | + EVP_DigestFinal(&md_ctx, *md_value, NULL); | |
619 | + } | |
620 | + else | |
621 | + { | |
622 | + /* Digest bytes are specified with digest. */ | |
623 | + long digest_len; | |
624 | + *md_value = string_to_hex(digest, &digest_len); | |
625 | + if (!*md_value || md_value_len != digest_len) | |
626 | + { | |
627 | + OPENSSL_free(*md_value); | |
628 | + *md_value = NULL; | |
629 | + BIO_printf(bio_err, "bad digest, %d bytes " | |
630 | + "must be specified\n", md_value_len); | |
631 | + goto err; | |
632 | + } | |
633 | + } | |
634 | + | |
635 | + return md_value_len; | |
636 | + err: | |
637 | + return 0; | |
638 | + } | |
639 | + | |
640 | +static ASN1_INTEGER *create_nonce(int bits) | |
641 | + { | |
642 | + unsigned char buf[20]; | |
643 | + ASN1_INTEGER *nonce = NULL; | |
644 | + int len = (bits - 1) / 8 + 1; | |
645 | + int i; | |
646 | + | |
647 | + /* Generating random byte sequence. */ | |
648 | + if (len > (int)sizeof(buf)) goto err; | |
649 | + if (!RAND_bytes(buf, len)) goto err; | |
650 | + | |
651 | + /* Find the first non-zero byte and creating ASN1_INTEGER object. */ | |
652 | + for (i = 0; i < len && !buf[i]; ++i); | |
653 | + if (!(nonce = ASN1_INTEGER_new())) goto err; | |
654 | + OPENSSL_free(nonce->data); | |
655 | + /* Allocate at least one byte. */ | |
656 | + nonce->length = len - i; | |
657 | + if (!(nonce->data = OPENSSL_malloc(nonce->length + 1))) goto err; | |
658 | + memcpy(nonce->data, buf + i, nonce->length); | |
659 | + | |
660 | + return nonce; | |
661 | + err: | |
662 | + BIO_printf(bio_err, "could not create nonce\n"); | |
663 | + ASN1_INTEGER_free(nonce); | |
664 | + return NULL; | |
665 | + } | |
666 | +/* | |
667 | + * Reply-related method definitions. | |
668 | + */ | |
669 | + | |
670 | +static int reply_command(CONF *conf, char *section, char *engine, | |
671 | + char *queryfile, char *passin, char *inkey, | |
672 | + char *signer, char *chain, const char *policy, | |
673 | + char *in, int token_in, | |
674 | + char *out, int token_out, int text) | |
675 | + { | |
676 | + int ret = 0; | |
677 | + TS_RESP *response = NULL; | |
678 | + BIO *in_bio = NULL; | |
679 | + BIO *query_bio = NULL; | |
680 | + BIO *inkey_bio = NULL; | |
681 | + BIO *signer_bio = NULL; | |
682 | + BIO *out_bio = NULL; | |
683 | + | |
684 | + /* Build response object either from response or query. */ | |
685 | + if (in != NULL) | |
686 | + { | |
687 | + if ((in_bio = BIO_new_file(in, "rb")) == NULL) goto end; | |
688 | + if (token_in) | |
689 | + { | |
690 | + /* We have a ContentInfo (PKCS7) object, add | |
691 | + 'granted' status info around it. */ | |
692 | + response = read_PKCS7(in_bio); | |
693 | + } | |
694 | + else | |
695 | + { | |
696 | + /* We have a ready-made TS_RESP object. */ | |
697 | + response = d2i_TS_RESP_bio(in_bio, NULL); | |
698 | + } | |
699 | + } | |
700 | + else | |
701 | + { | |
702 | + response = create_response(conf, section, engine, queryfile, | |
703 | + passin, inkey, signer, chain, | |
704 | + policy); | |
705 | + if (response) | |
706 | + BIO_printf(bio_err, "Response has been generated.\n"); | |
707 | + else | |
708 | + BIO_printf(bio_err, "Response is not generated.\n"); | |
709 | + } | |
710 | + if (response == NULL) goto end; | |
711 | + | |
712 | + /* Write response either in ASN.1 or text format. */ | |
713 | + if ((out_bio = BIO_open_with_default(out, "wb", stdout)) == NULL) | |
714 | + goto end; | |
715 | + if (text) | |
716 | + { | |
717 | + /* Text output. */ | |
718 | + if (token_out) | |
719 | + { | |
720 | + TS_TST_INFO *tst_info = TS_RESP_get_tst_info(response); | |
721 | + if (!TS_TST_INFO_print_bio(out_bio, tst_info)) goto end; | |
722 | + } | |
723 | + else | |
724 | + { | |
725 | + if (!TS_RESP_print_bio(out_bio, response)) goto end; | |
726 | + } | |
727 | + } | |
728 | + else | |
729 | + { | |
730 | + /* ASN.1 DER output. */ | |
731 | + if (token_out) | |
732 | + { | |
733 | + PKCS7 *token = TS_RESP_get_token(response); | |
734 | + if (!i2d_PKCS7_bio(out_bio, token)) goto end; | |
735 | + } | |
736 | + else | |
737 | + { | |
738 | + if (!i2d_TS_RESP_bio(out_bio, response)) goto end; | |
739 | + } | |
740 | + } | |
741 | + | |
742 | + ret = 1; | |
743 | + | |
744 | + end: | |
745 | + ERR_print_errors(bio_err); | |
746 | + | |
747 | + /* Clean up. */ | |
748 | + BIO_free_all(in_bio); | |
749 | + BIO_free_all(query_bio); | |
750 | + BIO_free_all(inkey_bio); | |
751 | + BIO_free_all(signer_bio); | |
752 | + BIO_free_all(out_bio); | |
753 | + TS_RESP_free(response); | |
754 | + | |
755 | + return ret; | |
756 | + } | |
757 | + | |
758 | +/* Reads a PKCS7 token and adds default 'granted' status info to it. */ | |
759 | +static TS_RESP *read_PKCS7(BIO *in_bio) | |
760 | + { | |
761 | + int ret = 0; | |
762 | + PKCS7 *token = NULL; | |
763 | + TS_TST_INFO *tst_info = NULL; | |
764 | + TS_RESP *resp = NULL; | |
765 | + TS_STATUS_INFO *si = NULL; | |
766 | + | |
767 | + /* Read PKCS7 object and extract the signed time stamp info. */ | |
768 | + if (!(token = d2i_PKCS7_bio(in_bio, NULL))) goto end; | |
769 | + if (!(tst_info = PKCS7_to_TS_TST_INFO(token))) goto end; | |
770 | + | |
771 | + /* Creating response object. */ | |
772 | + if (!(resp = TS_RESP_new())) goto end; | |
773 | + | |
774 | + /* Create granted status info. */ | |
775 | + if (!(si = TS_STATUS_INFO_new())) goto end; | |
776 | + if (!(ASN1_INTEGER_set(si->status, TS_STATUS_GRANTED))) goto end; | |
777 | + if (!TS_RESP_set_status_info(resp, si)) goto end; | |
778 | + | |
779 | + /* Setting encapsulated token. */ | |
780 | + TS_RESP_set_tst_info(resp, token, tst_info); | |
781 | + token = NULL; /* Ownership is lost. */ | |
782 | + tst_info = NULL; /* Ownership is lost. */ | |
783 | + | |
784 | + ret = 1; | |
785 | + end: | |
786 | + PKCS7_free(token); | |
787 | + TS_TST_INFO_free(tst_info); | |
788 | + if (!ret) | |
789 | + { | |
790 | + TS_RESP_free(resp); | |
791 | + resp = NULL; | |
792 | + } | |
793 | + TS_STATUS_INFO_free(si); | |
794 | + return resp; | |
795 | + } | |
796 | + | |
797 | +static TS_RESP *create_response(CONF *conf, const char *section, char *engine, | |
798 | + char *queryfile, char *passin, char *inkey, | |
799 | + char *signer, char *chain, const char *policy) | |
800 | + { | |
801 | + int ret = 0; | |
802 | + TS_RESP *response = NULL; | |
803 | + BIO *query_bio = NULL; | |
804 | + TS_RESP_CTX *resp_ctx = NULL; | |
805 | + | |
806 | + if (!(query_bio = BIO_new_file(queryfile, "rb"))) | |
807 | + goto end; | |
808 | + | |
809 | + /* Getting TSA configuration section. */ | |
810 | + if (!(section = TS_CONF_get_tsa_section(conf, section))) | |
811 | + goto end; | |
812 | + | |
813 | + /* Setting up response generation context. */ | |
814 | + if (!(resp_ctx = TS_RESP_CTX_new())) goto end; | |
815 | + | |
816 | + /* Setting serial number provider callback. */ | |
817 | + if (!TS_CONF_set_serial(conf, section, serial_cb, resp_ctx)) goto end; | |
818 | +#ifndef OPENSSL_NO_ENGINE | |
819 | + /* Setting default OpenSSL engine. */ | |
820 | + if (!TS_CONF_set_crypto_device(conf, section, engine)) goto end; | |
821 | +#endif | |
822 | + | |
823 | + /* Setting TSA signer certificate. */ | |
824 | + if (!TS_CONF_set_signer_cert(conf, section, signer, resp_ctx)) goto end; | |
825 | + | |
826 | + /* Setting TSA signer certificate chain. */ | |
827 | + if (!TS_CONF_set_certs(conf, section, chain, resp_ctx)) goto end; | |
828 | + | |
829 | + /* Setting TSA signer private key. */ | |
830 | + if (!TS_CONF_set_signer_key(conf, section, inkey, passin, resp_ctx)) | |
831 | + goto end; | |
832 | + | |
833 | + /* Setting default policy OID. */ | |
834 | + if (!TS_CONF_set_def_policy(conf, section, policy, resp_ctx)) goto end; | |
835 | + | |
836 | + /* Setting acceptable policy OIDs. */ | |
837 | + if (!TS_CONF_set_policies(conf, section, resp_ctx)) goto end; | |
838 | + | |
839 | + /* Setting the acceptable one-way hash algorithms. */ | |
840 | + if (!TS_CONF_set_digests(conf, section, resp_ctx)) goto end; | |
841 | + | |
842 | + /* Setting guaranteed time stamp accuracy. */ | |
843 | + if (!TS_CONF_set_accuracy(conf, section, resp_ctx)) goto end; | |
844 | + | |
845 | + /* Setting the precision of the time. */ | |
846 | + if (!TS_CONF_set_clock_precision_digits(conf, section, resp_ctx)) | |
847 | + goto end; | |
848 | + | |
849 | + /* Setting the ordering flaf if requested. */ | |
850 | + if (!TS_CONF_set_ordering(conf, section, resp_ctx)) goto end; | |
851 | + | |
852 | + /* Setting the TSA name required flag if requested. */ | |
853 | + if (!TS_CONF_set_tsa_name(conf, section, resp_ctx)) goto end; | |
854 | + | |
855 | + /* Setting the ESS cert id chain flag if requested. */ | |
856 | + if (!TS_CONF_set_ess_cert_id_chain(conf, section, resp_ctx)) goto end; | |
857 | + | |
858 | + /* Creating the response. */ | |
859 | + if (!(response = TS_RESP_create_response(resp_ctx, query_bio))) | |
860 | + goto end; | |
861 | + | |
862 | + ret = 1; | |
863 | + end: | |
864 | + if (!ret) | |
865 | + { | |
866 | + TS_RESP_free(response); | |
867 | + response = NULL; | |
868 | + } | |
869 | + TS_RESP_CTX_free(resp_ctx); | |
870 | + BIO_free_all(query_bio); | |
871 | + | |
872 | + return response; | |
873 | + } | |
874 | + | |
875 | +static ASN1_INTEGER * MS_CALLBACK serial_cb(TS_RESP_CTX *ctx, void *data) | |
876 | + { | |
877 | + const char *serial_file = (const char *) data; | |
878 | + ASN1_INTEGER *serial = next_serial(serial_file); | |
879 | + | |
880 | + if (!serial) | |
881 | + { | |
882 | + TS_RESP_CTX_set_status_info(ctx, TS_STATUS_REJECTION, | |
883 | + "Error during serial number " | |
884 | + "generation."); | |
885 | + TS_RESP_CTX_add_failure_info(ctx, | |
886 | + TS_INFO_ADD_INFO_NOT_AVAILABLE); | |
887 | + } | |
888 | + else | |
889 | + save_ts_serial(serial_file, serial); | |
890 | + | |
891 | + return serial; | |
892 | + } | |
893 | + | |
894 | +static ASN1_INTEGER *next_serial(const char *serialfile) | |
895 | + { | |
896 | + int ret = 0; | |
897 | + BIO *in = NULL; | |
898 | + ASN1_INTEGER *serial = NULL; | |
899 | + BIGNUM *bn = NULL; | |
900 | + | |
901 | + if (!(serial = ASN1_INTEGER_new())) goto err; | |
902 | + | |
903 | + if (!(in = BIO_new_file(serialfile, "r"))) | |
904 | + { | |
905 | + ERR_clear_error(); | |
906 | + BIO_printf(bio_err, "Warning: could not open file %s for " | |
907 | + "reading, using serial number: 1\n", serialfile); | |
908 | + if (!ASN1_INTEGER_set(serial, 1)) goto err; | |
909 | + } | |
910 | + else | |
911 | + { | |
912 | + char buf[1024]; | |
913 | + if (!a2i_ASN1_INTEGER(in, serial, buf, sizeof(buf))) | |
914 | + { | |
915 | + BIO_printf(bio_err, "unable to load number from %s\n", | |
916 | + serialfile); | |
917 | + goto err; | |
918 | + } | |
919 | + if (!(bn = ASN1_INTEGER_to_BN(serial, NULL))) goto err; | |
920 | + ASN1_INTEGER_free(serial); | |
921 | + serial = NULL; | |
922 | + if (!BN_add_word(bn, 1)) goto err; | |
923 | + if (!(serial = BN_to_ASN1_INTEGER(bn, NULL))) goto err; | |
924 | + } | |
925 | + ret = 1; | |
926 | + err: | |
927 | + if (!ret) | |
928 | + { | |
929 | + ASN1_INTEGER_free(serial); | |
930 | + serial = NULL; | |
931 | + } | |
932 | + BIO_free_all(in); | |
933 | + BN_free(bn); | |
934 | + return serial; | |
935 | + } | |
936 | + | |
937 | +static int save_ts_serial(const char *serialfile, ASN1_INTEGER *serial) | |
938 | + { | |
939 | + int ret = 0; | |
940 | + BIO *out = NULL; | |
941 | + | |
942 | + if (!(out = BIO_new_file(serialfile, "w"))) goto err; | |
943 | + if (i2a_ASN1_INTEGER(out, serial) <= 0) goto err; | |
944 | + if (BIO_puts(out, "\n") <= 0) goto err; | |
945 | + ret = 1; | |
946 | + err: | |
947 | + if (!ret) | |
948 | + BIO_printf(bio_err, "could not save serial number to %s\n", | |
949 | + serialfile); | |
950 | + BIO_free_all(out); | |
951 | + return ret; | |
952 | + } | |
953 | + | |
954 | +/* | |
955 | + * Verify-related method definitions. | |
956 | + */ | |
957 | + | |
958 | +static int verify_command(char *data, char *digest, char *queryfile, | |
959 | + char *in, int token_in, | |
960 | + char *ca_path, char *ca_file, char *untrusted) | |
961 | + { | |
962 | + BIO *in_bio = NULL; | |
963 | + PKCS7 *token = NULL; | |
964 | + TS_RESP *response = NULL; | |
965 | + TS_VERIFY_CTX *verify_ctx = NULL; | |
966 | + int ret = 0; | |
967 | + | |
968 | + /* Decode the token (PKCS7) or response (TS_RESP) files. */ | |
969 | + if (!(in_bio = BIO_new_file(in, "rb"))) goto end; | |
970 | + if (token_in) | |
971 | + { | |
972 | + if (!(token = d2i_PKCS7_bio(in_bio, NULL))) goto end; | |
973 | + } | |
974 | + else | |
975 | + { | |
976 | + if (!(response = d2i_TS_RESP_bio(in_bio, NULL))) goto end; | |
977 | + } | |
978 | + | |
979 | + if (!(verify_ctx = create_verify_ctx(data, digest, queryfile, | |
980 | + ca_path, ca_file, untrusted))) | |
981 | + goto end; | |
982 | + | |
983 | + /* Checking the token or response against the request. */ | |
984 | + ret = token_in ? | |
985 | + TS_RESP_verify_token(verify_ctx, token) : | |
986 | + TS_RESP_verify_response(verify_ctx, response); | |
987 | + | |
988 | + end: | |
989 | + printf("Verification: "); | |
990 | + if (ret) | |
991 | + printf("OK\n"); | |
992 | + else | |
993 | + { | |
994 | + printf("FAILED\n"); | |
995 | + /* Print errors, if there are any. */ | |
996 | + ERR_print_errors(bio_err); | |
997 | + } | |
998 | + | |
999 | + /* Clean up. */ | |
1000 | + BIO_free_all(in_bio); | |
1001 | + PKCS7_free(token); | |
1002 | + TS_RESP_free(response); | |
1003 | + TS_VERIFY_CTX_free(verify_ctx); | |
1004 | + return ret; | |
1005 | + } | |
1006 | + | |
1007 | +static TS_VERIFY_CTX *create_verify_ctx(char *data, char *digest, | |
1008 | + char *queryfile, | |
1009 | + char *ca_path, char *ca_file, | |
1010 | + char *untrusted) | |
1011 | + { | |
1012 | + TS_VERIFY_CTX *ctx = NULL; | |
1013 | + BIO *input = NULL; | |
1014 | + TS_REQ *request = NULL; | |
1015 | + int ret = 0; | |
1016 | + | |
1017 | + if (data != NULL || digest != NULL) | |
1018 | + { | |
1019 | + if (!(ctx = TS_VERIFY_CTX_new())) goto err; | |
1020 | + ctx->flags = TS_VFY_VERSION | TS_VFY_SIGNER; | |
1021 | + if (data != NULL) | |
1022 | + { | |
1023 | + ctx->flags |= TS_VFY_DATA; | |
1024 | + if (!(ctx->data = BIO_new_file(data, "rb"))) goto err; | |
1025 | + } | |
1026 | + else if (digest != NULL) | |
1027 | + { | |
1028 | + long imprint_len; | |
1029 | + ctx->flags |= TS_VFY_IMPRINT; | |
1030 | + if (!(ctx->imprint = string_to_hex(digest, | |
1031 | + &imprint_len))) | |
1032 | + { | |
1033 | + BIO_printf(bio_err, "invalid digest string\n"); | |
1034 | + goto err; | |
1035 | + } | |
1036 | + ctx->imprint_len = imprint_len; | |
1037 | + } | |
1038 | + | |
1039 | + } | |
1040 | + else if (queryfile != NULL) | |
1041 | + { | |
1042 | + /* The request has just to be read, decoded and converted to | |
1043 | + a verify context object. */ | |
1044 | + if (!(input = BIO_new_file(queryfile, "rb"))) goto err; | |
1045 | + if (!(request = d2i_TS_REQ_bio(input, NULL))) goto err; | |
1046 | + if (!(ctx = TS_REQ_to_TS_VERIFY_CTX(request, NULL))) goto err; | |
1047 | + } | |
1048 | + else | |
1049 | + return NULL; | |
1050 | + | |
1051 | + /* Add the signature verification flag and arguments. */ | |
1052 | + ctx->flags |= TS_VFY_SIGNATURE; | |
1053 | + | |
1054 | + /* Initialising the X509_STORE object. */ | |
1055 | + if (!(ctx->store = create_cert_store(ca_path, ca_file))) goto err; | |
1056 | + | |
1057 | + /* Loading untrusted certificates. */ | |
1058 | + if (untrusted && !(ctx->certs = TS_CONF_load_certs(untrusted))) | |
1059 | + goto err; | |
1060 | + | |
1061 | + ret = 1; | |
1062 | + err: | |
1063 | + if (!ret) | |
1064 | + { | |
1065 | + TS_VERIFY_CTX_free(ctx); | |
1066 | + ctx = NULL; | |
1067 | + } | |
1068 | + BIO_free_all(input); | |
1069 | + TS_REQ_free(request); | |
1070 | + return ctx; | |
1071 | + } | |
1072 | + | |
1073 | +static X509_STORE *create_cert_store(char *ca_path, char *ca_file) | |
1074 | + { | |
1075 | + X509_STORE *cert_ctx = NULL; | |
1076 | + X509_LOOKUP *lookup = NULL; | |
1077 | + int i; | |
1078 | + | |
1079 | + /* Creating the X509_STORE object. */ | |
1080 | + cert_ctx = X509_STORE_new(); | |
1081 | + | |
1082 | + /* Setting the callback for certificate chain verification. */ | |
1083 | + X509_STORE_set_verify_cb_func(cert_ctx, verify_cb); | |
1084 | + | |
1085 | + /* Adding a trusted certificate directory source. */ | |
1086 | + if (ca_path) | |
1087 | + { | |
1088 | + lookup = X509_STORE_add_lookup(cert_ctx, | |
1089 | + X509_LOOKUP_hash_dir()); | |
1090 | + if (lookup == NULL) | |
1091 | + { | |
1092 | + BIO_printf(bio_err, "memory allocation failure\n"); | |
1093 | + goto err; | |
1094 | + } | |
1095 | + i = X509_LOOKUP_add_dir(lookup, ca_path, X509_FILETYPE_PEM); | |
1096 | + if (!i) | |
1097 | + { | |
1098 | + BIO_printf(bio_err, "Error loading directory %s\n", | |
1099 | + ca_path); | |
1100 | + goto err; | |
1101 | + } | |
1102 | + } | |
1103 | + | |
1104 | + /* Adding a trusted certificate file source. */ | |
1105 | + if (ca_file) | |
1106 | + { | |
1107 | + lookup = X509_STORE_add_lookup(cert_ctx, X509_LOOKUP_file()); | |
1108 | + if (lookup == NULL) | |
1109 | + { | |
1110 | + BIO_printf(bio_err, "memory allocation failure\n"); | |
1111 | + goto err; | |
1112 | + } | |
1113 | + i = X509_LOOKUP_load_file(lookup, ca_file, X509_FILETYPE_PEM); | |
1114 | + if (!i) | |
1115 | + { | |
1116 | + BIO_printf(bio_err, "Error loading file %s\n", ca_file); | |
1117 | + goto err; | |
1118 | + } | |
1119 | + } | |
1120 | + | |
1121 | + return cert_ctx; | |
1122 | + err: | |
1123 | + X509_STORE_free(cert_ctx); | |
1124 | + return NULL; | |
1125 | + } | |
1126 | + | |
1127 | +static int MS_CALLBACK verify_cb(int ok, X509_STORE_CTX *ctx) | |
1128 | + { | |
1129 | + /* | |
1130 | + char buf[256]; | |
1131 | + | |
1132 | + if (!ok) | |
1133 | + { | |
1134 | + X509_NAME_oneline(X509_get_subject_name(ctx->current_cert), | |
1135 | + buf, sizeof(buf)); | |
1136 | + printf("%s\n", buf); | |
1137 | + printf("error %d at %d depth lookup: %s\n", | |
1138 | + ctx->error, ctx->error_depth, | |
1139 | + X509_verify_cert_error_string(ctx->error)); | |
1140 | + } | |
1141 | + */ | |
1142 | + | |
1143 | + return ok; | |
1144 | + } |
@@ -0,0 +1,195 @@ | ||
1 | +#!/usr/bin/perl -w | |
2 | +# Written by Zoltan Glozik <zglozik@stones.com>. | |
3 | +# Copyright (c) 2002 The OpenTSA Project. All rights reserved. | |
4 | +$::version = '$Id: tsget,v 1.1 2006/02/12 23:11:21 ulf Exp $'; | |
5 | + | |
6 | +use strict; | |
7 | +use IO::Handle; | |
8 | +use Getopt::Std; | |
9 | +use File::Basename; | |
10 | +use WWW::Curl::easy; | |
11 | + | |
12 | +use vars qw(%options); | |
13 | + | |
14 | +# Callback for reading the body. | |
15 | +sub read_body { | |
16 | + my ($maxlength, $state) = @_; | |
17 | + my $return_data = ""; | |
18 | + my $data_len = length ${$state->{data}}; | |
19 | + if ($state->{bytes} < $data_len) { | |
20 | + $data_len = $data_len - $state->{bytes}; | |
21 | + $data_len = $maxlength if $data_len > $maxlength; | |
22 | + $return_data = substr ${$state->{data}}, $state->{bytes}, $data_len; | |
23 | + $state->{bytes} += $data_len; | |
24 | + } | |
25 | + return $return_data; | |
26 | +} | |
27 | + | |
28 | +# Callback for writing the body into a variable. | |
29 | +sub write_body { | |
30 | + my ($data, $pointer) = @_; | |
31 | + ${$pointer} .= $data; | |
32 | + return length($data); | |
33 | +} | |
34 | + | |
35 | +# Initialise a new Curl object. | |
36 | +sub create_curl { | |
37 | + my $url = shift; | |
38 | + | |
39 | + # Create Curl object. | |
40 | + my $curl = WWW::Curl::easy::new(); | |
41 | + | |
42 | + # Error-handling related options. | |
43 | + $curl->setopt(CURLOPT_VERBOSE, 1) if $options{d}; | |
44 | + $curl->setopt(CURLOPT_FAILONERROR, 1); | |
45 | + $curl->setopt(CURLOPT_USERAGENT, "OpenTSA tsget.pl/" . (split / /, $::version)[2]); | |
46 | + | |
47 | + # Options for POST method. | |
48 | + $curl->setopt(CURLOPT_UPLOAD, 1); | |
49 | + $curl->setopt(CURLOPT_CUSTOMREQUEST, "POST"); | |
50 | + $curl->setopt(CURLOPT_HTTPHEADER, | |
51 | + ["Content-Type: application/timestamp-query", | |
52 | + "Accept: application/timestamp-reply"]); | |
53 | + $curl->setopt(CURLOPT_READFUNCTION, \&read_body); | |
54 | + $curl->setopt(CURLOPT_HEADERFUNCTION, sub { return length($_[0]); }); | |
55 | + | |
56 | + # Options for getting the result. | |
57 | + $curl->setopt(CURLOPT_WRITEFUNCTION, \&write_body); | |
58 | + | |
59 | + # SSL related options. | |
60 | + $curl->setopt(CURLOPT_SSLKEYTYPE, "PEM"); | |
61 | + $curl->setopt(CURLOPT_SSL_VERIFYPEER, 1); # Verify server's certificate. | |
62 | + $curl->setopt(CURLOPT_SSL_VERIFYHOST, 2); # Check server's CN. | |
63 | + $curl->setopt(CURLOPT_SSLKEY, $options{k}) if defined($options{k}); | |
64 | + $curl->setopt(CURLOPT_SSLKEYPASSWD, $options{p}) if defined($options{p}); | |
65 | + $curl->setopt(CURLOPT_SSLCERT, $options{c}) if defined($options{c}); | |
66 | + $curl->setopt(CURLOPT_CAINFO, $options{C}) if defined($options{C}); | |
67 | + $curl->setopt(CURLOPT_CAPATH, $options{P}) if defined($options{P}); | |
68 | + $curl->setopt(CURLOPT_RANDOM_FILE, $options{r}) if defined($options{r}); | |
69 | + $curl->setopt(CURLOPT_EGDSOCKET, $options{g}) if defined($options{g}); | |
70 | + | |
71 | + # Setting destination. | |
72 | + $curl->setopt(CURLOPT_URL, $url); | |
73 | + | |
74 | + return $curl; | |
75 | +} | |
76 | + | |
77 | +# Send a request and returns the body back. | |
78 | +sub get_timestamp { | |
79 | + my $curl = shift; | |
80 | + my $body = shift; | |
81 | + my $ts_body; | |
82 | + local $::error_buf; | |
83 | + | |
84 | + # Error-handling related options. | |
85 | + $curl->setopt(CURLOPT_ERRORBUFFER, "::error_buf"); | |
86 | + | |
87 | + # Options for POST method. | |
88 | + $curl->setopt(CURLOPT_INFILE, {data => $body, bytes => 0}); | |
89 | + $curl->setopt(CURLOPT_INFILESIZE, length(${$body})); | |
90 | + | |
91 | + # Options for getting the result. | |
92 | + $curl->setopt(CURLOPT_FILE, \$ts_body); | |
93 | + | |
94 | + # Send the request... | |
95 | + my $error_code = $curl->perform(); | |
96 | + my $error_string; | |
97 | + if ($error_code != 0) { | |
98 | + my $http_code = $curl->getinfo(CURLINFO_HTTP_CODE); | |
99 | + $error_string = "could not get timestamp"; | |
100 | + $error_string .= ", http code: $http_code" unless $http_code == 0; | |
101 | + $error_string .= ", curl code: $error_code"; | |
102 | + $error_string .= " ($::error_buf)" if defined($::error_buf); | |
103 | + } else { | |
104 | + my $ct = $curl->getinfo(CURLINFO_CONTENT_TYPE); | |
105 | + if (lc($ct) ne "application/timestamp-reply") { | |
106 | + $error_string = "unexpected content type returned: $ct"; | |
107 | + } | |
108 | + } | |
109 | + return ($ts_body, $error_string); | |
110 | + | |
111 | +} | |
112 | + | |
113 | +# Print usage information and exists. | |
114 | +sub usage { | |
115 | + | |
116 | + print STDERR "usage: $0 -h <server_url> [-e <extension>] [-o <output>] "; | |
117 | + print STDERR "[-v] [-d] [-k <private_key.pem>] [-p <key_password>] "; | |
118 | + print STDERR "[-c <client_cert.pem>] [-C <CA_certs.pem>] [-P <CA_path>] "; | |
119 | + print STDERR "[-r <file:file...>] [-g <EGD_socket>] [<request>]...\n"; | |
120 | + exit 1; | |
121 | +} | |
122 | + | |
123 | +# ---------------------------------------------------------------------- | |
124 | +# Main program | |
125 | +# ---------------------------------------------------------------------- | |
126 | + | |
127 | +# Getting command-line options (default comes from TSGET environment variable). | |
128 | +my $getopt_arg = "h:e:o:vdk:p:c:C:P:r:g:"; | |
129 | +if (exists $ENV{TSGET}) { | |
130 | + my @old_argv = @ARGV; | |
131 | + @ARGV = split /\s+/, $ENV{TSGET}; | |
132 | + getopts($getopt_arg, \%options) or usage; | |
133 | + @ARGV = @old_argv; | |
134 | +} | |
135 | +getopts($getopt_arg, \%options) or usage; | |
136 | + | |
137 | +# Checking argument consistency. | |
138 | +if (!exists($options{h}) || (@ARGV == 0 && !exists($options{o})) | |
139 | + || (@ARGV > 1 && exists($options{o}))) { | |
140 | + print STDERR "Inconsistent command line options.\n"; | |
141 | + usage; | |
142 | +} | |
143 | +# Setting defaults. | |
144 | +@ARGV = ("-") unless @ARGV != 0; | |
145 | +$options{e} = ".tsr" unless defined($options{e}); | |
146 | + | |
147 | +# Processing requests. | |
148 | +my $curl = create_curl $options{h}; | |
149 | +undef $/; # For reading whole files. | |
150 | +REQUEST: foreach (@ARGV) { | |
151 | + my $input = $_; | |
152 | + my ($base, $path) = fileparse($input, '\.[^.]*'); | |
153 | + my $output_base = $base . $options{e}; | |
154 | + my $output = defined($options{o}) ? $options{o} : $path . $output_base; | |
155 | + | |
156 | + STDERR->printflush("$input: ") if $options{v}; | |
157 | + # Read request. | |
158 | + my $body; | |
159 | + if ($input eq "-") { | |
160 | + # Read the request from STDIN; | |
161 | + $body = <STDIN>; | |
162 | + } else { | |
163 | + # Read the request from file. | |
164 | + open INPUT, "<" . $input | |
165 | + or warn("$input: could not open input file: $!\n"), next REQUEST; | |
166 | + $body = <INPUT>; | |
167 | + close INPUT | |
168 | + or warn("$input: could not close input file: $!\n"), next REQUEST; | |
169 | + } | |
170 | + | |
171 | + # Send request. | |
172 | + STDERR->printflush("sending request") if $options{v}; | |
173 | + | |
174 | + my ($ts_body, $error) = get_timestamp $curl, \$body; | |
175 | + if (defined($error)) { | |
176 | + die "$input: fatal error: $error\n"; | |
177 | + } | |
178 | + STDERR->printflush(", reply received") if $options{v}; | |
179 | + | |
180 | + # Write response. | |
181 | + if ($output eq "-") { | |
182 | + # Write to STDOUT. | |
183 | + print $ts_body; | |
184 | + } else { | |
185 | + # Write to file. | |
186 | + open OUTPUT, ">", $output | |
187 | + or warn("$output: could not open output file: $!\n"), next REQUEST; | |
188 | + print OUTPUT $ts_body; | |
189 | + close OUTPUT | |
190 | + or warn("$output: could not close output file: $!\n"), next REQUEST; | |
191 | + } | |
192 | + STDERR->printflush(", $output written.\n") if $options{v}; | |
193 | +} | |
194 | +$curl->cleanup(); | |
195 | +WWW::Curl::easy::global_cleanup(); |
@@ -0,0 +1,1063 @@ | ||
1 | +/* crypto/aes/aes_core.c -*- mode:C; c-file-style: "eay" -*- */ | |
2 | +/** | |
3 | + * rijndael-alg-fst.c | |
4 | + * | |
5 | + * @version 3.0 (December 2000) | |
6 | + * | |
7 | + * Optimised ANSI C code for the Rijndael cipher (now AES) | |
8 | + * | |
9 | + * @author Vincent Rijmen <vincent.rijmen@esat.kuleuven.ac.be> | |
10 | + * @author Antoon Bosselaers <antoon.bosselaers@esat.kuleuven.ac.be> | |
11 | + * @author Paulo Barreto <paulo.barreto@terra.com.br> | |
12 | + * | |
13 | + * This code is hereby placed in the public domain. | |
14 | + * | |
15 | + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS | |
16 | + * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED | |
17 | + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |
18 | + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE | |
19 | + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | |
20 | + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | |
21 | + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR | |
22 | + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, | |
23 | + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE | |
24 | + * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, | |
25 | + * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
26 | + */ | |
27 | + | |
28 | +/* | |
29 | + * This is experimental x86[_64] derivative. It assumes little-endian | |
30 | + * byte order and expects CPU to sustain unaligned memory references. | |
31 | + * It is used as playground for cache-time attack mitigations and | |
32 | + * serves as reference C implementation for x86[_64] assembler. | |
33 | + * | |
34 | + * <appro@fy.chalmers.se> | |
35 | + */ | |
36 | + | |
37 | + | |
38 | +#ifndef AES_DEBUG | |
39 | +# ifndef NDEBUG | |
40 | +# define NDEBUG | |
41 | +# endif | |
42 | +#endif | |
43 | +#include <assert.h> | |
44 | + | |
45 | +#include <stdlib.h> | |
46 | +#include <openssl/aes.h> | |
47 | +#include "aes_locl.h" | |
48 | + | |
49 | +/* | |
50 | + * These two parameters control which table, 256-byte or 2KB, is | |
51 | + * referenced in outer and respectively inner rounds. | |
52 | + */ | |
53 | +#define AES_COMPACT_IN_OUTER_ROUNDS | |
54 | +#ifdef AES_COMPACT_IN_OUTER_ROUNDS | |
55 | +/* AES_COMPACT_IN_OUTER_ROUNDS costs ~30% in performance, while | |
56 | + * adding AES_COMPACT_IN_INNER_ROUNDS reduces benchmark *further* | |
57 | + * by factor of ~2. */ | |
58 | +# undef AES_COMPACT_IN_INNER_ROUNDS | |
59 | +#endif | |
60 | + | |
61 | +#if 1 | |
62 | +static void prefetch256(const void *table) | |
63 | +{ | |
64 | + volatile unsigned long *t=(void *)table,ret; | |
65 | + unsigned long sum; | |
66 | + int i; | |
67 | + | |
68 | + /* 32 is common least cache-line size */ | |
69 | + for (sum=0,i=0;i<256/sizeof(t[0]);i+=32/sizeof(t[0])) sum ^= t[i]; | |
70 | + | |
71 | + ret = sum; | |
72 | +} | |
73 | +#else | |
74 | +# define prefetch256(t) | |
75 | +#endif | |
76 | + | |
77 | +#undef GETU32 | |
78 | +#define GETU32(p) (*((u32*)(p))) | |
79 | + | |
80 | +#if (defined(_WIN32) || defined(_WIN64)) && !defined(__MINGW32__) | |
81 | +typedef unsigned __int64 u64; | |
82 | +#define U64(C) C##UI64 | |
83 | +#elif defined(__arch64__) | |
84 | +typedef unsigned long u64; | |
85 | +#define U64(C) C##UL | |
86 | +#else | |
87 | +typedef unsigned long long u64; | |
88 | +#define U64(C) C##ULL | |
89 | +#endif | |
90 | + | |
91 | +#undef ROTATE | |
92 | +#if defined(_MSC_VER) || defined(__ICC) | |
93 | +# define ROTATE(a,n) _lrotl(a,n) | |
94 | +#elif defined(__GNUC__) && __GNUC__>=2 | |
95 | +# if defined(__i386) || defined(__i386__) || defined(__x86_64) || defined(__x86_64__) | |
96 | +# define ROTATE(a,n) ({ register unsigned int ret; \ | |
97 | + asm ( \ | |
98 | + "roll %1,%0" \ | |
99 | + : "=r"(ret) \ | |
100 | + : "I"(n), "0"(a) \ | |
101 | + : "cc"); \ | |
102 | + ret; \ | |
103 | + }) | |
104 | +# endif | |
105 | +#endif | |
106 | +/* | |
107 | +Te [x] = S [x].[02, 01, 01, 03, 02, 01, 01, 03]; | |
108 | +Te0[x] = S [x].[02, 01, 01, 03]; | |
109 | +Te1[x] = S [x].[03, 02, 01, 01]; | |
110 | +Te2[x] = S [x].[01, 03, 02, 01]; | |
111 | +Te3[x] = S [x].[01, 01, 03, 02]; | |
112 | +*/ | |
113 | +#define Te0 (u32)((u64*)((u8*)Te+0)) | |
114 | +#define Te1 (u32)((u64*)((u8*)Te+3)) | |
115 | +#define Te2 (u32)((u64*)((u8*)Te+2)) | |
116 | +#define Te3 (u32)((u64*)((u8*)Te+1)) | |
117 | +/* | |
118 | +Td [x] = Si[x].[0e, 09, 0d, 0b, 0e, 09, 0d, 0b]; | |
119 | +Td0[x] = Si[x].[0e, 09, 0d, 0b]; | |
120 | +Td1[x] = Si[x].[0b, 0e, 09, 0d]; | |
121 | +Td2[x] = Si[x].[0d, 0b, 0e, 09]; | |
122 | +Td3[x] = Si[x].[09, 0d, 0b, 0e]; | |
123 | +Td4[x] = Si[x].[01]; | |
124 | +*/ | |
125 | +#define Td0 (u32)((u64*)((u8*)Td+0)) | |
126 | +#define Td1 (u32)((u64*)((u8*)Td+3)) | |
127 | +#define Td2 (u32)((u64*)((u8*)Td+2)) | |
128 | +#define Td3 (u32)((u64*)((u8*)Td+1)) | |
129 | + | |
130 | +static const u64 Te[256] = { | |
131 | + U64(0xa56363c6a56363c6), U64(0x847c7cf8847c7cf8), | |
132 | + U64(0x997777ee997777ee), U64(0x8d7b7bf68d7b7bf6), | |
133 | + U64(0x0df2f2ff0df2f2ff), U64(0xbd6b6bd6bd6b6bd6), | |
134 | + U64(0xb16f6fdeb16f6fde), U64(0x54c5c59154c5c591), | |
135 | + U64(0x5030306050303060), U64(0x0301010203010102), | |
136 | + U64(0xa96767cea96767ce), U64(0x7d2b2b567d2b2b56), | |
137 | + U64(0x19fefee719fefee7), U64(0x62d7d7b562d7d7b5), | |
138 | + U64(0xe6abab4de6abab4d), U64(0x9a7676ec9a7676ec), | |
139 | + U64(0x45caca8f45caca8f), U64(0x9d82821f9d82821f), | |
140 | + U64(0x40c9c98940c9c989), U64(0x877d7dfa877d7dfa), | |
141 | + U64(0x15fafaef15fafaef), U64(0xeb5959b2eb5959b2), | |
142 | + U64(0xc947478ec947478e), U64(0x0bf0f0fb0bf0f0fb), | |
143 | + U64(0xecadad41ecadad41), U64(0x67d4d4b367d4d4b3), | |
144 | + U64(0xfda2a25ffda2a25f), U64(0xeaafaf45eaafaf45), | |
145 | + U64(0xbf9c9c23bf9c9c23), U64(0xf7a4a453f7a4a453), | |
146 | + U64(0x967272e4967272e4), U64(0x5bc0c09b5bc0c09b), | |
147 | + U64(0xc2b7b775c2b7b775), U64(0x1cfdfde11cfdfde1), | |
148 | + U64(0xae93933dae93933d), U64(0x6a26264c6a26264c), | |
149 | + U64(0x5a36366c5a36366c), U64(0x413f3f7e413f3f7e), | |
150 | + U64(0x02f7f7f502f7f7f5), U64(0x4fcccc834fcccc83), | |
151 | + U64(0x5c3434685c343468), U64(0xf4a5a551f4a5a551), | |
152 | + U64(0x34e5e5d134e5e5d1), U64(0x08f1f1f908f1f1f9), | |
153 | + U64(0x937171e2937171e2), U64(0x73d8d8ab73d8d8ab), | |
154 | + U64(0x5331316253313162), U64(0x3f15152a3f15152a), | |
155 | + U64(0x0c0404080c040408), U64(0x52c7c79552c7c795), | |
156 | + U64(0x6523234665232346), U64(0x5ec3c39d5ec3c39d), | |
157 | + U64(0x2818183028181830), U64(0xa1969637a1969637), | |
158 | + U64(0x0f05050a0f05050a), U64(0xb59a9a2fb59a9a2f), | |
159 | + U64(0x0907070e0907070e), U64(0x3612122436121224), | |
160 | + U64(0x9b80801b9b80801b), U64(0x3de2e2df3de2e2df), | |
161 | + U64(0x26ebebcd26ebebcd), U64(0x6927274e6927274e), | |
162 | + U64(0xcdb2b27fcdb2b27f), U64(0x9f7575ea9f7575ea), | |
163 | + U64(0x1b0909121b090912), U64(0x9e83831d9e83831d), | |
164 | + U64(0x742c2c58742c2c58), U64(0x2e1a1a342e1a1a34), | |
165 | + U64(0x2d1b1b362d1b1b36), U64(0xb26e6edcb26e6edc), | |
166 | + U64(0xee5a5ab4ee5a5ab4), U64(0xfba0a05bfba0a05b), | |
167 | + U64(0xf65252a4f65252a4), U64(0x4d3b3b764d3b3b76), | |
168 | + U64(0x61d6d6b761d6d6b7), U64(0xceb3b37dceb3b37d), | |
169 | + U64(0x7b2929527b292952), U64(0x3ee3e3dd3ee3e3dd), | |
170 | + U64(0x712f2f5e712f2f5e), U64(0x9784841397848413), | |
171 | + U64(0xf55353a6f55353a6), U64(0x68d1d1b968d1d1b9), | |
172 | + U64(0x0000000000000000), U64(0x2cededc12cededc1), | |
173 | + U64(0x6020204060202040), U64(0x1ffcfce31ffcfce3), | |
174 | + U64(0xc8b1b179c8b1b179), U64(0xed5b5bb6ed5b5bb6), | |
175 | + U64(0xbe6a6ad4be6a6ad4), U64(0x46cbcb8d46cbcb8d), | |
176 | + U64(0xd9bebe67d9bebe67), U64(0x4b3939724b393972), | |
177 | + U64(0xde4a4a94de4a4a94), U64(0xd44c4c98d44c4c98), | |
178 | + U64(0xe85858b0e85858b0), U64(0x4acfcf854acfcf85), | |
179 | + U64(0x6bd0d0bb6bd0d0bb), U64(0x2aefefc52aefefc5), | |
180 | + U64(0xe5aaaa4fe5aaaa4f), U64(0x16fbfbed16fbfbed), | |
181 | + U64(0xc5434386c5434386), U64(0xd74d4d9ad74d4d9a), | |
182 | + U64(0x5533336655333366), U64(0x9485851194858511), | |
183 | + U64(0xcf45458acf45458a), U64(0x10f9f9e910f9f9e9), | |
184 | + U64(0x0602020406020204), U64(0x817f7ffe817f7ffe), | |
185 | + U64(0xf05050a0f05050a0), U64(0x443c3c78443c3c78), | |
186 | + U64(0xba9f9f25ba9f9f25), U64(0xe3a8a84be3a8a84b), | |
187 | + U64(0xf35151a2f35151a2), U64(0xfea3a35dfea3a35d), | |
188 | + U64(0xc0404080c0404080), U64(0x8a8f8f058a8f8f05), | |
189 | + U64(0xad92923fad92923f), U64(0xbc9d9d21bc9d9d21), | |
190 | + U64(0x4838387048383870), U64(0x04f5f5f104f5f5f1), | |
191 | + U64(0xdfbcbc63dfbcbc63), U64(0xc1b6b677c1b6b677), | |
192 | + U64(0x75dadaaf75dadaaf), U64(0x6321214263212142), | |
193 | + U64(0x3010102030101020), U64(0x1affffe51affffe5), | |
194 | + U64(0x0ef3f3fd0ef3f3fd), U64(0x6dd2d2bf6dd2d2bf), | |
195 | + U64(0x4ccdcd814ccdcd81), U64(0x140c0c18140c0c18), | |
196 | + U64(0x3513132635131326), U64(0x2fececc32fececc3), | |
197 | + U64(0xe15f5fbee15f5fbe), U64(0xa2979735a2979735), | |
198 | + U64(0xcc444488cc444488), U64(0x3917172e3917172e), | |
199 | + U64(0x57c4c49357c4c493), U64(0xf2a7a755f2a7a755), | |
200 | + U64(0x827e7efc827e7efc), U64(0x473d3d7a473d3d7a), | |
201 | + U64(0xac6464c8ac6464c8), U64(0xe75d5dbae75d5dba), | |
202 | + U64(0x2b1919322b191932), U64(0x957373e6957373e6), | |
203 | + U64(0xa06060c0a06060c0), U64(0x9881811998818119), | |
204 | + U64(0xd14f4f9ed14f4f9e), U64(0x7fdcdca37fdcdca3), | |
205 | + U64(0x6622224466222244), U64(0x7e2a2a547e2a2a54), | |
206 | + U64(0xab90903bab90903b), U64(0x8388880b8388880b), | |
207 | + U64(0xca46468cca46468c), U64(0x29eeeec729eeeec7), | |
208 | + U64(0xd3b8b86bd3b8b86b), U64(0x3c1414283c141428), | |
209 | + U64(0x79dedea779dedea7), U64(0xe25e5ebce25e5ebc), | |
210 | + U64(0x1d0b0b161d0b0b16), U64(0x76dbdbad76dbdbad), | |
211 | + U64(0x3be0e0db3be0e0db), U64(0x5632326456323264), | |
212 | + U64(0x4e3a3a744e3a3a74), U64(0x1e0a0a141e0a0a14), | |
213 | + U64(0xdb494992db494992), U64(0x0a06060c0a06060c), | |
214 | + U64(0x6c2424486c242448), U64(0xe45c5cb8e45c5cb8), | |
215 | + U64(0x5dc2c29f5dc2c29f), U64(0x6ed3d3bd6ed3d3bd), | |
216 | + U64(0xefacac43efacac43), U64(0xa66262c4a66262c4), | |
217 | + U64(0xa8919139a8919139), U64(0xa4959531a4959531), | |
218 | + U64(0x37e4e4d337e4e4d3), U64(0x8b7979f28b7979f2), | |
219 | + U64(0x32e7e7d532e7e7d5), U64(0x43c8c88b43c8c88b), | |
220 | + U64(0x5937376e5937376e), U64(0xb76d6ddab76d6dda), | |
221 | + U64(0x8c8d8d018c8d8d01), U64(0x64d5d5b164d5d5b1), | |
222 | + U64(0xd24e4e9cd24e4e9c), U64(0xe0a9a949e0a9a949), | |
223 | + U64(0xb46c6cd8b46c6cd8), U64(0xfa5656acfa5656ac), | |
224 | + U64(0x07f4f4f307f4f4f3), U64(0x25eaeacf25eaeacf), | |
225 | + U64(0xaf6565caaf6565ca), U64(0x8e7a7af48e7a7af4), | |
226 | + U64(0xe9aeae47e9aeae47), U64(0x1808081018080810), | |
227 | + U64(0xd5baba6fd5baba6f), U64(0x887878f0887878f0), | |
228 | + U64(0x6f25254a6f25254a), U64(0x722e2e5c722e2e5c), | |
229 | + U64(0x241c1c38241c1c38), U64(0xf1a6a657f1a6a657), | |
230 | + U64(0xc7b4b473c7b4b473), U64(0x51c6c69751c6c697), | |
231 | + U64(0x23e8e8cb23e8e8cb), U64(0x7cdddda17cdddda1), | |
232 | + U64(0x9c7474e89c7474e8), U64(0x211f1f3e211f1f3e), | |
233 | + U64(0xdd4b4b96dd4b4b96), U64(0xdcbdbd61dcbdbd61), | |
234 | + U64(0x868b8b0d868b8b0d), U64(0x858a8a0f858a8a0f), | |
235 | + U64(0x907070e0907070e0), U64(0x423e3e7c423e3e7c), | |
236 | + U64(0xc4b5b571c4b5b571), U64(0xaa6666ccaa6666cc), | |
237 | + U64(0xd8484890d8484890), U64(0x0503030605030306), | |
238 | + U64(0x01f6f6f701f6f6f7), U64(0x120e0e1c120e0e1c), | |
239 | + U64(0xa36161c2a36161c2), U64(0x5f35356a5f35356a), | |
240 | + U64(0xf95757aef95757ae), U64(0xd0b9b969d0b9b969), | |
241 | + U64(0x9186861791868617), U64(0x58c1c19958c1c199), | |
242 | + U64(0x271d1d3a271d1d3a), U64(0xb99e9e27b99e9e27), | |
243 | + U64(0x38e1e1d938e1e1d9), U64(0x13f8f8eb13f8f8eb), | |
244 | + U64(0xb398982bb398982b), U64(0x3311112233111122), | |
245 | + U64(0xbb6969d2bb6969d2), U64(0x70d9d9a970d9d9a9), | |
246 | + U64(0x898e8e07898e8e07), U64(0xa7949433a7949433), | |
247 | + U64(0xb69b9b2db69b9b2d), U64(0x221e1e3c221e1e3c), | |
248 | + U64(0x9287871592878715), U64(0x20e9e9c920e9e9c9), | |
249 | + U64(0x49cece8749cece87), U64(0xff5555aaff5555aa), | |
250 | + U64(0x7828285078282850), U64(0x7adfdfa57adfdfa5), | |
251 | + U64(0x8f8c8c038f8c8c03), U64(0xf8a1a159f8a1a159), | |
252 | + U64(0x8089890980898909), U64(0x170d0d1a170d0d1a), | |
253 | + U64(0xdabfbf65dabfbf65), U64(0x31e6e6d731e6e6d7), | |
254 | + U64(0xc6424284c6424284), U64(0xb86868d0b86868d0), | |
255 | + U64(0xc3414182c3414182), U64(0xb0999929b0999929), | |
256 | + U64(0x772d2d5a772d2d5a), U64(0x110f0f1e110f0f1e), | |
257 | + U64(0xcbb0b07bcbb0b07b), U64(0xfc5454a8fc5454a8), | |
258 | + U64(0xd6bbbb6dd6bbbb6d), U64(0x3a16162c3a16162c) | |
259 | +}; | |
260 | + | |
261 | +static const u8 Te4[256] = { | |
262 | + 0x63U, 0x7cU, 0x77U, 0x7bU, 0xf2U, 0x6bU, 0x6fU, 0xc5U, | |
263 | + 0x30U, 0x01U, 0x67U, 0x2bU, 0xfeU, 0xd7U, 0xabU, 0x76U, | |
264 | + 0xcaU, 0x82U, 0xc9U, 0x7dU, 0xfaU, 0x59U, 0x47U, 0xf0U, | |
265 | + 0xadU, 0xd4U, 0xa2U, 0xafU, 0x9cU, 0xa4U, 0x72U, 0xc0U, | |
266 | + 0xb7U, 0xfdU, 0x93U, 0x26U, 0x36U, 0x3fU, 0xf7U, 0xccU, | |
267 | + 0x34U, 0xa5U, 0xe5U, 0xf1U, 0x71U, 0xd8U, 0x31U, 0x15U, | |
268 | + 0x04U, 0xc7U, 0x23U, 0xc3U, 0x18U, 0x96U, 0x05U, 0x9aU, | |
269 | + 0x07U, 0x12U, 0x80U, 0xe2U, 0xebU, 0x27U, 0xb2U, 0x75U, | |
270 | + 0x09U, 0x83U, 0x2cU, 0x1aU, 0x1bU, 0x6eU, 0x5aU, 0xa0U, | |
271 | + 0x52U, 0x3bU, 0xd6U, 0xb3U, 0x29U, 0xe3U, 0x2fU, 0x84U, | |
272 | + 0x53U, 0xd1U, 0x00U, 0xedU, 0x20U, 0xfcU, 0xb1U, 0x5bU, | |
273 | + 0x6aU, 0xcbU, 0xbeU, 0x39U, 0x4aU, 0x4cU, 0x58U, 0xcfU, | |
274 | + 0xd0U, 0xefU, 0xaaU, 0xfbU, 0x43U, 0x4dU, 0x33U, 0x85U, | |
275 | + 0x45U, 0xf9U, 0x02U, 0x7fU, 0x50U, 0x3cU, 0x9fU, 0xa8U, | |
276 | + 0x51U, 0xa3U, 0x40U, 0x8fU, 0x92U, 0x9dU, 0x38U, 0xf5U, | |
277 | + 0xbcU, 0xb6U, 0xdaU, 0x21U, 0x10U, 0xffU, 0xf3U, 0xd2U, | |
278 | + 0xcdU, 0x0cU, 0x13U, 0xecU, 0x5fU, 0x97U, 0x44U, 0x17U, | |
279 | + 0xc4U, 0xa7U, 0x7eU, 0x3dU, 0x64U, 0x5dU, 0x19U, 0x73U, | |
280 | + 0x60U, 0x81U, 0x4fU, 0xdcU, 0x22U, 0x2aU, 0x90U, 0x88U, | |
281 | + 0x46U, 0xeeU, 0xb8U, 0x14U, 0xdeU, 0x5eU, 0x0bU, 0xdbU, | |
282 | + 0xe0U, 0x32U, 0x3aU, 0x0aU, 0x49U, 0x06U, 0x24U, 0x5cU, | |
283 | + 0xc2U, 0xd3U, 0xacU, 0x62U, 0x91U, 0x95U, 0xe4U, 0x79U, | |
284 | + 0xe7U, 0xc8U, 0x37U, 0x6dU, 0x8dU, 0xd5U, 0x4eU, 0xa9U, | |
285 | + 0x6cU, 0x56U, 0xf4U, 0xeaU, 0x65U, 0x7aU, 0xaeU, 0x08U, | |
286 | + 0xbaU, 0x78U, 0x25U, 0x2eU, 0x1cU, 0xa6U, 0xb4U, 0xc6U, | |
287 | + 0xe8U, 0xddU, 0x74U, 0x1fU, 0x4bU, 0xbdU, 0x8bU, 0x8aU, | |
288 | + 0x70U, 0x3eU, 0xb5U, 0x66U, 0x48U, 0x03U, 0xf6U, 0x0eU, | |
289 | + 0x61U, 0x35U, 0x57U, 0xb9U, 0x86U, 0xc1U, 0x1dU, 0x9eU, | |
290 | + 0xe1U, 0xf8U, 0x98U, 0x11U, 0x69U, 0xd9U, 0x8eU, 0x94U, | |
291 | + 0x9bU, 0x1eU, 0x87U, 0xe9U, 0xceU, 0x55U, 0x28U, 0xdfU, | |
292 | + 0x8cU, 0xa1U, 0x89U, 0x0dU, 0xbfU, 0xe6U, 0x42U, 0x68U, | |
293 | + 0x41U, 0x99U, 0x2dU, 0x0fU, 0xb0U, 0x54U, 0xbbU, 0x16U | |
294 | +}; | |
295 | + | |
296 | +static const u64 Td[256] = { | |
297 | + U64(0x50a7f45150a7f451), U64(0x5365417e5365417e), | |
298 | + U64(0xc3a4171ac3a4171a), U64(0x965e273a965e273a), | |
299 | + U64(0xcb6bab3bcb6bab3b), U64(0xf1459d1ff1459d1f), | |
300 | + U64(0xab58faacab58faac), U64(0x9303e34b9303e34b), | |
301 | + U64(0x55fa302055fa3020), U64(0xf66d76adf66d76ad), | |
302 | + U64(0x9176cc889176cc88), U64(0x254c02f5254c02f5), | |
303 | + U64(0xfcd7e54ffcd7e54f), U64(0xd7cb2ac5d7cb2ac5), | |
304 | + U64(0x8044352680443526), U64(0x8fa362b58fa362b5), | |
305 | + U64(0x495ab1de495ab1de), U64(0x671bba25671bba25), | |
306 | + U64(0x980eea45980eea45), U64(0xe1c0fe5de1c0fe5d), | |
307 | + U64(0x02752fc302752fc3), U64(0x12f04c8112f04c81), | |
308 | + U64(0xa397468da397468d), U64(0xc6f9d36bc6f9d36b), | |
309 | + U64(0xe75f8f03e75f8f03), U64(0x959c9215959c9215), | |
310 | + U64(0xeb7a6dbfeb7a6dbf), U64(0xda595295da595295), | |
311 | + U64(0x2d83bed42d83bed4), U64(0xd3217458d3217458), | |
312 | + U64(0x2969e0492969e049), U64(0x44c8c98e44c8c98e), | |
313 | + U64(0x6a89c2756a89c275), U64(0x78798ef478798ef4), | |
314 | + U64(0x6b3e58996b3e5899), U64(0xdd71b927dd71b927), | |
315 | + U64(0xb64fe1beb64fe1be), U64(0x17ad88f017ad88f0), | |
316 | + U64(0x66ac20c966ac20c9), U64(0xb43ace7db43ace7d), | |
317 | + U64(0x184adf63184adf63), U64(0x82311ae582311ae5), | |
318 | + U64(0x6033519760335197), U64(0x457f5362457f5362), | |
319 | + U64(0xe07764b1e07764b1), U64(0x84ae6bbb84ae6bbb), | |
320 | + U64(0x1ca081fe1ca081fe), U64(0x942b08f9942b08f9), | |
321 | + U64(0x5868487058684870), U64(0x19fd458f19fd458f), | |
322 | + U64(0x876cde94876cde94), U64(0xb7f87b52b7f87b52), | |
323 | + U64(0x23d373ab23d373ab), U64(0xe2024b72e2024b72), | |
324 | + U64(0x578f1fe3578f1fe3), U64(0x2aab55662aab5566), | |
325 | + U64(0x0728ebb20728ebb2), U64(0x03c2b52f03c2b52f), | |
326 | + U64(0x9a7bc5869a7bc586), U64(0xa50837d3a50837d3), | |
327 | + U64(0xf2872830f2872830), U64(0xb2a5bf23b2a5bf23), | |
328 | + U64(0xba6a0302ba6a0302), U64(0x5c8216ed5c8216ed), | |
329 | + U64(0x2b1ccf8a2b1ccf8a), U64(0x92b479a792b479a7), | |
330 | + U64(0xf0f207f3f0f207f3), U64(0xa1e2694ea1e2694e), | |
331 | + U64(0xcdf4da65cdf4da65), U64(0xd5be0506d5be0506), | |
332 | + U64(0x1f6234d11f6234d1), U64(0x8afea6c48afea6c4), | |
333 | + U64(0x9d532e349d532e34), U64(0xa055f3a2a055f3a2), | |
334 | + U64(0x32e18a0532e18a05), U64(0x75ebf6a475ebf6a4), | |
335 | + U64(0x39ec830b39ec830b), U64(0xaaef6040aaef6040), | |
336 | + U64(0x069f715e069f715e), U64(0x51106ebd51106ebd), | |
337 | + U64(0xf98a213ef98a213e), U64(0x3d06dd963d06dd96), | |
338 | + U64(0xae053eddae053edd), U64(0x46bde64d46bde64d), | |
339 | + U64(0xb58d5491b58d5491), U64(0x055dc471055dc471), | |
340 | + U64(0x6fd406046fd40604), U64(0xff155060ff155060), | |
341 | + U64(0x24fb981924fb9819), U64(0x97e9bdd697e9bdd6), | |
342 | + U64(0xcc434089cc434089), U64(0x779ed967779ed967), | |
343 | + U64(0xbd42e8b0bd42e8b0), U64(0x888b8907888b8907), | |
344 | + U64(0x385b19e7385b19e7), U64(0xdbeec879dbeec879), | |
345 | + U64(0x470a7ca1470a7ca1), U64(0xe90f427ce90f427c), | |
346 | + U64(0xc91e84f8c91e84f8), U64(0x0000000000000000), | |
347 | + U64(0x8386800983868009), U64(0x48ed2b3248ed2b32), | |
348 | + U64(0xac70111eac70111e), U64(0x4e725a6c4e725a6c), | |
349 | + U64(0xfbff0efdfbff0efd), U64(0x5638850f5638850f), | |
350 | + U64(0x1ed5ae3d1ed5ae3d), U64(0x27392d3627392d36), | |
351 | + U64(0x64d90f0a64d90f0a), U64(0x21a65c6821a65c68), | |
352 | + U64(0xd1545b9bd1545b9b), U64(0x3a2e36243a2e3624), | |
353 | + U64(0xb1670a0cb1670a0c), U64(0x0fe757930fe75793), | |
354 | + U64(0xd296eeb4d296eeb4), U64(0x9e919b1b9e919b1b), | |
355 | + U64(0x4fc5c0804fc5c080), U64(0xa220dc61a220dc61), | |
356 | + U64(0x694b775a694b775a), U64(0x161a121c161a121c), | |
357 | + U64(0x0aba93e20aba93e2), U64(0xe52aa0c0e52aa0c0), | |
358 | + U64(0x43e0223c43e0223c), U64(0x1d171b121d171b12), | |
359 | + U64(0x0b0d090e0b0d090e), U64(0xadc78bf2adc78bf2), | |
360 | + U64(0xb9a8b62db9a8b62d), U64(0xc8a91e14c8a91e14), | |
361 | + U64(0x8519f1578519f157), U64(0x4c0775af4c0775af), | |
362 | + U64(0xbbdd99eebbdd99ee), U64(0xfd607fa3fd607fa3), | |
363 | + U64(0x9f2601f79f2601f7), U64(0xbcf5725cbcf5725c), | |
364 | + U64(0xc53b6644c53b6644), U64(0x347efb5b347efb5b), | |
365 | + U64(0x7629438b7629438b), U64(0xdcc623cbdcc623cb), | |
366 | + U64(0x68fcedb668fcedb6), U64(0x63f1e4b863f1e4b8), | |
367 | + U64(0xcadc31d7cadc31d7), U64(0x1085634210856342), | |
368 | + U64(0x4022971340229713), U64(0x2011c6842011c684), | |
369 | + U64(0x7d244a857d244a85), U64(0xf83dbbd2f83dbbd2), | |
370 | + U64(0x1132f9ae1132f9ae), U64(0x6da129c76da129c7), | |
371 | + U64(0x4b2f9e1d4b2f9e1d), U64(0xf330b2dcf330b2dc), | |
372 | + U64(0xec52860dec52860d), U64(0xd0e3c177d0e3c177), | |
373 | + U64(0x6c16b32b6c16b32b), U64(0x99b970a999b970a9), | |
374 | + U64(0xfa489411fa489411), U64(0x2264e9472264e947), | |
375 | + U64(0xc48cfca8c48cfca8), U64(0x1a3ff0a01a3ff0a0), | |
376 | + U64(0xd82c7d56d82c7d56), U64(0xef903322ef903322), | |
377 | + U64(0xc74e4987c74e4987), U64(0xc1d138d9c1d138d9), | |
378 | + U64(0xfea2ca8cfea2ca8c), U64(0x360bd498360bd498), | |
379 | + U64(0xcf81f5a6cf81f5a6), U64(0x28de7aa528de7aa5), | |
380 | + U64(0x268eb7da268eb7da), U64(0xa4bfad3fa4bfad3f), | |
381 | + U64(0xe49d3a2ce49d3a2c), U64(0x0d9278500d927850), | |
382 | + U64(0x9bcc5f6a9bcc5f6a), U64(0x62467e5462467e54), | |
383 | + U64(0xc2138df6c2138df6), U64(0xe8b8d890e8b8d890), | |
384 | + U64(0x5ef7392e5ef7392e), U64(0xf5afc382f5afc382), | |
385 | + U64(0xbe805d9fbe805d9f), U64(0x7c93d0697c93d069), | |
386 | + U64(0xa92dd56fa92dd56f), U64(0xb31225cfb31225cf), | |
387 | + U64(0x3b99acc83b99acc8), U64(0xa77d1810a77d1810), | |
388 | + U64(0x6e639ce86e639ce8), U64(0x7bbb3bdb7bbb3bdb), | |
389 | + U64(0x097826cd097826cd), U64(0xf418596ef418596e), | |
390 | + U64(0x01b79aec01b79aec), U64(0xa89a4f83a89a4f83), | |
391 | + U64(0x656e95e6656e95e6), U64(0x7ee6ffaa7ee6ffaa), | |
392 | + U64(0x08cfbc2108cfbc21), U64(0xe6e815efe6e815ef), | |
393 | + U64(0xd99be7bad99be7ba), U64(0xce366f4ace366f4a), | |
394 | + U64(0xd4099fead4099fea), U64(0xd67cb029d67cb029), | |
395 | + U64(0xafb2a431afb2a431), U64(0x31233f2a31233f2a), | |
396 | + U64(0x3094a5c63094a5c6), U64(0xc066a235c066a235), | |
397 | + U64(0x37bc4e7437bc4e74), U64(0xa6ca82fca6ca82fc), | |
398 | + U64(0xb0d090e0b0d090e0), U64(0x15d8a73315d8a733), | |
399 | + U64(0x4a9804f14a9804f1), U64(0xf7daec41f7daec41), | |
400 | + U64(0x0e50cd7f0e50cd7f), U64(0x2ff691172ff69117), | |
401 | + U64(0x8dd64d768dd64d76), U64(0x4db0ef434db0ef43), | |
402 | + U64(0x544daacc544daacc), U64(0xdf0496e4df0496e4), | |
403 | + U64(0xe3b5d19ee3b5d19e), U64(0x1b886a4c1b886a4c), | |
404 | + U64(0xb81f2cc1b81f2cc1), U64(0x7f5165467f516546), | |
405 | + U64(0x04ea5e9d04ea5e9d), U64(0x5d358c015d358c01), | |
406 | + U64(0x737487fa737487fa), U64(0x2e410bfb2e410bfb), | |
407 | + U64(0x5a1d67b35a1d67b3), U64(0x52d2db9252d2db92), | |
408 | + U64(0x335610e9335610e9), U64(0x1347d66d1347d66d), | |
409 | + U64(0x8c61d79a8c61d79a), U64(0x7a0ca1377a0ca137), | |
410 | + U64(0x8e14f8598e14f859), U64(0x893c13eb893c13eb), | |
411 | + U64(0xee27a9ceee27a9ce), U64(0x35c961b735c961b7), | |
412 | + U64(0xede51ce1ede51ce1), U64(0x3cb1477a3cb1477a), | |
413 | + U64(0x59dfd29c59dfd29c), U64(0x3f73f2553f73f255), | |
414 | + U64(0x79ce141879ce1418), U64(0xbf37c773bf37c773), | |
415 | + U64(0xeacdf753eacdf753), U64(0x5baafd5f5baafd5f), | |
416 | + U64(0x146f3ddf146f3ddf), U64(0x86db447886db4478), | |
417 | + U64(0x81f3afca81f3afca), U64(0x3ec468b93ec468b9), | |
418 | + U64(0x2c3424382c342438), U64(0x5f40a3c25f40a3c2), | |
419 | + U64(0x72c31d1672c31d16), U64(0x0c25e2bc0c25e2bc), | |
420 | + U64(0x8b493c288b493c28), U64(0x41950dff41950dff), | |
421 | + U64(0x7101a8397101a839), U64(0xdeb30c08deb30c08), | |
422 | + U64(0x9ce4b4d89ce4b4d8), U64(0x90c1566490c15664), | |
423 | + U64(0x6184cb7b6184cb7b), U64(0x70b632d570b632d5), | |
424 | + U64(0x745c6c48745c6c48), U64(0x4257b8d04257b8d0) | |
425 | +}; | |
426 | +static const u8 Td4[256] = { | |
427 | + 0x52U, 0x09U, 0x6aU, 0xd5U, 0x30U, 0x36U, 0xa5U, 0x38U, | |
428 | + 0xbfU, 0x40U, 0xa3U, 0x9eU, 0x81U, 0xf3U, 0xd7U, 0xfbU, | |
429 | + 0x7cU, 0xe3U, 0x39U, 0x82U, 0x9bU, 0x2fU, 0xffU, 0x87U, | |
430 | + 0x34U, 0x8eU, 0x43U, 0x44U, 0xc4U, 0xdeU, 0xe9U, 0xcbU, | |
431 | + 0x54U, 0x7bU, 0x94U, 0x32U, 0xa6U, 0xc2U, 0x23U, 0x3dU, | |
432 | + 0xeeU, 0x4cU, 0x95U, 0x0bU, 0x42U, 0xfaU, 0xc3U, 0x4eU, | |
433 | + 0x08U, 0x2eU, 0xa1U, 0x66U, 0x28U, 0xd9U, 0x24U, 0xb2U, | |
434 | + 0x76U, 0x5bU, 0xa2U, 0x49U, 0x6dU, 0x8bU, 0xd1U, 0x25U, | |
435 | + 0x72U, 0xf8U, 0xf6U, 0x64U, 0x86U, 0x68U, 0x98U, 0x16U, | |
436 | + 0xd4U, 0xa4U, 0x5cU, 0xccU, 0x5dU, 0x65U, 0xb6U, 0x92U, | |
437 | + 0x6cU, 0x70U, 0x48U, 0x50U, 0xfdU, 0xedU, 0xb9U, 0xdaU, | |
438 | + 0x5eU, 0x15U, 0x46U, 0x57U, 0xa7U, 0x8dU, 0x9dU, 0x84U, | |
439 | + 0x90U, 0xd8U, 0xabU, 0x00U, 0x8cU, 0xbcU, 0xd3U, 0x0aU, | |
440 | + 0xf7U, 0xe4U, 0x58U, 0x05U, 0xb8U, 0xb3U, 0x45U, 0x06U, | |
441 | + 0xd0U, 0x2cU, 0x1eU, 0x8fU, 0xcaU, 0x3fU, 0x0fU, 0x02U, | |
442 | + 0xc1U, 0xafU, 0xbdU, 0x03U, 0x01U, 0x13U, 0x8aU, 0x6bU, | |
443 | + 0x3aU, 0x91U, 0x11U, 0x41U, 0x4fU, 0x67U, 0xdcU, 0xeaU, | |
444 | + 0x97U, 0xf2U, 0xcfU, 0xceU, 0xf0U, 0xb4U, 0xe6U, 0x73U, | |
445 | + 0x96U, 0xacU, 0x74U, 0x22U, 0xe7U, 0xadU, 0x35U, 0x85U, | |
446 | + 0xe2U, 0xf9U, 0x37U, 0xe8U, 0x1cU, 0x75U, 0xdfU, 0x6eU, | |
447 | + 0x47U, 0xf1U, 0x1aU, 0x71U, 0x1dU, 0x29U, 0xc5U, 0x89U, | |
448 | + 0x6fU, 0xb7U, 0x62U, 0x0eU, 0xaaU, 0x18U, 0xbeU, 0x1bU, | |
449 | + 0xfcU, 0x56U, 0x3eU, 0x4bU, 0xc6U, 0xd2U, 0x79U, 0x20U, | |
450 | + 0x9aU, 0xdbU, 0xc0U, 0xfeU, 0x78U, 0xcdU, 0x5aU, 0xf4U, | |
451 | + 0x1fU, 0xddU, 0xa8U, 0x33U, 0x88U, 0x07U, 0xc7U, 0x31U, | |
452 | + 0xb1U, 0x12U, 0x10U, 0x59U, 0x27U, 0x80U, 0xecU, 0x5fU, | |
453 | + 0x60U, 0x51U, 0x7fU, 0xa9U, 0x19U, 0xb5U, 0x4aU, 0x0dU, | |
454 | + 0x2dU, 0xe5U, 0x7aU, 0x9fU, 0x93U, 0xc9U, 0x9cU, 0xefU, | |
455 | + 0xa0U, 0xe0U, 0x3bU, 0x4dU, 0xaeU, 0x2aU, 0xf5U, 0xb0U, | |
456 | + 0xc8U, 0xebU, 0xbbU, 0x3cU, 0x83U, 0x53U, 0x99U, 0x61U, | |
457 | + 0x17U, 0x2bU, 0x04U, 0x7eU, 0xbaU, 0x77U, 0xd6U, 0x26U, | |
458 | + 0xe1U, 0x69U, 0x14U, 0x63U, 0x55U, 0x21U, 0x0cU, 0x7dU | |
459 | +}; | |
460 | + | |
461 | +static const u32 rcon[] = { | |
462 | + 0x00000001U, 0x00000002U, 0x00000004U, 0x00000008U, | |
463 | + 0x00000010U, 0x00000020U, 0x00000040U, 0x00000080U, | |
464 | + 0x0000001bU, 0x00000036U, /* for 128-bit blocks, Rijndael never uses more than 10 rcon values */ | |
465 | +}; | |
466 | + | |
467 | +/** | |
468 | + * Expand the cipher key into the encryption key schedule. | |
469 | + */ | |
470 | +int AES_set_encrypt_key(const unsigned char *userKey, const int bits, | |
471 | + AES_KEY *key) { | |
472 | + | |
473 | + u32 *rk; | |
474 | + int i = 0; | |
475 | + u32 temp; | |
476 | + | |
477 | + if (!userKey || !key) | |
478 | + return -1; | |
479 | + if (bits != 128 && bits != 192 && bits != 256) | |
480 | + return -2; | |
481 | + | |
482 | + rk = key->rd_key; | |
483 | + | |
484 | + if (bits==128) | |
485 | + key->rounds = 10; | |
486 | + else if (bits==192) | |
487 | + key->rounds = 12; | |
488 | + else | |
489 | + key->rounds = 14; | |
490 | + | |
491 | + rk[0] = GETU32(userKey ); | |
492 | + rk[1] = GETU32(userKey + 4); | |
493 | + rk[2] = GETU32(userKey + 8); | |
494 | + rk[3] = GETU32(userKey + 12); | |
495 | + if (bits == 128) { | |
496 | + while (1) { | |
497 | + temp = rk[3]; | |
498 | + rk[4] = rk[0] ^ | |
499 | + (Te4[(temp >> 8) & 0xff] ) ^ | |
500 | + (Te4[(temp >> 16) & 0xff] << 8) ^ | |
501 | + (Te4[(temp >> 24) ] << 16) ^ | |
502 | + (Te4[(temp ) & 0xff] << 24) ^ | |
503 | + rcon[i]; | |
504 | + rk[5] = rk[1] ^ rk[4]; | |
505 | + rk[6] = rk[2] ^ rk[5]; | |
506 | + rk[7] = rk[3] ^ rk[6]; | |
507 | + if (++i == 10) { | |
508 | + return 0; | |
509 | + } | |
510 | + rk += 4; | |
511 | + } | |
512 | + } | |
513 | + rk[4] = GETU32(userKey + 16); | |
514 | + rk[5] = GETU32(userKey + 20); | |
515 | + if (bits == 192) { | |
516 | + while (1) { | |
517 | + temp = rk[ 5]; | |
518 | + rk[ 6] = rk[ 0] ^ | |
519 | + (Te4[(temp >> 8) & 0xff] ) ^ | |
520 | + (Te4[(temp >> 16) & 0xff] << 8) ^ | |
521 | + (Te4[(temp >> 24) ] << 16) ^ | |
522 | + (Te4[(temp ) & 0xff] << 24) ^ | |
523 | + rcon[i]; | |
524 | + rk[ 7] = rk[ 1] ^ rk[ 6]; | |
525 | + rk[ 8] = rk[ 2] ^ rk[ 7]; | |
526 | + rk[ 9] = rk[ 3] ^ rk[ 8]; | |
527 | + if (++i == 8) { | |
528 | + return 0; | |
529 | + } | |
530 | + rk[10] = rk[ 4] ^ rk[ 9]; | |
531 | + rk[11] = rk[ 5] ^ rk[10]; | |
532 | + rk += 6; | |
533 | + } | |
534 | + } | |
535 | + rk[6] = GETU32(userKey + 24); | |
536 | + rk[7] = GETU32(userKey + 28); | |
537 | + if (bits == 256) { | |
538 | + while (1) { | |
539 | + temp = rk[ 7]; | |
540 | + rk[ 8] = rk[ 0] ^ | |
541 | + (Te4[(temp >> 8) & 0xff] ) ^ | |
542 | + (Te4[(temp >> 16) & 0xff] << 8) ^ | |
543 | + (Te4[(temp >> 24) ] << 16) ^ | |
544 | + (Te4[(temp ) & 0xff] << 24) ^ | |
545 | + rcon[i]; | |
546 | + rk[ 9] = rk[ 1] ^ rk[ 8]; | |
547 | + rk[10] = rk[ 2] ^ rk[ 9]; | |
548 | + rk[11] = rk[ 3] ^ rk[10]; | |
549 | + if (++i == 7) { | |
550 | + return 0; | |
551 | + } | |
552 | + temp = rk[11]; | |
553 | + rk[12] = rk[ 4] ^ | |
554 | + (Te4[(temp ) & 0xff] ) ^ | |
555 | + (Te4[(temp >> 8) & 0xff] << 8) ^ | |
556 | + (Te4[(temp >> 16) & 0xff] << 16) ^ | |
557 | + (Te4[(temp >> 24) ] << 24); | |
558 | + rk[13] = rk[ 5] ^ rk[12]; | |
559 | + rk[14] = rk[ 6] ^ rk[13]; | |
560 | + rk[15] = rk[ 7] ^ rk[14]; | |
561 | + | |
562 | + rk += 8; | |
563 | + } | |
564 | + } | |
565 | + return 0; | |
566 | +} | |
567 | + | |
568 | +/** | |
569 | + * Expand the cipher key into the decryption key schedule. | |
570 | + */ | |
571 | +int AES_set_decrypt_key(const unsigned char *userKey, const int bits, | |
572 | + AES_KEY *key) { | |
573 | + | |
574 | + u32 *rk; | |
575 | + int i, j, status; | |
576 | + u32 temp; | |
577 | + | |
578 | + /* first, start with an encryption schedule */ | |
579 | + status = AES_set_encrypt_key(userKey, bits, key); | |
580 | + if (status < 0) | |
581 | + return status; | |
582 | + | |
583 | + rk = key->rd_key; | |
584 | + | |
585 | + /* invert the order of the round keys: */ | |
586 | + for (i = 0, j = 4*(key->rounds); i < j; i += 4, j -= 4) { | |
587 | + temp = rk[i ]; rk[i ] = rk[j ]; rk[j ] = temp; | |
588 | + temp = rk[i + 1]; rk[i + 1] = rk[j + 1]; rk[j + 1] = temp; | |
589 | + temp = rk[i + 2]; rk[i + 2] = rk[j + 2]; rk[j + 2] = temp; | |
590 | + temp = rk[i + 3]; rk[i + 3] = rk[j + 3]; rk[j + 3] = temp; | |
591 | + } | |
592 | + /* apply the inverse MixColumn transform to all round keys but the first and the last: */ | |
593 | + for (i = 1; i < (key->rounds); i++) { | |
594 | + rk += 4; | |
595 | +#if 1 | |
596 | + for (j = 0; j < 4; j++) { | |
597 | + u32 tp1, tp2, tp4, tp8, tp9, tpb, tpd, tpe, m; | |
598 | + | |
599 | + tp1 = rk[j]; | |
600 | + m = tp1 & 0x80808080; | |
601 | + tp2 = ((tp1 & 0x7f7f7f7f) << 1) ^ | |
602 | + ((m - (m >> 7)) & 0x1b1b1b1b); | |
603 | + m = tp2 & 0x80808080; | |
604 | + tp4 = ((tp2 & 0x7f7f7f7f) << 1) ^ | |
605 | + ((m - (m >> 7)) & 0x1b1b1b1b); | |
606 | + m = tp4 & 0x80808080; | |
607 | + tp8 = ((tp4 & 0x7f7f7f7f) << 1) ^ | |
608 | + ((m - (m >> 7)) & 0x1b1b1b1b); | |
609 | + tp9 = tp8 ^ tp1; | |
610 | + tpb = tp9 ^ tp2; | |
611 | + tpd = tp9 ^ tp4; | |
612 | + tpe = tp8 ^ tp4 ^ tp2; | |
613 | +#if defined(ROTATE) | |
614 | + rk[j] = tpe ^ ROTATE(tpd,16) ^ | |
615 | + ROTATE(tp9,8) ^ ROTATE(tpb,24); | |
616 | +#else | |
617 | + rk[j] = tpe ^ (tpd >> 16) ^ (tpd << 16) ^ | |
618 | + (tp9 >> 24) ^ (tp9 << 8) ^ | |
619 | + (tpb >> 8) ^ (tpb << 24); | |
620 | +#endif | |
621 | + } | |
622 | +#else | |
623 | + rk[0] = | |
624 | + Td0[Te2[(rk[0] ) & 0xff] & 0xff] ^ | |
625 | + Td1[Te2[(rk[0] >> 8) & 0xff] & 0xff] ^ | |
626 | + Td2[Te2[(rk[0] >> 16) & 0xff] & 0xff] ^ | |
627 | + Td3[Te2[(rk[0] >> 24) ] & 0xff]; | |
628 | + rk[1] = | |
629 | + Td0[Te2[(rk[1] ) & 0xff] & 0xff] ^ | |
630 | + Td1[Te2[(rk[1] >> 8) & 0xff] & 0xff] ^ | |
631 | + Td2[Te2[(rk[1] >> 16) & 0xff] & 0xff] ^ | |
632 | + Td3[Te2[(rk[1] >> 24) ] & 0xff]; | |
633 | + rk[2] = | |
634 | + Td0[Te2[(rk[2] ) & 0xff] & 0xff] ^ | |
635 | + Td1[Te2[(rk[2] >> 8) & 0xff] & 0xff] ^ | |
636 | + Td2[Te2[(rk[2] >> 16) & 0xff] & 0xff] ^ | |
637 | + Td3[Te2[(rk[2] >> 24) ] & 0xff]; | |
638 | + rk[3] = | |
639 | + Td0[Te2[(rk[3] ) & 0xff] & 0xff] ^ | |
640 | + Td1[Te2[(rk[3] >> 8) & 0xff] & 0xff] ^ | |
641 | + Td2[Te2[(rk[3] >> 16) & 0xff] & 0xff] ^ | |
642 | + Td3[Te2[(rk[3] >> 24) ] & 0xff]; | |
643 | +#endif | |
644 | + } | |
645 | + return 0; | |
646 | +} | |
647 | + | |
648 | +/* | |
649 | + * Encrypt a single block | |
650 | + * in and out can overlap | |
651 | + */ | |
652 | +void AES_encrypt(const unsigned char *in, unsigned char *out, | |
653 | + const AES_KEY *key) { | |
654 | + | |
655 | + const u32 *rk; | |
656 | + u32 s0, s1, s2, s3, t[4]; | |
657 | + int r; | |
658 | + | |
659 | + assert(in && out && key); | |
660 | + rk = key->rd_key; | |
661 | + | |
662 | + /* | |
663 | + * map byte array block to cipher state | |
664 | + * and add initial round key: | |
665 | + */ | |
666 | + s0 = GETU32(in ) ^ rk[0]; | |
667 | + s1 = GETU32(in + 4) ^ rk[1]; | |
668 | + s2 = GETU32(in + 8) ^ rk[2]; | |
669 | + s3 = GETU32(in + 12) ^ rk[3]; | |
670 | + | |
671 | +#if defined(AES_COMPACT_IN_OUTER_ROUNDS) | |
672 | + prefetch256(Te4); | |
673 | + | |
674 | + t[0] = Te4[(s0 ) & 0xff] ^ | |
675 | + Te4[(s1 >> 8) & 0xff] << 8 ^ | |
676 | + Te4[(s2 >> 16) & 0xff] << 16 ^ | |
677 | + Te4[(s3 >> 24) ] << 24; | |
678 | + t[1] = Te4[(s1 ) & 0xff] ^ | |
679 | + Te4[(s2 >> 8) & 0xff] << 8 ^ | |
680 | + Te4[(s3 >> 16) & 0xff] << 16 ^ | |
681 | + Te4[(s0 >> 24) ] << 24; | |
682 | + t[2] = Te4[(s2 ) & 0xff] ^ | |
683 | + Te4[(s3 >> 8) & 0xff] << 8 ^ | |
684 | + Te4[(s0 >> 16) & 0xff] << 16 ^ | |
685 | + Te4[(s1 >> 24) ] << 24; | |
686 | + t[3] = Te4[(s3 ) & 0xff] ^ | |
687 | + Te4[(s0 >> 8) & 0xff] << 8 ^ | |
688 | + Te4[(s1 >> 16) & 0xff] << 16 ^ | |
689 | + Te4[(s2 >> 24) ] << 24; | |
690 | + | |
691 | + /* now do the linear transform using words */ | |
692 | + { int i; | |
693 | + u32 r0, r1, r2; | |
694 | + | |
695 | + for (i = 0; i < 4; i++) { | |
696 | + r0 = t[i]; | |
697 | + r1 = r0 & 0x80808080; | |
698 | + r2 = ((r0 & 0x7f7f7f7f) << 1) ^ | |
699 | + ((r1 - (r1 >> 7)) & 0x1b1b1b1b); | |
700 | +#if defined(ROTATE) | |
701 | + t[i] = r2 ^ ROTATE(r2,24) ^ ROTATE(r0,24) ^ | |
702 | + ROTATE(r0,16) ^ ROTATE(r0,8); | |
703 | +#else | |
704 | + t[i] = r2 ^ ((r2 ^ r0) << 24) ^ ((r2 ^ r0) >> 8) ^ | |
705 | + (r0 << 16) ^ (r0 >> 16) ^ | |
706 | + (r0 << 8) ^ (r0 >> 24); | |
707 | +#endif | |
708 | + t[i] ^= rk[4+i]; | |
709 | + } | |
710 | + } | |
711 | +#else | |
712 | + t[0] = Te0[(s0 ) & 0xff] ^ | |
713 | + Te1[(s1 >> 8) & 0xff] ^ | |
714 | + Te2[(s2 >> 16) & 0xff] ^ | |
715 | + Te3[(s3 >> 24) ] ^ | |
716 | + rk[4]; | |
717 | + t[1] = Te0[(s1 ) & 0xff] ^ | |
718 | + Te1[(s2 >> 8) & 0xff] ^ | |
719 | + Te2[(s3 >> 16) & 0xff] ^ | |
720 | + Te3[(s0 >> 24) ] ^ | |
721 | + rk[5]; | |
722 | + t[2] = Te0[(s2 ) & 0xff] ^ | |
723 | + Te1[(s3 >> 8) & 0xff] ^ | |
724 | + Te2[(s0 >> 16) & 0xff] ^ | |
725 | + Te3[(s1 >> 24) ] ^ | |
726 | + rk[6]; | |
727 | + t[3] = Te0[(s3 ) & 0xff] ^ | |
728 | + Te1[(s0 >> 8) & 0xff] ^ | |
729 | + Te2[(s1 >> 16) & 0xff] ^ | |
730 | + Te3[(s2 >> 24) ] ^ | |
731 | + rk[7]; | |
732 | +#endif | |
733 | + s0 = t[0]; s1 = t[1]; s2 = t[2]; s3 = t[3]; | |
734 | + | |
735 | + /* | |
736 | + * Nr - 2 full rounds: | |
737 | + */ | |
738 | + for (rk+=8,r=key->rounds-2; r>0; rk+=4,r--) { | |
739 | +#if defined(AES_COMPACT_IN_INNER_ROUNDS) | |
740 | + t[0] = Te4[(s0 ) & 0xff] ^ | |
741 | + Te4[(s1 >> 8) & 0xff] << 8 ^ | |
742 | + Te4[(s2 >> 16) & 0xff] << 16 ^ | |
743 | + Te4[(s3 >> 24) ] << 24; | |
744 | + t[1] = Te4[(s1 ) & 0xff] ^ | |
745 | + Te4[(s2 >> 8) & 0xff] << 8 ^ | |
746 | + Te4[(s3 >> 16) & 0xff] << 16 ^ | |
747 | + Te4[(s0 >> 24) ] << 24; | |
748 | + t[2] = Te4[(s2 ) & 0xff] ^ | |
749 | + Te4[(s3 >> 8) & 0xff] << 8 ^ | |
750 | + Te4[(s0 >> 16) & 0xff] << 16 ^ | |
751 | + Te4[(s1 >> 24) ] << 24; | |
752 | + t[3] = Te4[(s3 ) & 0xff] ^ | |
753 | + Te4[(s0 >> 8) & 0xff] << 8 ^ | |
754 | + Te4[(s1 >> 16) & 0xff] << 16 ^ | |
755 | + Te4[(s2 >> 24) ] << 24; | |
756 | + | |
757 | + /* now do the linear transform using words */ | |
758 | + { int i; | |
759 | + u32 r0, r1, r2; | |
760 | + | |
761 | + for (i = 0; i < 4; i++) { | |
762 | + r0 = t[i]; | |
763 | + r1 = r0 & 0x80808080; | |
764 | + r2 = ((r0 & 0x7f7f7f7f) << 1) ^ | |
765 | + ((r1 - (r1 >> 7)) & 0x1b1b1b1b); | |
766 | +#if defined(ROTATE) | |
767 | + t[i] = r2 ^ ROTATE(r2,24) ^ ROTATE(r0,24) ^ | |
768 | + ROTATE(r0,16) ^ ROTATE(r0,8); | |
769 | +#else | |
770 | + t[i] = r2 ^ ((r2 ^ r0) << 24) ^ ((r2 ^ r0) >> 8) ^ | |
771 | + (r0 << 16) ^ (r0 >> 16) ^ | |
772 | + (r0 << 8) ^ (r0 >> 24); | |
773 | +#endif | |
774 | + t[i] ^= rk[i]; | |
775 | + } | |
776 | + } | |
777 | +#else | |
778 | + t[0] = Te0[(s0 ) & 0xff] ^ | |
779 | + Te1[(s1 >> 8) & 0xff] ^ | |
780 | + Te2[(s2 >> 16) & 0xff] ^ | |
781 | + Te3[(s3 >> 24) ] ^ | |
782 | + rk[0]; | |
783 | + t[1] = Te0[(s1 ) & 0xff] ^ | |
784 | + Te1[(s2 >> 8) & 0xff] ^ | |
785 | + Te2[(s3 >> 16) & 0xff] ^ | |
786 | + Te3[(s0 >> 24) ] ^ | |
787 | + rk[1]; | |
788 | + t[2] = Te0[(s2 ) & 0xff] ^ | |
789 | + Te1[(s3 >> 8) & 0xff] ^ | |
790 | + Te2[(s0 >> 16) & 0xff] ^ | |
791 | + Te3[(s1 >> 24) ] ^ | |
792 | + rk[2]; | |
793 | + t[3] = Te0[(s3 ) & 0xff] ^ | |
794 | + Te1[(s0 >> 8) & 0xff] ^ | |
795 | + Te2[(s1 >> 16) & 0xff] ^ | |
796 | + Te3[(s2 >> 24) ] ^ | |
797 | + rk[3]; | |
798 | +#endif | |
799 | + s0 = t[0]; s1 = t[1]; s2 = t[2]; s3 = t[3]; | |
800 | + } | |
801 | + /* | |
802 | + * apply last round and | |
803 | + * map cipher state to byte array block: | |
804 | + */ | |
805 | +#if defined(AES_COMPACT_IN_OUTER_ROUNDS) | |
806 | + prefetch256(Te4); | |
807 | + | |
808 | + *(u32*)(out+0) = | |
809 | + Te4[(s0 ) & 0xff] ^ | |
810 | + Te4[(s1 >> 8) & 0xff] << 8 ^ | |
811 | + Te4[(s2 >> 16) & 0xff] << 16 ^ | |
812 | + Te4[(s3 >> 24) ] << 24 ^ | |
813 | + rk[0]; | |
814 | + *(u32*)(out+4) = | |
815 | + Te4[(s1 ) & 0xff] ^ | |
816 | + Te4[(s2 >> 8) & 0xff] << 8 ^ | |
817 | + Te4[(s3 >> 16) & 0xff] << 16 ^ | |
818 | + Te4[(s0 >> 24) ] << 24 ^ | |
819 | + rk[1]; | |
820 | + *(u32*)(out+8) = | |
821 | + Te4[(s2 ) & 0xff] ^ | |
822 | + Te4[(s3 >> 8) & 0xff] << 8 ^ | |
823 | + Te4[(s0 >> 16) & 0xff] << 16 ^ | |
824 | + Te4[(s1 >> 24) ] << 24 ^ | |
825 | + rk[2]; | |
826 | + *(u32*)(out+12) = | |
827 | + Te4[(s3 ) & 0xff] ^ | |
828 | + Te4[(s0 >> 8) & 0xff] << 8 ^ | |
829 | + Te4[(s1 >> 16) & 0xff] << 16 ^ | |
830 | + Te4[(s2 >> 24) ] << 24 ^ | |
831 | + rk[3]; | |
832 | +#else | |
833 | + *(u32*)(out+0) = | |
834 | + (Te2[(s0 ) & 0xff] & 0x000000ffU) ^ | |
835 | + (Te3[(s1 >> 8) & 0xff] & 0x0000ff00U) ^ | |
836 | + (Te0[(s2 >> 16) & 0xff] & 0x00ff0000U) ^ | |
837 | + (Te1[(s3 >> 24) ] & 0xff000000U) ^ | |
838 | + rk[0]; | |
839 | + *(u32*)(out+4) = | |
840 | + (Te2[(s1 ) & 0xff] & 0x000000ffU) ^ | |
841 | + (Te3[(s2 >> 8) & 0xff] & 0x0000ff00U) ^ | |
842 | + (Te0[(s3 >> 16) & 0xff] & 0x00ff0000U) ^ | |
843 | + (Te1[(s0 >> 24) ] & 0xff000000U) ^ | |
844 | + rk[1]; | |
845 | + *(u32*)(out+8) = | |
846 | + (Te2[(s2 ) & 0xff] & 0x000000ffU) ^ | |
847 | + (Te3[(s3 >> 8) & 0xff] & 0x0000ff00U) ^ | |
848 | + (Te0[(s0 >> 16) & 0xff] & 0x00ff0000U) ^ | |
849 | + (Te1[(s1 >> 24) ] & 0xff000000U) ^ | |
850 | + rk[2]; | |
851 | + *(u32*)(out+12) = | |
852 | + (Te2[(s3 ) & 0xff] & 0x000000ffU) ^ | |
853 | + (Te3[(s0 >> 8) & 0xff] & 0x0000ff00U) ^ | |
854 | + (Te0[(s1 >> 16) & 0xff] & 0x00ff0000U) ^ | |
855 | + (Te1[(s2 >> 24) ] & 0xff000000U) ^ | |
856 | + rk[3]; | |
857 | +#endif | |
858 | +} | |
859 | + | |
860 | +/* | |
861 | + * Decrypt a single block | |
862 | + * in and out can overlap | |
863 | + */ | |
864 | +void AES_decrypt(const unsigned char *in, unsigned char *out, | |
865 | + const AES_KEY *key) { | |
866 | + | |
867 | + const u32 *rk; | |
868 | + u32 s0, s1, s2, s3, t[4]; | |
869 | + int r; | |
870 | + | |
871 | + assert(in && out && key); | |
872 | + rk = key->rd_key; | |
873 | + | |
874 | + /* | |
875 | + * map byte array block to cipher state | |
876 | + * and add initial round key: | |
877 | + */ | |
878 | + s0 = GETU32(in ) ^ rk[0]; | |
879 | + s1 = GETU32(in + 4) ^ rk[1]; | |
880 | + s2 = GETU32(in + 8) ^ rk[2]; | |
881 | + s3 = GETU32(in + 12) ^ rk[3]; | |
882 | + | |
883 | +#if defined(AES_COMPACT_IN_OUTER_ROUNDS) | |
884 | + prefetch256(Td4); | |
885 | + | |
886 | + t[0] = Td4[(s0 ) & 0xff] ^ | |
887 | + Td4[(s3 >> 8) & 0xff] << 8 ^ | |
888 | + Td4[(s2 >> 16) & 0xff] << 16 ^ | |
889 | + Td4[(s1 >> 24) ] << 24; | |
890 | + t[1] = Td4[(s1 ) & 0xff] ^ | |
891 | + Td4[(s0 >> 8) & 0xff] << 8 ^ | |
892 | + Td4[(s3 >> 16) & 0xff] << 16 ^ | |
893 | + Td4[(s2 >> 24) ] << 24; | |
894 | + t[2] = Td4[(s2 ) & 0xff] ^ | |
895 | + Td4[(s1 >> 8) & 0xff] << 8 ^ | |
896 | + Td4[(s0 >> 16) & 0xff] << 16 ^ | |
897 | + Td4[(s3 >> 24) ] << 24; | |
898 | + t[3] = Td4[(s3 ) & 0xff] ^ | |
899 | + Td4[(s2 >> 8) & 0xff] << 8 ^ | |
900 | + Td4[(s1 >> 16) & 0xff] << 16 ^ | |
901 | + Td4[(s0 >> 24) ] << 24; | |
902 | + | |
903 | + /* now do the linear transform using words */ | |
904 | + { int i; | |
905 | + u32 tp1, tp2, tp4, tp8, tp9, tpb, tpd, tpe, m; | |
906 | + | |
907 | + for (i = 0; i < 4; i++) { | |
908 | + tp1 = t[i]; | |
909 | + m = tp1 & 0x80808080; | |
910 | + tp2 = ((tp1 & 0x7f7f7f7f) << 1) ^ | |
911 | + ((m - (m >> 7)) & 0x1b1b1b1b); | |
912 | + m = tp2 & 0x80808080; | |
913 | + tp4 = ((tp2 & 0x7f7f7f7f) << 1) ^ | |
914 | + ((m - (m >> 7)) & 0x1b1b1b1b); | |
915 | + m = tp4 & 0x80808080; | |
916 | + tp8 = ((tp4 & 0x7f7f7f7f) << 1) ^ | |
917 | + ((m - (m >> 7)) & 0x1b1b1b1b); | |
918 | + tp9 = tp8 ^ tp1; | |
919 | + tpb = tp9 ^ tp2; | |
920 | + tpd = tp9 ^ tp4; | |
921 | + tpe = tp8 ^ tp4 ^ tp2; | |
922 | +#if defined(ROTATE) | |
923 | + t[i] = tpe ^ ROTATE(tpd,16) ^ | |
924 | + ROTATE(tp9,8) ^ ROTATE(tpb,24); | |
925 | +#else | |
926 | + t[i] = tpe ^ (tpd >> 16) ^ (tpd << 16) ^ | |
927 | + (tp9 >> 24) ^ (tp9 << 8) ^ | |
928 | + (tpb >> 8) ^ (tpb << 24); | |
929 | +#endif | |
930 | + t[i] ^= rk[4+i]; | |
931 | + } | |
932 | + } | |
933 | +#else | |
934 | + t[0] = Td0[(s0 ) & 0xff] ^ | |
935 | + Td1[(s3 >> 8) & 0xff] ^ | |
936 | + Td2[(s2 >> 16) & 0xff] ^ | |
937 | + Td3[(s1 >> 24) ] ^ | |
938 | + rk[4]; | |
939 | + t[1] = Td0[(s1 ) & 0xff] ^ | |
940 | + Td1[(s0 >> 8) & 0xff] ^ | |
941 | + Td2[(s3 >> 16) & 0xff] ^ | |
942 | + Td3[(s2 >> 24) ] ^ | |
943 | + rk[5]; | |
944 | + t[2] = Td0[(s2 ) & 0xff] ^ | |
945 | + Td1[(s1 >> 8) & 0xff] ^ | |
946 | + Td2[(s0 >> 16) & 0xff] ^ | |
947 | + Td3[(s3 >> 24) ] ^ | |
948 | + rk[6]; | |
949 | + t[3] = Td0[(s3 ) & 0xff] ^ | |
950 | + Td1[(s2 >> 8) & 0xff] ^ | |
951 | + Td2[(s1 >> 16) & 0xff] ^ | |
952 | + Td3[(s0 >> 24) ] ^ | |
953 | + rk[7]; | |
954 | +#endif | |
955 | + s0 = t[0]; s1 = t[1]; s2 = t[2]; s3 = t[3]; | |
956 | + | |
957 | + /* | |
958 | + * Nr - 2 full rounds: | |
959 | + */ | |
960 | + for (rk+=8,r=key->rounds-2; r>0; rk+=4,r--) { | |
961 | +#if defined(AES_COMPACT_IN_INNER_ROUNDS) | |
962 | + t[0] = Td4[(s0 ) & 0xff] ^ | |
963 | + Td4[(s3 >> 8) & 0xff] << 8 ^ | |
964 | + Td4[(s2 >> 16) & 0xff] << 16 ^ | |
965 | + Td4[(s1 >> 24) ] << 24; | |
966 | + t[1] = Td4[(s1 ) & 0xff] ^ | |
967 | + Td4[(s0 >> 8) & 0xff] << 8 ^ | |
968 | + Td4[(s3 >> 16) & 0xff] << 16 ^ | |
969 | + Td4[(s2 >> 24) ] << 24; | |
970 | + t[2] = Td4[(s2 ) & 0xff] ^ | |
971 | + Td4[(s1 >> 8) & 0xff] << 8 ^ | |
972 | + Td4[(s0 >> 16) & 0xff] << 16 ^ | |
973 | + Td4[(s3 >> 24) ] << 24; | |
974 | + t[3] = Td4[(s3 ) & 0xff] ^ | |
975 | + Td4[(s2 >> 8) & 0xff] << 8 ^ | |
976 | + Td4[(s1 >> 16) & 0xff] << 16 ^ | |
977 | + Td4[(s0 >> 24) ] << 24; | |
978 | + | |
979 | + /* now do the linear transform using words */ | |
980 | + { int i; | |
981 | + u32 tp1, tp2, tp4, tp8, tp9, tpb, tpd, tpe, m; | |
982 | + | |
983 | + for (i = 0; i < 4; i++) { | |
984 | + tp1 = t[i]; | |
985 | + m = tp1 & 0x80808080; | |
986 | + tp2 = ((tp1 & 0x7f7f7f7f) << 1) ^ | |
987 | + ((m - (m >> 7)) & 0x1b1b1b1b); | |
988 | + m = tp2 & 0x80808080; | |
989 | + tp4 = ((tp2 & 0x7f7f7f7f) << 1) ^ | |
990 | + ((m - (m >> 7)) & 0x1b1b1b1b); | |
991 | + m = tp4 & 0x80808080; | |
992 | + tp8 = ((tp4 & 0x7f7f7f7f) << 1) ^ | |
993 | + ((m - (m >> 7)) & 0x1b1b1b1b); | |
994 | + tp9 = tp8 ^ tp1; | |
995 | + tpb = tp9 ^ tp2; | |
996 | + tpd = tp9 ^ tp4; | |
997 | + tpe = tp8 ^ tp4 ^ tp2; | |
998 | +#if defined(ROTATE) | |
999 | + t[i] = tpe ^ ROTATE(tpd,16) ^ | |
1000 | + ROTATE(tp9,8) ^ ROTATE(tpb,24); | |
1001 | +#else | |
1002 | + t[i] = tpe ^ (tpd >> 16) ^ (tpd << 16) ^ | |
1003 | + (tp9 >> 24) ^ (tp9 << 8) ^ | |
1004 | + (tpb >> 8) ^ (tpb << 24); | |
1005 | +#endif | |
1006 | + t[i] ^= rk[i]; | |
1007 | + } | |
1008 | + } | |
1009 | +#else | |
1010 | + t[0] = Td0[(s0 ) & 0xff] ^ | |
1011 | + Td1[(s3 >> 8) & 0xff] ^ | |
1012 | + Td2[(s2 >> 16) & 0xff] ^ | |
1013 | + Td3[(s1 >> 24) ] ^ | |
1014 | + rk[0]; | |
1015 | + t[1] = Td0[(s1 ) & 0xff] ^ | |
1016 | + Td1[(s0 >> 8) & 0xff] ^ | |
1017 | + Td2[(s3 >> 16) & 0xff] ^ | |
1018 | + Td3[(s2 >> 24) ] ^ | |
1019 | + rk[1]; | |
1020 | + t[2] = Td0[(s2 ) & 0xff] ^ | |
1021 | + Td1[(s1 >> 8) & 0xff] ^ | |
1022 | + Td2[(s0 >> 16) & 0xff] ^ | |
1023 | + Td3[(s3 >> 24) ] ^ | |
1024 | + rk[2]; | |
1025 | + t[3] = Td0[(s3 ) & 0xff] ^ | |
1026 | + Td1[(s2 >> 8) & 0xff] ^ | |
1027 | + Td2[(s1 >> 16) & 0xff] ^ | |
1028 | + Td3[(s0 >> 24) ] ^ | |
1029 | + rk[3]; | |
1030 | +#endif | |
1031 | + s0 = t[0]; s1 = t[1]; s2 = t[2]; s3 = t[3]; | |
1032 | + } | |
1033 | + /* | |
1034 | + * apply last round and | |
1035 | + * map cipher state to byte array block: | |
1036 | + */ | |
1037 | + prefetch256(Td4); | |
1038 | + | |
1039 | + *(u32*)(out+0) = | |
1040 | + (Td4[(s0 ) & 0xff]) ^ | |
1041 | + (Td4[(s3 >> 8) & 0xff] << 8) ^ | |
1042 | + (Td4[(s2 >> 16) & 0xff] << 16) ^ | |
1043 | + (Td4[(s1 >> 24) ] << 24) ^ | |
1044 | + rk[0]; | |
1045 | + *(u32*)(out+4) = | |
1046 | + (Td4[(s1 ) & 0xff]) ^ | |
1047 | + (Td4[(s0 >> 8) & 0xff] << 8) ^ | |
1048 | + (Td4[(s3 >> 16) & 0xff] << 16) ^ | |
1049 | + (Td4[(s2 >> 24) ] << 24) ^ | |
1050 | + rk[1]; | |
1051 | + *(u32*)(out+8) = | |
1052 | + (Td4[(s2 ) & 0xff]) ^ | |
1053 | + (Td4[(s1 >> 8) & 0xff] << 8) ^ | |
1054 | + (Td4[(s0 >> 16) & 0xff] << 16) ^ | |
1055 | + (Td4[(s3 >> 24) ] << 24) ^ | |
1056 | + rk[2]; | |
1057 | + *(u32*)(out+12) = | |
1058 | + (Td4[(s3 ) & 0xff]) ^ | |
1059 | + (Td4[(s2 >> 8) & 0xff] << 8) ^ | |
1060 | + (Td4[(s1 >> 16) & 0xff] << 16) ^ | |
1061 | + (Td4[(s0 >> 24) ] << 24) ^ | |
1062 | + rk[3]; | |
1063 | +} |
@@ -0,0 +1,1030 @@ | ||
1 | +#!/usr/bin/env perl | |
2 | + | |
3 | +# ==================================================================== | |
4 | +# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | |
5 | +# project. The module is, however, dual licensed under OpenSSL and | |
6 | +# CRYPTOGAMS licenses depending on where you obtain it. For further | |
7 | +# details see http://www.openssl.org/~appro/cryptogams/. | |
8 | +# ==================================================================== | |
9 | + | |
10 | +# AES for ARMv4 | |
11 | + | |
12 | +# January 2007. | |
13 | +# | |
14 | +# Code uses single 1K S-box and is >2 times faster than code generated | |
15 | +# by gcc-3.4.1. This is thanks to unique feature of ARMv4 ISA, which | |
16 | +# allows to merge logical or arithmetic operation with shift or rotate | |
17 | +# in one instruction and emit combined result every cycle. The module | |
18 | +# is endian-neutral. The performance is ~42 cycles/byte for 128-bit | |
19 | +# key. | |
20 | + | |
21 | +# May 2007. | |
22 | +# | |
23 | +# AES_set_[en|de]crypt_key is added. | |
24 | + | |
25 | +$s0="r0"; | |
26 | +$s1="r1"; | |
27 | +$s2="r2"; | |
28 | +$s3="r3"; | |
29 | +$t1="r4"; | |
30 | +$t2="r5"; | |
31 | +$t3="r6"; | |
32 | +$i1="r7"; | |
33 | +$i2="r8"; | |
34 | +$i3="r9"; | |
35 | + | |
36 | +$tbl="r10"; | |
37 | +$key="r11"; | |
38 | +$rounds="r12"; | |
39 | + | |
40 | +$code=<<___; | |
41 | +.text | |
42 | +.code 32 | |
43 | + | |
44 | +.type AES_Te,%object | |
45 | +.align 5 | |
46 | +AES_Te: | |
47 | +.word 0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d | |
48 | +.word 0xfff2f20d, 0xd66b6bbd, 0xde6f6fb1, 0x91c5c554 | |
49 | +.word 0x60303050, 0x02010103, 0xce6767a9, 0x562b2b7d | |
50 | +.word 0xe7fefe19, 0xb5d7d762, 0x4dababe6, 0xec76769a | |
51 | +.word 0x8fcaca45, 0x1f82829d, 0x89c9c940, 0xfa7d7d87 | |
52 | +.word 0xeffafa15, 0xb25959eb, 0x8e4747c9, 0xfbf0f00b | |
53 | +.word 0x41adadec, 0xb3d4d467, 0x5fa2a2fd, 0x45afafea | |
54 | +.word 0x239c9cbf, 0x53a4a4f7, 0xe4727296, 0x9bc0c05b | |
55 | +.word 0x75b7b7c2, 0xe1fdfd1c, 0x3d9393ae, 0x4c26266a | |
56 | +.word 0x6c36365a, 0x7e3f3f41, 0xf5f7f702, 0x83cccc4f | |
57 | +.word 0x6834345c, 0x51a5a5f4, 0xd1e5e534, 0xf9f1f108 | |
58 | +.word 0xe2717193, 0xabd8d873, 0x62313153, 0x2a15153f | |
59 | +.word 0x0804040c, 0x95c7c752, 0x46232365, 0x9dc3c35e | |
60 | +.word 0x30181828, 0x379696a1, 0x0a05050f, 0x2f9a9ab5 | |
61 | +.word 0x0e070709, 0x24121236, 0x1b80809b, 0xdfe2e23d | |
62 | +.word 0xcdebeb26, 0x4e272769, 0x7fb2b2cd, 0xea75759f | |
63 | +.word 0x1209091b, 0x1d83839e, 0x582c2c74, 0x341a1a2e | |
64 | +.word 0x361b1b2d, 0xdc6e6eb2, 0xb45a5aee, 0x5ba0a0fb | |
65 | +.word 0xa45252f6, 0x763b3b4d, 0xb7d6d661, 0x7db3b3ce | |
66 | +.word 0x5229297b, 0xdde3e33e, 0x5e2f2f71, 0x13848497 | |
67 | +.word 0xa65353f5, 0xb9d1d168, 0x00000000, 0xc1eded2c | |
68 | +.word 0x40202060, 0xe3fcfc1f, 0x79b1b1c8, 0xb65b5bed | |
69 | +.word 0xd46a6abe, 0x8dcbcb46, 0x67bebed9, 0x7239394b | |
70 | +.word 0x944a4ade, 0x984c4cd4, 0xb05858e8, 0x85cfcf4a | |
71 | +.word 0xbbd0d06b, 0xc5efef2a, 0x4faaaae5, 0xedfbfb16 | |
72 | +.word 0x864343c5, 0x9a4d4dd7, 0x66333355, 0x11858594 | |
73 | +.word 0x8a4545cf, 0xe9f9f910, 0x04020206, 0xfe7f7f81 | |
74 | +.word 0xa05050f0, 0x783c3c44, 0x259f9fba, 0x4ba8a8e3 | |
75 | +.word 0xa25151f3, 0x5da3a3fe, 0x804040c0, 0x058f8f8a | |
76 | +.word 0x3f9292ad, 0x219d9dbc, 0x70383848, 0xf1f5f504 | |
77 | +.word 0x63bcbcdf, 0x77b6b6c1, 0xafdada75, 0x42212163 | |
78 | +.word 0x20101030, 0xe5ffff1a, 0xfdf3f30e, 0xbfd2d26d | |
79 | +.word 0x81cdcd4c, 0x180c0c14, 0x26131335, 0xc3ecec2f | |
80 | +.word 0xbe5f5fe1, 0x359797a2, 0x884444cc, 0x2e171739 | |
81 | +.word 0x93c4c457, 0x55a7a7f2, 0xfc7e7e82, 0x7a3d3d47 | |
82 | +.word 0xc86464ac, 0xba5d5de7, 0x3219192b, 0xe6737395 | |
83 | +.word 0xc06060a0, 0x19818198, 0x9e4f4fd1, 0xa3dcdc7f | |
84 | +.word 0x44222266, 0x542a2a7e, 0x3b9090ab, 0x0b888883 | |
85 | +.word 0x8c4646ca, 0xc7eeee29, 0x6bb8b8d3, 0x2814143c | |
86 | +.word 0xa7dede79, 0xbc5e5ee2, 0x160b0b1d, 0xaddbdb76 | |
87 | +.word 0xdbe0e03b, 0x64323256, 0x743a3a4e, 0x140a0a1e | |
88 | +.word 0x924949db, 0x0c06060a, 0x4824246c, 0xb85c5ce4 | |
89 | +.word 0x9fc2c25d, 0xbdd3d36e, 0x43acacef, 0xc46262a6 | |
90 | +.word 0x399191a8, 0x319595a4, 0xd3e4e437, 0xf279798b | |
91 | +.word 0xd5e7e732, 0x8bc8c843, 0x6e373759, 0xda6d6db7 | |
92 | +.word 0x018d8d8c, 0xb1d5d564, 0x9c4e4ed2, 0x49a9a9e0 | |
93 | +.word 0xd86c6cb4, 0xac5656fa, 0xf3f4f407, 0xcfeaea25 | |
94 | +.word 0xca6565af, 0xf47a7a8e, 0x47aeaee9, 0x10080818 | |
95 | +.word 0x6fbabad5, 0xf0787888, 0x4a25256f, 0x5c2e2e72 | |
96 | +.word 0x381c1c24, 0x57a6a6f1, 0x73b4b4c7, 0x97c6c651 | |
97 | +.word 0xcbe8e823, 0xa1dddd7c, 0xe874749c, 0x3e1f1f21 | |
98 | +.word 0x964b4bdd, 0x61bdbddc, 0x0d8b8b86, 0x0f8a8a85 | |
99 | +.word 0xe0707090, 0x7c3e3e42, 0x71b5b5c4, 0xcc6666aa | |
100 | +.word 0x904848d8, 0x06030305, 0xf7f6f601, 0x1c0e0e12 | |
101 | +.word 0xc26161a3, 0x6a35355f, 0xae5757f9, 0x69b9b9d0 | |
102 | +.word 0x17868691, 0x99c1c158, 0x3a1d1d27, 0x279e9eb9 | |
103 | +.word 0xd9e1e138, 0xebf8f813, 0x2b9898b3, 0x22111133 | |
104 | +.word 0xd26969bb, 0xa9d9d970, 0x078e8e89, 0x339494a7 | |
105 | +.word 0x2d9b9bb6, 0x3c1e1e22, 0x15878792, 0xc9e9e920 | |
106 | +.word 0x87cece49, 0xaa5555ff, 0x50282878, 0xa5dfdf7a | |
107 | +.word 0x038c8c8f, 0x59a1a1f8, 0x09898980, 0x1a0d0d17 | |
108 | +.word 0x65bfbfda, 0xd7e6e631, 0x844242c6, 0xd06868b8 | |
109 | +.word 0x824141c3, 0x299999b0, 0x5a2d2d77, 0x1e0f0f11 | |
110 | +.word 0x7bb0b0cb, 0xa85454fc, 0x6dbbbbd6, 0x2c16163a | |
111 | +@ Te4[256] | |
112 | +.byte 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5 | |
113 | +.byte 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76 | |
114 | +.byte 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0 | |
115 | +.byte 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0 | |
116 | +.byte 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc | |
117 | +.byte 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15 | |
118 | +.byte 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a | |
119 | +.byte 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75 | |
120 | +.byte 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0 | |
121 | +.byte 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84 | |
122 | +.byte 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b | |
123 | +.byte 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf | |
124 | +.byte 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85 | |
125 | +.byte 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8 | |
126 | +.byte 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5 | |
127 | +.byte 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2 | |
128 | +.byte 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17 | |
129 | +.byte 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73 | |
130 | +.byte 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88 | |
131 | +.byte 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb | |
132 | +.byte 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c | |
133 | +.byte 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79 | |
134 | +.byte 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9 | |
135 | +.byte 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08 | |
136 | +.byte 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6 | |
137 | +.byte 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a | |
138 | +.byte 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e | |
139 | +.byte 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e | |
140 | +.byte 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94 | |
141 | +.byte 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf | |
142 | +.byte 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68 | |
143 | +.byte 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16 | |
144 | +@ rcon[] | |
145 | +.word 0x01000000, 0x02000000, 0x04000000, 0x08000000 | |
146 | +.word 0x10000000, 0x20000000, 0x40000000, 0x80000000 | |
147 | +.word 0x1B000000, 0x36000000, 0, 0, 0, 0, 0, 0 | |
148 | +.size AES_Te,.-AES_Te | |
149 | + | |
150 | +@ void AES_encrypt(const unsigned char *in, unsigned char *out, | |
151 | +@ const AES_KEY *key) { | |
152 | +.global AES_encrypt | |
153 | +.type AES_encrypt,%function | |
154 | +.align 5 | |
155 | +AES_encrypt: | |
156 | + sub r3,pc,#8 @ AES_encrypt | |
157 | + stmdb sp!,{r1,r4-r12,lr} | |
158 | + mov $rounds,r0 @ inp | |
159 | + mov $key,r2 | |
160 | + sub $tbl,r3,#AES_encrypt-AES_Te @ Te | |
161 | + | |
162 | + ldrb $s0,[$rounds,#3] @ load input data in endian-neutral | |
163 | + ldrb $t1,[$rounds,#2] @ manner... | |
164 | + ldrb $t2,[$rounds,#1] | |
165 | + ldrb $t3,[$rounds,#0] | |
166 | + orr $s0,$s0,$t1,lsl#8 | |
167 | + orr $s0,$s0,$t2,lsl#16 | |
168 | + orr $s0,$s0,$t3,lsl#24 | |
169 | + ldrb $s1,[$rounds,#7] | |
170 | + ldrb $t1,[$rounds,#6] | |
171 | + ldrb $t2,[$rounds,#5] | |
172 | + ldrb $t3,[$rounds,#4] | |
173 | + orr $s1,$s1,$t1,lsl#8 | |
174 | + orr $s1,$s1,$t2,lsl#16 | |
175 | + orr $s1,$s1,$t3,lsl#24 | |
176 | + ldrb $s2,[$rounds,#11] | |
177 | + ldrb $t1,[$rounds,#10] | |
178 | + ldrb $t2,[$rounds,#9] | |
179 | + ldrb $t3,[$rounds,#8] | |
180 | + orr $s2,$s2,$t1,lsl#8 | |
181 | + orr $s2,$s2,$t2,lsl#16 | |
182 | + orr $s2,$s2,$t3,lsl#24 | |
183 | + ldrb $s3,[$rounds,#15] | |
184 | + ldrb $t1,[$rounds,#14] | |
185 | + ldrb $t2,[$rounds,#13] | |
186 | + ldrb $t3,[$rounds,#12] | |
187 | + orr $s3,$s3,$t1,lsl#8 | |
188 | + orr $s3,$s3,$t2,lsl#16 | |
189 | + orr $s3,$s3,$t3,lsl#24 | |
190 | + | |
191 | + bl _armv4_AES_encrypt | |
192 | + | |
193 | + ldr $rounds,[sp],#4 @ pop out | |
194 | + mov $t1,$s0,lsr#24 @ write output in endian-neutral | |
195 | + mov $t2,$s0,lsr#16 @ manner... | |
196 | + mov $t3,$s0,lsr#8 | |
197 | + strb $t1,[$rounds,#0] | |
198 | + strb $t2,[$rounds,#1] | |
199 | + strb $t3,[$rounds,#2] | |
200 | + strb $s0,[$rounds,#3] | |
201 | + mov $t1,$s1,lsr#24 | |
202 | + mov $t2,$s1,lsr#16 | |
203 | + mov $t3,$s1,lsr#8 | |
204 | + strb $t1,[$rounds,#4] | |
205 | + strb $t2,[$rounds,#5] | |
206 | + strb $t3,[$rounds,#6] | |
207 | + strb $s1,[$rounds,#7] | |
208 | + mov $t1,$s2,lsr#24 | |
209 | + mov $t2,$s2,lsr#16 | |
210 | + mov $t3,$s2,lsr#8 | |
211 | + strb $t1,[$rounds,#8] | |
212 | + strb $t2,[$rounds,#9] | |
213 | + strb $t3,[$rounds,#10] | |
214 | + strb $s2,[$rounds,#11] | |
215 | + mov $t1,$s3,lsr#24 | |
216 | + mov $t2,$s3,lsr#16 | |
217 | + mov $t3,$s3,lsr#8 | |
218 | + strb $t1,[$rounds,#12] | |
219 | + strb $t2,[$rounds,#13] | |
220 | + strb $t3,[$rounds,#14] | |
221 | + strb $s3,[$rounds,#15] | |
222 | + | |
223 | + ldmia sp!,{r4-r12,lr} | |
224 | + tst lr,#1 | |
225 | + moveq pc,lr @ be binary compatible with V4, yet | |
226 | + bx lr @ interoperable with Thumb ISA:-) | |
227 | +.size AES_encrypt,.-AES_encrypt | |
228 | + | |
229 | +.type _armv4_AES_encrypt,%function | |
230 | +.align 2 | |
231 | +_armv4_AES_encrypt: | |
232 | + str lr,[sp,#-4]! @ push lr | |
233 | + ldr $t1,[$key],#16 | |
234 | + ldr $t2,[$key,#-12] | |
235 | + ldr $t3,[$key,#-8] | |
236 | + ldr $i1,[$key,#-4] | |
237 | + ldr $rounds,[$key,#240-16] | |
238 | + eor $s0,$s0,$t1 | |
239 | + eor $s1,$s1,$t2 | |
240 | + eor $s2,$s2,$t3 | |
241 | + eor $s3,$s3,$i1 | |
242 | + sub $rounds,$rounds,#1 | |
243 | + mov lr,#255 | |
244 | + | |
245 | +.Lenc_loop: | |
246 | + and $i2,lr,$s0,lsr#8 | |
247 | + and $i3,lr,$s0,lsr#16 | |
248 | + and $i1,lr,$s0 | |
249 | + mov $s0,$s0,lsr#24 | |
250 | + ldr $t1,[$tbl,$i1,lsl#2] @ Te3[s0>>0] | |
251 | + ldr $s0,[$tbl,$s0,lsl#2] @ Te0[s0>>24] | |
252 | + ldr $t2,[$tbl,$i2,lsl#2] @ Te2[s0>>8] | |
253 | + ldr $t3,[$tbl,$i3,lsl#2] @ Te1[s0>>16] | |
254 | + | |
255 | + and $i1,lr,$s1,lsr#16 @ i0 | |
256 | + and $i2,lr,$s1 | |
257 | + and $i3,lr,$s1,lsr#8 | |
258 | + mov $s1,$s1,lsr#24 | |
259 | + ldr $i1,[$tbl,$i1,lsl#2] @ Te1[s1>>16] | |
260 | + ldr $s1,[$tbl,$s1,lsl#2] @ Te0[s1>>24] | |
261 | + ldr $i2,[$tbl,$i2,lsl#2] @ Te3[s1>>0] | |
262 | + ldr $i3,[$tbl,$i3,lsl#2] @ Te2[s1>>8] | |
263 | + eor $s0,$s0,$i1,ror#8 | |
264 | + eor $s1,$s1,$t1,ror#24 | |
265 | + eor $t2,$t2,$i2,ror#8 | |
266 | + eor $t3,$t3,$i3,ror#8 | |
267 | + | |
268 | + and $i1,lr,$s2,lsr#8 @ i0 | |
269 | + and $i2,lr,$s2,lsr#16 @ i1 | |
270 | + and $i3,lr,$s2 | |
271 | + mov $s2,$s2,lsr#24 | |
272 | + ldr $i1,[$tbl,$i1,lsl#2] @ Te2[s2>>8] | |
273 | + ldr $i2,[$tbl,$i2,lsl#2] @ Te1[s2>>16] | |
274 | + ldr $s2,[$tbl,$s2,lsl#2] @ Te0[s2>>24] | |
275 | + ldr $i3,[$tbl,$i3,lsl#2] @ Te3[s2>>0] | |
276 | + eor $s0,$s0,$i1,ror#16 | |
277 | + eor $s1,$s1,$i2,ror#8 | |
278 | + eor $s2,$s2,$t2,ror#16 | |
279 | + eor $t3,$t3,$i3,ror#16 | |
280 | + | |
281 | + and $i1,lr,$s3 @ i0 | |
282 | + and $i2,lr,$s3,lsr#8 @ i1 | |
283 | + and $i3,lr,$s3,lsr#16 @ i2 | |
284 | + mov $s3,$s3,lsr#24 | |
285 | + ldr $i1,[$tbl,$i1,lsl#2] @ Te3[s3>>0] | |
286 | + ldr $i2,[$tbl,$i2,lsl#2] @ Te2[s3>>8] | |
287 | + ldr $i3,[$tbl,$i3,lsl#2] @ Te1[s3>>16] | |
288 | + ldr $s3,[$tbl,$s3,lsl#2] @ Te0[s3>>24] | |
289 | + eor $s0,$s0,$i1,ror#24 | |
290 | + eor $s1,$s1,$i2,ror#16 | |
291 | + eor $s2,$s2,$i3,ror#8 | |
292 | + eor $s3,$s3,$t3,ror#8 | |
293 | + | |
294 | + ldr $t1,[$key],#16 | |
295 | + ldr $t2,[$key,#-12] | |
296 | + ldr $t3,[$key,#-8] | |
297 | + ldr $i1,[$key,#-4] | |
298 | + eor $s0,$s0,$t1 | |
299 | + eor $s1,$s1,$t2 | |
300 | + eor $s2,$s2,$t3 | |
301 | + eor $s3,$s3,$i1 | |
302 | + | |
303 | + subs $rounds,$rounds,#1 | |
304 | + bne .Lenc_loop | |
305 | + | |
306 | + add $tbl,$tbl,#2 | |
307 | + | |
308 | + and $i1,lr,$s0 | |
309 | + and $i2,lr,$s0,lsr#8 | |
310 | + and $i3,lr,$s0,lsr#16 | |
311 | + mov $s0,$s0,lsr#24 | |
312 | + ldrb $t1,[$tbl,$i1,lsl#2] @ Te4[s0>>0] | |
313 | + ldrb $s0,[$tbl,$s0,lsl#2] @ Te4[s0>>24] | |
314 | + ldrb $t2,[$tbl,$i2,lsl#2] @ Te4[s0>>8] | |
315 | + ldrb $t3,[$tbl,$i3,lsl#2] @ Te4[s0>>16] | |
316 | + | |
317 | + and $i1,lr,$s1,lsr#16 @ i0 | |
318 | + and $i2,lr,$s1 | |
319 | + and $i3,lr,$s1,lsr#8 | |
320 | + mov $s1,$s1,lsr#24 | |
321 | + ldrb $i1,[$tbl,$i1,lsl#2] @ Te4[s1>>16] | |
322 | + ldrb $s1,[$tbl,$s1,lsl#2] @ Te4[s1>>24] | |
323 | + ldrb $i2,[$tbl,$i2,lsl#2] @ Te4[s1>>0] | |
324 | + ldrb $i3,[$tbl,$i3,lsl#2] @ Te4[s1>>8] | |
325 | + eor $s0,$i1,$s0,lsl#8 | |
326 | + eor $s1,$t1,$s1,lsl#24 | |
327 | + eor $t2,$i2,$t2,lsl#8 | |
328 | + eor $t3,$i3,$t3,lsl#8 | |
329 | + | |
330 | + and $i1,lr,$s2,lsr#8 @ i0 | |
331 | + and $i2,lr,$s2,lsr#16 @ i1 | |
332 | + and $i3,lr,$s2 | |
333 | + mov $s2,$s2,lsr#24 | |
334 | + ldrb $i1,[$tbl,$i1,lsl#2] @ Te4[s2>>8] | |
335 | + ldrb $i2,[$tbl,$i2,lsl#2] @ Te4[s2>>16] | |
336 | + ldrb $s2,[$tbl,$s2,lsl#2] @ Te4[s2>>24] | |
337 | + ldrb $i3,[$tbl,$i3,lsl#2] @ Te4[s2>>0] | |
338 | + eor $s0,$i1,$s0,lsl#8 | |
339 | + eor $s1,$s1,$i2,lsl#16 | |
340 | + eor $s2,$t2,$s2,lsl#24 | |
341 | + eor $t3,$i3,$t3,lsl#8 | |
342 | + | |
343 | + and $i1,lr,$s3 @ i0 | |
344 | + and $i2,lr,$s3,lsr#8 @ i1 | |
345 | + and $i3,lr,$s3,lsr#16 @ i2 | |
346 | + mov $s3,$s3,lsr#24 | |
347 | + ldrb $i1,[$tbl,$i1,lsl#2] @ Te4[s3>>0] | |
348 | + ldrb $i2,[$tbl,$i2,lsl#2] @ Te4[s3>>8] | |
349 | + ldrb $i3,[$tbl,$i3,lsl#2] @ Te4[s3>>16] | |
350 | + ldrb $s3,[$tbl,$s3,lsl#2] @ Te4[s3>>24] | |
351 | + eor $s0,$i1,$s0,lsl#8 | |
352 | + eor $s1,$s1,$i2,lsl#8 | |
353 | + eor $s2,$s2,$i3,lsl#16 | |
354 | + eor $s3,$t3,$s3,lsl#24 | |
355 | + | |
356 | + ldr lr,[sp],#4 @ pop lr | |
357 | + ldr $t1,[$key,#0] | |
358 | + ldr $t2,[$key,#4] | |
359 | + ldr $t3,[$key,#8] | |
360 | + ldr $i1,[$key,#12] | |
361 | + eor $s0,$s0,$t1 | |
362 | + eor $s1,$s1,$t2 | |
363 | + eor $s2,$s2,$t3 | |
364 | + eor $s3,$s3,$i1 | |
365 | + | |
366 | + sub $tbl,$tbl,#2 | |
367 | + mov pc,lr @ return | |
368 | +.size _armv4_AES_encrypt,.-_armv4_AES_encrypt | |
369 | + | |
370 | +.global AES_set_encrypt_key | |
371 | +.type AES_set_encrypt_key,%function | |
372 | +.align 5 | |
373 | +AES_set_encrypt_key: | |
374 | + sub r3,pc,#8 @ AES_set_encrypt_key | |
375 | + teq r0,#0 | |
376 | + moveq r0,#-1 | |
377 | + beq .Labrt | |
378 | + teq r2,#0 | |
379 | + moveq r0,#-1 | |
380 | + beq .Labrt | |
381 | + | |
382 | + teq r1,#128 | |
383 | + beq .Lok | |
384 | + teq r1,#192 | |
385 | + beq .Lok | |
386 | + teq r1,#256 | |
387 | + movne r0,#-1 | |
388 | + bne .Labrt | |
389 | + | |
390 | +.Lok: stmdb sp!,{r4-r12,lr} | |
391 | + sub $tbl,r3,#AES_set_encrypt_key-AES_Te-1024 @ Te4 | |
392 | + | |
393 | + mov $rounds,r0 @ inp | |
394 | + mov lr,r1 @ bits | |
395 | + mov $key,r2 @ key | |
396 | + | |
397 | + ldrb $s0,[$rounds,#3] @ load input data in endian-neutral | |
398 | + ldrb $t1,[$rounds,#2] @ manner... | |
399 | + ldrb $t2,[$rounds,#1] | |
400 | + ldrb $t3,[$rounds,#0] | |
401 | + orr $s0,$s0,$t1,lsl#8 | |
402 | + orr $s0,$s0,$t2,lsl#16 | |
403 | + orr $s0,$s0,$t3,lsl#24 | |
404 | + ldrb $s1,[$rounds,#7] | |
405 | + ldrb $t1,[$rounds,#6] | |
406 | + ldrb $t2,[$rounds,#5] | |
407 | + ldrb $t3,[$rounds,#4] | |
408 | + orr $s1,$s1,$t1,lsl#8 | |
409 | + orr $s1,$s1,$t2,lsl#16 | |
410 | + orr $s1,$s1,$t3,lsl#24 | |
411 | + ldrb $s2,[$rounds,#11] | |
412 | + ldrb $t1,[$rounds,#10] | |
413 | + ldrb $t2,[$rounds,#9] | |
414 | + ldrb $t3,[$rounds,#8] | |
415 | + orr $s2,$s2,$t1,lsl#8 | |
416 | + orr $s2,$s2,$t2,lsl#16 | |
417 | + orr $s2,$s2,$t3,lsl#24 | |
418 | + ldrb $s3,[$rounds,#15] | |
419 | + ldrb $t1,[$rounds,#14] | |
420 | + ldrb $t2,[$rounds,#13] | |
421 | + ldrb $t3,[$rounds,#12] | |
422 | + orr $s3,$s3,$t1,lsl#8 | |
423 | + orr $s3,$s3,$t2,lsl#16 | |
424 | + orr $s3,$s3,$t3,lsl#24 | |
425 | + str $s0,[$key],#16 | |
426 | + str $s1,[$key,#-12] | |
427 | + str $s2,[$key,#-8] | |
428 | + str $s3,[$key,#-4] | |
429 | + | |
430 | + teq lr,#128 | |
431 | + bne .Lnot128 | |
432 | + mov $rounds,#10 | |
433 | + str $rounds,[$key,#240-16] | |
434 | + add $t3,$tbl,#256 @ rcon | |
435 | + mov lr,#255 | |
436 | + | |
437 | +.L128_loop: | |
438 | + and $t2,lr,$s3,lsr#24 | |
439 | + and $i1,lr,$s3,lsr#16 | |
440 | + and $i2,lr,$s3,lsr#8 | |
441 | + and $i3,lr,$s3 | |
442 | + ldrb $t2,[$tbl,$t2] | |
443 | + ldrb $i1,[$tbl,$i1] | |
444 | + ldrb $i2,[$tbl,$i2] | |
445 | + ldrb $i3,[$tbl,$i3] | |
446 | + ldr $t1,[$t3],#4 @ rcon[i++] | |
447 | + orr $t2,$t2,$i1,lsl#24 | |
448 | + orr $t2,$t2,$i2,lsl#16 | |
449 | + orr $t2,$t2,$i3,lsl#8 | |
450 | + eor $t2,$t2,$t1 | |
451 | + eor $s0,$s0,$t2 @ rk[4]=rk[0]^... | |
452 | + eor $s1,$s1,$s0 @ rk[5]=rk[1]^rk[4] | |
453 | + eor $s2,$s2,$s1 @ rk[6]=rk[2]^rk[5] | |
454 | + eor $s3,$s3,$s2 @ rk[7]=rk[3]^rk[6] | |
455 | + str $s0,[$key],#16 | |
456 | + str $s1,[$key,#-12] | |
457 | + str $s2,[$key,#-8] | |
458 | + str $s3,[$key,#-4] | |
459 | + | |
460 | + subs $rounds,$rounds,#1 | |
461 | + bne .L128_loop | |
462 | + sub r2,$key,#176 | |
463 | + b .Ldone | |
464 | + | |
465 | +.Lnot128: | |
466 | + ldrb $i2,[$rounds,#19] | |
467 | + ldrb $t1,[$rounds,#18] | |
468 | + ldrb $t2,[$rounds,#17] | |
469 | + ldrb $t3,[$rounds,#16] | |
470 | + orr $i2,$i2,$t1,lsl#8 | |
471 | + orr $i2,$i2,$t2,lsl#16 | |
472 | + orr $i2,$i2,$t3,lsl#24 | |
473 | + ldrb $i3,[$rounds,#23] | |
474 | + ldrb $t1,[$rounds,#22] | |
475 | + ldrb $t2,[$rounds,#21] | |
476 | + ldrb $t3,[$rounds,#20] | |
477 | + orr $i3,$i3,$t1,lsl#8 | |
478 | + orr $i3,$i3,$t2,lsl#16 | |
479 | + orr $i3,$i3,$t3,lsl#24 | |
480 | + str $i2,[$key],#8 | |
481 | + str $i3,[$key,#-4] | |
482 | + | |
483 | + teq lr,#192 | |
484 | + bne .Lnot192 | |
485 | + mov $rounds,#12 | |
486 | + str $rounds,[$key,#240-24] | |
487 | + add $t3,$tbl,#256 @ rcon | |
488 | + mov lr,#255 | |
489 | + mov $rounds,#8 | |
490 | + | |
491 | +.L192_loop: | |
492 | + and $t2,lr,$i3,lsr#24 | |
493 | + and $i1,lr,$i3,lsr#16 | |
494 | + and $i2,lr,$i3,lsr#8 | |
495 | + and $i3,lr,$i3 | |
496 | + ldrb $t2,[$tbl,$t2] | |
497 | + ldrb $i1,[$tbl,$i1] | |
498 | + ldrb $i2,[$tbl,$i2] | |
499 | + ldrb $i3,[$tbl,$i3] | |
500 | + ldr $t1,[$t3],#4 @ rcon[i++] | |
501 | + orr $t2,$t2,$i1,lsl#24 | |
502 | + orr $t2,$t2,$i2,lsl#16 | |
503 | + orr $t2,$t2,$i3,lsl#8 | |
504 | + eor $i3,$t2,$t1 | |
505 | + eor $s0,$s0,$i3 @ rk[6]=rk[0]^... | |
506 | + eor $s1,$s1,$s0 @ rk[7]=rk[1]^rk[6] | |
507 | + eor $s2,$s2,$s1 @ rk[8]=rk[2]^rk[7] | |
508 | + eor $s3,$s3,$s2 @ rk[9]=rk[3]^rk[8] | |
509 | + str $s0,[$key],#24 | |
510 | + str $s1,[$key,#-20] | |
511 | + str $s2,[$key,#-16] | |
512 | + str $s3,[$key,#-12] | |
513 | + | |
514 | + subs $rounds,$rounds,#1 | |
515 | + subeq r2,$key,#216 | |
516 | + beq .Ldone | |
517 | + | |
518 | + ldr $i1,[$key,#-32] | |
519 | + ldr $i2,[$key,#-28] | |
520 | + eor $i1,$i1,$s3 @ rk[10]=rk[4]^rk[9] | |
521 | + eor $i3,$i2,$i1 @ rk[11]=rk[5]^rk[10] | |
522 | + str $i1,[$key,#-8] | |
523 | + str $i3,[$key,#-4] | |
524 | + b .L192_loop | |
525 | + | |
526 | +.Lnot192: | |
527 | + ldrb $i2,[$rounds,#27] | |
528 | + ldrb $t1,[$rounds,#26] | |
529 | + ldrb $t2,[$rounds,#25] | |
530 | + ldrb $t3,[$rounds,#24] | |
531 | + orr $i2,$i2,$t1,lsl#8 | |
532 | + orr $i2,$i2,$t2,lsl#16 | |
533 | + orr $i2,$i2,$t3,lsl#24 | |
534 | + ldrb $i3,[$rounds,#31] | |
535 | + ldrb $t1,[$rounds,#30] | |
536 | + ldrb $t2,[$rounds,#29] | |
537 | + ldrb $t3,[$rounds,#28] | |
538 | + orr $i3,$i3,$t1,lsl#8 | |
539 | + orr $i3,$i3,$t2,lsl#16 | |
540 | + orr $i3,$i3,$t3,lsl#24 | |
541 | + str $i2,[$key],#8 | |
542 | + str $i3,[$key,#-4] | |
543 | + | |
544 | + mov $rounds,#14 | |
545 | + str $rounds,[$key,#240-32] | |
546 | + add $t3,$tbl,#256 @ rcon | |
547 | + mov lr,#255 | |
548 | + mov $rounds,#7 | |
549 | + | |
550 | +.L256_loop: | |
551 | + and $t2,lr,$i3,lsr#24 | |
552 | + and $i1,lr,$i3,lsr#16 | |
553 | + and $i2,lr,$i3,lsr#8 | |
554 | + and $i3,lr,$i3 | |
555 | + ldrb $t2,[$tbl,$t2] | |
556 | + ldrb $i1,[$tbl,$i1] | |
557 | + ldrb $i2,[$tbl,$i2] | |
558 | + ldrb $i3,[$tbl,$i3] | |
559 | + ldr $t1,[$t3],#4 @ rcon[i++] | |
560 | + orr $t2,$t2,$i1,lsl#24 | |
561 | + orr $t2,$t2,$i2,lsl#16 | |
562 | + orr $t2,$t2,$i3,lsl#8 | |
563 | + eor $i3,$t2,$t1 | |
564 | + eor $s0,$s0,$i3 @ rk[8]=rk[0]^... | |
565 | + eor $s1,$s1,$s0 @ rk[9]=rk[1]^rk[8] | |
566 | + eor $s2,$s2,$s1 @ rk[10]=rk[2]^rk[9] | |
567 | + eor $s3,$s3,$s2 @ rk[11]=rk[3]^rk[10] | |
568 | + str $s0,[$key],#32 | |
569 | + str $s1,[$key,#-28] | |
570 | + str $s2,[$key,#-24] | |
571 | + str $s3,[$key,#-20] | |
572 | + | |
573 | + subs $rounds,$rounds,#1 | |
574 | + subeq r2,$key,#256 | |
575 | + beq .Ldone | |
576 | + | |
577 | + and $t2,lr,$s3 | |
578 | + and $i1,lr,$s3,lsr#8 | |
579 | + and $i2,lr,$s3,lsr#16 | |
580 | + and $i3,lr,$s3,lsr#24 | |
581 | + ldrb $t2,[$tbl,$t2] | |
582 | + ldrb $i1,[$tbl,$i1] | |
583 | + ldrb $i2,[$tbl,$i2] | |
584 | + ldrb $i3,[$tbl,$i3] | |
585 | + orr $t2,$t2,$i1,lsl#8 | |
586 | + orr $t2,$t2,$i2,lsl#16 | |
587 | + orr $t2,$t2,$i3,lsl#24 | |
588 | + | |
589 | + ldr $t1,[$key,#-48] | |
590 | + ldr $i1,[$key,#-44] | |
591 | + ldr $i2,[$key,#-40] | |
592 | + ldr $i3,[$key,#-36] | |
593 | + eor $t1,$t1,$t2 @ rk[12]=rk[4]^... | |
594 | + eor $i1,$i1,$t1 @ rk[13]=rk[5]^rk[12] | |
595 | + eor $i2,$i2,$i1 @ rk[14]=rk[6]^rk[13] | |
596 | + eor $i3,$i3,$i2 @ rk[15]=rk[7]^rk[14] | |
597 | + str $t1,[$key,#-16] | |
598 | + str $i1,[$key,#-12] | |
599 | + str $i2,[$key,#-8] | |
600 | + str $i3,[$key,#-4] | |
601 | + b .L256_loop | |
602 | + | |
603 | +.Ldone: mov r0,#0 | |
604 | + ldmia sp!,{r4-r12,lr} | |
605 | +.Labrt: tst lr,#1 | |
606 | + moveq pc,lr @ be binary compatible with V4, yet | |
607 | + bx lr @ interoperable with Thumb ISA:-) | |
608 | +.size AES_set_encrypt_key,.-AES_set_encrypt_key | |
609 | + | |
610 | +.global AES_set_decrypt_key | |
611 | +.type AES_set_decrypt_key,%function | |
612 | +.align 5 | |
613 | +AES_set_decrypt_key: | |
614 | + str lr,[sp,#-4]! @ push lr | |
615 | + bl AES_set_encrypt_key | |
616 | + teq r0,#0 | |
617 | + ldrne lr,[sp],#4 @ pop lr | |
618 | + bne .Labrt | |
619 | + | |
620 | + stmdb sp!,{r4-r12} | |
621 | + | |
622 | + ldr $rounds,[r2,#240] @ AES_set_encrypt_key preserves r2, | |
623 | + mov $key,r2 @ which is AES_KEY *key | |
624 | + mov $i1,r2 | |
625 | + add $i2,r2,$rounds,lsl#4 | |
626 | + | |
627 | +.Linv: ldr $s0,[$i1] | |
628 | + ldr $s1,[$i1,#4] | |
629 | + ldr $s2,[$i1,#8] | |
630 | + ldr $s3,[$i1,#12] | |
631 | + ldr $t1,[$i2] | |
632 | + ldr $t2,[$i2,#4] | |
633 | + ldr $t3,[$i2,#8] | |
634 | + ldr $i3,[$i2,#12] | |
635 | + str $s0,[$i2],#-16 | |
636 | + str $s1,[$i2,#16+4] | |
637 | + str $s2,[$i2,#16+8] | |
638 | + str $s3,[$i2,#16+12] | |
639 | + str $t1,[$i1],#16 | |
640 | + str $t2,[$i1,#-12] | |
641 | + str $t3,[$i1,#-8] | |
642 | + str $i3,[$i1,#-4] | |
643 | + teq $i1,$i2 | |
644 | + bne .Linv | |
645 | +___ | |
646 | +$mask80=$i1; | |
647 | +$mask1b=$i2; | |
648 | +$mask7f=$i3; | |
649 | +$code.=<<___; | |
650 | + ldr $s0,[$key,#16]! @ prefetch tp1 | |
651 | + mov $mask80,#0x80 | |
652 | + mov $mask1b,#0x1b | |
653 | + orr $mask80,$mask80,#0x8000 | |
654 | + orr $mask1b,$mask1b,#0x1b00 | |
655 | + orr $mask80,$mask80,$mask80,lsl#16 | |
656 | + orr $mask1b,$mask1b,$mask1b,lsl#16 | |
657 | + sub $rounds,$rounds,#1 | |
658 | + mvn $mask7f,$mask80 | |
659 | + mov $rounds,$rounds,lsl#2 @ (rounds-1)*4 | |
660 | + | |
661 | +.Lmix: and $t1,$s0,$mask80 | |
662 | + and $s1,$s0,$mask7f | |
663 | + sub $t1,$t1,$t1,lsr#7 | |
664 | + and $t1,$t1,$mask1b | |
665 | + eor $s1,$t1,$s1,lsl#1 @ tp2 | |
666 | + | |
667 | + and $t1,$s1,$mask80 | |
668 | + and $s2,$s1,$mask7f | |
669 | + sub $t1,$t1,$t1,lsr#7 | |
670 | + and $t1,$t1,$mask1b | |
671 | + eor $s2,$t1,$s2,lsl#1 @ tp4 | |
672 | + | |
673 | + and $t1,$s2,$mask80 | |
674 | + and $s3,$s2,$mask7f | |
675 | + sub $t1,$t1,$t1,lsr#7 | |
676 | + and $t1,$t1,$mask1b | |
677 | + eor $s3,$t1,$s3,lsl#1 @ tp8 | |
678 | + | |
679 | + eor $t1,$s1,$s2 | |
680 | + eor $t2,$s0,$s3 @ tp9 | |
681 | + eor $t1,$t1,$s3 @ tpe | |
682 | + eor $t1,$t1,$s1,ror#24 | |
683 | + eor $t1,$t1,$t2,ror#24 @ ^= ROTATE(tpb=tp9^tp2,8) | |
684 | + eor $t1,$t1,$s2,ror#16 | |
685 | + eor $t1,$t1,$t2,ror#16 @ ^= ROTATE(tpd=tp9^tp4,16) | |
686 | + eor $t1,$t1,$t2,ror#8 @ ^= ROTATE(tp9,24) | |
687 | + | |
688 | + ldr $s0,[$key,#4] @ prefetch tp1 | |
689 | + str $t1,[$key],#4 | |
690 | + subs $rounds,$rounds,#1 | |
691 | + bne .Lmix | |
692 | + | |
693 | + mov r0,#0 | |
694 | + ldmia sp!,{r4-r12,lr} | |
695 | + tst lr,#1 | |
696 | + moveq pc,lr @ be binary compatible with V4, yet | |
697 | + bx lr @ interoperable with Thumb ISA:-) | |
698 | +.size AES_set_decrypt_key,.-AES_set_decrypt_key | |
699 | + | |
700 | +.type AES_Td,%object | |
701 | +.align 5 | |
702 | +AES_Td: | |
703 | +.word 0x51f4a750, 0x7e416553, 0x1a17a4c3, 0x3a275e96 | |
704 | +.word 0x3bab6bcb, 0x1f9d45f1, 0xacfa58ab, 0x4be30393 | |
705 | +.word 0x2030fa55, 0xad766df6, 0x88cc7691, 0xf5024c25 | |
706 | +.word 0x4fe5d7fc, 0xc52acbd7, 0x26354480, 0xb562a38f | |
707 | +.word 0xdeb15a49, 0x25ba1b67, 0x45ea0e98, 0x5dfec0e1 | |
708 | +.word 0xc32f7502, 0x814cf012, 0x8d4697a3, 0x6bd3f9c6 | |
709 | +.word 0x038f5fe7, 0x15929c95, 0xbf6d7aeb, 0x955259da | |
710 | +.word 0xd4be832d, 0x587421d3, 0x49e06929, 0x8ec9c844 | |
711 | +.word 0x75c2896a, 0xf48e7978, 0x99583e6b, 0x27b971dd | |
712 | +.word 0xbee14fb6, 0xf088ad17, 0xc920ac66, 0x7dce3ab4 | |
713 | +.word 0x63df4a18, 0xe51a3182, 0x97513360, 0x62537f45 | |
714 | +.word 0xb16477e0, 0xbb6bae84, 0xfe81a01c, 0xf9082b94 | |
715 | +.word 0x70486858, 0x8f45fd19, 0x94de6c87, 0x527bf8b7 | |
716 | +.word 0xab73d323, 0x724b02e2, 0xe31f8f57, 0x6655ab2a | |
717 | +.word 0xb2eb2807, 0x2fb5c203, 0x86c57b9a, 0xd33708a5 | |
718 | +.word 0x302887f2, 0x23bfa5b2, 0x02036aba, 0xed16825c | |
719 | +.word 0x8acf1c2b, 0xa779b492, 0xf307f2f0, 0x4e69e2a1 | |
720 | +.word 0x65daf4cd, 0x0605bed5, 0xd134621f, 0xc4a6fe8a | |
721 | +.word 0x342e539d, 0xa2f355a0, 0x058ae132, 0xa4f6eb75 | |
722 | +.word 0x0b83ec39, 0x4060efaa, 0x5e719f06, 0xbd6e1051 | |
723 | +.word 0x3e218af9, 0x96dd063d, 0xdd3e05ae, 0x4de6bd46 | |
724 | +.word 0x91548db5, 0x71c45d05, 0x0406d46f, 0x605015ff | |
725 | +.word 0x1998fb24, 0xd6bde997, 0x894043cc, 0x67d99e77 | |
726 | +.word 0xb0e842bd, 0x07898b88, 0xe7195b38, 0x79c8eedb | |
727 | +.word 0xa17c0a47, 0x7c420fe9, 0xf8841ec9, 0x00000000 | |
728 | +.word 0x09808683, 0x322bed48, 0x1e1170ac, 0x6c5a724e | |
729 | +.word 0xfd0efffb, 0x0f853856, 0x3daed51e, 0x362d3927 | |
730 | +.word 0x0a0fd964, 0x685ca621, 0x9b5b54d1, 0x24362e3a | |
731 | +.word 0x0c0a67b1, 0x9357e70f, 0xb4ee96d2, 0x1b9b919e | |
732 | +.word 0x80c0c54f, 0x61dc20a2, 0x5a774b69, 0x1c121a16 | |
733 | +.word 0xe293ba0a, 0xc0a02ae5, 0x3c22e043, 0x121b171d | |
734 | +.word 0x0e090d0b, 0xf28bc7ad, 0x2db6a8b9, 0x141ea9c8 | |
735 | +.word 0x57f11985, 0xaf75074c, 0xee99ddbb, 0xa37f60fd | |
736 | +.word 0xf701269f, 0x5c72f5bc, 0x44663bc5, 0x5bfb7e34 | |
737 | +.word 0x8b432976, 0xcb23c6dc, 0xb6edfc68, 0xb8e4f163 | |
738 | +.word 0xd731dcca, 0x42638510, 0x13972240, 0x84c61120 | |
739 | +.word 0x854a247d, 0xd2bb3df8, 0xaef93211, 0xc729a16d | |
740 | +.word 0x1d9e2f4b, 0xdcb230f3, 0x0d8652ec, 0x77c1e3d0 | |
741 | +.word 0x2bb3166c, 0xa970b999, 0x119448fa, 0x47e96422 | |
742 | +.word 0xa8fc8cc4, 0xa0f03f1a, 0x567d2cd8, 0x223390ef | |
743 | +.word 0x87494ec7, 0xd938d1c1, 0x8ccaa2fe, 0x98d40b36 | |
744 | +.word 0xa6f581cf, 0xa57ade28, 0xdab78e26, 0x3fadbfa4 | |
745 | +.word 0x2c3a9de4, 0x5078920d, 0x6a5fcc9b, 0x547e4662 | |
746 | +.word 0xf68d13c2, 0x90d8b8e8, 0x2e39f75e, 0x82c3aff5 | |
747 | +.word 0x9f5d80be, 0x69d0937c, 0x6fd52da9, 0xcf2512b3 | |
748 | +.word 0xc8ac993b, 0x10187da7, 0xe89c636e, 0xdb3bbb7b | |
749 | +.word 0xcd267809, 0x6e5918f4, 0xec9ab701, 0x834f9aa8 | |
750 | +.word 0xe6956e65, 0xaaffe67e, 0x21bccf08, 0xef15e8e6 | |
751 | +.word 0xbae79bd9, 0x4a6f36ce, 0xea9f09d4, 0x29b07cd6 | |
752 | +.word 0x31a4b2af, 0x2a3f2331, 0xc6a59430, 0x35a266c0 | |
753 | +.word 0x744ebc37, 0xfc82caa6, 0xe090d0b0, 0x33a7d815 | |
754 | +.word 0xf104984a, 0x41ecdaf7, 0x7fcd500e, 0x1791f62f | |
755 | +.word 0x764dd68d, 0x43efb04d, 0xccaa4d54, 0xe49604df | |
756 | +.word 0x9ed1b5e3, 0x4c6a881b, 0xc12c1fb8, 0x4665517f | |
757 | +.word 0x9d5eea04, 0x018c355d, 0xfa877473, 0xfb0b412e | |
758 | +.word 0xb3671d5a, 0x92dbd252, 0xe9105633, 0x6dd64713 | |
759 | +.word 0x9ad7618c, 0x37a10c7a, 0x59f8148e, 0xeb133c89 | |
760 | +.word 0xcea927ee, 0xb761c935, 0xe11ce5ed, 0x7a47b13c | |
761 | +.word 0x9cd2df59, 0x55f2733f, 0x1814ce79, 0x73c737bf | |
762 | +.word 0x53f7cdea, 0x5ffdaa5b, 0xdf3d6f14, 0x7844db86 | |
763 | +.word 0xcaaff381, 0xb968c43e, 0x3824342c, 0xc2a3405f | |
764 | +.word 0x161dc372, 0xbce2250c, 0x283c498b, 0xff0d9541 | |
765 | +.word 0x39a80171, 0x080cb3de, 0xd8b4e49c, 0x6456c190 | |
766 | +.word 0x7bcb8461, 0xd532b670, 0x486c5c74, 0xd0b85742 | |
767 | +@ Td4[256] | |
768 | +.byte 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38 | |
769 | +.byte 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb | |
770 | +.byte 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87 | |
771 | +.byte 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb | |
772 | +.byte 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d | |
773 | +.byte 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e | |
774 | +.byte 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2 | |
775 | +.byte 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25 | |
776 | +.byte 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16 | |
777 | +.byte 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92 | |
778 | +.byte 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda | |
779 | +.byte 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84 | |
780 | +.byte 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a | |
781 | +.byte 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06 | |
782 | +.byte 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02 | |
783 | +.byte 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b | |
784 | +.byte 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea | |
785 | +.byte 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73 | |
786 | +.byte 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85 | |
787 | +.byte 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e | |
788 | +.byte 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89 | |
789 | +.byte 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b | |
790 | +.byte 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20 | |
791 | +.byte 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4 | |
792 | +.byte 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31 | |
793 | +.byte 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f | |
794 | +.byte 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d | |
795 | +.byte 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef | |
796 | +.byte 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0 | |
797 | +.byte 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61 | |
798 | +.byte 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26 | |
799 | +.byte 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d | |
800 | +.size AES_Td,.-AES_Td | |
801 | + | |
802 | +@ void AES_decrypt(const unsigned char *in, unsigned char *out, | |
803 | +@ const AES_KEY *key) { | |
804 | +.global AES_decrypt | |
805 | +.type AES_decrypt,%function | |
806 | +.align 5 | |
807 | +AES_decrypt: | |
808 | + sub r3,pc,#8 @ AES_decrypt | |
809 | + stmdb sp!,{r1,r4-r12,lr} | |
810 | + mov $rounds,r0 @ inp | |
811 | + mov $key,r2 | |
812 | + sub $tbl,r3,#AES_decrypt-AES_Td @ Td | |
813 | + | |
814 | + ldrb $s0,[$rounds,#3] @ load input data in endian-neutral | |
815 | + ldrb $t1,[$rounds,#2] @ manner... | |
816 | + ldrb $t2,[$rounds,#1] | |
817 | + ldrb $t3,[$rounds,#0] | |
818 | + orr $s0,$s0,$t1,lsl#8 | |
819 | + orr $s0,$s0,$t2,lsl#16 | |
820 | + orr $s0,$s0,$t3,lsl#24 | |
821 | + ldrb $s1,[$rounds,#7] | |
822 | + ldrb $t1,[$rounds,#6] | |
823 | + ldrb $t2,[$rounds,#5] | |
824 | + ldrb $t3,[$rounds,#4] | |
825 | + orr $s1,$s1,$t1,lsl#8 | |
826 | + orr $s1,$s1,$t2,lsl#16 | |
827 | + orr $s1,$s1,$t3,lsl#24 | |
828 | + ldrb $s2,[$rounds,#11] | |
829 | + ldrb $t1,[$rounds,#10] | |
830 | + ldrb $t2,[$rounds,#9] | |
831 | + ldrb $t3,[$rounds,#8] | |
832 | + orr $s2,$s2,$t1,lsl#8 | |
833 | + orr $s2,$s2,$t2,lsl#16 | |
834 | + orr $s2,$s2,$t3,lsl#24 | |
835 | + ldrb $s3,[$rounds,#15] | |
836 | + ldrb $t1,[$rounds,#14] | |
837 | + ldrb $t2,[$rounds,#13] | |
838 | + ldrb $t3,[$rounds,#12] | |
839 | + orr $s3,$s3,$t1,lsl#8 | |
840 | + orr $s3,$s3,$t2,lsl#16 | |
841 | + orr $s3,$s3,$t3,lsl#24 | |
842 | + | |
843 | + bl _armv4_AES_decrypt | |
844 | + | |
845 | + ldr $rounds,[sp],#4 @ pop out | |
846 | + mov $t1,$s0,lsr#24 @ write output in endian-neutral | |
847 | + mov $t2,$s0,lsr#16 @ manner... | |
848 | + mov $t3,$s0,lsr#8 | |
849 | + strb $t1,[$rounds,#0] | |
850 | + strb $t2,[$rounds,#1] | |
851 | + strb $t3,[$rounds,#2] | |
852 | + strb $s0,[$rounds,#3] | |
853 | + mov $t1,$s1,lsr#24 | |
854 | + mov $t2,$s1,lsr#16 | |
855 | + mov $t3,$s1,lsr#8 | |
856 | + strb $t1,[$rounds,#4] | |
857 | + strb $t2,[$rounds,#5] | |
858 | + strb $t3,[$rounds,#6] | |
859 | + strb $s1,[$rounds,#7] | |
860 | + mov $t1,$s2,lsr#24 | |
861 | + mov $t2,$s2,lsr#16 | |
862 | + mov $t3,$s2,lsr#8 | |
863 | + strb $t1,[$rounds,#8] | |
864 | + strb $t2,[$rounds,#9] | |
865 | + strb $t3,[$rounds,#10] | |
866 | + strb $s2,[$rounds,#11] | |
867 | + mov $t1,$s3,lsr#24 | |
868 | + mov $t2,$s3,lsr#16 | |
869 | + mov $t3,$s3,lsr#8 | |
870 | + strb $t1,[$rounds,#12] | |
871 | + strb $t2,[$rounds,#13] | |
872 | + strb $t3,[$rounds,#14] | |
873 | + strb $s3,[$rounds,#15] | |
874 | + | |
875 | + ldmia sp!,{r4-r12,lr} | |
876 | + tst lr,#1 | |
877 | + moveq pc,lr @ be binary compatible with V4, yet | |
878 | + bx lr @ interoperable with Thumb ISA:-) | |
879 | +.size AES_decrypt,.-AES_decrypt | |
880 | + | |
881 | +.type _armv4_AES_decrypt,%function | |
882 | +.align 2 | |
883 | +_armv4_AES_decrypt: | |
884 | + str lr,[sp,#-4]! @ push lr | |
885 | + ldr $t1,[$key],#16 | |
886 | + ldr $t2,[$key,#-12] | |
887 | + ldr $t3,[$key,#-8] | |
888 | + ldr $i1,[$key,#-4] | |
889 | + ldr $rounds,[$key,#240-16] | |
890 | + eor $s0,$s0,$t1 | |
891 | + eor $s1,$s1,$t2 | |
892 | + eor $s2,$s2,$t3 | |
893 | + eor $s3,$s3,$i1 | |
894 | + sub $rounds,$rounds,#1 | |
895 | + mov lr,#255 | |
896 | + | |
897 | +.Ldec_loop: | |
898 | + and $i1,lr,$s0,lsr#16 | |
899 | + and $i2,lr,$s0,lsr#8 | |
900 | + and $i3,lr,$s0 | |
901 | + mov $s0,$s0,lsr#24 | |
902 | + ldr $t1,[$tbl,$i1,lsl#2] @ Td1[s0>>16] | |
903 | + ldr $s0,[$tbl,$s0,lsl#2] @ Td0[s0>>24] | |
904 | + ldr $t2,[$tbl,$i2,lsl#2] @ Td2[s0>>8] | |
905 | + ldr $t3,[$tbl,$i3,lsl#2] @ Td3[s0>>0] | |
906 | + | |
907 | + and $i1,lr,$s1 @ i0 | |
908 | + and $i2,lr,$s1,lsr#16 | |
909 | + and $i3,lr,$s1,lsr#8 | |
910 | + mov $s1,$s1,lsr#24 | |
911 | + ldr $i1,[$tbl,$i1,lsl#2] @ Td3[s1>>0] | |
912 | + ldr $s1,[$tbl,$s1,lsl#2] @ Td0[s1>>24] | |
913 | + ldr $i2,[$tbl,$i2,lsl#2] @ Td1[s1>>16] | |
914 | + ldr $i3,[$tbl,$i3,lsl#2] @ Td2[s1>>8] | |
915 | + eor $s0,$s0,$i1,ror#24 | |
916 | + eor $s1,$s1,$t1,ror#8 | |
917 | + eor $t2,$i2,$t2,ror#8 | |
918 | + eor $t3,$i3,$t3,ror#8 | |
919 | + | |
920 | + and $i1,lr,$s2,lsr#8 @ i0 | |
921 | + and $i2,lr,$s2 @ i1 | |
922 | + and $i3,lr,$s2,lsr#16 | |
923 | + mov $s2,$s2,lsr#24 | |
924 | + ldr $i1,[$tbl,$i1,lsl#2] @ Td2[s2>>8] | |
925 | + ldr $i2,[$tbl,$i2,lsl#2] @ Td3[s2>>0] | |
926 | + ldr $s2,[$tbl,$s2,lsl#2] @ Td0[s2>>24] | |
927 | + ldr $i3,[$tbl,$i3,lsl#2] @ Td1[s2>>16] | |
928 | + eor $s0,$s0,$i1,ror#16 | |
929 | + eor $s1,$s1,$i2,ror#24 | |
930 | + eor $s2,$s2,$t2,ror#8 | |
931 | + eor $t3,$i3,$t3,ror#8 | |
932 | + | |
933 | + and $i1,lr,$s3,lsr#16 @ i0 | |
934 | + and $i2,lr,$s3,lsr#8 @ i1 | |
935 | + and $i3,lr,$s3 @ i2 | |
936 | + mov $s3,$s3,lsr#24 | |
937 | + ldr $i1,[$tbl,$i1,lsl#2] @ Td1[s3>>16] | |
938 | + ldr $i2,[$tbl,$i2,lsl#2] @ Td2[s3>>8] | |
939 | + ldr $i3,[$tbl,$i3,lsl#2] @ Td3[s3>>0] | |
940 | + ldr $s3,[$tbl,$s3,lsl#2] @ Td0[s3>>24] | |
941 | + eor $s0,$s0,$i1,ror#8 | |
942 | + eor $s1,$s1,$i2,ror#16 | |
943 | + eor $s2,$s2,$i3,ror#24 | |
944 | + eor $s3,$s3,$t3,ror#8 | |
945 | + | |
946 | + ldr $t1,[$key],#16 | |
947 | + ldr $t2,[$key,#-12] | |
948 | + ldr $t3,[$key,#-8] | |
949 | + ldr $i1,[$key,#-4] | |
950 | + eor $s0,$s0,$t1 | |
951 | + eor $s1,$s1,$t2 | |
952 | + eor $s2,$s2,$t3 | |
953 | + eor $s3,$s3,$i1 | |
954 | + | |
955 | + subs $rounds,$rounds,#1 | |
956 | + bne .Ldec_loop | |
957 | + | |
958 | + add $tbl,$tbl,#1024 | |
959 | + | |
960 | + ldr $t1,[$tbl,#0] @ prefetch Td4 | |
961 | + ldr $t2,[$tbl,#32] | |
962 | + ldr $t3,[$tbl,#64] | |
963 | + ldr $i1,[$tbl,#96] | |
964 | + ldr $i2,[$tbl,#128] | |
965 | + ldr $i3,[$tbl,#160] | |
966 | + ldr $t1,[$tbl,#192] | |
967 | + ldr $t2,[$tbl,#224] | |
968 | + | |
969 | + and $i1,lr,$s0,lsr#16 | |
970 | + and $i2,lr,$s0,lsr#8 | |
971 | + and $i3,lr,$s0 | |
972 | + ldrb $s0,[$tbl,$s0,lsr#24] @ Td4[s0>>24] | |
973 | + ldrb $t1,[$tbl,$i1] @ Td4[s0>>16] | |
974 | + ldrb $t2,[$tbl,$i2] @ Td4[s0>>8] | |
975 | + ldrb $t3,[$tbl,$i3] @ Td4[s0>>0] | |
976 | + | |
977 | + and $i1,lr,$s1 @ i0 | |
978 | + and $i2,lr,$s1,lsr#16 | |
979 | + and $i3,lr,$s1,lsr#8 | |
980 | + ldrb $i1,[$tbl,$i1] @ Td4[s1>>0] | |
981 | + ldrb $s1,[$tbl,$s1,lsr#24] @ Td4[s1>>24] | |
982 | + ldrb $i2,[$tbl,$i2] @ Td4[s1>>16] | |
983 | + ldrb $i3,[$tbl,$i3] @ Td4[s1>>8] | |
984 | + eor $s0,$i1,$s0,lsl#24 | |
985 | + eor $s1,$t1,$s1,lsl#8 | |
986 | + eor $t2,$t2,$i2,lsl#8 | |
987 | + eor $t3,$t3,$i3,lsl#8 | |
988 | + | |
989 | + and $i1,lr,$s2,lsr#8 @ i0 | |
990 | + and $i2,lr,$s2 @ i1 | |
991 | + and $i3,lr,$s2,lsr#16 | |
992 | + ldrb $i1,[$tbl,$i1] @ Td4[s2>>8] | |
993 | + ldrb $i2,[$tbl,$i2] @ Td4[s2>>0] | |
994 | + ldrb $s2,[$tbl,$s2,lsr#24] @ Td4[s2>>24] | |
995 | + ldrb $i3,[$tbl,$i3] @ Td4[s2>>16] | |
996 | + eor $s0,$s0,$i1,lsl#8 | |
997 | + eor $s1,$i2,$s1,lsl#16 | |
998 | + eor $s2,$t2,$s2,lsl#16 | |
999 | + eor $t3,$t3,$i3,lsl#16 | |
1000 | + | |
1001 | + and $i1,lr,$s3,lsr#16 @ i0 | |
1002 | + and $i2,lr,$s3,lsr#8 @ i1 | |
1003 | + and $i3,lr,$s3 @ i2 | |
1004 | + ldrb $i1,[$tbl,$i1] @ Td4[s3>>16] | |
1005 | + ldrb $i2,[$tbl,$i2] @ Td4[s3>>8] | |
1006 | + ldrb $i3,[$tbl,$i3] @ Td4[s3>>0] | |
1007 | + ldrb $s3,[$tbl,$s3,lsr#24] @ Td4[s3>>24] | |
1008 | + eor $s0,$s0,$i1,lsl#16 | |
1009 | + eor $s1,$s1,$i2,lsl#8 | |
1010 | + eor $s2,$i3,$s2,lsl#8 | |
1011 | + eor $s3,$t3,$s3,lsl#24 | |
1012 | + | |
1013 | + ldr lr,[sp],#4 @ pop lr | |
1014 | + ldr $t1,[$key,#0] | |
1015 | + ldr $t2,[$key,#4] | |
1016 | + ldr $t3,[$key,#8] | |
1017 | + ldr $i1,[$key,#12] | |
1018 | + eor $s0,$s0,$t1 | |
1019 | + eor $s1,$s1,$t2 | |
1020 | + eor $s2,$s2,$t3 | |
1021 | + eor $s3,$s3,$i1 | |
1022 | + | |
1023 | + sub $tbl,$tbl,#1024 | |
1024 | + mov pc,lr @ return | |
1025 | +.size _armv4_AES_decrypt,.-_armv4_AES_decrypt | |
1026 | +.asciz "AES for ARMv4, CRYPTOGAMS by <appro\@openssl.org>" | |
1027 | +___ | |
1028 | + | |
1029 | +$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4 | |
1030 | +print $code; |
@@ -0,0 +1,1176 @@ | ||
1 | +#!/usr/bin/env perl | |
2 | + | |
3 | +# ==================================================================== | |
4 | +# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | |
5 | +# project. The module is, however, dual licensed under OpenSSL and | |
6 | +# CRYPTOGAMS licenses depending on where you obtain it. For further | |
7 | +# details see http://www.openssl.org/~appro/cryptogams/. | |
8 | +# ==================================================================== | |
9 | + | |
10 | +# Needs more work: key setup, page boundaries, CBC routine... | |
11 | +# | |
12 | +# ppc_AES_[en|de]crypt perform at 18 cycles per byte processed with | |
13 | +# 128-bit key, which is ~40% better than 64-bit code generated by gcc | |
14 | +# 4.0. But these are not the ones currently used! Their "compact" | |
15 | +# counterparts are, for security reason. ppc_AES_encrypt_compact runs | |
16 | +# at 1/2 of ppc_AES_encrypt speed, while ppc_AES_decrypt_compact - | |
17 | +# at 1/3 of ppc_AES_decrypt. | |
18 | + | |
19 | +$flavour = shift; | |
20 | + | |
21 | +if ($flavour =~ /64/) { | |
22 | + $SIZE_T =8; | |
23 | + $STU ="stdu"; | |
24 | + $POP ="ld"; | |
25 | + $PUSH ="std"; | |
26 | +} elsif ($flavour =~ /32/) { | |
27 | + $SIZE_T =4; | |
28 | + $STU ="stwu"; | |
29 | + $POP ="lwz"; | |
30 | + $PUSH ="stw"; | |
31 | +} else { die "nonsense $flavour"; } | |
32 | + | |
33 | +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | |
34 | +( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or | |
35 | +( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or | |
36 | +die "can't locate ppc-xlate.pl"; | |
37 | + | |
38 | +open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!"; | |
39 | + | |
40 | +$FRAME=32*$SIZE_T; | |
41 | + | |
42 | +sub _data_word() | |
43 | +{ my $i; | |
44 | + while(defined($i=shift)) { $code.=sprintf"\t.long\t0x%08x,0x%08x\n",$i,$i; } | |
45 | +} | |
46 | + | |
47 | +$sp="r1"; | |
48 | +$toc="r2"; | |
49 | +$inp="r3"; | |
50 | +$out="r4"; | |
51 | +$key="r5"; | |
52 | + | |
53 | +$Tbl0="r3"; | |
54 | +$Tbl1="r6"; | |
55 | +$Tbl2="r7"; | |
56 | +$Tbl3="r2"; | |
57 | + | |
58 | +$s0="r8"; | |
59 | +$s1="r9"; | |
60 | +$s2="r10"; | |
61 | +$s3="r11"; | |
62 | + | |
63 | +$t0="r12"; | |
64 | +$t1="r13"; | |
65 | +$t2="r14"; | |
66 | +$t3="r15"; | |
67 | + | |
68 | +$acc00="r16"; | |
69 | +$acc01="r17"; | |
70 | +$acc02="r18"; | |
71 | +$acc03="r19"; | |
72 | + | |
73 | +$acc04="r20"; | |
74 | +$acc05="r21"; | |
75 | +$acc06="r22"; | |
76 | +$acc07="r23"; | |
77 | + | |
78 | +$acc08="r24"; | |
79 | +$acc09="r25"; | |
80 | +$acc10="r26"; | |
81 | +$acc11="r27"; | |
82 | + | |
83 | +$acc12="r28"; | |
84 | +$acc13="r29"; | |
85 | +$acc14="r30"; | |
86 | +$acc15="r31"; | |
87 | + | |
88 | +# stay away from TLS pointer | |
89 | +if ($SIZE_T==8) { die if ($t1 ne "r13"); $t1="r0"; } | |
90 | +else { die if ($Tbl3 ne "r2"); $Tbl3=$t0; $t0="r0"; } | |
91 | +$mask80=$Tbl2; | |
92 | +$mask1b=$Tbl3; | |
93 | + | |
94 | +$code.=<<___; | |
95 | +.machine "any" | |
96 | +.text | |
97 | + | |
98 | +.align 7 | |
99 | +LAES_Te: | |
100 | + mflr r0 | |
101 | + bcl 20,31,\$+4 | |
102 | + mflr $Tbl0 ; vvvvv "distance" between . and 1st data entry | |
103 | + addi $Tbl0,$Tbl0,`128-8` | |
104 | + mtlr r0 | |
105 | + blr | |
106 | + .space `32-24` | |
107 | +LAES_Td: | |
108 | + mflr r0 | |
109 | + bcl 20,31,\$+4 | |
110 | + mflr $Tbl0 ; vvvvvvvv "distance" between . and 1st data entry | |
111 | + addi $Tbl0,$Tbl0,`128-8-32+2048+256` | |
112 | + mtlr r0 | |
113 | + blr | |
114 | + .space `128-32-24` | |
115 | +___ | |
116 | +&_data_word( | |
117 | + 0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d, | |
118 | + 0xfff2f20d, 0xd66b6bbd, 0xde6f6fb1, 0x91c5c554, | |
119 | + 0x60303050, 0x02010103, 0xce6767a9, 0x562b2b7d, | |
120 | + 0xe7fefe19, 0xb5d7d762, 0x4dababe6, 0xec76769a, | |
121 | + 0x8fcaca45, 0x1f82829d, 0x89c9c940, 0xfa7d7d87, | |
122 | + 0xeffafa15, 0xb25959eb, 0x8e4747c9, 0xfbf0f00b, | |
123 | + 0x41adadec, 0xb3d4d467, 0x5fa2a2fd, 0x45afafea, | |
124 | + 0x239c9cbf, 0x53a4a4f7, 0xe4727296, 0x9bc0c05b, | |
125 | + 0x75b7b7c2, 0xe1fdfd1c, 0x3d9393ae, 0x4c26266a, | |
126 | + 0x6c36365a, 0x7e3f3f41, 0xf5f7f702, 0x83cccc4f, | |
127 | + 0x6834345c, 0x51a5a5f4, 0xd1e5e534, 0xf9f1f108, | |
128 | + 0xe2717193, 0xabd8d873, 0x62313153, 0x2a15153f, | |
129 | + 0x0804040c, 0x95c7c752, 0x46232365, 0x9dc3c35e, | |
130 | + 0x30181828, 0x379696a1, 0x0a05050f, 0x2f9a9ab5, | |
131 | + 0x0e070709, 0x24121236, 0x1b80809b, 0xdfe2e23d, | |
132 | + 0xcdebeb26, 0x4e272769, 0x7fb2b2cd, 0xea75759f, | |
133 | + 0x1209091b, 0x1d83839e, 0x582c2c74, 0x341a1a2e, | |
134 | + 0x361b1b2d, 0xdc6e6eb2, 0xb45a5aee, 0x5ba0a0fb, | |
135 | + 0xa45252f6, 0x763b3b4d, 0xb7d6d661, 0x7db3b3ce, | |
136 | + 0x5229297b, 0xdde3e33e, 0x5e2f2f71, 0x13848497, | |
137 | + 0xa65353f5, 0xb9d1d168, 0x00000000, 0xc1eded2c, | |
138 | + 0x40202060, 0xe3fcfc1f, 0x79b1b1c8, 0xb65b5bed, | |
139 | + 0xd46a6abe, 0x8dcbcb46, 0x67bebed9, 0x7239394b, | |
140 | + 0x944a4ade, 0x984c4cd4, 0xb05858e8, 0x85cfcf4a, | |
141 | + 0xbbd0d06b, 0xc5efef2a, 0x4faaaae5, 0xedfbfb16, | |
142 | + 0x864343c5, 0x9a4d4dd7, 0x66333355, 0x11858594, | |
143 | + 0x8a4545cf, 0xe9f9f910, 0x04020206, 0xfe7f7f81, | |
144 | + 0xa05050f0, 0x783c3c44, 0x259f9fba, 0x4ba8a8e3, | |
145 | + 0xa25151f3, 0x5da3a3fe, 0x804040c0, 0x058f8f8a, | |
146 | + 0x3f9292ad, 0x219d9dbc, 0x70383848, 0xf1f5f504, | |
147 | + 0x63bcbcdf, 0x77b6b6c1, 0xafdada75, 0x42212163, | |
148 | + 0x20101030, 0xe5ffff1a, 0xfdf3f30e, 0xbfd2d26d, | |
149 | + 0x81cdcd4c, 0x180c0c14, 0x26131335, 0xc3ecec2f, | |
150 | + 0xbe5f5fe1, 0x359797a2, 0x884444cc, 0x2e171739, | |
151 | + 0x93c4c457, 0x55a7a7f2, 0xfc7e7e82, 0x7a3d3d47, | |
152 | + 0xc86464ac, 0xba5d5de7, 0x3219192b, 0xe6737395, | |
153 | + 0xc06060a0, 0x19818198, 0x9e4f4fd1, 0xa3dcdc7f, | |
154 | + 0x44222266, 0x542a2a7e, 0x3b9090ab, 0x0b888883, | |
155 | + 0x8c4646ca, 0xc7eeee29, 0x6bb8b8d3, 0x2814143c, | |
156 | + 0xa7dede79, 0xbc5e5ee2, 0x160b0b1d, 0xaddbdb76, | |
157 | + 0xdbe0e03b, 0x64323256, 0x743a3a4e, 0x140a0a1e, | |
158 | + 0x924949db, 0x0c06060a, 0x4824246c, 0xb85c5ce4, | |
159 | + 0x9fc2c25d, 0xbdd3d36e, 0x43acacef, 0xc46262a6, | |
160 | + 0x399191a8, 0x319595a4, 0xd3e4e437, 0xf279798b, | |
161 | + 0xd5e7e732, 0x8bc8c843, 0x6e373759, 0xda6d6db7, | |
162 | + 0x018d8d8c, 0xb1d5d564, 0x9c4e4ed2, 0x49a9a9e0, | |
163 | + 0xd86c6cb4, 0xac5656fa, 0xf3f4f407, 0xcfeaea25, | |
164 | + 0xca6565af, 0xf47a7a8e, 0x47aeaee9, 0x10080818, | |
165 | + 0x6fbabad5, 0xf0787888, 0x4a25256f, 0x5c2e2e72, | |
166 | + 0x381c1c24, 0x57a6a6f1, 0x73b4b4c7, 0x97c6c651, | |
167 | + 0xcbe8e823, 0xa1dddd7c, 0xe874749c, 0x3e1f1f21, | |
168 | + 0x964b4bdd, 0x61bdbddc, 0x0d8b8b86, 0x0f8a8a85, | |
169 | + 0xe0707090, 0x7c3e3e42, 0x71b5b5c4, 0xcc6666aa, | |
170 | + 0x904848d8, 0x06030305, 0xf7f6f601, 0x1c0e0e12, | |
171 | + 0xc26161a3, 0x6a35355f, 0xae5757f9, 0x69b9b9d0, | |
172 | + 0x17868691, 0x99c1c158, 0x3a1d1d27, 0x279e9eb9, | |
173 | + 0xd9e1e138, 0xebf8f813, 0x2b9898b3, 0x22111133, | |
174 | + 0xd26969bb, 0xa9d9d970, 0x078e8e89, 0x339494a7, | |
175 | + 0x2d9b9bb6, 0x3c1e1e22, 0x15878792, 0xc9e9e920, | |
176 | + 0x87cece49, 0xaa5555ff, 0x50282878, 0xa5dfdf7a, | |
177 | + 0x038c8c8f, 0x59a1a1f8, 0x09898980, 0x1a0d0d17, | |
178 | + 0x65bfbfda, 0xd7e6e631, 0x844242c6, 0xd06868b8, | |
179 | + 0x824141c3, 0x299999b0, 0x5a2d2d77, 0x1e0f0f11, | |
180 | + 0x7bb0b0cb, 0xa85454fc, 0x6dbbbbd6, 0x2c16163a); | |
181 | +$code.=<<___; | |
182 | +.byte 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5 | |
183 | +.byte 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76 | |
184 | +.byte 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0 | |
185 | +.byte 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0 | |
186 | +.byte 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc | |
187 | +.byte 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15 | |
188 | +.byte 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a | |
189 | +.byte 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75 | |
190 | +.byte 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0 | |
191 | +.byte 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84 | |
192 | +.byte 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b | |
193 | +.byte 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf | |
194 | +.byte 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85 | |
195 | +.byte 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8 | |
196 | +.byte 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5 | |
197 | +.byte 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2 | |
198 | +.byte 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17 | |
199 | +.byte 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73 | |
200 | +.byte 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88 | |
201 | +.byte 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb | |
202 | +.byte 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c | |
203 | +.byte 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79 | |
204 | +.byte 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9 | |
205 | +.byte 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08 | |
206 | +.byte 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6 | |
207 | +.byte 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a | |
208 | +.byte 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e | |
209 | +.byte 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e | |
210 | +.byte 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94 | |
211 | +.byte 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf | |
212 | +.byte 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68 | |
213 | +.byte 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16 | |
214 | +___ | |
215 | +&_data_word( | |
216 | + 0x51f4a750, 0x7e416553, 0x1a17a4c3, 0x3a275e96, | |
217 | + 0x3bab6bcb, 0x1f9d45f1, 0xacfa58ab, 0x4be30393, | |
218 | + 0x2030fa55, 0xad766df6, 0x88cc7691, 0xf5024c25, | |
219 | + 0x4fe5d7fc, 0xc52acbd7, 0x26354480, 0xb562a38f, | |
220 | + 0xdeb15a49, 0x25ba1b67, 0x45ea0e98, 0x5dfec0e1, | |
221 | + 0xc32f7502, 0x814cf012, 0x8d4697a3, 0x6bd3f9c6, | |
222 | + 0x038f5fe7, 0x15929c95, 0xbf6d7aeb, 0x955259da, | |
223 | + 0xd4be832d, 0x587421d3, 0x49e06929, 0x8ec9c844, | |
224 | + 0x75c2896a, 0xf48e7978, 0x99583e6b, 0x27b971dd, | |
225 | + 0xbee14fb6, 0xf088ad17, 0xc920ac66, 0x7dce3ab4, | |
226 | + 0x63df4a18, 0xe51a3182, 0x97513360, 0x62537f45, | |
227 | + 0xb16477e0, 0xbb6bae84, 0xfe81a01c, 0xf9082b94, | |
228 | + 0x70486858, 0x8f45fd19, 0x94de6c87, 0x527bf8b7, | |
229 | + 0xab73d323, 0x724b02e2, 0xe31f8f57, 0x6655ab2a, | |
230 | + 0xb2eb2807, 0x2fb5c203, 0x86c57b9a, 0xd33708a5, | |
231 | + 0x302887f2, 0x23bfa5b2, 0x02036aba, 0xed16825c, | |
232 | + 0x8acf1c2b, 0xa779b492, 0xf307f2f0, 0x4e69e2a1, | |
233 | + 0x65daf4cd, 0x0605bed5, 0xd134621f, 0xc4a6fe8a, | |
234 | + 0x342e539d, 0xa2f355a0, 0x058ae132, 0xa4f6eb75, | |
235 | + 0x0b83ec39, 0x4060efaa, 0x5e719f06, 0xbd6e1051, | |
236 | + 0x3e218af9, 0x96dd063d, 0xdd3e05ae, 0x4de6bd46, | |
237 | + 0x91548db5, 0x71c45d05, 0x0406d46f, 0x605015ff, | |
238 | + 0x1998fb24, 0xd6bde997, 0x894043cc, 0x67d99e77, | |
239 | + 0xb0e842bd, 0x07898b88, 0xe7195b38, 0x79c8eedb, | |
240 | + 0xa17c0a47, 0x7c420fe9, 0xf8841ec9, 0x00000000, | |
241 | + 0x09808683, 0x322bed48, 0x1e1170ac, 0x6c5a724e, | |
242 | + 0xfd0efffb, 0x0f853856, 0x3daed51e, 0x362d3927, | |
243 | + 0x0a0fd964, 0x685ca621, 0x9b5b54d1, 0x24362e3a, | |
244 | + 0x0c0a67b1, 0x9357e70f, 0xb4ee96d2, 0x1b9b919e, | |
245 | + 0x80c0c54f, 0x61dc20a2, 0x5a774b69, 0x1c121a16, | |
246 | + 0xe293ba0a, 0xc0a02ae5, 0x3c22e043, 0x121b171d, | |
247 | + 0x0e090d0b, 0xf28bc7ad, 0x2db6a8b9, 0x141ea9c8, | |
248 | + 0x57f11985, 0xaf75074c, 0xee99ddbb, 0xa37f60fd, | |
249 | + 0xf701269f, 0x5c72f5bc, 0x44663bc5, 0x5bfb7e34, | |
250 | + 0x8b432976, 0xcb23c6dc, 0xb6edfc68, 0xb8e4f163, | |
251 | + 0xd731dcca, 0x42638510, 0x13972240, 0x84c61120, | |
252 | + 0x854a247d, 0xd2bb3df8, 0xaef93211, 0xc729a16d, | |
253 | + 0x1d9e2f4b, 0xdcb230f3, 0x0d8652ec, 0x77c1e3d0, | |
254 | + 0x2bb3166c, 0xa970b999, 0x119448fa, 0x47e96422, | |
255 | + 0xa8fc8cc4, 0xa0f03f1a, 0x567d2cd8, 0x223390ef, | |
256 | + 0x87494ec7, 0xd938d1c1, 0x8ccaa2fe, 0x98d40b36, | |
257 | + 0xa6f581cf, 0xa57ade28, 0xdab78e26, 0x3fadbfa4, | |
258 | + 0x2c3a9de4, 0x5078920d, 0x6a5fcc9b, 0x547e4662, | |
259 | + 0xf68d13c2, 0x90d8b8e8, 0x2e39f75e, 0x82c3aff5, | |
260 | + 0x9f5d80be, 0x69d0937c, 0x6fd52da9, 0xcf2512b3, | |
261 | + 0xc8ac993b, 0x10187da7, 0xe89c636e, 0xdb3bbb7b, | |
262 | + 0xcd267809, 0x6e5918f4, 0xec9ab701, 0x834f9aa8, | |
263 | + 0xe6956e65, 0xaaffe67e, 0x21bccf08, 0xef15e8e6, | |
264 | + 0xbae79bd9, 0x4a6f36ce, 0xea9f09d4, 0x29b07cd6, | |
265 | + 0x31a4b2af, 0x2a3f2331, 0xc6a59430, 0x35a266c0, | |
266 | + 0x744ebc37, 0xfc82caa6, 0xe090d0b0, 0x33a7d815, | |
267 | + 0xf104984a, 0x41ecdaf7, 0x7fcd500e, 0x1791f62f, | |
268 | + 0x764dd68d, 0x43efb04d, 0xccaa4d54, 0xe49604df, | |
269 | + 0x9ed1b5e3, 0x4c6a881b, 0xc12c1fb8, 0x4665517f, | |
270 | + 0x9d5eea04, 0x018c355d, 0xfa877473, 0xfb0b412e, | |
271 | + 0xb3671d5a, 0x92dbd252, 0xe9105633, 0x6dd64713, | |
272 | + 0x9ad7618c, 0x37a10c7a, 0x59f8148e, 0xeb133c89, | |
273 | + 0xcea927ee, 0xb761c935, 0xe11ce5ed, 0x7a47b13c, | |
274 | + 0x9cd2df59, 0x55f2733f, 0x1814ce79, 0x73c737bf, | |
275 | + 0x53f7cdea, 0x5ffdaa5b, 0xdf3d6f14, 0x7844db86, | |
276 | + 0xcaaff381, 0xb968c43e, 0x3824342c, 0xc2a3405f, | |
277 | + 0x161dc372, 0xbce2250c, 0x283c498b, 0xff0d9541, | |
278 | + 0x39a80171, 0x080cb3de, 0xd8b4e49c, 0x6456c190, | |
279 | + 0x7bcb8461, 0xd532b670, 0x486c5c74, 0xd0b85742); | |
280 | +$code.=<<___; | |
281 | +.byte 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38 | |
282 | +.byte 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb | |
283 | +.byte 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87 | |
284 | +.byte 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb | |
285 | +.byte 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d | |
286 | +.byte 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e | |
287 | +.byte 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2 | |
288 | +.byte 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25 | |
289 | +.byte 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16 | |
290 | +.byte 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92 | |
291 | +.byte 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda | |
292 | +.byte 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84 | |
293 | +.byte 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a | |
294 | +.byte 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06 | |
295 | +.byte 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02 | |
296 | +.byte 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b | |
297 | +.byte 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea | |
298 | +.byte 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73 | |
299 | +.byte 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85 | |
300 | +.byte 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e | |
301 | +.byte 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89 | |
302 | +.byte 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b | |
303 | +.byte 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20 | |
304 | +.byte 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4 | |
305 | +.byte 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31 | |
306 | +.byte 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f | |
307 | +.byte 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d | |
308 | +.byte 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef | |
309 | +.byte 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0 | |
310 | +.byte 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61 | |
311 | +.byte 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26 | |
312 | +.byte 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d | |
313 | + | |
314 | + | |
315 | +.globl .AES_encrypt | |
316 | +.align 7 | |
317 | +.AES_encrypt: | |
318 | + mflr r0 | |
319 | + $STU $sp,-$FRAME($sp) | |
320 | + | |
321 | + $PUSH r0,`$FRAME-$SIZE_T*21`($sp) | |
322 | + $PUSH $toc,`$FRAME-$SIZE_T*20`($sp) | |
323 | + $PUSH r13,`$FRAME-$SIZE_T*19`($sp) | |
324 | + $PUSH r14,`$FRAME-$SIZE_T*18`($sp) | |
325 | + $PUSH r15,`$FRAME-$SIZE_T*17`($sp) | |
326 | + $PUSH r16,`$FRAME-$SIZE_T*16`($sp) | |
327 | + $PUSH r17,`$FRAME-$SIZE_T*15`($sp) | |
328 | + $PUSH r18,`$FRAME-$SIZE_T*14`($sp) | |
329 | + $PUSH r19,`$FRAME-$SIZE_T*13`($sp) | |
330 | + $PUSH r20,`$FRAME-$SIZE_T*12`($sp) | |
331 | + $PUSH r21,`$FRAME-$SIZE_T*11`($sp) | |
332 | + $PUSH r22,`$FRAME-$SIZE_T*10`($sp) | |
333 | + $PUSH r23,`$FRAME-$SIZE_T*9`($sp) | |
334 | + $PUSH r24,`$FRAME-$SIZE_T*8`($sp) | |
335 | + $PUSH r25,`$FRAME-$SIZE_T*7`($sp) | |
336 | + $PUSH r26,`$FRAME-$SIZE_T*6`($sp) | |
337 | + $PUSH r27,`$FRAME-$SIZE_T*5`($sp) | |
338 | + $PUSH r28,`$FRAME-$SIZE_T*4`($sp) | |
339 | + $PUSH r29,`$FRAME-$SIZE_T*3`($sp) | |
340 | + $PUSH r30,`$FRAME-$SIZE_T*2`($sp) | |
341 | + $PUSH r31,`$FRAME-$SIZE_T*1`($sp) | |
342 | + | |
343 | + lwz $s0,0($inp) | |
344 | + lwz $s1,4($inp) | |
345 | + lwz $s2,8($inp) | |
346 | + lwz $s3,12($inp) | |
347 | + bl LAES_Te | |
348 | + bl Lppc_AES_encrypt_compact | |
349 | + stw $s0,0($out) | |
350 | + stw $s1,4($out) | |
351 | + stw $s2,8($out) | |
352 | + stw $s3,12($out) | |
353 | + | |
354 | + $POP r0,`$FRAME-$SIZE_T*21`($sp) | |
355 | + $POP $toc,`$FRAME-$SIZE_T*20`($sp) | |
356 | + $POP r13,`$FRAME-$SIZE_T*19`($sp) | |
357 | + $POP r14,`$FRAME-$SIZE_T*18`($sp) | |
358 | + $POP r15,`$FRAME-$SIZE_T*17`($sp) | |
359 | + $POP r16,`$FRAME-$SIZE_T*16`($sp) | |
360 | + $POP r17,`$FRAME-$SIZE_T*15`($sp) | |
361 | + $POP r18,`$FRAME-$SIZE_T*14`($sp) | |
362 | + $POP r19,`$FRAME-$SIZE_T*13`($sp) | |
363 | + $POP r20,`$FRAME-$SIZE_T*12`($sp) | |
364 | + $POP r21,`$FRAME-$SIZE_T*11`($sp) | |
365 | + $POP r22,`$FRAME-$SIZE_T*10`($sp) | |
366 | + $POP r23,`$FRAME-$SIZE_T*9`($sp) | |
367 | + $POP r24,`$FRAME-$SIZE_T*8`($sp) | |
368 | + $POP r25,`$FRAME-$SIZE_T*7`($sp) | |
369 | + $POP r26,`$FRAME-$SIZE_T*6`($sp) | |
370 | + $POP r27,`$FRAME-$SIZE_T*5`($sp) | |
371 | + $POP r28,`$FRAME-$SIZE_T*4`($sp) | |
372 | + $POP r29,`$FRAME-$SIZE_T*3`($sp) | |
373 | + $POP r30,`$FRAME-$SIZE_T*2`($sp) | |
374 | + $POP r31,`$FRAME-$SIZE_T*1`($sp) | |
375 | + mtlr r0 | |
376 | + addi $sp,$sp,$FRAME | |
377 | + blr | |
378 | + | |
379 | +.align 4 | |
380 | +Lppc_AES_encrypt: | |
381 | + lwz $acc00,240($key) | |
382 | + lwz $t0,0($key) | |
383 | + lwz $t1,4($key) | |
384 | + lwz $t2,8($key) | |
385 | + lwz $t3,12($key) | |
386 | + addi $Tbl1,$Tbl0,3 | |
387 | + addi $Tbl2,$Tbl0,2 | |
388 | + addi $Tbl3,$Tbl0,1 | |
389 | + addi $acc00,$acc00,-1 | |
390 | + addi $key,$key,16 | |
391 | + xor $s0,$s0,$t0 | |
392 | + xor $s1,$s1,$t1 | |
393 | + xor $s2,$s2,$t2 | |
394 | + xor $s3,$s3,$t3 | |
395 | + mtctr $acc00 | |
396 | +.align 4 | |
397 | +Lenc_loop: | |
398 | + rlwinm $acc00,$s0,`32-24+3`,21,28 | |
399 | + rlwinm $acc01,$s1,`32-24+3`,21,28 | |
400 | + lwz $t0,0($key) | |
401 | + lwz $t1,4($key) | |
402 | + rlwinm $acc02,$s2,`32-24+3`,21,28 | |
403 | + rlwinm $acc03,$s3,`32-24+3`,21,28 | |
404 | + lwz $t2,8($key) | |
405 | + lwz $t3,12($key) | |
406 | + rlwinm $acc04,$s1,`32-16+3`,21,28 | |
407 | + rlwinm $acc05,$s2,`32-16+3`,21,28 | |
408 | + lwzx $acc00,$Tbl0,$acc00 | |
409 | + lwzx $acc01,$Tbl0,$acc01 | |
410 | + rlwinm $acc06,$s3,`32-16+3`,21,28 | |
411 | + rlwinm $acc07,$s0,`32-16+3`,21,28 | |
412 | + lwzx $acc02,$Tbl0,$acc02 | |
413 | + lwzx $acc03,$Tbl0,$acc03 | |
414 | + rlwinm $acc08,$s2,`32-8+3`,21,28 | |
415 | + rlwinm $acc09,$s3,`32-8+3`,21,28 | |
416 | + lwzx $acc04,$Tbl1,$acc04 | |
417 | + lwzx $acc05,$Tbl1,$acc05 | |
418 | + rlwinm $acc10,$s0,`32-8+3`,21,28 | |
419 | + rlwinm $acc11,$s1,`32-8+3`,21,28 | |
420 | + lwzx $acc06,$Tbl1,$acc06 | |
421 | + lwzx $acc07,$Tbl1,$acc07 | |
422 | + rlwinm $acc12,$s3,`0+3`,21,28 | |
423 | + rlwinm $acc13,$s0,`0+3`,21,28 | |
424 | + lwzx $acc08,$Tbl2,$acc08 | |
425 | + lwzx $acc09,$Tbl2,$acc09 | |
426 | + rlwinm $acc14,$s1,`0+3`,21,28 | |
427 | + rlwinm $acc15,$s2,`0+3`,21,28 | |
428 | + lwzx $acc10,$Tbl2,$acc10 | |
429 | + lwzx $acc11,$Tbl2,$acc11 | |
430 | + xor $t0,$t0,$acc00 | |
431 | + xor $t1,$t1,$acc01 | |
432 | + lwzx $acc12,$Tbl3,$acc12 | |
433 | + lwzx $acc13,$Tbl3,$acc13 | |
434 | + xor $t2,$t2,$acc02 | |
435 | + xor $t3,$t3,$acc03 | |
436 | + lwzx $acc14,$Tbl3,$acc14 | |
437 | + lwzx $acc15,$Tbl3,$acc15 | |
438 | + xor $t0,$t0,$acc04 | |
439 | + xor $t1,$t1,$acc05 | |
440 | + xor $t2,$t2,$acc06 | |
441 | + xor $t3,$t3,$acc07 | |
442 | + xor $t0,$t0,$acc08 | |
443 | + xor $t1,$t1,$acc09 | |
444 | + xor $t2,$t2,$acc10 | |
445 | + xor $t3,$t3,$acc11 | |
446 | + xor $s0,$t0,$acc12 | |
447 | + xor $s1,$t1,$acc13 | |
448 | + xor $s2,$t2,$acc14 | |
449 | + xor $s3,$t3,$acc15 | |
450 | + addi $key,$key,16 | |
451 | + bdnz- Lenc_loop | |
452 | + | |
453 | + addi $Tbl2,$Tbl0,2048 | |
454 | + nop | |
455 | + lwz $acc08,`2048+0`($Tbl0) ! prefetch Te4 | |
456 | + lwz $acc09,`2048+32`($Tbl0) | |
457 | + lwz $acc10,`2048+64`($Tbl0) | |
458 | + lwz $acc11,`2048+96`($Tbl0) | |
459 | + lwz $acc08,`2048+128`($Tbl0) | |
460 | + lwz $acc09,`2048+160`($Tbl0) | |
461 | + lwz $acc10,`2048+192`($Tbl0) | |
462 | + lwz $acc11,`2048+224`($Tbl0) | |
463 | + rlwinm $acc00,$s0,`32-24`,24,31 | |
464 | + rlwinm $acc01,$s1,`32-24`,24,31 | |
465 | + lwz $t0,0($key) | |
466 | + lwz $t1,4($key) | |
467 | + rlwinm $acc02,$s2,`32-24`,24,31 | |
468 | + rlwinm $acc03,$s3,`32-24`,24,31 | |
469 | + lwz $t2,8($key) | |
470 | + lwz $t3,12($key) | |
471 | + rlwinm $acc04,$s1,`32-16`,24,31 | |
472 | + rlwinm $acc05,$s2,`32-16`,24,31 | |
473 | + lbzx $acc00,$Tbl2,$acc00 | |
474 | + lbzx $acc01,$Tbl2,$acc01 | |
475 | + rlwinm $acc06,$s3,`32-16`,24,31 | |
476 | + rlwinm $acc07,$s0,`32-16`,24,31 | |
477 | + lbzx $acc02,$Tbl2,$acc02 | |
478 | + lbzx $acc03,$Tbl2,$acc03 | |
479 | + rlwinm $acc08,$s2,`32-8`,24,31 | |
480 | + rlwinm $acc09,$s3,`32-8`,24,31 | |
481 | + lbzx $acc04,$Tbl2,$acc04 | |
482 | + lbzx $acc05,$Tbl2,$acc05 | |
483 | + rlwinm $acc10,$s0,`32-8`,24,31 | |
484 | + rlwinm $acc11,$s1,`32-8`,24,31 | |
485 | + lbzx $acc06,$Tbl2,$acc06 | |
486 | + lbzx $acc07,$Tbl2,$acc07 | |
487 | + rlwinm $acc12,$s3,`0`,24,31 | |
488 | + rlwinm $acc13,$s0,`0`,24,31 | |
489 | + lbzx $acc08,$Tbl2,$acc08 | |
490 | + lbzx $acc09,$Tbl2,$acc09 | |
491 | + rlwinm $acc14,$s1,`0`,24,31 | |
492 | + rlwinm $acc15,$s2,`0`,24,31 | |
493 | + lbzx $acc10,$Tbl2,$acc10 | |
494 | + lbzx $acc11,$Tbl2,$acc11 | |
495 | + rlwinm $s0,$acc00,24,0,7 | |
496 | + rlwinm $s1,$acc01,24,0,7 | |
497 | + lbzx $acc12,$Tbl2,$acc12 | |
498 | + lbzx $acc13,$Tbl2,$acc13 | |
499 | + rlwinm $s2,$acc02,24,0,7 | |
500 | + rlwinm $s3,$acc03,24,0,7 | |
501 | + lbzx $acc14,$Tbl2,$acc14 | |
502 | + lbzx $acc15,$Tbl2,$acc15 | |
503 | + rlwimi $s0,$acc04,16,8,15 | |
504 | + rlwimi $s1,$acc05,16,8,15 | |
505 | + rlwimi $s2,$acc06,16,8,15 | |
506 | + rlwimi $s3,$acc07,16,8,15 | |
507 | + rlwimi $s0,$acc08,8,16,23 | |
508 | + rlwimi $s1,$acc09,8,16,23 | |
509 | + rlwimi $s2,$acc10,8,16,23 | |
510 | + rlwimi $s3,$acc11,8,16,23 | |
511 | + or $s0,$s0,$acc12 | |
512 | + or $s1,$s1,$acc13 | |
513 | + or $s2,$s2,$acc14 | |
514 | + or $s3,$s3,$acc15 | |
515 | + xor $s0,$s0,$t0 | |
516 | + xor $s1,$s1,$t1 | |
517 | + xor $s2,$s2,$t2 | |
518 | + xor $s3,$s3,$t3 | |
519 | + blr | |
520 | + | |
521 | +.align 4 | |
522 | +Lppc_AES_encrypt_compact: | |
523 | + lwz $acc00,240($key) | |
524 | + lwz $t0,0($key) | |
525 | + lwz $t1,4($key) | |
526 | + lwz $t2,8($key) | |
527 | + lwz $t3,12($key) | |
528 | + addi $Tbl1,$Tbl0,2048 | |
529 | + lis $mask80,0x8080 | |
530 | + lis $mask1b,0x1b1b | |
531 | + addi $key,$key,16 | |
532 | + ori $mask80,$mask80,0x8080 | |
533 | + ori $mask1b,$mask1b,0x1b1b | |
534 | + mtctr $acc00 | |
535 | +.align 4 | |
536 | +Lenc_compact_loop: | |
537 | + xor $s0,$s0,$t0 | |
538 | + xor $s1,$s1,$t1 | |
539 | + xor $s2,$s2,$t2 | |
540 | + xor $s3,$s3,$t3 | |
541 | + rlwinm $acc00,$s0,`32-24`,24,31 | |
542 | + rlwinm $acc01,$s1,`32-24`,24,31 | |
543 | + rlwinm $acc02,$s2,`32-24`,24,31 | |
544 | + rlwinm $acc03,$s3,`32-24`,24,31 | |
545 | + lbzx $acc00,$Tbl1,$acc00 | |
546 | + lbzx $acc01,$Tbl1,$acc01 | |
547 | + rlwinm $acc04,$s1,`32-16`,24,31 | |
548 | + rlwinm $acc05,$s2,`32-16`,24,31 | |
549 | + lbzx $acc02,$Tbl1,$acc02 | |
550 | + lbzx $acc03,$Tbl1,$acc03 | |
551 | + rlwinm $acc06,$s3,`32-16`,24,31 | |
552 | + rlwinm $acc07,$s0,`32-16`,24,31 | |
553 | + lbzx $acc04,$Tbl1,$acc04 | |
554 | + lbzx $acc05,$Tbl1,$acc05 | |
555 | + rlwinm $acc08,$s2,`32-8`,24,31 | |
556 | + rlwinm $acc09,$s3,`32-8`,24,31 | |
557 | + lbzx $acc06,$Tbl1,$acc06 | |
558 | + lbzx $acc07,$Tbl1,$acc07 | |
559 | + rlwinm $acc10,$s0,`32-8`,24,31 | |
560 | + rlwinm $acc11,$s1,`32-8`,24,31 | |
561 | + lbzx $acc08,$Tbl1,$acc08 | |
562 | + lbzx $acc09,$Tbl1,$acc09 | |
563 | + rlwinm $acc12,$s3,`0`,24,31 | |
564 | + rlwinm $acc13,$s0,`0`,24,31 | |
565 | + lbzx $acc10,$Tbl1,$acc10 | |
566 | + lbzx $acc11,$Tbl1,$acc11 | |
567 | + rlwinm $acc14,$s1,`0`,24,31 | |
568 | + rlwinm $acc15,$s2,`0`,24,31 | |
569 | + lbzx $acc12,$Tbl1,$acc12 | |
570 | + lbzx $acc13,$Tbl1,$acc13 | |
571 | + rlwinm $s0,$acc00,24,0,7 | |
572 | + rlwinm $s1,$acc01,24,0,7 | |
573 | + lbzx $acc14,$Tbl1,$acc14 | |
574 | + lbzx $acc15,$Tbl1,$acc15 | |
575 | + rlwinm $s2,$acc02,24,0,7 | |
576 | + rlwinm $s3,$acc03,24,0,7 | |
577 | + rlwimi $s0,$acc04,16,8,15 | |
578 | + rlwimi $s1,$acc05,16,8,15 | |
579 | + rlwimi $s2,$acc06,16,8,15 | |
580 | + rlwimi $s3,$acc07,16,8,15 | |
581 | + rlwimi $s0,$acc08,8,16,23 | |
582 | + rlwimi $s1,$acc09,8,16,23 | |
583 | + rlwimi $s2,$acc10,8,16,23 | |
584 | + rlwimi $s3,$acc11,8,16,23 | |
585 | + lwz $t0,0($key) | |
586 | + lwz $t1,4($key) | |
587 | + or $s0,$s0,$acc12 | |
588 | + or $s1,$s1,$acc13 | |
589 | + lwz $t2,8($key) | |
590 | + lwz $t3,12($key) | |
591 | + or $s2,$s2,$acc14 | |
592 | + or $s3,$s3,$acc15 | |
593 | + | |
594 | + addi $key,$key,16 | |
595 | + bdz Lenc_compact_done | |
596 | + | |
597 | + and $acc00,$s0,$mask80 # r1=r0&0x80808080 | |
598 | + and $acc01,$s1,$mask80 | |
599 | + and $acc02,$s2,$mask80 | |
600 | + and $acc03,$s3,$mask80 | |
601 | + srwi $acc04,$acc00,7 # r1>>7 | |
602 | + srwi $acc05,$acc01,7 | |
603 | + srwi $acc06,$acc02,7 | |
604 | + srwi $acc07,$acc03,7 | |
605 | + andc $acc08,$s0,$mask80 # r0&0x7f7f7f7f | |
606 | + andc $acc09,$s1,$mask80 | |
607 | + andc $acc10,$s2,$mask80 | |
608 | + andc $acc11,$s3,$mask80 | |
609 | + sub $acc00,$acc00,$acc04 # r1-(r1>>7) | |
610 | + sub $acc01,$acc01,$acc05 | |
611 | + sub $acc02,$acc02,$acc06 | |
612 | + sub $acc03,$acc03,$acc07 | |
613 | + add $acc08,$acc08,$acc08 # (r0&0x7f7f7f7f)<<1 | |
614 | + add $acc09,$acc09,$acc09 | |
615 | + add $acc10,$acc10,$acc10 | |
616 | + add $acc11,$acc11,$acc11 | |
617 | + and $acc00,$acc00,$mask1b # (r1-(r1>>7))&0x1b1b1b1b | |
618 | + and $acc01,$acc01,$mask1b | |
619 | + and $acc02,$acc02,$mask1b | |
620 | + and $acc03,$acc03,$mask1b | |
621 | + xor $acc00,$acc00,$acc08 # r2 | |
622 | + xor $acc01,$acc01,$acc09 | |
623 | + xor $acc02,$acc02,$acc10 | |
624 | + xor $acc03,$acc03,$acc11 | |
625 | + | |
626 | + rotlwi $acc12,$s0,16 # ROTATE(r0,16) | |
627 | + rotlwi $acc13,$s1,16 | |
628 | + rotlwi $acc14,$s2,16 | |
629 | + rotlwi $acc15,$s3,16 | |
630 | + xor $s0,$s0,$acc00 # r0^r2 | |
631 | + xor $s1,$s1,$acc01 | |
632 | + xor $s2,$s2,$acc02 | |
633 | + xor $s3,$s3,$acc03 | |
634 | + rotrwi $s0,$s0,24 # ROTATE(r2^r0,24) | |
635 | + rotrwi $s1,$s1,24 | |
636 | + rotrwi $s2,$s2,24 | |
637 | + rotrwi $s3,$s3,24 | |
638 | + xor $s0,$s0,$acc00 # ROTATE(r2^r0,24)^r2 | |
639 | + xor $s1,$s1,$acc01 | |
640 | + xor $s2,$s2,$acc02 | |
641 | + xor $s3,$s3,$acc03 | |
642 | + rotlwi $acc08,$acc12,8 # ROTATE(r0,24) | |
643 | + rotlwi $acc09,$acc13,8 | |
644 | + rotlwi $acc10,$acc14,8 | |
645 | + rotlwi $acc11,$acc15,8 | |
646 | + xor $s0,$s0,$acc12 # | |
647 | + xor $s1,$s1,$acc13 | |
648 | + xor $s2,$s2,$acc14 | |
649 | + xor $s3,$s3,$acc15 | |
650 | + xor $s0,$s0,$acc08 # | |
651 | + xor $s1,$s1,$acc09 | |
652 | + xor $s2,$s2,$acc10 | |
653 | + xor $s3,$s3,$acc11 | |
654 | + | |
655 | + b Lenc_compact_loop | |
656 | +.align 4 | |
657 | +Lenc_compact_done: | |
658 | + xor $s0,$s0,$t0 | |
659 | + xor $s1,$s1,$t1 | |
660 | + xor $s2,$s2,$t2 | |
661 | + xor $s3,$s3,$t3 | |
662 | + blr | |
663 | + | |
664 | +.globl .AES_decrypt | |
665 | +.align 7 | |
666 | +.AES_decrypt: | |
667 | + mflr r0 | |
668 | + $STU $sp,-$FRAME($sp) | |
669 | + | |
670 | + $PUSH r0,`$FRAME-$SIZE_T*21`($sp) | |
671 | + $PUSH $toc,`$FRAME-$SIZE_T*20`($sp) | |
672 | + $PUSH r13,`$FRAME-$SIZE_T*19`($sp) | |
673 | + $PUSH r14,`$FRAME-$SIZE_T*18`($sp) | |
674 | + $PUSH r15,`$FRAME-$SIZE_T*17`($sp) | |
675 | + $PUSH r16,`$FRAME-$SIZE_T*16`($sp) | |
676 | + $PUSH r17,`$FRAME-$SIZE_T*15`($sp) | |
677 | + $PUSH r18,`$FRAME-$SIZE_T*14`($sp) | |
678 | + $PUSH r19,`$FRAME-$SIZE_T*13`($sp) | |
679 | + $PUSH r20,`$FRAME-$SIZE_T*12`($sp) | |
680 | + $PUSH r21,`$FRAME-$SIZE_T*11`($sp) | |
681 | + $PUSH r22,`$FRAME-$SIZE_T*10`($sp) | |
682 | + $PUSH r23,`$FRAME-$SIZE_T*9`($sp) | |
683 | + $PUSH r24,`$FRAME-$SIZE_T*8`($sp) | |
684 | + $PUSH r25,`$FRAME-$SIZE_T*7`($sp) | |
685 | + $PUSH r26,`$FRAME-$SIZE_T*6`($sp) | |
686 | + $PUSH r27,`$FRAME-$SIZE_T*5`($sp) | |
687 | + $PUSH r28,`$FRAME-$SIZE_T*4`($sp) | |
688 | + $PUSH r29,`$FRAME-$SIZE_T*3`($sp) | |
689 | + $PUSH r30,`$FRAME-$SIZE_T*2`($sp) | |
690 | + $PUSH r31,`$FRAME-$SIZE_T*1`($sp) | |
691 | + | |
692 | + lwz $s0,0($inp) | |
693 | + lwz $s1,4($inp) | |
694 | + lwz $s2,8($inp) | |
695 | + lwz $s3,12($inp) | |
696 | + bl LAES_Td | |
697 | + bl Lppc_AES_decrypt_compact | |
698 | + stw $s0,0($out) | |
699 | + stw $s1,4($out) | |
700 | + stw $s2,8($out) | |
701 | + stw $s3,12($out) | |
702 | + | |
703 | + $POP r0,`$FRAME-$SIZE_T*21`($sp) | |
704 | + $POP $toc,`$FRAME-$SIZE_T*20`($sp) | |
705 | + $POP r13,`$FRAME-$SIZE_T*19`($sp) | |
706 | + $POP r14,`$FRAME-$SIZE_T*18`($sp) | |
707 | + $POP r15,`$FRAME-$SIZE_T*17`($sp) | |
708 | + $POP r16,`$FRAME-$SIZE_T*16`($sp) | |
709 | + $POP r17,`$FRAME-$SIZE_T*15`($sp) | |
710 | + $POP r18,`$FRAME-$SIZE_T*14`($sp) | |
711 | + $POP r19,`$FRAME-$SIZE_T*13`($sp) | |
712 | + $POP r20,`$FRAME-$SIZE_T*12`($sp) | |
713 | + $POP r21,`$FRAME-$SIZE_T*11`($sp) | |
714 | + $POP r22,`$FRAME-$SIZE_T*10`($sp) | |
715 | + $POP r23,`$FRAME-$SIZE_T*9`($sp) | |
716 | + $POP r24,`$FRAME-$SIZE_T*8`($sp) | |
717 | + $POP r25,`$FRAME-$SIZE_T*7`($sp) | |
718 | + $POP r26,`$FRAME-$SIZE_T*6`($sp) | |
719 | + $POP r27,`$FRAME-$SIZE_T*5`($sp) | |
720 | + $POP r28,`$FRAME-$SIZE_T*4`($sp) | |
721 | + $POP r29,`$FRAME-$SIZE_T*3`($sp) | |
722 | + $POP r30,`$FRAME-$SIZE_T*2`($sp) | |
723 | + $POP r31,`$FRAME-$SIZE_T*1`($sp) | |
724 | + mtlr r0 | |
725 | + addi $sp,$sp,$FRAME | |
726 | + blr | |
727 | + | |
728 | +.align 4 | |
729 | +Lppc_AES_decrypt: | |
730 | + lwz $acc00,240($key) | |
731 | + lwz $t0,0($key) | |
732 | + lwz $t1,4($key) | |
733 | + lwz $t2,8($key) | |
734 | + lwz $t3,12($key) | |
735 | + addi $Tbl1,$Tbl0,3 | |
736 | + addi $Tbl2,$Tbl0,2 | |
737 | + addi $Tbl3,$Tbl0,1 | |
738 | + addi $acc00,$acc00,-1 | |
739 | + addi $key,$key,16 | |
740 | + xor $s0,$s0,$t0 | |
741 | + xor $s1,$s1,$t1 | |
742 | + xor $s2,$s2,$t2 | |
743 | + xor $s3,$s3,$t3 | |
744 | + mtctr $acc00 | |
745 | +.align 4 | |
746 | +Ldec_loop: | |
747 | + rlwinm $acc00,$s0,`32-24+3`,21,28 | |
748 | + rlwinm $acc01,$s1,`32-24+3`,21,28 | |
749 | + lwz $t0,0($key) | |
750 | + lwz $t1,4($key) | |
751 | + rlwinm $acc02,$s2,`32-24+3`,21,28 | |
752 | + rlwinm $acc03,$s3,`32-24+3`,21,28 | |
753 | + lwz $t2,8($key) | |
754 | + lwz $t3,12($key) | |
755 | + rlwinm $acc04,$s3,`32-16+3`,21,28 | |
756 | + rlwinm $acc05,$s0,`32-16+3`,21,28 | |
757 | + lwzx $acc00,$Tbl0,$acc00 | |
758 | + lwzx $acc01,$Tbl0,$acc01 | |
759 | + rlwinm $acc06,$s1,`32-16+3`,21,28 | |
760 | + rlwinm $acc07,$s2,`32-16+3`,21,28 | |
761 | + lwzx $acc02,$Tbl0,$acc02 | |
762 | + lwzx $acc03,$Tbl0,$acc03 | |
763 | + rlwinm $acc08,$s2,`32-8+3`,21,28 | |
764 | + rlwinm $acc09,$s3,`32-8+3`,21,28 | |
765 | + lwzx $acc04,$Tbl1,$acc04 | |
766 | + lwzx $acc05,$Tbl1,$acc05 | |
767 | + rlwinm $acc10,$s0,`32-8+3`,21,28 | |
768 | + rlwinm $acc11,$s1,`32-8+3`,21,28 | |
769 | + lwzx $acc06,$Tbl1,$acc06 | |
770 | + lwzx $acc07,$Tbl1,$acc07 | |
771 | + rlwinm $acc12,$s1,`0+3`,21,28 | |
772 | + rlwinm $acc13,$s2,`0+3`,21,28 | |
773 | + lwzx $acc08,$Tbl2,$acc08 | |
774 | + lwzx $acc09,$Tbl2,$acc09 | |
775 | + rlwinm $acc14,$s3,`0+3`,21,28 | |
776 | + rlwinm $acc15,$s0,`0+3`,21,28 | |
777 | + lwzx $acc10,$Tbl2,$acc10 | |
778 | + lwzx $acc11,$Tbl2,$acc11 | |
779 | + xor $t0,$t0,$acc00 | |
780 | + xor $t1,$t1,$acc01 | |
781 | + lwzx $acc12,$Tbl3,$acc12 | |
782 | + lwzx $acc13,$Tbl3,$acc13 | |
783 | + xor $t2,$t2,$acc02 | |
784 | + xor $t3,$t3,$acc03 | |
785 | + lwzx $acc14,$Tbl3,$acc14 | |
786 | + lwzx $acc15,$Tbl3,$acc15 | |
787 | + xor $t0,$t0,$acc04 | |
788 | + xor $t1,$t1,$acc05 | |
789 | + xor $t2,$t2,$acc06 | |
790 | + xor $t3,$t3,$acc07 | |
791 | + xor $t0,$t0,$acc08 | |
792 | + xor $t1,$t1,$acc09 | |
793 | + xor $t2,$t2,$acc10 | |
794 | + xor $t3,$t3,$acc11 | |
795 | + xor $s0,$t0,$acc12 | |
796 | + xor $s1,$t1,$acc13 | |
797 | + xor $s2,$t2,$acc14 | |
798 | + xor $s3,$t3,$acc15 | |
799 | + addi $key,$key,16 | |
800 | + bdnz- Ldec_loop | |
801 | + | |
802 | + addi $Tbl2,$Tbl0,2048 | |
803 | + nop | |
804 | + lwz $acc08,`2048+0`($Tbl0) ! prefetch Td4 | |
805 | + lwz $acc09,`2048+32`($Tbl0) | |
806 | + lwz $acc10,`2048+64`($Tbl0) | |
807 | + lwz $acc11,`2048+96`($Tbl0) | |
808 | + lwz $acc08,`2048+128`($Tbl0) | |
809 | + lwz $acc09,`2048+160`($Tbl0) | |
810 | + lwz $acc10,`2048+192`($Tbl0) | |
811 | + lwz $acc11,`2048+224`($Tbl0) | |
812 | + rlwinm $acc00,$s0,`32-24`,24,31 | |
813 | + rlwinm $acc01,$s1,`32-24`,24,31 | |
814 | + lwz $t0,0($key) | |
815 | + lwz $t1,4($key) | |
816 | + rlwinm $acc02,$s2,`32-24`,24,31 | |
817 | + rlwinm $acc03,$s3,`32-24`,24,31 | |
818 | + lwz $t2,8($key) | |
819 | + lwz $t3,12($key) | |
820 | + rlwinm $acc04,$s3,`32-16`,24,31 | |
821 | + rlwinm $acc05,$s0,`32-16`,24,31 | |
822 | + lbzx $acc00,$Tbl2,$acc00 | |
823 | + lbzx $acc01,$Tbl2,$acc01 | |
824 | + rlwinm $acc06,$s1,`32-16`,24,31 | |
825 | + rlwinm $acc07,$s2,`32-16`,24,31 | |
826 | + lbzx $acc02,$Tbl2,$acc02 | |
827 | + lbzx $acc03,$Tbl2,$acc03 | |
828 | + rlwinm $acc08,$s2,`32-8`,24,31 | |
829 | + rlwinm $acc09,$s3,`32-8`,24,31 | |
830 | + lbzx $acc04,$Tbl2,$acc04 | |
831 | + lbzx $acc05,$Tbl2,$acc05 | |
832 | + rlwinm $acc10,$s0,`32-8`,24,31 | |
833 | + rlwinm $acc11,$s1,`32-8`,24,31 | |
834 | + lbzx $acc06,$Tbl2,$acc06 | |
835 | + lbzx $acc07,$Tbl2,$acc07 | |
836 | + rlwinm $acc12,$s1,`0`,24,31 | |
837 | + rlwinm $acc13,$s2,`0`,24,31 | |
838 | + lbzx $acc08,$Tbl2,$acc08 | |
839 | + lbzx $acc09,$Tbl2,$acc09 | |
840 | + rlwinm $acc14,$s3,`0`,24,31 | |
841 | + rlwinm $acc15,$s0,`0`,24,31 | |
842 | + lbzx $acc10,$Tbl2,$acc10 | |
843 | + lbzx $acc11,$Tbl2,$acc11 | |
844 | + rlwinm $s0,$acc00,24,0,7 | |
845 | + rlwinm $s1,$acc01,24,0,7 | |
846 | + lbzx $acc12,$Tbl2,$acc12 | |
847 | + lbzx $acc13,$Tbl2,$acc13 | |
848 | + rlwinm $s2,$acc02,24,0,7 | |
849 | + rlwinm $s3,$acc03,24,0,7 | |
850 | + lbzx $acc14,$Tbl2,$acc14 | |
851 | + lbzx $acc15,$Tbl2,$acc15 | |
852 | + rlwimi $s0,$acc04,16,8,15 | |
853 | + rlwimi $s1,$acc05,16,8,15 | |
854 | + rlwimi $s2,$acc06,16,8,15 | |
855 | + rlwimi $s3,$acc07,16,8,15 | |
856 | + rlwimi $s0,$acc08,8,16,23 | |
857 | + rlwimi $s1,$acc09,8,16,23 | |
858 | + rlwimi $s2,$acc10,8,16,23 | |
859 | + rlwimi $s3,$acc11,8,16,23 | |
860 | + or $s0,$s0,$acc12 | |
861 | + or $s1,$s1,$acc13 | |
862 | + or $s2,$s2,$acc14 | |
863 | + or $s3,$s3,$acc15 | |
864 | + xor $s0,$s0,$t0 | |
865 | + xor $s1,$s1,$t1 | |
866 | + xor $s2,$s2,$t2 | |
867 | + xor $s3,$s3,$t3 | |
868 | + blr | |
869 | + | |
870 | +.align 4 | |
871 | +Lppc_AES_decrypt_compact: | |
872 | + lwz $acc00,240($key) | |
873 | + lwz $t0,0($key) | |
874 | + lwz $t1,4($key) | |
875 | + lwz $t2,8($key) | |
876 | + lwz $t3,12($key) | |
877 | + addi $Tbl1,$Tbl0,2048 | |
878 | + lis $mask80,0x8080 | |
879 | + lis $mask1b,0x1b1b | |
880 | + addi $key,$key,16 | |
881 | + ori $mask80,$mask80,0x8080 | |
882 | + ori $mask1b,$mask1b,0x1b1b | |
883 | +___ | |
884 | +$code.=<<___ if ($SIZE_T==8); | |
885 | + insrdi $mask80,$mask80,32,0 | |
886 | + insrdi $mask1b,$mask1b,32,0 | |
887 | +___ | |
888 | +$code.=<<___; | |
889 | + mtctr $acc00 | |
890 | +.align 4 | |
891 | +Ldec_compact_loop: | |
892 | + xor $s0,$s0,$t0 | |
893 | + xor $s1,$s1,$t1 | |
894 | + xor $s2,$s2,$t2 | |
895 | + xor $s3,$s3,$t3 | |
896 | + rlwinm $acc00,$s0,`32-24`,24,31 | |
897 | + rlwinm $acc01,$s1,`32-24`,24,31 | |
898 | + rlwinm $acc02,$s2,`32-24`,24,31 | |
899 | + rlwinm $acc03,$s3,`32-24`,24,31 | |
900 | + lbzx $acc00,$Tbl1,$acc00 | |
901 | + lbzx $acc01,$Tbl1,$acc01 | |
902 | + rlwinm $acc04,$s3,`32-16`,24,31 | |
903 | + rlwinm $acc05,$s0,`32-16`,24,31 | |
904 | + lbzx $acc02,$Tbl1,$acc02 | |
905 | + lbzx $acc03,$Tbl1,$acc03 | |
906 | + rlwinm $acc06,$s1,`32-16`,24,31 | |
907 | + rlwinm $acc07,$s2,`32-16`,24,31 | |
908 | + lbzx $acc04,$Tbl1,$acc04 | |
909 | + lbzx $acc05,$Tbl1,$acc05 | |
910 | + rlwinm $acc08,$s2,`32-8`,24,31 | |
911 | + rlwinm $acc09,$s3,`32-8`,24,31 | |
912 | + lbzx $acc06,$Tbl1,$acc06 | |
913 | + lbzx $acc07,$Tbl1,$acc07 | |
914 | + rlwinm $acc10,$s0,`32-8`,24,31 | |
915 | + rlwinm $acc11,$s1,`32-8`,24,31 | |
916 | + lbzx $acc08,$Tbl1,$acc08 | |
917 | + lbzx $acc09,$Tbl1,$acc09 | |
918 | + rlwinm $acc12,$s1,`0`,24,31 | |
919 | + rlwinm $acc13,$s2,`0`,24,31 | |
920 | + lbzx $acc10,$Tbl1,$acc10 | |
921 | + lbzx $acc11,$Tbl1,$acc11 | |
922 | + rlwinm $acc14,$s3,`0`,24,31 | |
923 | + rlwinm $acc15,$s0,`0`,24,31 | |
924 | + lbzx $acc12,$Tbl1,$acc12 | |
925 | + lbzx $acc13,$Tbl1,$acc13 | |
926 | + rlwinm $s0,$acc00,24,0,7 | |
927 | + rlwinm $s1,$acc01,24,0,7 | |
928 | + lbzx $acc14,$Tbl1,$acc14 | |
929 | + lbzx $acc15,$Tbl1,$acc15 | |
930 | + rlwinm $s2,$acc02,24,0,7 | |
931 | + rlwinm $s3,$acc03,24,0,7 | |
932 | + rlwimi $s0,$acc04,16,8,15 | |
933 | + rlwimi $s1,$acc05,16,8,15 | |
934 | + rlwimi $s2,$acc06,16,8,15 | |
935 | + rlwimi $s3,$acc07,16,8,15 | |
936 | + rlwimi $s0,$acc08,8,16,23 | |
937 | + rlwimi $s1,$acc09,8,16,23 | |
938 | + rlwimi $s2,$acc10,8,16,23 | |
939 | + rlwimi $s3,$acc11,8,16,23 | |
940 | + lwz $t0,0($key) | |
941 | + lwz $t1,4($key) | |
942 | + or $s0,$s0,$acc12 | |
943 | + or $s1,$s1,$acc13 | |
944 | + lwz $t2,8($key) | |
945 | + lwz $t3,12($key) | |
946 | + or $s2,$s2,$acc14 | |
947 | + or $s3,$s3,$acc15 | |
948 | + | |
949 | + addi $key,$key,16 | |
950 | + bdz Ldec_compact_done | |
951 | +___ | |
952 | +$code.=<<___ if ($SIZE_T==8); | |
953 | + # vectorized permutation improves decrypt performance by 10% | |
954 | + insrdi $s0,$s1,32,0 | |
955 | + insrdi $s2,$s3,32,0 | |
956 | + | |
957 | + and $acc00,$s0,$mask80 # r1=r0&0x80808080 | |
958 | + and $acc02,$s2,$mask80 | |
959 | + srdi $acc04,$acc00,7 # r1>>7 | |
960 | + srdi $acc06,$acc02,7 | |
961 | + andc $acc08,$s0,$mask80 # r0&0x7f7f7f7f | |
962 | + andc $acc10,$s2,$mask80 | |
963 | + sub $acc00,$acc00,$acc04 # r1-(r1>>7) | |
964 | + sub $acc02,$acc02,$acc06 | |
965 | + add $acc08,$acc08,$acc08 # (r0&0x7f7f7f7f)<<1 | |
966 | + add $acc10,$acc10,$acc10 | |
967 | + and $acc00,$acc00,$mask1b # (r1-(r1>>7))&0x1b1b1b1b | |
968 | + and $acc02,$acc02,$mask1b | |
969 | + xor $acc00,$acc00,$acc08 # r2 | |
970 | + xor $acc02,$acc02,$acc10 | |
971 | + | |
972 | + and $acc04,$acc00,$mask80 # r1=r2&0x80808080 | |
973 | + and $acc06,$acc02,$mask80 | |
974 | + srdi $acc08,$acc04,7 # r1>>7 | |
975 | + srdi $acc10,$acc06,7 | |
976 | + andc $acc12,$acc00,$mask80 # r2&0x7f7f7f7f | |
977 | + andc $acc14,$acc02,$mask80 | |
978 | + sub $acc04,$acc04,$acc08 # r1-(r1>>7) | |
979 | + sub $acc06,$acc06,$acc10 | |
980 | + add $acc12,$acc12,$acc12 # (r2&0x7f7f7f7f)<<1 | |
981 | + add $acc14,$acc14,$acc14 | |
982 | + and $acc04,$acc04,$mask1b # (r1-(r1>>7))&0x1b1b1b1b | |
983 | + and $acc06,$acc06,$mask1b | |
984 | + xor $acc04,$acc04,$acc12 # r4 | |
985 | + xor $acc06,$acc06,$acc14 | |
986 | + | |
987 | + and $acc08,$acc04,$mask80 # r1=r4&0x80808080 | |
988 | + and $acc10,$acc06,$mask80 | |
989 | + srdi $acc12,$acc08,7 # r1>>7 | |
990 | + srdi $acc14,$acc10,7 | |
991 | + sub $acc08,$acc08,$acc12 # r1-(r1>>7) | |
992 | + sub $acc10,$acc10,$acc14 | |
993 | + andc $acc12,$acc04,$mask80 # r4&0x7f7f7f7f | |
994 | + andc $acc14,$acc06,$mask80 | |
995 | + add $acc12,$acc12,$acc12 # (r4&0x7f7f7f7f)<<1 | |
996 | + add $acc14,$acc14,$acc14 | |
997 | + and $acc08,$acc08,$mask1b # (r1-(r1>>7))&0x1b1b1b1b | |
998 | + and $acc10,$acc10,$mask1b | |
999 | + xor $acc08,$acc08,$acc12 # r8 | |
1000 | + xor $acc10,$acc10,$acc14 | |
1001 | + | |
1002 | + xor $acc00,$acc00,$s0 # r2^r0 | |
1003 | + xor $acc02,$acc02,$s2 | |
1004 | + xor $acc04,$acc04,$s0 # r4^r0 | |
1005 | + xor $acc06,$acc06,$s2 | |
1006 | + | |
1007 | + extrdi $acc01,$acc00,32,0 | |
1008 | + extrdi $acc03,$acc02,32,0 | |
1009 | + extrdi $acc05,$acc04,32,0 | |
1010 | + extrdi $acc07,$acc06,32,0 | |
1011 | + extrdi $acc09,$acc08,32,0 | |
1012 | + extrdi $acc11,$acc10,32,0 | |
1013 | +___ | |
1014 | +$code.=<<___ if ($SIZE_T==4); | |
1015 | + and $acc00,$s0,$mask80 # r1=r0&0x80808080 | |
1016 | + and $acc01,$s1,$mask80 | |
1017 | + and $acc02,$s2,$mask80 | |
1018 | + and $acc03,$s3,$mask80 | |
1019 | + srwi $acc04,$acc00,7 # r1>>7 | |
1020 | + srwi $acc05,$acc01,7 | |
1021 | + srwi $acc06,$acc02,7 | |
1022 | + srwi $acc07,$acc03,7 | |
1023 | + andc $acc08,$s0,$mask80 # r0&0x7f7f7f7f | |
1024 | + andc $acc09,$s1,$mask80 | |
1025 | + andc $acc10,$s2,$mask80 | |
1026 | + andc $acc11,$s3,$mask80 | |
1027 | + sub $acc00,$acc00,$acc04 # r1-(r1>>7) | |
1028 | + sub $acc01,$acc01,$acc05 | |
1029 | + sub $acc02,$acc02,$acc06 | |
1030 | + sub $acc03,$acc03,$acc07 | |
1031 | + add $acc08,$acc08,$acc08 # (r0&0x7f7f7f7f)<<1 | |
1032 | + add $acc09,$acc09,$acc09 | |
1033 | + add $acc10,$acc10,$acc10 | |
1034 | + add $acc11,$acc11,$acc11 | |
1035 | + and $acc00,$acc00,$mask1b # (r1-(r1>>7))&0x1b1b1b1b | |
1036 | + and $acc01,$acc01,$mask1b | |
1037 | + and $acc02,$acc02,$mask1b | |
1038 | + and $acc03,$acc03,$mask1b | |
1039 | + xor $acc00,$acc00,$acc08 # r2 | |
1040 | + xor $acc01,$acc01,$acc09 | |
1041 | + xor $acc02,$acc02,$acc10 | |
1042 | + xor $acc03,$acc03,$acc11 | |
1043 | + | |
1044 | + and $acc04,$acc00,$mask80 # r1=r2&0x80808080 | |
1045 | + and $acc05,$acc01,$mask80 | |
1046 | + and $acc06,$acc02,$mask80 | |
1047 | + and $acc07,$acc03,$mask80 | |
1048 | + srwi $acc08,$acc04,7 # r1>>7 | |
1049 | + srwi $acc09,$acc05,7 | |
1050 | + srwi $acc10,$acc06,7 | |
1051 | + srwi $acc11,$acc07,7 | |
1052 | + andc $acc12,$acc00,$mask80 # r2&0x7f7f7f7f | |
1053 | + andc $acc13,$acc01,$mask80 | |
1054 | + andc $acc14,$acc02,$mask80 | |
1055 | + andc $acc15,$acc03,$mask80 | |
1056 | + sub $acc04,$acc04,$acc08 # r1-(r1>>7) | |
1057 | + sub $acc05,$acc05,$acc09 | |
1058 | + sub $acc06,$acc06,$acc10 | |
1059 | + sub $acc07,$acc07,$acc11 | |
1060 | + add $acc12,$acc12,$acc12 # (r2&0x7f7f7f7f)<<1 | |
1061 | + add $acc13,$acc13,$acc13 | |
1062 | + add $acc14,$acc14,$acc14 | |
1063 | + add $acc15,$acc15,$acc15 | |
1064 | + and $acc04,$acc04,$mask1b # (r1-(r1>>7))&0x1b1b1b1b | |
1065 | + and $acc05,$acc05,$mask1b | |
1066 | + and $acc06,$acc06,$mask1b | |
1067 | + and $acc07,$acc07,$mask1b | |
1068 | + xor $acc04,$acc04,$acc12 # r4 | |
1069 | + xor $acc05,$acc05,$acc13 | |
1070 | + xor $acc06,$acc06,$acc14 | |
1071 | + xor $acc07,$acc07,$acc15 | |
1072 | + | |
1073 | + and $acc08,$acc04,$mask80 # r1=r4&0x80808080 | |
1074 | + and $acc09,$acc05,$mask80 | |
1075 | + and $acc10,$acc06,$mask80 | |
1076 | + and $acc11,$acc07,$mask80 | |
1077 | + srwi $acc12,$acc08,7 # r1>>7 | |
1078 | + srwi $acc13,$acc09,7 | |
1079 | + srwi $acc14,$acc10,7 | |
1080 | + srwi $acc15,$acc11,7 | |
1081 | + sub $acc08,$acc08,$acc12 # r1-(r1>>7) | |
1082 | + sub $acc09,$acc09,$acc13 | |
1083 | + sub $acc10,$acc10,$acc14 | |
1084 | + sub $acc11,$acc11,$acc15 | |
1085 | + andc $acc12,$acc04,$mask80 # r4&0x7f7f7f7f | |
1086 | + andc $acc13,$acc05,$mask80 | |
1087 | + andc $acc14,$acc06,$mask80 | |
1088 | + andc $acc15,$acc07,$mask80 | |
1089 | + add $acc12,$acc12,$acc12 # (r4&0x7f7f7f7f)<<1 | |
1090 | + add $acc13,$acc13,$acc13 | |
1091 | + add $acc14,$acc14,$acc14 | |
1092 | + add $acc15,$acc15,$acc15 | |
1093 | + and $acc08,$acc08,$mask1b # (r1-(r1>>7))&0x1b1b1b1b | |
1094 | + and $acc09,$acc09,$mask1b | |
1095 | + and $acc10,$acc10,$mask1b | |
1096 | + and $acc11,$acc11,$mask1b | |
1097 | + xor $acc08,$acc08,$acc12 # r8 | |
1098 | + xor $acc09,$acc09,$acc13 | |
1099 | + xor $acc10,$acc10,$acc14 | |
1100 | + xor $acc11,$acc11,$acc15 | |
1101 | + | |
1102 | + xor $acc00,$acc00,$s0 # r2^r0 | |
1103 | + xor $acc01,$acc01,$s1 | |
1104 | + xor $acc02,$acc02,$s2 | |
1105 | + xor $acc03,$acc03,$s3 | |
1106 | + xor $acc04,$acc04,$s0 # r4^r0 | |
1107 | + xor $acc05,$acc05,$s1 | |
1108 | + xor $acc06,$acc06,$s2 | |
1109 | + xor $acc07,$acc07,$s3 | |
1110 | +___ | |
1111 | +$code.=<<___; | |
1112 | + rotrwi $s0,$s0,8 # = ROTATE(r0,8) | |
1113 | + rotrwi $s1,$s1,8 | |
1114 | + rotrwi $s2,$s2,8 | |
1115 | + rotrwi $s3,$s3,8 | |
1116 | + xor $s0,$s0,$acc00 # ^= r2^r0 | |
1117 | + xor $s1,$s1,$acc01 | |
1118 | + xor $s2,$s2,$acc02 | |
1119 | + xor $s3,$s3,$acc03 | |
1120 | + xor $acc00,$acc00,$acc08 | |
1121 | + xor $acc01,$acc01,$acc09 | |
1122 | + xor $acc02,$acc02,$acc10 | |
1123 | + xor $acc03,$acc03,$acc11 | |
1124 | + xor $s0,$s0,$acc04 # ^= r4^r0 | |
1125 | + xor $s1,$s1,$acc05 | |
1126 | + xor $s2,$s2,$acc06 | |
1127 | + xor $s3,$s3,$acc07 | |
1128 | + rotrwi $acc00,$acc00,24 | |
1129 | + rotrwi $acc01,$acc01,24 | |
1130 | + rotrwi $acc02,$acc02,24 | |
1131 | + rotrwi $acc03,$acc03,24 | |
1132 | + xor $acc04,$acc04,$acc08 | |
1133 | + xor $acc05,$acc05,$acc09 | |
1134 | + xor $acc06,$acc06,$acc10 | |
1135 | + xor $acc07,$acc07,$acc11 | |
1136 | + xor $s0,$s0,$acc08 # ^= r8 [^((r4^r0)^(r2^r0)=r4^r2)] | |
1137 | + xor $s1,$s1,$acc09 | |
1138 | + xor $s2,$s2,$acc10 | |
1139 | + xor $s3,$s3,$acc11 | |
1140 | + rotrwi $acc04,$acc04,16 | |
1141 | + rotrwi $acc05,$acc05,16 | |
1142 | + rotrwi $acc06,$acc06,16 | |
1143 | + rotrwi $acc07,$acc07,16 | |
1144 | + xor $s0,$s0,$acc00 # ^= ROTATE(r8^r2^r0,24) | |
1145 | + xor $s1,$s1,$acc01 | |
1146 | + xor $s2,$s2,$acc02 | |
1147 | + xor $s3,$s3,$acc03 | |
1148 | + rotrwi $acc08,$acc08,8 | |
1149 | + rotrwi $acc09,$acc09,8 | |
1150 | + rotrwi $acc10,$acc10,8 | |
1151 | + rotrwi $acc11,$acc11,8 | |
1152 | + xor $s0,$s0,$acc04 # ^= ROTATE(r8^r4^r0,16) | |
1153 | + xor $s1,$s1,$acc05 | |
1154 | + xor $s2,$s2,$acc06 | |
1155 | + xor $s3,$s3,$acc07 | |
1156 | + xor $s0,$s0,$acc08 # ^= ROTATE(r8,8) | |
1157 | + xor $s1,$s1,$acc09 | |
1158 | + xor $s2,$s2,$acc10 | |
1159 | + xor $s3,$s3,$acc11 | |
1160 | + | |
1161 | + b Ldec_compact_loop | |
1162 | +.align 4 | |
1163 | +Ldec_compact_done: | |
1164 | + xor $s0,$s0,$t0 | |
1165 | + xor $s1,$s1,$t1 | |
1166 | + xor $s2,$s2,$t2 | |
1167 | + xor $s3,$s3,$t3 | |
1168 | + blr | |
1169 | +.long 0 | |
1170 | +.asciz "AES for PPC, CRYPTOGAMS by <appro\@openssl.org>" | |
1171 | +.align 7 | |
1172 | +___ | |
1173 | + | |
1174 | +$code =~ s/\`([^\`]*)\`/eval $1/gem; | |
1175 | +print $code; | |
1176 | +close STDOUT; |
@@ -0,0 +1,1333 @@ | ||
1 | +#!/usr/bin/env perl | |
2 | + | |
3 | +# ==================================================================== | |
4 | +# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | |
5 | +# project. The module is, however, dual licensed under OpenSSL and | |
6 | +# CRYPTOGAMS licenses depending on where you obtain it. For further | |
7 | +# details see http://www.openssl.org/~appro/cryptogams/. | |
8 | +# ==================================================================== | |
9 | + | |
10 | +# AES for s390x. | |
11 | + | |
12 | +# April 2007. | |
13 | +# | |
14 | +# Software performance improvement over gcc-generated code is ~70% and | |
15 | +# in absolute terms is ~73 cycles per byte processed with 128-bit key. | |
16 | +# You're likely to exclaim "why so slow?" Keep in mind that z-CPUs are | |
17 | +# *strictly* in-order execution and issued instruction [in this case | |
18 | +# load value from memory is critical] has to complete before execution | |
19 | +# flow proceeds. S-boxes are compressed to 2KB[+256B]. | |
20 | +# | |
21 | +# As for hardware acceleration support. It's basically a "teaser," as | |
22 | +# it can and should be improved in several ways. Most notably support | |
23 | +# for CBC is not utilized, nor multiple blocks are ever processed. | |
24 | +# Then software key schedule can be postponed till hardware support | |
25 | +# detection... Performance improvement over assembler is reportedly | |
26 | +# ~2.5x, but can reach >8x [naturally on larger chunks] if proper | |
27 | +# support is implemented. | |
28 | + | |
29 | +# May 2007. | |
30 | +# | |
31 | +# Implement AES_set_[en|de]crypt_key. Key schedule setup is avoided | |
32 | +# for 128-bit keys, if hardware support is detected. | |
33 | + | |
34 | +# Januray 2009. | |
35 | +# | |
36 | +# Add support for hardware AES192/256 and reschedule instructions to | |
37 | +# minimize/avoid Address Generation Interlock hazard and to favour | |
38 | +# dual-issue z10 pipeline. This gave ~25% improvement on z10 and | |
39 | +# almost 50% on z9. The gain is smaller on z10, because being dual- | |
40 | +# issue z10 makes it improssible to eliminate the interlock condition: | |
41 | +# critial path is not long enough. Yet it spends ~24 cycles per byte | |
42 | +# processed with 128-bit key. | |
43 | +# | |
44 | +# Unlike previous version hardware support detection takes place only | |
45 | +# at the moment of key schedule setup, which is denoted in key->rounds. | |
46 | +# This is done, because deferred key setup can't be made MT-safe, not | |
47 | +# for key lengthes longer than 128 bits. | |
48 | +# | |
49 | +# Add AES_cbc_encrypt, which gives incredible performance improvement, | |
50 | +# it was measured to be ~6.6x. It's less than previously mentioned 8x, | |
51 | +# because software implementation was optimized. | |
52 | + | |
53 | +$softonly=0; # allow hardware support | |
54 | + | |
55 | +$t0="%r0"; $mask="%r0"; | |
56 | +$t1="%r1"; | |
57 | +$t2="%r2"; $inp="%r2"; | |
58 | +$t3="%r3"; $out="%r3"; $bits="%r3"; | |
59 | +$key="%r4"; | |
60 | +$i1="%r5"; | |
61 | +$i2="%r6"; | |
62 | +$i3="%r7"; | |
63 | +$s0="%r8"; | |
64 | +$s1="%r9"; | |
65 | +$s2="%r10"; | |
66 | +$s3="%r11"; | |
67 | +$tbl="%r12"; | |
68 | +$rounds="%r13"; | |
69 | +$ra="%r14"; | |
70 | +$sp="%r15"; | |
71 | + | |
72 | +sub _data_word() | |
73 | +{ my $i; | |
74 | + while(defined($i=shift)) { $code.=sprintf".long\t0x%08x,0x%08x\n",$i,$i; } | |
75 | +} | |
76 | + | |
77 | +$code=<<___; | |
78 | +.text | |
79 | + | |
80 | +.type AES_Te,\@object | |
81 | +.align 256 | |
82 | +AES_Te: | |
83 | +___ | |
84 | +&_data_word( | |
85 | + 0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d, | |
86 | + 0xfff2f20d, 0xd66b6bbd, 0xde6f6fb1, 0x91c5c554, | |
87 | + 0x60303050, 0x02010103, 0xce6767a9, 0x562b2b7d, | |
88 | + 0xe7fefe19, 0xb5d7d762, 0x4dababe6, 0xec76769a, | |
89 | + 0x8fcaca45, 0x1f82829d, 0x89c9c940, 0xfa7d7d87, | |
90 | + 0xeffafa15, 0xb25959eb, 0x8e4747c9, 0xfbf0f00b, | |
91 | + 0x41adadec, 0xb3d4d467, 0x5fa2a2fd, 0x45afafea, | |
92 | + 0x239c9cbf, 0x53a4a4f7, 0xe4727296, 0x9bc0c05b, | |
93 | + 0x75b7b7c2, 0xe1fdfd1c, 0x3d9393ae, 0x4c26266a, | |
94 | + 0x6c36365a, 0x7e3f3f41, 0xf5f7f702, 0x83cccc4f, | |
95 | + 0x6834345c, 0x51a5a5f4, 0xd1e5e534, 0xf9f1f108, | |
96 | + 0xe2717193, 0xabd8d873, 0x62313153, 0x2a15153f, | |
97 | + 0x0804040c, 0x95c7c752, 0x46232365, 0x9dc3c35e, | |
98 | + 0x30181828, 0x379696a1, 0x0a05050f, 0x2f9a9ab5, | |
99 | + 0x0e070709, 0x24121236, 0x1b80809b, 0xdfe2e23d, | |
100 | + 0xcdebeb26, 0x4e272769, 0x7fb2b2cd, 0xea75759f, | |
101 | + 0x1209091b, 0x1d83839e, 0x582c2c74, 0x341a1a2e, | |
102 | + 0x361b1b2d, 0xdc6e6eb2, 0xb45a5aee, 0x5ba0a0fb, | |
103 | + 0xa45252f6, 0x763b3b4d, 0xb7d6d661, 0x7db3b3ce, | |
104 | + 0x5229297b, 0xdde3e33e, 0x5e2f2f71, 0x13848497, | |
105 | + 0xa65353f5, 0xb9d1d168, 0x00000000, 0xc1eded2c, | |
106 | + 0x40202060, 0xe3fcfc1f, 0x79b1b1c8, 0xb65b5bed, | |
107 | + 0xd46a6abe, 0x8dcbcb46, 0x67bebed9, 0x7239394b, | |
108 | + 0x944a4ade, 0x984c4cd4, 0xb05858e8, 0x85cfcf4a, | |
109 | + 0xbbd0d06b, 0xc5efef2a, 0x4faaaae5, 0xedfbfb16, | |
110 | + 0x864343c5, 0x9a4d4dd7, 0x66333355, 0x11858594, | |
111 | + 0x8a4545cf, 0xe9f9f910, 0x04020206, 0xfe7f7f81, | |
112 | + 0xa05050f0, 0x783c3c44, 0x259f9fba, 0x4ba8a8e3, | |
113 | + 0xa25151f3, 0x5da3a3fe, 0x804040c0, 0x058f8f8a, | |
114 | + 0x3f9292ad, 0x219d9dbc, 0x70383848, 0xf1f5f504, | |
115 | + 0x63bcbcdf, 0x77b6b6c1, 0xafdada75, 0x42212163, | |
116 | + 0x20101030, 0xe5ffff1a, 0xfdf3f30e, 0xbfd2d26d, | |
117 | + 0x81cdcd4c, 0x180c0c14, 0x26131335, 0xc3ecec2f, | |
118 | + 0xbe5f5fe1, 0x359797a2, 0x884444cc, 0x2e171739, | |
119 | + 0x93c4c457, 0x55a7a7f2, 0xfc7e7e82, 0x7a3d3d47, | |
120 | + 0xc86464ac, 0xba5d5de7, 0x3219192b, 0xe6737395, | |
121 | + 0xc06060a0, 0x19818198, 0x9e4f4fd1, 0xa3dcdc7f, | |
122 | + 0x44222266, 0x542a2a7e, 0x3b9090ab, 0x0b888883, | |
123 | + 0x8c4646ca, 0xc7eeee29, 0x6bb8b8d3, 0x2814143c, | |
124 | + 0xa7dede79, 0xbc5e5ee2, 0x160b0b1d, 0xaddbdb76, | |
125 | + 0xdbe0e03b, 0x64323256, 0x743a3a4e, 0x140a0a1e, | |
126 | + 0x924949db, 0x0c06060a, 0x4824246c, 0xb85c5ce4, | |
127 | + 0x9fc2c25d, 0xbdd3d36e, 0x43acacef, 0xc46262a6, | |
128 | + 0x399191a8, 0x319595a4, 0xd3e4e437, 0xf279798b, | |
129 | + 0xd5e7e732, 0x8bc8c843, 0x6e373759, 0xda6d6db7, | |
130 | + 0x018d8d8c, 0xb1d5d564, 0x9c4e4ed2, 0x49a9a9e0, | |
131 | + 0xd86c6cb4, 0xac5656fa, 0xf3f4f407, 0xcfeaea25, | |
132 | + 0xca6565af, 0xf47a7a8e, 0x47aeaee9, 0x10080818, | |
133 | + 0x6fbabad5, 0xf0787888, 0x4a25256f, 0x5c2e2e72, | |
134 | + 0x381c1c24, 0x57a6a6f1, 0x73b4b4c7, 0x97c6c651, | |
135 | + 0xcbe8e823, 0xa1dddd7c, 0xe874749c, 0x3e1f1f21, | |
136 | + 0x964b4bdd, 0x61bdbddc, 0x0d8b8b86, 0x0f8a8a85, | |
137 | + 0xe0707090, 0x7c3e3e42, 0x71b5b5c4, 0xcc6666aa, | |
138 | + 0x904848d8, 0x06030305, 0xf7f6f601, 0x1c0e0e12, | |
139 | + 0xc26161a3, 0x6a35355f, 0xae5757f9, 0x69b9b9d0, | |
140 | + 0x17868691, 0x99c1c158, 0x3a1d1d27, 0x279e9eb9, | |
141 | + 0xd9e1e138, 0xebf8f813, 0x2b9898b3, 0x22111133, | |
142 | + 0xd26969bb, 0xa9d9d970, 0x078e8e89, 0x339494a7, | |
143 | + 0x2d9b9bb6, 0x3c1e1e22, 0x15878792, 0xc9e9e920, | |
144 | + 0x87cece49, 0xaa5555ff, 0x50282878, 0xa5dfdf7a, | |
145 | + 0x038c8c8f, 0x59a1a1f8, 0x09898980, 0x1a0d0d17, | |
146 | + 0x65bfbfda, 0xd7e6e631, 0x844242c6, 0xd06868b8, | |
147 | + 0x824141c3, 0x299999b0, 0x5a2d2d77, 0x1e0f0f11, | |
148 | + 0x7bb0b0cb, 0xa85454fc, 0x6dbbbbd6, 0x2c16163a); | |
149 | +$code.=<<___; | |
150 | +# Te4[256] | |
151 | +.byte 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5 | |
152 | +.byte 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76 | |
153 | +.byte 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0 | |
154 | +.byte 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0 | |
155 | +.byte 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc | |
156 | +.byte 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15 | |
157 | +.byte 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a | |
158 | +.byte 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75 | |
159 | +.byte 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0 | |
160 | +.byte 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84 | |
161 | +.byte 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b | |
162 | +.byte 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf | |
163 | +.byte 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85 | |
164 | +.byte 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8 | |
165 | +.byte 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5 | |
166 | +.byte 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2 | |
167 | +.byte 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17 | |
168 | +.byte 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73 | |
169 | +.byte 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88 | |
170 | +.byte 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb | |
171 | +.byte 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c | |
172 | +.byte 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79 | |
173 | +.byte 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9 | |
174 | +.byte 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08 | |
175 | +.byte 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6 | |
176 | +.byte 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a | |
177 | +.byte 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e | |
178 | +.byte 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e | |
179 | +.byte 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94 | |
180 | +.byte 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf | |
181 | +.byte 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68 | |
182 | +.byte 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16 | |
183 | +# rcon[] | |
184 | +.long 0x01000000, 0x02000000, 0x04000000, 0x08000000 | |
185 | +.long 0x10000000, 0x20000000, 0x40000000, 0x80000000 | |
186 | +.long 0x1B000000, 0x36000000, 0, 0, 0, 0, 0, 0 | |
187 | +.align 256 | |
188 | +.size AES_Te,.-AES_Te | |
189 | + | |
190 | +# void AES_encrypt(const unsigned char *inp, unsigned char *out, | |
191 | +# const AES_KEY *key) { | |
192 | +.globl AES_encrypt | |
193 | +.type AES_encrypt,\@function | |
194 | +AES_encrypt: | |
195 | +___ | |
196 | +$code.=<<___ if (!$softonly); | |
197 | + l %r0,240($key) | |
198 | + lhi %r1,16 | |
199 | + clr %r0,%r1 | |
200 | + jl .Lesoft | |
201 | + | |
202 | + la %r1,0($key) | |
203 | + #la %r2,0($inp) | |
204 | + la %r4,0($out) | |
205 | + lghi %r3,16 # single block length | |
206 | + .long 0xb92e0042 # km %r4,%r2 | |
207 | + brc 1,.-4 # can this happen? | |
208 | + br %r14 | |
209 | +.align 64 | |
210 | +.Lesoft: | |
211 | +___ | |
212 | +$code.=<<___; | |
213 | + stmg %r3,$ra,24($sp) | |
214 | + | |
215 | + llgf $s0,0($inp) | |
216 | + llgf $s1,4($inp) | |
217 | + llgf $s2,8($inp) | |
218 | + llgf $s3,12($inp) | |
219 | + | |
220 | + larl $tbl,AES_Te | |
221 | + bras $ra,_s390x_AES_encrypt | |
222 | + | |
223 | + lg $out,24($sp) | |
224 | + st $s0,0($out) | |
225 | + st $s1,4($out) | |
226 | + st $s2,8($out) | |
227 | + st $s3,12($out) | |
228 | + | |
229 | + lmg %r6,$ra,48($sp) | |
230 | + br $ra | |
231 | +.size AES_encrypt,.-AES_encrypt | |
232 | + | |
233 | +.type _s390x_AES_encrypt,\@function | |
234 | +.align 16 | |
235 | +_s390x_AES_encrypt: | |
236 | + stg $ra,152($sp) | |
237 | + x $s0,0($key) | |
238 | + x $s1,4($key) | |
239 | + x $s2,8($key) | |
240 | + x $s3,12($key) | |
241 | + l $rounds,240($key) | |
242 | + llill $mask,`0xff<<3` | |
243 | + aghi $rounds,-1 | |
244 | + j .Lenc_loop | |
245 | +.align 16 | |
246 | +.Lenc_loop: | |
247 | + sllg $t1,$s0,`0+3` | |
248 | + srlg $t2,$s0,`8-3` | |
249 | + srlg $t3,$s0,`16-3` | |
250 | + srl $s0,`24-3` | |
251 | + nr $s0,$mask | |
252 | + ngr $t1,$mask | |
253 | + nr $t2,$mask | |
254 | + nr $t3,$mask | |
255 | + | |
256 | + srlg $i1,$s1,`16-3` # i0 | |
257 | + sllg $i2,$s1,`0+3` | |
258 | + srlg $i3,$s1,`8-3` | |
259 | + srl $s1,`24-3` | |
260 | + nr $i1,$mask | |
261 | + nr $s1,$mask | |
262 | + ngr $i2,$mask | |
263 | + nr $i3,$mask | |
264 | + | |
265 | + l $s0,0($s0,$tbl) # Te0[s0>>24] | |
266 | + l $t1,1($t1,$tbl) # Te3[s0>>0] | |
267 | + l $t2,2($t2,$tbl) # Te2[s0>>8] | |
268 | + l $t3,3($t3,$tbl) # Te1[s0>>16] | |
269 | + | |
270 | + x $s0,3($i1,$tbl) # Te1[s1>>16] | |
271 | + l $s1,0($s1,$tbl) # Te0[s1>>24] | |
272 | + x $t2,1($i2,$tbl) # Te3[s1>>0] | |
273 | + x $t3,2($i3,$tbl) # Te2[s1>>8] | |
274 | + | |
275 | + srlg $i1,$s2,`8-3` # i0 | |
276 | + srlg $i2,$s2,`16-3` # i1 | |
277 | + nr $i1,$mask | |
278 | + nr $i2,$mask | |
279 | + sllg $i3,$s2,`0+3` | |
280 | + srl $s2,`24-3` | |
281 | + nr $s2,$mask | |
282 | + ngr $i3,$mask | |
283 | + | |
284 | + xr $s1,$t1 | |
285 | + srlg $ra,$s3,`8-3` # i1 | |
286 | + sllg $t1,$s3,`0+3` # i0 | |
287 | + nr $ra,$mask | |
288 | + la $key,16($key) | |
289 | + ngr $t1,$mask | |
290 | + | |
291 | + x $s0,2($i1,$tbl) # Te2[s2>>8] | |
292 | + x $s1,3($i2,$tbl) # Te1[s2>>16] | |
293 | + l $s2,0($s2,$tbl) # Te0[s2>>24] | |
294 | + x $t3,1($i3,$tbl) # Te3[s2>>0] | |
295 | + | |
296 | + srlg $i3,$s3,`16-3` # i2 | |
297 | + xr $s2,$t2 | |
298 | + srl $s3,`24-3` | |
299 | + nr $i3,$mask | |
300 | + nr $s3,$mask | |
301 | + | |
302 | + x $s0,0($key) | |
303 | + x $s1,4($key) | |
304 | + x $s2,8($key) | |
305 | + x $t3,12($key) | |
306 | + | |
307 | + x $s0,1($t1,$tbl) # Te3[s3>>0] | |
308 | + x $s1,2($ra,$tbl) # Te2[s3>>8] | |
309 | + x $s2,3($i3,$tbl) # Te1[s3>>16] | |
310 | + l $s3,0($s3,$tbl) # Te0[s3>>24] | |
311 | + xr $s3,$t3 | |
312 | + | |
313 | + brct $rounds,.Lenc_loop | |
314 | + .align 16 | |
315 | + | |
316 | + sllg $t1,$s0,`0+3` | |
317 | + srlg $t2,$s0,`8-3` | |
318 | + ngr $t1,$mask | |
319 | + srlg $t3,$s0,`16-3` | |
320 | + srl $s0,`24-3` | |
321 | + nr $s0,$mask | |
322 | + nr $t2,$mask | |
323 | + nr $t3,$mask | |
324 | + | |
325 | + srlg $i1,$s1,`16-3` # i0 | |
326 | + sllg $i2,$s1,`0+3` | |
327 | + ngr $i2,$mask | |
328 | + srlg $i3,$s1,`8-3` | |
329 | + srl $s1,`24-3` | |
330 | + nr $i1,$mask | |
331 | + nr $s1,$mask | |
332 | + nr $i3,$mask | |
333 | + | |
334 | + llgc $s0,2($s0,$tbl) # Te4[s0>>24] | |
335 | + llgc $t1,2($t1,$tbl) # Te4[s0>>0] | |
336 | + sll $s0,24 | |
337 | + llgc $t2,2($t2,$tbl) # Te4[s0>>8] | |
338 | + llgc $t3,2($t3,$tbl) # Te4[s0>>16] | |
339 | + sll $t2,8 | |
340 | + sll $t3,16 | |
341 | + | |
342 | + llgc $i1,2($i1,$tbl) # Te4[s1>>16] | |
343 | + llgc $s1,2($s1,$tbl) # Te4[s1>>24] | |
344 | + llgc $i2,2($i2,$tbl) # Te4[s1>>0] | |
345 | + llgc $i3,2($i3,$tbl) # Te4[s1>>8] | |
346 | + sll $i1,16 | |
347 | + sll $s1,24 | |
348 | + sll $i3,8 | |
349 | + or $s0,$i1 | |
350 | + or $s1,$t1 | |
351 | + or $t2,$i2 | |
352 | + or $t3,$i3 | |
353 | + | |
354 | + srlg $i1,$s2,`8-3` # i0 | |
355 | + srlg $i2,$s2,`16-3` # i1 | |
356 | + nr $i1,$mask | |
357 | + nr $i2,$mask | |
358 | + sllg $i3,$s2,`0+3` | |
359 | + srl $s2,`24-3` | |
360 | + ngr $i3,$mask | |
361 | + nr $s2,$mask | |
362 | + | |
363 | + sllg $t1,$s3,`0+3` # i0 | |
364 | + srlg $ra,$s3,`8-3` # i1 | |
365 | + ngr $t1,$mask | |
366 | + | |
367 | + llgc $i1,2($i1,$tbl) # Te4[s2>>8] | |
368 | + llgc $i2,2($i2,$tbl) # Te4[s2>>16] | |
369 | + sll $i1,8 | |
370 | + llgc $s2,2($s2,$tbl) # Te4[s2>>24] | |
371 | + llgc $i3,2($i3,$tbl) # Te4[s2>>0] | |
372 | + sll $i2,16 | |
373 | + nr $ra,$mask | |
374 | + sll $s2,24 | |
375 | + or $s0,$i1 | |
376 | + or $s1,$i2 | |
377 | + or $s2,$t2 | |
378 | + or $t3,$i3 | |
379 | + | |
380 | + srlg $i3,$s3,`16-3` # i2 | |
381 | + srl $s3,`24-3` | |
382 | + nr $i3,$mask | |
383 | + nr $s3,$mask | |
384 | + | |
385 | + l $t0,16($key) | |
386 | + l $t2,20($key) | |
387 | + | |
388 | + llgc $i1,2($t1,$tbl) # Te4[s3>>0] | |
389 | + llgc $i2,2($ra,$tbl) # Te4[s3>>8] | |
390 | + llgc $i3,2($i3,$tbl) # Te4[s3>>16] | |
391 | + llgc $s3,2($s3,$tbl) # Te4[s3>>24] | |
392 | + sll $i2,8 | |
393 | + sll $i3,16 | |
394 | + sll $s3,24 | |
395 | + or $s0,$i1 | |
396 | + or $s1,$i2 | |
397 | + or $s2,$i3 | |
398 | + or $s3,$t3 | |
399 | + | |
400 | + lg $ra,152($sp) | |
401 | + xr $s0,$t0 | |
402 | + xr $s1,$t2 | |
403 | + x $s2,24($key) | |
404 | + x $s3,28($key) | |
405 | + | |
406 | + br $ra | |
407 | +.size _s390x_AES_encrypt,.-_s390x_AES_encrypt | |
408 | +___ | |
409 | + | |
410 | +$code.=<<___; | |
411 | +.type AES_Td,\@object | |
412 | +.align 256 | |
413 | +AES_Td: | |
414 | +___ | |
415 | +&_data_word( | |
416 | + 0x51f4a750, 0x7e416553, 0x1a17a4c3, 0x3a275e96, | |
417 | + 0x3bab6bcb, 0x1f9d45f1, 0xacfa58ab, 0x4be30393, | |
418 | + 0x2030fa55, 0xad766df6, 0x88cc7691, 0xf5024c25, | |
419 | + 0x4fe5d7fc, 0xc52acbd7, 0x26354480, 0xb562a38f, | |
420 | + 0xdeb15a49, 0x25ba1b67, 0x45ea0e98, 0x5dfec0e1, | |
421 | + 0xc32f7502, 0x814cf012, 0x8d4697a3, 0x6bd3f9c6, | |
422 | + 0x038f5fe7, 0x15929c95, 0xbf6d7aeb, 0x955259da, | |
423 | + 0xd4be832d, 0x587421d3, 0x49e06929, 0x8ec9c844, | |
424 | + 0x75c2896a, 0xf48e7978, 0x99583e6b, 0x27b971dd, | |
425 | + 0xbee14fb6, 0xf088ad17, 0xc920ac66, 0x7dce3ab4, | |
426 | + 0x63df4a18, 0xe51a3182, 0x97513360, 0x62537f45, | |
427 | + 0xb16477e0, 0xbb6bae84, 0xfe81a01c, 0xf9082b94, | |
428 | + 0x70486858, 0x8f45fd19, 0x94de6c87, 0x527bf8b7, | |
429 | + 0xab73d323, 0x724b02e2, 0xe31f8f57, 0x6655ab2a, | |
430 | + 0xb2eb2807, 0x2fb5c203, 0x86c57b9a, 0xd33708a5, | |
431 | + 0x302887f2, 0x23bfa5b2, 0x02036aba, 0xed16825c, | |
432 | + 0x8acf1c2b, 0xa779b492, 0xf307f2f0, 0x4e69e2a1, | |
433 | + 0x65daf4cd, 0x0605bed5, 0xd134621f, 0xc4a6fe8a, | |
434 | + 0x342e539d, 0xa2f355a0, 0x058ae132, 0xa4f6eb75, | |
435 | + 0x0b83ec39, 0x4060efaa, 0x5e719f06, 0xbd6e1051, | |
436 | + 0x3e218af9, 0x96dd063d, 0xdd3e05ae, 0x4de6bd46, | |
437 | + 0x91548db5, 0x71c45d05, 0x0406d46f, 0x605015ff, | |
438 | + 0x1998fb24, 0xd6bde997, 0x894043cc, 0x67d99e77, | |
439 | + 0xb0e842bd, 0x07898b88, 0xe7195b38, 0x79c8eedb, | |
440 | + 0xa17c0a47, 0x7c420fe9, 0xf8841ec9, 0x00000000, | |
441 | + 0x09808683, 0x322bed48, 0x1e1170ac, 0x6c5a724e, | |
442 | + 0xfd0efffb, 0x0f853856, 0x3daed51e, 0x362d3927, | |
443 | + 0x0a0fd964, 0x685ca621, 0x9b5b54d1, 0x24362e3a, | |
444 | + 0x0c0a67b1, 0x9357e70f, 0xb4ee96d2, 0x1b9b919e, | |
445 | + 0x80c0c54f, 0x61dc20a2, 0x5a774b69, 0x1c121a16, | |
446 | + 0xe293ba0a, 0xc0a02ae5, 0x3c22e043, 0x121b171d, | |
447 | + 0x0e090d0b, 0xf28bc7ad, 0x2db6a8b9, 0x141ea9c8, | |
448 | + 0x57f11985, 0xaf75074c, 0xee99ddbb, 0xa37f60fd, | |
449 | + 0xf701269f, 0x5c72f5bc, 0x44663bc5, 0x5bfb7e34, | |
450 | + 0x8b432976, 0xcb23c6dc, 0xb6edfc68, 0xb8e4f163, | |
451 | + 0xd731dcca, 0x42638510, 0x13972240, 0x84c61120, | |
452 | + 0x854a247d, 0xd2bb3df8, 0xaef93211, 0xc729a16d, | |
453 | + 0x1d9e2f4b, 0xdcb230f3, 0x0d8652ec, 0x77c1e3d0, | |
454 | + 0x2bb3166c, 0xa970b999, 0x119448fa, 0x47e96422, | |
455 | + 0xa8fc8cc4, 0xa0f03f1a, 0x567d2cd8, 0x223390ef, | |
456 | + 0x87494ec7, 0xd938d1c1, 0x8ccaa2fe, 0x98d40b36, | |
457 | + 0xa6f581cf, 0xa57ade28, 0xdab78e26, 0x3fadbfa4, | |
458 | + 0x2c3a9de4, 0x5078920d, 0x6a5fcc9b, 0x547e4662, | |
459 | + 0xf68d13c2, 0x90d8b8e8, 0x2e39f75e, 0x82c3aff5, | |
460 | + 0x9f5d80be, 0x69d0937c, 0x6fd52da9, 0xcf2512b3, | |
461 | + 0xc8ac993b, 0x10187da7, 0xe89c636e, 0xdb3bbb7b, | |
462 | + 0xcd267809, 0x6e5918f4, 0xec9ab701, 0x834f9aa8, | |
463 | + 0xe6956e65, 0xaaffe67e, 0x21bccf08, 0xef15e8e6, | |
464 | + 0xbae79bd9, 0x4a6f36ce, 0xea9f09d4, 0x29b07cd6, | |
465 | + 0x31a4b2af, 0x2a3f2331, 0xc6a59430, 0x35a266c0, | |
466 | + 0x744ebc37, 0xfc82caa6, 0xe090d0b0, 0x33a7d815, | |
467 | + 0xf104984a, 0x41ecdaf7, 0x7fcd500e, 0x1791f62f, | |
468 | + 0x764dd68d, 0x43efb04d, 0xccaa4d54, 0xe49604df, | |
469 | + 0x9ed1b5e3, 0x4c6a881b, 0xc12c1fb8, 0x4665517f, | |
470 | + 0x9d5eea04, 0x018c355d, 0xfa877473, 0xfb0b412e, | |
471 | + 0xb3671d5a, 0x92dbd252, 0xe9105633, 0x6dd64713, | |
472 | + 0x9ad7618c, 0x37a10c7a, 0x59f8148e, 0xeb133c89, | |
473 | + 0xcea927ee, 0xb761c935, 0xe11ce5ed, 0x7a47b13c, | |
474 | + 0x9cd2df59, 0x55f2733f, 0x1814ce79, 0x73c737bf, | |
475 | + 0x53f7cdea, 0x5ffdaa5b, 0xdf3d6f14, 0x7844db86, | |
476 | + 0xcaaff381, 0xb968c43e, 0x3824342c, 0xc2a3405f, | |
477 | + 0x161dc372, 0xbce2250c, 0x283c498b, 0xff0d9541, | |
478 | + 0x39a80171, 0x080cb3de, 0xd8b4e49c, 0x6456c190, | |
479 | + 0x7bcb8461, 0xd532b670, 0x486c5c74, 0xd0b85742); | |
480 | +$code.=<<___; | |
481 | +# Td4[256] | |
482 | +.byte 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38 | |
483 | +.byte 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb | |
484 | +.byte 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87 | |
485 | +.byte 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb | |
486 | +.byte 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d | |
487 | +.byte 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e | |
488 | +.byte 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2 | |
489 | +.byte 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25 | |
490 | +.byte 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16 | |
491 | +.byte 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92 | |
492 | +.byte 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda | |
493 | +.byte 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84 | |
494 | +.byte 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a | |
495 | +.byte 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06 | |
496 | +.byte 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02 | |
497 | +.byte 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b | |
498 | +.byte 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea | |
499 | +.byte 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73 | |
500 | +.byte 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85 | |
501 | +.byte 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e | |
502 | +.byte 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89 | |
503 | +.byte 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b | |
504 | +.byte 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20 | |
505 | +.byte 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4 | |
506 | +.byte 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31 | |
507 | +.byte 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f | |
508 | +.byte 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d | |
509 | +.byte 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef | |
510 | +.byte 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0 | |
511 | +.byte 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61 | |
512 | +.byte 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26 | |
513 | +.byte 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d | |
514 | +.size AES_Td,.-AES_Td | |
515 | + | |
516 | +# void AES_decrypt(const unsigned char *inp, unsigned char *out, | |
517 | +# const AES_KEY *key) { | |
518 | +.globl AES_decrypt | |
519 | +.type AES_decrypt,\@function | |
520 | +AES_decrypt: | |
521 | +___ | |
522 | +$code.=<<___ if (!$softonly); | |
523 | + l %r0,240($key) | |
524 | + lhi %r1,16 | |
525 | + clr %r0,%r1 | |
526 | + jl .Ldsoft | |
527 | + | |
528 | + la %r1,0($key) | |
529 | + #la %r2,0($inp) | |
530 | + la %r4,0($out) | |
531 | + lghi %r3,16 # single block length | |
532 | + .long 0xb92e0042 # km %r4,%r2 | |
533 | + brc 1,.-4 # can this happen? | |
534 | + br %r14 | |
535 | +.align 64 | |
536 | +.Ldsoft: | |
537 | +___ | |
538 | +$code.=<<___; | |
539 | + stmg %r3,$ra,24($sp) | |
540 | + | |
541 | + llgf $s0,0($inp) | |
542 | + llgf $s1,4($inp) | |
543 | + llgf $s2,8($inp) | |
544 | + llgf $s3,12($inp) | |
545 | + | |
546 | + larl $tbl,AES_Td | |
547 | + bras $ra,_s390x_AES_decrypt | |
548 | + | |
549 | + lg $out,24($sp) | |
550 | + st $s0,0($out) | |
551 | + st $s1,4($out) | |
552 | + st $s2,8($out) | |
553 | + st $s3,12($out) | |
554 | + | |
555 | + lmg %r6,$ra,48($sp) | |
556 | + br $ra | |
557 | +.size AES_decrypt,.-AES_decrypt | |
558 | + | |
559 | +.type _s390x_AES_decrypt,\@function | |
560 | +.align 16 | |
561 | +_s390x_AES_decrypt: | |
562 | + stg $ra,152($sp) | |
563 | + x $s0,0($key) | |
564 | + x $s1,4($key) | |
565 | + x $s2,8($key) | |
566 | + x $s3,12($key) | |
567 | + l $rounds,240($key) | |
568 | + llill $mask,`0xff<<3` | |
569 | + aghi $rounds,-1 | |
570 | + j .Ldec_loop | |
571 | +.align 16 | |
572 | +.Ldec_loop: | |
573 | + srlg $t1,$s0,`16-3` | |
574 | + srlg $t2,$s0,`8-3` | |
575 | + sllg $t3,$s0,`0+3` | |
576 | + srl $s0,`24-3` | |
577 | + nr $s0,$mask | |
578 | + nr $t1,$mask | |
579 | + nr $t2,$mask | |
580 | + ngr $t3,$mask | |
581 | + | |
582 | + sllg $i1,$s1,`0+3` # i0 | |
583 | + srlg $i2,$s1,`16-3` | |
584 | + srlg $i3,$s1,`8-3` | |
585 | + srl $s1,`24-3` | |
586 | + ngr $i1,$mask | |
587 | + nr $s1,$mask | |
588 | + nr $i2,$mask | |
589 | + nr $i3,$mask | |
590 | + | |
591 | + l $s0,0($s0,$tbl) # Td0[s0>>24] | |
592 | + l $t1,3($t1,$tbl) # Td1[s0>>16] | |
593 | + l $t2,2($t2,$tbl) # Td2[s0>>8] | |
594 | + l $t3,1($t3,$tbl) # Td3[s0>>0] | |
595 | + | |
596 | + x $s0,1($i1,$tbl) # Td3[s1>>0] | |
597 | + l $s1,0($s1,$tbl) # Td0[s1>>24] | |
598 | + x $t2,3($i2,$tbl) # Td1[s1>>16] | |
599 | + x $t3,2($i3,$tbl) # Td2[s1>>8] | |
600 | + | |
601 | + srlg $i1,$s2,`8-3` # i0 | |
602 | + sllg $i2,$s2,`0+3` # i1 | |
603 | + srlg $i3,$s2,`16-3` | |
604 | + srl $s2,`24-3` | |
605 | + nr $i1,$mask | |
606 | + ngr $i2,$mask | |
607 | + nr $s2,$mask | |
608 | + nr $i3,$mask | |
609 | + | |
610 | + xr $s1,$t1 | |
611 | + srlg $ra,$s3,`8-3` # i1 | |
612 | + srlg $t1,$s3,`16-3` # i0 | |
613 | + nr $ra,$mask | |
614 | + la $key,16($key) | |
615 | + nr $t1,$mask | |
616 | + | |
617 | + x $s0,2($i1,$tbl) # Td2[s2>>8] | |
618 | + x $s1,1($i2,$tbl) # Td3[s2>>0] | |
619 | + l $s2,0($s2,$tbl) # Td0[s2>>24] | |
620 | + x $t3,3($i3,$tbl) # Td1[s2>>16] | |
621 | + | |
622 | + sllg $i3,$s3,`0+3` # i2 | |
623 | + srl $s3,`24-3` | |
624 | + ngr $i3,$mask | |
625 | + nr $s3,$mask | |
626 | + | |
627 | + xr $s2,$t2 | |
628 | + x $s0,0($key) | |
629 | + x $s1,4($key) | |
630 | + x $s2,8($key) | |
631 | + x $t3,12($key) | |
632 | + | |
633 | + x $s0,3($t1,$tbl) # Td1[s3>>16] | |
634 | + x $s1,2($ra,$tbl) # Td2[s3>>8] | |
635 | + x $s2,1($i3,$tbl) # Td3[s3>>0] | |
636 | + l $s3,0($s3,$tbl) # Td0[s3>>24] | |
637 | + xr $s3,$t3 | |
638 | + | |
639 | + brct $rounds,.Ldec_loop | |
640 | + .align 16 | |
641 | + | |
642 | + l $t1,`2048+0`($tbl) # prefetch Td4 | |
643 | + l $t2,`2048+64`($tbl) | |
644 | + l $t3,`2048+128`($tbl) | |
645 | + l $i1,`2048+192`($tbl) | |
646 | + llill $mask,0xff | |
647 | + | |
648 | + srlg $i3,$s0,24 # i0 | |
649 | + srlg $t1,$s0,16 | |
650 | + srlg $t2,$s0,8 | |
651 | + nr $s0,$mask # i3 | |
652 | + nr $t1,$mask | |
653 | + | |
654 | + srlg $i1,$s1,24 | |
655 | + nr $t2,$mask | |
656 | + srlg $i2,$s1,16 | |
657 | + srlg $ra,$s1,8 | |
658 | + nr $s1,$mask # i0 | |
659 | + nr $i2,$mask | |
660 | + nr $ra,$mask | |
661 | + | |
662 | + llgc $i3,2048($i3,$tbl) # Td4[s0>>24] | |
663 | + llgc $t1,2048($t1,$tbl) # Td4[s0>>16] | |
664 | + llgc $t2,2048($t2,$tbl) # Td4[s0>>8] | |
665 | + sll $t1,16 | |
666 | + llgc $t3,2048($s0,$tbl) # Td4[s0>>0] | |
667 | + sllg $s0,$i3,24 | |
668 | + sll $t2,8 | |
669 | + | |
670 | + llgc $s1,2048($s1,$tbl) # Td4[s1>>0] | |
671 | + llgc $i1,2048($i1,$tbl) # Td4[s1>>24] | |
672 | + llgc $i2,2048($i2,$tbl) # Td4[s1>>16] | |
673 | + sll $i1,24 | |
674 | + llgc $i3,2048($ra,$tbl) # Td4[s1>>8] | |
675 | + sll $i2,16 | |
676 | + sll $i3,8 | |
677 | + or $s0,$s1 | |
678 | + or $t1,$i1 | |
679 | + or $t2,$i2 | |
680 | + or $t3,$i3 | |
681 | + | |
682 | + srlg $i1,$s2,8 # i0 | |
683 | + srlg $i2,$s2,24 | |
684 | + srlg $i3,$s2,16 | |
685 | + nr $s2,$mask # i1 | |
686 | + nr $i1,$mask | |
687 | + nr $i3,$mask | |
688 | + llgc $i1,2048($i1,$tbl) # Td4[s2>>8] | |
689 | + llgc $s1,2048($s2,$tbl) # Td4[s2>>0] | |
690 | + llgc $i2,2048($i2,$tbl) # Td4[s2>>24] | |
691 | + llgc $i3,2048($i3,$tbl) # Td4[s2>>16] | |
692 | + sll $i1,8 | |
693 | + sll $i2,24 | |
694 | + or $s0,$i1 | |
695 | + sll $i3,16 | |
696 | + or $t2,$i2 | |
697 | + or $t3,$i3 | |
698 | + | |
699 | + srlg $i1,$s3,16 # i0 | |
700 | + srlg $i2,$s3,8 # i1 | |
701 | + srlg $i3,$s3,24 | |
702 | + nr $s3,$mask # i2 | |
703 | + nr $i1,$mask | |
704 | + nr $i2,$mask | |
705 | + | |
706 | + lg $ra,152($sp) | |
707 | + or $s1,$t1 | |
708 | + l $t0,16($key) | |
709 | + l $t1,20($key) | |
710 | + | |
711 | + llgc $i1,2048($i1,$tbl) # Td4[s3>>16] | |
712 | + llgc $i2,2048($i2,$tbl) # Td4[s3>>8] | |
713 | + sll $i1,16 | |
714 | + llgc $s2,2048($s3,$tbl) # Td4[s3>>0] | |
715 | + llgc $s3,2048($i3,$tbl) # Td4[s3>>24] | |
716 | + sll $i2,8 | |
717 | + sll $s3,24 | |
718 | + or $s0,$i1 | |
719 | + or $s1,$i2 | |
720 | + or $s2,$t2 | |
721 | + or $s3,$t3 | |
722 | + | |
723 | + xr $s0,$t0 | |
724 | + xr $s1,$t1 | |
725 | + x $s2,24($key) | |
726 | + x $s3,28($key) | |
727 | + | |
728 | + br $ra | |
729 | +.size _s390x_AES_decrypt,.-_s390x_AES_decrypt | |
730 | +___ | |
731 | + | |
732 | +$code.=<<___; | |
733 | +# void AES_set_encrypt_key(const unsigned char *in, int bits, | |
734 | +# AES_KEY *key) { | |
735 | +.globl AES_set_encrypt_key | |
736 | +.type AES_set_encrypt_key,\@function | |
737 | +.align 16 | |
738 | +AES_set_encrypt_key: | |
739 | + lghi $t0,0 | |
740 | + clgr $inp,$t0 | |
741 | + je .Lminus1 | |
742 | + clgr $key,$t0 | |
743 | + je .Lminus1 | |
744 | + | |
745 | + lghi $t0,128 | |
746 | + clr $bits,$t0 | |
747 | + je .Lproceed | |
748 | + lghi $t0,192 | |
749 | + clr $bits,$t0 | |
750 | + je .Lproceed | |
751 | + lghi $t0,256 | |
752 | + clr $bits,$t0 | |
753 | + je .Lproceed | |
754 | + lghi %r2,-2 | |
755 | + br %r14 | |
756 | + | |
757 | +.align 16 | |
758 | +.Lproceed: | |
759 | +___ | |
760 | +$code.=<<___ if (!$softonly); | |
761 | + # convert bits to km code, [128,192,256]->[18,19,20] | |
762 | + lhi %r5,-128 | |
763 | + lhi %r0,18 | |
764 | + ar %r5,$bits | |
765 | + srl %r5,6 | |
766 | + ar %r5,%r0 | |
767 | + | |
768 | + lghi %r0,0 # query capability vector | |
769 | + la %r1,16($sp) | |
770 | + .long 0xb92f0042 # kmc %r4,%r2 | |
771 | + | |
772 | + llihh %r1,0x8000 | |
773 | + srlg %r1,%r1,0(%r5) | |
774 | + ng %r1,16($sp) | |
775 | + jz .Lekey_internal | |
776 | + | |
777 | + lmg %r0,%r1,0($inp) # just copy 128 bits... | |
778 | + stmg %r0,%r1,0($key) | |
779 | + lhi %r0,192 | |
780 | + cr $bits,%r0 | |
781 | + jl 1f | |
782 | + lg %r1,16($inp) | |
783 | + stg %r1,16($key) | |
784 | + je 1f | |
785 | + lg %r1,24($inp) | |
786 | + stg %r1,24($key) | |
787 | +1: st $bits,236($key) # save bits | |
788 | + st %r5,240($key) # save km code | |
789 | + lghi %r2,0 | |
790 | + br %r14 | |
791 | +___ | |
792 | +$code.=<<___; | |
793 | +.align 16 | |
794 | +.Lekey_internal: | |
795 | + stmg %r6,%r13,48($sp) # all non-volatile regs | |
796 | + | |
797 | + larl $tbl,AES_Te+2048 | |
798 | + | |
799 | + llgf $s0,0($inp) | |
800 | + llgf $s1,4($inp) | |
801 | + llgf $s2,8($inp) | |
802 | + llgf $s3,12($inp) | |
803 | + st $s0,0($key) | |
804 | + st $s1,4($key) | |
805 | + st $s2,8($key) | |
806 | + st $s3,12($key) | |
807 | + lghi $t0,128 | |
808 | + cr $bits,$t0 | |
809 | + jne .Lnot128 | |
810 | + | |
811 | + llill $mask,0xff | |
812 | + lghi $t3,0 # i=0 | |
813 | + lghi $rounds,10 | |
814 | + st $rounds,240($key) | |
815 | + | |
816 | + llgfr $t2,$s3 # temp=rk[3] | |
817 | + srlg $i1,$s3,8 | |
818 | + srlg $i2,$s3,16 | |
819 | + srlg $i3,$s3,24 | |
820 | + nr $t2,$mask | |
821 | + nr $i1,$mask | |
822 | + nr $i2,$mask | |
823 | + | |
824 | +.align 16 | |
825 | +.L128_loop: | |
826 | + la $t2,0($t2,$tbl) | |
827 | + la $i1,0($i1,$tbl) | |
828 | + la $i2,0($i2,$tbl) | |
829 | + la $i3,0($i3,$tbl) | |
830 | + icm $t2,2,0($t2) # Te4[rk[3]>>0]<<8 | |
831 | + icm $t2,4,0($i1) # Te4[rk[3]>>8]<<16 | |
832 | + icm $t2,8,0($i2) # Te4[rk[3]>>16]<<24 | |
833 | + icm $t2,1,0($i3) # Te4[rk[3]>>24] | |
834 | + x $t2,256($t3,$tbl) # rcon[i] | |
835 | + xr $s0,$t2 # rk[4]=rk[0]^... | |
836 | + xr $s1,$s0 # rk[5]=rk[1]^rk[4] | |
837 | + xr $s2,$s1 # rk[6]=rk[2]^rk[5] | |
838 | + xr $s3,$s2 # rk[7]=rk[3]^rk[6] | |
839 | + | |
840 | + llgfr $t2,$s3 # temp=rk[3] | |
841 | + srlg $i1,$s3,8 | |
842 | + srlg $i2,$s3,16 | |
843 | + nr $t2,$mask | |
844 | + nr $i1,$mask | |
845 | + srlg $i3,$s3,24 | |
846 | + nr $i2,$mask | |
847 | + | |
848 | + st $s0,16($key) | |
849 | + st $s1,20($key) | |
850 | + st $s2,24($key) | |
851 | + st $s3,28($key) | |
852 | + la $key,16($key) # key+=4 | |
853 | + la $t3,4($t3) # i++ | |
854 | + brct $rounds,.L128_loop | |
855 | + lghi %r2,0 | |
856 | + lmg %r6,%r13,48($sp) | |
857 | + br $ra | |
858 | + | |
859 | +.align 16 | |
860 | +.Lnot128: | |
861 | + llgf $t0,16($inp) | |
862 | + llgf $t1,20($inp) | |
863 | + st $t0,16($key) | |
864 | + st $t1,20($key) | |
865 | + lghi $t0,192 | |
866 | + cr $bits,$t0 | |
867 | + jne .Lnot192 | |
868 | + | |
869 | + llill $mask,0xff | |
870 | + lghi $t3,0 # i=0 | |
871 | + lghi $rounds,12 | |
872 | + st $rounds,240($key) | |
873 | + lghi $rounds,8 | |
874 | + | |
875 | + srlg $i1,$t1,8 | |
876 | + srlg $i2,$t1,16 | |
877 | + srlg $i3,$t1,24 | |
878 | + nr $t1,$mask | |
879 | + nr $i1,$mask | |
880 | + nr $i2,$mask | |
881 | + | |
882 | +.align 16 | |
883 | +.L192_loop: | |
884 | + la $t1,0($t1,$tbl) | |
885 | + la $i1,0($i1,$tbl) | |
886 | + la $i2,0($i2,$tbl) | |
887 | + la $i3,0($i3,$tbl) | |
888 | + icm $t1,2,0($t1) # Te4[rk[5]>>0]<<8 | |
889 | + icm $t1,4,0($i1) # Te4[rk[5]>>8]<<16 | |
890 | + icm $t1,8,0($i2) # Te4[rk[5]>>16]<<24 | |
891 | + icm $t1,1,0($i3) # Te4[rk[5]>>24] | |
892 | + x $t1,256($t3,$tbl) # rcon[i] | |
893 | + xr $s0,$t1 # rk[6]=rk[0]^... | |
894 | + xr $s1,$s0 # rk[7]=rk[1]^rk[6] | |
895 | + xr $s2,$s1 # rk[8]=rk[2]^rk[7] | |
896 | + xr $s3,$s2 # rk[9]=rk[3]^rk[8] | |
897 | + | |
898 | + st $s0,24($key) | |
899 | + st $s1,28($key) | |
900 | + st $s2,32($key) | |
901 | + st $s3,36($key) | |
902 | + brct $rounds,.L192_continue | |
903 | + lghi %r2,0 | |
904 | + lmg %r6,%r13,48($sp) | |
905 | + br $ra | |
906 | + | |
907 | +.align 16 | |
908 | +.L192_continue: | |
909 | + lgr $t1,$s3 | |
910 | + x $t1,16($key) # rk[10]=rk[4]^rk[9] | |
911 | + st $t1,40($key) | |
912 | + x $t1,20($key) # rk[11]=rk[5]^rk[10] | |
913 | + st $t1,44($key) | |
914 | + | |
915 | + srlg $i1,$t1,8 | |
916 | + srlg $i2,$t1,16 | |
917 | + srlg $i3,$t1,24 | |
918 | + nr $t1,$mask | |
919 | + nr $i1,$mask | |
920 | + nr $i2,$mask | |
921 | + | |
922 | + la $key,24($key) # key+=6 | |
923 | + la $t3,4($t3) # i++ | |
924 | + j .L192_loop | |
925 | + | |
926 | +.align 16 | |
927 | +.Lnot192: | |
928 | + llgf $t0,24($inp) | |
929 | + llgf $t1,28($inp) | |
930 | + st $t0,24($key) | |
931 | + st $t1,28($key) | |
932 | + llill $mask,0xff | |
933 | + lghi $t3,0 # i=0 | |
934 | + lghi $rounds,14 | |
935 | + st $rounds,240($key) | |
936 | + lghi $rounds,7 | |
937 | + | |
938 | + srlg $i1,$t1,8 | |
939 | + srlg $i2,$t1,16 | |
940 | + srlg $i3,$t1,24 | |
941 | + nr $t1,$mask | |
942 | + nr $i1,$mask | |
943 | + nr $i2,$mask | |
944 | + | |
945 | +.align 16 | |
946 | +.L256_loop: | |
947 | + la $t1,0($t1,$tbl) | |
948 | + la $i1,0($i1,$tbl) | |
949 | + la $i2,0($i2,$tbl) | |
950 | + la $i3,0($i3,$tbl) | |
951 | + icm $t1,2,0($t1) # Te4[rk[7]>>0]<<8 | |
952 | + icm $t1,4,0($i1) # Te4[rk[7]>>8]<<16 | |
953 | + icm $t1,8,0($i2) # Te4[rk[7]>>16]<<24 | |
954 | + icm $t1,1,0($i3) # Te4[rk[7]>>24] | |
955 | + x $t1,256($t3,$tbl) # rcon[i] | |
956 | + xr $s0,$t1 # rk[8]=rk[0]^... | |
957 | + xr $s1,$s0 # rk[9]=rk[1]^rk[8] | |
958 | + xr $s2,$s1 # rk[10]=rk[2]^rk[9] | |
959 | + xr $s3,$s2 # rk[11]=rk[3]^rk[10] | |
960 | + st $s0,32($key) | |
961 | + st $s1,36($key) | |
962 | + st $s2,40($key) | |
963 | + st $s3,44($key) | |
964 | + brct $rounds,.L256_continue | |
965 | + lghi %r2,0 | |
966 | + lmg %r6,%r13,48($sp) | |
967 | + br $ra | |
968 | + | |
969 | +.align 16 | |
970 | +.L256_continue: | |
971 | + lgr $t1,$s3 # temp=rk[11] | |
972 | + srlg $i1,$s3,8 | |
973 | + srlg $i2,$s3,16 | |
974 | + srlg $i3,$s3,24 | |
975 | + nr $t1,$mask | |
976 | + nr $i1,$mask | |
977 | + nr $i2,$mask | |
978 | + la $t1,0($t1,$tbl) | |
979 | + la $i1,0($i1,$tbl) | |
980 | + la $i2,0($i2,$tbl) | |
981 | + la $i3,0($i3,$tbl) | |
982 | + llgc $t1,0($t1) # Te4[rk[11]>>0] | |
983 | + icm $t1,2,0($i1) # Te4[rk[11]>>8]<<8 | |
984 | + icm $t1,4,0($i2) # Te4[rk[11]>>16]<<16 | |
985 | + icm $t1,8,0($i3) # Te4[rk[11]>>24]<<24 | |
986 | + x $t1,16($key) # rk[12]=rk[4]^... | |
987 | + st $t1,48($key) | |
988 | + x $t1,20($key) # rk[13]=rk[5]^rk[12] | |
989 | + st $t1,52($key) | |
990 | + x $t1,24($key) # rk[14]=rk[6]^rk[13] | |
991 | + st $t1,56($key) | |
992 | + x $t1,28($key) # rk[15]=rk[7]^rk[14] | |
993 | + st $t1,60($key) | |
994 | + | |
995 | + srlg $i1,$t1,8 | |
996 | + srlg $i2,$t1,16 | |
997 | + srlg $i3,$t1,24 | |
998 | + nr $t1,$mask | |
999 | + nr $i1,$mask | |
1000 | + nr $i2,$mask | |
1001 | + | |
1002 | + la $key,32($key) # key+=8 | |
1003 | + la $t3,4($t3) # i++ | |
1004 | + j .L256_loop | |
1005 | + | |
1006 | +.Lminus1: | |
1007 | + lghi %r2,-1 | |
1008 | + br $ra | |
1009 | +.size AES_set_encrypt_key,.-AES_set_encrypt_key | |
1010 | + | |
1011 | +# void AES_set_decrypt_key(const unsigned char *in, int bits, | |
1012 | +# AES_KEY *key) { | |
1013 | +.globl AES_set_decrypt_key | |
1014 | +.type AES_set_decrypt_key,\@function | |
1015 | +.align 16 | |
1016 | +AES_set_decrypt_key: | |
1017 | + stg $key,32($sp) # I rely on AES_set_encrypt_key to | |
1018 | + stg $ra,112($sp) # save non-volatile registers! | |
1019 | + bras $ra,AES_set_encrypt_key | |
1020 | + lg $key,32($sp) | |
1021 | + lg $ra,112($sp) | |
1022 | + ltgr %r2,%r2 | |
1023 | + bnzr $ra | |
1024 | +___ | |
1025 | +$code.=<<___ if (!$softonly); | |
1026 | + l $t0,240($key) | |
1027 | + lhi $t1,16 | |
1028 | + cr $t0,$t1 | |
1029 | + jl .Lgo | |
1030 | + oill $t0,0x80 # set "decrypt" bit | |
1031 | + st $t0,240($key) | |
1032 | + br $ra | |
1033 | + | |
1034 | +.align 16 | |
1035 | +.Ldkey_internal: | |
1036 | + stg $key,32($sp) | |
1037 | + stg $ra,40($sp) | |
1038 | + bras $ra,.Lekey_internal | |
1039 | + lg $key,32($sp) | |
1040 | + lg $ra,40($sp) | |
1041 | +___ | |
1042 | +$code.=<<___; | |
1043 | + | |
1044 | +.Lgo: llgf $rounds,240($key) | |
1045 | + la $i1,0($key) | |
1046 | + sllg $i2,$rounds,4 | |
1047 | + la $i2,0($i2,$key) | |
1048 | + srl $rounds,1 | |
1049 | + lghi $t1,-16 | |
1050 | + | |
1051 | +.align 16 | |
1052 | +.Linv: lmg $s0,$s1,0($i1) | |
1053 | + lmg $s2,$s3,0($i2) | |
1054 | + stmg $s0,$s1,0($i2) | |
1055 | + stmg $s2,$s3,0($i1) | |
1056 | + la $i1,16($i1) | |
1057 | + la $i2,0($t1,$i2) | |
1058 | + brct $rounds,.Linv | |
1059 | +___ | |
1060 | +$mask80=$i1; | |
1061 | +$mask1b=$i2; | |
1062 | +$maskfe=$i3; | |
1063 | +$code.=<<___; | |
1064 | + llgf $rounds,240($key) | |
1065 | + aghi $rounds,-1 | |
1066 | + sll $rounds,2 # (rounds-1)*4 | |
1067 | + llilh $mask80,0x8080 | |
1068 | + llilh $mask1b,0x1b1b | |
1069 | + llilh $maskfe,0xfefe | |
1070 | + oill $mask80,0x8080 | |
1071 | + oill $mask1b,0x1b1b | |
1072 | + oill $maskfe,0xfefe | |
1073 | + | |
1074 | +.align 16 | |
1075 | +.Lmix: l $s0,16($key) # tp1 | |
1076 | + lr $s1,$s0 | |
1077 | + ngr $s1,$mask80 | |
1078 | + srlg $t1,$s1,7 | |
1079 | + slr $s1,$t1 | |
1080 | + nr $s1,$mask1b | |
1081 | + sllg $t1,$s0,1 | |
1082 | + nr $t1,$maskfe | |
1083 | + xr $s1,$t1 # tp2 | |
1084 | + | |
1085 | + lr $s2,$s1 | |
1086 | + ngr $s2,$mask80 | |
1087 | + srlg $t1,$s2,7 | |
1088 | + slr $s2,$t1 | |
1089 | + nr $s2,$mask1b | |
1090 | + sllg $t1,$s1,1 | |
1091 | + nr $t1,$maskfe | |
1092 | + xr $s2,$t1 # tp4 | |
1093 | + | |
1094 | + lr $s3,$s2 | |
1095 | + ngr $s3,$mask80 | |
1096 | + srlg $t1,$s3,7 | |
1097 | + slr $s3,$t1 | |
1098 | + nr $s3,$mask1b | |
1099 | + sllg $t1,$s2,1 | |
1100 | + nr $t1,$maskfe | |
1101 | + xr $s3,$t1 # tp8 | |
1102 | + | |
1103 | + xr $s1,$s0 # tp2^tp1 | |
1104 | + xr $s2,$s0 # tp4^tp1 | |
1105 | + rll $s0,$s0,24 # = ROTATE(tp1,8) | |
1106 | + xr $s2,$s3 # ^=tp8 | |
1107 | + xr $s0,$s1 # ^=tp2^tp1 | |
1108 | + xr $s1,$s3 # tp2^tp1^tp8 | |
1109 | + xr $s0,$s2 # ^=tp4^tp1^tp8 | |
1110 | + rll $s1,$s1,8 | |
1111 | + rll $s2,$s2,16 | |
1112 | + xr $s0,$s1 # ^= ROTATE(tp8^tp2^tp1,24) | |
1113 | + rll $s3,$s3,24 | |
1114 | + xr $s0,$s2 # ^= ROTATE(tp8^tp4^tp1,16) | |
1115 | + xr $s0,$s3 # ^= ROTATE(tp8,8) | |
1116 | + | |
1117 | + st $s0,16($key) | |
1118 | + la $key,4($key) | |
1119 | + brct $rounds,.Lmix | |
1120 | + | |
1121 | + lmg %r6,%r13,48($sp)# as was saved by AES_set_encrypt_key! | |
1122 | + lghi %r2,0 | |
1123 | + br $ra | |
1124 | +.size AES_set_decrypt_key,.-AES_set_decrypt_key | |
1125 | +___ | |
1126 | + | |
1127 | +#void AES_cbc_encrypt(const unsigned char *in, unsigned char *out, | |
1128 | +# size_t length, const AES_KEY *key, | |
1129 | +# unsigned char *ivec, const int enc) | |
1130 | +{ | |
1131 | +my $inp="%r2"; | |
1132 | +my $out="%r4"; # length and out are swapped | |
1133 | +my $len="%r3"; | |
1134 | +my $key="%r5"; | |
1135 | +my $ivp="%r6"; | |
1136 | + | |
1137 | +$code.=<<___; | |
1138 | +.globl AES_cbc_encrypt | |
1139 | +.type AES_cbc_encrypt,\@function | |
1140 | +.align 16 | |
1141 | +AES_cbc_encrypt: | |
1142 | + xgr %r3,%r4 # flip %r3 and %r4, out and len | |
1143 | + xgr %r4,%r3 | |
1144 | + xgr %r3,%r4 | |
1145 | +___ | |
1146 | +$code.=<<___ if (!$softonly); | |
1147 | + lhi %r0,16 | |
1148 | + cl %r0,240($key) | |
1149 | + jh .Lcbc_software | |
1150 | + | |
1151 | + lg %r0,0($ivp) # copy ivec | |
1152 | + lg %r1,8($ivp) | |
1153 | + stmg %r0,%r1,16($sp) | |
1154 | + lmg %r0,%r1,0($key) # copy key, cover 256 bit | |
1155 | + stmg %r0,%r1,32($sp) | |
1156 | + lmg %r0,%r1,16($key) | |
1157 | + stmg %r0,%r1,48($sp) | |
1158 | + l %r0,240($key) # load kmc code | |
1159 | + lghi $key,15 # res=len%16, len-=res; | |
1160 | + ngr $key,$len | |
1161 | + slgr $len,$key | |
1162 | + la %r1,16($sp) # parameter block - ivec || key | |
1163 | + jz .Lkmc_truncated | |
1164 | + .long 0xb92f0042 # kmc %r4,%r2 | |
1165 | + brc 1,.-4 # pay attention to "partial completion" | |
1166 | + ltr $key,$key | |
1167 | + jnz .Lkmc_truncated | |
1168 | +.Lkmc_done: | |
1169 | + lmg %r0,%r1,16($sp) # copy ivec to caller | |
1170 | + stg %r0,0($ivp) | |
1171 | + stg %r1,8($ivp) | |
1172 | + br $ra | |
1173 | +.align 16 | |
1174 | +.Lkmc_truncated: | |
1175 | + ahi $key,-1 # it's the way it's encoded in mvc | |
1176 | + tmll %r0,0x80 | |
1177 | + jnz .Lkmc_truncated_dec | |
1178 | + lghi %r1,0 | |
1179 | + stg %r1,128($sp) | |
1180 | + stg %r1,136($sp) | |
1181 | + bras %r1,1f | |
1182 | + mvc 128(1,$sp),0($inp) | |
1183 | +1: ex $key,0(%r1) | |
1184 | + la %r1,16($sp) # restore parameter block | |
1185 | + la $inp,128($sp) | |
1186 | + lghi $len,16 | |
1187 | + .long 0xb92f0042 # kmc %r4,%r2 | |
1188 | + j .Lkmc_done | |
1189 | +.align 16 | |
1190 | +.Lkmc_truncated_dec: | |
1191 | + stg $out,64($sp) | |
1192 | + la $out,128($sp) | |
1193 | + lghi $len,16 | |
1194 | + .long 0xb92f0042 # kmc %r4,%r2 | |
1195 | + lg $out,64($sp) | |
1196 | + bras %r1,2f | |
1197 | + mvc 0(1,$out),128($sp) | |
1198 | +2: ex $key,0(%r1) | |
1199 | + j .Lkmc_done | |
1200 | +.align 16 | |
1201 | +.Lcbc_software: | |
1202 | +___ | |
1203 | +$code.=<<___; | |
1204 | + stmg $key,$ra,40($sp) | |
1205 | + lhi %r0,0 | |
1206 | + cl %r0,164($sp) | |
1207 | + je .Lcbc_decrypt | |
1208 | + | |
1209 | + larl $tbl,AES_Te | |
1210 | + | |
1211 | + llgf $s0,0($ivp) | |
1212 | + llgf $s1,4($ivp) | |
1213 | + llgf $s2,8($ivp) | |
1214 | + llgf $s3,12($ivp) | |
1215 | + | |
1216 | + lghi $t0,16 | |
1217 | + slgr $len,$t0 | |
1218 | + brc 4,.Lcbc_enc_tail # if borrow | |
1219 | +.Lcbc_enc_loop: | |
1220 | + stmg $inp,$out,16($sp) | |
1221 | + x $s0,0($inp) | |
1222 | + x $s1,4($inp) | |
1223 | + x $s2,8($inp) | |
1224 | + x $s3,12($inp) | |
1225 | + lgr %r4,$key | |
1226 | + | |
1227 | + bras $ra,_s390x_AES_encrypt | |
1228 | + | |
1229 | + lmg $inp,$key,16($sp) | |
1230 | + st $s0,0($out) | |
1231 | + st $s1,4($out) | |
1232 | + st $s2,8($out) | |
1233 | + st $s3,12($out) | |
1234 | + | |
1235 | + la $inp,16($inp) | |
1236 | + la $out,16($out) | |
1237 | + lghi $t0,16 | |
1238 | + ltgr $len,$len | |
1239 | + jz .Lcbc_enc_done | |
1240 | + slgr $len,$t0 | |
1241 | + brc 4,.Lcbc_enc_tail # if borrow | |
1242 | + j .Lcbc_enc_loop | |
1243 | +.align 16 | |
1244 | +.Lcbc_enc_done: | |
1245 | + lg $ivp,48($sp) | |
1246 | + st $s0,0($ivp) | |
1247 | + st $s1,4($ivp) | |
1248 | + st $s2,8($ivp) | |
1249 | + st $s3,12($ivp) | |
1250 | + | |
1251 | + lmg %r7,$ra,56($sp) | |
1252 | + br $ra | |
1253 | + | |
1254 | +.align 16 | |
1255 | +.Lcbc_enc_tail: | |
1256 | + aghi $len,15 | |
1257 | + lghi $t0,0 | |
1258 | + stg $t0,128($sp) | |
1259 | + stg $t0,136($sp) | |
1260 | + bras $t1,3f | |
1261 | + mvc 128(1,$sp),0($inp) | |
1262 | +3: ex $len,0($t1) | |
1263 | + lghi $len,0 | |
1264 | + la $inp,128($sp) | |
1265 | + j .Lcbc_enc_loop | |
1266 | + | |
1267 | +.align 16 | |
1268 | +.Lcbc_decrypt: | |
1269 | + larl $tbl,AES_Td | |
1270 | + | |
1271 | + lg $t0,0($ivp) | |
1272 | + lg $t1,8($ivp) | |
1273 | + stmg $t0,$t1,128($sp) | |
1274 | + | |
1275 | +.Lcbc_dec_loop: | |
1276 | + stmg $inp,$out,16($sp) | |
1277 | + llgf $s0,0($inp) | |
1278 | + llgf $s1,4($inp) | |
1279 | + llgf $s2,8($inp) | |
1280 | + llgf $s3,12($inp) | |
1281 | + lgr %r4,$key | |
1282 | + | |
1283 | + bras $ra,_s390x_AES_decrypt | |
1284 | + | |
1285 | + lmg $inp,$key,16($sp) | |
1286 | + sllg $s0,$s0,32 | |
1287 | + sllg $s2,$s2,32 | |
1288 | + lr $s0,$s1 | |
1289 | + lr $s2,$s3 | |
1290 | + | |
1291 | + lg $t0,0($inp) | |
1292 | + lg $t1,8($inp) | |
1293 | + xg $s0,128($sp) | |
1294 | + xg $s2,136($sp) | |
1295 | + lghi $s1,16 | |
1296 | + slgr $len,$s1 | |
1297 | + brc 4,.Lcbc_dec_tail # if borrow | |
1298 | + brc 2,.Lcbc_dec_done # if zero | |
1299 | + stg $s0,0($out) | |
1300 | + stg $s2,8($out) | |
1301 | + stmg $t0,$t1,128($sp) | |
1302 | + | |
1303 | + la $inp,16($inp) | |
1304 | + la $out,16($out) | |
1305 | + j .Lcbc_dec_loop | |
1306 | + | |
1307 | +.Lcbc_dec_done: | |
1308 | + stg $s0,0($out) | |
1309 | + stg $s2,8($out) | |
1310 | +.Lcbc_dec_exit: | |
1311 | + lmg $ivp,$ra,48($sp) | |
1312 | + stmg $t0,$t1,0($ivp) | |
1313 | + | |
1314 | + br $ra | |
1315 | + | |
1316 | +.align 16 | |
1317 | +.Lcbc_dec_tail: | |
1318 | + aghi $len,15 | |
1319 | + stg $s0,128($sp) | |
1320 | + stg $s2,136($sp) | |
1321 | + bras $s1,4f | |
1322 | + mvc 0(1,$out),128($sp) | |
1323 | +4: ex $len,0($s1) | |
1324 | + j .Lcbc_dec_exit | |
1325 | +.size AES_cbc_encrypt,.-AES_cbc_encrypt | |
1326 | +___ | |
1327 | +} | |
1328 | +$code.=<<___; | |
1329 | +.string "AES for s390x, CRYPTOGAMS by <appro\@openssl.org>" | |
1330 | +___ | |
1331 | + | |
1332 | +$code =~ s/\`([^\`]*)\`/eval $1/gem; | |
1333 | +print $code; |
@@ -0,0 +1,1181 @@ | ||
1 | +#!/usr/bin/env perl | |
2 | +# | |
3 | +# ==================================================================== | |
4 | +# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | |
5 | +# project. Rights for redistribution and usage in source and binary | |
6 | +# forms are granted according to the OpenSSL license. | |
7 | +# ==================================================================== | |
8 | +# | |
9 | +# Version 1.1 | |
10 | +# | |
11 | +# The major reason for undertaken effort was to mitigate the hazard of | |
12 | +# cache-timing attack. This is [currently and initially!] addressed in | |
13 | +# two ways. 1. S-boxes are compressed from 5KB to 2KB+256B size each. | |
14 | +# 2. References to them are scheduled for L2 cache latency, meaning | |
15 | +# that the tables don't have to reside in L1 cache. Once again, this | |
16 | +# is an initial draft and one should expect more countermeasures to | |
17 | +# be implemented... | |
18 | +# | |
19 | +# Version 1.1 prefetches T[ed]4 in order to mitigate attack on last | |
20 | +# round. | |
21 | +# | |
22 | +# Even though performance was not the primary goal [on the contrary, | |
23 | +# extra shifts "induced" by compressed S-box and longer loop epilogue | |
24 | +# "induced" by scheduling for L2 have negative effect on performance], | |
25 | +# the code turned out to run in ~23 cycles per processed byte en-/ | |
26 | +# decrypted with 128-bit key. This is pretty good result for code | |
27 | +# with mentioned qualities and UltraSPARC core. Compared to Sun C | |
28 | +# generated code my encrypt procedure runs just few percents faster, | |
29 | +# while decrypt one - whole 50% faster [yes, Sun C failed to generate | |
30 | +# optimal decrypt procedure]. Compared to GNU C generated code both | |
31 | +# procedures are more than 60% faster:-) | |
32 | + | |
33 | +$bits=32; | |
34 | +for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); } | |
35 | +if ($bits==64) { $bias=2047; $frame=192; } | |
36 | +else { $bias=0; $frame=112; } | |
37 | +$locals=16; | |
38 | + | |
39 | +$acc0="%l0"; | |
40 | +$acc1="%o0"; | |
41 | +$acc2="%o1"; | |
42 | +$acc3="%o2"; | |
43 | + | |
44 | +$acc4="%l1"; | |
45 | +$acc5="%o3"; | |
46 | +$acc6="%o4"; | |
47 | +$acc7="%o5"; | |
48 | + | |
49 | +$acc8="%l2"; | |
50 | +$acc9="%o7"; | |
51 | +$acc10="%g1"; | |
52 | +$acc11="%g2"; | |
53 | + | |
54 | +$acc12="%l3"; | |
55 | +$acc13="%g3"; | |
56 | +$acc14="%g4"; | |
57 | +$acc15="%g5"; | |
58 | + | |
59 | +$t0="%l4"; | |
60 | +$t1="%l5"; | |
61 | +$t2="%l6"; | |
62 | +$t3="%l7"; | |
63 | + | |
64 | +$s0="%i0"; | |
65 | +$s1="%i1"; | |
66 | +$s2="%i2"; | |
67 | +$s3="%i3"; | |
68 | +$tbl="%i4"; | |
69 | +$key="%i5"; | |
70 | +$rounds="%i7"; # aliases with return address, which is off-loaded to stack | |
71 | + | |
72 | +sub _data_word() | |
73 | +{ my $i; | |
74 | + while(defined($i=shift)) { $code.=sprintf"\t.long\t0x%08x,0x%08x\n",$i,$i; } | |
75 | +} | |
76 | + | |
77 | +$code.=<<___ if ($bits==64); | |
78 | +.register %g2,#scratch | |
79 | +.register %g3,#scratch | |
80 | +___ | |
81 | +$code.=<<___; | |
82 | +.section ".text",#alloc,#execinstr | |
83 | + | |
84 | +.align 256 | |
85 | +AES_Te: | |
86 | +___ | |
87 | +&_data_word( | |
88 | + 0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d, | |
89 | + 0xfff2f20d, 0xd66b6bbd, 0xde6f6fb1, 0x91c5c554, | |
90 | + 0x60303050, 0x02010103, 0xce6767a9, 0x562b2b7d, | |
91 | + 0xe7fefe19, 0xb5d7d762, 0x4dababe6, 0xec76769a, | |
92 | + 0x8fcaca45, 0x1f82829d, 0x89c9c940, 0xfa7d7d87, | |
93 | + 0xeffafa15, 0xb25959eb, 0x8e4747c9, 0xfbf0f00b, | |
94 | + 0x41adadec, 0xb3d4d467, 0x5fa2a2fd, 0x45afafea, | |
95 | + 0x239c9cbf, 0x53a4a4f7, 0xe4727296, 0x9bc0c05b, | |
96 | + 0x75b7b7c2, 0xe1fdfd1c, 0x3d9393ae, 0x4c26266a, | |
97 | + 0x6c36365a, 0x7e3f3f41, 0xf5f7f702, 0x83cccc4f, | |
98 | + 0x6834345c, 0x51a5a5f4, 0xd1e5e534, 0xf9f1f108, | |
99 | + 0xe2717193, 0xabd8d873, 0x62313153, 0x2a15153f, | |
100 | + 0x0804040c, 0x95c7c752, 0x46232365, 0x9dc3c35e, | |
101 | + 0x30181828, 0x379696a1, 0x0a05050f, 0x2f9a9ab5, | |
102 | + 0x0e070709, 0x24121236, 0x1b80809b, 0xdfe2e23d, | |
103 | + 0xcdebeb26, 0x4e272769, 0x7fb2b2cd, 0xea75759f, | |
104 | + 0x1209091b, 0x1d83839e, 0x582c2c74, 0x341a1a2e, | |
105 | + 0x361b1b2d, 0xdc6e6eb2, 0xb45a5aee, 0x5ba0a0fb, | |
106 | + 0xa45252f6, 0x763b3b4d, 0xb7d6d661, 0x7db3b3ce, | |
107 | + 0x5229297b, 0xdde3e33e, 0x5e2f2f71, 0x13848497, | |
108 | + 0xa65353f5, 0xb9d1d168, 0x00000000, 0xc1eded2c, | |
109 | + 0x40202060, 0xe3fcfc1f, 0x79b1b1c8, 0xb65b5bed, | |
110 | + 0xd46a6abe, 0x8dcbcb46, 0x67bebed9, 0x7239394b, | |
111 | + 0x944a4ade, 0x984c4cd4, 0xb05858e8, 0x85cfcf4a, | |
112 | + 0xbbd0d06b, 0xc5efef2a, 0x4faaaae5, 0xedfbfb16, | |
113 | + 0x864343c5, 0x9a4d4dd7, 0x66333355, 0x11858594, | |
114 | + 0x8a4545cf, 0xe9f9f910, 0x04020206, 0xfe7f7f81, | |
115 | + 0xa05050f0, 0x783c3c44, 0x259f9fba, 0x4ba8a8e3, | |
116 | + 0xa25151f3, 0x5da3a3fe, 0x804040c0, 0x058f8f8a, | |
117 | + 0x3f9292ad, 0x219d9dbc, 0x70383848, 0xf1f5f504, | |
118 | + 0x63bcbcdf, 0x77b6b6c1, 0xafdada75, 0x42212163, | |
119 | + 0x20101030, 0xe5ffff1a, 0xfdf3f30e, 0xbfd2d26d, | |
120 | + 0x81cdcd4c, 0x180c0c14, 0x26131335, 0xc3ecec2f, | |
121 | + 0xbe5f5fe1, 0x359797a2, 0x884444cc, 0x2e171739, | |
122 | + 0x93c4c457, 0x55a7a7f2, 0xfc7e7e82, 0x7a3d3d47, | |
123 | + 0xc86464ac, 0xba5d5de7, 0x3219192b, 0xe6737395, | |
124 | + 0xc06060a0, 0x19818198, 0x9e4f4fd1, 0xa3dcdc7f, | |
125 | + 0x44222266, 0x542a2a7e, 0x3b9090ab, 0x0b888883, | |
126 | + 0x8c4646ca, 0xc7eeee29, 0x6bb8b8d3, 0x2814143c, | |
127 | + 0xa7dede79, 0xbc5e5ee2, 0x160b0b1d, 0xaddbdb76, | |
128 | + 0xdbe0e03b, 0x64323256, 0x743a3a4e, 0x140a0a1e, | |
129 | + 0x924949db, 0x0c06060a, 0x4824246c, 0xb85c5ce4, | |
130 | + 0x9fc2c25d, 0xbdd3d36e, 0x43acacef, 0xc46262a6, | |
131 | + 0x399191a8, 0x319595a4, 0xd3e4e437, 0xf279798b, | |
132 | + 0xd5e7e732, 0x8bc8c843, 0x6e373759, 0xda6d6db7, | |
133 | + 0x018d8d8c, 0xb1d5d564, 0x9c4e4ed2, 0x49a9a9e0, | |
134 | + 0xd86c6cb4, 0xac5656fa, 0xf3f4f407, 0xcfeaea25, | |
135 | + 0xca6565af, 0xf47a7a8e, 0x47aeaee9, 0x10080818, | |
136 | + 0x6fbabad5, 0xf0787888, 0x4a25256f, 0x5c2e2e72, | |
137 | + 0x381c1c24, 0x57a6a6f1, 0x73b4b4c7, 0x97c6c651, | |
138 | + 0xcbe8e823, 0xa1dddd7c, 0xe874749c, 0x3e1f1f21, | |
139 | + 0x964b4bdd, 0x61bdbddc, 0x0d8b8b86, 0x0f8a8a85, | |
140 | + 0xe0707090, 0x7c3e3e42, 0x71b5b5c4, 0xcc6666aa, | |
141 | + 0x904848d8, 0x06030305, 0xf7f6f601, 0x1c0e0e12, | |
142 | + 0xc26161a3, 0x6a35355f, 0xae5757f9, 0x69b9b9d0, | |
143 | + 0x17868691, 0x99c1c158, 0x3a1d1d27, 0x279e9eb9, | |
144 | + 0xd9e1e138, 0xebf8f813, 0x2b9898b3, 0x22111133, | |
145 | + 0xd26969bb, 0xa9d9d970, 0x078e8e89, 0x339494a7, | |
146 | + 0x2d9b9bb6, 0x3c1e1e22, 0x15878792, 0xc9e9e920, | |
147 | + 0x87cece49, 0xaa5555ff, 0x50282878, 0xa5dfdf7a, | |
148 | + 0x038c8c8f, 0x59a1a1f8, 0x09898980, 0x1a0d0d17, | |
149 | + 0x65bfbfda, 0xd7e6e631, 0x844242c6, 0xd06868b8, | |
150 | + 0x824141c3, 0x299999b0, 0x5a2d2d77, 0x1e0f0f11, | |
151 | + 0x7bb0b0cb, 0xa85454fc, 0x6dbbbbd6, 0x2c16163a); | |
152 | +$code.=<<___; | |
153 | + .byte 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5 | |
154 | + .byte 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76 | |
155 | + .byte 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0 | |
156 | + .byte 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0 | |
157 | + .byte 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc | |
158 | + .byte 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15 | |
159 | + .byte 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a | |
160 | + .byte 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75 | |
161 | + .byte 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0 | |
162 | + .byte 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84 | |
163 | + .byte 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b | |
164 | + .byte 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf | |
165 | + .byte 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85 | |
166 | + .byte 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8 | |
167 | + .byte 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5 | |
168 | + .byte 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2 | |
169 | + .byte 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17 | |
170 | + .byte 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73 | |
171 | + .byte 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88 | |
172 | + .byte 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb | |
173 | + .byte 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c | |
174 | + .byte 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79 | |
175 | + .byte 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9 | |
176 | + .byte 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08 | |
177 | + .byte 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6 | |
178 | + .byte 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a | |
179 | + .byte 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e | |
180 | + .byte 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e | |
181 | + .byte 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94 | |
182 | + .byte 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf | |
183 | + .byte 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68 | |
184 | + .byte 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16 | |
185 | +.type AES_Te,#object | |
186 | +.size AES_Te,(.-AES_Te) | |
187 | + | |
188 | +.align 64 | |
189 | +.skip 16 | |
190 | +_sparcv9_AES_encrypt: | |
191 | + save %sp,-$frame-$locals,%sp | |
192 | + stx %i7,[%sp+$bias+$frame+0] ! off-load return address | |
193 | + ld [$key+240],$rounds | |
194 | + ld [$key+0],$t0 | |
195 | + ld [$key+4],$t1 ! | |
196 | + ld [$key+8],$t2 | |
197 | + srl $rounds,1,$rounds | |
198 | + xor $t0,$s0,$s0 | |
199 | + ld [$key+12],$t3 | |
200 | + srl $s0,21,$acc0 | |
201 | + xor $t1,$s1,$s1 | |
202 | + ld [$key+16],$t0 | |
203 | + srl $s1,13,$acc1 ! | |
204 | + xor $t2,$s2,$s2 | |
205 | + ld [$key+20],$t1 | |
206 | + xor $t3,$s3,$s3 | |
207 | + ld [$key+24],$t2 | |
208 | + and $acc0,2040,$acc0 | |
209 | + ld [$key+28],$t3 | |
210 | + nop | |
211 | +.Lenc_loop: | |
212 | + srl $s2,5,$acc2 ! | |
213 | + and $acc1,2040,$acc1 | |
214 | + ldx [$tbl+$acc0],$acc0 | |
215 | + sll $s3,3,$acc3 | |
216 | + and $acc2,2040,$acc2 | |
217 | + ldx [$tbl+$acc1],$acc1 | |
218 | + srl $s1,21,$acc4 | |
219 | + and $acc3,2040,$acc3 | |
220 | + ldx [$tbl+$acc2],$acc2 ! | |
221 | + srl $s2,13,$acc5 | |
222 | + and $acc4,2040,$acc4 | |
223 | + ldx [$tbl+$acc3],$acc3 | |
224 | + srl $s3,5,$acc6 | |
225 | + and $acc5,2040,$acc5 | |
226 | + ldx [$tbl+$acc4],$acc4 | |
227 | + fmovs %f0,%f0 | |
228 | + sll $s0,3,$acc7 ! | |
229 | + and $acc6,2040,$acc6 | |
230 | + ldx [$tbl+$acc5],$acc5 | |
231 | + srl $s2,21,$acc8 | |
232 | + and $acc7,2040,$acc7 | |
233 | + ldx [$tbl+$acc6],$acc6 | |
234 | + srl $s3,13,$acc9 | |
235 | + and $acc8,2040,$acc8 | |
236 | + ldx [$tbl+$acc7],$acc7 ! | |
237 | + srl $s0,5,$acc10 | |
238 | + and $acc9,2040,$acc9 | |
239 | + ldx [$tbl+$acc8],$acc8 | |
240 | + sll $s1,3,$acc11 | |
241 | + and $acc10,2040,$acc10 | |
242 | + ldx [$tbl+$acc9],$acc9 | |
243 | + fmovs %f0,%f0 | |
244 | + srl $s3,21,$acc12 ! | |
245 | + and $acc11,2040,$acc11 | |
246 | + ldx [$tbl+$acc10],$acc10 | |
247 | + srl $s0,13,$acc13 | |
248 | + and $acc12,2040,$acc12 | |
249 | + ldx [$tbl+$acc11],$acc11 | |
250 | + srl $s1,5,$acc14 | |
251 | + and $acc13,2040,$acc13 | |
252 | + ldx [$tbl+$acc12],$acc12 ! | |
253 | + sll $s2,3,$acc15 | |
254 | + and $acc14,2040,$acc14 | |
255 | + ldx [$tbl+$acc13],$acc13 | |
256 | + and $acc15,2040,$acc15 | |
257 | + add $key,32,$key | |
258 | + ldx [$tbl+$acc14],$acc14 | |
259 | + fmovs %f0,%f0 | |
260 | + subcc $rounds,1,$rounds ! | |
261 | + ldx [$tbl+$acc15],$acc15 | |
262 | + bz,a,pn %icc,.Lenc_last | |
263 | + add $tbl,2048,$rounds | |
264 | + | |
265 | + srlx $acc1,8,$acc1 | |
266 | + xor $acc0,$t0,$t0 | |
267 | + ld [$key+0],$s0 | |
268 | + fmovs %f0,%f0 | |
269 | + srlx $acc2,16,$acc2 ! | |
270 | + xor $acc1,$t0,$t0 | |
271 | + ld [$key+4],$s1 | |
272 | + srlx $acc3,24,$acc3 | |
273 | + xor $acc2,$t0,$t0 | |
274 | + ld [$key+8],$s2 | |
275 | + srlx $acc5,8,$acc5 | |
276 | + xor $acc3,$t0,$t0 | |
277 | + ld [$key+12],$s3 ! | |
278 | + srlx $acc6,16,$acc6 | |
279 | + xor $acc4,$t1,$t1 | |
280 | + fmovs %f0,%f0 | |
281 | + srlx $acc7,24,$acc7 | |
282 | + xor $acc5,$t1,$t1 | |
283 | + srlx $acc9,8,$acc9 | |
284 | + xor $acc6,$t1,$t1 | |
285 | + srlx $acc10,16,$acc10 ! | |
286 | + xor $acc7,$t1,$t1 | |
287 | + srlx $acc11,24,$acc11 | |
288 | + xor $acc8,$t2,$t2 | |
289 | + srlx $acc13,8,$acc13 | |
290 | + xor $acc9,$t2,$t2 | |
291 | + srlx $acc14,16,$acc14 | |
292 | + xor $acc10,$t2,$t2 | |
293 | + srlx $acc15,24,$acc15 ! | |
294 | + xor $acc11,$t2,$t2 | |
295 | + xor $acc12,$acc14,$acc14 | |
296 | + xor $acc13,$t3,$t3 | |
297 | + srl $t0,21,$acc0 | |
298 | + xor $acc14,$t3,$t3 | |
299 | + srl $t1,13,$acc1 | |
300 | + xor $acc15,$t3,$t3 | |
301 | + | |
302 | + and $acc0,2040,$acc0 ! | |
303 | + srl $t2,5,$acc2 | |
304 | + and $acc1,2040,$acc1 | |
305 | + ldx [$tbl+$acc0],$acc0 | |
306 | + sll $t3,3,$acc3 | |
307 | + and $acc2,2040,$acc2 | |
308 | + ldx [$tbl+$acc1],$acc1 | |
309 | + fmovs %f0,%f0 | |
310 | + srl $t1,21,$acc4 ! | |
311 | + and $acc3,2040,$acc3 | |
312 | + ldx [$tbl+$acc2],$acc2 | |
313 | + srl $t2,13,$acc5 | |
314 | + and $acc4,2040,$acc4 | |
315 | + ldx [$tbl+$acc3],$acc3 | |
316 | + srl $t3,5,$acc6 | |
317 | + and $acc5,2040,$acc5 | |
318 | + ldx [$tbl+$acc4],$acc4 ! | |
319 | + sll $t0,3,$acc7 | |
320 | + and $acc6,2040,$acc6 | |
321 | + ldx [$tbl+$acc5],$acc5 | |
322 | + srl $t2,21,$acc8 | |
323 | + and $acc7,2040,$acc7 | |
324 | + ldx [$tbl+$acc6],$acc6 | |
325 | + fmovs %f0,%f0 | |
326 | + srl $t3,13,$acc9 ! | |
327 | + and $acc8,2040,$acc8 | |
328 | + ldx [$tbl+$acc7],$acc7 | |
329 | + srl $t0,5,$acc10 | |
330 | + and $acc9,2040,$acc9 | |
331 | + ldx [$tbl+$acc8],$acc8 | |
332 | + sll $t1,3,$acc11 | |
333 | + and $acc10,2040,$acc10 | |
334 | + ldx [$tbl+$acc9],$acc9 ! | |
335 | + srl $t3,21,$acc12 | |
336 | + and $acc11,2040,$acc11 | |
337 | + ldx [$tbl+$acc10],$acc10 | |
338 | + srl $t0,13,$acc13 | |
339 | + and $acc12,2040,$acc12 | |
340 | + ldx [$tbl+$acc11],$acc11 | |
341 | + fmovs %f0,%f0 | |
342 | + srl $t1,5,$acc14 ! | |
343 | + and $acc13,2040,$acc13 | |
344 | + ldx [$tbl+$acc12],$acc12 | |
345 | + sll $t2,3,$acc15 | |
346 | + and $acc14,2040,$acc14 | |
347 | + ldx [$tbl+$acc13],$acc13 | |
348 | + srlx $acc1,8,$acc1 | |
349 | + and $acc15,2040,$acc15 | |
350 | + ldx [$tbl+$acc14],$acc14 ! | |
351 | + | |
352 | + srlx $acc2,16,$acc2 | |
353 | + xor $acc0,$s0,$s0 | |
354 | + ldx [$tbl+$acc15],$acc15 | |
355 | + srlx $acc3,24,$acc3 | |
356 | + xor $acc1,$s0,$s0 | |
357 | + ld [$key+16],$t0 | |
358 | + fmovs %f0,%f0 | |
359 | + srlx $acc5,8,$acc5 ! | |
360 | + xor $acc2,$s0,$s0 | |
361 | + ld [$key+20],$t1 | |
362 | + srlx $acc6,16,$acc6 | |
363 | + xor $acc3,$s0,$s0 | |
364 | + ld [$key+24],$t2 | |
365 | + srlx $acc7,24,$acc7 | |
366 | + xor $acc4,$s1,$s1 | |
367 | + ld [$key+28],$t3 ! | |
368 | + srlx $acc9,8,$acc9 | |
369 | + xor $acc5,$s1,$s1 | |
370 | + ldx [$tbl+2048+0],%g0 ! prefetch te4 | |
371 | + srlx $acc10,16,$acc10 | |
372 | + xor $acc6,$s1,$s1 | |
373 | + ldx [$tbl+2048+32],%g0 ! prefetch te4 | |
374 | + srlx $acc11,24,$acc11 | |
375 | + xor $acc7,$s1,$s1 | |
376 | + ldx [$tbl+2048+64],%g0 ! prefetch te4 | |
377 | + srlx $acc13,8,$acc13 | |
378 | + xor $acc8,$s2,$s2 | |
379 | + ldx [$tbl+2048+96],%g0 ! prefetch te4 | |
380 | + srlx $acc14,16,$acc14 ! | |
381 | + xor $acc9,$s2,$s2 | |
382 | + ldx [$tbl+2048+128],%g0 ! prefetch te4 | |
383 | + srlx $acc15,24,$acc15 | |
384 | + xor $acc10,$s2,$s2 | |
385 | + ldx [$tbl+2048+160],%g0 ! prefetch te4 | |
386 | + srl $s0,21,$acc0 | |
387 | + xor $acc11,$s2,$s2 | |
388 | + ldx [$tbl+2048+192],%g0 ! prefetch te4 | |
389 | + xor $acc12,$acc14,$acc14 | |
390 | + xor $acc13,$s3,$s3 | |
391 | + ldx [$tbl+2048+224],%g0 ! prefetch te4 | |
392 | + srl $s1,13,$acc1 ! | |
393 | + xor $acc14,$s3,$s3 | |
394 | + xor $acc15,$s3,$s3 | |
395 | + ba .Lenc_loop | |
396 | + and $acc0,2040,$acc0 | |
397 | + | |
398 | +.align 32 | |
399 | +.Lenc_last: | |
400 | + srlx $acc1,8,$acc1 ! | |
401 | + xor $acc0,$t0,$t0 | |
402 | + ld [$key+0],$s0 | |
403 | + srlx $acc2,16,$acc2 | |
404 | + xor $acc1,$t0,$t0 | |
405 | + ld [$key+4],$s1 | |
406 | + srlx $acc3,24,$acc3 | |
407 | + xor $acc2,$t0,$t0 | |
408 | + ld [$key+8],$s2 ! | |
409 | + srlx $acc5,8,$acc5 | |
410 | + xor $acc3,$t0,$t0 | |
411 | + ld [$key+12],$s3 | |
412 | + srlx $acc6,16,$acc6 | |
413 | + xor $acc4,$t1,$t1 | |
414 | + srlx $acc7,24,$acc7 | |
415 | + xor $acc5,$t1,$t1 | |
416 | + srlx $acc9,8,$acc9 ! | |
417 | + xor $acc6,$t1,$t1 | |
418 | + srlx $acc10,16,$acc10 | |
419 | + xor $acc7,$t1,$t1 | |
420 | + srlx $acc11,24,$acc11 | |
421 | + xor $acc8,$t2,$t2 | |
422 | + srlx $acc13,8,$acc13 | |
423 | + xor $acc9,$t2,$t2 | |
424 | + srlx $acc14,16,$acc14 ! | |
425 | + xor $acc10,$t2,$t2 | |
426 | + srlx $acc15,24,$acc15 | |
427 | + xor $acc11,$t2,$t2 | |
428 | + xor $acc12,$acc14,$acc14 | |
429 | + xor $acc13,$t3,$t3 | |
430 | + srl $t0,24,$acc0 | |
431 | + xor $acc14,$t3,$t3 | |
432 | + srl $t1,16,$acc1 ! | |
433 | + xor $acc15,$t3,$t3 | |
434 | + | |
435 | + srl $t2,8,$acc2 | |
436 | + and $acc1,255,$acc1 | |
437 | + ldub [$rounds+$acc0],$acc0 | |
438 | + srl $t1,24,$acc4 | |
439 | + and $acc2,255,$acc2 | |
440 | + ldub [$rounds+$acc1],$acc1 | |
441 | + srl $t2,16,$acc5 ! | |
442 | + and $t3,255,$acc3 | |
443 | + ldub [$rounds+$acc2],$acc2 | |
444 | + ldub [$rounds+$acc3],$acc3 | |
445 | + srl $t3,8,$acc6 | |
446 | + and $acc5,255,$acc5 | |
447 | + ldub [$rounds+$acc4],$acc4 | |
448 | + fmovs %f0,%f0 | |
449 | + srl $t2,24,$acc8 ! | |
450 | + and $acc6,255,$acc6 | |
451 | + ldub [$rounds+$acc5],$acc5 | |
452 | + srl $t3,16,$acc9 | |
453 | + and $t0,255,$acc7 | |
454 | + ldub [$rounds+$acc6],$acc6 | |
455 | + ldub [$rounds+$acc7],$acc7 | |
456 | + fmovs %f0,%f0 | |
457 | + srl $t0,8,$acc10 ! | |
458 | + and $acc9,255,$acc9 | |
459 | + ldub [$rounds+$acc8],$acc8 | |
460 | + srl $t3,24,$acc12 | |
461 | + and $acc10,255,$acc10 | |
462 | + ldub [$rounds+$acc9],$acc9 | |
463 | + srl $t0,16,$acc13 | |
464 | + and $t1,255,$acc11 | |
465 | + ldub [$rounds+$acc10],$acc10 ! | |
466 | + srl $t1,8,$acc14 | |
467 | + and $acc13,255,$acc13 | |
468 | + ldub [$rounds+$acc11],$acc11 | |
469 | + ldub [$rounds+$acc12],$acc12 | |
470 | + and $acc14,255,$acc14 | |
471 | + ldub [$rounds+$acc13],$acc13 | |
472 | + and $t2,255,$acc15 | |
473 | + ldub [$rounds+$acc14],$acc14 ! | |
474 | + | |
475 | + sll $acc0,24,$acc0 | |
476 | + xor $acc3,$s0,$s0 | |
477 | + ldub [$rounds+$acc15],$acc15 | |
478 | + sll $acc1,16,$acc1 | |
479 | + xor $acc0,$s0,$s0 | |
480 | + ldx [%sp+$bias+$frame+0],%i7 ! restore return address | |
481 | + fmovs %f0,%f0 | |
482 | + sll $acc2,8,$acc2 ! | |
483 | + xor $acc1,$s0,$s0 | |
484 | + sll $acc4,24,$acc4 | |
485 | + xor $acc2,$s0,$s0 | |
486 | + sll $acc5,16,$acc5 | |
487 | + xor $acc7,$s1,$s1 | |
488 | + sll $acc6,8,$acc6 | |
489 | + xor $acc4,$s1,$s1 | |
490 | + sll $acc8,24,$acc8 ! | |
491 | + xor $acc5,$s1,$s1 | |
492 | + sll $acc9,16,$acc9 | |
493 | + xor $acc11,$s2,$s2 | |
494 | + sll $acc10,8,$acc10 | |
495 | + xor $acc6,$s1,$s1 | |
496 | + sll $acc12,24,$acc12 | |
497 | + xor $acc8,$s2,$s2 | |
498 | + sll $acc13,16,$acc13 ! | |
499 | + xor $acc9,$s2,$s2 | |
500 | + sll $acc14,8,$acc14 | |
501 | + xor $acc10,$s2,$s2 | |
502 | + xor $acc12,$acc14,$acc14 | |
503 | + xor $acc13,$s3,$s3 | |
504 | + xor $acc14,$s3,$s3 | |
505 | + xor $acc15,$s3,$s3 | |
506 | + | |
507 | + ret | |
508 | + restore | |
509 | +.type _sparcv9_AES_encrypt,#function | |
510 | +.size _sparcv9_AES_encrypt,(.-_sparcv9_AES_encrypt) | |
511 | + | |
512 | +.align 32 | |
513 | +.globl AES_encrypt | |
514 | +AES_encrypt: | |
515 | + or %o0,%o1,%g1 | |
516 | + andcc %g1,3,%g0 | |
517 | + bnz,pn %xcc,.Lunaligned_enc | |
518 | + save %sp,-$frame,%sp | |
519 | + | |
520 | + ld [%i0+0],%o0 | |
521 | + ld [%i0+4],%o1 | |
522 | + ld [%i0+8],%o2 | |
523 | + ld [%i0+12],%o3 | |
524 | + | |
525 | +1: call .+8 | |
526 | + add %o7,AES_Te-1b,%o4 | |
527 | + call _sparcv9_AES_encrypt | |
528 | + mov %i2,%o5 | |
529 | + | |
530 | + st %o0,[%i1+0] | |
531 | + st %o1,[%i1+4] | |
532 | + st %o2,[%i1+8] | |
533 | + st %o3,[%i1+12] | |
534 | + | |
535 | + ret | |
536 | + restore | |
537 | + | |
538 | +.align 32 | |
539 | +.Lunaligned_enc: | |
540 | + ldub [%i0+0],%l0 | |
541 | + ldub [%i0+1],%l1 | |
542 | + ldub [%i0+2],%l2 | |
543 | + | |
544 | + sll %l0,24,%l0 | |
545 | + ldub [%i0+3],%l3 | |
546 | + sll %l1,16,%l1 | |
547 | + ldub [%i0+4],%l4 | |
548 | + sll %l2,8,%l2 | |
549 | + or %l1,%l0,%l0 | |
550 | + ldub [%i0+5],%l5 | |
551 | + sll %l4,24,%l4 | |
552 | + or %l3,%l2,%l2 | |
553 | + ldub [%i0+6],%l6 | |
554 | + sll %l5,16,%l5 | |
555 | + or %l0,%l2,%o0 | |
556 | + ldub [%i0+7],%l7 | |
557 | + | |
558 | + sll %l6,8,%l6 | |
559 | + or %l5,%l4,%l4 | |
560 | + ldub [%i0+8],%l0 | |
561 | + or %l7,%l6,%l6 | |
562 | + ldub [%i0+9],%l1 | |
563 | + or %l4,%l6,%o1 | |
564 | + ldub [%i0+10],%l2 | |
565 | + | |
566 | + sll %l0,24,%l0 | |
567 | + ldub [%i0+11],%l3 | |
568 | + sll %l1,16,%l1 | |
569 | + ldub [%i0+12],%l4 | |
570 | + sll %l2,8,%l2 | |
571 | + or %l1,%l0,%l0 | |
572 | + ldub [%i0+13],%l5 | |
573 | + sll %l4,24,%l4 | |
574 | + or %l3,%l2,%l2 | |
575 | + ldub [%i0+14],%l6 | |
576 | + sll %l5,16,%l5 | |
577 | + or %l0,%l2,%o2 | |
578 | + ldub [%i0+15],%l7 | |
579 | + | |
580 | + sll %l6,8,%l6 | |
581 | + or %l5,%l4,%l4 | |
582 | + or %l7,%l6,%l6 | |
583 | + or %l4,%l6,%o3 | |
584 | + | |
585 | +1: call .+8 | |
586 | + add %o7,AES_Te-1b,%o4 | |
587 | + call _sparcv9_AES_encrypt | |
588 | + mov %i2,%o5 | |
589 | + | |
590 | + srl %o0,24,%l0 | |
591 | + srl %o0,16,%l1 | |
592 | + stb %l0,[%i1+0] | |
593 | + srl %o0,8,%l2 | |
594 | + stb %l1,[%i1+1] | |
595 | + stb %l2,[%i1+2] | |
596 | + srl %o1,24,%l4 | |
597 | + stb %o0,[%i1+3] | |
598 | + | |
599 | + srl %o1,16,%l5 | |
600 | + stb %l4,[%i1+4] | |
601 | + srl %o1,8,%l6 | |
602 | + stb %l5,[%i1+5] | |
603 | + stb %l6,[%i1+6] | |
604 | + srl %o2,24,%l0 | |
605 | + stb %o1,[%i1+7] | |
606 | + | |
607 | + srl %o2,16,%l1 | |
608 | + stb %l0,[%i1+8] | |
609 | + srl %o2,8,%l2 | |
610 | + stb %l1,[%i1+9] | |
611 | + stb %l2,[%i1+10] | |
612 | + srl %o3,24,%l4 | |
613 | + stb %o2,[%i1+11] | |
614 | + | |
615 | + srl %o3,16,%l5 | |
616 | + stb %l4,[%i1+12] | |
617 | + srl %o3,8,%l6 | |
618 | + stb %l5,[%i1+13] | |
619 | + stb %l6,[%i1+14] | |
620 | + stb %o3,[%i1+15] | |
621 | + | |
622 | + ret | |
623 | + restore | |
624 | +.type AES_encrypt,#function | |
625 | +.size AES_encrypt,(.-AES_encrypt) | |
626 | + | |
627 | +___ | |
628 | + | |
629 | +$code.=<<___; | |
630 | +.align 256 | |
631 | +AES_Td: | |
632 | +___ | |
633 | +&_data_word( | |
634 | + 0x51f4a750, 0x7e416553, 0x1a17a4c3, 0x3a275e96, | |
635 | + 0x3bab6bcb, 0x1f9d45f1, 0xacfa58ab, 0x4be30393, | |
636 | + 0x2030fa55, 0xad766df6, 0x88cc7691, 0xf5024c25, | |
637 | + 0x4fe5d7fc, 0xc52acbd7, 0x26354480, 0xb562a38f, | |
638 | + 0xdeb15a49, 0x25ba1b67, 0x45ea0e98, 0x5dfec0e1, | |
639 | + 0xc32f7502, 0x814cf012, 0x8d4697a3, 0x6bd3f9c6, | |
640 | + 0x038f5fe7, 0x15929c95, 0xbf6d7aeb, 0x955259da, | |
641 | + 0xd4be832d, 0x587421d3, 0x49e06929, 0x8ec9c844, | |
642 | + 0x75c2896a, 0xf48e7978, 0x99583e6b, 0x27b971dd, | |
643 | + 0xbee14fb6, 0xf088ad17, 0xc920ac66, 0x7dce3ab4, | |
644 | + 0x63df4a18, 0xe51a3182, 0x97513360, 0x62537f45, | |
645 | + 0xb16477e0, 0xbb6bae84, 0xfe81a01c, 0xf9082b94, | |
646 | + 0x70486858, 0x8f45fd19, 0x94de6c87, 0x527bf8b7, | |
647 | + 0xab73d323, 0x724b02e2, 0xe31f8f57, 0x6655ab2a, | |
648 | + 0xb2eb2807, 0x2fb5c203, 0x86c57b9a, 0xd33708a5, | |
649 | + 0x302887f2, 0x23bfa5b2, 0x02036aba, 0xed16825c, | |
650 | + 0x8acf1c2b, 0xa779b492, 0xf307f2f0, 0x4e69e2a1, | |
651 | + 0x65daf4cd, 0x0605bed5, 0xd134621f, 0xc4a6fe8a, | |
652 | + 0x342e539d, 0xa2f355a0, 0x058ae132, 0xa4f6eb75, | |
653 | + 0x0b83ec39, 0x4060efaa, 0x5e719f06, 0xbd6e1051, | |
654 | + 0x3e218af9, 0x96dd063d, 0xdd3e05ae, 0x4de6bd46, | |
655 | + 0x91548db5, 0x71c45d05, 0x0406d46f, 0x605015ff, | |
656 | + 0x1998fb24, 0xd6bde997, 0x894043cc, 0x67d99e77, | |
657 | + 0xb0e842bd, 0x07898b88, 0xe7195b38, 0x79c8eedb, | |
658 | + 0xa17c0a47, 0x7c420fe9, 0xf8841ec9, 0x00000000, | |
659 | + 0x09808683, 0x322bed48, 0x1e1170ac, 0x6c5a724e, | |
660 | + 0xfd0efffb, 0x0f853856, 0x3daed51e, 0x362d3927, | |
661 | + 0x0a0fd964, 0x685ca621, 0x9b5b54d1, 0x24362e3a, | |
662 | + 0x0c0a67b1, 0x9357e70f, 0xb4ee96d2, 0x1b9b919e, | |
663 | + 0x80c0c54f, 0x61dc20a2, 0x5a774b69, 0x1c121a16, | |
664 | + 0xe293ba0a, 0xc0a02ae5, 0x3c22e043, 0x121b171d, | |
665 | + 0x0e090d0b, 0xf28bc7ad, 0x2db6a8b9, 0x141ea9c8, | |
666 | + 0x57f11985, 0xaf75074c, 0xee99ddbb, 0xa37f60fd, | |
667 | + 0xf701269f, 0x5c72f5bc, 0x44663bc5, 0x5bfb7e34, | |
668 | + 0x8b432976, 0xcb23c6dc, 0xb6edfc68, 0xb8e4f163, | |
669 | + 0xd731dcca, 0x42638510, 0x13972240, 0x84c61120, | |
670 | + 0x854a247d, 0xd2bb3df8, 0xaef93211, 0xc729a16d, | |
671 | + 0x1d9e2f4b, 0xdcb230f3, 0x0d8652ec, 0x77c1e3d0, | |
672 | + 0x2bb3166c, 0xa970b999, 0x119448fa, 0x47e96422, | |
673 | + 0xa8fc8cc4, 0xa0f03f1a, 0x567d2cd8, 0x223390ef, | |
674 | + 0x87494ec7, 0xd938d1c1, 0x8ccaa2fe, 0x98d40b36, | |
675 | + 0xa6f581cf, 0xa57ade28, 0xdab78e26, 0x3fadbfa4, | |
676 | + 0x2c3a9de4, 0x5078920d, 0x6a5fcc9b, 0x547e4662, | |
677 | + 0xf68d13c2, 0x90d8b8e8, 0x2e39f75e, 0x82c3aff5, | |
678 | + 0x9f5d80be, 0x69d0937c, 0x6fd52da9, 0xcf2512b3, | |
679 | + 0xc8ac993b, 0x10187da7, 0xe89c636e, 0xdb3bbb7b, | |
680 | + 0xcd267809, 0x6e5918f4, 0xec9ab701, 0x834f9aa8, | |
681 | + 0xe6956e65, 0xaaffe67e, 0x21bccf08, 0xef15e8e6, | |
682 | + 0xbae79bd9, 0x4a6f36ce, 0xea9f09d4, 0x29b07cd6, | |
683 | + 0x31a4b2af, 0x2a3f2331, 0xc6a59430, 0x35a266c0, | |
684 | + 0x744ebc37, 0xfc82caa6, 0xe090d0b0, 0x33a7d815, | |
685 | + 0xf104984a, 0x41ecdaf7, 0x7fcd500e, 0x1791f62f, | |
686 | + 0x764dd68d, 0x43efb04d, 0xccaa4d54, 0xe49604df, | |
687 | + 0x9ed1b5e3, 0x4c6a881b, 0xc12c1fb8, 0x4665517f, | |
688 | + 0x9d5eea04, 0x018c355d, 0xfa877473, 0xfb0b412e, | |
689 | + 0xb3671d5a, 0x92dbd252, 0xe9105633, 0x6dd64713, | |
690 | + 0x9ad7618c, 0x37a10c7a, 0x59f8148e, 0xeb133c89, | |
691 | + 0xcea927ee, 0xb761c935, 0xe11ce5ed, 0x7a47b13c, | |
692 | + 0x9cd2df59, 0x55f2733f, 0x1814ce79, 0x73c737bf, | |
693 | + 0x53f7cdea, 0x5ffdaa5b, 0xdf3d6f14, 0x7844db86, | |
694 | + 0xcaaff381, 0xb968c43e, 0x3824342c, 0xc2a3405f, | |
695 | + 0x161dc372, 0xbce2250c, 0x283c498b, 0xff0d9541, | |
696 | + 0x39a80171, 0x080cb3de, 0xd8b4e49c, 0x6456c190, | |
697 | + 0x7bcb8461, 0xd532b670, 0x486c5c74, 0xd0b85742); | |
698 | +$code.=<<___; | |
699 | + .byte 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38 | |
700 | + .byte 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb | |
701 | + .byte 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87 | |
702 | + .byte 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb | |
703 | + .byte 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d | |
704 | + .byte 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e | |
705 | + .byte 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2 | |
706 | + .byte 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25 | |
707 | + .byte 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16 | |
708 | + .byte 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92 | |
709 | + .byte 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda | |
710 | + .byte 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84 | |
711 | + .byte 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a | |
712 | + .byte 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06 | |
713 | + .byte 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02 | |
714 | + .byte 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b | |
715 | + .byte 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea | |
716 | + .byte 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73 | |
717 | + .byte 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85 | |
718 | + .byte 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e | |
719 | + .byte 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89 | |
720 | + .byte 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b | |
721 | + .byte 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20 | |
722 | + .byte 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4 | |
723 | + .byte 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31 | |
724 | + .byte 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f | |
725 | + .byte 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d | |
726 | + .byte 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef | |
727 | + .byte 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0 | |
728 | + .byte 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61 | |
729 | + .byte 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26 | |
730 | + .byte 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d | |
731 | +.type AES_Td,#object | |
732 | +.size AES_Td,(.-AES_Td) | |
733 | + | |
734 | +.align 64 | |
735 | +.skip 16 | |
736 | +_sparcv9_AES_decrypt: | |
737 | + save %sp,-$frame-$locals,%sp | |
738 | + stx %i7,[%sp+$bias+$frame+0] ! off-load return address | |
739 | + ld [$key+240],$rounds | |
740 | + ld [$key+0],$t0 | |
741 | + ld [$key+4],$t1 ! | |
742 | + ld [$key+8],$t2 | |
743 | + ld [$key+12],$t3 | |
744 | + srl $rounds,1,$rounds | |
745 | + xor $t0,$s0,$s0 | |
746 | + ld [$key+16],$t0 | |
747 | + xor $t1,$s1,$s1 | |
748 | + ld [$key+20],$t1 | |
749 | + srl $s0,21,$acc0 ! | |
750 | + xor $t2,$s2,$s2 | |
751 | + ld [$key+24],$t2 | |
752 | + xor $t3,$s3,$s3 | |
753 | + and $acc0,2040,$acc0 | |
754 | + ld [$key+28],$t3 | |
755 | + srl $s3,13,$acc1 | |
756 | + nop | |
757 | +.Ldec_loop: | |
758 | + srl $s2,5,$acc2 ! | |
759 | + and $acc1,2040,$acc1 | |
760 | + ldx [$tbl+$acc0],$acc0 | |
761 | + sll $s1,3,$acc3 | |
762 | + and $acc2,2040,$acc2 | |
763 | + ldx [$tbl+$acc1],$acc1 | |
764 | + srl $s1,21,$acc4 | |
765 | + and $acc3,2040,$acc3 | |
766 | + ldx [$tbl+$acc2],$acc2 ! | |
767 | + srl $s0,13,$acc5 | |
768 | + and $acc4,2040,$acc4 | |
769 | + ldx [$tbl+$acc3],$acc3 | |
770 | + srl $s3,5,$acc6 | |
771 | + and $acc5,2040,$acc5 | |
772 | + ldx [$tbl+$acc4],$acc4 | |
773 | + fmovs %f0,%f0 | |
774 | + sll $s2,3,$acc7 ! | |
775 | + and $acc6,2040,$acc6 | |
776 | + ldx [$tbl+$acc5],$acc5 | |
777 | + srl $s2,21,$acc8 | |
778 | + and $acc7,2040,$acc7 | |
779 | + ldx [$tbl+$acc6],$acc6 | |
780 | + srl $s1,13,$acc9 | |
781 | + and $acc8,2040,$acc8 | |
782 | + ldx [$tbl+$acc7],$acc7 ! | |
783 | + srl $s0,5,$acc10 | |
784 | + and $acc9,2040,$acc9 | |
785 | + ldx [$tbl+$acc8],$acc8 | |
786 | + sll $s3,3,$acc11 | |
787 | + and $acc10,2040,$acc10 | |
788 | + ldx [$tbl+$acc9],$acc9 | |
789 | + fmovs %f0,%f0 | |
790 | + srl $s3,21,$acc12 ! | |
791 | + and $acc11,2040,$acc11 | |
792 | + ldx [$tbl+$acc10],$acc10 | |
793 | + srl $s2,13,$acc13 | |
794 | + and $acc12,2040,$acc12 | |
795 | + ldx [$tbl+$acc11],$acc11 | |
796 | + srl $s1,5,$acc14 | |
797 | + and $acc13,2040,$acc13 | |
798 | + ldx [$tbl+$acc12],$acc12 ! | |
799 | + sll $s0,3,$acc15 | |
800 | + and $acc14,2040,$acc14 | |
801 | + ldx [$tbl+$acc13],$acc13 | |
802 | + and $acc15,2040,$acc15 | |
803 | + add $key,32,$key | |
804 | + ldx [$tbl+$acc14],$acc14 | |
805 | + fmovs %f0,%f0 | |
806 | + subcc $rounds,1,$rounds ! | |
807 | + ldx [$tbl+$acc15],$acc15 | |
808 | + bz,a,pn %icc,.Ldec_last | |
809 | + add $tbl,2048,$rounds | |
810 | + | |
811 | + srlx $acc1,8,$acc1 | |
812 | + xor $acc0,$t0,$t0 | |
813 | + ld [$key+0],$s0 | |
814 | + fmovs %f0,%f0 | |
815 | + srlx $acc2,16,$acc2 ! | |
816 | + xor $acc1,$t0,$t0 | |
817 | + ld [$key+4],$s1 | |
818 | + srlx $acc3,24,$acc3 | |
819 | + xor $acc2,$t0,$t0 | |
820 | + ld [$key+8],$s2 | |
821 | + srlx $acc5,8,$acc5 | |
822 | + xor $acc3,$t0,$t0 | |
823 | + ld [$key+12],$s3 ! | |
824 | + srlx $acc6,16,$acc6 | |
825 | + xor $acc4,$t1,$t1 | |
826 | + fmovs %f0,%f0 | |
827 | + srlx $acc7,24,$acc7 | |
828 | + xor $acc5,$t1,$t1 | |
829 | + srlx $acc9,8,$acc9 | |
830 | + xor $acc6,$t1,$t1 | |
831 | + srlx $acc10,16,$acc10 ! | |
832 | + xor $acc7,$t1,$t1 | |
833 | + srlx $acc11,24,$acc11 | |
834 | + xor $acc8,$t2,$t2 | |
835 | + srlx $acc13,8,$acc13 | |
836 | + xor $acc9,$t2,$t2 | |
837 | + srlx $acc14,16,$acc14 | |
838 | + xor $acc10,$t2,$t2 | |
839 | + srlx $acc15,24,$acc15 ! | |
840 | + xor $acc11,$t2,$t2 | |
841 | + xor $acc12,$acc14,$acc14 | |
842 | + xor $acc13,$t3,$t3 | |
843 | + srl $t0,21,$acc0 | |
844 | + xor $acc14,$t3,$t3 | |
845 | + xor $acc15,$t3,$t3 | |
846 | + srl $t3,13,$acc1 | |
847 | + | |
848 | + and $acc0,2040,$acc0 ! | |
849 | + srl $t2,5,$acc2 | |
850 | + and $acc1,2040,$acc1 | |
851 | + ldx [$tbl+$acc0],$acc0 | |
852 | + sll $t1,3,$acc3 | |
853 | + and $acc2,2040,$acc2 | |
854 | + ldx [$tbl+$acc1],$acc1 | |
855 | + fmovs %f0,%f0 | |
856 | + srl $t1,21,$acc4 ! | |
857 | + and $acc3,2040,$acc3 | |
858 | + ldx [$tbl+$acc2],$acc2 | |
859 | + srl $t0,13,$acc5 | |
860 | + and $acc4,2040,$acc4 | |
861 | + ldx [$tbl+$acc3],$acc3 | |
862 | + srl $t3,5,$acc6 | |
863 | + and $acc5,2040,$acc5 | |
864 | + ldx [$tbl+$acc4],$acc4 ! | |
865 | + sll $t2,3,$acc7 | |
866 | + and $acc6,2040,$acc6 | |
867 | + ldx [$tbl+$acc5],$acc5 | |
868 | + srl $t2,21,$acc8 | |
869 | + and $acc7,2040,$acc7 | |
870 | + ldx [$tbl+$acc6],$acc6 | |
871 | + fmovs %f0,%f0 | |
872 | + srl $t1,13,$acc9 ! | |
873 | + and $acc8,2040,$acc8 | |
874 | + ldx [$tbl+$acc7],$acc7 | |
875 | + srl $t0,5,$acc10 | |
876 | + and $acc9,2040,$acc9 | |
877 | + ldx [$tbl+$acc8],$acc8 | |
878 | + sll $t3,3,$acc11 | |
879 | + and $acc10,2040,$acc10 | |
880 | + ldx [$tbl+$acc9],$acc9 ! | |
881 | + srl $t3,21,$acc12 | |
882 | + and $acc11,2040,$acc11 | |
883 | + ldx [$tbl+$acc10],$acc10 | |
884 | + srl $t2,13,$acc13 | |
885 | + and $acc12,2040,$acc12 | |
886 | + ldx [$tbl+$acc11],$acc11 | |
887 | + fmovs %f0,%f0 | |
888 | + srl $t1,5,$acc14 ! | |
889 | + and $acc13,2040,$acc13 | |
890 | + ldx [$tbl+$acc12],$acc12 | |
891 | + sll $t0,3,$acc15 | |
892 | + and $acc14,2040,$acc14 | |
893 | + ldx [$tbl+$acc13],$acc13 | |
894 | + srlx $acc1,8,$acc1 | |
895 | + and $acc15,2040,$acc15 | |
896 | + ldx [$tbl+$acc14],$acc14 ! | |
897 | + | |
898 | + srlx $acc2,16,$acc2 | |
899 | + xor $acc0,$s0,$s0 | |
900 | + ldx [$tbl+$acc15],$acc15 | |
901 | + srlx $acc3,24,$acc3 | |
902 | + xor $acc1,$s0,$s0 | |
903 | + ld [$key+16],$t0 | |
904 | + fmovs %f0,%f0 | |
905 | + srlx $acc5,8,$acc5 ! | |
906 | + xor $acc2,$s0,$s0 | |
907 | + ld [$key+20],$t1 | |
908 | + srlx $acc6,16,$acc6 | |
909 | + xor $acc3,$s0,$s0 | |
910 | + ld [$key+24],$t2 | |
911 | + srlx $acc7,24,$acc7 | |
912 | + xor $acc4,$s1,$s1 | |
913 | + ld [$key+28],$t3 ! | |
914 | + srlx $acc9,8,$acc9 | |
915 | + xor $acc5,$s1,$s1 | |
916 | + ldx [$tbl+2048+0],%g0 ! prefetch td4 | |
917 | + srlx $acc10,16,$acc10 | |
918 | + xor $acc6,$s1,$s1 | |
919 | + ldx [$tbl+2048+32],%g0 ! prefetch td4 | |
920 | + srlx $acc11,24,$acc11 | |
921 | + xor $acc7,$s1,$s1 | |
922 | + ldx [$tbl+2048+64],%g0 ! prefetch td4 | |
923 | + srlx $acc13,8,$acc13 | |
924 | + xor $acc8,$s2,$s2 | |
925 | + ldx [$tbl+2048+96],%g0 ! prefetch td4 | |
926 | + srlx $acc14,16,$acc14 ! | |
927 | + xor $acc9,$s2,$s2 | |
928 | + ldx [$tbl+2048+128],%g0 ! prefetch td4 | |
929 | + srlx $acc15,24,$acc15 | |
930 | + xor $acc10,$s2,$s2 | |
931 | + ldx [$tbl+2048+160],%g0 ! prefetch td4 | |
932 | + srl $s0,21,$acc0 | |
933 | + xor $acc11,$s2,$s2 | |
934 | + ldx [$tbl+2048+192],%g0 ! prefetch td4 | |
935 | + xor $acc12,$acc14,$acc14 | |
936 | + xor $acc13,$s3,$s3 | |
937 | + ldx [$tbl+2048+224],%g0 ! prefetch td4 | |
938 | + and $acc0,2040,$acc0 ! | |
939 | + xor $acc14,$s3,$s3 | |
940 | + xor $acc15,$s3,$s3 | |
941 | + ba .Ldec_loop | |
942 | + srl $s3,13,$acc1 | |
943 | + | |
944 | +.align 32 | |
945 | +.Ldec_last: | |
946 | + srlx $acc1,8,$acc1 ! | |
947 | + xor $acc0,$t0,$t0 | |
948 | + ld [$key+0],$s0 | |
949 | + srlx $acc2,16,$acc2 | |
950 | + xor $acc1,$t0,$t0 | |
951 | + ld [$key+4],$s1 | |
952 | + srlx $acc3,24,$acc3 | |
953 | + xor $acc2,$t0,$t0 | |
954 | + ld [$key+8],$s2 ! | |
955 | + srlx $acc5,8,$acc5 | |
956 | + xor $acc3,$t0,$t0 | |
957 | + ld [$key+12],$s3 | |
958 | + srlx $acc6,16,$acc6 | |
959 | + xor $acc4,$t1,$t1 | |
960 | + srlx $acc7,24,$acc7 | |
961 | + xor $acc5,$t1,$t1 | |
962 | + srlx $acc9,8,$acc9 ! | |
963 | + xor $acc6,$t1,$t1 | |
964 | + srlx $acc10,16,$acc10 | |
965 | + xor $acc7,$t1,$t1 | |
966 | + srlx $acc11,24,$acc11 | |
967 | + xor $acc8,$t2,$t2 | |
968 | + srlx $acc13,8,$acc13 | |
969 | + xor $acc9,$t2,$t2 | |
970 | + srlx $acc14,16,$acc14 ! | |
971 | + xor $acc10,$t2,$t2 | |
972 | + srlx $acc15,24,$acc15 | |
973 | + xor $acc11,$t2,$t2 | |
974 | + xor $acc12,$acc14,$acc14 | |
975 | + xor $acc13,$t3,$t3 | |
976 | + srl $t0,24,$acc0 | |
977 | + xor $acc14,$t3,$t3 | |
978 | + xor $acc15,$t3,$t3 ! | |
979 | + srl $t3,16,$acc1 | |
980 | + | |
981 | + srl $t2,8,$acc2 | |
982 | + and $acc1,255,$acc1 | |
983 | + ldub [$rounds+$acc0],$acc0 | |
984 | + srl $t1,24,$acc4 | |
985 | + and $acc2,255,$acc2 | |
986 | + ldub [$rounds+$acc1],$acc1 | |
987 | + srl $t0,16,$acc5 ! | |
988 | + and $t1,255,$acc3 | |
989 | + ldub [$rounds+$acc2],$acc2 | |
990 | + ldub [$rounds+$acc3],$acc3 | |
991 | + srl $t3,8,$acc6 | |
992 | + and $acc5,255,$acc5 | |
993 | + ldub [$rounds+$acc4],$acc4 | |
994 | + fmovs %f0,%f0 | |
995 | + srl $t2,24,$acc8 ! | |
996 | + and $acc6,255,$acc6 | |
997 | + ldub [$rounds+$acc5],$acc5 | |
998 | + srl $t1,16,$acc9 | |
999 | + and $t2,255,$acc7 | |
1000 | + ldub [$rounds+$acc6],$acc6 | |
1001 | + ldub [$rounds+$acc7],$acc7 | |
1002 | + fmovs %f0,%f0 | |
1003 | + srl $t0,8,$acc10 ! | |
1004 | + and $acc9,255,$acc9 | |
1005 | + ldub [$rounds+$acc8],$acc8 | |
1006 | + srl $t3,24,$acc12 | |
1007 | + and $acc10,255,$acc10 | |
1008 | + ldub [$rounds+$acc9],$acc9 | |
1009 | + srl $t2,16,$acc13 | |
1010 | + and $t3,255,$acc11 | |
1011 | + ldub [$rounds+$acc10],$acc10 ! | |
1012 | + srl $t1,8,$acc14 | |
1013 | + and $acc13,255,$acc13 | |
1014 | + ldub [$rounds+$acc11],$acc11 | |
1015 | + ldub [$rounds+$acc12],$acc12 | |
1016 | + and $acc14,255,$acc14 | |
1017 | + ldub [$rounds+$acc13],$acc13 | |
1018 | + and $t0,255,$acc15 | |
1019 | + ldub [$rounds+$acc14],$acc14 ! | |
1020 | + | |
1021 | + sll $acc0,24,$acc0 | |
1022 | + xor $acc3,$s0,$s0 | |
1023 | + ldub [$rounds+$acc15],$acc15 | |
1024 | + sll $acc1,16,$acc1 | |
1025 | + xor $acc0,$s0,$s0 | |
1026 | + ldx [%sp+$bias+$frame+0],%i7 ! restore return address | |
1027 | + fmovs %f0,%f0 | |
1028 | + sll $acc2,8,$acc2 ! | |
1029 | + xor $acc1,$s0,$s0 | |
1030 | + sll $acc4,24,$acc4 | |
1031 | + xor $acc2,$s0,$s0 | |
1032 | + sll $acc5,16,$acc5 | |
1033 | + xor $acc7,$s1,$s1 | |
1034 | + sll $acc6,8,$acc6 | |
1035 | + xor $acc4,$s1,$s1 | |
1036 | + sll $acc8,24,$acc8 ! | |
1037 | + xor $acc5,$s1,$s1 | |
1038 | + sll $acc9,16,$acc9 | |
1039 | + xor $acc11,$s2,$s2 | |
1040 | + sll $acc10,8,$acc10 | |
1041 | + xor $acc6,$s1,$s1 | |
1042 | + sll $acc12,24,$acc12 | |
1043 | + xor $acc8,$s2,$s2 | |
1044 | + sll $acc13,16,$acc13 ! | |
1045 | + xor $acc9,$s2,$s2 | |
1046 | + sll $acc14,8,$acc14 | |
1047 | + xor $acc10,$s2,$s2 | |
1048 | + xor $acc12,$acc14,$acc14 | |
1049 | + xor $acc13,$s3,$s3 | |
1050 | + xor $acc14,$s3,$s3 | |
1051 | + xor $acc15,$s3,$s3 | |
1052 | + | |
1053 | + ret | |
1054 | + restore | |
1055 | +.type _sparcv9_AES_decrypt,#function | |
1056 | +.size _sparcv9_AES_decrypt,(.-_sparcv9_AES_decrypt) | |
1057 | + | |
1058 | +.align 32 | |
1059 | +.globl AES_decrypt | |
1060 | +AES_decrypt: | |
1061 | + or %o0,%o1,%g1 | |
1062 | + andcc %g1,3,%g0 | |
1063 | + bnz,pn %xcc,.Lunaligned_dec | |
1064 | + save %sp,-$frame,%sp | |
1065 | + | |
1066 | + ld [%i0+0],%o0 | |
1067 | + ld [%i0+4],%o1 | |
1068 | + ld [%i0+8],%o2 | |
1069 | + ld [%i0+12],%o3 | |
1070 | + | |
1071 | +1: call .+8 | |
1072 | + add %o7,AES_Td-1b,%o4 | |
1073 | + call _sparcv9_AES_decrypt | |
1074 | + mov %i2,%o5 | |
1075 | + | |
1076 | + st %o0,[%i1+0] | |
1077 | + st %o1,[%i1+4] | |
1078 | + st %o2,[%i1+8] | |
1079 | + st %o3,[%i1+12] | |
1080 | + | |
1081 | + ret | |
1082 | + restore | |
1083 | + | |
1084 | +.align 32 | |
1085 | +.Lunaligned_dec: | |
1086 | + ldub [%i0+0],%l0 | |
1087 | + ldub [%i0+1],%l1 | |
1088 | + ldub [%i0+2],%l2 | |
1089 | + | |
1090 | + sll %l0,24,%l0 | |
1091 | + ldub [%i0+3],%l3 | |
1092 | + sll %l1,16,%l1 | |
1093 | + ldub [%i0+4],%l4 | |
1094 | + sll %l2,8,%l2 | |
1095 | + or %l1,%l0,%l0 | |
1096 | + ldub [%i0+5],%l5 | |
1097 | + sll %l4,24,%l4 | |
1098 | + or %l3,%l2,%l2 | |
1099 | + ldub [%i0+6],%l6 | |
1100 | + sll %l5,16,%l5 | |
1101 | + or %l0,%l2,%o0 | |
1102 | + ldub [%i0+7],%l7 | |
1103 | + | |
1104 | + sll %l6,8,%l6 | |
1105 | + or %l5,%l4,%l4 | |
1106 | + ldub [%i0+8],%l0 | |
1107 | + or %l7,%l6,%l6 | |
1108 | + ldub [%i0+9],%l1 | |
1109 | + or %l4,%l6,%o1 | |
1110 | + ldub [%i0+10],%l2 | |
1111 | + | |
1112 | + sll %l0,24,%l0 | |
1113 | + ldub [%i0+11],%l3 | |
1114 | + sll %l1,16,%l1 | |
1115 | + ldub [%i0+12],%l4 | |
1116 | + sll %l2,8,%l2 | |
1117 | + or %l1,%l0,%l0 | |
1118 | + ldub [%i0+13],%l5 | |
1119 | + sll %l4,24,%l4 | |
1120 | + or %l3,%l2,%l2 | |
1121 | + ldub [%i0+14],%l6 | |
1122 | + sll %l5,16,%l5 | |
1123 | + or %l0,%l2,%o2 | |
1124 | + ldub [%i0+15],%l7 | |
1125 | + | |
1126 | + sll %l6,8,%l6 | |
1127 | + or %l5,%l4,%l4 | |
1128 | + or %l7,%l6,%l6 | |
1129 | + or %l4,%l6,%o3 | |
1130 | + | |
1131 | +1: call .+8 | |
1132 | + add %o7,AES_Td-1b,%o4 | |
1133 | + call _sparcv9_AES_decrypt | |
1134 | + mov %i2,%o5 | |
1135 | + | |
1136 | + srl %o0,24,%l0 | |
1137 | + srl %o0,16,%l1 | |
1138 | + stb %l0,[%i1+0] | |
1139 | + srl %o0,8,%l2 | |
1140 | + stb %l1,[%i1+1] | |
1141 | + stb %l2,[%i1+2] | |
1142 | + srl %o1,24,%l4 | |
1143 | + stb %o0,[%i1+3] | |
1144 | + | |
1145 | + srl %o1,16,%l5 | |
1146 | + stb %l4,[%i1+4] | |
1147 | + srl %o1,8,%l6 | |
1148 | + stb %l5,[%i1+5] | |
1149 | + stb %l6,[%i1+6] | |
1150 | + srl %o2,24,%l0 | |
1151 | + stb %o1,[%i1+7] | |
1152 | + | |
1153 | + srl %o2,16,%l1 | |
1154 | + stb %l0,[%i1+8] | |
1155 | + srl %o2,8,%l2 | |
1156 | + stb %l1,[%i1+9] | |
1157 | + stb %l2,[%i1+10] | |
1158 | + srl %o3,24,%l4 | |
1159 | + stb %o2,[%i1+11] | |
1160 | + | |
1161 | + srl %o3,16,%l5 | |
1162 | + stb %l4,[%i1+12] | |
1163 | + srl %o3,8,%l6 | |
1164 | + stb %l5,[%i1+13] | |
1165 | + stb %l6,[%i1+14] | |
1166 | + stb %o3,[%i1+15] | |
1167 | + | |
1168 | + ret | |
1169 | + restore | |
1170 | +.type AES_decrypt,#function | |
1171 | +.size AES_decrypt,(.-AES_decrypt) | |
1172 | +___ | |
1173 | + | |
1174 | +# fmovs instructions substituting for FP nops were originally added | |
1175 | +# to meet specific instruction alignment requirements to maximize ILP. | |
1176 | +# As UltraSPARC T1, a.k.a. Niagara, has shared FPU, FP nops can have | |
1177 | +# undesired effect, so just omit them and sacrifice some portion of | |
1178 | +# percent in performance... | |
1179 | +$code =~ s/fmovs.*$//gem; | |
1180 | + | |
1181 | +print $code; |
@@ -0,0 +1,124 @@ | ||
1 | +.text | |
2 | + | |
3 | +.set noat | |
4 | + | |
5 | +.globl OPENSSL_cpuid_setup | |
6 | +.ent OPENSSL_cpuid_setup | |
7 | +OPENSSL_cpuid_setup: | |
8 | + .frame $30,0,$26 | |
9 | + .prologue 0 | |
10 | + ret ($26) | |
11 | +.end OPENSSL_cpuid_setup | |
12 | + | |
13 | +.globl OPENSSL_wipe_cpu | |
14 | +.ent OPENSSL_wipe_cpu | |
15 | +OPENSSL_wipe_cpu: | |
16 | + .frame $30,0,$26 | |
17 | + .prologue 0 | |
18 | + clr $1 | |
19 | + clr $2 | |
20 | + clr $3 | |
21 | + clr $4 | |
22 | + clr $5 | |
23 | + clr $6 | |
24 | + clr $7 | |
25 | + clr $8 | |
26 | + clr $16 | |
27 | + clr $17 | |
28 | + clr $18 | |
29 | + clr $19 | |
30 | + clr $20 | |
31 | + clr $21 | |
32 | + clr $22 | |
33 | + clr $23 | |
34 | + clr $24 | |
35 | + clr $25 | |
36 | + clr $27 | |
37 | + clr $at | |
38 | + clr $29 | |
39 | + fclr $f0 | |
40 | + fclr $f1 | |
41 | + fclr $f10 | |
42 | + fclr $f11 | |
43 | + fclr $f12 | |
44 | + fclr $f13 | |
45 | + fclr $f14 | |
46 | + fclr $f15 | |
47 | + fclr $f16 | |
48 | + fclr $f17 | |
49 | + fclr $f18 | |
50 | + fclr $f19 | |
51 | + fclr $f20 | |
52 | + fclr $f21 | |
53 | + fclr $f22 | |
54 | + fclr $f23 | |
55 | + fclr $f24 | |
56 | + fclr $f25 | |
57 | + fclr $f26 | |
58 | + fclr $f27 | |
59 | + fclr $f28 | |
60 | + fclr $f29 | |
61 | + fclr $f30 | |
62 | + mov $sp,$0 | |
63 | + ret ($26) | |
64 | +.end OPENSSL_wipe_cpu | |
65 | + | |
66 | +.globl OPENSSL_atomic_add | |
67 | +.ent OPENSSL_atomic_add | |
68 | +OPENSSL_atomic_add: | |
69 | + .frame $30,0,$26 | |
70 | + .prologue 0 | |
71 | +1: ldl_l $0,($16) | |
72 | + addl $0,$17,$1 | |
73 | + stl_c $1,($16) | |
74 | + beq $1,1b | |
75 | + addl $0,$17,$0 | |
76 | + ret ($26) | |
77 | +.end OPENSSL_atomic_add | |
78 | + | |
79 | +.globl OPENSSL_rdtsc | |
80 | +.ent OPENSSL_rdtsc | |
81 | +OPENSSL_rdtsc: | |
82 | + .frame $30,0,$26 | |
83 | + .prologue 0 | |
84 | + rpcc $0 | |
85 | + ret ($26) | |
86 | +.end OPENSSL_rdtsc | |
87 | + | |
88 | +.globl OPENSSL_cleanse | |
89 | +.ent OPENSSL_cleanse | |
90 | +OPENSSL_cleanse: | |
91 | + .frame $30,0,$26 | |
92 | + .prologue 0 | |
93 | + and $16,7,$0 | |
94 | + bic $17,7,$at | |
95 | + beq $at,.Little | |
96 | + beq $0,.Laligned | |
97 | + | |
98 | +.Little: | |
99 | + ldq_u $1,0($16) | |
100 | + mov $16,$2 | |
101 | +.Lalign: | |
102 | + mskbl $1,$16,$1 | |
103 | + lda $16,1($16) | |
104 | + subq $17,1,$17 | |
105 | + subq $0,1,$0 | |
106 | + beq $17,.Lout | |
107 | + bne $0,.Lalign | |
108 | +.Lout: stq_u $1,0($2) | |
109 | + beq $17,.Ldone | |
110 | + bic $17,7,$at | |
111 | + mov $17,$0 | |
112 | + beq $at,.Little | |
113 | + | |
114 | +.Laligned: | |
115 | + stq $31,0($16) | |
116 | + subq $17,8,$17 | |
117 | + lda $16,8($16) | |
118 | + bic $17,7,$at | |
119 | + bne $at,.Laligned | |
120 | + beq $17,.Ldone | |
121 | + mov $17,$0 | |
122 | + br .Little | |
123 | +.Ldone: ret ($26) | |
124 | +.end OPENSSL_cleanse |
@@ -0,0 +1,446 @@ | ||
1 | +/* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL | |
2 | + * project 2006. | |
3 | + */ | |
4 | +/* ==================================================================== | |
5 | + * Copyright (c) 2006 The OpenSSL Project. All rights reserved. | |
6 | + * | |
7 | + * Redistribution and use in source and binary forms, with or without | |
8 | + * modification, are permitted provided that the following conditions | |
9 | + * are met: | |
10 | + * | |
11 | + * 1. Redistributions of source code must retain the above copyright | |
12 | + * notice, this list of conditions and the following disclaimer. | |
13 | + * | |
14 | + * 2. Redistributions in binary form must reproduce the above copyright | |
15 | + * notice, this list of conditions and the following disclaimer in | |
16 | + * the documentation and/or other materials provided with the | |
17 | + * distribution. | |
18 | + * | |
19 | + * 3. All advertising materials mentioning features or use of this | |
20 | + * software must display the following acknowledgment: | |
21 | + * "This product includes software developed by the OpenSSL Project | |
22 | + * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)" | |
23 | + * | |
24 | + * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to | |
25 | + * endorse or promote products derived from this software without | |
26 | + * prior written permission. For written permission, please contact | |
27 | + * licensing@OpenSSL.org. | |
28 | + * | |
29 | + * 5. Products derived from this software may not be called "OpenSSL" | |
30 | + * nor may "OpenSSL" appear in their names without prior written | |
31 | + * permission of the OpenSSL Project. | |
32 | + * | |
33 | + * 6. Redistributions of any form whatsoever must retain the following | |
34 | + * acknowledgment: | |
35 | + * "This product includes software developed by the OpenSSL Project | |
36 | + * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)" | |
37 | + * | |
38 | + * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY | |
39 | + * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
40 | + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR | |
41 | + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR | |
42 | + * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | |
43 | + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT | |
44 | + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; | |
45 | + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | |
46 | + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, | |
47 | + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | |
48 | + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED | |
49 | + * OF THE POSSIBILITY OF SUCH DAMAGE. | |
50 | + * ==================================================================== | |
51 | + * | |
52 | + * This product includes cryptographic software written by Eric Young | |
53 | + * (eay@cryptsoft.com). This product includes software written by Tim | |
54 | + * Hudson (tjh@cryptsoft.com). | |
55 | + * | |
56 | + */ | |
57 | + | |
58 | +#include <stdio.h> | |
59 | +#include "cryptlib.h" | |
60 | +#include <openssl/asn1t.h> | |
61 | +#include <openssl/x509.h> | |
62 | +#ifndef OPENSSL_NO_ENGINE | |
63 | +#include <openssl/engine.h> | |
64 | +#endif | |
65 | +#include "asn1_locl.h" | |
66 | + | |
67 | +extern const EVP_PKEY_ASN1_METHOD rsa_asn1_meths[]; | |
68 | +extern const EVP_PKEY_ASN1_METHOD dsa_asn1_meths[]; | |
69 | +extern const EVP_PKEY_ASN1_METHOD dh_asn1_meth; | |
70 | +extern const EVP_PKEY_ASN1_METHOD eckey_asn1_meth; | |
71 | +extern const EVP_PKEY_ASN1_METHOD hmac_asn1_meth; | |
72 | + | |
73 | +/* Keep this sorted in type order !! */ | |
74 | +static const EVP_PKEY_ASN1_METHOD *standard_methods[] = | |
75 | + { | |
76 | +#ifndef OPENSSL_NO_RSA | |
77 | + &rsa_asn1_meths[0], | |
78 | + &rsa_asn1_meths[1], | |
79 | +#endif | |
80 | +#ifndef OPENSSL_NO_DH | |
81 | + &dh_asn1_meth, | |
82 | +#endif | |
83 | +#ifndef OPENSSL_NO_DSA | |
84 | + &dsa_asn1_meths[0], | |
85 | + &dsa_asn1_meths[1], | |
86 | + &dsa_asn1_meths[2], | |
87 | + &dsa_asn1_meths[3], | |
88 | + &dsa_asn1_meths[4], | |
89 | +#endif | |
90 | +#ifndef OPENSSL_NO_EC | |
91 | + &eckey_asn1_meth, | |
92 | +#endif | |
93 | + &hmac_asn1_meth | |
94 | + }; | |
95 | + | |
96 | +typedef int sk_cmp_fn_type(const char * const *a, const char * const *b); | |
97 | +DECLARE_STACK_OF(EVP_PKEY_ASN1_METHOD) | |
98 | +static STACK_OF(EVP_PKEY_ASN1_METHOD) *app_methods = NULL; | |
99 | + | |
100 | + | |
101 | + | |
102 | +#ifdef TEST | |
103 | +void main() | |
104 | + { | |
105 | + int i; | |
106 | + for (i = 0; | |
107 | + i < sizeof(standard_methods)/sizeof(EVP_PKEY_ASN1_METHOD *); | |
108 | + i++) | |
109 | + fprintf(stderr, "Number %d id=%d (%s)\n", i, | |
110 | + standard_methods[i]->pkey_id, | |
111 | + OBJ_nid2sn(standard_methods[i]->pkey_id)); | |
112 | + } | |
113 | +#endif | |
114 | + | |
115 | +DECLARE_OBJ_BSEARCH_CMP_FN(const EVP_PKEY_ASN1_METHOD *, | |
116 | + const EVP_PKEY_ASN1_METHOD *, ameth); | |
117 | + | |
118 | +static int ameth_cmp(const EVP_PKEY_ASN1_METHOD * const *a, | |
119 | + const EVP_PKEY_ASN1_METHOD * const *b) | |
120 | + { | |
121 | + return ((*a)->pkey_id - (*b)->pkey_id); | |
122 | + } | |
123 | + | |
124 | +IMPLEMENT_OBJ_BSEARCH_CMP_FN(const EVP_PKEY_ASN1_METHOD *, | |
125 | + const EVP_PKEY_ASN1_METHOD *, ameth); | |
126 | + | |
127 | +int EVP_PKEY_asn1_get_count(void) | |
128 | + { | |
129 | + int num = sizeof(standard_methods)/sizeof(EVP_PKEY_ASN1_METHOD *); | |
130 | + if (app_methods) | |
131 | + num += sk_EVP_PKEY_ASN1_METHOD_num(app_methods); | |
132 | + return num; | |
133 | + } | |
134 | + | |
135 | +const EVP_PKEY_ASN1_METHOD *EVP_PKEY_asn1_get0(int idx) | |
136 | + { | |
137 | + int num = sizeof(standard_methods)/sizeof(EVP_PKEY_ASN1_METHOD *); | |
138 | + if (idx < 0) | |
139 | + return NULL; | |
140 | + if (idx < num) | |
141 | + return standard_methods[idx]; | |
142 | + idx -= num; | |
143 | + return sk_EVP_PKEY_ASN1_METHOD_value(app_methods, idx); | |
144 | + } | |
145 | + | |
146 | +static const EVP_PKEY_ASN1_METHOD *pkey_asn1_find(int type) | |
147 | + { | |
148 | + EVP_PKEY_ASN1_METHOD tmp; | |
149 | + const EVP_PKEY_ASN1_METHOD *t = &tmp, **ret; | |
150 | + tmp.pkey_id = type; | |
151 | + if (app_methods) | |
152 | + { | |
153 | + int idx; | |
154 | + idx = sk_EVP_PKEY_ASN1_METHOD_find(app_methods, &tmp); | |
155 | + if (idx >= 0) | |
156 | + return sk_EVP_PKEY_ASN1_METHOD_value(app_methods, idx); | |
157 | + } | |
158 | + ret = OBJ_bsearch_ameth(&t, standard_methods, | |
159 | + sizeof(standard_methods) | |
160 | + /sizeof(EVP_PKEY_ASN1_METHOD *)); | |
161 | + if (!ret || !*ret) | |
162 | + return NULL; | |
163 | + return *ret; | |
164 | + } | |
165 | + | |
166 | +/* Find an implementation of an ASN1 algorithm. If 'pe' is not NULL | |
167 | + * also search through engines and set *pe to a functional reference | |
168 | + * to the engine implementing 'type' or NULL if no engine implements | |
169 | + * it. | |
170 | + */ | |
171 | + | |
172 | +const EVP_PKEY_ASN1_METHOD *EVP_PKEY_asn1_find(ENGINE **pe, int type) | |
173 | + { | |
174 | + const EVP_PKEY_ASN1_METHOD *t; | |
175 | + ENGINE *e; | |
176 | + | |
177 | + for (;;) | |
178 | + { | |
179 | + t = pkey_asn1_find(type); | |
180 | + if (!t || !(t->pkey_flags & ASN1_PKEY_ALIAS)) | |
181 | + break; | |
182 | + type = t->pkey_base_id; | |
183 | + } | |
184 | + if (pe) | |
185 | + { | |
186 | +#ifndef OPENSSL_NO_ENGINE | |
187 | + /* type will contain the final unaliased type */ | |
188 | + e = ENGINE_get_pkey_asn1_meth_engine(type); | |
189 | + if (e) | |
190 | + { | |
191 | + *pe = e; | |
192 | + return ENGINE_get_pkey_asn1_meth(e, type); | |
193 | + } | |
194 | +#endif | |
195 | + *pe = NULL; | |
196 | + } | |
197 | + return t; | |
198 | + } | |
199 | + | |
200 | +const EVP_PKEY_ASN1_METHOD *EVP_PKEY_asn1_find_str(ENGINE **pe, | |
201 | + const char *str, int len) | |
202 | + { | |
203 | + int i; | |
204 | + const EVP_PKEY_ASN1_METHOD *ameth; | |
205 | + if (len == -1) | |
206 | + len = strlen(str); | |
207 | + if (pe) | |
208 | + { | |
209 | +#ifndef OPENSSL_NO_ENGINE | |
210 | + ENGINE *e; | |
211 | + ameth = ENGINE_pkey_asn1_find_str(&e, str, len); | |
212 | + if (ameth) | |
213 | + { | |
214 | + /* Convert structural into | |
215 | + * functional reference | |
216 | + */ | |
217 | + if (!ENGINE_init(e)) | |
218 | + ameth = NULL; | |
219 | + ENGINE_free(e); | |
220 | + *pe = e; | |
221 | + return ameth; | |
222 | + } | |
223 | +#endif | |
224 | + *pe = NULL; | |
225 | + } | |
226 | + for (i = 0; i < EVP_PKEY_asn1_get_count(); i++) | |
227 | + { | |
228 | + ameth = EVP_PKEY_asn1_get0(i); | |
229 | + if (ameth->pkey_flags & ASN1_PKEY_ALIAS) | |
230 | + continue; | |
231 | + if (((int)strlen(ameth->pem_str) == len) && | |
232 | + !strncasecmp(ameth->pem_str, str, len)) | |
233 | + return ameth; | |
234 | + } | |
235 | + return NULL; | |
236 | + } | |
237 | + | |
238 | +int EVP_PKEY_asn1_add0(const EVP_PKEY_ASN1_METHOD *ameth) | |
239 | + { | |
240 | + if (app_methods == NULL) | |
241 | + { | |
242 | + app_methods = sk_EVP_PKEY_ASN1_METHOD_new(ameth_cmp); | |
243 | + if (!app_methods) | |
244 | + return 0; | |
245 | + } | |
246 | + if (!sk_EVP_PKEY_ASN1_METHOD_push(app_methods, ameth)) | |
247 | + return 0; | |
248 | + sk_EVP_PKEY_ASN1_METHOD_sort(app_methods); | |
249 | + return 1; | |
250 | + } | |
251 | + | |
252 | +int EVP_PKEY_asn1_add_alias(int to, int from) | |
253 | + { | |
254 | + EVP_PKEY_ASN1_METHOD *ameth; | |
255 | + ameth = EVP_PKEY_asn1_new(from, ASN1_PKEY_ALIAS, NULL, NULL); | |
256 | + if (!ameth) | |
257 | + return 0; | |
258 | + ameth->pkey_base_id = to; | |
259 | + return EVP_PKEY_asn1_add0(ameth); | |
260 | + } | |
261 | + | |
262 | +int EVP_PKEY_asn1_get0_info(int *ppkey_id, int *ppkey_base_id, int *ppkey_flags, | |
263 | + const char **pinfo, const char **ppem_str, | |
264 | + const EVP_PKEY_ASN1_METHOD *ameth) | |
265 | + { | |
266 | + if (!ameth) | |
267 | + return 0; | |
268 | + if (ppkey_id) | |
269 | + *ppkey_id = ameth->pkey_id; | |
270 | + if (ppkey_base_id) | |
271 | + *ppkey_base_id = ameth->pkey_base_id; | |
272 | + if (ppkey_flags) | |
273 | + *ppkey_flags = ameth->pkey_flags; | |
274 | + if (pinfo) | |
275 | + *pinfo = ameth->info; | |
276 | + if (ppem_str) | |
277 | + *ppem_str = ameth->pem_str; | |
278 | + return 1; | |
279 | + } | |
280 | + | |
281 | +const EVP_PKEY_ASN1_METHOD* EVP_PKEY_get0_asn1(EVP_PKEY *pkey) | |
282 | + { | |
283 | + return pkey->ameth; | |
284 | + } | |
285 | + | |
286 | +EVP_PKEY_ASN1_METHOD* EVP_PKEY_asn1_new(int id, int flags, | |
287 | + const char *pem_str, const char *info) | |
288 | + { | |
289 | + EVP_PKEY_ASN1_METHOD *ameth; | |
290 | + ameth = OPENSSL_malloc(sizeof(EVP_PKEY_ASN1_METHOD)); | |
291 | + if (!ameth) | |
292 | + return NULL; | |
293 | + | |
294 | + ameth->pkey_id = id; | |
295 | + ameth->pkey_base_id = id; | |
296 | + ameth->pkey_flags = flags | ASN1_PKEY_DYNAMIC; | |
297 | + | |
298 | + if (info) | |
299 | + { | |
300 | + ameth->info = BUF_strdup(info); | |
301 | + if (!ameth->info) | |
302 | + goto err; | |
303 | + } | |
304 | + | |
305 | + if (pem_str) | |
306 | + { | |
307 | + ameth->pem_str = BUF_strdup(pem_str); | |
308 | + if (!ameth->pem_str) | |
309 | + goto err; | |
310 | + } | |
311 | + | |
312 | + ameth->pub_decode = 0; | |
313 | + ameth->pub_encode = 0; | |
314 | + ameth->pub_cmp = 0; | |
315 | + ameth->pub_print = 0; | |
316 | + | |
317 | + ameth->priv_decode = 0; | |
318 | + ameth->priv_encode = 0; | |
319 | + ameth->priv_print = 0; | |
320 | + | |
321 | + ameth->old_priv_encode = 0; | |
322 | + ameth->old_priv_decode = 0; | |
323 | + | |
324 | + ameth->pkey_size = 0; | |
325 | + ameth->pkey_bits = 0; | |
326 | + | |
327 | + ameth->param_decode = 0; | |
328 | + ameth->param_encode = 0; | |
329 | + ameth->param_missing = 0; | |
330 | + ameth->param_copy = 0; | |
331 | + ameth->param_cmp = 0; | |
332 | + ameth->param_print = 0; | |
333 | + | |
334 | + ameth->pkey_free = 0; | |
335 | + ameth->pkey_ctrl = 0; | |
336 | + | |
337 | + return ameth; | |
338 | + | |
339 | + err: | |
340 | + | |
341 | + EVP_PKEY_asn1_free(ameth); | |
342 | + return NULL; | |
343 | + | |
344 | + } | |
345 | + | |
346 | +void EVP_PKEY_asn1_copy(EVP_PKEY_ASN1_METHOD *dst, | |
347 | + const EVP_PKEY_ASN1_METHOD *src) | |
348 | + { | |
349 | + | |
350 | + dst->pub_decode = src->pub_decode; | |
351 | + dst->pub_encode = src->pub_encode; | |
352 | + dst->pub_cmp = src->pub_cmp; | |
353 | + dst->pub_print = src->pub_print; | |
354 | + | |
355 | + dst->priv_decode = src->priv_decode; | |
356 | + dst->priv_encode = src->priv_encode; | |
357 | + dst->priv_print = src->priv_print; | |
358 | + | |
359 | + dst->old_priv_encode = src->old_priv_encode; | |
360 | + dst->old_priv_decode = src->old_priv_decode; | |
361 | + | |
362 | + dst->pkey_size = src->pkey_size; | |
363 | + dst->pkey_bits = src->pkey_bits; | |
364 | + | |
365 | + dst->param_decode = src->param_decode; | |
366 | + dst->param_encode = src->param_encode; | |
367 | + dst->param_missing = src->param_missing; | |
368 | + dst->param_copy = src->param_copy; | |
369 | + dst->param_cmp = src->param_cmp; | |
370 | + dst->param_print = src->param_print; | |
371 | + | |
372 | + dst->pkey_free = src->pkey_free; | |
373 | + dst->pkey_ctrl = src->pkey_ctrl; | |
374 | + | |
375 | + } | |
376 | + | |
377 | +void EVP_PKEY_asn1_free(EVP_PKEY_ASN1_METHOD *ameth) | |
378 | + { | |
379 | + if (ameth && (ameth->pkey_flags & ASN1_PKEY_DYNAMIC)) | |
380 | + { | |
381 | + if (ameth->pem_str) | |
382 | + OPENSSL_free(ameth->pem_str); | |
383 | + if (ameth->info) | |
384 | + OPENSSL_free(ameth->info); | |
385 | + OPENSSL_free(ameth); | |
386 | + } | |
387 | + } | |
388 | + | |
389 | +void EVP_PKEY_asn1_set_public(EVP_PKEY_ASN1_METHOD *ameth, | |
390 | + int (*pub_decode)(EVP_PKEY *pk, X509_PUBKEY *pub), | |
391 | + int (*pub_encode)(X509_PUBKEY *pub, const EVP_PKEY *pk), | |
392 | + int (*pub_cmp)(const EVP_PKEY *a, const EVP_PKEY *b), | |
393 | + int (*pub_print)(BIO *out, const EVP_PKEY *pkey, int indent, | |
394 | + ASN1_PCTX *pctx), | |
395 | + int (*pkey_size)(const EVP_PKEY *pk), | |
396 | + int (*pkey_bits)(const EVP_PKEY *pk)) | |
397 | + { | |
398 | + ameth->pub_decode = pub_decode; | |
399 | + ameth->pub_encode = pub_encode; | |
400 | + ameth->pub_cmp = pub_cmp; | |
401 | + ameth->pub_print = pub_print; | |
402 | + ameth->pkey_size = pkey_size; | |
403 | + ameth->pkey_bits = pkey_bits; | |
404 | + } | |
405 | + | |
406 | +void EVP_PKEY_asn1_set_private(EVP_PKEY_ASN1_METHOD *ameth, | |
407 | + int (*priv_decode)(EVP_PKEY *pk, PKCS8_PRIV_KEY_INFO *p8inf), | |
408 | + int (*priv_encode)(PKCS8_PRIV_KEY_INFO *p8, const EVP_PKEY *pk), | |
409 | + int (*priv_print)(BIO *out, const EVP_PKEY *pkey, int indent, | |
410 | + ASN1_PCTX *pctx)) | |
411 | + { | |
412 | + ameth->priv_decode = priv_decode; | |
413 | + ameth->priv_encode = priv_encode; | |
414 | + ameth->priv_print = priv_print; | |
415 | + } | |
416 | + | |
417 | +void EVP_PKEY_asn1_set_param(EVP_PKEY_ASN1_METHOD *ameth, | |
418 | + int (*param_decode)(EVP_PKEY *pkey, | |
419 | + const unsigned char **pder, int derlen), | |
420 | + int (*param_encode)(const EVP_PKEY *pkey, unsigned char **pder), | |
421 | + int (*param_missing)(const EVP_PKEY *pk), | |
422 | + int (*param_copy)(EVP_PKEY *to, const EVP_PKEY *from), | |
423 | + int (*param_cmp)(const EVP_PKEY *a, const EVP_PKEY *b), | |
424 | + int (*param_print)(BIO *out, const EVP_PKEY *pkey, int indent, | |
425 | + ASN1_PCTX *pctx)) | |
426 | + { | |
427 | + ameth->param_decode = param_decode; | |
428 | + ameth->param_encode = param_encode; | |
429 | + ameth->param_missing = param_missing; | |
430 | + ameth->param_copy = param_copy; | |
431 | + ameth->param_cmp = param_cmp; | |
432 | + ameth->param_print = param_print; | |
433 | + } | |
434 | + | |
435 | +void EVP_PKEY_asn1_set_free(EVP_PKEY_ASN1_METHOD *ameth, | |
436 | + void (*pkey_free)(EVP_PKEY *pkey)) | |
437 | + { | |
438 | + ameth->pkey_free = pkey_free; | |
439 | + } | |
440 | + | |
441 | +void EVP_PKEY_asn1_set_ctrl(EVP_PKEY_ASN1_METHOD *ameth, | |
442 | + int (*pkey_ctrl)(EVP_PKEY *pkey, int op, | |
443 | + long arg1, void *arg2)) | |
444 | + { | |
445 | + ameth->pkey_ctrl = pkey_ctrl; | |
446 | + } |
@@ -0,0 +1,134 @@ | ||
1 | +/* asn1t.h */ | |
2 | +/* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL | |
3 | + * project 2006. | |
4 | + */ | |
5 | +/* ==================================================================== | |
6 | + * Copyright (c) 2006 The OpenSSL Project. All rights reserved. | |
7 | + * | |
8 | + * Redistribution and use in source and binary forms, with or without | |
9 | + * modification, are permitted provided that the following conditions | |
10 | + * are met: | |
11 | + * | |
12 | + * 1. Redistributions of source code must retain the above copyright | |
13 | + * notice, this list of conditions and the following disclaimer. | |
14 | + * | |
15 | + * 2. Redistributions in binary form must reproduce the above copyright | |
16 | + * notice, this list of conditions and the following disclaimer in | |
17 | + * the documentation and/or other materials provided with the | |
18 | + * distribution. | |
19 | + * | |
20 | + * 3. All advertising materials mentioning features or use of this | |
21 | + * software must display the following acknowledgment: | |
22 | + * "This product includes software developed by the OpenSSL Project | |
23 | + * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)" | |
24 | + * | |
25 | + * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to | |
26 | + * endorse or promote products derived from this software without | |
27 | + * prior written permission. For written permission, please contact | |
28 | + * licensing@OpenSSL.org. | |
29 | + * | |
30 | + * 5. Products derived from this software may not be called "OpenSSL" | |
31 | + * nor may "OpenSSL" appear in their names without prior written | |
32 | + * permission of the OpenSSL Project. | |
33 | + * | |
34 | + * 6. Redistributions of any form whatsoever must retain the following | |
35 | + * acknowledgment: | |
36 | + * "This product includes software developed by the OpenSSL Project | |
37 | + * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)" | |
38 | + * | |
39 | + * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY | |
40 | + * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
41 | + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR | |
42 | + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR | |
43 | + * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | |
44 | + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT | |
45 | + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; | |
46 | + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | |
47 | + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, | |
48 | + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | |
49 | + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED | |
50 | + * OF THE POSSIBILITY OF SUCH DAMAGE. | |
51 | + * ==================================================================== | |
52 | + * | |
53 | + * This product includes cryptographic software written by Eric Young | |
54 | + * (eay@cryptsoft.com). This product includes software written by Tim | |
55 | + * Hudson (tjh@cryptsoft.com). | |
56 | + * | |
57 | + */ | |
58 | + | |
59 | +/* Internal ASN1 structures and functions: not for application use */ | |
60 | + | |
61 | +/* ASN1 print context structure */ | |
62 | + | |
63 | +struct asn1_pctx_st | |
64 | + { | |
65 | + unsigned long flags; | |
66 | + unsigned long nm_flags; | |
67 | + unsigned long cert_flags; | |
68 | + unsigned long oid_flags; | |
69 | + unsigned long str_flags; | |
70 | + } /* ASN1_PCTX */; | |
71 | + | |
72 | +/* ASN1 public key method structure */ | |
73 | + | |
74 | +struct evp_pkey_asn1_method_st | |
75 | + { | |
76 | + int pkey_id; | |
77 | + int pkey_base_id; | |
78 | + unsigned long pkey_flags; | |
79 | + | |
80 | + char *pem_str; | |
81 | + char *info; | |
82 | + | |
83 | + int (*pub_decode)(EVP_PKEY *pk, X509_PUBKEY *pub); | |
84 | + int (*pub_encode)(X509_PUBKEY *pub, const EVP_PKEY *pk); | |
85 | + int (*pub_cmp)(const EVP_PKEY *a, const EVP_PKEY *b); | |
86 | + int (*pub_print)(BIO *out, const EVP_PKEY *pkey, int indent, | |
87 | + ASN1_PCTX *pctx); | |
88 | + | |
89 | + int (*priv_decode)(EVP_PKEY *pk, PKCS8_PRIV_KEY_INFO *p8inf); | |
90 | + int (*priv_encode)(PKCS8_PRIV_KEY_INFO *p8, const EVP_PKEY *pk); | |
91 | + int (*priv_print)(BIO *out, const EVP_PKEY *pkey, int indent, | |
92 | + ASN1_PCTX *pctx); | |
93 | + | |
94 | + int (*pkey_size)(const EVP_PKEY *pk); | |
95 | + int (*pkey_bits)(const EVP_PKEY *pk); | |
96 | + | |
97 | + int (*param_decode)(EVP_PKEY *pkey, | |
98 | + const unsigned char **pder, int derlen); | |
99 | + int (*param_encode)(const EVP_PKEY *pkey, unsigned char **pder); | |
100 | + int (*param_missing)(const EVP_PKEY *pk); | |
101 | + int (*param_copy)(EVP_PKEY *to, const EVP_PKEY *from); | |
102 | + int (*param_cmp)(const EVP_PKEY *a, const EVP_PKEY *b); | |
103 | + int (*param_print)(BIO *out, const EVP_PKEY *pkey, int indent, | |
104 | + ASN1_PCTX *pctx); | |
105 | + | |
106 | + void (*pkey_free)(EVP_PKEY *pkey); | |
107 | + int (*pkey_ctrl)(EVP_PKEY *pkey, int op, long arg1, void *arg2); | |
108 | + | |
109 | + /* Legacy functions for old PEM */ | |
110 | + | |
111 | + int (*old_priv_decode)(EVP_PKEY *pkey, | |
112 | + const unsigned char **pder, int derlen); | |
113 | + int (*old_priv_encode)(const EVP_PKEY *pkey, unsigned char **pder); | |
114 | + | |
115 | + } /* EVP_PKEY_ASN1_METHOD */; | |
116 | + | |
117 | +/* Method to handle CRL access. | |
118 | + * In general a CRL could be very large (several Mb) and can consume large | |
119 | + * amounts of resources if stored in memory by multiple processes. | |
120 | + * This method allows general CRL operations to be redirected to more | |
121 | + * efficient callbacks: for example a CRL entry database. | |
122 | + */ | |
123 | + | |
124 | +#define X509_CRL_METHOD_DYNAMIC 1 | |
125 | + | |
126 | +struct x509_crl_method_st | |
127 | + { | |
128 | + int flags; | |
129 | + int (*crl_init)(X509_CRL *crl); | |
130 | + int (*crl_free)(X509_CRL *crl); | |
131 | + int (*crl_lookup)(X509_CRL *crl, X509_REVOKED **ret, | |
132 | + ASN1_INTEGER *ser, X509_NAME *issuer); | |
133 | + int (*crl_verify)(X509_CRL *crl, EVP_PKEY *pk); | |
134 | + }; |
@@ -0,0 +1,495 @@ | ||
1 | +/* bio_asn1.c */ | |
2 | +/* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL | |
3 | + * project. | |
4 | + */ | |
5 | +/* ==================================================================== | |
6 | + * Copyright (c) 2006 The OpenSSL Project. All rights reserved. | |
7 | + * | |
8 | + * Redistribution and use in source and binary forms, with or without | |
9 | + * modification, are permitted provided that the following conditions | |
10 | + * are met: | |
11 | + * | |
12 | + * 1. Redistributions of source code must retain the above copyright | |
13 | + * notice, this list of conditions and the following disclaimer. | |
14 | + * | |
15 | + * 2. Redistributions in binary form must reproduce the above copyright | |
16 | + * notice, this list of conditions and the following disclaimer in | |
17 | + * the documentation and/or other materials provided with the | |
18 | + * distribution. | |
19 | + * | |
20 | + * 3. All advertising materials mentioning features or use of this | |
21 | + * software must display the following acknowledgment: | |
22 | + * "This product includes software developed by the OpenSSL Project | |
23 | + * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)" | |
24 | + * | |
25 | + * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to | |
26 | + * endorse or promote products derived from this software without | |
27 | + * prior written permission. For written permission, please contact | |
28 | + * licensing@OpenSSL.org. | |
29 | + * | |
30 | + * 5. Products derived from this software may not be called "OpenSSL" | |
31 | + * nor may "OpenSSL" appear in their names without prior written | |
32 | + * permission of the OpenSSL Project. | |
33 | + * | |
34 | + * 6. Redistributions of any form whatsoever must retain the following | |
35 | + * acknowledgment: | |
36 | + * "This product includes software developed by the OpenSSL Project | |
37 | + * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)" | |
38 | + * | |
39 | + * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY | |
40 | + * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
41 | + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR | |
42 | + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR | |
43 | + * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | |
44 | + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT | |
45 | + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; | |
46 | + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | |
47 | + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, | |
48 | + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | |
49 | + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED | |
50 | + * OF THE POSSIBILITY OF SUCH DAMAGE. | |
51 | + * ==================================================================== | |
52 | + * | |
53 | + * This product includes cryptographic software written by Eric Young | |
54 | + * (eay@cryptsoft.com). This product includes software written by Tim | |
55 | + * Hudson (tjh@cryptsoft.com). | |
56 | + * | |
57 | + */ | |
58 | + | |
59 | +/* Experimental ASN1 BIO. When written through the data is converted | |
60 | + * to an ASN1 string type: default is OCTET STRING. Additional functions | |
61 | + * can be provided to add prefix and suffix data. | |
62 | + */ | |
63 | + | |
64 | +#include <string.h> | |
65 | +#include <openssl/bio.h> | |
66 | +#include <openssl/asn1.h> | |
67 | + | |
68 | +/* Must be large enough for biggest tag+length */ | |
69 | +#define DEFAULT_ASN1_BUF_SIZE 20 | |
70 | + | |
71 | +typedef enum | |
72 | + { | |
73 | + ASN1_STATE_START, | |
74 | + ASN1_STATE_PRE_COPY, | |
75 | + ASN1_STATE_HEADER, | |
76 | + ASN1_STATE_HEADER_COPY, | |
77 | + ASN1_STATE_DATA_COPY, | |
78 | + ASN1_STATE_POST_COPY, | |
79 | + ASN1_STATE_DONE | |
80 | + } asn1_bio_state_t; | |
81 | + | |
82 | +typedef struct BIO_ASN1_EX_FUNCS_st | |
83 | + { | |
84 | + asn1_ps_func *ex_func; | |
85 | + asn1_ps_func *ex_free_func; | |
86 | + } BIO_ASN1_EX_FUNCS; | |
87 | + | |
88 | +typedef struct BIO_ASN1_BUF_CTX_t | |
89 | + { | |
90 | + /* Internal state */ | |
91 | + asn1_bio_state_t state; | |
92 | + /* Internal buffer */ | |
93 | + unsigned char *buf; | |
94 | + /* Size of buffer */ | |
95 | + int bufsize; | |
96 | + /* Current position in buffer */ | |
97 | + int bufpos; | |
98 | + /* Current buffer length */ | |
99 | + int buflen; | |
100 | + /* Amount of data to copy */ | |
101 | + int copylen; | |
102 | + /* Class and tag to use */ | |
103 | + int asn1_class, asn1_tag; | |
104 | + asn1_ps_func *prefix, *prefix_free, *suffix, *suffix_free; | |
105 | + /* Extra buffer for prefix and suffix data */ | |
106 | + unsigned char *ex_buf; | |
107 | + int ex_len; | |
108 | + int ex_pos; | |
109 | + void *ex_arg; | |
110 | + } BIO_ASN1_BUF_CTX; | |
111 | + | |
112 | + | |
113 | +static int asn1_bio_write(BIO *h, const char *buf,int num); | |
114 | +static int asn1_bio_read(BIO *h, char *buf, int size); | |
115 | +static int asn1_bio_puts(BIO *h, const char *str); | |
116 | +static int asn1_bio_gets(BIO *h, char *str, int size); | |
117 | +static long asn1_bio_ctrl(BIO *h, int cmd, long arg1, void *arg2); | |
118 | +static int asn1_bio_new(BIO *h); | |
119 | +static int asn1_bio_free(BIO *data); | |
120 | +static long asn1_bio_callback_ctrl(BIO *h, int cmd, bio_info_cb *fp); | |
121 | + | |
122 | +static int asn1_bio_init(BIO_ASN1_BUF_CTX *ctx, int size); | |
123 | +static int asn1_bio_flush_ex(BIO *b, BIO_ASN1_BUF_CTX *ctx, | |
124 | + asn1_ps_func *cleanup, asn1_bio_state_t next); | |
125 | +static int asn1_bio_setup_ex(BIO *b, BIO_ASN1_BUF_CTX *ctx, | |
126 | + asn1_ps_func *setup, | |
127 | + asn1_bio_state_t ex_state, | |
128 | + asn1_bio_state_t other_state); | |
129 | + | |
130 | +static BIO_METHOD methods_asn1= | |
131 | + { | |
132 | + BIO_TYPE_ASN1, | |
133 | + "asn1", | |
134 | + asn1_bio_write, | |
135 | + asn1_bio_read, | |
136 | + asn1_bio_puts, | |
137 | + asn1_bio_gets, | |
138 | + asn1_bio_ctrl, | |
139 | + asn1_bio_new, | |
140 | + asn1_bio_free, | |
141 | + asn1_bio_callback_ctrl, | |
142 | + }; | |
143 | + | |
144 | +BIO_METHOD *BIO_f_asn1(void) | |
145 | + { | |
146 | + return(&methods_asn1); | |
147 | + } | |
148 | + | |
149 | + | |
150 | +static int asn1_bio_new(BIO *b) | |
151 | + { | |
152 | + BIO_ASN1_BUF_CTX *ctx; | |
153 | + ctx = OPENSSL_malloc(sizeof(BIO_ASN1_BUF_CTX)); | |
154 | + if (!ctx) | |
155 | + return 0; | |
156 | + if (!asn1_bio_init(ctx, DEFAULT_ASN1_BUF_SIZE)) | |
157 | + return 0; | |
158 | + b->init = 1; | |
159 | + b->ptr = (char *)ctx; | |
160 | + b->flags = 0; | |
161 | + return 1; | |
162 | + } | |
163 | + | |
164 | +static int asn1_bio_init(BIO_ASN1_BUF_CTX *ctx, int size) | |
165 | + { | |
166 | + ctx->buf = OPENSSL_malloc(size); | |
167 | + if (!ctx->buf) | |
168 | + return 0; | |
169 | + ctx->bufsize = size; | |
170 | + ctx->bufpos = 0; | |
171 | + ctx->buflen = 0; | |
172 | + ctx->copylen = 0; | |
173 | + ctx->asn1_class = V_ASN1_UNIVERSAL; | |
174 | + ctx->asn1_tag = V_ASN1_OCTET_STRING; | |
175 | + ctx->ex_buf = 0; | |
176 | + ctx->ex_pos = 0; | |
177 | + ctx->ex_len = 0; | |
178 | + ctx->state = ASN1_STATE_START; | |
179 | + return 1; | |
180 | + } | |
181 | + | |
182 | +static int asn1_bio_free(BIO *b) | |
183 | + { | |
184 | + BIO_ASN1_BUF_CTX *ctx; | |
185 | + ctx = (BIO_ASN1_BUF_CTX *) b->ptr; | |
186 | + if (ctx == NULL) | |
187 | + return 0; | |
188 | + if (ctx->buf) | |
189 | + OPENSSL_free(ctx->buf); | |
190 | + OPENSSL_free(ctx); | |
191 | + b->init = 0; | |
192 | + b->ptr = NULL; | |
193 | + b->flags = 0; | |
194 | + return 1; | |
195 | + } | |
196 | + | |
197 | +static int asn1_bio_write(BIO *b, const char *in , int inl) | |
198 | + { | |
199 | + BIO_ASN1_BUF_CTX *ctx; | |
200 | + int wrmax, wrlen, ret; | |
201 | + unsigned char *p; | |
202 | + if (!in || (inl < 0) || (b->next_bio == NULL)) | |
203 | + return 0; | |
204 | + ctx = (BIO_ASN1_BUF_CTX *) b->ptr; | |
205 | + if (ctx == NULL) | |
206 | + return 0; | |
207 | + | |
208 | + wrlen = 0; | |
209 | + ret = -1; | |
210 | + | |
211 | + for(;;) | |
212 | + { | |
213 | + switch (ctx->state) | |
214 | + { | |
215 | + | |
216 | + /* Setup prefix data, call it */ | |
217 | + case ASN1_STATE_START: | |
218 | + if (!asn1_bio_setup_ex(b, ctx, ctx->prefix, | |
219 | + ASN1_STATE_PRE_COPY, ASN1_STATE_HEADER)) | |
220 | + return 0; | |
221 | + break; | |
222 | + | |
223 | + /* Copy any pre data first */ | |
224 | + case ASN1_STATE_PRE_COPY: | |
225 | + | |
226 | + ret = asn1_bio_flush_ex(b, ctx, ctx->prefix_free, | |
227 | + ASN1_STATE_HEADER); | |
228 | + | |
229 | + if (ret <= 0) | |
230 | + goto done; | |
231 | + | |
232 | + break; | |
233 | + | |
234 | + case ASN1_STATE_HEADER: | |
235 | + ctx->buflen = | |
236 | + ASN1_object_size(0, inl, ctx->asn1_tag) - inl; | |
237 | + OPENSSL_assert(ctx->buflen <= ctx->bufsize); | |
238 | + p = ctx->buf; | |
239 | + ASN1_put_object(&p, 0, inl, | |
240 | + ctx->asn1_tag, ctx->asn1_class); | |
241 | + ctx->copylen = inl; | |
242 | + ctx->state = ASN1_STATE_HEADER_COPY; | |
243 | + | |
244 | + break; | |
245 | + | |
246 | + case ASN1_STATE_HEADER_COPY: | |
247 | + ret = BIO_write(b->next_bio, | |
248 | + ctx->buf + ctx->bufpos, ctx->buflen); | |
249 | + if (ret <= 0) | |
250 | + goto done; | |
251 | + | |
252 | + ctx->buflen -= ret; | |
253 | + if (ctx->buflen) | |
254 | + ctx->bufpos += ret; | |
255 | + else | |
256 | + { | |
257 | + ctx->bufpos = 0; | |
258 | + ctx->state = ASN1_STATE_DATA_COPY; | |
259 | + } | |
260 | + | |
261 | + break; | |
262 | + | |
263 | + case ASN1_STATE_DATA_COPY: | |
264 | + | |
265 | + if (inl > ctx->copylen) | |
266 | + wrmax = ctx->copylen; | |
267 | + else | |
268 | + wrmax = inl; | |
269 | + ret = BIO_write(b->next_bio, in, wrmax); | |
270 | + if (ret <= 0) | |
271 | + break; | |
272 | + wrlen += ret; | |
273 | + ctx->copylen -= ret; | |
274 | + in += ret; | |
275 | + inl -= ret; | |
276 | + | |
277 | + if (ctx->copylen == 0) | |
278 | + ctx->state = ASN1_STATE_HEADER; | |
279 | + | |
280 | + if (inl == 0) | |
281 | + goto done; | |
282 | + | |
283 | + break; | |
284 | + | |
285 | + default: | |
286 | + BIO_clear_retry_flags(b); | |
287 | + return 0; | |
288 | + | |
289 | + } | |
290 | + | |
291 | + } | |
292 | + | |
293 | + done: | |
294 | + BIO_clear_retry_flags(b); | |
295 | + BIO_copy_next_retry(b); | |
296 | + | |
297 | + return (wrlen > 0) ? wrlen : ret; | |
298 | + | |
299 | + } | |
300 | + | |
301 | +static int asn1_bio_flush_ex(BIO *b, BIO_ASN1_BUF_CTX *ctx, | |
302 | + asn1_ps_func *cleanup, asn1_bio_state_t next) | |
303 | + { | |
304 | + int ret; | |
305 | + if (ctx->ex_len <= 0) | |
306 | + return 1; | |
307 | + for(;;) | |
308 | + { | |
309 | + ret = BIO_write(b->next_bio, ctx->ex_buf + ctx->ex_pos, | |
310 | + ctx->ex_len); | |
311 | + if (ret <= 0) | |
312 | + break; | |
313 | + ctx->ex_len -= ret; | |
314 | + if (ctx->ex_len > 0) | |
315 | + ctx->ex_pos += ret; | |
316 | + else | |
317 | + { | |
318 | + if(cleanup) | |
319 | + cleanup(b, &ctx->ex_buf, &ctx->ex_len, | |
320 | + &ctx->ex_arg); | |
321 | + ctx->state = next; | |
322 | + ctx->ex_pos = 0; | |
323 | + break; | |
324 | + } | |
325 | + } | |
326 | + return ret; | |
327 | + } | |
328 | + | |
329 | +static int asn1_bio_setup_ex(BIO *b, BIO_ASN1_BUF_CTX *ctx, | |
330 | + asn1_ps_func *setup, | |
331 | + asn1_bio_state_t ex_state, | |
332 | + asn1_bio_state_t other_state) | |
333 | + { | |
334 | + if (setup && !setup(b, &ctx->ex_buf, &ctx->ex_len, &ctx->ex_arg)) | |
335 | + { | |
336 | + BIO_clear_retry_flags(b); | |
337 | + return 0; | |
338 | + } | |
339 | + if (ctx->ex_len > 0) | |
340 | + ctx->state = ex_state; | |
341 | + else | |
342 | + ctx->state = other_state; | |
343 | + return 1; | |
344 | + } | |
345 | + | |
346 | +static int asn1_bio_read(BIO *b, char *in , int inl) | |
347 | + { | |
348 | + if (!b->next_bio) | |
349 | + return 0; | |
350 | + return BIO_read(b->next_bio, in , inl); | |
351 | + } | |
352 | + | |
353 | +static int asn1_bio_puts(BIO *b, const char *str) | |
354 | + { | |
355 | + return asn1_bio_write(b, str, strlen(str)); | |
356 | + } | |
357 | + | |
358 | +static int asn1_bio_gets(BIO *b, char *str, int size) | |
359 | + { | |
360 | + if (!b->next_bio) | |
361 | + return 0; | |
362 | + return BIO_gets(b->next_bio, str , size); | |
363 | + } | |
364 | + | |
365 | +static long asn1_bio_callback_ctrl(BIO *b, int cmd, bio_info_cb *fp) | |
366 | + { | |
367 | + if (b->next_bio == NULL) return(0); | |
368 | + return BIO_callback_ctrl(b->next_bio,cmd,fp); | |
369 | + } | |
370 | + | |
371 | +static long asn1_bio_ctrl(BIO *b, int cmd, long arg1, void *arg2) | |
372 | + { | |
373 | + BIO_ASN1_BUF_CTX *ctx; | |
374 | + BIO_ASN1_EX_FUNCS *ex_func; | |
375 | + long ret = 1; | |
376 | + ctx = (BIO_ASN1_BUF_CTX *) b->ptr; | |
377 | + if (ctx == NULL) | |
378 | + return 0; | |
379 | + switch(cmd) | |
380 | + { | |
381 | + | |
382 | + case BIO_C_SET_PREFIX: | |
383 | + ex_func = arg2; | |
384 | + ctx->prefix = ex_func->ex_func; | |
385 | + ctx->prefix_free = ex_func->ex_free_func; | |
386 | + break; | |
387 | + | |
388 | + case BIO_C_GET_PREFIX: | |
389 | + ex_func = arg2; | |
390 | + ex_func->ex_func = ctx->prefix; | |
391 | + ex_func->ex_free_func = ctx->prefix_free; | |
392 | + break; | |
393 | + | |
394 | + case BIO_C_SET_SUFFIX: | |
395 | + ex_func = arg2; | |
396 | + ctx->suffix = ex_func->ex_func; | |
397 | + ctx->suffix_free = ex_func->ex_free_func; | |
398 | + break; | |
399 | + | |
400 | + case BIO_C_GET_SUFFIX: | |
401 | + ex_func = arg2; | |
402 | + ex_func->ex_func = ctx->suffix; | |
403 | + ex_func->ex_free_func = ctx->suffix_free; | |
404 | + break; | |
405 | + | |
406 | + case BIO_C_SET_EX_ARG: | |
407 | + ctx->ex_arg = arg2; | |
408 | + break; | |
409 | + | |
410 | + case BIO_C_GET_EX_ARG: | |
411 | + *(void **)arg2 = ctx->ex_arg; | |
412 | + break; | |
413 | + | |
414 | + case BIO_CTRL_FLUSH: | |
415 | + if (!b->next_bio) | |
416 | + return 0; | |
417 | + | |
418 | + /* Call post function if possible */ | |
419 | + if (ctx->state == ASN1_STATE_HEADER) | |
420 | + { | |
421 | + if (!asn1_bio_setup_ex(b, ctx, ctx->suffix, | |
422 | + ASN1_STATE_POST_COPY, ASN1_STATE_DONE)) | |
423 | + return 0; | |
424 | + } | |
425 | + | |
426 | + if (ctx->state == ASN1_STATE_POST_COPY) | |
427 | + { | |
428 | + ret = asn1_bio_flush_ex(b, ctx, ctx->suffix_free, | |
429 | + ASN1_STATE_DONE); | |
430 | + if (ret <= 0) | |
431 | + return ret; | |
432 | + } | |
433 | + | |
434 | + if (ctx->state == ASN1_STATE_DONE) | |
435 | + return BIO_ctrl(b->next_bio, cmd, arg1, arg2); | |
436 | + else | |
437 | + { | |
438 | + BIO_clear_retry_flags(b); | |
439 | + return 0; | |
440 | + } | |
441 | + break; | |
442 | + | |
443 | + | |
444 | + default: | |
445 | + if (!b->next_bio) | |
446 | + return 0; | |
447 | + return BIO_ctrl(b->next_bio, cmd, arg1, arg2); | |
448 | + | |
449 | + } | |
450 | + | |
451 | + return ret; | |
452 | + } | |
453 | + | |
454 | +static int asn1_bio_set_ex(BIO *b, int cmd, | |
455 | + asn1_ps_func *ex_func, asn1_ps_func *ex_free_func) | |
456 | + { | |
457 | + BIO_ASN1_EX_FUNCS extmp; | |
458 | + extmp.ex_func = ex_func; | |
459 | + extmp.ex_free_func = ex_free_func; | |
460 | + return BIO_ctrl(b, cmd, 0, &extmp); | |
461 | + } | |
462 | + | |
463 | +static int asn1_bio_get_ex(BIO *b, int cmd, | |
464 | + asn1_ps_func **ex_func, asn1_ps_func **ex_free_func) | |
465 | + { | |
466 | + BIO_ASN1_EX_FUNCS extmp; | |
467 | + int ret; | |
468 | + ret = BIO_ctrl(b, cmd, 0, &extmp); | |
469 | + if (ret > 0) | |
470 | + { | |
471 | + *ex_func = extmp.ex_func; | |
472 | + *ex_free_func = extmp.ex_free_func; | |
473 | + } | |
474 | + return ret; | |
475 | + } | |
476 | + | |
477 | +int BIO_asn1_set_prefix(BIO *b, asn1_ps_func *prefix, asn1_ps_func *prefix_free) | |
478 | + { | |
479 | + return asn1_bio_set_ex(b, BIO_C_SET_PREFIX, prefix, prefix_free); | |
480 | + } | |
481 | + | |
482 | +int BIO_asn1_get_prefix(BIO *b, asn1_ps_func **pprefix, asn1_ps_func **pprefix_free) | |
483 | + { | |
484 | + return asn1_bio_get_ex(b, BIO_C_GET_PREFIX, pprefix, pprefix_free); | |
485 | + } | |
486 | + | |
487 | +int BIO_asn1_set_suffix(BIO *b, asn1_ps_func *suffix, asn1_ps_func *suffix_free) | |
488 | + { | |
489 | + return asn1_bio_set_ex(b, BIO_C_SET_SUFFIX, suffix, suffix_free); | |
490 | + } | |
491 | + | |
492 | +int BIO_asn1_get_suffix(BIO *b, asn1_ps_func **psuffix, asn1_ps_func **psuffix_free) | |
493 | + { | |
494 | + return asn1_bio_get_ex(b, BIO_C_GET_SUFFIX, psuffix, psuffix_free); | |
495 | + } |
@@ -0,0 +1,246 @@ | ||
1 | +/* bio_ndef.c */ | |
2 | +/* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL | |
3 | + * project. | |
4 | + */ | |
5 | +/* ==================================================================== | |
6 | + * Copyright (c) 2008 The OpenSSL Project. All rights reserved. | |
7 | + * | |
8 | + * Redistribution and use in source and binary forms, with or without | |
9 | + * modification, are permitted provided that the following conditions | |
10 | + * are met: | |
11 | + * | |
12 | + * 1. Redistributions of source code must retain the above copyright | |
13 | + * notice, this list of conditions and the following disclaimer. | |
14 | + * | |
15 | + * 2. Redistributions in binary form must reproduce the above copyright | |
16 | + * notice, this list of conditions and the following disclaimer in | |
17 | + * the documentation and/or other materials provided with the | |
18 | + * distribution. | |
19 | + * | |
20 | + * 3. All advertising materials mentioning features or use of this | |
21 | + * software must display the following acknowledgment: | |
22 | + * "This product includes software developed by the OpenSSL Project | |
23 | + * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)" | |
24 | + * | |
25 | + * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to | |
26 | + * endorse or promote products derived from this software without | |
27 | + * prior written permission. For written permission, please contact | |
28 | + * licensing@OpenSSL.org. | |
29 | + * | |
30 | + * 5. Products derived from this software may not be called "OpenSSL" | |
31 | + * nor may "OpenSSL" appear in their names without prior written | |
32 | + * permission of the OpenSSL Project. | |
33 | + * | |
34 | + * 6. Redistributions of any form whatsoever must retain the following | |
35 | + * acknowledgment: | |
36 | + * "This product includes software developed by the OpenSSL Project | |
37 | + * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)" | |
38 | + * | |
39 | + * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY | |
40 | + * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
41 | + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR | |
42 | + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR | |
43 | + * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | |
44 | + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT | |
45 | + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; | |
46 | + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | |
47 | + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, | |
48 | + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | |
49 | + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED | |
50 | + * OF THE POSSIBILITY OF SUCH DAMAGE. | |
51 | + * ==================================================================== | |
52 | + * | |
53 | + */ | |
54 | + | |
55 | +#include <openssl/asn1.h> | |
56 | +#include <openssl/asn1t.h> | |
57 | +#include <openssl/bio.h> | |
58 | +#include <openssl/err.h> | |
59 | + | |
60 | +#ifndef OPENSSL_SYSNAME_NETWARE | |
61 | +#include <memory.h> | |
62 | +#endif | |
63 | +#include <stdio.h> | |
64 | + | |
65 | +/* Experimental NDEF ASN1 BIO support routines */ | |
66 | + | |
67 | +/* The usage is quite simple, initialize an ASN1 structure, | |
68 | + * get a BIO from it then any data written through the BIO | |
69 | + * will end up translated to approptiate format on the fly. | |
70 | + * The data is streamed out and does *not* need to be | |
71 | + * all held in memory at once. | |
72 | + * | |
73 | + * When the BIO is flushed the output is finalized and any | |
74 | + * signatures etc written out. | |
75 | + * | |
76 | + * The BIO is a 'proper' BIO and can handle non blocking I/O | |
77 | + * correctly. | |
78 | + * | |
79 | + * The usage is simple. The implementation is *not*... | |
80 | + */ | |
81 | + | |
82 | +/* BIO support data stored in the ASN1 BIO ex_arg */ | |
83 | + | |
84 | +typedef struct ndef_aux_st | |
85 | + { | |
86 | + /* ASN1 structure this BIO refers to */ | |
87 | + ASN1_VALUE *val; | |
88 | + const ASN1_ITEM *it; | |
89 | + /* Top of the BIO chain */ | |
90 | + BIO *ndef_bio; | |
91 | + /* Output BIO */ | |
92 | + BIO *out; | |
93 | + /* Boundary where content is inserted */ | |
94 | + unsigned char **boundary; | |
95 | + /* DER buffer start */ | |
96 | + unsigned char *derbuf; | |
97 | + } NDEF_SUPPORT; | |
98 | + | |
99 | +static int ndef_prefix(BIO *b, unsigned char **pbuf, int *plen, void *parg); | |
100 | +static int ndef_prefix_free(BIO *b, unsigned char **pbuf, int *plen, void *parg); | |
101 | +static int ndef_suffix(BIO *b, unsigned char **pbuf, int *plen, void *parg); | |
102 | +static int ndef_suffix_free(BIO *b, unsigned char **pbuf, int *plen, void *parg); | |
103 | + | |
104 | +BIO *BIO_new_NDEF(BIO *out, ASN1_VALUE *val, const ASN1_ITEM *it) | |
105 | + { | |
106 | + NDEF_SUPPORT *ndef_aux = NULL; | |
107 | + BIO *asn_bio = NULL; | |
108 | + const ASN1_AUX *aux = it->funcs; | |
109 | + ASN1_STREAM_ARG sarg; | |
110 | + | |
111 | + if (!aux || !aux->asn1_cb) | |
112 | + { | |
113 | + ASN1err(ASN1_F_BIO_NEW_NDEF, ASN1_R_STREAMING_NOT_SUPPORTED); | |
114 | + return NULL; | |
115 | + } | |
116 | + ndef_aux = OPENSSL_malloc(sizeof(NDEF_SUPPORT)); | |
117 | + asn_bio = BIO_new(BIO_f_asn1()); | |
118 | + | |
119 | + /* ASN1 bio needs to be next to output BIO */ | |
120 | + | |
121 | + out = BIO_push(asn_bio, out); | |
122 | + | |
123 | + if (!ndef_aux || !asn_bio || !out) | |
124 | + goto err; | |
125 | + | |
126 | + BIO_asn1_set_prefix(asn_bio, ndef_prefix, ndef_prefix_free); | |
127 | + BIO_asn1_set_suffix(asn_bio, ndef_suffix, ndef_suffix_free); | |
128 | + | |
129 | + /* Now let callback prepend any digest, cipher etc BIOs | |
130 | + * ASN1 structure needs. | |
131 | + */ | |
132 | + | |
133 | + sarg.out = out; | |
134 | + sarg.ndef_bio = NULL; | |
135 | + sarg.boundary = NULL; | |
136 | + | |
137 | + if (aux->asn1_cb(ASN1_OP_STREAM_PRE, &val, it, &sarg) <= 0) | |
138 | + goto err; | |
139 | + | |
140 | + ndef_aux->val = val; | |
141 | + ndef_aux->it = it; | |
142 | + ndef_aux->ndef_bio = sarg.ndef_bio; | |
143 | + ndef_aux->boundary = sarg.boundary; | |
144 | + ndef_aux->out = out; | |
145 | + | |
146 | + BIO_ctrl(asn_bio, BIO_C_SET_EX_ARG, 0, ndef_aux); | |
147 | + | |
148 | + return sarg.ndef_bio; | |
149 | + | |
150 | + err: | |
151 | + if (asn_bio) | |
152 | + BIO_free(asn_bio); | |
153 | + if (ndef_aux) | |
154 | + OPENSSL_free(ndef_aux); | |
155 | + return NULL; | |
156 | + } | |
157 | + | |
158 | +static int ndef_prefix(BIO *b, unsigned char **pbuf, int *plen, void *parg) | |
159 | + { | |
160 | + NDEF_SUPPORT *ndef_aux; | |
161 | + unsigned char *p; | |
162 | + int derlen; | |
163 | + | |
164 | + if (!parg) | |
165 | + return 0; | |
166 | + | |
167 | + ndef_aux = *(NDEF_SUPPORT **)parg; | |
168 | + | |
169 | + derlen = ASN1_item_ndef_i2d(ndef_aux->val, NULL, ndef_aux->it); | |
170 | + p = OPENSSL_malloc(derlen); | |
171 | + ndef_aux->derbuf = p; | |
172 | + *pbuf = p; | |
173 | + derlen = ASN1_item_ndef_i2d(ndef_aux->val, &p, ndef_aux->it); | |
174 | + | |
175 | + if (!*ndef_aux->boundary) | |
176 | + return 0; | |
177 | + | |
178 | + *plen = *ndef_aux->boundary - *pbuf; | |
179 | + | |
180 | + return 1; | |
181 | + } | |
182 | + | |
183 | +static int ndef_prefix_free(BIO *b, unsigned char **pbuf, int *plen, void *parg) | |
184 | + { | |
185 | + NDEF_SUPPORT *ndef_aux; | |
186 | + | |
187 | + if (!parg) | |
188 | + return 0; | |
189 | + | |
190 | + ndef_aux = *(NDEF_SUPPORT **)parg; | |
191 | + | |
192 | + if (ndef_aux->derbuf) | |
193 | + OPENSSL_free(ndef_aux->derbuf); | |
194 | + | |
195 | + ndef_aux->derbuf = NULL; | |
196 | + *pbuf = NULL; | |
197 | + *plen = 0; | |
198 | + return 1; | |
199 | + } | |
200 | + | |
201 | +static int ndef_suffix_free(BIO *b, unsigned char **pbuf, int *plen, void *parg) | |
202 | + { | |
203 | + NDEF_SUPPORT **pndef_aux = (NDEF_SUPPORT **)parg; | |
204 | + if (!ndef_prefix_free(b, pbuf, plen, parg)) | |
205 | + return 0; | |
206 | + OPENSSL_free(*pndef_aux); | |
207 | + *pndef_aux = NULL; | |
208 | + return 1; | |
209 | + } | |
210 | + | |
211 | +static int ndef_suffix(BIO *b, unsigned char **pbuf, int *plen, void *parg) | |
212 | + { | |
213 | + NDEF_SUPPORT *ndef_aux; | |
214 | + unsigned char *p; | |
215 | + int derlen; | |
216 | + const ASN1_AUX *aux; | |
217 | + ASN1_STREAM_ARG sarg; | |
218 | + | |
219 | + if (!parg) | |
220 | + return 0; | |
221 | + | |
222 | + ndef_aux = *(NDEF_SUPPORT **)parg; | |
223 | + | |
224 | + aux = ndef_aux->it->funcs; | |
225 | + | |
226 | + /* Finalize structures */ | |
227 | + sarg.ndef_bio = ndef_aux->ndef_bio; | |
228 | + sarg.out = ndef_aux->out; | |
229 | + sarg.boundary = ndef_aux->boundary; | |
230 | + if (aux->asn1_cb(ASN1_OP_STREAM_POST, | |
231 | + &ndef_aux->val, ndef_aux->it, &sarg) <= 0) | |
232 | + return 0; | |
233 | + | |
234 | + derlen = ASN1_item_ndef_i2d(ndef_aux->val, NULL, ndef_aux->it); | |
235 | + p = OPENSSL_malloc(derlen); | |
236 | + ndef_aux->derbuf = p; | |
237 | + *pbuf = p; | |
238 | + derlen = ASN1_item_ndef_i2d(ndef_aux->val, &p, ndef_aux->it); | |
239 | + | |
240 | + if (!*ndef_aux->boundary) | |
241 | + return 0; | |
242 | + *pbuf = *ndef_aux->boundary; | |
243 | + *plen = derlen - (*ndef_aux->boundary - ndef_aux->derbuf); | |
244 | + | |
245 | + return 1; | |
246 | + } |
@@ -0,0 +1,72 @@ | ||
1 | +/* x_nx509.c */ | |
2 | +/* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL | |
3 | + * project 2005. | |
4 | + */ | |
5 | +/* ==================================================================== | |
6 | + * Copyright (c) 2005 The OpenSSL Project. All rights reserved. | |
7 | + * | |
8 | + * Redistribution and use in source and binary forms, with or without | |
9 | + * modification, are permitted provided that the following conditions | |
10 | + * are met: | |
11 | + * | |
12 | + * 1. Redistributions of source code must retain the above copyright | |
13 | + * notice, this list of conditions and the following disclaimer. | |
14 | + * | |
15 | + * 2. Redistributions in binary form must reproduce the above copyright | |
16 | + * notice, this list of conditions and the following disclaimer in | |
17 | + * the documentation and/or other materials provided with the | |
18 | + * distribution. | |
19 | + * | |
20 | + * 3. All advertising materials mentioning features or use of this | |
21 | + * software must display the following acknowledgment: | |
22 | + * "This product includes software developed by the OpenSSL Project | |
23 | + * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)" | |
24 | + * | |
25 | + * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to | |
26 | + * endorse or promote products derived from this software without | |
27 | + * prior written permission. For written permission, please contact | |
28 | + * licensing@OpenSSL.org. | |
29 | + * | |
30 | + * 5. Products derived from this software may not be called "OpenSSL" | |
31 | + * nor may "OpenSSL" appear in their names without prior written | |
32 | + * permission of the OpenSSL Project. | |
33 | + * | |
34 | + * 6. Redistributions of any form whatsoever must retain the following | |
35 | + * acknowledgment: | |
36 | + * "This product includes software developed by the OpenSSL Project | |
37 | + * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)" | |
38 | + * | |
39 | + * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY | |
40 | + * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
41 | + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR | |
42 | + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR | |
43 | + * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | |
44 | + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT | |
45 | + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; | |
46 | + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | |
47 | + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, | |
48 | + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | |
49 | + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED | |
50 | + * OF THE POSSIBILITY OF SUCH DAMAGE. | |
51 | + * ==================================================================== | |
52 | + * | |
53 | + * This product includes cryptographic software written by Eric Young | |
54 | + * (eay@cryptsoft.com). This product includes software written by Tim | |
55 | + * Hudson (tjh@cryptsoft.com). | |
56 | + * | |
57 | + */ | |
58 | + | |
59 | +#include <stddef.h> | |
60 | +#include <openssl/x509.h> | |
61 | +#include <openssl/asn1.h> | |
62 | +#include <openssl/asn1t.h> | |
63 | + | |
64 | +/* Old netscape certificate wrapper format */ | |
65 | + | |
66 | +ASN1_SEQUENCE(NETSCAPE_X509) = { | |
67 | + ASN1_SIMPLE(NETSCAPE_X509, header, ASN1_OCTET_STRING), | |
68 | + ASN1_OPT(NETSCAPE_X509, cert, X509) | |
69 | +} ASN1_SEQUENCE_END(NETSCAPE_X509) | |
70 | + | |
71 | +IMPLEMENT_ASN1_FUNCTIONS(NETSCAPE_X509) | |
72 | + |
@@ -0,0 +1,317 @@ | ||
1 | +#!/usr/bin/env perl | |
2 | +# | |
3 | +# ==================================================================== | |
4 | +# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | |
5 | +# project. The module is, however, dual licensed under OpenSSL and | |
6 | +# CRYPTOGAMS licenses depending on where you obtain it. For further | |
7 | +# details see http://www.openssl.org/~appro/cryptogams/. | |
8 | +# ==================================================================== | |
9 | +# | |
10 | +# On 21264 RSA sign performance improves by 70/35/20/15 percent for | |
11 | +# 512/1024/2048/4096 bit key lengths. This is against vendor compiler | |
12 | +# instructed to '-tune host' code with in-line assembler. Other | |
13 | +# benchmarks improve by 15-20%. To anchor it to something else, the | |
14 | +# code provides approximately the same performance per GHz as AMD64. | |
15 | +# I.e. if you compare 1GHz 21264 and 2GHz Opteron, you'll observe ~2x | |
16 | +# difference. | |
17 | + | |
18 | +# int bn_mul_mont( | |
19 | +$rp="a0"; # BN_ULONG *rp, | |
20 | +$ap="a1"; # const BN_ULONG *ap, | |
21 | +$bp="a2"; # const BN_ULONG *bp, | |
22 | +$np="a3"; # const BN_ULONG *np, | |
23 | +$n0="a4"; # const BN_ULONG *n0, | |
24 | +$num="a5"; # int num); | |
25 | + | |
26 | +$lo0="t0"; | |
27 | +$hi0="t1"; | |
28 | +$lo1="t2"; | |
29 | +$hi1="t3"; | |
30 | +$aj="t4"; | |
31 | +$bi="t5"; | |
32 | +$nj="t6"; | |
33 | +$tp="t7"; | |
34 | +$alo="t8"; | |
35 | +$ahi="t9"; | |
36 | +$nlo="t10"; | |
37 | +$nhi="t11"; | |
38 | +$tj="t12"; | |
39 | +$i="s3"; | |
40 | +$j="s4"; | |
41 | +$m1="s5"; | |
42 | + | |
43 | +$code=<<___; | |
44 | +#include <asm.h> | |
45 | +#include <regdef.h> | |
46 | + | |
47 | +.text | |
48 | + | |
49 | +.set noat | |
50 | +.set noreorder | |
51 | + | |
52 | +.globl bn_mul_mont | |
53 | +.align 5 | |
54 | +.ent bn_mul_mont | |
55 | +bn_mul_mont: | |
56 | + lda sp,-40(sp) | |
57 | + stq ra,0(sp) | |
58 | + stq s3,8(sp) | |
59 | + stq s4,16(sp) | |
60 | + stq s5,24(sp) | |
61 | + stq fp,32(sp) | |
62 | + mov sp,fp | |
63 | + .mask 0x0400f000,-40 | |
64 | + .frame fp,40,ra | |
65 | + .prologue 0 | |
66 | + | |
67 | + .align 4 | |
68 | + .set reorder | |
69 | + sextl $num,$num | |
70 | + mov 0,v0 | |
71 | + cmplt $num,4,AT | |
72 | + bne AT,.Lexit | |
73 | + | |
74 | + ldq $hi0,0($ap) # ap[0] | |
75 | + s8addq $num,16,AT | |
76 | + ldq $aj,8($ap) | |
77 | + subq sp,AT,sp | |
78 | + ldq $bi,0($bp) # bp[0] | |
79 | + mov -4096,AT | |
80 | + ldq $n0,0($n0) | |
81 | + and sp,AT,sp | |
82 | + | |
83 | + mulq $hi0,$bi,$lo0 | |
84 | + ldq $hi1,0($np) # np[0] | |
85 | + umulh $hi0,$bi,$hi0 | |
86 | + ldq $nj,8($np) | |
87 | + | |
88 | + mulq $lo0,$n0,$m1 | |
89 | + | |
90 | + mulq $hi1,$m1,$lo1 | |
91 | + umulh $hi1,$m1,$hi1 | |
92 | + | |
93 | + addq $lo1,$lo0,$lo1 | |
94 | + cmpult $lo1,$lo0,AT | |
95 | + addq $hi1,AT,$hi1 | |
96 | + | |
97 | + mulq $aj,$bi,$alo | |
98 | + mov 2,$j | |
99 | + umulh $aj,$bi,$ahi | |
100 | + mov sp,$tp | |
101 | + | |
102 | + mulq $nj,$m1,$nlo | |
103 | + s8addq $j,$ap,$aj | |
104 | + umulh $nj,$m1,$nhi | |
105 | + s8addq $j,$np,$nj | |
106 | +.align 4 | |
107 | +.L1st: | |
108 | + .set noreorder | |
109 | + ldq $aj,($aj) | |
110 | + addl $j,1,$j | |
111 | + ldq $nj,($nj) | |
112 | + lda $tp,8($tp) | |
113 | + | |
114 | + addq $alo,$hi0,$lo0 | |
115 | + mulq $aj,$bi,$alo | |
116 | + cmpult $lo0,$hi0,AT | |
117 | + addq $nlo,$hi1,$lo1 | |
118 | + | |
119 | + mulq $nj,$m1,$nlo | |
120 | + addq $ahi,AT,$hi0 | |
121 | + cmpult $lo1,$hi1,v0 | |
122 | + cmplt $j,$num,$tj | |
123 | + | |
124 | + umulh $aj,$bi,$ahi | |
125 | + addq $nhi,v0,$hi1 | |
126 | + addq $lo1,$lo0,$lo1 | |
127 | + s8addq $j,$ap,$aj | |
128 | + | |
129 | + umulh $nj,$m1,$nhi | |
130 | + cmpult $lo1,$lo0,v0 | |
131 | + addq $hi1,v0,$hi1 | |
132 | + s8addq $j,$np,$nj | |
133 | + | |
134 | + stq $lo1,-8($tp) | |
135 | + nop | |
136 | + unop | |
137 | + bne $tj,.L1st | |
138 | + .set reorder | |
139 | + | |
140 | + addq $alo,$hi0,$lo0 | |
141 | + addq $nlo,$hi1,$lo1 | |
142 | + cmpult $lo0,$hi0,AT | |
143 | + cmpult $lo1,$hi1,v0 | |
144 | + addq $ahi,AT,$hi0 | |
145 | + addq $nhi,v0,$hi1 | |
146 | + | |
147 | + addq $lo1,$lo0,$lo1 | |
148 | + cmpult $lo1,$lo0,v0 | |
149 | + addq $hi1,v0,$hi1 | |
150 | + | |
151 | + stq $lo1,0($tp) | |
152 | + | |
153 | + addq $hi1,$hi0,$hi1 | |
154 | + cmpult $hi1,$hi0,AT | |
155 | + stq $hi1,8($tp) | |
156 | + stq AT,16($tp) | |
157 | + | |
158 | + mov 1,$i | |
159 | +.align 4 | |
160 | +.Louter: | |
161 | + s8addq $i,$bp,$bi | |
162 | + ldq $hi0,($ap) | |
163 | + ldq $aj,8($ap) | |
164 | + ldq $bi,($bi) | |
165 | + ldq $hi1,($np) | |
166 | + ldq $nj,8($np) | |
167 | + ldq $tj,(sp) | |
168 | + | |
169 | + mulq $hi0,$bi,$lo0 | |
170 | + umulh $hi0,$bi,$hi0 | |
171 | + | |
172 | + addq $lo0,$tj,$lo0 | |
173 | + cmpult $lo0,$tj,AT | |
174 | + addq $hi0,AT,$hi0 | |
175 | + | |
176 | + mulq $lo0,$n0,$m1 | |
177 | + | |
178 | + mulq $hi1,$m1,$lo1 | |
179 | + umulh $hi1,$m1,$hi1 | |
180 | + | |
181 | + addq $lo1,$lo0,$lo1 | |
182 | + cmpult $lo1,$lo0,AT | |
183 | + mov 2,$j | |
184 | + addq $hi1,AT,$hi1 | |
185 | + | |
186 | + mulq $aj,$bi,$alo | |
187 | + mov sp,$tp | |
188 | + umulh $aj,$bi,$ahi | |
189 | + | |
190 | + mulq $nj,$m1,$nlo | |
191 | + s8addq $j,$ap,$aj | |
192 | + umulh $nj,$m1,$nhi | |
193 | +.align 4 | |
194 | +.Linner: | |
195 | + .set noreorder | |
196 | + ldq $tj,8($tp) #L0 | |
197 | + nop #U1 | |
198 | + ldq $aj,($aj) #L1 | |
199 | + s8addq $j,$np,$nj #U0 | |
200 | + | |
201 | + ldq $nj,($nj) #L0 | |
202 | + nop #U1 | |
203 | + addq $alo,$hi0,$lo0 #L1 | |
204 | + lda $tp,8($tp) | |
205 | + | |
206 | + mulq $aj,$bi,$alo #U1 | |
207 | + cmpult $lo0,$hi0,AT #L0 | |
208 | + addq $nlo,$hi1,$lo1 #L1 | |
209 | + addl $j,1,$j | |
210 | + | |
211 | + mulq $nj,$m1,$nlo #U1 | |
212 | + addq $ahi,AT,$hi0 #L0 | |
213 | + addq $lo0,$tj,$lo0 #L1 | |
214 | + cmpult $lo1,$hi1,v0 #U0 | |
215 | + | |
216 | + umulh $aj,$bi,$ahi #U1 | |
217 | + cmpult $lo0,$tj,AT #L0 | |
218 | + addq $lo1,$lo0,$lo1 #L1 | |
219 | + addq $nhi,v0,$hi1 #U0 | |
220 | + | |
221 | + umulh $nj,$m1,$nhi #U1 | |
222 | + s8addq $j,$ap,$aj #L0 | |
223 | + cmpult $lo1,$lo0,v0 #L1 | |
224 | + cmplt $j,$num,$tj #U0 # borrow $tj | |
225 | + | |
226 | + addq $hi0,AT,$hi0 #L0 | |
227 | + addq $hi1,v0,$hi1 #U1 | |
228 | + stq $lo1,-8($tp) #L1 | |
229 | + bne $tj,.Linner #U0 | |
230 | + .set reorder | |
231 | + | |
232 | + ldq $tj,8($tp) | |
233 | + addq $alo,$hi0,$lo0 | |
234 | + addq $nlo,$hi1,$lo1 | |
235 | + cmpult $lo0,$hi0,AT | |
236 | + cmpult $lo1,$hi1,v0 | |
237 | + addq $ahi,AT,$hi0 | |
238 | + addq $nhi,v0,$hi1 | |
239 | + | |
240 | + addq $lo0,$tj,$lo0 | |
241 | + cmpult $lo0,$tj,AT | |
242 | + addq $hi0,AT,$hi0 | |
243 | + | |
244 | + ldq $tj,16($tp) | |
245 | + addq $lo1,$lo0,$j | |
246 | + cmpult $j,$lo0,v0 | |
247 | + addq $hi1,v0,$hi1 | |
248 | + | |
249 | + addq $hi1,$hi0,$lo1 | |
250 | + stq $j,($tp) | |
251 | + cmpult $lo1,$hi0,$hi1 | |
252 | + addq $lo1,$tj,$lo1 | |
253 | + cmpult $lo1,$tj,AT | |
254 | + addl $i,1,$i | |
255 | + addq $hi1,AT,$hi1 | |
256 | + stq $lo1,8($tp) | |
257 | + cmplt $i,$num,$tj # borrow $tj | |
258 | + stq $hi1,16($tp) | |
259 | + bne $tj,.Louter | |
260 | + | |
261 | + s8addq $num,sp,$tj # &tp[num] | |
262 | + mov $rp,$bp # put rp aside | |
263 | + mov sp,$tp | |
264 | + mov sp,$ap | |
265 | + mov 0,$hi0 # clear borrow bit | |
266 | + | |
267 | +.align 4 | |
268 | +.Lsub: ldq $lo0,($tp) | |
269 | + ldq $lo1,($np) | |
270 | + lda $tp,8($tp) | |
271 | + lda $np,8($np) | |
272 | + subq $lo0,$lo1,$lo1 # tp[i]-np[i] | |
273 | + cmpult $lo0,$lo1,AT | |
274 | + subq $lo1,$hi0,$lo0 | |
275 | + cmpult $lo1,$lo0,$hi0 | |
276 | + or $hi0,AT,$hi0 | |
277 | + stq $lo0,($rp) | |
278 | + cmpult $tp,$tj,v0 | |
279 | + lda $rp,8($rp) | |
280 | + bne v0,.Lsub | |
281 | + | |
282 | + subq $hi1,$hi0,$hi0 # handle upmost overflow bit | |
283 | + mov sp,$tp | |
284 | + mov $bp,$rp # restore rp | |
285 | + | |
286 | + and sp,$hi0,$ap | |
287 | + bic $bp,$hi0,$bp | |
288 | + bis $bp,$ap,$ap # ap=borrow?tp:rp | |
289 | + | |
290 | +.align 4 | |
291 | +.Lcopy: ldq $aj,($ap) # copy or in-place refresh | |
292 | + lda $tp,8($tp) | |
293 | + lda $rp,8($rp) | |
294 | + lda $ap,8($ap) | |
295 | + stq zero,-8($tp) # zap tp | |
296 | + cmpult $tp,$tj,AT | |
297 | + stq $aj,-8($rp) | |
298 | + bne AT,.Lcopy | |
299 | + mov 1,v0 | |
300 | + | |
301 | +.Lexit: | |
302 | + .set noreorder | |
303 | + mov fp,sp | |
304 | + /*ldq ra,0(sp)*/ | |
305 | + ldq s3,8(sp) | |
306 | + ldq s4,16(sp) | |
307 | + ldq s5,24(sp) | |
308 | + ldq fp,32(sp) | |
309 | + lda sp,40(sp) | |
310 | + ret (ra) | |
311 | +.end bn_mul_mont | |
312 | +.rdata | |
313 | +.asciiz "Montgomery Multiplication for Alpha, CRYPTOGAMS by <appro\@openssl.org>" | |
314 | +___ | |
315 | + | |
316 | +print $code; | |
317 | +close STDOUT; |
@@ -0,0 +1,200 @@ | ||
1 | +#!/usr/bin/env perl | |
2 | + | |
3 | +# ==================================================================== | |
4 | +# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | |
5 | +# project. The module is, however, dual licensed under OpenSSL and | |
6 | +# CRYPTOGAMS licenses depending on where you obtain it. For further | |
7 | +# details see http://www.openssl.org/~appro/cryptogams/. | |
8 | +# ==================================================================== | |
9 | + | |
10 | +# January 2007. | |
11 | + | |
12 | +# Montgomery multiplication for ARMv4. | |
13 | +# | |
14 | +# Performance improvement naturally varies among CPU implementations | |
15 | +# and compilers. The code was observed to provide +65-35% improvement | |
16 | +# [depending on key length, less for longer keys] on ARM920T, and | |
17 | +# +115-80% on Intel IXP425. This is compared to pre-bn_mul_mont code | |
18 | +# base and compiler generated code with in-lined umull and even umlal | |
19 | +# instructions. The latter means that this code didn't really have an | |
20 | +# "advantage" of utilizing some "secret" instruction. | |
21 | +# | |
22 | +# The code is interoperable with Thumb ISA and is rather compact, less | |
23 | +# than 1/2KB. Windows CE port would be trivial, as it's exclusively | |
24 | +# about decorations, ABI and instruction syntax are identical. | |
25 | + | |
26 | +$num="r0"; # starts as num argument, but holds &tp[num-1] | |
27 | +$ap="r1"; | |
28 | +$bp="r2"; $bi="r2"; $rp="r2"; | |
29 | +$np="r3"; | |
30 | +$tp="r4"; | |
31 | +$aj="r5"; | |
32 | +$nj="r6"; | |
33 | +$tj="r7"; | |
34 | +$n0="r8"; | |
35 | +########### # r9 is reserved by ELF as platform specific, e.g. TLS pointer | |
36 | +$alo="r10"; # sl, gcc uses it to keep @GOT | |
37 | +$ahi="r11"; # fp | |
38 | +$nlo="r12"; # ip | |
39 | +########### # r13 is stack pointer | |
40 | +$nhi="r14"; # lr | |
41 | +########### # r15 is program counter | |
42 | + | |
43 | +#### argument block layout relative to &tp[num-1], a.k.a. $num | |
44 | +$_rp="$num,#12*4"; | |
45 | +# ap permanently resides in r1 | |
46 | +$_bp="$num,#13*4"; | |
47 | +# np permanently resides in r3 | |
48 | +$_n0="$num,#14*4"; | |
49 | +$_num="$num,#15*4"; $_bpend=$_num; | |
50 | + | |
51 | +$code=<<___; | |
52 | +.text | |
53 | + | |
54 | +.global bn_mul_mont | |
55 | +.type bn_mul_mont,%function | |
56 | + | |
57 | +.align 2 | |
58 | +bn_mul_mont: | |
59 | + stmdb sp!,{r0,r2} @ sp points at argument block | |
60 | + ldr $num,[sp,#3*4] @ load num | |
61 | + cmp $num,#2 | |
62 | + movlt r0,#0 | |
63 | + addlt sp,sp,#2*4 | |
64 | + blt .Labrt | |
65 | + | |
66 | + stmdb sp!,{r4-r12,lr} @ save 10 registers | |
67 | + | |
68 | + mov $num,$num,lsl#2 @ rescale $num for byte count | |
69 | + sub sp,sp,$num @ alloca(4*num) | |
70 | + sub sp,sp,#4 @ +extra dword | |
71 | + sub $num,$num,#4 @ "num=num-1" | |
72 | + add $tp,$bp,$num @ &bp[num-1] | |
73 | + | |
74 | + add $num,sp,$num @ $num to point at &tp[num-1] | |
75 | + ldr $n0,[$_n0] @ &n0 | |
76 | + ldr $bi,[$bp] @ bp[0] | |
77 | + ldr $aj,[$ap],#4 @ ap[0],ap++ | |
78 | + ldr $nj,[$np],#4 @ np[0],np++ | |
79 | + ldr $n0,[$n0] @ *n0 | |
80 | + str $tp,[$_bpend] @ save &bp[num] | |
81 | + | |
82 | + umull $alo,$ahi,$aj,$bi @ ap[0]*bp[0] | |
83 | + str $n0,[$_n0] @ save n0 value | |
84 | + mul $n0,$alo,$n0 @ "tp[0]"*n0 | |
85 | + mov $nlo,#0 | |
86 | + umlal $alo,$nlo,$nj,$n0 @ np[0]*n0+"t[0]" | |
87 | + mov $tp,sp | |
88 | + | |
89 | +.L1st: | |
90 | + ldr $aj,[$ap],#4 @ ap[j],ap++ | |
91 | + mov $alo,$ahi | |
92 | + mov $ahi,#0 | |
93 | + umlal $alo,$ahi,$aj,$bi @ ap[j]*bp[0] | |
94 | + ldr $nj,[$np],#4 @ np[j],np++ | |
95 | + mov $nhi,#0 | |
96 | + umlal $nlo,$nhi,$nj,$n0 @ np[j]*n0 | |
97 | + adds $nlo,$nlo,$alo | |
98 | + str $nlo,[$tp],#4 @ tp[j-1]=,tp++ | |
99 | + adc $nlo,$nhi,#0 | |
100 | + cmp $tp,$num | |
101 | + bne .L1st | |
102 | + | |
103 | + adds $nlo,$nlo,$ahi | |
104 | + mov $nhi,#0 | |
105 | + adc $nhi,$nhi,#0 | |
106 | + ldr $tp,[$_bp] @ restore bp | |
107 | + str $nlo,[$num] @ tp[num-1]= | |
108 | + ldr $n0,[$_n0] @ restore n0 | |
109 | + str $nhi,[$num,#4] @ tp[num]= | |
110 | + | |
111 | +.Louter: | |
112 | + sub $tj,$num,sp @ "original" $num-1 value | |
113 | + sub $ap,$ap,$tj @ "rewind" ap to &ap[1] | |
114 | + sub $np,$np,$tj @ "rewind" np to &np[1] | |
115 | + ldr $bi,[$tp,#4]! @ *(++bp) | |
116 | + ldr $aj,[$ap,#-4] @ ap[0] | |
117 | + ldr $nj,[$np,#-4] @ np[0] | |
118 | + ldr $alo,[sp] @ tp[0] | |
119 | + ldr $tj,[sp,#4] @ tp[1] | |
120 | + | |
121 | + mov $ahi,#0 | |
122 | + umlal $alo,$ahi,$aj,$bi @ ap[0]*bp[i]+tp[0] | |
123 | + str $tp,[$_bp] @ save bp | |
124 | + mul $n0,$alo,$n0 | |
125 | + mov $nlo,#0 | |
126 | + umlal $alo,$nlo,$nj,$n0 @ np[0]*n0+"tp[0]" | |
127 | + mov $tp,sp | |
128 | + | |
129 | +.Linner: | |
130 | + ldr $aj,[$ap],#4 @ ap[j],ap++ | |
131 | + adds $alo,$ahi,$tj @ +=tp[j] | |
132 | + mov $ahi,#0 | |
133 | + umlal $alo,$ahi,$aj,$bi @ ap[j]*bp[i] | |
134 | + ldr $nj,[$np],#4 @ np[j],np++ | |
135 | + mov $nhi,#0 | |
136 | + umlal $nlo,$nhi,$nj,$n0 @ np[j]*n0 | |
137 | + ldr $tj,[$tp,#8] @ tp[j+1] | |
138 | + adc $ahi,$ahi,#0 | |
139 | + adds $nlo,$nlo,$alo | |
140 | + str $nlo,[$tp],#4 @ tp[j-1]=,tp++ | |
141 | + adc $nlo,$nhi,#0 | |
142 | + cmp $tp,$num | |
143 | + bne .Linner | |
144 | + | |
145 | + adds $nlo,$nlo,$ahi | |
146 | + mov $nhi,#0 | |
147 | + adc $nhi,$nhi,#0 | |
148 | + adds $nlo,$nlo,$tj | |
149 | + adc $nhi,$nhi,#0 | |
150 | + ldr $tp,[$_bp] @ restore bp | |
151 | + ldr $tj,[$_bpend] @ restore &bp[num] | |
152 | + str $nlo,[$num] @ tp[num-1]= | |
153 | + ldr $n0,[$_n0] @ restore n0 | |
154 | + str $nhi,[$num,#4] @ tp[num]= | |
155 | + | |
156 | + cmp $tp,$tj | |
157 | + bne .Louter | |
158 | + | |
159 | + ldr $rp,[$_rp] @ pull rp | |
160 | + add $num,$num,#4 @ $num to point at &tp[num] | |
161 | + sub $aj,$num,sp @ "original" num value | |
162 | + mov $tp,sp @ "rewind" $tp | |
163 | + mov $ap,$tp @ "borrow" $ap | |
164 | + sub $np,$np,$aj @ "rewind" $np to &np[0] | |
165 | + | |
166 | + subs $tj,$tj,$tj @ "clear" carry flag | |
167 | +.Lsub: ldr $tj,[$tp],#4 | |
168 | + ldr $nj,[$np],#4 | |
169 | + sbcs $tj,$tj,$nj @ tp[j]-np[j] | |
170 | + str $tj,[$rp],#4 @ rp[j]= | |
171 | + teq $tp,$num @ preserve carry | |
172 | + bne .Lsub | |
173 | + sbcs $nhi,$nhi,#0 @ upmost carry | |
174 | + mov $tp,sp @ "rewind" $tp | |
175 | + sub $rp,$rp,$aj @ "rewind" $rp | |
176 | + | |
177 | + and $ap,$tp,$nhi | |
178 | + bic $np,$rp,$nhi | |
179 | + orr $ap,$ap,$np @ ap=borrow?tp:rp | |
180 | + | |
181 | +.Lcopy: ldr $tj,[$ap],#4 @ copy or in-place refresh | |
182 | + str sp,[$tp],#4 @ zap tp | |
183 | + str $tj,[$rp],#4 | |
184 | + cmp $tp,$num | |
185 | + bne .Lcopy | |
186 | + | |
187 | + add sp,$num,#4 @ skip over tp[num+1] | |
188 | + ldmia sp!,{r4-r12,lr} @ restore registers | |
189 | + add sp,sp,#2*4 @ skip over {r0,r2} | |
190 | + mov r0,#1 | |
191 | +.Labrt: tst lr,#1 | |
192 | + moveq pc,lr @ be binary compatible with V4, yet | |
193 | + bx lr @ interoperable with Thumb ISA:-) | |
194 | +.size bn_mul_mont,.-bn_mul_mont | |
195 | +.asciz "Montgomery multiplication for ARMv4, CRYPTOGAMS by <appro\@openssl.org>" | |
196 | +___ | |
197 | + | |
198 | +$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4 | |
199 | +print $code; | |
200 | +close STDOUT; |
@@ -0,0 +1,327 @@ | ||
1 | +#!/usr/bin/env perl | |
2 | +# | |
3 | +# ==================================================================== | |
4 | +# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | |
5 | +# project. The module is, however, dual licensed under OpenSSL and | |
6 | +# CRYPTOGAMS licenses depending on where you obtain it. For further | |
7 | +# details see http://www.openssl.org/~appro/cryptogams/. | |
8 | +# ==================================================================== | |
9 | + | |
10 | +# This module doesn't present direct interest for OpenSSL, because it | |
11 | +# doesn't provide better performance for longer keys. While 512-bit | |
12 | +# RSA private key operations are 40% faster, 1024-bit ones are hardly | |
13 | +# faster at all, while longer key operations are slower by up to 20%. | |
14 | +# It might be of interest to embedded system developers though, as | |
15 | +# it's smaller than 1KB, yet offers ~3x improvement over compiler | |
16 | +# generated code. | |
17 | +# | |
18 | +# The module targets N32 and N64 MIPS ABIs and currently is a bit | |
19 | +# IRIX-centric, i.e. is likely to require adaptation for other OSes. | |
20 | + | |
21 | +# int bn_mul_mont( | |
22 | +$rp="a0"; # BN_ULONG *rp, | |
23 | +$ap="a1"; # const BN_ULONG *ap, | |
24 | +$bp="a2"; # const BN_ULONG *bp, | |
25 | +$np="a3"; # const BN_ULONG *np, | |
26 | +$n0="a4"; # const BN_ULONG *n0, | |
27 | +$num="a5"; # int num); | |
28 | + | |
29 | +$lo0="a6"; | |
30 | +$hi0="a7"; | |
31 | +$lo1="v0"; | |
32 | +$hi1="v1"; | |
33 | +$aj="t0"; | |
34 | +$bi="t1"; | |
35 | +$nj="t2"; | |
36 | +$tp="t3"; | |
37 | +$alo="s0"; | |
38 | +$ahi="s1"; | |
39 | +$nlo="s2"; | |
40 | +$nhi="s3"; | |
41 | +$tj="s4"; | |
42 | +$i="s5"; | |
43 | +$j="s6"; | |
44 | +$fp="t8"; | |
45 | +$m1="t9"; | |
46 | + | |
47 | +$FRAME=8*(2+8); | |
48 | + | |
49 | +$code=<<___; | |
50 | +#include <asm.h> | |
51 | +#include <regdef.h> | |
52 | + | |
53 | +.text | |
54 | + | |
55 | +.set noat | |
56 | +.set reorder | |
57 | + | |
58 | +.align 5 | |
59 | +.globl bn_mul_mont | |
60 | +.ent bn_mul_mont | |
61 | +bn_mul_mont: | |
62 | + .set noreorder | |
63 | + PTR_SUB sp,64 | |
64 | + move $fp,sp | |
65 | + .frame $fp,64,ra | |
66 | + slt AT,$num,4 | |
67 | + li v0,0 | |
68 | + beqzl AT,.Lproceed | |
69 | + nop | |
70 | + jr ra | |
71 | + PTR_ADD sp,$fp,64 | |
72 | + .set reorder | |
73 | +.align 5 | |
74 | +.Lproceed: | |
75 | + ld $n0,0($n0) | |
76 | + ld $bi,0($bp) # bp[0] | |
77 | + ld $aj,0($ap) # ap[0] | |
78 | + ld $nj,0($np) # np[0] | |
79 | + PTR_SUB sp,16 # place for two extra words | |
80 | + sll $num,3 | |
81 | + li AT,-4096 | |
82 | + PTR_SUB sp,$num | |
83 | + and sp,AT | |
84 | + | |
85 | + sd s0,0($fp) | |
86 | + sd s1,8($fp) | |
87 | + sd s2,16($fp) | |
88 | + sd s3,24($fp) | |
89 | + sd s4,32($fp) | |
90 | + sd s5,40($fp) | |
91 | + sd s6,48($fp) | |
92 | + sd s7,56($fp) | |
93 | + | |
94 | + dmultu $aj,$bi | |
95 | + ld $alo,8($ap) | |
96 | + ld $nlo,8($np) | |
97 | + mflo $lo0 | |
98 | + mfhi $hi0 | |
99 | + dmultu $lo0,$n0 | |
100 | + mflo $m1 | |
101 | + | |
102 | + dmultu $alo,$bi | |
103 | + mflo $alo | |
104 | + mfhi $ahi | |
105 | + | |
106 | + dmultu $nj,$m1 | |
107 | + mflo $lo1 | |
108 | + mfhi $hi1 | |
109 | + dmultu $nlo,$m1 | |
110 | + daddu $lo1,$lo0 | |
111 | + sltu AT,$lo1,$lo0 | |
112 | + daddu $hi1,AT | |
113 | + mflo $nlo | |
114 | + mfhi $nhi | |
115 | + | |
116 | + move $tp,sp | |
117 | + li $j,16 | |
118 | +.align 4 | |
119 | +.L1st: | |
120 | + .set noreorder | |
121 | + PTR_ADD $aj,$ap,$j | |
122 | + ld $aj,($aj) | |
123 | + PTR_ADD $nj,$np,$j | |
124 | + ld $nj,($nj) | |
125 | + | |
126 | + dmultu $aj,$bi | |
127 | + daddu $lo0,$alo,$hi0 | |
128 | + daddu $lo1,$nlo,$hi1 | |
129 | + sltu AT,$lo0,$hi0 | |
130 | + sltu s7,$lo1,$hi1 | |
131 | + daddu $hi0,$ahi,AT | |
132 | + daddu $hi1,$nhi,s7 | |
133 | + mflo $alo | |
134 | + mfhi $ahi | |
135 | + | |
136 | + daddu $lo1,$lo0 | |
137 | + sltu AT,$lo1,$lo0 | |
138 | + dmultu $nj,$m1 | |
139 | + daddu $hi1,AT | |
140 | + addu $j,8 | |
141 | + sd $lo1,($tp) | |
142 | + sltu s7,$j,$num | |
143 | + mflo $nlo | |
144 | + mfhi $nhi | |
145 | + | |
146 | + bnez s7,.L1st | |
147 | + PTR_ADD $tp,8 | |
148 | + .set reorder | |
149 | + | |
150 | + daddu $lo0,$alo,$hi0 | |
151 | + sltu AT,$lo0,$hi0 | |
152 | + daddu $hi0,$ahi,AT | |
153 | + | |
154 | + daddu $lo1,$nlo,$hi1 | |
155 | + sltu s7,$lo1,$hi1 | |
156 | + daddu $hi1,$nhi,s7 | |
157 | + daddu $lo1,$lo0 | |
158 | + sltu AT,$lo1,$lo0 | |
159 | + daddu $hi1,AT | |
160 | + | |
161 | + sd $lo1,($tp) | |
162 | + | |
163 | + daddu $hi1,$hi0 | |
164 | + sltu AT,$hi1,$hi0 | |
165 | + sd $hi1,8($tp) | |
166 | + sd AT,16($tp) | |
167 | + | |
168 | + li $i,8 | |
169 | +.align 4 | |
170 | +.Louter: | |
171 | + PTR_ADD $bi,$bp,$i | |
172 | + ld $bi,($bi) | |
173 | + ld $aj,($ap) | |
174 | + ld $alo,8($ap) | |
175 | + ld $tj,(sp) | |
176 | + | |
177 | + dmultu $aj,$bi | |
178 | + ld $nj,($np) | |
179 | + ld $nlo,8($np) | |
180 | + mflo $lo0 | |
181 | + mfhi $hi0 | |
182 | + daddu $lo0,$tj | |
183 | + dmultu $lo0,$n0 | |
184 | + sltu AT,$lo0,$tj | |
185 | + daddu $hi0,AT | |
186 | + mflo $m1 | |
187 | + | |
188 | + dmultu $alo,$bi | |
189 | + mflo $alo | |
190 | + mfhi $ahi | |
191 | + | |
192 | + dmultu $nj,$m1 | |
193 | + mflo $lo1 | |
194 | + mfhi $hi1 | |
195 | + | |
196 | + dmultu $nlo,$m1 | |
197 | + daddu $lo1,$lo0 | |
198 | + sltu AT,$lo1,$lo0 | |
199 | + daddu $hi1,AT | |
200 | + mflo $nlo | |
201 | + mfhi $nhi | |
202 | + | |
203 | + move $tp,sp | |
204 | + li $j,16 | |
205 | + ld $tj,8($tp) | |
206 | +.align 4 | |
207 | +.Linner: | |
208 | + .set noreorder | |
209 | + PTR_ADD $aj,$ap,$j | |
210 | + ld $aj,($aj) | |
211 | + PTR_ADD $nj,$np,$j | |
212 | + ld $nj,($nj) | |
213 | + | |
214 | + dmultu $aj,$bi | |
215 | + daddu $lo0,$alo,$hi0 | |
216 | + daddu $lo1,$nlo,$hi1 | |
217 | + sltu AT,$lo0,$hi0 | |
218 | + sltu s7,$lo1,$hi1 | |
219 | + daddu $hi0,$ahi,AT | |
220 | + daddu $hi1,$nhi,s7 | |
221 | + mflo $alo | |
222 | + mfhi $ahi | |
223 | + | |
224 | + daddu $lo0,$tj | |
225 | + addu $j,8 | |
226 | + dmultu $nj,$m1 | |
227 | + sltu AT,$lo0,$tj | |
228 | + daddu $lo1,$lo0 | |
229 | + daddu $hi0,AT | |
230 | + sltu s7,$lo1,$lo0 | |
231 | + ld $tj,16($tp) | |
232 | + daddu $hi1,s7 | |
233 | + sltu AT,$j,$num | |
234 | + mflo $nlo | |
235 | + mfhi $nhi | |
236 | + sd $lo1,($tp) | |
237 | + bnez AT,.Linner | |
238 | + PTR_ADD $tp,8 | |
239 | + .set reorder | |
240 | + | |
241 | + daddu $lo0,$alo,$hi0 | |
242 | + sltu AT,$lo0,$hi0 | |
243 | + daddu $hi0,$ahi,AT | |
244 | + daddu $lo0,$tj | |
245 | + sltu s7,$lo0,$tj | |
246 | + daddu $hi0,s7 | |
247 | + | |
248 | + ld $tj,16($tp) | |
249 | + daddu $lo1,$nlo,$hi1 | |
250 | + sltu AT,$lo1,$hi1 | |
251 | + daddu $hi1,$nhi,AT | |
252 | + daddu $lo1,$lo0 | |
253 | + sltu s7,$lo1,$lo0 | |
254 | + daddu $hi1,s7 | |
255 | + sd $lo1,($tp) | |
256 | + | |
257 | + daddu $lo1,$hi1,$hi0 | |
258 | + sltu $hi1,$lo1,$hi0 | |
259 | + daddu $lo1,$tj | |
260 | + sltu AT,$lo1,$tj | |
261 | + daddu $hi1,AT | |
262 | + sd $lo1,8($tp) | |
263 | + sd $hi1,16($tp) | |
264 | + | |
265 | + addu $i,8 | |
266 | + sltu s7,$i,$num | |
267 | + bnez s7,.Louter | |
268 | + | |
269 | + .set noreorder | |
270 | + PTR_ADD $tj,sp,$num # &tp[num] | |
271 | + move $tp,sp | |
272 | + move $ap,sp | |
273 | + li $hi0,0 # clear borrow bit | |
274 | + | |
275 | +.align 4 | |
276 | +.Lsub: ld $lo0,($tp) | |
277 | + ld $lo1,($np) | |
278 | + PTR_ADD $tp,8 | |
279 | + PTR_ADD $np,8 | |
280 | + dsubu $lo1,$lo0,$lo1 # tp[i]-np[i] | |
281 | + sgtu AT,$lo1,$lo0 | |
282 | + dsubu $lo0,$lo1,$hi0 | |
283 | + sgtu $hi0,$lo0,$lo1 | |
284 | + sd $lo0,($rp) | |
285 | + or $hi0,AT | |
286 | + sltu AT,$tp,$tj | |
287 | + bnez AT,.Lsub | |
288 | + PTR_ADD $rp,8 | |
289 | + | |
290 | + dsubu $hi0,$hi1,$hi0 # handle upmost overflow bit | |
291 | + move $tp,sp | |
292 | + PTR_SUB $rp,$num # restore rp | |
293 | + not $hi1,$hi0 | |
294 | + | |
295 | + and $ap,$hi0,sp | |
296 | + and $bp,$hi1,$rp | |
297 | + or $ap,$ap,$bp # ap=borrow?tp:rp | |
298 | + | |
299 | +.align 4 | |
300 | +.Lcopy: ld $aj,($ap) | |
301 | + PTR_ADD $ap,8 | |
302 | + PTR_ADD $tp,8 | |
303 | + sd zero,-8($tp) | |
304 | + sltu AT,$tp,$tj | |
305 | + sd $aj,($rp) | |
306 | + bnez AT,.Lcopy | |
307 | + PTR_ADD $rp,8 | |
308 | + | |
309 | + ld s0,0($fp) | |
310 | + ld s1,8($fp) | |
311 | + ld s2,16($fp) | |
312 | + ld s3,24($fp) | |
313 | + ld s4,32($fp) | |
314 | + ld s5,40($fp) | |
315 | + ld s6,48($fp) | |
316 | + ld s7,56($fp) | |
317 | + li v0,1 | |
318 | + jr ra | |
319 | + PTR_ADD sp,$fp,64 | |
320 | + .set reorder | |
321 | +END(bn_mul_mont) | |
322 | +.rdata | |
323 | +.asciiz "Montgomery Multiplication for MIPS III/IV, CRYPTOGAMS by <appro\@openssl.org>" | |
324 | +___ | |
325 | + | |
326 | +print $code; | |
327 | +close STDOUT; |
@@ -0,0 +1,323 @@ | ||
1 | +#!/usr/bin/env perl | |
2 | + | |
3 | +# ==================================================================== | |
4 | +# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | |
5 | +# project. The module is, however, dual licensed under OpenSSL and | |
6 | +# CRYPTOGAMS licenses depending on where you obtain it. For further | |
7 | +# details see http://www.openssl.org/~appro/cryptogams/. | |
8 | +# ==================================================================== | |
9 | + | |
10 | +# April 2006 | |
11 | + | |
12 | +# "Teaser" Montgomery multiplication module for PowerPC. It's possible | |
13 | +# to gain a bit more by modulo-scheduling outer loop, then dedicated | |
14 | +# squaring procedure should give further 20% and code can be adapted | |
15 | +# for 32-bit application running on 64-bit CPU. As for the latter. | |
16 | +# It won't be able to achieve "native" 64-bit performance, because in | |
17 | +# 32-bit application context every addc instruction will have to be | |
18 | +# expanded as addc, twice right shift by 32 and finally adde, etc. | |
19 | +# So far RSA *sign* performance improvement over pre-bn_mul_mont asm | |
20 | +# for 64-bit application running on PPC970/G5 is: | |
21 | +# | |
22 | +# 512-bit +65% | |
23 | +# 1024-bit +35% | |
24 | +# 2048-bit +18% | |
25 | +# 4096-bit +4% | |
26 | + | |
27 | +$flavour = shift; | |
28 | + | |
29 | +if ($flavour =~ /32/) { | |
30 | + $BITS= 32; | |
31 | + $BNSZ= $BITS/8; | |
32 | + $SIZE_T=4; | |
33 | + $RZONE= 224; | |
34 | + $FRAME= $SIZE_T*16; | |
35 | + | |
36 | + $LD= "lwz"; # load | |
37 | + $LDU= "lwzu"; # load and update | |
38 | + $LDX= "lwzx"; # load indexed | |
39 | + $ST= "stw"; # store | |
40 | + $STU= "stwu"; # store and update | |
41 | + $STX= "stwx"; # store indexed | |
42 | + $STUX= "stwux"; # store indexed and update | |
43 | + $UMULL= "mullw"; # unsigned multiply low | |
44 | + $UMULH= "mulhwu"; # unsigned multiply high | |
45 | + $UCMP= "cmplw"; # unsigned compare | |
46 | + $SHRI= "srwi"; # unsigned shift right by immediate | |
47 | + $PUSH= $ST; | |
48 | + $POP= $LD; | |
49 | +} elsif ($flavour =~ /64/) { | |
50 | + $BITS= 64; | |
51 | + $BNSZ= $BITS/8; | |
52 | + $SIZE_T=8; | |
53 | + $RZONE= 288; | |
54 | + $FRAME= $SIZE_T*16; | |
55 | + | |
56 | + # same as above, but 64-bit mnemonics... | |
57 | + $LD= "ld"; # load | |
58 | + $LDU= "ldu"; # load and update | |
59 | + $LDX= "ldx"; # load indexed | |
60 | + $ST= "std"; # store | |
61 | + $STU= "stdu"; # store and update | |
62 | + $STX= "stdx"; # store indexed | |
63 | + $STUX= "stdux"; # store indexed and update | |
64 | + $UMULL= "mulld"; # unsigned multiply low | |
65 | + $UMULH= "mulhdu"; # unsigned multiply high | |
66 | + $UCMP= "cmpld"; # unsigned compare | |
67 | + $SHRI= "srdi"; # unsigned shift right by immediate | |
68 | + $PUSH= $ST; | |
69 | + $POP= $LD; | |
70 | +} else { die "nonsense $flavour"; } | |
71 | + | |
72 | +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | |
73 | +( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or | |
74 | +( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or | |
75 | +die "can't locate ppc-xlate.pl"; | |
76 | + | |
77 | +open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!"; | |
78 | + | |
79 | +$sp="r1"; | |
80 | +$toc="r2"; | |
81 | +$rp="r3"; $ovf="r3"; | |
82 | +$ap="r4"; | |
83 | +$bp="r5"; | |
84 | +$np="r6"; | |
85 | +$n0="r7"; | |
86 | +$num="r8"; | |
87 | +$rp="r9"; # $rp is reassigned | |
88 | +$aj="r10"; | |
89 | +$nj="r11"; | |
90 | +$tj="r12"; | |
91 | +# non-volatile registers | |
92 | +$i="r14"; | |
93 | +$j="r15"; | |
94 | +$tp="r16"; | |
95 | +$m0="r17"; | |
96 | +$m1="r18"; | |
97 | +$lo0="r19"; | |
98 | +$hi0="r20"; | |
99 | +$lo1="r21"; | |
100 | +$hi1="r22"; | |
101 | +$alo="r23"; | |
102 | +$ahi="r24"; | |
103 | +$nlo="r25"; | |
104 | +# | |
105 | +$nhi="r0"; | |
106 | + | |
107 | +$code=<<___; | |
108 | +.machine "any" | |
109 | +.text | |
110 | + | |
111 | +.globl .bn_mul_mont | |
112 | +.align 4 | |
113 | +.bn_mul_mont: | |
114 | + cmpwi $num,4 | |
115 | + mr $rp,r3 ; $rp is reassigned | |
116 | + li r3,0 | |
117 | + bltlr | |
118 | + | |
119 | + slwi $num,$num,`log($BNSZ)/log(2)` | |
120 | + li $tj,-4096 | |
121 | + addi $ovf,$num,`$FRAME+$RZONE` | |
122 | + subf $ovf,$ovf,$sp ; $sp-$ovf | |
123 | + and $ovf,$ovf,$tj ; minimize TLB usage | |
124 | + subf $ovf,$sp,$ovf ; $ovf-$sp | |
125 | + srwi $num,$num,`log($BNSZ)/log(2)` | |
126 | + $STUX $sp,$sp,$ovf | |
127 | + | |
128 | + $PUSH r14,`4*$SIZE_T`($sp) | |
129 | + $PUSH r15,`5*$SIZE_T`($sp) | |
130 | + $PUSH r16,`6*$SIZE_T`($sp) | |
131 | + $PUSH r17,`7*$SIZE_T`($sp) | |
132 | + $PUSH r18,`8*$SIZE_T`($sp) | |
133 | + $PUSH r19,`9*$SIZE_T`($sp) | |
134 | + $PUSH r20,`10*$SIZE_T`($sp) | |
135 | + $PUSH r21,`11*$SIZE_T`($sp) | |
136 | + $PUSH r22,`12*$SIZE_T`($sp) | |
137 | + $PUSH r23,`13*$SIZE_T`($sp) | |
138 | + $PUSH r24,`14*$SIZE_T`($sp) | |
139 | + $PUSH r25,`15*$SIZE_T`($sp) | |
140 | + | |
141 | + $LD $n0,0($n0) ; pull n0[0] value | |
142 | + addi $num,$num,-2 ; adjust $num for counter register | |
143 | + | |
144 | + $LD $m0,0($bp) ; m0=bp[0] | |
145 | + $LD $aj,0($ap) ; ap[0] | |
146 | + addi $tp,$sp,$FRAME | |
147 | + $UMULL $lo0,$aj,$m0 ; ap[0]*bp[0] | |
148 | + $UMULH $hi0,$aj,$m0 | |
149 | + | |
150 | + $LD $aj,$BNSZ($ap) ; ap[1] | |
151 | + $LD $nj,0($np) ; np[0] | |
152 | + | |
153 | + $UMULL $m1,$lo0,$n0 ; "tp[0]"*n0 | |
154 | + | |
155 | + $UMULL $alo,$aj,$m0 ; ap[1]*bp[0] | |
156 | + $UMULH $ahi,$aj,$m0 | |
157 | + | |
158 | + $UMULL $lo1,$nj,$m1 ; np[0]*m1 | |
159 | + $UMULH $hi1,$nj,$m1 | |
160 | + $LD $nj,$BNSZ($np) ; np[1] | |
161 | + addc $lo1,$lo1,$lo0 | |
162 | + addze $hi1,$hi1 | |
163 | + | |
164 | + $UMULL $nlo,$nj,$m1 ; np[1]*m1 | |
165 | + $UMULH $nhi,$nj,$m1 | |
166 | + | |
167 | + mtctr $num | |
168 | + li $j,`2*$BNSZ` | |
169 | +.align 4 | |
170 | +L1st: | |
171 | + $LDX $aj,$ap,$j ; ap[j] | |
172 | + addc $lo0,$alo,$hi0 | |
173 | + $LDX $nj,$np,$j ; np[j] | |
174 | + addze $hi0,$ahi | |
175 | + $UMULL $alo,$aj,$m0 ; ap[j]*bp[0] | |
176 | + addc $lo1,$nlo,$hi1 | |
177 | + $UMULH $ahi,$aj,$m0 | |
178 | + addze $hi1,$nhi | |
179 | + $UMULL $nlo,$nj,$m1 ; np[j]*m1 | |
180 | + addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[0] | |
181 | + $UMULH $nhi,$nj,$m1 | |
182 | + addze $hi1,$hi1 | |
183 | + $ST $lo1,0($tp) ; tp[j-1] | |
184 | + | |
185 | + addi $j,$j,$BNSZ ; j++ | |
186 | + addi $tp,$tp,$BNSZ ; tp++ | |
187 | + bdnz- L1st | |
188 | +;L1st | |
189 | + addc $lo0,$alo,$hi0 | |
190 | + addze $hi0,$ahi | |
191 | + | |
192 | + addc $lo1,$nlo,$hi1 | |
193 | + addze $hi1,$nhi | |
194 | + addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[0] | |
195 | + addze $hi1,$hi1 | |
196 | + $ST $lo1,0($tp) ; tp[j-1] | |
197 | + | |
198 | + li $ovf,0 | |
199 | + addc $hi1,$hi1,$hi0 | |
200 | + addze $ovf,$ovf ; upmost overflow bit | |
201 | + $ST $hi1,$BNSZ($tp) | |
202 | + | |
203 | + li $i,$BNSZ | |
204 | +.align 4 | |
205 | +Louter: | |
206 | + $LDX $m0,$bp,$i ; m0=bp[i] | |
207 | + $LD $aj,0($ap) ; ap[0] | |
208 | + addi $tp,$sp,$FRAME | |
209 | + $LD $tj,$FRAME($sp) ; tp[0] | |
210 | + $UMULL $lo0,$aj,$m0 ; ap[0]*bp[i] | |
211 | + $UMULH $hi0,$aj,$m0 | |
212 | + $LD $aj,$BNSZ($ap) ; ap[1] | |
213 | + $LD $nj,0($np) ; np[0] | |
214 | + addc $lo0,$lo0,$tj ; ap[0]*bp[i]+tp[0] | |
215 | + $UMULL $alo,$aj,$m0 ; ap[j]*bp[i] | |
216 | + addze $hi0,$hi0 | |
217 | + $UMULL $m1,$lo0,$n0 ; tp[0]*n0 | |
218 | + $UMULH $ahi,$aj,$m0 | |
219 | + $UMULL $lo1,$nj,$m1 ; np[0]*m1 | |
220 | + $UMULH $hi1,$nj,$m1 | |
221 | + $LD $nj,$BNSZ($np) ; np[1] | |
222 | + addc $lo1,$lo1,$lo0 | |
223 | + $UMULL $nlo,$nj,$m1 ; np[1]*m1 | |
224 | + addze $hi1,$hi1 | |
225 | + $UMULH $nhi,$nj,$m1 | |
226 | + | |
227 | + mtctr $num | |
228 | + li $j,`2*$BNSZ` | |
229 | +.align 4 | |
230 | +Linner: | |
231 | + $LDX $aj,$ap,$j ; ap[j] | |
232 | + addc $lo0,$alo,$hi0 | |
233 | + $LD $tj,$BNSZ($tp) ; tp[j] | |
234 | + addze $hi0,$ahi | |
235 | + $LDX $nj,$np,$j ; np[j] | |
236 | + addc $lo1,$nlo,$hi1 | |
237 | + $UMULL $alo,$aj,$m0 ; ap[j]*bp[i] | |
238 | + addze $hi1,$nhi | |
239 | + $UMULH $ahi,$aj,$m0 | |
240 | + addc $lo0,$lo0,$tj ; ap[j]*bp[i]+tp[j] | |
241 | + $UMULL $nlo,$nj,$m1 ; np[j]*m1 | |
242 | + addze $hi0,$hi0 | |
243 | + $UMULH $nhi,$nj,$m1 | |
244 | + addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[i]+tp[j] | |
245 | + addi $j,$j,$BNSZ ; j++ | |
246 | + addze $hi1,$hi1 | |
247 | + $ST $lo1,0($tp) ; tp[j-1] | |
248 | + addi $tp,$tp,$BNSZ ; tp++ | |
249 | + bdnz- Linner | |
250 | +;Linner | |
251 | + $LD $tj,$BNSZ($tp) ; tp[j] | |
252 | + addc $lo0,$alo,$hi0 | |
253 | + addze $hi0,$ahi | |
254 | + addc $lo0,$lo0,$tj ; ap[j]*bp[i]+tp[j] | |
255 | + addze $hi0,$hi0 | |
256 | + | |
257 | + addc $lo1,$nlo,$hi1 | |
258 | + addze $hi1,$nhi | |
259 | + addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[i]+tp[j] | |
260 | + addze $hi1,$hi1 | |
261 | + $ST $lo1,0($tp) ; tp[j-1] | |
262 | + | |
263 | + addic $ovf,$ovf,-1 ; move upmost overflow to XER[CA] | |
264 | + li $ovf,0 | |
265 | + adde $hi1,$hi1,$hi0 | |
266 | + addze $ovf,$ovf | |
267 | + $ST $hi1,$BNSZ($tp) | |
268 | +; | |
269 | + slwi $tj,$num,`log($BNSZ)/log(2)` | |
270 | + $UCMP $i,$tj | |
271 | + addi $i,$i,$BNSZ | |
272 | + ble- Louter | |
273 | + | |
274 | + addi $num,$num,2 ; restore $num | |
275 | + subfc $j,$j,$j ; j=0 and "clear" XER[CA] | |
276 | + addi $tp,$sp,$FRAME | |
277 | + mtctr $num | |
278 | + | |
279 | +.align 4 | |
280 | +Lsub: $LDX $tj,$tp,$j | |
281 | + $LDX $nj,$np,$j | |
282 | + subfe $aj,$nj,$tj ; tp[j]-np[j] | |
283 | + $STX $aj,$rp,$j | |
284 | + addi $j,$j,$BNSZ | |
285 | + bdnz- Lsub | |
286 | + | |
287 | + li $j,0 | |
288 | + mtctr $num | |
289 | + subfe $ovf,$j,$ovf ; handle upmost overflow bit | |
290 | + and $ap,$tp,$ovf | |
291 | + andc $np,$rp,$ovf | |
292 | + or $ap,$ap,$np ; ap=borrow?tp:rp | |
293 | + | |
294 | +.align 4 | |
295 | +Lcopy: ; copy or in-place refresh | |
296 | + $LDX $tj,$ap,$j | |
297 | + $STX $tj,$rp,$j | |
298 | + $STX $j,$tp,$j ; zap at once | |
299 | + addi $j,$j,$BNSZ | |
300 | + bdnz- Lcopy | |
301 | + | |
302 | + $POP r14,`4*$SIZE_T`($sp) | |
303 | + $POP r15,`5*$SIZE_T`($sp) | |
304 | + $POP r16,`6*$SIZE_T`($sp) | |
305 | + $POP r17,`7*$SIZE_T`($sp) | |
306 | + $POP r18,`8*$SIZE_T`($sp) | |
307 | + $POP r19,`9*$SIZE_T`($sp) | |
308 | + $POP r20,`10*$SIZE_T`($sp) | |
309 | + $POP r21,`11*$SIZE_T`($sp) | |
310 | + $POP r22,`12*$SIZE_T`($sp) | |
311 | + $POP r23,`13*$SIZE_T`($sp) | |
312 | + $POP r24,`14*$SIZE_T`($sp) | |
313 | + $POP r25,`15*$SIZE_T`($sp) | |
314 | + $POP $sp,0($sp) | |
315 | + li r3,1 | |
316 | + blr | |
317 | + .long 0 | |
318 | +.asciz "Montgomery Multiplication for PPC, CRYPTOGAMS by <appro\@fy.chalmers.se>" | |
319 | +___ | |
320 | + | |
321 | +$code =~ s/\`([^\`]*)\`/eval $1/gem; | |
322 | +print $code; | |
323 | +close STDOUT; |
@@ -0,0 +1,918 @@ | ||
1 | +#!/usr/bin/env perl | |
2 | + | |
3 | +# ==================================================================== | |
4 | +# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | |
5 | +# project. The module is, however, dual licensed under OpenSSL and | |
6 | +# CRYPTOGAMS licenses depending on where you obtain it. For further | |
7 | +# details see http://www.openssl.org/~appro/cryptogams/. | |
8 | +# ==================================================================== | |
9 | + | |
10 | +# December 2007 | |
11 | + | |
12 | +# The reason for undertaken effort is basically following. Even though | |
13 | +# Power 6 CPU operates at incredible 4.7GHz clock frequency, its PKI | |
14 | +# performance was observed to be less than impressive, essentially as | |
15 | +# fast as 1.8GHz PPC970, or 2.6 times(!) slower than one would hope. | |
16 | +# Well, it's not surprising that IBM had to make some sacrifices to | |
17 | +# boost the clock frequency that much, but no overall improvement? | |
18 | +# Having observed how much difference did switching to FPU make on | |
19 | +# UltraSPARC, playing same stunt on Power 6 appeared appropriate... | |
20 | +# Unfortunately the resulting performance improvement is not as | |
21 | +# impressive, ~30%, and in absolute terms is still very far from what | |
22 | +# one would expect from 4.7GHz CPU. There is a chance that I'm doing | |
23 | +# something wrong, but in the lack of assembler level micro-profiling | |
24 | +# data or at least decent platform guide I can't tell... Or better | |
25 | +# results might be achieved with VMX... Anyway, this module provides | |
26 | +# *worse* performance on other PowerPC implementations, ~40-15% slower | |
27 | +# on PPC970 depending on key length and ~40% slower on Power 5 for all | |
28 | +# key lengths. As it's obviously inappropriate as "best all-round" | |
29 | +# alternative, it has to be complemented with run-time CPU family | |
30 | +# detection. Oh! It should also be noted that unlike other PowerPC | |
31 | +# implementation IALU ppc-mont.pl module performs *suboptimaly* on | |
32 | +# >=1024-bit key lengths on Power 6. It should also be noted that | |
33 | +# *everything* said so far applies to 64-bit builds! As far as 32-bit | |
34 | +# application executed on 64-bit CPU goes, this module is likely to | |
35 | +# become preferred choice, because it's easy to adapt it for such | |
36 | +# case and *is* faster than 32-bit ppc-mont.pl on *all* processors. | |
37 | + | |
38 | +# February 2008 | |
39 | + | |
40 | +# Micro-profiling assisted optimization results in ~15% improvement | |
41 | +# over original ppc64-mont.pl version, or overall ~50% improvement | |
42 | +# over ppc.pl module on Power 6. If compared to ppc-mont.pl on same | |
43 | +# Power 6 CPU, this module is 5-150% faster depending on key length, | |
44 | +# [hereafter] more for longer keys. But if compared to ppc-mont.pl | |
45 | +# on 1.8GHz PPC970, it's only 5-55% faster. Still far from impressive | |
46 | +# in absolute terms, but it's apparently the way Power 6 is... | |
47 | + | |
48 | +$flavour = shift; | |
49 | + | |
50 | +if ($flavour =~ /32/) { | |
51 | + $SIZE_T=4; | |
52 | + $RZONE= 224; | |
53 | + $FRAME= $SIZE_T*12+8*12; | |
54 | + $fname= "bn_mul_mont_ppc64"; | |
55 | + | |
56 | + $STUX= "stwux"; # store indexed and update | |
57 | + $PUSH= "stw"; | |
58 | + $POP= "lwz"; | |
59 | + die "not implemented yet"; | |
60 | +} elsif ($flavour =~ /64/) { | |
61 | + $SIZE_T=8; | |
62 | + $RZONE= 288; | |
63 | + $FRAME= $SIZE_T*12+8*12; | |
64 | + $fname= "bn_mul_mont"; | |
65 | + | |
66 | + # same as above, but 64-bit mnemonics... | |
67 | + $STUX= "stdux"; # store indexed and update | |
68 | + $PUSH= "std"; | |
69 | + $POP= "ld"; | |
70 | +} else { die "nonsense $flavour"; } | |
71 | + | |
72 | +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | |
73 | +( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or | |
74 | +( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or | |
75 | +die "can't locate ppc-xlate.pl"; | |
76 | + | |
77 | +open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!"; | |
78 | + | |
79 | +$FRAME=($FRAME+63)&~63; | |
80 | +$TRANSFER=16*8; | |
81 | + | |
82 | +$carry="r0"; | |
83 | +$sp="r1"; | |
84 | +$toc="r2"; | |
85 | +$rp="r3"; $ovf="r3"; | |
86 | +$ap="r4"; | |
87 | +$bp="r5"; | |
88 | +$np="r6"; | |
89 | +$n0="r7"; | |
90 | +$num="r8"; | |
91 | +$rp="r9"; # $rp is reassigned | |
92 | +$tp="r10"; | |
93 | +$j="r11"; | |
94 | +$i="r12"; | |
95 | +# non-volatile registers | |
96 | +$nap_d="r14"; # interleaved ap and np in double format | |
97 | +$a0="r15"; # ap[0] | |
98 | +$t0="r16"; # temporary registers | |
99 | +$t1="r17"; | |
100 | +$t2="r18"; | |
101 | +$t3="r19"; | |
102 | +$t4="r20"; | |
103 | +$t5="r21"; | |
104 | +$t6="r22"; | |
105 | +$t7="r23"; | |
106 | + | |
107 | +# PPC offers enough register bank capacity to unroll inner loops twice | |
108 | +# | |
109 | +# ..A3A2A1A0 | |
110 | +# dcba | |
111 | +# ----------- | |
112 | +# A0a | |
113 | +# A0b | |
114 | +# A0c | |
115 | +# A0d | |
116 | +# A1a | |
117 | +# A1b | |
118 | +# A1c | |
119 | +# A1d | |
120 | +# A2a | |
121 | +# A2b | |
122 | +# A2c | |
123 | +# A2d | |
124 | +# A3a | |
125 | +# A3b | |
126 | +# A3c | |
127 | +# A3d | |
128 | +# ..a | |
129 | +# ..b | |
130 | +# | |
131 | +$ba="f0"; $bb="f1"; $bc="f2"; $bd="f3"; | |
132 | +$na="f4"; $nb="f5"; $nc="f6"; $nd="f7"; | |
133 | +$dota="f8"; $dotb="f9"; | |
134 | +$A0="f10"; $A1="f11"; $A2="f12"; $A3="f13"; | |
135 | +$N0="f14"; $N1="f15"; $N2="f16"; $N3="f17"; | |
136 | +$T0a="f18"; $T0b="f19"; | |
137 | +$T1a="f20"; $T1b="f21"; | |
138 | +$T2a="f22"; $T2b="f23"; | |
139 | +$T3a="f24"; $T3b="f25"; | |
140 | + | |
141 | +# sp----------->+-------------------------------+ | |
142 | +# | saved sp | | |
143 | +# +-------------------------------+ | |
144 | +# | | | |
145 | +# +-------------------------------+ | |
146 | +# | 10 saved gpr, r14-r23 | | |
147 | +# . . | |
148 | +# . . | |
149 | +# +12*size_t +-------------------------------+ | |
150 | +# | 12 saved fpr, f14-f25 | | |
151 | +# . . | |
152 | +# . . | |
153 | +# +12*8 +-------------------------------+ | |
154 | +# | padding to 64 byte boundary | | |
155 | +# . . | |
156 | +# +X +-------------------------------+ | |
157 | +# | 16 gpr<->fpr transfer zone | | |
158 | +# . . | |
159 | +# . . | |
160 | +# +16*8 +-------------------------------+ | |
161 | +# | __int64 tmp[-1] | | |
162 | +# +-------------------------------+ | |
163 | +# | __int64 tmp[num] | | |
164 | +# . . | |
165 | +# . . | |
166 | +# . . | |
167 | +# +(num+1)*8 +-------------------------------+ | |
168 | +# | padding to 64 byte boundary | | |
169 | +# . . | |
170 | +# +X +-------------------------------+ | |
171 | +# | double nap_d[4*num] | | |
172 | +# . . | |
173 | +# . . | |
174 | +# . . | |
175 | +# +-------------------------------+ | |
176 | + | |
177 | +$code=<<___; | |
178 | +.machine "any" | |
179 | +.text | |
180 | + | |
181 | +.globl .$fname | |
182 | +.align 5 | |
183 | +.$fname: | |
184 | + cmpwi $num,4 | |
185 | + mr $rp,r3 ; $rp is reassigned | |
186 | + li r3,0 ; possible "not handled" return code | |
187 | + bltlr- | |
188 | + andi. r0,$num,1 ; $num has to be even | |
189 | + bnelr- | |
190 | + | |
191 | + slwi $num,$num,3 ; num*=8 | |
192 | + li $i,-4096 | |
193 | + slwi $tp,$num,2 ; place for {an}p_{lh}[num], i.e. 4*num | |
194 | + add $tp,$tp,$num ; place for tp[num+1] | |
195 | + addi $tp,$tp,`$FRAME+$TRANSFER+8+64+$RZONE` | |
196 | + subf $tp,$tp,$sp ; $sp-$tp | |
197 | + and $tp,$tp,$i ; minimize TLB usage | |
198 | + subf $tp,$sp,$tp ; $tp-$sp | |
199 | + $STUX $sp,$sp,$tp ; alloca | |
200 | + | |
201 | + $PUSH r14,`2*$SIZE_T`($sp) | |
202 | + $PUSH r15,`3*$SIZE_T`($sp) | |
203 | + $PUSH r16,`4*$SIZE_T`($sp) | |
204 | + $PUSH r17,`5*$SIZE_T`($sp) | |
205 | + $PUSH r18,`6*$SIZE_T`($sp) | |
206 | + $PUSH r19,`7*$SIZE_T`($sp) | |
207 | + $PUSH r20,`8*$SIZE_T`($sp) | |
208 | + $PUSH r21,`9*$SIZE_T`($sp) | |
209 | + $PUSH r22,`10*$SIZE_T`($sp) | |
210 | + $PUSH r23,`11*$SIZE_T`($sp) | |
211 | + stfd f14,`12*$SIZE_T+0`($sp) | |
212 | + stfd f15,`12*$SIZE_T+8`($sp) | |
213 | + stfd f16,`12*$SIZE_T+16`($sp) | |
214 | + stfd f17,`12*$SIZE_T+24`($sp) | |
215 | + stfd f18,`12*$SIZE_T+32`($sp) | |
216 | + stfd f19,`12*$SIZE_T+40`($sp) | |
217 | + stfd f20,`12*$SIZE_T+48`($sp) | |
218 | + stfd f21,`12*$SIZE_T+56`($sp) | |
219 | + stfd f22,`12*$SIZE_T+64`($sp) | |
220 | + stfd f23,`12*$SIZE_T+72`($sp) | |
221 | + stfd f24,`12*$SIZE_T+80`($sp) | |
222 | + stfd f25,`12*$SIZE_T+88`($sp) | |
223 | + | |
224 | + ld $a0,0($ap) ; pull ap[0] value | |
225 | + ld $n0,0($n0) ; pull n0[0] value | |
226 | + ld $t3,0($bp) ; bp[0] | |
227 | + | |
228 | + addi $tp,$sp,`$FRAME+$TRANSFER+8+64` | |
229 | + li $i,-64 | |
230 | + add $nap_d,$tp,$num | |
231 | + and $nap_d,$nap_d,$i ; align to 64 bytes | |
232 | + | |
233 | + mulld $t7,$a0,$t3 ; ap[0]*bp[0] | |
234 | + ; nap_d is off by 1, because it's used with stfdu/lfdu | |
235 | + addi $nap_d,$nap_d,-8 | |
236 | + srwi $j,$num,`3+1` ; counter register, num/2 | |
237 | + mulld $t7,$t7,$n0 ; tp[0]*n0 | |
238 | + addi $j,$j,-1 | |
239 | + addi $tp,$sp,`$FRAME+$TRANSFER-8` | |
240 | + li $carry,0 | |
241 | + mtctr $j | |
242 | + | |
243 | + ; transfer bp[0] to FPU as 4x16-bit values | |
244 | + extrdi $t0,$t3,16,48 | |
245 | + extrdi $t1,$t3,16,32 | |
246 | + extrdi $t2,$t3,16,16 | |
247 | + extrdi $t3,$t3,16,0 | |
248 | + std $t0,`$FRAME+0`($sp) | |
249 | + std $t1,`$FRAME+8`($sp) | |
250 | + std $t2,`$FRAME+16`($sp) | |
251 | + std $t3,`$FRAME+24`($sp) | |
252 | + ; transfer (ap[0]*bp[0])*n0 to FPU as 4x16-bit values | |
253 | + extrdi $t4,$t7,16,48 | |
254 | + extrdi $t5,$t7,16,32 | |
255 | + extrdi $t6,$t7,16,16 | |
256 | + extrdi $t7,$t7,16,0 | |
257 | + std $t4,`$FRAME+32`($sp) | |
258 | + std $t5,`$FRAME+40`($sp) | |
259 | + std $t6,`$FRAME+48`($sp) | |
260 | + std $t7,`$FRAME+56`($sp) | |
261 | + lwz $t0,4($ap) ; load a[j] as 32-bit word pair | |
262 | + lwz $t1,0($ap) | |
263 | + lwz $t2,12($ap) ; load a[j+1] as 32-bit word pair | |
264 | + lwz $t3,8($ap) | |
265 | + lwz $t4,4($np) ; load n[j] as 32-bit word pair | |
266 | + lwz $t5,0($np) | |
267 | + lwz $t6,12($np) ; load n[j+1] as 32-bit word pair | |
268 | + lwz $t7,8($np) | |
269 | + lfd $ba,`$FRAME+0`($sp) | |
270 | + lfd $bb,`$FRAME+8`($sp) | |
271 | + lfd $bc,`$FRAME+16`($sp) | |
272 | + lfd $bd,`$FRAME+24`($sp) | |
273 | + lfd $na,`$FRAME+32`($sp) | |
274 | + lfd $nb,`$FRAME+40`($sp) | |
275 | + lfd $nc,`$FRAME+48`($sp) | |
276 | + lfd $nd,`$FRAME+56`($sp) | |
277 | + std $t0,`$FRAME+64`($sp) | |
278 | + std $t1,`$FRAME+72`($sp) | |
279 | + std $t2,`$FRAME+80`($sp) | |
280 | + std $t3,`$FRAME+88`($sp) | |
281 | + std $t4,`$FRAME+96`($sp) | |
282 | + std $t5,`$FRAME+104`($sp) | |
283 | + std $t6,`$FRAME+112`($sp) | |
284 | + std $t7,`$FRAME+120`($sp) | |
285 | + fcfid $ba,$ba | |
286 | + fcfid $bb,$bb | |
287 | + fcfid $bc,$bc | |
288 | + fcfid $bd,$bd | |
289 | + fcfid $na,$na | |
290 | + fcfid $nb,$nb | |
291 | + fcfid $nc,$nc | |
292 | + fcfid $nd,$nd | |
293 | + | |
294 | + lfd $A0,`$FRAME+64`($sp) | |
295 | + lfd $A1,`$FRAME+72`($sp) | |
296 | + lfd $A2,`$FRAME+80`($sp) | |
297 | + lfd $A3,`$FRAME+88`($sp) | |
298 | + lfd $N0,`$FRAME+96`($sp) | |
299 | + lfd $N1,`$FRAME+104`($sp) | |
300 | + lfd $N2,`$FRAME+112`($sp) | |
301 | + lfd $N3,`$FRAME+120`($sp) | |
302 | + fcfid $A0,$A0 | |
303 | + fcfid $A1,$A1 | |
304 | + fcfid $A2,$A2 | |
305 | + fcfid $A3,$A3 | |
306 | + fcfid $N0,$N0 | |
307 | + fcfid $N1,$N1 | |
308 | + fcfid $N2,$N2 | |
309 | + fcfid $N3,$N3 | |
310 | + addi $ap,$ap,16 | |
311 | + addi $np,$np,16 | |
312 | + | |
313 | + fmul $T1a,$A1,$ba | |
314 | + fmul $T1b,$A1,$bb | |
315 | + stfd $A0,8($nap_d) ; save a[j] in double format | |
316 | + stfd $A1,16($nap_d) | |
317 | + fmul $T2a,$A2,$ba | |
318 | + fmul $T2b,$A2,$bb | |
319 | + stfd $A2,24($nap_d) ; save a[j+1] in double format | |
320 | + stfd $A3,32($nap_d) | |
321 | + fmul $T3a,$A3,$ba | |
322 | + fmul $T3b,$A3,$bb | |
323 | + stfd $N0,40($nap_d) ; save n[j] in double format | |
324 | + stfd $N1,48($nap_d) | |
325 | + fmul $T0a,$A0,$ba | |
326 | + fmul $T0b,$A0,$bb | |
327 | + stfd $N2,56($nap_d) ; save n[j+1] in double format | |
328 | + stfdu $N3,64($nap_d) | |
329 | + | |
330 | + fmadd $T1a,$A0,$bc,$T1a | |
331 | + fmadd $T1b,$A0,$bd,$T1b | |
332 | + fmadd $T2a,$A1,$bc,$T2a | |
333 | + fmadd $T2b,$A1,$bd,$T2b | |
334 | + fmadd $T3a,$A2,$bc,$T3a | |
335 | + fmadd $T3b,$A2,$bd,$T3b | |
336 | + fmul $dota,$A3,$bc | |
337 | + fmul $dotb,$A3,$bd | |
338 | + | |
339 | + fmadd $T1a,$N1,$na,$T1a | |
340 | + fmadd $T1b,$N1,$nb,$T1b | |
341 | + fmadd $T2a,$N2,$na,$T2a | |
342 | + fmadd $T2b,$N2,$nb,$T2b | |
343 | + fmadd $T3a,$N3,$na,$T3a | |
344 | + fmadd $T3b,$N3,$nb,$T3b | |
345 | + fmadd $T0a,$N0,$na,$T0a | |
346 | + fmadd $T0b,$N0,$nb,$T0b | |
347 | + | |
348 | + fmadd $T1a,$N0,$nc,$T1a | |
349 | + fmadd $T1b,$N0,$nd,$T1b | |
350 | + fmadd $T2a,$N1,$nc,$T2a | |
351 | + fmadd $T2b,$N1,$nd,$T2b | |
352 | + fmadd $T3a,$N2,$nc,$T3a | |
353 | + fmadd $T3b,$N2,$nd,$T3b | |
354 | + fmadd $dota,$N3,$nc,$dota | |
355 | + fmadd $dotb,$N3,$nd,$dotb | |
356 | + | |
357 | + fctid $T0a,$T0a | |
358 | + fctid $T0b,$T0b | |
359 | + fctid $T1a,$T1a | |
360 | + fctid $T1b,$T1b | |
361 | + fctid $T2a,$T2a | |
362 | + fctid $T2b,$T2b | |
363 | + fctid $T3a,$T3a | |
364 | + fctid $T3b,$T3b | |
365 | + | |
366 | + stfd $T0a,`$FRAME+0`($sp) | |
367 | + stfd $T0b,`$FRAME+8`($sp) | |
368 | + stfd $T1a,`$FRAME+16`($sp) | |
369 | + stfd $T1b,`$FRAME+24`($sp) | |
370 | + stfd $T2a,`$FRAME+32`($sp) | |
371 | + stfd $T2b,`$FRAME+40`($sp) | |
372 | + stfd $T3a,`$FRAME+48`($sp) | |
373 | + stfd $T3b,`$FRAME+56`($sp) | |
374 | + | |
375 | +.align 5 | |
376 | +L1st: | |
377 | + lwz $t0,4($ap) ; load a[j] as 32-bit word pair | |
378 | + lwz $t1,0($ap) | |
379 | + lwz $t2,12($ap) ; load a[j+1] as 32-bit word pair | |
380 | + lwz $t3,8($ap) | |
381 | + lwz $t4,4($np) ; load n[j] as 32-bit word pair | |
382 | + lwz $t5,0($np) | |
383 | + lwz $t6,12($np) ; load n[j+1] as 32-bit word pair | |
384 | + lwz $t7,8($np) | |
385 | + std $t0,`$FRAME+64`($sp) | |
386 | + std $t1,`$FRAME+72`($sp) | |
387 | + std $t2,`$FRAME+80`($sp) | |
388 | + std $t3,`$FRAME+88`($sp) | |
389 | + std $t4,`$FRAME+96`($sp) | |
390 | + std $t5,`$FRAME+104`($sp) | |
391 | + std $t6,`$FRAME+112`($sp) | |
392 | + std $t7,`$FRAME+120`($sp) | |
393 | + ld $t0,`$FRAME+0`($sp) | |
394 | + ld $t1,`$FRAME+8`($sp) | |
395 | + ld $t2,`$FRAME+16`($sp) | |
396 | + ld $t3,`$FRAME+24`($sp) | |
397 | + ld $t4,`$FRAME+32`($sp) | |
398 | + ld $t5,`$FRAME+40`($sp) | |
399 | + ld $t6,`$FRAME+48`($sp) | |
400 | + ld $t7,`$FRAME+56`($sp) | |
401 | + lfd $A0,`$FRAME+64`($sp) | |
402 | + lfd $A1,`$FRAME+72`($sp) | |
403 | + lfd $A2,`$FRAME+80`($sp) | |
404 | + lfd $A3,`$FRAME+88`($sp) | |
405 | + lfd $N0,`$FRAME+96`($sp) | |
406 | + lfd $N1,`$FRAME+104`($sp) | |
407 | + lfd $N2,`$FRAME+112`($sp) | |
408 | + lfd $N3,`$FRAME+120`($sp) | |
409 | + fcfid $A0,$A0 | |
410 | + fcfid $A1,$A1 | |
411 | + fcfid $A2,$A2 | |
412 | + fcfid $A3,$A3 | |
413 | + fcfid $N0,$N0 | |
414 | + fcfid $N1,$N1 | |
415 | + fcfid $N2,$N2 | |
416 | + fcfid $N3,$N3 | |
417 | + addi $ap,$ap,16 | |
418 | + addi $np,$np,16 | |
419 | + | |
420 | + fmul $T1a,$A1,$ba | |
421 | + fmul $T1b,$A1,$bb | |
422 | + fmul $T2a,$A2,$ba | |
423 | + fmul $T2b,$A2,$bb | |
424 | + stfd $A0,8($nap_d) ; save a[j] in double format | |
425 | + stfd $A1,16($nap_d) | |
426 | + fmul $T3a,$A3,$ba | |
427 | + fmul $T3b,$A3,$bb | |
428 | + fmadd $T0a,$A0,$ba,$dota | |
429 | + fmadd $T0b,$A0,$bb,$dotb | |
430 | + stfd $A2,24($nap_d) ; save a[j+1] in double format | |
431 | + stfd $A3,32($nap_d) | |
432 | + | |
433 | + fmadd $T1a,$A0,$bc,$T1a | |
434 | + fmadd $T1b,$A0,$bd,$T1b | |
435 | + fmadd $T2a,$A1,$bc,$T2a | |
436 | + fmadd $T2b,$A1,$bd,$T2b | |
437 | + stfd $N0,40($nap_d) ; save n[j] in double format | |
438 | + stfd $N1,48($nap_d) | |
439 | + fmadd $T3a,$A2,$bc,$T3a | |
440 | + fmadd $T3b,$A2,$bd,$T3b | |
441 | + add $t0,$t0,$carry ; can not overflow | |
442 | + fmul $dota,$A3,$bc | |
443 | + fmul $dotb,$A3,$bd | |
444 | + stfd $N2,56($nap_d) ; save n[j+1] in double format | |
445 | + stfdu $N3,64($nap_d) | |
446 | + srdi $carry,$t0,16 | |
447 | + add $t1,$t1,$carry | |
448 | + srdi $carry,$t1,16 | |
449 | + | |
450 | + fmadd $T1a,$N1,$na,$T1a | |
451 | + fmadd $T1b,$N1,$nb,$T1b | |
452 | + insrdi $t0,$t1,16,32 | |
453 | + fmadd $T2a,$N2,$na,$T2a | |
454 | + fmadd $T2b,$N2,$nb,$T2b | |
455 | + add $t2,$t2,$carry | |
456 | + fmadd $T3a,$N3,$na,$T3a | |
457 | + fmadd $T3b,$N3,$nb,$T3b | |
458 | + srdi $carry,$t2,16 | |
459 | + fmadd $T0a,$N0,$na,$T0a | |
460 | + fmadd $T0b,$N0,$nb,$T0b | |
461 | + insrdi $t0,$t2,16,16 | |
462 | + add $t3,$t3,$carry | |
463 | + srdi $carry,$t3,16 | |
464 | + | |
465 | + fmadd $T1a,$N0,$nc,$T1a | |
466 | + fmadd $T1b,$N0,$nd,$T1b | |
467 | + insrdi $t0,$t3,16,0 ; 0..63 bits | |
468 | + fmadd $T2a,$N1,$nc,$T2a | |
469 | + fmadd $T2b,$N1,$nd,$T2b | |
470 | + add $t4,$t4,$carry | |
471 | + fmadd $T3a,$N2,$nc,$T3a | |
472 | + fmadd $T3b,$N2,$nd,$T3b | |
473 | + srdi $carry,$t4,16 | |
474 | + fmadd $dota,$N3,$nc,$dota | |
475 | + fmadd $dotb,$N3,$nd,$dotb | |
476 | + add $t5,$t5,$carry | |
477 | + srdi $carry,$t5,16 | |
478 | + insrdi $t4,$t5,16,32 | |
479 | + | |
480 | + fctid $T0a,$T0a | |
481 | + fctid $T0b,$T0b | |
482 | + add $t6,$t6,$carry | |
483 | + fctid $T1a,$T1a | |
484 | + fctid $T1b,$T1b | |
485 | + srdi $carry,$t6,16 | |
486 | + fctid $T2a,$T2a | |
487 | + fctid $T2b,$T2b | |
488 | + insrdi $t4,$t6,16,16 | |
489 | + fctid $T3a,$T3a | |
490 | + fctid $T3b,$T3b | |
491 | + add $t7,$t7,$carry | |
492 | + insrdi $t4,$t7,16,0 ; 64..127 bits | |
493 | + srdi $carry,$t7,16 ; upper 33 bits | |
494 | + | |
495 | + stfd $T0a,`$FRAME+0`($sp) | |
496 | + stfd $T0b,`$FRAME+8`($sp) | |
497 | + stfd $T1a,`$FRAME+16`($sp) | |
498 | + stfd $T1b,`$FRAME+24`($sp) | |
499 | + stfd $T2a,`$FRAME+32`($sp) | |
500 | + stfd $T2b,`$FRAME+40`($sp) | |
501 | + stfd $T3a,`$FRAME+48`($sp) | |
502 | + stfd $T3b,`$FRAME+56`($sp) | |
503 | + std $t0,8($tp) ; tp[j-1] | |
504 | + stdu $t4,16($tp) ; tp[j] | |
505 | + bdnz- L1st | |
506 | + | |
507 | + fctid $dota,$dota | |
508 | + fctid $dotb,$dotb | |
509 | + | |
510 | + ld $t0,`$FRAME+0`($sp) | |
511 | + ld $t1,`$FRAME+8`($sp) | |
512 | + ld $t2,`$FRAME+16`($sp) | |
513 | + ld $t3,`$FRAME+24`($sp) | |
514 | + ld $t4,`$FRAME+32`($sp) | |
515 | + ld $t5,`$FRAME+40`($sp) | |
516 | + ld $t6,`$FRAME+48`($sp) | |
517 | + ld $t7,`$FRAME+56`($sp) | |
518 | + stfd $dota,`$FRAME+64`($sp) | |
519 | + stfd $dotb,`$FRAME+72`($sp) | |
520 | + | |
521 | + add $t0,$t0,$carry ; can not overflow | |
522 | + srdi $carry,$t0,16 | |
523 | + add $t1,$t1,$carry | |
524 | + srdi $carry,$t1,16 | |
525 | + insrdi $t0,$t1,16,32 | |
526 | + add $t2,$t2,$carry | |
527 | + srdi $carry,$t2,16 | |
528 | + insrdi $t0,$t2,16,16 | |
529 | + add $t3,$t3,$carry | |
530 | + srdi $carry,$t3,16 | |
531 | + insrdi $t0,$t3,16,0 ; 0..63 bits | |
532 | + add $t4,$t4,$carry | |
533 | + srdi $carry,$t4,16 | |
534 | + add $t5,$t5,$carry | |
535 | + srdi $carry,$t5,16 | |
536 | + insrdi $t4,$t5,16,32 | |
537 | + add $t6,$t6,$carry | |
538 | + srdi $carry,$t6,16 | |
539 | + insrdi $t4,$t6,16,16 | |
540 | + add $t7,$t7,$carry | |
541 | + insrdi $t4,$t7,16,0 ; 64..127 bits | |
542 | + srdi $carry,$t7,16 ; upper 33 bits | |
543 | + ld $t6,`$FRAME+64`($sp) | |
544 | + ld $t7,`$FRAME+72`($sp) | |
545 | + | |
546 | + std $t0,8($tp) ; tp[j-1] | |
547 | + stdu $t4,16($tp) ; tp[j] | |
548 | + | |
549 | + add $t6,$t6,$carry ; can not overflow | |
550 | + srdi $carry,$t6,16 | |
551 | + add $t7,$t7,$carry | |
552 | + insrdi $t6,$t7,48,0 | |
553 | + srdi $ovf,$t7,48 | |
554 | + std $t6,8($tp) ; tp[num-1] | |
555 | + | |
556 | + slwi $t7,$num,2 | |
557 | + subf $nap_d,$t7,$nap_d ; rewind pointer | |
558 | + | |
559 | + li $i,8 ; i=1 | |
560 | +.align 5 | |
561 | +Louter: | |
562 | + ldx $t3,$bp,$i ; bp[i] | |
563 | + ld $t6,`$FRAME+$TRANSFER+8`($sp) ; tp[0] | |
564 | + mulld $t7,$a0,$t3 ; ap[0]*bp[i] | |
565 | + | |
566 | + addi $tp,$sp,`$FRAME+$TRANSFER` | |
567 | + add $t7,$t7,$t6 ; ap[0]*bp[i]+tp[0] | |
568 | + li $carry,0 | |
569 | + mulld $t7,$t7,$n0 ; tp[0]*n0 | |
570 | + mtctr $j | |
571 | + | |
572 | + ; transfer bp[i] to FPU as 4x16-bit values | |
573 | + extrdi $t0,$t3,16,48 | |
574 | + extrdi $t1,$t3,16,32 | |
575 | + extrdi $t2,$t3,16,16 | |
576 | + extrdi $t3,$t3,16,0 | |
577 | + std $t0,`$FRAME+0`($sp) | |
578 | + std $t1,`$FRAME+8`($sp) | |
579 | + std $t2,`$FRAME+16`($sp) | |
580 | + std $t3,`$FRAME+24`($sp) | |
581 | + ; transfer (ap[0]*bp[i]+tp[0])*n0 to FPU as 4x16-bit values | |
582 | + extrdi $t4,$t7,16,48 | |
583 | + extrdi $t5,$t7,16,32 | |
584 | + extrdi $t6,$t7,16,16 | |
585 | + extrdi $t7,$t7,16,0 | |
586 | + std $t4,`$FRAME+32`($sp) | |
587 | + std $t5,`$FRAME+40`($sp) | |
588 | + std $t6,`$FRAME+48`($sp) | |
589 | + std $t7,`$FRAME+56`($sp) | |
590 | + | |
591 | + lfd $A0,8($nap_d) ; load a[j] in double format | |
592 | + lfd $A1,16($nap_d) | |
593 | + lfd $A2,24($nap_d) ; load a[j+1] in double format | |
594 | + lfd $A3,32($nap_d) | |
595 | + lfd $N0,40($nap_d) ; load n[j] in double format | |
596 | + lfd $N1,48($nap_d) | |
597 | + lfd $N2,56($nap_d) ; load n[j+1] in double format | |
598 | + lfdu $N3,64($nap_d) | |
599 | + | |
600 | + lfd $ba,`$FRAME+0`($sp) | |
601 | + lfd $bb,`$FRAME+8`($sp) | |
602 | + lfd $bc,`$FRAME+16`($sp) | |
603 | + lfd $bd,`$FRAME+24`($sp) | |
604 | + lfd $na,`$FRAME+32`($sp) | |
605 | + lfd $nb,`$FRAME+40`($sp) | |
606 | + lfd $nc,`$FRAME+48`($sp) | |
607 | + lfd $nd,`$FRAME+56`($sp) | |
608 | + | |
609 | + fcfid $ba,$ba | |
610 | + fcfid $bb,$bb | |
611 | + fcfid $bc,$bc | |
612 | + fcfid $bd,$bd | |
613 | + fcfid $na,$na | |
614 | + fcfid $nb,$nb | |
615 | + fcfid $nc,$nc | |
616 | + fcfid $nd,$nd | |
617 | + | |
618 | + fmul $T1a,$A1,$ba | |
619 | + fmul $T1b,$A1,$bb | |
620 | + fmul $T2a,$A2,$ba | |
621 | + fmul $T2b,$A2,$bb | |
622 | + fmul $T3a,$A3,$ba | |
623 | + fmul $T3b,$A3,$bb | |
624 | + fmul $T0a,$A0,$ba | |
625 | + fmul $T0b,$A0,$bb | |
626 | + | |
627 | + fmadd $T1a,$A0,$bc,$T1a | |
628 | + fmadd $T1b,$A0,$bd,$T1b | |
629 | + fmadd $T2a,$A1,$bc,$T2a | |
630 | + fmadd $T2b,$A1,$bd,$T2b | |
631 | + fmadd $T3a,$A2,$bc,$T3a | |
632 | + fmadd $T3b,$A2,$bd,$T3b | |
633 | + fmul $dota,$A3,$bc | |
634 | + fmul $dotb,$A3,$bd | |
635 | + | |
636 | + fmadd $T1a,$N1,$na,$T1a | |
637 | + fmadd $T1b,$N1,$nb,$T1b | |
638 | + lfd $A0,8($nap_d) ; load a[j] in double format | |
639 | + lfd $A1,16($nap_d) | |
640 | + fmadd $T2a,$N2,$na,$T2a | |
641 | + fmadd $T2b,$N2,$nb,$T2b | |
642 | + lfd $A2,24($nap_d) ; load a[j+1] in double format | |
643 | + lfd $A3,32($nap_d) | |
644 | + fmadd $T3a,$N3,$na,$T3a | |
645 | + fmadd $T3b,$N3,$nb,$T3b | |
646 | + fmadd $T0a,$N0,$na,$T0a | |
647 | + fmadd $T0b,$N0,$nb,$T0b | |
648 | + | |
649 | + fmadd $T1a,$N0,$nc,$T1a | |
650 | + fmadd $T1b,$N0,$nd,$T1b | |
651 | + fmadd $T2a,$N1,$nc,$T2a | |
652 | + fmadd $T2b,$N1,$nd,$T2b | |
653 | + fmadd $T3a,$N2,$nc,$T3a | |
654 | + fmadd $T3b,$N2,$nd,$T3b | |
655 | + fmadd $dota,$N3,$nc,$dota | |
656 | + fmadd $dotb,$N3,$nd,$dotb | |
657 | + | |
658 | + fctid $T0a,$T0a | |
659 | + fctid $T0b,$T0b | |
660 | + fctid $T1a,$T1a | |
661 | + fctid $T1b,$T1b | |
662 | + fctid $T2a,$T2a | |
663 | + fctid $T2b,$T2b | |
664 | + fctid $T3a,$T3a | |
665 | + fctid $T3b,$T3b | |
666 | + | |
667 | + stfd $T0a,`$FRAME+0`($sp) | |
668 | + stfd $T0b,`$FRAME+8`($sp) | |
669 | + stfd $T1a,`$FRAME+16`($sp) | |
670 | + stfd $T1b,`$FRAME+24`($sp) | |
671 | + stfd $T2a,`$FRAME+32`($sp) | |
672 | + stfd $T2b,`$FRAME+40`($sp) | |
673 | + stfd $T3a,`$FRAME+48`($sp) | |
674 | + stfd $T3b,`$FRAME+56`($sp) | |
675 | + | |
676 | +.align 5 | |
677 | +Linner: | |
678 | + fmul $T1a,$A1,$ba | |
679 | + fmul $T1b,$A1,$bb | |
680 | + fmul $T2a,$A2,$ba | |
681 | + fmul $T2b,$A2,$bb | |
682 | + lfd $N0,40($nap_d) ; load n[j] in double format | |
683 | + lfd $N1,48($nap_d) | |
684 | + fmul $T3a,$A3,$ba | |
685 | + fmul $T3b,$A3,$bb | |
686 | + fmadd $T0a,$A0,$ba,$dota | |
687 | + fmadd $T0b,$A0,$bb,$dotb | |
688 | + lfd $N2,56($nap_d) ; load n[j+1] in double format | |
689 | + lfdu $N3,64($nap_d) | |
690 | + | |
691 | + fmadd $T1a,$A0,$bc,$T1a | |
692 | + fmadd $T1b,$A0,$bd,$T1b | |
693 | + fmadd $T2a,$A1,$bc,$T2a | |
694 | + fmadd $T2b,$A1,$bd,$T2b | |
695 | + lfd $A0,8($nap_d) ; load a[j] in double format | |
696 | + lfd $A1,16($nap_d) | |
697 | + fmadd $T3a,$A2,$bc,$T3a | |
698 | + fmadd $T3b,$A2,$bd,$T3b | |
699 | + fmul $dota,$A3,$bc | |
700 | + fmul $dotb,$A3,$bd | |
701 | + lfd $A2,24($nap_d) ; load a[j+1] in double format | |
702 | + lfd $A3,32($nap_d) | |
703 | + | |
704 | + fmadd $T1a,$N1,$na,$T1a | |
705 | + fmadd $T1b,$N1,$nb,$T1b | |
706 | + ld $t0,`$FRAME+0`($sp) | |
707 | + ld $t1,`$FRAME+8`($sp) | |
708 | + fmadd $T2a,$N2,$na,$T2a | |
709 | + fmadd $T2b,$N2,$nb,$T2b | |
710 | + ld $t2,`$FRAME+16`($sp) | |
711 | + ld $t3,`$FRAME+24`($sp) | |
712 | + fmadd $T3a,$N3,$na,$T3a | |
713 | + fmadd $T3b,$N3,$nb,$T3b | |
714 | + add $t0,$t0,$carry ; can not overflow | |
715 | + ld $t4,`$FRAME+32`($sp) | |
716 | + ld $t5,`$FRAME+40`($sp) | |
717 | + fmadd $T0a,$N0,$na,$T0a | |
718 | + fmadd $T0b,$N0,$nb,$T0b | |
719 | + srdi $carry,$t0,16 | |
720 | + add $t1,$t1,$carry | |
721 | + srdi $carry,$t1,16 | |
722 | + ld $t6,`$FRAME+48`($sp) | |
723 | + ld $t7,`$FRAME+56`($sp) | |
724 | + | |
725 | + fmadd $T1a,$N0,$nc,$T1a | |
726 | + fmadd $T1b,$N0,$nd,$T1b | |
727 | + insrdi $t0,$t1,16,32 | |
728 | + ld $t1,8($tp) ; tp[j] | |
729 | + fmadd $T2a,$N1,$nc,$T2a | |
730 | + fmadd $T2b,$N1,$nd,$T2b | |
731 | + add $t2,$t2,$carry | |
732 | + fmadd $T3a,$N2,$nc,$T3a | |
733 | + fmadd $T3b,$N2,$nd,$T3b | |
734 | + srdi $carry,$t2,16 | |
735 | + insrdi $t0,$t2,16,16 | |
736 | + fmadd $dota,$N3,$nc,$dota | |
737 | + fmadd $dotb,$N3,$nd,$dotb | |
738 | + add $t3,$t3,$carry | |
739 | + ldu $t2,16($tp) ; tp[j+1] | |
740 | + srdi $carry,$t3,16 | |
741 | + insrdi $t0,$t3,16,0 ; 0..63 bits | |
742 | + add $t4,$t4,$carry | |
743 | + | |
744 | + fctid $T0a,$T0a | |
745 | + fctid $T0b,$T0b | |
746 | + srdi $carry,$t4,16 | |
747 | + fctid $T1a,$T1a | |
748 | + fctid $T1b,$T1b | |
749 | + add $t5,$t5,$carry | |
750 | + fctid $T2a,$T2a | |
751 | + fctid $T2b,$T2b | |
752 | + srdi $carry,$t5,16 | |
753 | + insrdi $t4,$t5,16,32 | |
754 | + fctid $T3a,$T3a | |
755 | + fctid $T3b,$T3b | |
756 | + add $t6,$t6,$carry | |
757 | + srdi $carry,$t6,16 | |
758 | + insrdi $t4,$t6,16,16 | |
759 | + | |
760 | + stfd $T0a,`$FRAME+0`($sp) | |
761 | + stfd $T0b,`$FRAME+8`($sp) | |
762 | + add $t7,$t7,$carry | |
763 | + addc $t3,$t0,$t1 | |
764 | + stfd $T1a,`$FRAME+16`($sp) | |
765 | + stfd $T1b,`$FRAME+24`($sp) | |
766 | + insrdi $t4,$t7,16,0 ; 64..127 bits | |
767 | + srdi $carry,$t7,16 ; upper 33 bits | |
768 | + stfd $T2a,`$FRAME+32`($sp) | |
769 | + stfd $T2b,`$FRAME+40`($sp) | |
770 | + adde $t5,$t4,$t2 | |
771 | + stfd $T3a,`$FRAME+48`($sp) | |
772 | + stfd $T3b,`$FRAME+56`($sp) | |
773 | + addze $carry,$carry | |
774 | + std $t3,-16($tp) ; tp[j-1] | |
775 | + std $t5,-8($tp) ; tp[j] | |
776 | + bdnz- Linner | |
777 | + | |
778 | + fctid $dota,$dota | |
779 | + fctid $dotb,$dotb | |
780 | + ld $t0,`$FRAME+0`($sp) | |
781 | + ld $t1,`$FRAME+8`($sp) | |
782 | + ld $t2,`$FRAME+16`($sp) | |
783 | + ld $t3,`$FRAME+24`($sp) | |
784 | + ld $t4,`$FRAME+32`($sp) | |
785 | + ld $t5,`$FRAME+40`($sp) | |
786 | + ld $t6,`$FRAME+48`($sp) | |
787 | + ld $t7,`$FRAME+56`($sp) | |
788 | + stfd $dota,`$FRAME+64`($sp) | |
789 | + stfd $dotb,`$FRAME+72`($sp) | |
790 | + | |
791 | + add $t0,$t0,$carry ; can not overflow | |
792 | + srdi $carry,$t0,16 | |
793 | + add $t1,$t1,$carry | |
794 | + srdi $carry,$t1,16 | |
795 | + insrdi $t0,$t1,16,32 | |
796 | + add $t2,$t2,$carry | |
797 | + ld $t1,8($tp) ; tp[j] | |
798 | + srdi $carry,$t2,16 | |
799 | + insrdi $t0,$t2,16,16 | |
800 | + add $t3,$t3,$carry | |
801 | + ldu $t2,16($tp) ; tp[j+1] | |
802 | + srdi $carry,$t3,16 | |
803 | + insrdi $t0,$t3,16,0 ; 0..63 bits | |
804 | + add $t4,$t4,$carry | |
805 | + srdi $carry,$t4,16 | |
806 | + add $t5,$t5,$carry | |
807 | + srdi $carry,$t5,16 | |
808 | + insrdi $t4,$t5,16,32 | |
809 | + add $t6,$t6,$carry | |
810 | + srdi $carry,$t6,16 | |
811 | + insrdi $t4,$t6,16,16 | |
812 | + add $t7,$t7,$carry | |
813 | + insrdi $t4,$t7,16,0 ; 64..127 bits | |
814 | + srdi $carry,$t7,16 ; upper 33 bits | |
815 | + ld $t6,`$FRAME+64`($sp) | |
816 | + ld $t7,`$FRAME+72`($sp) | |
817 | + | |
818 | + addc $t3,$t0,$t1 | |
819 | + adde $t5,$t4,$t2 | |
820 | + addze $carry,$carry | |
821 | + | |
822 | + std $t3,-16($tp) ; tp[j-1] | |
823 | + std $t5,-8($tp) ; tp[j] | |
824 | + | |
825 | + add $carry,$carry,$ovf ; comsume upmost overflow | |
826 | + add $t6,$t6,$carry ; can not overflow | |
827 | + srdi $carry,$t6,16 | |
828 | + add $t7,$t7,$carry | |
829 | + insrdi $t6,$t7,48,0 | |
830 | + srdi $ovf,$t7,48 | |
831 | + std $t6,0($tp) ; tp[num-1] | |
832 | + | |
833 | + slwi $t7,$num,2 | |
834 | + addi $i,$i,8 | |
835 | + subf $nap_d,$t7,$nap_d ; rewind pointer | |
836 | + cmpw $i,$num | |
837 | + blt- Louter | |
838 | + | |
839 | + subf $np,$num,$np ; rewind np | |
840 | + addi $j,$j,1 ; restore counter | |
841 | + subfc $i,$i,$i ; j=0 and "clear" XER[CA] | |
842 | + addi $tp,$sp,`$FRAME+$TRANSFER+8` | |
843 | + addi $t4,$sp,`$FRAME+$TRANSFER+16` | |
844 | + addi $t5,$np,8 | |
845 | + addi $t6,$rp,8 | |
846 | + mtctr $j | |
847 | + | |
848 | +.align 4 | |
849 | +Lsub: ldx $t0,$tp,$i | |
850 | + ldx $t1,$np,$i | |
851 | + ldx $t2,$t4,$i | |
852 | + ldx $t3,$t5,$i | |
853 | + subfe $t0,$t1,$t0 ; tp[j]-np[j] | |
854 | + subfe $t2,$t3,$t2 ; tp[j+1]-np[j+1] | |
855 | + stdx $t0,$rp,$i | |
856 | + stdx $t2,$t6,$i | |
857 | + addi $i,$i,16 | |
858 | + bdnz- Lsub | |
859 | + | |
860 | + li $i,0 | |
861 | + subfe $ovf,$i,$ovf ; handle upmost overflow bit | |
862 | + and $ap,$tp,$ovf | |
863 | + andc $np,$rp,$ovf | |
864 | + or $ap,$ap,$np ; ap=borrow?tp:rp | |
865 | + addi $t7,$ap,8 | |
866 | + mtctr $j | |
867 | + | |
868 | +.align 4 | |
869 | +Lcopy: ; copy or in-place refresh | |
870 | + ldx $t0,$ap,$i | |
871 | + ldx $t1,$t7,$i | |
872 | + std $i,8($nap_d) ; zap nap_d | |
873 | + std $i,16($nap_d) | |
874 | + std $i,24($nap_d) | |
875 | + std $i,32($nap_d) | |
876 | + std $i,40($nap_d) | |
877 | + std $i,48($nap_d) | |
878 | + std $i,56($nap_d) | |
879 | + stdu $i,64($nap_d) | |
880 | + stdx $t0,$rp,$i | |
881 | + stdx $t1,$t6,$i | |
882 | + stdx $i,$tp,$i ; zap tp at once | |
883 | + stdx $i,$t4,$i | |
884 | + addi $i,$i,16 | |
885 | + bdnz- Lcopy | |
886 | + | |
887 | + $POP r14,`2*$SIZE_T`($sp) | |
888 | + $POP r15,`3*$SIZE_T`($sp) | |
889 | + $POP r16,`4*$SIZE_T`($sp) | |
890 | + $POP r17,`5*$SIZE_T`($sp) | |
891 | + $POP r18,`6*$SIZE_T`($sp) | |
892 | + $POP r19,`7*$SIZE_T`($sp) | |
893 | + $POP r20,`8*$SIZE_T`($sp) | |
894 | + $POP r21,`9*$SIZE_T`($sp) | |
895 | + $POP r22,`10*$SIZE_T`($sp) | |
896 | + $POP r23,`11*$SIZE_T`($sp) | |
897 | + lfd f14,`12*$SIZE_T+0`($sp) | |
898 | + lfd f15,`12*$SIZE_T+8`($sp) | |
899 | + lfd f16,`12*$SIZE_T+16`($sp) | |
900 | + lfd f17,`12*$SIZE_T+24`($sp) | |
901 | + lfd f18,`12*$SIZE_T+32`($sp) | |
902 | + lfd f19,`12*$SIZE_T+40`($sp) | |
903 | + lfd f20,`12*$SIZE_T+48`($sp) | |
904 | + lfd f21,`12*$SIZE_T+56`($sp) | |
905 | + lfd f22,`12*$SIZE_T+64`($sp) | |
906 | + lfd f23,`12*$SIZE_T+72`($sp) | |
907 | + lfd f24,`12*$SIZE_T+80`($sp) | |
908 | + lfd f25,`12*$SIZE_T+88`($sp) | |
909 | + $POP $sp,0($sp) | |
910 | + li r3,1 ; signal "handled" | |
911 | + blr | |
912 | + .long 0 | |
913 | +.asciz "Montgomery Multiplication for PPC64, CRYPTOGAMS by <appro\@fy.chalmers.se>" | |
914 | +___ | |
915 | + | |
916 | +$code =~ s/\`([^\`]*)\`/eval $1/gem; | |
917 | +print $code; | |
918 | +close STDOUT; |
@@ -0,0 +1,225 @@ | ||
1 | +#!/usr/bin/env perl | |
2 | + | |
3 | +# ==================================================================== | |
4 | +# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | |
5 | +# project. The module is, however, dual licensed under OpenSSL and | |
6 | +# CRYPTOGAMS licenses depending on where you obtain it. For further | |
7 | +# details see http://www.openssl.org/~appro/cryptogams/. | |
8 | +# ==================================================================== | |
9 | + | |
10 | +# April 2007. | |
11 | +# | |
12 | +# Performance improvement over vanilla C code varies from 85% to 45% | |
13 | +# depending on key length and benchmark. Unfortunately in this context | |
14 | +# these are not very impressive results [for code that utilizes "wide" | |
15 | +# 64x64=128-bit multiplication, which is not commonly available to C | |
16 | +# programmers], at least hand-coded bn_asm.c replacement is known to | |
17 | +# provide 30-40% better results for longest keys. Well, on a second | |
18 | +# thought it's not very surprising, because z-CPUs are single-issue | |
19 | +# and _strictly_ in-order execution, while bn_mul_mont is more or less | |
20 | +# dependent on CPU ability to pipe-line instructions and have several | |
21 | +# of them "in-flight" at the same time. I mean while other methods, | |
22 | +# for example Karatsuba, aim to minimize amount of multiplications at | |
23 | +# the cost of other operations increase, bn_mul_mont aim to neatly | |
24 | +# "overlap" multiplications and the other operations [and on most | |
25 | +# platforms even minimize the amount of the other operations, in | |
26 | +# particular references to memory]. But it's possible to improve this | |
27 | +# module performance by implementing dedicated squaring code-path and | |
28 | +# possibly by unrolling loops... | |
29 | + | |
30 | +# January 2009. | |
31 | +# | |
32 | +# Reschedule to minimize/avoid Address Generation Interlock hazard, | |
33 | +# make inner loops counter-based. | |
34 | + | |
35 | +$mn0="%r0"; | |
36 | +$num="%r1"; | |
37 | + | |
38 | +# int bn_mul_mont( | |
39 | +$rp="%r2"; # BN_ULONG *rp, | |
40 | +$ap="%r3"; # const BN_ULONG *ap, | |
41 | +$bp="%r4"; # const BN_ULONG *bp, | |
42 | +$np="%r5"; # const BN_ULONG *np, | |
43 | +$n0="%r6"; # const BN_ULONG *n0, | |
44 | +#$num="160(%r15)" # int num); | |
45 | + | |
46 | +$bi="%r2"; # zaps rp | |
47 | +$j="%r7"; | |
48 | + | |
49 | +$ahi="%r8"; | |
50 | +$alo="%r9"; | |
51 | +$nhi="%r10"; | |
52 | +$nlo="%r11"; | |
53 | +$AHI="%r12"; | |
54 | +$NHI="%r13"; | |
55 | +$count="%r14"; | |
56 | +$sp="%r15"; | |
57 | + | |
58 | +$code.=<<___; | |
59 | +.text | |
60 | +.globl bn_mul_mont | |
61 | +.type bn_mul_mont,\@function | |
62 | +bn_mul_mont: | |
63 | + lgf $num,164($sp) # pull $num | |
64 | + sla $num,3 # $num to enumerate bytes | |
65 | + la $bp,0($num,$bp) | |
66 | + | |
67 | + stg %r2,16($sp) | |
68 | + | |
69 | + cghi $num,16 # | |
70 | + lghi %r2,0 # | |
71 | + blr %r14 # if($num<16) return 0; | |
72 | + cghi $num,128 # | |
73 | + bhr %r14 # if($num>128) return 0; | |
74 | + | |
75 | + stmg %r3,%r15,24($sp) | |
76 | + | |
77 | + lghi $rp,-160-8 # leave room for carry bit | |
78 | + lcgr $j,$num # -$num | |
79 | + lgr %r0,$sp | |
80 | + la $rp,0($rp,$sp) | |
81 | + la $sp,0($j,$rp) # alloca | |
82 | + stg %r0,0($sp) # back chain | |
83 | + | |
84 | + sra $num,3 # restore $num | |
85 | + la $bp,0($j,$bp) # restore $bp | |
86 | + ahi $num,-1 # adjust $num for inner loop | |
87 | + lg $n0,0($n0) # pull n0 | |
88 | + | |
89 | + lg $bi,0($bp) | |
90 | + lg $alo,0($ap) | |
91 | + mlgr $ahi,$bi # ap[0]*bp[0] | |
92 | + lgr $AHI,$ahi | |
93 | + | |
94 | + lgr $mn0,$alo # "tp[0]"*n0 | |
95 | + msgr $mn0,$n0 | |
96 | + | |
97 | + lg $nlo,0($np) # | |
98 | + mlgr $nhi,$mn0 # np[0]*m1 | |
99 | + algr $nlo,$alo # +="tp[0]" | |
100 | + lghi $NHI,0 | |
101 | + alcgr $NHI,$nhi | |
102 | + | |
103 | + la $j,8(%r0) # j=1 | |
104 | + lr $count,$num | |
105 | + | |
106 | +.align 16 | |
107 | +.L1st: | |
108 | + lg $alo,0($j,$ap) | |
109 | + mlgr $ahi,$bi # ap[j]*bp[0] | |
110 | + algr $alo,$AHI | |
111 | + lghi $AHI,0 | |
112 | + alcgr $AHI,$ahi | |
113 | + | |
114 | + lg $nlo,0($j,$np) | |
115 | + mlgr $nhi,$mn0 # np[j]*m1 | |
116 | + algr $nlo,$NHI | |
117 | + lghi $NHI,0 | |
118 | + alcgr $nhi,$NHI # +="tp[j]" | |
119 | + algr $nlo,$alo | |
120 | + alcgr $NHI,$nhi | |
121 | + | |
122 | + stg $nlo,160-8($j,$sp) # tp[j-1]= | |
123 | + la $j,8($j) # j++ | |
124 | + brct $count,.L1st | |
125 | + | |
126 | + algr $NHI,$AHI | |
127 | + lghi $AHI,0 | |
128 | + alcgr $AHI,$AHI # upmost overflow bit | |
129 | + stg $NHI,160-8($j,$sp) | |
130 | + stg $AHI,160($j,$sp) | |
131 | + la $bp,8($bp) # bp++ | |
132 | + | |
133 | +.Louter: | |
134 | + lg $bi,0($bp) # bp[i] | |
135 | + lg $alo,0($ap) | |
136 | + mlgr $ahi,$bi # ap[0]*bp[i] | |
137 | + alg $alo,160($sp) # +=tp[0] | |
138 | + lghi $AHI,0 | |
139 | + alcgr $AHI,$ahi | |
140 | + | |
141 | + lgr $mn0,$alo | |
142 | + msgr $mn0,$n0 # tp[0]*n0 | |
143 | + | |
144 | + lg $nlo,0($np) # np[0] | |
145 | + mlgr $nhi,$mn0 # np[0]*m1 | |
146 | + algr $nlo,$alo # +="tp[0]" | |
147 | + lghi $NHI,0 | |
148 | + alcgr $NHI,$nhi | |
149 | + | |
150 | + la $j,8(%r0) # j=1 | |
151 | + lr $count,$num | |
152 | + | |
153 | +.align 16 | |
154 | +.Linner: | |
155 | + lg $alo,0($j,$ap) | |
156 | + mlgr $ahi,$bi # ap[j]*bp[i] | |
157 | + algr $alo,$AHI | |
158 | + lghi $AHI,0 | |
159 | + alcgr $ahi,$AHI | |
160 | + alg $alo,160($j,$sp)# +=tp[j] | |
161 | + alcgr $AHI,$ahi | |
162 | + | |
163 | + lg $nlo,0($j,$np) | |
164 | + mlgr $nhi,$mn0 # np[j]*m1 | |
165 | + algr $nlo,$NHI | |
166 | + lghi $NHI,0 | |
167 | + alcgr $nhi,$NHI | |
168 | + algr $nlo,$alo # +="tp[j]" | |
169 | + alcgr $NHI,$nhi | |
170 | + | |
171 | + stg $nlo,160-8($j,$sp) # tp[j-1]= | |
172 | + la $j,8($j) # j++ | |
173 | + brct $count,.Linner | |
174 | + | |
175 | + algr $NHI,$AHI | |
176 | + lghi $AHI,0 | |
177 | + alcgr $AHI,$AHI | |
178 | + alg $NHI,160($j,$sp)# accumulate previous upmost overflow bit | |
179 | + lghi $ahi,0 | |
180 | + alcgr $AHI,$ahi # new upmost overflow bit | |
181 | + stg $NHI,160-8($j,$sp) | |
182 | + stg $AHI,160($j,$sp) | |
183 | + | |
184 | + la $bp,8($bp) # bp++ | |
185 | + clg $bp,160+8+32($j,$sp) # compare to &bp[num] | |
186 | + jne .Louter | |
187 | + | |
188 | + lg $rp,160+8+16($j,$sp) # reincarnate rp | |
189 | + la $ap,160($sp) | |
190 | + ahi $num,1 # restore $num, incidentally clears "borrow" | |
191 | + | |
192 | + la $j,0(%r0) | |
193 | + lr $count,$num | |
194 | +.Lsub: lg $alo,0($j,$ap) | |
195 | + slbg $alo,0($j,$np) | |
196 | + stg $alo,0($j,$rp) | |
197 | + la $j,8($j) | |
198 | + brct $count,.Lsub | |
199 | + lghi $ahi,0 | |
200 | + slbgr $AHI,$ahi # handle upmost carry | |
201 | + | |
202 | + ngr $ap,$AHI | |
203 | + lghi $np,-1 | |
204 | + xgr $np,$AHI | |
205 | + ngr $np,$rp | |
206 | + ogr $ap,$np # ap=borrow?tp:rp | |
207 | + | |
208 | + la $j,0(%r0) | |
209 | + lgr $count,$num | |
210 | +.Lcopy: lg $alo,0($j,$ap) # copy or in-place refresh | |
211 | + stg $j,160($j,$sp) # zap tp | |
212 | + stg $alo,0($j,$rp) | |
213 | + la $j,8($j) | |
214 | + brct $count,.Lcopy | |
215 | + | |
216 | + la %r1,160+8+48($j,$sp) | |
217 | + lmg %r6,%r15,0(%r1) | |
218 | + lghi %r2,1 # signal "processed" | |
219 | + br %r14 | |
220 | +.size bn_mul_mont,.-bn_mul_mont | |
221 | +.string "Montgomery Multiplication for s390x, CRYPTOGAMS by <appro\@openssl.org>" | |
222 | +___ | |
223 | + | |
224 | +print $code; | |
225 | +close STDOUT; |
@@ -0,0 +1,678 @@ | ||
1 | +.ident "s390x.S, version 1.0" | |
2 | +// ==================================================================== | |
3 | +// Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | |
4 | +// project. | |
5 | +// | |
6 | +// Rights for redistribution and usage in source and binary forms are | |
7 | +// granted according to the OpenSSL license. Warranty of any kind is | |
8 | +// disclaimed. | |
9 | +// ==================================================================== | |
10 | + | |
11 | +.text | |
12 | + | |
13 | +#define zero %r0 | |
14 | + | |
15 | +// BN_ULONG bn_mul_add_words(BN_ULONG *r2,BN_ULONG *r3,int r4,BN_ULONG r5); | |
16 | +.globl bn_mul_add_words | |
17 | +.type bn_mul_add_words,@function | |
18 | +.align 4 | |
19 | +bn_mul_add_words: | |
20 | + lghi zero,0 // zero = 0 | |
21 | + la %r1,0(%r2) // put rp aside | |
22 | + lghi %r2,0 // i=0; | |
23 | + ltgfr %r4,%r4 | |
24 | + bler %r14 // if (len<=0) return 0; | |
25 | + | |
26 | + stmg %r6,%r10,48(%r15) | |
27 | + lghi %r8,0 // carry = 0 | |
28 | + srag %r10,%r4,2 // cnt=len/4 | |
29 | + jz .Loop1_madd | |
30 | + | |
31 | +.Loop4_madd: | |
32 | + lg %r7,0(%r2,%r3) // ap[i] | |
33 | + mlgr %r6,%r5 // *=w | |
34 | + algr %r7,%r8 // +=carry | |
35 | + alcgr %r6,zero | |
36 | + alg %r7,0(%r2,%r1) // +=rp[i] | |
37 | + alcgr %r6,zero | |
38 | + stg %r7,0(%r2,%r1) // rp[i]= | |
39 | + | |
40 | + lg %r9,8(%r2,%r3) | |
41 | + mlgr %r8,%r5 | |
42 | + algr %r9,%r6 | |
43 | + alcgr %r8,zero | |
44 | + alg %r9,8(%r2,%r1) | |
45 | + alcgr %r8,zero | |
46 | + stg %r9,8(%r2,%r1) | |
47 | + | |
48 | + lg %r7,16(%r2,%r3) | |
49 | + mlgr %r6,%r5 | |
50 | + algr %r7,%r8 | |
51 | + alcgr %r6,zero | |
52 | + alg %r7,16(%r2,%r1) | |
53 | + alcgr %r6,zero | |
54 | + stg %r7,16(%r2,%r1) | |
55 | + | |
56 | + lg %r9,24(%r2,%r3) | |
57 | + mlgr %r8,%r5 | |
58 | + algr %r9,%r6 | |
59 | + alcgr %r8,zero | |
60 | + alg %r9,24(%r2,%r1) | |
61 | + alcgr %r8,zero | |
62 | + stg %r9,24(%r2,%r1) | |
63 | + | |
64 | + la %r2,32(%r2) // i+=4 | |
65 | + brct %r10,.Loop4_madd | |
66 | + | |
67 | + lghi %r10,3 | |
68 | + nr %r4,%r10 // cnt=len%4 | |
69 | + jz .Lend_madd | |
70 | + | |
71 | +.Loop1_madd: | |
72 | + lg %r7,0(%r2,%r3) // ap[i] | |
73 | + mlgr %r6,%r5 // *=w | |
74 | + algr %r7,%r8 // +=carry | |
75 | + alcgr %r6,zero | |
76 | + alg %r7,0(%r2,%r1) // +=rp[i] | |
77 | + alcgr %r6,zero | |
78 | + stg %r7,0(%r2,%r1) // rp[i]= | |
79 | + | |
80 | + lgr %r8,%r6 | |
81 | + la %r2,8(%r2) // i++ | |
82 | + brct %r4,.Loop1_madd | |
83 | + | |
84 | +.Lend_madd: | |
85 | + lgr %r2,%r8 | |
86 | + lmg %r6,%r10,48(%r15) | |
87 | + br %r14 | |
88 | +.size bn_mul_add_words,.-bn_mul_add_words | |
89 | + | |
90 | +// BN_ULONG bn_mul_words(BN_ULONG *r2,BN_ULONG *r3,int r4,BN_ULONG r5); | |
91 | +.globl bn_mul_words | |
92 | +.type bn_mul_words,@function | |
93 | +.align 4 | |
94 | +bn_mul_words: | |
95 | + lghi zero,0 // zero = 0 | |
96 | + la %r1,0(%r2) // put rp aside | |
97 | + lghi %r2,0 // i=0; | |
98 | + ltgfr %r4,%r4 | |
99 | + bler %r14 // if (len<=0) return 0; | |
100 | + | |
101 | + stmg %r6,%r10,48(%r15) | |
102 | + lghi %r8,0 // carry = 0 | |
103 | + srag %r10,%r4,2 // cnt=len/4 | |
104 | + jz .Loop1_mul | |
105 | + | |
106 | +.Loop4_mul: | |
107 | + lg %r7,0(%r2,%r3) // ap[i] | |
108 | + mlgr %r6,%r5 // *=w | |
109 | + algr %r7,%r8 // +=carry | |
110 | + alcgr %r6,zero | |
111 | + stg %r7,0(%r2,%r1) // rp[i]= | |
112 | + | |
113 | + lg %r9,8(%r2,%r3) | |
114 | + mlgr %r8,%r5 | |
115 | + algr %r9,%r6 | |
116 | + alcgr %r8,zero | |
117 | + stg %r9,8(%r2,%r1) | |
118 | + | |
119 | + lg %r7,16(%r2,%r3) | |
120 | + mlgr %r6,%r5 | |
121 | + algr %r7,%r8 | |
122 | + alcgr %r6,zero | |
123 | + stg %r7,16(%r2,%r1) | |
124 | + | |
125 | + lg %r9,24(%r2,%r3) | |
126 | + mlgr %r8,%r5 | |
127 | + algr %r9,%r6 | |
128 | + alcgr %r8,zero | |
129 | + stg %r9,24(%r2,%r1) | |
130 | + | |
131 | + la %r2,32(%r2) // i+=4 | |
132 | + brct %r10,.Loop4_mul | |
133 | + | |
134 | + lghi %r10,3 | |
135 | + nr %r4,%r10 // cnt=len%4 | |
136 | + jz .Lend_mul | |
137 | + | |
138 | +.Loop1_mul: | |
139 | + lg %r7,0(%r2,%r3) // ap[i] | |
140 | + mlgr %r6,%r5 // *=w | |
141 | + algr %r7,%r8 // +=carry | |
142 | + alcgr %r6,zero | |
143 | + stg %r7,0(%r2,%r1) // rp[i]= | |
144 | + | |
145 | + lgr %r8,%r6 | |
146 | + la %r2,8(%r2) // i++ | |
147 | + brct %r4,.Loop1_mul | |
148 | + | |
149 | +.Lend_mul: | |
150 | + lgr %r2,%r8 | |
151 | + lmg %r6,%r10,48(%r15) | |
152 | + br %r14 | |
153 | +.size bn_mul_words,.-bn_mul_words | |
154 | + | |
155 | +// void bn_sqr_words(BN_ULONG *r2,BN_ULONG *r2,int r4) | |
156 | +.globl bn_sqr_words | |
157 | +.type bn_sqr_words,@function | |
158 | +.align 4 | |
159 | +bn_sqr_words: | |
160 | + ltgfr %r4,%r4 | |
161 | + bler %r14 | |
162 | + | |
163 | + stmg %r6,%r7,48(%r15) | |
164 | + srag %r1,%r4,2 // cnt=len/4 | |
165 | + jz .Loop1_sqr | |
166 | + | |
167 | +.Loop4_sqr: | |
168 | + lg %r7,0(%r3) | |
169 | + mlgr %r6,%r7 | |
170 | + stg %r7,0(%r2) | |
171 | + stg %r6,8(%r2) | |
172 | + | |
173 | + lg %r7,8(%r3) | |
174 | + mlgr %r6,%r7 | |
175 | + stg %r7,16(%r2) | |
176 | + stg %r6,24(%r2) | |
177 | + | |
178 | + lg %r7,16(%r3) | |
179 | + mlgr %r6,%r7 | |
180 | + stg %r7,32(%r2) | |
181 | + stg %r6,40(%r2) | |
182 | + | |
183 | + lg %r7,24(%r3) | |
184 | + mlgr %r6,%r7 | |
185 | + stg %r7,48(%r2) | |
186 | + stg %r6,56(%r2) | |
187 | + | |
188 | + la %r3,32(%r3) | |
189 | + la %r2,64(%r2) | |
190 | + brct %r1,.Loop4_sqr | |
191 | + | |
192 | + lghi %r1,3 | |
193 | + nr %r4,%r1 // cnt=len%4 | |
194 | + jz .Lend_sqr | |
195 | + | |
196 | +.Loop1_sqr: | |
197 | + lg %r7,0(%r3) | |
198 | + mlgr %r6,%r7 | |
199 | + stg %r7,0(%r2) | |
200 | + stg %r6,8(%r2) | |
201 | + | |
202 | + la %r3,8(%r3) | |
203 | + la %r2,16(%r2) | |
204 | + brct %r4,.Loop1_sqr | |
205 | + | |
206 | +.Lend_sqr: | |
207 | + lmg %r6,%r7,48(%r15) | |
208 | + br %r14 | |
209 | +.size bn_sqr_words,.-bn_sqr_words | |
210 | + | |
211 | +// BN_ULONG bn_div_words(BN_ULONG h,BN_ULONG l,BN_ULONG d); | |
212 | +.globl bn_div_words | |
213 | +.type bn_div_words,@function | |
214 | +.align 4 | |
215 | +bn_div_words: | |
216 | + dlgr %r2,%r4 | |
217 | + lgr %r2,%r3 | |
218 | + br %r14 | |
219 | +.size bn_div_words,.-bn_div_words | |
220 | + | |
221 | +// BN_ULONG bn_add_words(BN_ULONG *r2,BN_ULONG *r3,BN_ULONG *r4,int r5); | |
222 | +.globl bn_add_words | |
223 | +.type bn_add_words,@function | |
224 | +.align 4 | |
225 | +bn_add_words: | |
226 | + la %r1,0(%r2) // put rp aside | |
227 | + lghi %r2,0 // i=0 | |
228 | + ltgfr %r5,%r5 | |
229 | + bler %r14 // if (len<=0) return 0; | |
230 | + | |
231 | + stg %r6,48(%r15) | |
232 | + lghi %r6,3 | |
233 | + nr %r6,%r5 // len%4 | |
234 | + sra %r5,2 // len/4, use sra because it sets condition code | |
235 | + jz .Loop1_add // carry is incidentally cleared if branch taken | |
236 | + algr %r2,%r2 // clear carry | |
237 | + | |
238 | +.Loop4_add: | |
239 | + lg %r0,0(%r2,%r3) | |
240 | + alcg %r0,0(%r2,%r4) | |
241 | + stg %r0,0(%r2,%r1) | |
242 | + lg %r0,8(%r2,%r3) | |
243 | + alcg %r0,8(%r2,%r4) | |
244 | + stg %r0,8(%r2,%r1) | |
245 | + lg %r0,16(%r2,%r3) | |
246 | + alcg %r0,16(%r2,%r4) | |
247 | + stg %r0,16(%r2,%r1) | |
248 | + lg %r0,24(%r2,%r3) | |
249 | + alcg %r0,24(%r2,%r4) | |
250 | + stg %r0,24(%r2,%r1) | |
251 | + | |
252 | + la %r2,32(%r2) // i+=4 | |
253 | + brct %r5,.Loop4_add | |
254 | + | |
255 | + la %r6,1(%r6) // see if len%4 is zero ... | |
256 | + brct %r6,.Loop1_add // without touching condition code:-) | |
257 | + | |
258 | +.Lexit_add: | |
259 | + lghi %r2,0 | |
260 | + alcgr %r2,%r2 | |
261 | + lg %r6,48(%r15) | |
262 | + br %r14 | |
263 | + | |
264 | +.Loop1_add: | |
265 | + lg %r0,0(%r2,%r3) | |
266 | + alcg %r0,0(%r2,%r4) | |
267 | + stg %r0,0(%r2,%r1) | |
268 | + | |
269 | + la %r2,8(%r2) // i++ | |
270 | + brct %r6,.Loop1_add | |
271 | + | |
272 | + j .Lexit_add | |
273 | +.size bn_add_words,.-bn_add_words | |
274 | + | |
275 | +// BN_ULONG bn_sub_words(BN_ULONG *r2,BN_ULONG *r3,BN_ULONG *r4,int r5); | |
276 | +.globl bn_sub_words | |
277 | +.type bn_sub_words,@function | |
278 | +.align 4 | |
279 | +bn_sub_words: | |
280 | + la %r1,0(%r2) // put rp aside | |
281 | + lghi %r2,0 // i=0 | |
282 | + ltgfr %r5,%r5 | |
283 | + bler %r14 // if (len<=0) return 0; | |
284 | + | |
285 | + stg %r6,48(%r15) | |
286 | + lghi %r6,3 | |
287 | + nr %r6,%r5 // len%4 | |
288 | + sra %r5,2 // len/4, use sra because it sets condition code | |
289 | + jnz .Loop4_sub // borrow is incidentally cleared if branch taken | |
290 | + slgr %r2,%r2 // clear borrow | |
291 | + | |
292 | +.Loop1_sub: | |
293 | + lg %r0,0(%r2,%r3) | |
294 | + slbg %r0,0(%r2,%r4) | |
295 | + stg %r0,0(%r2,%r1) | |
296 | + | |
297 | + la %r2,8(%r2) // i++ | |
298 | + brct %r6,.Loop1_sub | |
299 | + j .Lexit_sub | |
300 | + | |
301 | +.Loop4_sub: | |
302 | + lg %r0,0(%r2,%r3) | |
303 | + slbg %r0,0(%r2,%r4) | |
304 | + stg %r0,0(%r2,%r1) | |
305 | + lg %r0,8(%r2,%r3) | |
306 | + slbg %r0,8(%r2,%r4) | |
307 | + stg %r0,8(%r2,%r1) | |
308 | + lg %r0,16(%r2,%r3) | |
309 | + slbg %r0,16(%r2,%r4) | |
310 | + stg %r0,16(%r2,%r1) | |
311 | + lg %r0,24(%r2,%r3) | |
312 | + slbg %r0,24(%r2,%r4) | |
313 | + stg %r0,24(%r2,%r1) | |
314 | + | |
315 | + la %r2,32(%r2) // i+=4 | |
316 | + brct %r5,.Loop4_sub | |
317 | + | |
318 | + la %r6,1(%r6) // see if len%4 is zero ... | |
319 | + brct %r6,.Loop1_sub // without touching condition code:-) | |
320 | + | |
321 | +.Lexit_sub: | |
322 | + lghi %r2,0 | |
323 | + slbgr %r2,%r2 | |
324 | + lcgr %r2,%r2 | |
325 | + lg %r6,48(%r15) | |
326 | + br %r14 | |
327 | +.size bn_sub_words,.-bn_sub_words | |
328 | + | |
329 | +#define c1 %r1 | |
330 | +#define c2 %r5 | |
331 | +#define c3 %r8 | |
332 | + | |
333 | +#define mul_add_c(ai,bi,c1,c2,c3) \ | |
334 | + lg %r7,ai*8(%r3); \ | |
335 | + mlg %r6,bi*8(%r4); \ | |
336 | + algr c1,%r7; \ | |
337 | + alcgr c2,%r6; \ | |
338 | + alcgr c3,zero | |
339 | + | |
340 | +// void bn_mul_comba8(BN_ULONG *r2,BN_ULONG *r3,BN_ULONG *r4); | |
341 | +.globl bn_mul_comba8 | |
342 | +.type bn_mul_comba8,@function | |
343 | +.align 4 | |
344 | +bn_mul_comba8: | |
345 | + stmg %r6,%r8,48(%r15) | |
346 | + | |
347 | + lghi c1,0 | |
348 | + lghi c2,0 | |
349 | + lghi c3,0 | |
350 | + lghi zero,0 | |
351 | + | |
352 | + mul_add_c(0,0,c1,c2,c3); | |
353 | + stg c1,0*8(%r2) | |
354 | + lghi c1,0 | |
355 | + | |
356 | + mul_add_c(0,1,c2,c3,c1); | |
357 | + mul_add_c(1,0,c2,c3,c1); | |
358 | + stg c2,1*8(%r2) | |
359 | + lghi c2,0 | |
360 | + | |
361 | + mul_add_c(2,0,c3,c1,c2); | |
362 | + mul_add_c(1,1,c3,c1,c2); | |
363 | + mul_add_c(0,2,c3,c1,c2); | |
364 | + stg c3,2*8(%r2) | |
365 | + lghi c3,0 | |
366 | + | |
367 | + mul_add_c(0,3,c1,c2,c3); | |
368 | + mul_add_c(1,2,c1,c2,c3); | |
369 | + mul_add_c(2,1,c1,c2,c3); | |
370 | + mul_add_c(3,0,c1,c2,c3); | |
371 | + stg c1,3*8(%r2) | |
372 | + lghi c1,0 | |
373 | + | |
374 | + mul_add_c(4,0,c2,c3,c1); | |
375 | + mul_add_c(3,1,c2,c3,c1); | |
376 | + mul_add_c(2,2,c2,c3,c1); | |
377 | + mul_add_c(1,3,c2,c3,c1); | |
378 | + mul_add_c(0,4,c2,c3,c1); | |
379 | + stg c2,4*8(%r2) | |
380 | + lghi c2,0 | |
381 | + | |
382 | + mul_add_c(0,5,c3,c1,c2); | |
383 | + mul_add_c(1,4,c3,c1,c2); | |
384 | + mul_add_c(2,3,c3,c1,c2); | |
385 | + mul_add_c(3,2,c3,c1,c2); | |
386 | + mul_add_c(4,1,c3,c1,c2); | |
387 | + mul_add_c(5,0,c3,c1,c2); | |
388 | + stg c3,5*8(%r2) | |
389 | + lghi c3,0 | |
390 | + | |
391 | + mul_add_c(6,0,c1,c2,c3); | |
392 | + mul_add_c(5,1,c1,c2,c3); | |
393 | + mul_add_c(4,2,c1,c2,c3); | |
394 | + mul_add_c(3,3,c1,c2,c3); | |
395 | + mul_add_c(2,4,c1,c2,c3); | |
396 | + mul_add_c(1,5,c1,c2,c3); | |
397 | + mul_add_c(0,6,c1,c2,c3); | |
398 | + stg c1,6*8(%r2) | |
399 | + lghi c1,0 | |
400 | + | |
401 | + mul_add_c(0,7,c2,c3,c1); | |
402 | + mul_add_c(1,6,c2,c3,c1); | |
403 | + mul_add_c(2,5,c2,c3,c1); | |
404 | + mul_add_c(3,4,c2,c3,c1); | |
405 | + mul_add_c(4,3,c2,c3,c1); | |
406 | + mul_add_c(5,2,c2,c3,c1); | |
407 | + mul_add_c(6,1,c2,c3,c1); | |
408 | + mul_add_c(7,0,c2,c3,c1); | |
409 | + stg c2,7*8(%r2) | |
410 | + lghi c2,0 | |
411 | + | |
412 | + mul_add_c(7,1,c3,c1,c2); | |
413 | + mul_add_c(6,2,c3,c1,c2); | |
414 | + mul_add_c(5,3,c3,c1,c2); | |
415 | + mul_add_c(4,4,c3,c1,c2); | |
416 | + mul_add_c(3,5,c3,c1,c2); | |
417 | + mul_add_c(2,6,c3,c1,c2); | |
418 | + mul_add_c(1,7,c3,c1,c2); | |
419 | + stg c3,8*8(%r2) | |
420 | + lghi c3,0 | |
421 | + | |
422 | + mul_add_c(2,7,c1,c2,c3); | |
423 | + mul_add_c(3,6,c1,c2,c3); | |
424 | + mul_add_c(4,5,c1,c2,c3); | |
425 | + mul_add_c(5,4,c1,c2,c3); | |
426 | + mul_add_c(6,3,c1,c2,c3); | |
427 | + mul_add_c(7,2,c1,c2,c3); | |
428 | + stg c1,9*8(%r2) | |
429 | + lghi c1,0 | |
430 | + | |
431 | + mul_add_c(7,3,c2,c3,c1); | |
432 | + mul_add_c(6,4,c2,c3,c1); | |
433 | + mul_add_c(5,5,c2,c3,c1); | |
434 | + mul_add_c(4,6,c2,c3,c1); | |
435 | + mul_add_c(3,7,c2,c3,c1); | |
436 | + stg c2,10*8(%r2) | |
437 | + lghi c2,0 | |
438 | + | |
439 | + mul_add_c(4,7,c3,c1,c2); | |
440 | + mul_add_c(5,6,c3,c1,c2); | |
441 | + mul_add_c(6,5,c3,c1,c2); | |
442 | + mul_add_c(7,4,c3,c1,c2); | |
443 | + stg c3,11*8(%r2) | |
444 | + lghi c3,0 | |
445 | + | |
446 | + mul_add_c(7,5,c1,c2,c3); | |
447 | + mul_add_c(6,6,c1,c2,c3); | |
448 | + mul_add_c(5,7,c1,c2,c3); | |
449 | + stg c1,12*8(%r2) | |
450 | + lghi c1,0 | |
451 | + | |
452 | + | |
453 | + mul_add_c(6,7,c2,c3,c1); | |
454 | + mul_add_c(7,6,c2,c3,c1); | |
455 | + stg c2,13*8(%r2) | |
456 | + lghi c2,0 | |
457 | + | |
458 | + mul_add_c(7,7,c3,c1,c2); | |
459 | + stg c3,14*8(%r2) | |
460 | + stg c1,15*8(%r2) | |
461 | + | |
462 | + lmg %r6,%r8,48(%r15) | |
463 | + br %r14 | |
464 | +.size bn_mul_comba8,.-bn_mul_comba8 | |
465 | + | |
466 | +// void bn_mul_comba4(BN_ULONG *r2,BN_ULONG *r3,BN_ULONG *r4); | |
467 | +.globl bn_mul_comba4 | |
468 | +.type bn_mul_comba4,@function | |
469 | +.align 4 | |
470 | +bn_mul_comba4: | |
471 | + stmg %r6,%r8,48(%r15) | |
472 | + | |
473 | + lghi c1,0 | |
474 | + lghi c2,0 | |
475 | + lghi c3,0 | |
476 | + lghi zero,0 | |
477 | + | |
478 | + mul_add_c(0,0,c1,c2,c3); | |
479 | + stg c1,0*8(%r3) | |
480 | + lghi c1,0 | |
481 | + | |
482 | + mul_add_c(0,1,c2,c3,c1); | |
483 | + mul_add_c(1,0,c2,c3,c1); | |
484 | + stg c2,1*8(%r2) | |
485 | + lghi c2,0 | |
486 | + | |
487 | + mul_add_c(2,0,c3,c1,c2); | |
488 | + mul_add_c(1,1,c3,c1,c2); | |
489 | + mul_add_c(0,2,c3,c1,c2); | |
490 | + stg c3,2*8(%r2) | |
491 | + lghi c3,0 | |
492 | + | |
493 | + mul_add_c(0,3,c1,c2,c3); | |
494 | + mul_add_c(1,2,c1,c2,c3); | |
495 | + mul_add_c(2,1,c1,c2,c3); | |
496 | + mul_add_c(3,0,c1,c2,c3); | |
497 | + stg c1,3*8(%r2) | |
498 | + lghi c1,0 | |
499 | + | |
500 | + mul_add_c(3,1,c2,c3,c1); | |
501 | + mul_add_c(2,2,c2,c3,c1); | |
502 | + mul_add_c(1,3,c2,c3,c1); | |
503 | + stg c2,4*8(%r2) | |
504 | + lghi c2,0 | |
505 | + | |
506 | + mul_add_c(2,3,c3,c1,c2); | |
507 | + mul_add_c(3,2,c3,c1,c2); | |
508 | + stg c3,5*8(%r2) | |
509 | + lghi c3,0 | |
510 | + | |
511 | + mul_add_c(3,3,c1,c2,c3); | |
512 | + stg c1,6*8(%r2) | |
513 | + stg c2,7*8(%r2) | |
514 | + | |
515 | + stmg %r6,%r8,48(%r15) | |
516 | + br %r14 | |
517 | +.size bn_mul_comba4,.-bn_mul_comba4 | |
518 | + | |
519 | +#define sqr_add_c(ai,c1,c2,c3) \ | |
520 | + lg %r7,ai*8(%r3); \ | |
521 | + mlgr %r6,%r7; \ | |
522 | + algr c1,%r7; \ | |
523 | + alcgr c2,%r6; \ | |
524 | + alcgr c3,zero | |
525 | + | |
526 | +#define sqr_add_c2(ai,aj,c1,c2,c3) \ | |
527 | + lg %r7,ai*8(%r3); \ | |
528 | + mlg %r6,aj*8(%r3); \ | |
529 | + algr c1,%r7; \ | |
530 | + alcgr c2,%r6; \ | |
531 | + alcgr c3,zero; \ | |
532 | + algr c1,%r7; \ | |
533 | + alcgr c2,%r6; \ | |
534 | + alcgr c3,zero | |
535 | + | |
536 | +// void bn_sqr_comba8(BN_ULONG *r2,BN_ULONG *r3); | |
537 | +.globl bn_sqr_comba8 | |
538 | +.type bn_sqr_comba8,@function | |
539 | +.align 4 | |
540 | +bn_sqr_comba8: | |
541 | + stmg %r6,%r8,48(%r15) | |
542 | + | |
543 | + lghi c1,0 | |
544 | + lghi c2,0 | |
545 | + lghi c3,0 | |
546 | + lghi zero,0 | |
547 | + | |
548 | + sqr_add_c(0,c1,c2,c3); | |
549 | + stg c1,0*8(%r2) | |
550 | + lghi c1,0 | |
551 | + | |
552 | + sqr_add_c2(1,0,c2,c3,c1); | |
553 | + stg c2,1*8(%r2) | |
554 | + lghi c2,0 | |
555 | + | |
556 | + sqr_add_c(1,c3,c1,c2); | |
557 | + sqr_add_c2(2,0,c3,c1,c2); | |
558 | + stg c3,2*8(%r2) | |
559 | + lghi c3,0 | |
560 | + | |
561 | + sqr_add_c2(3,0,c1,c2,c3); | |
562 | + sqr_add_c2(2,1,c1,c2,c3); | |
563 | + stg c1,3*8(%r2) | |
564 | + lghi c1,0 | |
565 | + | |
566 | + sqr_add_c(2,c2,c3,c1); | |
567 | + sqr_add_c2(3,1,c2,c3,c1); | |
568 | + sqr_add_c2(4,0,c2,c3,c1); | |
569 | + stg c2,4*8(%r2) | |
570 | + lghi c2,0 | |
571 | + | |
572 | + sqr_add_c2(5,0,c3,c1,c2); | |
573 | + sqr_add_c2(4,1,c3,c1,c2); | |
574 | + sqr_add_c2(3,2,c3,c1,c2); | |
575 | + stg c3,5*8(%r2) | |
576 | + lghi c3,0 | |
577 | + | |
578 | + sqr_add_c(3,c1,c2,c3); | |
579 | + sqr_add_c2(4,2,c1,c2,c3); | |
580 | + sqr_add_c2(5,1,c1,c2,c3); | |
581 | + sqr_add_c2(6,0,c1,c2,c3); | |
582 | + stg c1,6*8(%r2) | |
583 | + lghi c1,0 | |
584 | + | |
585 | + sqr_add_c2(7,0,c2,c3,c1); | |
586 | + sqr_add_c2(6,1,c2,c3,c1); | |
587 | + sqr_add_c2(5,2,c2,c3,c1); | |
588 | + sqr_add_c2(4,3,c2,c3,c1); | |
589 | + stg c2,7*8(%r2) | |
590 | + lghi c2,0 | |
591 | + | |
592 | + sqr_add_c(4,c3,c1,c2); | |
593 | + sqr_add_c2(5,3,c3,c1,c2); | |
594 | + sqr_add_c2(6,2,c3,c1,c2); | |
595 | + sqr_add_c2(7,1,c3,c1,c2); | |
596 | + stg c3,8*8(%r2) | |
597 | + lghi c3,0 | |
598 | + | |
599 | + sqr_add_c2(7,2,c1,c2,c3); | |
600 | + sqr_add_c2(6,3,c1,c2,c3); | |
601 | + sqr_add_c2(5,4,c1,c2,c3); | |
602 | + stg c1,9*8(%r2) | |
603 | + lghi c1,0 | |
604 | + | |
605 | + sqr_add_c(5,c2,c3,c1); | |
606 | + sqr_add_c2(6,4,c2,c3,c1); | |
607 | + sqr_add_c2(7,3,c2,c3,c1); | |
608 | + stg c2,10*8(%r2) | |
609 | + lghi c2,0 | |
610 | + | |
611 | + sqr_add_c2(7,4,c3,c1,c2); | |
612 | + sqr_add_c2(6,5,c3,c1,c2); | |
613 | + stg c3,11*8(%r2) | |
614 | + lghi c3,0 | |
615 | + | |
616 | + sqr_add_c(6,c1,c2,c3); | |
617 | + sqr_add_c2(7,5,c1,c2,c3); | |
618 | + stg c1,12*8(%r2) | |
619 | + lghi c1,0 | |
620 | + | |
621 | + sqr_add_c2(7,6,c2,c3,c1); | |
622 | + stg c2,13*8(%r2) | |
623 | + lghi c2,0 | |
624 | + | |
625 | + sqr_add_c(7,c3,c1,c2); | |
626 | + stg c3,14*8(%r2) | |
627 | + stg c1,15*8(%r2) | |
628 | + | |
629 | + lmg %r6,%r8,48(%r15) | |
630 | + br %r14 | |
631 | +.size bn_sqr_comba8,.-bn_sqr_comba8 | |
632 | + | |
633 | +// void bn_sqr_comba4(BN_ULONG *r2,BN_ULONG *r3); | |
634 | +.globl bn_sqr_comba4 | |
635 | +.type bn_sqr_comba4,@function | |
636 | +.align 4 | |
637 | +bn_sqr_comba4: | |
638 | + stmg %r6,%r8,48(%r15) | |
639 | + | |
640 | + lghi c1,0 | |
641 | + lghi c2,0 | |
642 | + lghi c3,0 | |
643 | + lghi zero,0 | |
644 | + | |
645 | + sqr_add_c(0,c1,c2,c3); | |
646 | + stg c1,0*8(%r2) | |
647 | + lghi c1,0 | |
648 | + | |
649 | + sqr_add_c2(1,0,c2,c3,c1); | |
650 | + stg c2,1*8(%r2) | |
651 | + lghi c2,0 | |
652 | + | |
653 | + sqr_add_c(1,c3,c1,c2); | |
654 | + sqr_add_c2(2,0,c3,c1,c2); | |
655 | + stg c3,2*8(%r2) | |
656 | + lghi c3,0 | |
657 | + | |
658 | + sqr_add_c2(3,0,c1,c2,c3); | |
659 | + sqr_add_c2(2,1,c1,c2,c3); | |
660 | + stg c1,3*8(%r2) | |
661 | + lghi c1,0 | |
662 | + | |
663 | + sqr_add_c(2,c2,c3,c1); | |
664 | + sqr_add_c2(3,1,c2,c3,c1); | |
665 | + stg c2,4*8(%r2) | |
666 | + lghi c2,0 | |
667 | + | |
668 | + sqr_add_c2(3,2,c3,c1,c2); | |
669 | + stg c3,5*8(%r2) | |
670 | + lghi c3,0 | |
671 | + | |
672 | + sqr_add_c(3,c1,c2,c3); | |
673 | + stg c1,6*8(%r2) | |
674 | + stg c2,7*8(%r2) | |
675 | + | |
676 | + lmg %r6,%r8,48(%r15) | |
677 | + br %r14 | |
678 | +.size bn_sqr_comba4,.-bn_sqr_comba4 |
@@ -0,0 +1,606 @@ | ||
1 | +#!/usr/bin/env perl | |
2 | + | |
3 | +# ==================================================================== | |
4 | +# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | |
5 | +# project. The module is, however, dual licensed under OpenSSL and | |
6 | +# CRYPTOGAMS licenses depending on where you obtain it. For further | |
7 | +# details see http://www.openssl.org/~appro/cryptogams/. | |
8 | +# ==================================================================== | |
9 | + | |
10 | +# December 2005 | |
11 | +# | |
12 | +# Pure SPARCv9/8+ and IALU-only bn_mul_mont implementation. The reasons | |
13 | +# for undertaken effort are multiple. First of all, UltraSPARC is not | |
14 | +# the whole SPARCv9 universe and other VIS-free implementations deserve | |
15 | +# optimized code as much. Secondly, newly introduced UltraSPARC T1, | |
16 | +# a.k.a. Niagara, has shared FPU and concurrent FPU-intensive pathes, | |
17 | +# such as sparcv9a-mont, will simply sink it. Yes, T1 is equipped with | |
18 | +# several integrated RSA/DSA accelerator circuits accessible through | |
19 | +# kernel driver [only(*)], but having decent user-land software | |
20 | +# implementation is important too. Finally, reasons like desire to | |
21 | +# experiment with dedicated squaring procedure. Yes, this module | |
22 | +# implements one, because it was easiest to draft it in SPARCv9 | |
23 | +# instructions... | |
24 | + | |
25 | +# (*) Engine accessing the driver in question is on my TODO list. | |
26 | +# For reference, acceleator is estimated to give 6 to 10 times | |
27 | +# improvement on single-threaded RSA sign. It should be noted | |
28 | +# that 6-10x improvement coefficient does not actually mean | |
29 | +# something extraordinary in terms of absolute [single-threaded] | |
30 | +# performance, as SPARCv9 instruction set is by all means least | |
31 | +# suitable for high performance crypto among other 64 bit | |
32 | +# platforms. 6-10x factor simply places T1 in same performance | |
33 | +# domain as say AMD64 and IA-64. Improvement of RSA verify don't | |
34 | +# appear impressive at all, but it's the sign operation which is | |
35 | +# far more critical/interesting. | |
36 | + | |
37 | +# You might notice that inner loops are modulo-scheduled:-) This has | |
38 | +# essentially negligible impact on UltraSPARC performance, it's | |
39 | +# Fujitsu SPARC64 V users who should notice and hopefully appreciate | |
40 | +# the advantage... Currently this module surpasses sparcv9a-mont.pl | |
41 | +# by ~20% on UltraSPARC-III and later cores, but recall that sparcv9a | |
42 | +# module still have hidden potential [see TODO list there], which is | |
43 | +# estimated to be larger than 20%... | |
44 | + | |
45 | +# int bn_mul_mont( | |
46 | +$rp="%i0"; # BN_ULONG *rp, | |
47 | +$ap="%i1"; # const BN_ULONG *ap, | |
48 | +$bp="%i2"; # const BN_ULONG *bp, | |
49 | +$np="%i3"; # const BN_ULONG *np, | |
50 | +$n0="%i4"; # const BN_ULONG *n0, | |
51 | +$num="%i5"; # int num); | |
52 | + | |
53 | +$bits=32; | |
54 | +for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); } | |
55 | +if ($bits==64) { $bias=2047; $frame=192; } | |
56 | +else { $bias=0; $frame=128; } | |
57 | + | |
58 | +$car0="%o0"; | |
59 | +$car1="%o1"; | |
60 | +$car2="%o2"; # 1 bit | |
61 | +$acc0="%o3"; | |
62 | +$acc1="%o4"; | |
63 | +$mask="%g1"; # 32 bits, what a waste... | |
64 | +$tmp0="%g4"; | |
65 | +$tmp1="%g5"; | |
66 | + | |
67 | +$i="%l0"; | |
68 | +$j="%l1"; | |
69 | +$mul0="%l2"; | |
70 | +$mul1="%l3"; | |
71 | +$tp="%l4"; | |
72 | +$apj="%l5"; | |
73 | +$npj="%l6"; | |
74 | +$tpj="%l7"; | |
75 | + | |
76 | +$fname="bn_mul_mont_int"; | |
77 | + | |
78 | +$code=<<___; | |
79 | +.section ".text",#alloc,#execinstr | |
80 | + | |
81 | +.global $fname | |
82 | +.align 32 | |
83 | +$fname: | |
84 | + cmp %o5,4 ! 128 bits minimum | |
85 | + bge,pt %icc,.Lenter | |
86 | + sethi %hi(0xffffffff),$mask | |
87 | + retl | |
88 | + clr %o0 | |
89 | +.align 32 | |
90 | +.Lenter: | |
91 | + save %sp,-$frame,%sp | |
92 | + sll $num,2,$num ! num*=4 | |
93 | + or $mask,%lo(0xffffffff),$mask | |
94 | + ld [$n0],$n0 | |
95 | + cmp $ap,$bp | |
96 | + and $num,$mask,$num | |
97 | + ld [$bp],$mul0 ! bp[0] | |
98 | + nop | |
99 | + | |
100 | + add %sp,$bias,%o7 ! real top of stack | |
101 | + ld [$ap],$car0 ! ap[0] ! redundant in squaring context | |
102 | + sub %o7,$num,%o7 | |
103 | + ld [$ap+4],$apj ! ap[1] | |
104 | + and %o7,-1024,%o7 | |
105 | + ld [$np],$car1 ! np[0] | |
106 | + sub %o7,$bias,%sp ! alloca | |
107 | + ld [$np+4],$npj ! np[1] | |
108 | + be,pt `$bits==32?"%icc":"%xcc"`,.Lbn_sqr_mont | |
109 | + mov 12,$j | |
110 | + | |
111 | + mulx $car0,$mul0,$car0 ! ap[0]*bp[0] | |
112 | + mulx $apj,$mul0,$tmp0 !prologue! ap[1]*bp[0] | |
113 | + and $car0,$mask,$acc0 | |
114 | + add %sp,$bias+$frame,$tp | |
115 | + ld [$ap+8],$apj !prologue! | |
116 | + | |
117 | + mulx $n0,$acc0,$mul1 ! "t[0]"*n0 | |
118 | + and $mul1,$mask,$mul1 | |
119 | + | |
120 | + mulx $car1,$mul1,$car1 ! np[0]*"t[0]"*n0 | |
121 | + mulx $npj,$mul1,$acc1 !prologue! np[1]*"t[0]"*n0 | |
122 | + srlx $car0,32,$car0 | |
123 | + add $acc0,$car1,$car1 | |
124 | + ld [$np+8],$npj !prologue! | |
125 | + srlx $car1,32,$car1 | |
126 | + mov $tmp0,$acc0 !prologue! | |
127 | + | |
128 | +.L1st: | |
129 | + mulx $apj,$mul0,$tmp0 | |
130 | + mulx $npj,$mul1,$tmp1 | |
131 | + add $acc0,$car0,$car0 | |
132 | + ld [$ap+$j],$apj ! ap[j] | |
133 | + and $car0,$mask,$acc0 | |
134 | + add $acc1,$car1,$car1 | |
135 | + ld [$np+$j],$npj ! np[j] | |
136 | + srlx $car0,32,$car0 | |
137 | + add $acc0,$car1,$car1 | |
138 | + add $j,4,$j ! j++ | |
139 | + mov $tmp0,$acc0 | |
140 | + st $car1,[$tp] | |
141 | + cmp $j,$num | |
142 | + mov $tmp1,$acc1 | |
143 | + srlx $car1,32,$car1 | |
144 | + bl %icc,.L1st | |
145 | + add $tp,4,$tp ! tp++ | |
146 | +!.L1st | |
147 | + | |
148 | + mulx $apj,$mul0,$tmp0 !epilogue! | |
149 | + mulx $npj,$mul1,$tmp1 | |
150 | + add $acc0,$car0,$car0 | |
151 | + and $car0,$mask,$acc0 | |
152 | + add $acc1,$car1,$car1 | |
153 | + srlx $car0,32,$car0 | |
154 | + add $acc0,$car1,$car1 | |
155 | + st $car1,[$tp] | |
156 | + srlx $car1,32,$car1 | |
157 | + | |
158 | + add $tmp0,$car0,$car0 | |
159 | + and $car0,$mask,$acc0 | |
160 | + add $tmp1,$car1,$car1 | |
161 | + srlx $car0,32,$car0 | |
162 | + add $acc0,$car1,$car1 | |
163 | + st $car1,[$tp+4] | |
164 | + srlx $car1,32,$car1 | |
165 | + | |
166 | + add $car0,$car1,$car1 | |
167 | + st $car1,[$tp+8] | |
168 | + srlx $car1,32,$car2 | |
169 | + | |
170 | + mov 4,$i ! i++ | |
171 | + ld [$bp+4],$mul0 ! bp[1] | |
172 | +.Louter: | |
173 | + add %sp,$bias+$frame,$tp | |
174 | + ld [$ap],$car0 ! ap[0] | |
175 | + ld [$ap+4],$apj ! ap[1] | |
176 | + ld [$np],$car1 ! np[0] | |
177 | + ld [$np+4],$npj ! np[1] | |
178 | + ld [$tp],$tmp1 ! tp[0] | |
179 | + ld [$tp+4],$tpj ! tp[1] | |
180 | + mov 12,$j | |
181 | + | |
182 | + mulx $car0,$mul0,$car0 | |
183 | + mulx $apj,$mul0,$tmp0 !prologue! | |
184 | + add $tmp1,$car0,$car0 | |
185 | + ld [$ap+8],$apj !prologue! | |
186 | + and $car0,$mask,$acc0 | |
187 | + | |
188 | + mulx $n0,$acc0,$mul1 | |
189 | + and $mul1,$mask,$mul1 | |
190 | + | |
191 | + mulx $car1,$mul1,$car1 | |
192 | + mulx $npj,$mul1,$acc1 !prologue! | |
193 | + srlx $car0,32,$car0 | |
194 | + add $acc0,$car1,$car1 | |
195 | + ld [$np+8],$npj !prologue! | |
196 | + srlx $car1,32,$car1 | |
197 | + mov $tmp0,$acc0 !prologue! | |
198 | + | |
199 | +.Linner: | |
200 | + mulx $apj,$mul0,$tmp0 | |
201 | + mulx $npj,$mul1,$tmp1 | |
202 | + add $tpj,$car0,$car0 | |
203 | + ld [$ap+$j],$apj ! ap[j] | |
204 | + add $acc0,$car0,$car0 | |
205 | + add $acc1,$car1,$car1 | |
206 | + ld [$np+$j],$npj ! np[j] | |
207 | + and $car0,$mask,$acc0 | |
208 | + ld [$tp+8],$tpj ! tp[j] | |
209 | + srlx $car0,32,$car0 | |
210 | + add $acc0,$car1,$car1 | |
211 | + add $j,4,$j ! j++ | |
212 | + mov $tmp0,$acc0 | |
213 | + st $car1,[$tp] ! tp[j-1] | |
214 | + srlx $car1,32,$car1 | |
215 | + mov $tmp1,$acc1 | |
216 | + cmp $j,$num | |
217 | + bl %icc,.Linner | |
218 | + add $tp,4,$tp ! tp++ | |
219 | +!.Linner | |
220 | + | |
221 | + mulx $apj,$mul0,$tmp0 !epilogue! | |
222 | + mulx $npj,$mul1,$tmp1 | |
223 | + add $tpj,$car0,$car0 | |
224 | + add $acc0,$car0,$car0 | |
225 | + ld [$tp+8],$tpj ! tp[j] | |
226 | + and $car0,$mask,$acc0 | |
227 | + add $acc1,$car1,$car1 | |
228 | + srlx $car0,32,$car0 | |
229 | + add $acc0,$car1,$car1 | |
230 | + st $car1,[$tp] ! tp[j-1] | |
231 | + srlx $car1,32,$car1 | |
232 | + | |
233 | + add $tpj,$car0,$car0 | |
234 | + add $tmp0,$car0,$car0 | |
235 | + and $car0,$mask,$acc0 | |
236 | + add $tmp1,$car1,$car1 | |
237 | + add $acc0,$car1,$car1 | |
238 | + st $car1,[$tp+4] ! tp[j-1] | |
239 | + srlx $car0,32,$car0 | |
240 | + add $i,4,$i ! i++ | |
241 | + srlx $car1,32,$car1 | |
242 | + | |
243 | + add $car0,$car1,$car1 | |
244 | + cmp $i,$num | |
245 | + add $car2,$car1,$car1 | |
246 | + st $car1,[$tp+8] | |
247 | + | |
248 | + srlx $car1,32,$car2 | |
249 | + bl,a %icc,.Louter | |
250 | + ld [$bp+$i],$mul0 ! bp[i] | |
251 | +!.Louter | |
252 | + | |
253 | + add $tp,12,$tp | |
254 | + | |
255 | +.Ltail: | |
256 | + add $np,$num,$np | |
257 | + add $rp,$num,$rp | |
258 | + mov $tp,$ap | |
259 | + sub %g0,$num,%o7 ! k=-num | |
260 | + ba .Lsub | |
261 | + subcc %g0,%g0,%g0 ! clear %icc.c | |
262 | +.align 16 | |
263 | +.Lsub: | |
264 | + ld [$tp+%o7],%o0 | |
265 | + ld [$np+%o7],%o1 | |
266 | + subccc %o0,%o1,%o1 ! tp[j]-np[j] | |
267 | + add $rp,%o7,$i | |
268 | + add %o7,4,%o7 | |
269 | + brnz %o7,.Lsub | |
270 | + st %o1,[$i] | |
271 | + subc $car2,0,$car2 ! handle upmost overflow bit | |
272 | + and $tp,$car2,$ap | |
273 | + andn $rp,$car2,$np | |
274 | + or $ap,$np,$ap | |
275 | + sub %g0,$num,%o7 | |
276 | + | |
277 | +.Lcopy: | |
278 | + ld [$ap+%o7],%o0 ! copy or in-place refresh | |
279 | + st %g0,[$tp+%o7] ! zap tp | |
280 | + st %o0,[$rp+%o7] | |
281 | + add %o7,4,%o7 | |
282 | + brnz %o7,.Lcopy | |
283 | + nop | |
284 | + mov 1,%i0 | |
285 | + ret | |
286 | + restore | |
287 | +___ | |
288 | + | |
289 | +######## | |
290 | +######## .Lbn_sqr_mont gives up to 20% *overall* improvement over | |
291 | +######## code without following dedicated squaring procedure. | |
292 | +######## | |
293 | +$sbit="%i2"; # re-use $bp! | |
294 | + | |
295 | +$code.=<<___; | |
296 | +.align 32 | |
297 | +.Lbn_sqr_mont: | |
298 | + mulx $mul0,$mul0,$car0 ! ap[0]*ap[0] | |
299 | + mulx $apj,$mul0,$tmp0 !prologue! | |
300 | + and $car0,$mask,$acc0 | |
301 | + add %sp,$bias+$frame,$tp | |
302 | + ld [$ap+8],$apj !prologue! | |
303 | + | |
304 | + mulx $n0,$acc0,$mul1 ! "t[0]"*n0 | |
305 | + srlx $car0,32,$car0 | |
306 | + and $mul1,$mask,$mul1 | |
307 | + | |
308 | + mulx $car1,$mul1,$car1 ! np[0]*"t[0]"*n0 | |
309 | + mulx $npj,$mul1,$acc1 !prologue! | |
310 | + and $car0,1,$sbit | |
311 | + ld [$np+8],$npj !prologue! | |
312 | + srlx $car0,1,$car0 | |
313 | + add $acc0,$car1,$car1 | |
314 | + srlx $car1,32,$car1 | |
315 | + mov $tmp0,$acc0 !prologue! | |
316 | + | |
317 | +.Lsqr_1st: | |
318 | + mulx $apj,$mul0,$tmp0 | |
319 | + mulx $npj,$mul1,$tmp1 | |
320 | + add $acc0,$car0,$car0 ! ap[j]*a0+c0 | |
321 | + add $acc1,$car1,$car1 | |
322 | + ld [$ap+$j],$apj ! ap[j] | |
323 | + and $car0,$mask,$acc0 | |
324 | + ld [$np+$j],$npj ! np[j] | |
325 | + srlx $car0,32,$car0 | |
326 | + add $acc0,$acc0,$acc0 | |
327 | + or $sbit,$acc0,$acc0 | |
328 | + mov $tmp1,$acc1 | |
329 | + srlx $acc0,32,$sbit | |
330 | + add $j,4,$j ! j++ | |
331 | + and $acc0,$mask,$acc0 | |
332 | + cmp $j,$num | |
333 | + add $acc0,$car1,$car1 | |
334 | + st $car1,[$tp] | |
335 | + mov $tmp0,$acc0 | |
336 | + srlx $car1,32,$car1 | |
337 | + bl %icc,.Lsqr_1st | |
338 | + add $tp,4,$tp ! tp++ | |
339 | +!.Lsqr_1st | |
340 | + | |
341 | + mulx $apj,$mul0,$tmp0 ! epilogue | |
342 | + mulx $npj,$mul1,$tmp1 | |
343 | + add $acc0,$car0,$car0 ! ap[j]*a0+c0 | |
344 | + add $acc1,$car1,$car1 | |
345 | + and $car0,$mask,$acc0 | |
346 | + srlx $car0,32,$car0 | |
347 | + add $acc0,$acc0,$acc0 | |
348 | + or $sbit,$acc0,$acc0 | |
349 | + srlx $acc0,32,$sbit | |
350 | + and $acc0,$mask,$acc0 | |
351 | + add $acc0,$car1,$car1 | |
352 | + st $car1,[$tp] | |
353 | + srlx $car1,32,$car1 | |
354 | + | |
355 | + add $tmp0,$car0,$car0 ! ap[j]*a0+c0 | |
356 | + add $tmp1,$car1,$car1 | |
357 | + and $car0,$mask,$acc0 | |
358 | + srlx $car0,32,$car0 | |
359 | + add $acc0,$acc0,$acc0 | |
360 | + or $sbit,$acc0,$acc0 | |
361 | + srlx $acc0,32,$sbit | |
362 | + and $acc0,$mask,$acc0 | |
363 | + add $acc0,$car1,$car1 | |
364 | + st $car1,[$tp+4] | |
365 | + srlx $car1,32,$car1 | |
366 | + | |
367 | + add $car0,$car0,$car0 | |
368 | + or $sbit,$car0,$car0 | |
369 | + add $car0,$car1,$car1 | |
370 | + st $car1,[$tp+8] | |
371 | + srlx $car1,32,$car2 | |
372 | + | |
373 | + ld [%sp+$bias+$frame],$tmp0 ! tp[0] | |
374 | + ld [%sp+$bias+$frame+4],$tmp1 ! tp[1] | |
375 | + ld [%sp+$bias+$frame+8],$tpj ! tp[2] | |
376 | + ld [$ap+4],$mul0 ! ap[1] | |
377 | + ld [$ap+8],$apj ! ap[2] | |
378 | + ld [$np],$car1 ! np[0] | |
379 | + ld [$np+4],$npj ! np[1] | |
380 | + mulx $n0,$tmp0,$mul1 | |
381 | + | |
382 | + mulx $mul0,$mul0,$car0 | |
383 | + and $mul1,$mask,$mul1 | |
384 | + | |
385 | + mulx $car1,$mul1,$car1 | |
386 | + mulx $npj,$mul1,$acc1 | |
387 | + add $tmp0,$car1,$car1 | |
388 | + and $car0,$mask,$acc0 | |
389 | + ld [$np+8],$npj ! np[2] | |
390 | + srlx $car1,32,$car1 | |
391 | + add $tmp1,$car1,$car1 | |
392 | + srlx $car0,32,$car0 | |
393 | + add $acc0,$car1,$car1 | |
394 | + and $car0,1,$sbit | |
395 | + add $acc1,$car1,$car1 | |
396 | + srlx $car0,1,$car0 | |
397 | + mov 12,$j | |
398 | + st $car1,[%sp+$bias+$frame] ! tp[0]= | |
399 | + srlx $car1,32,$car1 | |
400 | + add %sp,$bias+$frame+4,$tp | |
401 | + | |
402 | +.Lsqr_2nd: | |
403 | + mulx $apj,$mul0,$acc0 | |
404 | + mulx $npj,$mul1,$acc1 | |
405 | + add $acc0,$car0,$car0 | |
406 | + add $tpj,$car1,$car1 | |
407 | + ld [$ap+$j],$apj ! ap[j] | |
408 | + and $car0,$mask,$acc0 | |
409 | + ld [$np+$j],$npj ! np[j] | |
410 | + srlx $car0,32,$car0 | |
411 | + add $acc1,$car1,$car1 | |
412 | + ld [$tp+8],$tpj ! tp[j] | |
413 | + add $acc0,$acc0,$acc0 | |
414 | + add $j,4,$j ! j++ | |
415 | + or $sbit,$acc0,$acc0 | |
416 | + srlx $acc0,32,$sbit | |
417 | + and $acc0,$mask,$acc0 | |
418 | + cmp $j,$num | |
419 | + add $acc0,$car1,$car1 | |
420 | + st $car1,[$tp] ! tp[j-1] | |
421 | + srlx $car1,32,$car1 | |
422 | + bl %icc,.Lsqr_2nd | |
423 | + add $tp,4,$tp ! tp++ | |
424 | +!.Lsqr_2nd | |
425 | + | |
426 | + mulx $apj,$mul0,$acc0 | |
427 | + mulx $npj,$mul1,$acc1 | |
428 | + add $acc0,$car0,$car0 | |
429 | + add $tpj,$car1,$car1 | |
430 | + and $car0,$mask,$acc0 | |
431 | + srlx $car0,32,$car0 | |
432 | + add $acc1,$car1,$car1 | |
433 | + add $acc0,$acc0,$acc0 | |
434 | + or $sbit,$acc0,$acc0 | |
435 | + srlx $acc0,32,$sbit | |
436 | + and $acc0,$mask,$acc0 | |
437 | + add $acc0,$car1,$car1 | |
438 | + st $car1,[$tp] ! tp[j-1] | |
439 | + srlx $car1,32,$car1 | |
440 | + | |
441 | + add $car0,$car0,$car0 | |
442 | + or $sbit,$car0,$car0 | |
443 | + add $car0,$car1,$car1 | |
444 | + add $car2,$car1,$car1 | |
445 | + st $car1,[$tp+4] | |
446 | + srlx $car1,32,$car2 | |
447 | + | |
448 | + ld [%sp+$bias+$frame],$tmp1 ! tp[0] | |
449 | + ld [%sp+$bias+$frame+4],$tpj ! tp[1] | |
450 | + ld [$ap+8],$mul0 ! ap[2] | |
451 | + ld [$np],$car1 ! np[0] | |
452 | + ld [$np+4],$npj ! np[1] | |
453 | + mulx $n0,$tmp1,$mul1 | |
454 | + and $mul1,$mask,$mul1 | |
455 | + mov 8,$i | |
456 | + | |
457 | + mulx $mul0,$mul0,$car0 | |
458 | + mulx $car1,$mul1,$car1 | |
459 | + and $car0,$mask,$acc0 | |
460 | + add $tmp1,$car1,$car1 | |
461 | + srlx $car0,32,$car0 | |
462 | + add %sp,$bias+$frame,$tp | |
463 | + srlx $car1,32,$car1 | |
464 | + and $car0,1,$sbit | |
465 | + srlx $car0,1,$car0 | |
466 | + mov 4,$j | |
467 | + | |
468 | +.Lsqr_outer: | |
469 | +.Lsqr_inner1: | |
470 | + mulx $npj,$mul1,$acc1 | |
471 | + add $tpj,$car1,$car1 | |
472 | + add $j,4,$j | |
473 | + ld [$tp+8],$tpj | |
474 | + cmp $j,$i | |
475 | + add $acc1,$car1,$car1 | |
476 | + ld [$np+$j],$npj | |
477 | + st $car1,[$tp] | |
478 | + srlx $car1,32,$car1 | |
479 | + bl %icc,.Lsqr_inner1 | |
480 | + add $tp,4,$tp | |
481 | +!.Lsqr_inner1 | |
482 | + | |
483 | + add $j,4,$j | |
484 | + ld [$ap+$j],$apj ! ap[j] | |
485 | + mulx $npj,$mul1,$acc1 | |
486 | + add $tpj,$car1,$car1 | |
487 | + ld [$np+$j],$npj ! np[j] | |
488 | + add $acc0,$car1,$car1 | |
489 | + ld [$tp+8],$tpj ! tp[j] | |
490 | + add $acc1,$car1,$car1 | |
491 | + st $car1,[$tp] | |
492 | + srlx $car1,32,$car1 | |
493 | + | |
494 | + add $j,4,$j | |
495 | + cmp $j,$num | |
496 | + be,pn %icc,.Lsqr_no_inner2 | |
497 | + add $tp,4,$tp | |
498 | + | |
499 | +.Lsqr_inner2: | |
500 | + mulx $apj,$mul0,$acc0 | |
501 | + mulx $npj,$mul1,$acc1 | |
502 | + add $tpj,$car1,$car1 | |
503 | + add $acc0,$car0,$car0 | |
504 | + ld [$ap+$j],$apj ! ap[j] | |
505 | + and $car0,$mask,$acc0 | |
506 | + ld [$np+$j],$npj ! np[j] | |
507 | + srlx $car0,32,$car0 | |
508 | + add $acc0,$acc0,$acc0 | |
509 | + ld [$tp+8],$tpj ! tp[j] | |
510 | + or $sbit,$acc0,$acc0 | |
511 | + add $j,4,$j ! j++ | |
512 | + srlx $acc0,32,$sbit | |
513 | + and $acc0,$mask,$acc0 | |
514 | + cmp $j,$num | |
515 | + add $acc0,$car1,$car1 | |
516 | + add $acc1,$car1,$car1 | |
517 | + st $car1,[$tp] ! tp[j-1] | |
518 | + srlx $car1,32,$car1 | |
519 | + bl %icc,.Lsqr_inner2 | |
520 | + add $tp,4,$tp ! tp++ | |
521 | + | |
522 | +.Lsqr_no_inner2: | |
523 | + mulx $apj,$mul0,$acc0 | |
524 | + mulx $npj,$mul1,$acc1 | |
525 | + add $tpj,$car1,$car1 | |
526 | + add $acc0,$car0,$car0 | |
527 | + and $car0,$mask,$acc0 | |
528 | + srlx $car0,32,$car0 | |
529 | + add $acc0,$acc0,$acc0 | |
530 | + or $sbit,$acc0,$acc0 | |
531 | + srlx $acc0,32,$sbit | |
532 | + and $acc0,$mask,$acc0 | |
533 | + add $acc0,$car1,$car1 | |
534 | + add $acc1,$car1,$car1 | |
535 | + st $car1,[$tp] ! tp[j-1] | |
536 | + srlx $car1,32,$car1 | |
537 | + | |
538 | + add $car0,$car0,$car0 | |
539 | + or $sbit,$car0,$car0 | |
540 | + add $car0,$car1,$car1 | |
541 | + add $car2,$car1,$car1 | |
542 | + st $car1,[$tp+4] | |
543 | + srlx $car1,32,$car2 | |
544 | + | |
545 | + add $i,4,$i ! i++ | |
546 | + ld [%sp+$bias+$frame],$tmp1 ! tp[0] | |
547 | + ld [%sp+$bias+$frame+4],$tpj ! tp[1] | |
548 | + ld [$ap+$i],$mul0 ! ap[j] | |
549 | + ld [$np],$car1 ! np[0] | |
550 | + ld [$np+4],$npj ! np[1] | |
551 | + mulx $n0,$tmp1,$mul1 | |
552 | + and $mul1,$mask,$mul1 | |
553 | + add $i,4,$tmp0 | |
554 | + | |
555 | + mulx $mul0,$mul0,$car0 | |
556 | + mulx $car1,$mul1,$car1 | |
557 | + and $car0,$mask,$acc0 | |
558 | + add $tmp1,$car1,$car1 | |
559 | + srlx $car0,32,$car0 | |
560 | + add %sp,$bias+$frame,$tp | |
561 | + srlx $car1,32,$car1 | |
562 | + and $car0,1,$sbit | |
563 | + srlx $car0,1,$car0 | |
564 | + | |
565 | + cmp $tmp0,$num ! i<num-1 | |
566 | + bl %icc,.Lsqr_outer | |
567 | + mov 4,$j | |
568 | + | |
569 | +.Lsqr_last: | |
570 | + mulx $npj,$mul1,$acc1 | |
571 | + add $tpj,$car1,$car1 | |
572 | + add $j,4,$j | |
573 | + ld [$tp+8],$tpj | |
574 | + cmp $j,$i | |
575 | + add $acc1,$car1,$car1 | |
576 | + ld [$np+$j],$npj | |
577 | + st $car1,[$tp] | |
578 | + srlx $car1,32,$car1 | |
579 | + bl %icc,.Lsqr_last | |
580 | + add $tp,4,$tp | |
581 | +!.Lsqr_last | |
582 | + | |
583 | + mulx $npj,$mul1,$acc1 | |
584 | + add $tpj,$car1,$car1 | |
585 | + add $acc0,$car1,$car1 | |
586 | + add $acc1,$car1,$car1 | |
587 | + st $car1,[$tp] | |
588 | + srlx $car1,32,$car1 | |
589 | + | |
590 | + add $car0,$car0,$car0 ! recover $car0 | |
591 | + or $sbit,$car0,$car0 | |
592 | + add $car0,$car1,$car1 | |
593 | + add $car2,$car1,$car1 | |
594 | + st $car1,[$tp+4] | |
595 | + srlx $car1,32,$car2 | |
596 | + | |
597 | + ba .Ltail | |
598 | + add $tp,8,$tp | |
599 | +.type $fname,#function | |
600 | +.size $fname,(.-$fname) | |
601 | +.asciz "Montgomery Multipltication for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>" | |
602 | +.align 32 | |
603 | +___ | |
604 | +$code =~ s/\`([^\`]*)\`/eval($1)/gem; | |
605 | +print $code; | |
606 | +close STDOUT; |
@@ -0,0 +1,882 @@ | ||
1 | +#!/usr/bin/env perl | |
2 | + | |
3 | +# ==================================================================== | |
4 | +# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | |
5 | +# project. The module is, however, dual licensed under OpenSSL and | |
6 | +# CRYPTOGAMS licenses depending on where you obtain it. For further | |
7 | +# details see http://www.openssl.org/~appro/cryptogams/. | |
8 | +# ==================================================================== | |
9 | + | |
10 | +# October 2005 | |
11 | +# | |
12 | +# "Teaser" Montgomery multiplication module for UltraSPARC. Why FPU? | |
13 | +# Because unlike integer multiplier, which simply stalls whole CPU, | |
14 | +# FPU is fully pipelined and can effectively emit 48 bit partial | |
15 | +# product every cycle. Why not blended SPARC v9? One can argue that | |
16 | +# making this module dependent on UltraSPARC VIS extension limits its | |
17 | +# binary compatibility. Well yes, it does exclude SPARC64 prior-V(!) | |
18 | +# implementations from compatibility matrix. But the rest, whole Sun | |
19 | +# UltraSPARC family and brand new Fujitsu's SPARC64 V, all support | |
20 | +# VIS extension instructions used in this module. This is considered | |
21 | +# good enough to not care about HAL SPARC64 users [if any] who have | |
22 | +# integer-only pure SPARCv9 module to "fall down" to. | |
23 | + | |
24 | +# USI&II cores currently exhibit uniform 2x improvement [over pre- | |
25 | +# bn_mul_mont codebase] for all key lengths and benchmarks. On USIII | |
26 | +# performance improves few percents for shorter keys and worsens few | |
27 | +# percents for longer keys. This is because USIII integer multiplier | |
28 | +# is >3x faster than USI&II one, which is harder to match [but see | |
29 | +# TODO list below]. It should also be noted that SPARC64 V features | |
30 | +# out-of-order execution, which *might* mean that integer multiplier | |
31 | +# is pipelined, which in turn *might* be impossible to match... On | |
32 | +# additional note, SPARC64 V implements FP Multiply-Add instruction, | |
33 | +# which is perfectly usable in this context... In other words, as far | |
34 | +# as Fujitsu SPARC64 V goes, talk to the author:-) | |
35 | + | |
36 | +# The implementation implies following "non-natural" limitations on | |
37 | +# input arguments: | |
38 | +# - num may not be less than 4; | |
39 | +# - num has to be even; | |
40 | +# Failure to meet either condition has no fatal effects, simply | |
41 | +# doesn't give any performance gain. | |
42 | + | |
43 | +# TODO: | |
44 | +# - modulo-schedule inner loop for better performance (on in-order | |
45 | +# execution core such as UltraSPARC this shall result in further | |
46 | +# noticeable(!) improvement); | |
47 | +# - dedicated squaring procedure[?]; | |
48 | + | |
49 | +###################################################################### | |
50 | +# November 2006 | |
51 | +# | |
52 | +# Modulo-scheduled inner loops allow to interleave floating point and | |
53 | +# integer instructions and minimize Read-After-Write penalties. This | |
54 | +# results in *further* 20-50% perfromance improvement [depending on | |
55 | +# key length, more for longer keys] on USI&II cores and 30-80% - on | |
56 | +# USIII&IV. | |
57 | + | |
58 | +$fname="bn_mul_mont_fpu"; | |
59 | +$bits=32; | |
60 | +for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); } | |
61 | + | |
62 | +if ($bits==64) { | |
63 | + $bias=2047; | |
64 | + $frame=192; | |
65 | +} else { | |
66 | + $bias=0; | |
67 | + $frame=128; # 96 rounded up to largest known cache-line | |
68 | +} | |
69 | +$locals=64; | |
70 | + | |
71 | +# In order to provide for 32-/64-bit ABI duality, I keep integers wider | |
72 | +# than 32 bit in %g1-%g4 and %o0-%o5. %l0-%l7 and %i0-%i5 are used | |
73 | +# exclusively for pointers, indexes and other small values... | |
74 | +# int bn_mul_mont( | |
75 | +$rp="%i0"; # BN_ULONG *rp, | |
76 | +$ap="%i1"; # const BN_ULONG *ap, | |
77 | +$bp="%i2"; # const BN_ULONG *bp, | |
78 | +$np="%i3"; # const BN_ULONG *np, | |
79 | +$n0="%i4"; # const BN_ULONG *n0, | |
80 | +$num="%i5"; # int num); | |
81 | + | |
82 | +$tp="%l0"; # t[num] | |
83 | +$ap_l="%l1"; # a[num],n[num] are smashed to 32-bit words and saved | |
84 | +$ap_h="%l2"; # to these four vectors as double-precision FP values. | |
85 | +$np_l="%l3"; # This way a bunch of fxtods are eliminated in second | |
86 | +$np_h="%l4"; # loop and L1-cache aliasing is minimized... | |
87 | +$i="%l5"; | |
88 | +$j="%l6"; | |
89 | +$mask="%l7"; # 16-bit mask, 0xffff | |
90 | + | |
91 | +$n0="%g4"; # reassigned(!) to "64-bit" register | |
92 | +$carry="%i4"; # %i4 reused(!) for a carry bit | |
93 | + | |
94 | +# FP register naming chart | |
95 | +# | |
96 | +# ..HILO | |
97 | +# dcba | |
98 | +# -------- | |
99 | +# LOa | |
100 | +# LOb | |
101 | +# LOc | |
102 | +# LOd | |
103 | +# HIa | |
104 | +# HIb | |
105 | +# HIc | |
106 | +# HId | |
107 | +# ..a | |
108 | +# ..b | |
109 | +$ba="%f0"; $bb="%f2"; $bc="%f4"; $bd="%f6"; | |
110 | +$na="%f8"; $nb="%f10"; $nc="%f12"; $nd="%f14"; | |
111 | +$alo="%f16"; $alo_="%f17"; $ahi="%f18"; $ahi_="%f19"; | |
112 | +$nlo="%f20"; $nlo_="%f21"; $nhi="%f22"; $nhi_="%f23"; | |
113 | + | |
114 | +$dota="%f24"; $dotb="%f26"; | |
115 | + | |
116 | +$aloa="%f32"; $alob="%f34"; $aloc="%f36"; $alod="%f38"; | |
117 | +$ahia="%f40"; $ahib="%f42"; $ahic="%f44"; $ahid="%f46"; | |
118 | +$nloa="%f48"; $nlob="%f50"; $nloc="%f52"; $nlod="%f54"; | |
119 | +$nhia="%f56"; $nhib="%f58"; $nhic="%f60"; $nhid="%f62"; | |
120 | + | |
121 | +$ASI_FL16_P=0xD2; # magic ASI value to engage 16-bit FP load | |
122 | + | |
123 | +$code=<<___; | |
124 | +.section ".text",#alloc,#execinstr | |
125 | + | |
126 | +.global $fname | |
127 | +.align 32 | |
128 | +$fname: | |
129 | + save %sp,-$frame-$locals,%sp | |
130 | + | |
131 | + cmp $num,4 | |
132 | + bl,a,pn %icc,.Lret | |
133 | + clr %i0 | |
134 | + andcc $num,1,%g0 ! $num has to be even... | |
135 | + bnz,a,pn %icc,.Lret | |
136 | + clr %i0 ! signal "unsupported input value" | |
137 | + | |
138 | + srl $num,1,$num | |
139 | + sethi %hi(0xffff),$mask | |
140 | + ld [%i4+0],$n0 ! $n0 reassigned, remember? | |
141 | + or $mask,%lo(0xffff),$mask | |
142 | + ld [%i4+4],%o0 | |
143 | + sllx %o0,32,%o0 | |
144 | + or %o0,$n0,$n0 ! $n0=n0[1].n0[0] | |
145 | + | |
146 | + sll $num,3,$num ! num*=8 | |
147 | + | |
148 | + add %sp,$bias,%o0 ! real top of stack | |
149 | + sll $num,2,%o1 | |
150 | + add %o1,$num,%o1 ! %o1=num*5 | |
151 | + sub %o0,%o1,%o0 | |
152 | + and %o0,-2048,%o0 ! optimize TLB utilization | |
153 | + sub %o0,$bias,%sp ! alloca(5*num*8) | |
154 | + | |
155 | + rd %asi,%o7 ! save %asi | |
156 | + add %sp,$bias+$frame+$locals,$tp | |
157 | + add $tp,$num,$ap_l | |
158 | + add $ap_l,$num,$ap_l ! [an]p_[lh] point at the vectors' ends ! | |
159 | + add $ap_l,$num,$ap_h | |
160 | + add $ap_h,$num,$np_l | |
161 | + add $np_l,$num,$np_h | |
162 | + | |
163 | + wr %g0,$ASI_FL16_P,%asi ! setup %asi for 16-bit FP loads | |
164 | + | |
165 | + add $rp,$num,$rp ! readjust input pointers to point | |
166 | + add $ap,$num,$ap ! at the ends too... | |
167 | + add $bp,$num,$bp | |
168 | + add $np,$num,$np | |
169 | + | |
170 | + stx %o7,[%sp+$bias+$frame+48] ! save %asi | |
171 | + | |
172 | + sub %g0,$num,$i ! i=-num | |
173 | + sub %g0,$num,$j ! j=-num | |
174 | + | |
175 | + add $ap,$j,%o3 | |
176 | + add $bp,$i,%o4 | |
177 | + | |
178 | + ld [%o3+4],%g1 ! bp[0] | |
179 | + ld [%o3+0],%o0 | |
180 | + ld [%o4+4],%g5 ! ap[0] | |
181 | + sllx %g1,32,%g1 | |
182 | + ld [%o4+0],%o1 | |
183 | + sllx %g5,32,%g5 | |
184 | + or %g1,%o0,%o0 | |
185 | + or %g5,%o1,%o1 | |
186 | + | |
187 | + add $np,$j,%o5 | |
188 | + | |
189 | + mulx %o1,%o0,%o0 ! ap[0]*bp[0] | |
190 | + mulx $n0,%o0,%o0 ! ap[0]*bp[0]*n0 | |
191 | + stx %o0,[%sp+$bias+$frame+0] | |
192 | + | |
193 | + ld [%o3+0],$alo_ ! load a[j] as pair of 32-bit words | |
194 | + fzeros $alo | |
195 | + ld [%o3+4],$ahi_ | |
196 | + fzeros $ahi | |
197 | + ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words | |
198 | + fzeros $nlo | |
199 | + ld [%o5+4],$nhi_ | |
200 | + fzeros $nhi | |
201 | + | |
202 | + ! transfer b[i] to FPU as 4x16-bit values | |
203 | + ldda [%o4+2]%asi,$ba | |
204 | + fxtod $alo,$alo | |
205 | + ldda [%o4+0]%asi,$bb | |
206 | + fxtod $ahi,$ahi | |
207 | + ldda [%o4+6]%asi,$bc | |
208 | + fxtod $nlo,$nlo | |
209 | + ldda [%o4+4]%asi,$bd | |
210 | + fxtod $nhi,$nhi | |
211 | + | |
212 | + ! transfer ap[0]*b[0]*n0 to FPU as 4x16-bit values | |
213 | + ldda [%sp+$bias+$frame+6]%asi,$na | |
214 | + fxtod $ba,$ba | |
215 | + ldda [%sp+$bias+$frame+4]%asi,$nb | |
216 | + fxtod $bb,$bb | |
217 | + ldda [%sp+$bias+$frame+2]%asi,$nc | |
218 | + fxtod $bc,$bc | |
219 | + ldda [%sp+$bias+$frame+0]%asi,$nd | |
220 | + fxtod $bd,$bd | |
221 | + | |
222 | + std $alo,[$ap_l+$j] ! save smashed ap[j] in double format | |
223 | + fxtod $na,$na | |
224 | + std $ahi,[$ap_h+$j] | |
225 | + fxtod $nb,$nb | |
226 | + std $nlo,[$np_l+$j] ! save smashed np[j] in double format | |
227 | + fxtod $nc,$nc | |
228 | + std $nhi,[$np_h+$j] | |
229 | + fxtod $nd,$nd | |
230 | + | |
231 | + fmuld $alo,$ba,$aloa | |
232 | + fmuld $nlo,$na,$nloa | |
233 | + fmuld $alo,$bb,$alob | |
234 | + fmuld $nlo,$nb,$nlob | |
235 | + fmuld $alo,$bc,$aloc | |
236 | + faddd $aloa,$nloa,$nloa | |
237 | + fmuld $nlo,$nc,$nloc | |
238 | + fmuld $alo,$bd,$alod | |
239 | + faddd $alob,$nlob,$nlob | |
240 | + fmuld $nlo,$nd,$nlod | |
241 | + fmuld $ahi,$ba,$ahia | |
242 | + faddd $aloc,$nloc,$nloc | |
243 | + fmuld $nhi,$na,$nhia | |
244 | + fmuld $ahi,$bb,$ahib | |
245 | + faddd $alod,$nlod,$nlod | |
246 | + fmuld $nhi,$nb,$nhib | |
247 | + fmuld $ahi,$bc,$ahic | |
248 | + faddd $ahia,$nhia,$nhia | |
249 | + fmuld $nhi,$nc,$nhic | |
250 | + fmuld $ahi,$bd,$ahid | |
251 | + faddd $ahib,$nhib,$nhib | |
252 | + fmuld $nhi,$nd,$nhid | |
253 | + | |
254 | + faddd $ahic,$nhic,$dota ! $nhic | |
255 | + faddd $ahid,$nhid,$dotb ! $nhid | |
256 | + | |
257 | + faddd $nloc,$nhia,$nloc | |
258 | + faddd $nlod,$nhib,$nlod | |
259 | + | |
260 | + fdtox $nloa,$nloa | |
261 | + fdtox $nlob,$nlob | |
262 | + fdtox $nloc,$nloc | |
263 | + fdtox $nlod,$nlod | |
264 | + | |
265 | + std $nloa,[%sp+$bias+$frame+0] | |
266 | + add $j,8,$j | |
267 | + std $nlob,[%sp+$bias+$frame+8] | |
268 | + add $ap,$j,%o4 | |
269 | + std $nloc,[%sp+$bias+$frame+16] | |
270 | + add $np,$j,%o5 | |
271 | + std $nlod,[%sp+$bias+$frame+24] | |
272 | + | |
273 | + ld [%o4+0],$alo_ ! load a[j] as pair of 32-bit words | |
274 | + fzeros $alo | |
275 | + ld [%o4+4],$ahi_ | |
276 | + fzeros $ahi | |
277 | + ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words | |
278 | + fzeros $nlo | |
279 | + ld [%o5+4],$nhi_ | |
280 | + fzeros $nhi | |
281 | + | |
282 | + fxtod $alo,$alo | |
283 | + fxtod $ahi,$ahi | |
284 | + fxtod $nlo,$nlo | |
285 | + fxtod $nhi,$nhi | |
286 | + | |
287 | + ldx [%sp+$bias+$frame+0],%o0 | |
288 | + fmuld $alo,$ba,$aloa | |
289 | + ldx [%sp+$bias+$frame+8],%o1 | |
290 | + fmuld $nlo,$na,$nloa | |
291 | + ldx [%sp+$bias+$frame+16],%o2 | |
292 | + fmuld $alo,$bb,$alob | |
293 | + ldx [%sp+$bias+$frame+24],%o3 | |
294 | + fmuld $nlo,$nb,$nlob | |
295 | + | |
296 | + srlx %o0,16,%o7 | |
297 | + std $alo,[$ap_l+$j] ! save smashed ap[j] in double format | |
298 | + fmuld $alo,$bc,$aloc | |
299 | + add %o7,%o1,%o1 | |
300 | + std $ahi,[$ap_h+$j] | |
301 | + faddd $aloa,$nloa,$nloa | |
302 | + fmuld $nlo,$nc,$nloc | |
303 | + srlx %o1,16,%o7 | |
304 | + std $nlo,[$np_l+$j] ! save smashed np[j] in double format | |
305 | + fmuld $alo,$bd,$alod | |
306 | + add %o7,%o2,%o2 | |
307 | + std $nhi,[$np_h+$j] | |
308 | + faddd $alob,$nlob,$nlob | |
309 | + fmuld $nlo,$nd,$nlod | |
310 | + srlx %o2,16,%o7 | |
311 | + fmuld $ahi,$ba,$ahia | |
312 | + add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15] | |
313 | + faddd $aloc,$nloc,$nloc | |
314 | + fmuld $nhi,$na,$nhia | |
315 | + !and %o0,$mask,%o0 | |
316 | + !and %o1,$mask,%o1 | |
317 | + !and %o2,$mask,%o2 | |
318 | + !sllx %o1,16,%o1 | |
319 | + !sllx %o2,32,%o2 | |
320 | + !sllx %o3,48,%o7 | |
321 | + !or %o1,%o0,%o0 | |
322 | + !or %o2,%o0,%o0 | |
323 | + !or %o7,%o0,%o0 ! 64-bit result | |
324 | + srlx %o3,16,%g1 ! 34-bit carry | |
325 | + fmuld $ahi,$bb,$ahib | |
326 | + | |
327 | + faddd $alod,$nlod,$nlod | |
328 | + fmuld $nhi,$nb,$nhib | |
329 | + fmuld $ahi,$bc,$ahic | |
330 | + faddd $ahia,$nhia,$nhia | |
331 | + fmuld $nhi,$nc,$nhic | |
332 | + fmuld $ahi,$bd,$ahid | |
333 | + faddd $ahib,$nhib,$nhib | |
334 | + fmuld $nhi,$nd,$nhid | |
335 | + | |
336 | + faddd $dota,$nloa,$nloa | |
337 | + faddd $dotb,$nlob,$nlob | |
338 | + faddd $ahic,$nhic,$dota ! $nhic | |
339 | + faddd $ahid,$nhid,$dotb ! $nhid | |
340 | + | |
341 | + faddd $nloc,$nhia,$nloc | |
342 | + faddd $nlod,$nhib,$nlod | |
343 | + | |
344 | + fdtox $nloa,$nloa | |
345 | + fdtox $nlob,$nlob | |
346 | + fdtox $nloc,$nloc | |
347 | + fdtox $nlod,$nlod | |
348 | + | |
349 | + std $nloa,[%sp+$bias+$frame+0] | |
350 | + std $nlob,[%sp+$bias+$frame+8] | |
351 | + addcc $j,8,$j | |
352 | + std $nloc,[%sp+$bias+$frame+16] | |
353 | + bz,pn %icc,.L1stskip | |
354 | + std $nlod,[%sp+$bias+$frame+24] | |
355 | + | |
356 | +.align 32 ! incidentally already aligned ! | |
357 | +.L1st: | |
358 | + add $ap,$j,%o4 | |
359 | + add $np,$j,%o5 | |
360 | + ld [%o4+0],$alo_ ! load a[j] as pair of 32-bit words | |
361 | + fzeros $alo | |
362 | + ld [%o4+4],$ahi_ | |
363 | + fzeros $ahi | |
364 | + ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words | |
365 | + fzeros $nlo | |
366 | + ld [%o5+4],$nhi_ | |
367 | + fzeros $nhi | |
368 | + | |
369 | + fxtod $alo,$alo | |
370 | + fxtod $ahi,$ahi | |
371 | + fxtod $nlo,$nlo | |
372 | + fxtod $nhi,$nhi | |
373 | + | |
374 | + ldx [%sp+$bias+$frame+0],%o0 | |
375 | + fmuld $alo,$ba,$aloa | |
376 | + ldx [%sp+$bias+$frame+8],%o1 | |
377 | + fmuld $nlo,$na,$nloa | |
378 | + ldx [%sp+$bias+$frame+16],%o2 | |
379 | + fmuld $alo,$bb,$alob | |
380 | + ldx [%sp+$bias+$frame+24],%o3 | |
381 | + fmuld $nlo,$nb,$nlob | |
382 | + | |
383 | + srlx %o0,16,%o7 | |
384 | + std $alo,[$ap_l+$j] ! save smashed ap[j] in double format | |
385 | + fmuld $alo,$bc,$aloc | |
386 | + add %o7,%o1,%o1 | |
387 | + std $ahi,[$ap_h+$j] | |
388 | + faddd $aloa,$nloa,$nloa | |
389 | + fmuld $nlo,$nc,$nloc | |
390 | + srlx %o1,16,%o7 | |
391 | + std $nlo,[$np_l+$j] ! save smashed np[j] in double format | |
392 | + fmuld $alo,$bd,$alod | |
393 | + add %o7,%o2,%o2 | |
394 | + std $nhi,[$np_h+$j] | |
395 | + faddd $alob,$nlob,$nlob | |
396 | + fmuld $nlo,$nd,$nlod | |
397 | + srlx %o2,16,%o7 | |
398 | + fmuld $ahi,$ba,$ahia | |
399 | + add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15] | |
400 | + and %o0,$mask,%o0 | |
401 | + faddd $aloc,$nloc,$nloc | |
402 | + fmuld $nhi,$na,$nhia | |
403 | + and %o1,$mask,%o1 | |
404 | + and %o2,$mask,%o2 | |
405 | + fmuld $ahi,$bb,$ahib | |
406 | + sllx %o1,16,%o1 | |
407 | + faddd $alod,$nlod,$nlod | |
408 | + fmuld $nhi,$nb,$nhib | |
409 | + sllx %o2,32,%o2 | |
410 | + fmuld $ahi,$bc,$ahic | |
411 | + sllx %o3,48,%o7 | |
412 | + or %o1,%o0,%o0 | |
413 | + faddd $ahia,$nhia,$nhia | |
414 | + fmuld $nhi,$nc,$nhic | |
415 | + or %o2,%o0,%o0 | |
416 | + fmuld $ahi,$bd,$ahid | |
417 | + or %o7,%o0,%o0 ! 64-bit result | |
418 | + faddd $ahib,$nhib,$nhib | |
419 | + fmuld $nhi,$nd,$nhid | |
420 | + addcc %g1,%o0,%o0 | |
421 | + faddd $dota,$nloa,$nloa | |
422 | + srlx %o3,16,%g1 ! 34-bit carry | |
423 | + faddd $dotb,$nlob,$nlob | |
424 | + bcs,a %xcc,.+8 | |
425 | + add %g1,1,%g1 | |
426 | + | |
427 | + stx %o0,[$tp] ! tp[j-1]= | |
428 | + | |
429 | + faddd $ahic,$nhic,$dota ! $nhic | |
430 | + faddd $ahid,$nhid,$dotb ! $nhid | |
431 | + | |
432 | + faddd $nloc,$nhia,$nloc | |
433 | + faddd $nlod,$nhib,$nlod | |
434 | + | |
435 | + fdtox $nloa,$nloa | |
436 | + fdtox $nlob,$nlob | |
437 | + fdtox $nloc,$nloc | |
438 | + fdtox $nlod,$nlod | |
439 | + | |
440 | + std $nloa,[%sp+$bias+$frame+0] | |
441 | + std $nlob,[%sp+$bias+$frame+8] | |
442 | + std $nloc,[%sp+$bias+$frame+16] | |
443 | + std $nlod,[%sp+$bias+$frame+24] | |
444 | + | |
445 | + addcc $j,8,$j | |
446 | + bnz,pt %icc,.L1st | |
447 | + add $tp,8,$tp | |
448 | + | |
449 | +.L1stskip: | |
450 | + fdtox $dota,$dota | |
451 | + fdtox $dotb,$dotb | |
452 | + | |
453 | + ldx [%sp+$bias+$frame+0],%o0 | |
454 | + ldx [%sp+$bias+$frame+8],%o1 | |
455 | + ldx [%sp+$bias+$frame+16],%o2 | |
456 | + ldx [%sp+$bias+$frame+24],%o3 | |
457 | + | |
458 | + srlx %o0,16,%o7 | |
459 | + std $dota,[%sp+$bias+$frame+32] | |
460 | + add %o7,%o1,%o1 | |
461 | + std $dotb,[%sp+$bias+$frame+40] | |
462 | + srlx %o1,16,%o7 | |
463 | + add %o7,%o2,%o2 | |
464 | + srlx %o2,16,%o7 | |
465 | + add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15] | |
466 | + and %o0,$mask,%o0 | |
467 | + and %o1,$mask,%o1 | |
468 | + and %o2,$mask,%o2 | |
469 | + sllx %o1,16,%o1 | |
470 | + sllx %o2,32,%o2 | |
471 | + sllx %o3,48,%o7 | |
472 | + or %o1,%o0,%o0 | |
473 | + or %o2,%o0,%o0 | |
474 | + or %o7,%o0,%o0 ! 64-bit result | |
475 | + ldx [%sp+$bias+$frame+32],%o4 | |
476 | + addcc %g1,%o0,%o0 | |
477 | + ldx [%sp+$bias+$frame+40],%o5 | |
478 | + srlx %o3,16,%g1 ! 34-bit carry | |
479 | + bcs,a %xcc,.+8 | |
480 | + add %g1,1,%g1 | |
481 | + | |
482 | + stx %o0,[$tp] ! tp[j-1]= | |
483 | + add $tp,8,$tp | |
484 | + | |
485 | + srlx %o4,16,%o7 | |
486 | + add %o7,%o5,%o5 | |
487 | + and %o4,$mask,%o4 | |
488 | + sllx %o5,16,%o7 | |
489 | + or %o7,%o4,%o4 | |
490 | + addcc %g1,%o4,%o4 | |
491 | + srlx %o5,48,%g1 | |
492 | + bcs,a %xcc,.+8 | |
493 | + add %g1,1,%g1 | |
494 | + | |
495 | + mov %g1,$carry | |
496 | + stx %o4,[$tp] ! tp[num-1]= | |
497 | + | |
498 | + ba .Louter | |
499 | + add $i,8,$i | |
500 | +.align 32 | |
501 | +.Louter: | |
502 | + sub %g0,$num,$j ! j=-num | |
503 | + add %sp,$bias+$frame+$locals,$tp | |
504 | + | |
505 | + add $ap,$j,%o3 | |
506 | + add $bp,$i,%o4 | |
507 | + | |
508 | + ld [%o3+4],%g1 ! bp[i] | |
509 | + ld [%o3+0],%o0 | |
510 | + ld [%o4+4],%g5 ! ap[0] | |
511 | + sllx %g1,32,%g1 | |
512 | + ld [%o4+0],%o1 | |
513 | + sllx %g5,32,%g5 | |
514 | + or %g1,%o0,%o0 | |
515 | + or %g5,%o1,%o1 | |
516 | + | |
517 | + ldx [$tp],%o2 ! tp[0] | |
518 | + mulx %o1,%o0,%o0 | |
519 | + addcc %o2,%o0,%o0 | |
520 | + mulx $n0,%o0,%o0 ! (ap[0]*bp[i]+t[0])*n0 | |
521 | + stx %o0,[%sp+$bias+$frame+0] | |
522 | + | |
523 | + ! transfer b[i] to FPU as 4x16-bit values | |
524 | + ldda [%o4+2]%asi,$ba | |
525 | + ldda [%o4+0]%asi,$bb | |
526 | + ldda [%o4+6]%asi,$bc | |
527 | + ldda [%o4+4]%asi,$bd | |
528 | + | |
529 | + ! transfer (ap[0]*b[i]+t[0])*n0 to FPU as 4x16-bit values | |
530 | + ldda [%sp+$bias+$frame+6]%asi,$na | |
531 | + fxtod $ba,$ba | |
532 | + ldda [%sp+$bias+$frame+4]%asi,$nb | |
533 | + fxtod $bb,$bb | |
534 | + ldda [%sp+$bias+$frame+2]%asi,$nc | |
535 | + fxtod $bc,$bc | |
536 | + ldda [%sp+$bias+$frame+0]%asi,$nd | |
537 | + fxtod $bd,$bd | |
538 | + ldd [$ap_l+$j],$alo ! load a[j] in double format | |
539 | + fxtod $na,$na | |
540 | + ldd [$ap_h+$j],$ahi | |
541 | + fxtod $nb,$nb | |
542 | + ldd [$np_l+$j],$nlo ! load n[j] in double format | |
543 | + fxtod $nc,$nc | |
544 | + ldd [$np_h+$j],$nhi | |
545 | + fxtod $nd,$nd | |
546 | + | |
547 | + fmuld $alo,$ba,$aloa | |
548 | + fmuld $nlo,$na,$nloa | |
549 | + fmuld $alo,$bb,$alob | |
550 | + fmuld $nlo,$nb,$nlob | |
551 | + fmuld $alo,$bc,$aloc | |
552 | + faddd $aloa,$nloa,$nloa | |
553 | + fmuld $nlo,$nc,$nloc | |
554 | + fmuld $alo,$bd,$alod | |
555 | + faddd $alob,$nlob,$nlob | |
556 | + fmuld $nlo,$nd,$nlod | |
557 | + fmuld $ahi,$ba,$ahia | |
558 | + faddd $aloc,$nloc,$nloc | |
559 | + fmuld $nhi,$na,$nhia | |
560 | + fmuld $ahi,$bb,$ahib | |
561 | + faddd $alod,$nlod,$nlod | |
562 | + fmuld $nhi,$nb,$nhib | |
563 | + fmuld $ahi,$bc,$ahic | |
564 | + faddd $ahia,$nhia,$nhia | |
565 | + fmuld $nhi,$nc,$nhic | |
566 | + fmuld $ahi,$bd,$ahid | |
567 | + faddd $ahib,$nhib,$nhib | |
568 | + fmuld $nhi,$nd,$nhid | |
569 | + | |
570 | + faddd $ahic,$nhic,$dota ! $nhic | |
571 | + faddd $ahid,$nhid,$dotb ! $nhid | |
572 | + | |
573 | + faddd $nloc,$nhia,$nloc | |
574 | + faddd $nlod,$nhib,$nlod | |
575 | + | |
576 | + fdtox $nloa,$nloa | |
577 | + fdtox $nlob,$nlob | |
578 | + fdtox $nloc,$nloc | |
579 | + fdtox $nlod,$nlod | |
580 | + | |
581 | + std $nloa,[%sp+$bias+$frame+0] | |
582 | + std $nlob,[%sp+$bias+$frame+8] | |
583 | + std $nloc,[%sp+$bias+$frame+16] | |
584 | + add $j,8,$j | |
585 | + std $nlod,[%sp+$bias+$frame+24] | |
586 | + | |
587 | + ldd [$ap_l+$j],$alo ! load a[j] in double format | |
588 | + ldd [$ap_h+$j],$ahi | |
589 | + ldd [$np_l+$j],$nlo ! load n[j] in double format | |
590 | + ldd [$np_h+$j],$nhi | |
591 | + | |
592 | + fmuld $alo,$ba,$aloa | |
593 | + fmuld $nlo,$na,$nloa | |
594 | + fmuld $alo,$bb,$alob | |
595 | + fmuld $nlo,$nb,$nlob | |
596 | + fmuld $alo,$bc,$aloc | |
597 | + ldx [%sp+$bias+$frame+0],%o0 | |
598 | + faddd $aloa,$nloa,$nloa | |
599 | + fmuld $nlo,$nc,$nloc | |
600 | + ldx [%sp+$bias+$frame+8],%o1 | |
601 | + fmuld $alo,$bd,$alod | |
602 | + ldx [%sp+$bias+$frame+16],%o2 | |
603 | + faddd $alob,$nlob,$nlob | |
604 | + fmuld $nlo,$nd,$nlod | |
605 | + ldx [%sp+$bias+$frame+24],%o3 | |
606 | + fmuld $ahi,$ba,$ahia | |
607 | + | |
608 | + srlx %o0,16,%o7 | |
609 | + faddd $aloc,$nloc,$nloc | |
610 | + fmuld $nhi,$na,$nhia | |
611 | + add %o7,%o1,%o1 | |
612 | + fmuld $ahi,$bb,$ahib | |
613 | + srlx %o1,16,%o7 | |
614 | + faddd $alod,$nlod,$nlod | |
615 | + fmuld $nhi,$nb,$nhib | |
616 | + add %o7,%o2,%o2 | |
617 | + fmuld $ahi,$bc,$ahic | |
618 | + srlx %o2,16,%o7 | |
619 | + faddd $ahia,$nhia,$nhia | |
620 | + fmuld $nhi,$nc,$nhic | |
621 | + add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15] | |
622 | + ! why? | |
623 | + and %o0,$mask,%o0 | |
624 | + fmuld $ahi,$bd,$ahid | |
625 | + and %o1,$mask,%o1 | |
626 | + and %o2,$mask,%o2 | |
627 | + faddd $ahib,$nhib,$nhib | |
628 | + fmuld $nhi,$nd,$nhid | |
629 | + sllx %o1,16,%o1 | |
630 | + faddd $dota,$nloa,$nloa | |
631 | + sllx %o2,32,%o2 | |
632 | + faddd $dotb,$nlob,$nlob | |
633 | + sllx %o3,48,%o7 | |
634 | + or %o1,%o0,%o0 | |
635 | + faddd $ahic,$nhic,$dota ! $nhic | |
636 | + or %o2,%o0,%o0 | |
637 | + faddd $ahid,$nhid,$dotb ! $nhid | |
638 | + or %o7,%o0,%o0 ! 64-bit result | |
639 | + ldx [$tp],%o7 | |
640 | + faddd $nloc,$nhia,$nloc | |
641 | + addcc %o7,%o0,%o0 | |
642 | + ! end-of-why? | |
643 | + faddd $nlod,$nhib,$nlod | |
644 | + srlx %o3,16,%g1 ! 34-bit carry | |
645 | + fdtox $nloa,$nloa | |
646 | + bcs,a %xcc,.+8 | |
647 | + add %g1,1,%g1 | |
648 | + | |
649 | + fdtox $nlob,$nlob | |
650 | + fdtox $nloc,$nloc | |
651 | + fdtox $nlod,$nlod | |
652 | + | |
653 | + std $nloa,[%sp+$bias+$frame+0] | |
654 | + std $nlob,[%sp+$bias+$frame+8] | |
655 | + addcc $j,8,$j | |
656 | + std $nloc,[%sp+$bias+$frame+16] | |
657 | + bz,pn %icc,.Linnerskip | |
658 | + std $nlod,[%sp+$bias+$frame+24] | |
659 | + | |
660 | + ba .Linner | |
661 | + nop | |
662 | +.align 32 | |
663 | +.Linner: | |
664 | + ldd [$ap_l+$j],$alo ! load a[j] in double format | |
665 | + ldd [$ap_h+$j],$ahi | |
666 | + ldd [$np_l+$j],$nlo ! load n[j] in double format | |
667 | + ldd [$np_h+$j],$nhi | |
668 | + | |
669 | + fmuld $alo,$ba,$aloa | |
670 | + fmuld $nlo,$na,$nloa | |
671 | + fmuld $alo,$bb,$alob | |
672 | + fmuld $nlo,$nb,$nlob | |
673 | + fmuld $alo,$bc,$aloc | |
674 | + ldx [%sp+$bias+$frame+0],%o0 | |
675 | + faddd $aloa,$nloa,$nloa | |
676 | + fmuld $nlo,$nc,$nloc | |
677 | + ldx [%sp+$bias+$frame+8],%o1 | |
678 | + fmuld $alo,$bd,$alod | |
679 | + ldx [%sp+$bias+$frame+16],%o2 | |
680 | + faddd $alob,$nlob,$nlob | |
681 | + fmuld $nlo,$nd,$nlod | |
682 | + ldx [%sp+$bias+$frame+24],%o3 | |
683 | + fmuld $ahi,$ba,$ahia | |
684 | + | |
685 | + srlx %o0,16,%o7 | |
686 | + faddd $aloc,$nloc,$nloc | |
687 | + fmuld $nhi,$na,$nhia | |
688 | + add %o7,%o1,%o1 | |
689 | + fmuld $ahi,$bb,$ahib | |
690 | + srlx %o1,16,%o7 | |
691 | + faddd $alod,$nlod,$nlod | |
692 | + fmuld $nhi,$nb,$nhib | |
693 | + add %o7,%o2,%o2 | |
694 | + fmuld $ahi,$bc,$ahic | |
695 | + srlx %o2,16,%o7 | |
696 | + faddd $ahia,$nhia,$nhia | |
697 | + fmuld $nhi,$nc,$nhic | |
698 | + add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15] | |
699 | + and %o0,$mask,%o0 | |
700 | + fmuld $ahi,$bd,$ahid | |
701 | + and %o1,$mask,%o1 | |
702 | + and %o2,$mask,%o2 | |
703 | + faddd $ahib,$nhib,$nhib | |
704 | + fmuld $nhi,$nd,$nhid | |
705 | + sllx %o1,16,%o1 | |
706 | + faddd $dota,$nloa,$nloa | |
707 | + sllx %o2,32,%o2 | |
708 | + faddd $dotb,$nlob,$nlob | |
709 | + sllx %o3,48,%o7 | |
710 | + or %o1,%o0,%o0 | |
711 | + faddd $ahic,$nhic,$dota ! $nhic | |
712 | + or %o2,%o0,%o0 | |
713 | + faddd $ahid,$nhid,$dotb ! $nhid | |
714 | + or %o7,%o0,%o0 ! 64-bit result | |
715 | + faddd $nloc,$nhia,$nloc | |
716 | + addcc %g1,%o0,%o0 | |
717 | + ldx [$tp+8],%o7 ! tp[j] | |
718 | + faddd $nlod,$nhib,$nlod | |
719 | + srlx %o3,16,%g1 ! 34-bit carry | |
720 | + fdtox $nloa,$nloa | |
721 | + bcs,a %xcc,.+8 | |
722 | + add %g1,1,%g1 | |
723 | + fdtox $nlob,$nlob | |
724 | + addcc %o7,%o0,%o0 | |
725 | + fdtox $nloc,$nloc | |
726 | + bcs,a %xcc,.+8 | |
727 | + add %g1,1,%g1 | |
728 | + | |
729 | + stx %o0,[$tp] ! tp[j-1] | |
730 | + fdtox $nlod,$nlod | |
731 | + | |
732 | + std $nloa,[%sp+$bias+$frame+0] | |
733 | + std $nlob,[%sp+$bias+$frame+8] | |
734 | + std $nloc,[%sp+$bias+$frame+16] | |
735 | + addcc $j,8,$j | |
736 | + std $nlod,[%sp+$bias+$frame+24] | |
737 | + bnz,pt %icc,.Linner | |
738 | + add $tp,8,$tp | |
739 | + | |
740 | +.Linnerskip: | |
741 | + fdtox $dota,$dota | |
742 | + fdtox $dotb,$dotb | |
743 | + | |
744 | + ldx [%sp+$bias+$frame+0],%o0 | |
745 | + ldx [%sp+$bias+$frame+8],%o1 | |
746 | + ldx [%sp+$bias+$frame+16],%o2 | |
747 | + ldx [%sp+$bias+$frame+24],%o3 | |
748 | + | |
749 | + srlx %o0,16,%o7 | |
750 | + std $dota,[%sp+$bias+$frame+32] | |
751 | + add %o7,%o1,%o1 | |
752 | + std $dotb,[%sp+$bias+$frame+40] | |
753 | + srlx %o1,16,%o7 | |
754 | + add %o7,%o2,%o2 | |
755 | + srlx %o2,16,%o7 | |
756 | + add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15] | |
757 | + and %o0,$mask,%o0 | |
758 | + and %o1,$mask,%o1 | |
759 | + and %o2,$mask,%o2 | |
760 | + sllx %o1,16,%o1 | |
761 | + sllx %o2,32,%o2 | |
762 | + sllx %o3,48,%o7 | |
763 | + or %o1,%o0,%o0 | |
764 | + or %o2,%o0,%o0 | |
765 | + ldx [%sp+$bias+$frame+32],%o4 | |
766 | + or %o7,%o0,%o0 ! 64-bit result | |
767 | + ldx [%sp+$bias+$frame+40],%o5 | |
768 | + addcc %g1,%o0,%o0 | |
769 | + ldx [$tp+8],%o7 ! tp[j] | |
770 | + srlx %o3,16,%g1 ! 34-bit carry | |
771 | + bcs,a %xcc,.+8 | |
772 | + add %g1,1,%g1 | |
773 | + | |
774 | + addcc %o7,%o0,%o0 | |
775 | + bcs,a %xcc,.+8 | |
776 | + add %g1,1,%g1 | |
777 | + | |
778 | + stx %o0,[$tp] ! tp[j-1] | |
779 | + add $tp,8,$tp | |
780 | + | |
781 | + srlx %o4,16,%o7 | |
782 | + add %o7,%o5,%o5 | |
783 | + and %o4,$mask,%o4 | |
784 | + sllx %o5,16,%o7 | |
785 | + or %o7,%o4,%o4 | |
786 | + addcc %g1,%o4,%o4 | |
787 | + srlx %o5,48,%g1 | |
788 | + bcs,a %xcc,.+8 | |
789 | + add %g1,1,%g1 | |
790 | + | |
791 | + addcc $carry,%o4,%o4 | |
792 | + stx %o4,[$tp] ! tp[num-1] | |
793 | + mov %g1,$carry | |
794 | + bcs,a %xcc,.+8 | |
795 | + add $carry,1,$carry | |
796 | + | |
797 | + addcc $i,8,$i | |
798 | + bnz %icc,.Louter | |
799 | + nop | |
800 | + | |
801 | + add $tp,8,$tp ! adjust tp to point at the end | |
802 | + orn %g0,%g0,%g4 | |
803 | + sub %g0,$num,%o7 ! n=-num | |
804 | + ba .Lsub | |
805 | + subcc %g0,%g0,%g0 ! clear %icc.c | |
806 | + | |
807 | +.align 32 | |
808 | +.Lsub: | |
809 | + ldx [$tp+%o7],%o0 | |
810 | + add $np,%o7,%g1 | |
811 | + ld [%g1+0],%o2 | |
812 | + ld [%g1+4],%o3 | |
813 | + srlx %o0,32,%o1 | |
814 | + subccc %o0,%o2,%o2 | |
815 | + add $rp,%o7,%g1 | |
816 | + subccc %o1,%o3,%o3 | |
817 | + st %o2,[%g1+0] | |
818 | + add %o7,8,%o7 | |
819 | + brnz,pt %o7,.Lsub | |
820 | + st %o3,[%g1+4] | |
821 | + subc $carry,0,%g4 | |
822 | + sub %g0,$num,%o7 ! n=-num | |
823 | + ba .Lcopy | |
824 | + nop | |
825 | + | |
826 | +.align 32 | |
827 | +.Lcopy: | |
828 | + ldx [$tp+%o7],%o0 | |
829 | + add $rp,%o7,%g1 | |
830 | + ld [%g1+0],%o2 | |
831 | + ld [%g1+4],%o3 | |
832 | + stx %g0,[$tp+%o7] | |
833 | + and %o0,%g4,%o0 | |
834 | + srlx %o0,32,%o1 | |
835 | + andn %o2,%g4,%o2 | |
836 | + andn %o3,%g4,%o3 | |
837 | + or %o2,%o0,%o0 | |
838 | + or %o3,%o1,%o1 | |
839 | + st %o0,[%g1+0] | |
840 | + add %o7,8,%o7 | |
841 | + brnz,pt %o7,.Lcopy | |
842 | + st %o1,[%g1+4] | |
843 | + sub %g0,$num,%o7 ! n=-num | |
844 | + | |
845 | +.Lzap: | |
846 | + stx %g0,[$ap_l+%o7] | |
847 | + stx %g0,[$ap_h+%o7] | |
848 | + stx %g0,[$np_l+%o7] | |
849 | + stx %g0,[$np_h+%o7] | |
850 | + add %o7,8,%o7 | |
851 | + brnz,pt %o7,.Lzap | |
852 | + nop | |
853 | + | |
854 | + ldx [%sp+$bias+$frame+48],%o7 | |
855 | + wr %g0,%o7,%asi ! restore %asi | |
856 | + | |
857 | + mov 1,%i0 | |
858 | +.Lret: | |
859 | + ret | |
860 | + restore | |
861 | +.type $fname,#function | |
862 | +.size $fname,(.-$fname) | |
863 | +.asciz "Montgomery Multipltication for UltraSPARC, CRYPTOGAMS by <appro\@openssl.org>" | |
864 | +.align 32 | |
865 | +___ | |
866 | + | |
867 | +$code =~ s/\`([^\`]*)\`/eval($1)/gem; | |
868 | + | |
869 | +# Below substitution makes it possible to compile without demanding | |
870 | +# VIS extentions on command line, e.g. -xarch=v9 vs. -xarch=v9a. I | |
871 | +# dare to do this, because VIS capability is detected at run-time now | |
872 | +# and this routine is not called on CPU not capable to execute it. Do | |
873 | +# note that fzeros is not the only VIS dependency! Another dependency | |
874 | +# is implicit and is just _a_ numerical value loaded to %asi register, | |
875 | +# which assembler can't recognize as VIS specific... | |
876 | +$code =~ s/fzeros\s+%f([0-9]+)/ | |
877 | + sprintf(".word\t0x%x\t! fzeros %%f%d",0x81b00c20|($1<<25),$1) | |
878 | + /gem; | |
879 | + | |
880 | +print $code; | |
881 | +# flush | |
882 | +close STDOUT; |
@@ -0,0 +1,242 @@ | ||
1 | +#!/usr/bin/env perl | |
2 | +# | |
3 | +# ==================================================================== | |
4 | +# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | |
5 | +# project. The module is, however, dual licensed under OpenSSL and | |
6 | +# CRYPTOGAMS licenses depending on where you obtain it. For further | |
7 | +# details see http://www.openssl.org/~appro/cryptogams/. | |
8 | +# ==================================================================== | |
9 | +# | |
10 | +# Wrapper around 'rep montmul', VIA-specific instruction accessing | |
11 | +# PadLock Montgomery Multiplier. The wrapper is designed as drop-in | |
12 | +# replacement for OpenSSL bn_mul_mont [first implemented in 0.9.9]. | |
13 | +# | |
14 | +# Below are interleaved outputs from 'openssl speed rsa dsa' for 4 | |
15 | +# different software configurations on 1.5GHz VIA Esther processor. | |
16 | +# Lines marked with "software integer" denote performance of hand- | |
17 | +# coded integer-only assembler found in OpenSSL 0.9.7. "Software SSE2" | |
18 | +# refers to hand-coded SSE2 Montgomery multiplication procedure found | |
19 | +# OpenSSL 0.9.9. "Hardware VIA SDK" refers to padlock_pmm routine from | |
20 | +# Padlock SDK 2.0.1 available for download from VIA, which naturally | |
21 | +# utilizes the magic 'repz montmul' instruction. And finally "hardware | |
22 | +# this" refers to *this* implementation which also uses 'repz montmul' | |
23 | +# | |
24 | +# sign verify sign/s verify/s | |
25 | +# rsa 512 bits 0.001720s 0.000140s 581.4 7149.7 software integer | |
26 | +# rsa 512 bits 0.000690s 0.000086s 1450.3 11606.0 software SSE2 | |
27 | +# rsa 512 bits 0.006136s 0.000201s 163.0 4974.5 hardware VIA SDK | |
28 | +# rsa 512 bits 0.000712s 0.000050s 1404.9 19858.5 hardware this | |
29 | +# | |
30 | +# rsa 1024 bits 0.008518s 0.000413s 117.4 2420.8 software integer | |
31 | +# rsa 1024 bits 0.004275s 0.000277s 233.9 3609.7 software SSE2 | |
32 | +# rsa 1024 bits 0.012136s 0.000260s 82.4 3844.5 hardware VIA SDK | |
33 | +# rsa 1024 bits 0.002522s 0.000116s 396.5 8650.9 hardware this | |
34 | +# | |
35 | +# rsa 2048 bits 0.050101s 0.001371s 20.0 729.6 software integer | |
36 | +# rsa 2048 bits 0.030273s 0.001008s 33.0 991.9 software SSE2 | |
37 | +# rsa 2048 bits 0.030833s 0.000976s 32.4 1025.1 hardware VIA SDK | |
38 | +# rsa 2048 bits 0.011879s 0.000342s 84.2 2921.7 hardware this | |
39 | +# | |
40 | +# rsa 4096 bits 0.327097s 0.004859s 3.1 205.8 software integer | |
41 | +# rsa 4096 bits 0.229318s 0.003859s 4.4 259.2 software SSE2 | |
42 | +# rsa 4096 bits 0.233953s 0.003274s 4.3 305.4 hardware VIA SDK | |
43 | +# rsa 4096 bits 0.070493s 0.001166s 14.2 857.6 hardware this | |
44 | +# | |
45 | +# dsa 512 bits 0.001342s 0.001651s 745.2 605.7 software integer | |
46 | +# dsa 512 bits 0.000844s 0.000987s 1185.3 1013.1 software SSE2 | |
47 | +# dsa 512 bits 0.001902s 0.002247s 525.6 444.9 hardware VIA SDK | |
48 | +# dsa 512 bits 0.000458s 0.000524s 2182.2 1909.1 hardware this | |
49 | +# | |
50 | +# dsa 1024 bits 0.003964s 0.004926s 252.3 203.0 software integer | |
51 | +# dsa 1024 bits 0.002686s 0.003166s 372.3 315.8 software SSE2 | |
52 | +# dsa 1024 bits 0.002397s 0.002823s 417.1 354.3 hardware VIA SDK | |
53 | +# dsa 1024 bits 0.000978s 0.001170s 1022.2 855.0 hardware this | |
54 | +# | |
55 | +# dsa 2048 bits 0.013280s 0.016518s 75.3 60.5 software integer | |
56 | +# dsa 2048 bits 0.009911s 0.011522s 100.9 86.8 software SSE2 | |
57 | +# dsa 2048 bits 0.009542s 0.011763s 104.8 85.0 hardware VIA SDK | |
58 | +# dsa 2048 bits 0.002884s 0.003352s 346.8 298.3 hardware this | |
59 | +# | |
60 | +# To give you some other reference point here is output for 2.4GHz P4 | |
61 | +# running hand-coded SSE2 bn_mul_mont found in 0.9.9, i.e. "software | |
62 | +# SSE2" in above terms. | |
63 | +# | |
64 | +# rsa 512 bits 0.000407s 0.000047s 2454.2 21137.0 | |
65 | +# rsa 1024 bits 0.002426s 0.000141s 412.1 7100.0 | |
66 | +# rsa 2048 bits 0.015046s 0.000491s 66.5 2034.9 | |
67 | +# rsa 4096 bits 0.109770s 0.002379s 9.1 420.3 | |
68 | +# dsa 512 bits 0.000438s 0.000525s 2281.1 1904.1 | |
69 | +# dsa 1024 bits 0.001346s 0.001595s 742.7 627.0 | |
70 | +# dsa 2048 bits 0.004745s 0.005582s 210.7 179.1 | |
71 | +# | |
72 | +# Conclusions: | |
73 | +# - VIA SDK leaves a *lot* of room for improvement (which this | |
74 | +# implementation successfully fills:-); | |
75 | +# - 'rep montmul' gives up to >3x performance improvement depending on | |
76 | +# key length; | |
77 | +# - in terms of absolute performance it delivers approximately as much | |
78 | +# as modern out-of-order 32-bit cores [again, for longer keys]. | |
79 | + | |
80 | +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | |
81 | +push(@INC,"${dir}","${dir}../../perlasm"); | |
82 | +require "x86asm.pl"; | |
83 | + | |
84 | +&asm_init($ARGV[0],"via-mont.pl"); | |
85 | + | |
86 | +# int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0, int num); | |
87 | +$func="bn_mul_mont_padlock"; | |
88 | + | |
89 | +$pad=16*1; # amount of reserved bytes on top of every vector | |
90 | + | |
91 | +# stack layout | |
92 | +$mZeroPrime=&DWP(0,"esp"); # these are specified by VIA | |
93 | +$A=&DWP(4,"esp"); | |
94 | +$B=&DWP(8,"esp"); | |
95 | +$T=&DWP(12,"esp"); | |
96 | +$M=&DWP(16,"esp"); | |
97 | +$scratch=&DWP(20,"esp"); | |
98 | +$rp=&DWP(24,"esp"); # these are mine | |
99 | +$sp=&DWP(28,"esp"); | |
100 | +# &DWP(32,"esp") # 32 byte scratch area | |
101 | +# &DWP(64+(4*$num+$pad)*0,"esp") # padded tp[num] | |
102 | +# &DWP(64+(4*$num+$pad)*1,"esp") # padded copy of ap[num] | |
103 | +# &DWP(64+(4*$num+$pad)*2,"esp") # padded copy of bp[num] | |
104 | +# &DWP(64+(4*$num+$pad)*3,"esp") # padded copy of np[num] | |
105 | +# Note that SDK suggests to unconditionally allocate 2K per vector. This | |
106 | +# has quite an impact on performance. It naturally depends on key length, | |
107 | +# but to give an example 1024 bit private RSA key operations suffer >30% | |
108 | +# penalty. I allocate only as much as actually required... | |
109 | + | |
110 | +&function_begin($func); | |
111 | + &xor ("eax","eax"); | |
112 | + &mov ("ecx",&wparam(5)); # num | |
113 | + # meet VIA's limitations for num [note that the specification | |
114 | + # expresses them in bits, while we work with amount of 32-bit words] | |
115 | + &test ("ecx",3); | |
116 | + &jnz (&label("leave")); # num % 4 != 0 | |
117 | + &cmp ("ecx",8); | |
118 | + &jb (&label("leave")); # num < 8 | |
119 | + &cmp ("ecx",1024); | |
120 | + &ja (&label("leave")); # num > 1024 | |
121 | + | |
122 | + &pushf (); | |
123 | + &cld (); | |
124 | + | |
125 | + &mov ("edi",&wparam(0)); # rp | |
126 | + &mov ("eax",&wparam(1)); # ap | |
127 | + &mov ("ebx",&wparam(2)); # bp | |
128 | + &mov ("edx",&wparam(3)); # np | |
129 | + &mov ("esi",&wparam(4)); # n0 | |
130 | + &mov ("esi",&DWP(0,"esi")); # *n0 | |
131 | + | |
132 | + &lea ("ecx",&DWP($pad,"","ecx",4)); # ecx becomes vector size in bytes | |
133 | + &lea ("ebp",&DWP(64,"","ecx",4)); # allocate 4 vectors + 64 bytes | |
134 | + &neg ("ebp"); | |
135 | + &add ("ebp","esp"); | |
136 | + &and ("ebp",-64); # align to cache-line | |
137 | + &xchg ("ebp","esp"); # alloca | |
138 | + | |
139 | + &mov ($rp,"edi"); # save rp | |
140 | + &mov ($sp,"ebp"); # save esp | |
141 | + | |
142 | + &mov ($mZeroPrime,"esi"); | |
143 | + &lea ("esi",&DWP(64,"esp")); # tp | |
144 | + &mov ($T,"esi"); | |
145 | + &lea ("edi",&DWP(32,"esp")); # scratch area | |
146 | + &mov ($scratch,"edi"); | |
147 | + &mov ("esi","eax"); | |
148 | + | |
149 | + &lea ("ebp",&DWP(-$pad,"ecx")); | |
150 | + &shr ("ebp",2); # restore original num value in ebp | |
151 | + | |
152 | + &xor ("eax","eax"); | |
153 | + | |
154 | + &mov ("ecx","ebp"); | |
155 | + &lea ("ecx",&DWP((32+$pad)/4,"ecx"));# padded tp + scratch | |
156 | + &data_byte(0xf3,0xab); # rep stosl, bzero | |
157 | + | |
158 | + &mov ("ecx","ebp"); | |
159 | + &lea ("edi",&DWP(64+$pad,"esp","ecx",4));# pointer to ap copy | |
160 | + &mov ($A,"edi"); | |
161 | + &data_byte(0xf3,0xa5); # rep movsl, memcpy | |
162 | + &mov ("ecx",$pad/4); | |
163 | + &data_byte(0xf3,0xab); # rep stosl, bzero pad | |
164 | + # edi points at the end of padded ap copy... | |
165 | + | |
166 | + &mov ("ecx","ebp"); | |
167 | + &mov ("esi","ebx"); | |
168 | + &mov ($B,"edi"); | |
169 | + &data_byte(0xf3,0xa5); # rep movsl, memcpy | |
170 | + &mov ("ecx",$pad/4); | |
171 | + &data_byte(0xf3,0xab); # rep stosl, bzero pad | |
172 | + # edi points at the end of padded bp copy... | |
173 | + | |
174 | + &mov ("ecx","ebp"); | |
175 | + &mov ("esi","edx"); | |
176 | + &mov ($M,"edi"); | |
177 | + &data_byte(0xf3,0xa5); # rep movsl, memcpy | |
178 | + &mov ("ecx",$pad/4); | |
179 | + &data_byte(0xf3,0xab); # rep stosl, bzero pad | |
180 | + # edi points at the end of padded np copy... | |
181 | + | |
182 | + # let magic happen... | |
183 | + &mov ("ecx","ebp"); | |
184 | + &mov ("esi","esp"); | |
185 | + &shl ("ecx",5); # convert word counter to bit counter | |
186 | + &align (4); | |
187 | + &data_byte(0xf3,0x0f,0xa6,0xc0);# rep montmul | |
188 | + | |
189 | + &mov ("ecx","ebp"); | |
190 | + &lea ("esi",&DWP(64,"esp")); # tp | |
191 | + # edi still points at the end of padded np copy... | |
192 | + &neg ("ebp"); | |
193 | + &lea ("ebp",&DWP(-$pad,"edi","ebp",4)); # so just "rewind" | |
194 | + &mov ("edi",$rp); # restore rp | |
195 | + &xor ("edx","edx"); # i=0 and clear CF | |
196 | + | |
197 | +&set_label("sub",8); | |
198 | + &mov ("eax",&DWP(0,"esi","edx",4)); | |
199 | + &sbb ("eax",&DWP(0,"ebp","edx",4)); | |
200 | + &mov (&DWP(0,"edi","edx",4),"eax"); # rp[i]=tp[i]-np[i] | |
201 | + &lea ("edx",&DWP(1,"edx")); # i++ | |
202 | + &loop (&label("sub")); # doesn't affect CF! | |
203 | + | |
204 | + &mov ("eax",&DWP(0,"esi","edx",4)); # upmost overflow bit | |
205 | + &sbb ("eax",0); | |
206 | + &and ("esi","eax"); | |
207 | + ¬ ("eax"); | |
208 | + &mov ("ebp","edi"); | |
209 | + &and ("ebp","eax"); | |
210 | + &or ("esi","ebp"); # tp=carry?tp:rp | |
211 | + | |
212 | + &mov ("ecx","edx"); # num | |
213 | + &xor ("edx","edx"); # i=0 | |
214 | + | |
215 | +&set_label("copy",8); | |
216 | + &mov ("eax",&DWP(0,"esi","edx",4)); | |
217 | + &mov (&DWP(64,"esp","edx",4),"ecx"); # zap tp | |
218 | + &mov (&DWP(0,"edi","edx",4),"eax"); | |
219 | + &lea ("edx",&DWP(1,"edx")); # i++ | |
220 | + &loop (&label("copy")); | |
221 | + | |
222 | + &mov ("ebp",$sp); | |
223 | + &xor ("eax","eax"); | |
224 | + | |
225 | + &mov ("ecx",64/4); | |
226 | + &mov ("edi","esp"); # zap frame including scratch area | |
227 | + &data_byte(0xf3,0xab); # rep stosl, bzero | |
228 | + | |
229 | + # zap copies of ap, bp and np | |
230 | + &lea ("edi",&DWP(64+$pad,"esp","edx",4));# pointer to ap | |
231 | + &lea ("ecx",&DWP(3*$pad/4,"edx","edx",2)); | |
232 | + &data_byte(0xf3,0xab); # rep stosl, bzero | |
233 | + | |
234 | + &mov ("esp","ebp"); | |
235 | + &inc ("eax"); # signal "done" | |
236 | + &popf (); | |
237 | +&set_label("leave"); | |
238 | +&function_end($func); | |
239 | + | |
240 | +&asciz("Padlock Montgomery Multiplication, CRYPTOGAMS by <appro\@openssl.org>"); | |
241 | + | |
242 | +&asm_finish(); |
@@ -0,0 +1,591 @@ | ||
1 | +#!/usr/bin/env perl | |
2 | + | |
3 | +# ==================================================================== | |
4 | +# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | |
5 | +# project. The module is, however, dual licensed under OpenSSL and | |
6 | +# CRYPTOGAMS licenses depending on where you obtain it. For further | |
7 | +# details see http://www.openssl.org/~appro/cryptogams/. | |
8 | +# ==================================================================== | |
9 | + | |
10 | +# October 2005 | |
11 | +# | |
12 | +# This is a "teaser" code, as it can be improved in several ways... | |
13 | +# First of all non-SSE2 path should be implemented (yes, for now it | |
14 | +# performs Montgomery multiplication/convolution only on SSE2-capable | |
15 | +# CPUs such as P4, others fall down to original code). Then inner loop | |
16 | +# can be unrolled and modulo-scheduled to improve ILP and possibly | |
17 | +# moved to 128-bit XMM register bank (though it would require input | |
18 | +# rearrangement and/or increase bus bandwidth utilization). Dedicated | |
19 | +# squaring procedure should give further performance improvement... | |
20 | +# Yet, for being draft, the code improves rsa512 *sign* benchmark by | |
21 | +# 110%(!), rsa1024 one - by 70% and rsa4096 - by 20%:-) | |
22 | + | |
23 | +# December 2006 | |
24 | +# | |
25 | +# Modulo-scheduling SSE2 loops results in further 15-20% improvement. | |
26 | +# Integer-only code [being equipped with dedicated squaring procedure] | |
27 | +# gives ~40% on rsa512 sign benchmark... | |
28 | + | |
29 | +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | |
30 | +push(@INC,"${dir}","${dir}../../perlasm"); | |
31 | +require "x86asm.pl"; | |
32 | + | |
33 | +&asm_init($ARGV[0],$0); | |
34 | + | |
35 | +$sse2=0; | |
36 | +for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); } | |
37 | + | |
38 | +&external_label("OPENSSL_ia32cap_P") if ($sse2); | |
39 | + | |
40 | +&function_begin("bn_mul_mont"); | |
41 | + | |
42 | +$i="edx"; | |
43 | +$j="ecx"; | |
44 | +$ap="esi"; $tp="esi"; # overlapping variables!!! | |
45 | +$rp="edi"; $bp="edi"; # overlapping variables!!! | |
46 | +$np="ebp"; | |
47 | +$num="ebx"; | |
48 | + | |
49 | +$_num=&DWP(4*0,"esp"); # stack top layout | |
50 | +$_rp=&DWP(4*1,"esp"); | |
51 | +$_ap=&DWP(4*2,"esp"); | |
52 | +$_bp=&DWP(4*3,"esp"); | |
53 | +$_np=&DWP(4*4,"esp"); | |
54 | +$_n0=&DWP(4*5,"esp"); $_n0q=&QWP(4*5,"esp"); | |
55 | +$_sp=&DWP(4*6,"esp"); | |
56 | +$_bpend=&DWP(4*7,"esp"); | |
57 | +$frame=32; # size of above frame rounded up to 16n | |
58 | + | |
59 | + &xor ("eax","eax"); | |
60 | + &mov ("edi",&wparam(5)); # int num | |
61 | + &cmp ("edi",4); | |
62 | + &jl (&label("just_leave")); | |
63 | + | |
64 | + &lea ("esi",&wparam(0)); # put aside pointer to argument block | |
65 | + &lea ("edx",&wparam(1)); # load ap | |
66 | + &mov ("ebp","esp"); # saved stack pointer! | |
67 | + &add ("edi",2); # extra two words on top of tp | |
68 | + &neg ("edi"); | |
69 | + &lea ("esp",&DWP(-$frame,"esp","edi",4)); # alloca($frame+4*(num+2)) | |
70 | + &neg ("edi"); | |
71 | + | |
72 | + # minimize cache contention by arraning 2K window between stack | |
73 | + # pointer and ap argument [np is also position sensitive vector, | |
74 | + # but it's assumed to be near ap, as it's allocated at ~same | |
75 | + # time]. | |
76 | + &mov ("eax","esp"); | |
77 | + &sub ("eax","edx"); | |
78 | + &and ("eax",2047); | |
79 | + &sub ("esp","eax"); # this aligns sp and ap modulo 2048 | |
80 | + | |
81 | + &xor ("edx","esp"); | |
82 | + &and ("edx",2048); | |
83 | + &xor ("edx",2048); | |
84 | + &sub ("esp","edx"); # this splits them apart modulo 4096 | |
85 | + | |
86 | + &and ("esp",-64); # align to cache line | |
87 | + | |
88 | + ################################# load argument block... | |
89 | + &mov ("eax",&DWP(0*4,"esi"));# BN_ULONG *rp | |
90 | + &mov ("ebx",&DWP(1*4,"esi"));# const BN_ULONG *ap | |
91 | + &mov ("ecx",&DWP(2*4,"esi"));# const BN_ULONG *bp | |
92 | + &mov ("edx",&DWP(3*4,"esi"));# const BN_ULONG *np | |
93 | + &mov ("esi",&DWP(4*4,"esi"));# const BN_ULONG *n0 | |
94 | + #&mov ("edi",&DWP(5*4,"esi"));# int num | |
95 | + | |
96 | + &mov ("esi",&DWP(0,"esi")); # pull n0[0] | |
97 | + &mov ($_rp,"eax"); # ... save a copy of argument block | |
98 | + &mov ($_ap,"ebx"); | |
99 | + &mov ($_bp,"ecx"); | |
100 | + &mov ($_np,"edx"); | |
101 | + &mov ($_n0,"esi"); | |
102 | + &lea ($num,&DWP(-3,"edi")); # num=num-1 to assist modulo-scheduling | |
103 | + #&mov ($_num,$num); # redundant as $num is not reused | |
104 | + &mov ($_sp,"ebp"); # saved stack pointer! | |
105 | + | |
106 | +if($sse2) { | |
107 | +$acc0="mm0"; # mmx register bank layout | |
108 | +$acc1="mm1"; | |
109 | +$car0="mm2"; | |
110 | +$car1="mm3"; | |
111 | +$mul0="mm4"; | |
112 | +$mul1="mm5"; | |
113 | +$temp="mm6"; | |
114 | +$mask="mm7"; | |
115 | + | |
116 | + &picmeup("eax","OPENSSL_ia32cap_P"); | |
117 | + &bt (&DWP(0,"eax"),26); | |
118 | + &jnc (&label("non_sse2")); | |
119 | + | |
120 | + &mov ("eax",-1); | |
121 | + &movd ($mask,"eax"); # mask 32 lower bits | |
122 | + | |
123 | + &mov ($ap,$_ap); # load input pointers | |
124 | + &mov ($bp,$_bp); | |
125 | + &mov ($np,$_np); | |
126 | + | |
127 | + &xor ($i,$i); # i=0 | |
128 | + &xor ($j,$j); # j=0 | |
129 | + | |
130 | + &movd ($mul0,&DWP(0,$bp)); # bp[0] | |
131 | + &movd ($mul1,&DWP(0,$ap)); # ap[0] | |
132 | + &movd ($car1,&DWP(0,$np)); # np[0] | |
133 | + | |
134 | + &pmuludq($mul1,$mul0); # ap[0]*bp[0] | |
135 | + &movq ($car0,$mul1); | |
136 | + &movq ($acc0,$mul1); # I wish movd worked for | |
137 | + &pand ($acc0,$mask); # inter-register transfers | |
138 | + | |
139 | + &pmuludq($mul1,$_n0q); # *=n0 | |
140 | + | |
141 | + &pmuludq($car1,$mul1); # "t[0]"*np[0]*n0 | |
142 | + &paddq ($car1,$acc0); | |
143 | + | |
144 | + &movd ($acc1,&DWP(4,$np)); # np[1] | |
145 | + &movd ($acc0,&DWP(4,$ap)); # ap[1] | |
146 | + | |
147 | + &psrlq ($car0,32); | |
148 | + &psrlq ($car1,32); | |
149 | + | |
150 | + &inc ($j); # j++ | |
151 | +&set_label("1st",16); | |
152 | + &pmuludq($acc0,$mul0); # ap[j]*bp[0] | |
153 | + &pmuludq($acc1,$mul1); # np[j]*m1 | |
154 | + &paddq ($car0,$acc0); # +=c0 | |
155 | + &paddq ($car1,$acc1); # +=c1 | |
156 | + | |
157 | + &movq ($acc0,$car0); | |
158 | + &pand ($acc0,$mask); | |
159 | + &movd ($acc1,&DWP(4,$np,$j,4)); # np[j+1] | |
160 | + &paddq ($car1,$acc0); # +=ap[j]*bp[0]; | |
161 | + &movd ($acc0,&DWP(4,$ap,$j,4)); # ap[j+1] | |
162 | + &psrlq ($car0,32); | |
163 | + &movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[j-1]= | |
164 | + &psrlq ($car1,32); | |
165 | + | |
166 | + &lea ($j,&DWP(1,$j)); | |
167 | + &cmp ($j,$num); | |
168 | + &jl (&label("1st")); | |
169 | + | |
170 | + &pmuludq($acc0,$mul0); # ap[num-1]*bp[0] | |
171 | + &pmuludq($acc1,$mul1); # np[num-1]*m1 | |
172 | + &paddq ($car0,$acc0); # +=c0 | |
173 | + &paddq ($car1,$acc1); # +=c1 | |
174 | + | |
175 | + &movq ($acc0,$car0); | |
176 | + &pand ($acc0,$mask); | |
177 | + &paddq ($car1,$acc0); # +=ap[num-1]*bp[0]; | |
178 | + &movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[num-2]= | |
179 | + | |
180 | + &psrlq ($car0,32); | |
181 | + &psrlq ($car1,32); | |
182 | + | |
183 | + &paddq ($car1,$car0); | |
184 | + &movq (&QWP($frame,"esp",$num,4),$car1); # tp[num].tp[num-1] | |
185 | + | |
186 | + &inc ($i); # i++ | |
187 | +&set_label("outer"); | |
188 | + &xor ($j,$j); # j=0 | |
189 | + | |
190 | + &movd ($mul0,&DWP(0,$bp,$i,4)); # bp[i] | |
191 | + &movd ($mul1,&DWP(0,$ap)); # ap[0] | |
192 | + &movd ($temp,&DWP($frame,"esp")); # tp[0] | |
193 | + &movd ($car1,&DWP(0,$np)); # np[0] | |
194 | + &pmuludq($mul1,$mul0); # ap[0]*bp[i] | |
195 | + | |
196 | + &paddq ($mul1,$temp); # +=tp[0] | |
197 | + &movq ($acc0,$mul1); | |
198 | + &movq ($car0,$mul1); | |
199 | + &pand ($acc0,$mask); | |
200 | + | |
201 | + &pmuludq($mul1,$_n0q); # *=n0 | |
202 | + | |
203 | + &pmuludq($car1,$mul1); | |
204 | + &paddq ($car1,$acc0); | |
205 | + | |
206 | + &movd ($temp,&DWP($frame+4,"esp")); # tp[1] | |
207 | + &movd ($acc1,&DWP(4,$np)); # np[1] | |
208 | + &movd ($acc0,&DWP(4,$ap)); # ap[1] | |
209 | + | |
210 | + &psrlq ($car0,32); | |
211 | + &psrlq ($car1,32); | |
212 | + &paddq ($car0,$temp); # +=tp[1] | |
213 | + | |
214 | + &inc ($j); # j++ | |
215 | + &dec ($num); | |
216 | +&set_label("inner"); | |
217 | + &pmuludq($acc0,$mul0); # ap[j]*bp[i] | |
218 | + &pmuludq($acc1,$mul1); # np[j]*m1 | |
219 | + &paddq ($car0,$acc0); # +=c0 | |
220 | + &paddq ($car1,$acc1); # +=c1 | |
221 | + | |
222 | + &movq ($acc0,$car0); | |
223 | + &movd ($temp,&DWP($frame+4,"esp",$j,4));# tp[j+1] | |
224 | + &pand ($acc0,$mask); | |
225 | + &movd ($acc1,&DWP(4,$np,$j,4)); # np[j+1] | |
226 | + &paddq ($car1,$acc0); # +=ap[j]*bp[i]+tp[j] | |
227 | + &movd ($acc0,&DWP(4,$ap,$j,4)); # ap[j+1] | |
228 | + &psrlq ($car0,32); | |
229 | + &movd (&DWP($frame-4,"esp",$j,4),$car1);# tp[j-1]= | |
230 | + &psrlq ($car1,32); | |
231 | + &paddq ($car0,$temp); # +=tp[j+1] | |
232 | + | |
233 | + &dec ($num); | |
234 | + &lea ($j,&DWP(1,$j)); # j++ | |
235 | + &jnz (&label("inner")); | |
236 | + | |
237 | + &mov ($num,$j); | |
238 | + &pmuludq($acc0,$mul0); # ap[num-1]*bp[i] | |
239 | + &pmuludq($acc1,$mul1); # np[num-1]*m1 | |
240 | + &paddq ($car0,$acc0); # +=c0 | |
241 | + &paddq ($car1,$acc1); # +=c1 | |
242 | + | |
243 | + &movq ($acc0,$car0); | |
244 | + &pand ($acc0,$mask); | |
245 | + &paddq ($car1,$acc0); # +=ap[num-1]*bp[i]+tp[num-1] | |
246 | + &movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[num-2]= | |
247 | + &psrlq ($car0,32); | |
248 | + &psrlq ($car1,32); | |
249 | + | |
250 | + &movd ($temp,&DWP($frame+4,"esp",$num,4)); # += tp[num] | |
251 | + &paddq ($car1,$car0); | |
252 | + &paddq ($car1,$temp); | |
253 | + &movq (&QWP($frame,"esp",$num,4),$car1); # tp[num].tp[num-1] | |
254 | + | |
255 | + &lea ($i,&DWP(1,$i)); # i++ | |
256 | + &cmp ($i,$num); | |
257 | + &jle (&label("outer")); | |
258 | + | |
259 | + &emms (); # done with mmx bank | |
260 | + &jmp (&label("common_tail")); | |
261 | + | |
262 | +&set_label("non_sse2",16); | |
263 | +} | |
264 | + | |
265 | +if (0) { | |
266 | + &mov ("esp",$_sp); | |
267 | + &xor ("eax","eax"); # signal "not fast enough [yet]" | |
268 | + &jmp (&label("just_leave")); | |
269 | + # While the below code provides competitive performance for | |
270 | + # all key lengthes on modern Intel cores, it's still more | |
271 | + # than 10% slower for 4096-bit key elsewhere:-( "Competitive" | |
272 | + # means compared to the original integer-only assembler. | |
273 | + # 512-bit RSA sign is better by ~40%, but that's about all | |
274 | + # one can say about all CPUs... | |
275 | +} else { | |
276 | +$inp="esi"; # integer path uses these registers differently | |
277 | +$word="edi"; | |
278 | +$carry="ebp"; | |
279 | + | |
280 | + &mov ($inp,$_ap); | |
281 | + &lea ($carry,&DWP(1,$num)); | |
282 | + &mov ($word,$_bp); | |
283 | + &xor ($j,$j); # j=0 | |
284 | + &mov ("edx",$inp); | |
285 | + &and ($carry,1); # see if num is even | |
286 | + &sub ("edx",$word); # see if ap==bp | |
287 | + &lea ("eax",&DWP(4,$word,$num,4)); # &bp[num] | |
288 | + &or ($carry,"edx"); | |
289 | + &mov ($word,&DWP(0,$word)); # bp[0] | |
290 | + &jz (&label("bn_sqr_mont")); | |
291 | + &mov ($_bpend,"eax"); | |
292 | + &mov ("eax",&DWP(0,$inp)); | |
293 | + &xor ("edx","edx"); | |
294 | + | |
295 | +&set_label("mull",16); | |
296 | + &mov ($carry,"edx"); | |
297 | + &mul ($word); # ap[j]*bp[0] | |
298 | + &add ($carry,"eax"); | |
299 | + &lea ($j,&DWP(1,$j)); | |
300 | + &adc ("edx",0); | |
301 | + &mov ("eax",&DWP(0,$inp,$j,4)); # ap[j+1] | |
302 | + &cmp ($j,$num); | |
303 | + &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]= | |
304 | + &jl (&label("mull")); | |
305 | + | |
306 | + &mov ($carry,"edx"); | |
307 | + &mul ($word); # ap[num-1]*bp[0] | |
308 | + &mov ($word,$_n0); | |
309 | + &add ("eax",$carry); | |
310 | + &mov ($inp,$_np); | |
311 | + &adc ("edx",0); | |
312 | + &imul ($word,&DWP($frame,"esp")); # n0*tp[0] | |
313 | + | |
314 | + &mov (&DWP($frame,"esp",$num,4),"eax"); # tp[num-1]= | |
315 | + &xor ($j,$j); | |
316 | + &mov (&DWP($frame+4,"esp",$num,4),"edx"); # tp[num]= | |
317 | + &mov (&DWP($frame+8,"esp",$num,4),$j); # tp[num+1]= | |
318 | + | |
319 | + &mov ("eax",&DWP(0,$inp)); # np[0] | |
320 | + &mul ($word); # np[0]*m | |
321 | + &add ("eax",&DWP($frame,"esp")); # +=tp[0] | |
322 | + &mov ("eax",&DWP(4,$inp)); # np[1] | |
323 | + &adc ("edx",0); | |
324 | + &inc ($j); | |
325 | + | |
326 | + &jmp (&label("2ndmadd")); | |
327 | + | |
328 | +&set_label("1stmadd",16); | |
329 | + &mov ($carry,"edx"); | |
330 | + &mul ($word); # ap[j]*bp[i] | |
331 | + &add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j] | |
332 | + &lea ($j,&DWP(1,$j)); | |
333 | + &adc ("edx",0); | |
334 | + &add ($carry,"eax"); | |
335 | + &mov ("eax",&DWP(0,$inp,$j,4)); # ap[j+1] | |
336 | + &adc ("edx",0); | |
337 | + &cmp ($j,$num); | |
338 | + &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]= | |
339 | + &jl (&label("1stmadd")); | |
340 | + | |
341 | + &mov ($carry,"edx"); | |
342 | + &mul ($word); # ap[num-1]*bp[i] | |
343 | + &add ("eax",&DWP($frame,"esp",$num,4)); # +=tp[num-1] | |
344 | + &mov ($word,$_n0); | |
345 | + &adc ("edx",0); | |
346 | + &mov ($inp,$_np); | |
347 | + &add ($carry,"eax"); | |
348 | + &adc ("edx",0); | |
349 | + &imul ($word,&DWP($frame,"esp")); # n0*tp[0] | |
350 | + | |
351 | + &xor ($j,$j); | |
352 | + &add ("edx",&DWP($frame+4,"esp",$num,4)); # carry+=tp[num] | |
353 | + &mov (&DWP($frame,"esp",$num,4),$carry); # tp[num-1]= | |
354 | + &adc ($j,0); | |
355 | + &mov ("eax",&DWP(0,$inp)); # np[0] | |
356 | + &mov (&DWP($frame+4,"esp",$num,4),"edx"); # tp[num]= | |
357 | + &mov (&DWP($frame+8,"esp",$num,4),$j); # tp[num+1]= | |
358 | + | |
359 | + &mul ($word); # np[0]*m | |
360 | + &add ("eax",&DWP($frame,"esp")); # +=tp[0] | |
361 | + &mov ("eax",&DWP(4,$inp)); # np[1] | |
362 | + &adc ("edx",0); | |
363 | + &mov ($j,1); | |
364 | + | |
365 | +&set_label("2ndmadd",16); | |
366 | + &mov ($carry,"edx"); | |
367 | + &mul ($word); # np[j]*m | |
368 | + &add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j] | |
369 | + &lea ($j,&DWP(1,$j)); | |
370 | + &adc ("edx",0); | |
371 | + &add ($carry,"eax"); | |
372 | + &mov ("eax",&DWP(0,$inp,$j,4)); # np[j+1] | |
373 | + &adc ("edx",0); | |
374 | + &cmp ($j,$num); | |
375 | + &mov (&DWP($frame-8,"esp",$j,4),$carry); # tp[j-1]= | |
376 | + &jl (&label("2ndmadd")); | |
377 | + | |
378 | + &mov ($carry,"edx"); | |
379 | + &mul ($word); # np[j]*m | |
380 | + &add ($carry,&DWP($frame,"esp",$num,4)); # +=tp[num-1] | |
381 | + &adc ("edx",0); | |
382 | + &add ($carry,"eax"); | |
383 | + &adc ("edx",0); | |
384 | + &mov (&DWP($frame-4,"esp",$num,4),$carry); # tp[num-2]= | |
385 | + | |
386 | + &xor ("eax","eax"); | |
387 | + &mov ($j,$_bp); # &bp[i] | |
388 | + &add ("edx",&DWP($frame+4,"esp",$num,4)); # carry+=tp[num] | |
389 | + &adc ("eax",&DWP($frame+8,"esp",$num,4)); # +=tp[num+1] | |
390 | + &lea ($j,&DWP(4,$j)); | |
391 | + &mov (&DWP($frame,"esp",$num,4),"edx"); # tp[num-1]= | |
392 | + &cmp ($j,$_bpend); | |
393 | + &mov (&DWP($frame+4,"esp",$num,4),"eax"); # tp[num]= | |
394 | + &je (&label("common_tail")); | |
395 | + | |
396 | + &mov ($word,&DWP(0,$j)); # bp[i+1] | |
397 | + &mov ($inp,$_ap); | |
398 | + &mov ($_bp,$j); # &bp[++i] | |
399 | + &xor ($j,$j); | |
400 | + &xor ("edx","edx"); | |
401 | + &mov ("eax",&DWP(0,$inp)); | |
402 | + &jmp (&label("1stmadd")); | |
403 | + | |
404 | +&set_label("bn_sqr_mont",16); | |
405 | +$sbit=$num; | |
406 | + &mov ($_num,$num); | |
407 | + &mov ($_bp,$j); # i=0 | |
408 | + | |
409 | + &mov ("eax",$word); # ap[0] | |
410 | + &mul ($word); # ap[0]*ap[0] | |
411 | + &mov (&DWP($frame,"esp"),"eax"); # tp[0]= | |
412 | + &mov ($sbit,"edx"); | |
413 | + &shr ("edx",1); | |
414 | + &and ($sbit,1); | |
415 | + &inc ($j); | |
416 | +&set_label("sqr",16); | |
417 | + &mov ("eax",&DWP(0,$inp,$j,4)); # ap[j] | |
418 | + &mov ($carry,"edx"); | |
419 | + &mul ($word); # ap[j]*ap[0] | |
420 | + &add ("eax",$carry); | |
421 | + &lea ($j,&DWP(1,$j)); | |
422 | + &adc ("edx",0); | |
423 | + &lea ($carry,&DWP(0,$sbit,"eax",2)); | |
424 | + &shr ("eax",31); | |
425 | + &cmp ($j,$_num); | |
426 | + &mov ($sbit,"eax"); | |
427 | + &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]= | |
428 | + &jl (&label("sqr")); | |
429 | + | |
430 | + &mov ("eax",&DWP(0,$inp,$j,4)); # ap[num-1] | |
431 | + &mov ($carry,"edx"); | |
432 | + &mul ($word); # ap[num-1]*ap[0] | |
433 | + &add ("eax",$carry); | |
434 | + &mov ($word,$_n0); | |
435 | + &adc ("edx",0); | |
436 | + &mov ($inp,$_np); | |
437 | + &lea ($carry,&DWP(0,$sbit,"eax",2)); | |
438 | + &imul ($word,&DWP($frame,"esp")); # n0*tp[0] | |
439 | + &shr ("eax",31); | |
440 | + &mov (&DWP($frame,"esp",$j,4),$carry); # tp[num-1]= | |
441 | + | |
442 | + &lea ($carry,&DWP(0,"eax","edx",2)); | |
443 | + &mov ("eax",&DWP(0,$inp)); # np[0] | |
444 | + &shr ("edx",31); | |
445 | + &mov (&DWP($frame+4,"esp",$j,4),$carry); # tp[num]= | |
446 | + &mov (&DWP($frame+8,"esp",$j,4),"edx"); # tp[num+1]= | |
447 | + | |
448 | + &mul ($word); # np[0]*m | |
449 | + &add ("eax",&DWP($frame,"esp")); # +=tp[0] | |
450 | + &mov ($num,$j); | |
451 | + &adc ("edx",0); | |
452 | + &mov ("eax",&DWP(4,$inp)); # np[1] | |
453 | + &mov ($j,1); | |
454 | + | |
455 | +&set_label("3rdmadd",16); | |
456 | + &mov ($carry,"edx"); | |
457 | + &mul ($word); # np[j]*m | |
458 | + &add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j] | |
459 | + &adc ("edx",0); | |
460 | + &add ($carry,"eax"); | |
461 | + &mov ("eax",&DWP(4,$inp,$j,4)); # np[j+1] | |
462 | + &adc ("edx",0); | |
463 | + &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j-1]= | |
464 | + | |
465 | + &mov ($carry,"edx"); | |
466 | + &mul ($word); # np[j+1]*m | |
467 | + &add ($carry,&DWP($frame+4,"esp",$j,4)); # +=tp[j+1] | |
468 | + &lea ($j,&DWP(2,$j)); | |
469 | + &adc ("edx",0); | |
470 | + &add ($carry,"eax"); | |
471 | + &mov ("eax",&DWP(0,$inp,$j,4)); # np[j+2] | |
472 | + &adc ("edx",0); | |
473 | + &cmp ($j,$num); | |
474 | + &mov (&DWP($frame-8,"esp",$j,4),$carry); # tp[j]= | |
475 | + &jl (&label("3rdmadd")); | |
476 | + | |
477 | + &mov ($carry,"edx"); | |
478 | + &mul ($word); # np[j]*m | |
479 | + &add ($carry,&DWP($frame,"esp",$num,4)); # +=tp[num-1] | |
480 | + &adc ("edx",0); | |
481 | + &add ($carry,"eax"); | |
482 | + &adc ("edx",0); | |
483 | + &mov (&DWP($frame-4,"esp",$num,4),$carry); # tp[num-2]= | |
484 | + | |
485 | + &mov ($j,$_bp); # i | |
486 | + &xor ("eax","eax"); | |
487 | + &mov ($inp,$_ap); | |
488 | + &add ("edx",&DWP($frame+4,"esp",$num,4)); # carry+=tp[num] | |
489 | + &adc ("eax",&DWP($frame+8,"esp",$num,4)); # +=tp[num+1] | |
490 | + &mov (&DWP($frame,"esp",$num,4),"edx"); # tp[num-1]= | |
491 | + &cmp ($j,$num); | |
492 | + &mov (&DWP($frame+4,"esp",$num,4),"eax"); # tp[num]= | |
493 | + &je (&label("common_tail")); | |
494 | + | |
495 | + &mov ($word,&DWP(4,$inp,$j,4)); # ap[i] | |
496 | + &lea ($j,&DWP(1,$j)); | |
497 | + &mov ("eax",$word); | |
498 | + &mov ($_bp,$j); # ++i | |
499 | + &mul ($word); # ap[i]*ap[i] | |
500 | + &add ("eax",&DWP($frame,"esp",$j,4)); # +=tp[i] | |
501 | + &adc ("edx",0); | |
502 | + &mov (&DWP($frame,"esp",$j,4),"eax"); # tp[i]= | |
503 | + &xor ($carry,$carry); | |
504 | + &cmp ($j,$num); | |
505 | + &lea ($j,&DWP(1,$j)); | |
506 | + &je (&label("sqrlast")); | |
507 | + | |
508 | + &mov ($sbit,"edx"); # zaps $num | |
509 | + &shr ("edx",1); | |
510 | + &and ($sbit,1); | |
511 | +&set_label("sqradd",16); | |
512 | + &mov ("eax",&DWP(0,$inp,$j,4)); # ap[j] | |
513 | + &mov ($carry,"edx"); | |
514 | + &mul ($word); # ap[j]*ap[i] | |
515 | + &add ("eax",$carry); | |
516 | + &lea ($carry,&DWP(0,"eax","eax")); | |
517 | + &adc ("edx",0); | |
518 | + &shr ("eax",31); | |
519 | + &add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j] | |
520 | + &lea ($j,&DWP(1,$j)); | |
521 | + &adc ("eax",0); | |
522 | + &add ($carry,$sbit); | |
523 | + &adc ("eax",0); | |
524 | + &cmp ($j,$_num); | |
525 | + &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]= | |
526 | + &mov ($sbit,"eax"); | |
527 | + &jle (&label("sqradd")); | |
528 | + | |
529 | + &mov ($carry,"edx"); | |
530 | + &lea ("edx",&DWP(0,$sbit,"edx",2)); | |
531 | + &shr ($carry,31); | |
532 | +&set_label("sqrlast"); | |
533 | + &mov ($word,$_n0); | |
534 | + &mov ($inp,$_np); | |
535 | + &imul ($word,&DWP($frame,"esp")); # n0*tp[0] | |
536 | + | |
537 | + &add ("edx",&DWP($frame,"esp",$j,4)); # +=tp[num] | |
538 | + &mov ("eax",&DWP(0,$inp)); # np[0] | |
539 | + &adc ($carry,0); | |
540 | + &mov (&DWP($frame,"esp",$j,4),"edx"); # tp[num]= | |
541 | + &mov (&DWP($frame+4,"esp",$j,4),$carry); # tp[num+1]= | |
542 | + | |
543 | + &mul ($word); # np[0]*m | |
544 | + &add ("eax",&DWP($frame,"esp")); # +=tp[0] | |
545 | + &lea ($num,&DWP(-1,$j)); | |
546 | + &adc ("edx",0); | |
547 | + &mov ($j,1); | |
548 | + &mov ("eax",&DWP(4,$inp)); # np[1] | |
549 | + | |
550 | + &jmp (&label("3rdmadd")); | |
551 | +} | |
552 | + | |
553 | +&set_label("common_tail",16); | |
554 | + &mov ($np,$_np); # load modulus pointer | |
555 | + &mov ($rp,$_rp); # load result pointer | |
556 | + &lea ($tp,&DWP($frame,"esp")); # [$ap and $bp are zapped] | |
557 | + | |
558 | + &mov ("eax",&DWP(0,$tp)); # tp[0] | |
559 | + &mov ($j,$num); # j=num-1 | |
560 | + &xor ($i,$i); # i=0 and clear CF! | |
561 | + | |
562 | +&set_label("sub",16); | |
563 | + &sbb ("eax",&DWP(0,$np,$i,4)); | |
564 | + &mov (&DWP(0,$rp,$i,4),"eax"); # rp[i]=tp[i]-np[i] | |
565 | + &dec ($j); # doesn't affect CF! | |
566 | + &mov ("eax",&DWP(4,$tp,$i,4)); # tp[i+1] | |
567 | + &lea ($i,&DWP(1,$i)); # i++ | |
568 | + &jge (&label("sub")); | |
569 | + | |
570 | + &sbb ("eax",0); # handle upmost overflow bit | |
571 | + &and ($tp,"eax"); | |
572 | + ¬ ("eax"); | |
573 | + &mov ($np,$rp); | |
574 | + &and ($np,"eax"); | |
575 | + &or ($tp,$np); # tp=carry?tp:rp | |
576 | + | |
577 | +&set_label("copy",16); # copy or in-place refresh | |
578 | + &mov ("eax",&DWP(0,$tp,$num,4)); | |
579 | + &mov (&DWP(0,$rp,$num,4),"eax"); # rp[i]=tp[i] | |
580 | + &mov (&DWP($frame,"esp",$num,4),$j); # zap temporary vector | |
581 | + &dec ($num); | |
582 | + &jge (&label("copy")); | |
583 | + | |
584 | + &mov ("esp",$_sp); # pull saved stack pointer | |
585 | + &mov ("eax",1); | |
586 | +&set_label("just_leave"); | |
587 | +&function_end("bn_mul_mont"); | |
588 | + | |
589 | +&asciz("Montgomery Multiplication for x86, CRYPTOGAMS by <appro\@openssl.org>"); | |
590 | + | |
591 | +&asm_finish(); |
@@ -0,0 +1,3 @@ | ||
1 | +lib | |
2 | +Makefile.save | |
3 | +cmll-*.s |
@@ -0,0 +1,1138 @@ | ||
1 | +#!/usr/bin/env perl | |
2 | + | |
3 | +# ==================================================================== | |
4 | +# Copyright (c) 2008 Andy Polyakov <appro@openssl.org> | |
5 | +# | |
6 | +# This module may be used under the terms of either the GNU General | |
7 | +# Public License version 2 or later, the GNU Lesser General Public | |
8 | +# License version 2.1 or later, the Mozilla Public License version | |
9 | +# 1.1 or the BSD License. The exact terms of either license are | |
10 | +# distributed along with this module. For further details see | |
11 | +# http://www.openssl.org/~appro/camellia/. | |
12 | +# ==================================================================== | |
13 | + | |
14 | +# Performance in cycles per processed byte (less is better) in | |
15 | +# 'openssl speed ...' benchmark: | |
16 | +# | |
17 | +# AMD K8 Core2 PIII P4 | |
18 | +# -evp camellia-128-ecb 21.5 22.8 27.0 28.9 | |
19 | +# + over gcc 3.4.6 +90/11% +70/10% +53/4% +160/64% | |
20 | +# + over icc 8.0 +48/19% +21/15% +21/17% +55/37% | |
21 | +# | |
22 | +# camellia-128-cbc 17.3 21.1 23.9 25.9 | |
23 | +# | |
24 | +# 128-bit key setup 196 280 256 240 cycles/key | |
25 | +# + over gcc 3.4.6 +30/0% +17/11% +11/0% +63/40% | |
26 | +# + over icc 8.0 +18/3% +10/0% +10/3% +21/10% | |
27 | +# | |
28 | +# Pairs of numbers in "+" rows represent performance improvement over | |
29 | +# compiler generated position-independent code, PIC, and non-PIC | |
30 | +# respectively. PIC results are of greater relevance, as this module | |
31 | +# is position-independent, i.e. suitable for a shared library or PIE. | |
32 | +# Position independence "costs" one register, which is why compilers | |
33 | +# are so close with non-PIC results, they have an extra register to | |
34 | +# spare. CBC results are better than ECB ones thanks to "zero-copy" | |
35 | +# private _x86_* interface, and are ~30-40% better than with compiler | |
36 | +# generated cmll_cbc.o, and reach ~80-90% of x86_64 performance on | |
37 | +# same CPU (where applicable). | |
38 | + | |
39 | +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | |
40 | +push(@INC,"${dir}","${dir}../../perlasm"); | |
41 | +require "x86asm.pl"; | |
42 | + | |
43 | +$OPENSSL=1; | |
44 | + | |
45 | +&asm_init($ARGV[0],"cmll-586.pl",$ARGV[$#ARGV] eq "386"); | |
46 | + | |
47 | +@T=("eax","ebx","ecx","edx"); | |
48 | +$idx="esi"; | |
49 | +$key="edi"; | |
50 | +$Tbl="ebp"; | |
51 | + | |
52 | +# stack frame layout in _x86_Camellia_* routines, frame is allocated | |
53 | +# by caller | |
54 | +$__ra=&DWP(0,"esp"); # return address | |
55 | +$__s0=&DWP(4,"esp"); # s0 backing store | |
56 | +$__s1=&DWP(8,"esp"); # s1 backing store | |
57 | +$__s2=&DWP(12,"esp"); # s2 backing store | |
58 | +$__s3=&DWP(16,"esp"); # s3 backing store | |
59 | +$__end=&DWP(20,"esp"); # pointer to end/start of key schedule | |
60 | + | |
61 | +# stack frame layout in Camellia_[en|crypt] routines, which differs from | |
62 | +# above by 4 and overlaps by pointer to end/start of key schedule | |
63 | +$_end=&DWP(16,"esp"); | |
64 | +$_esp=&DWP(20,"esp"); | |
65 | + | |
66 | +# const unsigned int Camellia_SBOX[4][256]; | |
67 | +# Well, sort of... Camellia_SBOX[0][] is interleaved with [1][], | |
68 | +# and [2][] - with [3][]. This is done to optimize code size. | |
69 | +$SBOX1_1110=0; # Camellia_SBOX[0] | |
70 | +$SBOX4_4404=4; # Camellia_SBOX[1] | |
71 | +$SBOX2_0222=2048; # Camellia_SBOX[2] | |
72 | +$SBOX3_3033=2052; # Camellia_SBOX[3] | |
73 | +&static_label("Camellia_SIGMA"); | |
74 | +&static_label("Camellia_SBOX"); | |
75 | + | |
76 | +sub Camellia_Feistel { | |
77 | +my $i=@_[0]; | |
78 | +my $seed=defined(@_[1])?@_[1]:0; | |
79 | +my $scale=$seed<0?-8:8; | |
80 | +my $frame=defined(@_[2])?@_[2]:0; | |
81 | +my $j=($i&1)*2; | |
82 | +my $t0=@T[($j)%4],$t1=@T[($j+1)%4],$t2=@T[($j+2)%4],$t3=@T[($j+3)%4]; | |
83 | + | |
84 | + &xor ($t0,$idx); # t0^=key[0] | |
85 | + &xor ($t1,&DWP($seed+$i*$scale+4,$key)); # t1^=key[1] | |
86 | + &movz ($idx,&HB($t0)); # (t0>>8)&0xff | |
87 | + &mov ($t3,&DWP($SBOX3_3033,$Tbl,$idx,8)); # t3=SBOX3_3033[0] | |
88 | + &movz ($idx,&LB($t0)); # (t0>>0)&0xff | |
89 | + &xor ($t3,&DWP($SBOX4_4404,$Tbl,$idx,8)); # t3^=SBOX4_4404[0] | |
90 | + &shr ($t0,16); | |
91 | + &movz ($idx,&LB($t1)); # (t1>>0)&0xff | |
92 | + &mov ($t2,&DWP($SBOX1_1110,$Tbl,$idx,8)); # t2=SBOX1_1110[1] | |
93 | + &movz ($idx,&HB($t0)); # (t0>>24)&0xff | |
94 | + &xor ($t3,&DWP($SBOX1_1110,$Tbl,$idx,8)); # t3^=SBOX1_1110[0] | |
95 | + &movz ($idx,&HB($t1)); # (t1>>8)&0xff | |
96 | + &xor ($t2,&DWP($SBOX4_4404,$Tbl,$idx,8)); # t2^=SBOX4_4404[1] | |
97 | + &shr ($t1,16); | |
98 | + &movz ($t0,&LB($t0)); # (t0>>16)&0xff | |
99 | + &xor ($t3,&DWP($SBOX2_0222,$Tbl,$t0,8)); # t3^=SBOX2_0222[0] | |
100 | + &movz ($idx,&HB($t1)); # (t1>>24)&0xff | |
101 | + &mov ($t0,&DWP($frame+4*(($j+3)%4),"esp")); # prefetch "s3" | |
102 | + &xor ($t2,$t3); # t2^=t3 | |
103 | + &rotr ($t3,8); # t3=RightRotate(t3,8) | |
104 | + &xor ($t2,&DWP($SBOX2_0222,$Tbl,$idx,8)); # t2^=SBOX2_0222[1] | |
105 | + &movz ($idx,&LB($t1)); # (t1>>16)&0xff | |
106 | + &mov ($t1,&DWP($frame+4*(($j+2)%4),"esp")); # prefetch "s2" | |
107 | + &xor ($t3,$t0); # t3^=s3 | |
108 | + &xor ($t2,&DWP($SBOX3_3033,$Tbl,$idx,8)); # t2^=SBOX3_3033[1] | |
109 | + &mov ($idx,&DWP($seed+($i+1)*$scale,$key)); # prefetch key[i+1] | |
110 | + &xor ($t3,$t2); # t3^=t2 | |
111 | + &mov (&DWP($frame+4*(($j+3)%4),"esp"),$t3); # s3=t3 | |
112 | + &xor ($t2,$t1); # t2^=s2 | |
113 | + &mov (&DWP($frame+4*(($j+2)%4),"esp"),$t2); # s2=t2 | |
114 | +} | |
115 | + | |
116 | +# void Camellia_EncryptBlock_Rounds( | |
117 | +# int grandRounds, | |
118 | +# const Byte plaintext[], | |
119 | +# const KEY_TABLE_TYPE keyTable, | |
120 | +# Byte ciphertext[]) | |
121 | +&function_begin("Camellia_EncryptBlock_Rounds"); | |
122 | + &mov ("eax",&wparam(0)); # load grandRounds | |
123 | + &mov ($idx,&wparam(1)); # load plaintext pointer | |
124 | + &mov ($key,&wparam(2)); # load key schedule pointer | |
125 | + | |
126 | + &mov ("ebx","esp"); | |
127 | + &sub ("esp",7*4); # place for s[0-3],keyEnd,esp and ra | |
128 | + &and ("esp",-64); | |
129 | + | |
130 | + # place stack frame just "above mod 1024" the key schedule | |
131 | + # this ensures that cache associativity of 2 suffices | |
132 | + &lea ("ecx",&DWP(-64-63,$key)); | |
133 | + &sub ("ecx","esp"); | |
134 | + &neg ("ecx"); | |
135 | + &and ("ecx",0x3C0); # modulo 1024, but aligned to cache-line | |
136 | + &sub ("esp","ecx"); | |
137 | + &add ("esp",4); # 4 is reserved for callee's return address | |
138 | + | |
139 | + &shl ("eax",6); | |
140 | + &lea ("eax",&DWP(0,$key,"eax")); | |
141 | + &mov ($_esp,"ebx"); # save %esp | |
142 | + &mov ($_end,"eax"); # save keyEnd | |
143 | + | |
144 | + &call (&label("pic_point")); | |
145 | + &set_label("pic_point"); | |
146 | + &blindpop($Tbl); | |
147 | + &lea ($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl)); | |
148 | + | |
149 | + &mov (@T[0],&DWP(0,$idx)); # load plaintext | |
150 | + &mov (@T[1],&DWP(4,$idx)); | |
151 | + &mov (@T[2],&DWP(8,$idx)); | |
152 | + &bswap (@T[0]); | |
153 | + &mov (@T[3],&DWP(12,$idx)); | |
154 | + &bswap (@T[1]); | |
155 | + &bswap (@T[2]); | |
156 | + &bswap (@T[3]); | |
157 | + | |
158 | + &call ("_x86_Camellia_encrypt"); | |
159 | + | |
160 | + &mov ("esp",$_esp); | |
161 | + &bswap (@T[0]); | |
162 | + &mov ($idx,&wparam(3)); # load ciphertext pointer | |
163 | + &bswap (@T[1]); | |
164 | + &bswap (@T[2]); | |
165 | + &bswap (@T[3]); | |
166 | + &mov (&DWP(0,$idx),@T[0]); # write ciphertext | |
167 | + &mov (&DWP(4,$idx),@T[1]); | |
168 | + &mov (&DWP(8,$idx),@T[2]); | |
169 | + &mov (&DWP(12,$idx),@T[3]); | |
170 | +&function_end("Camellia_EncryptBlock_Rounds"); | |
171 | +# V1.x API | |
172 | +&function_begin_B("Camellia_EncryptBlock"); | |
173 | + &mov ("eax",128); | |
174 | + &sub ("eax",&wparam(0)); # load keyBitLength | |
175 | + &mov ("eax",3); | |
176 | + &adc ("eax",0); # keyBitLength==128?3:4 | |
177 | + &mov (&wparam(0),"eax"); | |
178 | + &jmp (&label("Camellia_EncryptBlock_Rounds")); | |
179 | +&function_end_B("Camellia_EncryptBlock"); | |
180 | + | |
181 | +if ($OPENSSL) { | |
182 | +# void Camellia_encrypt( | |
183 | +# const unsigned char *in, | |
184 | +# unsigned char *out, | |
185 | +# const CAMELLIA_KEY *key) | |
186 | +&function_begin("Camellia_encrypt"); | |
187 | + &mov ($idx,&wparam(0)); # load plaintext pointer | |
188 | + &mov ($key,&wparam(2)); # load key schedule pointer | |
189 | + | |
190 | + &mov ("ebx","esp"); | |
191 | + &sub ("esp",7*4); # place for s[0-3],keyEnd,esp and ra | |
192 | + &and ("esp",-64); | |
193 | + &mov ("eax",&DWP(272,$key)); # load grandRounds counter | |
194 | + | |
195 | + # place stack frame just "above mod 1024" the key schedule | |
196 | + # this ensures that cache associativity of 2 suffices | |
197 | + &lea ("ecx",&DWP(-64-63,$key)); | |
198 | + &sub ("ecx","esp"); | |
199 | + &neg ("ecx"); | |
200 | + &and ("ecx",0x3C0); # modulo 1024, but aligned to cache-line | |
201 | + &sub ("esp","ecx"); | |
202 | + &add ("esp",4); # 4 is reserved for callee's return address | |
203 | + | |
204 | + &shl ("eax",6); | |
205 | + &lea ("eax",&DWP(0,$key,"eax")); | |
206 | + &mov ($_esp,"ebx"); # save %esp | |
207 | + &mov ($_end,"eax"); # save keyEnd | |
208 | + | |
209 | + &call (&label("pic_point")); | |
210 | + &set_label("pic_point"); | |
211 | + &blindpop($Tbl); | |
212 | + &lea ($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl)); | |
213 | + | |
214 | + &mov (@T[0],&DWP(0,$idx)); # load plaintext | |
215 | + &mov (@T[1],&DWP(4,$idx)); | |
216 | + &mov (@T[2],&DWP(8,$idx)); | |
217 | + &bswap (@T[0]); | |
218 | + &mov (@T[3],&DWP(12,$idx)); | |
219 | + &bswap (@T[1]); | |
220 | + &bswap (@T[2]); | |
221 | + &bswap (@T[3]); | |
222 | + | |
223 | + &call ("_x86_Camellia_encrypt"); | |
224 | + | |
225 | + &mov ("esp",$_esp); | |
226 | + &bswap (@T[0]); | |
227 | + &mov ($idx,&wparam(1)); # load ciphertext pointer | |
228 | + &bswap (@T[1]); | |
229 | + &bswap (@T[2]); | |
230 | + &bswap (@T[3]); | |
231 | + &mov (&DWP(0,$idx),@T[0]); # write ciphertext | |
232 | + &mov (&DWP(4,$idx),@T[1]); | |
233 | + &mov (&DWP(8,$idx),@T[2]); | |
234 | + &mov (&DWP(12,$idx),@T[3]); | |
235 | +&function_end("Camellia_encrypt"); | |
236 | +} | |
237 | + | |
238 | +&function_begin_B("_x86_Camellia_encrypt"); | |
239 | + &xor (@T[0],&DWP(0,$key)); # ^=key[0-3] | |
240 | + &xor (@T[1],&DWP(4,$key)); | |
241 | + &xor (@T[2],&DWP(8,$key)); | |
242 | + &xor (@T[3],&DWP(12,$key)); | |
243 | + &mov ($idx,&DWP(16,$key)); # prefetch key[4] | |
244 | + | |
245 | + &mov ($__s0,@T[0]); # save s[0-3] | |
246 | + &mov ($__s1,@T[1]); | |
247 | + &mov ($__s2,@T[2]); | |
248 | + &mov ($__s3,@T[3]); | |
249 | + | |
250 | +&set_label("loop",16); | |
251 | + for ($i=0;$i<6;$i++) { Camellia_Feistel($i,16,4); } | |
252 | + | |
253 | + &add ($key,16*4); | |
254 | + &cmp ($key,$__end); | |
255 | + &je (&label("done")); | |
256 | + | |
257 | + # @T[0-1] are preloaded, $idx is preloaded with key[0] | |
258 | + &and ($idx,@T[0]); | |
259 | + &mov (@T[3],$__s3); | |
260 | + &rotl ($idx,1); | |
261 | + &mov (@T[2],@T[3]); | |
262 | + &xor (@T[1],$idx); | |
263 | + &or (@T[2],&DWP(12,$key)); | |
264 | + &mov ($__s1,@T[1]); # s1^=LeftRotate(s0&key[0],1); | |
265 | + &xor (@T[2],$__s2); | |
266 | + | |
267 | + &mov ($idx,&DWP(4,$key)); | |
268 | + &mov ($__s2,@T[2]); # s2^=s3|key[3]; | |
269 | + &or ($idx,@T[1]); | |
270 | + &and (@T[2],&DWP(8,$key)); | |
271 | + &xor (@T[0],$idx); | |
272 | + &rotl (@T[2],1); | |
273 | + &mov ($__s0,@T[0]); # s0^=s1|key[1]; | |
274 | + &xor (@T[3],@T[2]); | |
275 | + &mov ($idx,&DWP(16,$key)); # prefetch key[4] | |
276 | + &mov ($__s3,@T[3]); # s3^=LeftRotate(s2&key[2],1); | |
277 | + &jmp (&label("loop")); | |
278 | + | |
279 | +&set_label("done",8); | |
280 | + &mov (@T[2],@T[0]); # SwapHalf | |
281 | + &mov (@T[3],@T[1]); | |
282 | + &mov (@T[0],$__s2); | |
283 | + &mov (@T[1],$__s3); | |
284 | + &xor (@T[0],$idx); # $idx is preloaded with key[0] | |
285 | + &xor (@T[1],&DWP(4,$key)); | |
286 | + &xor (@T[2],&DWP(8,$key)); | |
287 | + &xor (@T[3],&DWP(12,$key)); | |
288 | + &ret (); | |
289 | +&function_end_B("_x86_Camellia_encrypt"); | |
290 | + | |
291 | +# void Camellia_DecryptBlock_Rounds( | |
292 | +# int grandRounds, | |
293 | +# const Byte ciphertext[], | |
294 | +# const KEY_TABLE_TYPE keyTable, | |
295 | +# Byte plaintext[]) | |
296 | +&function_begin("Camellia_DecryptBlock_Rounds"); | |
297 | + &mov ("eax",&wparam(0)); # load grandRounds | |
298 | + &mov ($idx,&wparam(1)); # load ciphertext pointer | |
299 | + &mov ($key,&wparam(2)); # load key schedule pointer | |
300 | + | |
301 | + &mov ("ebx","esp"); | |
302 | + &sub ("esp",7*4); # place for s[0-3],keyEnd,esp and ra | |
303 | + &and ("esp",-64); | |
304 | + | |
305 | + # place stack frame just "above mod 1024" the key schedule | |
306 | + # this ensures that cache associativity of 2 suffices | |
307 | + &lea ("ecx",&DWP(-64-63,$key)); | |
308 | + &sub ("ecx","esp"); | |
309 | + &neg ("ecx"); | |
310 | + &and ("ecx",0x3C0); # modulo 1024, but aligned to cache-line | |
311 | + &sub ("esp","ecx"); | |
312 | + &add ("esp",4); # 4 is reserved for callee's return address | |
313 | + | |
314 | + &shl ("eax",6); | |
315 | + &mov (&DWP(4*4,"esp"),$key); # save keyStart | |
316 | + &lea ($key,&DWP(0,$key,"eax")); | |
317 | + &mov (&DWP(5*4,"esp"),"ebx");# save %esp | |
318 | + | |
319 | + &call (&label("pic_point")); | |
320 | + &set_label("pic_point"); | |
321 | + &blindpop($Tbl); | |
322 | + &lea ($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl)); | |
323 | + | |
324 | + &mov (@T[0],&DWP(0,$idx)); # load ciphertext | |
325 | + &mov (@T[1],&DWP(4,$idx)); | |
326 | + &mov (@T[2],&DWP(8,$idx)); | |
327 | + &bswap (@T[0]); | |
328 | + &mov (@T[3],&DWP(12,$idx)); | |
329 | + &bswap (@T[1]); | |
330 | + &bswap (@T[2]); | |
331 | + &bswap (@T[3]); | |
332 | + | |
333 | + &call ("_x86_Camellia_decrypt"); | |
334 | + | |
335 | + &mov ("esp",&DWP(5*4,"esp")); | |
336 | + &bswap (@T[0]); | |
337 | + &mov ($idx,&wparam(3)); # load plaintext pointer | |
338 | + &bswap (@T[1]); | |
339 | + &bswap (@T[2]); | |
340 | + &bswap (@T[3]); | |
341 | + &mov (&DWP(0,$idx),@T[0]); # write plaintext | |
342 | + &mov (&DWP(4,$idx),@T[1]); | |
343 | + &mov (&DWP(8,$idx),@T[2]); | |
344 | + &mov (&DWP(12,$idx),@T[3]); | |
345 | +&function_end("Camellia_DecryptBlock_Rounds"); | |
346 | +# V1.x API | |
347 | +&function_begin_B("Camellia_DecryptBlock"); | |
348 | + &mov ("eax",128); | |
349 | + &sub ("eax",&wparam(0)); # load keyBitLength | |
350 | + &mov ("eax",3); | |
351 | + &adc ("eax",0); # keyBitLength==128?3:4 | |
352 | + &mov (&wparam(0),"eax"); | |
353 | + &jmp (&label("Camellia_DecryptBlock_Rounds")); | |
354 | +&function_end_B("Camellia_DecryptBlock"); | |
355 | + | |
356 | +if ($OPENSSL) { | |
357 | +# void Camellia_decrypt( | |
358 | +# const unsigned char *in, | |
359 | +# unsigned char *out, | |
360 | +# const CAMELLIA_KEY *key) | |
361 | +&function_begin("Camellia_decrypt"); | |
362 | + &mov ($idx,&wparam(0)); # load ciphertext pointer | |
363 | + &mov ($key,&wparam(2)); # load key schedule pointer | |
364 | + | |
365 | + &mov ("ebx","esp"); | |
366 | + &sub ("esp",7*4); # place for s[0-3],keyEnd,esp and ra | |
367 | + &and ("esp",-64); | |
368 | + &mov ("eax",&DWP(272,$key)); # load grandRounds counter | |
369 | + | |
370 | + # place stack frame just "above mod 1024" the key schedule | |
371 | + # this ensures that cache associativity of 2 suffices | |
372 | + &lea ("ecx",&DWP(-64-63,$key)); | |
373 | + &sub ("ecx","esp"); | |
374 | + &neg ("ecx"); | |
375 | + &and ("ecx",0x3C0); # modulo 1024, but aligned to cache-line | |
376 | + &sub ("esp","ecx"); | |
377 | + &add ("esp",4); # 4 is reserved for callee's return address | |
378 | + | |
379 | + &shl ("eax",6); | |
380 | + &mov (&DWP(4*4,"esp"),$key); # save keyStart | |
381 | + &lea ($key,&DWP(0,$key,"eax")); | |
382 | + &mov (&DWP(5*4,"esp"),"ebx");# save %esp | |
383 | + | |
384 | + &call (&label("pic_point")); | |
385 | + &set_label("pic_point"); | |
386 | + &blindpop($Tbl); | |
387 | + &lea ($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl)); | |
388 | + | |
389 | + &mov (@T[0],&DWP(0,$idx)); # load ciphertext | |
390 | + &mov (@T[1],&DWP(4,$idx)); | |
391 | + &mov (@T[2],&DWP(8,$idx)); | |
392 | + &bswap (@T[0]); | |
393 | + &mov (@T[3],&DWP(12,$idx)); | |
394 | + &bswap (@T[1]); | |
395 | + &bswap (@T[2]); | |
396 | + &bswap (@T[3]); | |
397 | + | |
398 | + &call ("_x86_Camellia_decrypt"); | |
399 | + | |
400 | + &mov ("esp",&DWP(5*4,"esp")); | |
401 | + &bswap (@T[0]); | |
402 | + &mov ($idx,&wparam(1)); # load plaintext pointer | |
403 | + &bswap (@T[1]); | |
404 | + &bswap (@T[2]); | |
405 | + &bswap (@T[3]); | |
406 | + &mov (&DWP(0,$idx),@T[0]); # write plaintext | |
407 | + &mov (&DWP(4,$idx),@T[1]); | |
408 | + &mov (&DWP(8,$idx),@T[2]); | |
409 | + &mov (&DWP(12,$idx),@T[3]); | |
410 | +&function_end("Camellia_decrypt"); | |
411 | +} | |
412 | + | |
413 | +&function_begin_B("_x86_Camellia_decrypt"); | |
414 | + &xor (@T[0],&DWP(0,$key)); # ^=key[0-3] | |
415 | + &xor (@T[1],&DWP(4,$key)); | |
416 | + &xor (@T[2],&DWP(8,$key)); | |
417 | + &xor (@T[3],&DWP(12,$key)); | |
418 | + &mov ($idx,&DWP(-8,$key)); # prefetch key[-2] | |
419 | + | |
420 | + &mov ($__s0,@T[0]); # save s[0-3] | |
421 | + &mov ($__s1,@T[1]); | |
422 | + &mov ($__s2,@T[2]); | |
423 | + &mov ($__s3,@T[3]); | |
424 | + | |
425 | +&set_label("loop",16); | |
426 | + for ($i=0;$i<6;$i++) { Camellia_Feistel($i,-8,4); } | |
427 | + | |
428 | + &sub ($key,16*4); | |
429 | + &cmp ($key,$__end); | |
430 | + &je (&label("done")); | |
431 | + | |
432 | + # @T[0-1] are preloaded, $idx is preloaded with key[2] | |
433 | + &and ($idx,@T[0]); | |
434 | + &mov (@T[3],$__s3); | |
435 | + &rotl ($idx,1); | |
436 | + &mov (@T[2],@T[3]); | |
437 | + &xor (@T[1],$idx); | |
438 | + &or (@T[2],&DWP(4,$key)); | |
439 | + &mov ($__s1,@T[1]); # s1^=LeftRotate(s0&key[0],1); | |
440 | + &xor (@T[2],$__s2); | |
441 | + | |
442 | + &mov ($idx,&DWP(12,$key)); | |
443 | + &mov ($__s2,@T[2]); # s2^=s3|key[3]; | |
444 | + &or ($idx,@T[1]); | |
445 | + &and (@T[2],&DWP(0,$key)); | |
446 | + &xor (@T[0],$idx); | |
447 | + &rotl (@T[2],1); | |
448 | + &mov ($__s0,@T[0]); # s0^=s1|key[1]; | |
449 | + &xor (@T[3],@T[2]); | |
450 | + &mov ($idx,&DWP(-8,$key)); # prefetch key[4] | |
451 | + &mov ($__s3,@T[3]); # s3^=LeftRotate(s2&key[2],1); | |
452 | + &jmp (&label("loop")); | |
453 | + | |
454 | +&set_label("done",8); | |
455 | + &mov (@T[2],@T[0]); # SwapHalf | |
456 | + &mov (@T[3],@T[1]); | |
457 | + &mov (@T[0],$__s2); | |
458 | + &mov (@T[1],$__s3); | |
459 | + &xor (@T[2],$idx); # $idx is preloaded with key[2] | |
460 | + &xor (@T[3],&DWP(12,$key)); | |
461 | + &xor (@T[0],&DWP(0,$key)); | |
462 | + &xor (@T[1],&DWP(4,$key)); | |
463 | + &ret (); | |
464 | +&function_end_B("_x86_Camellia_decrypt"); | |
465 | + | |
466 | +# shld is very slow on Intel P4 family. Even on AMD it limits | |
467 | +# instruction decode rate [because it's VectorPath] and consequently | |
468 | +# performance. PIII, PM and Core[2] seem to be the only ones which | |
469 | +# execute this code ~7% faster... | |
470 | +sub __rotl128 { | |
471 | + my ($i0,$i1,$i2,$i3,$rot,$rnd,@T)=@_; | |
472 | + | |
473 | + $rnd *= 2; | |
474 | + if ($rot) { | |
475 | + &mov ($idx,$i0); | |
476 | + &shld ($i0,$i1,$rot); | |
477 | + &shld ($i1,$i2,$rot); | |
478 | + &shld ($i2,$i3,$rot); | |
479 | + &shld ($i3,$idx,$rot); | |
480 | + } | |
481 | + &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i0 eq @T[0]); | |
482 | + &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i1 eq @T[0]); | |
483 | + &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i2 eq @T[0]); | |
484 | + &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i3 eq @T[0]); | |
485 | +} | |
486 | + | |
487 | +# ... Implementing 128-bit rotate without shld gives >3x performance | |
488 | +# improvement on P4, only ~7% degradation on other Intel CPUs and | |
489 | +# not worse performance on AMD. This is therefore preferred. | |
490 | +sub _rotl128 { | |
491 | + my ($i0,$i1,$i2,$i3,$rot,$rnd,@T)=@_; | |
492 | + | |
493 | + $rnd *= 2; | |
494 | + if ($rot) { | |
495 | + &mov ($Tbl,$i0); | |
496 | + &shl ($i0,$rot); | |
497 | + &mov ($idx,$i1); | |
498 | + &shr ($idx,32-$rot); | |
499 | + &shl ($i1,$rot); | |
500 | + &or ($i0,$idx); | |
501 | + &mov ($idx,$i2); | |
502 | + &shl ($i2,$rot); | |
503 | + &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i0 eq @T[0]); | |
504 | + &shr ($idx,32-$rot); | |
505 | + &or ($i1,$idx); | |
506 | + &shr ($Tbl,32-$rot); | |
507 | + &mov ($idx,$i3); | |
508 | + &shr ($idx,32-$rot); | |
509 | + &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i1 eq @T[0]); | |
510 | + &shl ($i3,$rot); | |
511 | + &or ($i2,$idx); | |
512 | + &or ($i3,$Tbl); | |
513 | + &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i2 eq @T[0]); | |
514 | + &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i3 eq @T[0]); | |
515 | + } else { | |
516 | + &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i0 eq @T[0]); | |
517 | + &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i1 eq @T[0]); | |
518 | + &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i2 eq @T[0]); | |
519 | + &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i3 eq @T[0]); | |
520 | + } | |
521 | +} | |
522 | + | |
523 | +sub _saveround { | |
524 | +my ($rnd,$key,@T)=@_; | |
525 | +my $bias=int(@T[0])?shift(@T):0; | |
526 | + | |
527 | + &mov (&DWP($bias+$rnd*8+0,$key),@T[0]); | |
528 | + &mov (&DWP($bias+$rnd*8+4,$key),@T[1]) if ($#T>=1); | |
529 | + &mov (&DWP($bias+$rnd*8+8,$key),@T[2]) if ($#T>=2); | |
530 | + &mov (&DWP($bias+$rnd*8+12,$key),@T[3]) if ($#T>=3); | |
531 | +} | |
532 | + | |
533 | +sub _loadround { | |
534 | +my ($rnd,$key,@T)=@_; | |
535 | +my $bias=int(@T[0])?shift(@T):0; | |
536 | + | |
537 | + &mov (@T[0],&DWP($bias+$rnd*8+0,$key)); | |
538 | + &mov (@T[1],&DWP($bias+$rnd*8+4,$key)) if ($#T>=1); | |
539 | + &mov (@T[2],&DWP($bias+$rnd*8+8,$key)) if ($#T>=2); | |
540 | + &mov (@T[3],&DWP($bias+$rnd*8+12,$key)) if ($#T>=3); | |
541 | +} | |
542 | + | |
543 | +# void Camellia_Ekeygen( | |
544 | +# const int keyBitLength, | |
545 | +# const Byte *rawKey, | |
546 | +# KEY_TABLE_TYPE keyTable) | |
547 | +&function_begin("Camellia_Ekeygen"); | |
548 | +{ my $step=0; | |
549 | + | |
550 | + &stack_push(4); # place for s[0-3] | |
551 | + | |
552 | + &mov ($Tbl,&wparam(0)); # load arguments | |
553 | + &mov ($idx,&wparam(1)); | |
554 | + &mov ($key,&wparam(2)); | |
555 | + | |
556 | + &mov (@T[0],&DWP(0,$idx)); # load 0-127 bits | |
557 | + &mov (@T[1],&DWP(4,$idx)); | |
558 | + &mov (@T[2],&DWP(8,$idx)); | |
559 | + &mov (@T[3],&DWP(12,$idx)); | |
560 | + | |
561 | + &bswap (@T[0]); | |
562 | + &bswap (@T[1]); | |
563 | + &bswap (@T[2]); | |
564 | + &bswap (@T[3]); | |
565 | + | |
566 | + &_saveround (0,$key,@T); # KL<<<0 | |
567 | + | |
568 | + &cmp ($Tbl,128); | |
569 | + &je (&label("1st128")); | |
570 | + | |
571 | + &mov (@T[0],&DWP(16,$idx)); # load 128-191 bits | |
572 | + &mov (@T[1],&DWP(20,$idx)); | |
573 | + &cmp ($Tbl,192); | |
574 | + &je (&label("1st192")); | |
575 | + &mov (@T[2],&DWP(24,$idx)); # load 192-255 bits | |
576 | + &mov (@T[3],&DWP(28,$idx)); | |
577 | + &jmp (&label("1st256")); | |
578 | +&set_label("1st192",4); | |
579 | + &mov (@T[2],@T[0]); | |
580 | + &mov (@T[3],@T[1]); | |
581 | + ¬ (@T[2]); | |
582 | + ¬ (@T[3]); | |
583 | +&set_label("1st256",4); | |
584 | + &bswap (@T[0]); | |
585 | + &bswap (@T[1]); | |
586 | + &bswap (@T[2]); | |
587 | + &bswap (@T[3]); | |
588 | + | |
589 | + &_saveround (4,$key,@T); # temporary storage for KR! | |
590 | + | |
591 | + &xor (@T[0],&DWP(0*8+0,$key)); # KR^KL | |
592 | + &xor (@T[1],&DWP(0*8+4,$key)); | |
593 | + &xor (@T[2],&DWP(1*8+0,$key)); | |
594 | + &xor (@T[3],&DWP(1*8+4,$key)); | |
595 | + | |
596 | +&set_label("1st128",4); | |
597 | + &call (&label("pic_point")); | |
598 | + &set_label("pic_point"); | |
599 | + &blindpop($Tbl); | |
600 | + &lea ($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl)); | |
601 | + &lea ($key,&DWP(&label("Camellia_SIGMA")."-".&label("Camellia_SBOX"),$Tbl)); | |
602 | + | |
603 | + &mov ($idx,&DWP($step*8,$key)); # prefetch SIGMA[0] | |
604 | + &mov (&swtmp(0),@T[0]); # save s[0-3] | |
605 | + &mov (&swtmp(1),@T[1]); | |
606 | + &mov (&swtmp(2),@T[2]); | |
607 | + &mov (&swtmp(3),@T[3]); | |
608 | + &Camellia_Feistel($step++); | |
609 | + &Camellia_Feistel($step++); | |
610 | + &mov (@T[2],&swtmp(2)); | |
611 | + &mov (@T[3],&swtmp(3)); | |
612 | + | |
613 | + &mov ($idx,&wparam(2)); | |
614 | + &xor (@T[0],&DWP(0*8+0,$idx)); # ^KL | |
615 | + &xor (@T[1],&DWP(0*8+4,$idx)); | |
616 | + &xor (@T[2],&DWP(1*8+0,$idx)); | |
617 | + &xor (@T[3],&DWP(1*8+4,$idx)); | |
618 | + | |
619 | + &mov ($idx,&DWP($step*8,$key)); # prefetch SIGMA[4] | |
620 | + &mov (&swtmp(0),@T[0]); # save s[0-3] | |
621 | + &mov (&swtmp(1),@T[1]); | |
622 | + &mov (&swtmp(2),@T[2]); | |
623 | + &mov (&swtmp(3),@T[3]); | |
624 | + &Camellia_Feistel($step++); | |
625 | + &Camellia_Feistel($step++); | |
626 | + &mov (@T[2],&swtmp(2)); | |
627 | + &mov (@T[3],&swtmp(3)); | |
628 | + | |
629 | + &mov ($idx,&wparam(0)); | |
630 | + &cmp ($idx,128); | |
631 | + &jne (&label("2nd256")); | |
632 | + | |
633 | + &mov ($key,&wparam(2)); | |
634 | + &lea ($key,&DWP(128,$key)); # size optimization | |
635 | + | |
636 | + ####### process KA | |
637 | + &_saveround (2,$key,-128,@T); # KA<<<0 | |
638 | + &_rotl128 (@T,15,6,@T); # KA<<<15 | |
639 | + &_rotl128 (@T,15,8,@T); # KA<<<(15+15=30) | |
640 | + &_rotl128 (@T,15,12,@T[0],@T[1]); # KA<<<(30+15=45) | |
641 | + &_rotl128 (@T,15,14,@T); # KA<<<(45+15=60) | |
642 | + push (@T,shift(@T)); # rotl128(@T,32); | |
643 | + &_rotl128 (@T,2,20,@T); # KA<<<(60+32+2=94) | |
644 | + &_rotl128 (@T,17,24,@T); # KA<<<(94+17=111) | |
645 | + | |
646 | + ####### process KL | |
647 | + &_loadround (0,$key,-128,@T); # load KL | |
648 | + &_rotl128 (@T,15,4,@T); # KL<<<15 | |
649 | + &_rotl128 (@T,30,10,@T); # KL<<<(15+30=45) | |
650 | + &_rotl128 (@T,15,13,@T[2],@T[3]); # KL<<<(45+15=60) | |
651 | + &_rotl128 (@T,17,16,@T); # KL<<<(60+17=77) | |
652 | + &_rotl128 (@T,17,18,@T); # KL<<<(77+17=94) | |
653 | + &_rotl128 (@T,17,22,@T); # KL<<<(94+17=111) | |
654 | + | |
655 | + while (@T[0] ne "eax") # restore order | |
656 | + { unshift (@T,pop(@T)); } | |
657 | + | |
658 | + &mov ("eax",3); # 3 grandRounds | |
659 | + &jmp (&label("done")); | |
660 | + | |
661 | +&set_label("2nd256",16); | |
662 | + &mov ($idx,&wparam(2)); | |
663 | + &_saveround (6,$idx,@T); # temporary storage for KA! | |
664 | + | |
665 | + &xor (@T[0],&DWP(4*8+0,$idx)); # KA^KR | |
666 | + &xor (@T[1],&DWP(4*8+4,$idx)); | |
667 | + &xor (@T[2],&DWP(5*8+0,$idx)); | |
668 | + &xor (@T[3],&DWP(5*8+4,$idx)); | |
669 | + | |
670 | + &mov ($idx,&DWP($step*8,$key)); # prefetch SIGMA[8] | |
671 | + &mov (&swtmp(0),@T[0]); # save s[0-3] | |
672 | + &mov (&swtmp(1),@T[1]); | |
673 | + &mov (&swtmp(2),@T[2]); | |
674 | + &mov (&swtmp(3),@T[3]); | |
675 | + &Camellia_Feistel($step++); | |
676 | + &Camellia_Feistel($step++); | |
677 | + &mov (@T[2],&swtmp(2)); | |
678 | + &mov (@T[3],&swtmp(3)); | |
679 | + | |
680 | + &mov ($key,&wparam(2)); | |
681 | + &lea ($key,&DWP(128,$key)); # size optimization | |
682 | + | |
683 | + ####### process KB | |
684 | + &_saveround (2,$key,-128,@T); # KB<<<0 | |
685 | + &_rotl128 (@T,30,10,@T); # KB<<<30 | |
686 | + &_rotl128 (@T,30,20,@T); # KB<<<(30+30=60) | |
687 | + push (@T,shift(@T)); # rotl128(@T,32); | |
688 | + &_rotl128 (@T,19,32,@T); # KB<<<(60+32+19=111) | |
689 | + | |
690 | + ####### process KR | |
691 | + &_loadround (4,$key,-128,@T); # load KR | |
692 | + &_rotl128 (@T,15,4,@T); # KR<<<15 | |
693 | + &_rotl128 (@T,15,8,@T); # KR<<<(15+15=30) | |
694 | + &_rotl128 (@T,30,18,@T); # KR<<<(30+30=60) | |
695 | + push (@T,shift(@T)); # rotl128(@T,32); | |
696 | + &_rotl128 (@T,2,26,@T); # KR<<<(60+32+2=94) | |
697 | + | |
698 | + ####### process KA | |
699 | + &_loadround (6,$key,-128,@T); # load KA | |
700 | + &_rotl128 (@T,15,6,@T); # KA<<<15 | |
701 | + &_rotl128 (@T,30,14,@T); # KA<<<(15+30=45) | |
702 | + push (@T,shift(@T)); # rotl128(@T,32); | |
703 | + &_rotl128 (@T,0,24,@T); # KA<<<(45+32+0=77) | |
704 | + &_rotl128 (@T,17,28,@T); # KA<<<(77+17=94) | |
705 | + | |
706 | + ####### process KL | |
707 | + &_loadround (0,$key,-128,@T); # load KL | |
708 | + push (@T,shift(@T)); # rotl128(@T,32); | |
709 | + &_rotl128 (@T,13,12,@T); # KL<<<(32+13=45) | |
710 | + &_rotl128 (@T,15,16,@T); # KL<<<(45+15=60) | |
711 | + &_rotl128 (@T,17,22,@T); # KL<<<(60+17=77) | |
712 | + push (@T,shift(@T)); # rotl128(@T,32); | |
713 | + &_rotl128 (@T,2,30,@T); # KL<<<(77+32+2=111) | |
714 | + | |
715 | + while (@T[0] ne "eax") # restore order | |
716 | + { unshift (@T,pop(@T)); } | |
717 | + | |
718 | + &mov ("eax",4); # 4 grandRounds | |
719 | +&set_label("done"); | |
720 | + &lea ("edx",&DWP(272-128,$key)); # end of key schedule | |
721 | + &stack_pop(4); | |
722 | +} | |
723 | +&function_end("Camellia_Ekeygen"); | |
724 | + | |
725 | +if ($OPENSSL) { | |
726 | +# int Camellia_set_key ( | |
727 | +# const unsigned char *userKey, | |
728 | +# int bits, | |
729 | +# CAMELLIA_KEY *key) | |
730 | +&function_begin_B("Camellia_set_key"); | |
731 | + &push ("ebx"); | |
732 | + &mov ("ecx",&wparam(0)); # pull arguments | |
733 | + &mov ("ebx",&wparam(1)); | |
734 | + &mov ("edx",&wparam(2)); | |
735 | + | |
736 | + &mov ("eax",-1); | |
737 | + &test ("ecx","ecx"); | |
738 | + &jz (&label("done")); # userKey==NULL? | |
739 | + &test ("edx","edx"); | |
740 | + &jz (&label("done")); # key==NULL? | |
741 | + | |
742 | + &mov ("eax",-2); | |
743 | + &cmp ("ebx",256); | |
744 | + &je (&label("arg_ok")); # bits==256? | |
745 | + &cmp ("ebx",192); | |
746 | + &je (&label("arg_ok")); # bits==192? | |
747 | + &cmp ("ebx",128); | |
748 | + &jne (&label("done")); # bits!=128? | |
749 | +&set_label("arg_ok",4); | |
750 | + | |
751 | + &push ("edx"); # push arguments | |
752 | + &push ("ecx"); | |
753 | + &push ("ebx"); | |
754 | + &call ("Camellia_Ekeygen"); | |
755 | + &stack_pop(3); | |
756 | + | |
757 | + # eax holds grandRounds and edx points at where to put it | |
758 | + &mov (&DWP(0,"edx"),"eax"); | |
759 | + &xor ("eax","eax"); | |
760 | +&set_label("done",4); | |
761 | + &pop ("ebx"); | |
762 | + &ret (); | |
763 | +&function_end_B("Camellia_set_key"); | |
764 | +} | |
765 | + | |
766 | +@SBOX=( | |
767 | +112,130, 44,236,179, 39,192,229,228,133, 87, 53,234, 12,174, 65, | |
768 | + 35,239,107,147, 69, 25,165, 33,237, 14, 79, 78, 29,101,146,189, | |
769 | +134,184,175,143,124,235, 31,206, 62, 48,220, 95, 94,197, 11, 26, | |
770 | +166,225, 57,202,213, 71, 93, 61,217, 1, 90,214, 81, 86,108, 77, | |
771 | +139, 13,154,102,251,204,176, 45,116, 18, 43, 32,240,177,132,153, | |
772 | +223, 76,203,194, 52,126,118, 5,109,183,169, 49,209, 23, 4,215, | |
773 | + 20, 88, 58, 97,222, 27, 17, 28, 50, 15,156, 22, 83, 24,242, 34, | |
774 | +254, 68,207,178,195,181,122,145, 36, 8,232,168, 96,252,105, 80, | |
775 | +170,208,160,125,161,137, 98,151, 84, 91, 30,149,224,255,100,210, | |
776 | + 16,196, 0, 72,163,247,117,219,138, 3,230,218, 9, 63,221,148, | |
777 | +135, 92,131, 2,205, 74,144, 51,115,103,246,243,157,127,191,226, | |
778 | + 82,155,216, 38,200, 55,198, 59,129,150,111, 75, 19,190, 99, 46, | |
779 | +233,121,167,140,159,110,188,142, 41,245,249,182, 47,253,180, 89, | |
780 | +120,152, 6,106,231, 70,113,186,212, 37,171, 66,136,162,141,250, | |
781 | +114, 7,185, 85,248,238,172, 10, 54, 73, 42,104, 60, 56,241,164, | |
782 | + 64, 40,211,123,187,201, 67,193, 21,227,173,244,119,199,128,158); | |
783 | + | |
784 | +sub S1110 { my $i=shift; $i=@SBOX[$i]; return $i<<24|$i<<16|$i<<8; } | |
785 | +sub S4404 { my $i=shift; $i=($i<<1|$i>>7)&0xff; $i=@SBOX[$i]; return $i<<24|$i<<16|$i; } | |
786 | +sub S0222 { my $i=shift; $i=@SBOX[$i]; $i=($i<<1|$i>>7)&0xff; return $i<<16|$i<<8|$i; } | |
787 | +sub S3033 { my $i=shift; $i=@SBOX[$i]; $i=($i>>1|$i<<7)&0xff; return $i<<24|$i<<8|$i; } | |
788 | + | |
789 | +&set_label("Camellia_SIGMA",64); | |
790 | +&data_word( | |
791 | + 0xa09e667f, 0x3bcc908b, 0xb67ae858, 0x4caa73b2, | |
792 | + 0xc6ef372f, 0xe94f82be, 0x54ff53a5, 0xf1d36f1c, | |
793 | + 0x10e527fa, 0xde682d1d, 0xb05688c2, 0xb3e6c1fd, | |
794 | + 0, 0, 0, 0); | |
795 | +&set_label("Camellia_SBOX",64); | |
796 | +# tables are interleaved, remember? | |
797 | +for ($i=0;$i<256;$i++) { &data_word(&S1110($i),&S4404($i)); } | |
798 | +for ($i=0;$i<256;$i++) { &data_word(&S0222($i),&S3033($i)); } | |
799 | + | |
800 | +# void Camellia_cbc_encrypt (const void char *inp, unsigned char *out, | |
801 | +# size_t length, const CAMELLIA_KEY *key, | |
802 | +# unsigned char *ivp,const int enc); | |
803 | +{ | |
804 | +# stack frame layout | |
805 | +# -4(%esp) # return address 0(%esp) | |
806 | +# 0(%esp) # s0 4(%esp) | |
807 | +# 4(%esp) # s1 8(%esp) | |
808 | +# 8(%esp) # s2 12(%esp) | |
809 | +# 12(%esp) # s3 16(%esp) | |
810 | +# 16(%esp) # end of key schedule 20(%esp) | |
811 | +# 20(%esp) # %esp backup | |
812 | +my $_inp=&DWP(24,"esp"); #copy of wparam(0) | |
813 | +my $_out=&DWP(28,"esp"); #copy of wparam(1) | |
814 | +my $_len=&DWP(32,"esp"); #copy of wparam(2) | |
815 | +my $_key=&DWP(36,"esp"); #copy of wparam(3) | |
816 | +my $_ivp=&DWP(40,"esp"); #copy of wparam(4) | |
817 | +my $ivec=&DWP(44,"esp"); #ivec[16] | |
818 | +my $_tmp=&DWP(44,"esp"); #volatile variable [yes, aliases with ivec] | |
819 | +my ($s0,$s1,$s2,$s3) = @T; | |
820 | + | |
821 | +&function_begin("Camellia_cbc_encrypt"); | |
822 | + &mov ($s2 eq "ecx"? $s2 : "",&wparam(2)); # load len | |
823 | + &cmp ($s2,0); | |
824 | + &je (&label("enc_out")); | |
825 | + | |
826 | + &pushf (); | |
827 | + &cld (); | |
828 | + | |
829 | + &mov ($s0,&wparam(0)); # load inp | |
830 | + &mov ($s1,&wparam(1)); # load out | |
831 | + #&mov ($s2,&wparam(2)); # load len | |
832 | + &mov ($s3,&wparam(3)); # load key | |
833 | + &mov ($Tbl,&wparam(4)); # load ivp | |
834 | + | |
835 | + # allocate aligned stack frame... | |
836 | + &lea ($idx,&DWP(-64,"esp")); | |
837 | + &and ($idx,-64); | |
838 | + | |
839 | + # place stack frame just "above mod 1024" the key schedule | |
840 | + # this ensures that cache associativity of 2 suffices | |
841 | + &lea ($key,&DWP(-64-63,$s3)); | |
842 | + &sub ($key,$idx); | |
843 | + &neg ($key); | |
844 | + &and ($key,0x3C0); # modulo 1024, but aligned to cache-line | |
845 | + &sub ($idx,$key); | |
846 | + | |
847 | + &mov ($key,&wparam(5)); # load enc | |
848 | + | |
849 | + &exch ("esp",$idx); | |
850 | + &add ("esp",4); # reserve for return address! | |
851 | + &mov ($_esp,$idx); # save %esp | |
852 | + | |
853 | + &mov ($_inp,$s0); # save copy of inp | |
854 | + &mov ($_out,$s1); # save copy of out | |
855 | + &mov ($_len,$s2); # save copy of len | |
856 | + &mov ($_key,$s3); # save copy of key | |
857 | + &mov ($_ivp,$Tbl); # save copy of ivp | |
858 | + | |
859 | + &call (&label("pic_point")); # make it PIC! | |
860 | + &set_label("pic_point"); | |
861 | + &blindpop($Tbl); | |
862 | + &lea ($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl)); | |
863 | + | |
864 | + &mov ($idx,32); | |
865 | + &set_label("prefetch_sbox",4); | |
866 | + &mov ($s0,&DWP(0,$Tbl)); | |
867 | + &mov ($s1,&DWP(32,$Tbl)); | |
868 | + &mov ($s2,&DWP(64,$Tbl)); | |
869 | + &mov ($s3,&DWP(96,$Tbl)); | |
870 | + &lea ($Tbl,&DWP(128,$Tbl)); | |
871 | + &dec ($idx); | |
872 | + &jnz (&label("prefetch_sbox")); | |
873 | + &mov ($s0,$_key); | |
874 | + &sub ($Tbl,4096); | |
875 | + &mov ($idx,$_inp); | |
876 | + &mov ($s3,&DWP(272,$s0)); # load grandRounds | |
877 | + | |
878 | + &cmp ($key,0); | |
879 | + &je (&label("DECRYPT")); | |
880 | + | |
881 | + &mov ($s2,$_len); | |
882 | + &mov ($key,$_ivp); | |
883 | + &shl ($s3,6); | |
884 | + &lea ($s3,&DWP(0,$s0,$s3)); | |
885 | + &mov ($_end,$s3); | |
886 | + | |
887 | + &test ($s2,0xFFFFFFF0); | |
888 | + &jz (&label("enc_tail")); # short input... | |
889 | + | |
890 | + &mov ($s0,&DWP(0,$key)); # load iv | |
891 | + &mov ($s1,&DWP(4,$key)); | |
892 | + | |
893 | + &set_label("enc_loop",4); | |
894 | + &mov ($s2,&DWP(8,$key)); | |
895 | + &mov ($s3,&DWP(12,$key)); | |
896 | + | |
897 | + &xor ($s0,&DWP(0,$idx)); # xor input data | |
898 | + &xor ($s1,&DWP(4,$idx)); | |
899 | + &xor ($s2,&DWP(8,$idx)); | |
900 | + &bswap ($s0); | |
901 | + &xor ($s3,&DWP(12,$idx)); | |
902 | + &bswap ($s1); | |
903 | + &mov ($key,$_key); # load key | |
904 | + &bswap ($s2); | |
905 | + &bswap ($s3); | |
906 | + | |
907 | + &call ("_x86_Camellia_encrypt"); | |
908 | + | |
909 | + &mov ($idx,$_inp); # load inp | |
910 | + &mov ($key,$_out); # load out | |
911 | + | |
912 | + &bswap ($s0); | |
913 | + &bswap ($s1); | |
914 | + &bswap ($s2); | |
915 | + &mov (&DWP(0,$key),$s0); # save output data | |
916 | + &bswap ($s3); | |
917 | + &mov (&DWP(4,$key),$s1); | |
918 | + &mov (&DWP(8,$key),$s2); | |
919 | + &mov (&DWP(12,$key),$s3); | |
920 | + | |
921 | + &mov ($s2,$_len); # load len | |
922 | + | |
923 | + &lea ($idx,&DWP(16,$idx)); | |
924 | + &mov ($_inp,$idx); # save inp | |
925 | + | |
926 | + &lea ($s3,&DWP(16,$key)); | |
927 | + &mov ($_out,$s3); # save out | |
928 | + | |
929 | + &sub ($s2,16); | |
930 | + &test ($s2,0xFFFFFFF0); | |
931 | + &mov ($_len,$s2); # save len | |
932 | + &jnz (&label("enc_loop")); | |
933 | + &test ($s2,15); | |
934 | + &jnz (&label("enc_tail")); | |
935 | + &mov ($idx,$_ivp); # load ivp | |
936 | + &mov ($s2,&DWP(8,$key)); # restore last dwords | |
937 | + &mov ($s3,&DWP(12,$key)); | |
938 | + &mov (&DWP(0,$idx),$s0); # save ivec | |
939 | + &mov (&DWP(4,$idx),$s1); | |
940 | + &mov (&DWP(8,$idx),$s2); | |
941 | + &mov (&DWP(12,$idx),$s3); | |
942 | + | |
943 | + &mov ("esp",$_esp); | |
944 | + &popf (); | |
945 | + &set_label("enc_out"); | |
946 | + &function_end_A(); | |
947 | + &pushf (); # kludge, never executed | |
948 | + | |
949 | + &set_label("enc_tail",4); | |
950 | + &mov ($s0,$key eq "edi" ? $key : ""); | |
951 | + &mov ($key,$_out); # load out | |
952 | + &push ($s0); # push ivp | |
953 | + &mov ($s1,16); | |
954 | + &sub ($s1,$s2); | |
955 | + &cmp ($key,$idx); # compare with inp | |
956 | + &je (&label("enc_in_place")); | |
957 | + &align (4); | |
958 | + &data_word(0xA4F3F689); # rep movsb # copy input | |
959 | + &jmp (&label("enc_skip_in_place")); | |
960 | + &set_label("enc_in_place"); | |
961 | + &lea ($key,&DWP(0,$key,$s2)); | |
962 | + &set_label("enc_skip_in_place"); | |
963 | + &mov ($s2,$s1); | |
964 | + &xor ($s0,$s0); | |
965 | + &align (4); | |
966 | + &data_word(0xAAF3F689); # rep stosb # zero tail | |
967 | + &pop ($key); # pop ivp | |
968 | + | |
969 | + &mov ($idx,$_out); # output as input | |
970 | + &mov ($s0,&DWP(0,$key)); | |
971 | + &mov ($s1,&DWP(4,$key)); | |
972 | + &mov ($_len,16); # len=16 | |
973 | + &jmp (&label("enc_loop")); # one more spin... | |
974 | + | |
975 | +#----------------------------- DECRYPT -----------------------------# | |
976 | +&set_label("DECRYPT",16); | |
977 | + &shl ($s3,6); | |
978 | + &lea ($s3,&DWP(0,$s0,$s3)); | |
979 | + &mov ($_end,$s0); | |
980 | + &mov ($_key,$s3); | |
981 | + | |
982 | + &cmp ($idx,$_out); | |
983 | + &je (&label("dec_in_place")); # in-place processing... | |
984 | + | |
985 | + &mov ($key,$_ivp); # load ivp | |
986 | + &mov ($_tmp,$key); | |
987 | + | |
988 | + &set_label("dec_loop",4); | |
989 | + &mov ($s0,&DWP(0,$idx)); # read input | |
990 | + &mov ($s1,&DWP(4,$idx)); | |
991 | + &mov ($s2,&DWP(8,$idx)); | |
992 | + &bswap ($s0); | |
993 | + &mov ($s3,&DWP(12,$idx)); | |
994 | + &bswap ($s1); | |
995 | + &mov ($key,$_key); # load key | |
996 | + &bswap ($s2); | |
997 | + &bswap ($s3); | |
998 | + | |
999 | + &call ("_x86_Camellia_decrypt"); | |
1000 | + | |
1001 | + &mov ($key,$_tmp); # load ivp | |
1002 | + &mov ($idx,$_len); # load len | |
1003 | + | |
1004 | + &bswap ($s0); | |
1005 | + &bswap ($s1); | |
1006 | + &bswap ($s2); | |
1007 | + &xor ($s0,&DWP(0,$key)); # xor iv | |
1008 | + &bswap ($s3); | |
1009 | + &xor ($s1,&DWP(4,$key)); | |
1010 | + &xor ($s2,&DWP(8,$key)); | |
1011 | + &xor ($s3,&DWP(12,$key)); | |
1012 | + | |
1013 | + &sub ($idx,16); | |
1014 | + &jc (&label("dec_partial")); | |
1015 | + &mov ($_len,$idx); # save len | |
1016 | + &mov ($idx,$_inp); # load inp | |
1017 | + &mov ($key,$_out); # load out | |
1018 | + | |
1019 | + &mov (&DWP(0,$key),$s0); # write output | |
1020 | + &mov (&DWP(4,$key),$s1); | |
1021 | + &mov (&DWP(8,$key),$s2); | |
1022 | + &mov (&DWP(12,$key),$s3); | |
1023 | + | |
1024 | + &mov ($_tmp,$idx); # save ivp | |
1025 | + &lea ($idx,&DWP(16,$idx)); | |
1026 | + &mov ($_inp,$idx); # save inp | |
1027 | + | |
1028 | + &lea ($key,&DWP(16,$key)); | |
1029 | + &mov ($_out,$key); # save out | |
1030 | + | |
1031 | + &jnz (&label("dec_loop")); | |
1032 | + &mov ($key,$_tmp); # load temp ivp | |
1033 | + &set_label("dec_end"); | |
1034 | + &mov ($idx,$_ivp); # load user ivp | |
1035 | + &mov ($s0,&DWP(0,$key)); # load iv | |
1036 | + &mov ($s1,&DWP(4,$key)); | |
1037 | + &mov ($s2,&DWP(8,$key)); | |
1038 | + &mov ($s3,&DWP(12,$key)); | |
1039 | + &mov (&DWP(0,$idx),$s0); # copy back to user | |
1040 | + &mov (&DWP(4,$idx),$s1); | |
1041 | + &mov (&DWP(8,$idx),$s2); | |
1042 | + &mov (&DWP(12,$idx),$s3); | |
1043 | + &jmp (&label("dec_out")); | |
1044 | + | |
1045 | + &set_label("dec_partial",4); | |
1046 | + &lea ($key,$ivec); | |
1047 | + &mov (&DWP(0,$key),$s0); # dump output to stack | |
1048 | + &mov (&DWP(4,$key),$s1); | |
1049 | + &mov (&DWP(8,$key),$s2); | |
1050 | + &mov (&DWP(12,$key),$s3); | |
1051 | + &lea ($s2 eq "ecx" ? $s2 : "",&DWP(16,$idx)); | |
1052 | + &mov ($idx eq "esi" ? $idx : "",$key); | |
1053 | + &mov ($key eq "edi" ? $key : "",$_out); # load out | |
1054 | + &data_word(0xA4F3F689); # rep movsb # copy output | |
1055 | + &mov ($key,$_inp); # use inp as temp ivp | |
1056 | + &jmp (&label("dec_end")); | |
1057 | + | |
1058 | + &set_label("dec_in_place",4); | |
1059 | + &set_label("dec_in_place_loop"); | |
1060 | + &lea ($key,$ivec); | |
1061 | + &mov ($s0,&DWP(0,$idx)); # read input | |
1062 | + &mov ($s1,&DWP(4,$idx)); | |
1063 | + &mov ($s2,&DWP(8,$idx)); | |
1064 | + &mov ($s3,&DWP(12,$idx)); | |
1065 | + | |
1066 | + &mov (&DWP(0,$key),$s0); # copy to temp | |
1067 | + &mov (&DWP(4,$key),$s1); | |
1068 | + &mov (&DWP(8,$key),$s2); | |
1069 | + &bswap ($s0); | |
1070 | + &mov (&DWP(12,$key),$s3); | |
1071 | + &bswap ($s1); | |
1072 | + &mov ($key,$_key); # load key | |
1073 | + &bswap ($s2); | |
1074 | + &bswap ($s3); | |
1075 | + | |
1076 | + &call ("_x86_Camellia_decrypt"); | |
1077 | + | |
1078 | + &mov ($key,$_ivp); # load ivp | |
1079 | + &mov ($idx,$_out); # load out | |
1080 | + | |
1081 | + &bswap ($s0); | |
1082 | + &bswap ($s1); | |
1083 | + &bswap ($s2); | |
1084 | + &xor ($s0,&DWP(0,$key)); # xor iv | |
1085 | + &bswap ($s3); | |
1086 | + &xor ($s1,&DWP(4,$key)); | |
1087 | + &xor ($s2,&DWP(8,$key)); | |
1088 | + &xor ($s3,&DWP(12,$key)); | |
1089 | + | |
1090 | + &mov (&DWP(0,$idx),$s0); # write output | |
1091 | + &mov (&DWP(4,$idx),$s1); | |
1092 | + &mov (&DWP(8,$idx),$s2); | |
1093 | + &mov (&DWP(12,$idx),$s3); | |
1094 | + | |
1095 | + &lea ($idx,&DWP(16,$idx)); | |
1096 | + &mov ($_out,$idx); # save out | |
1097 | + | |
1098 | + &lea ($idx,$ivec); | |
1099 | + &mov ($s0,&DWP(0,$idx)); # read temp | |
1100 | + &mov ($s1,&DWP(4,$idx)); | |
1101 | + &mov ($s2,&DWP(8,$idx)); | |
1102 | + &mov ($s3,&DWP(12,$idx)); | |
1103 | + | |
1104 | + &mov (&DWP(0,$key),$s0); # copy iv | |
1105 | + &mov (&DWP(4,$key),$s1); | |
1106 | + &mov (&DWP(8,$key),$s2); | |
1107 | + &mov (&DWP(12,$key),$s3); | |
1108 | + | |
1109 | + &mov ($idx,$_inp); # load inp | |
1110 | + | |
1111 | + &lea ($idx,&DWP(16,$idx)); | |
1112 | + &mov ($_inp,$idx); # save inp | |
1113 | + | |
1114 | + &mov ($s2,$_len); # load len | |
1115 | + &sub ($s2,16); | |
1116 | + &jc (&label("dec_in_place_partial")); | |
1117 | + &mov ($_len,$s2); # save len | |
1118 | + &jnz (&label("dec_in_place_loop")); | |
1119 | + &jmp (&label("dec_out")); | |
1120 | + | |
1121 | + &set_label("dec_in_place_partial",4); | |
1122 | + # one can argue if this is actually required... | |
1123 | + &mov ($key eq "edi" ? $key : "",$_out); | |
1124 | + &lea ($idx eq "esi" ? $idx : "",$ivec); | |
1125 | + &lea ($key,&DWP(0,$key,$s2)); | |
1126 | + &lea ($idx,&DWP(16,$idx,$s2)); | |
1127 | + &neg ($s2 eq "ecx" ? $s2 : ""); | |
1128 | + &data_word(0xA4F3F689); # rep movsb # restore tail | |
1129 | + | |
1130 | + &set_label("dec_out",4); | |
1131 | + &mov ("esp",$_esp); | |
1132 | + &popf (); | |
1133 | +&function_end("Camellia_cbc_encrypt"); | |
1134 | +} | |
1135 | + | |
1136 | +&asciz("Camellia for x86 by <appro@openssl.org>"); | |
1137 | + | |
1138 | +&asm_finish(); |
@@ -0,0 +1,1080 @@ | ||
1 | +#!/usr/bin/env perl | |
2 | + | |
3 | +# ==================================================================== | |
4 | +# Copyright (c) 2008 Andy Polyakov <appro@openssl.org> | |
5 | +# | |
6 | +# This module may be used under the terms of either the GNU General | |
7 | +# Public License version 2 or later, the GNU Lesser General Public | |
8 | +# License version 2.1 or later, the Mozilla Public License version | |
9 | +# 1.1 or the BSD License. The exact terms of either license are | |
10 | +# distributed along with this module. For further details see | |
11 | +# http://www.openssl.org/~appro/camellia/. | |
12 | +# ==================================================================== | |
13 | + | |
14 | +# Performance in cycles per processed byte (less is better) in | |
15 | +# 'openssl speed ...' benchmark: | |
16 | +# | |
17 | +# AMD64 Core2 EM64T | |
18 | +# -evp camellia-128-ecb 16.7 21.0 22.7 | |
19 | +# + over gcc 3.4.6 +25% +5% 0% | |
20 | +# | |
21 | +# camellia-128-cbc 15.7 20.4 21.1 | |
22 | +# | |
23 | +# 128-bit key setup 128 216 205 cycles/key | |
24 | +# + over gcc 3.4.6 +54% +39% +15% | |
25 | +# | |
26 | +# Numbers in "+" rows represent performance improvement over compiler | |
27 | +# generated code. Key setup timings are impressive on AMD and Core2 | |
28 | +# thanks to 64-bit operations being covertly deployed. Improvement on | |
29 | +# EM64T, pre-Core2 Intel x86_64 CPU, is not as impressive, because it | |
30 | +# apparently emulates some of 64-bit operations in [32-bit] microcode. | |
31 | + | |
32 | +$flavour = shift; | |
33 | +$output = shift; | |
34 | +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } | |
35 | + | |
36 | +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); | |
37 | + | |
38 | +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | |
39 | +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or | |
40 | +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or | |
41 | +die "can't locate x86_64-xlate.pl"; | |
42 | + | |
43 | +open STDOUT,"| $^X $xlate $flavour $output"; | |
44 | + | |
45 | +sub hi() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1h/; $r; } | |
46 | +sub lo() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/; | |
47 | + $r =~ s/%[er]([sd]i)/%\1l/; | |
48 | + $r =~ s/%(r[0-9]+)[d]?/%\1b/; $r; } | |
49 | + | |
50 | +$t0="%eax";$t1="%ebx";$t2="%ecx";$t3="%edx"; | |
51 | +@S=("%r8d","%r9d","%r10d","%r11d"); | |
52 | +$i0="%esi"; | |
53 | +$i1="%edi"; | |
54 | +$Tbl="%rbp"; # size optimization | |
55 | +$inp="%r12"; | |
56 | +$out="%r13"; | |
57 | +$key="%r14"; | |
58 | +$keyend="%r15"; | |
59 | +$arg0d=$win64?"%ecx":"%edi"; | |
60 | + | |
61 | +# const unsigned int Camellia_SBOX[4][256]; | |
62 | +# Well, sort of... Camellia_SBOX[0][] is interleaved with [1][], | |
63 | +# and [2][] - with [3][]. This is done to minimize code size. | |
64 | +$SBOX1_1110=0; # Camellia_SBOX[0] | |
65 | +$SBOX4_4404=4; # Camellia_SBOX[1] | |
66 | +$SBOX2_0222=2048; # Camellia_SBOX[2] | |
67 | +$SBOX3_3033=2052; # Camellia_SBOX[3] | |
68 | + | |
69 | +sub Camellia_Feistel { | |
70 | +my $i=@_[0]; | |
71 | +my $seed=defined(@_[1])?@_[1]:0; | |
72 | +my $scale=$seed<0?-8:8; | |
73 | +my $j=($i&1)*2; | |
74 | +my $s0=@S[($j)%4],$s1=@S[($j+1)%4],$s2=@S[($j+2)%4],$s3=@S[($j+3)%4]; | |
75 | + | |
76 | +$code.=<<___; | |
77 | + xor $s0,$t0 # t0^=key[0] | |
78 | + xor $s1,$t1 # t1^=key[1] | |
79 | + movz `&hi("$t0")`,$i0 # (t0>>8)&0xff | |
80 | + movz `&lo("$t1")`,$i1 # (t1>>0)&0xff | |
81 | + mov $SBOX3_3033($Tbl,$i0,8),$t3 # t3=SBOX3_3033[0] | |
82 | + mov $SBOX1_1110($Tbl,$i1,8),$t2 # t2=SBOX1_1110[1] | |
83 | + movz `&lo("$t0")`,$i0 # (t0>>0)&0xff | |
84 | + shr \$16,$t0 | |
85 | + movz `&hi("$t1")`,$i1 # (t1>>8)&0xff | |
86 | + xor $SBOX4_4404($Tbl,$i0,8),$t3 # t3^=SBOX4_4404[0] | |
87 | + shr \$16,$t1 | |
88 | + xor $SBOX4_4404($Tbl,$i1,8),$t2 # t2^=SBOX4_4404[1] | |
89 | + movz `&hi("$t0")`,$i0 # (t0>>24)&0xff | |
90 | + movz `&lo("$t1")`,$i1 # (t1>>16)&0xff | |
91 | + xor $SBOX1_1110($Tbl,$i0,8),$t3 # t3^=SBOX1_1110[0] | |
92 | + xor $SBOX3_3033($Tbl,$i1,8),$t2 # t2^=SBOX3_3033[1] | |
93 | + movz `&lo("$t0")`,$i0 # (t0>>16)&0xff | |
94 | + movz `&hi("$t1")`,$i1 # (t1>>24)&0xff | |
95 | + xor $SBOX2_0222($Tbl,$i0,8),$t3 # t3^=SBOX2_0222[0] | |
96 | + xor $SBOX2_0222($Tbl,$i1,8),$t2 # t2^=SBOX2_0222[1] | |
97 | + mov `$seed+($i+1)*$scale`($key),$t1 # prefetch key[i+1] | |
98 | + mov `$seed+($i+1)*$scale+4`($key),$t0 | |
99 | + xor $t3,$t2 # t2^=t3 | |
100 | + ror \$8,$t3 # t3=RightRotate(t3,8) | |
101 | + xor $t2,$s2 | |
102 | + xor $t2,$s3 | |
103 | + xor $t3,$s3 | |
104 | +___ | |
105 | +} | |
106 | + | |
107 | +# void Camellia_EncryptBlock_Rounds( | |
108 | +# int grandRounds, | |
109 | +# const Byte plaintext[], | |
110 | +# const KEY_TABLE_TYPE keyTable, | |
111 | +# Byte ciphertext[]) | |
112 | +$code=<<___; | |
113 | +.text | |
114 | + | |
115 | +# V1.x API | |
116 | +.globl Camellia_EncryptBlock | |
117 | +.type Camellia_EncryptBlock,\@abi-omnipotent | |
118 | +.align 16 | |
119 | +Camellia_EncryptBlock: | |
120 | + movl \$128,%eax | |
121 | + subl $arg0d,%eax | |
122 | + movl \$3,$arg0d | |
123 | + adcl \$0,$arg0d # keyBitLength==128?3:4 | |
124 | + jmp .Lenc_rounds | |
125 | +.size Camellia_EncryptBlock,.-Camellia_EncryptBlock | |
126 | +# V2 | |
127 | +.globl Camellia_EncryptBlock_Rounds | |
128 | +.type Camellia_EncryptBlock_Rounds,\@function,4 | |
129 | +.align 16 | |
130 | +.Lenc_rounds: | |
131 | +Camellia_EncryptBlock_Rounds: | |
132 | + push %rbx | |
133 | + push %rbp | |
134 | + push %r13 | |
135 | + push %r14 | |
136 | + push %r15 | |
137 | +.Lenc_prologue: | |
138 | + | |
139 | + #mov %rsi,$inp # put away arguments | |
140 | + mov %rcx,$out | |
141 | + mov %rdx,$key | |
142 | + | |
143 | + shl \$6,%edi # process grandRounds | |
144 | + lea .LCamellia_SBOX(%rip),$Tbl | |
145 | + lea ($key,%rdi),$keyend | |
146 | + | |
147 | + mov 0(%rsi),@S[0] # load plaintext | |
148 | + mov 4(%rsi),@S[1] | |
149 | + mov 8(%rsi),@S[2] | |
150 | + bswap @S[0] | |
151 | + mov 12(%rsi),@S[3] | |
152 | + bswap @S[1] | |
153 | + bswap @S[2] | |
154 | + bswap @S[3] | |
155 | + | |
156 | + call _x86_64_Camellia_encrypt | |
157 | + | |
158 | + bswap @S[0] | |
159 | + bswap @S[1] | |
160 | + bswap @S[2] | |
161 | + mov @S[0],0($out) | |
162 | + bswap @S[3] | |
163 | + mov @S[1],4($out) | |
164 | + mov @S[2],8($out) | |
165 | + mov @S[3],12($out) | |
166 | + | |
167 | + mov 0(%rsp),%r15 | |
168 | + mov 8(%rsp),%r14 | |
169 | + mov 16(%rsp),%r13 | |
170 | + mov 24(%rsp),%rbp | |
171 | + mov 32(%rsp),%rbx | |
172 | + lea 40(%rsp),%rsp | |
173 | +.Lenc_epilogue: | |
174 | + ret | |
175 | +.size Camellia_EncryptBlock_Rounds,.-Camellia_EncryptBlock_Rounds | |
176 | + | |
177 | +.type _x86_64_Camellia_encrypt,\@abi-omnipotent | |
178 | +.align 16 | |
179 | +_x86_64_Camellia_encrypt: | |
180 | + xor 0($key),@S[1] | |
181 | + xor 4($key),@S[0] # ^=key[0-3] | |
182 | + xor 8($key),@S[3] | |
183 | + xor 12($key),@S[2] | |
184 | +.align 16 | |
185 | +.Leloop: | |
186 | + mov 16($key),$t1 # prefetch key[4-5] | |
187 | + mov 20($key),$t0 | |
188 | + | |
189 | +___ | |
190 | + for ($i=0;$i<6;$i++) { Camellia_Feistel($i,16); } | |
191 | +$code.=<<___; | |
192 | + lea 16*4($key),$key | |
193 | + cmp $keyend,$key | |
194 | + mov 8($key),$t3 # prefetch key[2-3] | |
195 | + mov 12($key),$t2 | |
196 | + je .Ledone | |
197 | + | |
198 | + and @S[0],$t0 | |
199 | + or @S[3],$t3 | |
200 | + rol \$1,$t0 | |
201 | + xor $t3,@S[2] # s2^=s3|key[3]; | |
202 | + xor $t0,@S[1] # s1^=LeftRotate(s0&key[0],1); | |
203 | + and @S[2],$t2 | |
204 | + or @S[1],$t1 | |
205 | + rol \$1,$t2 | |
206 | + xor $t1,@S[0] # s0^=s1|key[1]; | |
207 | + xor $t2,@S[3] # s3^=LeftRotate(s2&key[2],1); | |
208 | + jmp .Leloop | |
209 | + | |
210 | +.align 16 | |
211 | +.Ledone: | |
212 | + xor @S[2],$t0 # SwapHalf | |
213 | + xor @S[3],$t1 | |
214 | + xor @S[0],$t2 | |
215 | + xor @S[1],$t3 | |
216 | + | |
217 | + mov $t0,@S[0] | |
218 | + mov $t1,@S[1] | |
219 | + mov $t2,@S[2] | |
220 | + mov $t3,@S[3] | |
221 | + | |
222 | + .byte 0xf3,0xc3 # rep ret | |
223 | +.size _x86_64_Camellia_encrypt,.-_x86_64_Camellia_encrypt | |
224 | + | |
225 | +# V1.x API | |
226 | +.globl Camellia_DecryptBlock | |
227 | +.type Camellia_DecryptBlock,\@abi-omnipotent | |
228 | +.align 16 | |
229 | +Camellia_DecryptBlock: | |
230 | + movl \$128,%eax | |
231 | + subl $arg0d,%eax | |
232 | + movl \$3,$arg0d | |
233 | + adcl \$0,$arg0d # keyBitLength==128?3:4 | |
234 | + jmp .Ldec_rounds | |
235 | +.size Camellia_DecryptBlock,.-Camellia_DecryptBlock | |
236 | +# V2 | |
237 | +.globl Camellia_DecryptBlock_Rounds | |
238 | +.type Camellia_DecryptBlock_Rounds,\@function,4 | |
239 | +.align 16 | |
240 | +.Ldec_rounds: | |
241 | +Camellia_DecryptBlock_Rounds: | |
242 | + push %rbx | |
243 | + push %rbp | |
244 | + push %r13 | |
245 | + push %r14 | |
246 | + push %r15 | |
247 | +.Ldec_prologue: | |
248 | + | |
249 | + #mov %rsi,$inp # put away arguments | |
250 | + mov %rcx,$out | |
251 | + mov %rdx,$keyend | |
252 | + | |
253 | + shl \$6,%edi # process grandRounds | |
254 | + lea .LCamellia_SBOX(%rip),$Tbl | |
255 | + lea ($keyend,%rdi),$key | |
256 | + | |
257 | + mov 0(%rsi),@S[0] # load plaintext | |
258 | + mov 4(%rsi),@S[1] | |
259 | + mov 8(%rsi),@S[2] | |
260 | + bswap @S[0] | |
261 | + mov 12(%rsi),@S[3] | |
262 | + bswap @S[1] | |
263 | + bswap @S[2] | |
264 | + bswap @S[3] | |
265 | + | |
266 | + call _x86_64_Camellia_decrypt | |
267 | + | |
268 | + bswap @S[0] | |
269 | + bswap @S[1] | |
270 | + bswap @S[2] | |
271 | + mov @S[0],0($out) | |
272 | + bswap @S[3] | |
273 | + mov @S[1],4($out) | |
274 | + mov @S[2],8($out) | |
275 | + mov @S[3],12($out) | |
276 | + | |
277 | + mov 0(%rsp),%r15 | |
278 | + mov 8(%rsp),%r14 | |
279 | + mov 16(%rsp),%r13 | |
280 | + mov 24(%rsp),%rbp | |
281 | + mov 32(%rsp),%rbx | |
282 | + lea 40(%rsp),%rsp | |
283 | +.Ldec_epilogue: | |
284 | + ret | |
285 | +.size Camellia_DecryptBlock_Rounds,.-Camellia_DecryptBlock_Rounds | |
286 | + | |
287 | +.type _x86_64_Camellia_decrypt,\@abi-omnipotent | |
288 | +.align 16 | |
289 | +_x86_64_Camellia_decrypt: | |
290 | + xor 0($key),@S[1] | |
291 | + xor 4($key),@S[0] # ^=key[0-3] | |
292 | + xor 8($key),@S[3] | |
293 | + xor 12($key),@S[2] | |
294 | +.align 16 | |
295 | +.Ldloop: | |
296 | + mov -8($key),$t1 # prefetch key[4-5] | |
297 | + mov -4($key),$t0 | |
298 | + | |
299 | +___ | |
300 | + for ($i=0;$i<6;$i++) { Camellia_Feistel($i,-8); } | |
301 | +$code.=<<___; | |
302 | + lea -16*4($key),$key | |
303 | + cmp $keyend,$key | |
304 | + mov 0($key),$t3 # prefetch key[2-3] | |
305 | + mov 4($key),$t2 | |
306 | + je .Lddone | |
307 | + | |
308 | + and @S[0],$t0 | |
309 | + or @S[3],$t3 | |
310 | + rol \$1,$t0 | |
311 | + xor $t3,@S[2] # s2^=s3|key[3]; | |
312 | + xor $t0,@S[1] # s1^=LeftRotate(s0&key[0],1); | |
313 | + and @S[2],$t2 | |
314 | + or @S[1],$t1 | |
315 | + rol \$1,$t2 | |
316 | + xor $t1,@S[0] # s0^=s1|key[1]; | |
317 | + xor $t2,@S[3] # s3^=LeftRotate(s2&key[2],1); | |
318 | + | |
319 | + jmp .Ldloop | |
320 | + | |
321 | +.align 16 | |
322 | +.Lddone: | |
323 | + xor @S[2],$t2 | |
324 | + xor @S[3],$t3 | |
325 | + xor @S[0],$t0 | |
326 | + xor @S[1],$t1 | |
327 | + | |
328 | + mov $t2,@S[0] # SwapHalf | |
329 | + mov $t3,@S[1] | |
330 | + mov $t0,@S[2] | |
331 | + mov $t1,@S[3] | |
332 | + | |
333 | + .byte 0xf3,0xc3 # rep ret | |
334 | +.size _x86_64_Camellia_decrypt,.-_x86_64_Camellia_decrypt | |
335 | +___ | |
336 | + | |
337 | +sub _saveround { | |
338 | +my ($rnd,$key,@T)=@_; | |
339 | +my $bias=int(@T[0])?shift(@T):0; | |
340 | + | |
341 | + if ($#T==3) { | |
342 | + $code.=<<___; | |
343 | + mov @T[1],`$bias+$rnd*8+0`($key) | |
344 | + mov @T[0],`$bias+$rnd*8+4`($key) | |
345 | + mov @T[3],`$bias+$rnd*8+8`($key) | |
346 | + mov @T[2],`$bias+$rnd*8+12`($key) | |
347 | +___ | |
348 | + } else { | |
349 | + $code.=" mov @T[0],`$bias+$rnd*8+0`($key)\n"; | |
350 | + $code.=" mov @T[1],`$bias+$rnd*8+8`($key)\n" if ($#T>=1); | |
351 | + } | |
352 | +} | |
353 | + | |
354 | +sub _loadround { | |
355 | +my ($rnd,$key,@T)=@_; | |
356 | +my $bias=int(@T[0])?shift(@T):0; | |
357 | + | |
358 | +$code.=" mov `$bias+$rnd*8+0`($key),@T[0]\n"; | |
359 | +$code.=" mov `$bias+$rnd*8+8`($key),@T[1]\n" if ($#T>=1); | |
360 | +} | |
361 | + | |
362 | +# shld is very slow on Intel EM64T family. Even on AMD it limits | |
363 | +# instruction decode rate [because it's VectorPath] and consequently | |
364 | +# performance... | |
365 | +sub __rotl128 { | |
366 | +my ($i0,$i1,$rot)=@_; | |
367 | + | |
368 | + if ($rot) { | |
369 | + $code.=<<___; | |
370 | + mov $i0,%r11 | |
371 | + shld \$$rot,$i1,$i0 | |
372 | + shld \$$rot,%r11,$i1 | |
373 | +___ | |
374 | + } | |
375 | +} | |
376 | + | |
377 | +# ... Implementing 128-bit rotate without shld gives 80% better | |
378 | +# performance EM64T, +15% on AMD64 and only ~7% degradation on | |
379 | +# Core2. This is therefore preferred. | |
380 | +sub _rotl128 { | |
381 | +my ($i0,$i1,$rot)=@_; | |
382 | + | |
383 | + if ($rot) { | |
384 | + $code.=<<___; | |
385 | + mov $i0,%r11 | |
386 | + shl \$$rot,$i0 | |
387 | + mov $i1,%r9 | |
388 | + shr \$`64-$rot`,%r9 | |
389 | + shr \$`64-$rot`,%r11 | |
390 | + or %r9,$i0 | |
391 | + shl \$$rot,$i1 | |
392 | + or %r11,$i1 | |
393 | +___ | |
394 | + } | |
395 | +} | |
396 | + | |
397 | +{ my $step=0; | |
398 | + | |
399 | +$code.=<<___; | |
400 | +.globl Camellia_Ekeygen | |
401 | +.type Camellia_Ekeygen,\@function,3 | |
402 | +.align 16 | |
403 | +Camellia_Ekeygen: | |
404 | + push %rbx | |
405 | + push %rbp | |
406 | + push %r13 | |
407 | + push %r14 | |
408 | + push %r15 | |
409 | +.Lkey_prologue: | |
410 | + | |
411 | + mov %rdi,$keyend # put away arguments, keyBitLength | |
412 | + mov %rdx,$out # keyTable | |
413 | + | |
414 | + mov 0(%rsi),@S[0] # load 0-127 bits | |
415 | + mov 4(%rsi),@S[1] | |
416 | + mov 8(%rsi),@S[2] | |
417 | + mov 12(%rsi),@S[3] | |
418 | + | |
419 | + bswap @S[0] | |
420 | + bswap @S[1] | |
421 | + bswap @S[2] | |
422 | + bswap @S[3] | |
423 | +___ | |
424 | + &_saveround (0,$out,@S); # KL<<<0 | |
425 | +$code.=<<___; | |
426 | + cmp \$128,$keyend # check keyBitLength | |
427 | + je .L1st128 | |
428 | + | |
429 | + mov 16(%rsi),@S[0] # load 128-191 bits | |
430 | + mov 20(%rsi),@S[1] | |
431 | + cmp \$192,$keyend | |
432 | + je .L1st192 | |
433 | + mov 24(%rsi),@S[2] # load 192-255 bits | |
434 | + mov 28(%rsi),@S[3] | |
435 | + jmp .L1st256 | |
436 | +.L1st192: | |
437 | + mov @S[0],@S[2] | |
438 | + mov @S[1],@S[3] | |
439 | + not @S[2] | |
440 | + not @S[3] | |
441 | +.L1st256: | |
442 | + bswap @S[0] | |
443 | + bswap @S[1] | |
444 | + bswap @S[2] | |
445 | + bswap @S[3] | |
446 | +___ | |
447 | + &_saveround (4,$out,@S); # temp storage for KR! | |
448 | +$code.=<<___; | |
449 | + xor 0($out),@S[1] # KR^KL | |
450 | + xor 4($out),@S[0] | |
451 | + xor 8($out),@S[3] | |
452 | + xor 12($out),@S[2] | |
453 | + | |
454 | +.L1st128: | |
455 | + lea .LCamellia_SIGMA(%rip),$key | |
456 | + lea .LCamellia_SBOX(%rip),$Tbl | |
457 | + | |
458 | + mov 0($key),$t1 | |
459 | + mov 4($key),$t0 | |
460 | +___ | |
461 | + &Camellia_Feistel($step++); | |
462 | + &Camellia_Feistel($step++); | |
463 | +$code.=<<___; | |
464 | + xor 0($out),@S[1] # ^KL | |
465 | + xor 4($out),@S[0] | |
466 | + xor 8($out),@S[3] | |
467 | + xor 12($out),@S[2] | |
468 | +___ | |
469 | + &Camellia_Feistel($step++); | |
470 | + &Camellia_Feistel($step++); | |
471 | +$code.=<<___; | |
472 | + cmp \$128,$keyend | |
473 | + jne .L2nd256 | |
474 | + | |
475 | + lea 128($out),$out # size optimization | |
476 | + shl \$32,%r8 # @S[0]|| | |
477 | + shl \$32,%r10 # @S[2]|| | |
478 | + or %r9,%r8 # ||@S[1] | |
479 | + or %r11,%r10 # ||@S[3] | |
480 | +___ | |
481 | + &_loadround (0,$out,-128,"%rax","%rbx"); # KL | |
482 | + &_saveround (2,$out,-128,"%r8","%r10"); # KA<<<0 | |
483 | + &_rotl128 ("%rax","%rbx",15); | |
484 | + &_saveround (4,$out,-128,"%rax","%rbx"); # KL<<<15 | |
485 | + &_rotl128 ("%r8","%r10",15); | |
486 | + &_saveround (6,$out,-128,"%r8","%r10"); # KA<<<15 | |
487 | + &_rotl128 ("%r8","%r10",15); # 15+15=30 | |
488 | + &_saveround (8,$out,-128,"%r8","%r10"); # KA<<<30 | |
489 | + &_rotl128 ("%rax","%rbx",30); # 15+30=45 | |
490 | + &_saveround (10,$out,-128,"%rax","%rbx"); # KL<<<45 | |
491 | + &_rotl128 ("%r8","%r10",15); # 30+15=45 | |
492 | + &_saveround (12,$out,-128,"%r8"); # KA<<<45 | |
493 | + &_rotl128 ("%rax","%rbx",15); # 45+15=60 | |
494 | + &_saveround (13,$out,-128,"%rbx"); # KL<<<60 | |
495 | + &_rotl128 ("%r8","%r10",15); # 45+15=60 | |
496 | + &_saveround (14,$out,-128,"%r8","%r10"); # KA<<<60 | |
497 | + &_rotl128 ("%rax","%rbx",17); # 60+17=77 | |
498 | + &_saveround (16,$out,-128,"%rax","%rbx"); # KL<<<77 | |
499 | + &_rotl128 ("%rax","%rbx",17); # 77+17=94 | |
500 | + &_saveround (18,$out,-128,"%rax","%rbx"); # KL<<<94 | |
501 | + &_rotl128 ("%r8","%r10",34); # 60+34=94 | |
502 | + &_saveround (20,$out,-128,"%r8","%r10"); # KA<<<94 | |
503 | + &_rotl128 ("%rax","%rbx",17); # 94+17=111 | |
504 | + &_saveround (22,$out,-128,"%rax","%rbx"); # KL<<<111 | |
505 | + &_rotl128 ("%r8","%r10",17); # 94+17=111 | |
506 | + &_saveround (24,$out,-128,"%r8","%r10"); # KA<<<111 | |
507 | +$code.=<<___; | |
508 | + mov \$3,%eax | |
509 | + jmp .Ldone | |
510 | +.align 16 | |
511 | +.L2nd256: | |
512 | +___ | |
513 | + &_saveround (6,$out,@S); # temp storage for KA! | |
514 | +$code.=<<___; | |
515 | + xor `4*8+0`($out),@S[1] # KA^KR | |
516 | + xor `4*8+4`($out),@S[0] | |
517 | + xor `5*8+0`($out),@S[3] | |
518 | + xor `5*8+4`($out),@S[2] | |
519 | +___ | |
520 | + &Camellia_Feistel($step++); | |
521 | + &Camellia_Feistel($step++); | |
522 | + | |
523 | + &_loadround (0,$out,"%rax","%rbx"); # KL | |
524 | + &_loadround (4,$out,"%rcx","%rdx"); # KR | |
525 | + &_loadround (6,$out,"%r14","%r15"); # KA | |
526 | +$code.=<<___; | |
527 | + lea 128($out),$out # size optimization | |
528 | + shl \$32,%r8 # @S[0]|| | |
529 | + shl \$32,%r10 # @S[2]|| | |
530 | + or %r9,%r8 # ||@S[1] | |
531 | + or %r11,%r10 # ||@S[3] | |
532 | +___ | |
533 | + &_saveround (2,$out,-128,"%r8","%r10"); # KB<<<0 | |
534 | + &_rotl128 ("%rcx","%rdx",15); | |
535 | + &_saveround (4,$out,-128,"%rcx","%rdx"); # KR<<<15 | |
536 | + &_rotl128 ("%r14","%r15",15); | |
537 | + &_saveround (6,$out,-128,"%r14","%r15"); # KA<<<15 | |
538 | + &_rotl128 ("%rcx","%rdx",15); # 15+15=30 | |
539 | + &_saveround (8,$out,-128,"%rcx","%rdx"); # KR<<<30 | |
540 | + &_rotl128 ("%r8","%r10",30); | |
541 | + &_saveround (10,$out,-128,"%r8","%r10"); # KB<<<30 | |
542 | + &_rotl128 ("%rax","%rbx",45); | |
543 | + &_saveround (12,$out,-128,"%rax","%rbx"); # KL<<<45 | |
544 | + &_rotl128 ("%r14","%r15",30); # 15+30=45 | |
545 | + &_saveround (14,$out,-128,"%r14","%r15"); # KA<<<45 | |
546 | + &_rotl128 ("%rax","%rbx",15); # 45+15=60 | |
547 | + &_saveround (16,$out,-128,"%rax","%rbx"); # KL<<<60 | |
548 | + &_rotl128 ("%rcx","%rdx",30); # 30+30=60 | |
549 | + &_saveround (18,$out,-128,"%rcx","%rdx"); # KR<<<60 | |
550 | + &_rotl128 ("%r8","%r10",30); # 30+30=60 | |
551 | + &_saveround (20,$out,-128,"%r8","%r10"); # KB<<<60 | |
552 | + &_rotl128 ("%rax","%rbx",17); # 60+17=77 | |
553 | + &_saveround (22,$out,-128,"%rax","%rbx"); # KL<<<77 | |
554 | + &_rotl128 ("%r14","%r15",32); # 45+32=77 | |
555 | + &_saveround (24,$out,-128,"%r14","%r15"); # KA<<<77 | |
556 | + &_rotl128 ("%rcx","%rdx",34); # 60+34=94 | |
557 | + &_saveround (26,$out,-128,"%rcx","%rdx"); # KR<<<94 | |
558 | + &_rotl128 ("%r14","%r15",17); # 77+17=94 | |
559 | + &_saveround (28,$out,-128,"%r14","%r15"); # KA<<<77 | |
560 | + &_rotl128 ("%rax","%rbx",34); # 77+34=111 | |
561 | + &_saveround (30,$out,-128,"%rax","%rbx"); # KL<<<111 | |
562 | + &_rotl128 ("%r8","%r10",51); # 60+51=111 | |
563 | + &_saveround (32,$out,-128,"%r8","%r10"); # KB<<<111 | |
564 | +$code.=<<___; | |
565 | + mov \$4,%eax | |
566 | +.Ldone: | |
567 | + mov 0(%rsp),%r15 | |
568 | + mov 8(%rsp),%r14 | |
569 | + mov 16(%rsp),%r13 | |
570 | + mov 24(%rsp),%rbp | |
571 | + mov 32(%rsp),%rbx | |
572 | + lea 40(%rsp),%rsp | |
573 | +.Lkey_epilogue: | |
574 | + ret | |
575 | +.size Camellia_Ekeygen,.-Camellia_Ekeygen | |
576 | +___ | |
577 | +} | |
578 | + | |
579 | +@SBOX=( | |
580 | +112,130, 44,236,179, 39,192,229,228,133, 87, 53,234, 12,174, 65, | |
581 | + 35,239,107,147, 69, 25,165, 33,237, 14, 79, 78, 29,101,146,189, | |
582 | +134,184,175,143,124,235, 31,206, 62, 48,220, 95, 94,197, 11, 26, | |
583 | +166,225, 57,202,213, 71, 93, 61,217, 1, 90,214, 81, 86,108, 77, | |
584 | +139, 13,154,102,251,204,176, 45,116, 18, 43, 32,240,177,132,153, | |
585 | +223, 76,203,194, 52,126,118, 5,109,183,169, 49,209, 23, 4,215, | |
586 | + 20, 88, 58, 97,222, 27, 17, 28, 50, 15,156, 22, 83, 24,242, 34, | |
587 | +254, 68,207,178,195,181,122,145, 36, 8,232,168, 96,252,105, 80, | |
588 | +170,208,160,125,161,137, 98,151, 84, 91, 30,149,224,255,100,210, | |
589 | + 16,196, 0, 72,163,247,117,219,138, 3,230,218, 9, 63,221,148, | |
590 | +135, 92,131, 2,205, 74,144, 51,115,103,246,243,157,127,191,226, | |
591 | + 82,155,216, 38,200, 55,198, 59,129,150,111, 75, 19,190, 99, 46, | |
592 | +233,121,167,140,159,110,188,142, 41,245,249,182, 47,253,180, 89, | |
593 | +120,152, 6,106,231, 70,113,186,212, 37,171, 66,136,162,141,250, | |
594 | +114, 7,185, 85,248,238,172, 10, 54, 73, 42,104, 60, 56,241,164, | |
595 | + 64, 40,211,123,187,201, 67,193, 21,227,173,244,119,199,128,158); | |
596 | + | |
597 | +sub S1110 { my $i=shift; $i=@SBOX[$i]; $i=$i<<24|$i<<16|$i<<8; sprintf("0x%08x",$i); } | |
598 | +sub S4404 { my $i=shift; $i=($i<<1|$i>>7)&0xff; $i=@SBOX[$i]; $i=$i<<24|$i<<16|$i; sprintf("0x%08x",$i); } | |
599 | +sub S0222 { my $i=shift; $i=@SBOX[$i]; $i=($i<<1|$i>>7)&0xff; $i=$i<<16|$i<<8|$i; sprintf("0x%08x",$i); } | |
600 | +sub S3033 { my $i=shift; $i=@SBOX[$i]; $i=($i>>1|$i<<7)&0xff; $i=$i<<24|$i<<8|$i; sprintf("0x%08x",$i); } | |
601 | + | |
602 | +$code.=<<___; | |
603 | +.align 64 | |
604 | +.LCamellia_SIGMA: | |
605 | +.long 0x3bcc908b, 0xa09e667f, 0x4caa73b2, 0xb67ae858 | |
606 | +.long 0xe94f82be, 0xc6ef372f, 0xf1d36f1c, 0x54ff53a5 | |
607 | +.long 0xde682d1d, 0x10e527fa, 0xb3e6c1fd, 0xb05688c2 | |
608 | +.long 0, 0, 0, 0 | |
609 | +.LCamellia_SBOX: | |
610 | +___ | |
611 | +# tables are interleaved, remember? | |
612 | +sub data_word { $code.=".long\t".join(',',@_)."\n"; } | |
613 | +for ($i=0;$i<256;$i++) { &data_word(&S1110($i),&S4404($i)); } | |
614 | +for ($i=0;$i<256;$i++) { &data_word(&S0222($i),&S3033($i)); } | |
615 | + | |
616 | +# void Camellia_cbc_encrypt (const void char *inp, unsigned char *out, | |
617 | +# size_t length, const CAMELLIA_KEY *key, | |
618 | +# unsigned char *ivp,const int enc); | |
619 | +{ | |
620 | +$_key="0(%rsp)"; | |
621 | +$_end="8(%rsp)"; # inp+len&~15 | |
622 | +$_res="16(%rsp)"; # len&15 | |
623 | +$ivec="24(%rsp)"; | |
624 | +$_ivp="40(%rsp)"; | |
625 | +$_rsp="48(%rsp)"; | |
626 | + | |
627 | +$code.=<<___; | |
628 | +.globl Camellia_cbc_encrypt | |
629 | +.type Camellia_cbc_encrypt,\@function,6 | |
630 | +.align 16 | |
631 | +Camellia_cbc_encrypt: | |
632 | + cmp \$0,%rdx | |
633 | + je .Lcbc_abort | |
634 | + push %rbx | |
635 | + push %rbp | |
636 | + push %r12 | |
637 | + push %r13 | |
638 | + push %r14 | |
639 | + push %r15 | |
640 | +.Lcbc_prologue: | |
641 | + | |
642 | + mov %rsp,%rbp | |
643 | + sub \$64,%rsp | |
644 | + and \$-64,%rsp | |
645 | + | |
646 | + # place stack frame just "above mod 1024" the key schedule, | |
647 | + # this ensures that cache associativity suffices | |
648 | + lea -64-63(%rcx),%r10 | |
649 | + sub %rsp,%r10 | |
650 | + neg %r10 | |
651 | + and \$0x3C0,%r10 | |
652 | + sub %r10,%rsp | |
653 | + #add \$8,%rsp # 8 is reserved for callee's ra | |
654 | + | |
655 | + mov %rdi,$inp # inp argument | |
656 | + mov %rsi,$out # out argument | |
657 | + mov %r8,%rbx # ivp argument | |
658 | + mov %rcx,$key # key argument | |
659 | + mov 272(%rcx),$keyend # grandRounds | |
660 | + | |
661 | + mov %r8,$_ivp | |
662 | + mov %rbp,$_rsp | |
663 | + | |
664 | +.Lcbc_body: | |
665 | + lea .LCamellia_SBOX(%rip),$Tbl | |
666 | + | |
667 | + mov \$32,%ecx | |
668 | +.align 4 | |
669 | +.Lcbc_prefetch_sbox: | |
670 | + mov 0($Tbl),%rax | |
671 | + mov 32($Tbl),%rsi | |
672 | + mov 64($Tbl),%rdi | |
673 | + mov 96($Tbl),%r11 | |
674 | + lea 128($Tbl),$Tbl | |
675 | + loop .Lcbc_prefetch_sbox | |
676 | + sub \$4096,$Tbl | |
677 | + shl \$6,$keyend | |
678 | + mov %rdx,%rcx # len argument | |
679 | + lea ($key,$keyend),$keyend | |
680 | + | |
681 | + cmp \$0,%r9d # enc argument | |
682 | + je .LCBC_DECRYPT | |
683 | + | |
684 | + and \$-16,%rdx | |
685 | + and \$15,%rcx # length residue | |
686 | + lea ($inp,%rdx),%rdx | |
687 | + mov $key,$_key | |
688 | + mov %rdx,$_end | |
689 | + mov %rcx,$_res | |
690 | + | |
691 | + cmp $inp,%rdx | |
692 | + mov 0(%rbx),@S[0] # load IV | |
693 | + mov 4(%rbx),@S[1] | |
694 | + mov 8(%rbx),@S[2] | |
695 | + mov 12(%rbx),@S[3] | |
696 | + je .Lcbc_enc_tail | |
697 | + jmp .Lcbc_eloop | |
698 | + | |
699 | +.align 16 | |
700 | +.Lcbc_eloop: | |
701 | + xor 0($inp),@S[0] | |
702 | + xor 4($inp),@S[1] | |
703 | + xor 8($inp),@S[2] | |
704 | + bswap @S[0] | |
705 | + xor 12($inp),@S[3] | |
706 | + bswap @S[1] | |
707 | + bswap @S[2] | |
708 | + bswap @S[3] | |
709 | + | |
710 | + call _x86_64_Camellia_encrypt | |
711 | + | |
712 | + mov $_key,$key # "rewind" the key | |
713 | + bswap @S[0] | |
714 | + mov $_end,%rdx | |
715 | + bswap @S[1] | |
716 | + mov $_res,%rcx | |
717 | + bswap @S[2] | |
718 | + mov @S[0],0($out) | |
719 | + bswap @S[3] | |
720 | + mov @S[1],4($out) | |
721 | + mov @S[2],8($out) | |
722 | + lea 16($inp),$inp | |
723 | + mov @S[3],12($out) | |
724 | + cmp %rdx,$inp | |
725 | + lea 16($out),$out | |
726 | + jne .Lcbc_eloop | |
727 | + | |
728 | + cmp \$0,%rcx | |
729 | + jne .Lcbc_enc_tail | |
730 | + | |
731 | + mov $_ivp,$out | |
732 | + mov @S[0],0($out) # write out IV residue | |
733 | + mov @S[1],4($out) | |
734 | + mov @S[2],8($out) | |
735 | + mov @S[3],12($out) | |
736 | + jmp .Lcbc_done | |
737 | + | |
738 | +.align 16 | |
739 | +.Lcbc_enc_tail: | |
740 | + xor %rax,%rax | |
741 | + mov %rax,0+$ivec | |
742 | + mov %rax,8+$ivec | |
743 | + mov %rax,$_res | |
744 | + | |
745 | +.Lcbc_enc_pushf: | |
746 | + pushfq | |
747 | + cld | |
748 | + mov $inp,%rsi | |
749 | + lea 8+$ivec,%rdi | |
750 | + .long 0x9066A4F3 # rep movsb | |
751 | + popfq | |
752 | +.Lcbc_enc_popf: | |
753 | + | |
754 | + lea $ivec,$inp | |
755 | + lea 16+$ivec,%rax | |
756 | + mov %rax,$_end | |
757 | + jmp .Lcbc_eloop # one more time | |
758 | + | |
759 | +.align 16 | |
760 | +.LCBC_DECRYPT: | |
761 | + xchg $key,$keyend | |
762 | + add \$15,%rdx | |
763 | + and \$15,%rcx # length residue | |
764 | + and \$-16,%rdx | |
765 | + mov $key,$_key | |
766 | + lea ($inp,%rdx),%rdx | |
767 | + mov %rdx,$_end | |
768 | + mov %rcx,$_res | |
769 | + | |
770 | + mov (%rbx),%rax # load IV | |
771 | + mov 8(%rbx),%rbx | |
772 | + jmp .Lcbc_dloop | |
773 | +.align 16 | |
774 | +.Lcbc_dloop: | |
775 | + mov 0($inp),@S[0] | |
776 | + mov 4($inp),@S[1] | |
777 | + mov 8($inp),@S[2] | |
778 | + bswap @S[0] | |
779 | + mov 12($inp),@S[3] | |
780 | + bswap @S[1] | |
781 | + mov %rax,0+$ivec # save IV to temporary storage | |
782 | + bswap @S[2] | |
783 | + mov %rbx,8+$ivec | |
784 | + bswap @S[3] | |
785 | + | |
786 | + call _x86_64_Camellia_decrypt | |
787 | + | |
788 | + mov $_key,$key # "rewind" the key | |
789 | + mov $_end,%rdx | |
790 | + mov $_res,%rcx | |
791 | + | |
792 | + bswap @S[0] | |
793 | + mov ($inp),%rax # load IV for next iteration | |
794 | + bswap @S[1] | |
795 | + mov 8($inp),%rbx | |
796 | + bswap @S[2] | |
797 | + xor 0+$ivec,@S[0] | |
798 | + bswap @S[3] | |
799 | + xor 4+$ivec,@S[1] | |
800 | + xor 8+$ivec,@S[2] | |
801 | + lea 16($inp),$inp | |
802 | + xor 12+$ivec,@S[3] | |
803 | + cmp %rdx,$inp | |
804 | + je .Lcbc_ddone | |
805 | + | |
806 | + mov @S[0],0($out) | |
807 | + mov @S[1],4($out) | |
808 | + mov @S[2],8($out) | |
809 | + mov @S[3],12($out) | |
810 | + | |
811 | + lea 16($out),$out | |
812 | + jmp .Lcbc_dloop | |
813 | + | |
814 | +.align 16 | |
815 | +.Lcbc_ddone: | |
816 | + mov $_ivp,%rdx | |
817 | + cmp \$0,%rcx | |
818 | + jne .Lcbc_dec_tail | |
819 | + | |
820 | + mov @S[0],0($out) | |
821 | + mov @S[1],4($out) | |
822 | + mov @S[2],8($out) | |
823 | + mov @S[3],12($out) | |
824 | + | |
825 | + mov %rax,(%rdx) # write out IV residue | |
826 | + mov %rbx,8(%rdx) | |
827 | + jmp .Lcbc_done | |
828 | +.align 16 | |
829 | +.Lcbc_dec_tail: | |
830 | + mov @S[0],0+$ivec | |
831 | + mov @S[1],4+$ivec | |
832 | + mov @S[2],8+$ivec | |
833 | + mov @S[3],12+$ivec | |
834 | + | |
835 | +.Lcbc_dec_pushf: | |
836 | + pushfq | |
837 | + cld | |
838 | + lea 8+$ivec,%rsi | |
839 | + lea ($out),%rdi | |
840 | + .long 0x9066A4F3 # rep movsb | |
841 | + popfq | |
842 | +.Lcbc_dec_popf: | |
843 | + | |
844 | + mov %rax,(%rdx) # write out IV residue | |
845 | + mov %rbx,8(%rdx) | |
846 | + jmp .Lcbc_done | |
847 | + | |
848 | +.align 16 | |
849 | +.Lcbc_done: | |
850 | + mov $_rsp,%rcx | |
851 | + mov 0(%rcx),%r15 | |
852 | + mov 8(%rcx),%r14 | |
853 | + mov 16(%rcx),%r13 | |
854 | + mov 24(%rcx),%r12 | |
855 | + mov 32(%rcx),%rbp | |
856 | + mov 40(%rcx),%rbx | |
857 | + lea 48(%rcx),%rsp | |
858 | +.Lcbc_abort: | |
859 | + ret | |
860 | +.size Camellia_cbc_encrypt,.-Camellia_cbc_encrypt | |
861 | + | |
862 | +.asciz "Camellia for x86_64 by <appro@openssl.org>" | |
863 | +___ | |
864 | +} | |
865 | + | |
866 | +# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, | |
867 | +# CONTEXT *context,DISPATCHER_CONTEXT *disp) | |
868 | +if ($win64) { | |
869 | +$rec="%rcx"; | |
870 | +$frame="%rdx"; | |
871 | +$context="%r8"; | |
872 | +$disp="%r9"; | |
873 | + | |
874 | +$code.=<<___; | |
875 | +.extern __imp_RtlVirtualUnwind | |
876 | +.type common_se_handler,\@abi-omnipotent | |
877 | +.align 16 | |
878 | +common_se_handler: | |
879 | + push %rsi | |
880 | + push %rdi | |
881 | + push %rbx | |
882 | + push %rbp | |
883 | + push %r12 | |
884 | + push %r13 | |
885 | + push %r14 | |
886 | + push %r15 | |
887 | + pushfq | |
888 | + lea -64(%rsp),%rsp | |
889 | + | |
890 | + mov 120($context),%rax # pull context->Rax | |
891 | + mov 248($context),%rbx # pull context->Rip | |
892 | + | |
893 | + mov 8($disp),%rsi # disp->ImageBase | |
894 | + mov 56($disp),%r11 # disp->HandlerData | |
895 | + | |
896 | + mov 0(%r11),%r10d # HandlerData[0] | |
897 | + lea (%rsi,%r10),%r10 # prologue label | |
898 | + cmp %r10,%rbx # context->Rip<prologue label | |
899 | + jb .Lin_prologue | |
900 | + | |
901 | + mov 152($context),%rax # pull context->Rsp | |
902 | + | |
903 | + mov 4(%r11),%r10d # HandlerData[1] | |
904 | + lea (%rsi,%r10),%r10 # epilogue label | |
905 | + cmp %r10,%rbx # context->Rip>=epilogue label | |
906 | + jae .Lin_prologue | |
907 | + | |
908 | + lea 40(%rax),%rax | |
909 | + mov -8(%rax),%rbx | |
910 | + mov -16(%rax),%rbp | |
911 | + mov -24(%rax),%r13 | |
912 | + mov -32(%rax),%r14 | |
913 | + mov -40(%rax),%r15 | |
914 | + mov %rbx,144($context) # restore context->Rbx | |
915 | + mov %rbp,160($context) # restore context->Rbp | |
916 | + mov %r13,224($context) # restore context->R13 | |
917 | + mov %r14,232($context) # restore context->R14 | |
918 | + mov %r15,240($context) # restore context->R15 | |
919 | + | |
920 | +.Lin_prologue: | |
921 | + mov 8(%rax),%rdi | |
922 | + mov 16(%rax),%rsi | |
923 | + mov %rax,152($context) # restore context->Rsp | |
924 | + mov %rsi,168($context) # restore context->Rsi | |
925 | + mov %rdi,176($context) # restore context->Rdi | |
926 | + | |
927 | + jmp .Lcommon_seh_exit | |
928 | +.size common_se_handler,.-common_se_handler | |
929 | + | |
930 | +.type cbc_se_handler,\@abi-omnipotent | |
931 | +.align 16 | |
932 | +cbc_se_handler: | |
933 | + push %rsi | |
934 | + push %rdi | |
935 | + push %rbx | |
936 | + push %rbp | |
937 | + push %r12 | |
938 | + push %r13 | |
939 | + push %r14 | |
940 | + push %r15 | |
941 | + pushfq | |
942 | + lea -64(%rsp),%rsp | |
943 | + | |
944 | + mov 120($context),%rax # pull context->Rax | |
945 | + mov 248($context),%rbx # pull context->Rip | |
946 | + | |
947 | + lea .Lcbc_prologue(%rip),%r10 | |
948 | + cmp %r10,%rbx # context->Rip<.Lcbc_prologue | |
949 | + jb .Lin_cbc_prologue | |
950 | + | |
951 | + lea .Lcbc_body(%rip),%r10 | |
952 | + cmp %r10,%rbx # context->Rip<.Lcbc_body | |
953 | + jb .Lin_cbc_frame_setup | |
954 | + | |
955 | + mov 152($context),%rax # pull context->Rsp | |
956 | + | |
957 | + lea .Lcbc_abort(%rip),%r10 | |
958 | + cmp %r10,%rbx # context->Rip>=.Lcbc_abort | |
959 | + jae .Lin_cbc_prologue | |
960 | + | |
961 | + # handle pushf/popf in Camellia_cbc_encrypt | |
962 | + lea .Lcbc_enc_pushf(%rip),%r10 | |
963 | + cmp %r10,%rbx # context->Rip<=.Lcbc_enc_pushf | |
964 | + jbe .Lin_cbc_no_flag | |
965 | + lea 8(%rax),%rax | |
966 | + lea .Lcbc_enc_popf(%rip),%r10 | |
967 | + cmp %r10,%rbx # context->Rip<.Lcbc_enc_popf | |
968 | + jb .Lin_cbc_no_flag | |
969 | + lea -8(%rax),%rax | |
970 | + lea .Lcbc_dec_pushf(%rip),%r10 | |
971 | + cmp %r10,%rbx # context->Rip<=.Lcbc_dec_pushf | |
972 | + jbe .Lin_cbc_no_flag | |
973 | + lea 8(%rax),%rax | |
974 | + lea .Lcbc_dec_popf(%rip),%r10 | |
975 | + cmp %r10,%rbx # context->Rip<.Lcbc_dec_popf | |
976 | + jb .Lin_cbc_no_flag | |
977 | + lea -8(%rax),%rax | |
978 | + | |
979 | +.Lin_cbc_no_flag: | |
980 | + mov 48(%rax),%rax # $_rsp | |
981 | + lea 48(%rax),%rax | |
982 | + | |
983 | +.Lin_cbc_frame_setup: | |
984 | + mov -8(%rax),%rbx | |
985 | + mov -16(%rax),%rbp | |
986 | + mov -24(%rax),%r12 | |
987 | + mov -32(%rax),%r13 | |
988 | + mov -40(%rax),%r14 | |
989 | + mov -48(%rax),%r15 | |
990 | + mov %rbx,144($context) # restore context->Rbx | |
991 | + mov %rbp,160($context) # restore context->Rbp | |
992 | + mov %r12,216($context) # restore context->R12 | |
993 | + mov %r13,224($context) # restore context->R13 | |
994 | + mov %r14,232($context) # restore context->R14 | |
995 | + mov %r15,240($context) # restore context->R15 | |
996 | + | |
997 | +.Lin_cbc_prologue: | |
998 | + mov 8(%rax),%rdi | |
999 | + mov 16(%rax),%rsi | |
1000 | + mov %rax,152($context) # restore context->Rsp | |
1001 | + mov %rsi,168($context) # restore context->Rsi | |
1002 | + mov %rdi,176($context) # restore context->Rdi | |
1003 | + | |
1004 | +.align 4 | |
1005 | +.Lcommon_seh_exit: | |
1006 | + | |
1007 | + mov 40($disp),%rdi # disp->ContextRecord | |
1008 | + mov $context,%rsi # context | |
1009 | + mov \$`1232/8`,%ecx # sizeof(CONTEXT) | |
1010 | + .long 0xa548f3fc # cld; rep movsq | |
1011 | + | |
1012 | + mov $disp,%rsi | |
1013 | + xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER | |
1014 | + mov 8(%rsi),%rdx # arg2, disp->ImageBase | |
1015 | + mov 0(%rsi),%r8 # arg3, disp->ControlPc | |
1016 | + mov 16(%rsi),%r9 # arg4, disp->FunctionEntry | |
1017 | + mov 40(%rsi),%r10 # disp->ContextRecord | |
1018 | + lea 56(%rsi),%r11 # &disp->HandlerData | |
1019 | + lea 24(%rsi),%r12 # &disp->EstablisherFrame | |
1020 | + mov %r10,32(%rsp) # arg5 | |
1021 | + mov %r11,40(%rsp) # arg6 | |
1022 | + mov %r12,48(%rsp) # arg7 | |
1023 | + mov %rcx,56(%rsp) # arg8, (NULL) | |
1024 | + call *__imp_RtlVirtualUnwind(%rip) | |
1025 | + | |
1026 | + mov \$1,%eax # ExceptionContinueSearch | |
1027 | + lea 64(%rsp),%rsp | |
1028 | + popfq | |
1029 | + pop %r15 | |
1030 | + pop %r14 | |
1031 | + pop %r13 | |
1032 | + pop %r12 | |
1033 | + pop %rbp | |
1034 | + pop %rbx | |
1035 | + pop %rdi | |
1036 | + pop %rsi | |
1037 | + ret | |
1038 | +.size cbc_se_handler,.-cbc_se_handler | |
1039 | + | |
1040 | +.section .pdata | |
1041 | +.align 4 | |
1042 | + .rva .LSEH_begin_Camellia_EncryptBlock_Rounds | |
1043 | + .rva .LSEH_end_Camellia_EncryptBlock_Rounds | |
1044 | + .rva .LSEH_info_Camellia_EncryptBlock_Rounds | |
1045 | + | |
1046 | + .rva .LSEH_begin_Camellia_DecryptBlock_Rounds | |
1047 | + .rva .LSEH_end_Camellia_DecryptBlock_Rounds | |
1048 | + .rva .LSEH_info_Camellia_DecryptBlock_Rounds | |
1049 | + | |
1050 | + .rva .LSEH_begin_Camellia_Ekeygen | |
1051 | + .rva .LSEH_end_Camellia_Ekeygen | |
1052 | + .rva .LSEH_info_Camellia_Ekeygen | |
1053 | + | |
1054 | + .rva .LSEH_begin_Camellia_cbc_encrypt | |
1055 | + .rva .LSEH_end_Camellia_cbc_encrypt | |
1056 | + .rva .LSEH_info_Camellia_cbc_encrypt | |
1057 | + | |
1058 | +.section .xdata | |
1059 | +.align 8 | |
1060 | +.LSEH_info_Camellia_EncryptBlock_Rounds: | |
1061 | + .byte 9,0,0,0 | |
1062 | + .rva common_se_handler | |
1063 | + .rva .Lenc_prologue,.Lenc_epilogue # HandlerData[] | |
1064 | +.LSEH_info_Camellia_DecryptBlock_Rounds: | |
1065 | + .byte 9,0,0,0 | |
1066 | + .rva common_se_handler | |
1067 | + .rva .Ldec_prologue,.Ldec_epilogue # HandlerData[] | |
1068 | +.LSEH_info_Camellia_Ekeygen: | |
1069 | + .byte 9,0,0,0 | |
1070 | + .rva common_se_handler | |
1071 | + .rva .Lkey_prologue,.Lkey_epilogue # HandlerData[] | |
1072 | +.LSEH_info_Camellia_cbc_encrypt: | |
1073 | + .byte 9,0,0,0 | |
1074 | + .rva cbc_se_handler | |
1075 | +___ | |
1076 | +} | |
1077 | + | |
1078 | +$code =~ s/\`([^\`]*)\`/eval $1/gem; | |
1079 | +print $code; | |
1080 | +close STDOUT; |
@@ -0,0 +1,4 @@ | ||
1 | +lib | |
2 | +Makefile.save | |
3 | +*.flc | |
4 | +semantic.cache |
@@ -0,0 +1,94 @@ | ||
1 | +#!/usr/bin/env perl | |
2 | + | |
3 | +$flavour = shift; | |
4 | + | |
5 | +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | |
6 | +( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or | |
7 | +( $xlate="${dir}perlasm/ppc-xlate.pl" and -f $xlate) or | |
8 | +die "can't locate ppc-xlate.pl"; | |
9 | + | |
10 | +open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!"; | |
11 | + | |
12 | +if ($flavour=~/64/) { | |
13 | + $CMPLI="cmpldi"; | |
14 | + $SHRLI="srdi"; | |
15 | + $SIGNX="extsw"; | |
16 | +} else { | |
17 | + $CMPLI="cmplwi"; | |
18 | + $SHRLI="srwi"; | |
19 | + $SIGNX="mr"; | |
20 | +} | |
21 | + | |
22 | +$code=<<___; | |
23 | +.machine "any" | |
24 | +.text | |
25 | + | |
26 | +.globl .OPENSSL_cpuid_setup | |
27 | +.align 4 | |
28 | +.OPENSSL_cpuid_setup: | |
29 | + blr | |
30 | + | |
31 | +.globl .OPENSSL_wipe_cpu | |
32 | +.align 4 | |
33 | +.OPENSSL_wipe_cpu: | |
34 | + xor r0,r0,r0 | |
35 | + mr r3,r1 | |
36 | + xor r4,r4,r4 | |
37 | + xor r5,r5,r5 | |
38 | + xor r6,r6,r6 | |
39 | + xor r7,r7,r7 | |
40 | + xor r8,r8,r8 | |
41 | + xor r9,r9,r9 | |
42 | + xor r10,r10,r10 | |
43 | + xor r11,r11,r11 | |
44 | + xor r12,r12,r12 | |
45 | + blr | |
46 | + | |
47 | +.globl .OPENSSL_atomic_add | |
48 | +.align 4 | |
49 | +.OPENSSL_atomic_add: | |
50 | +Loop: lwarx r5,0,r3 | |
51 | + add r0,r4,r5 | |
52 | + stwcx. r0,0,r3 | |
53 | + bne- Loop | |
54 | + $SIGNX r3,r0 | |
55 | + blr | |
56 | + | |
57 | +.globl .OPENSSL_rdtsc | |
58 | +.align 4 | |
59 | +.OPENSSL_rdtsc: | |
60 | + mftb r3 | |
61 | + mftbu r4 | |
62 | + blr | |
63 | + | |
64 | +.globl .OPENSSL_cleanse | |
65 | +.align 4 | |
66 | +.OPENSSL_cleanse: | |
67 | + $CMPLI r4,7 | |
68 | + li r0,0 | |
69 | + bge Lot | |
70 | +Little: mtctr r4 | |
71 | + stb r0,0(r3) | |
72 | + addi r3,r3,1 | |
73 | + bdnz- \$-8 | |
74 | + blr | |
75 | +Lot: andi. r5,r3,3 | |
76 | + beq Laligned | |
77 | + stb r0,0(r3) | |
78 | + subi r4,r4,1 | |
79 | + addi r3,r3,1 | |
80 | + b Lot | |
81 | +Laligned: | |
82 | + $SHRLI r5,r4,2 | |
83 | + mtctr r5 | |
84 | + stw r0,0(r3) | |
85 | + addi r3,r3,4 | |
86 | + bdnz- \$-8 | |
87 | + andi. r4,r4,3 | |
88 | + bne Little | |
89 | + blr | |
90 | +___ | |
91 | + | |
92 | +$code =~ s/\`([^\`]*)\`/eval $1/gem; | |
93 | +print $code; | |
94 | +close STDOUT; |
@@ -0,0 +1,90 @@ | ||
1 | +.text | |
2 | + | |
3 | +.globl OPENSSL_cpuid_setup | |
4 | +.type OPENSSL_cpuid_setup,@function | |
5 | +.align 16 | |
6 | +OPENSSL_cpuid_setup: | |
7 | + br %r14 # reserved for future | |
8 | +.size OPENSSL_cpuid_setup,.-OPENSSL_cpuid_setup | |
9 | + | |
10 | +.globl OPENSSL_s390x_facilities | |
11 | +.type OPENSSL_s390x_facilities,@function | |
12 | +.align 16 | |
13 | +OPENSSL_s390x_facilities: | |
14 | + lghi %r0,0 | |
15 | + .long 0xb2b0f010 # stfle 16(%r15) | |
16 | + lg %r2,16(%r15) | |
17 | + br %r14 | |
18 | +.size OPENSSL_s390x_facilities,.-OPENSSL_s390x_facilities | |
19 | + | |
20 | +.globl OPENSSL_rdtsc | |
21 | +.type OPENSSL_rdtsc,@function | |
22 | +.align 16 | |
23 | +OPENSSL_rdtsc: | |
24 | + stck 16(%r15) | |
25 | + lg %r2,16(%r15) | |
26 | + br %r14 | |
27 | +.size OPENSSL_rdtsc,.-OPENSSL_rdtsc | |
28 | + | |
29 | +.globl OPENSSL_atomic_add | |
30 | +.type OPENSSL_atomic_add,@function | |
31 | +.align 16 | |
32 | +OPENSSL_atomic_add: | |
33 | + l %r1,0(%r2) | |
34 | +.Lspin: lr %r0,%r1 | |
35 | + ar %r0,%r3 | |
36 | + cs %r1,%r0,0(%r2) | |
37 | + brc 4,.Lspin | |
38 | + lgfr %r2,%r0 # OpenSSL expects the new value | |
39 | + br %r14 | |
40 | +.size OPENSSL_atomic_add,.-OPENSSL_atomic_add | |
41 | + | |
42 | +.globl OPENSSL_wipe_cpu | |
43 | +.type OPENSSL_wipe_cpu,@function | |
44 | +.align 16 | |
45 | +OPENSSL_wipe_cpu: | |
46 | + xgr %r0,%r0 | |
47 | + xgr %r1,%r1 | |
48 | + lgr %r2,%r15 | |
49 | + xgr %r3,%r3 | |
50 | + xgr %r4,%r4 | |
51 | + lzdr %f0 | |
52 | + lzdr %f1 | |
53 | + lzdr %f2 | |
54 | + lzdr %f3 | |
55 | + lzdr %f4 | |
56 | + lzdr %f5 | |
57 | + lzdr %f6 | |
58 | + lzdr %f7 | |
59 | + br %r14 | |
60 | +.size OPENSSL_wipe_cpu,.-OPENSSL_wipe_cpu | |
61 | + | |
62 | +.globl OPENSSL_cleanse | |
63 | +.type OPENSSL_cleanse,@function | |
64 | +.align 16 | |
65 | +OPENSSL_cleanse: | |
66 | + lghi %r4,15 | |
67 | + lghi %r0,0 | |
68 | + clgr %r3,%r4 | |
69 | + jh .Lot | |
70 | +.Little: | |
71 | + stc %r0,0(%r2) | |
72 | + la %r2,1(%r2) | |
73 | + brctg %r3,.Little | |
74 | + br %r14 | |
75 | +.align 4 | |
76 | +.Lot: tmll %r2,7 | |
77 | + jz .Laligned | |
78 | + stc %r0,0(%r2) | |
79 | + la %r2,1(%r2) | |
80 | + brctg %r3,.Lot | |
81 | +.Laligned: | |
82 | + srlg %r4,%r3,3 | |
83 | +.Loop: stg %r0,0(%r2) | |
84 | + la %r2,8(%r2) | |
85 | + brctg %r4,.Loop | |
86 | + lghi %r4,7 | |
87 | + ngr %r3,%r4 | |
88 | + jnz .Little | |
89 | + br %r14 | |
90 | +.size OPENSSL_cleanse,.-OPENSSL_cleanse |
@@ -0,0 +1,154 @@ | ||
1 | +#include <stdio.h> | |
2 | +#include <stdlib.h> | |
3 | +#include <string.h> | |
4 | +#include <sys/time.h> | |
5 | +#include <openssl/bn.h> | |
6 | + | |
7 | +#define SPARCV9_TICK_PRIVILEGED (1<<0) | |
8 | +#define SPARCV9_PREFER_FPU (1<<1) | |
9 | +#define SPARCV9_VIS1 (1<<2) | |
10 | +#define SPARCV9_VIS2 (1<<3) /* reserved */ | |
11 | +#define SPARCV9_FMADD (1<<4) /* reserved for SPARC64 V */ | |
12 | +static int OPENSSL_sparcv9cap_P=SPARCV9_TICK_PRIVILEGED; | |
13 | + | |
14 | +int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0, int num) | |
15 | + { | |
16 | + int bn_mul_mont_fpu(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0, int num); | |
17 | + int bn_mul_mont_int(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0, int num); | |
18 | + | |
19 | + if ((OPENSSL_sparcv9cap_P&(SPARCV9_PREFER_FPU|SPARCV9_VIS1)) == | |
20 | + (SPARCV9_PREFER_FPU|SPARCV9_VIS1)) | |
21 | + return bn_mul_mont_fpu(rp,ap,bp,np,n0,num); | |
22 | + else | |
23 | + return bn_mul_mont_int(rp,ap,bp,np,n0,num); | |
24 | + } | |
25 | + | |
26 | +unsigned long OPENSSL_rdtsc(void) | |
27 | + { | |
28 | + unsigned long _sparcv9_rdtick(void); | |
29 | + | |
30 | + if (OPENSSL_sparcv9cap_P&SPARCV9_TICK_PRIVILEGED) | |
31 | +#if defined(__sun) && defined(__SVR4) | |
32 | + return gethrtime(); | |
33 | +#else | |
34 | + return 0; | |
35 | +#endif | |
36 | + else | |
37 | + return _sparcv9_rdtick(); | |
38 | + } | |
39 | + | |
40 | +#if defined(__sun) && defined(__SVR4) | |
41 | + | |
42 | +#include <dlfcn.h> | |
43 | +#include <libdevinfo.h> | |
44 | +#include <sys/systeminfo.h> | |
45 | + | |
46 | +typedef di_node_t (*di_init_t)(const char *,uint_t); | |
47 | +typedef void (*di_fini_t)(di_node_t); | |
48 | +typedef char * (*di_node_name_t)(di_node_t); | |
49 | +typedef int (*di_walk_node_t)(di_node_t,uint_t,di_node_name_t,int (*)(di_node_t,di_node_name_t)); | |
50 | + | |
51 | +#define DLLINK(h,name) (name=(name##_t)dlsym((h),#name)) | |
52 | + | |
53 | +static int walk_nodename(di_node_t node, di_node_name_t di_node_name) | |
54 | + { | |
55 | + char *name = (*di_node_name)(node); | |
56 | + | |
57 | + /* This is expected to catch all UltraSPARC flavors prior T1 */ | |
58 | + if (!strcmp (name,"SUNW,UltraSPARC") || | |
59 | + !strncmp(name,"SUNW,UltraSPARC-I",17)) /* covers II,III,IV */ | |
60 | + { | |
61 | + OPENSSL_sparcv9cap_P |= SPARCV9_PREFER_FPU|SPARCV9_VIS1; | |
62 | + | |
63 | + /* %tick is privileged only on UltraSPARC-I/II, but not IIe */ | |
64 | + if (name[14]!='\0' && name[17]!='\0' && name[18]!='\0') | |
65 | + OPENSSL_sparcv9cap_P &= ~SPARCV9_TICK_PRIVILEGED; | |
66 | + | |
67 | + return DI_WALK_TERMINATE; | |
68 | + } | |
69 | + /* This is expected to catch remaining UltraSPARCs, such as T1 */ | |
70 | + else if (!strncmp(name,"SUNW,UltraSPARC",15)) | |
71 | + { | |
72 | + OPENSSL_sparcv9cap_P &= ~SPARCV9_TICK_PRIVILEGED; | |
73 | + | |
74 | + return DI_WALK_TERMINATE; | |
75 | + } | |
76 | + | |
77 | + return DI_WALK_CONTINUE; | |
78 | + } | |
79 | + | |
80 | +void OPENSSL_cpuid_setup(void) | |
81 | + { | |
82 | + void *h; | |
83 | + char *e,si[256]; | |
84 | + static int trigger=0; | |
85 | + | |
86 | + if (trigger) return; | |
87 | + trigger=1; | |
88 | + | |
89 | + if ((e=getenv("OPENSSL_sparcv9cap"))) | |
90 | + { | |
91 | + OPENSSL_sparcv9cap_P=strtoul(e,NULL,0); | |
92 | + return; | |
93 | + } | |
94 | + | |
95 | + if (sysinfo(SI_MACHINE,si,sizeof(si))>0) | |
96 | + { | |
97 | + if (strcmp(si,"sun4v")) | |
98 | + /* FPU is preferred for all CPUs, but US-T1/2 */ | |
99 | + OPENSSL_sparcv9cap_P |= SPARCV9_PREFER_FPU; | |
100 | + } | |
101 | + | |
102 | + if (sysinfo(SI_ISALIST,si,sizeof(si))>0) | |
103 | + { | |
104 | + if (strstr(si,"+vis")) | |
105 | + OPENSSL_sparcv9cap_P |= SPARCV9_VIS1; | |
106 | + if (strstr(si,"+vis2")) | |
107 | + { | |
108 | + OPENSSL_sparcv9cap_P |= SPARCV9_VIS2; | |
109 | + OPENSSL_sparcv9cap_P &= ~SPARCV9_TICK_PRIVILEGED; | |
110 | + return; | |
111 | + } | |
112 | + } | |
113 | + | |
114 | + if ((h = dlopen("libdevinfo.so.1",RTLD_LAZY))) do | |
115 | + { | |
116 | + di_init_t di_init; | |
117 | + di_fini_t di_fini; | |
118 | + di_walk_node_t di_walk_node; | |
119 | + di_node_name_t di_node_name; | |
120 | + di_node_t root_node; | |
121 | + | |
122 | + if (!DLLINK(h,di_init)) break; | |
123 | + if (!DLLINK(h,di_fini)) break; | |
124 | + if (!DLLINK(h,di_walk_node)) break; | |
125 | + if (!DLLINK(h,di_node_name)) break; | |
126 | + | |
127 | + if ((root_node = (*di_init)("/",DINFOSUBTREE))!=DI_NODE_NIL) | |
128 | + { | |
129 | + (*di_walk_node)(root_node,DI_WALK_SIBFIRST, | |
130 | + di_node_name,walk_nodename); | |
131 | + (*di_fini)(root_node); | |
132 | + } | |
133 | + } while(0); | |
134 | + | |
135 | + if (h) dlclose(h); | |
136 | + } | |
137 | + | |
138 | +#else | |
139 | + | |
140 | +void OPENSSL_cpuid_setup(void) | |
141 | + { | |
142 | + char *e; | |
143 | + | |
144 | + if ((e=getenv("OPENSSL_sparcv9cap"))) | |
145 | + { | |
146 | + OPENSSL_sparcv9cap_P=strtoul(e,NULL,0); | |
147 | + return; | |
148 | + } | |
149 | + | |
150 | + /* For now we assume that the rest supports UltraSPARC-I* only */ | |
151 | + OPENSSL_sparcv9cap_P |= SPARCV9_PREFER_FPU|SPARCV9_VIS1; | |
152 | + } | |
153 | + | |
154 | +#endif |