• R/O
  • HTTP
  • SSH
  • HTTPS

Commit

Tags
No Tags

Frequently used words (click to add to your profile)

javac++androidlinuxc#windowsobjective-ccocoa誰得qtpythonphprubygameguibathyscaphec計画中(planning stage)翻訳omegatframeworktwitterdomtestvb.netdirectxゲームエンジンbtronarduinopreviewer

TLS/SSL and crypto library


Commit MetaInfo

Revisión67956bda58a1692d67a9ec0c75390a29e5ce27cd (tree)
Tiempo2009-03-25 21:08:15
Autorcvs2svn <cvs2svn>
Commitercvs2svn

Log Message

This commit was manufactured by cvs2svn to create branch
'BRANCH_OpenSSL_0_9_8k'.

Cambiar Resumen

Diferencia incremental

--- /dev/null
+++ b/apps/genpkey.c
@@ -0,0 +1,440 @@
1+/* apps/genpkey.c */
2+/* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL
3+ * project 2006
4+ */
5+/* ====================================================================
6+ * Copyright (c) 2006 The OpenSSL Project. All rights reserved.
7+ *
8+ * Redistribution and use in source and binary forms, with or without
9+ * modification, are permitted provided that the following conditions
10+ * are met:
11+ *
12+ * 1. Redistributions of source code must retain the above copyright
13+ * notice, this list of conditions and the following disclaimer.
14+ *
15+ * 2. Redistributions in binary form must reproduce the above copyright
16+ * notice, this list of conditions and the following disclaimer in
17+ * the documentation and/or other materials provided with the
18+ * distribution.
19+ *
20+ * 3. All advertising materials mentioning features or use of this
21+ * software must display the following acknowledgment:
22+ * "This product includes software developed by the OpenSSL Project
23+ * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
24+ *
25+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
26+ * endorse or promote products derived from this software without
27+ * prior written permission. For written permission, please contact
28+ * licensing@OpenSSL.org.
29+ *
30+ * 5. Products derived from this software may not be called "OpenSSL"
31+ * nor may "OpenSSL" appear in their names without prior written
32+ * permission of the OpenSSL Project.
33+ *
34+ * 6. Redistributions of any form whatsoever must retain the following
35+ * acknowledgment:
36+ * "This product includes software developed by the OpenSSL Project
37+ * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
38+ *
39+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
40+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
41+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
42+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
43+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
44+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
45+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
46+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
47+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
48+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
49+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
50+ * OF THE POSSIBILITY OF SUCH DAMAGE.
51+ * ====================================================================
52+ *
53+ * This product includes cryptographic software written by Eric Young
54+ * (eay@cryptsoft.com). This product includes software written by Tim
55+ * Hudson (tjh@cryptsoft.com).
56+ *
57+ */
58+#include <stdio.h>
59+#include <string.h>
60+#include "apps.h"
61+#include <openssl/pem.h>
62+#include <openssl/err.h>
63+#include <openssl/evp.h>
64+#ifndef OPENSSL_NO_ENGINE
65+#include <openssl/engine.h>
66+#endif
67+
68+static int init_keygen_file(BIO *err, EVP_PKEY_CTX **pctx,
69+ const char *file, ENGINE *e);
70+static int genpkey_cb(EVP_PKEY_CTX *ctx);
71+
72+#define PROG genpkey_main
73+
74+int MAIN(int, char **);
75+
76+int MAIN(int argc, char **argv)
77+ {
78+ ENGINE *e = NULL;
79+ char **args, *outfile = NULL;
80+ char *passarg = NULL;
81+ BIO *in = NULL, *out = NULL;
82+ const EVP_CIPHER *cipher = NULL;
83+ int outformat;
84+ int text = 0;
85+ EVP_PKEY *pkey=NULL;
86+ EVP_PKEY_CTX *ctx = NULL;
87+ char *pass = NULL;
88+ int badarg = 0;
89+ int ret = 1, rv;
90+
91+ int do_param = 0;
92+
93+ if (bio_err == NULL)
94+ bio_err = BIO_new_fp (stderr, BIO_NOCLOSE);
95+
96+ if (!load_config(bio_err, NULL))
97+ goto end;
98+
99+ outformat=FORMAT_PEM;
100+
101+ ERR_load_crypto_strings();
102+ OpenSSL_add_all_algorithms();
103+ args = argv + 1;
104+ while (!badarg && *args && *args[0] == '-')
105+ {
106+ if (!strcmp(*args,"-outform"))
107+ {
108+ if (args[1])
109+ {
110+ args++;
111+ outformat=str2fmt(*args);
112+ }
113+ else badarg = 1;
114+ }
115+ else if (!strcmp(*args,"-pass"))
116+ {
117+ if (!args[1]) goto bad;
118+ passarg= *(++args);
119+ }
120+#ifndef OPENSSL_NO_ENGINE
121+ else if (strcmp(*args,"-engine") == 0)
122+ {
123+ if (!args[1])
124+ goto bad;
125+ e = setup_engine(bio_err, *(++args), 0);
126+ }
127+#endif
128+ else if (!strcmp (*args, "-paramfile"))
129+ {
130+ if (!args[1])
131+ goto bad;
132+ args++;
133+ if (do_param == 1)
134+ goto bad;
135+ if (!init_keygen_file(bio_err, &ctx, *args, e))
136+ goto end;
137+ }
138+ else if (!strcmp (*args, "-out"))
139+ {
140+ if (args[1])
141+ {
142+ args++;
143+ outfile = *args;
144+ }
145+ else badarg = 1;
146+ }
147+ else if (strcmp(*args,"-algorithm") == 0)
148+ {
149+ if (!args[1])
150+ goto bad;
151+ if (!init_gen_str(bio_err, &ctx, *(++args),e, do_param))
152+ goto end;
153+ }
154+ else if (strcmp(*args,"-pkeyopt") == 0)
155+ {
156+ if (!args[1])
157+ goto bad;
158+ if (!ctx)
159+ {
160+ BIO_puts(bio_err, "No keytype specified\n");
161+ goto bad;
162+ }
163+ else if (pkey_ctrl_string(ctx, *(++args)) <= 0)
164+ {
165+ BIO_puts(bio_err, "parameter setting error\n");
166+ ERR_print_errors(bio_err);
167+ goto end;
168+ }
169+ }
170+ else if (strcmp(*args,"-genparam") == 0)
171+ {
172+ if (ctx)
173+ goto bad;
174+ do_param = 1;
175+ }
176+ else if (strcmp(*args,"-text") == 0)
177+ text=1;
178+ else
179+ {
180+ cipher = EVP_get_cipherbyname(*args + 1);
181+ if (!cipher)
182+ {
183+ BIO_printf(bio_err, "Unknown cipher %s\n",
184+ *args + 1);
185+ badarg = 1;
186+ }
187+ if (do_param == 1)
188+ badarg = 1;
189+ }
190+ args++;
191+ }
192+
193+ if (!ctx)
194+ badarg = 1;
195+
196+ if (badarg)
197+ {
198+ bad:
199+ BIO_printf(bio_err, "Usage: genpkey [options]\n");
200+ BIO_printf(bio_err, "where options may be\n");
201+ BIO_printf(bio_err, "-out file output file\n");
202+ BIO_printf(bio_err, "-outform X output format (DER or PEM)\n");
203+ BIO_printf(bio_err, "-pass arg output file pass phrase source\n");
204+ BIO_printf(bio_err, "-<cipher> use cipher <cipher> to encrypt the key\n");
205+#ifndef OPENSSL_NO_ENGINE
206+ BIO_printf(bio_err, "-engine e use engine e, possibly a hardware device.\n");
207+#endif
208+ BIO_printf(bio_err, "-paramfile file parameters file\n");
209+ BIO_printf(bio_err, "-algorithm alg the public key algorithm\n");
210+ BIO_printf(bio_err, "-pkeyopt opt:value set the public key algorithm option <opt>\n"
211+ " to value <value>\n");
212+ BIO_printf(bio_err, "-genparam generate parameters, not key\n");
213+ BIO_printf(bio_err, "-text print the in text\n");
214+ BIO_printf(bio_err, "NB: options order may be important! See the manual page.\n");
215+ goto end;
216+ }
217+
218+ if (!app_passwd(bio_err, passarg, NULL, &pass, NULL))
219+ {
220+ BIO_puts(bio_err, "Error getting password\n");
221+ goto end;
222+ }
223+
224+ if (outfile)
225+ {
226+ if (!(out = BIO_new_file (outfile, "wb")))
227+ {
228+ BIO_printf(bio_err,
229+ "Can't open output file %s\n", outfile);
230+ goto end;
231+ }
232+ }
233+ else
234+ {
235+ out = BIO_new_fp (stdout, BIO_NOCLOSE);
236+#ifdef OPENSSL_SYS_VMS
237+ {
238+ BIO *tmpbio = BIO_new(BIO_f_linebuffer());
239+ out = BIO_push(tmpbio, out);
240+ }
241+#endif
242+ }
243+
244+ EVP_PKEY_CTX_set_cb(ctx, genpkey_cb);
245+ EVP_PKEY_CTX_set_app_data(ctx, bio_err);
246+
247+ if (do_param)
248+ {
249+ if (EVP_PKEY_paramgen(ctx, &pkey) <= 0)
250+ {
251+ BIO_puts(bio_err, "Error generating parameters\n");
252+ ERR_print_errors(bio_err);
253+ goto end;
254+ }
255+ }
256+ else
257+ {
258+ if (EVP_PKEY_keygen(ctx, &pkey) <= 0)
259+ {
260+ BIO_puts(bio_err, "Error generating key\n");
261+ ERR_print_errors(bio_err);
262+ goto end;
263+ }
264+ }
265+
266+ if (do_param)
267+ rv = PEM_write_bio_Parameters(out, pkey);
268+ else if (outformat == FORMAT_PEM)
269+ rv = PEM_write_bio_PrivateKey(out, pkey, cipher, NULL, 0,
270+ NULL, pass);
271+ else if (outformat == FORMAT_ASN1)
272+ rv = i2d_PrivateKey_bio(out, pkey);
273+ else
274+ {
275+ BIO_printf(bio_err, "Bad format specified for key\n");
276+ goto end;
277+ }
278+
279+ if (rv <= 0)
280+ {
281+ BIO_puts(bio_err, "Error writing key\n");
282+ ERR_print_errors(bio_err);
283+ }
284+
285+ if (text)
286+ {
287+ if (do_param)
288+ rv = EVP_PKEY_print_params(out, pkey, 0, NULL);
289+ else
290+ rv = EVP_PKEY_print_private(out, pkey, 0, NULL);
291+
292+ if (rv <= 0)
293+ {
294+ BIO_puts(bio_err, "Error printing key\n");
295+ ERR_print_errors(bio_err);
296+ }
297+ }
298+
299+ ret = 0;
300+
301+ end:
302+ if (pkey)
303+ EVP_PKEY_free(pkey);
304+ if (ctx)
305+ EVP_PKEY_CTX_free(ctx);
306+ if (out)
307+ BIO_free_all(out);
308+ BIO_free(in);
309+ if (pass)
310+ OPENSSL_free(pass);
311+
312+ return ret;
313+ }
314+
315+static int init_keygen_file(BIO *err, EVP_PKEY_CTX **pctx,
316+ const char *file, ENGINE *e)
317+ {
318+ BIO *pbio;
319+ EVP_PKEY *pkey = NULL;
320+ EVP_PKEY_CTX *ctx = NULL;
321+ if (*pctx)
322+ {
323+ BIO_puts(err, "Parameters already set!\n");
324+ return 0;
325+ }
326+
327+ pbio = BIO_new_file(file, "r");
328+ if (!pbio)
329+ {
330+ BIO_printf(err, "Can't open parameter file %s\n", file);
331+ return 0;
332+ }
333+
334+ pkey = PEM_read_bio_Parameters(pbio, NULL);
335+ BIO_free(pbio);
336+
337+ if (!pkey)
338+ {
339+ BIO_printf(bio_err, "Error reading parameter file %s\n", file);
340+ return 0;
341+ }
342+
343+ ctx = EVP_PKEY_CTX_new(pkey, e);
344+ if (!ctx)
345+ goto err;
346+ if (EVP_PKEY_keygen_init(ctx) <= 0)
347+ goto err;
348+ EVP_PKEY_free(pkey);
349+ *pctx = ctx;
350+ return 1;
351+
352+ err:
353+ BIO_puts(err, "Error initializing context\n");
354+ ERR_print_errors(err);
355+ if (ctx)
356+ EVP_PKEY_CTX_free(ctx);
357+ if (pkey)
358+ EVP_PKEY_free(pkey);
359+ return 0;
360+
361+ }
362+
363+int init_gen_str(BIO *err, EVP_PKEY_CTX **pctx,
364+ const char *algname, ENGINE *e, int do_param)
365+ {
366+ EVP_PKEY_CTX *ctx = NULL;
367+ const EVP_PKEY_ASN1_METHOD *ameth;
368+ ENGINE *tmpeng = NULL;
369+ int pkey_id;
370+
371+ if (*pctx)
372+ {
373+ BIO_puts(err, "Algorithm already set!\n");
374+ return 0;
375+ }
376+
377+ ameth = EVP_PKEY_asn1_find_str(&tmpeng, algname, -1);
378+
379+#ifndef OPENSSL_NO_ENGINE
380+ if (!ameth && e)
381+ ameth = ENGINE_get_pkey_asn1_meth_str(e, algname, -1);
382+#endif
383+
384+ if (!ameth)
385+ {
386+ BIO_printf(bio_err, "Algorithm %s not found\n", algname);
387+ return 0;
388+ }
389+
390+ ERR_clear_error();
391+
392+ EVP_PKEY_asn1_get0_info(&pkey_id, NULL, NULL, NULL, NULL, ameth);
393+#ifndef OPENSSL_NO_ENGINE
394+ if (tmpeng)
395+ ENGINE_finish(tmpeng);
396+#endif
397+ ctx = EVP_PKEY_CTX_new_id(pkey_id, e);
398+
399+ if (!ctx)
400+ goto err;
401+ if (do_param)
402+ {
403+ if (EVP_PKEY_paramgen_init(ctx) <= 0)
404+ goto err;
405+ }
406+ else
407+ {
408+ if (EVP_PKEY_keygen_init(ctx) <= 0)
409+ goto err;
410+ }
411+
412+ *pctx = ctx;
413+ return 1;
414+
415+ err:
416+ BIO_printf(err, "Error initializing %s context\n", algname);
417+ ERR_print_errors(err);
418+ if (ctx)
419+ EVP_PKEY_CTX_free(ctx);
420+ return 0;
421+
422+ }
423+
424+static int genpkey_cb(EVP_PKEY_CTX *ctx)
425+ {
426+ char c='*';
427+ BIO *b = EVP_PKEY_CTX_get_app_data(ctx);
428+ int p;
429+ p = EVP_PKEY_CTX_get_keygen_info(ctx, 0);
430+ if (p == 0) c='.';
431+ if (p == 1) c='+';
432+ if (p == 2) c='*';
433+ if (p == 3) c='\n';
434+ BIO_write(b,&c,1);
435+ (void)BIO_flush(b);
436+#ifdef LINT
437+ p=n;
438+#endif
439+ return 1;
440+ }
--- /dev/null
+++ b/apps/pkey.c
@@ -0,0 +1,284 @@
1+/* apps/pkey.c */
2+/* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL
3+ * project 2006
4+ */
5+/* ====================================================================
6+ * Copyright (c) 2006 The OpenSSL Project. All rights reserved.
7+ *
8+ * Redistribution and use in source and binary forms, with or without
9+ * modification, are permitted provided that the following conditions
10+ * are met:
11+ *
12+ * 1. Redistributions of source code must retain the above copyright
13+ * notice, this list of conditions and the following disclaimer.
14+ *
15+ * 2. Redistributions in binary form must reproduce the above copyright
16+ * notice, this list of conditions and the following disclaimer in
17+ * the documentation and/or other materials provided with the
18+ * distribution.
19+ *
20+ * 3. All advertising materials mentioning features or use of this
21+ * software must display the following acknowledgment:
22+ * "This product includes software developed by the OpenSSL Project
23+ * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
24+ *
25+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
26+ * endorse or promote products derived from this software without
27+ * prior written permission. For written permission, please contact
28+ * licensing@OpenSSL.org.
29+ *
30+ * 5. Products derived from this software may not be called "OpenSSL"
31+ * nor may "OpenSSL" appear in their names without prior written
32+ * permission of the OpenSSL Project.
33+ *
34+ * 6. Redistributions of any form whatsoever must retain the following
35+ * acknowledgment:
36+ * "This product includes software developed by the OpenSSL Project
37+ * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
38+ *
39+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
40+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
41+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
42+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
43+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
44+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
45+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
46+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
47+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
48+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
49+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
50+ * OF THE POSSIBILITY OF SUCH DAMAGE.
51+ * ====================================================================
52+ *
53+ * This product includes cryptographic software written by Eric Young
54+ * (eay@cryptsoft.com). This product includes software written by Tim
55+ * Hudson (tjh@cryptsoft.com).
56+ *
57+ */
58+#include <stdio.h>
59+#include <string.h>
60+#include "apps.h"
61+#include <openssl/pem.h>
62+#include <openssl/err.h>
63+#include <openssl/evp.h>
64+
65+#define PROG pkey_main
66+
67+int MAIN(int, char **);
68+
69+int MAIN(int argc, char **argv)
70+ {
71+ ENGINE *e = NULL;
72+ char **args, *infile = NULL, *outfile = NULL;
73+ char *passargin = NULL, *passargout = NULL;
74+ BIO *in = NULL, *out = NULL;
75+ const EVP_CIPHER *cipher = NULL;
76+ int informat, outformat;
77+ int pubin = 0, pubout = 0, pubtext = 0, text = 0, noout = 0;
78+ EVP_PKEY *pkey=NULL;
79+ char *passin = NULL, *passout = NULL;
80+ int badarg = 0;
81+#ifndef OPENSSL_NO_ENGINE
82+ char *engine=NULL;
83+#endif
84+ int ret = 1;
85+
86+ if (bio_err == NULL)
87+ bio_err = BIO_new_fp (stderr, BIO_NOCLOSE);
88+
89+ if (!load_config(bio_err, NULL))
90+ goto end;
91+
92+ informat=FORMAT_PEM;
93+ outformat=FORMAT_PEM;
94+
95+ ERR_load_crypto_strings();
96+ OpenSSL_add_all_algorithms();
97+ args = argv + 1;
98+ while (!badarg && *args && *args[0] == '-')
99+ {
100+ if (!strcmp(*args,"-inform"))
101+ {
102+ if (args[1])
103+ {
104+ args++;
105+ informat=str2fmt(*args);
106+ }
107+ else badarg = 1;
108+ }
109+ else if (!strcmp(*args,"-outform"))
110+ {
111+ if (args[1])
112+ {
113+ args++;
114+ outformat=str2fmt(*args);
115+ }
116+ else badarg = 1;
117+ }
118+ else if (!strcmp(*args,"-passin"))
119+ {
120+ if (!args[1]) goto bad;
121+ passargin= *(++args);
122+ }
123+ else if (!strcmp(*args,"-passout"))
124+ {
125+ if (!args[1]) goto bad;
126+ passargout= *(++args);
127+ }
128+#ifndef OPENSSL_NO_ENGINE
129+ else if (strcmp(*args,"-engine") == 0)
130+ {
131+ if (!args[1]) goto bad;
132+ engine= *(++args);
133+ }
134+#endif
135+ else if (!strcmp (*args, "-in"))
136+ {
137+ if (args[1])
138+ {
139+ args++;
140+ infile = *args;
141+ }
142+ else badarg = 1;
143+ }
144+ else if (!strcmp (*args, "-out"))
145+ {
146+ if (args[1])
147+ {
148+ args++;
149+ outfile = *args;
150+ }
151+ else badarg = 1;
152+ }
153+ else if (strcmp(*args,"-pubin") == 0)
154+ {
155+ pubin=1;
156+ pubout=1;
157+ pubtext=1;
158+ }
159+ else if (strcmp(*args,"-pubout") == 0)
160+ pubout=1;
161+ else if (strcmp(*args,"-text_pub") == 0)
162+ {
163+ pubtext=1;
164+ text=1;
165+ }
166+ else if (strcmp(*args,"-text") == 0)
167+ text=1;
168+ else if (strcmp(*args,"-noout") == 0)
169+ noout=1;
170+ else
171+ {
172+ cipher = EVP_get_cipherbyname(*args + 1);
173+ if (!cipher)
174+ {
175+ BIO_printf(bio_err, "Unknown cipher %s\n",
176+ *args + 1);
177+ badarg = 1;
178+ }
179+ }
180+ args++;
181+ }
182+
183+ if (badarg)
184+ {
185+ bad:
186+ BIO_printf(bio_err, "Usage pkey [options]\n");
187+ BIO_printf(bio_err, "where options are\n");
188+ BIO_printf(bio_err, "-in file input file\n");
189+ BIO_printf(bio_err, "-inform X input format (DER or PEM)\n");
190+ BIO_printf(bio_err, "-passin arg input file pass phrase source\n");
191+ BIO_printf(bio_err, "-outform X output format (DER or PEM)\n");
192+ BIO_printf(bio_err, "-out file output file\n");
193+ BIO_printf(bio_err, "-passout arg output file pass phrase source\n");
194+#ifndef OPENSSL_NO_ENGINE
195+ BIO_printf(bio_err, "-engine e use engine e, possibly a hardware device.\n");
196+#endif
197+ return 1;
198+ }
199+
200+#ifndef OPENSSL_NO_ENGINE
201+ e = setup_engine(bio_err, engine, 0);
202+#endif
203+
204+ if (!app_passwd(bio_err, passargin, passargout, &passin, &passout))
205+ {
206+ BIO_printf(bio_err, "Error getting passwords\n");
207+ goto end;
208+ }
209+
210+ if (outfile)
211+ {
212+ if (!(out = BIO_new_file (outfile, "wb")))
213+ {
214+ BIO_printf(bio_err,
215+ "Can't open output file %s\n", outfile);
216+ goto end;
217+ }
218+ }
219+ else
220+ {
221+ out = BIO_new_fp (stdout, BIO_NOCLOSE);
222+#ifdef OPENSSL_SYS_VMS
223+ {
224+ BIO *tmpbio = BIO_new(BIO_f_linebuffer());
225+ out = BIO_push(tmpbio, out);
226+ }
227+#endif
228+ }
229+
230+ if (pubin)
231+ pkey = load_pubkey(bio_err, infile, informat, 1,
232+ passin, e, "Public Key");
233+ else
234+ pkey = load_key(bio_err, infile, informat, 1,
235+ passin, e, "key");
236+ if (!pkey)
237+ goto end;
238+
239+ if (!noout)
240+ {
241+ if (outformat == FORMAT_PEM)
242+ {
243+ if (pubout)
244+ PEM_write_bio_PUBKEY(out,pkey);
245+ else
246+ PEM_write_bio_PrivateKey(out, pkey, cipher,
247+ NULL, 0, NULL, passout);
248+ }
249+ else if (outformat == FORMAT_ASN1)
250+ {
251+ if (pubout)
252+ i2d_PUBKEY_bio(out, pkey);
253+ else
254+ i2d_PrivateKey_bio(out, pkey);
255+ }
256+ else
257+ {
258+ BIO_printf(bio_err, "Bad format specified for key\n");
259+ goto end;
260+ }
261+
262+ }
263+
264+ if (text)
265+ {
266+ if (pubtext)
267+ EVP_PKEY_print_public(out, pkey, 0, NULL);
268+ else
269+ EVP_PKEY_print_private(out, pkey, 0, NULL);
270+ }
271+
272+ ret = 0;
273+
274+ end:
275+ EVP_PKEY_free(pkey);
276+ BIO_free_all(out);
277+ BIO_free(in);
278+ if (passin)
279+ OPENSSL_free(passin);
280+ if (passout)
281+ OPENSSL_free(passout);
282+
283+ return ret;
284+ }
--- /dev/null
+++ b/apps/pkeyparam.c
@@ -0,0 +1,201 @@
1+/* apps/pkeyparam.c */
2+/* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL
3+ * project 2006
4+ */
5+/* ====================================================================
6+ * Copyright (c) 2006 The OpenSSL Project. All rights reserved.
7+ *
8+ * Redistribution and use in source and binary forms, with or without
9+ * modification, are permitted provided that the following conditions
10+ * are met:
11+ *
12+ * 1. Redistributions of source code must retain the above copyright
13+ * notice, this list of conditions and the following disclaimer.
14+ *
15+ * 2. Redistributions in binary form must reproduce the above copyright
16+ * notice, this list of conditions and the following disclaimer in
17+ * the documentation and/or other materials provided with the
18+ * distribution.
19+ *
20+ * 3. All advertising materials mentioning features or use of this
21+ * software must display the following acknowledgment:
22+ * "This product includes software developed by the OpenSSL Project
23+ * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
24+ *
25+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
26+ * endorse or promote products derived from this software without
27+ * prior written permission. For written permission, please contact
28+ * licensing@OpenSSL.org.
29+ *
30+ * 5. Products derived from this software may not be called "OpenSSL"
31+ * nor may "OpenSSL" appear in their names without prior written
32+ * permission of the OpenSSL Project.
33+ *
34+ * 6. Redistributions of any form whatsoever must retain the following
35+ * acknowledgment:
36+ * "This product includes software developed by the OpenSSL Project
37+ * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
38+ *
39+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
40+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
41+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
42+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
43+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
44+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
45+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
46+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
47+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
48+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
49+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
50+ * OF THE POSSIBILITY OF SUCH DAMAGE.
51+ * ====================================================================
52+ *
53+ * This product includes cryptographic software written by Eric Young
54+ * (eay@cryptsoft.com). This product includes software written by Tim
55+ * Hudson (tjh@cryptsoft.com).
56+ *
57+ */
58+#include <stdio.h>
59+#include <string.h>
60+#include "apps.h"
61+#include <openssl/pem.h>
62+#include <openssl/err.h>
63+#include <openssl/evp.h>
64+
65+#define PROG pkeyparam_main
66+
67+int MAIN(int, char **);
68+
69+int MAIN(int argc, char **argv)
70+ {
71+ char **args, *infile = NULL, *outfile = NULL;
72+ BIO *in = NULL, *out = NULL;
73+ int text = 0, noout = 0;
74+ EVP_PKEY *pkey=NULL;
75+ int badarg = 0;
76+#ifndef OPENSSL_NO_ENGINE
77+ ENGINE *e = NULL;
78+ char *engine=NULL;
79+#endif
80+ int ret = 1;
81+
82+ if (bio_err == NULL)
83+ bio_err = BIO_new_fp (stderr, BIO_NOCLOSE);
84+
85+ if (!load_config(bio_err, NULL))
86+ goto end;
87+
88+ ERR_load_crypto_strings();
89+ OpenSSL_add_all_algorithms();
90+ args = argv + 1;
91+ while (!badarg && *args && *args[0] == '-')
92+ {
93+ if (!strcmp (*args, "-in"))
94+ {
95+ if (args[1])
96+ {
97+ args++;
98+ infile = *args;
99+ }
100+ else badarg = 1;
101+ }
102+ else if (!strcmp (*args, "-out"))
103+ {
104+ if (args[1])
105+ {
106+ args++;
107+ outfile = *args;
108+ }
109+ else badarg = 1;
110+ }
111+#ifndef OPENSSL_NO_ENGINE
112+ else if (strcmp(*args,"-engine") == 0)
113+ {
114+ if (!args[1]) goto bad;
115+ engine= *(++args);
116+ }
117+#endif
118+
119+ else if (strcmp(*args,"-text") == 0)
120+ text=1;
121+ else if (strcmp(*args,"-noout") == 0)
122+ noout=1;
123+ args++;
124+ }
125+
126+ if (badarg)
127+ {
128+#ifndef OPENSSL_NO_ENGINE
129+ bad:
130+#endif
131+ BIO_printf(bio_err, "Usage pkeyparam [options]\n");
132+ BIO_printf(bio_err, "where options are\n");
133+ BIO_printf(bio_err, "-in file input file\n");
134+ BIO_printf(bio_err, "-out file output file\n");
135+ BIO_printf(bio_err, "-text print parameters as text\n");
136+ BIO_printf(bio_err, "-noout don't output encoded parameters\n");
137+#ifndef OPENSSL_NO_ENGINE
138+ BIO_printf(bio_err, "-engine e use engine e, possibly a hardware device.\n");
139+#endif
140+ return 1;
141+ }
142+
143+#ifndef OPENSSL_NO_ENGINE
144+ e = setup_engine(bio_err, engine, 0);
145+#endif
146+
147+ if (infile)
148+ {
149+ if (!(in = BIO_new_file (infile, "r")))
150+ {
151+ BIO_printf(bio_err,
152+ "Can't open input file %s\n", infile);
153+ goto end;
154+ }
155+ }
156+ else
157+ in = BIO_new_fp (stdin, BIO_NOCLOSE);
158+
159+ if (outfile)
160+ {
161+ if (!(out = BIO_new_file (outfile, "w")))
162+ {
163+ BIO_printf(bio_err,
164+ "Can't open output file %s\n", outfile);
165+ goto end;
166+ }
167+ }
168+ else
169+ {
170+ out = BIO_new_fp (stdout, BIO_NOCLOSE);
171+#ifdef OPENSSL_SYS_VMS
172+ {
173+ BIO *tmpbio = BIO_new(BIO_f_linebuffer());
174+ out = BIO_push(tmpbio, out);
175+ }
176+#endif
177+ }
178+
179+ pkey = PEM_read_bio_Parameters(in, NULL);
180+ if (!pkey)
181+ {
182+ BIO_printf(bio_err, "Error reading paramters\n");
183+ ERR_print_errors(bio_err);
184+ goto end;
185+ }
186+
187+ if (!noout)
188+ PEM_write_bio_Parameters(out,pkey);
189+
190+ if (text)
191+ EVP_PKEY_print_params(out, pkey, 0, NULL);
192+
193+ ret = 0;
194+
195+ end:
196+ EVP_PKEY_free(pkey);
197+ BIO_free_all(out);
198+ BIO_free(in);
199+
200+ return ret;
201+ }
--- /dev/null
+++ b/apps/pkeyutl.c
@@ -0,0 +1,570 @@
1+/* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL
2+ * project 2006.
3+ */
4+/* ====================================================================
5+ * Copyright (c) 2006 The OpenSSL Project. All rights reserved.
6+ *
7+ * Redistribution and use in source and binary forms, with or without
8+ * modification, are permitted provided that the following conditions
9+ * are met:
10+ *
11+ * 1. Redistributions of source code must retain the above copyright
12+ * notice, this list of conditions and the following disclaimer.
13+ *
14+ * 2. Redistributions in binary form must reproduce the above copyright
15+ * notice, this list of conditions and the following disclaimer in
16+ * the documentation and/or other materials provided with the
17+ * distribution.
18+ *
19+ * 3. All advertising materials mentioning features or use of this
20+ * software must display the following acknowledgment:
21+ * "This product includes software developed by the OpenSSL Project
22+ * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
23+ *
24+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
25+ * endorse or promote products derived from this software without
26+ * prior written permission. For written permission, please contact
27+ * licensing@OpenSSL.org.
28+ *
29+ * 5. Products derived from this software may not be called "OpenSSL"
30+ * nor may "OpenSSL" appear in their names without prior written
31+ * permission of the OpenSSL Project.
32+ *
33+ * 6. Redistributions of any form whatsoever must retain the following
34+ * acknowledgment:
35+ * "This product includes software developed by the OpenSSL Project
36+ * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
37+ *
38+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
39+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
40+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
41+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
42+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
43+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
44+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
45+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
46+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
47+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
48+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
49+ * OF THE POSSIBILITY OF SUCH DAMAGE.
50+ * ====================================================================
51+ *
52+ * This product includes cryptographic software written by Eric Young
53+ * (eay@cryptsoft.com). This product includes software written by Tim
54+ * Hudson (tjh@cryptsoft.com).
55+ *
56+ */
57+
58+
59+#include "apps.h"
60+#include <string.h>
61+#include <openssl/err.h>
62+#include <openssl/pem.h>
63+#include <openssl/evp.h>
64+
65+#define KEY_PRIVKEY 1
66+#define KEY_PUBKEY 2
67+#define KEY_CERT 3
68+
69+static void usage(void);
70+
71+#undef PROG
72+
73+#define PROG pkeyutl_main
74+
75+static EVP_PKEY_CTX *init_ctx(int *pkeysize,
76+ char *keyfile, int keyform, int key_type,
77+ char *passargin, int pkey_op, ENGINE *e);
78+
79+static int setup_peer(BIO *err, EVP_PKEY_CTX *ctx, int peerform,
80+ const char *file);
81+
82+static int do_keyop(EVP_PKEY_CTX *ctx, int pkey_op,
83+ unsigned char *out, size_t *poutlen,
84+ unsigned char *in, size_t inlen);
85+
86+int MAIN(int argc, char **);
87+
88+int MAIN(int argc, char **argv)
89+{
90+ BIO *in = NULL, *out = NULL;
91+ char *infile = NULL, *outfile = NULL, *sigfile = NULL;
92+ ENGINE *e = NULL;
93+ int pkey_op = EVP_PKEY_OP_SIGN, key_type = KEY_PRIVKEY;
94+ int keyform = FORMAT_PEM, peerform = FORMAT_PEM;
95+ char badarg = 0, rev = 0;
96+ char hexdump = 0, asn1parse = 0;
97+ EVP_PKEY_CTX *ctx = NULL;
98+ char *passargin = NULL;
99+ int keysize = -1;
100+
101+ unsigned char *buf_in = NULL, *buf_out = NULL, *sig = NULL;
102+ size_t buf_outlen;
103+ int buf_inlen = 0, siglen = -1;
104+
105+ int ret = 1, rv = -1;
106+
107+ argc--;
108+ argv++;
109+
110+ if(!bio_err) bio_err = BIO_new_fp(stderr, BIO_NOCLOSE);
111+
112+ if (!load_config(bio_err, NULL))
113+ goto end;
114+ ERR_load_crypto_strings();
115+ OpenSSL_add_all_algorithms();
116+
117+ while(argc >= 1)
118+ {
119+ if (!strcmp(*argv,"-in"))
120+ {
121+ if (--argc < 1) badarg = 1;
122+ infile= *(++argv);
123+ }
124+ else if (!strcmp(*argv,"-out"))
125+ {
126+ if (--argc < 1) badarg = 1;
127+ outfile= *(++argv);
128+ }
129+ else if (!strcmp(*argv,"-sigfile"))
130+ {
131+ if (--argc < 1) badarg = 1;
132+ sigfile= *(++argv);
133+ }
134+ else if(!strcmp(*argv, "-inkey"))
135+ {
136+ if (--argc < 1)
137+ badarg = 1;
138+ else
139+ {
140+ ctx = init_ctx(&keysize,
141+ *(++argv), keyform, key_type,
142+ passargin, pkey_op, e);
143+ if (!ctx)
144+ {
145+ BIO_puts(bio_err,
146+ "Error initializing context\n");
147+ ERR_print_errors(bio_err);
148+ badarg = 1;
149+ }
150+ }
151+ }
152+ else if (!strcmp(*argv,"-peerkey"))
153+ {
154+ if (--argc < 1)
155+ badarg = 1;
156+ else if (!setup_peer(bio_err, ctx, peerform, *(++argv)))
157+ badarg = 1;
158+ }
159+ else if (!strcmp(*argv,"-passin"))
160+ {
161+ if (--argc < 1) badarg = 1;
162+ passargin= *(++argv);
163+ }
164+ else if (strcmp(*argv,"-peerform") == 0)
165+ {
166+ if (--argc < 1) badarg = 1;
167+ peerform=str2fmt(*(++argv));
168+ }
169+ else if (strcmp(*argv,"-keyform") == 0)
170+ {
171+ if (--argc < 1) badarg = 1;
172+ keyform=str2fmt(*(++argv));
173+ }
174+#ifndef OPENSSL_NO_ENGINE
175+ else if(!strcmp(*argv, "-engine"))
176+ {
177+ if (--argc < 1)
178+ badarg = 1;
179+ else
180+ e = setup_engine(bio_err, *(++argv), 0);
181+ }
182+#endif
183+ else if(!strcmp(*argv, "-pubin"))
184+ key_type = KEY_PUBKEY;
185+ else if(!strcmp(*argv, "-certin"))
186+ key_type = KEY_CERT;
187+ else if(!strcmp(*argv, "-asn1parse"))
188+ asn1parse = 1;
189+ else if(!strcmp(*argv, "-hexdump"))
190+ hexdump = 1;
191+ else if(!strcmp(*argv, "-sign"))
192+ pkey_op = EVP_PKEY_OP_SIGN;
193+ else if(!strcmp(*argv, "-verify"))
194+ pkey_op = EVP_PKEY_OP_VERIFY;
195+ else if(!strcmp(*argv, "-verifyrecover"))
196+ pkey_op = EVP_PKEY_OP_VERIFYRECOVER;
197+ else if(!strcmp(*argv, "-rev"))
198+ rev = 1;
199+ else if(!strcmp(*argv, "-encrypt"))
200+ pkey_op = EVP_PKEY_OP_ENCRYPT;
201+ else if(!strcmp(*argv, "-decrypt"))
202+ pkey_op = EVP_PKEY_OP_DECRYPT;
203+ else if(!strcmp(*argv, "-derive"))
204+ pkey_op = EVP_PKEY_OP_DERIVE;
205+ else if (strcmp(*argv,"-pkeyopt") == 0)
206+ {
207+ if (--argc < 1)
208+ badarg = 1;
209+ else if (!ctx)
210+ {
211+ BIO_puts(bio_err,
212+ "-pkeyopt command before -inkey\n");
213+ badarg = 1;
214+ }
215+ else if (pkey_ctrl_string(ctx, *(++argv)) <= 0)
216+ {
217+ BIO_puts(bio_err, "parameter setting error\n");
218+ ERR_print_errors(bio_err);
219+ goto end;
220+ }
221+ }
222+ else badarg = 1;
223+ if(badarg)
224+ {
225+ usage();
226+ goto end;
227+ }
228+ argc--;
229+ argv++;
230+ }
231+
232+ if (!ctx)
233+ {
234+ usage();
235+ goto end;
236+ }
237+
238+ if (sigfile && (pkey_op != EVP_PKEY_OP_VERIFY))
239+ {
240+ BIO_puts(bio_err, "Signature file specified for non verify\n");
241+ goto end;
242+ }
243+
244+ if (!sigfile && (pkey_op == EVP_PKEY_OP_VERIFY))
245+ {
246+ BIO_puts(bio_err, "No signature file specified for verify\n");
247+ goto end;
248+ }
249+
250+/* FIXME: seed PRNG only if needed */
251+ app_RAND_load_file(NULL, bio_err, 0);
252+
253+ if (pkey_op != EVP_PKEY_OP_DERIVE)
254+ {
255+ if(infile)
256+ {
257+ if(!(in = BIO_new_file(infile, "rb")))
258+ {
259+ BIO_puts(bio_err,
260+ "Error Opening Input File\n");
261+ ERR_print_errors(bio_err);
262+ goto end;
263+ }
264+ }
265+ else
266+ in = BIO_new_fp(stdin, BIO_NOCLOSE);
267+ }
268+
269+ if(outfile)
270+ {
271+ if(!(out = BIO_new_file(outfile, "wb")))
272+ {
273+ BIO_printf(bio_err, "Error Creating Output File\n");
274+ ERR_print_errors(bio_err);
275+ goto end;
276+ }
277+ }
278+ else
279+ {
280+ out = BIO_new_fp(stdout, BIO_NOCLOSE);
281+#ifdef OPENSSL_SYS_VMS
282+ {
283+ BIO *tmpbio = BIO_new(BIO_f_linebuffer());
284+ out = BIO_push(tmpbio, out);
285+ }
286+#endif
287+ }
288+
289+ if (sigfile)
290+ {
291+ BIO *sigbio = BIO_new_file(sigfile, "rb");
292+ if (!sigbio)
293+ {
294+ BIO_printf(bio_err, "Can't open signature file %s\n",
295+ sigfile);
296+ goto end;
297+ }
298+ siglen = bio_to_mem(&sig, keysize * 10, sigbio);
299+ BIO_free(sigbio);
300+ if (siglen <= 0)
301+ {
302+ BIO_printf(bio_err, "Error reading signature data\n");
303+ goto end;
304+ }
305+ }
306+
307+ if (in)
308+ {
309+ /* Read the input data */
310+ buf_inlen = bio_to_mem(&buf_in, keysize * 10, in);
311+ if(buf_inlen <= 0)
312+ {
313+ BIO_printf(bio_err, "Error reading input Data\n");
314+ exit(1);
315+ }
316+ if(rev)
317+ {
318+ size_t i;
319+ unsigned char ctmp;
320+ size_t l = (size_t)buf_inlen;
321+ for(i = 0; i < l/2; i++)
322+ {
323+ ctmp = buf_in[i];
324+ buf_in[i] = buf_in[l - 1 - i];
325+ buf_in[l - 1 - i] = ctmp;
326+ }
327+ }
328+ }
329+
330+ if(pkey_op == EVP_PKEY_OP_VERIFY)
331+ {
332+ rv = EVP_PKEY_verify(ctx, sig, (size_t)siglen,
333+ buf_in, (size_t)buf_inlen);
334+ if (rv == 0)
335+ BIO_puts(out, "Signature Verification Failure\n");
336+ else if (rv == 1)
337+ BIO_puts(out, "Signature Verified Successfully\n");
338+ if (rv >= 0)
339+ goto end;
340+ }
341+ else
342+ {
343+ rv = do_keyop(ctx, pkey_op, NULL, (size_t *)&buf_outlen,
344+ buf_in, (size_t)buf_inlen);
345+ if (rv > 0)
346+ {
347+ buf_out = OPENSSL_malloc(buf_outlen);
348+ if (!buf_out)
349+ rv = -1;
350+ else
351+ rv = do_keyop(ctx, pkey_op,
352+ buf_out, (size_t *)&buf_outlen,
353+ buf_in, (size_t)buf_inlen);
354+ }
355+ }
356+
357+ if(rv <= 0)
358+ {
359+ BIO_printf(bio_err, "Public Key operation error\n");
360+ ERR_print_errors(bio_err);
361+ goto end;
362+ }
363+ ret = 0;
364+ if(asn1parse)
365+ {
366+ if(!ASN1_parse_dump(out, buf_out, buf_outlen, 1, -1))
367+ ERR_print_errors(bio_err);
368+ }
369+ else if(hexdump)
370+ BIO_dump(out, (char *)buf_out, buf_outlen);
371+ else
372+ BIO_write(out, buf_out, buf_outlen);
373+
374+ end:
375+ if (ctx)
376+ EVP_PKEY_CTX_free(ctx);
377+ BIO_free(in);
378+ BIO_free_all(out);
379+ if (buf_in)
380+ OPENSSL_free(buf_in);
381+ if (buf_out)
382+ OPENSSL_free(buf_out);
383+ if (sig)
384+ OPENSSL_free(sig);
385+ return ret;
386+}
387+
388+static void usage()
389+{
390+ BIO_printf(bio_err, "Usage: pkeyutl [options]\n");
391+ BIO_printf(bio_err, "-in file input file\n");
392+ BIO_printf(bio_err, "-out file output file\n");
393+ BIO_printf(bio_err, "-signature file signature file (verify operation only)\n");
394+ BIO_printf(bio_err, "-inkey file input key\n");
395+ BIO_printf(bio_err, "-keyform arg private key format - default PEM\n");
396+ BIO_printf(bio_err, "-pubin input is a public key\n");
397+ BIO_printf(bio_err, "-certin input is a certificate carrying a public key\n");
398+ BIO_printf(bio_err, "-pkeyopt X:Y public key options\n");
399+ BIO_printf(bio_err, "-sign sign with private key\n");
400+ BIO_printf(bio_err, "-verify verify with public key\n");
401+ BIO_printf(bio_err, "-verifyrecover verify with public key, recover original data\n");
402+ BIO_printf(bio_err, "-encrypt encrypt with public key\n");
403+ BIO_printf(bio_err, "-decrypt decrypt with private key\n");
404+ BIO_printf(bio_err, "-derive derive shared secret\n");
405+ BIO_printf(bio_err, "-hexdump hex dump output\n");
406+#ifndef OPENSSL_NO_ENGINE
407+ BIO_printf(bio_err, "-engine e use engine e, possibly a hardware device.\n");
408+#endif
409+ BIO_printf(bio_err, "-passin arg pass phrase source\n");
410+
411+}
412+
413+static EVP_PKEY_CTX *init_ctx(int *pkeysize,
414+ char *keyfile, int keyform, int key_type,
415+ char *passargin, int pkey_op, ENGINE *e)
416+ {
417+ EVP_PKEY *pkey = NULL;
418+ EVP_PKEY_CTX *ctx = NULL;
419+ char *passin = NULL;
420+ int rv = -1;
421+ X509 *x;
422+ if(((pkey_op == EVP_PKEY_OP_SIGN) || (pkey_op == EVP_PKEY_OP_DECRYPT)
423+ || (pkey_op == EVP_PKEY_OP_DERIVE))
424+ && (key_type != KEY_PRIVKEY))
425+ {
426+ BIO_printf(bio_err, "A private key is needed for this operation\n");
427+ goto end;
428+ }
429+ if(!app_passwd(bio_err, passargin, NULL, &passin, NULL))
430+ {
431+ BIO_printf(bio_err, "Error getting password\n");
432+ goto end;
433+ }
434+ switch(key_type)
435+ {
436+ case KEY_PRIVKEY:
437+ pkey = load_key(bio_err, keyfile, keyform, 0,
438+ passin, e, "Private Key");
439+ break;
440+
441+ case KEY_PUBKEY:
442+ pkey = load_pubkey(bio_err, keyfile, keyform, 0,
443+ NULL, e, "Public Key");
444+ break;
445+
446+ case KEY_CERT:
447+ x = load_cert(bio_err, keyfile, keyform,
448+ NULL, e, "Certificate");
449+ if(x)
450+ {
451+ pkey = X509_get_pubkey(x);
452+ X509_free(x);
453+ }
454+ break;
455+
456+ }
457+
458+ *pkeysize = EVP_PKEY_size(pkey);
459+
460+ if (!pkey)
461+ goto end;
462+
463+ ctx = EVP_PKEY_CTX_new(pkey, e);
464+
465+ EVP_PKEY_free(pkey);
466+
467+ if (!ctx)
468+ goto end;
469+
470+ switch(pkey_op)
471+ {
472+ case EVP_PKEY_OP_SIGN:
473+ rv = EVP_PKEY_sign_init(ctx);
474+ break;
475+
476+ case EVP_PKEY_OP_VERIFY:
477+ rv = EVP_PKEY_verify_init(ctx);
478+ break;
479+
480+ case EVP_PKEY_OP_VERIFYRECOVER:
481+ rv = EVP_PKEY_verify_recover_init(ctx);
482+ break;
483+
484+ case EVP_PKEY_OP_ENCRYPT:
485+ rv = EVP_PKEY_encrypt_init(ctx);
486+ break;
487+
488+ case EVP_PKEY_OP_DECRYPT:
489+ rv = EVP_PKEY_decrypt_init(ctx);
490+ break;
491+
492+ case EVP_PKEY_OP_DERIVE:
493+ rv = EVP_PKEY_derive_init(ctx);
494+ break;
495+ }
496+
497+ if (rv <= 0)
498+ {
499+ EVP_PKEY_CTX_free(ctx);
500+ ctx = NULL;
501+ }
502+
503+ end:
504+
505+ if (passin)
506+ OPENSSL_free(passin);
507+
508+ return ctx;
509+
510+
511+ }
512+
513+static int setup_peer(BIO *err, EVP_PKEY_CTX *ctx, int peerform,
514+ const char *file)
515+ {
516+ EVP_PKEY *peer = NULL;
517+ int ret;
518+ if (!ctx)
519+ {
520+ BIO_puts(err, "-peerkey command before -inkey\n");
521+ return 0;
522+ }
523+
524+ peer = load_pubkey(bio_err, file, peerform, 0, NULL, NULL, "Peer Key");
525+
526+ if (!peer)
527+ {
528+ BIO_printf(bio_err, "Error reading peer key %s\n", file);
529+ ERR_print_errors(err);
530+ return 0;
531+ }
532+
533+ ret = EVP_PKEY_derive_set_peer(ctx, peer);
534+
535+ EVP_PKEY_free(peer);
536+ if (ret <= 0)
537+ ERR_print_errors(err);
538+ return ret;
539+ }
540+
541+static int do_keyop(EVP_PKEY_CTX *ctx, int pkey_op,
542+ unsigned char *out, size_t *poutlen,
543+ unsigned char *in, size_t inlen)
544+ {
545+ int rv = 0;
546+ switch(pkey_op)
547+ {
548+ case EVP_PKEY_OP_VERIFYRECOVER:
549+ rv = EVP_PKEY_verify_recover(ctx, out, poutlen, in, inlen);
550+ break;
551+
552+ case EVP_PKEY_OP_SIGN:
553+ rv = EVP_PKEY_sign(ctx, out, poutlen, in, inlen);
554+ break;
555+
556+ case EVP_PKEY_OP_ENCRYPT:
557+ rv = EVP_PKEY_encrypt(ctx, out, poutlen, in, inlen);
558+ break;
559+
560+ case EVP_PKEY_OP_DECRYPT:
561+ rv = EVP_PKEY_decrypt(ctx, out, poutlen, in, inlen);
562+ break;
563+
564+ case EVP_PKEY_OP_DERIVE:
565+ rv = EVP_PKEY_derive(ctx, out, poutlen);
566+ break;
567+
568+ }
569+ return rv;
570+ }
--- /dev/null
+++ b/apps/ts.c
@@ -0,0 +1,1144 @@
1+/* apps/ts.c */
2+/* Written by Zoltan Glozik (zglozik@stones.com) for the OpenSSL
3+ * project 2002.
4+ */
5+/* ====================================================================
6+ * Copyright (c) 2001 The OpenSSL Project. All rights reserved.
7+ *
8+ * Redistribution and use in source and binary forms, with or without
9+ * modification, are permitted provided that the following conditions
10+ * are met:
11+ *
12+ * 1. Redistributions of source code must retain the above copyright
13+ * notice, this list of conditions and the following disclaimer.
14+ *
15+ * 2. Redistributions in binary form must reproduce the above copyright
16+ * notice, this list of conditions and the following disclaimer in
17+ * the documentation and/or other materials provided with the
18+ * distribution.
19+ *
20+ * 3. All advertising materials mentioning features or use of this
21+ * software must display the following acknowledgment:
22+ * "This product includes software developed by the OpenSSL Project
23+ * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
24+ *
25+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
26+ * endorse or promote products derived from this software without
27+ * prior written permission. For written permission, please contact
28+ * licensing@OpenSSL.org.
29+ *
30+ * 5. Products derived from this software may not be called "OpenSSL"
31+ * nor may "OpenSSL" appear in their names without prior written
32+ * permission of the OpenSSL Project.
33+ *
34+ * 6. Redistributions of any form whatsoever must retain the following
35+ * acknowledgment:
36+ * "This product includes software developed by the OpenSSL Project
37+ * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
38+ *
39+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
40+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
41+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
42+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
43+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
44+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
45+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
46+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
47+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
48+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
49+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
50+ * OF THE POSSIBILITY OF SUCH DAMAGE.
51+ * ====================================================================
52+ *
53+ * This product includes cryptographic software written by Eric Young
54+ * (eay@cryptsoft.com). This product includes software written by Tim
55+ * Hudson (tjh@cryptsoft.com).
56+ *
57+ */
58+
59+#include <stdio.h>
60+#include <stdlib.h>
61+#include <string.h>
62+#include "apps.h"
63+#include <openssl/bio.h>
64+#include <openssl/err.h>
65+#include <openssl/pem.h>
66+#include <openssl/rand.h>
67+#include <openssl/ts.h>
68+#include <openssl/bn.h>
69+
70+#undef PROG
71+#define PROG ts_main
72+
73+/* Length of the nonce of the request in bits (must be a multiple of 8). */
74+#define NONCE_LENGTH 64
75+
76+/* Macro definitions for the configuration file. */
77+#define ENV_OID_FILE "oid_file"
78+
79+/* Local function declarations. */
80+
81+static ASN1_OBJECT *txt2obj(const char *oid);
82+static CONF *load_config_file(const char *configfile);
83+
84+/* Query related functions. */
85+static int query_command(const char *data, char *digest,
86+ const EVP_MD *md, const char *policy, int no_nonce,
87+ int cert, const char *in, const char *out, int text);
88+static BIO *BIO_open_with_default(const char *file, const char *mode,
89+ FILE *default_fp);
90+static TS_REQ *create_query(BIO *data_bio, char *digest, const EVP_MD *md,
91+ const char *policy, int no_nonce, int cert);
92+static int create_digest(BIO *input, char *digest,
93+ const EVP_MD *md, unsigned char **md_value);
94+static ASN1_INTEGER *create_nonce(int bits);
95+
96+/* Reply related functions. */
97+static int reply_command(CONF *conf, char *section, char *engine,
98+ char *queryfile, char *passin, char *inkey,
99+ char *signer, char *chain, const char *policy,
100+ char *in, int token_in, char *out, int token_out,
101+ int text);
102+static TS_RESP *read_PKCS7(BIO *in_bio);
103+static TS_RESP *create_response(CONF *conf, const char *section, char *engine,
104+ char *queryfile, char *passin, char *inkey,
105+ char *signer, char *chain, const char *policy);
106+static ASN1_INTEGER * MS_CALLBACK serial_cb(TS_RESP_CTX *ctx, void *data);
107+static ASN1_INTEGER *next_serial(const char *serialfile);
108+static int save_ts_serial(const char *serialfile, ASN1_INTEGER *serial);
109+
110+/* Verify related functions. */
111+static int verify_command(char *data, char *digest, char *queryfile,
112+ char *in, int token_in,
113+ char *ca_path, char *ca_file, char *untrusted);
114+static TS_VERIFY_CTX *create_verify_ctx(char *data, char *digest,
115+ char *queryfile,
116+ char *ca_path, char *ca_file,
117+ char *untrusted);
118+static X509_STORE *create_cert_store(char *ca_path, char *ca_file);
119+static int MS_CALLBACK verify_cb(int ok, X509_STORE_CTX *ctx);
120+
121+/* Main function definition. */
122+int MAIN(int, char **);
123+
124+int MAIN(int argc, char **argv)
125+ {
126+ int ret = 1;
127+ char *configfile = NULL;
128+ char *section = NULL;
129+ CONF *conf = NULL;
130+ enum mode {
131+ CMD_NONE, CMD_QUERY, CMD_REPLY, CMD_VERIFY
132+ } mode = CMD_NONE;
133+ char *data = NULL;
134+ char *digest = NULL;
135+ const EVP_MD *md = NULL;
136+ char *rnd = NULL;
137+ char *policy = NULL;
138+ int no_nonce = 0;
139+ int cert = 0;
140+ char *in = NULL;
141+ char *out = NULL;
142+ int text = 0;
143+ char *queryfile = NULL;
144+ char *passin = NULL; /* Password source. */
145+ char *password =NULL; /* Password itself. */
146+ char *inkey = NULL;
147+ char *signer = NULL;
148+ char *chain = NULL;
149+ char *ca_path = NULL;
150+ char *ca_file = NULL;
151+ char *untrusted = NULL;
152+ char *engine = NULL;
153+ /* Input is ContentInfo instead of TimeStampResp. */
154+ int token_in = 0;
155+ /* Output is ContentInfo instead of TimeStampResp. */
156+ int token_out = 0;
157+ int free_bio_err = 0;
158+
159+ ERR_load_crypto_strings();
160+ apps_startup();
161+
162+ if (bio_err == NULL && (bio_err = BIO_new(BIO_s_file())) != NULL)
163+ {
164+ free_bio_err = 1;
165+ BIO_set_fp(bio_err, stderr, BIO_NOCLOSE | BIO_FP_TEXT);
166+ }
167+
168+ for (argc--, argv++; argc > 0; argc--, argv++)
169+ {
170+ if (strcmp(*argv, "-config") == 0)
171+ {
172+ if (argc-- < 1) goto usage;
173+ configfile = *++argv;
174+ }
175+ else if (strcmp(*argv, "-section") == 0)
176+ {
177+ if (argc-- < 1) goto usage;
178+ section = *++argv;
179+ }
180+ else if (strcmp(*argv, "-query") == 0)
181+ {
182+ if (mode != CMD_NONE) goto usage;
183+ mode = CMD_QUERY;
184+ }
185+ else if (strcmp(*argv, "-data") == 0)
186+ {
187+ if (argc-- < 1) goto usage;
188+ data = *++argv;
189+ }
190+ else if (strcmp(*argv, "-digest") == 0)
191+ {
192+ if (argc-- < 1) goto usage;
193+ digest = *++argv;
194+ }
195+ else if (strcmp(*argv, "-rand") == 0)
196+ {
197+ if (argc-- < 1) goto usage;
198+ rnd = *++argv;
199+ }
200+ else if (strcmp(*argv, "-policy") == 0)
201+ {
202+ if (argc-- < 1) goto usage;
203+ policy = *++argv;
204+ }
205+ else if (strcmp(*argv, "-no_nonce") == 0)
206+ {
207+ no_nonce = 1;
208+ }
209+ else if (strcmp(*argv, "-cert") == 0)
210+ {
211+ cert = 1;
212+ }
213+ else if (strcmp(*argv, "-in") == 0)
214+ {
215+ if (argc-- < 1) goto usage;
216+ in = *++argv;
217+ }
218+ else if (strcmp(*argv, "-token_in") == 0)
219+ {
220+ token_in = 1;
221+ }
222+ else if (strcmp(*argv, "-out") == 0)
223+ {
224+ if (argc-- < 1) goto usage;
225+ out = *++argv;
226+ }
227+ else if (strcmp(*argv, "-token_out") == 0)
228+ {
229+ token_out = 1;
230+ }
231+ else if (strcmp(*argv, "-text") == 0)
232+ {
233+ text = 1;
234+ }
235+ else if (strcmp(*argv, "-reply") == 0)
236+ {
237+ if (mode != CMD_NONE) goto usage;
238+ mode = CMD_REPLY;
239+ }
240+ else if (strcmp(*argv, "-queryfile") == 0)
241+ {
242+ if (argc-- < 1) goto usage;
243+ queryfile = *++argv;
244+ }
245+ else if (strcmp(*argv, "-passin") == 0)
246+ {
247+ if (argc-- < 1) goto usage;
248+ passin = *++argv;
249+ }
250+ else if (strcmp(*argv, "-inkey") == 0)
251+ {
252+ if (argc-- < 1) goto usage;
253+ inkey = *++argv;
254+ }
255+ else if (strcmp(*argv, "-signer") == 0)
256+ {
257+ if (argc-- < 1) goto usage;
258+ signer = *++argv;
259+ }
260+ else if (strcmp(*argv, "-chain") == 0)
261+ {
262+ if (argc-- < 1) goto usage;
263+ chain = *++argv;
264+ }
265+ else if (strcmp(*argv, "-verify") == 0)
266+ {
267+ if (mode != CMD_NONE) goto usage;
268+ mode = CMD_VERIFY;
269+ }
270+ else if (strcmp(*argv, "-CApath") == 0)
271+ {
272+ if (argc-- < 1) goto usage;
273+ ca_path = *++argv;
274+ }
275+ else if (strcmp(*argv, "-CAfile") == 0)
276+ {
277+ if (argc-- < 1) goto usage;
278+ ca_file = *++argv;
279+ }
280+ else if (strcmp(*argv, "-untrusted") == 0)
281+ {
282+ if (argc-- < 1) goto usage;
283+ untrusted = *++argv;
284+ }
285+ else if (strcmp(*argv, "-engine") == 0)
286+ {
287+ if (argc-- < 1) goto usage;
288+ engine = *++argv;
289+ }
290+ else if ((md = EVP_get_digestbyname(*argv + 1)) != NULL)
291+ {
292+ /* empty. */
293+ }
294+ else
295+ goto usage;
296+ }
297+
298+ /* Seed the random number generator if it is going to be used. */
299+ if (mode == CMD_QUERY && !no_nonce)
300+ {
301+ if (!app_RAND_load_file(NULL, bio_err, 1) && rnd == NULL)
302+ BIO_printf(bio_err, "warning, not much extra random "
303+ "data, consider using the -rand option\n");
304+ if (rnd != NULL)
305+ BIO_printf(bio_err,"%ld semi-random bytes loaded\n",
306+ app_RAND_load_files(rnd));
307+ }
308+
309+ /* Get the password if required. */
310+ if(mode == CMD_REPLY && passin &&
311+ !app_passwd(bio_err, passin, NULL, &password, NULL))
312+ {
313+ BIO_printf(bio_err,"Error getting password.\n");
314+ goto cleanup;
315+ }
316+
317+ /* Check consistency of parameters and execute
318+ the appropriate function. */
319+ switch (mode)
320+ {
321+ case CMD_NONE:
322+ goto usage;
323+ case CMD_QUERY:
324+ /* Data file and message imprint cannot be specified
325+ at the same time. */
326+ ret = data != NULL && digest != NULL;
327+ if (ret) goto usage;
328+ /* Load the config file for possible policy OIDs. */
329+ conf = load_config_file(configfile);
330+ ret = !query_command(data, digest, md, policy, no_nonce, cert,
331+ in, out, text);
332+ break;
333+ case CMD_REPLY:
334+ conf = load_config_file(configfile);
335+ if (in == NULL)
336+ {
337+ ret = !(queryfile != NULL && conf != NULL && !token_in);
338+ if (ret) goto usage;
339+ }
340+ else
341+ {
342+ /* 'in' and 'queryfile' are exclusive. */
343+ ret = !(queryfile == NULL);
344+ if (ret) goto usage;
345+ }
346+
347+ ret = !reply_command(conf, section, engine, queryfile,
348+ password, inkey, signer, chain, policy,
349+ in, token_in, out, token_out, text);
350+ break;
351+ case CMD_VERIFY:
352+ ret = !(((queryfile && !data && !digest)
353+ || (!queryfile && data && !digest)
354+ || (!queryfile && !data && digest))
355+ && in != NULL);
356+ if (ret) goto usage;
357+
358+ ret = !verify_command(data, digest, queryfile, in, token_in,
359+ ca_path, ca_file, untrusted);
360+ }
361+
362+ goto cleanup;
363+
364+ usage:
365+ BIO_printf(bio_err, "usage:\n"
366+ "ts -query [-rand file%cfile%c...] [-config configfile] "
367+ "[-data file_to_hash] [-digest digest_bytes]"
368+ "[-md2|-md4|-md5|-sha|-sha1|-mdc2|-ripemd160] "
369+ "[-policy object_id] [-no_nonce] [-cert] "
370+ "[-in request.tsq] [-out request.tsq] [-text]\n",
371+ LIST_SEPARATOR_CHAR, LIST_SEPARATOR_CHAR);
372+ BIO_printf(bio_err, "or\n"
373+ "ts -reply [-config configfile] [-section tsa_section] "
374+ "[-queryfile request.tsq] [-passin password] "
375+ "[-signer tsa_cert.pem] [-inkey private_key.pem] "
376+ "[-chain certs_file.pem] [-policy object_id] "
377+ "[-in response.tsr] [-token_in] "
378+ "[-out response.tsr] [-token_out] [-text] [-engine id]\n");
379+ BIO_printf(bio_err, "or\n"
380+ "ts -verify [-data file_to_hash] [-digest digest_bytes] "
381+ "[-queryfile request.tsq] "
382+ "-in response.tsr [-token_in] "
383+ "-CApath ca_path -CAfile ca_file.pem "
384+ "-untrusted cert_file.pem\n");
385+ cleanup:
386+ /* Clean up. */
387+ app_RAND_write_file(NULL, bio_err);
388+ NCONF_free(conf);
389+ OPENSSL_free(password);
390+ OBJ_cleanup();
391+ if (free_bio_err)
392+ {
393+ BIO_free_all(bio_err);
394+ bio_err = NULL;
395+ }
396+
397+ OPENSSL_EXIT(ret);
398+ }
399+
400+/*
401+ * Configuration file-related function definitions.
402+ */
403+
404+static ASN1_OBJECT *txt2obj(const char *oid)
405+ {
406+ ASN1_OBJECT *oid_obj = NULL;
407+
408+ if (!(oid_obj = OBJ_txt2obj(oid, 0)))
409+ BIO_printf(bio_err, "cannot convert %s to OID\n", oid);
410+
411+ return oid_obj;
412+ }
413+
414+static CONF *load_config_file(const char *configfile)
415+ {
416+ CONF *conf = NULL;
417+ long errorline = -1;
418+
419+ if (!configfile) configfile = getenv("OPENSSL_CONF");
420+ if (!configfile) configfile = getenv("SSLEAY_CONF");
421+
422+ if (configfile &&
423+ (!(conf = NCONF_new(NULL)) ||
424+ NCONF_load(conf, configfile, &errorline) <= 0))
425+ {
426+ if (errorline <= 0)
427+ BIO_printf(bio_err, "error loading the config file "
428+ "'%s'\n", configfile);
429+ else
430+ BIO_printf(bio_err, "error on line %ld of config file "
431+ "'%s'\n", errorline, configfile);
432+ }
433+
434+ if (conf != NULL)
435+ {
436+ const char *p;
437+
438+ BIO_printf(bio_err,"Using configuration from %s\n", configfile);
439+ p = NCONF_get_string(conf, NULL, ENV_OID_FILE);
440+ if (p != NULL)
441+ {
442+ BIO *oid_bio = BIO_new_file(p, "r");
443+ if (!oid_bio)
444+ ERR_print_errors(bio_err);
445+ else
446+ {
447+ OBJ_create_objects(oid_bio);
448+ BIO_free_all(oid_bio);
449+ }
450+ }
451+ else
452+ ERR_clear_error();
453+ if(!add_oid_section(bio_err, conf))
454+ ERR_print_errors(bio_err);
455+ }
456+ return conf;
457+ }
458+
459+/*
460+ * Query-related method definitions.
461+ */
462+
463+static int query_command(const char *data, char *digest, const EVP_MD *md,
464+ const char *policy, int no_nonce,
465+ int cert, const char *in, const char *out, int text)
466+ {
467+ int ret = 0;
468+ TS_REQ *query = NULL;
469+ BIO *in_bio = NULL;
470+ BIO *data_bio = NULL;
471+ BIO *out_bio = NULL;
472+
473+ /* Build query object either from file or from scratch. */
474+ if (in != NULL)
475+ {
476+ if ((in_bio = BIO_new_file(in, "rb")) == NULL) goto end;
477+ query = d2i_TS_REQ_bio(in_bio, NULL);
478+ }
479+ else
480+ {
481+ /* Open the file if no explicit digest bytes were specified. */
482+ if (!digest
483+ && !(data_bio = BIO_open_with_default(data, "rb", stdin)))
484+ goto end;
485+ /* Creating the query object. */
486+ query = create_query(data_bio, digest, md,
487+ policy, no_nonce, cert);
488+ /* Saving the random number generator state. */
489+ }
490+ if (query == NULL) goto end;
491+
492+ /* Write query either in ASN.1 or in text format. */
493+ if ((out_bio = BIO_open_with_default(out, "wb", stdout)) == NULL)
494+ goto end;
495+ if (text)
496+ {
497+ /* Text output. */
498+ if (!TS_REQ_print_bio(out_bio, query))
499+ goto end;
500+ }
501+ else
502+ {
503+ /* ASN.1 output. */
504+ if (!i2d_TS_REQ_bio(out_bio, query))
505+ goto end;
506+ }
507+
508+ ret = 1;
509+
510+ end:
511+ ERR_print_errors(bio_err);
512+
513+ /* Clean up. */
514+ BIO_free_all(in_bio);
515+ BIO_free_all(data_bio);
516+ BIO_free_all(out_bio);
517+ TS_REQ_free(query);
518+
519+ return ret;
520+ }
521+
522+static BIO *BIO_open_with_default(const char *file, const char *mode,
523+ FILE *default_fp)
524+ {
525+ return file == NULL ?
526+ BIO_new_fp(default_fp, BIO_NOCLOSE)
527+ : BIO_new_file(file, mode);
528+ }
529+
530+static TS_REQ *create_query(BIO *data_bio, char *digest, const EVP_MD *md,
531+ const char *policy, int no_nonce, int cert)
532+ {
533+ int ret = 0;
534+ TS_REQ *ts_req = NULL;
535+ int len;
536+ TS_MSG_IMPRINT *msg_imprint = NULL;
537+ X509_ALGOR *algo = NULL;
538+ unsigned char *data = NULL;
539+ ASN1_OBJECT *policy_obj = NULL;
540+ ASN1_INTEGER *nonce_asn1 = NULL;
541+
542+ /* Setting default message digest. */
543+ if (!md && !(md = EVP_get_digestbyname("sha1"))) goto err;
544+
545+ /* Creating request object. */
546+ if (!(ts_req = TS_REQ_new())) goto err;
547+
548+ /* Setting version. */
549+ if (!TS_REQ_set_version(ts_req, 1)) goto err;
550+
551+ /* Creating and adding MSG_IMPRINT object. */
552+ if (!(msg_imprint = TS_MSG_IMPRINT_new())) goto err;
553+
554+ /* Adding algorithm. */
555+ if (!(algo = X509_ALGOR_new())) goto err;
556+ if (!(algo->algorithm = OBJ_nid2obj(EVP_MD_type(md)))) goto err;
557+ if (!(algo->parameter = ASN1_TYPE_new())) goto err;
558+ algo->parameter->type = V_ASN1_NULL;
559+ if (!TS_MSG_IMPRINT_set_algo(msg_imprint, algo)) goto err;
560+
561+ /* Adding message digest. */
562+ if ((len = create_digest(data_bio, digest, md, &data)) == 0)
563+ goto err;
564+ if (!TS_MSG_IMPRINT_set_msg(msg_imprint, data, len)) goto err;
565+
566+ if (!TS_REQ_set_msg_imprint(ts_req, msg_imprint)) goto err;
567+
568+ /* Setting policy if requested. */
569+ if (policy && !(policy_obj = txt2obj(policy))) goto err;
570+ if (policy_obj && !TS_REQ_set_policy_id(ts_req, policy_obj)) goto err;
571+
572+ /* Setting nonce if requested. */
573+ if (!no_nonce && !(nonce_asn1 = create_nonce(NONCE_LENGTH))) goto err;
574+ if (nonce_asn1 && !TS_REQ_set_nonce(ts_req, nonce_asn1)) goto err;
575+
576+ /* Setting certificate request flag if requested. */
577+ if (!TS_REQ_set_cert_req(ts_req, cert)) goto err;
578+
579+ ret = 1;
580+ err:
581+ if (!ret)
582+ {
583+ TS_REQ_free(ts_req);
584+ ts_req = NULL;
585+ BIO_printf(bio_err, "could not create query\n");
586+ }
587+ TS_MSG_IMPRINT_free(msg_imprint);
588+ X509_ALGOR_free(algo);
589+ OPENSSL_free(data);
590+ ASN1_OBJECT_free(policy_obj);
591+ ASN1_INTEGER_free(nonce_asn1);
592+ return ts_req;
593+ }
594+
595+static int create_digest(BIO *input, char *digest, const EVP_MD *md,
596+ unsigned char **md_value)
597+ {
598+ int md_value_len;
599+
600+ md_value_len = EVP_MD_size(md);
601+ if (md_value_len < 0)
602+ goto err;
603+ if (input)
604+ {
605+ /* Digest must be computed from an input file. */
606+ EVP_MD_CTX md_ctx;
607+ unsigned char buffer[4096];
608+ int length;
609+
610+ *md_value = OPENSSL_malloc(md_value_len);
611+ if (*md_value == 0) goto err;
612+
613+ EVP_DigestInit(&md_ctx, md);
614+ while ((length = BIO_read(input, buffer, sizeof(buffer))) > 0)
615+ {
616+ EVP_DigestUpdate(&md_ctx, buffer, length);
617+ }
618+ EVP_DigestFinal(&md_ctx, *md_value, NULL);
619+ }
620+ else
621+ {
622+ /* Digest bytes are specified with digest. */
623+ long digest_len;
624+ *md_value = string_to_hex(digest, &digest_len);
625+ if (!*md_value || md_value_len != digest_len)
626+ {
627+ OPENSSL_free(*md_value);
628+ *md_value = NULL;
629+ BIO_printf(bio_err, "bad digest, %d bytes "
630+ "must be specified\n", md_value_len);
631+ goto err;
632+ }
633+ }
634+
635+ return md_value_len;
636+ err:
637+ return 0;
638+ }
639+
640+static ASN1_INTEGER *create_nonce(int bits)
641+ {
642+ unsigned char buf[20];
643+ ASN1_INTEGER *nonce = NULL;
644+ int len = (bits - 1) / 8 + 1;
645+ int i;
646+
647+ /* Generating random byte sequence. */
648+ if (len > (int)sizeof(buf)) goto err;
649+ if (!RAND_bytes(buf, len)) goto err;
650+
651+ /* Find the first non-zero byte and creating ASN1_INTEGER object. */
652+ for (i = 0; i < len && !buf[i]; ++i);
653+ if (!(nonce = ASN1_INTEGER_new())) goto err;
654+ OPENSSL_free(nonce->data);
655+ /* Allocate at least one byte. */
656+ nonce->length = len - i;
657+ if (!(nonce->data = OPENSSL_malloc(nonce->length + 1))) goto err;
658+ memcpy(nonce->data, buf + i, nonce->length);
659+
660+ return nonce;
661+ err:
662+ BIO_printf(bio_err, "could not create nonce\n");
663+ ASN1_INTEGER_free(nonce);
664+ return NULL;
665+ }
666+/*
667+ * Reply-related method definitions.
668+ */
669+
670+static int reply_command(CONF *conf, char *section, char *engine,
671+ char *queryfile, char *passin, char *inkey,
672+ char *signer, char *chain, const char *policy,
673+ char *in, int token_in,
674+ char *out, int token_out, int text)
675+ {
676+ int ret = 0;
677+ TS_RESP *response = NULL;
678+ BIO *in_bio = NULL;
679+ BIO *query_bio = NULL;
680+ BIO *inkey_bio = NULL;
681+ BIO *signer_bio = NULL;
682+ BIO *out_bio = NULL;
683+
684+ /* Build response object either from response or query. */
685+ if (in != NULL)
686+ {
687+ if ((in_bio = BIO_new_file(in, "rb")) == NULL) goto end;
688+ if (token_in)
689+ {
690+ /* We have a ContentInfo (PKCS7) object, add
691+ 'granted' status info around it. */
692+ response = read_PKCS7(in_bio);
693+ }
694+ else
695+ {
696+ /* We have a ready-made TS_RESP object. */
697+ response = d2i_TS_RESP_bio(in_bio, NULL);
698+ }
699+ }
700+ else
701+ {
702+ response = create_response(conf, section, engine, queryfile,
703+ passin, inkey, signer, chain,
704+ policy);
705+ if (response)
706+ BIO_printf(bio_err, "Response has been generated.\n");
707+ else
708+ BIO_printf(bio_err, "Response is not generated.\n");
709+ }
710+ if (response == NULL) goto end;
711+
712+ /* Write response either in ASN.1 or text format. */
713+ if ((out_bio = BIO_open_with_default(out, "wb", stdout)) == NULL)
714+ goto end;
715+ if (text)
716+ {
717+ /* Text output. */
718+ if (token_out)
719+ {
720+ TS_TST_INFO *tst_info = TS_RESP_get_tst_info(response);
721+ if (!TS_TST_INFO_print_bio(out_bio, tst_info)) goto end;
722+ }
723+ else
724+ {
725+ if (!TS_RESP_print_bio(out_bio, response)) goto end;
726+ }
727+ }
728+ else
729+ {
730+ /* ASN.1 DER output. */
731+ if (token_out)
732+ {
733+ PKCS7 *token = TS_RESP_get_token(response);
734+ if (!i2d_PKCS7_bio(out_bio, token)) goto end;
735+ }
736+ else
737+ {
738+ if (!i2d_TS_RESP_bio(out_bio, response)) goto end;
739+ }
740+ }
741+
742+ ret = 1;
743+
744+ end:
745+ ERR_print_errors(bio_err);
746+
747+ /* Clean up. */
748+ BIO_free_all(in_bio);
749+ BIO_free_all(query_bio);
750+ BIO_free_all(inkey_bio);
751+ BIO_free_all(signer_bio);
752+ BIO_free_all(out_bio);
753+ TS_RESP_free(response);
754+
755+ return ret;
756+ }
757+
758+/* Reads a PKCS7 token and adds default 'granted' status info to it. */
759+static TS_RESP *read_PKCS7(BIO *in_bio)
760+ {
761+ int ret = 0;
762+ PKCS7 *token = NULL;
763+ TS_TST_INFO *tst_info = NULL;
764+ TS_RESP *resp = NULL;
765+ TS_STATUS_INFO *si = NULL;
766+
767+ /* Read PKCS7 object and extract the signed time stamp info. */
768+ if (!(token = d2i_PKCS7_bio(in_bio, NULL))) goto end;
769+ if (!(tst_info = PKCS7_to_TS_TST_INFO(token))) goto end;
770+
771+ /* Creating response object. */
772+ if (!(resp = TS_RESP_new())) goto end;
773+
774+ /* Create granted status info. */
775+ if (!(si = TS_STATUS_INFO_new())) goto end;
776+ if (!(ASN1_INTEGER_set(si->status, TS_STATUS_GRANTED))) goto end;
777+ if (!TS_RESP_set_status_info(resp, si)) goto end;
778+
779+ /* Setting encapsulated token. */
780+ TS_RESP_set_tst_info(resp, token, tst_info);
781+ token = NULL; /* Ownership is lost. */
782+ tst_info = NULL; /* Ownership is lost. */
783+
784+ ret = 1;
785+ end:
786+ PKCS7_free(token);
787+ TS_TST_INFO_free(tst_info);
788+ if (!ret)
789+ {
790+ TS_RESP_free(resp);
791+ resp = NULL;
792+ }
793+ TS_STATUS_INFO_free(si);
794+ return resp;
795+ }
796+
797+static TS_RESP *create_response(CONF *conf, const char *section, char *engine,
798+ char *queryfile, char *passin, char *inkey,
799+ char *signer, char *chain, const char *policy)
800+ {
801+ int ret = 0;
802+ TS_RESP *response = NULL;
803+ BIO *query_bio = NULL;
804+ TS_RESP_CTX *resp_ctx = NULL;
805+
806+ if (!(query_bio = BIO_new_file(queryfile, "rb")))
807+ goto end;
808+
809+ /* Getting TSA configuration section. */
810+ if (!(section = TS_CONF_get_tsa_section(conf, section)))
811+ goto end;
812+
813+ /* Setting up response generation context. */
814+ if (!(resp_ctx = TS_RESP_CTX_new())) goto end;
815+
816+ /* Setting serial number provider callback. */
817+ if (!TS_CONF_set_serial(conf, section, serial_cb, resp_ctx)) goto end;
818+#ifndef OPENSSL_NO_ENGINE
819+ /* Setting default OpenSSL engine. */
820+ if (!TS_CONF_set_crypto_device(conf, section, engine)) goto end;
821+#endif
822+
823+ /* Setting TSA signer certificate. */
824+ if (!TS_CONF_set_signer_cert(conf, section, signer, resp_ctx)) goto end;
825+
826+ /* Setting TSA signer certificate chain. */
827+ if (!TS_CONF_set_certs(conf, section, chain, resp_ctx)) goto end;
828+
829+ /* Setting TSA signer private key. */
830+ if (!TS_CONF_set_signer_key(conf, section, inkey, passin, resp_ctx))
831+ goto end;
832+
833+ /* Setting default policy OID. */
834+ if (!TS_CONF_set_def_policy(conf, section, policy, resp_ctx)) goto end;
835+
836+ /* Setting acceptable policy OIDs. */
837+ if (!TS_CONF_set_policies(conf, section, resp_ctx)) goto end;
838+
839+ /* Setting the acceptable one-way hash algorithms. */
840+ if (!TS_CONF_set_digests(conf, section, resp_ctx)) goto end;
841+
842+ /* Setting guaranteed time stamp accuracy. */
843+ if (!TS_CONF_set_accuracy(conf, section, resp_ctx)) goto end;
844+
845+ /* Setting the precision of the time. */
846+ if (!TS_CONF_set_clock_precision_digits(conf, section, resp_ctx))
847+ goto end;
848+
849+ /* Setting the ordering flaf if requested. */
850+ if (!TS_CONF_set_ordering(conf, section, resp_ctx)) goto end;
851+
852+ /* Setting the TSA name required flag if requested. */
853+ if (!TS_CONF_set_tsa_name(conf, section, resp_ctx)) goto end;
854+
855+ /* Setting the ESS cert id chain flag if requested. */
856+ if (!TS_CONF_set_ess_cert_id_chain(conf, section, resp_ctx)) goto end;
857+
858+ /* Creating the response. */
859+ if (!(response = TS_RESP_create_response(resp_ctx, query_bio)))
860+ goto end;
861+
862+ ret = 1;
863+ end:
864+ if (!ret)
865+ {
866+ TS_RESP_free(response);
867+ response = NULL;
868+ }
869+ TS_RESP_CTX_free(resp_ctx);
870+ BIO_free_all(query_bio);
871+
872+ return response;
873+ }
874+
875+static ASN1_INTEGER * MS_CALLBACK serial_cb(TS_RESP_CTX *ctx, void *data)
876+ {
877+ const char *serial_file = (const char *) data;
878+ ASN1_INTEGER *serial = next_serial(serial_file);
879+
880+ if (!serial)
881+ {
882+ TS_RESP_CTX_set_status_info(ctx, TS_STATUS_REJECTION,
883+ "Error during serial number "
884+ "generation.");
885+ TS_RESP_CTX_add_failure_info(ctx,
886+ TS_INFO_ADD_INFO_NOT_AVAILABLE);
887+ }
888+ else
889+ save_ts_serial(serial_file, serial);
890+
891+ return serial;
892+ }
893+
894+static ASN1_INTEGER *next_serial(const char *serialfile)
895+ {
896+ int ret = 0;
897+ BIO *in = NULL;
898+ ASN1_INTEGER *serial = NULL;
899+ BIGNUM *bn = NULL;
900+
901+ if (!(serial = ASN1_INTEGER_new())) goto err;
902+
903+ if (!(in = BIO_new_file(serialfile, "r")))
904+ {
905+ ERR_clear_error();
906+ BIO_printf(bio_err, "Warning: could not open file %s for "
907+ "reading, using serial number: 1\n", serialfile);
908+ if (!ASN1_INTEGER_set(serial, 1)) goto err;
909+ }
910+ else
911+ {
912+ char buf[1024];
913+ if (!a2i_ASN1_INTEGER(in, serial, buf, sizeof(buf)))
914+ {
915+ BIO_printf(bio_err, "unable to load number from %s\n",
916+ serialfile);
917+ goto err;
918+ }
919+ if (!(bn = ASN1_INTEGER_to_BN(serial, NULL))) goto err;
920+ ASN1_INTEGER_free(serial);
921+ serial = NULL;
922+ if (!BN_add_word(bn, 1)) goto err;
923+ if (!(serial = BN_to_ASN1_INTEGER(bn, NULL))) goto err;
924+ }
925+ ret = 1;
926+ err:
927+ if (!ret)
928+ {
929+ ASN1_INTEGER_free(serial);
930+ serial = NULL;
931+ }
932+ BIO_free_all(in);
933+ BN_free(bn);
934+ return serial;
935+ }
936+
937+static int save_ts_serial(const char *serialfile, ASN1_INTEGER *serial)
938+ {
939+ int ret = 0;
940+ BIO *out = NULL;
941+
942+ if (!(out = BIO_new_file(serialfile, "w"))) goto err;
943+ if (i2a_ASN1_INTEGER(out, serial) <= 0) goto err;
944+ if (BIO_puts(out, "\n") <= 0) goto err;
945+ ret = 1;
946+ err:
947+ if (!ret)
948+ BIO_printf(bio_err, "could not save serial number to %s\n",
949+ serialfile);
950+ BIO_free_all(out);
951+ return ret;
952+ }
953+
954+/*
955+ * Verify-related method definitions.
956+ */
957+
958+static int verify_command(char *data, char *digest, char *queryfile,
959+ char *in, int token_in,
960+ char *ca_path, char *ca_file, char *untrusted)
961+ {
962+ BIO *in_bio = NULL;
963+ PKCS7 *token = NULL;
964+ TS_RESP *response = NULL;
965+ TS_VERIFY_CTX *verify_ctx = NULL;
966+ int ret = 0;
967+
968+ /* Decode the token (PKCS7) or response (TS_RESP) files. */
969+ if (!(in_bio = BIO_new_file(in, "rb"))) goto end;
970+ if (token_in)
971+ {
972+ if (!(token = d2i_PKCS7_bio(in_bio, NULL))) goto end;
973+ }
974+ else
975+ {
976+ if (!(response = d2i_TS_RESP_bio(in_bio, NULL))) goto end;
977+ }
978+
979+ if (!(verify_ctx = create_verify_ctx(data, digest, queryfile,
980+ ca_path, ca_file, untrusted)))
981+ goto end;
982+
983+ /* Checking the token or response against the request. */
984+ ret = token_in ?
985+ TS_RESP_verify_token(verify_ctx, token) :
986+ TS_RESP_verify_response(verify_ctx, response);
987+
988+ end:
989+ printf("Verification: ");
990+ if (ret)
991+ printf("OK\n");
992+ else
993+ {
994+ printf("FAILED\n");
995+ /* Print errors, if there are any. */
996+ ERR_print_errors(bio_err);
997+ }
998+
999+ /* Clean up. */
1000+ BIO_free_all(in_bio);
1001+ PKCS7_free(token);
1002+ TS_RESP_free(response);
1003+ TS_VERIFY_CTX_free(verify_ctx);
1004+ return ret;
1005+ }
1006+
1007+static TS_VERIFY_CTX *create_verify_ctx(char *data, char *digest,
1008+ char *queryfile,
1009+ char *ca_path, char *ca_file,
1010+ char *untrusted)
1011+ {
1012+ TS_VERIFY_CTX *ctx = NULL;
1013+ BIO *input = NULL;
1014+ TS_REQ *request = NULL;
1015+ int ret = 0;
1016+
1017+ if (data != NULL || digest != NULL)
1018+ {
1019+ if (!(ctx = TS_VERIFY_CTX_new())) goto err;
1020+ ctx->flags = TS_VFY_VERSION | TS_VFY_SIGNER;
1021+ if (data != NULL)
1022+ {
1023+ ctx->flags |= TS_VFY_DATA;
1024+ if (!(ctx->data = BIO_new_file(data, "rb"))) goto err;
1025+ }
1026+ else if (digest != NULL)
1027+ {
1028+ long imprint_len;
1029+ ctx->flags |= TS_VFY_IMPRINT;
1030+ if (!(ctx->imprint = string_to_hex(digest,
1031+ &imprint_len)))
1032+ {
1033+ BIO_printf(bio_err, "invalid digest string\n");
1034+ goto err;
1035+ }
1036+ ctx->imprint_len = imprint_len;
1037+ }
1038+
1039+ }
1040+ else if (queryfile != NULL)
1041+ {
1042+ /* The request has just to be read, decoded and converted to
1043+ a verify context object. */
1044+ if (!(input = BIO_new_file(queryfile, "rb"))) goto err;
1045+ if (!(request = d2i_TS_REQ_bio(input, NULL))) goto err;
1046+ if (!(ctx = TS_REQ_to_TS_VERIFY_CTX(request, NULL))) goto err;
1047+ }
1048+ else
1049+ return NULL;
1050+
1051+ /* Add the signature verification flag and arguments. */
1052+ ctx->flags |= TS_VFY_SIGNATURE;
1053+
1054+ /* Initialising the X509_STORE object. */
1055+ if (!(ctx->store = create_cert_store(ca_path, ca_file))) goto err;
1056+
1057+ /* Loading untrusted certificates. */
1058+ if (untrusted && !(ctx->certs = TS_CONF_load_certs(untrusted)))
1059+ goto err;
1060+
1061+ ret = 1;
1062+ err:
1063+ if (!ret)
1064+ {
1065+ TS_VERIFY_CTX_free(ctx);
1066+ ctx = NULL;
1067+ }
1068+ BIO_free_all(input);
1069+ TS_REQ_free(request);
1070+ return ctx;
1071+ }
1072+
1073+static X509_STORE *create_cert_store(char *ca_path, char *ca_file)
1074+ {
1075+ X509_STORE *cert_ctx = NULL;
1076+ X509_LOOKUP *lookup = NULL;
1077+ int i;
1078+
1079+ /* Creating the X509_STORE object. */
1080+ cert_ctx = X509_STORE_new();
1081+
1082+ /* Setting the callback for certificate chain verification. */
1083+ X509_STORE_set_verify_cb_func(cert_ctx, verify_cb);
1084+
1085+ /* Adding a trusted certificate directory source. */
1086+ if (ca_path)
1087+ {
1088+ lookup = X509_STORE_add_lookup(cert_ctx,
1089+ X509_LOOKUP_hash_dir());
1090+ if (lookup == NULL)
1091+ {
1092+ BIO_printf(bio_err, "memory allocation failure\n");
1093+ goto err;
1094+ }
1095+ i = X509_LOOKUP_add_dir(lookup, ca_path, X509_FILETYPE_PEM);
1096+ if (!i)
1097+ {
1098+ BIO_printf(bio_err, "Error loading directory %s\n",
1099+ ca_path);
1100+ goto err;
1101+ }
1102+ }
1103+
1104+ /* Adding a trusted certificate file source. */
1105+ if (ca_file)
1106+ {
1107+ lookup = X509_STORE_add_lookup(cert_ctx, X509_LOOKUP_file());
1108+ if (lookup == NULL)
1109+ {
1110+ BIO_printf(bio_err, "memory allocation failure\n");
1111+ goto err;
1112+ }
1113+ i = X509_LOOKUP_load_file(lookup, ca_file, X509_FILETYPE_PEM);
1114+ if (!i)
1115+ {
1116+ BIO_printf(bio_err, "Error loading file %s\n", ca_file);
1117+ goto err;
1118+ }
1119+ }
1120+
1121+ return cert_ctx;
1122+ err:
1123+ X509_STORE_free(cert_ctx);
1124+ return NULL;
1125+ }
1126+
1127+static int MS_CALLBACK verify_cb(int ok, X509_STORE_CTX *ctx)
1128+ {
1129+ /*
1130+ char buf[256];
1131+
1132+ if (!ok)
1133+ {
1134+ X509_NAME_oneline(X509_get_subject_name(ctx->current_cert),
1135+ buf, sizeof(buf));
1136+ printf("%s\n", buf);
1137+ printf("error %d at %d depth lookup: %s\n",
1138+ ctx->error, ctx->error_depth,
1139+ X509_verify_cert_error_string(ctx->error));
1140+ }
1141+ */
1142+
1143+ return ok;
1144+ }
--- /dev/null
+++ b/apps/tsget
@@ -0,0 +1,195 @@
1+#!/usr/bin/perl -w
2+# Written by Zoltan Glozik <zglozik@stones.com>.
3+# Copyright (c) 2002 The OpenTSA Project. All rights reserved.
4+$::version = '$Id: tsget,v 1.1 2006/02/12 23:11:21 ulf Exp $';
5+
6+use strict;
7+use IO::Handle;
8+use Getopt::Std;
9+use File::Basename;
10+use WWW::Curl::easy;
11+
12+use vars qw(%options);
13+
14+# Callback for reading the body.
15+sub read_body {
16+ my ($maxlength, $state) = @_;
17+ my $return_data = "";
18+ my $data_len = length ${$state->{data}};
19+ if ($state->{bytes} < $data_len) {
20+ $data_len = $data_len - $state->{bytes};
21+ $data_len = $maxlength if $data_len > $maxlength;
22+ $return_data = substr ${$state->{data}}, $state->{bytes}, $data_len;
23+ $state->{bytes} += $data_len;
24+ }
25+ return $return_data;
26+}
27+
28+# Callback for writing the body into a variable.
29+sub write_body {
30+ my ($data, $pointer) = @_;
31+ ${$pointer} .= $data;
32+ return length($data);
33+}
34+
35+# Initialise a new Curl object.
36+sub create_curl {
37+ my $url = shift;
38+
39+ # Create Curl object.
40+ my $curl = WWW::Curl::easy::new();
41+
42+ # Error-handling related options.
43+ $curl->setopt(CURLOPT_VERBOSE, 1) if $options{d};
44+ $curl->setopt(CURLOPT_FAILONERROR, 1);
45+ $curl->setopt(CURLOPT_USERAGENT, "OpenTSA tsget.pl/" . (split / /, $::version)[2]);
46+
47+ # Options for POST method.
48+ $curl->setopt(CURLOPT_UPLOAD, 1);
49+ $curl->setopt(CURLOPT_CUSTOMREQUEST, "POST");
50+ $curl->setopt(CURLOPT_HTTPHEADER,
51+ ["Content-Type: application/timestamp-query",
52+ "Accept: application/timestamp-reply"]);
53+ $curl->setopt(CURLOPT_READFUNCTION, \&read_body);
54+ $curl->setopt(CURLOPT_HEADERFUNCTION, sub { return length($_[0]); });
55+
56+ # Options for getting the result.
57+ $curl->setopt(CURLOPT_WRITEFUNCTION, \&write_body);
58+
59+ # SSL related options.
60+ $curl->setopt(CURLOPT_SSLKEYTYPE, "PEM");
61+ $curl->setopt(CURLOPT_SSL_VERIFYPEER, 1); # Verify server's certificate.
62+ $curl->setopt(CURLOPT_SSL_VERIFYHOST, 2); # Check server's CN.
63+ $curl->setopt(CURLOPT_SSLKEY, $options{k}) if defined($options{k});
64+ $curl->setopt(CURLOPT_SSLKEYPASSWD, $options{p}) if defined($options{p});
65+ $curl->setopt(CURLOPT_SSLCERT, $options{c}) if defined($options{c});
66+ $curl->setopt(CURLOPT_CAINFO, $options{C}) if defined($options{C});
67+ $curl->setopt(CURLOPT_CAPATH, $options{P}) if defined($options{P});
68+ $curl->setopt(CURLOPT_RANDOM_FILE, $options{r}) if defined($options{r});
69+ $curl->setopt(CURLOPT_EGDSOCKET, $options{g}) if defined($options{g});
70+
71+ # Setting destination.
72+ $curl->setopt(CURLOPT_URL, $url);
73+
74+ return $curl;
75+}
76+
77+# Send a request and returns the body back.
78+sub get_timestamp {
79+ my $curl = shift;
80+ my $body = shift;
81+ my $ts_body;
82+ local $::error_buf;
83+
84+ # Error-handling related options.
85+ $curl->setopt(CURLOPT_ERRORBUFFER, "::error_buf");
86+
87+ # Options for POST method.
88+ $curl->setopt(CURLOPT_INFILE, {data => $body, bytes => 0});
89+ $curl->setopt(CURLOPT_INFILESIZE, length(${$body}));
90+
91+ # Options for getting the result.
92+ $curl->setopt(CURLOPT_FILE, \$ts_body);
93+
94+ # Send the request...
95+ my $error_code = $curl->perform();
96+ my $error_string;
97+ if ($error_code != 0) {
98+ my $http_code = $curl->getinfo(CURLINFO_HTTP_CODE);
99+ $error_string = "could not get timestamp";
100+ $error_string .= ", http code: $http_code" unless $http_code == 0;
101+ $error_string .= ", curl code: $error_code";
102+ $error_string .= " ($::error_buf)" if defined($::error_buf);
103+ } else {
104+ my $ct = $curl->getinfo(CURLINFO_CONTENT_TYPE);
105+ if (lc($ct) ne "application/timestamp-reply") {
106+ $error_string = "unexpected content type returned: $ct";
107+ }
108+ }
109+ return ($ts_body, $error_string);
110+
111+}
112+
113+# Print usage information and exists.
114+sub usage {
115+
116+ print STDERR "usage: $0 -h <server_url> [-e <extension>] [-o <output>] ";
117+ print STDERR "[-v] [-d] [-k <private_key.pem>] [-p <key_password>] ";
118+ print STDERR "[-c <client_cert.pem>] [-C <CA_certs.pem>] [-P <CA_path>] ";
119+ print STDERR "[-r <file:file...>] [-g <EGD_socket>] [<request>]...\n";
120+ exit 1;
121+}
122+
123+# ----------------------------------------------------------------------
124+# Main program
125+# ----------------------------------------------------------------------
126+
127+# Getting command-line options (default comes from TSGET environment variable).
128+my $getopt_arg = "h:e:o:vdk:p:c:C:P:r:g:";
129+if (exists $ENV{TSGET}) {
130+ my @old_argv = @ARGV;
131+ @ARGV = split /\s+/, $ENV{TSGET};
132+ getopts($getopt_arg, \%options) or usage;
133+ @ARGV = @old_argv;
134+}
135+getopts($getopt_arg, \%options) or usage;
136+
137+# Checking argument consistency.
138+if (!exists($options{h}) || (@ARGV == 0 && !exists($options{o}))
139+ || (@ARGV > 1 && exists($options{o}))) {
140+ print STDERR "Inconsistent command line options.\n";
141+ usage;
142+}
143+# Setting defaults.
144+@ARGV = ("-") unless @ARGV != 0;
145+$options{e} = ".tsr" unless defined($options{e});
146+
147+# Processing requests.
148+my $curl = create_curl $options{h};
149+undef $/; # For reading whole files.
150+REQUEST: foreach (@ARGV) {
151+ my $input = $_;
152+ my ($base, $path) = fileparse($input, '\.[^.]*');
153+ my $output_base = $base . $options{e};
154+ my $output = defined($options{o}) ? $options{o} : $path . $output_base;
155+
156+ STDERR->printflush("$input: ") if $options{v};
157+ # Read request.
158+ my $body;
159+ if ($input eq "-") {
160+ # Read the request from STDIN;
161+ $body = <STDIN>;
162+ } else {
163+ # Read the request from file.
164+ open INPUT, "<" . $input
165+ or warn("$input: could not open input file: $!\n"), next REQUEST;
166+ $body = <INPUT>;
167+ close INPUT
168+ or warn("$input: could not close input file: $!\n"), next REQUEST;
169+ }
170+
171+ # Send request.
172+ STDERR->printflush("sending request") if $options{v};
173+
174+ my ($ts_body, $error) = get_timestamp $curl, \$body;
175+ if (defined($error)) {
176+ die "$input: fatal error: $error\n";
177+ }
178+ STDERR->printflush(", reply received") if $options{v};
179+
180+ # Write response.
181+ if ($output eq "-") {
182+ # Write to STDOUT.
183+ print $ts_body;
184+ } else {
185+ # Write to file.
186+ open OUTPUT, ">", $output
187+ or warn("$output: could not open output file: $!\n"), next REQUEST;
188+ print OUTPUT $ts_body;
189+ close OUTPUT
190+ or warn("$output: could not close output file: $!\n"), next REQUEST;
191+ }
192+ STDERR->printflush(", $output written.\n") if $options{v};
193+}
194+$curl->cleanup();
195+WWW::Curl::easy::global_cleanup();
--- /dev/null
+++ b/crypto/aes/aes_x86core.c
@@ -0,0 +1,1063 @@
1+/* crypto/aes/aes_core.c -*- mode:C; c-file-style: "eay" -*- */
2+/**
3+ * rijndael-alg-fst.c
4+ *
5+ * @version 3.0 (December 2000)
6+ *
7+ * Optimised ANSI C code for the Rijndael cipher (now AES)
8+ *
9+ * @author Vincent Rijmen <vincent.rijmen@esat.kuleuven.ac.be>
10+ * @author Antoon Bosselaers <antoon.bosselaers@esat.kuleuven.ac.be>
11+ * @author Paulo Barreto <paulo.barreto@terra.com.br>
12+ *
13+ * This code is hereby placed in the public domain.
14+ *
15+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS
16+ * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE
19+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
20+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
21+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
22+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
23+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
24+ * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
25+ * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26+ */
27+
28+/*
29+ * This is experimental x86[_64] derivative. It assumes little-endian
30+ * byte order and expects CPU to sustain unaligned memory references.
31+ * It is used as playground for cache-time attack mitigations and
32+ * serves as reference C implementation for x86[_64] assembler.
33+ *
34+ * <appro@fy.chalmers.se>
35+ */
36+
37+
38+#ifndef AES_DEBUG
39+# ifndef NDEBUG
40+# define NDEBUG
41+# endif
42+#endif
43+#include <assert.h>
44+
45+#include <stdlib.h>
46+#include <openssl/aes.h>
47+#include "aes_locl.h"
48+
49+/*
50+ * These two parameters control which table, 256-byte or 2KB, is
51+ * referenced in outer and respectively inner rounds.
52+ */
53+#define AES_COMPACT_IN_OUTER_ROUNDS
54+#ifdef AES_COMPACT_IN_OUTER_ROUNDS
55+/* AES_COMPACT_IN_OUTER_ROUNDS costs ~30% in performance, while
56+ * adding AES_COMPACT_IN_INNER_ROUNDS reduces benchmark *further*
57+ * by factor of ~2. */
58+# undef AES_COMPACT_IN_INNER_ROUNDS
59+#endif
60+
61+#if 1
62+static void prefetch256(const void *table)
63+{
64+ volatile unsigned long *t=(void *)table,ret;
65+ unsigned long sum;
66+ int i;
67+
68+ /* 32 is common least cache-line size */
69+ for (sum=0,i=0;i<256/sizeof(t[0]);i+=32/sizeof(t[0])) sum ^= t[i];
70+
71+ ret = sum;
72+}
73+#else
74+# define prefetch256(t)
75+#endif
76+
77+#undef GETU32
78+#define GETU32(p) (*((u32*)(p)))
79+
80+#if (defined(_WIN32) || defined(_WIN64)) && !defined(__MINGW32__)
81+typedef unsigned __int64 u64;
82+#define U64(C) C##UI64
83+#elif defined(__arch64__)
84+typedef unsigned long u64;
85+#define U64(C) C##UL
86+#else
87+typedef unsigned long long u64;
88+#define U64(C) C##ULL
89+#endif
90+
91+#undef ROTATE
92+#if defined(_MSC_VER) || defined(__ICC)
93+# define ROTATE(a,n) _lrotl(a,n)
94+#elif defined(__GNUC__) && __GNUC__>=2
95+# if defined(__i386) || defined(__i386__) || defined(__x86_64) || defined(__x86_64__)
96+# define ROTATE(a,n) ({ register unsigned int ret; \
97+ asm ( \
98+ "roll %1,%0" \
99+ : "=r"(ret) \
100+ : "I"(n), "0"(a) \
101+ : "cc"); \
102+ ret; \
103+ })
104+# endif
105+#endif
106+/*
107+Te [x] = S [x].[02, 01, 01, 03, 02, 01, 01, 03];
108+Te0[x] = S [x].[02, 01, 01, 03];
109+Te1[x] = S [x].[03, 02, 01, 01];
110+Te2[x] = S [x].[01, 03, 02, 01];
111+Te3[x] = S [x].[01, 01, 03, 02];
112+*/
113+#define Te0 (u32)((u64*)((u8*)Te+0))
114+#define Te1 (u32)((u64*)((u8*)Te+3))
115+#define Te2 (u32)((u64*)((u8*)Te+2))
116+#define Te3 (u32)((u64*)((u8*)Te+1))
117+/*
118+Td [x] = Si[x].[0e, 09, 0d, 0b, 0e, 09, 0d, 0b];
119+Td0[x] = Si[x].[0e, 09, 0d, 0b];
120+Td1[x] = Si[x].[0b, 0e, 09, 0d];
121+Td2[x] = Si[x].[0d, 0b, 0e, 09];
122+Td3[x] = Si[x].[09, 0d, 0b, 0e];
123+Td4[x] = Si[x].[01];
124+*/
125+#define Td0 (u32)((u64*)((u8*)Td+0))
126+#define Td1 (u32)((u64*)((u8*)Td+3))
127+#define Td2 (u32)((u64*)((u8*)Td+2))
128+#define Td3 (u32)((u64*)((u8*)Td+1))
129+
130+static const u64 Te[256] = {
131+ U64(0xa56363c6a56363c6), U64(0x847c7cf8847c7cf8),
132+ U64(0x997777ee997777ee), U64(0x8d7b7bf68d7b7bf6),
133+ U64(0x0df2f2ff0df2f2ff), U64(0xbd6b6bd6bd6b6bd6),
134+ U64(0xb16f6fdeb16f6fde), U64(0x54c5c59154c5c591),
135+ U64(0x5030306050303060), U64(0x0301010203010102),
136+ U64(0xa96767cea96767ce), U64(0x7d2b2b567d2b2b56),
137+ U64(0x19fefee719fefee7), U64(0x62d7d7b562d7d7b5),
138+ U64(0xe6abab4de6abab4d), U64(0x9a7676ec9a7676ec),
139+ U64(0x45caca8f45caca8f), U64(0x9d82821f9d82821f),
140+ U64(0x40c9c98940c9c989), U64(0x877d7dfa877d7dfa),
141+ U64(0x15fafaef15fafaef), U64(0xeb5959b2eb5959b2),
142+ U64(0xc947478ec947478e), U64(0x0bf0f0fb0bf0f0fb),
143+ U64(0xecadad41ecadad41), U64(0x67d4d4b367d4d4b3),
144+ U64(0xfda2a25ffda2a25f), U64(0xeaafaf45eaafaf45),
145+ U64(0xbf9c9c23bf9c9c23), U64(0xf7a4a453f7a4a453),
146+ U64(0x967272e4967272e4), U64(0x5bc0c09b5bc0c09b),
147+ U64(0xc2b7b775c2b7b775), U64(0x1cfdfde11cfdfde1),
148+ U64(0xae93933dae93933d), U64(0x6a26264c6a26264c),
149+ U64(0x5a36366c5a36366c), U64(0x413f3f7e413f3f7e),
150+ U64(0x02f7f7f502f7f7f5), U64(0x4fcccc834fcccc83),
151+ U64(0x5c3434685c343468), U64(0xf4a5a551f4a5a551),
152+ U64(0x34e5e5d134e5e5d1), U64(0x08f1f1f908f1f1f9),
153+ U64(0x937171e2937171e2), U64(0x73d8d8ab73d8d8ab),
154+ U64(0x5331316253313162), U64(0x3f15152a3f15152a),
155+ U64(0x0c0404080c040408), U64(0x52c7c79552c7c795),
156+ U64(0x6523234665232346), U64(0x5ec3c39d5ec3c39d),
157+ U64(0x2818183028181830), U64(0xa1969637a1969637),
158+ U64(0x0f05050a0f05050a), U64(0xb59a9a2fb59a9a2f),
159+ U64(0x0907070e0907070e), U64(0x3612122436121224),
160+ U64(0x9b80801b9b80801b), U64(0x3de2e2df3de2e2df),
161+ U64(0x26ebebcd26ebebcd), U64(0x6927274e6927274e),
162+ U64(0xcdb2b27fcdb2b27f), U64(0x9f7575ea9f7575ea),
163+ U64(0x1b0909121b090912), U64(0x9e83831d9e83831d),
164+ U64(0x742c2c58742c2c58), U64(0x2e1a1a342e1a1a34),
165+ U64(0x2d1b1b362d1b1b36), U64(0xb26e6edcb26e6edc),
166+ U64(0xee5a5ab4ee5a5ab4), U64(0xfba0a05bfba0a05b),
167+ U64(0xf65252a4f65252a4), U64(0x4d3b3b764d3b3b76),
168+ U64(0x61d6d6b761d6d6b7), U64(0xceb3b37dceb3b37d),
169+ U64(0x7b2929527b292952), U64(0x3ee3e3dd3ee3e3dd),
170+ U64(0x712f2f5e712f2f5e), U64(0x9784841397848413),
171+ U64(0xf55353a6f55353a6), U64(0x68d1d1b968d1d1b9),
172+ U64(0x0000000000000000), U64(0x2cededc12cededc1),
173+ U64(0x6020204060202040), U64(0x1ffcfce31ffcfce3),
174+ U64(0xc8b1b179c8b1b179), U64(0xed5b5bb6ed5b5bb6),
175+ U64(0xbe6a6ad4be6a6ad4), U64(0x46cbcb8d46cbcb8d),
176+ U64(0xd9bebe67d9bebe67), U64(0x4b3939724b393972),
177+ U64(0xde4a4a94de4a4a94), U64(0xd44c4c98d44c4c98),
178+ U64(0xe85858b0e85858b0), U64(0x4acfcf854acfcf85),
179+ U64(0x6bd0d0bb6bd0d0bb), U64(0x2aefefc52aefefc5),
180+ U64(0xe5aaaa4fe5aaaa4f), U64(0x16fbfbed16fbfbed),
181+ U64(0xc5434386c5434386), U64(0xd74d4d9ad74d4d9a),
182+ U64(0x5533336655333366), U64(0x9485851194858511),
183+ U64(0xcf45458acf45458a), U64(0x10f9f9e910f9f9e9),
184+ U64(0x0602020406020204), U64(0x817f7ffe817f7ffe),
185+ U64(0xf05050a0f05050a0), U64(0x443c3c78443c3c78),
186+ U64(0xba9f9f25ba9f9f25), U64(0xe3a8a84be3a8a84b),
187+ U64(0xf35151a2f35151a2), U64(0xfea3a35dfea3a35d),
188+ U64(0xc0404080c0404080), U64(0x8a8f8f058a8f8f05),
189+ U64(0xad92923fad92923f), U64(0xbc9d9d21bc9d9d21),
190+ U64(0x4838387048383870), U64(0x04f5f5f104f5f5f1),
191+ U64(0xdfbcbc63dfbcbc63), U64(0xc1b6b677c1b6b677),
192+ U64(0x75dadaaf75dadaaf), U64(0x6321214263212142),
193+ U64(0x3010102030101020), U64(0x1affffe51affffe5),
194+ U64(0x0ef3f3fd0ef3f3fd), U64(0x6dd2d2bf6dd2d2bf),
195+ U64(0x4ccdcd814ccdcd81), U64(0x140c0c18140c0c18),
196+ U64(0x3513132635131326), U64(0x2fececc32fececc3),
197+ U64(0xe15f5fbee15f5fbe), U64(0xa2979735a2979735),
198+ U64(0xcc444488cc444488), U64(0x3917172e3917172e),
199+ U64(0x57c4c49357c4c493), U64(0xf2a7a755f2a7a755),
200+ U64(0x827e7efc827e7efc), U64(0x473d3d7a473d3d7a),
201+ U64(0xac6464c8ac6464c8), U64(0xe75d5dbae75d5dba),
202+ U64(0x2b1919322b191932), U64(0x957373e6957373e6),
203+ U64(0xa06060c0a06060c0), U64(0x9881811998818119),
204+ U64(0xd14f4f9ed14f4f9e), U64(0x7fdcdca37fdcdca3),
205+ U64(0x6622224466222244), U64(0x7e2a2a547e2a2a54),
206+ U64(0xab90903bab90903b), U64(0x8388880b8388880b),
207+ U64(0xca46468cca46468c), U64(0x29eeeec729eeeec7),
208+ U64(0xd3b8b86bd3b8b86b), U64(0x3c1414283c141428),
209+ U64(0x79dedea779dedea7), U64(0xe25e5ebce25e5ebc),
210+ U64(0x1d0b0b161d0b0b16), U64(0x76dbdbad76dbdbad),
211+ U64(0x3be0e0db3be0e0db), U64(0x5632326456323264),
212+ U64(0x4e3a3a744e3a3a74), U64(0x1e0a0a141e0a0a14),
213+ U64(0xdb494992db494992), U64(0x0a06060c0a06060c),
214+ U64(0x6c2424486c242448), U64(0xe45c5cb8e45c5cb8),
215+ U64(0x5dc2c29f5dc2c29f), U64(0x6ed3d3bd6ed3d3bd),
216+ U64(0xefacac43efacac43), U64(0xa66262c4a66262c4),
217+ U64(0xa8919139a8919139), U64(0xa4959531a4959531),
218+ U64(0x37e4e4d337e4e4d3), U64(0x8b7979f28b7979f2),
219+ U64(0x32e7e7d532e7e7d5), U64(0x43c8c88b43c8c88b),
220+ U64(0x5937376e5937376e), U64(0xb76d6ddab76d6dda),
221+ U64(0x8c8d8d018c8d8d01), U64(0x64d5d5b164d5d5b1),
222+ U64(0xd24e4e9cd24e4e9c), U64(0xe0a9a949e0a9a949),
223+ U64(0xb46c6cd8b46c6cd8), U64(0xfa5656acfa5656ac),
224+ U64(0x07f4f4f307f4f4f3), U64(0x25eaeacf25eaeacf),
225+ U64(0xaf6565caaf6565ca), U64(0x8e7a7af48e7a7af4),
226+ U64(0xe9aeae47e9aeae47), U64(0x1808081018080810),
227+ U64(0xd5baba6fd5baba6f), U64(0x887878f0887878f0),
228+ U64(0x6f25254a6f25254a), U64(0x722e2e5c722e2e5c),
229+ U64(0x241c1c38241c1c38), U64(0xf1a6a657f1a6a657),
230+ U64(0xc7b4b473c7b4b473), U64(0x51c6c69751c6c697),
231+ U64(0x23e8e8cb23e8e8cb), U64(0x7cdddda17cdddda1),
232+ U64(0x9c7474e89c7474e8), U64(0x211f1f3e211f1f3e),
233+ U64(0xdd4b4b96dd4b4b96), U64(0xdcbdbd61dcbdbd61),
234+ U64(0x868b8b0d868b8b0d), U64(0x858a8a0f858a8a0f),
235+ U64(0x907070e0907070e0), U64(0x423e3e7c423e3e7c),
236+ U64(0xc4b5b571c4b5b571), U64(0xaa6666ccaa6666cc),
237+ U64(0xd8484890d8484890), U64(0x0503030605030306),
238+ U64(0x01f6f6f701f6f6f7), U64(0x120e0e1c120e0e1c),
239+ U64(0xa36161c2a36161c2), U64(0x5f35356a5f35356a),
240+ U64(0xf95757aef95757ae), U64(0xd0b9b969d0b9b969),
241+ U64(0x9186861791868617), U64(0x58c1c19958c1c199),
242+ U64(0x271d1d3a271d1d3a), U64(0xb99e9e27b99e9e27),
243+ U64(0x38e1e1d938e1e1d9), U64(0x13f8f8eb13f8f8eb),
244+ U64(0xb398982bb398982b), U64(0x3311112233111122),
245+ U64(0xbb6969d2bb6969d2), U64(0x70d9d9a970d9d9a9),
246+ U64(0x898e8e07898e8e07), U64(0xa7949433a7949433),
247+ U64(0xb69b9b2db69b9b2d), U64(0x221e1e3c221e1e3c),
248+ U64(0x9287871592878715), U64(0x20e9e9c920e9e9c9),
249+ U64(0x49cece8749cece87), U64(0xff5555aaff5555aa),
250+ U64(0x7828285078282850), U64(0x7adfdfa57adfdfa5),
251+ U64(0x8f8c8c038f8c8c03), U64(0xf8a1a159f8a1a159),
252+ U64(0x8089890980898909), U64(0x170d0d1a170d0d1a),
253+ U64(0xdabfbf65dabfbf65), U64(0x31e6e6d731e6e6d7),
254+ U64(0xc6424284c6424284), U64(0xb86868d0b86868d0),
255+ U64(0xc3414182c3414182), U64(0xb0999929b0999929),
256+ U64(0x772d2d5a772d2d5a), U64(0x110f0f1e110f0f1e),
257+ U64(0xcbb0b07bcbb0b07b), U64(0xfc5454a8fc5454a8),
258+ U64(0xd6bbbb6dd6bbbb6d), U64(0x3a16162c3a16162c)
259+};
260+
261+static const u8 Te4[256] = {
262+ 0x63U, 0x7cU, 0x77U, 0x7bU, 0xf2U, 0x6bU, 0x6fU, 0xc5U,
263+ 0x30U, 0x01U, 0x67U, 0x2bU, 0xfeU, 0xd7U, 0xabU, 0x76U,
264+ 0xcaU, 0x82U, 0xc9U, 0x7dU, 0xfaU, 0x59U, 0x47U, 0xf0U,
265+ 0xadU, 0xd4U, 0xa2U, 0xafU, 0x9cU, 0xa4U, 0x72U, 0xc0U,
266+ 0xb7U, 0xfdU, 0x93U, 0x26U, 0x36U, 0x3fU, 0xf7U, 0xccU,
267+ 0x34U, 0xa5U, 0xe5U, 0xf1U, 0x71U, 0xd8U, 0x31U, 0x15U,
268+ 0x04U, 0xc7U, 0x23U, 0xc3U, 0x18U, 0x96U, 0x05U, 0x9aU,
269+ 0x07U, 0x12U, 0x80U, 0xe2U, 0xebU, 0x27U, 0xb2U, 0x75U,
270+ 0x09U, 0x83U, 0x2cU, 0x1aU, 0x1bU, 0x6eU, 0x5aU, 0xa0U,
271+ 0x52U, 0x3bU, 0xd6U, 0xb3U, 0x29U, 0xe3U, 0x2fU, 0x84U,
272+ 0x53U, 0xd1U, 0x00U, 0xedU, 0x20U, 0xfcU, 0xb1U, 0x5bU,
273+ 0x6aU, 0xcbU, 0xbeU, 0x39U, 0x4aU, 0x4cU, 0x58U, 0xcfU,
274+ 0xd0U, 0xefU, 0xaaU, 0xfbU, 0x43U, 0x4dU, 0x33U, 0x85U,
275+ 0x45U, 0xf9U, 0x02U, 0x7fU, 0x50U, 0x3cU, 0x9fU, 0xa8U,
276+ 0x51U, 0xa3U, 0x40U, 0x8fU, 0x92U, 0x9dU, 0x38U, 0xf5U,
277+ 0xbcU, 0xb6U, 0xdaU, 0x21U, 0x10U, 0xffU, 0xf3U, 0xd2U,
278+ 0xcdU, 0x0cU, 0x13U, 0xecU, 0x5fU, 0x97U, 0x44U, 0x17U,
279+ 0xc4U, 0xa7U, 0x7eU, 0x3dU, 0x64U, 0x5dU, 0x19U, 0x73U,
280+ 0x60U, 0x81U, 0x4fU, 0xdcU, 0x22U, 0x2aU, 0x90U, 0x88U,
281+ 0x46U, 0xeeU, 0xb8U, 0x14U, 0xdeU, 0x5eU, 0x0bU, 0xdbU,
282+ 0xe0U, 0x32U, 0x3aU, 0x0aU, 0x49U, 0x06U, 0x24U, 0x5cU,
283+ 0xc2U, 0xd3U, 0xacU, 0x62U, 0x91U, 0x95U, 0xe4U, 0x79U,
284+ 0xe7U, 0xc8U, 0x37U, 0x6dU, 0x8dU, 0xd5U, 0x4eU, 0xa9U,
285+ 0x6cU, 0x56U, 0xf4U, 0xeaU, 0x65U, 0x7aU, 0xaeU, 0x08U,
286+ 0xbaU, 0x78U, 0x25U, 0x2eU, 0x1cU, 0xa6U, 0xb4U, 0xc6U,
287+ 0xe8U, 0xddU, 0x74U, 0x1fU, 0x4bU, 0xbdU, 0x8bU, 0x8aU,
288+ 0x70U, 0x3eU, 0xb5U, 0x66U, 0x48U, 0x03U, 0xf6U, 0x0eU,
289+ 0x61U, 0x35U, 0x57U, 0xb9U, 0x86U, 0xc1U, 0x1dU, 0x9eU,
290+ 0xe1U, 0xf8U, 0x98U, 0x11U, 0x69U, 0xd9U, 0x8eU, 0x94U,
291+ 0x9bU, 0x1eU, 0x87U, 0xe9U, 0xceU, 0x55U, 0x28U, 0xdfU,
292+ 0x8cU, 0xa1U, 0x89U, 0x0dU, 0xbfU, 0xe6U, 0x42U, 0x68U,
293+ 0x41U, 0x99U, 0x2dU, 0x0fU, 0xb0U, 0x54U, 0xbbU, 0x16U
294+};
295+
296+static const u64 Td[256] = {
297+ U64(0x50a7f45150a7f451), U64(0x5365417e5365417e),
298+ U64(0xc3a4171ac3a4171a), U64(0x965e273a965e273a),
299+ U64(0xcb6bab3bcb6bab3b), U64(0xf1459d1ff1459d1f),
300+ U64(0xab58faacab58faac), U64(0x9303e34b9303e34b),
301+ U64(0x55fa302055fa3020), U64(0xf66d76adf66d76ad),
302+ U64(0x9176cc889176cc88), U64(0x254c02f5254c02f5),
303+ U64(0xfcd7e54ffcd7e54f), U64(0xd7cb2ac5d7cb2ac5),
304+ U64(0x8044352680443526), U64(0x8fa362b58fa362b5),
305+ U64(0x495ab1de495ab1de), U64(0x671bba25671bba25),
306+ U64(0x980eea45980eea45), U64(0xe1c0fe5de1c0fe5d),
307+ U64(0x02752fc302752fc3), U64(0x12f04c8112f04c81),
308+ U64(0xa397468da397468d), U64(0xc6f9d36bc6f9d36b),
309+ U64(0xe75f8f03e75f8f03), U64(0x959c9215959c9215),
310+ U64(0xeb7a6dbfeb7a6dbf), U64(0xda595295da595295),
311+ U64(0x2d83bed42d83bed4), U64(0xd3217458d3217458),
312+ U64(0x2969e0492969e049), U64(0x44c8c98e44c8c98e),
313+ U64(0x6a89c2756a89c275), U64(0x78798ef478798ef4),
314+ U64(0x6b3e58996b3e5899), U64(0xdd71b927dd71b927),
315+ U64(0xb64fe1beb64fe1be), U64(0x17ad88f017ad88f0),
316+ U64(0x66ac20c966ac20c9), U64(0xb43ace7db43ace7d),
317+ U64(0x184adf63184adf63), U64(0x82311ae582311ae5),
318+ U64(0x6033519760335197), U64(0x457f5362457f5362),
319+ U64(0xe07764b1e07764b1), U64(0x84ae6bbb84ae6bbb),
320+ U64(0x1ca081fe1ca081fe), U64(0x942b08f9942b08f9),
321+ U64(0x5868487058684870), U64(0x19fd458f19fd458f),
322+ U64(0x876cde94876cde94), U64(0xb7f87b52b7f87b52),
323+ U64(0x23d373ab23d373ab), U64(0xe2024b72e2024b72),
324+ U64(0x578f1fe3578f1fe3), U64(0x2aab55662aab5566),
325+ U64(0x0728ebb20728ebb2), U64(0x03c2b52f03c2b52f),
326+ U64(0x9a7bc5869a7bc586), U64(0xa50837d3a50837d3),
327+ U64(0xf2872830f2872830), U64(0xb2a5bf23b2a5bf23),
328+ U64(0xba6a0302ba6a0302), U64(0x5c8216ed5c8216ed),
329+ U64(0x2b1ccf8a2b1ccf8a), U64(0x92b479a792b479a7),
330+ U64(0xf0f207f3f0f207f3), U64(0xa1e2694ea1e2694e),
331+ U64(0xcdf4da65cdf4da65), U64(0xd5be0506d5be0506),
332+ U64(0x1f6234d11f6234d1), U64(0x8afea6c48afea6c4),
333+ U64(0x9d532e349d532e34), U64(0xa055f3a2a055f3a2),
334+ U64(0x32e18a0532e18a05), U64(0x75ebf6a475ebf6a4),
335+ U64(0x39ec830b39ec830b), U64(0xaaef6040aaef6040),
336+ U64(0x069f715e069f715e), U64(0x51106ebd51106ebd),
337+ U64(0xf98a213ef98a213e), U64(0x3d06dd963d06dd96),
338+ U64(0xae053eddae053edd), U64(0x46bde64d46bde64d),
339+ U64(0xb58d5491b58d5491), U64(0x055dc471055dc471),
340+ U64(0x6fd406046fd40604), U64(0xff155060ff155060),
341+ U64(0x24fb981924fb9819), U64(0x97e9bdd697e9bdd6),
342+ U64(0xcc434089cc434089), U64(0x779ed967779ed967),
343+ U64(0xbd42e8b0bd42e8b0), U64(0x888b8907888b8907),
344+ U64(0x385b19e7385b19e7), U64(0xdbeec879dbeec879),
345+ U64(0x470a7ca1470a7ca1), U64(0xe90f427ce90f427c),
346+ U64(0xc91e84f8c91e84f8), U64(0x0000000000000000),
347+ U64(0x8386800983868009), U64(0x48ed2b3248ed2b32),
348+ U64(0xac70111eac70111e), U64(0x4e725a6c4e725a6c),
349+ U64(0xfbff0efdfbff0efd), U64(0x5638850f5638850f),
350+ U64(0x1ed5ae3d1ed5ae3d), U64(0x27392d3627392d36),
351+ U64(0x64d90f0a64d90f0a), U64(0x21a65c6821a65c68),
352+ U64(0xd1545b9bd1545b9b), U64(0x3a2e36243a2e3624),
353+ U64(0xb1670a0cb1670a0c), U64(0x0fe757930fe75793),
354+ U64(0xd296eeb4d296eeb4), U64(0x9e919b1b9e919b1b),
355+ U64(0x4fc5c0804fc5c080), U64(0xa220dc61a220dc61),
356+ U64(0x694b775a694b775a), U64(0x161a121c161a121c),
357+ U64(0x0aba93e20aba93e2), U64(0xe52aa0c0e52aa0c0),
358+ U64(0x43e0223c43e0223c), U64(0x1d171b121d171b12),
359+ U64(0x0b0d090e0b0d090e), U64(0xadc78bf2adc78bf2),
360+ U64(0xb9a8b62db9a8b62d), U64(0xc8a91e14c8a91e14),
361+ U64(0x8519f1578519f157), U64(0x4c0775af4c0775af),
362+ U64(0xbbdd99eebbdd99ee), U64(0xfd607fa3fd607fa3),
363+ U64(0x9f2601f79f2601f7), U64(0xbcf5725cbcf5725c),
364+ U64(0xc53b6644c53b6644), U64(0x347efb5b347efb5b),
365+ U64(0x7629438b7629438b), U64(0xdcc623cbdcc623cb),
366+ U64(0x68fcedb668fcedb6), U64(0x63f1e4b863f1e4b8),
367+ U64(0xcadc31d7cadc31d7), U64(0x1085634210856342),
368+ U64(0x4022971340229713), U64(0x2011c6842011c684),
369+ U64(0x7d244a857d244a85), U64(0xf83dbbd2f83dbbd2),
370+ U64(0x1132f9ae1132f9ae), U64(0x6da129c76da129c7),
371+ U64(0x4b2f9e1d4b2f9e1d), U64(0xf330b2dcf330b2dc),
372+ U64(0xec52860dec52860d), U64(0xd0e3c177d0e3c177),
373+ U64(0x6c16b32b6c16b32b), U64(0x99b970a999b970a9),
374+ U64(0xfa489411fa489411), U64(0x2264e9472264e947),
375+ U64(0xc48cfca8c48cfca8), U64(0x1a3ff0a01a3ff0a0),
376+ U64(0xd82c7d56d82c7d56), U64(0xef903322ef903322),
377+ U64(0xc74e4987c74e4987), U64(0xc1d138d9c1d138d9),
378+ U64(0xfea2ca8cfea2ca8c), U64(0x360bd498360bd498),
379+ U64(0xcf81f5a6cf81f5a6), U64(0x28de7aa528de7aa5),
380+ U64(0x268eb7da268eb7da), U64(0xa4bfad3fa4bfad3f),
381+ U64(0xe49d3a2ce49d3a2c), U64(0x0d9278500d927850),
382+ U64(0x9bcc5f6a9bcc5f6a), U64(0x62467e5462467e54),
383+ U64(0xc2138df6c2138df6), U64(0xe8b8d890e8b8d890),
384+ U64(0x5ef7392e5ef7392e), U64(0xf5afc382f5afc382),
385+ U64(0xbe805d9fbe805d9f), U64(0x7c93d0697c93d069),
386+ U64(0xa92dd56fa92dd56f), U64(0xb31225cfb31225cf),
387+ U64(0x3b99acc83b99acc8), U64(0xa77d1810a77d1810),
388+ U64(0x6e639ce86e639ce8), U64(0x7bbb3bdb7bbb3bdb),
389+ U64(0x097826cd097826cd), U64(0xf418596ef418596e),
390+ U64(0x01b79aec01b79aec), U64(0xa89a4f83a89a4f83),
391+ U64(0x656e95e6656e95e6), U64(0x7ee6ffaa7ee6ffaa),
392+ U64(0x08cfbc2108cfbc21), U64(0xe6e815efe6e815ef),
393+ U64(0xd99be7bad99be7ba), U64(0xce366f4ace366f4a),
394+ U64(0xd4099fead4099fea), U64(0xd67cb029d67cb029),
395+ U64(0xafb2a431afb2a431), U64(0x31233f2a31233f2a),
396+ U64(0x3094a5c63094a5c6), U64(0xc066a235c066a235),
397+ U64(0x37bc4e7437bc4e74), U64(0xa6ca82fca6ca82fc),
398+ U64(0xb0d090e0b0d090e0), U64(0x15d8a73315d8a733),
399+ U64(0x4a9804f14a9804f1), U64(0xf7daec41f7daec41),
400+ U64(0x0e50cd7f0e50cd7f), U64(0x2ff691172ff69117),
401+ U64(0x8dd64d768dd64d76), U64(0x4db0ef434db0ef43),
402+ U64(0x544daacc544daacc), U64(0xdf0496e4df0496e4),
403+ U64(0xe3b5d19ee3b5d19e), U64(0x1b886a4c1b886a4c),
404+ U64(0xb81f2cc1b81f2cc1), U64(0x7f5165467f516546),
405+ U64(0x04ea5e9d04ea5e9d), U64(0x5d358c015d358c01),
406+ U64(0x737487fa737487fa), U64(0x2e410bfb2e410bfb),
407+ U64(0x5a1d67b35a1d67b3), U64(0x52d2db9252d2db92),
408+ U64(0x335610e9335610e9), U64(0x1347d66d1347d66d),
409+ U64(0x8c61d79a8c61d79a), U64(0x7a0ca1377a0ca137),
410+ U64(0x8e14f8598e14f859), U64(0x893c13eb893c13eb),
411+ U64(0xee27a9ceee27a9ce), U64(0x35c961b735c961b7),
412+ U64(0xede51ce1ede51ce1), U64(0x3cb1477a3cb1477a),
413+ U64(0x59dfd29c59dfd29c), U64(0x3f73f2553f73f255),
414+ U64(0x79ce141879ce1418), U64(0xbf37c773bf37c773),
415+ U64(0xeacdf753eacdf753), U64(0x5baafd5f5baafd5f),
416+ U64(0x146f3ddf146f3ddf), U64(0x86db447886db4478),
417+ U64(0x81f3afca81f3afca), U64(0x3ec468b93ec468b9),
418+ U64(0x2c3424382c342438), U64(0x5f40a3c25f40a3c2),
419+ U64(0x72c31d1672c31d16), U64(0x0c25e2bc0c25e2bc),
420+ U64(0x8b493c288b493c28), U64(0x41950dff41950dff),
421+ U64(0x7101a8397101a839), U64(0xdeb30c08deb30c08),
422+ U64(0x9ce4b4d89ce4b4d8), U64(0x90c1566490c15664),
423+ U64(0x6184cb7b6184cb7b), U64(0x70b632d570b632d5),
424+ U64(0x745c6c48745c6c48), U64(0x4257b8d04257b8d0)
425+};
426+static const u8 Td4[256] = {
427+ 0x52U, 0x09U, 0x6aU, 0xd5U, 0x30U, 0x36U, 0xa5U, 0x38U,
428+ 0xbfU, 0x40U, 0xa3U, 0x9eU, 0x81U, 0xf3U, 0xd7U, 0xfbU,
429+ 0x7cU, 0xe3U, 0x39U, 0x82U, 0x9bU, 0x2fU, 0xffU, 0x87U,
430+ 0x34U, 0x8eU, 0x43U, 0x44U, 0xc4U, 0xdeU, 0xe9U, 0xcbU,
431+ 0x54U, 0x7bU, 0x94U, 0x32U, 0xa6U, 0xc2U, 0x23U, 0x3dU,
432+ 0xeeU, 0x4cU, 0x95U, 0x0bU, 0x42U, 0xfaU, 0xc3U, 0x4eU,
433+ 0x08U, 0x2eU, 0xa1U, 0x66U, 0x28U, 0xd9U, 0x24U, 0xb2U,
434+ 0x76U, 0x5bU, 0xa2U, 0x49U, 0x6dU, 0x8bU, 0xd1U, 0x25U,
435+ 0x72U, 0xf8U, 0xf6U, 0x64U, 0x86U, 0x68U, 0x98U, 0x16U,
436+ 0xd4U, 0xa4U, 0x5cU, 0xccU, 0x5dU, 0x65U, 0xb6U, 0x92U,
437+ 0x6cU, 0x70U, 0x48U, 0x50U, 0xfdU, 0xedU, 0xb9U, 0xdaU,
438+ 0x5eU, 0x15U, 0x46U, 0x57U, 0xa7U, 0x8dU, 0x9dU, 0x84U,
439+ 0x90U, 0xd8U, 0xabU, 0x00U, 0x8cU, 0xbcU, 0xd3U, 0x0aU,
440+ 0xf7U, 0xe4U, 0x58U, 0x05U, 0xb8U, 0xb3U, 0x45U, 0x06U,
441+ 0xd0U, 0x2cU, 0x1eU, 0x8fU, 0xcaU, 0x3fU, 0x0fU, 0x02U,
442+ 0xc1U, 0xafU, 0xbdU, 0x03U, 0x01U, 0x13U, 0x8aU, 0x6bU,
443+ 0x3aU, 0x91U, 0x11U, 0x41U, 0x4fU, 0x67U, 0xdcU, 0xeaU,
444+ 0x97U, 0xf2U, 0xcfU, 0xceU, 0xf0U, 0xb4U, 0xe6U, 0x73U,
445+ 0x96U, 0xacU, 0x74U, 0x22U, 0xe7U, 0xadU, 0x35U, 0x85U,
446+ 0xe2U, 0xf9U, 0x37U, 0xe8U, 0x1cU, 0x75U, 0xdfU, 0x6eU,
447+ 0x47U, 0xf1U, 0x1aU, 0x71U, 0x1dU, 0x29U, 0xc5U, 0x89U,
448+ 0x6fU, 0xb7U, 0x62U, 0x0eU, 0xaaU, 0x18U, 0xbeU, 0x1bU,
449+ 0xfcU, 0x56U, 0x3eU, 0x4bU, 0xc6U, 0xd2U, 0x79U, 0x20U,
450+ 0x9aU, 0xdbU, 0xc0U, 0xfeU, 0x78U, 0xcdU, 0x5aU, 0xf4U,
451+ 0x1fU, 0xddU, 0xa8U, 0x33U, 0x88U, 0x07U, 0xc7U, 0x31U,
452+ 0xb1U, 0x12U, 0x10U, 0x59U, 0x27U, 0x80U, 0xecU, 0x5fU,
453+ 0x60U, 0x51U, 0x7fU, 0xa9U, 0x19U, 0xb5U, 0x4aU, 0x0dU,
454+ 0x2dU, 0xe5U, 0x7aU, 0x9fU, 0x93U, 0xc9U, 0x9cU, 0xefU,
455+ 0xa0U, 0xe0U, 0x3bU, 0x4dU, 0xaeU, 0x2aU, 0xf5U, 0xb0U,
456+ 0xc8U, 0xebU, 0xbbU, 0x3cU, 0x83U, 0x53U, 0x99U, 0x61U,
457+ 0x17U, 0x2bU, 0x04U, 0x7eU, 0xbaU, 0x77U, 0xd6U, 0x26U,
458+ 0xe1U, 0x69U, 0x14U, 0x63U, 0x55U, 0x21U, 0x0cU, 0x7dU
459+};
460+
461+static const u32 rcon[] = {
462+ 0x00000001U, 0x00000002U, 0x00000004U, 0x00000008U,
463+ 0x00000010U, 0x00000020U, 0x00000040U, 0x00000080U,
464+ 0x0000001bU, 0x00000036U, /* for 128-bit blocks, Rijndael never uses more than 10 rcon values */
465+};
466+
467+/**
468+ * Expand the cipher key into the encryption key schedule.
469+ */
470+int AES_set_encrypt_key(const unsigned char *userKey, const int bits,
471+ AES_KEY *key) {
472+
473+ u32 *rk;
474+ int i = 0;
475+ u32 temp;
476+
477+ if (!userKey || !key)
478+ return -1;
479+ if (bits != 128 && bits != 192 && bits != 256)
480+ return -2;
481+
482+ rk = key->rd_key;
483+
484+ if (bits==128)
485+ key->rounds = 10;
486+ else if (bits==192)
487+ key->rounds = 12;
488+ else
489+ key->rounds = 14;
490+
491+ rk[0] = GETU32(userKey );
492+ rk[1] = GETU32(userKey + 4);
493+ rk[2] = GETU32(userKey + 8);
494+ rk[3] = GETU32(userKey + 12);
495+ if (bits == 128) {
496+ while (1) {
497+ temp = rk[3];
498+ rk[4] = rk[0] ^
499+ (Te4[(temp >> 8) & 0xff] ) ^
500+ (Te4[(temp >> 16) & 0xff] << 8) ^
501+ (Te4[(temp >> 24) ] << 16) ^
502+ (Te4[(temp ) & 0xff] << 24) ^
503+ rcon[i];
504+ rk[5] = rk[1] ^ rk[4];
505+ rk[6] = rk[2] ^ rk[5];
506+ rk[7] = rk[3] ^ rk[6];
507+ if (++i == 10) {
508+ return 0;
509+ }
510+ rk += 4;
511+ }
512+ }
513+ rk[4] = GETU32(userKey + 16);
514+ rk[5] = GETU32(userKey + 20);
515+ if (bits == 192) {
516+ while (1) {
517+ temp = rk[ 5];
518+ rk[ 6] = rk[ 0] ^
519+ (Te4[(temp >> 8) & 0xff] ) ^
520+ (Te4[(temp >> 16) & 0xff] << 8) ^
521+ (Te4[(temp >> 24) ] << 16) ^
522+ (Te4[(temp ) & 0xff] << 24) ^
523+ rcon[i];
524+ rk[ 7] = rk[ 1] ^ rk[ 6];
525+ rk[ 8] = rk[ 2] ^ rk[ 7];
526+ rk[ 9] = rk[ 3] ^ rk[ 8];
527+ if (++i == 8) {
528+ return 0;
529+ }
530+ rk[10] = rk[ 4] ^ rk[ 9];
531+ rk[11] = rk[ 5] ^ rk[10];
532+ rk += 6;
533+ }
534+ }
535+ rk[6] = GETU32(userKey + 24);
536+ rk[7] = GETU32(userKey + 28);
537+ if (bits == 256) {
538+ while (1) {
539+ temp = rk[ 7];
540+ rk[ 8] = rk[ 0] ^
541+ (Te4[(temp >> 8) & 0xff] ) ^
542+ (Te4[(temp >> 16) & 0xff] << 8) ^
543+ (Te4[(temp >> 24) ] << 16) ^
544+ (Te4[(temp ) & 0xff] << 24) ^
545+ rcon[i];
546+ rk[ 9] = rk[ 1] ^ rk[ 8];
547+ rk[10] = rk[ 2] ^ rk[ 9];
548+ rk[11] = rk[ 3] ^ rk[10];
549+ if (++i == 7) {
550+ return 0;
551+ }
552+ temp = rk[11];
553+ rk[12] = rk[ 4] ^
554+ (Te4[(temp ) & 0xff] ) ^
555+ (Te4[(temp >> 8) & 0xff] << 8) ^
556+ (Te4[(temp >> 16) & 0xff] << 16) ^
557+ (Te4[(temp >> 24) ] << 24);
558+ rk[13] = rk[ 5] ^ rk[12];
559+ rk[14] = rk[ 6] ^ rk[13];
560+ rk[15] = rk[ 7] ^ rk[14];
561+
562+ rk += 8;
563+ }
564+ }
565+ return 0;
566+}
567+
568+/**
569+ * Expand the cipher key into the decryption key schedule.
570+ */
571+int AES_set_decrypt_key(const unsigned char *userKey, const int bits,
572+ AES_KEY *key) {
573+
574+ u32 *rk;
575+ int i, j, status;
576+ u32 temp;
577+
578+ /* first, start with an encryption schedule */
579+ status = AES_set_encrypt_key(userKey, bits, key);
580+ if (status < 0)
581+ return status;
582+
583+ rk = key->rd_key;
584+
585+ /* invert the order of the round keys: */
586+ for (i = 0, j = 4*(key->rounds); i < j; i += 4, j -= 4) {
587+ temp = rk[i ]; rk[i ] = rk[j ]; rk[j ] = temp;
588+ temp = rk[i + 1]; rk[i + 1] = rk[j + 1]; rk[j + 1] = temp;
589+ temp = rk[i + 2]; rk[i + 2] = rk[j + 2]; rk[j + 2] = temp;
590+ temp = rk[i + 3]; rk[i + 3] = rk[j + 3]; rk[j + 3] = temp;
591+ }
592+ /* apply the inverse MixColumn transform to all round keys but the first and the last: */
593+ for (i = 1; i < (key->rounds); i++) {
594+ rk += 4;
595+#if 1
596+ for (j = 0; j < 4; j++) {
597+ u32 tp1, tp2, tp4, tp8, tp9, tpb, tpd, tpe, m;
598+
599+ tp1 = rk[j];
600+ m = tp1 & 0x80808080;
601+ tp2 = ((tp1 & 0x7f7f7f7f) << 1) ^
602+ ((m - (m >> 7)) & 0x1b1b1b1b);
603+ m = tp2 & 0x80808080;
604+ tp4 = ((tp2 & 0x7f7f7f7f) << 1) ^
605+ ((m - (m >> 7)) & 0x1b1b1b1b);
606+ m = tp4 & 0x80808080;
607+ tp8 = ((tp4 & 0x7f7f7f7f) << 1) ^
608+ ((m - (m >> 7)) & 0x1b1b1b1b);
609+ tp9 = tp8 ^ tp1;
610+ tpb = tp9 ^ tp2;
611+ tpd = tp9 ^ tp4;
612+ tpe = tp8 ^ tp4 ^ tp2;
613+#if defined(ROTATE)
614+ rk[j] = tpe ^ ROTATE(tpd,16) ^
615+ ROTATE(tp9,8) ^ ROTATE(tpb,24);
616+#else
617+ rk[j] = tpe ^ (tpd >> 16) ^ (tpd << 16) ^
618+ (tp9 >> 24) ^ (tp9 << 8) ^
619+ (tpb >> 8) ^ (tpb << 24);
620+#endif
621+ }
622+#else
623+ rk[0] =
624+ Td0[Te2[(rk[0] ) & 0xff] & 0xff] ^
625+ Td1[Te2[(rk[0] >> 8) & 0xff] & 0xff] ^
626+ Td2[Te2[(rk[0] >> 16) & 0xff] & 0xff] ^
627+ Td3[Te2[(rk[0] >> 24) ] & 0xff];
628+ rk[1] =
629+ Td0[Te2[(rk[1] ) & 0xff] & 0xff] ^
630+ Td1[Te2[(rk[1] >> 8) & 0xff] & 0xff] ^
631+ Td2[Te2[(rk[1] >> 16) & 0xff] & 0xff] ^
632+ Td3[Te2[(rk[1] >> 24) ] & 0xff];
633+ rk[2] =
634+ Td0[Te2[(rk[2] ) & 0xff] & 0xff] ^
635+ Td1[Te2[(rk[2] >> 8) & 0xff] & 0xff] ^
636+ Td2[Te2[(rk[2] >> 16) & 0xff] & 0xff] ^
637+ Td3[Te2[(rk[2] >> 24) ] & 0xff];
638+ rk[3] =
639+ Td0[Te2[(rk[3] ) & 0xff] & 0xff] ^
640+ Td1[Te2[(rk[3] >> 8) & 0xff] & 0xff] ^
641+ Td2[Te2[(rk[3] >> 16) & 0xff] & 0xff] ^
642+ Td3[Te2[(rk[3] >> 24) ] & 0xff];
643+#endif
644+ }
645+ return 0;
646+}
647+
648+/*
649+ * Encrypt a single block
650+ * in and out can overlap
651+ */
652+void AES_encrypt(const unsigned char *in, unsigned char *out,
653+ const AES_KEY *key) {
654+
655+ const u32 *rk;
656+ u32 s0, s1, s2, s3, t[4];
657+ int r;
658+
659+ assert(in && out && key);
660+ rk = key->rd_key;
661+
662+ /*
663+ * map byte array block to cipher state
664+ * and add initial round key:
665+ */
666+ s0 = GETU32(in ) ^ rk[0];
667+ s1 = GETU32(in + 4) ^ rk[1];
668+ s2 = GETU32(in + 8) ^ rk[2];
669+ s3 = GETU32(in + 12) ^ rk[3];
670+
671+#if defined(AES_COMPACT_IN_OUTER_ROUNDS)
672+ prefetch256(Te4);
673+
674+ t[0] = Te4[(s0 ) & 0xff] ^
675+ Te4[(s1 >> 8) & 0xff] << 8 ^
676+ Te4[(s2 >> 16) & 0xff] << 16 ^
677+ Te4[(s3 >> 24) ] << 24;
678+ t[1] = Te4[(s1 ) & 0xff] ^
679+ Te4[(s2 >> 8) & 0xff] << 8 ^
680+ Te4[(s3 >> 16) & 0xff] << 16 ^
681+ Te4[(s0 >> 24) ] << 24;
682+ t[2] = Te4[(s2 ) & 0xff] ^
683+ Te4[(s3 >> 8) & 0xff] << 8 ^
684+ Te4[(s0 >> 16) & 0xff] << 16 ^
685+ Te4[(s1 >> 24) ] << 24;
686+ t[3] = Te4[(s3 ) & 0xff] ^
687+ Te4[(s0 >> 8) & 0xff] << 8 ^
688+ Te4[(s1 >> 16) & 0xff] << 16 ^
689+ Te4[(s2 >> 24) ] << 24;
690+
691+ /* now do the linear transform using words */
692+ { int i;
693+ u32 r0, r1, r2;
694+
695+ for (i = 0; i < 4; i++) {
696+ r0 = t[i];
697+ r1 = r0 & 0x80808080;
698+ r2 = ((r0 & 0x7f7f7f7f) << 1) ^
699+ ((r1 - (r1 >> 7)) & 0x1b1b1b1b);
700+#if defined(ROTATE)
701+ t[i] = r2 ^ ROTATE(r2,24) ^ ROTATE(r0,24) ^
702+ ROTATE(r0,16) ^ ROTATE(r0,8);
703+#else
704+ t[i] = r2 ^ ((r2 ^ r0) << 24) ^ ((r2 ^ r0) >> 8) ^
705+ (r0 << 16) ^ (r0 >> 16) ^
706+ (r0 << 8) ^ (r0 >> 24);
707+#endif
708+ t[i] ^= rk[4+i];
709+ }
710+ }
711+#else
712+ t[0] = Te0[(s0 ) & 0xff] ^
713+ Te1[(s1 >> 8) & 0xff] ^
714+ Te2[(s2 >> 16) & 0xff] ^
715+ Te3[(s3 >> 24) ] ^
716+ rk[4];
717+ t[1] = Te0[(s1 ) & 0xff] ^
718+ Te1[(s2 >> 8) & 0xff] ^
719+ Te2[(s3 >> 16) & 0xff] ^
720+ Te3[(s0 >> 24) ] ^
721+ rk[5];
722+ t[2] = Te0[(s2 ) & 0xff] ^
723+ Te1[(s3 >> 8) & 0xff] ^
724+ Te2[(s0 >> 16) & 0xff] ^
725+ Te3[(s1 >> 24) ] ^
726+ rk[6];
727+ t[3] = Te0[(s3 ) & 0xff] ^
728+ Te1[(s0 >> 8) & 0xff] ^
729+ Te2[(s1 >> 16) & 0xff] ^
730+ Te3[(s2 >> 24) ] ^
731+ rk[7];
732+#endif
733+ s0 = t[0]; s1 = t[1]; s2 = t[2]; s3 = t[3];
734+
735+ /*
736+ * Nr - 2 full rounds:
737+ */
738+ for (rk+=8,r=key->rounds-2; r>0; rk+=4,r--) {
739+#if defined(AES_COMPACT_IN_INNER_ROUNDS)
740+ t[0] = Te4[(s0 ) & 0xff] ^
741+ Te4[(s1 >> 8) & 0xff] << 8 ^
742+ Te4[(s2 >> 16) & 0xff] << 16 ^
743+ Te4[(s3 >> 24) ] << 24;
744+ t[1] = Te4[(s1 ) & 0xff] ^
745+ Te4[(s2 >> 8) & 0xff] << 8 ^
746+ Te4[(s3 >> 16) & 0xff] << 16 ^
747+ Te4[(s0 >> 24) ] << 24;
748+ t[2] = Te4[(s2 ) & 0xff] ^
749+ Te4[(s3 >> 8) & 0xff] << 8 ^
750+ Te4[(s0 >> 16) & 0xff] << 16 ^
751+ Te4[(s1 >> 24) ] << 24;
752+ t[3] = Te4[(s3 ) & 0xff] ^
753+ Te4[(s0 >> 8) & 0xff] << 8 ^
754+ Te4[(s1 >> 16) & 0xff] << 16 ^
755+ Te4[(s2 >> 24) ] << 24;
756+
757+ /* now do the linear transform using words */
758+ { int i;
759+ u32 r0, r1, r2;
760+
761+ for (i = 0; i < 4; i++) {
762+ r0 = t[i];
763+ r1 = r0 & 0x80808080;
764+ r2 = ((r0 & 0x7f7f7f7f) << 1) ^
765+ ((r1 - (r1 >> 7)) & 0x1b1b1b1b);
766+#if defined(ROTATE)
767+ t[i] = r2 ^ ROTATE(r2,24) ^ ROTATE(r0,24) ^
768+ ROTATE(r0,16) ^ ROTATE(r0,8);
769+#else
770+ t[i] = r2 ^ ((r2 ^ r0) << 24) ^ ((r2 ^ r0) >> 8) ^
771+ (r0 << 16) ^ (r0 >> 16) ^
772+ (r0 << 8) ^ (r0 >> 24);
773+#endif
774+ t[i] ^= rk[i];
775+ }
776+ }
777+#else
778+ t[0] = Te0[(s0 ) & 0xff] ^
779+ Te1[(s1 >> 8) & 0xff] ^
780+ Te2[(s2 >> 16) & 0xff] ^
781+ Te3[(s3 >> 24) ] ^
782+ rk[0];
783+ t[1] = Te0[(s1 ) & 0xff] ^
784+ Te1[(s2 >> 8) & 0xff] ^
785+ Te2[(s3 >> 16) & 0xff] ^
786+ Te3[(s0 >> 24) ] ^
787+ rk[1];
788+ t[2] = Te0[(s2 ) & 0xff] ^
789+ Te1[(s3 >> 8) & 0xff] ^
790+ Te2[(s0 >> 16) & 0xff] ^
791+ Te3[(s1 >> 24) ] ^
792+ rk[2];
793+ t[3] = Te0[(s3 ) & 0xff] ^
794+ Te1[(s0 >> 8) & 0xff] ^
795+ Te2[(s1 >> 16) & 0xff] ^
796+ Te3[(s2 >> 24) ] ^
797+ rk[3];
798+#endif
799+ s0 = t[0]; s1 = t[1]; s2 = t[2]; s3 = t[3];
800+ }
801+ /*
802+ * apply last round and
803+ * map cipher state to byte array block:
804+ */
805+#if defined(AES_COMPACT_IN_OUTER_ROUNDS)
806+ prefetch256(Te4);
807+
808+ *(u32*)(out+0) =
809+ Te4[(s0 ) & 0xff] ^
810+ Te4[(s1 >> 8) & 0xff] << 8 ^
811+ Te4[(s2 >> 16) & 0xff] << 16 ^
812+ Te4[(s3 >> 24) ] << 24 ^
813+ rk[0];
814+ *(u32*)(out+4) =
815+ Te4[(s1 ) & 0xff] ^
816+ Te4[(s2 >> 8) & 0xff] << 8 ^
817+ Te4[(s3 >> 16) & 0xff] << 16 ^
818+ Te4[(s0 >> 24) ] << 24 ^
819+ rk[1];
820+ *(u32*)(out+8) =
821+ Te4[(s2 ) & 0xff] ^
822+ Te4[(s3 >> 8) & 0xff] << 8 ^
823+ Te4[(s0 >> 16) & 0xff] << 16 ^
824+ Te4[(s1 >> 24) ] << 24 ^
825+ rk[2];
826+ *(u32*)(out+12) =
827+ Te4[(s3 ) & 0xff] ^
828+ Te4[(s0 >> 8) & 0xff] << 8 ^
829+ Te4[(s1 >> 16) & 0xff] << 16 ^
830+ Te4[(s2 >> 24) ] << 24 ^
831+ rk[3];
832+#else
833+ *(u32*)(out+0) =
834+ (Te2[(s0 ) & 0xff] & 0x000000ffU) ^
835+ (Te3[(s1 >> 8) & 0xff] & 0x0000ff00U) ^
836+ (Te0[(s2 >> 16) & 0xff] & 0x00ff0000U) ^
837+ (Te1[(s3 >> 24) ] & 0xff000000U) ^
838+ rk[0];
839+ *(u32*)(out+4) =
840+ (Te2[(s1 ) & 0xff] & 0x000000ffU) ^
841+ (Te3[(s2 >> 8) & 0xff] & 0x0000ff00U) ^
842+ (Te0[(s3 >> 16) & 0xff] & 0x00ff0000U) ^
843+ (Te1[(s0 >> 24) ] & 0xff000000U) ^
844+ rk[1];
845+ *(u32*)(out+8) =
846+ (Te2[(s2 ) & 0xff] & 0x000000ffU) ^
847+ (Te3[(s3 >> 8) & 0xff] & 0x0000ff00U) ^
848+ (Te0[(s0 >> 16) & 0xff] & 0x00ff0000U) ^
849+ (Te1[(s1 >> 24) ] & 0xff000000U) ^
850+ rk[2];
851+ *(u32*)(out+12) =
852+ (Te2[(s3 ) & 0xff] & 0x000000ffU) ^
853+ (Te3[(s0 >> 8) & 0xff] & 0x0000ff00U) ^
854+ (Te0[(s1 >> 16) & 0xff] & 0x00ff0000U) ^
855+ (Te1[(s2 >> 24) ] & 0xff000000U) ^
856+ rk[3];
857+#endif
858+}
859+
860+/*
861+ * Decrypt a single block
862+ * in and out can overlap
863+ */
864+void AES_decrypt(const unsigned char *in, unsigned char *out,
865+ const AES_KEY *key) {
866+
867+ const u32 *rk;
868+ u32 s0, s1, s2, s3, t[4];
869+ int r;
870+
871+ assert(in && out && key);
872+ rk = key->rd_key;
873+
874+ /*
875+ * map byte array block to cipher state
876+ * and add initial round key:
877+ */
878+ s0 = GETU32(in ) ^ rk[0];
879+ s1 = GETU32(in + 4) ^ rk[1];
880+ s2 = GETU32(in + 8) ^ rk[2];
881+ s3 = GETU32(in + 12) ^ rk[3];
882+
883+#if defined(AES_COMPACT_IN_OUTER_ROUNDS)
884+ prefetch256(Td4);
885+
886+ t[0] = Td4[(s0 ) & 0xff] ^
887+ Td4[(s3 >> 8) & 0xff] << 8 ^
888+ Td4[(s2 >> 16) & 0xff] << 16 ^
889+ Td4[(s1 >> 24) ] << 24;
890+ t[1] = Td4[(s1 ) & 0xff] ^
891+ Td4[(s0 >> 8) & 0xff] << 8 ^
892+ Td4[(s3 >> 16) & 0xff] << 16 ^
893+ Td4[(s2 >> 24) ] << 24;
894+ t[2] = Td4[(s2 ) & 0xff] ^
895+ Td4[(s1 >> 8) & 0xff] << 8 ^
896+ Td4[(s0 >> 16) & 0xff] << 16 ^
897+ Td4[(s3 >> 24) ] << 24;
898+ t[3] = Td4[(s3 ) & 0xff] ^
899+ Td4[(s2 >> 8) & 0xff] << 8 ^
900+ Td4[(s1 >> 16) & 0xff] << 16 ^
901+ Td4[(s0 >> 24) ] << 24;
902+
903+ /* now do the linear transform using words */
904+ { int i;
905+ u32 tp1, tp2, tp4, tp8, tp9, tpb, tpd, tpe, m;
906+
907+ for (i = 0; i < 4; i++) {
908+ tp1 = t[i];
909+ m = tp1 & 0x80808080;
910+ tp2 = ((tp1 & 0x7f7f7f7f) << 1) ^
911+ ((m - (m >> 7)) & 0x1b1b1b1b);
912+ m = tp2 & 0x80808080;
913+ tp4 = ((tp2 & 0x7f7f7f7f) << 1) ^
914+ ((m - (m >> 7)) & 0x1b1b1b1b);
915+ m = tp4 & 0x80808080;
916+ tp8 = ((tp4 & 0x7f7f7f7f) << 1) ^
917+ ((m - (m >> 7)) & 0x1b1b1b1b);
918+ tp9 = tp8 ^ tp1;
919+ tpb = tp9 ^ tp2;
920+ tpd = tp9 ^ tp4;
921+ tpe = tp8 ^ tp4 ^ tp2;
922+#if defined(ROTATE)
923+ t[i] = tpe ^ ROTATE(tpd,16) ^
924+ ROTATE(tp9,8) ^ ROTATE(tpb,24);
925+#else
926+ t[i] = tpe ^ (tpd >> 16) ^ (tpd << 16) ^
927+ (tp9 >> 24) ^ (tp9 << 8) ^
928+ (tpb >> 8) ^ (tpb << 24);
929+#endif
930+ t[i] ^= rk[4+i];
931+ }
932+ }
933+#else
934+ t[0] = Td0[(s0 ) & 0xff] ^
935+ Td1[(s3 >> 8) & 0xff] ^
936+ Td2[(s2 >> 16) & 0xff] ^
937+ Td3[(s1 >> 24) ] ^
938+ rk[4];
939+ t[1] = Td0[(s1 ) & 0xff] ^
940+ Td1[(s0 >> 8) & 0xff] ^
941+ Td2[(s3 >> 16) & 0xff] ^
942+ Td3[(s2 >> 24) ] ^
943+ rk[5];
944+ t[2] = Td0[(s2 ) & 0xff] ^
945+ Td1[(s1 >> 8) & 0xff] ^
946+ Td2[(s0 >> 16) & 0xff] ^
947+ Td3[(s3 >> 24) ] ^
948+ rk[6];
949+ t[3] = Td0[(s3 ) & 0xff] ^
950+ Td1[(s2 >> 8) & 0xff] ^
951+ Td2[(s1 >> 16) & 0xff] ^
952+ Td3[(s0 >> 24) ] ^
953+ rk[7];
954+#endif
955+ s0 = t[0]; s1 = t[1]; s2 = t[2]; s3 = t[3];
956+
957+ /*
958+ * Nr - 2 full rounds:
959+ */
960+ for (rk+=8,r=key->rounds-2; r>0; rk+=4,r--) {
961+#if defined(AES_COMPACT_IN_INNER_ROUNDS)
962+ t[0] = Td4[(s0 ) & 0xff] ^
963+ Td4[(s3 >> 8) & 0xff] << 8 ^
964+ Td4[(s2 >> 16) & 0xff] << 16 ^
965+ Td4[(s1 >> 24) ] << 24;
966+ t[1] = Td4[(s1 ) & 0xff] ^
967+ Td4[(s0 >> 8) & 0xff] << 8 ^
968+ Td4[(s3 >> 16) & 0xff] << 16 ^
969+ Td4[(s2 >> 24) ] << 24;
970+ t[2] = Td4[(s2 ) & 0xff] ^
971+ Td4[(s1 >> 8) & 0xff] << 8 ^
972+ Td4[(s0 >> 16) & 0xff] << 16 ^
973+ Td4[(s3 >> 24) ] << 24;
974+ t[3] = Td4[(s3 ) & 0xff] ^
975+ Td4[(s2 >> 8) & 0xff] << 8 ^
976+ Td4[(s1 >> 16) & 0xff] << 16 ^
977+ Td4[(s0 >> 24) ] << 24;
978+
979+ /* now do the linear transform using words */
980+ { int i;
981+ u32 tp1, tp2, tp4, tp8, tp9, tpb, tpd, tpe, m;
982+
983+ for (i = 0; i < 4; i++) {
984+ tp1 = t[i];
985+ m = tp1 & 0x80808080;
986+ tp2 = ((tp1 & 0x7f7f7f7f) << 1) ^
987+ ((m - (m >> 7)) & 0x1b1b1b1b);
988+ m = tp2 & 0x80808080;
989+ tp4 = ((tp2 & 0x7f7f7f7f) << 1) ^
990+ ((m - (m >> 7)) & 0x1b1b1b1b);
991+ m = tp4 & 0x80808080;
992+ tp8 = ((tp4 & 0x7f7f7f7f) << 1) ^
993+ ((m - (m >> 7)) & 0x1b1b1b1b);
994+ tp9 = tp8 ^ tp1;
995+ tpb = tp9 ^ tp2;
996+ tpd = tp9 ^ tp4;
997+ tpe = tp8 ^ tp4 ^ tp2;
998+#if defined(ROTATE)
999+ t[i] = tpe ^ ROTATE(tpd,16) ^
1000+ ROTATE(tp9,8) ^ ROTATE(tpb,24);
1001+#else
1002+ t[i] = tpe ^ (tpd >> 16) ^ (tpd << 16) ^
1003+ (tp9 >> 24) ^ (tp9 << 8) ^
1004+ (tpb >> 8) ^ (tpb << 24);
1005+#endif
1006+ t[i] ^= rk[i];
1007+ }
1008+ }
1009+#else
1010+ t[0] = Td0[(s0 ) & 0xff] ^
1011+ Td1[(s3 >> 8) & 0xff] ^
1012+ Td2[(s2 >> 16) & 0xff] ^
1013+ Td3[(s1 >> 24) ] ^
1014+ rk[0];
1015+ t[1] = Td0[(s1 ) & 0xff] ^
1016+ Td1[(s0 >> 8) & 0xff] ^
1017+ Td2[(s3 >> 16) & 0xff] ^
1018+ Td3[(s2 >> 24) ] ^
1019+ rk[1];
1020+ t[2] = Td0[(s2 ) & 0xff] ^
1021+ Td1[(s1 >> 8) & 0xff] ^
1022+ Td2[(s0 >> 16) & 0xff] ^
1023+ Td3[(s3 >> 24) ] ^
1024+ rk[2];
1025+ t[3] = Td0[(s3 ) & 0xff] ^
1026+ Td1[(s2 >> 8) & 0xff] ^
1027+ Td2[(s1 >> 16) & 0xff] ^
1028+ Td3[(s0 >> 24) ] ^
1029+ rk[3];
1030+#endif
1031+ s0 = t[0]; s1 = t[1]; s2 = t[2]; s3 = t[3];
1032+ }
1033+ /*
1034+ * apply last round and
1035+ * map cipher state to byte array block:
1036+ */
1037+ prefetch256(Td4);
1038+
1039+ *(u32*)(out+0) =
1040+ (Td4[(s0 ) & 0xff]) ^
1041+ (Td4[(s3 >> 8) & 0xff] << 8) ^
1042+ (Td4[(s2 >> 16) & 0xff] << 16) ^
1043+ (Td4[(s1 >> 24) ] << 24) ^
1044+ rk[0];
1045+ *(u32*)(out+4) =
1046+ (Td4[(s1 ) & 0xff]) ^
1047+ (Td4[(s0 >> 8) & 0xff] << 8) ^
1048+ (Td4[(s3 >> 16) & 0xff] << 16) ^
1049+ (Td4[(s2 >> 24) ] << 24) ^
1050+ rk[1];
1051+ *(u32*)(out+8) =
1052+ (Td4[(s2 ) & 0xff]) ^
1053+ (Td4[(s1 >> 8) & 0xff] << 8) ^
1054+ (Td4[(s0 >> 16) & 0xff] << 16) ^
1055+ (Td4[(s3 >> 24) ] << 24) ^
1056+ rk[2];
1057+ *(u32*)(out+12) =
1058+ (Td4[(s3 ) & 0xff]) ^
1059+ (Td4[(s2 >> 8) & 0xff] << 8) ^
1060+ (Td4[(s1 >> 16) & 0xff] << 16) ^
1061+ (Td4[(s0 >> 24) ] << 24) ^
1062+ rk[3];
1063+}
--- /dev/null
+++ b/crypto/aes/asm/aes-armv4.pl
@@ -0,0 +1,1030 @@
1+#!/usr/bin/env perl
2+
3+# ====================================================================
4+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5+# project. The module is, however, dual licensed under OpenSSL and
6+# CRYPTOGAMS licenses depending on where you obtain it. For further
7+# details see http://www.openssl.org/~appro/cryptogams/.
8+# ====================================================================
9+
10+# AES for ARMv4
11+
12+# January 2007.
13+#
14+# Code uses single 1K S-box and is >2 times faster than code generated
15+# by gcc-3.4.1. This is thanks to unique feature of ARMv4 ISA, which
16+# allows to merge logical or arithmetic operation with shift or rotate
17+# in one instruction and emit combined result every cycle. The module
18+# is endian-neutral. The performance is ~42 cycles/byte for 128-bit
19+# key.
20+
21+# May 2007.
22+#
23+# AES_set_[en|de]crypt_key is added.
24+
25+$s0="r0";
26+$s1="r1";
27+$s2="r2";
28+$s3="r3";
29+$t1="r4";
30+$t2="r5";
31+$t3="r6";
32+$i1="r7";
33+$i2="r8";
34+$i3="r9";
35+
36+$tbl="r10";
37+$key="r11";
38+$rounds="r12";
39+
40+$code=<<___;
41+.text
42+.code 32
43+
44+.type AES_Te,%object
45+.align 5
46+AES_Te:
47+.word 0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d
48+.word 0xfff2f20d, 0xd66b6bbd, 0xde6f6fb1, 0x91c5c554
49+.word 0x60303050, 0x02010103, 0xce6767a9, 0x562b2b7d
50+.word 0xe7fefe19, 0xb5d7d762, 0x4dababe6, 0xec76769a
51+.word 0x8fcaca45, 0x1f82829d, 0x89c9c940, 0xfa7d7d87
52+.word 0xeffafa15, 0xb25959eb, 0x8e4747c9, 0xfbf0f00b
53+.word 0x41adadec, 0xb3d4d467, 0x5fa2a2fd, 0x45afafea
54+.word 0x239c9cbf, 0x53a4a4f7, 0xe4727296, 0x9bc0c05b
55+.word 0x75b7b7c2, 0xe1fdfd1c, 0x3d9393ae, 0x4c26266a
56+.word 0x6c36365a, 0x7e3f3f41, 0xf5f7f702, 0x83cccc4f
57+.word 0x6834345c, 0x51a5a5f4, 0xd1e5e534, 0xf9f1f108
58+.word 0xe2717193, 0xabd8d873, 0x62313153, 0x2a15153f
59+.word 0x0804040c, 0x95c7c752, 0x46232365, 0x9dc3c35e
60+.word 0x30181828, 0x379696a1, 0x0a05050f, 0x2f9a9ab5
61+.word 0x0e070709, 0x24121236, 0x1b80809b, 0xdfe2e23d
62+.word 0xcdebeb26, 0x4e272769, 0x7fb2b2cd, 0xea75759f
63+.word 0x1209091b, 0x1d83839e, 0x582c2c74, 0x341a1a2e
64+.word 0x361b1b2d, 0xdc6e6eb2, 0xb45a5aee, 0x5ba0a0fb
65+.word 0xa45252f6, 0x763b3b4d, 0xb7d6d661, 0x7db3b3ce
66+.word 0x5229297b, 0xdde3e33e, 0x5e2f2f71, 0x13848497
67+.word 0xa65353f5, 0xb9d1d168, 0x00000000, 0xc1eded2c
68+.word 0x40202060, 0xe3fcfc1f, 0x79b1b1c8, 0xb65b5bed
69+.word 0xd46a6abe, 0x8dcbcb46, 0x67bebed9, 0x7239394b
70+.word 0x944a4ade, 0x984c4cd4, 0xb05858e8, 0x85cfcf4a
71+.word 0xbbd0d06b, 0xc5efef2a, 0x4faaaae5, 0xedfbfb16
72+.word 0x864343c5, 0x9a4d4dd7, 0x66333355, 0x11858594
73+.word 0x8a4545cf, 0xe9f9f910, 0x04020206, 0xfe7f7f81
74+.word 0xa05050f0, 0x783c3c44, 0x259f9fba, 0x4ba8a8e3
75+.word 0xa25151f3, 0x5da3a3fe, 0x804040c0, 0x058f8f8a
76+.word 0x3f9292ad, 0x219d9dbc, 0x70383848, 0xf1f5f504
77+.word 0x63bcbcdf, 0x77b6b6c1, 0xafdada75, 0x42212163
78+.word 0x20101030, 0xe5ffff1a, 0xfdf3f30e, 0xbfd2d26d
79+.word 0x81cdcd4c, 0x180c0c14, 0x26131335, 0xc3ecec2f
80+.word 0xbe5f5fe1, 0x359797a2, 0x884444cc, 0x2e171739
81+.word 0x93c4c457, 0x55a7a7f2, 0xfc7e7e82, 0x7a3d3d47
82+.word 0xc86464ac, 0xba5d5de7, 0x3219192b, 0xe6737395
83+.word 0xc06060a0, 0x19818198, 0x9e4f4fd1, 0xa3dcdc7f
84+.word 0x44222266, 0x542a2a7e, 0x3b9090ab, 0x0b888883
85+.word 0x8c4646ca, 0xc7eeee29, 0x6bb8b8d3, 0x2814143c
86+.word 0xa7dede79, 0xbc5e5ee2, 0x160b0b1d, 0xaddbdb76
87+.word 0xdbe0e03b, 0x64323256, 0x743a3a4e, 0x140a0a1e
88+.word 0x924949db, 0x0c06060a, 0x4824246c, 0xb85c5ce4
89+.word 0x9fc2c25d, 0xbdd3d36e, 0x43acacef, 0xc46262a6
90+.word 0x399191a8, 0x319595a4, 0xd3e4e437, 0xf279798b
91+.word 0xd5e7e732, 0x8bc8c843, 0x6e373759, 0xda6d6db7
92+.word 0x018d8d8c, 0xb1d5d564, 0x9c4e4ed2, 0x49a9a9e0
93+.word 0xd86c6cb4, 0xac5656fa, 0xf3f4f407, 0xcfeaea25
94+.word 0xca6565af, 0xf47a7a8e, 0x47aeaee9, 0x10080818
95+.word 0x6fbabad5, 0xf0787888, 0x4a25256f, 0x5c2e2e72
96+.word 0x381c1c24, 0x57a6a6f1, 0x73b4b4c7, 0x97c6c651
97+.word 0xcbe8e823, 0xa1dddd7c, 0xe874749c, 0x3e1f1f21
98+.word 0x964b4bdd, 0x61bdbddc, 0x0d8b8b86, 0x0f8a8a85
99+.word 0xe0707090, 0x7c3e3e42, 0x71b5b5c4, 0xcc6666aa
100+.word 0x904848d8, 0x06030305, 0xf7f6f601, 0x1c0e0e12
101+.word 0xc26161a3, 0x6a35355f, 0xae5757f9, 0x69b9b9d0
102+.word 0x17868691, 0x99c1c158, 0x3a1d1d27, 0x279e9eb9
103+.word 0xd9e1e138, 0xebf8f813, 0x2b9898b3, 0x22111133
104+.word 0xd26969bb, 0xa9d9d970, 0x078e8e89, 0x339494a7
105+.word 0x2d9b9bb6, 0x3c1e1e22, 0x15878792, 0xc9e9e920
106+.word 0x87cece49, 0xaa5555ff, 0x50282878, 0xa5dfdf7a
107+.word 0x038c8c8f, 0x59a1a1f8, 0x09898980, 0x1a0d0d17
108+.word 0x65bfbfda, 0xd7e6e631, 0x844242c6, 0xd06868b8
109+.word 0x824141c3, 0x299999b0, 0x5a2d2d77, 0x1e0f0f11
110+.word 0x7bb0b0cb, 0xa85454fc, 0x6dbbbbd6, 0x2c16163a
111+@ Te4[256]
112+.byte 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5
113+.byte 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
114+.byte 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
115+.byte 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
116+.byte 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
117+.byte 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
118+.byte 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
119+.byte 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
120+.byte 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
121+.byte 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
122+.byte 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
123+.byte 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
124+.byte 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
125+.byte 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
126+.byte 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
127+.byte 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
128+.byte 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
129+.byte 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
130+.byte 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
131+.byte 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
132+.byte 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
133+.byte 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
134+.byte 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
135+.byte 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
136+.byte 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
137+.byte 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
138+.byte 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
139+.byte 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
140+.byte 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
141+.byte 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
142+.byte 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
143+.byte 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
144+@ rcon[]
145+.word 0x01000000, 0x02000000, 0x04000000, 0x08000000
146+.word 0x10000000, 0x20000000, 0x40000000, 0x80000000
147+.word 0x1B000000, 0x36000000, 0, 0, 0, 0, 0, 0
148+.size AES_Te,.-AES_Te
149+
150+@ void AES_encrypt(const unsigned char *in, unsigned char *out,
151+@ const AES_KEY *key) {
152+.global AES_encrypt
153+.type AES_encrypt,%function
154+.align 5
155+AES_encrypt:
156+ sub r3,pc,#8 @ AES_encrypt
157+ stmdb sp!,{r1,r4-r12,lr}
158+ mov $rounds,r0 @ inp
159+ mov $key,r2
160+ sub $tbl,r3,#AES_encrypt-AES_Te @ Te
161+
162+ ldrb $s0,[$rounds,#3] @ load input data in endian-neutral
163+ ldrb $t1,[$rounds,#2] @ manner...
164+ ldrb $t2,[$rounds,#1]
165+ ldrb $t3,[$rounds,#0]
166+ orr $s0,$s0,$t1,lsl#8
167+ orr $s0,$s0,$t2,lsl#16
168+ orr $s0,$s0,$t3,lsl#24
169+ ldrb $s1,[$rounds,#7]
170+ ldrb $t1,[$rounds,#6]
171+ ldrb $t2,[$rounds,#5]
172+ ldrb $t3,[$rounds,#4]
173+ orr $s1,$s1,$t1,lsl#8
174+ orr $s1,$s1,$t2,lsl#16
175+ orr $s1,$s1,$t3,lsl#24
176+ ldrb $s2,[$rounds,#11]
177+ ldrb $t1,[$rounds,#10]
178+ ldrb $t2,[$rounds,#9]
179+ ldrb $t3,[$rounds,#8]
180+ orr $s2,$s2,$t1,lsl#8
181+ orr $s2,$s2,$t2,lsl#16
182+ orr $s2,$s2,$t3,lsl#24
183+ ldrb $s3,[$rounds,#15]
184+ ldrb $t1,[$rounds,#14]
185+ ldrb $t2,[$rounds,#13]
186+ ldrb $t3,[$rounds,#12]
187+ orr $s3,$s3,$t1,lsl#8
188+ orr $s3,$s3,$t2,lsl#16
189+ orr $s3,$s3,$t3,lsl#24
190+
191+ bl _armv4_AES_encrypt
192+
193+ ldr $rounds,[sp],#4 @ pop out
194+ mov $t1,$s0,lsr#24 @ write output in endian-neutral
195+ mov $t2,$s0,lsr#16 @ manner...
196+ mov $t3,$s0,lsr#8
197+ strb $t1,[$rounds,#0]
198+ strb $t2,[$rounds,#1]
199+ strb $t3,[$rounds,#2]
200+ strb $s0,[$rounds,#3]
201+ mov $t1,$s1,lsr#24
202+ mov $t2,$s1,lsr#16
203+ mov $t3,$s1,lsr#8
204+ strb $t1,[$rounds,#4]
205+ strb $t2,[$rounds,#5]
206+ strb $t3,[$rounds,#6]
207+ strb $s1,[$rounds,#7]
208+ mov $t1,$s2,lsr#24
209+ mov $t2,$s2,lsr#16
210+ mov $t3,$s2,lsr#8
211+ strb $t1,[$rounds,#8]
212+ strb $t2,[$rounds,#9]
213+ strb $t3,[$rounds,#10]
214+ strb $s2,[$rounds,#11]
215+ mov $t1,$s3,lsr#24
216+ mov $t2,$s3,lsr#16
217+ mov $t3,$s3,lsr#8
218+ strb $t1,[$rounds,#12]
219+ strb $t2,[$rounds,#13]
220+ strb $t3,[$rounds,#14]
221+ strb $s3,[$rounds,#15]
222+
223+ ldmia sp!,{r4-r12,lr}
224+ tst lr,#1
225+ moveq pc,lr @ be binary compatible with V4, yet
226+ bx lr @ interoperable with Thumb ISA:-)
227+.size AES_encrypt,.-AES_encrypt
228+
229+.type _armv4_AES_encrypt,%function
230+.align 2
231+_armv4_AES_encrypt:
232+ str lr,[sp,#-4]! @ push lr
233+ ldr $t1,[$key],#16
234+ ldr $t2,[$key,#-12]
235+ ldr $t3,[$key,#-8]
236+ ldr $i1,[$key,#-4]
237+ ldr $rounds,[$key,#240-16]
238+ eor $s0,$s0,$t1
239+ eor $s1,$s1,$t2
240+ eor $s2,$s2,$t3
241+ eor $s3,$s3,$i1
242+ sub $rounds,$rounds,#1
243+ mov lr,#255
244+
245+.Lenc_loop:
246+ and $i2,lr,$s0,lsr#8
247+ and $i3,lr,$s0,lsr#16
248+ and $i1,lr,$s0
249+ mov $s0,$s0,lsr#24
250+ ldr $t1,[$tbl,$i1,lsl#2] @ Te3[s0>>0]
251+ ldr $s0,[$tbl,$s0,lsl#2] @ Te0[s0>>24]
252+ ldr $t2,[$tbl,$i2,lsl#2] @ Te2[s0>>8]
253+ ldr $t3,[$tbl,$i3,lsl#2] @ Te1[s0>>16]
254+
255+ and $i1,lr,$s1,lsr#16 @ i0
256+ and $i2,lr,$s1
257+ and $i3,lr,$s1,lsr#8
258+ mov $s1,$s1,lsr#24
259+ ldr $i1,[$tbl,$i1,lsl#2] @ Te1[s1>>16]
260+ ldr $s1,[$tbl,$s1,lsl#2] @ Te0[s1>>24]
261+ ldr $i2,[$tbl,$i2,lsl#2] @ Te3[s1>>0]
262+ ldr $i3,[$tbl,$i3,lsl#2] @ Te2[s1>>8]
263+ eor $s0,$s0,$i1,ror#8
264+ eor $s1,$s1,$t1,ror#24
265+ eor $t2,$t2,$i2,ror#8
266+ eor $t3,$t3,$i3,ror#8
267+
268+ and $i1,lr,$s2,lsr#8 @ i0
269+ and $i2,lr,$s2,lsr#16 @ i1
270+ and $i3,lr,$s2
271+ mov $s2,$s2,lsr#24
272+ ldr $i1,[$tbl,$i1,lsl#2] @ Te2[s2>>8]
273+ ldr $i2,[$tbl,$i2,lsl#2] @ Te1[s2>>16]
274+ ldr $s2,[$tbl,$s2,lsl#2] @ Te0[s2>>24]
275+ ldr $i3,[$tbl,$i3,lsl#2] @ Te3[s2>>0]
276+ eor $s0,$s0,$i1,ror#16
277+ eor $s1,$s1,$i2,ror#8
278+ eor $s2,$s2,$t2,ror#16
279+ eor $t3,$t3,$i3,ror#16
280+
281+ and $i1,lr,$s3 @ i0
282+ and $i2,lr,$s3,lsr#8 @ i1
283+ and $i3,lr,$s3,lsr#16 @ i2
284+ mov $s3,$s3,lsr#24
285+ ldr $i1,[$tbl,$i1,lsl#2] @ Te3[s3>>0]
286+ ldr $i2,[$tbl,$i2,lsl#2] @ Te2[s3>>8]
287+ ldr $i3,[$tbl,$i3,lsl#2] @ Te1[s3>>16]
288+ ldr $s3,[$tbl,$s3,lsl#2] @ Te0[s3>>24]
289+ eor $s0,$s0,$i1,ror#24
290+ eor $s1,$s1,$i2,ror#16
291+ eor $s2,$s2,$i3,ror#8
292+ eor $s3,$s3,$t3,ror#8
293+
294+ ldr $t1,[$key],#16
295+ ldr $t2,[$key,#-12]
296+ ldr $t3,[$key,#-8]
297+ ldr $i1,[$key,#-4]
298+ eor $s0,$s0,$t1
299+ eor $s1,$s1,$t2
300+ eor $s2,$s2,$t3
301+ eor $s3,$s3,$i1
302+
303+ subs $rounds,$rounds,#1
304+ bne .Lenc_loop
305+
306+ add $tbl,$tbl,#2
307+
308+ and $i1,lr,$s0
309+ and $i2,lr,$s0,lsr#8
310+ and $i3,lr,$s0,lsr#16
311+ mov $s0,$s0,lsr#24
312+ ldrb $t1,[$tbl,$i1,lsl#2] @ Te4[s0>>0]
313+ ldrb $s0,[$tbl,$s0,lsl#2] @ Te4[s0>>24]
314+ ldrb $t2,[$tbl,$i2,lsl#2] @ Te4[s0>>8]
315+ ldrb $t3,[$tbl,$i3,lsl#2] @ Te4[s0>>16]
316+
317+ and $i1,lr,$s1,lsr#16 @ i0
318+ and $i2,lr,$s1
319+ and $i3,lr,$s1,lsr#8
320+ mov $s1,$s1,lsr#24
321+ ldrb $i1,[$tbl,$i1,lsl#2] @ Te4[s1>>16]
322+ ldrb $s1,[$tbl,$s1,lsl#2] @ Te4[s1>>24]
323+ ldrb $i2,[$tbl,$i2,lsl#2] @ Te4[s1>>0]
324+ ldrb $i3,[$tbl,$i3,lsl#2] @ Te4[s1>>8]
325+ eor $s0,$i1,$s0,lsl#8
326+ eor $s1,$t1,$s1,lsl#24
327+ eor $t2,$i2,$t2,lsl#8
328+ eor $t3,$i3,$t3,lsl#8
329+
330+ and $i1,lr,$s2,lsr#8 @ i0
331+ and $i2,lr,$s2,lsr#16 @ i1
332+ and $i3,lr,$s2
333+ mov $s2,$s2,lsr#24
334+ ldrb $i1,[$tbl,$i1,lsl#2] @ Te4[s2>>8]
335+ ldrb $i2,[$tbl,$i2,lsl#2] @ Te4[s2>>16]
336+ ldrb $s2,[$tbl,$s2,lsl#2] @ Te4[s2>>24]
337+ ldrb $i3,[$tbl,$i3,lsl#2] @ Te4[s2>>0]
338+ eor $s0,$i1,$s0,lsl#8
339+ eor $s1,$s1,$i2,lsl#16
340+ eor $s2,$t2,$s2,lsl#24
341+ eor $t3,$i3,$t3,lsl#8
342+
343+ and $i1,lr,$s3 @ i0
344+ and $i2,lr,$s3,lsr#8 @ i1
345+ and $i3,lr,$s3,lsr#16 @ i2
346+ mov $s3,$s3,lsr#24
347+ ldrb $i1,[$tbl,$i1,lsl#2] @ Te4[s3>>0]
348+ ldrb $i2,[$tbl,$i2,lsl#2] @ Te4[s3>>8]
349+ ldrb $i3,[$tbl,$i3,lsl#2] @ Te4[s3>>16]
350+ ldrb $s3,[$tbl,$s3,lsl#2] @ Te4[s3>>24]
351+ eor $s0,$i1,$s0,lsl#8
352+ eor $s1,$s1,$i2,lsl#8
353+ eor $s2,$s2,$i3,lsl#16
354+ eor $s3,$t3,$s3,lsl#24
355+
356+ ldr lr,[sp],#4 @ pop lr
357+ ldr $t1,[$key,#0]
358+ ldr $t2,[$key,#4]
359+ ldr $t3,[$key,#8]
360+ ldr $i1,[$key,#12]
361+ eor $s0,$s0,$t1
362+ eor $s1,$s1,$t2
363+ eor $s2,$s2,$t3
364+ eor $s3,$s3,$i1
365+
366+ sub $tbl,$tbl,#2
367+ mov pc,lr @ return
368+.size _armv4_AES_encrypt,.-_armv4_AES_encrypt
369+
370+.global AES_set_encrypt_key
371+.type AES_set_encrypt_key,%function
372+.align 5
373+AES_set_encrypt_key:
374+ sub r3,pc,#8 @ AES_set_encrypt_key
375+ teq r0,#0
376+ moveq r0,#-1
377+ beq .Labrt
378+ teq r2,#0
379+ moveq r0,#-1
380+ beq .Labrt
381+
382+ teq r1,#128
383+ beq .Lok
384+ teq r1,#192
385+ beq .Lok
386+ teq r1,#256
387+ movne r0,#-1
388+ bne .Labrt
389+
390+.Lok: stmdb sp!,{r4-r12,lr}
391+ sub $tbl,r3,#AES_set_encrypt_key-AES_Te-1024 @ Te4
392+
393+ mov $rounds,r0 @ inp
394+ mov lr,r1 @ bits
395+ mov $key,r2 @ key
396+
397+ ldrb $s0,[$rounds,#3] @ load input data in endian-neutral
398+ ldrb $t1,[$rounds,#2] @ manner...
399+ ldrb $t2,[$rounds,#1]
400+ ldrb $t3,[$rounds,#0]
401+ orr $s0,$s0,$t1,lsl#8
402+ orr $s0,$s0,$t2,lsl#16
403+ orr $s0,$s0,$t3,lsl#24
404+ ldrb $s1,[$rounds,#7]
405+ ldrb $t1,[$rounds,#6]
406+ ldrb $t2,[$rounds,#5]
407+ ldrb $t3,[$rounds,#4]
408+ orr $s1,$s1,$t1,lsl#8
409+ orr $s1,$s1,$t2,lsl#16
410+ orr $s1,$s1,$t3,lsl#24
411+ ldrb $s2,[$rounds,#11]
412+ ldrb $t1,[$rounds,#10]
413+ ldrb $t2,[$rounds,#9]
414+ ldrb $t3,[$rounds,#8]
415+ orr $s2,$s2,$t1,lsl#8
416+ orr $s2,$s2,$t2,lsl#16
417+ orr $s2,$s2,$t3,lsl#24
418+ ldrb $s3,[$rounds,#15]
419+ ldrb $t1,[$rounds,#14]
420+ ldrb $t2,[$rounds,#13]
421+ ldrb $t3,[$rounds,#12]
422+ orr $s3,$s3,$t1,lsl#8
423+ orr $s3,$s3,$t2,lsl#16
424+ orr $s3,$s3,$t3,lsl#24
425+ str $s0,[$key],#16
426+ str $s1,[$key,#-12]
427+ str $s2,[$key,#-8]
428+ str $s3,[$key,#-4]
429+
430+ teq lr,#128
431+ bne .Lnot128
432+ mov $rounds,#10
433+ str $rounds,[$key,#240-16]
434+ add $t3,$tbl,#256 @ rcon
435+ mov lr,#255
436+
437+.L128_loop:
438+ and $t2,lr,$s3,lsr#24
439+ and $i1,lr,$s3,lsr#16
440+ and $i2,lr,$s3,lsr#8
441+ and $i3,lr,$s3
442+ ldrb $t2,[$tbl,$t2]
443+ ldrb $i1,[$tbl,$i1]
444+ ldrb $i2,[$tbl,$i2]
445+ ldrb $i3,[$tbl,$i3]
446+ ldr $t1,[$t3],#4 @ rcon[i++]
447+ orr $t2,$t2,$i1,lsl#24
448+ orr $t2,$t2,$i2,lsl#16
449+ orr $t2,$t2,$i3,lsl#8
450+ eor $t2,$t2,$t1
451+ eor $s0,$s0,$t2 @ rk[4]=rk[0]^...
452+ eor $s1,$s1,$s0 @ rk[5]=rk[1]^rk[4]
453+ eor $s2,$s2,$s1 @ rk[6]=rk[2]^rk[5]
454+ eor $s3,$s3,$s2 @ rk[7]=rk[3]^rk[6]
455+ str $s0,[$key],#16
456+ str $s1,[$key,#-12]
457+ str $s2,[$key,#-8]
458+ str $s3,[$key,#-4]
459+
460+ subs $rounds,$rounds,#1
461+ bne .L128_loop
462+ sub r2,$key,#176
463+ b .Ldone
464+
465+.Lnot128:
466+ ldrb $i2,[$rounds,#19]
467+ ldrb $t1,[$rounds,#18]
468+ ldrb $t2,[$rounds,#17]
469+ ldrb $t3,[$rounds,#16]
470+ orr $i2,$i2,$t1,lsl#8
471+ orr $i2,$i2,$t2,lsl#16
472+ orr $i2,$i2,$t3,lsl#24
473+ ldrb $i3,[$rounds,#23]
474+ ldrb $t1,[$rounds,#22]
475+ ldrb $t2,[$rounds,#21]
476+ ldrb $t3,[$rounds,#20]
477+ orr $i3,$i3,$t1,lsl#8
478+ orr $i3,$i3,$t2,lsl#16
479+ orr $i3,$i3,$t3,lsl#24
480+ str $i2,[$key],#8
481+ str $i3,[$key,#-4]
482+
483+ teq lr,#192
484+ bne .Lnot192
485+ mov $rounds,#12
486+ str $rounds,[$key,#240-24]
487+ add $t3,$tbl,#256 @ rcon
488+ mov lr,#255
489+ mov $rounds,#8
490+
491+.L192_loop:
492+ and $t2,lr,$i3,lsr#24
493+ and $i1,lr,$i3,lsr#16
494+ and $i2,lr,$i3,lsr#8
495+ and $i3,lr,$i3
496+ ldrb $t2,[$tbl,$t2]
497+ ldrb $i1,[$tbl,$i1]
498+ ldrb $i2,[$tbl,$i2]
499+ ldrb $i3,[$tbl,$i3]
500+ ldr $t1,[$t3],#4 @ rcon[i++]
501+ orr $t2,$t2,$i1,lsl#24
502+ orr $t2,$t2,$i2,lsl#16
503+ orr $t2,$t2,$i3,lsl#8
504+ eor $i3,$t2,$t1
505+ eor $s0,$s0,$i3 @ rk[6]=rk[0]^...
506+ eor $s1,$s1,$s0 @ rk[7]=rk[1]^rk[6]
507+ eor $s2,$s2,$s1 @ rk[8]=rk[2]^rk[7]
508+ eor $s3,$s3,$s2 @ rk[9]=rk[3]^rk[8]
509+ str $s0,[$key],#24
510+ str $s1,[$key,#-20]
511+ str $s2,[$key,#-16]
512+ str $s3,[$key,#-12]
513+
514+ subs $rounds,$rounds,#1
515+ subeq r2,$key,#216
516+ beq .Ldone
517+
518+ ldr $i1,[$key,#-32]
519+ ldr $i2,[$key,#-28]
520+ eor $i1,$i1,$s3 @ rk[10]=rk[4]^rk[9]
521+ eor $i3,$i2,$i1 @ rk[11]=rk[5]^rk[10]
522+ str $i1,[$key,#-8]
523+ str $i3,[$key,#-4]
524+ b .L192_loop
525+
526+.Lnot192:
527+ ldrb $i2,[$rounds,#27]
528+ ldrb $t1,[$rounds,#26]
529+ ldrb $t2,[$rounds,#25]
530+ ldrb $t3,[$rounds,#24]
531+ orr $i2,$i2,$t1,lsl#8
532+ orr $i2,$i2,$t2,lsl#16
533+ orr $i2,$i2,$t3,lsl#24
534+ ldrb $i3,[$rounds,#31]
535+ ldrb $t1,[$rounds,#30]
536+ ldrb $t2,[$rounds,#29]
537+ ldrb $t3,[$rounds,#28]
538+ orr $i3,$i3,$t1,lsl#8
539+ orr $i3,$i3,$t2,lsl#16
540+ orr $i3,$i3,$t3,lsl#24
541+ str $i2,[$key],#8
542+ str $i3,[$key,#-4]
543+
544+ mov $rounds,#14
545+ str $rounds,[$key,#240-32]
546+ add $t3,$tbl,#256 @ rcon
547+ mov lr,#255
548+ mov $rounds,#7
549+
550+.L256_loop:
551+ and $t2,lr,$i3,lsr#24
552+ and $i1,lr,$i3,lsr#16
553+ and $i2,lr,$i3,lsr#8
554+ and $i3,lr,$i3
555+ ldrb $t2,[$tbl,$t2]
556+ ldrb $i1,[$tbl,$i1]
557+ ldrb $i2,[$tbl,$i2]
558+ ldrb $i3,[$tbl,$i3]
559+ ldr $t1,[$t3],#4 @ rcon[i++]
560+ orr $t2,$t2,$i1,lsl#24
561+ orr $t2,$t2,$i2,lsl#16
562+ orr $t2,$t2,$i3,lsl#8
563+ eor $i3,$t2,$t1
564+ eor $s0,$s0,$i3 @ rk[8]=rk[0]^...
565+ eor $s1,$s1,$s0 @ rk[9]=rk[1]^rk[8]
566+ eor $s2,$s2,$s1 @ rk[10]=rk[2]^rk[9]
567+ eor $s3,$s3,$s2 @ rk[11]=rk[3]^rk[10]
568+ str $s0,[$key],#32
569+ str $s1,[$key,#-28]
570+ str $s2,[$key,#-24]
571+ str $s3,[$key,#-20]
572+
573+ subs $rounds,$rounds,#1
574+ subeq r2,$key,#256
575+ beq .Ldone
576+
577+ and $t2,lr,$s3
578+ and $i1,lr,$s3,lsr#8
579+ and $i2,lr,$s3,lsr#16
580+ and $i3,lr,$s3,lsr#24
581+ ldrb $t2,[$tbl,$t2]
582+ ldrb $i1,[$tbl,$i1]
583+ ldrb $i2,[$tbl,$i2]
584+ ldrb $i3,[$tbl,$i3]
585+ orr $t2,$t2,$i1,lsl#8
586+ orr $t2,$t2,$i2,lsl#16
587+ orr $t2,$t2,$i3,lsl#24
588+
589+ ldr $t1,[$key,#-48]
590+ ldr $i1,[$key,#-44]
591+ ldr $i2,[$key,#-40]
592+ ldr $i3,[$key,#-36]
593+ eor $t1,$t1,$t2 @ rk[12]=rk[4]^...
594+ eor $i1,$i1,$t1 @ rk[13]=rk[5]^rk[12]
595+ eor $i2,$i2,$i1 @ rk[14]=rk[6]^rk[13]
596+ eor $i3,$i3,$i2 @ rk[15]=rk[7]^rk[14]
597+ str $t1,[$key,#-16]
598+ str $i1,[$key,#-12]
599+ str $i2,[$key,#-8]
600+ str $i3,[$key,#-4]
601+ b .L256_loop
602+
603+.Ldone: mov r0,#0
604+ ldmia sp!,{r4-r12,lr}
605+.Labrt: tst lr,#1
606+ moveq pc,lr @ be binary compatible with V4, yet
607+ bx lr @ interoperable with Thumb ISA:-)
608+.size AES_set_encrypt_key,.-AES_set_encrypt_key
609+
610+.global AES_set_decrypt_key
611+.type AES_set_decrypt_key,%function
612+.align 5
613+AES_set_decrypt_key:
614+ str lr,[sp,#-4]! @ push lr
615+ bl AES_set_encrypt_key
616+ teq r0,#0
617+ ldrne lr,[sp],#4 @ pop lr
618+ bne .Labrt
619+
620+ stmdb sp!,{r4-r12}
621+
622+ ldr $rounds,[r2,#240] @ AES_set_encrypt_key preserves r2,
623+ mov $key,r2 @ which is AES_KEY *key
624+ mov $i1,r2
625+ add $i2,r2,$rounds,lsl#4
626+
627+.Linv: ldr $s0,[$i1]
628+ ldr $s1,[$i1,#4]
629+ ldr $s2,[$i1,#8]
630+ ldr $s3,[$i1,#12]
631+ ldr $t1,[$i2]
632+ ldr $t2,[$i2,#4]
633+ ldr $t3,[$i2,#8]
634+ ldr $i3,[$i2,#12]
635+ str $s0,[$i2],#-16
636+ str $s1,[$i2,#16+4]
637+ str $s2,[$i2,#16+8]
638+ str $s3,[$i2,#16+12]
639+ str $t1,[$i1],#16
640+ str $t2,[$i1,#-12]
641+ str $t3,[$i1,#-8]
642+ str $i3,[$i1,#-4]
643+ teq $i1,$i2
644+ bne .Linv
645+___
646+$mask80=$i1;
647+$mask1b=$i2;
648+$mask7f=$i3;
649+$code.=<<___;
650+ ldr $s0,[$key,#16]! @ prefetch tp1
651+ mov $mask80,#0x80
652+ mov $mask1b,#0x1b
653+ orr $mask80,$mask80,#0x8000
654+ orr $mask1b,$mask1b,#0x1b00
655+ orr $mask80,$mask80,$mask80,lsl#16
656+ orr $mask1b,$mask1b,$mask1b,lsl#16
657+ sub $rounds,$rounds,#1
658+ mvn $mask7f,$mask80
659+ mov $rounds,$rounds,lsl#2 @ (rounds-1)*4
660+
661+.Lmix: and $t1,$s0,$mask80
662+ and $s1,$s0,$mask7f
663+ sub $t1,$t1,$t1,lsr#7
664+ and $t1,$t1,$mask1b
665+ eor $s1,$t1,$s1,lsl#1 @ tp2
666+
667+ and $t1,$s1,$mask80
668+ and $s2,$s1,$mask7f
669+ sub $t1,$t1,$t1,lsr#7
670+ and $t1,$t1,$mask1b
671+ eor $s2,$t1,$s2,lsl#1 @ tp4
672+
673+ and $t1,$s2,$mask80
674+ and $s3,$s2,$mask7f
675+ sub $t1,$t1,$t1,lsr#7
676+ and $t1,$t1,$mask1b
677+ eor $s3,$t1,$s3,lsl#1 @ tp8
678+
679+ eor $t1,$s1,$s2
680+ eor $t2,$s0,$s3 @ tp9
681+ eor $t1,$t1,$s3 @ tpe
682+ eor $t1,$t1,$s1,ror#24
683+ eor $t1,$t1,$t2,ror#24 @ ^= ROTATE(tpb=tp9^tp2,8)
684+ eor $t1,$t1,$s2,ror#16
685+ eor $t1,$t1,$t2,ror#16 @ ^= ROTATE(tpd=tp9^tp4,16)
686+ eor $t1,$t1,$t2,ror#8 @ ^= ROTATE(tp9,24)
687+
688+ ldr $s0,[$key,#4] @ prefetch tp1
689+ str $t1,[$key],#4
690+ subs $rounds,$rounds,#1
691+ bne .Lmix
692+
693+ mov r0,#0
694+ ldmia sp!,{r4-r12,lr}
695+ tst lr,#1
696+ moveq pc,lr @ be binary compatible with V4, yet
697+ bx lr @ interoperable with Thumb ISA:-)
698+.size AES_set_decrypt_key,.-AES_set_decrypt_key
699+
700+.type AES_Td,%object
701+.align 5
702+AES_Td:
703+.word 0x51f4a750, 0x7e416553, 0x1a17a4c3, 0x3a275e96
704+.word 0x3bab6bcb, 0x1f9d45f1, 0xacfa58ab, 0x4be30393
705+.word 0x2030fa55, 0xad766df6, 0x88cc7691, 0xf5024c25
706+.word 0x4fe5d7fc, 0xc52acbd7, 0x26354480, 0xb562a38f
707+.word 0xdeb15a49, 0x25ba1b67, 0x45ea0e98, 0x5dfec0e1
708+.word 0xc32f7502, 0x814cf012, 0x8d4697a3, 0x6bd3f9c6
709+.word 0x038f5fe7, 0x15929c95, 0xbf6d7aeb, 0x955259da
710+.word 0xd4be832d, 0x587421d3, 0x49e06929, 0x8ec9c844
711+.word 0x75c2896a, 0xf48e7978, 0x99583e6b, 0x27b971dd
712+.word 0xbee14fb6, 0xf088ad17, 0xc920ac66, 0x7dce3ab4
713+.word 0x63df4a18, 0xe51a3182, 0x97513360, 0x62537f45
714+.word 0xb16477e0, 0xbb6bae84, 0xfe81a01c, 0xf9082b94
715+.word 0x70486858, 0x8f45fd19, 0x94de6c87, 0x527bf8b7
716+.word 0xab73d323, 0x724b02e2, 0xe31f8f57, 0x6655ab2a
717+.word 0xb2eb2807, 0x2fb5c203, 0x86c57b9a, 0xd33708a5
718+.word 0x302887f2, 0x23bfa5b2, 0x02036aba, 0xed16825c
719+.word 0x8acf1c2b, 0xa779b492, 0xf307f2f0, 0x4e69e2a1
720+.word 0x65daf4cd, 0x0605bed5, 0xd134621f, 0xc4a6fe8a
721+.word 0x342e539d, 0xa2f355a0, 0x058ae132, 0xa4f6eb75
722+.word 0x0b83ec39, 0x4060efaa, 0x5e719f06, 0xbd6e1051
723+.word 0x3e218af9, 0x96dd063d, 0xdd3e05ae, 0x4de6bd46
724+.word 0x91548db5, 0x71c45d05, 0x0406d46f, 0x605015ff
725+.word 0x1998fb24, 0xd6bde997, 0x894043cc, 0x67d99e77
726+.word 0xb0e842bd, 0x07898b88, 0xe7195b38, 0x79c8eedb
727+.word 0xa17c0a47, 0x7c420fe9, 0xf8841ec9, 0x00000000
728+.word 0x09808683, 0x322bed48, 0x1e1170ac, 0x6c5a724e
729+.word 0xfd0efffb, 0x0f853856, 0x3daed51e, 0x362d3927
730+.word 0x0a0fd964, 0x685ca621, 0x9b5b54d1, 0x24362e3a
731+.word 0x0c0a67b1, 0x9357e70f, 0xb4ee96d2, 0x1b9b919e
732+.word 0x80c0c54f, 0x61dc20a2, 0x5a774b69, 0x1c121a16
733+.word 0xe293ba0a, 0xc0a02ae5, 0x3c22e043, 0x121b171d
734+.word 0x0e090d0b, 0xf28bc7ad, 0x2db6a8b9, 0x141ea9c8
735+.word 0x57f11985, 0xaf75074c, 0xee99ddbb, 0xa37f60fd
736+.word 0xf701269f, 0x5c72f5bc, 0x44663bc5, 0x5bfb7e34
737+.word 0x8b432976, 0xcb23c6dc, 0xb6edfc68, 0xb8e4f163
738+.word 0xd731dcca, 0x42638510, 0x13972240, 0x84c61120
739+.word 0x854a247d, 0xd2bb3df8, 0xaef93211, 0xc729a16d
740+.word 0x1d9e2f4b, 0xdcb230f3, 0x0d8652ec, 0x77c1e3d0
741+.word 0x2bb3166c, 0xa970b999, 0x119448fa, 0x47e96422
742+.word 0xa8fc8cc4, 0xa0f03f1a, 0x567d2cd8, 0x223390ef
743+.word 0x87494ec7, 0xd938d1c1, 0x8ccaa2fe, 0x98d40b36
744+.word 0xa6f581cf, 0xa57ade28, 0xdab78e26, 0x3fadbfa4
745+.word 0x2c3a9de4, 0x5078920d, 0x6a5fcc9b, 0x547e4662
746+.word 0xf68d13c2, 0x90d8b8e8, 0x2e39f75e, 0x82c3aff5
747+.word 0x9f5d80be, 0x69d0937c, 0x6fd52da9, 0xcf2512b3
748+.word 0xc8ac993b, 0x10187da7, 0xe89c636e, 0xdb3bbb7b
749+.word 0xcd267809, 0x6e5918f4, 0xec9ab701, 0x834f9aa8
750+.word 0xe6956e65, 0xaaffe67e, 0x21bccf08, 0xef15e8e6
751+.word 0xbae79bd9, 0x4a6f36ce, 0xea9f09d4, 0x29b07cd6
752+.word 0x31a4b2af, 0x2a3f2331, 0xc6a59430, 0x35a266c0
753+.word 0x744ebc37, 0xfc82caa6, 0xe090d0b0, 0x33a7d815
754+.word 0xf104984a, 0x41ecdaf7, 0x7fcd500e, 0x1791f62f
755+.word 0x764dd68d, 0x43efb04d, 0xccaa4d54, 0xe49604df
756+.word 0x9ed1b5e3, 0x4c6a881b, 0xc12c1fb8, 0x4665517f
757+.word 0x9d5eea04, 0x018c355d, 0xfa877473, 0xfb0b412e
758+.word 0xb3671d5a, 0x92dbd252, 0xe9105633, 0x6dd64713
759+.word 0x9ad7618c, 0x37a10c7a, 0x59f8148e, 0xeb133c89
760+.word 0xcea927ee, 0xb761c935, 0xe11ce5ed, 0x7a47b13c
761+.word 0x9cd2df59, 0x55f2733f, 0x1814ce79, 0x73c737bf
762+.word 0x53f7cdea, 0x5ffdaa5b, 0xdf3d6f14, 0x7844db86
763+.word 0xcaaff381, 0xb968c43e, 0x3824342c, 0xc2a3405f
764+.word 0x161dc372, 0xbce2250c, 0x283c498b, 0xff0d9541
765+.word 0x39a80171, 0x080cb3de, 0xd8b4e49c, 0x6456c190
766+.word 0x7bcb8461, 0xd532b670, 0x486c5c74, 0xd0b85742
767+@ Td4[256]
768+.byte 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38
769+.byte 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
770+.byte 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
771+.byte 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
772+.byte 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
773+.byte 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
774+.byte 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
775+.byte 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
776+.byte 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
777+.byte 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
778+.byte 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
779+.byte 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
780+.byte 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
781+.byte 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
782+.byte 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
783+.byte 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
784+.byte 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
785+.byte 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
786+.byte 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
787+.byte 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
788+.byte 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
789+.byte 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
790+.byte 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
791+.byte 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
792+.byte 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
793+.byte 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
794+.byte 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
795+.byte 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
796+.byte 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
797+.byte 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
798+.byte 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
799+.byte 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
800+.size AES_Td,.-AES_Td
801+
802+@ void AES_decrypt(const unsigned char *in, unsigned char *out,
803+@ const AES_KEY *key) {
804+.global AES_decrypt
805+.type AES_decrypt,%function
806+.align 5
807+AES_decrypt:
808+ sub r3,pc,#8 @ AES_decrypt
809+ stmdb sp!,{r1,r4-r12,lr}
810+ mov $rounds,r0 @ inp
811+ mov $key,r2
812+ sub $tbl,r3,#AES_decrypt-AES_Td @ Td
813+
814+ ldrb $s0,[$rounds,#3] @ load input data in endian-neutral
815+ ldrb $t1,[$rounds,#2] @ manner...
816+ ldrb $t2,[$rounds,#1]
817+ ldrb $t3,[$rounds,#0]
818+ orr $s0,$s0,$t1,lsl#8
819+ orr $s0,$s0,$t2,lsl#16
820+ orr $s0,$s0,$t3,lsl#24
821+ ldrb $s1,[$rounds,#7]
822+ ldrb $t1,[$rounds,#6]
823+ ldrb $t2,[$rounds,#5]
824+ ldrb $t3,[$rounds,#4]
825+ orr $s1,$s1,$t1,lsl#8
826+ orr $s1,$s1,$t2,lsl#16
827+ orr $s1,$s1,$t3,lsl#24
828+ ldrb $s2,[$rounds,#11]
829+ ldrb $t1,[$rounds,#10]
830+ ldrb $t2,[$rounds,#9]
831+ ldrb $t3,[$rounds,#8]
832+ orr $s2,$s2,$t1,lsl#8
833+ orr $s2,$s2,$t2,lsl#16
834+ orr $s2,$s2,$t3,lsl#24
835+ ldrb $s3,[$rounds,#15]
836+ ldrb $t1,[$rounds,#14]
837+ ldrb $t2,[$rounds,#13]
838+ ldrb $t3,[$rounds,#12]
839+ orr $s3,$s3,$t1,lsl#8
840+ orr $s3,$s3,$t2,lsl#16
841+ orr $s3,$s3,$t3,lsl#24
842+
843+ bl _armv4_AES_decrypt
844+
845+ ldr $rounds,[sp],#4 @ pop out
846+ mov $t1,$s0,lsr#24 @ write output in endian-neutral
847+ mov $t2,$s0,lsr#16 @ manner...
848+ mov $t3,$s0,lsr#8
849+ strb $t1,[$rounds,#0]
850+ strb $t2,[$rounds,#1]
851+ strb $t3,[$rounds,#2]
852+ strb $s0,[$rounds,#3]
853+ mov $t1,$s1,lsr#24
854+ mov $t2,$s1,lsr#16
855+ mov $t3,$s1,lsr#8
856+ strb $t1,[$rounds,#4]
857+ strb $t2,[$rounds,#5]
858+ strb $t3,[$rounds,#6]
859+ strb $s1,[$rounds,#7]
860+ mov $t1,$s2,lsr#24
861+ mov $t2,$s2,lsr#16
862+ mov $t3,$s2,lsr#8
863+ strb $t1,[$rounds,#8]
864+ strb $t2,[$rounds,#9]
865+ strb $t3,[$rounds,#10]
866+ strb $s2,[$rounds,#11]
867+ mov $t1,$s3,lsr#24
868+ mov $t2,$s3,lsr#16
869+ mov $t3,$s3,lsr#8
870+ strb $t1,[$rounds,#12]
871+ strb $t2,[$rounds,#13]
872+ strb $t3,[$rounds,#14]
873+ strb $s3,[$rounds,#15]
874+
875+ ldmia sp!,{r4-r12,lr}
876+ tst lr,#1
877+ moveq pc,lr @ be binary compatible with V4, yet
878+ bx lr @ interoperable with Thumb ISA:-)
879+.size AES_decrypt,.-AES_decrypt
880+
881+.type _armv4_AES_decrypt,%function
882+.align 2
883+_armv4_AES_decrypt:
884+ str lr,[sp,#-4]! @ push lr
885+ ldr $t1,[$key],#16
886+ ldr $t2,[$key,#-12]
887+ ldr $t3,[$key,#-8]
888+ ldr $i1,[$key,#-4]
889+ ldr $rounds,[$key,#240-16]
890+ eor $s0,$s0,$t1
891+ eor $s1,$s1,$t2
892+ eor $s2,$s2,$t3
893+ eor $s3,$s3,$i1
894+ sub $rounds,$rounds,#1
895+ mov lr,#255
896+
897+.Ldec_loop:
898+ and $i1,lr,$s0,lsr#16
899+ and $i2,lr,$s0,lsr#8
900+ and $i3,lr,$s0
901+ mov $s0,$s0,lsr#24
902+ ldr $t1,[$tbl,$i1,lsl#2] @ Td1[s0>>16]
903+ ldr $s0,[$tbl,$s0,lsl#2] @ Td0[s0>>24]
904+ ldr $t2,[$tbl,$i2,lsl#2] @ Td2[s0>>8]
905+ ldr $t3,[$tbl,$i3,lsl#2] @ Td3[s0>>0]
906+
907+ and $i1,lr,$s1 @ i0
908+ and $i2,lr,$s1,lsr#16
909+ and $i3,lr,$s1,lsr#8
910+ mov $s1,$s1,lsr#24
911+ ldr $i1,[$tbl,$i1,lsl#2] @ Td3[s1>>0]
912+ ldr $s1,[$tbl,$s1,lsl#2] @ Td0[s1>>24]
913+ ldr $i2,[$tbl,$i2,lsl#2] @ Td1[s1>>16]
914+ ldr $i3,[$tbl,$i3,lsl#2] @ Td2[s1>>8]
915+ eor $s0,$s0,$i1,ror#24
916+ eor $s1,$s1,$t1,ror#8
917+ eor $t2,$i2,$t2,ror#8
918+ eor $t3,$i3,$t3,ror#8
919+
920+ and $i1,lr,$s2,lsr#8 @ i0
921+ and $i2,lr,$s2 @ i1
922+ and $i3,lr,$s2,lsr#16
923+ mov $s2,$s2,lsr#24
924+ ldr $i1,[$tbl,$i1,lsl#2] @ Td2[s2>>8]
925+ ldr $i2,[$tbl,$i2,lsl#2] @ Td3[s2>>0]
926+ ldr $s2,[$tbl,$s2,lsl#2] @ Td0[s2>>24]
927+ ldr $i3,[$tbl,$i3,lsl#2] @ Td1[s2>>16]
928+ eor $s0,$s0,$i1,ror#16
929+ eor $s1,$s1,$i2,ror#24
930+ eor $s2,$s2,$t2,ror#8
931+ eor $t3,$i3,$t3,ror#8
932+
933+ and $i1,lr,$s3,lsr#16 @ i0
934+ and $i2,lr,$s3,lsr#8 @ i1
935+ and $i3,lr,$s3 @ i2
936+ mov $s3,$s3,lsr#24
937+ ldr $i1,[$tbl,$i1,lsl#2] @ Td1[s3>>16]
938+ ldr $i2,[$tbl,$i2,lsl#2] @ Td2[s3>>8]
939+ ldr $i3,[$tbl,$i3,lsl#2] @ Td3[s3>>0]
940+ ldr $s3,[$tbl,$s3,lsl#2] @ Td0[s3>>24]
941+ eor $s0,$s0,$i1,ror#8
942+ eor $s1,$s1,$i2,ror#16
943+ eor $s2,$s2,$i3,ror#24
944+ eor $s3,$s3,$t3,ror#8
945+
946+ ldr $t1,[$key],#16
947+ ldr $t2,[$key,#-12]
948+ ldr $t3,[$key,#-8]
949+ ldr $i1,[$key,#-4]
950+ eor $s0,$s0,$t1
951+ eor $s1,$s1,$t2
952+ eor $s2,$s2,$t3
953+ eor $s3,$s3,$i1
954+
955+ subs $rounds,$rounds,#1
956+ bne .Ldec_loop
957+
958+ add $tbl,$tbl,#1024
959+
960+ ldr $t1,[$tbl,#0] @ prefetch Td4
961+ ldr $t2,[$tbl,#32]
962+ ldr $t3,[$tbl,#64]
963+ ldr $i1,[$tbl,#96]
964+ ldr $i2,[$tbl,#128]
965+ ldr $i3,[$tbl,#160]
966+ ldr $t1,[$tbl,#192]
967+ ldr $t2,[$tbl,#224]
968+
969+ and $i1,lr,$s0,lsr#16
970+ and $i2,lr,$s0,lsr#8
971+ and $i3,lr,$s0
972+ ldrb $s0,[$tbl,$s0,lsr#24] @ Td4[s0>>24]
973+ ldrb $t1,[$tbl,$i1] @ Td4[s0>>16]
974+ ldrb $t2,[$tbl,$i2] @ Td4[s0>>8]
975+ ldrb $t3,[$tbl,$i3] @ Td4[s0>>0]
976+
977+ and $i1,lr,$s1 @ i0
978+ and $i2,lr,$s1,lsr#16
979+ and $i3,lr,$s1,lsr#8
980+ ldrb $i1,[$tbl,$i1] @ Td4[s1>>0]
981+ ldrb $s1,[$tbl,$s1,lsr#24] @ Td4[s1>>24]
982+ ldrb $i2,[$tbl,$i2] @ Td4[s1>>16]
983+ ldrb $i3,[$tbl,$i3] @ Td4[s1>>8]
984+ eor $s0,$i1,$s0,lsl#24
985+ eor $s1,$t1,$s1,lsl#8
986+ eor $t2,$t2,$i2,lsl#8
987+ eor $t3,$t3,$i3,lsl#8
988+
989+ and $i1,lr,$s2,lsr#8 @ i0
990+ and $i2,lr,$s2 @ i1
991+ and $i3,lr,$s2,lsr#16
992+ ldrb $i1,[$tbl,$i1] @ Td4[s2>>8]
993+ ldrb $i2,[$tbl,$i2] @ Td4[s2>>0]
994+ ldrb $s2,[$tbl,$s2,lsr#24] @ Td4[s2>>24]
995+ ldrb $i3,[$tbl,$i3] @ Td4[s2>>16]
996+ eor $s0,$s0,$i1,lsl#8
997+ eor $s1,$i2,$s1,lsl#16
998+ eor $s2,$t2,$s2,lsl#16
999+ eor $t3,$t3,$i3,lsl#16
1000+
1001+ and $i1,lr,$s3,lsr#16 @ i0
1002+ and $i2,lr,$s3,lsr#8 @ i1
1003+ and $i3,lr,$s3 @ i2
1004+ ldrb $i1,[$tbl,$i1] @ Td4[s3>>16]
1005+ ldrb $i2,[$tbl,$i2] @ Td4[s3>>8]
1006+ ldrb $i3,[$tbl,$i3] @ Td4[s3>>0]
1007+ ldrb $s3,[$tbl,$s3,lsr#24] @ Td4[s3>>24]
1008+ eor $s0,$s0,$i1,lsl#16
1009+ eor $s1,$s1,$i2,lsl#8
1010+ eor $s2,$i3,$s2,lsl#8
1011+ eor $s3,$t3,$s3,lsl#24
1012+
1013+ ldr lr,[sp],#4 @ pop lr
1014+ ldr $t1,[$key,#0]
1015+ ldr $t2,[$key,#4]
1016+ ldr $t3,[$key,#8]
1017+ ldr $i1,[$key,#12]
1018+ eor $s0,$s0,$t1
1019+ eor $s1,$s1,$t2
1020+ eor $s2,$s2,$t3
1021+ eor $s3,$s3,$i1
1022+
1023+ sub $tbl,$tbl,#1024
1024+ mov pc,lr @ return
1025+.size _armv4_AES_decrypt,.-_armv4_AES_decrypt
1026+.asciz "AES for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
1027+___
1028+
1029+$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
1030+print $code;
--- /dev/null
+++ b/crypto/aes/asm/aes-ppc.pl
@@ -0,0 +1,1176 @@
1+#!/usr/bin/env perl
2+
3+# ====================================================================
4+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5+# project. The module is, however, dual licensed under OpenSSL and
6+# CRYPTOGAMS licenses depending on where you obtain it. For further
7+# details see http://www.openssl.org/~appro/cryptogams/.
8+# ====================================================================
9+
10+# Needs more work: key setup, page boundaries, CBC routine...
11+#
12+# ppc_AES_[en|de]crypt perform at 18 cycles per byte processed with
13+# 128-bit key, which is ~40% better than 64-bit code generated by gcc
14+# 4.0. But these are not the ones currently used! Their "compact"
15+# counterparts are, for security reason. ppc_AES_encrypt_compact runs
16+# at 1/2 of ppc_AES_encrypt speed, while ppc_AES_decrypt_compact -
17+# at 1/3 of ppc_AES_decrypt.
18+
19+$flavour = shift;
20+
21+if ($flavour =~ /64/) {
22+ $SIZE_T =8;
23+ $STU ="stdu";
24+ $POP ="ld";
25+ $PUSH ="std";
26+} elsif ($flavour =~ /32/) {
27+ $SIZE_T =4;
28+ $STU ="stwu";
29+ $POP ="lwz";
30+ $PUSH ="stw";
31+} else { die "nonsense $flavour"; }
32+
33+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
34+( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
35+( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
36+die "can't locate ppc-xlate.pl";
37+
38+open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
39+
40+$FRAME=32*$SIZE_T;
41+
42+sub _data_word()
43+{ my $i;
44+ while(defined($i=shift)) { $code.=sprintf"\t.long\t0x%08x,0x%08x\n",$i,$i; }
45+}
46+
47+$sp="r1";
48+$toc="r2";
49+$inp="r3";
50+$out="r4";
51+$key="r5";
52+
53+$Tbl0="r3";
54+$Tbl1="r6";
55+$Tbl2="r7";
56+$Tbl3="r2";
57+
58+$s0="r8";
59+$s1="r9";
60+$s2="r10";
61+$s3="r11";
62+
63+$t0="r12";
64+$t1="r13";
65+$t2="r14";
66+$t3="r15";
67+
68+$acc00="r16";
69+$acc01="r17";
70+$acc02="r18";
71+$acc03="r19";
72+
73+$acc04="r20";
74+$acc05="r21";
75+$acc06="r22";
76+$acc07="r23";
77+
78+$acc08="r24";
79+$acc09="r25";
80+$acc10="r26";
81+$acc11="r27";
82+
83+$acc12="r28";
84+$acc13="r29";
85+$acc14="r30";
86+$acc15="r31";
87+
88+# stay away from TLS pointer
89+if ($SIZE_T==8) { die if ($t1 ne "r13"); $t1="r0"; }
90+else { die if ($Tbl3 ne "r2"); $Tbl3=$t0; $t0="r0"; }
91+$mask80=$Tbl2;
92+$mask1b=$Tbl3;
93+
94+$code.=<<___;
95+.machine "any"
96+.text
97+
98+.align 7
99+LAES_Te:
100+ mflr r0
101+ bcl 20,31,\$+4
102+ mflr $Tbl0 ; vvvvv "distance" between . and 1st data entry
103+ addi $Tbl0,$Tbl0,`128-8`
104+ mtlr r0
105+ blr
106+ .space `32-24`
107+LAES_Td:
108+ mflr r0
109+ bcl 20,31,\$+4
110+ mflr $Tbl0 ; vvvvvvvv "distance" between . and 1st data entry
111+ addi $Tbl0,$Tbl0,`128-8-32+2048+256`
112+ mtlr r0
113+ blr
114+ .space `128-32-24`
115+___
116+&_data_word(
117+ 0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d,
118+ 0xfff2f20d, 0xd66b6bbd, 0xde6f6fb1, 0x91c5c554,
119+ 0x60303050, 0x02010103, 0xce6767a9, 0x562b2b7d,
120+ 0xe7fefe19, 0xb5d7d762, 0x4dababe6, 0xec76769a,
121+ 0x8fcaca45, 0x1f82829d, 0x89c9c940, 0xfa7d7d87,
122+ 0xeffafa15, 0xb25959eb, 0x8e4747c9, 0xfbf0f00b,
123+ 0x41adadec, 0xb3d4d467, 0x5fa2a2fd, 0x45afafea,
124+ 0x239c9cbf, 0x53a4a4f7, 0xe4727296, 0x9bc0c05b,
125+ 0x75b7b7c2, 0xe1fdfd1c, 0x3d9393ae, 0x4c26266a,
126+ 0x6c36365a, 0x7e3f3f41, 0xf5f7f702, 0x83cccc4f,
127+ 0x6834345c, 0x51a5a5f4, 0xd1e5e534, 0xf9f1f108,
128+ 0xe2717193, 0xabd8d873, 0x62313153, 0x2a15153f,
129+ 0x0804040c, 0x95c7c752, 0x46232365, 0x9dc3c35e,
130+ 0x30181828, 0x379696a1, 0x0a05050f, 0x2f9a9ab5,
131+ 0x0e070709, 0x24121236, 0x1b80809b, 0xdfe2e23d,
132+ 0xcdebeb26, 0x4e272769, 0x7fb2b2cd, 0xea75759f,
133+ 0x1209091b, 0x1d83839e, 0x582c2c74, 0x341a1a2e,
134+ 0x361b1b2d, 0xdc6e6eb2, 0xb45a5aee, 0x5ba0a0fb,
135+ 0xa45252f6, 0x763b3b4d, 0xb7d6d661, 0x7db3b3ce,
136+ 0x5229297b, 0xdde3e33e, 0x5e2f2f71, 0x13848497,
137+ 0xa65353f5, 0xb9d1d168, 0x00000000, 0xc1eded2c,
138+ 0x40202060, 0xe3fcfc1f, 0x79b1b1c8, 0xb65b5bed,
139+ 0xd46a6abe, 0x8dcbcb46, 0x67bebed9, 0x7239394b,
140+ 0x944a4ade, 0x984c4cd4, 0xb05858e8, 0x85cfcf4a,
141+ 0xbbd0d06b, 0xc5efef2a, 0x4faaaae5, 0xedfbfb16,
142+ 0x864343c5, 0x9a4d4dd7, 0x66333355, 0x11858594,
143+ 0x8a4545cf, 0xe9f9f910, 0x04020206, 0xfe7f7f81,
144+ 0xa05050f0, 0x783c3c44, 0x259f9fba, 0x4ba8a8e3,
145+ 0xa25151f3, 0x5da3a3fe, 0x804040c0, 0x058f8f8a,
146+ 0x3f9292ad, 0x219d9dbc, 0x70383848, 0xf1f5f504,
147+ 0x63bcbcdf, 0x77b6b6c1, 0xafdada75, 0x42212163,
148+ 0x20101030, 0xe5ffff1a, 0xfdf3f30e, 0xbfd2d26d,
149+ 0x81cdcd4c, 0x180c0c14, 0x26131335, 0xc3ecec2f,
150+ 0xbe5f5fe1, 0x359797a2, 0x884444cc, 0x2e171739,
151+ 0x93c4c457, 0x55a7a7f2, 0xfc7e7e82, 0x7a3d3d47,
152+ 0xc86464ac, 0xba5d5de7, 0x3219192b, 0xe6737395,
153+ 0xc06060a0, 0x19818198, 0x9e4f4fd1, 0xa3dcdc7f,
154+ 0x44222266, 0x542a2a7e, 0x3b9090ab, 0x0b888883,
155+ 0x8c4646ca, 0xc7eeee29, 0x6bb8b8d3, 0x2814143c,
156+ 0xa7dede79, 0xbc5e5ee2, 0x160b0b1d, 0xaddbdb76,
157+ 0xdbe0e03b, 0x64323256, 0x743a3a4e, 0x140a0a1e,
158+ 0x924949db, 0x0c06060a, 0x4824246c, 0xb85c5ce4,
159+ 0x9fc2c25d, 0xbdd3d36e, 0x43acacef, 0xc46262a6,
160+ 0x399191a8, 0x319595a4, 0xd3e4e437, 0xf279798b,
161+ 0xd5e7e732, 0x8bc8c843, 0x6e373759, 0xda6d6db7,
162+ 0x018d8d8c, 0xb1d5d564, 0x9c4e4ed2, 0x49a9a9e0,
163+ 0xd86c6cb4, 0xac5656fa, 0xf3f4f407, 0xcfeaea25,
164+ 0xca6565af, 0xf47a7a8e, 0x47aeaee9, 0x10080818,
165+ 0x6fbabad5, 0xf0787888, 0x4a25256f, 0x5c2e2e72,
166+ 0x381c1c24, 0x57a6a6f1, 0x73b4b4c7, 0x97c6c651,
167+ 0xcbe8e823, 0xa1dddd7c, 0xe874749c, 0x3e1f1f21,
168+ 0x964b4bdd, 0x61bdbddc, 0x0d8b8b86, 0x0f8a8a85,
169+ 0xe0707090, 0x7c3e3e42, 0x71b5b5c4, 0xcc6666aa,
170+ 0x904848d8, 0x06030305, 0xf7f6f601, 0x1c0e0e12,
171+ 0xc26161a3, 0x6a35355f, 0xae5757f9, 0x69b9b9d0,
172+ 0x17868691, 0x99c1c158, 0x3a1d1d27, 0x279e9eb9,
173+ 0xd9e1e138, 0xebf8f813, 0x2b9898b3, 0x22111133,
174+ 0xd26969bb, 0xa9d9d970, 0x078e8e89, 0x339494a7,
175+ 0x2d9b9bb6, 0x3c1e1e22, 0x15878792, 0xc9e9e920,
176+ 0x87cece49, 0xaa5555ff, 0x50282878, 0xa5dfdf7a,
177+ 0x038c8c8f, 0x59a1a1f8, 0x09898980, 0x1a0d0d17,
178+ 0x65bfbfda, 0xd7e6e631, 0x844242c6, 0xd06868b8,
179+ 0x824141c3, 0x299999b0, 0x5a2d2d77, 0x1e0f0f11,
180+ 0x7bb0b0cb, 0xa85454fc, 0x6dbbbbd6, 0x2c16163a);
181+$code.=<<___;
182+.byte 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5
183+.byte 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
184+.byte 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
185+.byte 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
186+.byte 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
187+.byte 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
188+.byte 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
189+.byte 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
190+.byte 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
191+.byte 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
192+.byte 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
193+.byte 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
194+.byte 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
195+.byte 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
196+.byte 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
197+.byte 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
198+.byte 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
199+.byte 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
200+.byte 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
201+.byte 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
202+.byte 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
203+.byte 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
204+.byte 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
205+.byte 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
206+.byte 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
207+.byte 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
208+.byte 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
209+.byte 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
210+.byte 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
211+.byte 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
212+.byte 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
213+.byte 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
214+___
215+&_data_word(
216+ 0x51f4a750, 0x7e416553, 0x1a17a4c3, 0x3a275e96,
217+ 0x3bab6bcb, 0x1f9d45f1, 0xacfa58ab, 0x4be30393,
218+ 0x2030fa55, 0xad766df6, 0x88cc7691, 0xf5024c25,
219+ 0x4fe5d7fc, 0xc52acbd7, 0x26354480, 0xb562a38f,
220+ 0xdeb15a49, 0x25ba1b67, 0x45ea0e98, 0x5dfec0e1,
221+ 0xc32f7502, 0x814cf012, 0x8d4697a3, 0x6bd3f9c6,
222+ 0x038f5fe7, 0x15929c95, 0xbf6d7aeb, 0x955259da,
223+ 0xd4be832d, 0x587421d3, 0x49e06929, 0x8ec9c844,
224+ 0x75c2896a, 0xf48e7978, 0x99583e6b, 0x27b971dd,
225+ 0xbee14fb6, 0xf088ad17, 0xc920ac66, 0x7dce3ab4,
226+ 0x63df4a18, 0xe51a3182, 0x97513360, 0x62537f45,
227+ 0xb16477e0, 0xbb6bae84, 0xfe81a01c, 0xf9082b94,
228+ 0x70486858, 0x8f45fd19, 0x94de6c87, 0x527bf8b7,
229+ 0xab73d323, 0x724b02e2, 0xe31f8f57, 0x6655ab2a,
230+ 0xb2eb2807, 0x2fb5c203, 0x86c57b9a, 0xd33708a5,
231+ 0x302887f2, 0x23bfa5b2, 0x02036aba, 0xed16825c,
232+ 0x8acf1c2b, 0xa779b492, 0xf307f2f0, 0x4e69e2a1,
233+ 0x65daf4cd, 0x0605bed5, 0xd134621f, 0xc4a6fe8a,
234+ 0x342e539d, 0xa2f355a0, 0x058ae132, 0xa4f6eb75,
235+ 0x0b83ec39, 0x4060efaa, 0x5e719f06, 0xbd6e1051,
236+ 0x3e218af9, 0x96dd063d, 0xdd3e05ae, 0x4de6bd46,
237+ 0x91548db5, 0x71c45d05, 0x0406d46f, 0x605015ff,
238+ 0x1998fb24, 0xd6bde997, 0x894043cc, 0x67d99e77,
239+ 0xb0e842bd, 0x07898b88, 0xe7195b38, 0x79c8eedb,
240+ 0xa17c0a47, 0x7c420fe9, 0xf8841ec9, 0x00000000,
241+ 0x09808683, 0x322bed48, 0x1e1170ac, 0x6c5a724e,
242+ 0xfd0efffb, 0x0f853856, 0x3daed51e, 0x362d3927,
243+ 0x0a0fd964, 0x685ca621, 0x9b5b54d1, 0x24362e3a,
244+ 0x0c0a67b1, 0x9357e70f, 0xb4ee96d2, 0x1b9b919e,
245+ 0x80c0c54f, 0x61dc20a2, 0x5a774b69, 0x1c121a16,
246+ 0xe293ba0a, 0xc0a02ae5, 0x3c22e043, 0x121b171d,
247+ 0x0e090d0b, 0xf28bc7ad, 0x2db6a8b9, 0x141ea9c8,
248+ 0x57f11985, 0xaf75074c, 0xee99ddbb, 0xa37f60fd,
249+ 0xf701269f, 0x5c72f5bc, 0x44663bc5, 0x5bfb7e34,
250+ 0x8b432976, 0xcb23c6dc, 0xb6edfc68, 0xb8e4f163,
251+ 0xd731dcca, 0x42638510, 0x13972240, 0x84c61120,
252+ 0x854a247d, 0xd2bb3df8, 0xaef93211, 0xc729a16d,
253+ 0x1d9e2f4b, 0xdcb230f3, 0x0d8652ec, 0x77c1e3d0,
254+ 0x2bb3166c, 0xa970b999, 0x119448fa, 0x47e96422,
255+ 0xa8fc8cc4, 0xa0f03f1a, 0x567d2cd8, 0x223390ef,
256+ 0x87494ec7, 0xd938d1c1, 0x8ccaa2fe, 0x98d40b36,
257+ 0xa6f581cf, 0xa57ade28, 0xdab78e26, 0x3fadbfa4,
258+ 0x2c3a9de4, 0x5078920d, 0x6a5fcc9b, 0x547e4662,
259+ 0xf68d13c2, 0x90d8b8e8, 0x2e39f75e, 0x82c3aff5,
260+ 0x9f5d80be, 0x69d0937c, 0x6fd52da9, 0xcf2512b3,
261+ 0xc8ac993b, 0x10187da7, 0xe89c636e, 0xdb3bbb7b,
262+ 0xcd267809, 0x6e5918f4, 0xec9ab701, 0x834f9aa8,
263+ 0xe6956e65, 0xaaffe67e, 0x21bccf08, 0xef15e8e6,
264+ 0xbae79bd9, 0x4a6f36ce, 0xea9f09d4, 0x29b07cd6,
265+ 0x31a4b2af, 0x2a3f2331, 0xc6a59430, 0x35a266c0,
266+ 0x744ebc37, 0xfc82caa6, 0xe090d0b0, 0x33a7d815,
267+ 0xf104984a, 0x41ecdaf7, 0x7fcd500e, 0x1791f62f,
268+ 0x764dd68d, 0x43efb04d, 0xccaa4d54, 0xe49604df,
269+ 0x9ed1b5e3, 0x4c6a881b, 0xc12c1fb8, 0x4665517f,
270+ 0x9d5eea04, 0x018c355d, 0xfa877473, 0xfb0b412e,
271+ 0xb3671d5a, 0x92dbd252, 0xe9105633, 0x6dd64713,
272+ 0x9ad7618c, 0x37a10c7a, 0x59f8148e, 0xeb133c89,
273+ 0xcea927ee, 0xb761c935, 0xe11ce5ed, 0x7a47b13c,
274+ 0x9cd2df59, 0x55f2733f, 0x1814ce79, 0x73c737bf,
275+ 0x53f7cdea, 0x5ffdaa5b, 0xdf3d6f14, 0x7844db86,
276+ 0xcaaff381, 0xb968c43e, 0x3824342c, 0xc2a3405f,
277+ 0x161dc372, 0xbce2250c, 0x283c498b, 0xff0d9541,
278+ 0x39a80171, 0x080cb3de, 0xd8b4e49c, 0x6456c190,
279+ 0x7bcb8461, 0xd532b670, 0x486c5c74, 0xd0b85742);
280+$code.=<<___;
281+.byte 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38
282+.byte 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
283+.byte 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
284+.byte 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
285+.byte 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
286+.byte 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
287+.byte 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
288+.byte 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
289+.byte 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
290+.byte 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
291+.byte 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
292+.byte 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
293+.byte 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
294+.byte 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
295+.byte 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
296+.byte 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
297+.byte 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
298+.byte 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
299+.byte 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
300+.byte 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
301+.byte 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
302+.byte 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
303+.byte 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
304+.byte 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
305+.byte 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
306+.byte 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
307+.byte 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
308+.byte 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
309+.byte 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
310+.byte 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
311+.byte 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
312+.byte 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
313+
314+
315+.globl .AES_encrypt
316+.align 7
317+.AES_encrypt:
318+ mflr r0
319+ $STU $sp,-$FRAME($sp)
320+
321+ $PUSH r0,`$FRAME-$SIZE_T*21`($sp)
322+ $PUSH $toc,`$FRAME-$SIZE_T*20`($sp)
323+ $PUSH r13,`$FRAME-$SIZE_T*19`($sp)
324+ $PUSH r14,`$FRAME-$SIZE_T*18`($sp)
325+ $PUSH r15,`$FRAME-$SIZE_T*17`($sp)
326+ $PUSH r16,`$FRAME-$SIZE_T*16`($sp)
327+ $PUSH r17,`$FRAME-$SIZE_T*15`($sp)
328+ $PUSH r18,`$FRAME-$SIZE_T*14`($sp)
329+ $PUSH r19,`$FRAME-$SIZE_T*13`($sp)
330+ $PUSH r20,`$FRAME-$SIZE_T*12`($sp)
331+ $PUSH r21,`$FRAME-$SIZE_T*11`($sp)
332+ $PUSH r22,`$FRAME-$SIZE_T*10`($sp)
333+ $PUSH r23,`$FRAME-$SIZE_T*9`($sp)
334+ $PUSH r24,`$FRAME-$SIZE_T*8`($sp)
335+ $PUSH r25,`$FRAME-$SIZE_T*7`($sp)
336+ $PUSH r26,`$FRAME-$SIZE_T*6`($sp)
337+ $PUSH r27,`$FRAME-$SIZE_T*5`($sp)
338+ $PUSH r28,`$FRAME-$SIZE_T*4`($sp)
339+ $PUSH r29,`$FRAME-$SIZE_T*3`($sp)
340+ $PUSH r30,`$FRAME-$SIZE_T*2`($sp)
341+ $PUSH r31,`$FRAME-$SIZE_T*1`($sp)
342+
343+ lwz $s0,0($inp)
344+ lwz $s1,4($inp)
345+ lwz $s2,8($inp)
346+ lwz $s3,12($inp)
347+ bl LAES_Te
348+ bl Lppc_AES_encrypt_compact
349+ stw $s0,0($out)
350+ stw $s1,4($out)
351+ stw $s2,8($out)
352+ stw $s3,12($out)
353+
354+ $POP r0,`$FRAME-$SIZE_T*21`($sp)
355+ $POP $toc,`$FRAME-$SIZE_T*20`($sp)
356+ $POP r13,`$FRAME-$SIZE_T*19`($sp)
357+ $POP r14,`$FRAME-$SIZE_T*18`($sp)
358+ $POP r15,`$FRAME-$SIZE_T*17`($sp)
359+ $POP r16,`$FRAME-$SIZE_T*16`($sp)
360+ $POP r17,`$FRAME-$SIZE_T*15`($sp)
361+ $POP r18,`$FRAME-$SIZE_T*14`($sp)
362+ $POP r19,`$FRAME-$SIZE_T*13`($sp)
363+ $POP r20,`$FRAME-$SIZE_T*12`($sp)
364+ $POP r21,`$FRAME-$SIZE_T*11`($sp)
365+ $POP r22,`$FRAME-$SIZE_T*10`($sp)
366+ $POP r23,`$FRAME-$SIZE_T*9`($sp)
367+ $POP r24,`$FRAME-$SIZE_T*8`($sp)
368+ $POP r25,`$FRAME-$SIZE_T*7`($sp)
369+ $POP r26,`$FRAME-$SIZE_T*6`($sp)
370+ $POP r27,`$FRAME-$SIZE_T*5`($sp)
371+ $POP r28,`$FRAME-$SIZE_T*4`($sp)
372+ $POP r29,`$FRAME-$SIZE_T*3`($sp)
373+ $POP r30,`$FRAME-$SIZE_T*2`($sp)
374+ $POP r31,`$FRAME-$SIZE_T*1`($sp)
375+ mtlr r0
376+ addi $sp,$sp,$FRAME
377+ blr
378+
379+.align 4
380+Lppc_AES_encrypt:
381+ lwz $acc00,240($key)
382+ lwz $t0,0($key)
383+ lwz $t1,4($key)
384+ lwz $t2,8($key)
385+ lwz $t3,12($key)
386+ addi $Tbl1,$Tbl0,3
387+ addi $Tbl2,$Tbl0,2
388+ addi $Tbl3,$Tbl0,1
389+ addi $acc00,$acc00,-1
390+ addi $key,$key,16
391+ xor $s0,$s0,$t0
392+ xor $s1,$s1,$t1
393+ xor $s2,$s2,$t2
394+ xor $s3,$s3,$t3
395+ mtctr $acc00
396+.align 4
397+Lenc_loop:
398+ rlwinm $acc00,$s0,`32-24+3`,21,28
399+ rlwinm $acc01,$s1,`32-24+3`,21,28
400+ lwz $t0,0($key)
401+ lwz $t1,4($key)
402+ rlwinm $acc02,$s2,`32-24+3`,21,28
403+ rlwinm $acc03,$s3,`32-24+3`,21,28
404+ lwz $t2,8($key)
405+ lwz $t3,12($key)
406+ rlwinm $acc04,$s1,`32-16+3`,21,28
407+ rlwinm $acc05,$s2,`32-16+3`,21,28
408+ lwzx $acc00,$Tbl0,$acc00
409+ lwzx $acc01,$Tbl0,$acc01
410+ rlwinm $acc06,$s3,`32-16+3`,21,28
411+ rlwinm $acc07,$s0,`32-16+3`,21,28
412+ lwzx $acc02,$Tbl0,$acc02
413+ lwzx $acc03,$Tbl0,$acc03
414+ rlwinm $acc08,$s2,`32-8+3`,21,28
415+ rlwinm $acc09,$s3,`32-8+3`,21,28
416+ lwzx $acc04,$Tbl1,$acc04
417+ lwzx $acc05,$Tbl1,$acc05
418+ rlwinm $acc10,$s0,`32-8+3`,21,28
419+ rlwinm $acc11,$s1,`32-8+3`,21,28
420+ lwzx $acc06,$Tbl1,$acc06
421+ lwzx $acc07,$Tbl1,$acc07
422+ rlwinm $acc12,$s3,`0+3`,21,28
423+ rlwinm $acc13,$s0,`0+3`,21,28
424+ lwzx $acc08,$Tbl2,$acc08
425+ lwzx $acc09,$Tbl2,$acc09
426+ rlwinm $acc14,$s1,`0+3`,21,28
427+ rlwinm $acc15,$s2,`0+3`,21,28
428+ lwzx $acc10,$Tbl2,$acc10
429+ lwzx $acc11,$Tbl2,$acc11
430+ xor $t0,$t0,$acc00
431+ xor $t1,$t1,$acc01
432+ lwzx $acc12,$Tbl3,$acc12
433+ lwzx $acc13,$Tbl3,$acc13
434+ xor $t2,$t2,$acc02
435+ xor $t3,$t3,$acc03
436+ lwzx $acc14,$Tbl3,$acc14
437+ lwzx $acc15,$Tbl3,$acc15
438+ xor $t0,$t0,$acc04
439+ xor $t1,$t1,$acc05
440+ xor $t2,$t2,$acc06
441+ xor $t3,$t3,$acc07
442+ xor $t0,$t0,$acc08
443+ xor $t1,$t1,$acc09
444+ xor $t2,$t2,$acc10
445+ xor $t3,$t3,$acc11
446+ xor $s0,$t0,$acc12
447+ xor $s1,$t1,$acc13
448+ xor $s2,$t2,$acc14
449+ xor $s3,$t3,$acc15
450+ addi $key,$key,16
451+ bdnz- Lenc_loop
452+
453+ addi $Tbl2,$Tbl0,2048
454+ nop
455+ lwz $acc08,`2048+0`($Tbl0) ! prefetch Te4
456+ lwz $acc09,`2048+32`($Tbl0)
457+ lwz $acc10,`2048+64`($Tbl0)
458+ lwz $acc11,`2048+96`($Tbl0)
459+ lwz $acc08,`2048+128`($Tbl0)
460+ lwz $acc09,`2048+160`($Tbl0)
461+ lwz $acc10,`2048+192`($Tbl0)
462+ lwz $acc11,`2048+224`($Tbl0)
463+ rlwinm $acc00,$s0,`32-24`,24,31
464+ rlwinm $acc01,$s1,`32-24`,24,31
465+ lwz $t0,0($key)
466+ lwz $t1,4($key)
467+ rlwinm $acc02,$s2,`32-24`,24,31
468+ rlwinm $acc03,$s3,`32-24`,24,31
469+ lwz $t2,8($key)
470+ lwz $t3,12($key)
471+ rlwinm $acc04,$s1,`32-16`,24,31
472+ rlwinm $acc05,$s2,`32-16`,24,31
473+ lbzx $acc00,$Tbl2,$acc00
474+ lbzx $acc01,$Tbl2,$acc01
475+ rlwinm $acc06,$s3,`32-16`,24,31
476+ rlwinm $acc07,$s0,`32-16`,24,31
477+ lbzx $acc02,$Tbl2,$acc02
478+ lbzx $acc03,$Tbl2,$acc03
479+ rlwinm $acc08,$s2,`32-8`,24,31
480+ rlwinm $acc09,$s3,`32-8`,24,31
481+ lbzx $acc04,$Tbl2,$acc04
482+ lbzx $acc05,$Tbl2,$acc05
483+ rlwinm $acc10,$s0,`32-8`,24,31
484+ rlwinm $acc11,$s1,`32-8`,24,31
485+ lbzx $acc06,$Tbl2,$acc06
486+ lbzx $acc07,$Tbl2,$acc07
487+ rlwinm $acc12,$s3,`0`,24,31
488+ rlwinm $acc13,$s0,`0`,24,31
489+ lbzx $acc08,$Tbl2,$acc08
490+ lbzx $acc09,$Tbl2,$acc09
491+ rlwinm $acc14,$s1,`0`,24,31
492+ rlwinm $acc15,$s2,`0`,24,31
493+ lbzx $acc10,$Tbl2,$acc10
494+ lbzx $acc11,$Tbl2,$acc11
495+ rlwinm $s0,$acc00,24,0,7
496+ rlwinm $s1,$acc01,24,0,7
497+ lbzx $acc12,$Tbl2,$acc12
498+ lbzx $acc13,$Tbl2,$acc13
499+ rlwinm $s2,$acc02,24,0,7
500+ rlwinm $s3,$acc03,24,0,7
501+ lbzx $acc14,$Tbl2,$acc14
502+ lbzx $acc15,$Tbl2,$acc15
503+ rlwimi $s0,$acc04,16,8,15
504+ rlwimi $s1,$acc05,16,8,15
505+ rlwimi $s2,$acc06,16,8,15
506+ rlwimi $s3,$acc07,16,8,15
507+ rlwimi $s0,$acc08,8,16,23
508+ rlwimi $s1,$acc09,8,16,23
509+ rlwimi $s2,$acc10,8,16,23
510+ rlwimi $s3,$acc11,8,16,23
511+ or $s0,$s0,$acc12
512+ or $s1,$s1,$acc13
513+ or $s2,$s2,$acc14
514+ or $s3,$s3,$acc15
515+ xor $s0,$s0,$t0
516+ xor $s1,$s1,$t1
517+ xor $s2,$s2,$t2
518+ xor $s3,$s3,$t3
519+ blr
520+
521+.align 4
522+Lppc_AES_encrypt_compact:
523+ lwz $acc00,240($key)
524+ lwz $t0,0($key)
525+ lwz $t1,4($key)
526+ lwz $t2,8($key)
527+ lwz $t3,12($key)
528+ addi $Tbl1,$Tbl0,2048
529+ lis $mask80,0x8080
530+ lis $mask1b,0x1b1b
531+ addi $key,$key,16
532+ ori $mask80,$mask80,0x8080
533+ ori $mask1b,$mask1b,0x1b1b
534+ mtctr $acc00
535+.align 4
536+Lenc_compact_loop:
537+ xor $s0,$s0,$t0
538+ xor $s1,$s1,$t1
539+ xor $s2,$s2,$t2
540+ xor $s3,$s3,$t3
541+ rlwinm $acc00,$s0,`32-24`,24,31
542+ rlwinm $acc01,$s1,`32-24`,24,31
543+ rlwinm $acc02,$s2,`32-24`,24,31
544+ rlwinm $acc03,$s3,`32-24`,24,31
545+ lbzx $acc00,$Tbl1,$acc00
546+ lbzx $acc01,$Tbl1,$acc01
547+ rlwinm $acc04,$s1,`32-16`,24,31
548+ rlwinm $acc05,$s2,`32-16`,24,31
549+ lbzx $acc02,$Tbl1,$acc02
550+ lbzx $acc03,$Tbl1,$acc03
551+ rlwinm $acc06,$s3,`32-16`,24,31
552+ rlwinm $acc07,$s0,`32-16`,24,31
553+ lbzx $acc04,$Tbl1,$acc04
554+ lbzx $acc05,$Tbl1,$acc05
555+ rlwinm $acc08,$s2,`32-8`,24,31
556+ rlwinm $acc09,$s3,`32-8`,24,31
557+ lbzx $acc06,$Tbl1,$acc06
558+ lbzx $acc07,$Tbl1,$acc07
559+ rlwinm $acc10,$s0,`32-8`,24,31
560+ rlwinm $acc11,$s1,`32-8`,24,31
561+ lbzx $acc08,$Tbl1,$acc08
562+ lbzx $acc09,$Tbl1,$acc09
563+ rlwinm $acc12,$s3,`0`,24,31
564+ rlwinm $acc13,$s0,`0`,24,31
565+ lbzx $acc10,$Tbl1,$acc10
566+ lbzx $acc11,$Tbl1,$acc11
567+ rlwinm $acc14,$s1,`0`,24,31
568+ rlwinm $acc15,$s2,`0`,24,31
569+ lbzx $acc12,$Tbl1,$acc12
570+ lbzx $acc13,$Tbl1,$acc13
571+ rlwinm $s0,$acc00,24,0,7
572+ rlwinm $s1,$acc01,24,0,7
573+ lbzx $acc14,$Tbl1,$acc14
574+ lbzx $acc15,$Tbl1,$acc15
575+ rlwinm $s2,$acc02,24,0,7
576+ rlwinm $s3,$acc03,24,0,7
577+ rlwimi $s0,$acc04,16,8,15
578+ rlwimi $s1,$acc05,16,8,15
579+ rlwimi $s2,$acc06,16,8,15
580+ rlwimi $s3,$acc07,16,8,15
581+ rlwimi $s0,$acc08,8,16,23
582+ rlwimi $s1,$acc09,8,16,23
583+ rlwimi $s2,$acc10,8,16,23
584+ rlwimi $s3,$acc11,8,16,23
585+ lwz $t0,0($key)
586+ lwz $t1,4($key)
587+ or $s0,$s0,$acc12
588+ or $s1,$s1,$acc13
589+ lwz $t2,8($key)
590+ lwz $t3,12($key)
591+ or $s2,$s2,$acc14
592+ or $s3,$s3,$acc15
593+
594+ addi $key,$key,16
595+ bdz Lenc_compact_done
596+
597+ and $acc00,$s0,$mask80 # r1=r0&0x80808080
598+ and $acc01,$s1,$mask80
599+ and $acc02,$s2,$mask80
600+ and $acc03,$s3,$mask80
601+ srwi $acc04,$acc00,7 # r1>>7
602+ srwi $acc05,$acc01,7
603+ srwi $acc06,$acc02,7
604+ srwi $acc07,$acc03,7
605+ andc $acc08,$s0,$mask80 # r0&0x7f7f7f7f
606+ andc $acc09,$s1,$mask80
607+ andc $acc10,$s2,$mask80
608+ andc $acc11,$s3,$mask80
609+ sub $acc00,$acc00,$acc04 # r1-(r1>>7)
610+ sub $acc01,$acc01,$acc05
611+ sub $acc02,$acc02,$acc06
612+ sub $acc03,$acc03,$acc07
613+ add $acc08,$acc08,$acc08 # (r0&0x7f7f7f7f)<<1
614+ add $acc09,$acc09,$acc09
615+ add $acc10,$acc10,$acc10
616+ add $acc11,$acc11,$acc11
617+ and $acc00,$acc00,$mask1b # (r1-(r1>>7))&0x1b1b1b1b
618+ and $acc01,$acc01,$mask1b
619+ and $acc02,$acc02,$mask1b
620+ and $acc03,$acc03,$mask1b
621+ xor $acc00,$acc00,$acc08 # r2
622+ xor $acc01,$acc01,$acc09
623+ xor $acc02,$acc02,$acc10
624+ xor $acc03,$acc03,$acc11
625+
626+ rotlwi $acc12,$s0,16 # ROTATE(r0,16)
627+ rotlwi $acc13,$s1,16
628+ rotlwi $acc14,$s2,16
629+ rotlwi $acc15,$s3,16
630+ xor $s0,$s0,$acc00 # r0^r2
631+ xor $s1,$s1,$acc01
632+ xor $s2,$s2,$acc02
633+ xor $s3,$s3,$acc03
634+ rotrwi $s0,$s0,24 # ROTATE(r2^r0,24)
635+ rotrwi $s1,$s1,24
636+ rotrwi $s2,$s2,24
637+ rotrwi $s3,$s3,24
638+ xor $s0,$s0,$acc00 # ROTATE(r2^r0,24)^r2
639+ xor $s1,$s1,$acc01
640+ xor $s2,$s2,$acc02
641+ xor $s3,$s3,$acc03
642+ rotlwi $acc08,$acc12,8 # ROTATE(r0,24)
643+ rotlwi $acc09,$acc13,8
644+ rotlwi $acc10,$acc14,8
645+ rotlwi $acc11,$acc15,8
646+ xor $s0,$s0,$acc12 #
647+ xor $s1,$s1,$acc13
648+ xor $s2,$s2,$acc14
649+ xor $s3,$s3,$acc15
650+ xor $s0,$s0,$acc08 #
651+ xor $s1,$s1,$acc09
652+ xor $s2,$s2,$acc10
653+ xor $s3,$s3,$acc11
654+
655+ b Lenc_compact_loop
656+.align 4
657+Lenc_compact_done:
658+ xor $s0,$s0,$t0
659+ xor $s1,$s1,$t1
660+ xor $s2,$s2,$t2
661+ xor $s3,$s3,$t3
662+ blr
663+
664+.globl .AES_decrypt
665+.align 7
666+.AES_decrypt:
667+ mflr r0
668+ $STU $sp,-$FRAME($sp)
669+
670+ $PUSH r0,`$FRAME-$SIZE_T*21`($sp)
671+ $PUSH $toc,`$FRAME-$SIZE_T*20`($sp)
672+ $PUSH r13,`$FRAME-$SIZE_T*19`($sp)
673+ $PUSH r14,`$FRAME-$SIZE_T*18`($sp)
674+ $PUSH r15,`$FRAME-$SIZE_T*17`($sp)
675+ $PUSH r16,`$FRAME-$SIZE_T*16`($sp)
676+ $PUSH r17,`$FRAME-$SIZE_T*15`($sp)
677+ $PUSH r18,`$FRAME-$SIZE_T*14`($sp)
678+ $PUSH r19,`$FRAME-$SIZE_T*13`($sp)
679+ $PUSH r20,`$FRAME-$SIZE_T*12`($sp)
680+ $PUSH r21,`$FRAME-$SIZE_T*11`($sp)
681+ $PUSH r22,`$FRAME-$SIZE_T*10`($sp)
682+ $PUSH r23,`$FRAME-$SIZE_T*9`($sp)
683+ $PUSH r24,`$FRAME-$SIZE_T*8`($sp)
684+ $PUSH r25,`$FRAME-$SIZE_T*7`($sp)
685+ $PUSH r26,`$FRAME-$SIZE_T*6`($sp)
686+ $PUSH r27,`$FRAME-$SIZE_T*5`($sp)
687+ $PUSH r28,`$FRAME-$SIZE_T*4`($sp)
688+ $PUSH r29,`$FRAME-$SIZE_T*3`($sp)
689+ $PUSH r30,`$FRAME-$SIZE_T*2`($sp)
690+ $PUSH r31,`$FRAME-$SIZE_T*1`($sp)
691+
692+ lwz $s0,0($inp)
693+ lwz $s1,4($inp)
694+ lwz $s2,8($inp)
695+ lwz $s3,12($inp)
696+ bl LAES_Td
697+ bl Lppc_AES_decrypt_compact
698+ stw $s0,0($out)
699+ stw $s1,4($out)
700+ stw $s2,8($out)
701+ stw $s3,12($out)
702+
703+ $POP r0,`$FRAME-$SIZE_T*21`($sp)
704+ $POP $toc,`$FRAME-$SIZE_T*20`($sp)
705+ $POP r13,`$FRAME-$SIZE_T*19`($sp)
706+ $POP r14,`$FRAME-$SIZE_T*18`($sp)
707+ $POP r15,`$FRAME-$SIZE_T*17`($sp)
708+ $POP r16,`$FRAME-$SIZE_T*16`($sp)
709+ $POP r17,`$FRAME-$SIZE_T*15`($sp)
710+ $POP r18,`$FRAME-$SIZE_T*14`($sp)
711+ $POP r19,`$FRAME-$SIZE_T*13`($sp)
712+ $POP r20,`$FRAME-$SIZE_T*12`($sp)
713+ $POP r21,`$FRAME-$SIZE_T*11`($sp)
714+ $POP r22,`$FRAME-$SIZE_T*10`($sp)
715+ $POP r23,`$FRAME-$SIZE_T*9`($sp)
716+ $POP r24,`$FRAME-$SIZE_T*8`($sp)
717+ $POP r25,`$FRAME-$SIZE_T*7`($sp)
718+ $POP r26,`$FRAME-$SIZE_T*6`($sp)
719+ $POP r27,`$FRAME-$SIZE_T*5`($sp)
720+ $POP r28,`$FRAME-$SIZE_T*4`($sp)
721+ $POP r29,`$FRAME-$SIZE_T*3`($sp)
722+ $POP r30,`$FRAME-$SIZE_T*2`($sp)
723+ $POP r31,`$FRAME-$SIZE_T*1`($sp)
724+ mtlr r0
725+ addi $sp,$sp,$FRAME
726+ blr
727+
728+.align 4
729+Lppc_AES_decrypt:
730+ lwz $acc00,240($key)
731+ lwz $t0,0($key)
732+ lwz $t1,4($key)
733+ lwz $t2,8($key)
734+ lwz $t3,12($key)
735+ addi $Tbl1,$Tbl0,3
736+ addi $Tbl2,$Tbl0,2
737+ addi $Tbl3,$Tbl0,1
738+ addi $acc00,$acc00,-1
739+ addi $key,$key,16
740+ xor $s0,$s0,$t0
741+ xor $s1,$s1,$t1
742+ xor $s2,$s2,$t2
743+ xor $s3,$s3,$t3
744+ mtctr $acc00
745+.align 4
746+Ldec_loop:
747+ rlwinm $acc00,$s0,`32-24+3`,21,28
748+ rlwinm $acc01,$s1,`32-24+3`,21,28
749+ lwz $t0,0($key)
750+ lwz $t1,4($key)
751+ rlwinm $acc02,$s2,`32-24+3`,21,28
752+ rlwinm $acc03,$s3,`32-24+3`,21,28
753+ lwz $t2,8($key)
754+ lwz $t3,12($key)
755+ rlwinm $acc04,$s3,`32-16+3`,21,28
756+ rlwinm $acc05,$s0,`32-16+3`,21,28
757+ lwzx $acc00,$Tbl0,$acc00
758+ lwzx $acc01,$Tbl0,$acc01
759+ rlwinm $acc06,$s1,`32-16+3`,21,28
760+ rlwinm $acc07,$s2,`32-16+3`,21,28
761+ lwzx $acc02,$Tbl0,$acc02
762+ lwzx $acc03,$Tbl0,$acc03
763+ rlwinm $acc08,$s2,`32-8+3`,21,28
764+ rlwinm $acc09,$s3,`32-8+3`,21,28
765+ lwzx $acc04,$Tbl1,$acc04
766+ lwzx $acc05,$Tbl1,$acc05
767+ rlwinm $acc10,$s0,`32-8+3`,21,28
768+ rlwinm $acc11,$s1,`32-8+3`,21,28
769+ lwzx $acc06,$Tbl1,$acc06
770+ lwzx $acc07,$Tbl1,$acc07
771+ rlwinm $acc12,$s1,`0+3`,21,28
772+ rlwinm $acc13,$s2,`0+3`,21,28
773+ lwzx $acc08,$Tbl2,$acc08
774+ lwzx $acc09,$Tbl2,$acc09
775+ rlwinm $acc14,$s3,`0+3`,21,28
776+ rlwinm $acc15,$s0,`0+3`,21,28
777+ lwzx $acc10,$Tbl2,$acc10
778+ lwzx $acc11,$Tbl2,$acc11
779+ xor $t0,$t0,$acc00
780+ xor $t1,$t1,$acc01
781+ lwzx $acc12,$Tbl3,$acc12
782+ lwzx $acc13,$Tbl3,$acc13
783+ xor $t2,$t2,$acc02
784+ xor $t3,$t3,$acc03
785+ lwzx $acc14,$Tbl3,$acc14
786+ lwzx $acc15,$Tbl3,$acc15
787+ xor $t0,$t0,$acc04
788+ xor $t1,$t1,$acc05
789+ xor $t2,$t2,$acc06
790+ xor $t3,$t3,$acc07
791+ xor $t0,$t0,$acc08
792+ xor $t1,$t1,$acc09
793+ xor $t2,$t2,$acc10
794+ xor $t3,$t3,$acc11
795+ xor $s0,$t0,$acc12
796+ xor $s1,$t1,$acc13
797+ xor $s2,$t2,$acc14
798+ xor $s3,$t3,$acc15
799+ addi $key,$key,16
800+ bdnz- Ldec_loop
801+
802+ addi $Tbl2,$Tbl0,2048
803+ nop
804+ lwz $acc08,`2048+0`($Tbl0) ! prefetch Td4
805+ lwz $acc09,`2048+32`($Tbl0)
806+ lwz $acc10,`2048+64`($Tbl0)
807+ lwz $acc11,`2048+96`($Tbl0)
808+ lwz $acc08,`2048+128`($Tbl0)
809+ lwz $acc09,`2048+160`($Tbl0)
810+ lwz $acc10,`2048+192`($Tbl0)
811+ lwz $acc11,`2048+224`($Tbl0)
812+ rlwinm $acc00,$s0,`32-24`,24,31
813+ rlwinm $acc01,$s1,`32-24`,24,31
814+ lwz $t0,0($key)
815+ lwz $t1,4($key)
816+ rlwinm $acc02,$s2,`32-24`,24,31
817+ rlwinm $acc03,$s3,`32-24`,24,31
818+ lwz $t2,8($key)
819+ lwz $t3,12($key)
820+ rlwinm $acc04,$s3,`32-16`,24,31
821+ rlwinm $acc05,$s0,`32-16`,24,31
822+ lbzx $acc00,$Tbl2,$acc00
823+ lbzx $acc01,$Tbl2,$acc01
824+ rlwinm $acc06,$s1,`32-16`,24,31
825+ rlwinm $acc07,$s2,`32-16`,24,31
826+ lbzx $acc02,$Tbl2,$acc02
827+ lbzx $acc03,$Tbl2,$acc03
828+ rlwinm $acc08,$s2,`32-8`,24,31
829+ rlwinm $acc09,$s3,`32-8`,24,31
830+ lbzx $acc04,$Tbl2,$acc04
831+ lbzx $acc05,$Tbl2,$acc05
832+ rlwinm $acc10,$s0,`32-8`,24,31
833+ rlwinm $acc11,$s1,`32-8`,24,31
834+ lbzx $acc06,$Tbl2,$acc06
835+ lbzx $acc07,$Tbl2,$acc07
836+ rlwinm $acc12,$s1,`0`,24,31
837+ rlwinm $acc13,$s2,`0`,24,31
838+ lbzx $acc08,$Tbl2,$acc08
839+ lbzx $acc09,$Tbl2,$acc09
840+ rlwinm $acc14,$s3,`0`,24,31
841+ rlwinm $acc15,$s0,`0`,24,31
842+ lbzx $acc10,$Tbl2,$acc10
843+ lbzx $acc11,$Tbl2,$acc11
844+ rlwinm $s0,$acc00,24,0,7
845+ rlwinm $s1,$acc01,24,0,7
846+ lbzx $acc12,$Tbl2,$acc12
847+ lbzx $acc13,$Tbl2,$acc13
848+ rlwinm $s2,$acc02,24,0,7
849+ rlwinm $s3,$acc03,24,0,7
850+ lbzx $acc14,$Tbl2,$acc14
851+ lbzx $acc15,$Tbl2,$acc15
852+ rlwimi $s0,$acc04,16,8,15
853+ rlwimi $s1,$acc05,16,8,15
854+ rlwimi $s2,$acc06,16,8,15
855+ rlwimi $s3,$acc07,16,8,15
856+ rlwimi $s0,$acc08,8,16,23
857+ rlwimi $s1,$acc09,8,16,23
858+ rlwimi $s2,$acc10,8,16,23
859+ rlwimi $s3,$acc11,8,16,23
860+ or $s0,$s0,$acc12
861+ or $s1,$s1,$acc13
862+ or $s2,$s2,$acc14
863+ or $s3,$s3,$acc15
864+ xor $s0,$s0,$t0
865+ xor $s1,$s1,$t1
866+ xor $s2,$s2,$t2
867+ xor $s3,$s3,$t3
868+ blr
869+
870+.align 4
871+Lppc_AES_decrypt_compact:
872+ lwz $acc00,240($key)
873+ lwz $t0,0($key)
874+ lwz $t1,4($key)
875+ lwz $t2,8($key)
876+ lwz $t3,12($key)
877+ addi $Tbl1,$Tbl0,2048
878+ lis $mask80,0x8080
879+ lis $mask1b,0x1b1b
880+ addi $key,$key,16
881+ ori $mask80,$mask80,0x8080
882+ ori $mask1b,$mask1b,0x1b1b
883+___
884+$code.=<<___ if ($SIZE_T==8);
885+ insrdi $mask80,$mask80,32,0
886+ insrdi $mask1b,$mask1b,32,0
887+___
888+$code.=<<___;
889+ mtctr $acc00
890+.align 4
891+Ldec_compact_loop:
892+ xor $s0,$s0,$t0
893+ xor $s1,$s1,$t1
894+ xor $s2,$s2,$t2
895+ xor $s3,$s3,$t3
896+ rlwinm $acc00,$s0,`32-24`,24,31
897+ rlwinm $acc01,$s1,`32-24`,24,31
898+ rlwinm $acc02,$s2,`32-24`,24,31
899+ rlwinm $acc03,$s3,`32-24`,24,31
900+ lbzx $acc00,$Tbl1,$acc00
901+ lbzx $acc01,$Tbl1,$acc01
902+ rlwinm $acc04,$s3,`32-16`,24,31
903+ rlwinm $acc05,$s0,`32-16`,24,31
904+ lbzx $acc02,$Tbl1,$acc02
905+ lbzx $acc03,$Tbl1,$acc03
906+ rlwinm $acc06,$s1,`32-16`,24,31
907+ rlwinm $acc07,$s2,`32-16`,24,31
908+ lbzx $acc04,$Tbl1,$acc04
909+ lbzx $acc05,$Tbl1,$acc05
910+ rlwinm $acc08,$s2,`32-8`,24,31
911+ rlwinm $acc09,$s3,`32-8`,24,31
912+ lbzx $acc06,$Tbl1,$acc06
913+ lbzx $acc07,$Tbl1,$acc07
914+ rlwinm $acc10,$s0,`32-8`,24,31
915+ rlwinm $acc11,$s1,`32-8`,24,31
916+ lbzx $acc08,$Tbl1,$acc08
917+ lbzx $acc09,$Tbl1,$acc09
918+ rlwinm $acc12,$s1,`0`,24,31
919+ rlwinm $acc13,$s2,`0`,24,31
920+ lbzx $acc10,$Tbl1,$acc10
921+ lbzx $acc11,$Tbl1,$acc11
922+ rlwinm $acc14,$s3,`0`,24,31
923+ rlwinm $acc15,$s0,`0`,24,31
924+ lbzx $acc12,$Tbl1,$acc12
925+ lbzx $acc13,$Tbl1,$acc13
926+ rlwinm $s0,$acc00,24,0,7
927+ rlwinm $s1,$acc01,24,0,7
928+ lbzx $acc14,$Tbl1,$acc14
929+ lbzx $acc15,$Tbl1,$acc15
930+ rlwinm $s2,$acc02,24,0,7
931+ rlwinm $s3,$acc03,24,0,7
932+ rlwimi $s0,$acc04,16,8,15
933+ rlwimi $s1,$acc05,16,8,15
934+ rlwimi $s2,$acc06,16,8,15
935+ rlwimi $s3,$acc07,16,8,15
936+ rlwimi $s0,$acc08,8,16,23
937+ rlwimi $s1,$acc09,8,16,23
938+ rlwimi $s2,$acc10,8,16,23
939+ rlwimi $s3,$acc11,8,16,23
940+ lwz $t0,0($key)
941+ lwz $t1,4($key)
942+ or $s0,$s0,$acc12
943+ or $s1,$s1,$acc13
944+ lwz $t2,8($key)
945+ lwz $t3,12($key)
946+ or $s2,$s2,$acc14
947+ or $s3,$s3,$acc15
948+
949+ addi $key,$key,16
950+ bdz Ldec_compact_done
951+___
952+$code.=<<___ if ($SIZE_T==8);
953+ # vectorized permutation improves decrypt performance by 10%
954+ insrdi $s0,$s1,32,0
955+ insrdi $s2,$s3,32,0
956+
957+ and $acc00,$s0,$mask80 # r1=r0&0x80808080
958+ and $acc02,$s2,$mask80
959+ srdi $acc04,$acc00,7 # r1>>7
960+ srdi $acc06,$acc02,7
961+ andc $acc08,$s0,$mask80 # r0&0x7f7f7f7f
962+ andc $acc10,$s2,$mask80
963+ sub $acc00,$acc00,$acc04 # r1-(r1>>7)
964+ sub $acc02,$acc02,$acc06
965+ add $acc08,$acc08,$acc08 # (r0&0x7f7f7f7f)<<1
966+ add $acc10,$acc10,$acc10
967+ and $acc00,$acc00,$mask1b # (r1-(r1>>7))&0x1b1b1b1b
968+ and $acc02,$acc02,$mask1b
969+ xor $acc00,$acc00,$acc08 # r2
970+ xor $acc02,$acc02,$acc10
971+
972+ and $acc04,$acc00,$mask80 # r1=r2&0x80808080
973+ and $acc06,$acc02,$mask80
974+ srdi $acc08,$acc04,7 # r1>>7
975+ srdi $acc10,$acc06,7
976+ andc $acc12,$acc00,$mask80 # r2&0x7f7f7f7f
977+ andc $acc14,$acc02,$mask80
978+ sub $acc04,$acc04,$acc08 # r1-(r1>>7)
979+ sub $acc06,$acc06,$acc10
980+ add $acc12,$acc12,$acc12 # (r2&0x7f7f7f7f)<<1
981+ add $acc14,$acc14,$acc14
982+ and $acc04,$acc04,$mask1b # (r1-(r1>>7))&0x1b1b1b1b
983+ and $acc06,$acc06,$mask1b
984+ xor $acc04,$acc04,$acc12 # r4
985+ xor $acc06,$acc06,$acc14
986+
987+ and $acc08,$acc04,$mask80 # r1=r4&0x80808080
988+ and $acc10,$acc06,$mask80
989+ srdi $acc12,$acc08,7 # r1>>7
990+ srdi $acc14,$acc10,7
991+ sub $acc08,$acc08,$acc12 # r1-(r1>>7)
992+ sub $acc10,$acc10,$acc14
993+ andc $acc12,$acc04,$mask80 # r4&0x7f7f7f7f
994+ andc $acc14,$acc06,$mask80
995+ add $acc12,$acc12,$acc12 # (r4&0x7f7f7f7f)<<1
996+ add $acc14,$acc14,$acc14
997+ and $acc08,$acc08,$mask1b # (r1-(r1>>7))&0x1b1b1b1b
998+ and $acc10,$acc10,$mask1b
999+ xor $acc08,$acc08,$acc12 # r8
1000+ xor $acc10,$acc10,$acc14
1001+
1002+ xor $acc00,$acc00,$s0 # r2^r0
1003+ xor $acc02,$acc02,$s2
1004+ xor $acc04,$acc04,$s0 # r4^r0
1005+ xor $acc06,$acc06,$s2
1006+
1007+ extrdi $acc01,$acc00,32,0
1008+ extrdi $acc03,$acc02,32,0
1009+ extrdi $acc05,$acc04,32,0
1010+ extrdi $acc07,$acc06,32,0
1011+ extrdi $acc09,$acc08,32,0
1012+ extrdi $acc11,$acc10,32,0
1013+___
1014+$code.=<<___ if ($SIZE_T==4);
1015+ and $acc00,$s0,$mask80 # r1=r0&0x80808080
1016+ and $acc01,$s1,$mask80
1017+ and $acc02,$s2,$mask80
1018+ and $acc03,$s3,$mask80
1019+ srwi $acc04,$acc00,7 # r1>>7
1020+ srwi $acc05,$acc01,7
1021+ srwi $acc06,$acc02,7
1022+ srwi $acc07,$acc03,7
1023+ andc $acc08,$s0,$mask80 # r0&0x7f7f7f7f
1024+ andc $acc09,$s1,$mask80
1025+ andc $acc10,$s2,$mask80
1026+ andc $acc11,$s3,$mask80
1027+ sub $acc00,$acc00,$acc04 # r1-(r1>>7)
1028+ sub $acc01,$acc01,$acc05
1029+ sub $acc02,$acc02,$acc06
1030+ sub $acc03,$acc03,$acc07
1031+ add $acc08,$acc08,$acc08 # (r0&0x7f7f7f7f)<<1
1032+ add $acc09,$acc09,$acc09
1033+ add $acc10,$acc10,$acc10
1034+ add $acc11,$acc11,$acc11
1035+ and $acc00,$acc00,$mask1b # (r1-(r1>>7))&0x1b1b1b1b
1036+ and $acc01,$acc01,$mask1b
1037+ and $acc02,$acc02,$mask1b
1038+ and $acc03,$acc03,$mask1b
1039+ xor $acc00,$acc00,$acc08 # r2
1040+ xor $acc01,$acc01,$acc09
1041+ xor $acc02,$acc02,$acc10
1042+ xor $acc03,$acc03,$acc11
1043+
1044+ and $acc04,$acc00,$mask80 # r1=r2&0x80808080
1045+ and $acc05,$acc01,$mask80
1046+ and $acc06,$acc02,$mask80
1047+ and $acc07,$acc03,$mask80
1048+ srwi $acc08,$acc04,7 # r1>>7
1049+ srwi $acc09,$acc05,7
1050+ srwi $acc10,$acc06,7
1051+ srwi $acc11,$acc07,7
1052+ andc $acc12,$acc00,$mask80 # r2&0x7f7f7f7f
1053+ andc $acc13,$acc01,$mask80
1054+ andc $acc14,$acc02,$mask80
1055+ andc $acc15,$acc03,$mask80
1056+ sub $acc04,$acc04,$acc08 # r1-(r1>>7)
1057+ sub $acc05,$acc05,$acc09
1058+ sub $acc06,$acc06,$acc10
1059+ sub $acc07,$acc07,$acc11
1060+ add $acc12,$acc12,$acc12 # (r2&0x7f7f7f7f)<<1
1061+ add $acc13,$acc13,$acc13
1062+ add $acc14,$acc14,$acc14
1063+ add $acc15,$acc15,$acc15
1064+ and $acc04,$acc04,$mask1b # (r1-(r1>>7))&0x1b1b1b1b
1065+ and $acc05,$acc05,$mask1b
1066+ and $acc06,$acc06,$mask1b
1067+ and $acc07,$acc07,$mask1b
1068+ xor $acc04,$acc04,$acc12 # r4
1069+ xor $acc05,$acc05,$acc13
1070+ xor $acc06,$acc06,$acc14
1071+ xor $acc07,$acc07,$acc15
1072+
1073+ and $acc08,$acc04,$mask80 # r1=r4&0x80808080
1074+ and $acc09,$acc05,$mask80
1075+ and $acc10,$acc06,$mask80
1076+ and $acc11,$acc07,$mask80
1077+ srwi $acc12,$acc08,7 # r1>>7
1078+ srwi $acc13,$acc09,7
1079+ srwi $acc14,$acc10,7
1080+ srwi $acc15,$acc11,7
1081+ sub $acc08,$acc08,$acc12 # r1-(r1>>7)
1082+ sub $acc09,$acc09,$acc13
1083+ sub $acc10,$acc10,$acc14
1084+ sub $acc11,$acc11,$acc15
1085+ andc $acc12,$acc04,$mask80 # r4&0x7f7f7f7f
1086+ andc $acc13,$acc05,$mask80
1087+ andc $acc14,$acc06,$mask80
1088+ andc $acc15,$acc07,$mask80
1089+ add $acc12,$acc12,$acc12 # (r4&0x7f7f7f7f)<<1
1090+ add $acc13,$acc13,$acc13
1091+ add $acc14,$acc14,$acc14
1092+ add $acc15,$acc15,$acc15
1093+ and $acc08,$acc08,$mask1b # (r1-(r1>>7))&0x1b1b1b1b
1094+ and $acc09,$acc09,$mask1b
1095+ and $acc10,$acc10,$mask1b
1096+ and $acc11,$acc11,$mask1b
1097+ xor $acc08,$acc08,$acc12 # r8
1098+ xor $acc09,$acc09,$acc13
1099+ xor $acc10,$acc10,$acc14
1100+ xor $acc11,$acc11,$acc15
1101+
1102+ xor $acc00,$acc00,$s0 # r2^r0
1103+ xor $acc01,$acc01,$s1
1104+ xor $acc02,$acc02,$s2
1105+ xor $acc03,$acc03,$s3
1106+ xor $acc04,$acc04,$s0 # r4^r0
1107+ xor $acc05,$acc05,$s1
1108+ xor $acc06,$acc06,$s2
1109+ xor $acc07,$acc07,$s3
1110+___
1111+$code.=<<___;
1112+ rotrwi $s0,$s0,8 # = ROTATE(r0,8)
1113+ rotrwi $s1,$s1,8
1114+ rotrwi $s2,$s2,8
1115+ rotrwi $s3,$s3,8
1116+ xor $s0,$s0,$acc00 # ^= r2^r0
1117+ xor $s1,$s1,$acc01
1118+ xor $s2,$s2,$acc02
1119+ xor $s3,$s3,$acc03
1120+ xor $acc00,$acc00,$acc08
1121+ xor $acc01,$acc01,$acc09
1122+ xor $acc02,$acc02,$acc10
1123+ xor $acc03,$acc03,$acc11
1124+ xor $s0,$s0,$acc04 # ^= r4^r0
1125+ xor $s1,$s1,$acc05
1126+ xor $s2,$s2,$acc06
1127+ xor $s3,$s3,$acc07
1128+ rotrwi $acc00,$acc00,24
1129+ rotrwi $acc01,$acc01,24
1130+ rotrwi $acc02,$acc02,24
1131+ rotrwi $acc03,$acc03,24
1132+ xor $acc04,$acc04,$acc08
1133+ xor $acc05,$acc05,$acc09
1134+ xor $acc06,$acc06,$acc10
1135+ xor $acc07,$acc07,$acc11
1136+ xor $s0,$s0,$acc08 # ^= r8 [^((r4^r0)^(r2^r0)=r4^r2)]
1137+ xor $s1,$s1,$acc09
1138+ xor $s2,$s2,$acc10
1139+ xor $s3,$s3,$acc11
1140+ rotrwi $acc04,$acc04,16
1141+ rotrwi $acc05,$acc05,16
1142+ rotrwi $acc06,$acc06,16
1143+ rotrwi $acc07,$acc07,16
1144+ xor $s0,$s0,$acc00 # ^= ROTATE(r8^r2^r0,24)
1145+ xor $s1,$s1,$acc01
1146+ xor $s2,$s2,$acc02
1147+ xor $s3,$s3,$acc03
1148+ rotrwi $acc08,$acc08,8
1149+ rotrwi $acc09,$acc09,8
1150+ rotrwi $acc10,$acc10,8
1151+ rotrwi $acc11,$acc11,8
1152+ xor $s0,$s0,$acc04 # ^= ROTATE(r8^r4^r0,16)
1153+ xor $s1,$s1,$acc05
1154+ xor $s2,$s2,$acc06
1155+ xor $s3,$s3,$acc07
1156+ xor $s0,$s0,$acc08 # ^= ROTATE(r8,8)
1157+ xor $s1,$s1,$acc09
1158+ xor $s2,$s2,$acc10
1159+ xor $s3,$s3,$acc11
1160+
1161+ b Ldec_compact_loop
1162+.align 4
1163+Ldec_compact_done:
1164+ xor $s0,$s0,$t0
1165+ xor $s1,$s1,$t1
1166+ xor $s2,$s2,$t2
1167+ xor $s3,$s3,$t3
1168+ blr
1169+.long 0
1170+.asciz "AES for PPC, CRYPTOGAMS by <appro\@openssl.org>"
1171+.align 7
1172+___
1173+
1174+$code =~ s/\`([^\`]*)\`/eval $1/gem;
1175+print $code;
1176+close STDOUT;
--- /dev/null
+++ b/crypto/aes/asm/aes-s390x.pl
@@ -0,0 +1,1333 @@
1+#!/usr/bin/env perl
2+
3+# ====================================================================
4+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5+# project. The module is, however, dual licensed under OpenSSL and
6+# CRYPTOGAMS licenses depending on where you obtain it. For further
7+# details see http://www.openssl.org/~appro/cryptogams/.
8+# ====================================================================
9+
10+# AES for s390x.
11+
12+# April 2007.
13+#
14+# Software performance improvement over gcc-generated code is ~70% and
15+# in absolute terms is ~73 cycles per byte processed with 128-bit key.
16+# You're likely to exclaim "why so slow?" Keep in mind that z-CPUs are
17+# *strictly* in-order execution and issued instruction [in this case
18+# load value from memory is critical] has to complete before execution
19+# flow proceeds. S-boxes are compressed to 2KB[+256B].
20+#
21+# As for hardware acceleration support. It's basically a "teaser," as
22+# it can and should be improved in several ways. Most notably support
23+# for CBC is not utilized, nor multiple blocks are ever processed.
24+# Then software key schedule can be postponed till hardware support
25+# detection... Performance improvement over assembler is reportedly
26+# ~2.5x, but can reach >8x [naturally on larger chunks] if proper
27+# support is implemented.
28+
29+# May 2007.
30+#
31+# Implement AES_set_[en|de]crypt_key. Key schedule setup is avoided
32+# for 128-bit keys, if hardware support is detected.
33+
34+# Januray 2009.
35+#
36+# Add support for hardware AES192/256 and reschedule instructions to
37+# minimize/avoid Address Generation Interlock hazard and to favour
38+# dual-issue z10 pipeline. This gave ~25% improvement on z10 and
39+# almost 50% on z9. The gain is smaller on z10, because being dual-
40+# issue z10 makes it improssible to eliminate the interlock condition:
41+# critial path is not long enough. Yet it spends ~24 cycles per byte
42+# processed with 128-bit key.
43+#
44+# Unlike previous version hardware support detection takes place only
45+# at the moment of key schedule setup, which is denoted in key->rounds.
46+# This is done, because deferred key setup can't be made MT-safe, not
47+# for key lengthes longer than 128 bits.
48+#
49+# Add AES_cbc_encrypt, which gives incredible performance improvement,
50+# it was measured to be ~6.6x. It's less than previously mentioned 8x,
51+# because software implementation was optimized.
52+
53+$softonly=0; # allow hardware support
54+
55+$t0="%r0"; $mask="%r0";
56+$t1="%r1";
57+$t2="%r2"; $inp="%r2";
58+$t3="%r3"; $out="%r3"; $bits="%r3";
59+$key="%r4";
60+$i1="%r5";
61+$i2="%r6";
62+$i3="%r7";
63+$s0="%r8";
64+$s1="%r9";
65+$s2="%r10";
66+$s3="%r11";
67+$tbl="%r12";
68+$rounds="%r13";
69+$ra="%r14";
70+$sp="%r15";
71+
72+sub _data_word()
73+{ my $i;
74+ while(defined($i=shift)) { $code.=sprintf".long\t0x%08x,0x%08x\n",$i,$i; }
75+}
76+
77+$code=<<___;
78+.text
79+
80+.type AES_Te,\@object
81+.align 256
82+AES_Te:
83+___
84+&_data_word(
85+ 0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d,
86+ 0xfff2f20d, 0xd66b6bbd, 0xde6f6fb1, 0x91c5c554,
87+ 0x60303050, 0x02010103, 0xce6767a9, 0x562b2b7d,
88+ 0xe7fefe19, 0xb5d7d762, 0x4dababe6, 0xec76769a,
89+ 0x8fcaca45, 0x1f82829d, 0x89c9c940, 0xfa7d7d87,
90+ 0xeffafa15, 0xb25959eb, 0x8e4747c9, 0xfbf0f00b,
91+ 0x41adadec, 0xb3d4d467, 0x5fa2a2fd, 0x45afafea,
92+ 0x239c9cbf, 0x53a4a4f7, 0xe4727296, 0x9bc0c05b,
93+ 0x75b7b7c2, 0xe1fdfd1c, 0x3d9393ae, 0x4c26266a,
94+ 0x6c36365a, 0x7e3f3f41, 0xf5f7f702, 0x83cccc4f,
95+ 0x6834345c, 0x51a5a5f4, 0xd1e5e534, 0xf9f1f108,
96+ 0xe2717193, 0xabd8d873, 0x62313153, 0x2a15153f,
97+ 0x0804040c, 0x95c7c752, 0x46232365, 0x9dc3c35e,
98+ 0x30181828, 0x379696a1, 0x0a05050f, 0x2f9a9ab5,
99+ 0x0e070709, 0x24121236, 0x1b80809b, 0xdfe2e23d,
100+ 0xcdebeb26, 0x4e272769, 0x7fb2b2cd, 0xea75759f,
101+ 0x1209091b, 0x1d83839e, 0x582c2c74, 0x341a1a2e,
102+ 0x361b1b2d, 0xdc6e6eb2, 0xb45a5aee, 0x5ba0a0fb,
103+ 0xa45252f6, 0x763b3b4d, 0xb7d6d661, 0x7db3b3ce,
104+ 0x5229297b, 0xdde3e33e, 0x5e2f2f71, 0x13848497,
105+ 0xa65353f5, 0xb9d1d168, 0x00000000, 0xc1eded2c,
106+ 0x40202060, 0xe3fcfc1f, 0x79b1b1c8, 0xb65b5bed,
107+ 0xd46a6abe, 0x8dcbcb46, 0x67bebed9, 0x7239394b,
108+ 0x944a4ade, 0x984c4cd4, 0xb05858e8, 0x85cfcf4a,
109+ 0xbbd0d06b, 0xc5efef2a, 0x4faaaae5, 0xedfbfb16,
110+ 0x864343c5, 0x9a4d4dd7, 0x66333355, 0x11858594,
111+ 0x8a4545cf, 0xe9f9f910, 0x04020206, 0xfe7f7f81,
112+ 0xa05050f0, 0x783c3c44, 0x259f9fba, 0x4ba8a8e3,
113+ 0xa25151f3, 0x5da3a3fe, 0x804040c0, 0x058f8f8a,
114+ 0x3f9292ad, 0x219d9dbc, 0x70383848, 0xf1f5f504,
115+ 0x63bcbcdf, 0x77b6b6c1, 0xafdada75, 0x42212163,
116+ 0x20101030, 0xe5ffff1a, 0xfdf3f30e, 0xbfd2d26d,
117+ 0x81cdcd4c, 0x180c0c14, 0x26131335, 0xc3ecec2f,
118+ 0xbe5f5fe1, 0x359797a2, 0x884444cc, 0x2e171739,
119+ 0x93c4c457, 0x55a7a7f2, 0xfc7e7e82, 0x7a3d3d47,
120+ 0xc86464ac, 0xba5d5de7, 0x3219192b, 0xe6737395,
121+ 0xc06060a0, 0x19818198, 0x9e4f4fd1, 0xa3dcdc7f,
122+ 0x44222266, 0x542a2a7e, 0x3b9090ab, 0x0b888883,
123+ 0x8c4646ca, 0xc7eeee29, 0x6bb8b8d3, 0x2814143c,
124+ 0xa7dede79, 0xbc5e5ee2, 0x160b0b1d, 0xaddbdb76,
125+ 0xdbe0e03b, 0x64323256, 0x743a3a4e, 0x140a0a1e,
126+ 0x924949db, 0x0c06060a, 0x4824246c, 0xb85c5ce4,
127+ 0x9fc2c25d, 0xbdd3d36e, 0x43acacef, 0xc46262a6,
128+ 0x399191a8, 0x319595a4, 0xd3e4e437, 0xf279798b,
129+ 0xd5e7e732, 0x8bc8c843, 0x6e373759, 0xda6d6db7,
130+ 0x018d8d8c, 0xb1d5d564, 0x9c4e4ed2, 0x49a9a9e0,
131+ 0xd86c6cb4, 0xac5656fa, 0xf3f4f407, 0xcfeaea25,
132+ 0xca6565af, 0xf47a7a8e, 0x47aeaee9, 0x10080818,
133+ 0x6fbabad5, 0xf0787888, 0x4a25256f, 0x5c2e2e72,
134+ 0x381c1c24, 0x57a6a6f1, 0x73b4b4c7, 0x97c6c651,
135+ 0xcbe8e823, 0xa1dddd7c, 0xe874749c, 0x3e1f1f21,
136+ 0x964b4bdd, 0x61bdbddc, 0x0d8b8b86, 0x0f8a8a85,
137+ 0xe0707090, 0x7c3e3e42, 0x71b5b5c4, 0xcc6666aa,
138+ 0x904848d8, 0x06030305, 0xf7f6f601, 0x1c0e0e12,
139+ 0xc26161a3, 0x6a35355f, 0xae5757f9, 0x69b9b9d0,
140+ 0x17868691, 0x99c1c158, 0x3a1d1d27, 0x279e9eb9,
141+ 0xd9e1e138, 0xebf8f813, 0x2b9898b3, 0x22111133,
142+ 0xd26969bb, 0xa9d9d970, 0x078e8e89, 0x339494a7,
143+ 0x2d9b9bb6, 0x3c1e1e22, 0x15878792, 0xc9e9e920,
144+ 0x87cece49, 0xaa5555ff, 0x50282878, 0xa5dfdf7a,
145+ 0x038c8c8f, 0x59a1a1f8, 0x09898980, 0x1a0d0d17,
146+ 0x65bfbfda, 0xd7e6e631, 0x844242c6, 0xd06868b8,
147+ 0x824141c3, 0x299999b0, 0x5a2d2d77, 0x1e0f0f11,
148+ 0x7bb0b0cb, 0xa85454fc, 0x6dbbbbd6, 0x2c16163a);
149+$code.=<<___;
150+# Te4[256]
151+.byte 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5
152+.byte 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
153+.byte 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
154+.byte 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
155+.byte 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
156+.byte 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
157+.byte 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
158+.byte 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
159+.byte 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
160+.byte 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
161+.byte 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
162+.byte 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
163+.byte 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
164+.byte 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
165+.byte 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
166+.byte 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
167+.byte 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
168+.byte 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
169+.byte 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
170+.byte 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
171+.byte 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
172+.byte 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
173+.byte 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
174+.byte 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
175+.byte 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
176+.byte 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
177+.byte 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
178+.byte 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
179+.byte 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
180+.byte 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
181+.byte 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
182+.byte 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
183+# rcon[]
184+.long 0x01000000, 0x02000000, 0x04000000, 0x08000000
185+.long 0x10000000, 0x20000000, 0x40000000, 0x80000000
186+.long 0x1B000000, 0x36000000, 0, 0, 0, 0, 0, 0
187+.align 256
188+.size AES_Te,.-AES_Te
189+
190+# void AES_encrypt(const unsigned char *inp, unsigned char *out,
191+# const AES_KEY *key) {
192+.globl AES_encrypt
193+.type AES_encrypt,\@function
194+AES_encrypt:
195+___
196+$code.=<<___ if (!$softonly);
197+ l %r0,240($key)
198+ lhi %r1,16
199+ clr %r0,%r1
200+ jl .Lesoft
201+
202+ la %r1,0($key)
203+ #la %r2,0($inp)
204+ la %r4,0($out)
205+ lghi %r3,16 # single block length
206+ .long 0xb92e0042 # km %r4,%r2
207+ brc 1,.-4 # can this happen?
208+ br %r14
209+.align 64
210+.Lesoft:
211+___
212+$code.=<<___;
213+ stmg %r3,$ra,24($sp)
214+
215+ llgf $s0,0($inp)
216+ llgf $s1,4($inp)
217+ llgf $s2,8($inp)
218+ llgf $s3,12($inp)
219+
220+ larl $tbl,AES_Te
221+ bras $ra,_s390x_AES_encrypt
222+
223+ lg $out,24($sp)
224+ st $s0,0($out)
225+ st $s1,4($out)
226+ st $s2,8($out)
227+ st $s3,12($out)
228+
229+ lmg %r6,$ra,48($sp)
230+ br $ra
231+.size AES_encrypt,.-AES_encrypt
232+
233+.type _s390x_AES_encrypt,\@function
234+.align 16
235+_s390x_AES_encrypt:
236+ stg $ra,152($sp)
237+ x $s0,0($key)
238+ x $s1,4($key)
239+ x $s2,8($key)
240+ x $s3,12($key)
241+ l $rounds,240($key)
242+ llill $mask,`0xff<<3`
243+ aghi $rounds,-1
244+ j .Lenc_loop
245+.align 16
246+.Lenc_loop:
247+ sllg $t1,$s0,`0+3`
248+ srlg $t2,$s0,`8-3`
249+ srlg $t3,$s0,`16-3`
250+ srl $s0,`24-3`
251+ nr $s0,$mask
252+ ngr $t1,$mask
253+ nr $t2,$mask
254+ nr $t3,$mask
255+
256+ srlg $i1,$s1,`16-3` # i0
257+ sllg $i2,$s1,`0+3`
258+ srlg $i3,$s1,`8-3`
259+ srl $s1,`24-3`
260+ nr $i1,$mask
261+ nr $s1,$mask
262+ ngr $i2,$mask
263+ nr $i3,$mask
264+
265+ l $s0,0($s0,$tbl) # Te0[s0>>24]
266+ l $t1,1($t1,$tbl) # Te3[s0>>0]
267+ l $t2,2($t2,$tbl) # Te2[s0>>8]
268+ l $t3,3($t3,$tbl) # Te1[s0>>16]
269+
270+ x $s0,3($i1,$tbl) # Te1[s1>>16]
271+ l $s1,0($s1,$tbl) # Te0[s1>>24]
272+ x $t2,1($i2,$tbl) # Te3[s1>>0]
273+ x $t3,2($i3,$tbl) # Te2[s1>>8]
274+
275+ srlg $i1,$s2,`8-3` # i0
276+ srlg $i2,$s2,`16-3` # i1
277+ nr $i1,$mask
278+ nr $i2,$mask
279+ sllg $i3,$s2,`0+3`
280+ srl $s2,`24-3`
281+ nr $s2,$mask
282+ ngr $i3,$mask
283+
284+ xr $s1,$t1
285+ srlg $ra,$s3,`8-3` # i1
286+ sllg $t1,$s3,`0+3` # i0
287+ nr $ra,$mask
288+ la $key,16($key)
289+ ngr $t1,$mask
290+
291+ x $s0,2($i1,$tbl) # Te2[s2>>8]
292+ x $s1,3($i2,$tbl) # Te1[s2>>16]
293+ l $s2,0($s2,$tbl) # Te0[s2>>24]
294+ x $t3,1($i3,$tbl) # Te3[s2>>0]
295+
296+ srlg $i3,$s3,`16-3` # i2
297+ xr $s2,$t2
298+ srl $s3,`24-3`
299+ nr $i3,$mask
300+ nr $s3,$mask
301+
302+ x $s0,0($key)
303+ x $s1,4($key)
304+ x $s2,8($key)
305+ x $t3,12($key)
306+
307+ x $s0,1($t1,$tbl) # Te3[s3>>0]
308+ x $s1,2($ra,$tbl) # Te2[s3>>8]
309+ x $s2,3($i3,$tbl) # Te1[s3>>16]
310+ l $s3,0($s3,$tbl) # Te0[s3>>24]
311+ xr $s3,$t3
312+
313+ brct $rounds,.Lenc_loop
314+ .align 16
315+
316+ sllg $t1,$s0,`0+3`
317+ srlg $t2,$s0,`8-3`
318+ ngr $t1,$mask
319+ srlg $t3,$s0,`16-3`
320+ srl $s0,`24-3`
321+ nr $s0,$mask
322+ nr $t2,$mask
323+ nr $t3,$mask
324+
325+ srlg $i1,$s1,`16-3` # i0
326+ sllg $i2,$s1,`0+3`
327+ ngr $i2,$mask
328+ srlg $i3,$s1,`8-3`
329+ srl $s1,`24-3`
330+ nr $i1,$mask
331+ nr $s1,$mask
332+ nr $i3,$mask
333+
334+ llgc $s0,2($s0,$tbl) # Te4[s0>>24]
335+ llgc $t1,2($t1,$tbl) # Te4[s0>>0]
336+ sll $s0,24
337+ llgc $t2,2($t2,$tbl) # Te4[s0>>8]
338+ llgc $t3,2($t3,$tbl) # Te4[s0>>16]
339+ sll $t2,8
340+ sll $t3,16
341+
342+ llgc $i1,2($i1,$tbl) # Te4[s1>>16]
343+ llgc $s1,2($s1,$tbl) # Te4[s1>>24]
344+ llgc $i2,2($i2,$tbl) # Te4[s1>>0]
345+ llgc $i3,2($i3,$tbl) # Te4[s1>>8]
346+ sll $i1,16
347+ sll $s1,24
348+ sll $i3,8
349+ or $s0,$i1
350+ or $s1,$t1
351+ or $t2,$i2
352+ or $t3,$i3
353+
354+ srlg $i1,$s2,`8-3` # i0
355+ srlg $i2,$s2,`16-3` # i1
356+ nr $i1,$mask
357+ nr $i2,$mask
358+ sllg $i3,$s2,`0+3`
359+ srl $s2,`24-3`
360+ ngr $i3,$mask
361+ nr $s2,$mask
362+
363+ sllg $t1,$s3,`0+3` # i0
364+ srlg $ra,$s3,`8-3` # i1
365+ ngr $t1,$mask
366+
367+ llgc $i1,2($i1,$tbl) # Te4[s2>>8]
368+ llgc $i2,2($i2,$tbl) # Te4[s2>>16]
369+ sll $i1,8
370+ llgc $s2,2($s2,$tbl) # Te4[s2>>24]
371+ llgc $i3,2($i3,$tbl) # Te4[s2>>0]
372+ sll $i2,16
373+ nr $ra,$mask
374+ sll $s2,24
375+ or $s0,$i1
376+ or $s1,$i2
377+ or $s2,$t2
378+ or $t3,$i3
379+
380+ srlg $i3,$s3,`16-3` # i2
381+ srl $s3,`24-3`
382+ nr $i3,$mask
383+ nr $s3,$mask
384+
385+ l $t0,16($key)
386+ l $t2,20($key)
387+
388+ llgc $i1,2($t1,$tbl) # Te4[s3>>0]
389+ llgc $i2,2($ra,$tbl) # Te4[s3>>8]
390+ llgc $i3,2($i3,$tbl) # Te4[s3>>16]
391+ llgc $s3,2($s3,$tbl) # Te4[s3>>24]
392+ sll $i2,8
393+ sll $i3,16
394+ sll $s3,24
395+ or $s0,$i1
396+ or $s1,$i2
397+ or $s2,$i3
398+ or $s3,$t3
399+
400+ lg $ra,152($sp)
401+ xr $s0,$t0
402+ xr $s1,$t2
403+ x $s2,24($key)
404+ x $s3,28($key)
405+
406+ br $ra
407+.size _s390x_AES_encrypt,.-_s390x_AES_encrypt
408+___
409+
410+$code.=<<___;
411+.type AES_Td,\@object
412+.align 256
413+AES_Td:
414+___
415+&_data_word(
416+ 0x51f4a750, 0x7e416553, 0x1a17a4c3, 0x3a275e96,
417+ 0x3bab6bcb, 0x1f9d45f1, 0xacfa58ab, 0x4be30393,
418+ 0x2030fa55, 0xad766df6, 0x88cc7691, 0xf5024c25,
419+ 0x4fe5d7fc, 0xc52acbd7, 0x26354480, 0xb562a38f,
420+ 0xdeb15a49, 0x25ba1b67, 0x45ea0e98, 0x5dfec0e1,
421+ 0xc32f7502, 0x814cf012, 0x8d4697a3, 0x6bd3f9c6,
422+ 0x038f5fe7, 0x15929c95, 0xbf6d7aeb, 0x955259da,
423+ 0xd4be832d, 0x587421d3, 0x49e06929, 0x8ec9c844,
424+ 0x75c2896a, 0xf48e7978, 0x99583e6b, 0x27b971dd,
425+ 0xbee14fb6, 0xf088ad17, 0xc920ac66, 0x7dce3ab4,
426+ 0x63df4a18, 0xe51a3182, 0x97513360, 0x62537f45,
427+ 0xb16477e0, 0xbb6bae84, 0xfe81a01c, 0xf9082b94,
428+ 0x70486858, 0x8f45fd19, 0x94de6c87, 0x527bf8b7,
429+ 0xab73d323, 0x724b02e2, 0xe31f8f57, 0x6655ab2a,
430+ 0xb2eb2807, 0x2fb5c203, 0x86c57b9a, 0xd33708a5,
431+ 0x302887f2, 0x23bfa5b2, 0x02036aba, 0xed16825c,
432+ 0x8acf1c2b, 0xa779b492, 0xf307f2f0, 0x4e69e2a1,
433+ 0x65daf4cd, 0x0605bed5, 0xd134621f, 0xc4a6fe8a,
434+ 0x342e539d, 0xa2f355a0, 0x058ae132, 0xa4f6eb75,
435+ 0x0b83ec39, 0x4060efaa, 0x5e719f06, 0xbd6e1051,
436+ 0x3e218af9, 0x96dd063d, 0xdd3e05ae, 0x4de6bd46,
437+ 0x91548db5, 0x71c45d05, 0x0406d46f, 0x605015ff,
438+ 0x1998fb24, 0xd6bde997, 0x894043cc, 0x67d99e77,
439+ 0xb0e842bd, 0x07898b88, 0xe7195b38, 0x79c8eedb,
440+ 0xa17c0a47, 0x7c420fe9, 0xf8841ec9, 0x00000000,
441+ 0x09808683, 0x322bed48, 0x1e1170ac, 0x6c5a724e,
442+ 0xfd0efffb, 0x0f853856, 0x3daed51e, 0x362d3927,
443+ 0x0a0fd964, 0x685ca621, 0x9b5b54d1, 0x24362e3a,
444+ 0x0c0a67b1, 0x9357e70f, 0xb4ee96d2, 0x1b9b919e,
445+ 0x80c0c54f, 0x61dc20a2, 0x5a774b69, 0x1c121a16,
446+ 0xe293ba0a, 0xc0a02ae5, 0x3c22e043, 0x121b171d,
447+ 0x0e090d0b, 0xf28bc7ad, 0x2db6a8b9, 0x141ea9c8,
448+ 0x57f11985, 0xaf75074c, 0xee99ddbb, 0xa37f60fd,
449+ 0xf701269f, 0x5c72f5bc, 0x44663bc5, 0x5bfb7e34,
450+ 0x8b432976, 0xcb23c6dc, 0xb6edfc68, 0xb8e4f163,
451+ 0xd731dcca, 0x42638510, 0x13972240, 0x84c61120,
452+ 0x854a247d, 0xd2bb3df8, 0xaef93211, 0xc729a16d,
453+ 0x1d9e2f4b, 0xdcb230f3, 0x0d8652ec, 0x77c1e3d0,
454+ 0x2bb3166c, 0xa970b999, 0x119448fa, 0x47e96422,
455+ 0xa8fc8cc4, 0xa0f03f1a, 0x567d2cd8, 0x223390ef,
456+ 0x87494ec7, 0xd938d1c1, 0x8ccaa2fe, 0x98d40b36,
457+ 0xa6f581cf, 0xa57ade28, 0xdab78e26, 0x3fadbfa4,
458+ 0x2c3a9de4, 0x5078920d, 0x6a5fcc9b, 0x547e4662,
459+ 0xf68d13c2, 0x90d8b8e8, 0x2e39f75e, 0x82c3aff5,
460+ 0x9f5d80be, 0x69d0937c, 0x6fd52da9, 0xcf2512b3,
461+ 0xc8ac993b, 0x10187da7, 0xe89c636e, 0xdb3bbb7b,
462+ 0xcd267809, 0x6e5918f4, 0xec9ab701, 0x834f9aa8,
463+ 0xe6956e65, 0xaaffe67e, 0x21bccf08, 0xef15e8e6,
464+ 0xbae79bd9, 0x4a6f36ce, 0xea9f09d4, 0x29b07cd6,
465+ 0x31a4b2af, 0x2a3f2331, 0xc6a59430, 0x35a266c0,
466+ 0x744ebc37, 0xfc82caa6, 0xe090d0b0, 0x33a7d815,
467+ 0xf104984a, 0x41ecdaf7, 0x7fcd500e, 0x1791f62f,
468+ 0x764dd68d, 0x43efb04d, 0xccaa4d54, 0xe49604df,
469+ 0x9ed1b5e3, 0x4c6a881b, 0xc12c1fb8, 0x4665517f,
470+ 0x9d5eea04, 0x018c355d, 0xfa877473, 0xfb0b412e,
471+ 0xb3671d5a, 0x92dbd252, 0xe9105633, 0x6dd64713,
472+ 0x9ad7618c, 0x37a10c7a, 0x59f8148e, 0xeb133c89,
473+ 0xcea927ee, 0xb761c935, 0xe11ce5ed, 0x7a47b13c,
474+ 0x9cd2df59, 0x55f2733f, 0x1814ce79, 0x73c737bf,
475+ 0x53f7cdea, 0x5ffdaa5b, 0xdf3d6f14, 0x7844db86,
476+ 0xcaaff381, 0xb968c43e, 0x3824342c, 0xc2a3405f,
477+ 0x161dc372, 0xbce2250c, 0x283c498b, 0xff0d9541,
478+ 0x39a80171, 0x080cb3de, 0xd8b4e49c, 0x6456c190,
479+ 0x7bcb8461, 0xd532b670, 0x486c5c74, 0xd0b85742);
480+$code.=<<___;
481+# Td4[256]
482+.byte 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38
483+.byte 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
484+.byte 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
485+.byte 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
486+.byte 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
487+.byte 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
488+.byte 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
489+.byte 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
490+.byte 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
491+.byte 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
492+.byte 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
493+.byte 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
494+.byte 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
495+.byte 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
496+.byte 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
497+.byte 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
498+.byte 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
499+.byte 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
500+.byte 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
501+.byte 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
502+.byte 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
503+.byte 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
504+.byte 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
505+.byte 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
506+.byte 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
507+.byte 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
508+.byte 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
509+.byte 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
510+.byte 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
511+.byte 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
512+.byte 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
513+.byte 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
514+.size AES_Td,.-AES_Td
515+
516+# void AES_decrypt(const unsigned char *inp, unsigned char *out,
517+# const AES_KEY *key) {
518+.globl AES_decrypt
519+.type AES_decrypt,\@function
520+AES_decrypt:
521+___
522+$code.=<<___ if (!$softonly);
523+ l %r0,240($key)
524+ lhi %r1,16
525+ clr %r0,%r1
526+ jl .Ldsoft
527+
528+ la %r1,0($key)
529+ #la %r2,0($inp)
530+ la %r4,0($out)
531+ lghi %r3,16 # single block length
532+ .long 0xb92e0042 # km %r4,%r2
533+ brc 1,.-4 # can this happen?
534+ br %r14
535+.align 64
536+.Ldsoft:
537+___
538+$code.=<<___;
539+ stmg %r3,$ra,24($sp)
540+
541+ llgf $s0,0($inp)
542+ llgf $s1,4($inp)
543+ llgf $s2,8($inp)
544+ llgf $s3,12($inp)
545+
546+ larl $tbl,AES_Td
547+ bras $ra,_s390x_AES_decrypt
548+
549+ lg $out,24($sp)
550+ st $s0,0($out)
551+ st $s1,4($out)
552+ st $s2,8($out)
553+ st $s3,12($out)
554+
555+ lmg %r6,$ra,48($sp)
556+ br $ra
557+.size AES_decrypt,.-AES_decrypt
558+
559+.type _s390x_AES_decrypt,\@function
560+.align 16
561+_s390x_AES_decrypt:
562+ stg $ra,152($sp)
563+ x $s0,0($key)
564+ x $s1,4($key)
565+ x $s2,8($key)
566+ x $s3,12($key)
567+ l $rounds,240($key)
568+ llill $mask,`0xff<<3`
569+ aghi $rounds,-1
570+ j .Ldec_loop
571+.align 16
572+.Ldec_loop:
573+ srlg $t1,$s0,`16-3`
574+ srlg $t2,$s0,`8-3`
575+ sllg $t3,$s0,`0+3`
576+ srl $s0,`24-3`
577+ nr $s0,$mask
578+ nr $t1,$mask
579+ nr $t2,$mask
580+ ngr $t3,$mask
581+
582+ sllg $i1,$s1,`0+3` # i0
583+ srlg $i2,$s1,`16-3`
584+ srlg $i3,$s1,`8-3`
585+ srl $s1,`24-3`
586+ ngr $i1,$mask
587+ nr $s1,$mask
588+ nr $i2,$mask
589+ nr $i3,$mask
590+
591+ l $s0,0($s0,$tbl) # Td0[s0>>24]
592+ l $t1,3($t1,$tbl) # Td1[s0>>16]
593+ l $t2,2($t2,$tbl) # Td2[s0>>8]
594+ l $t3,1($t3,$tbl) # Td3[s0>>0]
595+
596+ x $s0,1($i1,$tbl) # Td3[s1>>0]
597+ l $s1,0($s1,$tbl) # Td0[s1>>24]
598+ x $t2,3($i2,$tbl) # Td1[s1>>16]
599+ x $t3,2($i3,$tbl) # Td2[s1>>8]
600+
601+ srlg $i1,$s2,`8-3` # i0
602+ sllg $i2,$s2,`0+3` # i1
603+ srlg $i3,$s2,`16-3`
604+ srl $s2,`24-3`
605+ nr $i1,$mask
606+ ngr $i2,$mask
607+ nr $s2,$mask
608+ nr $i3,$mask
609+
610+ xr $s1,$t1
611+ srlg $ra,$s3,`8-3` # i1
612+ srlg $t1,$s3,`16-3` # i0
613+ nr $ra,$mask
614+ la $key,16($key)
615+ nr $t1,$mask
616+
617+ x $s0,2($i1,$tbl) # Td2[s2>>8]
618+ x $s1,1($i2,$tbl) # Td3[s2>>0]
619+ l $s2,0($s2,$tbl) # Td0[s2>>24]
620+ x $t3,3($i3,$tbl) # Td1[s2>>16]
621+
622+ sllg $i3,$s3,`0+3` # i2
623+ srl $s3,`24-3`
624+ ngr $i3,$mask
625+ nr $s3,$mask
626+
627+ xr $s2,$t2
628+ x $s0,0($key)
629+ x $s1,4($key)
630+ x $s2,8($key)
631+ x $t3,12($key)
632+
633+ x $s0,3($t1,$tbl) # Td1[s3>>16]
634+ x $s1,2($ra,$tbl) # Td2[s3>>8]
635+ x $s2,1($i3,$tbl) # Td3[s3>>0]
636+ l $s3,0($s3,$tbl) # Td0[s3>>24]
637+ xr $s3,$t3
638+
639+ brct $rounds,.Ldec_loop
640+ .align 16
641+
642+ l $t1,`2048+0`($tbl) # prefetch Td4
643+ l $t2,`2048+64`($tbl)
644+ l $t3,`2048+128`($tbl)
645+ l $i1,`2048+192`($tbl)
646+ llill $mask,0xff
647+
648+ srlg $i3,$s0,24 # i0
649+ srlg $t1,$s0,16
650+ srlg $t2,$s0,8
651+ nr $s0,$mask # i3
652+ nr $t1,$mask
653+
654+ srlg $i1,$s1,24
655+ nr $t2,$mask
656+ srlg $i2,$s1,16
657+ srlg $ra,$s1,8
658+ nr $s1,$mask # i0
659+ nr $i2,$mask
660+ nr $ra,$mask
661+
662+ llgc $i3,2048($i3,$tbl) # Td4[s0>>24]
663+ llgc $t1,2048($t1,$tbl) # Td4[s0>>16]
664+ llgc $t2,2048($t2,$tbl) # Td4[s0>>8]
665+ sll $t1,16
666+ llgc $t3,2048($s0,$tbl) # Td4[s0>>0]
667+ sllg $s0,$i3,24
668+ sll $t2,8
669+
670+ llgc $s1,2048($s1,$tbl) # Td4[s1>>0]
671+ llgc $i1,2048($i1,$tbl) # Td4[s1>>24]
672+ llgc $i2,2048($i2,$tbl) # Td4[s1>>16]
673+ sll $i1,24
674+ llgc $i3,2048($ra,$tbl) # Td4[s1>>8]
675+ sll $i2,16
676+ sll $i3,8
677+ or $s0,$s1
678+ or $t1,$i1
679+ or $t2,$i2
680+ or $t3,$i3
681+
682+ srlg $i1,$s2,8 # i0
683+ srlg $i2,$s2,24
684+ srlg $i3,$s2,16
685+ nr $s2,$mask # i1
686+ nr $i1,$mask
687+ nr $i3,$mask
688+ llgc $i1,2048($i1,$tbl) # Td4[s2>>8]
689+ llgc $s1,2048($s2,$tbl) # Td4[s2>>0]
690+ llgc $i2,2048($i2,$tbl) # Td4[s2>>24]
691+ llgc $i3,2048($i3,$tbl) # Td4[s2>>16]
692+ sll $i1,8
693+ sll $i2,24
694+ or $s0,$i1
695+ sll $i3,16
696+ or $t2,$i2
697+ or $t3,$i3
698+
699+ srlg $i1,$s3,16 # i0
700+ srlg $i2,$s3,8 # i1
701+ srlg $i3,$s3,24
702+ nr $s3,$mask # i2
703+ nr $i1,$mask
704+ nr $i2,$mask
705+
706+ lg $ra,152($sp)
707+ or $s1,$t1
708+ l $t0,16($key)
709+ l $t1,20($key)
710+
711+ llgc $i1,2048($i1,$tbl) # Td4[s3>>16]
712+ llgc $i2,2048($i2,$tbl) # Td4[s3>>8]
713+ sll $i1,16
714+ llgc $s2,2048($s3,$tbl) # Td4[s3>>0]
715+ llgc $s3,2048($i3,$tbl) # Td4[s3>>24]
716+ sll $i2,8
717+ sll $s3,24
718+ or $s0,$i1
719+ or $s1,$i2
720+ or $s2,$t2
721+ or $s3,$t3
722+
723+ xr $s0,$t0
724+ xr $s1,$t1
725+ x $s2,24($key)
726+ x $s3,28($key)
727+
728+ br $ra
729+.size _s390x_AES_decrypt,.-_s390x_AES_decrypt
730+___
731+
732+$code.=<<___;
733+# void AES_set_encrypt_key(const unsigned char *in, int bits,
734+# AES_KEY *key) {
735+.globl AES_set_encrypt_key
736+.type AES_set_encrypt_key,\@function
737+.align 16
738+AES_set_encrypt_key:
739+ lghi $t0,0
740+ clgr $inp,$t0
741+ je .Lminus1
742+ clgr $key,$t0
743+ je .Lminus1
744+
745+ lghi $t0,128
746+ clr $bits,$t0
747+ je .Lproceed
748+ lghi $t0,192
749+ clr $bits,$t0
750+ je .Lproceed
751+ lghi $t0,256
752+ clr $bits,$t0
753+ je .Lproceed
754+ lghi %r2,-2
755+ br %r14
756+
757+.align 16
758+.Lproceed:
759+___
760+$code.=<<___ if (!$softonly);
761+ # convert bits to km code, [128,192,256]->[18,19,20]
762+ lhi %r5,-128
763+ lhi %r0,18
764+ ar %r5,$bits
765+ srl %r5,6
766+ ar %r5,%r0
767+
768+ lghi %r0,0 # query capability vector
769+ la %r1,16($sp)
770+ .long 0xb92f0042 # kmc %r4,%r2
771+
772+ llihh %r1,0x8000
773+ srlg %r1,%r1,0(%r5)
774+ ng %r1,16($sp)
775+ jz .Lekey_internal
776+
777+ lmg %r0,%r1,0($inp) # just copy 128 bits...
778+ stmg %r0,%r1,0($key)
779+ lhi %r0,192
780+ cr $bits,%r0
781+ jl 1f
782+ lg %r1,16($inp)
783+ stg %r1,16($key)
784+ je 1f
785+ lg %r1,24($inp)
786+ stg %r1,24($key)
787+1: st $bits,236($key) # save bits
788+ st %r5,240($key) # save km code
789+ lghi %r2,0
790+ br %r14
791+___
792+$code.=<<___;
793+.align 16
794+.Lekey_internal:
795+ stmg %r6,%r13,48($sp) # all non-volatile regs
796+
797+ larl $tbl,AES_Te+2048
798+
799+ llgf $s0,0($inp)
800+ llgf $s1,4($inp)
801+ llgf $s2,8($inp)
802+ llgf $s3,12($inp)
803+ st $s0,0($key)
804+ st $s1,4($key)
805+ st $s2,8($key)
806+ st $s3,12($key)
807+ lghi $t0,128
808+ cr $bits,$t0
809+ jne .Lnot128
810+
811+ llill $mask,0xff
812+ lghi $t3,0 # i=0
813+ lghi $rounds,10
814+ st $rounds,240($key)
815+
816+ llgfr $t2,$s3 # temp=rk[3]
817+ srlg $i1,$s3,8
818+ srlg $i2,$s3,16
819+ srlg $i3,$s3,24
820+ nr $t2,$mask
821+ nr $i1,$mask
822+ nr $i2,$mask
823+
824+.align 16
825+.L128_loop:
826+ la $t2,0($t2,$tbl)
827+ la $i1,0($i1,$tbl)
828+ la $i2,0($i2,$tbl)
829+ la $i3,0($i3,$tbl)
830+ icm $t2,2,0($t2) # Te4[rk[3]>>0]<<8
831+ icm $t2,4,0($i1) # Te4[rk[3]>>8]<<16
832+ icm $t2,8,0($i2) # Te4[rk[3]>>16]<<24
833+ icm $t2,1,0($i3) # Te4[rk[3]>>24]
834+ x $t2,256($t3,$tbl) # rcon[i]
835+ xr $s0,$t2 # rk[4]=rk[0]^...
836+ xr $s1,$s0 # rk[5]=rk[1]^rk[4]
837+ xr $s2,$s1 # rk[6]=rk[2]^rk[5]
838+ xr $s3,$s2 # rk[7]=rk[3]^rk[6]
839+
840+ llgfr $t2,$s3 # temp=rk[3]
841+ srlg $i1,$s3,8
842+ srlg $i2,$s3,16
843+ nr $t2,$mask
844+ nr $i1,$mask
845+ srlg $i3,$s3,24
846+ nr $i2,$mask
847+
848+ st $s0,16($key)
849+ st $s1,20($key)
850+ st $s2,24($key)
851+ st $s3,28($key)
852+ la $key,16($key) # key+=4
853+ la $t3,4($t3) # i++
854+ brct $rounds,.L128_loop
855+ lghi %r2,0
856+ lmg %r6,%r13,48($sp)
857+ br $ra
858+
859+.align 16
860+.Lnot128:
861+ llgf $t0,16($inp)
862+ llgf $t1,20($inp)
863+ st $t0,16($key)
864+ st $t1,20($key)
865+ lghi $t0,192
866+ cr $bits,$t0
867+ jne .Lnot192
868+
869+ llill $mask,0xff
870+ lghi $t3,0 # i=0
871+ lghi $rounds,12
872+ st $rounds,240($key)
873+ lghi $rounds,8
874+
875+ srlg $i1,$t1,8
876+ srlg $i2,$t1,16
877+ srlg $i3,$t1,24
878+ nr $t1,$mask
879+ nr $i1,$mask
880+ nr $i2,$mask
881+
882+.align 16
883+.L192_loop:
884+ la $t1,0($t1,$tbl)
885+ la $i1,0($i1,$tbl)
886+ la $i2,0($i2,$tbl)
887+ la $i3,0($i3,$tbl)
888+ icm $t1,2,0($t1) # Te4[rk[5]>>0]<<8
889+ icm $t1,4,0($i1) # Te4[rk[5]>>8]<<16
890+ icm $t1,8,0($i2) # Te4[rk[5]>>16]<<24
891+ icm $t1,1,0($i3) # Te4[rk[5]>>24]
892+ x $t1,256($t3,$tbl) # rcon[i]
893+ xr $s0,$t1 # rk[6]=rk[0]^...
894+ xr $s1,$s0 # rk[7]=rk[1]^rk[6]
895+ xr $s2,$s1 # rk[8]=rk[2]^rk[7]
896+ xr $s3,$s2 # rk[9]=rk[3]^rk[8]
897+
898+ st $s0,24($key)
899+ st $s1,28($key)
900+ st $s2,32($key)
901+ st $s3,36($key)
902+ brct $rounds,.L192_continue
903+ lghi %r2,0
904+ lmg %r6,%r13,48($sp)
905+ br $ra
906+
907+.align 16
908+.L192_continue:
909+ lgr $t1,$s3
910+ x $t1,16($key) # rk[10]=rk[4]^rk[9]
911+ st $t1,40($key)
912+ x $t1,20($key) # rk[11]=rk[5]^rk[10]
913+ st $t1,44($key)
914+
915+ srlg $i1,$t1,8
916+ srlg $i2,$t1,16
917+ srlg $i3,$t1,24
918+ nr $t1,$mask
919+ nr $i1,$mask
920+ nr $i2,$mask
921+
922+ la $key,24($key) # key+=6
923+ la $t3,4($t3) # i++
924+ j .L192_loop
925+
926+.align 16
927+.Lnot192:
928+ llgf $t0,24($inp)
929+ llgf $t1,28($inp)
930+ st $t0,24($key)
931+ st $t1,28($key)
932+ llill $mask,0xff
933+ lghi $t3,0 # i=0
934+ lghi $rounds,14
935+ st $rounds,240($key)
936+ lghi $rounds,7
937+
938+ srlg $i1,$t1,8
939+ srlg $i2,$t1,16
940+ srlg $i3,$t1,24
941+ nr $t1,$mask
942+ nr $i1,$mask
943+ nr $i2,$mask
944+
945+.align 16
946+.L256_loop:
947+ la $t1,0($t1,$tbl)
948+ la $i1,0($i1,$tbl)
949+ la $i2,0($i2,$tbl)
950+ la $i3,0($i3,$tbl)
951+ icm $t1,2,0($t1) # Te4[rk[7]>>0]<<8
952+ icm $t1,4,0($i1) # Te4[rk[7]>>8]<<16
953+ icm $t1,8,0($i2) # Te4[rk[7]>>16]<<24
954+ icm $t1,1,0($i3) # Te4[rk[7]>>24]
955+ x $t1,256($t3,$tbl) # rcon[i]
956+ xr $s0,$t1 # rk[8]=rk[0]^...
957+ xr $s1,$s0 # rk[9]=rk[1]^rk[8]
958+ xr $s2,$s1 # rk[10]=rk[2]^rk[9]
959+ xr $s3,$s2 # rk[11]=rk[3]^rk[10]
960+ st $s0,32($key)
961+ st $s1,36($key)
962+ st $s2,40($key)
963+ st $s3,44($key)
964+ brct $rounds,.L256_continue
965+ lghi %r2,0
966+ lmg %r6,%r13,48($sp)
967+ br $ra
968+
969+.align 16
970+.L256_continue:
971+ lgr $t1,$s3 # temp=rk[11]
972+ srlg $i1,$s3,8
973+ srlg $i2,$s3,16
974+ srlg $i3,$s3,24
975+ nr $t1,$mask
976+ nr $i1,$mask
977+ nr $i2,$mask
978+ la $t1,0($t1,$tbl)
979+ la $i1,0($i1,$tbl)
980+ la $i2,0($i2,$tbl)
981+ la $i3,0($i3,$tbl)
982+ llgc $t1,0($t1) # Te4[rk[11]>>0]
983+ icm $t1,2,0($i1) # Te4[rk[11]>>8]<<8
984+ icm $t1,4,0($i2) # Te4[rk[11]>>16]<<16
985+ icm $t1,8,0($i3) # Te4[rk[11]>>24]<<24
986+ x $t1,16($key) # rk[12]=rk[4]^...
987+ st $t1,48($key)
988+ x $t1,20($key) # rk[13]=rk[5]^rk[12]
989+ st $t1,52($key)
990+ x $t1,24($key) # rk[14]=rk[6]^rk[13]
991+ st $t1,56($key)
992+ x $t1,28($key) # rk[15]=rk[7]^rk[14]
993+ st $t1,60($key)
994+
995+ srlg $i1,$t1,8
996+ srlg $i2,$t1,16
997+ srlg $i3,$t1,24
998+ nr $t1,$mask
999+ nr $i1,$mask
1000+ nr $i2,$mask
1001+
1002+ la $key,32($key) # key+=8
1003+ la $t3,4($t3) # i++
1004+ j .L256_loop
1005+
1006+.Lminus1:
1007+ lghi %r2,-1
1008+ br $ra
1009+.size AES_set_encrypt_key,.-AES_set_encrypt_key
1010+
1011+# void AES_set_decrypt_key(const unsigned char *in, int bits,
1012+# AES_KEY *key) {
1013+.globl AES_set_decrypt_key
1014+.type AES_set_decrypt_key,\@function
1015+.align 16
1016+AES_set_decrypt_key:
1017+ stg $key,32($sp) # I rely on AES_set_encrypt_key to
1018+ stg $ra,112($sp) # save non-volatile registers!
1019+ bras $ra,AES_set_encrypt_key
1020+ lg $key,32($sp)
1021+ lg $ra,112($sp)
1022+ ltgr %r2,%r2
1023+ bnzr $ra
1024+___
1025+$code.=<<___ if (!$softonly);
1026+ l $t0,240($key)
1027+ lhi $t1,16
1028+ cr $t0,$t1
1029+ jl .Lgo
1030+ oill $t0,0x80 # set "decrypt" bit
1031+ st $t0,240($key)
1032+ br $ra
1033+
1034+.align 16
1035+.Ldkey_internal:
1036+ stg $key,32($sp)
1037+ stg $ra,40($sp)
1038+ bras $ra,.Lekey_internal
1039+ lg $key,32($sp)
1040+ lg $ra,40($sp)
1041+___
1042+$code.=<<___;
1043+
1044+.Lgo: llgf $rounds,240($key)
1045+ la $i1,0($key)
1046+ sllg $i2,$rounds,4
1047+ la $i2,0($i2,$key)
1048+ srl $rounds,1
1049+ lghi $t1,-16
1050+
1051+.align 16
1052+.Linv: lmg $s0,$s1,0($i1)
1053+ lmg $s2,$s3,0($i2)
1054+ stmg $s0,$s1,0($i2)
1055+ stmg $s2,$s3,0($i1)
1056+ la $i1,16($i1)
1057+ la $i2,0($t1,$i2)
1058+ brct $rounds,.Linv
1059+___
1060+$mask80=$i1;
1061+$mask1b=$i2;
1062+$maskfe=$i3;
1063+$code.=<<___;
1064+ llgf $rounds,240($key)
1065+ aghi $rounds,-1
1066+ sll $rounds,2 # (rounds-1)*4
1067+ llilh $mask80,0x8080
1068+ llilh $mask1b,0x1b1b
1069+ llilh $maskfe,0xfefe
1070+ oill $mask80,0x8080
1071+ oill $mask1b,0x1b1b
1072+ oill $maskfe,0xfefe
1073+
1074+.align 16
1075+.Lmix: l $s0,16($key) # tp1
1076+ lr $s1,$s0
1077+ ngr $s1,$mask80
1078+ srlg $t1,$s1,7
1079+ slr $s1,$t1
1080+ nr $s1,$mask1b
1081+ sllg $t1,$s0,1
1082+ nr $t1,$maskfe
1083+ xr $s1,$t1 # tp2
1084+
1085+ lr $s2,$s1
1086+ ngr $s2,$mask80
1087+ srlg $t1,$s2,7
1088+ slr $s2,$t1
1089+ nr $s2,$mask1b
1090+ sllg $t1,$s1,1
1091+ nr $t1,$maskfe
1092+ xr $s2,$t1 # tp4
1093+
1094+ lr $s3,$s2
1095+ ngr $s3,$mask80
1096+ srlg $t1,$s3,7
1097+ slr $s3,$t1
1098+ nr $s3,$mask1b
1099+ sllg $t1,$s2,1
1100+ nr $t1,$maskfe
1101+ xr $s3,$t1 # tp8
1102+
1103+ xr $s1,$s0 # tp2^tp1
1104+ xr $s2,$s0 # tp4^tp1
1105+ rll $s0,$s0,24 # = ROTATE(tp1,8)
1106+ xr $s2,$s3 # ^=tp8
1107+ xr $s0,$s1 # ^=tp2^tp1
1108+ xr $s1,$s3 # tp2^tp1^tp8
1109+ xr $s0,$s2 # ^=tp4^tp1^tp8
1110+ rll $s1,$s1,8
1111+ rll $s2,$s2,16
1112+ xr $s0,$s1 # ^= ROTATE(tp8^tp2^tp1,24)
1113+ rll $s3,$s3,24
1114+ xr $s0,$s2 # ^= ROTATE(tp8^tp4^tp1,16)
1115+ xr $s0,$s3 # ^= ROTATE(tp8,8)
1116+
1117+ st $s0,16($key)
1118+ la $key,4($key)
1119+ brct $rounds,.Lmix
1120+
1121+ lmg %r6,%r13,48($sp)# as was saved by AES_set_encrypt_key!
1122+ lghi %r2,0
1123+ br $ra
1124+.size AES_set_decrypt_key,.-AES_set_decrypt_key
1125+___
1126+
1127+#void AES_cbc_encrypt(const unsigned char *in, unsigned char *out,
1128+# size_t length, const AES_KEY *key,
1129+# unsigned char *ivec, const int enc)
1130+{
1131+my $inp="%r2";
1132+my $out="%r4"; # length and out are swapped
1133+my $len="%r3";
1134+my $key="%r5";
1135+my $ivp="%r6";
1136+
1137+$code.=<<___;
1138+.globl AES_cbc_encrypt
1139+.type AES_cbc_encrypt,\@function
1140+.align 16
1141+AES_cbc_encrypt:
1142+ xgr %r3,%r4 # flip %r3 and %r4, out and len
1143+ xgr %r4,%r3
1144+ xgr %r3,%r4
1145+___
1146+$code.=<<___ if (!$softonly);
1147+ lhi %r0,16
1148+ cl %r0,240($key)
1149+ jh .Lcbc_software
1150+
1151+ lg %r0,0($ivp) # copy ivec
1152+ lg %r1,8($ivp)
1153+ stmg %r0,%r1,16($sp)
1154+ lmg %r0,%r1,0($key) # copy key, cover 256 bit
1155+ stmg %r0,%r1,32($sp)
1156+ lmg %r0,%r1,16($key)
1157+ stmg %r0,%r1,48($sp)
1158+ l %r0,240($key) # load kmc code
1159+ lghi $key,15 # res=len%16, len-=res;
1160+ ngr $key,$len
1161+ slgr $len,$key
1162+ la %r1,16($sp) # parameter block - ivec || key
1163+ jz .Lkmc_truncated
1164+ .long 0xb92f0042 # kmc %r4,%r2
1165+ brc 1,.-4 # pay attention to "partial completion"
1166+ ltr $key,$key
1167+ jnz .Lkmc_truncated
1168+.Lkmc_done:
1169+ lmg %r0,%r1,16($sp) # copy ivec to caller
1170+ stg %r0,0($ivp)
1171+ stg %r1,8($ivp)
1172+ br $ra
1173+.align 16
1174+.Lkmc_truncated:
1175+ ahi $key,-1 # it's the way it's encoded in mvc
1176+ tmll %r0,0x80
1177+ jnz .Lkmc_truncated_dec
1178+ lghi %r1,0
1179+ stg %r1,128($sp)
1180+ stg %r1,136($sp)
1181+ bras %r1,1f
1182+ mvc 128(1,$sp),0($inp)
1183+1: ex $key,0(%r1)
1184+ la %r1,16($sp) # restore parameter block
1185+ la $inp,128($sp)
1186+ lghi $len,16
1187+ .long 0xb92f0042 # kmc %r4,%r2
1188+ j .Lkmc_done
1189+.align 16
1190+.Lkmc_truncated_dec:
1191+ stg $out,64($sp)
1192+ la $out,128($sp)
1193+ lghi $len,16
1194+ .long 0xb92f0042 # kmc %r4,%r2
1195+ lg $out,64($sp)
1196+ bras %r1,2f
1197+ mvc 0(1,$out),128($sp)
1198+2: ex $key,0(%r1)
1199+ j .Lkmc_done
1200+.align 16
1201+.Lcbc_software:
1202+___
1203+$code.=<<___;
1204+ stmg $key,$ra,40($sp)
1205+ lhi %r0,0
1206+ cl %r0,164($sp)
1207+ je .Lcbc_decrypt
1208+
1209+ larl $tbl,AES_Te
1210+
1211+ llgf $s0,0($ivp)
1212+ llgf $s1,4($ivp)
1213+ llgf $s2,8($ivp)
1214+ llgf $s3,12($ivp)
1215+
1216+ lghi $t0,16
1217+ slgr $len,$t0
1218+ brc 4,.Lcbc_enc_tail # if borrow
1219+.Lcbc_enc_loop:
1220+ stmg $inp,$out,16($sp)
1221+ x $s0,0($inp)
1222+ x $s1,4($inp)
1223+ x $s2,8($inp)
1224+ x $s3,12($inp)
1225+ lgr %r4,$key
1226+
1227+ bras $ra,_s390x_AES_encrypt
1228+
1229+ lmg $inp,$key,16($sp)
1230+ st $s0,0($out)
1231+ st $s1,4($out)
1232+ st $s2,8($out)
1233+ st $s3,12($out)
1234+
1235+ la $inp,16($inp)
1236+ la $out,16($out)
1237+ lghi $t0,16
1238+ ltgr $len,$len
1239+ jz .Lcbc_enc_done
1240+ slgr $len,$t0
1241+ brc 4,.Lcbc_enc_tail # if borrow
1242+ j .Lcbc_enc_loop
1243+.align 16
1244+.Lcbc_enc_done:
1245+ lg $ivp,48($sp)
1246+ st $s0,0($ivp)
1247+ st $s1,4($ivp)
1248+ st $s2,8($ivp)
1249+ st $s3,12($ivp)
1250+
1251+ lmg %r7,$ra,56($sp)
1252+ br $ra
1253+
1254+.align 16
1255+.Lcbc_enc_tail:
1256+ aghi $len,15
1257+ lghi $t0,0
1258+ stg $t0,128($sp)
1259+ stg $t0,136($sp)
1260+ bras $t1,3f
1261+ mvc 128(1,$sp),0($inp)
1262+3: ex $len,0($t1)
1263+ lghi $len,0
1264+ la $inp,128($sp)
1265+ j .Lcbc_enc_loop
1266+
1267+.align 16
1268+.Lcbc_decrypt:
1269+ larl $tbl,AES_Td
1270+
1271+ lg $t0,0($ivp)
1272+ lg $t1,8($ivp)
1273+ stmg $t0,$t1,128($sp)
1274+
1275+.Lcbc_dec_loop:
1276+ stmg $inp,$out,16($sp)
1277+ llgf $s0,0($inp)
1278+ llgf $s1,4($inp)
1279+ llgf $s2,8($inp)
1280+ llgf $s3,12($inp)
1281+ lgr %r4,$key
1282+
1283+ bras $ra,_s390x_AES_decrypt
1284+
1285+ lmg $inp,$key,16($sp)
1286+ sllg $s0,$s0,32
1287+ sllg $s2,$s2,32
1288+ lr $s0,$s1
1289+ lr $s2,$s3
1290+
1291+ lg $t0,0($inp)
1292+ lg $t1,8($inp)
1293+ xg $s0,128($sp)
1294+ xg $s2,136($sp)
1295+ lghi $s1,16
1296+ slgr $len,$s1
1297+ brc 4,.Lcbc_dec_tail # if borrow
1298+ brc 2,.Lcbc_dec_done # if zero
1299+ stg $s0,0($out)
1300+ stg $s2,8($out)
1301+ stmg $t0,$t1,128($sp)
1302+
1303+ la $inp,16($inp)
1304+ la $out,16($out)
1305+ j .Lcbc_dec_loop
1306+
1307+.Lcbc_dec_done:
1308+ stg $s0,0($out)
1309+ stg $s2,8($out)
1310+.Lcbc_dec_exit:
1311+ lmg $ivp,$ra,48($sp)
1312+ stmg $t0,$t1,0($ivp)
1313+
1314+ br $ra
1315+
1316+.align 16
1317+.Lcbc_dec_tail:
1318+ aghi $len,15
1319+ stg $s0,128($sp)
1320+ stg $s2,136($sp)
1321+ bras $s1,4f
1322+ mvc 0(1,$out),128($sp)
1323+4: ex $len,0($s1)
1324+ j .Lcbc_dec_exit
1325+.size AES_cbc_encrypt,.-AES_cbc_encrypt
1326+___
1327+}
1328+$code.=<<___;
1329+.string "AES for s390x, CRYPTOGAMS by <appro\@openssl.org>"
1330+___
1331+
1332+$code =~ s/\`([^\`]*)\`/eval $1/gem;
1333+print $code;
--- /dev/null
+++ b/crypto/aes/asm/aes-sparcv9.pl
@@ -0,0 +1,1181 @@
1+#!/usr/bin/env perl
2+#
3+# ====================================================================
4+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5+# project. Rights for redistribution and usage in source and binary
6+# forms are granted according to the OpenSSL license.
7+# ====================================================================
8+#
9+# Version 1.1
10+#
11+# The major reason for undertaken effort was to mitigate the hazard of
12+# cache-timing attack. This is [currently and initially!] addressed in
13+# two ways. 1. S-boxes are compressed from 5KB to 2KB+256B size each.
14+# 2. References to them are scheduled for L2 cache latency, meaning
15+# that the tables don't have to reside in L1 cache. Once again, this
16+# is an initial draft and one should expect more countermeasures to
17+# be implemented...
18+#
19+# Version 1.1 prefetches T[ed]4 in order to mitigate attack on last
20+# round.
21+#
22+# Even though performance was not the primary goal [on the contrary,
23+# extra shifts "induced" by compressed S-box and longer loop epilogue
24+# "induced" by scheduling for L2 have negative effect on performance],
25+# the code turned out to run in ~23 cycles per processed byte en-/
26+# decrypted with 128-bit key. This is pretty good result for code
27+# with mentioned qualities and UltraSPARC core. Compared to Sun C
28+# generated code my encrypt procedure runs just few percents faster,
29+# while decrypt one - whole 50% faster [yes, Sun C failed to generate
30+# optimal decrypt procedure]. Compared to GNU C generated code both
31+# procedures are more than 60% faster:-)
32+
33+$bits=32;
34+for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
35+if ($bits==64) { $bias=2047; $frame=192; }
36+else { $bias=0; $frame=112; }
37+$locals=16;
38+
39+$acc0="%l0";
40+$acc1="%o0";
41+$acc2="%o1";
42+$acc3="%o2";
43+
44+$acc4="%l1";
45+$acc5="%o3";
46+$acc6="%o4";
47+$acc7="%o5";
48+
49+$acc8="%l2";
50+$acc9="%o7";
51+$acc10="%g1";
52+$acc11="%g2";
53+
54+$acc12="%l3";
55+$acc13="%g3";
56+$acc14="%g4";
57+$acc15="%g5";
58+
59+$t0="%l4";
60+$t1="%l5";
61+$t2="%l6";
62+$t3="%l7";
63+
64+$s0="%i0";
65+$s1="%i1";
66+$s2="%i2";
67+$s3="%i3";
68+$tbl="%i4";
69+$key="%i5";
70+$rounds="%i7"; # aliases with return address, which is off-loaded to stack
71+
72+sub _data_word()
73+{ my $i;
74+ while(defined($i=shift)) { $code.=sprintf"\t.long\t0x%08x,0x%08x\n",$i,$i; }
75+}
76+
77+$code.=<<___ if ($bits==64);
78+.register %g2,#scratch
79+.register %g3,#scratch
80+___
81+$code.=<<___;
82+.section ".text",#alloc,#execinstr
83+
84+.align 256
85+AES_Te:
86+___
87+&_data_word(
88+ 0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d,
89+ 0xfff2f20d, 0xd66b6bbd, 0xde6f6fb1, 0x91c5c554,
90+ 0x60303050, 0x02010103, 0xce6767a9, 0x562b2b7d,
91+ 0xe7fefe19, 0xb5d7d762, 0x4dababe6, 0xec76769a,
92+ 0x8fcaca45, 0x1f82829d, 0x89c9c940, 0xfa7d7d87,
93+ 0xeffafa15, 0xb25959eb, 0x8e4747c9, 0xfbf0f00b,
94+ 0x41adadec, 0xb3d4d467, 0x5fa2a2fd, 0x45afafea,
95+ 0x239c9cbf, 0x53a4a4f7, 0xe4727296, 0x9bc0c05b,
96+ 0x75b7b7c2, 0xe1fdfd1c, 0x3d9393ae, 0x4c26266a,
97+ 0x6c36365a, 0x7e3f3f41, 0xf5f7f702, 0x83cccc4f,
98+ 0x6834345c, 0x51a5a5f4, 0xd1e5e534, 0xf9f1f108,
99+ 0xe2717193, 0xabd8d873, 0x62313153, 0x2a15153f,
100+ 0x0804040c, 0x95c7c752, 0x46232365, 0x9dc3c35e,
101+ 0x30181828, 0x379696a1, 0x0a05050f, 0x2f9a9ab5,
102+ 0x0e070709, 0x24121236, 0x1b80809b, 0xdfe2e23d,
103+ 0xcdebeb26, 0x4e272769, 0x7fb2b2cd, 0xea75759f,
104+ 0x1209091b, 0x1d83839e, 0x582c2c74, 0x341a1a2e,
105+ 0x361b1b2d, 0xdc6e6eb2, 0xb45a5aee, 0x5ba0a0fb,
106+ 0xa45252f6, 0x763b3b4d, 0xb7d6d661, 0x7db3b3ce,
107+ 0x5229297b, 0xdde3e33e, 0x5e2f2f71, 0x13848497,
108+ 0xa65353f5, 0xb9d1d168, 0x00000000, 0xc1eded2c,
109+ 0x40202060, 0xe3fcfc1f, 0x79b1b1c8, 0xb65b5bed,
110+ 0xd46a6abe, 0x8dcbcb46, 0x67bebed9, 0x7239394b,
111+ 0x944a4ade, 0x984c4cd4, 0xb05858e8, 0x85cfcf4a,
112+ 0xbbd0d06b, 0xc5efef2a, 0x4faaaae5, 0xedfbfb16,
113+ 0x864343c5, 0x9a4d4dd7, 0x66333355, 0x11858594,
114+ 0x8a4545cf, 0xe9f9f910, 0x04020206, 0xfe7f7f81,
115+ 0xa05050f0, 0x783c3c44, 0x259f9fba, 0x4ba8a8e3,
116+ 0xa25151f3, 0x5da3a3fe, 0x804040c0, 0x058f8f8a,
117+ 0x3f9292ad, 0x219d9dbc, 0x70383848, 0xf1f5f504,
118+ 0x63bcbcdf, 0x77b6b6c1, 0xafdada75, 0x42212163,
119+ 0x20101030, 0xe5ffff1a, 0xfdf3f30e, 0xbfd2d26d,
120+ 0x81cdcd4c, 0x180c0c14, 0x26131335, 0xc3ecec2f,
121+ 0xbe5f5fe1, 0x359797a2, 0x884444cc, 0x2e171739,
122+ 0x93c4c457, 0x55a7a7f2, 0xfc7e7e82, 0x7a3d3d47,
123+ 0xc86464ac, 0xba5d5de7, 0x3219192b, 0xe6737395,
124+ 0xc06060a0, 0x19818198, 0x9e4f4fd1, 0xa3dcdc7f,
125+ 0x44222266, 0x542a2a7e, 0x3b9090ab, 0x0b888883,
126+ 0x8c4646ca, 0xc7eeee29, 0x6bb8b8d3, 0x2814143c,
127+ 0xa7dede79, 0xbc5e5ee2, 0x160b0b1d, 0xaddbdb76,
128+ 0xdbe0e03b, 0x64323256, 0x743a3a4e, 0x140a0a1e,
129+ 0x924949db, 0x0c06060a, 0x4824246c, 0xb85c5ce4,
130+ 0x9fc2c25d, 0xbdd3d36e, 0x43acacef, 0xc46262a6,
131+ 0x399191a8, 0x319595a4, 0xd3e4e437, 0xf279798b,
132+ 0xd5e7e732, 0x8bc8c843, 0x6e373759, 0xda6d6db7,
133+ 0x018d8d8c, 0xb1d5d564, 0x9c4e4ed2, 0x49a9a9e0,
134+ 0xd86c6cb4, 0xac5656fa, 0xf3f4f407, 0xcfeaea25,
135+ 0xca6565af, 0xf47a7a8e, 0x47aeaee9, 0x10080818,
136+ 0x6fbabad5, 0xf0787888, 0x4a25256f, 0x5c2e2e72,
137+ 0x381c1c24, 0x57a6a6f1, 0x73b4b4c7, 0x97c6c651,
138+ 0xcbe8e823, 0xa1dddd7c, 0xe874749c, 0x3e1f1f21,
139+ 0x964b4bdd, 0x61bdbddc, 0x0d8b8b86, 0x0f8a8a85,
140+ 0xe0707090, 0x7c3e3e42, 0x71b5b5c4, 0xcc6666aa,
141+ 0x904848d8, 0x06030305, 0xf7f6f601, 0x1c0e0e12,
142+ 0xc26161a3, 0x6a35355f, 0xae5757f9, 0x69b9b9d0,
143+ 0x17868691, 0x99c1c158, 0x3a1d1d27, 0x279e9eb9,
144+ 0xd9e1e138, 0xebf8f813, 0x2b9898b3, 0x22111133,
145+ 0xd26969bb, 0xa9d9d970, 0x078e8e89, 0x339494a7,
146+ 0x2d9b9bb6, 0x3c1e1e22, 0x15878792, 0xc9e9e920,
147+ 0x87cece49, 0xaa5555ff, 0x50282878, 0xa5dfdf7a,
148+ 0x038c8c8f, 0x59a1a1f8, 0x09898980, 0x1a0d0d17,
149+ 0x65bfbfda, 0xd7e6e631, 0x844242c6, 0xd06868b8,
150+ 0x824141c3, 0x299999b0, 0x5a2d2d77, 0x1e0f0f11,
151+ 0x7bb0b0cb, 0xa85454fc, 0x6dbbbbd6, 0x2c16163a);
152+$code.=<<___;
153+ .byte 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5
154+ .byte 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
155+ .byte 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
156+ .byte 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
157+ .byte 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
158+ .byte 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
159+ .byte 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
160+ .byte 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
161+ .byte 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
162+ .byte 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
163+ .byte 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
164+ .byte 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
165+ .byte 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
166+ .byte 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
167+ .byte 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
168+ .byte 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
169+ .byte 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
170+ .byte 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
171+ .byte 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
172+ .byte 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
173+ .byte 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
174+ .byte 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
175+ .byte 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
176+ .byte 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
177+ .byte 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
178+ .byte 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
179+ .byte 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
180+ .byte 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
181+ .byte 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
182+ .byte 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
183+ .byte 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
184+ .byte 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
185+.type AES_Te,#object
186+.size AES_Te,(.-AES_Te)
187+
188+.align 64
189+.skip 16
190+_sparcv9_AES_encrypt:
191+ save %sp,-$frame-$locals,%sp
192+ stx %i7,[%sp+$bias+$frame+0] ! off-load return address
193+ ld [$key+240],$rounds
194+ ld [$key+0],$t0
195+ ld [$key+4],$t1 !
196+ ld [$key+8],$t2
197+ srl $rounds,1,$rounds
198+ xor $t0,$s0,$s0
199+ ld [$key+12],$t3
200+ srl $s0,21,$acc0
201+ xor $t1,$s1,$s1
202+ ld [$key+16],$t0
203+ srl $s1,13,$acc1 !
204+ xor $t2,$s2,$s2
205+ ld [$key+20],$t1
206+ xor $t3,$s3,$s3
207+ ld [$key+24],$t2
208+ and $acc0,2040,$acc0
209+ ld [$key+28],$t3
210+ nop
211+.Lenc_loop:
212+ srl $s2,5,$acc2 !
213+ and $acc1,2040,$acc1
214+ ldx [$tbl+$acc0],$acc0
215+ sll $s3,3,$acc3
216+ and $acc2,2040,$acc2
217+ ldx [$tbl+$acc1],$acc1
218+ srl $s1,21,$acc4
219+ and $acc3,2040,$acc3
220+ ldx [$tbl+$acc2],$acc2 !
221+ srl $s2,13,$acc5
222+ and $acc4,2040,$acc4
223+ ldx [$tbl+$acc3],$acc3
224+ srl $s3,5,$acc6
225+ and $acc5,2040,$acc5
226+ ldx [$tbl+$acc4],$acc4
227+ fmovs %f0,%f0
228+ sll $s0,3,$acc7 !
229+ and $acc6,2040,$acc6
230+ ldx [$tbl+$acc5],$acc5
231+ srl $s2,21,$acc8
232+ and $acc7,2040,$acc7
233+ ldx [$tbl+$acc6],$acc6
234+ srl $s3,13,$acc9
235+ and $acc8,2040,$acc8
236+ ldx [$tbl+$acc7],$acc7 !
237+ srl $s0,5,$acc10
238+ and $acc9,2040,$acc9
239+ ldx [$tbl+$acc8],$acc8
240+ sll $s1,3,$acc11
241+ and $acc10,2040,$acc10
242+ ldx [$tbl+$acc9],$acc9
243+ fmovs %f0,%f0
244+ srl $s3,21,$acc12 !
245+ and $acc11,2040,$acc11
246+ ldx [$tbl+$acc10],$acc10
247+ srl $s0,13,$acc13
248+ and $acc12,2040,$acc12
249+ ldx [$tbl+$acc11],$acc11
250+ srl $s1,5,$acc14
251+ and $acc13,2040,$acc13
252+ ldx [$tbl+$acc12],$acc12 !
253+ sll $s2,3,$acc15
254+ and $acc14,2040,$acc14
255+ ldx [$tbl+$acc13],$acc13
256+ and $acc15,2040,$acc15
257+ add $key,32,$key
258+ ldx [$tbl+$acc14],$acc14
259+ fmovs %f0,%f0
260+ subcc $rounds,1,$rounds !
261+ ldx [$tbl+$acc15],$acc15
262+ bz,a,pn %icc,.Lenc_last
263+ add $tbl,2048,$rounds
264+
265+ srlx $acc1,8,$acc1
266+ xor $acc0,$t0,$t0
267+ ld [$key+0],$s0
268+ fmovs %f0,%f0
269+ srlx $acc2,16,$acc2 !
270+ xor $acc1,$t0,$t0
271+ ld [$key+4],$s1
272+ srlx $acc3,24,$acc3
273+ xor $acc2,$t0,$t0
274+ ld [$key+8],$s2
275+ srlx $acc5,8,$acc5
276+ xor $acc3,$t0,$t0
277+ ld [$key+12],$s3 !
278+ srlx $acc6,16,$acc6
279+ xor $acc4,$t1,$t1
280+ fmovs %f0,%f0
281+ srlx $acc7,24,$acc7
282+ xor $acc5,$t1,$t1
283+ srlx $acc9,8,$acc9
284+ xor $acc6,$t1,$t1
285+ srlx $acc10,16,$acc10 !
286+ xor $acc7,$t1,$t1
287+ srlx $acc11,24,$acc11
288+ xor $acc8,$t2,$t2
289+ srlx $acc13,8,$acc13
290+ xor $acc9,$t2,$t2
291+ srlx $acc14,16,$acc14
292+ xor $acc10,$t2,$t2
293+ srlx $acc15,24,$acc15 !
294+ xor $acc11,$t2,$t2
295+ xor $acc12,$acc14,$acc14
296+ xor $acc13,$t3,$t3
297+ srl $t0,21,$acc0
298+ xor $acc14,$t3,$t3
299+ srl $t1,13,$acc1
300+ xor $acc15,$t3,$t3
301+
302+ and $acc0,2040,$acc0 !
303+ srl $t2,5,$acc2
304+ and $acc1,2040,$acc1
305+ ldx [$tbl+$acc0],$acc0
306+ sll $t3,3,$acc3
307+ and $acc2,2040,$acc2
308+ ldx [$tbl+$acc1],$acc1
309+ fmovs %f0,%f0
310+ srl $t1,21,$acc4 !
311+ and $acc3,2040,$acc3
312+ ldx [$tbl+$acc2],$acc2
313+ srl $t2,13,$acc5
314+ and $acc4,2040,$acc4
315+ ldx [$tbl+$acc3],$acc3
316+ srl $t3,5,$acc6
317+ and $acc5,2040,$acc5
318+ ldx [$tbl+$acc4],$acc4 !
319+ sll $t0,3,$acc7
320+ and $acc6,2040,$acc6
321+ ldx [$tbl+$acc5],$acc5
322+ srl $t2,21,$acc8
323+ and $acc7,2040,$acc7
324+ ldx [$tbl+$acc6],$acc6
325+ fmovs %f0,%f0
326+ srl $t3,13,$acc9 !
327+ and $acc8,2040,$acc8
328+ ldx [$tbl+$acc7],$acc7
329+ srl $t0,5,$acc10
330+ and $acc9,2040,$acc9
331+ ldx [$tbl+$acc8],$acc8
332+ sll $t1,3,$acc11
333+ and $acc10,2040,$acc10
334+ ldx [$tbl+$acc9],$acc9 !
335+ srl $t3,21,$acc12
336+ and $acc11,2040,$acc11
337+ ldx [$tbl+$acc10],$acc10
338+ srl $t0,13,$acc13
339+ and $acc12,2040,$acc12
340+ ldx [$tbl+$acc11],$acc11
341+ fmovs %f0,%f0
342+ srl $t1,5,$acc14 !
343+ and $acc13,2040,$acc13
344+ ldx [$tbl+$acc12],$acc12
345+ sll $t2,3,$acc15
346+ and $acc14,2040,$acc14
347+ ldx [$tbl+$acc13],$acc13
348+ srlx $acc1,8,$acc1
349+ and $acc15,2040,$acc15
350+ ldx [$tbl+$acc14],$acc14 !
351+
352+ srlx $acc2,16,$acc2
353+ xor $acc0,$s0,$s0
354+ ldx [$tbl+$acc15],$acc15
355+ srlx $acc3,24,$acc3
356+ xor $acc1,$s0,$s0
357+ ld [$key+16],$t0
358+ fmovs %f0,%f0
359+ srlx $acc5,8,$acc5 !
360+ xor $acc2,$s0,$s0
361+ ld [$key+20],$t1
362+ srlx $acc6,16,$acc6
363+ xor $acc3,$s0,$s0
364+ ld [$key+24],$t2
365+ srlx $acc7,24,$acc7
366+ xor $acc4,$s1,$s1
367+ ld [$key+28],$t3 !
368+ srlx $acc9,8,$acc9
369+ xor $acc5,$s1,$s1
370+ ldx [$tbl+2048+0],%g0 ! prefetch te4
371+ srlx $acc10,16,$acc10
372+ xor $acc6,$s1,$s1
373+ ldx [$tbl+2048+32],%g0 ! prefetch te4
374+ srlx $acc11,24,$acc11
375+ xor $acc7,$s1,$s1
376+ ldx [$tbl+2048+64],%g0 ! prefetch te4
377+ srlx $acc13,8,$acc13
378+ xor $acc8,$s2,$s2
379+ ldx [$tbl+2048+96],%g0 ! prefetch te4
380+ srlx $acc14,16,$acc14 !
381+ xor $acc9,$s2,$s2
382+ ldx [$tbl+2048+128],%g0 ! prefetch te4
383+ srlx $acc15,24,$acc15
384+ xor $acc10,$s2,$s2
385+ ldx [$tbl+2048+160],%g0 ! prefetch te4
386+ srl $s0,21,$acc0
387+ xor $acc11,$s2,$s2
388+ ldx [$tbl+2048+192],%g0 ! prefetch te4
389+ xor $acc12,$acc14,$acc14
390+ xor $acc13,$s3,$s3
391+ ldx [$tbl+2048+224],%g0 ! prefetch te4
392+ srl $s1,13,$acc1 !
393+ xor $acc14,$s3,$s3
394+ xor $acc15,$s3,$s3
395+ ba .Lenc_loop
396+ and $acc0,2040,$acc0
397+
398+.align 32
399+.Lenc_last:
400+ srlx $acc1,8,$acc1 !
401+ xor $acc0,$t0,$t0
402+ ld [$key+0],$s0
403+ srlx $acc2,16,$acc2
404+ xor $acc1,$t0,$t0
405+ ld [$key+4],$s1
406+ srlx $acc3,24,$acc3
407+ xor $acc2,$t0,$t0
408+ ld [$key+8],$s2 !
409+ srlx $acc5,8,$acc5
410+ xor $acc3,$t0,$t0
411+ ld [$key+12],$s3
412+ srlx $acc6,16,$acc6
413+ xor $acc4,$t1,$t1
414+ srlx $acc7,24,$acc7
415+ xor $acc5,$t1,$t1
416+ srlx $acc9,8,$acc9 !
417+ xor $acc6,$t1,$t1
418+ srlx $acc10,16,$acc10
419+ xor $acc7,$t1,$t1
420+ srlx $acc11,24,$acc11
421+ xor $acc8,$t2,$t2
422+ srlx $acc13,8,$acc13
423+ xor $acc9,$t2,$t2
424+ srlx $acc14,16,$acc14 !
425+ xor $acc10,$t2,$t2
426+ srlx $acc15,24,$acc15
427+ xor $acc11,$t2,$t2
428+ xor $acc12,$acc14,$acc14
429+ xor $acc13,$t3,$t3
430+ srl $t0,24,$acc0
431+ xor $acc14,$t3,$t3
432+ srl $t1,16,$acc1 !
433+ xor $acc15,$t3,$t3
434+
435+ srl $t2,8,$acc2
436+ and $acc1,255,$acc1
437+ ldub [$rounds+$acc0],$acc0
438+ srl $t1,24,$acc4
439+ and $acc2,255,$acc2
440+ ldub [$rounds+$acc1],$acc1
441+ srl $t2,16,$acc5 !
442+ and $t3,255,$acc3
443+ ldub [$rounds+$acc2],$acc2
444+ ldub [$rounds+$acc3],$acc3
445+ srl $t3,8,$acc6
446+ and $acc5,255,$acc5
447+ ldub [$rounds+$acc4],$acc4
448+ fmovs %f0,%f0
449+ srl $t2,24,$acc8 !
450+ and $acc6,255,$acc6
451+ ldub [$rounds+$acc5],$acc5
452+ srl $t3,16,$acc9
453+ and $t0,255,$acc7
454+ ldub [$rounds+$acc6],$acc6
455+ ldub [$rounds+$acc7],$acc7
456+ fmovs %f0,%f0
457+ srl $t0,8,$acc10 !
458+ and $acc9,255,$acc9
459+ ldub [$rounds+$acc8],$acc8
460+ srl $t3,24,$acc12
461+ and $acc10,255,$acc10
462+ ldub [$rounds+$acc9],$acc9
463+ srl $t0,16,$acc13
464+ and $t1,255,$acc11
465+ ldub [$rounds+$acc10],$acc10 !
466+ srl $t1,8,$acc14
467+ and $acc13,255,$acc13
468+ ldub [$rounds+$acc11],$acc11
469+ ldub [$rounds+$acc12],$acc12
470+ and $acc14,255,$acc14
471+ ldub [$rounds+$acc13],$acc13
472+ and $t2,255,$acc15
473+ ldub [$rounds+$acc14],$acc14 !
474+
475+ sll $acc0,24,$acc0
476+ xor $acc3,$s0,$s0
477+ ldub [$rounds+$acc15],$acc15
478+ sll $acc1,16,$acc1
479+ xor $acc0,$s0,$s0
480+ ldx [%sp+$bias+$frame+0],%i7 ! restore return address
481+ fmovs %f0,%f0
482+ sll $acc2,8,$acc2 !
483+ xor $acc1,$s0,$s0
484+ sll $acc4,24,$acc4
485+ xor $acc2,$s0,$s0
486+ sll $acc5,16,$acc5
487+ xor $acc7,$s1,$s1
488+ sll $acc6,8,$acc6
489+ xor $acc4,$s1,$s1
490+ sll $acc8,24,$acc8 !
491+ xor $acc5,$s1,$s1
492+ sll $acc9,16,$acc9
493+ xor $acc11,$s2,$s2
494+ sll $acc10,8,$acc10
495+ xor $acc6,$s1,$s1
496+ sll $acc12,24,$acc12
497+ xor $acc8,$s2,$s2
498+ sll $acc13,16,$acc13 !
499+ xor $acc9,$s2,$s2
500+ sll $acc14,8,$acc14
501+ xor $acc10,$s2,$s2
502+ xor $acc12,$acc14,$acc14
503+ xor $acc13,$s3,$s3
504+ xor $acc14,$s3,$s3
505+ xor $acc15,$s3,$s3
506+
507+ ret
508+ restore
509+.type _sparcv9_AES_encrypt,#function
510+.size _sparcv9_AES_encrypt,(.-_sparcv9_AES_encrypt)
511+
512+.align 32
513+.globl AES_encrypt
514+AES_encrypt:
515+ or %o0,%o1,%g1
516+ andcc %g1,3,%g0
517+ bnz,pn %xcc,.Lunaligned_enc
518+ save %sp,-$frame,%sp
519+
520+ ld [%i0+0],%o0
521+ ld [%i0+4],%o1
522+ ld [%i0+8],%o2
523+ ld [%i0+12],%o3
524+
525+1: call .+8
526+ add %o7,AES_Te-1b,%o4
527+ call _sparcv9_AES_encrypt
528+ mov %i2,%o5
529+
530+ st %o0,[%i1+0]
531+ st %o1,[%i1+4]
532+ st %o2,[%i1+8]
533+ st %o3,[%i1+12]
534+
535+ ret
536+ restore
537+
538+.align 32
539+.Lunaligned_enc:
540+ ldub [%i0+0],%l0
541+ ldub [%i0+1],%l1
542+ ldub [%i0+2],%l2
543+
544+ sll %l0,24,%l0
545+ ldub [%i0+3],%l3
546+ sll %l1,16,%l1
547+ ldub [%i0+4],%l4
548+ sll %l2,8,%l2
549+ or %l1,%l0,%l0
550+ ldub [%i0+5],%l5
551+ sll %l4,24,%l4
552+ or %l3,%l2,%l2
553+ ldub [%i0+6],%l6
554+ sll %l5,16,%l5
555+ or %l0,%l2,%o0
556+ ldub [%i0+7],%l7
557+
558+ sll %l6,8,%l6
559+ or %l5,%l4,%l4
560+ ldub [%i0+8],%l0
561+ or %l7,%l6,%l6
562+ ldub [%i0+9],%l1
563+ or %l4,%l6,%o1
564+ ldub [%i0+10],%l2
565+
566+ sll %l0,24,%l0
567+ ldub [%i0+11],%l3
568+ sll %l1,16,%l1
569+ ldub [%i0+12],%l4
570+ sll %l2,8,%l2
571+ or %l1,%l0,%l0
572+ ldub [%i0+13],%l5
573+ sll %l4,24,%l4
574+ or %l3,%l2,%l2
575+ ldub [%i0+14],%l6
576+ sll %l5,16,%l5
577+ or %l0,%l2,%o2
578+ ldub [%i0+15],%l7
579+
580+ sll %l6,8,%l6
581+ or %l5,%l4,%l4
582+ or %l7,%l6,%l6
583+ or %l4,%l6,%o3
584+
585+1: call .+8
586+ add %o7,AES_Te-1b,%o4
587+ call _sparcv9_AES_encrypt
588+ mov %i2,%o5
589+
590+ srl %o0,24,%l0
591+ srl %o0,16,%l1
592+ stb %l0,[%i1+0]
593+ srl %o0,8,%l2
594+ stb %l1,[%i1+1]
595+ stb %l2,[%i1+2]
596+ srl %o1,24,%l4
597+ stb %o0,[%i1+3]
598+
599+ srl %o1,16,%l5
600+ stb %l4,[%i1+4]
601+ srl %o1,8,%l6
602+ stb %l5,[%i1+5]
603+ stb %l6,[%i1+6]
604+ srl %o2,24,%l0
605+ stb %o1,[%i1+7]
606+
607+ srl %o2,16,%l1
608+ stb %l0,[%i1+8]
609+ srl %o2,8,%l2
610+ stb %l1,[%i1+9]
611+ stb %l2,[%i1+10]
612+ srl %o3,24,%l4
613+ stb %o2,[%i1+11]
614+
615+ srl %o3,16,%l5
616+ stb %l4,[%i1+12]
617+ srl %o3,8,%l6
618+ stb %l5,[%i1+13]
619+ stb %l6,[%i1+14]
620+ stb %o3,[%i1+15]
621+
622+ ret
623+ restore
624+.type AES_encrypt,#function
625+.size AES_encrypt,(.-AES_encrypt)
626+
627+___
628+
629+$code.=<<___;
630+.align 256
631+AES_Td:
632+___
633+&_data_word(
634+ 0x51f4a750, 0x7e416553, 0x1a17a4c3, 0x3a275e96,
635+ 0x3bab6bcb, 0x1f9d45f1, 0xacfa58ab, 0x4be30393,
636+ 0x2030fa55, 0xad766df6, 0x88cc7691, 0xf5024c25,
637+ 0x4fe5d7fc, 0xc52acbd7, 0x26354480, 0xb562a38f,
638+ 0xdeb15a49, 0x25ba1b67, 0x45ea0e98, 0x5dfec0e1,
639+ 0xc32f7502, 0x814cf012, 0x8d4697a3, 0x6bd3f9c6,
640+ 0x038f5fe7, 0x15929c95, 0xbf6d7aeb, 0x955259da,
641+ 0xd4be832d, 0x587421d3, 0x49e06929, 0x8ec9c844,
642+ 0x75c2896a, 0xf48e7978, 0x99583e6b, 0x27b971dd,
643+ 0xbee14fb6, 0xf088ad17, 0xc920ac66, 0x7dce3ab4,
644+ 0x63df4a18, 0xe51a3182, 0x97513360, 0x62537f45,
645+ 0xb16477e0, 0xbb6bae84, 0xfe81a01c, 0xf9082b94,
646+ 0x70486858, 0x8f45fd19, 0x94de6c87, 0x527bf8b7,
647+ 0xab73d323, 0x724b02e2, 0xe31f8f57, 0x6655ab2a,
648+ 0xb2eb2807, 0x2fb5c203, 0x86c57b9a, 0xd33708a5,
649+ 0x302887f2, 0x23bfa5b2, 0x02036aba, 0xed16825c,
650+ 0x8acf1c2b, 0xa779b492, 0xf307f2f0, 0x4e69e2a1,
651+ 0x65daf4cd, 0x0605bed5, 0xd134621f, 0xc4a6fe8a,
652+ 0x342e539d, 0xa2f355a0, 0x058ae132, 0xa4f6eb75,
653+ 0x0b83ec39, 0x4060efaa, 0x5e719f06, 0xbd6e1051,
654+ 0x3e218af9, 0x96dd063d, 0xdd3e05ae, 0x4de6bd46,
655+ 0x91548db5, 0x71c45d05, 0x0406d46f, 0x605015ff,
656+ 0x1998fb24, 0xd6bde997, 0x894043cc, 0x67d99e77,
657+ 0xb0e842bd, 0x07898b88, 0xe7195b38, 0x79c8eedb,
658+ 0xa17c0a47, 0x7c420fe9, 0xf8841ec9, 0x00000000,
659+ 0x09808683, 0x322bed48, 0x1e1170ac, 0x6c5a724e,
660+ 0xfd0efffb, 0x0f853856, 0x3daed51e, 0x362d3927,
661+ 0x0a0fd964, 0x685ca621, 0x9b5b54d1, 0x24362e3a,
662+ 0x0c0a67b1, 0x9357e70f, 0xb4ee96d2, 0x1b9b919e,
663+ 0x80c0c54f, 0x61dc20a2, 0x5a774b69, 0x1c121a16,
664+ 0xe293ba0a, 0xc0a02ae5, 0x3c22e043, 0x121b171d,
665+ 0x0e090d0b, 0xf28bc7ad, 0x2db6a8b9, 0x141ea9c8,
666+ 0x57f11985, 0xaf75074c, 0xee99ddbb, 0xa37f60fd,
667+ 0xf701269f, 0x5c72f5bc, 0x44663bc5, 0x5bfb7e34,
668+ 0x8b432976, 0xcb23c6dc, 0xb6edfc68, 0xb8e4f163,
669+ 0xd731dcca, 0x42638510, 0x13972240, 0x84c61120,
670+ 0x854a247d, 0xd2bb3df8, 0xaef93211, 0xc729a16d,
671+ 0x1d9e2f4b, 0xdcb230f3, 0x0d8652ec, 0x77c1e3d0,
672+ 0x2bb3166c, 0xa970b999, 0x119448fa, 0x47e96422,
673+ 0xa8fc8cc4, 0xa0f03f1a, 0x567d2cd8, 0x223390ef,
674+ 0x87494ec7, 0xd938d1c1, 0x8ccaa2fe, 0x98d40b36,
675+ 0xa6f581cf, 0xa57ade28, 0xdab78e26, 0x3fadbfa4,
676+ 0x2c3a9de4, 0x5078920d, 0x6a5fcc9b, 0x547e4662,
677+ 0xf68d13c2, 0x90d8b8e8, 0x2e39f75e, 0x82c3aff5,
678+ 0x9f5d80be, 0x69d0937c, 0x6fd52da9, 0xcf2512b3,
679+ 0xc8ac993b, 0x10187da7, 0xe89c636e, 0xdb3bbb7b,
680+ 0xcd267809, 0x6e5918f4, 0xec9ab701, 0x834f9aa8,
681+ 0xe6956e65, 0xaaffe67e, 0x21bccf08, 0xef15e8e6,
682+ 0xbae79bd9, 0x4a6f36ce, 0xea9f09d4, 0x29b07cd6,
683+ 0x31a4b2af, 0x2a3f2331, 0xc6a59430, 0x35a266c0,
684+ 0x744ebc37, 0xfc82caa6, 0xe090d0b0, 0x33a7d815,
685+ 0xf104984a, 0x41ecdaf7, 0x7fcd500e, 0x1791f62f,
686+ 0x764dd68d, 0x43efb04d, 0xccaa4d54, 0xe49604df,
687+ 0x9ed1b5e3, 0x4c6a881b, 0xc12c1fb8, 0x4665517f,
688+ 0x9d5eea04, 0x018c355d, 0xfa877473, 0xfb0b412e,
689+ 0xb3671d5a, 0x92dbd252, 0xe9105633, 0x6dd64713,
690+ 0x9ad7618c, 0x37a10c7a, 0x59f8148e, 0xeb133c89,
691+ 0xcea927ee, 0xb761c935, 0xe11ce5ed, 0x7a47b13c,
692+ 0x9cd2df59, 0x55f2733f, 0x1814ce79, 0x73c737bf,
693+ 0x53f7cdea, 0x5ffdaa5b, 0xdf3d6f14, 0x7844db86,
694+ 0xcaaff381, 0xb968c43e, 0x3824342c, 0xc2a3405f,
695+ 0x161dc372, 0xbce2250c, 0x283c498b, 0xff0d9541,
696+ 0x39a80171, 0x080cb3de, 0xd8b4e49c, 0x6456c190,
697+ 0x7bcb8461, 0xd532b670, 0x486c5c74, 0xd0b85742);
698+$code.=<<___;
699+ .byte 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38
700+ .byte 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
701+ .byte 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
702+ .byte 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
703+ .byte 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
704+ .byte 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
705+ .byte 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
706+ .byte 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
707+ .byte 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
708+ .byte 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
709+ .byte 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
710+ .byte 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
711+ .byte 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
712+ .byte 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
713+ .byte 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
714+ .byte 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
715+ .byte 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
716+ .byte 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
717+ .byte 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
718+ .byte 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
719+ .byte 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
720+ .byte 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
721+ .byte 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
722+ .byte 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
723+ .byte 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
724+ .byte 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
725+ .byte 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
726+ .byte 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
727+ .byte 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
728+ .byte 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
729+ .byte 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
730+ .byte 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
731+.type AES_Td,#object
732+.size AES_Td,(.-AES_Td)
733+
734+.align 64
735+.skip 16
736+_sparcv9_AES_decrypt:
737+ save %sp,-$frame-$locals,%sp
738+ stx %i7,[%sp+$bias+$frame+0] ! off-load return address
739+ ld [$key+240],$rounds
740+ ld [$key+0],$t0
741+ ld [$key+4],$t1 !
742+ ld [$key+8],$t2
743+ ld [$key+12],$t3
744+ srl $rounds,1,$rounds
745+ xor $t0,$s0,$s0
746+ ld [$key+16],$t0
747+ xor $t1,$s1,$s1
748+ ld [$key+20],$t1
749+ srl $s0,21,$acc0 !
750+ xor $t2,$s2,$s2
751+ ld [$key+24],$t2
752+ xor $t3,$s3,$s3
753+ and $acc0,2040,$acc0
754+ ld [$key+28],$t3
755+ srl $s3,13,$acc1
756+ nop
757+.Ldec_loop:
758+ srl $s2,5,$acc2 !
759+ and $acc1,2040,$acc1
760+ ldx [$tbl+$acc0],$acc0
761+ sll $s1,3,$acc3
762+ and $acc2,2040,$acc2
763+ ldx [$tbl+$acc1],$acc1
764+ srl $s1,21,$acc4
765+ and $acc3,2040,$acc3
766+ ldx [$tbl+$acc2],$acc2 !
767+ srl $s0,13,$acc5
768+ and $acc4,2040,$acc4
769+ ldx [$tbl+$acc3],$acc3
770+ srl $s3,5,$acc6
771+ and $acc5,2040,$acc5
772+ ldx [$tbl+$acc4],$acc4
773+ fmovs %f0,%f0
774+ sll $s2,3,$acc7 !
775+ and $acc6,2040,$acc6
776+ ldx [$tbl+$acc5],$acc5
777+ srl $s2,21,$acc8
778+ and $acc7,2040,$acc7
779+ ldx [$tbl+$acc6],$acc6
780+ srl $s1,13,$acc9
781+ and $acc8,2040,$acc8
782+ ldx [$tbl+$acc7],$acc7 !
783+ srl $s0,5,$acc10
784+ and $acc9,2040,$acc9
785+ ldx [$tbl+$acc8],$acc8
786+ sll $s3,3,$acc11
787+ and $acc10,2040,$acc10
788+ ldx [$tbl+$acc9],$acc9
789+ fmovs %f0,%f0
790+ srl $s3,21,$acc12 !
791+ and $acc11,2040,$acc11
792+ ldx [$tbl+$acc10],$acc10
793+ srl $s2,13,$acc13
794+ and $acc12,2040,$acc12
795+ ldx [$tbl+$acc11],$acc11
796+ srl $s1,5,$acc14
797+ and $acc13,2040,$acc13
798+ ldx [$tbl+$acc12],$acc12 !
799+ sll $s0,3,$acc15
800+ and $acc14,2040,$acc14
801+ ldx [$tbl+$acc13],$acc13
802+ and $acc15,2040,$acc15
803+ add $key,32,$key
804+ ldx [$tbl+$acc14],$acc14
805+ fmovs %f0,%f0
806+ subcc $rounds,1,$rounds !
807+ ldx [$tbl+$acc15],$acc15
808+ bz,a,pn %icc,.Ldec_last
809+ add $tbl,2048,$rounds
810+
811+ srlx $acc1,8,$acc1
812+ xor $acc0,$t0,$t0
813+ ld [$key+0],$s0
814+ fmovs %f0,%f0
815+ srlx $acc2,16,$acc2 !
816+ xor $acc1,$t0,$t0
817+ ld [$key+4],$s1
818+ srlx $acc3,24,$acc3
819+ xor $acc2,$t0,$t0
820+ ld [$key+8],$s2
821+ srlx $acc5,8,$acc5
822+ xor $acc3,$t0,$t0
823+ ld [$key+12],$s3 !
824+ srlx $acc6,16,$acc6
825+ xor $acc4,$t1,$t1
826+ fmovs %f0,%f0
827+ srlx $acc7,24,$acc7
828+ xor $acc5,$t1,$t1
829+ srlx $acc9,8,$acc9
830+ xor $acc6,$t1,$t1
831+ srlx $acc10,16,$acc10 !
832+ xor $acc7,$t1,$t1
833+ srlx $acc11,24,$acc11
834+ xor $acc8,$t2,$t2
835+ srlx $acc13,8,$acc13
836+ xor $acc9,$t2,$t2
837+ srlx $acc14,16,$acc14
838+ xor $acc10,$t2,$t2
839+ srlx $acc15,24,$acc15 !
840+ xor $acc11,$t2,$t2
841+ xor $acc12,$acc14,$acc14
842+ xor $acc13,$t3,$t3
843+ srl $t0,21,$acc0
844+ xor $acc14,$t3,$t3
845+ xor $acc15,$t3,$t3
846+ srl $t3,13,$acc1
847+
848+ and $acc0,2040,$acc0 !
849+ srl $t2,5,$acc2
850+ and $acc1,2040,$acc1
851+ ldx [$tbl+$acc0],$acc0
852+ sll $t1,3,$acc3
853+ and $acc2,2040,$acc2
854+ ldx [$tbl+$acc1],$acc1
855+ fmovs %f0,%f0
856+ srl $t1,21,$acc4 !
857+ and $acc3,2040,$acc3
858+ ldx [$tbl+$acc2],$acc2
859+ srl $t0,13,$acc5
860+ and $acc4,2040,$acc4
861+ ldx [$tbl+$acc3],$acc3
862+ srl $t3,5,$acc6
863+ and $acc5,2040,$acc5
864+ ldx [$tbl+$acc4],$acc4 !
865+ sll $t2,3,$acc7
866+ and $acc6,2040,$acc6
867+ ldx [$tbl+$acc5],$acc5
868+ srl $t2,21,$acc8
869+ and $acc7,2040,$acc7
870+ ldx [$tbl+$acc6],$acc6
871+ fmovs %f0,%f0
872+ srl $t1,13,$acc9 !
873+ and $acc8,2040,$acc8
874+ ldx [$tbl+$acc7],$acc7
875+ srl $t0,5,$acc10
876+ and $acc9,2040,$acc9
877+ ldx [$tbl+$acc8],$acc8
878+ sll $t3,3,$acc11
879+ and $acc10,2040,$acc10
880+ ldx [$tbl+$acc9],$acc9 !
881+ srl $t3,21,$acc12
882+ and $acc11,2040,$acc11
883+ ldx [$tbl+$acc10],$acc10
884+ srl $t2,13,$acc13
885+ and $acc12,2040,$acc12
886+ ldx [$tbl+$acc11],$acc11
887+ fmovs %f0,%f0
888+ srl $t1,5,$acc14 !
889+ and $acc13,2040,$acc13
890+ ldx [$tbl+$acc12],$acc12
891+ sll $t0,3,$acc15
892+ and $acc14,2040,$acc14
893+ ldx [$tbl+$acc13],$acc13
894+ srlx $acc1,8,$acc1
895+ and $acc15,2040,$acc15
896+ ldx [$tbl+$acc14],$acc14 !
897+
898+ srlx $acc2,16,$acc2
899+ xor $acc0,$s0,$s0
900+ ldx [$tbl+$acc15],$acc15
901+ srlx $acc3,24,$acc3
902+ xor $acc1,$s0,$s0
903+ ld [$key+16],$t0
904+ fmovs %f0,%f0
905+ srlx $acc5,8,$acc5 !
906+ xor $acc2,$s0,$s0
907+ ld [$key+20],$t1
908+ srlx $acc6,16,$acc6
909+ xor $acc3,$s0,$s0
910+ ld [$key+24],$t2
911+ srlx $acc7,24,$acc7
912+ xor $acc4,$s1,$s1
913+ ld [$key+28],$t3 !
914+ srlx $acc9,8,$acc9
915+ xor $acc5,$s1,$s1
916+ ldx [$tbl+2048+0],%g0 ! prefetch td4
917+ srlx $acc10,16,$acc10
918+ xor $acc6,$s1,$s1
919+ ldx [$tbl+2048+32],%g0 ! prefetch td4
920+ srlx $acc11,24,$acc11
921+ xor $acc7,$s1,$s1
922+ ldx [$tbl+2048+64],%g0 ! prefetch td4
923+ srlx $acc13,8,$acc13
924+ xor $acc8,$s2,$s2
925+ ldx [$tbl+2048+96],%g0 ! prefetch td4
926+ srlx $acc14,16,$acc14 !
927+ xor $acc9,$s2,$s2
928+ ldx [$tbl+2048+128],%g0 ! prefetch td4
929+ srlx $acc15,24,$acc15
930+ xor $acc10,$s2,$s2
931+ ldx [$tbl+2048+160],%g0 ! prefetch td4
932+ srl $s0,21,$acc0
933+ xor $acc11,$s2,$s2
934+ ldx [$tbl+2048+192],%g0 ! prefetch td4
935+ xor $acc12,$acc14,$acc14
936+ xor $acc13,$s3,$s3
937+ ldx [$tbl+2048+224],%g0 ! prefetch td4
938+ and $acc0,2040,$acc0 !
939+ xor $acc14,$s3,$s3
940+ xor $acc15,$s3,$s3
941+ ba .Ldec_loop
942+ srl $s3,13,$acc1
943+
944+.align 32
945+.Ldec_last:
946+ srlx $acc1,8,$acc1 !
947+ xor $acc0,$t0,$t0
948+ ld [$key+0],$s0
949+ srlx $acc2,16,$acc2
950+ xor $acc1,$t0,$t0
951+ ld [$key+4],$s1
952+ srlx $acc3,24,$acc3
953+ xor $acc2,$t0,$t0
954+ ld [$key+8],$s2 !
955+ srlx $acc5,8,$acc5
956+ xor $acc3,$t0,$t0
957+ ld [$key+12],$s3
958+ srlx $acc6,16,$acc6
959+ xor $acc4,$t1,$t1
960+ srlx $acc7,24,$acc7
961+ xor $acc5,$t1,$t1
962+ srlx $acc9,8,$acc9 !
963+ xor $acc6,$t1,$t1
964+ srlx $acc10,16,$acc10
965+ xor $acc7,$t1,$t1
966+ srlx $acc11,24,$acc11
967+ xor $acc8,$t2,$t2
968+ srlx $acc13,8,$acc13
969+ xor $acc9,$t2,$t2
970+ srlx $acc14,16,$acc14 !
971+ xor $acc10,$t2,$t2
972+ srlx $acc15,24,$acc15
973+ xor $acc11,$t2,$t2
974+ xor $acc12,$acc14,$acc14
975+ xor $acc13,$t3,$t3
976+ srl $t0,24,$acc0
977+ xor $acc14,$t3,$t3
978+ xor $acc15,$t3,$t3 !
979+ srl $t3,16,$acc1
980+
981+ srl $t2,8,$acc2
982+ and $acc1,255,$acc1
983+ ldub [$rounds+$acc0],$acc0
984+ srl $t1,24,$acc4
985+ and $acc2,255,$acc2
986+ ldub [$rounds+$acc1],$acc1
987+ srl $t0,16,$acc5 !
988+ and $t1,255,$acc3
989+ ldub [$rounds+$acc2],$acc2
990+ ldub [$rounds+$acc3],$acc3
991+ srl $t3,8,$acc6
992+ and $acc5,255,$acc5
993+ ldub [$rounds+$acc4],$acc4
994+ fmovs %f0,%f0
995+ srl $t2,24,$acc8 !
996+ and $acc6,255,$acc6
997+ ldub [$rounds+$acc5],$acc5
998+ srl $t1,16,$acc9
999+ and $t2,255,$acc7
1000+ ldub [$rounds+$acc6],$acc6
1001+ ldub [$rounds+$acc7],$acc7
1002+ fmovs %f0,%f0
1003+ srl $t0,8,$acc10 !
1004+ and $acc9,255,$acc9
1005+ ldub [$rounds+$acc8],$acc8
1006+ srl $t3,24,$acc12
1007+ and $acc10,255,$acc10
1008+ ldub [$rounds+$acc9],$acc9
1009+ srl $t2,16,$acc13
1010+ and $t3,255,$acc11
1011+ ldub [$rounds+$acc10],$acc10 !
1012+ srl $t1,8,$acc14
1013+ and $acc13,255,$acc13
1014+ ldub [$rounds+$acc11],$acc11
1015+ ldub [$rounds+$acc12],$acc12
1016+ and $acc14,255,$acc14
1017+ ldub [$rounds+$acc13],$acc13
1018+ and $t0,255,$acc15
1019+ ldub [$rounds+$acc14],$acc14 !
1020+
1021+ sll $acc0,24,$acc0
1022+ xor $acc3,$s0,$s0
1023+ ldub [$rounds+$acc15],$acc15
1024+ sll $acc1,16,$acc1
1025+ xor $acc0,$s0,$s0
1026+ ldx [%sp+$bias+$frame+0],%i7 ! restore return address
1027+ fmovs %f0,%f0
1028+ sll $acc2,8,$acc2 !
1029+ xor $acc1,$s0,$s0
1030+ sll $acc4,24,$acc4
1031+ xor $acc2,$s0,$s0
1032+ sll $acc5,16,$acc5
1033+ xor $acc7,$s1,$s1
1034+ sll $acc6,8,$acc6
1035+ xor $acc4,$s1,$s1
1036+ sll $acc8,24,$acc8 !
1037+ xor $acc5,$s1,$s1
1038+ sll $acc9,16,$acc9
1039+ xor $acc11,$s2,$s2
1040+ sll $acc10,8,$acc10
1041+ xor $acc6,$s1,$s1
1042+ sll $acc12,24,$acc12
1043+ xor $acc8,$s2,$s2
1044+ sll $acc13,16,$acc13 !
1045+ xor $acc9,$s2,$s2
1046+ sll $acc14,8,$acc14
1047+ xor $acc10,$s2,$s2
1048+ xor $acc12,$acc14,$acc14
1049+ xor $acc13,$s3,$s3
1050+ xor $acc14,$s3,$s3
1051+ xor $acc15,$s3,$s3
1052+
1053+ ret
1054+ restore
1055+.type _sparcv9_AES_decrypt,#function
1056+.size _sparcv9_AES_decrypt,(.-_sparcv9_AES_decrypt)
1057+
1058+.align 32
1059+.globl AES_decrypt
1060+AES_decrypt:
1061+ or %o0,%o1,%g1
1062+ andcc %g1,3,%g0
1063+ bnz,pn %xcc,.Lunaligned_dec
1064+ save %sp,-$frame,%sp
1065+
1066+ ld [%i0+0],%o0
1067+ ld [%i0+4],%o1
1068+ ld [%i0+8],%o2
1069+ ld [%i0+12],%o3
1070+
1071+1: call .+8
1072+ add %o7,AES_Td-1b,%o4
1073+ call _sparcv9_AES_decrypt
1074+ mov %i2,%o5
1075+
1076+ st %o0,[%i1+0]
1077+ st %o1,[%i1+4]
1078+ st %o2,[%i1+8]
1079+ st %o3,[%i1+12]
1080+
1081+ ret
1082+ restore
1083+
1084+.align 32
1085+.Lunaligned_dec:
1086+ ldub [%i0+0],%l0
1087+ ldub [%i0+1],%l1
1088+ ldub [%i0+2],%l2
1089+
1090+ sll %l0,24,%l0
1091+ ldub [%i0+3],%l3
1092+ sll %l1,16,%l1
1093+ ldub [%i0+4],%l4
1094+ sll %l2,8,%l2
1095+ or %l1,%l0,%l0
1096+ ldub [%i0+5],%l5
1097+ sll %l4,24,%l4
1098+ or %l3,%l2,%l2
1099+ ldub [%i0+6],%l6
1100+ sll %l5,16,%l5
1101+ or %l0,%l2,%o0
1102+ ldub [%i0+7],%l7
1103+
1104+ sll %l6,8,%l6
1105+ or %l5,%l4,%l4
1106+ ldub [%i0+8],%l0
1107+ or %l7,%l6,%l6
1108+ ldub [%i0+9],%l1
1109+ or %l4,%l6,%o1
1110+ ldub [%i0+10],%l2
1111+
1112+ sll %l0,24,%l0
1113+ ldub [%i0+11],%l3
1114+ sll %l1,16,%l1
1115+ ldub [%i0+12],%l4
1116+ sll %l2,8,%l2
1117+ or %l1,%l0,%l0
1118+ ldub [%i0+13],%l5
1119+ sll %l4,24,%l4
1120+ or %l3,%l2,%l2
1121+ ldub [%i0+14],%l6
1122+ sll %l5,16,%l5
1123+ or %l0,%l2,%o2
1124+ ldub [%i0+15],%l7
1125+
1126+ sll %l6,8,%l6
1127+ or %l5,%l4,%l4
1128+ or %l7,%l6,%l6
1129+ or %l4,%l6,%o3
1130+
1131+1: call .+8
1132+ add %o7,AES_Td-1b,%o4
1133+ call _sparcv9_AES_decrypt
1134+ mov %i2,%o5
1135+
1136+ srl %o0,24,%l0
1137+ srl %o0,16,%l1
1138+ stb %l0,[%i1+0]
1139+ srl %o0,8,%l2
1140+ stb %l1,[%i1+1]
1141+ stb %l2,[%i1+2]
1142+ srl %o1,24,%l4
1143+ stb %o0,[%i1+3]
1144+
1145+ srl %o1,16,%l5
1146+ stb %l4,[%i1+4]
1147+ srl %o1,8,%l6
1148+ stb %l5,[%i1+5]
1149+ stb %l6,[%i1+6]
1150+ srl %o2,24,%l0
1151+ stb %o1,[%i1+7]
1152+
1153+ srl %o2,16,%l1
1154+ stb %l0,[%i1+8]
1155+ srl %o2,8,%l2
1156+ stb %l1,[%i1+9]
1157+ stb %l2,[%i1+10]
1158+ srl %o3,24,%l4
1159+ stb %o2,[%i1+11]
1160+
1161+ srl %o3,16,%l5
1162+ stb %l4,[%i1+12]
1163+ srl %o3,8,%l6
1164+ stb %l5,[%i1+13]
1165+ stb %l6,[%i1+14]
1166+ stb %o3,[%i1+15]
1167+
1168+ ret
1169+ restore
1170+.type AES_decrypt,#function
1171+.size AES_decrypt,(.-AES_decrypt)
1172+___
1173+
1174+# fmovs instructions substituting for FP nops were originally added
1175+# to meet specific instruction alignment requirements to maximize ILP.
1176+# As UltraSPARC T1, a.k.a. Niagara, has shared FPU, FP nops can have
1177+# undesired effect, so just omit them and sacrifice some portion of
1178+# percent in performance...
1179+$code =~ s/fmovs.*$//gem;
1180+
1181+print $code;
--- /dev/null
+++ b/crypto/alphacpuid.s
@@ -0,0 +1,124 @@
1+.text
2+
3+.set noat
4+
5+.globl OPENSSL_cpuid_setup
6+.ent OPENSSL_cpuid_setup
7+OPENSSL_cpuid_setup:
8+ .frame $30,0,$26
9+ .prologue 0
10+ ret ($26)
11+.end OPENSSL_cpuid_setup
12+
13+.globl OPENSSL_wipe_cpu
14+.ent OPENSSL_wipe_cpu
15+OPENSSL_wipe_cpu:
16+ .frame $30,0,$26
17+ .prologue 0
18+ clr $1
19+ clr $2
20+ clr $3
21+ clr $4
22+ clr $5
23+ clr $6
24+ clr $7
25+ clr $8
26+ clr $16
27+ clr $17
28+ clr $18
29+ clr $19
30+ clr $20
31+ clr $21
32+ clr $22
33+ clr $23
34+ clr $24
35+ clr $25
36+ clr $27
37+ clr $at
38+ clr $29
39+ fclr $f0
40+ fclr $f1
41+ fclr $f10
42+ fclr $f11
43+ fclr $f12
44+ fclr $f13
45+ fclr $f14
46+ fclr $f15
47+ fclr $f16
48+ fclr $f17
49+ fclr $f18
50+ fclr $f19
51+ fclr $f20
52+ fclr $f21
53+ fclr $f22
54+ fclr $f23
55+ fclr $f24
56+ fclr $f25
57+ fclr $f26
58+ fclr $f27
59+ fclr $f28
60+ fclr $f29
61+ fclr $f30
62+ mov $sp,$0
63+ ret ($26)
64+.end OPENSSL_wipe_cpu
65+
66+.globl OPENSSL_atomic_add
67+.ent OPENSSL_atomic_add
68+OPENSSL_atomic_add:
69+ .frame $30,0,$26
70+ .prologue 0
71+1: ldl_l $0,($16)
72+ addl $0,$17,$1
73+ stl_c $1,($16)
74+ beq $1,1b
75+ addl $0,$17,$0
76+ ret ($26)
77+.end OPENSSL_atomic_add
78+
79+.globl OPENSSL_rdtsc
80+.ent OPENSSL_rdtsc
81+OPENSSL_rdtsc:
82+ .frame $30,0,$26
83+ .prologue 0
84+ rpcc $0
85+ ret ($26)
86+.end OPENSSL_rdtsc
87+
88+.globl OPENSSL_cleanse
89+.ent OPENSSL_cleanse
90+OPENSSL_cleanse:
91+ .frame $30,0,$26
92+ .prologue 0
93+ and $16,7,$0
94+ bic $17,7,$at
95+ beq $at,.Little
96+ beq $0,.Laligned
97+
98+.Little:
99+ ldq_u $1,0($16)
100+ mov $16,$2
101+.Lalign:
102+ mskbl $1,$16,$1
103+ lda $16,1($16)
104+ subq $17,1,$17
105+ subq $0,1,$0
106+ beq $17,.Lout
107+ bne $0,.Lalign
108+.Lout: stq_u $1,0($2)
109+ beq $17,.Ldone
110+ bic $17,7,$at
111+ mov $17,$0
112+ beq $at,.Little
113+
114+.Laligned:
115+ stq $31,0($16)
116+ subq $17,8,$17
117+ lda $16,8($16)
118+ bic $17,7,$at
119+ bne $at,.Laligned
120+ beq $17,.Ldone
121+ mov $17,$0
122+ br .Little
123+.Ldone: ret ($26)
124+.end OPENSSL_cleanse
--- /dev/null
+++ b/crypto/asn1/ameth_lib.c
@@ -0,0 +1,446 @@
1+/* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL
2+ * project 2006.
3+ */
4+/* ====================================================================
5+ * Copyright (c) 2006 The OpenSSL Project. All rights reserved.
6+ *
7+ * Redistribution and use in source and binary forms, with or without
8+ * modification, are permitted provided that the following conditions
9+ * are met:
10+ *
11+ * 1. Redistributions of source code must retain the above copyright
12+ * notice, this list of conditions and the following disclaimer.
13+ *
14+ * 2. Redistributions in binary form must reproduce the above copyright
15+ * notice, this list of conditions and the following disclaimer in
16+ * the documentation and/or other materials provided with the
17+ * distribution.
18+ *
19+ * 3. All advertising materials mentioning features or use of this
20+ * software must display the following acknowledgment:
21+ * "This product includes software developed by the OpenSSL Project
22+ * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
23+ *
24+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
25+ * endorse or promote products derived from this software without
26+ * prior written permission. For written permission, please contact
27+ * licensing@OpenSSL.org.
28+ *
29+ * 5. Products derived from this software may not be called "OpenSSL"
30+ * nor may "OpenSSL" appear in their names without prior written
31+ * permission of the OpenSSL Project.
32+ *
33+ * 6. Redistributions of any form whatsoever must retain the following
34+ * acknowledgment:
35+ * "This product includes software developed by the OpenSSL Project
36+ * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
37+ *
38+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
39+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
40+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
41+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
42+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
43+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
44+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
45+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
46+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
47+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
48+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
49+ * OF THE POSSIBILITY OF SUCH DAMAGE.
50+ * ====================================================================
51+ *
52+ * This product includes cryptographic software written by Eric Young
53+ * (eay@cryptsoft.com). This product includes software written by Tim
54+ * Hudson (tjh@cryptsoft.com).
55+ *
56+ */
57+
58+#include <stdio.h>
59+#include "cryptlib.h"
60+#include <openssl/asn1t.h>
61+#include <openssl/x509.h>
62+#ifndef OPENSSL_NO_ENGINE
63+#include <openssl/engine.h>
64+#endif
65+#include "asn1_locl.h"
66+
67+extern const EVP_PKEY_ASN1_METHOD rsa_asn1_meths[];
68+extern const EVP_PKEY_ASN1_METHOD dsa_asn1_meths[];
69+extern const EVP_PKEY_ASN1_METHOD dh_asn1_meth;
70+extern const EVP_PKEY_ASN1_METHOD eckey_asn1_meth;
71+extern const EVP_PKEY_ASN1_METHOD hmac_asn1_meth;
72+
73+/* Keep this sorted in type order !! */
74+static const EVP_PKEY_ASN1_METHOD *standard_methods[] =
75+ {
76+#ifndef OPENSSL_NO_RSA
77+ &rsa_asn1_meths[0],
78+ &rsa_asn1_meths[1],
79+#endif
80+#ifndef OPENSSL_NO_DH
81+ &dh_asn1_meth,
82+#endif
83+#ifndef OPENSSL_NO_DSA
84+ &dsa_asn1_meths[0],
85+ &dsa_asn1_meths[1],
86+ &dsa_asn1_meths[2],
87+ &dsa_asn1_meths[3],
88+ &dsa_asn1_meths[4],
89+#endif
90+#ifndef OPENSSL_NO_EC
91+ &eckey_asn1_meth,
92+#endif
93+ &hmac_asn1_meth
94+ };
95+
96+typedef int sk_cmp_fn_type(const char * const *a, const char * const *b);
97+DECLARE_STACK_OF(EVP_PKEY_ASN1_METHOD)
98+static STACK_OF(EVP_PKEY_ASN1_METHOD) *app_methods = NULL;
99+
100+
101+
102+#ifdef TEST
103+void main()
104+ {
105+ int i;
106+ for (i = 0;
107+ i < sizeof(standard_methods)/sizeof(EVP_PKEY_ASN1_METHOD *);
108+ i++)
109+ fprintf(stderr, "Number %d id=%d (%s)\n", i,
110+ standard_methods[i]->pkey_id,
111+ OBJ_nid2sn(standard_methods[i]->pkey_id));
112+ }
113+#endif
114+
115+DECLARE_OBJ_BSEARCH_CMP_FN(const EVP_PKEY_ASN1_METHOD *,
116+ const EVP_PKEY_ASN1_METHOD *, ameth);
117+
118+static int ameth_cmp(const EVP_PKEY_ASN1_METHOD * const *a,
119+ const EVP_PKEY_ASN1_METHOD * const *b)
120+ {
121+ return ((*a)->pkey_id - (*b)->pkey_id);
122+ }
123+
124+IMPLEMENT_OBJ_BSEARCH_CMP_FN(const EVP_PKEY_ASN1_METHOD *,
125+ const EVP_PKEY_ASN1_METHOD *, ameth);
126+
127+int EVP_PKEY_asn1_get_count(void)
128+ {
129+ int num = sizeof(standard_methods)/sizeof(EVP_PKEY_ASN1_METHOD *);
130+ if (app_methods)
131+ num += sk_EVP_PKEY_ASN1_METHOD_num(app_methods);
132+ return num;
133+ }
134+
135+const EVP_PKEY_ASN1_METHOD *EVP_PKEY_asn1_get0(int idx)
136+ {
137+ int num = sizeof(standard_methods)/sizeof(EVP_PKEY_ASN1_METHOD *);
138+ if (idx < 0)
139+ return NULL;
140+ if (idx < num)
141+ return standard_methods[idx];
142+ idx -= num;
143+ return sk_EVP_PKEY_ASN1_METHOD_value(app_methods, idx);
144+ }
145+
146+static const EVP_PKEY_ASN1_METHOD *pkey_asn1_find(int type)
147+ {
148+ EVP_PKEY_ASN1_METHOD tmp;
149+ const EVP_PKEY_ASN1_METHOD *t = &tmp, **ret;
150+ tmp.pkey_id = type;
151+ if (app_methods)
152+ {
153+ int idx;
154+ idx = sk_EVP_PKEY_ASN1_METHOD_find(app_methods, &tmp);
155+ if (idx >= 0)
156+ return sk_EVP_PKEY_ASN1_METHOD_value(app_methods, idx);
157+ }
158+ ret = OBJ_bsearch_ameth(&t, standard_methods,
159+ sizeof(standard_methods)
160+ /sizeof(EVP_PKEY_ASN1_METHOD *));
161+ if (!ret || !*ret)
162+ return NULL;
163+ return *ret;
164+ }
165+
166+/* Find an implementation of an ASN1 algorithm. If 'pe' is not NULL
167+ * also search through engines and set *pe to a functional reference
168+ * to the engine implementing 'type' or NULL if no engine implements
169+ * it.
170+ */
171+
172+const EVP_PKEY_ASN1_METHOD *EVP_PKEY_asn1_find(ENGINE **pe, int type)
173+ {
174+ const EVP_PKEY_ASN1_METHOD *t;
175+ ENGINE *e;
176+
177+ for (;;)
178+ {
179+ t = pkey_asn1_find(type);
180+ if (!t || !(t->pkey_flags & ASN1_PKEY_ALIAS))
181+ break;
182+ type = t->pkey_base_id;
183+ }
184+ if (pe)
185+ {
186+#ifndef OPENSSL_NO_ENGINE
187+ /* type will contain the final unaliased type */
188+ e = ENGINE_get_pkey_asn1_meth_engine(type);
189+ if (e)
190+ {
191+ *pe = e;
192+ return ENGINE_get_pkey_asn1_meth(e, type);
193+ }
194+#endif
195+ *pe = NULL;
196+ }
197+ return t;
198+ }
199+
200+const EVP_PKEY_ASN1_METHOD *EVP_PKEY_asn1_find_str(ENGINE **pe,
201+ const char *str, int len)
202+ {
203+ int i;
204+ const EVP_PKEY_ASN1_METHOD *ameth;
205+ if (len == -1)
206+ len = strlen(str);
207+ if (pe)
208+ {
209+#ifndef OPENSSL_NO_ENGINE
210+ ENGINE *e;
211+ ameth = ENGINE_pkey_asn1_find_str(&e, str, len);
212+ if (ameth)
213+ {
214+ /* Convert structural into
215+ * functional reference
216+ */
217+ if (!ENGINE_init(e))
218+ ameth = NULL;
219+ ENGINE_free(e);
220+ *pe = e;
221+ return ameth;
222+ }
223+#endif
224+ *pe = NULL;
225+ }
226+ for (i = 0; i < EVP_PKEY_asn1_get_count(); i++)
227+ {
228+ ameth = EVP_PKEY_asn1_get0(i);
229+ if (ameth->pkey_flags & ASN1_PKEY_ALIAS)
230+ continue;
231+ if (((int)strlen(ameth->pem_str) == len) &&
232+ !strncasecmp(ameth->pem_str, str, len))
233+ return ameth;
234+ }
235+ return NULL;
236+ }
237+
238+int EVP_PKEY_asn1_add0(const EVP_PKEY_ASN1_METHOD *ameth)
239+ {
240+ if (app_methods == NULL)
241+ {
242+ app_methods = sk_EVP_PKEY_ASN1_METHOD_new(ameth_cmp);
243+ if (!app_methods)
244+ return 0;
245+ }
246+ if (!sk_EVP_PKEY_ASN1_METHOD_push(app_methods, ameth))
247+ return 0;
248+ sk_EVP_PKEY_ASN1_METHOD_sort(app_methods);
249+ return 1;
250+ }
251+
252+int EVP_PKEY_asn1_add_alias(int to, int from)
253+ {
254+ EVP_PKEY_ASN1_METHOD *ameth;
255+ ameth = EVP_PKEY_asn1_new(from, ASN1_PKEY_ALIAS, NULL, NULL);
256+ if (!ameth)
257+ return 0;
258+ ameth->pkey_base_id = to;
259+ return EVP_PKEY_asn1_add0(ameth);
260+ }
261+
262+int EVP_PKEY_asn1_get0_info(int *ppkey_id, int *ppkey_base_id, int *ppkey_flags,
263+ const char **pinfo, const char **ppem_str,
264+ const EVP_PKEY_ASN1_METHOD *ameth)
265+ {
266+ if (!ameth)
267+ return 0;
268+ if (ppkey_id)
269+ *ppkey_id = ameth->pkey_id;
270+ if (ppkey_base_id)
271+ *ppkey_base_id = ameth->pkey_base_id;
272+ if (ppkey_flags)
273+ *ppkey_flags = ameth->pkey_flags;
274+ if (pinfo)
275+ *pinfo = ameth->info;
276+ if (ppem_str)
277+ *ppem_str = ameth->pem_str;
278+ return 1;
279+ }
280+
281+const EVP_PKEY_ASN1_METHOD* EVP_PKEY_get0_asn1(EVP_PKEY *pkey)
282+ {
283+ return pkey->ameth;
284+ }
285+
286+EVP_PKEY_ASN1_METHOD* EVP_PKEY_asn1_new(int id, int flags,
287+ const char *pem_str, const char *info)
288+ {
289+ EVP_PKEY_ASN1_METHOD *ameth;
290+ ameth = OPENSSL_malloc(sizeof(EVP_PKEY_ASN1_METHOD));
291+ if (!ameth)
292+ return NULL;
293+
294+ ameth->pkey_id = id;
295+ ameth->pkey_base_id = id;
296+ ameth->pkey_flags = flags | ASN1_PKEY_DYNAMIC;
297+
298+ if (info)
299+ {
300+ ameth->info = BUF_strdup(info);
301+ if (!ameth->info)
302+ goto err;
303+ }
304+
305+ if (pem_str)
306+ {
307+ ameth->pem_str = BUF_strdup(pem_str);
308+ if (!ameth->pem_str)
309+ goto err;
310+ }
311+
312+ ameth->pub_decode = 0;
313+ ameth->pub_encode = 0;
314+ ameth->pub_cmp = 0;
315+ ameth->pub_print = 0;
316+
317+ ameth->priv_decode = 0;
318+ ameth->priv_encode = 0;
319+ ameth->priv_print = 0;
320+
321+ ameth->old_priv_encode = 0;
322+ ameth->old_priv_decode = 0;
323+
324+ ameth->pkey_size = 0;
325+ ameth->pkey_bits = 0;
326+
327+ ameth->param_decode = 0;
328+ ameth->param_encode = 0;
329+ ameth->param_missing = 0;
330+ ameth->param_copy = 0;
331+ ameth->param_cmp = 0;
332+ ameth->param_print = 0;
333+
334+ ameth->pkey_free = 0;
335+ ameth->pkey_ctrl = 0;
336+
337+ return ameth;
338+
339+ err:
340+
341+ EVP_PKEY_asn1_free(ameth);
342+ return NULL;
343+
344+ }
345+
346+void EVP_PKEY_asn1_copy(EVP_PKEY_ASN1_METHOD *dst,
347+ const EVP_PKEY_ASN1_METHOD *src)
348+ {
349+
350+ dst->pub_decode = src->pub_decode;
351+ dst->pub_encode = src->pub_encode;
352+ dst->pub_cmp = src->pub_cmp;
353+ dst->pub_print = src->pub_print;
354+
355+ dst->priv_decode = src->priv_decode;
356+ dst->priv_encode = src->priv_encode;
357+ dst->priv_print = src->priv_print;
358+
359+ dst->old_priv_encode = src->old_priv_encode;
360+ dst->old_priv_decode = src->old_priv_decode;
361+
362+ dst->pkey_size = src->pkey_size;
363+ dst->pkey_bits = src->pkey_bits;
364+
365+ dst->param_decode = src->param_decode;
366+ dst->param_encode = src->param_encode;
367+ dst->param_missing = src->param_missing;
368+ dst->param_copy = src->param_copy;
369+ dst->param_cmp = src->param_cmp;
370+ dst->param_print = src->param_print;
371+
372+ dst->pkey_free = src->pkey_free;
373+ dst->pkey_ctrl = src->pkey_ctrl;
374+
375+ }
376+
377+void EVP_PKEY_asn1_free(EVP_PKEY_ASN1_METHOD *ameth)
378+ {
379+ if (ameth && (ameth->pkey_flags & ASN1_PKEY_DYNAMIC))
380+ {
381+ if (ameth->pem_str)
382+ OPENSSL_free(ameth->pem_str);
383+ if (ameth->info)
384+ OPENSSL_free(ameth->info);
385+ OPENSSL_free(ameth);
386+ }
387+ }
388+
389+void EVP_PKEY_asn1_set_public(EVP_PKEY_ASN1_METHOD *ameth,
390+ int (*pub_decode)(EVP_PKEY *pk, X509_PUBKEY *pub),
391+ int (*pub_encode)(X509_PUBKEY *pub, const EVP_PKEY *pk),
392+ int (*pub_cmp)(const EVP_PKEY *a, const EVP_PKEY *b),
393+ int (*pub_print)(BIO *out, const EVP_PKEY *pkey, int indent,
394+ ASN1_PCTX *pctx),
395+ int (*pkey_size)(const EVP_PKEY *pk),
396+ int (*pkey_bits)(const EVP_PKEY *pk))
397+ {
398+ ameth->pub_decode = pub_decode;
399+ ameth->pub_encode = pub_encode;
400+ ameth->pub_cmp = pub_cmp;
401+ ameth->pub_print = pub_print;
402+ ameth->pkey_size = pkey_size;
403+ ameth->pkey_bits = pkey_bits;
404+ }
405+
406+void EVP_PKEY_asn1_set_private(EVP_PKEY_ASN1_METHOD *ameth,
407+ int (*priv_decode)(EVP_PKEY *pk, PKCS8_PRIV_KEY_INFO *p8inf),
408+ int (*priv_encode)(PKCS8_PRIV_KEY_INFO *p8, const EVP_PKEY *pk),
409+ int (*priv_print)(BIO *out, const EVP_PKEY *pkey, int indent,
410+ ASN1_PCTX *pctx))
411+ {
412+ ameth->priv_decode = priv_decode;
413+ ameth->priv_encode = priv_encode;
414+ ameth->priv_print = priv_print;
415+ }
416+
417+void EVP_PKEY_asn1_set_param(EVP_PKEY_ASN1_METHOD *ameth,
418+ int (*param_decode)(EVP_PKEY *pkey,
419+ const unsigned char **pder, int derlen),
420+ int (*param_encode)(const EVP_PKEY *pkey, unsigned char **pder),
421+ int (*param_missing)(const EVP_PKEY *pk),
422+ int (*param_copy)(EVP_PKEY *to, const EVP_PKEY *from),
423+ int (*param_cmp)(const EVP_PKEY *a, const EVP_PKEY *b),
424+ int (*param_print)(BIO *out, const EVP_PKEY *pkey, int indent,
425+ ASN1_PCTX *pctx))
426+ {
427+ ameth->param_decode = param_decode;
428+ ameth->param_encode = param_encode;
429+ ameth->param_missing = param_missing;
430+ ameth->param_copy = param_copy;
431+ ameth->param_cmp = param_cmp;
432+ ameth->param_print = param_print;
433+ }
434+
435+void EVP_PKEY_asn1_set_free(EVP_PKEY_ASN1_METHOD *ameth,
436+ void (*pkey_free)(EVP_PKEY *pkey))
437+ {
438+ ameth->pkey_free = pkey_free;
439+ }
440+
441+void EVP_PKEY_asn1_set_ctrl(EVP_PKEY_ASN1_METHOD *ameth,
442+ int (*pkey_ctrl)(EVP_PKEY *pkey, int op,
443+ long arg1, void *arg2))
444+ {
445+ ameth->pkey_ctrl = pkey_ctrl;
446+ }
--- /dev/null
+++ b/crypto/asn1/asn1_locl.h
@@ -0,0 +1,134 @@
1+/* asn1t.h */
2+/* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL
3+ * project 2006.
4+ */
5+/* ====================================================================
6+ * Copyright (c) 2006 The OpenSSL Project. All rights reserved.
7+ *
8+ * Redistribution and use in source and binary forms, with or without
9+ * modification, are permitted provided that the following conditions
10+ * are met:
11+ *
12+ * 1. Redistributions of source code must retain the above copyright
13+ * notice, this list of conditions and the following disclaimer.
14+ *
15+ * 2. Redistributions in binary form must reproduce the above copyright
16+ * notice, this list of conditions and the following disclaimer in
17+ * the documentation and/or other materials provided with the
18+ * distribution.
19+ *
20+ * 3. All advertising materials mentioning features or use of this
21+ * software must display the following acknowledgment:
22+ * "This product includes software developed by the OpenSSL Project
23+ * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
24+ *
25+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
26+ * endorse or promote products derived from this software without
27+ * prior written permission. For written permission, please contact
28+ * licensing@OpenSSL.org.
29+ *
30+ * 5. Products derived from this software may not be called "OpenSSL"
31+ * nor may "OpenSSL" appear in their names without prior written
32+ * permission of the OpenSSL Project.
33+ *
34+ * 6. Redistributions of any form whatsoever must retain the following
35+ * acknowledgment:
36+ * "This product includes software developed by the OpenSSL Project
37+ * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
38+ *
39+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
40+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
41+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
42+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
43+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
44+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
45+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
46+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
47+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
48+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
49+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
50+ * OF THE POSSIBILITY OF SUCH DAMAGE.
51+ * ====================================================================
52+ *
53+ * This product includes cryptographic software written by Eric Young
54+ * (eay@cryptsoft.com). This product includes software written by Tim
55+ * Hudson (tjh@cryptsoft.com).
56+ *
57+ */
58+
59+/* Internal ASN1 structures and functions: not for application use */
60+
61+/* ASN1 print context structure */
62+
63+struct asn1_pctx_st
64+ {
65+ unsigned long flags;
66+ unsigned long nm_flags;
67+ unsigned long cert_flags;
68+ unsigned long oid_flags;
69+ unsigned long str_flags;
70+ } /* ASN1_PCTX */;
71+
72+/* ASN1 public key method structure */
73+
74+struct evp_pkey_asn1_method_st
75+ {
76+ int pkey_id;
77+ int pkey_base_id;
78+ unsigned long pkey_flags;
79+
80+ char *pem_str;
81+ char *info;
82+
83+ int (*pub_decode)(EVP_PKEY *pk, X509_PUBKEY *pub);
84+ int (*pub_encode)(X509_PUBKEY *pub, const EVP_PKEY *pk);
85+ int (*pub_cmp)(const EVP_PKEY *a, const EVP_PKEY *b);
86+ int (*pub_print)(BIO *out, const EVP_PKEY *pkey, int indent,
87+ ASN1_PCTX *pctx);
88+
89+ int (*priv_decode)(EVP_PKEY *pk, PKCS8_PRIV_KEY_INFO *p8inf);
90+ int (*priv_encode)(PKCS8_PRIV_KEY_INFO *p8, const EVP_PKEY *pk);
91+ int (*priv_print)(BIO *out, const EVP_PKEY *pkey, int indent,
92+ ASN1_PCTX *pctx);
93+
94+ int (*pkey_size)(const EVP_PKEY *pk);
95+ int (*pkey_bits)(const EVP_PKEY *pk);
96+
97+ int (*param_decode)(EVP_PKEY *pkey,
98+ const unsigned char **pder, int derlen);
99+ int (*param_encode)(const EVP_PKEY *pkey, unsigned char **pder);
100+ int (*param_missing)(const EVP_PKEY *pk);
101+ int (*param_copy)(EVP_PKEY *to, const EVP_PKEY *from);
102+ int (*param_cmp)(const EVP_PKEY *a, const EVP_PKEY *b);
103+ int (*param_print)(BIO *out, const EVP_PKEY *pkey, int indent,
104+ ASN1_PCTX *pctx);
105+
106+ void (*pkey_free)(EVP_PKEY *pkey);
107+ int (*pkey_ctrl)(EVP_PKEY *pkey, int op, long arg1, void *arg2);
108+
109+ /* Legacy functions for old PEM */
110+
111+ int (*old_priv_decode)(EVP_PKEY *pkey,
112+ const unsigned char **pder, int derlen);
113+ int (*old_priv_encode)(const EVP_PKEY *pkey, unsigned char **pder);
114+
115+ } /* EVP_PKEY_ASN1_METHOD */;
116+
117+/* Method to handle CRL access.
118+ * In general a CRL could be very large (several Mb) and can consume large
119+ * amounts of resources if stored in memory by multiple processes.
120+ * This method allows general CRL operations to be redirected to more
121+ * efficient callbacks: for example a CRL entry database.
122+ */
123+
124+#define X509_CRL_METHOD_DYNAMIC 1
125+
126+struct x509_crl_method_st
127+ {
128+ int flags;
129+ int (*crl_init)(X509_CRL *crl);
130+ int (*crl_free)(X509_CRL *crl);
131+ int (*crl_lookup)(X509_CRL *crl, X509_REVOKED **ret,
132+ ASN1_INTEGER *ser, X509_NAME *issuer);
133+ int (*crl_verify)(X509_CRL *crl, EVP_PKEY *pk);
134+ };
--- /dev/null
+++ b/crypto/asn1/bio_asn1.c
@@ -0,0 +1,495 @@
1+/* bio_asn1.c */
2+/* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL
3+ * project.
4+ */
5+/* ====================================================================
6+ * Copyright (c) 2006 The OpenSSL Project. All rights reserved.
7+ *
8+ * Redistribution and use in source and binary forms, with or without
9+ * modification, are permitted provided that the following conditions
10+ * are met:
11+ *
12+ * 1. Redistributions of source code must retain the above copyright
13+ * notice, this list of conditions and the following disclaimer.
14+ *
15+ * 2. Redistributions in binary form must reproduce the above copyright
16+ * notice, this list of conditions and the following disclaimer in
17+ * the documentation and/or other materials provided with the
18+ * distribution.
19+ *
20+ * 3. All advertising materials mentioning features or use of this
21+ * software must display the following acknowledgment:
22+ * "This product includes software developed by the OpenSSL Project
23+ * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
24+ *
25+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
26+ * endorse or promote products derived from this software without
27+ * prior written permission. For written permission, please contact
28+ * licensing@OpenSSL.org.
29+ *
30+ * 5. Products derived from this software may not be called "OpenSSL"
31+ * nor may "OpenSSL" appear in their names without prior written
32+ * permission of the OpenSSL Project.
33+ *
34+ * 6. Redistributions of any form whatsoever must retain the following
35+ * acknowledgment:
36+ * "This product includes software developed by the OpenSSL Project
37+ * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
38+ *
39+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
40+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
41+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
42+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
43+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
44+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
45+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
46+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
47+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
48+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
49+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
50+ * OF THE POSSIBILITY OF SUCH DAMAGE.
51+ * ====================================================================
52+ *
53+ * This product includes cryptographic software written by Eric Young
54+ * (eay@cryptsoft.com). This product includes software written by Tim
55+ * Hudson (tjh@cryptsoft.com).
56+ *
57+ */
58+
59+/* Experimental ASN1 BIO. When written through the data is converted
60+ * to an ASN1 string type: default is OCTET STRING. Additional functions
61+ * can be provided to add prefix and suffix data.
62+ */
63+
64+#include <string.h>
65+#include <openssl/bio.h>
66+#include <openssl/asn1.h>
67+
68+/* Must be large enough for biggest tag+length */
69+#define DEFAULT_ASN1_BUF_SIZE 20
70+
71+typedef enum
72+ {
73+ ASN1_STATE_START,
74+ ASN1_STATE_PRE_COPY,
75+ ASN1_STATE_HEADER,
76+ ASN1_STATE_HEADER_COPY,
77+ ASN1_STATE_DATA_COPY,
78+ ASN1_STATE_POST_COPY,
79+ ASN1_STATE_DONE
80+ } asn1_bio_state_t;
81+
82+typedef struct BIO_ASN1_EX_FUNCS_st
83+ {
84+ asn1_ps_func *ex_func;
85+ asn1_ps_func *ex_free_func;
86+ } BIO_ASN1_EX_FUNCS;
87+
88+typedef struct BIO_ASN1_BUF_CTX_t
89+ {
90+ /* Internal state */
91+ asn1_bio_state_t state;
92+ /* Internal buffer */
93+ unsigned char *buf;
94+ /* Size of buffer */
95+ int bufsize;
96+ /* Current position in buffer */
97+ int bufpos;
98+ /* Current buffer length */
99+ int buflen;
100+ /* Amount of data to copy */
101+ int copylen;
102+ /* Class and tag to use */
103+ int asn1_class, asn1_tag;
104+ asn1_ps_func *prefix, *prefix_free, *suffix, *suffix_free;
105+ /* Extra buffer for prefix and suffix data */
106+ unsigned char *ex_buf;
107+ int ex_len;
108+ int ex_pos;
109+ void *ex_arg;
110+ } BIO_ASN1_BUF_CTX;
111+
112+
113+static int asn1_bio_write(BIO *h, const char *buf,int num);
114+static int asn1_bio_read(BIO *h, char *buf, int size);
115+static int asn1_bio_puts(BIO *h, const char *str);
116+static int asn1_bio_gets(BIO *h, char *str, int size);
117+static long asn1_bio_ctrl(BIO *h, int cmd, long arg1, void *arg2);
118+static int asn1_bio_new(BIO *h);
119+static int asn1_bio_free(BIO *data);
120+static long asn1_bio_callback_ctrl(BIO *h, int cmd, bio_info_cb *fp);
121+
122+static int asn1_bio_init(BIO_ASN1_BUF_CTX *ctx, int size);
123+static int asn1_bio_flush_ex(BIO *b, BIO_ASN1_BUF_CTX *ctx,
124+ asn1_ps_func *cleanup, asn1_bio_state_t next);
125+static int asn1_bio_setup_ex(BIO *b, BIO_ASN1_BUF_CTX *ctx,
126+ asn1_ps_func *setup,
127+ asn1_bio_state_t ex_state,
128+ asn1_bio_state_t other_state);
129+
130+static BIO_METHOD methods_asn1=
131+ {
132+ BIO_TYPE_ASN1,
133+ "asn1",
134+ asn1_bio_write,
135+ asn1_bio_read,
136+ asn1_bio_puts,
137+ asn1_bio_gets,
138+ asn1_bio_ctrl,
139+ asn1_bio_new,
140+ asn1_bio_free,
141+ asn1_bio_callback_ctrl,
142+ };
143+
144+BIO_METHOD *BIO_f_asn1(void)
145+ {
146+ return(&methods_asn1);
147+ }
148+
149+
150+static int asn1_bio_new(BIO *b)
151+ {
152+ BIO_ASN1_BUF_CTX *ctx;
153+ ctx = OPENSSL_malloc(sizeof(BIO_ASN1_BUF_CTX));
154+ if (!ctx)
155+ return 0;
156+ if (!asn1_bio_init(ctx, DEFAULT_ASN1_BUF_SIZE))
157+ return 0;
158+ b->init = 1;
159+ b->ptr = (char *)ctx;
160+ b->flags = 0;
161+ return 1;
162+ }
163+
164+static int asn1_bio_init(BIO_ASN1_BUF_CTX *ctx, int size)
165+ {
166+ ctx->buf = OPENSSL_malloc(size);
167+ if (!ctx->buf)
168+ return 0;
169+ ctx->bufsize = size;
170+ ctx->bufpos = 0;
171+ ctx->buflen = 0;
172+ ctx->copylen = 0;
173+ ctx->asn1_class = V_ASN1_UNIVERSAL;
174+ ctx->asn1_tag = V_ASN1_OCTET_STRING;
175+ ctx->ex_buf = 0;
176+ ctx->ex_pos = 0;
177+ ctx->ex_len = 0;
178+ ctx->state = ASN1_STATE_START;
179+ return 1;
180+ }
181+
182+static int asn1_bio_free(BIO *b)
183+ {
184+ BIO_ASN1_BUF_CTX *ctx;
185+ ctx = (BIO_ASN1_BUF_CTX *) b->ptr;
186+ if (ctx == NULL)
187+ return 0;
188+ if (ctx->buf)
189+ OPENSSL_free(ctx->buf);
190+ OPENSSL_free(ctx);
191+ b->init = 0;
192+ b->ptr = NULL;
193+ b->flags = 0;
194+ return 1;
195+ }
196+
197+static int asn1_bio_write(BIO *b, const char *in , int inl)
198+ {
199+ BIO_ASN1_BUF_CTX *ctx;
200+ int wrmax, wrlen, ret;
201+ unsigned char *p;
202+ if (!in || (inl < 0) || (b->next_bio == NULL))
203+ return 0;
204+ ctx = (BIO_ASN1_BUF_CTX *) b->ptr;
205+ if (ctx == NULL)
206+ return 0;
207+
208+ wrlen = 0;
209+ ret = -1;
210+
211+ for(;;)
212+ {
213+ switch (ctx->state)
214+ {
215+
216+ /* Setup prefix data, call it */
217+ case ASN1_STATE_START:
218+ if (!asn1_bio_setup_ex(b, ctx, ctx->prefix,
219+ ASN1_STATE_PRE_COPY, ASN1_STATE_HEADER))
220+ return 0;
221+ break;
222+
223+ /* Copy any pre data first */
224+ case ASN1_STATE_PRE_COPY:
225+
226+ ret = asn1_bio_flush_ex(b, ctx, ctx->prefix_free,
227+ ASN1_STATE_HEADER);
228+
229+ if (ret <= 0)
230+ goto done;
231+
232+ break;
233+
234+ case ASN1_STATE_HEADER:
235+ ctx->buflen =
236+ ASN1_object_size(0, inl, ctx->asn1_tag) - inl;
237+ OPENSSL_assert(ctx->buflen <= ctx->bufsize);
238+ p = ctx->buf;
239+ ASN1_put_object(&p, 0, inl,
240+ ctx->asn1_tag, ctx->asn1_class);
241+ ctx->copylen = inl;
242+ ctx->state = ASN1_STATE_HEADER_COPY;
243+
244+ break;
245+
246+ case ASN1_STATE_HEADER_COPY:
247+ ret = BIO_write(b->next_bio,
248+ ctx->buf + ctx->bufpos, ctx->buflen);
249+ if (ret <= 0)
250+ goto done;
251+
252+ ctx->buflen -= ret;
253+ if (ctx->buflen)
254+ ctx->bufpos += ret;
255+ else
256+ {
257+ ctx->bufpos = 0;
258+ ctx->state = ASN1_STATE_DATA_COPY;
259+ }
260+
261+ break;
262+
263+ case ASN1_STATE_DATA_COPY:
264+
265+ if (inl > ctx->copylen)
266+ wrmax = ctx->copylen;
267+ else
268+ wrmax = inl;
269+ ret = BIO_write(b->next_bio, in, wrmax);
270+ if (ret <= 0)
271+ break;
272+ wrlen += ret;
273+ ctx->copylen -= ret;
274+ in += ret;
275+ inl -= ret;
276+
277+ if (ctx->copylen == 0)
278+ ctx->state = ASN1_STATE_HEADER;
279+
280+ if (inl == 0)
281+ goto done;
282+
283+ break;
284+
285+ default:
286+ BIO_clear_retry_flags(b);
287+ return 0;
288+
289+ }
290+
291+ }
292+
293+ done:
294+ BIO_clear_retry_flags(b);
295+ BIO_copy_next_retry(b);
296+
297+ return (wrlen > 0) ? wrlen : ret;
298+
299+ }
300+
301+static int asn1_bio_flush_ex(BIO *b, BIO_ASN1_BUF_CTX *ctx,
302+ asn1_ps_func *cleanup, asn1_bio_state_t next)
303+ {
304+ int ret;
305+ if (ctx->ex_len <= 0)
306+ return 1;
307+ for(;;)
308+ {
309+ ret = BIO_write(b->next_bio, ctx->ex_buf + ctx->ex_pos,
310+ ctx->ex_len);
311+ if (ret <= 0)
312+ break;
313+ ctx->ex_len -= ret;
314+ if (ctx->ex_len > 0)
315+ ctx->ex_pos += ret;
316+ else
317+ {
318+ if(cleanup)
319+ cleanup(b, &ctx->ex_buf, &ctx->ex_len,
320+ &ctx->ex_arg);
321+ ctx->state = next;
322+ ctx->ex_pos = 0;
323+ break;
324+ }
325+ }
326+ return ret;
327+ }
328+
329+static int asn1_bio_setup_ex(BIO *b, BIO_ASN1_BUF_CTX *ctx,
330+ asn1_ps_func *setup,
331+ asn1_bio_state_t ex_state,
332+ asn1_bio_state_t other_state)
333+ {
334+ if (setup && !setup(b, &ctx->ex_buf, &ctx->ex_len, &ctx->ex_arg))
335+ {
336+ BIO_clear_retry_flags(b);
337+ return 0;
338+ }
339+ if (ctx->ex_len > 0)
340+ ctx->state = ex_state;
341+ else
342+ ctx->state = other_state;
343+ return 1;
344+ }
345+
346+static int asn1_bio_read(BIO *b, char *in , int inl)
347+ {
348+ if (!b->next_bio)
349+ return 0;
350+ return BIO_read(b->next_bio, in , inl);
351+ }
352+
353+static int asn1_bio_puts(BIO *b, const char *str)
354+ {
355+ return asn1_bio_write(b, str, strlen(str));
356+ }
357+
358+static int asn1_bio_gets(BIO *b, char *str, int size)
359+ {
360+ if (!b->next_bio)
361+ return 0;
362+ return BIO_gets(b->next_bio, str , size);
363+ }
364+
365+static long asn1_bio_callback_ctrl(BIO *b, int cmd, bio_info_cb *fp)
366+ {
367+ if (b->next_bio == NULL) return(0);
368+ return BIO_callback_ctrl(b->next_bio,cmd,fp);
369+ }
370+
371+static long asn1_bio_ctrl(BIO *b, int cmd, long arg1, void *arg2)
372+ {
373+ BIO_ASN1_BUF_CTX *ctx;
374+ BIO_ASN1_EX_FUNCS *ex_func;
375+ long ret = 1;
376+ ctx = (BIO_ASN1_BUF_CTX *) b->ptr;
377+ if (ctx == NULL)
378+ return 0;
379+ switch(cmd)
380+ {
381+
382+ case BIO_C_SET_PREFIX:
383+ ex_func = arg2;
384+ ctx->prefix = ex_func->ex_func;
385+ ctx->prefix_free = ex_func->ex_free_func;
386+ break;
387+
388+ case BIO_C_GET_PREFIX:
389+ ex_func = arg2;
390+ ex_func->ex_func = ctx->prefix;
391+ ex_func->ex_free_func = ctx->prefix_free;
392+ break;
393+
394+ case BIO_C_SET_SUFFIX:
395+ ex_func = arg2;
396+ ctx->suffix = ex_func->ex_func;
397+ ctx->suffix_free = ex_func->ex_free_func;
398+ break;
399+
400+ case BIO_C_GET_SUFFIX:
401+ ex_func = arg2;
402+ ex_func->ex_func = ctx->suffix;
403+ ex_func->ex_free_func = ctx->suffix_free;
404+ break;
405+
406+ case BIO_C_SET_EX_ARG:
407+ ctx->ex_arg = arg2;
408+ break;
409+
410+ case BIO_C_GET_EX_ARG:
411+ *(void **)arg2 = ctx->ex_arg;
412+ break;
413+
414+ case BIO_CTRL_FLUSH:
415+ if (!b->next_bio)
416+ return 0;
417+
418+ /* Call post function if possible */
419+ if (ctx->state == ASN1_STATE_HEADER)
420+ {
421+ if (!asn1_bio_setup_ex(b, ctx, ctx->suffix,
422+ ASN1_STATE_POST_COPY, ASN1_STATE_DONE))
423+ return 0;
424+ }
425+
426+ if (ctx->state == ASN1_STATE_POST_COPY)
427+ {
428+ ret = asn1_bio_flush_ex(b, ctx, ctx->suffix_free,
429+ ASN1_STATE_DONE);
430+ if (ret <= 0)
431+ return ret;
432+ }
433+
434+ if (ctx->state == ASN1_STATE_DONE)
435+ return BIO_ctrl(b->next_bio, cmd, arg1, arg2);
436+ else
437+ {
438+ BIO_clear_retry_flags(b);
439+ return 0;
440+ }
441+ break;
442+
443+
444+ default:
445+ if (!b->next_bio)
446+ return 0;
447+ return BIO_ctrl(b->next_bio, cmd, arg1, arg2);
448+
449+ }
450+
451+ return ret;
452+ }
453+
454+static int asn1_bio_set_ex(BIO *b, int cmd,
455+ asn1_ps_func *ex_func, asn1_ps_func *ex_free_func)
456+ {
457+ BIO_ASN1_EX_FUNCS extmp;
458+ extmp.ex_func = ex_func;
459+ extmp.ex_free_func = ex_free_func;
460+ return BIO_ctrl(b, cmd, 0, &extmp);
461+ }
462+
463+static int asn1_bio_get_ex(BIO *b, int cmd,
464+ asn1_ps_func **ex_func, asn1_ps_func **ex_free_func)
465+ {
466+ BIO_ASN1_EX_FUNCS extmp;
467+ int ret;
468+ ret = BIO_ctrl(b, cmd, 0, &extmp);
469+ if (ret > 0)
470+ {
471+ *ex_func = extmp.ex_func;
472+ *ex_free_func = extmp.ex_free_func;
473+ }
474+ return ret;
475+ }
476+
477+int BIO_asn1_set_prefix(BIO *b, asn1_ps_func *prefix, asn1_ps_func *prefix_free)
478+ {
479+ return asn1_bio_set_ex(b, BIO_C_SET_PREFIX, prefix, prefix_free);
480+ }
481+
482+int BIO_asn1_get_prefix(BIO *b, asn1_ps_func **pprefix, asn1_ps_func **pprefix_free)
483+ {
484+ return asn1_bio_get_ex(b, BIO_C_GET_PREFIX, pprefix, pprefix_free);
485+ }
486+
487+int BIO_asn1_set_suffix(BIO *b, asn1_ps_func *suffix, asn1_ps_func *suffix_free)
488+ {
489+ return asn1_bio_set_ex(b, BIO_C_SET_SUFFIX, suffix, suffix_free);
490+ }
491+
492+int BIO_asn1_get_suffix(BIO *b, asn1_ps_func **psuffix, asn1_ps_func **psuffix_free)
493+ {
494+ return asn1_bio_get_ex(b, BIO_C_GET_SUFFIX, psuffix, psuffix_free);
495+ }
--- /dev/null
+++ b/crypto/asn1/bio_ndef.c
@@ -0,0 +1,246 @@
1+/* bio_ndef.c */
2+/* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL
3+ * project.
4+ */
5+/* ====================================================================
6+ * Copyright (c) 2008 The OpenSSL Project. All rights reserved.
7+ *
8+ * Redistribution and use in source and binary forms, with or without
9+ * modification, are permitted provided that the following conditions
10+ * are met:
11+ *
12+ * 1. Redistributions of source code must retain the above copyright
13+ * notice, this list of conditions and the following disclaimer.
14+ *
15+ * 2. Redistributions in binary form must reproduce the above copyright
16+ * notice, this list of conditions and the following disclaimer in
17+ * the documentation and/or other materials provided with the
18+ * distribution.
19+ *
20+ * 3. All advertising materials mentioning features or use of this
21+ * software must display the following acknowledgment:
22+ * "This product includes software developed by the OpenSSL Project
23+ * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
24+ *
25+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
26+ * endorse or promote products derived from this software without
27+ * prior written permission. For written permission, please contact
28+ * licensing@OpenSSL.org.
29+ *
30+ * 5. Products derived from this software may not be called "OpenSSL"
31+ * nor may "OpenSSL" appear in their names without prior written
32+ * permission of the OpenSSL Project.
33+ *
34+ * 6. Redistributions of any form whatsoever must retain the following
35+ * acknowledgment:
36+ * "This product includes software developed by the OpenSSL Project
37+ * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
38+ *
39+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
40+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
41+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
42+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
43+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
44+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
45+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
46+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
47+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
48+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
49+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
50+ * OF THE POSSIBILITY OF SUCH DAMAGE.
51+ * ====================================================================
52+ *
53+ */
54+
55+#include <openssl/asn1.h>
56+#include <openssl/asn1t.h>
57+#include <openssl/bio.h>
58+#include <openssl/err.h>
59+
60+#ifndef OPENSSL_SYSNAME_NETWARE
61+#include <memory.h>
62+#endif
63+#include <stdio.h>
64+
65+/* Experimental NDEF ASN1 BIO support routines */
66+
67+/* The usage is quite simple, initialize an ASN1 structure,
68+ * get a BIO from it then any data written through the BIO
69+ * will end up translated to approptiate format on the fly.
70+ * The data is streamed out and does *not* need to be
71+ * all held in memory at once.
72+ *
73+ * When the BIO is flushed the output is finalized and any
74+ * signatures etc written out.
75+ *
76+ * The BIO is a 'proper' BIO and can handle non blocking I/O
77+ * correctly.
78+ *
79+ * The usage is simple. The implementation is *not*...
80+ */
81+
82+/* BIO support data stored in the ASN1 BIO ex_arg */
83+
84+typedef struct ndef_aux_st
85+ {
86+ /* ASN1 structure this BIO refers to */
87+ ASN1_VALUE *val;
88+ const ASN1_ITEM *it;
89+ /* Top of the BIO chain */
90+ BIO *ndef_bio;
91+ /* Output BIO */
92+ BIO *out;
93+ /* Boundary where content is inserted */
94+ unsigned char **boundary;
95+ /* DER buffer start */
96+ unsigned char *derbuf;
97+ } NDEF_SUPPORT;
98+
99+static int ndef_prefix(BIO *b, unsigned char **pbuf, int *plen, void *parg);
100+static int ndef_prefix_free(BIO *b, unsigned char **pbuf, int *plen, void *parg);
101+static int ndef_suffix(BIO *b, unsigned char **pbuf, int *plen, void *parg);
102+static int ndef_suffix_free(BIO *b, unsigned char **pbuf, int *plen, void *parg);
103+
104+BIO *BIO_new_NDEF(BIO *out, ASN1_VALUE *val, const ASN1_ITEM *it)
105+ {
106+ NDEF_SUPPORT *ndef_aux = NULL;
107+ BIO *asn_bio = NULL;
108+ const ASN1_AUX *aux = it->funcs;
109+ ASN1_STREAM_ARG sarg;
110+
111+ if (!aux || !aux->asn1_cb)
112+ {
113+ ASN1err(ASN1_F_BIO_NEW_NDEF, ASN1_R_STREAMING_NOT_SUPPORTED);
114+ return NULL;
115+ }
116+ ndef_aux = OPENSSL_malloc(sizeof(NDEF_SUPPORT));
117+ asn_bio = BIO_new(BIO_f_asn1());
118+
119+ /* ASN1 bio needs to be next to output BIO */
120+
121+ out = BIO_push(asn_bio, out);
122+
123+ if (!ndef_aux || !asn_bio || !out)
124+ goto err;
125+
126+ BIO_asn1_set_prefix(asn_bio, ndef_prefix, ndef_prefix_free);
127+ BIO_asn1_set_suffix(asn_bio, ndef_suffix, ndef_suffix_free);
128+
129+ /* Now let callback prepend any digest, cipher etc BIOs
130+ * ASN1 structure needs.
131+ */
132+
133+ sarg.out = out;
134+ sarg.ndef_bio = NULL;
135+ sarg.boundary = NULL;
136+
137+ if (aux->asn1_cb(ASN1_OP_STREAM_PRE, &val, it, &sarg) <= 0)
138+ goto err;
139+
140+ ndef_aux->val = val;
141+ ndef_aux->it = it;
142+ ndef_aux->ndef_bio = sarg.ndef_bio;
143+ ndef_aux->boundary = sarg.boundary;
144+ ndef_aux->out = out;
145+
146+ BIO_ctrl(asn_bio, BIO_C_SET_EX_ARG, 0, ndef_aux);
147+
148+ return sarg.ndef_bio;
149+
150+ err:
151+ if (asn_bio)
152+ BIO_free(asn_bio);
153+ if (ndef_aux)
154+ OPENSSL_free(ndef_aux);
155+ return NULL;
156+ }
157+
158+static int ndef_prefix(BIO *b, unsigned char **pbuf, int *plen, void *parg)
159+ {
160+ NDEF_SUPPORT *ndef_aux;
161+ unsigned char *p;
162+ int derlen;
163+
164+ if (!parg)
165+ return 0;
166+
167+ ndef_aux = *(NDEF_SUPPORT **)parg;
168+
169+ derlen = ASN1_item_ndef_i2d(ndef_aux->val, NULL, ndef_aux->it);
170+ p = OPENSSL_malloc(derlen);
171+ ndef_aux->derbuf = p;
172+ *pbuf = p;
173+ derlen = ASN1_item_ndef_i2d(ndef_aux->val, &p, ndef_aux->it);
174+
175+ if (!*ndef_aux->boundary)
176+ return 0;
177+
178+ *plen = *ndef_aux->boundary - *pbuf;
179+
180+ return 1;
181+ }
182+
183+static int ndef_prefix_free(BIO *b, unsigned char **pbuf, int *plen, void *parg)
184+ {
185+ NDEF_SUPPORT *ndef_aux;
186+
187+ if (!parg)
188+ return 0;
189+
190+ ndef_aux = *(NDEF_SUPPORT **)parg;
191+
192+ if (ndef_aux->derbuf)
193+ OPENSSL_free(ndef_aux->derbuf);
194+
195+ ndef_aux->derbuf = NULL;
196+ *pbuf = NULL;
197+ *plen = 0;
198+ return 1;
199+ }
200+
201+static int ndef_suffix_free(BIO *b, unsigned char **pbuf, int *plen, void *parg)
202+ {
203+ NDEF_SUPPORT **pndef_aux = (NDEF_SUPPORT **)parg;
204+ if (!ndef_prefix_free(b, pbuf, plen, parg))
205+ return 0;
206+ OPENSSL_free(*pndef_aux);
207+ *pndef_aux = NULL;
208+ return 1;
209+ }
210+
211+static int ndef_suffix(BIO *b, unsigned char **pbuf, int *plen, void *parg)
212+ {
213+ NDEF_SUPPORT *ndef_aux;
214+ unsigned char *p;
215+ int derlen;
216+ const ASN1_AUX *aux;
217+ ASN1_STREAM_ARG sarg;
218+
219+ if (!parg)
220+ return 0;
221+
222+ ndef_aux = *(NDEF_SUPPORT **)parg;
223+
224+ aux = ndef_aux->it->funcs;
225+
226+ /* Finalize structures */
227+ sarg.ndef_bio = ndef_aux->ndef_bio;
228+ sarg.out = ndef_aux->out;
229+ sarg.boundary = ndef_aux->boundary;
230+ if (aux->asn1_cb(ASN1_OP_STREAM_POST,
231+ &ndef_aux->val, ndef_aux->it, &sarg) <= 0)
232+ return 0;
233+
234+ derlen = ASN1_item_ndef_i2d(ndef_aux->val, NULL, ndef_aux->it);
235+ p = OPENSSL_malloc(derlen);
236+ ndef_aux->derbuf = p;
237+ *pbuf = p;
238+ derlen = ASN1_item_ndef_i2d(ndef_aux->val, &p, ndef_aux->it);
239+
240+ if (!*ndef_aux->boundary)
241+ return 0;
242+ *pbuf = *ndef_aux->boundary;
243+ *plen = derlen - (*ndef_aux->boundary - ndef_aux->derbuf);
244+
245+ return 1;
246+ }
--- /dev/null
+++ b/crypto/asn1/x_nx509.c
@@ -0,0 +1,72 @@
1+/* x_nx509.c */
2+/* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL
3+ * project 2005.
4+ */
5+/* ====================================================================
6+ * Copyright (c) 2005 The OpenSSL Project. All rights reserved.
7+ *
8+ * Redistribution and use in source and binary forms, with or without
9+ * modification, are permitted provided that the following conditions
10+ * are met:
11+ *
12+ * 1. Redistributions of source code must retain the above copyright
13+ * notice, this list of conditions and the following disclaimer.
14+ *
15+ * 2. Redistributions in binary form must reproduce the above copyright
16+ * notice, this list of conditions and the following disclaimer in
17+ * the documentation and/or other materials provided with the
18+ * distribution.
19+ *
20+ * 3. All advertising materials mentioning features or use of this
21+ * software must display the following acknowledgment:
22+ * "This product includes software developed by the OpenSSL Project
23+ * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
24+ *
25+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
26+ * endorse or promote products derived from this software without
27+ * prior written permission. For written permission, please contact
28+ * licensing@OpenSSL.org.
29+ *
30+ * 5. Products derived from this software may not be called "OpenSSL"
31+ * nor may "OpenSSL" appear in their names without prior written
32+ * permission of the OpenSSL Project.
33+ *
34+ * 6. Redistributions of any form whatsoever must retain the following
35+ * acknowledgment:
36+ * "This product includes software developed by the OpenSSL Project
37+ * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
38+ *
39+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
40+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
41+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
42+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
43+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
44+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
45+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
46+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
47+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
48+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
49+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
50+ * OF THE POSSIBILITY OF SUCH DAMAGE.
51+ * ====================================================================
52+ *
53+ * This product includes cryptographic software written by Eric Young
54+ * (eay@cryptsoft.com). This product includes software written by Tim
55+ * Hudson (tjh@cryptsoft.com).
56+ *
57+ */
58+
59+#include <stddef.h>
60+#include <openssl/x509.h>
61+#include <openssl/asn1.h>
62+#include <openssl/asn1t.h>
63+
64+/* Old netscape certificate wrapper format */
65+
66+ASN1_SEQUENCE(NETSCAPE_X509) = {
67+ ASN1_SIMPLE(NETSCAPE_X509, header, ASN1_OCTET_STRING),
68+ ASN1_OPT(NETSCAPE_X509, cert, X509)
69+} ASN1_SEQUENCE_END(NETSCAPE_X509)
70+
71+IMPLEMENT_ASN1_FUNCTIONS(NETSCAPE_X509)
72+
--- /dev/null
+++ b/crypto/bn/asm/alpha-mont.pl
@@ -0,0 +1,317 @@
1+#!/usr/bin/env perl
2+#
3+# ====================================================================
4+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5+# project. The module is, however, dual licensed under OpenSSL and
6+# CRYPTOGAMS licenses depending on where you obtain it. For further
7+# details see http://www.openssl.org/~appro/cryptogams/.
8+# ====================================================================
9+#
10+# On 21264 RSA sign performance improves by 70/35/20/15 percent for
11+# 512/1024/2048/4096 bit key lengths. This is against vendor compiler
12+# instructed to '-tune host' code with in-line assembler. Other
13+# benchmarks improve by 15-20%. To anchor it to something else, the
14+# code provides approximately the same performance per GHz as AMD64.
15+# I.e. if you compare 1GHz 21264 and 2GHz Opteron, you'll observe ~2x
16+# difference.
17+
18+# int bn_mul_mont(
19+$rp="a0"; # BN_ULONG *rp,
20+$ap="a1"; # const BN_ULONG *ap,
21+$bp="a2"; # const BN_ULONG *bp,
22+$np="a3"; # const BN_ULONG *np,
23+$n0="a4"; # const BN_ULONG *n0,
24+$num="a5"; # int num);
25+
26+$lo0="t0";
27+$hi0="t1";
28+$lo1="t2";
29+$hi1="t3";
30+$aj="t4";
31+$bi="t5";
32+$nj="t6";
33+$tp="t7";
34+$alo="t8";
35+$ahi="t9";
36+$nlo="t10";
37+$nhi="t11";
38+$tj="t12";
39+$i="s3";
40+$j="s4";
41+$m1="s5";
42+
43+$code=<<___;
44+#include <asm.h>
45+#include <regdef.h>
46+
47+.text
48+
49+.set noat
50+.set noreorder
51+
52+.globl bn_mul_mont
53+.align 5
54+.ent bn_mul_mont
55+bn_mul_mont:
56+ lda sp,-40(sp)
57+ stq ra,0(sp)
58+ stq s3,8(sp)
59+ stq s4,16(sp)
60+ stq s5,24(sp)
61+ stq fp,32(sp)
62+ mov sp,fp
63+ .mask 0x0400f000,-40
64+ .frame fp,40,ra
65+ .prologue 0
66+
67+ .align 4
68+ .set reorder
69+ sextl $num,$num
70+ mov 0,v0
71+ cmplt $num,4,AT
72+ bne AT,.Lexit
73+
74+ ldq $hi0,0($ap) # ap[0]
75+ s8addq $num,16,AT
76+ ldq $aj,8($ap)
77+ subq sp,AT,sp
78+ ldq $bi,0($bp) # bp[0]
79+ mov -4096,AT
80+ ldq $n0,0($n0)
81+ and sp,AT,sp
82+
83+ mulq $hi0,$bi,$lo0
84+ ldq $hi1,0($np) # np[0]
85+ umulh $hi0,$bi,$hi0
86+ ldq $nj,8($np)
87+
88+ mulq $lo0,$n0,$m1
89+
90+ mulq $hi1,$m1,$lo1
91+ umulh $hi1,$m1,$hi1
92+
93+ addq $lo1,$lo0,$lo1
94+ cmpult $lo1,$lo0,AT
95+ addq $hi1,AT,$hi1
96+
97+ mulq $aj,$bi,$alo
98+ mov 2,$j
99+ umulh $aj,$bi,$ahi
100+ mov sp,$tp
101+
102+ mulq $nj,$m1,$nlo
103+ s8addq $j,$ap,$aj
104+ umulh $nj,$m1,$nhi
105+ s8addq $j,$np,$nj
106+.align 4
107+.L1st:
108+ .set noreorder
109+ ldq $aj,($aj)
110+ addl $j,1,$j
111+ ldq $nj,($nj)
112+ lda $tp,8($tp)
113+
114+ addq $alo,$hi0,$lo0
115+ mulq $aj,$bi,$alo
116+ cmpult $lo0,$hi0,AT
117+ addq $nlo,$hi1,$lo1
118+
119+ mulq $nj,$m1,$nlo
120+ addq $ahi,AT,$hi0
121+ cmpult $lo1,$hi1,v0
122+ cmplt $j,$num,$tj
123+
124+ umulh $aj,$bi,$ahi
125+ addq $nhi,v0,$hi1
126+ addq $lo1,$lo0,$lo1
127+ s8addq $j,$ap,$aj
128+
129+ umulh $nj,$m1,$nhi
130+ cmpult $lo1,$lo0,v0
131+ addq $hi1,v0,$hi1
132+ s8addq $j,$np,$nj
133+
134+ stq $lo1,-8($tp)
135+ nop
136+ unop
137+ bne $tj,.L1st
138+ .set reorder
139+
140+ addq $alo,$hi0,$lo0
141+ addq $nlo,$hi1,$lo1
142+ cmpult $lo0,$hi0,AT
143+ cmpult $lo1,$hi1,v0
144+ addq $ahi,AT,$hi0
145+ addq $nhi,v0,$hi1
146+
147+ addq $lo1,$lo0,$lo1
148+ cmpult $lo1,$lo0,v0
149+ addq $hi1,v0,$hi1
150+
151+ stq $lo1,0($tp)
152+
153+ addq $hi1,$hi0,$hi1
154+ cmpult $hi1,$hi0,AT
155+ stq $hi1,8($tp)
156+ stq AT,16($tp)
157+
158+ mov 1,$i
159+.align 4
160+.Louter:
161+ s8addq $i,$bp,$bi
162+ ldq $hi0,($ap)
163+ ldq $aj,8($ap)
164+ ldq $bi,($bi)
165+ ldq $hi1,($np)
166+ ldq $nj,8($np)
167+ ldq $tj,(sp)
168+
169+ mulq $hi0,$bi,$lo0
170+ umulh $hi0,$bi,$hi0
171+
172+ addq $lo0,$tj,$lo0
173+ cmpult $lo0,$tj,AT
174+ addq $hi0,AT,$hi0
175+
176+ mulq $lo0,$n0,$m1
177+
178+ mulq $hi1,$m1,$lo1
179+ umulh $hi1,$m1,$hi1
180+
181+ addq $lo1,$lo0,$lo1
182+ cmpult $lo1,$lo0,AT
183+ mov 2,$j
184+ addq $hi1,AT,$hi1
185+
186+ mulq $aj,$bi,$alo
187+ mov sp,$tp
188+ umulh $aj,$bi,$ahi
189+
190+ mulq $nj,$m1,$nlo
191+ s8addq $j,$ap,$aj
192+ umulh $nj,$m1,$nhi
193+.align 4
194+.Linner:
195+ .set noreorder
196+ ldq $tj,8($tp) #L0
197+ nop #U1
198+ ldq $aj,($aj) #L1
199+ s8addq $j,$np,$nj #U0
200+
201+ ldq $nj,($nj) #L0
202+ nop #U1
203+ addq $alo,$hi0,$lo0 #L1
204+ lda $tp,8($tp)
205+
206+ mulq $aj,$bi,$alo #U1
207+ cmpult $lo0,$hi0,AT #L0
208+ addq $nlo,$hi1,$lo1 #L1
209+ addl $j,1,$j
210+
211+ mulq $nj,$m1,$nlo #U1
212+ addq $ahi,AT,$hi0 #L0
213+ addq $lo0,$tj,$lo0 #L1
214+ cmpult $lo1,$hi1,v0 #U0
215+
216+ umulh $aj,$bi,$ahi #U1
217+ cmpult $lo0,$tj,AT #L0
218+ addq $lo1,$lo0,$lo1 #L1
219+ addq $nhi,v0,$hi1 #U0
220+
221+ umulh $nj,$m1,$nhi #U1
222+ s8addq $j,$ap,$aj #L0
223+ cmpult $lo1,$lo0,v0 #L1
224+ cmplt $j,$num,$tj #U0 # borrow $tj
225+
226+ addq $hi0,AT,$hi0 #L0
227+ addq $hi1,v0,$hi1 #U1
228+ stq $lo1,-8($tp) #L1
229+ bne $tj,.Linner #U0
230+ .set reorder
231+
232+ ldq $tj,8($tp)
233+ addq $alo,$hi0,$lo0
234+ addq $nlo,$hi1,$lo1
235+ cmpult $lo0,$hi0,AT
236+ cmpult $lo1,$hi1,v0
237+ addq $ahi,AT,$hi0
238+ addq $nhi,v0,$hi1
239+
240+ addq $lo0,$tj,$lo0
241+ cmpult $lo0,$tj,AT
242+ addq $hi0,AT,$hi0
243+
244+ ldq $tj,16($tp)
245+ addq $lo1,$lo0,$j
246+ cmpult $j,$lo0,v0
247+ addq $hi1,v0,$hi1
248+
249+ addq $hi1,$hi0,$lo1
250+ stq $j,($tp)
251+ cmpult $lo1,$hi0,$hi1
252+ addq $lo1,$tj,$lo1
253+ cmpult $lo1,$tj,AT
254+ addl $i,1,$i
255+ addq $hi1,AT,$hi1
256+ stq $lo1,8($tp)
257+ cmplt $i,$num,$tj # borrow $tj
258+ stq $hi1,16($tp)
259+ bne $tj,.Louter
260+
261+ s8addq $num,sp,$tj # &tp[num]
262+ mov $rp,$bp # put rp aside
263+ mov sp,$tp
264+ mov sp,$ap
265+ mov 0,$hi0 # clear borrow bit
266+
267+.align 4
268+.Lsub: ldq $lo0,($tp)
269+ ldq $lo1,($np)
270+ lda $tp,8($tp)
271+ lda $np,8($np)
272+ subq $lo0,$lo1,$lo1 # tp[i]-np[i]
273+ cmpult $lo0,$lo1,AT
274+ subq $lo1,$hi0,$lo0
275+ cmpult $lo1,$lo0,$hi0
276+ or $hi0,AT,$hi0
277+ stq $lo0,($rp)
278+ cmpult $tp,$tj,v0
279+ lda $rp,8($rp)
280+ bne v0,.Lsub
281+
282+ subq $hi1,$hi0,$hi0 # handle upmost overflow bit
283+ mov sp,$tp
284+ mov $bp,$rp # restore rp
285+
286+ and sp,$hi0,$ap
287+ bic $bp,$hi0,$bp
288+ bis $bp,$ap,$ap # ap=borrow?tp:rp
289+
290+.align 4
291+.Lcopy: ldq $aj,($ap) # copy or in-place refresh
292+ lda $tp,8($tp)
293+ lda $rp,8($rp)
294+ lda $ap,8($ap)
295+ stq zero,-8($tp) # zap tp
296+ cmpult $tp,$tj,AT
297+ stq $aj,-8($rp)
298+ bne AT,.Lcopy
299+ mov 1,v0
300+
301+.Lexit:
302+ .set noreorder
303+ mov fp,sp
304+ /*ldq ra,0(sp)*/
305+ ldq s3,8(sp)
306+ ldq s4,16(sp)
307+ ldq s5,24(sp)
308+ ldq fp,32(sp)
309+ lda sp,40(sp)
310+ ret (ra)
311+.end bn_mul_mont
312+.rdata
313+.asciiz "Montgomery Multiplication for Alpha, CRYPTOGAMS by <appro\@openssl.org>"
314+___
315+
316+print $code;
317+close STDOUT;
--- /dev/null
+++ b/crypto/bn/asm/armv4-mont.pl
@@ -0,0 +1,200 @@
1+#!/usr/bin/env perl
2+
3+# ====================================================================
4+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5+# project. The module is, however, dual licensed under OpenSSL and
6+# CRYPTOGAMS licenses depending on where you obtain it. For further
7+# details see http://www.openssl.org/~appro/cryptogams/.
8+# ====================================================================
9+
10+# January 2007.
11+
12+# Montgomery multiplication for ARMv4.
13+#
14+# Performance improvement naturally varies among CPU implementations
15+# and compilers. The code was observed to provide +65-35% improvement
16+# [depending on key length, less for longer keys] on ARM920T, and
17+# +115-80% on Intel IXP425. This is compared to pre-bn_mul_mont code
18+# base and compiler generated code with in-lined umull and even umlal
19+# instructions. The latter means that this code didn't really have an
20+# "advantage" of utilizing some "secret" instruction.
21+#
22+# The code is interoperable with Thumb ISA and is rather compact, less
23+# than 1/2KB. Windows CE port would be trivial, as it's exclusively
24+# about decorations, ABI and instruction syntax are identical.
25+
26+$num="r0"; # starts as num argument, but holds &tp[num-1]
27+$ap="r1";
28+$bp="r2"; $bi="r2"; $rp="r2";
29+$np="r3";
30+$tp="r4";
31+$aj="r5";
32+$nj="r6";
33+$tj="r7";
34+$n0="r8";
35+########### # r9 is reserved by ELF as platform specific, e.g. TLS pointer
36+$alo="r10"; # sl, gcc uses it to keep @GOT
37+$ahi="r11"; # fp
38+$nlo="r12"; # ip
39+########### # r13 is stack pointer
40+$nhi="r14"; # lr
41+########### # r15 is program counter
42+
43+#### argument block layout relative to &tp[num-1], a.k.a. $num
44+$_rp="$num,#12*4";
45+# ap permanently resides in r1
46+$_bp="$num,#13*4";
47+# np permanently resides in r3
48+$_n0="$num,#14*4";
49+$_num="$num,#15*4"; $_bpend=$_num;
50+
51+$code=<<___;
52+.text
53+
54+.global bn_mul_mont
55+.type bn_mul_mont,%function
56+
57+.align 2
58+bn_mul_mont:
59+ stmdb sp!,{r0,r2} @ sp points at argument block
60+ ldr $num,[sp,#3*4] @ load num
61+ cmp $num,#2
62+ movlt r0,#0
63+ addlt sp,sp,#2*4
64+ blt .Labrt
65+
66+ stmdb sp!,{r4-r12,lr} @ save 10 registers
67+
68+ mov $num,$num,lsl#2 @ rescale $num for byte count
69+ sub sp,sp,$num @ alloca(4*num)
70+ sub sp,sp,#4 @ +extra dword
71+ sub $num,$num,#4 @ "num=num-1"
72+ add $tp,$bp,$num @ &bp[num-1]
73+
74+ add $num,sp,$num @ $num to point at &tp[num-1]
75+ ldr $n0,[$_n0] @ &n0
76+ ldr $bi,[$bp] @ bp[0]
77+ ldr $aj,[$ap],#4 @ ap[0],ap++
78+ ldr $nj,[$np],#4 @ np[0],np++
79+ ldr $n0,[$n0] @ *n0
80+ str $tp,[$_bpend] @ save &bp[num]
81+
82+ umull $alo,$ahi,$aj,$bi @ ap[0]*bp[0]
83+ str $n0,[$_n0] @ save n0 value
84+ mul $n0,$alo,$n0 @ "tp[0]"*n0
85+ mov $nlo,#0
86+ umlal $alo,$nlo,$nj,$n0 @ np[0]*n0+"t[0]"
87+ mov $tp,sp
88+
89+.L1st:
90+ ldr $aj,[$ap],#4 @ ap[j],ap++
91+ mov $alo,$ahi
92+ mov $ahi,#0
93+ umlal $alo,$ahi,$aj,$bi @ ap[j]*bp[0]
94+ ldr $nj,[$np],#4 @ np[j],np++
95+ mov $nhi,#0
96+ umlal $nlo,$nhi,$nj,$n0 @ np[j]*n0
97+ adds $nlo,$nlo,$alo
98+ str $nlo,[$tp],#4 @ tp[j-1]=,tp++
99+ adc $nlo,$nhi,#0
100+ cmp $tp,$num
101+ bne .L1st
102+
103+ adds $nlo,$nlo,$ahi
104+ mov $nhi,#0
105+ adc $nhi,$nhi,#0
106+ ldr $tp,[$_bp] @ restore bp
107+ str $nlo,[$num] @ tp[num-1]=
108+ ldr $n0,[$_n0] @ restore n0
109+ str $nhi,[$num,#4] @ tp[num]=
110+
111+.Louter:
112+ sub $tj,$num,sp @ "original" $num-1 value
113+ sub $ap,$ap,$tj @ "rewind" ap to &ap[1]
114+ sub $np,$np,$tj @ "rewind" np to &np[1]
115+ ldr $bi,[$tp,#4]! @ *(++bp)
116+ ldr $aj,[$ap,#-4] @ ap[0]
117+ ldr $nj,[$np,#-4] @ np[0]
118+ ldr $alo,[sp] @ tp[0]
119+ ldr $tj,[sp,#4] @ tp[1]
120+
121+ mov $ahi,#0
122+ umlal $alo,$ahi,$aj,$bi @ ap[0]*bp[i]+tp[0]
123+ str $tp,[$_bp] @ save bp
124+ mul $n0,$alo,$n0
125+ mov $nlo,#0
126+ umlal $alo,$nlo,$nj,$n0 @ np[0]*n0+"tp[0]"
127+ mov $tp,sp
128+
129+.Linner:
130+ ldr $aj,[$ap],#4 @ ap[j],ap++
131+ adds $alo,$ahi,$tj @ +=tp[j]
132+ mov $ahi,#0
133+ umlal $alo,$ahi,$aj,$bi @ ap[j]*bp[i]
134+ ldr $nj,[$np],#4 @ np[j],np++
135+ mov $nhi,#0
136+ umlal $nlo,$nhi,$nj,$n0 @ np[j]*n0
137+ ldr $tj,[$tp,#8] @ tp[j+1]
138+ adc $ahi,$ahi,#0
139+ adds $nlo,$nlo,$alo
140+ str $nlo,[$tp],#4 @ tp[j-1]=,tp++
141+ adc $nlo,$nhi,#0
142+ cmp $tp,$num
143+ bne .Linner
144+
145+ adds $nlo,$nlo,$ahi
146+ mov $nhi,#0
147+ adc $nhi,$nhi,#0
148+ adds $nlo,$nlo,$tj
149+ adc $nhi,$nhi,#0
150+ ldr $tp,[$_bp] @ restore bp
151+ ldr $tj,[$_bpend] @ restore &bp[num]
152+ str $nlo,[$num] @ tp[num-1]=
153+ ldr $n0,[$_n0] @ restore n0
154+ str $nhi,[$num,#4] @ tp[num]=
155+
156+ cmp $tp,$tj
157+ bne .Louter
158+
159+ ldr $rp,[$_rp] @ pull rp
160+ add $num,$num,#4 @ $num to point at &tp[num]
161+ sub $aj,$num,sp @ "original" num value
162+ mov $tp,sp @ "rewind" $tp
163+ mov $ap,$tp @ "borrow" $ap
164+ sub $np,$np,$aj @ "rewind" $np to &np[0]
165+
166+ subs $tj,$tj,$tj @ "clear" carry flag
167+.Lsub: ldr $tj,[$tp],#4
168+ ldr $nj,[$np],#4
169+ sbcs $tj,$tj,$nj @ tp[j]-np[j]
170+ str $tj,[$rp],#4 @ rp[j]=
171+ teq $tp,$num @ preserve carry
172+ bne .Lsub
173+ sbcs $nhi,$nhi,#0 @ upmost carry
174+ mov $tp,sp @ "rewind" $tp
175+ sub $rp,$rp,$aj @ "rewind" $rp
176+
177+ and $ap,$tp,$nhi
178+ bic $np,$rp,$nhi
179+ orr $ap,$ap,$np @ ap=borrow?tp:rp
180+
181+.Lcopy: ldr $tj,[$ap],#4 @ copy or in-place refresh
182+ str sp,[$tp],#4 @ zap tp
183+ str $tj,[$rp],#4
184+ cmp $tp,$num
185+ bne .Lcopy
186+
187+ add sp,$num,#4 @ skip over tp[num+1]
188+ ldmia sp!,{r4-r12,lr} @ restore registers
189+ add sp,sp,#2*4 @ skip over {r0,r2}
190+ mov r0,#1
191+.Labrt: tst lr,#1
192+ moveq pc,lr @ be binary compatible with V4, yet
193+ bx lr @ interoperable with Thumb ISA:-)
194+.size bn_mul_mont,.-bn_mul_mont
195+.asciz "Montgomery multiplication for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
196+___
197+
198+$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
199+print $code;
200+close STDOUT;
--- /dev/null
+++ b/crypto/bn/asm/mips3-mont.pl
@@ -0,0 +1,327 @@
1+#!/usr/bin/env perl
2+#
3+# ====================================================================
4+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5+# project. The module is, however, dual licensed under OpenSSL and
6+# CRYPTOGAMS licenses depending on where you obtain it. For further
7+# details see http://www.openssl.org/~appro/cryptogams/.
8+# ====================================================================
9+
10+# This module doesn't present direct interest for OpenSSL, because it
11+# doesn't provide better performance for longer keys. While 512-bit
12+# RSA private key operations are 40% faster, 1024-bit ones are hardly
13+# faster at all, while longer key operations are slower by up to 20%.
14+# It might be of interest to embedded system developers though, as
15+# it's smaller than 1KB, yet offers ~3x improvement over compiler
16+# generated code.
17+#
18+# The module targets N32 and N64 MIPS ABIs and currently is a bit
19+# IRIX-centric, i.e. is likely to require adaptation for other OSes.
20+
21+# int bn_mul_mont(
22+$rp="a0"; # BN_ULONG *rp,
23+$ap="a1"; # const BN_ULONG *ap,
24+$bp="a2"; # const BN_ULONG *bp,
25+$np="a3"; # const BN_ULONG *np,
26+$n0="a4"; # const BN_ULONG *n0,
27+$num="a5"; # int num);
28+
29+$lo0="a6";
30+$hi0="a7";
31+$lo1="v0";
32+$hi1="v1";
33+$aj="t0";
34+$bi="t1";
35+$nj="t2";
36+$tp="t3";
37+$alo="s0";
38+$ahi="s1";
39+$nlo="s2";
40+$nhi="s3";
41+$tj="s4";
42+$i="s5";
43+$j="s6";
44+$fp="t8";
45+$m1="t9";
46+
47+$FRAME=8*(2+8);
48+
49+$code=<<___;
50+#include <asm.h>
51+#include <regdef.h>
52+
53+.text
54+
55+.set noat
56+.set reorder
57+
58+.align 5
59+.globl bn_mul_mont
60+.ent bn_mul_mont
61+bn_mul_mont:
62+ .set noreorder
63+ PTR_SUB sp,64
64+ move $fp,sp
65+ .frame $fp,64,ra
66+ slt AT,$num,4
67+ li v0,0
68+ beqzl AT,.Lproceed
69+ nop
70+ jr ra
71+ PTR_ADD sp,$fp,64
72+ .set reorder
73+.align 5
74+.Lproceed:
75+ ld $n0,0($n0)
76+ ld $bi,0($bp) # bp[0]
77+ ld $aj,0($ap) # ap[0]
78+ ld $nj,0($np) # np[0]
79+ PTR_SUB sp,16 # place for two extra words
80+ sll $num,3
81+ li AT,-4096
82+ PTR_SUB sp,$num
83+ and sp,AT
84+
85+ sd s0,0($fp)
86+ sd s1,8($fp)
87+ sd s2,16($fp)
88+ sd s3,24($fp)
89+ sd s4,32($fp)
90+ sd s5,40($fp)
91+ sd s6,48($fp)
92+ sd s7,56($fp)
93+
94+ dmultu $aj,$bi
95+ ld $alo,8($ap)
96+ ld $nlo,8($np)
97+ mflo $lo0
98+ mfhi $hi0
99+ dmultu $lo0,$n0
100+ mflo $m1
101+
102+ dmultu $alo,$bi
103+ mflo $alo
104+ mfhi $ahi
105+
106+ dmultu $nj,$m1
107+ mflo $lo1
108+ mfhi $hi1
109+ dmultu $nlo,$m1
110+ daddu $lo1,$lo0
111+ sltu AT,$lo1,$lo0
112+ daddu $hi1,AT
113+ mflo $nlo
114+ mfhi $nhi
115+
116+ move $tp,sp
117+ li $j,16
118+.align 4
119+.L1st:
120+ .set noreorder
121+ PTR_ADD $aj,$ap,$j
122+ ld $aj,($aj)
123+ PTR_ADD $nj,$np,$j
124+ ld $nj,($nj)
125+
126+ dmultu $aj,$bi
127+ daddu $lo0,$alo,$hi0
128+ daddu $lo1,$nlo,$hi1
129+ sltu AT,$lo0,$hi0
130+ sltu s7,$lo1,$hi1
131+ daddu $hi0,$ahi,AT
132+ daddu $hi1,$nhi,s7
133+ mflo $alo
134+ mfhi $ahi
135+
136+ daddu $lo1,$lo0
137+ sltu AT,$lo1,$lo0
138+ dmultu $nj,$m1
139+ daddu $hi1,AT
140+ addu $j,8
141+ sd $lo1,($tp)
142+ sltu s7,$j,$num
143+ mflo $nlo
144+ mfhi $nhi
145+
146+ bnez s7,.L1st
147+ PTR_ADD $tp,8
148+ .set reorder
149+
150+ daddu $lo0,$alo,$hi0
151+ sltu AT,$lo0,$hi0
152+ daddu $hi0,$ahi,AT
153+
154+ daddu $lo1,$nlo,$hi1
155+ sltu s7,$lo1,$hi1
156+ daddu $hi1,$nhi,s7
157+ daddu $lo1,$lo0
158+ sltu AT,$lo1,$lo0
159+ daddu $hi1,AT
160+
161+ sd $lo1,($tp)
162+
163+ daddu $hi1,$hi0
164+ sltu AT,$hi1,$hi0
165+ sd $hi1,8($tp)
166+ sd AT,16($tp)
167+
168+ li $i,8
169+.align 4
170+.Louter:
171+ PTR_ADD $bi,$bp,$i
172+ ld $bi,($bi)
173+ ld $aj,($ap)
174+ ld $alo,8($ap)
175+ ld $tj,(sp)
176+
177+ dmultu $aj,$bi
178+ ld $nj,($np)
179+ ld $nlo,8($np)
180+ mflo $lo0
181+ mfhi $hi0
182+ daddu $lo0,$tj
183+ dmultu $lo0,$n0
184+ sltu AT,$lo0,$tj
185+ daddu $hi0,AT
186+ mflo $m1
187+
188+ dmultu $alo,$bi
189+ mflo $alo
190+ mfhi $ahi
191+
192+ dmultu $nj,$m1
193+ mflo $lo1
194+ mfhi $hi1
195+
196+ dmultu $nlo,$m1
197+ daddu $lo1,$lo0
198+ sltu AT,$lo1,$lo0
199+ daddu $hi1,AT
200+ mflo $nlo
201+ mfhi $nhi
202+
203+ move $tp,sp
204+ li $j,16
205+ ld $tj,8($tp)
206+.align 4
207+.Linner:
208+ .set noreorder
209+ PTR_ADD $aj,$ap,$j
210+ ld $aj,($aj)
211+ PTR_ADD $nj,$np,$j
212+ ld $nj,($nj)
213+
214+ dmultu $aj,$bi
215+ daddu $lo0,$alo,$hi0
216+ daddu $lo1,$nlo,$hi1
217+ sltu AT,$lo0,$hi0
218+ sltu s7,$lo1,$hi1
219+ daddu $hi0,$ahi,AT
220+ daddu $hi1,$nhi,s7
221+ mflo $alo
222+ mfhi $ahi
223+
224+ daddu $lo0,$tj
225+ addu $j,8
226+ dmultu $nj,$m1
227+ sltu AT,$lo0,$tj
228+ daddu $lo1,$lo0
229+ daddu $hi0,AT
230+ sltu s7,$lo1,$lo0
231+ ld $tj,16($tp)
232+ daddu $hi1,s7
233+ sltu AT,$j,$num
234+ mflo $nlo
235+ mfhi $nhi
236+ sd $lo1,($tp)
237+ bnez AT,.Linner
238+ PTR_ADD $tp,8
239+ .set reorder
240+
241+ daddu $lo0,$alo,$hi0
242+ sltu AT,$lo0,$hi0
243+ daddu $hi0,$ahi,AT
244+ daddu $lo0,$tj
245+ sltu s7,$lo0,$tj
246+ daddu $hi0,s7
247+
248+ ld $tj,16($tp)
249+ daddu $lo1,$nlo,$hi1
250+ sltu AT,$lo1,$hi1
251+ daddu $hi1,$nhi,AT
252+ daddu $lo1,$lo0
253+ sltu s7,$lo1,$lo0
254+ daddu $hi1,s7
255+ sd $lo1,($tp)
256+
257+ daddu $lo1,$hi1,$hi0
258+ sltu $hi1,$lo1,$hi0
259+ daddu $lo1,$tj
260+ sltu AT,$lo1,$tj
261+ daddu $hi1,AT
262+ sd $lo1,8($tp)
263+ sd $hi1,16($tp)
264+
265+ addu $i,8
266+ sltu s7,$i,$num
267+ bnez s7,.Louter
268+
269+ .set noreorder
270+ PTR_ADD $tj,sp,$num # &tp[num]
271+ move $tp,sp
272+ move $ap,sp
273+ li $hi0,0 # clear borrow bit
274+
275+.align 4
276+.Lsub: ld $lo0,($tp)
277+ ld $lo1,($np)
278+ PTR_ADD $tp,8
279+ PTR_ADD $np,8
280+ dsubu $lo1,$lo0,$lo1 # tp[i]-np[i]
281+ sgtu AT,$lo1,$lo0
282+ dsubu $lo0,$lo1,$hi0
283+ sgtu $hi0,$lo0,$lo1
284+ sd $lo0,($rp)
285+ or $hi0,AT
286+ sltu AT,$tp,$tj
287+ bnez AT,.Lsub
288+ PTR_ADD $rp,8
289+
290+ dsubu $hi0,$hi1,$hi0 # handle upmost overflow bit
291+ move $tp,sp
292+ PTR_SUB $rp,$num # restore rp
293+ not $hi1,$hi0
294+
295+ and $ap,$hi0,sp
296+ and $bp,$hi1,$rp
297+ or $ap,$ap,$bp # ap=borrow?tp:rp
298+
299+.align 4
300+.Lcopy: ld $aj,($ap)
301+ PTR_ADD $ap,8
302+ PTR_ADD $tp,8
303+ sd zero,-8($tp)
304+ sltu AT,$tp,$tj
305+ sd $aj,($rp)
306+ bnez AT,.Lcopy
307+ PTR_ADD $rp,8
308+
309+ ld s0,0($fp)
310+ ld s1,8($fp)
311+ ld s2,16($fp)
312+ ld s3,24($fp)
313+ ld s4,32($fp)
314+ ld s5,40($fp)
315+ ld s6,48($fp)
316+ ld s7,56($fp)
317+ li v0,1
318+ jr ra
319+ PTR_ADD sp,$fp,64
320+ .set reorder
321+END(bn_mul_mont)
322+.rdata
323+.asciiz "Montgomery Multiplication for MIPS III/IV, CRYPTOGAMS by <appro\@openssl.org>"
324+___
325+
326+print $code;
327+close STDOUT;
--- /dev/null
+++ b/crypto/bn/asm/ppc-mont.pl
@@ -0,0 +1,323 @@
1+#!/usr/bin/env perl
2+
3+# ====================================================================
4+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5+# project. The module is, however, dual licensed under OpenSSL and
6+# CRYPTOGAMS licenses depending on where you obtain it. For further
7+# details see http://www.openssl.org/~appro/cryptogams/.
8+# ====================================================================
9+
10+# April 2006
11+
12+# "Teaser" Montgomery multiplication module for PowerPC. It's possible
13+# to gain a bit more by modulo-scheduling outer loop, then dedicated
14+# squaring procedure should give further 20% and code can be adapted
15+# for 32-bit application running on 64-bit CPU. As for the latter.
16+# It won't be able to achieve "native" 64-bit performance, because in
17+# 32-bit application context every addc instruction will have to be
18+# expanded as addc, twice right shift by 32 and finally adde, etc.
19+# So far RSA *sign* performance improvement over pre-bn_mul_mont asm
20+# for 64-bit application running on PPC970/G5 is:
21+#
22+# 512-bit +65%
23+# 1024-bit +35%
24+# 2048-bit +18%
25+# 4096-bit +4%
26+
27+$flavour = shift;
28+
29+if ($flavour =~ /32/) {
30+ $BITS= 32;
31+ $BNSZ= $BITS/8;
32+ $SIZE_T=4;
33+ $RZONE= 224;
34+ $FRAME= $SIZE_T*16;
35+
36+ $LD= "lwz"; # load
37+ $LDU= "lwzu"; # load and update
38+ $LDX= "lwzx"; # load indexed
39+ $ST= "stw"; # store
40+ $STU= "stwu"; # store and update
41+ $STX= "stwx"; # store indexed
42+ $STUX= "stwux"; # store indexed and update
43+ $UMULL= "mullw"; # unsigned multiply low
44+ $UMULH= "mulhwu"; # unsigned multiply high
45+ $UCMP= "cmplw"; # unsigned compare
46+ $SHRI= "srwi"; # unsigned shift right by immediate
47+ $PUSH= $ST;
48+ $POP= $LD;
49+} elsif ($flavour =~ /64/) {
50+ $BITS= 64;
51+ $BNSZ= $BITS/8;
52+ $SIZE_T=8;
53+ $RZONE= 288;
54+ $FRAME= $SIZE_T*16;
55+
56+ # same as above, but 64-bit mnemonics...
57+ $LD= "ld"; # load
58+ $LDU= "ldu"; # load and update
59+ $LDX= "ldx"; # load indexed
60+ $ST= "std"; # store
61+ $STU= "stdu"; # store and update
62+ $STX= "stdx"; # store indexed
63+ $STUX= "stdux"; # store indexed and update
64+ $UMULL= "mulld"; # unsigned multiply low
65+ $UMULH= "mulhdu"; # unsigned multiply high
66+ $UCMP= "cmpld"; # unsigned compare
67+ $SHRI= "srdi"; # unsigned shift right by immediate
68+ $PUSH= $ST;
69+ $POP= $LD;
70+} else { die "nonsense $flavour"; }
71+
72+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
73+( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
74+( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
75+die "can't locate ppc-xlate.pl";
76+
77+open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
78+
79+$sp="r1";
80+$toc="r2";
81+$rp="r3"; $ovf="r3";
82+$ap="r4";
83+$bp="r5";
84+$np="r6";
85+$n0="r7";
86+$num="r8";
87+$rp="r9"; # $rp is reassigned
88+$aj="r10";
89+$nj="r11";
90+$tj="r12";
91+# non-volatile registers
92+$i="r14";
93+$j="r15";
94+$tp="r16";
95+$m0="r17";
96+$m1="r18";
97+$lo0="r19";
98+$hi0="r20";
99+$lo1="r21";
100+$hi1="r22";
101+$alo="r23";
102+$ahi="r24";
103+$nlo="r25";
104+#
105+$nhi="r0";
106+
107+$code=<<___;
108+.machine "any"
109+.text
110+
111+.globl .bn_mul_mont
112+.align 4
113+.bn_mul_mont:
114+ cmpwi $num,4
115+ mr $rp,r3 ; $rp is reassigned
116+ li r3,0
117+ bltlr
118+
119+ slwi $num,$num,`log($BNSZ)/log(2)`
120+ li $tj,-4096
121+ addi $ovf,$num,`$FRAME+$RZONE`
122+ subf $ovf,$ovf,$sp ; $sp-$ovf
123+ and $ovf,$ovf,$tj ; minimize TLB usage
124+ subf $ovf,$sp,$ovf ; $ovf-$sp
125+ srwi $num,$num,`log($BNSZ)/log(2)`
126+ $STUX $sp,$sp,$ovf
127+
128+ $PUSH r14,`4*$SIZE_T`($sp)
129+ $PUSH r15,`5*$SIZE_T`($sp)
130+ $PUSH r16,`6*$SIZE_T`($sp)
131+ $PUSH r17,`7*$SIZE_T`($sp)
132+ $PUSH r18,`8*$SIZE_T`($sp)
133+ $PUSH r19,`9*$SIZE_T`($sp)
134+ $PUSH r20,`10*$SIZE_T`($sp)
135+ $PUSH r21,`11*$SIZE_T`($sp)
136+ $PUSH r22,`12*$SIZE_T`($sp)
137+ $PUSH r23,`13*$SIZE_T`($sp)
138+ $PUSH r24,`14*$SIZE_T`($sp)
139+ $PUSH r25,`15*$SIZE_T`($sp)
140+
141+ $LD $n0,0($n0) ; pull n0[0] value
142+ addi $num,$num,-2 ; adjust $num for counter register
143+
144+ $LD $m0,0($bp) ; m0=bp[0]
145+ $LD $aj,0($ap) ; ap[0]
146+ addi $tp,$sp,$FRAME
147+ $UMULL $lo0,$aj,$m0 ; ap[0]*bp[0]
148+ $UMULH $hi0,$aj,$m0
149+
150+ $LD $aj,$BNSZ($ap) ; ap[1]
151+ $LD $nj,0($np) ; np[0]
152+
153+ $UMULL $m1,$lo0,$n0 ; "tp[0]"*n0
154+
155+ $UMULL $alo,$aj,$m0 ; ap[1]*bp[0]
156+ $UMULH $ahi,$aj,$m0
157+
158+ $UMULL $lo1,$nj,$m1 ; np[0]*m1
159+ $UMULH $hi1,$nj,$m1
160+ $LD $nj,$BNSZ($np) ; np[1]
161+ addc $lo1,$lo1,$lo0
162+ addze $hi1,$hi1
163+
164+ $UMULL $nlo,$nj,$m1 ; np[1]*m1
165+ $UMULH $nhi,$nj,$m1
166+
167+ mtctr $num
168+ li $j,`2*$BNSZ`
169+.align 4
170+L1st:
171+ $LDX $aj,$ap,$j ; ap[j]
172+ addc $lo0,$alo,$hi0
173+ $LDX $nj,$np,$j ; np[j]
174+ addze $hi0,$ahi
175+ $UMULL $alo,$aj,$m0 ; ap[j]*bp[0]
176+ addc $lo1,$nlo,$hi1
177+ $UMULH $ahi,$aj,$m0
178+ addze $hi1,$nhi
179+ $UMULL $nlo,$nj,$m1 ; np[j]*m1
180+ addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[0]
181+ $UMULH $nhi,$nj,$m1
182+ addze $hi1,$hi1
183+ $ST $lo1,0($tp) ; tp[j-1]
184+
185+ addi $j,$j,$BNSZ ; j++
186+ addi $tp,$tp,$BNSZ ; tp++
187+ bdnz- L1st
188+;L1st
189+ addc $lo0,$alo,$hi0
190+ addze $hi0,$ahi
191+
192+ addc $lo1,$nlo,$hi1
193+ addze $hi1,$nhi
194+ addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[0]
195+ addze $hi1,$hi1
196+ $ST $lo1,0($tp) ; tp[j-1]
197+
198+ li $ovf,0
199+ addc $hi1,$hi1,$hi0
200+ addze $ovf,$ovf ; upmost overflow bit
201+ $ST $hi1,$BNSZ($tp)
202+
203+ li $i,$BNSZ
204+.align 4
205+Louter:
206+ $LDX $m0,$bp,$i ; m0=bp[i]
207+ $LD $aj,0($ap) ; ap[0]
208+ addi $tp,$sp,$FRAME
209+ $LD $tj,$FRAME($sp) ; tp[0]
210+ $UMULL $lo0,$aj,$m0 ; ap[0]*bp[i]
211+ $UMULH $hi0,$aj,$m0
212+ $LD $aj,$BNSZ($ap) ; ap[1]
213+ $LD $nj,0($np) ; np[0]
214+ addc $lo0,$lo0,$tj ; ap[0]*bp[i]+tp[0]
215+ $UMULL $alo,$aj,$m0 ; ap[j]*bp[i]
216+ addze $hi0,$hi0
217+ $UMULL $m1,$lo0,$n0 ; tp[0]*n0
218+ $UMULH $ahi,$aj,$m0
219+ $UMULL $lo1,$nj,$m1 ; np[0]*m1
220+ $UMULH $hi1,$nj,$m1
221+ $LD $nj,$BNSZ($np) ; np[1]
222+ addc $lo1,$lo1,$lo0
223+ $UMULL $nlo,$nj,$m1 ; np[1]*m1
224+ addze $hi1,$hi1
225+ $UMULH $nhi,$nj,$m1
226+
227+ mtctr $num
228+ li $j,`2*$BNSZ`
229+.align 4
230+Linner:
231+ $LDX $aj,$ap,$j ; ap[j]
232+ addc $lo0,$alo,$hi0
233+ $LD $tj,$BNSZ($tp) ; tp[j]
234+ addze $hi0,$ahi
235+ $LDX $nj,$np,$j ; np[j]
236+ addc $lo1,$nlo,$hi1
237+ $UMULL $alo,$aj,$m0 ; ap[j]*bp[i]
238+ addze $hi1,$nhi
239+ $UMULH $ahi,$aj,$m0
240+ addc $lo0,$lo0,$tj ; ap[j]*bp[i]+tp[j]
241+ $UMULL $nlo,$nj,$m1 ; np[j]*m1
242+ addze $hi0,$hi0
243+ $UMULH $nhi,$nj,$m1
244+ addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[i]+tp[j]
245+ addi $j,$j,$BNSZ ; j++
246+ addze $hi1,$hi1
247+ $ST $lo1,0($tp) ; tp[j-1]
248+ addi $tp,$tp,$BNSZ ; tp++
249+ bdnz- Linner
250+;Linner
251+ $LD $tj,$BNSZ($tp) ; tp[j]
252+ addc $lo0,$alo,$hi0
253+ addze $hi0,$ahi
254+ addc $lo0,$lo0,$tj ; ap[j]*bp[i]+tp[j]
255+ addze $hi0,$hi0
256+
257+ addc $lo1,$nlo,$hi1
258+ addze $hi1,$nhi
259+ addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[i]+tp[j]
260+ addze $hi1,$hi1
261+ $ST $lo1,0($tp) ; tp[j-1]
262+
263+ addic $ovf,$ovf,-1 ; move upmost overflow to XER[CA]
264+ li $ovf,0
265+ adde $hi1,$hi1,$hi0
266+ addze $ovf,$ovf
267+ $ST $hi1,$BNSZ($tp)
268+;
269+ slwi $tj,$num,`log($BNSZ)/log(2)`
270+ $UCMP $i,$tj
271+ addi $i,$i,$BNSZ
272+ ble- Louter
273+
274+ addi $num,$num,2 ; restore $num
275+ subfc $j,$j,$j ; j=0 and "clear" XER[CA]
276+ addi $tp,$sp,$FRAME
277+ mtctr $num
278+
279+.align 4
280+Lsub: $LDX $tj,$tp,$j
281+ $LDX $nj,$np,$j
282+ subfe $aj,$nj,$tj ; tp[j]-np[j]
283+ $STX $aj,$rp,$j
284+ addi $j,$j,$BNSZ
285+ bdnz- Lsub
286+
287+ li $j,0
288+ mtctr $num
289+ subfe $ovf,$j,$ovf ; handle upmost overflow bit
290+ and $ap,$tp,$ovf
291+ andc $np,$rp,$ovf
292+ or $ap,$ap,$np ; ap=borrow?tp:rp
293+
294+.align 4
295+Lcopy: ; copy or in-place refresh
296+ $LDX $tj,$ap,$j
297+ $STX $tj,$rp,$j
298+ $STX $j,$tp,$j ; zap at once
299+ addi $j,$j,$BNSZ
300+ bdnz- Lcopy
301+
302+ $POP r14,`4*$SIZE_T`($sp)
303+ $POP r15,`5*$SIZE_T`($sp)
304+ $POP r16,`6*$SIZE_T`($sp)
305+ $POP r17,`7*$SIZE_T`($sp)
306+ $POP r18,`8*$SIZE_T`($sp)
307+ $POP r19,`9*$SIZE_T`($sp)
308+ $POP r20,`10*$SIZE_T`($sp)
309+ $POP r21,`11*$SIZE_T`($sp)
310+ $POP r22,`12*$SIZE_T`($sp)
311+ $POP r23,`13*$SIZE_T`($sp)
312+ $POP r24,`14*$SIZE_T`($sp)
313+ $POP r25,`15*$SIZE_T`($sp)
314+ $POP $sp,0($sp)
315+ li r3,1
316+ blr
317+ .long 0
318+.asciz "Montgomery Multiplication for PPC, CRYPTOGAMS by <appro\@fy.chalmers.se>"
319+___
320+
321+$code =~ s/\`([^\`]*)\`/eval $1/gem;
322+print $code;
323+close STDOUT;
--- /dev/null
+++ b/crypto/bn/asm/ppc64-mont.pl
@@ -0,0 +1,918 @@
1+#!/usr/bin/env perl
2+
3+# ====================================================================
4+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5+# project. The module is, however, dual licensed under OpenSSL and
6+# CRYPTOGAMS licenses depending on where you obtain it. For further
7+# details see http://www.openssl.org/~appro/cryptogams/.
8+# ====================================================================
9+
10+# December 2007
11+
12+# The reason for undertaken effort is basically following. Even though
13+# Power 6 CPU operates at incredible 4.7GHz clock frequency, its PKI
14+# performance was observed to be less than impressive, essentially as
15+# fast as 1.8GHz PPC970, or 2.6 times(!) slower than one would hope.
16+# Well, it's not surprising that IBM had to make some sacrifices to
17+# boost the clock frequency that much, but no overall improvement?
18+# Having observed how much difference did switching to FPU make on
19+# UltraSPARC, playing same stunt on Power 6 appeared appropriate...
20+# Unfortunately the resulting performance improvement is not as
21+# impressive, ~30%, and in absolute terms is still very far from what
22+# one would expect from 4.7GHz CPU. There is a chance that I'm doing
23+# something wrong, but in the lack of assembler level micro-profiling
24+# data or at least decent platform guide I can't tell... Or better
25+# results might be achieved with VMX... Anyway, this module provides
26+# *worse* performance on other PowerPC implementations, ~40-15% slower
27+# on PPC970 depending on key length and ~40% slower on Power 5 for all
28+# key lengths. As it's obviously inappropriate as "best all-round"
29+# alternative, it has to be complemented with run-time CPU family
30+# detection. Oh! It should also be noted that unlike other PowerPC
31+# implementation IALU ppc-mont.pl module performs *suboptimaly* on
32+# >=1024-bit key lengths on Power 6. It should also be noted that
33+# *everything* said so far applies to 64-bit builds! As far as 32-bit
34+# application executed on 64-bit CPU goes, this module is likely to
35+# become preferred choice, because it's easy to adapt it for such
36+# case and *is* faster than 32-bit ppc-mont.pl on *all* processors.
37+
38+# February 2008
39+
40+# Micro-profiling assisted optimization results in ~15% improvement
41+# over original ppc64-mont.pl version, or overall ~50% improvement
42+# over ppc.pl module on Power 6. If compared to ppc-mont.pl on same
43+# Power 6 CPU, this module is 5-150% faster depending on key length,
44+# [hereafter] more for longer keys. But if compared to ppc-mont.pl
45+# on 1.8GHz PPC970, it's only 5-55% faster. Still far from impressive
46+# in absolute terms, but it's apparently the way Power 6 is...
47+
48+$flavour = shift;
49+
50+if ($flavour =~ /32/) {
51+ $SIZE_T=4;
52+ $RZONE= 224;
53+ $FRAME= $SIZE_T*12+8*12;
54+ $fname= "bn_mul_mont_ppc64";
55+
56+ $STUX= "stwux"; # store indexed and update
57+ $PUSH= "stw";
58+ $POP= "lwz";
59+ die "not implemented yet";
60+} elsif ($flavour =~ /64/) {
61+ $SIZE_T=8;
62+ $RZONE= 288;
63+ $FRAME= $SIZE_T*12+8*12;
64+ $fname= "bn_mul_mont";
65+
66+ # same as above, but 64-bit mnemonics...
67+ $STUX= "stdux"; # store indexed and update
68+ $PUSH= "std";
69+ $POP= "ld";
70+} else { die "nonsense $flavour"; }
71+
72+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
73+( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
74+( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
75+die "can't locate ppc-xlate.pl";
76+
77+open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
78+
79+$FRAME=($FRAME+63)&~63;
80+$TRANSFER=16*8;
81+
82+$carry="r0";
83+$sp="r1";
84+$toc="r2";
85+$rp="r3"; $ovf="r3";
86+$ap="r4";
87+$bp="r5";
88+$np="r6";
89+$n0="r7";
90+$num="r8";
91+$rp="r9"; # $rp is reassigned
92+$tp="r10";
93+$j="r11";
94+$i="r12";
95+# non-volatile registers
96+$nap_d="r14"; # interleaved ap and np in double format
97+$a0="r15"; # ap[0]
98+$t0="r16"; # temporary registers
99+$t1="r17";
100+$t2="r18";
101+$t3="r19";
102+$t4="r20";
103+$t5="r21";
104+$t6="r22";
105+$t7="r23";
106+
107+# PPC offers enough register bank capacity to unroll inner loops twice
108+#
109+# ..A3A2A1A0
110+# dcba
111+# -----------
112+# A0a
113+# A0b
114+# A0c
115+# A0d
116+# A1a
117+# A1b
118+# A1c
119+# A1d
120+# A2a
121+# A2b
122+# A2c
123+# A2d
124+# A3a
125+# A3b
126+# A3c
127+# A3d
128+# ..a
129+# ..b
130+#
131+$ba="f0"; $bb="f1"; $bc="f2"; $bd="f3";
132+$na="f4"; $nb="f5"; $nc="f6"; $nd="f7";
133+$dota="f8"; $dotb="f9";
134+$A0="f10"; $A1="f11"; $A2="f12"; $A3="f13";
135+$N0="f14"; $N1="f15"; $N2="f16"; $N3="f17";
136+$T0a="f18"; $T0b="f19";
137+$T1a="f20"; $T1b="f21";
138+$T2a="f22"; $T2b="f23";
139+$T3a="f24"; $T3b="f25";
140+
141+# sp----------->+-------------------------------+
142+# | saved sp |
143+# +-------------------------------+
144+# | |
145+# +-------------------------------+
146+# | 10 saved gpr, r14-r23 |
147+# . .
148+# . .
149+# +12*size_t +-------------------------------+
150+# | 12 saved fpr, f14-f25 |
151+# . .
152+# . .
153+# +12*8 +-------------------------------+
154+# | padding to 64 byte boundary |
155+# . .
156+# +X +-------------------------------+
157+# | 16 gpr<->fpr transfer zone |
158+# . .
159+# . .
160+# +16*8 +-------------------------------+
161+# | __int64 tmp[-1] |
162+# +-------------------------------+
163+# | __int64 tmp[num] |
164+# . .
165+# . .
166+# . .
167+# +(num+1)*8 +-------------------------------+
168+# | padding to 64 byte boundary |
169+# . .
170+# +X +-------------------------------+
171+# | double nap_d[4*num] |
172+# . .
173+# . .
174+# . .
175+# +-------------------------------+
176+
177+$code=<<___;
178+.machine "any"
179+.text
180+
181+.globl .$fname
182+.align 5
183+.$fname:
184+ cmpwi $num,4
185+ mr $rp,r3 ; $rp is reassigned
186+ li r3,0 ; possible "not handled" return code
187+ bltlr-
188+ andi. r0,$num,1 ; $num has to be even
189+ bnelr-
190+
191+ slwi $num,$num,3 ; num*=8
192+ li $i,-4096
193+ slwi $tp,$num,2 ; place for {an}p_{lh}[num], i.e. 4*num
194+ add $tp,$tp,$num ; place for tp[num+1]
195+ addi $tp,$tp,`$FRAME+$TRANSFER+8+64+$RZONE`
196+ subf $tp,$tp,$sp ; $sp-$tp
197+ and $tp,$tp,$i ; minimize TLB usage
198+ subf $tp,$sp,$tp ; $tp-$sp
199+ $STUX $sp,$sp,$tp ; alloca
200+
201+ $PUSH r14,`2*$SIZE_T`($sp)
202+ $PUSH r15,`3*$SIZE_T`($sp)
203+ $PUSH r16,`4*$SIZE_T`($sp)
204+ $PUSH r17,`5*$SIZE_T`($sp)
205+ $PUSH r18,`6*$SIZE_T`($sp)
206+ $PUSH r19,`7*$SIZE_T`($sp)
207+ $PUSH r20,`8*$SIZE_T`($sp)
208+ $PUSH r21,`9*$SIZE_T`($sp)
209+ $PUSH r22,`10*$SIZE_T`($sp)
210+ $PUSH r23,`11*$SIZE_T`($sp)
211+ stfd f14,`12*$SIZE_T+0`($sp)
212+ stfd f15,`12*$SIZE_T+8`($sp)
213+ stfd f16,`12*$SIZE_T+16`($sp)
214+ stfd f17,`12*$SIZE_T+24`($sp)
215+ stfd f18,`12*$SIZE_T+32`($sp)
216+ stfd f19,`12*$SIZE_T+40`($sp)
217+ stfd f20,`12*$SIZE_T+48`($sp)
218+ stfd f21,`12*$SIZE_T+56`($sp)
219+ stfd f22,`12*$SIZE_T+64`($sp)
220+ stfd f23,`12*$SIZE_T+72`($sp)
221+ stfd f24,`12*$SIZE_T+80`($sp)
222+ stfd f25,`12*$SIZE_T+88`($sp)
223+
224+ ld $a0,0($ap) ; pull ap[0] value
225+ ld $n0,0($n0) ; pull n0[0] value
226+ ld $t3,0($bp) ; bp[0]
227+
228+ addi $tp,$sp,`$FRAME+$TRANSFER+8+64`
229+ li $i,-64
230+ add $nap_d,$tp,$num
231+ and $nap_d,$nap_d,$i ; align to 64 bytes
232+
233+ mulld $t7,$a0,$t3 ; ap[0]*bp[0]
234+ ; nap_d is off by 1, because it's used with stfdu/lfdu
235+ addi $nap_d,$nap_d,-8
236+ srwi $j,$num,`3+1` ; counter register, num/2
237+ mulld $t7,$t7,$n0 ; tp[0]*n0
238+ addi $j,$j,-1
239+ addi $tp,$sp,`$FRAME+$TRANSFER-8`
240+ li $carry,0
241+ mtctr $j
242+
243+ ; transfer bp[0] to FPU as 4x16-bit values
244+ extrdi $t0,$t3,16,48
245+ extrdi $t1,$t3,16,32
246+ extrdi $t2,$t3,16,16
247+ extrdi $t3,$t3,16,0
248+ std $t0,`$FRAME+0`($sp)
249+ std $t1,`$FRAME+8`($sp)
250+ std $t2,`$FRAME+16`($sp)
251+ std $t3,`$FRAME+24`($sp)
252+ ; transfer (ap[0]*bp[0])*n0 to FPU as 4x16-bit values
253+ extrdi $t4,$t7,16,48
254+ extrdi $t5,$t7,16,32
255+ extrdi $t6,$t7,16,16
256+ extrdi $t7,$t7,16,0
257+ std $t4,`$FRAME+32`($sp)
258+ std $t5,`$FRAME+40`($sp)
259+ std $t6,`$FRAME+48`($sp)
260+ std $t7,`$FRAME+56`($sp)
261+ lwz $t0,4($ap) ; load a[j] as 32-bit word pair
262+ lwz $t1,0($ap)
263+ lwz $t2,12($ap) ; load a[j+1] as 32-bit word pair
264+ lwz $t3,8($ap)
265+ lwz $t4,4($np) ; load n[j] as 32-bit word pair
266+ lwz $t5,0($np)
267+ lwz $t6,12($np) ; load n[j+1] as 32-bit word pair
268+ lwz $t7,8($np)
269+ lfd $ba,`$FRAME+0`($sp)
270+ lfd $bb,`$FRAME+8`($sp)
271+ lfd $bc,`$FRAME+16`($sp)
272+ lfd $bd,`$FRAME+24`($sp)
273+ lfd $na,`$FRAME+32`($sp)
274+ lfd $nb,`$FRAME+40`($sp)
275+ lfd $nc,`$FRAME+48`($sp)
276+ lfd $nd,`$FRAME+56`($sp)
277+ std $t0,`$FRAME+64`($sp)
278+ std $t1,`$FRAME+72`($sp)
279+ std $t2,`$FRAME+80`($sp)
280+ std $t3,`$FRAME+88`($sp)
281+ std $t4,`$FRAME+96`($sp)
282+ std $t5,`$FRAME+104`($sp)
283+ std $t6,`$FRAME+112`($sp)
284+ std $t7,`$FRAME+120`($sp)
285+ fcfid $ba,$ba
286+ fcfid $bb,$bb
287+ fcfid $bc,$bc
288+ fcfid $bd,$bd
289+ fcfid $na,$na
290+ fcfid $nb,$nb
291+ fcfid $nc,$nc
292+ fcfid $nd,$nd
293+
294+ lfd $A0,`$FRAME+64`($sp)
295+ lfd $A1,`$FRAME+72`($sp)
296+ lfd $A2,`$FRAME+80`($sp)
297+ lfd $A3,`$FRAME+88`($sp)
298+ lfd $N0,`$FRAME+96`($sp)
299+ lfd $N1,`$FRAME+104`($sp)
300+ lfd $N2,`$FRAME+112`($sp)
301+ lfd $N3,`$FRAME+120`($sp)
302+ fcfid $A0,$A0
303+ fcfid $A1,$A1
304+ fcfid $A2,$A2
305+ fcfid $A3,$A3
306+ fcfid $N0,$N0
307+ fcfid $N1,$N1
308+ fcfid $N2,$N2
309+ fcfid $N3,$N3
310+ addi $ap,$ap,16
311+ addi $np,$np,16
312+
313+ fmul $T1a,$A1,$ba
314+ fmul $T1b,$A1,$bb
315+ stfd $A0,8($nap_d) ; save a[j] in double format
316+ stfd $A1,16($nap_d)
317+ fmul $T2a,$A2,$ba
318+ fmul $T2b,$A2,$bb
319+ stfd $A2,24($nap_d) ; save a[j+1] in double format
320+ stfd $A3,32($nap_d)
321+ fmul $T3a,$A3,$ba
322+ fmul $T3b,$A3,$bb
323+ stfd $N0,40($nap_d) ; save n[j] in double format
324+ stfd $N1,48($nap_d)
325+ fmul $T0a,$A0,$ba
326+ fmul $T0b,$A0,$bb
327+ stfd $N2,56($nap_d) ; save n[j+1] in double format
328+ stfdu $N3,64($nap_d)
329+
330+ fmadd $T1a,$A0,$bc,$T1a
331+ fmadd $T1b,$A0,$bd,$T1b
332+ fmadd $T2a,$A1,$bc,$T2a
333+ fmadd $T2b,$A1,$bd,$T2b
334+ fmadd $T3a,$A2,$bc,$T3a
335+ fmadd $T3b,$A2,$bd,$T3b
336+ fmul $dota,$A3,$bc
337+ fmul $dotb,$A3,$bd
338+
339+ fmadd $T1a,$N1,$na,$T1a
340+ fmadd $T1b,$N1,$nb,$T1b
341+ fmadd $T2a,$N2,$na,$T2a
342+ fmadd $T2b,$N2,$nb,$T2b
343+ fmadd $T3a,$N3,$na,$T3a
344+ fmadd $T3b,$N3,$nb,$T3b
345+ fmadd $T0a,$N0,$na,$T0a
346+ fmadd $T0b,$N0,$nb,$T0b
347+
348+ fmadd $T1a,$N0,$nc,$T1a
349+ fmadd $T1b,$N0,$nd,$T1b
350+ fmadd $T2a,$N1,$nc,$T2a
351+ fmadd $T2b,$N1,$nd,$T2b
352+ fmadd $T3a,$N2,$nc,$T3a
353+ fmadd $T3b,$N2,$nd,$T3b
354+ fmadd $dota,$N3,$nc,$dota
355+ fmadd $dotb,$N3,$nd,$dotb
356+
357+ fctid $T0a,$T0a
358+ fctid $T0b,$T0b
359+ fctid $T1a,$T1a
360+ fctid $T1b,$T1b
361+ fctid $T2a,$T2a
362+ fctid $T2b,$T2b
363+ fctid $T3a,$T3a
364+ fctid $T3b,$T3b
365+
366+ stfd $T0a,`$FRAME+0`($sp)
367+ stfd $T0b,`$FRAME+8`($sp)
368+ stfd $T1a,`$FRAME+16`($sp)
369+ stfd $T1b,`$FRAME+24`($sp)
370+ stfd $T2a,`$FRAME+32`($sp)
371+ stfd $T2b,`$FRAME+40`($sp)
372+ stfd $T3a,`$FRAME+48`($sp)
373+ stfd $T3b,`$FRAME+56`($sp)
374+
375+.align 5
376+L1st:
377+ lwz $t0,4($ap) ; load a[j] as 32-bit word pair
378+ lwz $t1,0($ap)
379+ lwz $t2,12($ap) ; load a[j+1] as 32-bit word pair
380+ lwz $t3,8($ap)
381+ lwz $t4,4($np) ; load n[j] as 32-bit word pair
382+ lwz $t5,0($np)
383+ lwz $t6,12($np) ; load n[j+1] as 32-bit word pair
384+ lwz $t7,8($np)
385+ std $t0,`$FRAME+64`($sp)
386+ std $t1,`$FRAME+72`($sp)
387+ std $t2,`$FRAME+80`($sp)
388+ std $t3,`$FRAME+88`($sp)
389+ std $t4,`$FRAME+96`($sp)
390+ std $t5,`$FRAME+104`($sp)
391+ std $t6,`$FRAME+112`($sp)
392+ std $t7,`$FRAME+120`($sp)
393+ ld $t0,`$FRAME+0`($sp)
394+ ld $t1,`$FRAME+8`($sp)
395+ ld $t2,`$FRAME+16`($sp)
396+ ld $t3,`$FRAME+24`($sp)
397+ ld $t4,`$FRAME+32`($sp)
398+ ld $t5,`$FRAME+40`($sp)
399+ ld $t6,`$FRAME+48`($sp)
400+ ld $t7,`$FRAME+56`($sp)
401+ lfd $A0,`$FRAME+64`($sp)
402+ lfd $A1,`$FRAME+72`($sp)
403+ lfd $A2,`$FRAME+80`($sp)
404+ lfd $A3,`$FRAME+88`($sp)
405+ lfd $N0,`$FRAME+96`($sp)
406+ lfd $N1,`$FRAME+104`($sp)
407+ lfd $N2,`$FRAME+112`($sp)
408+ lfd $N3,`$FRAME+120`($sp)
409+ fcfid $A0,$A0
410+ fcfid $A1,$A1
411+ fcfid $A2,$A2
412+ fcfid $A3,$A3
413+ fcfid $N0,$N0
414+ fcfid $N1,$N1
415+ fcfid $N2,$N2
416+ fcfid $N3,$N3
417+ addi $ap,$ap,16
418+ addi $np,$np,16
419+
420+ fmul $T1a,$A1,$ba
421+ fmul $T1b,$A1,$bb
422+ fmul $T2a,$A2,$ba
423+ fmul $T2b,$A2,$bb
424+ stfd $A0,8($nap_d) ; save a[j] in double format
425+ stfd $A1,16($nap_d)
426+ fmul $T3a,$A3,$ba
427+ fmul $T3b,$A3,$bb
428+ fmadd $T0a,$A0,$ba,$dota
429+ fmadd $T0b,$A0,$bb,$dotb
430+ stfd $A2,24($nap_d) ; save a[j+1] in double format
431+ stfd $A3,32($nap_d)
432+
433+ fmadd $T1a,$A0,$bc,$T1a
434+ fmadd $T1b,$A0,$bd,$T1b
435+ fmadd $T2a,$A1,$bc,$T2a
436+ fmadd $T2b,$A1,$bd,$T2b
437+ stfd $N0,40($nap_d) ; save n[j] in double format
438+ stfd $N1,48($nap_d)
439+ fmadd $T3a,$A2,$bc,$T3a
440+ fmadd $T3b,$A2,$bd,$T3b
441+ add $t0,$t0,$carry ; can not overflow
442+ fmul $dota,$A3,$bc
443+ fmul $dotb,$A3,$bd
444+ stfd $N2,56($nap_d) ; save n[j+1] in double format
445+ stfdu $N3,64($nap_d)
446+ srdi $carry,$t0,16
447+ add $t1,$t1,$carry
448+ srdi $carry,$t1,16
449+
450+ fmadd $T1a,$N1,$na,$T1a
451+ fmadd $T1b,$N1,$nb,$T1b
452+ insrdi $t0,$t1,16,32
453+ fmadd $T2a,$N2,$na,$T2a
454+ fmadd $T2b,$N2,$nb,$T2b
455+ add $t2,$t2,$carry
456+ fmadd $T3a,$N3,$na,$T3a
457+ fmadd $T3b,$N3,$nb,$T3b
458+ srdi $carry,$t2,16
459+ fmadd $T0a,$N0,$na,$T0a
460+ fmadd $T0b,$N0,$nb,$T0b
461+ insrdi $t0,$t2,16,16
462+ add $t3,$t3,$carry
463+ srdi $carry,$t3,16
464+
465+ fmadd $T1a,$N0,$nc,$T1a
466+ fmadd $T1b,$N0,$nd,$T1b
467+ insrdi $t0,$t3,16,0 ; 0..63 bits
468+ fmadd $T2a,$N1,$nc,$T2a
469+ fmadd $T2b,$N1,$nd,$T2b
470+ add $t4,$t4,$carry
471+ fmadd $T3a,$N2,$nc,$T3a
472+ fmadd $T3b,$N2,$nd,$T3b
473+ srdi $carry,$t4,16
474+ fmadd $dota,$N3,$nc,$dota
475+ fmadd $dotb,$N3,$nd,$dotb
476+ add $t5,$t5,$carry
477+ srdi $carry,$t5,16
478+ insrdi $t4,$t5,16,32
479+
480+ fctid $T0a,$T0a
481+ fctid $T0b,$T0b
482+ add $t6,$t6,$carry
483+ fctid $T1a,$T1a
484+ fctid $T1b,$T1b
485+ srdi $carry,$t6,16
486+ fctid $T2a,$T2a
487+ fctid $T2b,$T2b
488+ insrdi $t4,$t6,16,16
489+ fctid $T3a,$T3a
490+ fctid $T3b,$T3b
491+ add $t7,$t7,$carry
492+ insrdi $t4,$t7,16,0 ; 64..127 bits
493+ srdi $carry,$t7,16 ; upper 33 bits
494+
495+ stfd $T0a,`$FRAME+0`($sp)
496+ stfd $T0b,`$FRAME+8`($sp)
497+ stfd $T1a,`$FRAME+16`($sp)
498+ stfd $T1b,`$FRAME+24`($sp)
499+ stfd $T2a,`$FRAME+32`($sp)
500+ stfd $T2b,`$FRAME+40`($sp)
501+ stfd $T3a,`$FRAME+48`($sp)
502+ stfd $T3b,`$FRAME+56`($sp)
503+ std $t0,8($tp) ; tp[j-1]
504+ stdu $t4,16($tp) ; tp[j]
505+ bdnz- L1st
506+
507+ fctid $dota,$dota
508+ fctid $dotb,$dotb
509+
510+ ld $t0,`$FRAME+0`($sp)
511+ ld $t1,`$FRAME+8`($sp)
512+ ld $t2,`$FRAME+16`($sp)
513+ ld $t3,`$FRAME+24`($sp)
514+ ld $t4,`$FRAME+32`($sp)
515+ ld $t5,`$FRAME+40`($sp)
516+ ld $t6,`$FRAME+48`($sp)
517+ ld $t7,`$FRAME+56`($sp)
518+ stfd $dota,`$FRAME+64`($sp)
519+ stfd $dotb,`$FRAME+72`($sp)
520+
521+ add $t0,$t0,$carry ; can not overflow
522+ srdi $carry,$t0,16
523+ add $t1,$t1,$carry
524+ srdi $carry,$t1,16
525+ insrdi $t0,$t1,16,32
526+ add $t2,$t2,$carry
527+ srdi $carry,$t2,16
528+ insrdi $t0,$t2,16,16
529+ add $t3,$t3,$carry
530+ srdi $carry,$t3,16
531+ insrdi $t0,$t3,16,0 ; 0..63 bits
532+ add $t4,$t4,$carry
533+ srdi $carry,$t4,16
534+ add $t5,$t5,$carry
535+ srdi $carry,$t5,16
536+ insrdi $t4,$t5,16,32
537+ add $t6,$t6,$carry
538+ srdi $carry,$t6,16
539+ insrdi $t4,$t6,16,16
540+ add $t7,$t7,$carry
541+ insrdi $t4,$t7,16,0 ; 64..127 bits
542+ srdi $carry,$t7,16 ; upper 33 bits
543+ ld $t6,`$FRAME+64`($sp)
544+ ld $t7,`$FRAME+72`($sp)
545+
546+ std $t0,8($tp) ; tp[j-1]
547+ stdu $t4,16($tp) ; tp[j]
548+
549+ add $t6,$t6,$carry ; can not overflow
550+ srdi $carry,$t6,16
551+ add $t7,$t7,$carry
552+ insrdi $t6,$t7,48,0
553+ srdi $ovf,$t7,48
554+ std $t6,8($tp) ; tp[num-1]
555+
556+ slwi $t7,$num,2
557+ subf $nap_d,$t7,$nap_d ; rewind pointer
558+
559+ li $i,8 ; i=1
560+.align 5
561+Louter:
562+ ldx $t3,$bp,$i ; bp[i]
563+ ld $t6,`$FRAME+$TRANSFER+8`($sp) ; tp[0]
564+ mulld $t7,$a0,$t3 ; ap[0]*bp[i]
565+
566+ addi $tp,$sp,`$FRAME+$TRANSFER`
567+ add $t7,$t7,$t6 ; ap[0]*bp[i]+tp[0]
568+ li $carry,0
569+ mulld $t7,$t7,$n0 ; tp[0]*n0
570+ mtctr $j
571+
572+ ; transfer bp[i] to FPU as 4x16-bit values
573+ extrdi $t0,$t3,16,48
574+ extrdi $t1,$t3,16,32
575+ extrdi $t2,$t3,16,16
576+ extrdi $t3,$t3,16,0
577+ std $t0,`$FRAME+0`($sp)
578+ std $t1,`$FRAME+8`($sp)
579+ std $t2,`$FRAME+16`($sp)
580+ std $t3,`$FRAME+24`($sp)
581+ ; transfer (ap[0]*bp[i]+tp[0])*n0 to FPU as 4x16-bit values
582+ extrdi $t4,$t7,16,48
583+ extrdi $t5,$t7,16,32
584+ extrdi $t6,$t7,16,16
585+ extrdi $t7,$t7,16,0
586+ std $t4,`$FRAME+32`($sp)
587+ std $t5,`$FRAME+40`($sp)
588+ std $t6,`$FRAME+48`($sp)
589+ std $t7,`$FRAME+56`($sp)
590+
591+ lfd $A0,8($nap_d) ; load a[j] in double format
592+ lfd $A1,16($nap_d)
593+ lfd $A2,24($nap_d) ; load a[j+1] in double format
594+ lfd $A3,32($nap_d)
595+ lfd $N0,40($nap_d) ; load n[j] in double format
596+ lfd $N1,48($nap_d)
597+ lfd $N2,56($nap_d) ; load n[j+1] in double format
598+ lfdu $N3,64($nap_d)
599+
600+ lfd $ba,`$FRAME+0`($sp)
601+ lfd $bb,`$FRAME+8`($sp)
602+ lfd $bc,`$FRAME+16`($sp)
603+ lfd $bd,`$FRAME+24`($sp)
604+ lfd $na,`$FRAME+32`($sp)
605+ lfd $nb,`$FRAME+40`($sp)
606+ lfd $nc,`$FRAME+48`($sp)
607+ lfd $nd,`$FRAME+56`($sp)
608+
609+ fcfid $ba,$ba
610+ fcfid $bb,$bb
611+ fcfid $bc,$bc
612+ fcfid $bd,$bd
613+ fcfid $na,$na
614+ fcfid $nb,$nb
615+ fcfid $nc,$nc
616+ fcfid $nd,$nd
617+
618+ fmul $T1a,$A1,$ba
619+ fmul $T1b,$A1,$bb
620+ fmul $T2a,$A2,$ba
621+ fmul $T2b,$A2,$bb
622+ fmul $T3a,$A3,$ba
623+ fmul $T3b,$A3,$bb
624+ fmul $T0a,$A0,$ba
625+ fmul $T0b,$A0,$bb
626+
627+ fmadd $T1a,$A0,$bc,$T1a
628+ fmadd $T1b,$A0,$bd,$T1b
629+ fmadd $T2a,$A1,$bc,$T2a
630+ fmadd $T2b,$A1,$bd,$T2b
631+ fmadd $T3a,$A2,$bc,$T3a
632+ fmadd $T3b,$A2,$bd,$T3b
633+ fmul $dota,$A3,$bc
634+ fmul $dotb,$A3,$bd
635+
636+ fmadd $T1a,$N1,$na,$T1a
637+ fmadd $T1b,$N1,$nb,$T1b
638+ lfd $A0,8($nap_d) ; load a[j] in double format
639+ lfd $A1,16($nap_d)
640+ fmadd $T2a,$N2,$na,$T2a
641+ fmadd $T2b,$N2,$nb,$T2b
642+ lfd $A2,24($nap_d) ; load a[j+1] in double format
643+ lfd $A3,32($nap_d)
644+ fmadd $T3a,$N3,$na,$T3a
645+ fmadd $T3b,$N3,$nb,$T3b
646+ fmadd $T0a,$N0,$na,$T0a
647+ fmadd $T0b,$N0,$nb,$T0b
648+
649+ fmadd $T1a,$N0,$nc,$T1a
650+ fmadd $T1b,$N0,$nd,$T1b
651+ fmadd $T2a,$N1,$nc,$T2a
652+ fmadd $T2b,$N1,$nd,$T2b
653+ fmadd $T3a,$N2,$nc,$T3a
654+ fmadd $T3b,$N2,$nd,$T3b
655+ fmadd $dota,$N3,$nc,$dota
656+ fmadd $dotb,$N3,$nd,$dotb
657+
658+ fctid $T0a,$T0a
659+ fctid $T0b,$T0b
660+ fctid $T1a,$T1a
661+ fctid $T1b,$T1b
662+ fctid $T2a,$T2a
663+ fctid $T2b,$T2b
664+ fctid $T3a,$T3a
665+ fctid $T3b,$T3b
666+
667+ stfd $T0a,`$FRAME+0`($sp)
668+ stfd $T0b,`$FRAME+8`($sp)
669+ stfd $T1a,`$FRAME+16`($sp)
670+ stfd $T1b,`$FRAME+24`($sp)
671+ stfd $T2a,`$FRAME+32`($sp)
672+ stfd $T2b,`$FRAME+40`($sp)
673+ stfd $T3a,`$FRAME+48`($sp)
674+ stfd $T3b,`$FRAME+56`($sp)
675+
676+.align 5
677+Linner:
678+ fmul $T1a,$A1,$ba
679+ fmul $T1b,$A1,$bb
680+ fmul $T2a,$A2,$ba
681+ fmul $T2b,$A2,$bb
682+ lfd $N0,40($nap_d) ; load n[j] in double format
683+ lfd $N1,48($nap_d)
684+ fmul $T3a,$A3,$ba
685+ fmul $T3b,$A3,$bb
686+ fmadd $T0a,$A0,$ba,$dota
687+ fmadd $T0b,$A0,$bb,$dotb
688+ lfd $N2,56($nap_d) ; load n[j+1] in double format
689+ lfdu $N3,64($nap_d)
690+
691+ fmadd $T1a,$A0,$bc,$T1a
692+ fmadd $T1b,$A0,$bd,$T1b
693+ fmadd $T2a,$A1,$bc,$T2a
694+ fmadd $T2b,$A1,$bd,$T2b
695+ lfd $A0,8($nap_d) ; load a[j] in double format
696+ lfd $A1,16($nap_d)
697+ fmadd $T3a,$A2,$bc,$T3a
698+ fmadd $T3b,$A2,$bd,$T3b
699+ fmul $dota,$A3,$bc
700+ fmul $dotb,$A3,$bd
701+ lfd $A2,24($nap_d) ; load a[j+1] in double format
702+ lfd $A3,32($nap_d)
703+
704+ fmadd $T1a,$N1,$na,$T1a
705+ fmadd $T1b,$N1,$nb,$T1b
706+ ld $t0,`$FRAME+0`($sp)
707+ ld $t1,`$FRAME+8`($sp)
708+ fmadd $T2a,$N2,$na,$T2a
709+ fmadd $T2b,$N2,$nb,$T2b
710+ ld $t2,`$FRAME+16`($sp)
711+ ld $t3,`$FRAME+24`($sp)
712+ fmadd $T3a,$N3,$na,$T3a
713+ fmadd $T3b,$N3,$nb,$T3b
714+ add $t0,$t0,$carry ; can not overflow
715+ ld $t4,`$FRAME+32`($sp)
716+ ld $t5,`$FRAME+40`($sp)
717+ fmadd $T0a,$N0,$na,$T0a
718+ fmadd $T0b,$N0,$nb,$T0b
719+ srdi $carry,$t0,16
720+ add $t1,$t1,$carry
721+ srdi $carry,$t1,16
722+ ld $t6,`$FRAME+48`($sp)
723+ ld $t7,`$FRAME+56`($sp)
724+
725+ fmadd $T1a,$N0,$nc,$T1a
726+ fmadd $T1b,$N0,$nd,$T1b
727+ insrdi $t0,$t1,16,32
728+ ld $t1,8($tp) ; tp[j]
729+ fmadd $T2a,$N1,$nc,$T2a
730+ fmadd $T2b,$N1,$nd,$T2b
731+ add $t2,$t2,$carry
732+ fmadd $T3a,$N2,$nc,$T3a
733+ fmadd $T3b,$N2,$nd,$T3b
734+ srdi $carry,$t2,16
735+ insrdi $t0,$t2,16,16
736+ fmadd $dota,$N3,$nc,$dota
737+ fmadd $dotb,$N3,$nd,$dotb
738+ add $t3,$t3,$carry
739+ ldu $t2,16($tp) ; tp[j+1]
740+ srdi $carry,$t3,16
741+ insrdi $t0,$t3,16,0 ; 0..63 bits
742+ add $t4,$t4,$carry
743+
744+ fctid $T0a,$T0a
745+ fctid $T0b,$T0b
746+ srdi $carry,$t4,16
747+ fctid $T1a,$T1a
748+ fctid $T1b,$T1b
749+ add $t5,$t5,$carry
750+ fctid $T2a,$T2a
751+ fctid $T2b,$T2b
752+ srdi $carry,$t5,16
753+ insrdi $t4,$t5,16,32
754+ fctid $T3a,$T3a
755+ fctid $T3b,$T3b
756+ add $t6,$t6,$carry
757+ srdi $carry,$t6,16
758+ insrdi $t4,$t6,16,16
759+
760+ stfd $T0a,`$FRAME+0`($sp)
761+ stfd $T0b,`$FRAME+8`($sp)
762+ add $t7,$t7,$carry
763+ addc $t3,$t0,$t1
764+ stfd $T1a,`$FRAME+16`($sp)
765+ stfd $T1b,`$FRAME+24`($sp)
766+ insrdi $t4,$t7,16,0 ; 64..127 bits
767+ srdi $carry,$t7,16 ; upper 33 bits
768+ stfd $T2a,`$FRAME+32`($sp)
769+ stfd $T2b,`$FRAME+40`($sp)
770+ adde $t5,$t4,$t2
771+ stfd $T3a,`$FRAME+48`($sp)
772+ stfd $T3b,`$FRAME+56`($sp)
773+ addze $carry,$carry
774+ std $t3,-16($tp) ; tp[j-1]
775+ std $t5,-8($tp) ; tp[j]
776+ bdnz- Linner
777+
778+ fctid $dota,$dota
779+ fctid $dotb,$dotb
780+ ld $t0,`$FRAME+0`($sp)
781+ ld $t1,`$FRAME+8`($sp)
782+ ld $t2,`$FRAME+16`($sp)
783+ ld $t3,`$FRAME+24`($sp)
784+ ld $t4,`$FRAME+32`($sp)
785+ ld $t5,`$FRAME+40`($sp)
786+ ld $t6,`$FRAME+48`($sp)
787+ ld $t7,`$FRAME+56`($sp)
788+ stfd $dota,`$FRAME+64`($sp)
789+ stfd $dotb,`$FRAME+72`($sp)
790+
791+ add $t0,$t0,$carry ; can not overflow
792+ srdi $carry,$t0,16
793+ add $t1,$t1,$carry
794+ srdi $carry,$t1,16
795+ insrdi $t0,$t1,16,32
796+ add $t2,$t2,$carry
797+ ld $t1,8($tp) ; tp[j]
798+ srdi $carry,$t2,16
799+ insrdi $t0,$t2,16,16
800+ add $t3,$t3,$carry
801+ ldu $t2,16($tp) ; tp[j+1]
802+ srdi $carry,$t3,16
803+ insrdi $t0,$t3,16,0 ; 0..63 bits
804+ add $t4,$t4,$carry
805+ srdi $carry,$t4,16
806+ add $t5,$t5,$carry
807+ srdi $carry,$t5,16
808+ insrdi $t4,$t5,16,32
809+ add $t6,$t6,$carry
810+ srdi $carry,$t6,16
811+ insrdi $t4,$t6,16,16
812+ add $t7,$t7,$carry
813+ insrdi $t4,$t7,16,0 ; 64..127 bits
814+ srdi $carry,$t7,16 ; upper 33 bits
815+ ld $t6,`$FRAME+64`($sp)
816+ ld $t7,`$FRAME+72`($sp)
817+
818+ addc $t3,$t0,$t1
819+ adde $t5,$t4,$t2
820+ addze $carry,$carry
821+
822+ std $t3,-16($tp) ; tp[j-1]
823+ std $t5,-8($tp) ; tp[j]
824+
825+ add $carry,$carry,$ovf ; comsume upmost overflow
826+ add $t6,$t6,$carry ; can not overflow
827+ srdi $carry,$t6,16
828+ add $t7,$t7,$carry
829+ insrdi $t6,$t7,48,0
830+ srdi $ovf,$t7,48
831+ std $t6,0($tp) ; tp[num-1]
832+
833+ slwi $t7,$num,2
834+ addi $i,$i,8
835+ subf $nap_d,$t7,$nap_d ; rewind pointer
836+ cmpw $i,$num
837+ blt- Louter
838+
839+ subf $np,$num,$np ; rewind np
840+ addi $j,$j,1 ; restore counter
841+ subfc $i,$i,$i ; j=0 and "clear" XER[CA]
842+ addi $tp,$sp,`$FRAME+$TRANSFER+8`
843+ addi $t4,$sp,`$FRAME+$TRANSFER+16`
844+ addi $t5,$np,8
845+ addi $t6,$rp,8
846+ mtctr $j
847+
848+.align 4
849+Lsub: ldx $t0,$tp,$i
850+ ldx $t1,$np,$i
851+ ldx $t2,$t4,$i
852+ ldx $t3,$t5,$i
853+ subfe $t0,$t1,$t0 ; tp[j]-np[j]
854+ subfe $t2,$t3,$t2 ; tp[j+1]-np[j+1]
855+ stdx $t0,$rp,$i
856+ stdx $t2,$t6,$i
857+ addi $i,$i,16
858+ bdnz- Lsub
859+
860+ li $i,0
861+ subfe $ovf,$i,$ovf ; handle upmost overflow bit
862+ and $ap,$tp,$ovf
863+ andc $np,$rp,$ovf
864+ or $ap,$ap,$np ; ap=borrow?tp:rp
865+ addi $t7,$ap,8
866+ mtctr $j
867+
868+.align 4
869+Lcopy: ; copy or in-place refresh
870+ ldx $t0,$ap,$i
871+ ldx $t1,$t7,$i
872+ std $i,8($nap_d) ; zap nap_d
873+ std $i,16($nap_d)
874+ std $i,24($nap_d)
875+ std $i,32($nap_d)
876+ std $i,40($nap_d)
877+ std $i,48($nap_d)
878+ std $i,56($nap_d)
879+ stdu $i,64($nap_d)
880+ stdx $t0,$rp,$i
881+ stdx $t1,$t6,$i
882+ stdx $i,$tp,$i ; zap tp at once
883+ stdx $i,$t4,$i
884+ addi $i,$i,16
885+ bdnz- Lcopy
886+
887+ $POP r14,`2*$SIZE_T`($sp)
888+ $POP r15,`3*$SIZE_T`($sp)
889+ $POP r16,`4*$SIZE_T`($sp)
890+ $POP r17,`5*$SIZE_T`($sp)
891+ $POP r18,`6*$SIZE_T`($sp)
892+ $POP r19,`7*$SIZE_T`($sp)
893+ $POP r20,`8*$SIZE_T`($sp)
894+ $POP r21,`9*$SIZE_T`($sp)
895+ $POP r22,`10*$SIZE_T`($sp)
896+ $POP r23,`11*$SIZE_T`($sp)
897+ lfd f14,`12*$SIZE_T+0`($sp)
898+ lfd f15,`12*$SIZE_T+8`($sp)
899+ lfd f16,`12*$SIZE_T+16`($sp)
900+ lfd f17,`12*$SIZE_T+24`($sp)
901+ lfd f18,`12*$SIZE_T+32`($sp)
902+ lfd f19,`12*$SIZE_T+40`($sp)
903+ lfd f20,`12*$SIZE_T+48`($sp)
904+ lfd f21,`12*$SIZE_T+56`($sp)
905+ lfd f22,`12*$SIZE_T+64`($sp)
906+ lfd f23,`12*$SIZE_T+72`($sp)
907+ lfd f24,`12*$SIZE_T+80`($sp)
908+ lfd f25,`12*$SIZE_T+88`($sp)
909+ $POP $sp,0($sp)
910+ li r3,1 ; signal "handled"
911+ blr
912+ .long 0
913+.asciz "Montgomery Multiplication for PPC64, CRYPTOGAMS by <appro\@fy.chalmers.se>"
914+___
915+
916+$code =~ s/\`([^\`]*)\`/eval $1/gem;
917+print $code;
918+close STDOUT;
--- /dev/null
+++ b/crypto/bn/asm/s390x-mont.pl
@@ -0,0 +1,225 @@
1+#!/usr/bin/env perl
2+
3+# ====================================================================
4+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5+# project. The module is, however, dual licensed under OpenSSL and
6+# CRYPTOGAMS licenses depending on where you obtain it. For further
7+# details see http://www.openssl.org/~appro/cryptogams/.
8+# ====================================================================
9+
10+# April 2007.
11+#
12+# Performance improvement over vanilla C code varies from 85% to 45%
13+# depending on key length and benchmark. Unfortunately in this context
14+# these are not very impressive results [for code that utilizes "wide"
15+# 64x64=128-bit multiplication, which is not commonly available to C
16+# programmers], at least hand-coded bn_asm.c replacement is known to
17+# provide 30-40% better results for longest keys. Well, on a second
18+# thought it's not very surprising, because z-CPUs are single-issue
19+# and _strictly_ in-order execution, while bn_mul_mont is more or less
20+# dependent on CPU ability to pipe-line instructions and have several
21+# of them "in-flight" at the same time. I mean while other methods,
22+# for example Karatsuba, aim to minimize amount of multiplications at
23+# the cost of other operations increase, bn_mul_mont aim to neatly
24+# "overlap" multiplications and the other operations [and on most
25+# platforms even minimize the amount of the other operations, in
26+# particular references to memory]. But it's possible to improve this
27+# module performance by implementing dedicated squaring code-path and
28+# possibly by unrolling loops...
29+
30+# January 2009.
31+#
32+# Reschedule to minimize/avoid Address Generation Interlock hazard,
33+# make inner loops counter-based.
34+
35+$mn0="%r0";
36+$num="%r1";
37+
38+# int bn_mul_mont(
39+$rp="%r2"; # BN_ULONG *rp,
40+$ap="%r3"; # const BN_ULONG *ap,
41+$bp="%r4"; # const BN_ULONG *bp,
42+$np="%r5"; # const BN_ULONG *np,
43+$n0="%r6"; # const BN_ULONG *n0,
44+#$num="160(%r15)" # int num);
45+
46+$bi="%r2"; # zaps rp
47+$j="%r7";
48+
49+$ahi="%r8";
50+$alo="%r9";
51+$nhi="%r10";
52+$nlo="%r11";
53+$AHI="%r12";
54+$NHI="%r13";
55+$count="%r14";
56+$sp="%r15";
57+
58+$code.=<<___;
59+.text
60+.globl bn_mul_mont
61+.type bn_mul_mont,\@function
62+bn_mul_mont:
63+ lgf $num,164($sp) # pull $num
64+ sla $num,3 # $num to enumerate bytes
65+ la $bp,0($num,$bp)
66+
67+ stg %r2,16($sp)
68+
69+ cghi $num,16 #
70+ lghi %r2,0 #
71+ blr %r14 # if($num<16) return 0;
72+ cghi $num,128 #
73+ bhr %r14 # if($num>128) return 0;
74+
75+ stmg %r3,%r15,24($sp)
76+
77+ lghi $rp,-160-8 # leave room for carry bit
78+ lcgr $j,$num # -$num
79+ lgr %r0,$sp
80+ la $rp,0($rp,$sp)
81+ la $sp,0($j,$rp) # alloca
82+ stg %r0,0($sp) # back chain
83+
84+ sra $num,3 # restore $num
85+ la $bp,0($j,$bp) # restore $bp
86+ ahi $num,-1 # adjust $num for inner loop
87+ lg $n0,0($n0) # pull n0
88+
89+ lg $bi,0($bp)
90+ lg $alo,0($ap)
91+ mlgr $ahi,$bi # ap[0]*bp[0]
92+ lgr $AHI,$ahi
93+
94+ lgr $mn0,$alo # "tp[0]"*n0
95+ msgr $mn0,$n0
96+
97+ lg $nlo,0($np) #
98+ mlgr $nhi,$mn0 # np[0]*m1
99+ algr $nlo,$alo # +="tp[0]"
100+ lghi $NHI,0
101+ alcgr $NHI,$nhi
102+
103+ la $j,8(%r0) # j=1
104+ lr $count,$num
105+
106+.align 16
107+.L1st:
108+ lg $alo,0($j,$ap)
109+ mlgr $ahi,$bi # ap[j]*bp[0]
110+ algr $alo,$AHI
111+ lghi $AHI,0
112+ alcgr $AHI,$ahi
113+
114+ lg $nlo,0($j,$np)
115+ mlgr $nhi,$mn0 # np[j]*m1
116+ algr $nlo,$NHI
117+ lghi $NHI,0
118+ alcgr $nhi,$NHI # +="tp[j]"
119+ algr $nlo,$alo
120+ alcgr $NHI,$nhi
121+
122+ stg $nlo,160-8($j,$sp) # tp[j-1]=
123+ la $j,8($j) # j++
124+ brct $count,.L1st
125+
126+ algr $NHI,$AHI
127+ lghi $AHI,0
128+ alcgr $AHI,$AHI # upmost overflow bit
129+ stg $NHI,160-8($j,$sp)
130+ stg $AHI,160($j,$sp)
131+ la $bp,8($bp) # bp++
132+
133+.Louter:
134+ lg $bi,0($bp) # bp[i]
135+ lg $alo,0($ap)
136+ mlgr $ahi,$bi # ap[0]*bp[i]
137+ alg $alo,160($sp) # +=tp[0]
138+ lghi $AHI,0
139+ alcgr $AHI,$ahi
140+
141+ lgr $mn0,$alo
142+ msgr $mn0,$n0 # tp[0]*n0
143+
144+ lg $nlo,0($np) # np[0]
145+ mlgr $nhi,$mn0 # np[0]*m1
146+ algr $nlo,$alo # +="tp[0]"
147+ lghi $NHI,0
148+ alcgr $NHI,$nhi
149+
150+ la $j,8(%r0) # j=1
151+ lr $count,$num
152+
153+.align 16
154+.Linner:
155+ lg $alo,0($j,$ap)
156+ mlgr $ahi,$bi # ap[j]*bp[i]
157+ algr $alo,$AHI
158+ lghi $AHI,0
159+ alcgr $ahi,$AHI
160+ alg $alo,160($j,$sp)# +=tp[j]
161+ alcgr $AHI,$ahi
162+
163+ lg $nlo,0($j,$np)
164+ mlgr $nhi,$mn0 # np[j]*m1
165+ algr $nlo,$NHI
166+ lghi $NHI,0
167+ alcgr $nhi,$NHI
168+ algr $nlo,$alo # +="tp[j]"
169+ alcgr $NHI,$nhi
170+
171+ stg $nlo,160-8($j,$sp) # tp[j-1]=
172+ la $j,8($j) # j++
173+ brct $count,.Linner
174+
175+ algr $NHI,$AHI
176+ lghi $AHI,0
177+ alcgr $AHI,$AHI
178+ alg $NHI,160($j,$sp)# accumulate previous upmost overflow bit
179+ lghi $ahi,0
180+ alcgr $AHI,$ahi # new upmost overflow bit
181+ stg $NHI,160-8($j,$sp)
182+ stg $AHI,160($j,$sp)
183+
184+ la $bp,8($bp) # bp++
185+ clg $bp,160+8+32($j,$sp) # compare to &bp[num]
186+ jne .Louter
187+
188+ lg $rp,160+8+16($j,$sp) # reincarnate rp
189+ la $ap,160($sp)
190+ ahi $num,1 # restore $num, incidentally clears "borrow"
191+
192+ la $j,0(%r0)
193+ lr $count,$num
194+.Lsub: lg $alo,0($j,$ap)
195+ slbg $alo,0($j,$np)
196+ stg $alo,0($j,$rp)
197+ la $j,8($j)
198+ brct $count,.Lsub
199+ lghi $ahi,0
200+ slbgr $AHI,$ahi # handle upmost carry
201+
202+ ngr $ap,$AHI
203+ lghi $np,-1
204+ xgr $np,$AHI
205+ ngr $np,$rp
206+ ogr $ap,$np # ap=borrow?tp:rp
207+
208+ la $j,0(%r0)
209+ lgr $count,$num
210+.Lcopy: lg $alo,0($j,$ap) # copy or in-place refresh
211+ stg $j,160($j,$sp) # zap tp
212+ stg $alo,0($j,$rp)
213+ la $j,8($j)
214+ brct $count,.Lcopy
215+
216+ la %r1,160+8+48($j,$sp)
217+ lmg %r6,%r15,0(%r1)
218+ lghi %r2,1 # signal "processed"
219+ br %r14
220+.size bn_mul_mont,.-bn_mul_mont
221+.string "Montgomery Multiplication for s390x, CRYPTOGAMS by <appro\@openssl.org>"
222+___
223+
224+print $code;
225+close STDOUT;
--- /dev/null
+++ b/crypto/bn/asm/s390x.S
@@ -0,0 +1,678 @@
1+.ident "s390x.S, version 1.0"
2+// ====================================================================
3+// Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
4+// project.
5+//
6+// Rights for redistribution and usage in source and binary forms are
7+// granted according to the OpenSSL license. Warranty of any kind is
8+// disclaimed.
9+// ====================================================================
10+
11+.text
12+
13+#define zero %r0
14+
15+// BN_ULONG bn_mul_add_words(BN_ULONG *r2,BN_ULONG *r3,int r4,BN_ULONG r5);
16+.globl bn_mul_add_words
17+.type bn_mul_add_words,@function
18+.align 4
19+bn_mul_add_words:
20+ lghi zero,0 // zero = 0
21+ la %r1,0(%r2) // put rp aside
22+ lghi %r2,0 // i=0;
23+ ltgfr %r4,%r4
24+ bler %r14 // if (len<=0) return 0;
25+
26+ stmg %r6,%r10,48(%r15)
27+ lghi %r8,0 // carry = 0
28+ srag %r10,%r4,2 // cnt=len/4
29+ jz .Loop1_madd
30+
31+.Loop4_madd:
32+ lg %r7,0(%r2,%r3) // ap[i]
33+ mlgr %r6,%r5 // *=w
34+ algr %r7,%r8 // +=carry
35+ alcgr %r6,zero
36+ alg %r7,0(%r2,%r1) // +=rp[i]
37+ alcgr %r6,zero
38+ stg %r7,0(%r2,%r1) // rp[i]=
39+
40+ lg %r9,8(%r2,%r3)
41+ mlgr %r8,%r5
42+ algr %r9,%r6
43+ alcgr %r8,zero
44+ alg %r9,8(%r2,%r1)
45+ alcgr %r8,zero
46+ stg %r9,8(%r2,%r1)
47+
48+ lg %r7,16(%r2,%r3)
49+ mlgr %r6,%r5
50+ algr %r7,%r8
51+ alcgr %r6,zero
52+ alg %r7,16(%r2,%r1)
53+ alcgr %r6,zero
54+ stg %r7,16(%r2,%r1)
55+
56+ lg %r9,24(%r2,%r3)
57+ mlgr %r8,%r5
58+ algr %r9,%r6
59+ alcgr %r8,zero
60+ alg %r9,24(%r2,%r1)
61+ alcgr %r8,zero
62+ stg %r9,24(%r2,%r1)
63+
64+ la %r2,32(%r2) // i+=4
65+ brct %r10,.Loop4_madd
66+
67+ lghi %r10,3
68+ nr %r4,%r10 // cnt=len%4
69+ jz .Lend_madd
70+
71+.Loop1_madd:
72+ lg %r7,0(%r2,%r3) // ap[i]
73+ mlgr %r6,%r5 // *=w
74+ algr %r7,%r8 // +=carry
75+ alcgr %r6,zero
76+ alg %r7,0(%r2,%r1) // +=rp[i]
77+ alcgr %r6,zero
78+ stg %r7,0(%r2,%r1) // rp[i]=
79+
80+ lgr %r8,%r6
81+ la %r2,8(%r2) // i++
82+ brct %r4,.Loop1_madd
83+
84+.Lend_madd:
85+ lgr %r2,%r8
86+ lmg %r6,%r10,48(%r15)
87+ br %r14
88+.size bn_mul_add_words,.-bn_mul_add_words
89+
90+// BN_ULONG bn_mul_words(BN_ULONG *r2,BN_ULONG *r3,int r4,BN_ULONG r5);
91+.globl bn_mul_words
92+.type bn_mul_words,@function
93+.align 4
94+bn_mul_words:
95+ lghi zero,0 // zero = 0
96+ la %r1,0(%r2) // put rp aside
97+ lghi %r2,0 // i=0;
98+ ltgfr %r4,%r4
99+ bler %r14 // if (len<=0) return 0;
100+
101+ stmg %r6,%r10,48(%r15)
102+ lghi %r8,0 // carry = 0
103+ srag %r10,%r4,2 // cnt=len/4
104+ jz .Loop1_mul
105+
106+.Loop4_mul:
107+ lg %r7,0(%r2,%r3) // ap[i]
108+ mlgr %r6,%r5 // *=w
109+ algr %r7,%r8 // +=carry
110+ alcgr %r6,zero
111+ stg %r7,0(%r2,%r1) // rp[i]=
112+
113+ lg %r9,8(%r2,%r3)
114+ mlgr %r8,%r5
115+ algr %r9,%r6
116+ alcgr %r8,zero
117+ stg %r9,8(%r2,%r1)
118+
119+ lg %r7,16(%r2,%r3)
120+ mlgr %r6,%r5
121+ algr %r7,%r8
122+ alcgr %r6,zero
123+ stg %r7,16(%r2,%r1)
124+
125+ lg %r9,24(%r2,%r3)
126+ mlgr %r8,%r5
127+ algr %r9,%r6
128+ alcgr %r8,zero
129+ stg %r9,24(%r2,%r1)
130+
131+ la %r2,32(%r2) // i+=4
132+ brct %r10,.Loop4_mul
133+
134+ lghi %r10,3
135+ nr %r4,%r10 // cnt=len%4
136+ jz .Lend_mul
137+
138+.Loop1_mul:
139+ lg %r7,0(%r2,%r3) // ap[i]
140+ mlgr %r6,%r5 // *=w
141+ algr %r7,%r8 // +=carry
142+ alcgr %r6,zero
143+ stg %r7,0(%r2,%r1) // rp[i]=
144+
145+ lgr %r8,%r6
146+ la %r2,8(%r2) // i++
147+ brct %r4,.Loop1_mul
148+
149+.Lend_mul:
150+ lgr %r2,%r8
151+ lmg %r6,%r10,48(%r15)
152+ br %r14
153+.size bn_mul_words,.-bn_mul_words
154+
155+// void bn_sqr_words(BN_ULONG *r2,BN_ULONG *r2,int r4)
156+.globl bn_sqr_words
157+.type bn_sqr_words,@function
158+.align 4
159+bn_sqr_words:
160+ ltgfr %r4,%r4
161+ bler %r14
162+
163+ stmg %r6,%r7,48(%r15)
164+ srag %r1,%r4,2 // cnt=len/4
165+ jz .Loop1_sqr
166+
167+.Loop4_sqr:
168+ lg %r7,0(%r3)
169+ mlgr %r6,%r7
170+ stg %r7,0(%r2)
171+ stg %r6,8(%r2)
172+
173+ lg %r7,8(%r3)
174+ mlgr %r6,%r7
175+ stg %r7,16(%r2)
176+ stg %r6,24(%r2)
177+
178+ lg %r7,16(%r3)
179+ mlgr %r6,%r7
180+ stg %r7,32(%r2)
181+ stg %r6,40(%r2)
182+
183+ lg %r7,24(%r3)
184+ mlgr %r6,%r7
185+ stg %r7,48(%r2)
186+ stg %r6,56(%r2)
187+
188+ la %r3,32(%r3)
189+ la %r2,64(%r2)
190+ brct %r1,.Loop4_sqr
191+
192+ lghi %r1,3
193+ nr %r4,%r1 // cnt=len%4
194+ jz .Lend_sqr
195+
196+.Loop1_sqr:
197+ lg %r7,0(%r3)
198+ mlgr %r6,%r7
199+ stg %r7,0(%r2)
200+ stg %r6,8(%r2)
201+
202+ la %r3,8(%r3)
203+ la %r2,16(%r2)
204+ brct %r4,.Loop1_sqr
205+
206+.Lend_sqr:
207+ lmg %r6,%r7,48(%r15)
208+ br %r14
209+.size bn_sqr_words,.-bn_sqr_words
210+
211+// BN_ULONG bn_div_words(BN_ULONG h,BN_ULONG l,BN_ULONG d);
212+.globl bn_div_words
213+.type bn_div_words,@function
214+.align 4
215+bn_div_words:
216+ dlgr %r2,%r4
217+ lgr %r2,%r3
218+ br %r14
219+.size bn_div_words,.-bn_div_words
220+
221+// BN_ULONG bn_add_words(BN_ULONG *r2,BN_ULONG *r3,BN_ULONG *r4,int r5);
222+.globl bn_add_words
223+.type bn_add_words,@function
224+.align 4
225+bn_add_words:
226+ la %r1,0(%r2) // put rp aside
227+ lghi %r2,0 // i=0
228+ ltgfr %r5,%r5
229+ bler %r14 // if (len<=0) return 0;
230+
231+ stg %r6,48(%r15)
232+ lghi %r6,3
233+ nr %r6,%r5 // len%4
234+ sra %r5,2 // len/4, use sra because it sets condition code
235+ jz .Loop1_add // carry is incidentally cleared if branch taken
236+ algr %r2,%r2 // clear carry
237+
238+.Loop4_add:
239+ lg %r0,0(%r2,%r3)
240+ alcg %r0,0(%r2,%r4)
241+ stg %r0,0(%r2,%r1)
242+ lg %r0,8(%r2,%r3)
243+ alcg %r0,8(%r2,%r4)
244+ stg %r0,8(%r2,%r1)
245+ lg %r0,16(%r2,%r3)
246+ alcg %r0,16(%r2,%r4)
247+ stg %r0,16(%r2,%r1)
248+ lg %r0,24(%r2,%r3)
249+ alcg %r0,24(%r2,%r4)
250+ stg %r0,24(%r2,%r1)
251+
252+ la %r2,32(%r2) // i+=4
253+ brct %r5,.Loop4_add
254+
255+ la %r6,1(%r6) // see if len%4 is zero ...
256+ brct %r6,.Loop1_add // without touching condition code:-)
257+
258+.Lexit_add:
259+ lghi %r2,0
260+ alcgr %r2,%r2
261+ lg %r6,48(%r15)
262+ br %r14
263+
264+.Loop1_add:
265+ lg %r0,0(%r2,%r3)
266+ alcg %r0,0(%r2,%r4)
267+ stg %r0,0(%r2,%r1)
268+
269+ la %r2,8(%r2) // i++
270+ brct %r6,.Loop1_add
271+
272+ j .Lexit_add
273+.size bn_add_words,.-bn_add_words
274+
275+// BN_ULONG bn_sub_words(BN_ULONG *r2,BN_ULONG *r3,BN_ULONG *r4,int r5);
276+.globl bn_sub_words
277+.type bn_sub_words,@function
278+.align 4
279+bn_sub_words:
280+ la %r1,0(%r2) // put rp aside
281+ lghi %r2,0 // i=0
282+ ltgfr %r5,%r5
283+ bler %r14 // if (len<=0) return 0;
284+
285+ stg %r6,48(%r15)
286+ lghi %r6,3
287+ nr %r6,%r5 // len%4
288+ sra %r5,2 // len/4, use sra because it sets condition code
289+ jnz .Loop4_sub // borrow is incidentally cleared if branch taken
290+ slgr %r2,%r2 // clear borrow
291+
292+.Loop1_sub:
293+ lg %r0,0(%r2,%r3)
294+ slbg %r0,0(%r2,%r4)
295+ stg %r0,0(%r2,%r1)
296+
297+ la %r2,8(%r2) // i++
298+ brct %r6,.Loop1_sub
299+ j .Lexit_sub
300+
301+.Loop4_sub:
302+ lg %r0,0(%r2,%r3)
303+ slbg %r0,0(%r2,%r4)
304+ stg %r0,0(%r2,%r1)
305+ lg %r0,8(%r2,%r3)
306+ slbg %r0,8(%r2,%r4)
307+ stg %r0,8(%r2,%r1)
308+ lg %r0,16(%r2,%r3)
309+ slbg %r0,16(%r2,%r4)
310+ stg %r0,16(%r2,%r1)
311+ lg %r0,24(%r2,%r3)
312+ slbg %r0,24(%r2,%r4)
313+ stg %r0,24(%r2,%r1)
314+
315+ la %r2,32(%r2) // i+=4
316+ brct %r5,.Loop4_sub
317+
318+ la %r6,1(%r6) // see if len%4 is zero ...
319+ brct %r6,.Loop1_sub // without touching condition code:-)
320+
321+.Lexit_sub:
322+ lghi %r2,0
323+ slbgr %r2,%r2
324+ lcgr %r2,%r2
325+ lg %r6,48(%r15)
326+ br %r14
327+.size bn_sub_words,.-bn_sub_words
328+
329+#define c1 %r1
330+#define c2 %r5
331+#define c3 %r8
332+
333+#define mul_add_c(ai,bi,c1,c2,c3) \
334+ lg %r7,ai*8(%r3); \
335+ mlg %r6,bi*8(%r4); \
336+ algr c1,%r7; \
337+ alcgr c2,%r6; \
338+ alcgr c3,zero
339+
340+// void bn_mul_comba8(BN_ULONG *r2,BN_ULONG *r3,BN_ULONG *r4);
341+.globl bn_mul_comba8
342+.type bn_mul_comba8,@function
343+.align 4
344+bn_mul_comba8:
345+ stmg %r6,%r8,48(%r15)
346+
347+ lghi c1,0
348+ lghi c2,0
349+ lghi c3,0
350+ lghi zero,0
351+
352+ mul_add_c(0,0,c1,c2,c3);
353+ stg c1,0*8(%r2)
354+ lghi c1,0
355+
356+ mul_add_c(0,1,c2,c3,c1);
357+ mul_add_c(1,0,c2,c3,c1);
358+ stg c2,1*8(%r2)
359+ lghi c2,0
360+
361+ mul_add_c(2,0,c3,c1,c2);
362+ mul_add_c(1,1,c3,c1,c2);
363+ mul_add_c(0,2,c3,c1,c2);
364+ stg c3,2*8(%r2)
365+ lghi c3,0
366+
367+ mul_add_c(0,3,c1,c2,c3);
368+ mul_add_c(1,2,c1,c2,c3);
369+ mul_add_c(2,1,c1,c2,c3);
370+ mul_add_c(3,0,c1,c2,c3);
371+ stg c1,3*8(%r2)
372+ lghi c1,0
373+
374+ mul_add_c(4,0,c2,c3,c1);
375+ mul_add_c(3,1,c2,c3,c1);
376+ mul_add_c(2,2,c2,c3,c1);
377+ mul_add_c(1,3,c2,c3,c1);
378+ mul_add_c(0,4,c2,c3,c1);
379+ stg c2,4*8(%r2)
380+ lghi c2,0
381+
382+ mul_add_c(0,5,c3,c1,c2);
383+ mul_add_c(1,4,c3,c1,c2);
384+ mul_add_c(2,3,c3,c1,c2);
385+ mul_add_c(3,2,c3,c1,c2);
386+ mul_add_c(4,1,c3,c1,c2);
387+ mul_add_c(5,0,c3,c1,c2);
388+ stg c3,5*8(%r2)
389+ lghi c3,0
390+
391+ mul_add_c(6,0,c1,c2,c3);
392+ mul_add_c(5,1,c1,c2,c3);
393+ mul_add_c(4,2,c1,c2,c3);
394+ mul_add_c(3,3,c1,c2,c3);
395+ mul_add_c(2,4,c1,c2,c3);
396+ mul_add_c(1,5,c1,c2,c3);
397+ mul_add_c(0,6,c1,c2,c3);
398+ stg c1,6*8(%r2)
399+ lghi c1,0
400+
401+ mul_add_c(0,7,c2,c3,c1);
402+ mul_add_c(1,6,c2,c3,c1);
403+ mul_add_c(2,5,c2,c3,c1);
404+ mul_add_c(3,4,c2,c3,c1);
405+ mul_add_c(4,3,c2,c3,c1);
406+ mul_add_c(5,2,c2,c3,c1);
407+ mul_add_c(6,1,c2,c3,c1);
408+ mul_add_c(7,0,c2,c3,c1);
409+ stg c2,7*8(%r2)
410+ lghi c2,0
411+
412+ mul_add_c(7,1,c3,c1,c2);
413+ mul_add_c(6,2,c3,c1,c2);
414+ mul_add_c(5,3,c3,c1,c2);
415+ mul_add_c(4,4,c3,c1,c2);
416+ mul_add_c(3,5,c3,c1,c2);
417+ mul_add_c(2,6,c3,c1,c2);
418+ mul_add_c(1,7,c3,c1,c2);
419+ stg c3,8*8(%r2)
420+ lghi c3,0
421+
422+ mul_add_c(2,7,c1,c2,c3);
423+ mul_add_c(3,6,c1,c2,c3);
424+ mul_add_c(4,5,c1,c2,c3);
425+ mul_add_c(5,4,c1,c2,c3);
426+ mul_add_c(6,3,c1,c2,c3);
427+ mul_add_c(7,2,c1,c2,c3);
428+ stg c1,9*8(%r2)
429+ lghi c1,0
430+
431+ mul_add_c(7,3,c2,c3,c1);
432+ mul_add_c(6,4,c2,c3,c1);
433+ mul_add_c(5,5,c2,c3,c1);
434+ mul_add_c(4,6,c2,c3,c1);
435+ mul_add_c(3,7,c2,c3,c1);
436+ stg c2,10*8(%r2)
437+ lghi c2,0
438+
439+ mul_add_c(4,7,c3,c1,c2);
440+ mul_add_c(5,6,c3,c1,c2);
441+ mul_add_c(6,5,c3,c1,c2);
442+ mul_add_c(7,4,c3,c1,c2);
443+ stg c3,11*8(%r2)
444+ lghi c3,0
445+
446+ mul_add_c(7,5,c1,c2,c3);
447+ mul_add_c(6,6,c1,c2,c3);
448+ mul_add_c(5,7,c1,c2,c3);
449+ stg c1,12*8(%r2)
450+ lghi c1,0
451+
452+
453+ mul_add_c(6,7,c2,c3,c1);
454+ mul_add_c(7,6,c2,c3,c1);
455+ stg c2,13*8(%r2)
456+ lghi c2,0
457+
458+ mul_add_c(7,7,c3,c1,c2);
459+ stg c3,14*8(%r2)
460+ stg c1,15*8(%r2)
461+
462+ lmg %r6,%r8,48(%r15)
463+ br %r14
464+.size bn_mul_comba8,.-bn_mul_comba8
465+
466+// void bn_mul_comba4(BN_ULONG *r2,BN_ULONG *r3,BN_ULONG *r4);
467+.globl bn_mul_comba4
468+.type bn_mul_comba4,@function
469+.align 4
470+bn_mul_comba4:
471+ stmg %r6,%r8,48(%r15)
472+
473+ lghi c1,0
474+ lghi c2,0
475+ lghi c3,0
476+ lghi zero,0
477+
478+ mul_add_c(0,0,c1,c2,c3);
479+ stg c1,0*8(%r3)
480+ lghi c1,0
481+
482+ mul_add_c(0,1,c2,c3,c1);
483+ mul_add_c(1,0,c2,c3,c1);
484+ stg c2,1*8(%r2)
485+ lghi c2,0
486+
487+ mul_add_c(2,0,c3,c1,c2);
488+ mul_add_c(1,1,c3,c1,c2);
489+ mul_add_c(0,2,c3,c1,c2);
490+ stg c3,2*8(%r2)
491+ lghi c3,0
492+
493+ mul_add_c(0,3,c1,c2,c3);
494+ mul_add_c(1,2,c1,c2,c3);
495+ mul_add_c(2,1,c1,c2,c3);
496+ mul_add_c(3,0,c1,c2,c3);
497+ stg c1,3*8(%r2)
498+ lghi c1,0
499+
500+ mul_add_c(3,1,c2,c3,c1);
501+ mul_add_c(2,2,c2,c3,c1);
502+ mul_add_c(1,3,c2,c3,c1);
503+ stg c2,4*8(%r2)
504+ lghi c2,0
505+
506+ mul_add_c(2,3,c3,c1,c2);
507+ mul_add_c(3,2,c3,c1,c2);
508+ stg c3,5*8(%r2)
509+ lghi c3,0
510+
511+ mul_add_c(3,3,c1,c2,c3);
512+ stg c1,6*8(%r2)
513+ stg c2,7*8(%r2)
514+
515+ stmg %r6,%r8,48(%r15)
516+ br %r14
517+.size bn_mul_comba4,.-bn_mul_comba4
518+
519+#define sqr_add_c(ai,c1,c2,c3) \
520+ lg %r7,ai*8(%r3); \
521+ mlgr %r6,%r7; \
522+ algr c1,%r7; \
523+ alcgr c2,%r6; \
524+ alcgr c3,zero
525+
526+#define sqr_add_c2(ai,aj,c1,c2,c3) \
527+ lg %r7,ai*8(%r3); \
528+ mlg %r6,aj*8(%r3); \
529+ algr c1,%r7; \
530+ alcgr c2,%r6; \
531+ alcgr c3,zero; \
532+ algr c1,%r7; \
533+ alcgr c2,%r6; \
534+ alcgr c3,zero
535+
536+// void bn_sqr_comba8(BN_ULONG *r2,BN_ULONG *r3);
537+.globl bn_sqr_comba8
538+.type bn_sqr_comba8,@function
539+.align 4
540+bn_sqr_comba8:
541+ stmg %r6,%r8,48(%r15)
542+
543+ lghi c1,0
544+ lghi c2,0
545+ lghi c3,0
546+ lghi zero,0
547+
548+ sqr_add_c(0,c1,c2,c3);
549+ stg c1,0*8(%r2)
550+ lghi c1,0
551+
552+ sqr_add_c2(1,0,c2,c3,c1);
553+ stg c2,1*8(%r2)
554+ lghi c2,0
555+
556+ sqr_add_c(1,c3,c1,c2);
557+ sqr_add_c2(2,0,c3,c1,c2);
558+ stg c3,2*8(%r2)
559+ lghi c3,0
560+
561+ sqr_add_c2(3,0,c1,c2,c3);
562+ sqr_add_c2(2,1,c1,c2,c3);
563+ stg c1,3*8(%r2)
564+ lghi c1,0
565+
566+ sqr_add_c(2,c2,c3,c1);
567+ sqr_add_c2(3,1,c2,c3,c1);
568+ sqr_add_c2(4,0,c2,c3,c1);
569+ stg c2,4*8(%r2)
570+ lghi c2,0
571+
572+ sqr_add_c2(5,0,c3,c1,c2);
573+ sqr_add_c2(4,1,c3,c1,c2);
574+ sqr_add_c2(3,2,c3,c1,c2);
575+ stg c3,5*8(%r2)
576+ lghi c3,0
577+
578+ sqr_add_c(3,c1,c2,c3);
579+ sqr_add_c2(4,2,c1,c2,c3);
580+ sqr_add_c2(5,1,c1,c2,c3);
581+ sqr_add_c2(6,0,c1,c2,c3);
582+ stg c1,6*8(%r2)
583+ lghi c1,0
584+
585+ sqr_add_c2(7,0,c2,c3,c1);
586+ sqr_add_c2(6,1,c2,c3,c1);
587+ sqr_add_c2(5,2,c2,c3,c1);
588+ sqr_add_c2(4,3,c2,c3,c1);
589+ stg c2,7*8(%r2)
590+ lghi c2,0
591+
592+ sqr_add_c(4,c3,c1,c2);
593+ sqr_add_c2(5,3,c3,c1,c2);
594+ sqr_add_c2(6,2,c3,c1,c2);
595+ sqr_add_c2(7,1,c3,c1,c2);
596+ stg c3,8*8(%r2)
597+ lghi c3,0
598+
599+ sqr_add_c2(7,2,c1,c2,c3);
600+ sqr_add_c2(6,3,c1,c2,c3);
601+ sqr_add_c2(5,4,c1,c2,c3);
602+ stg c1,9*8(%r2)
603+ lghi c1,0
604+
605+ sqr_add_c(5,c2,c3,c1);
606+ sqr_add_c2(6,4,c2,c3,c1);
607+ sqr_add_c2(7,3,c2,c3,c1);
608+ stg c2,10*8(%r2)
609+ lghi c2,0
610+
611+ sqr_add_c2(7,4,c3,c1,c2);
612+ sqr_add_c2(6,5,c3,c1,c2);
613+ stg c3,11*8(%r2)
614+ lghi c3,0
615+
616+ sqr_add_c(6,c1,c2,c3);
617+ sqr_add_c2(7,5,c1,c2,c3);
618+ stg c1,12*8(%r2)
619+ lghi c1,0
620+
621+ sqr_add_c2(7,6,c2,c3,c1);
622+ stg c2,13*8(%r2)
623+ lghi c2,0
624+
625+ sqr_add_c(7,c3,c1,c2);
626+ stg c3,14*8(%r2)
627+ stg c1,15*8(%r2)
628+
629+ lmg %r6,%r8,48(%r15)
630+ br %r14
631+.size bn_sqr_comba8,.-bn_sqr_comba8
632+
633+// void bn_sqr_comba4(BN_ULONG *r2,BN_ULONG *r3);
634+.globl bn_sqr_comba4
635+.type bn_sqr_comba4,@function
636+.align 4
637+bn_sqr_comba4:
638+ stmg %r6,%r8,48(%r15)
639+
640+ lghi c1,0
641+ lghi c2,0
642+ lghi c3,0
643+ lghi zero,0
644+
645+ sqr_add_c(0,c1,c2,c3);
646+ stg c1,0*8(%r2)
647+ lghi c1,0
648+
649+ sqr_add_c2(1,0,c2,c3,c1);
650+ stg c2,1*8(%r2)
651+ lghi c2,0
652+
653+ sqr_add_c(1,c3,c1,c2);
654+ sqr_add_c2(2,0,c3,c1,c2);
655+ stg c3,2*8(%r2)
656+ lghi c3,0
657+
658+ sqr_add_c2(3,0,c1,c2,c3);
659+ sqr_add_c2(2,1,c1,c2,c3);
660+ stg c1,3*8(%r2)
661+ lghi c1,0
662+
663+ sqr_add_c(2,c2,c3,c1);
664+ sqr_add_c2(3,1,c2,c3,c1);
665+ stg c2,4*8(%r2)
666+ lghi c2,0
667+
668+ sqr_add_c2(3,2,c3,c1,c2);
669+ stg c3,5*8(%r2)
670+ lghi c3,0
671+
672+ sqr_add_c(3,c1,c2,c3);
673+ stg c1,6*8(%r2)
674+ stg c2,7*8(%r2)
675+
676+ lmg %r6,%r8,48(%r15)
677+ br %r14
678+.size bn_sqr_comba4,.-bn_sqr_comba4
--- /dev/null
+++ b/crypto/bn/asm/sparcv9-mont.pl
@@ -0,0 +1,606 @@
1+#!/usr/bin/env perl
2+
3+# ====================================================================
4+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5+# project. The module is, however, dual licensed under OpenSSL and
6+# CRYPTOGAMS licenses depending on where you obtain it. For further
7+# details see http://www.openssl.org/~appro/cryptogams/.
8+# ====================================================================
9+
10+# December 2005
11+#
12+# Pure SPARCv9/8+ and IALU-only bn_mul_mont implementation. The reasons
13+# for undertaken effort are multiple. First of all, UltraSPARC is not
14+# the whole SPARCv9 universe and other VIS-free implementations deserve
15+# optimized code as much. Secondly, newly introduced UltraSPARC T1,
16+# a.k.a. Niagara, has shared FPU and concurrent FPU-intensive pathes,
17+# such as sparcv9a-mont, will simply sink it. Yes, T1 is equipped with
18+# several integrated RSA/DSA accelerator circuits accessible through
19+# kernel driver [only(*)], but having decent user-land software
20+# implementation is important too. Finally, reasons like desire to
21+# experiment with dedicated squaring procedure. Yes, this module
22+# implements one, because it was easiest to draft it in SPARCv9
23+# instructions...
24+
25+# (*) Engine accessing the driver in question is on my TODO list.
26+# For reference, acceleator is estimated to give 6 to 10 times
27+# improvement on single-threaded RSA sign. It should be noted
28+# that 6-10x improvement coefficient does not actually mean
29+# something extraordinary in terms of absolute [single-threaded]
30+# performance, as SPARCv9 instruction set is by all means least
31+# suitable for high performance crypto among other 64 bit
32+# platforms. 6-10x factor simply places T1 in same performance
33+# domain as say AMD64 and IA-64. Improvement of RSA verify don't
34+# appear impressive at all, but it's the sign operation which is
35+# far more critical/interesting.
36+
37+# You might notice that inner loops are modulo-scheduled:-) This has
38+# essentially negligible impact on UltraSPARC performance, it's
39+# Fujitsu SPARC64 V users who should notice and hopefully appreciate
40+# the advantage... Currently this module surpasses sparcv9a-mont.pl
41+# by ~20% on UltraSPARC-III and later cores, but recall that sparcv9a
42+# module still have hidden potential [see TODO list there], which is
43+# estimated to be larger than 20%...
44+
45+# int bn_mul_mont(
46+$rp="%i0"; # BN_ULONG *rp,
47+$ap="%i1"; # const BN_ULONG *ap,
48+$bp="%i2"; # const BN_ULONG *bp,
49+$np="%i3"; # const BN_ULONG *np,
50+$n0="%i4"; # const BN_ULONG *n0,
51+$num="%i5"; # int num);
52+
53+$bits=32;
54+for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
55+if ($bits==64) { $bias=2047; $frame=192; }
56+else { $bias=0; $frame=128; }
57+
58+$car0="%o0";
59+$car1="%o1";
60+$car2="%o2"; # 1 bit
61+$acc0="%o3";
62+$acc1="%o4";
63+$mask="%g1"; # 32 bits, what a waste...
64+$tmp0="%g4";
65+$tmp1="%g5";
66+
67+$i="%l0";
68+$j="%l1";
69+$mul0="%l2";
70+$mul1="%l3";
71+$tp="%l4";
72+$apj="%l5";
73+$npj="%l6";
74+$tpj="%l7";
75+
76+$fname="bn_mul_mont_int";
77+
78+$code=<<___;
79+.section ".text",#alloc,#execinstr
80+
81+.global $fname
82+.align 32
83+$fname:
84+ cmp %o5,4 ! 128 bits minimum
85+ bge,pt %icc,.Lenter
86+ sethi %hi(0xffffffff),$mask
87+ retl
88+ clr %o0
89+.align 32
90+.Lenter:
91+ save %sp,-$frame,%sp
92+ sll $num,2,$num ! num*=4
93+ or $mask,%lo(0xffffffff),$mask
94+ ld [$n0],$n0
95+ cmp $ap,$bp
96+ and $num,$mask,$num
97+ ld [$bp],$mul0 ! bp[0]
98+ nop
99+
100+ add %sp,$bias,%o7 ! real top of stack
101+ ld [$ap],$car0 ! ap[0] ! redundant in squaring context
102+ sub %o7,$num,%o7
103+ ld [$ap+4],$apj ! ap[1]
104+ and %o7,-1024,%o7
105+ ld [$np],$car1 ! np[0]
106+ sub %o7,$bias,%sp ! alloca
107+ ld [$np+4],$npj ! np[1]
108+ be,pt `$bits==32?"%icc":"%xcc"`,.Lbn_sqr_mont
109+ mov 12,$j
110+
111+ mulx $car0,$mul0,$car0 ! ap[0]*bp[0]
112+ mulx $apj,$mul0,$tmp0 !prologue! ap[1]*bp[0]
113+ and $car0,$mask,$acc0
114+ add %sp,$bias+$frame,$tp
115+ ld [$ap+8],$apj !prologue!
116+
117+ mulx $n0,$acc0,$mul1 ! "t[0]"*n0
118+ and $mul1,$mask,$mul1
119+
120+ mulx $car1,$mul1,$car1 ! np[0]*"t[0]"*n0
121+ mulx $npj,$mul1,$acc1 !prologue! np[1]*"t[0]"*n0
122+ srlx $car0,32,$car0
123+ add $acc0,$car1,$car1
124+ ld [$np+8],$npj !prologue!
125+ srlx $car1,32,$car1
126+ mov $tmp0,$acc0 !prologue!
127+
128+.L1st:
129+ mulx $apj,$mul0,$tmp0
130+ mulx $npj,$mul1,$tmp1
131+ add $acc0,$car0,$car0
132+ ld [$ap+$j],$apj ! ap[j]
133+ and $car0,$mask,$acc0
134+ add $acc1,$car1,$car1
135+ ld [$np+$j],$npj ! np[j]
136+ srlx $car0,32,$car0
137+ add $acc0,$car1,$car1
138+ add $j,4,$j ! j++
139+ mov $tmp0,$acc0
140+ st $car1,[$tp]
141+ cmp $j,$num
142+ mov $tmp1,$acc1
143+ srlx $car1,32,$car1
144+ bl %icc,.L1st
145+ add $tp,4,$tp ! tp++
146+!.L1st
147+
148+ mulx $apj,$mul0,$tmp0 !epilogue!
149+ mulx $npj,$mul1,$tmp1
150+ add $acc0,$car0,$car0
151+ and $car0,$mask,$acc0
152+ add $acc1,$car1,$car1
153+ srlx $car0,32,$car0
154+ add $acc0,$car1,$car1
155+ st $car1,[$tp]
156+ srlx $car1,32,$car1
157+
158+ add $tmp0,$car0,$car0
159+ and $car0,$mask,$acc0
160+ add $tmp1,$car1,$car1
161+ srlx $car0,32,$car0
162+ add $acc0,$car1,$car1
163+ st $car1,[$tp+4]
164+ srlx $car1,32,$car1
165+
166+ add $car0,$car1,$car1
167+ st $car1,[$tp+8]
168+ srlx $car1,32,$car2
169+
170+ mov 4,$i ! i++
171+ ld [$bp+4],$mul0 ! bp[1]
172+.Louter:
173+ add %sp,$bias+$frame,$tp
174+ ld [$ap],$car0 ! ap[0]
175+ ld [$ap+4],$apj ! ap[1]
176+ ld [$np],$car1 ! np[0]
177+ ld [$np+4],$npj ! np[1]
178+ ld [$tp],$tmp1 ! tp[0]
179+ ld [$tp+4],$tpj ! tp[1]
180+ mov 12,$j
181+
182+ mulx $car0,$mul0,$car0
183+ mulx $apj,$mul0,$tmp0 !prologue!
184+ add $tmp1,$car0,$car0
185+ ld [$ap+8],$apj !prologue!
186+ and $car0,$mask,$acc0
187+
188+ mulx $n0,$acc0,$mul1
189+ and $mul1,$mask,$mul1
190+
191+ mulx $car1,$mul1,$car1
192+ mulx $npj,$mul1,$acc1 !prologue!
193+ srlx $car0,32,$car0
194+ add $acc0,$car1,$car1
195+ ld [$np+8],$npj !prologue!
196+ srlx $car1,32,$car1
197+ mov $tmp0,$acc0 !prologue!
198+
199+.Linner:
200+ mulx $apj,$mul0,$tmp0
201+ mulx $npj,$mul1,$tmp1
202+ add $tpj,$car0,$car0
203+ ld [$ap+$j],$apj ! ap[j]
204+ add $acc0,$car0,$car0
205+ add $acc1,$car1,$car1
206+ ld [$np+$j],$npj ! np[j]
207+ and $car0,$mask,$acc0
208+ ld [$tp+8],$tpj ! tp[j]
209+ srlx $car0,32,$car0
210+ add $acc0,$car1,$car1
211+ add $j,4,$j ! j++
212+ mov $tmp0,$acc0
213+ st $car1,[$tp] ! tp[j-1]
214+ srlx $car1,32,$car1
215+ mov $tmp1,$acc1
216+ cmp $j,$num
217+ bl %icc,.Linner
218+ add $tp,4,$tp ! tp++
219+!.Linner
220+
221+ mulx $apj,$mul0,$tmp0 !epilogue!
222+ mulx $npj,$mul1,$tmp1
223+ add $tpj,$car0,$car0
224+ add $acc0,$car0,$car0
225+ ld [$tp+8],$tpj ! tp[j]
226+ and $car0,$mask,$acc0
227+ add $acc1,$car1,$car1
228+ srlx $car0,32,$car0
229+ add $acc0,$car1,$car1
230+ st $car1,[$tp] ! tp[j-1]
231+ srlx $car1,32,$car1
232+
233+ add $tpj,$car0,$car0
234+ add $tmp0,$car0,$car0
235+ and $car0,$mask,$acc0
236+ add $tmp1,$car1,$car1
237+ add $acc0,$car1,$car1
238+ st $car1,[$tp+4] ! tp[j-1]
239+ srlx $car0,32,$car0
240+ add $i,4,$i ! i++
241+ srlx $car1,32,$car1
242+
243+ add $car0,$car1,$car1
244+ cmp $i,$num
245+ add $car2,$car1,$car1
246+ st $car1,[$tp+8]
247+
248+ srlx $car1,32,$car2
249+ bl,a %icc,.Louter
250+ ld [$bp+$i],$mul0 ! bp[i]
251+!.Louter
252+
253+ add $tp,12,$tp
254+
255+.Ltail:
256+ add $np,$num,$np
257+ add $rp,$num,$rp
258+ mov $tp,$ap
259+ sub %g0,$num,%o7 ! k=-num
260+ ba .Lsub
261+ subcc %g0,%g0,%g0 ! clear %icc.c
262+.align 16
263+.Lsub:
264+ ld [$tp+%o7],%o0
265+ ld [$np+%o7],%o1
266+ subccc %o0,%o1,%o1 ! tp[j]-np[j]
267+ add $rp,%o7,$i
268+ add %o7,4,%o7
269+ brnz %o7,.Lsub
270+ st %o1,[$i]
271+ subc $car2,0,$car2 ! handle upmost overflow bit
272+ and $tp,$car2,$ap
273+ andn $rp,$car2,$np
274+ or $ap,$np,$ap
275+ sub %g0,$num,%o7
276+
277+.Lcopy:
278+ ld [$ap+%o7],%o0 ! copy or in-place refresh
279+ st %g0,[$tp+%o7] ! zap tp
280+ st %o0,[$rp+%o7]
281+ add %o7,4,%o7
282+ brnz %o7,.Lcopy
283+ nop
284+ mov 1,%i0
285+ ret
286+ restore
287+___
288+
289+########
290+######## .Lbn_sqr_mont gives up to 20% *overall* improvement over
291+######## code without following dedicated squaring procedure.
292+########
293+$sbit="%i2"; # re-use $bp!
294+
295+$code.=<<___;
296+.align 32
297+.Lbn_sqr_mont:
298+ mulx $mul0,$mul0,$car0 ! ap[0]*ap[0]
299+ mulx $apj,$mul0,$tmp0 !prologue!
300+ and $car0,$mask,$acc0
301+ add %sp,$bias+$frame,$tp
302+ ld [$ap+8],$apj !prologue!
303+
304+ mulx $n0,$acc0,$mul1 ! "t[0]"*n0
305+ srlx $car0,32,$car0
306+ and $mul1,$mask,$mul1
307+
308+ mulx $car1,$mul1,$car1 ! np[0]*"t[0]"*n0
309+ mulx $npj,$mul1,$acc1 !prologue!
310+ and $car0,1,$sbit
311+ ld [$np+8],$npj !prologue!
312+ srlx $car0,1,$car0
313+ add $acc0,$car1,$car1
314+ srlx $car1,32,$car1
315+ mov $tmp0,$acc0 !prologue!
316+
317+.Lsqr_1st:
318+ mulx $apj,$mul0,$tmp0
319+ mulx $npj,$mul1,$tmp1
320+ add $acc0,$car0,$car0 ! ap[j]*a0+c0
321+ add $acc1,$car1,$car1
322+ ld [$ap+$j],$apj ! ap[j]
323+ and $car0,$mask,$acc0
324+ ld [$np+$j],$npj ! np[j]
325+ srlx $car0,32,$car0
326+ add $acc0,$acc0,$acc0
327+ or $sbit,$acc0,$acc0
328+ mov $tmp1,$acc1
329+ srlx $acc0,32,$sbit
330+ add $j,4,$j ! j++
331+ and $acc0,$mask,$acc0
332+ cmp $j,$num
333+ add $acc0,$car1,$car1
334+ st $car1,[$tp]
335+ mov $tmp0,$acc0
336+ srlx $car1,32,$car1
337+ bl %icc,.Lsqr_1st
338+ add $tp,4,$tp ! tp++
339+!.Lsqr_1st
340+
341+ mulx $apj,$mul0,$tmp0 ! epilogue
342+ mulx $npj,$mul1,$tmp1
343+ add $acc0,$car0,$car0 ! ap[j]*a0+c0
344+ add $acc1,$car1,$car1
345+ and $car0,$mask,$acc0
346+ srlx $car0,32,$car0
347+ add $acc0,$acc0,$acc0
348+ or $sbit,$acc0,$acc0
349+ srlx $acc0,32,$sbit
350+ and $acc0,$mask,$acc0
351+ add $acc0,$car1,$car1
352+ st $car1,[$tp]
353+ srlx $car1,32,$car1
354+
355+ add $tmp0,$car0,$car0 ! ap[j]*a0+c0
356+ add $tmp1,$car1,$car1
357+ and $car0,$mask,$acc0
358+ srlx $car0,32,$car0
359+ add $acc0,$acc0,$acc0
360+ or $sbit,$acc0,$acc0
361+ srlx $acc0,32,$sbit
362+ and $acc0,$mask,$acc0
363+ add $acc0,$car1,$car1
364+ st $car1,[$tp+4]
365+ srlx $car1,32,$car1
366+
367+ add $car0,$car0,$car0
368+ or $sbit,$car0,$car0
369+ add $car0,$car1,$car1
370+ st $car1,[$tp+8]
371+ srlx $car1,32,$car2
372+
373+ ld [%sp+$bias+$frame],$tmp0 ! tp[0]
374+ ld [%sp+$bias+$frame+4],$tmp1 ! tp[1]
375+ ld [%sp+$bias+$frame+8],$tpj ! tp[2]
376+ ld [$ap+4],$mul0 ! ap[1]
377+ ld [$ap+8],$apj ! ap[2]
378+ ld [$np],$car1 ! np[0]
379+ ld [$np+4],$npj ! np[1]
380+ mulx $n0,$tmp0,$mul1
381+
382+ mulx $mul0,$mul0,$car0
383+ and $mul1,$mask,$mul1
384+
385+ mulx $car1,$mul1,$car1
386+ mulx $npj,$mul1,$acc1
387+ add $tmp0,$car1,$car1
388+ and $car0,$mask,$acc0
389+ ld [$np+8],$npj ! np[2]
390+ srlx $car1,32,$car1
391+ add $tmp1,$car1,$car1
392+ srlx $car0,32,$car0
393+ add $acc0,$car1,$car1
394+ and $car0,1,$sbit
395+ add $acc1,$car1,$car1
396+ srlx $car0,1,$car0
397+ mov 12,$j
398+ st $car1,[%sp+$bias+$frame] ! tp[0]=
399+ srlx $car1,32,$car1
400+ add %sp,$bias+$frame+4,$tp
401+
402+.Lsqr_2nd:
403+ mulx $apj,$mul0,$acc0
404+ mulx $npj,$mul1,$acc1
405+ add $acc0,$car0,$car0
406+ add $tpj,$car1,$car1
407+ ld [$ap+$j],$apj ! ap[j]
408+ and $car0,$mask,$acc0
409+ ld [$np+$j],$npj ! np[j]
410+ srlx $car0,32,$car0
411+ add $acc1,$car1,$car1
412+ ld [$tp+8],$tpj ! tp[j]
413+ add $acc0,$acc0,$acc0
414+ add $j,4,$j ! j++
415+ or $sbit,$acc0,$acc0
416+ srlx $acc0,32,$sbit
417+ and $acc0,$mask,$acc0
418+ cmp $j,$num
419+ add $acc0,$car1,$car1
420+ st $car1,[$tp] ! tp[j-1]
421+ srlx $car1,32,$car1
422+ bl %icc,.Lsqr_2nd
423+ add $tp,4,$tp ! tp++
424+!.Lsqr_2nd
425+
426+ mulx $apj,$mul0,$acc0
427+ mulx $npj,$mul1,$acc1
428+ add $acc0,$car0,$car0
429+ add $tpj,$car1,$car1
430+ and $car0,$mask,$acc0
431+ srlx $car0,32,$car0
432+ add $acc1,$car1,$car1
433+ add $acc0,$acc0,$acc0
434+ or $sbit,$acc0,$acc0
435+ srlx $acc0,32,$sbit
436+ and $acc0,$mask,$acc0
437+ add $acc0,$car1,$car1
438+ st $car1,[$tp] ! tp[j-1]
439+ srlx $car1,32,$car1
440+
441+ add $car0,$car0,$car0
442+ or $sbit,$car0,$car0
443+ add $car0,$car1,$car1
444+ add $car2,$car1,$car1
445+ st $car1,[$tp+4]
446+ srlx $car1,32,$car2
447+
448+ ld [%sp+$bias+$frame],$tmp1 ! tp[0]
449+ ld [%sp+$bias+$frame+4],$tpj ! tp[1]
450+ ld [$ap+8],$mul0 ! ap[2]
451+ ld [$np],$car1 ! np[0]
452+ ld [$np+4],$npj ! np[1]
453+ mulx $n0,$tmp1,$mul1
454+ and $mul1,$mask,$mul1
455+ mov 8,$i
456+
457+ mulx $mul0,$mul0,$car0
458+ mulx $car1,$mul1,$car1
459+ and $car0,$mask,$acc0
460+ add $tmp1,$car1,$car1
461+ srlx $car0,32,$car0
462+ add %sp,$bias+$frame,$tp
463+ srlx $car1,32,$car1
464+ and $car0,1,$sbit
465+ srlx $car0,1,$car0
466+ mov 4,$j
467+
468+.Lsqr_outer:
469+.Lsqr_inner1:
470+ mulx $npj,$mul1,$acc1
471+ add $tpj,$car1,$car1
472+ add $j,4,$j
473+ ld [$tp+8],$tpj
474+ cmp $j,$i
475+ add $acc1,$car1,$car1
476+ ld [$np+$j],$npj
477+ st $car1,[$tp]
478+ srlx $car1,32,$car1
479+ bl %icc,.Lsqr_inner1
480+ add $tp,4,$tp
481+!.Lsqr_inner1
482+
483+ add $j,4,$j
484+ ld [$ap+$j],$apj ! ap[j]
485+ mulx $npj,$mul1,$acc1
486+ add $tpj,$car1,$car1
487+ ld [$np+$j],$npj ! np[j]
488+ add $acc0,$car1,$car1
489+ ld [$tp+8],$tpj ! tp[j]
490+ add $acc1,$car1,$car1
491+ st $car1,[$tp]
492+ srlx $car1,32,$car1
493+
494+ add $j,4,$j
495+ cmp $j,$num
496+ be,pn %icc,.Lsqr_no_inner2
497+ add $tp,4,$tp
498+
499+.Lsqr_inner2:
500+ mulx $apj,$mul0,$acc0
501+ mulx $npj,$mul1,$acc1
502+ add $tpj,$car1,$car1
503+ add $acc0,$car0,$car0
504+ ld [$ap+$j],$apj ! ap[j]
505+ and $car0,$mask,$acc0
506+ ld [$np+$j],$npj ! np[j]
507+ srlx $car0,32,$car0
508+ add $acc0,$acc0,$acc0
509+ ld [$tp+8],$tpj ! tp[j]
510+ or $sbit,$acc0,$acc0
511+ add $j,4,$j ! j++
512+ srlx $acc0,32,$sbit
513+ and $acc0,$mask,$acc0
514+ cmp $j,$num
515+ add $acc0,$car1,$car1
516+ add $acc1,$car1,$car1
517+ st $car1,[$tp] ! tp[j-1]
518+ srlx $car1,32,$car1
519+ bl %icc,.Lsqr_inner2
520+ add $tp,4,$tp ! tp++
521+
522+.Lsqr_no_inner2:
523+ mulx $apj,$mul0,$acc0
524+ mulx $npj,$mul1,$acc1
525+ add $tpj,$car1,$car1
526+ add $acc0,$car0,$car0
527+ and $car0,$mask,$acc0
528+ srlx $car0,32,$car0
529+ add $acc0,$acc0,$acc0
530+ or $sbit,$acc0,$acc0
531+ srlx $acc0,32,$sbit
532+ and $acc0,$mask,$acc0
533+ add $acc0,$car1,$car1
534+ add $acc1,$car1,$car1
535+ st $car1,[$tp] ! tp[j-1]
536+ srlx $car1,32,$car1
537+
538+ add $car0,$car0,$car0
539+ or $sbit,$car0,$car0
540+ add $car0,$car1,$car1
541+ add $car2,$car1,$car1
542+ st $car1,[$tp+4]
543+ srlx $car1,32,$car2
544+
545+ add $i,4,$i ! i++
546+ ld [%sp+$bias+$frame],$tmp1 ! tp[0]
547+ ld [%sp+$bias+$frame+4],$tpj ! tp[1]
548+ ld [$ap+$i],$mul0 ! ap[j]
549+ ld [$np],$car1 ! np[0]
550+ ld [$np+4],$npj ! np[1]
551+ mulx $n0,$tmp1,$mul1
552+ and $mul1,$mask,$mul1
553+ add $i,4,$tmp0
554+
555+ mulx $mul0,$mul0,$car0
556+ mulx $car1,$mul1,$car1
557+ and $car0,$mask,$acc0
558+ add $tmp1,$car1,$car1
559+ srlx $car0,32,$car0
560+ add %sp,$bias+$frame,$tp
561+ srlx $car1,32,$car1
562+ and $car0,1,$sbit
563+ srlx $car0,1,$car0
564+
565+ cmp $tmp0,$num ! i<num-1
566+ bl %icc,.Lsqr_outer
567+ mov 4,$j
568+
569+.Lsqr_last:
570+ mulx $npj,$mul1,$acc1
571+ add $tpj,$car1,$car1
572+ add $j,4,$j
573+ ld [$tp+8],$tpj
574+ cmp $j,$i
575+ add $acc1,$car1,$car1
576+ ld [$np+$j],$npj
577+ st $car1,[$tp]
578+ srlx $car1,32,$car1
579+ bl %icc,.Lsqr_last
580+ add $tp,4,$tp
581+!.Lsqr_last
582+
583+ mulx $npj,$mul1,$acc1
584+ add $tpj,$car1,$car1
585+ add $acc0,$car1,$car1
586+ add $acc1,$car1,$car1
587+ st $car1,[$tp]
588+ srlx $car1,32,$car1
589+
590+ add $car0,$car0,$car0 ! recover $car0
591+ or $sbit,$car0,$car0
592+ add $car0,$car1,$car1
593+ add $car2,$car1,$car1
594+ st $car1,[$tp+4]
595+ srlx $car1,32,$car2
596+
597+ ba .Ltail
598+ add $tp,8,$tp
599+.type $fname,#function
600+.size $fname,(.-$fname)
601+.asciz "Montgomery Multipltication for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
602+.align 32
603+___
604+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
605+print $code;
606+close STDOUT;
--- /dev/null
+++ b/crypto/bn/asm/sparcv9a-mont.pl
@@ -0,0 +1,882 @@
1+#!/usr/bin/env perl
2+
3+# ====================================================================
4+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5+# project. The module is, however, dual licensed under OpenSSL and
6+# CRYPTOGAMS licenses depending on where you obtain it. For further
7+# details see http://www.openssl.org/~appro/cryptogams/.
8+# ====================================================================
9+
10+# October 2005
11+#
12+# "Teaser" Montgomery multiplication module for UltraSPARC. Why FPU?
13+# Because unlike integer multiplier, which simply stalls whole CPU,
14+# FPU is fully pipelined and can effectively emit 48 bit partial
15+# product every cycle. Why not blended SPARC v9? One can argue that
16+# making this module dependent on UltraSPARC VIS extension limits its
17+# binary compatibility. Well yes, it does exclude SPARC64 prior-V(!)
18+# implementations from compatibility matrix. But the rest, whole Sun
19+# UltraSPARC family and brand new Fujitsu's SPARC64 V, all support
20+# VIS extension instructions used in this module. This is considered
21+# good enough to not care about HAL SPARC64 users [if any] who have
22+# integer-only pure SPARCv9 module to "fall down" to.
23+
24+# USI&II cores currently exhibit uniform 2x improvement [over pre-
25+# bn_mul_mont codebase] for all key lengths and benchmarks. On USIII
26+# performance improves few percents for shorter keys and worsens few
27+# percents for longer keys. This is because USIII integer multiplier
28+# is >3x faster than USI&II one, which is harder to match [but see
29+# TODO list below]. It should also be noted that SPARC64 V features
30+# out-of-order execution, which *might* mean that integer multiplier
31+# is pipelined, which in turn *might* be impossible to match... On
32+# additional note, SPARC64 V implements FP Multiply-Add instruction,
33+# which is perfectly usable in this context... In other words, as far
34+# as Fujitsu SPARC64 V goes, talk to the author:-)
35+
36+# The implementation implies following "non-natural" limitations on
37+# input arguments:
38+# - num may not be less than 4;
39+# - num has to be even;
40+# Failure to meet either condition has no fatal effects, simply
41+# doesn't give any performance gain.
42+
43+# TODO:
44+# - modulo-schedule inner loop for better performance (on in-order
45+# execution core such as UltraSPARC this shall result in further
46+# noticeable(!) improvement);
47+# - dedicated squaring procedure[?];
48+
49+######################################################################
50+# November 2006
51+#
52+# Modulo-scheduled inner loops allow to interleave floating point and
53+# integer instructions and minimize Read-After-Write penalties. This
54+# results in *further* 20-50% perfromance improvement [depending on
55+# key length, more for longer keys] on USI&II cores and 30-80% - on
56+# USIII&IV.
57+
58+$fname="bn_mul_mont_fpu";
59+$bits=32;
60+for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
61+
62+if ($bits==64) {
63+ $bias=2047;
64+ $frame=192;
65+} else {
66+ $bias=0;
67+ $frame=128; # 96 rounded up to largest known cache-line
68+}
69+$locals=64;
70+
71+# In order to provide for 32-/64-bit ABI duality, I keep integers wider
72+# than 32 bit in %g1-%g4 and %o0-%o5. %l0-%l7 and %i0-%i5 are used
73+# exclusively for pointers, indexes and other small values...
74+# int bn_mul_mont(
75+$rp="%i0"; # BN_ULONG *rp,
76+$ap="%i1"; # const BN_ULONG *ap,
77+$bp="%i2"; # const BN_ULONG *bp,
78+$np="%i3"; # const BN_ULONG *np,
79+$n0="%i4"; # const BN_ULONG *n0,
80+$num="%i5"; # int num);
81+
82+$tp="%l0"; # t[num]
83+$ap_l="%l1"; # a[num],n[num] are smashed to 32-bit words and saved
84+$ap_h="%l2"; # to these four vectors as double-precision FP values.
85+$np_l="%l3"; # This way a bunch of fxtods are eliminated in second
86+$np_h="%l4"; # loop and L1-cache aliasing is minimized...
87+$i="%l5";
88+$j="%l6";
89+$mask="%l7"; # 16-bit mask, 0xffff
90+
91+$n0="%g4"; # reassigned(!) to "64-bit" register
92+$carry="%i4"; # %i4 reused(!) for a carry bit
93+
94+# FP register naming chart
95+#
96+# ..HILO
97+# dcba
98+# --------
99+# LOa
100+# LOb
101+# LOc
102+# LOd
103+# HIa
104+# HIb
105+# HIc
106+# HId
107+# ..a
108+# ..b
109+$ba="%f0"; $bb="%f2"; $bc="%f4"; $bd="%f6";
110+$na="%f8"; $nb="%f10"; $nc="%f12"; $nd="%f14";
111+$alo="%f16"; $alo_="%f17"; $ahi="%f18"; $ahi_="%f19";
112+$nlo="%f20"; $nlo_="%f21"; $nhi="%f22"; $nhi_="%f23";
113+
114+$dota="%f24"; $dotb="%f26";
115+
116+$aloa="%f32"; $alob="%f34"; $aloc="%f36"; $alod="%f38";
117+$ahia="%f40"; $ahib="%f42"; $ahic="%f44"; $ahid="%f46";
118+$nloa="%f48"; $nlob="%f50"; $nloc="%f52"; $nlod="%f54";
119+$nhia="%f56"; $nhib="%f58"; $nhic="%f60"; $nhid="%f62";
120+
121+$ASI_FL16_P=0xD2; # magic ASI value to engage 16-bit FP load
122+
123+$code=<<___;
124+.section ".text",#alloc,#execinstr
125+
126+.global $fname
127+.align 32
128+$fname:
129+ save %sp,-$frame-$locals,%sp
130+
131+ cmp $num,4
132+ bl,a,pn %icc,.Lret
133+ clr %i0
134+ andcc $num,1,%g0 ! $num has to be even...
135+ bnz,a,pn %icc,.Lret
136+ clr %i0 ! signal "unsupported input value"
137+
138+ srl $num,1,$num
139+ sethi %hi(0xffff),$mask
140+ ld [%i4+0],$n0 ! $n0 reassigned, remember?
141+ or $mask,%lo(0xffff),$mask
142+ ld [%i4+4],%o0
143+ sllx %o0,32,%o0
144+ or %o0,$n0,$n0 ! $n0=n0[1].n0[0]
145+
146+ sll $num,3,$num ! num*=8
147+
148+ add %sp,$bias,%o0 ! real top of stack
149+ sll $num,2,%o1
150+ add %o1,$num,%o1 ! %o1=num*5
151+ sub %o0,%o1,%o0
152+ and %o0,-2048,%o0 ! optimize TLB utilization
153+ sub %o0,$bias,%sp ! alloca(5*num*8)
154+
155+ rd %asi,%o7 ! save %asi
156+ add %sp,$bias+$frame+$locals,$tp
157+ add $tp,$num,$ap_l
158+ add $ap_l,$num,$ap_l ! [an]p_[lh] point at the vectors' ends !
159+ add $ap_l,$num,$ap_h
160+ add $ap_h,$num,$np_l
161+ add $np_l,$num,$np_h
162+
163+ wr %g0,$ASI_FL16_P,%asi ! setup %asi for 16-bit FP loads
164+
165+ add $rp,$num,$rp ! readjust input pointers to point
166+ add $ap,$num,$ap ! at the ends too...
167+ add $bp,$num,$bp
168+ add $np,$num,$np
169+
170+ stx %o7,[%sp+$bias+$frame+48] ! save %asi
171+
172+ sub %g0,$num,$i ! i=-num
173+ sub %g0,$num,$j ! j=-num
174+
175+ add $ap,$j,%o3
176+ add $bp,$i,%o4
177+
178+ ld [%o3+4],%g1 ! bp[0]
179+ ld [%o3+0],%o0
180+ ld [%o4+4],%g5 ! ap[0]
181+ sllx %g1,32,%g1
182+ ld [%o4+0],%o1
183+ sllx %g5,32,%g5
184+ or %g1,%o0,%o0
185+ or %g5,%o1,%o1
186+
187+ add $np,$j,%o5
188+
189+ mulx %o1,%o0,%o0 ! ap[0]*bp[0]
190+ mulx $n0,%o0,%o0 ! ap[0]*bp[0]*n0
191+ stx %o0,[%sp+$bias+$frame+0]
192+
193+ ld [%o3+0],$alo_ ! load a[j] as pair of 32-bit words
194+ fzeros $alo
195+ ld [%o3+4],$ahi_
196+ fzeros $ahi
197+ ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words
198+ fzeros $nlo
199+ ld [%o5+4],$nhi_
200+ fzeros $nhi
201+
202+ ! transfer b[i] to FPU as 4x16-bit values
203+ ldda [%o4+2]%asi,$ba
204+ fxtod $alo,$alo
205+ ldda [%o4+0]%asi,$bb
206+ fxtod $ahi,$ahi
207+ ldda [%o4+6]%asi,$bc
208+ fxtod $nlo,$nlo
209+ ldda [%o4+4]%asi,$bd
210+ fxtod $nhi,$nhi
211+
212+ ! transfer ap[0]*b[0]*n0 to FPU as 4x16-bit values
213+ ldda [%sp+$bias+$frame+6]%asi,$na
214+ fxtod $ba,$ba
215+ ldda [%sp+$bias+$frame+4]%asi,$nb
216+ fxtod $bb,$bb
217+ ldda [%sp+$bias+$frame+2]%asi,$nc
218+ fxtod $bc,$bc
219+ ldda [%sp+$bias+$frame+0]%asi,$nd
220+ fxtod $bd,$bd
221+
222+ std $alo,[$ap_l+$j] ! save smashed ap[j] in double format
223+ fxtod $na,$na
224+ std $ahi,[$ap_h+$j]
225+ fxtod $nb,$nb
226+ std $nlo,[$np_l+$j] ! save smashed np[j] in double format
227+ fxtod $nc,$nc
228+ std $nhi,[$np_h+$j]
229+ fxtod $nd,$nd
230+
231+ fmuld $alo,$ba,$aloa
232+ fmuld $nlo,$na,$nloa
233+ fmuld $alo,$bb,$alob
234+ fmuld $nlo,$nb,$nlob
235+ fmuld $alo,$bc,$aloc
236+ faddd $aloa,$nloa,$nloa
237+ fmuld $nlo,$nc,$nloc
238+ fmuld $alo,$bd,$alod
239+ faddd $alob,$nlob,$nlob
240+ fmuld $nlo,$nd,$nlod
241+ fmuld $ahi,$ba,$ahia
242+ faddd $aloc,$nloc,$nloc
243+ fmuld $nhi,$na,$nhia
244+ fmuld $ahi,$bb,$ahib
245+ faddd $alod,$nlod,$nlod
246+ fmuld $nhi,$nb,$nhib
247+ fmuld $ahi,$bc,$ahic
248+ faddd $ahia,$nhia,$nhia
249+ fmuld $nhi,$nc,$nhic
250+ fmuld $ahi,$bd,$ahid
251+ faddd $ahib,$nhib,$nhib
252+ fmuld $nhi,$nd,$nhid
253+
254+ faddd $ahic,$nhic,$dota ! $nhic
255+ faddd $ahid,$nhid,$dotb ! $nhid
256+
257+ faddd $nloc,$nhia,$nloc
258+ faddd $nlod,$nhib,$nlod
259+
260+ fdtox $nloa,$nloa
261+ fdtox $nlob,$nlob
262+ fdtox $nloc,$nloc
263+ fdtox $nlod,$nlod
264+
265+ std $nloa,[%sp+$bias+$frame+0]
266+ add $j,8,$j
267+ std $nlob,[%sp+$bias+$frame+8]
268+ add $ap,$j,%o4
269+ std $nloc,[%sp+$bias+$frame+16]
270+ add $np,$j,%o5
271+ std $nlod,[%sp+$bias+$frame+24]
272+
273+ ld [%o4+0],$alo_ ! load a[j] as pair of 32-bit words
274+ fzeros $alo
275+ ld [%o4+4],$ahi_
276+ fzeros $ahi
277+ ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words
278+ fzeros $nlo
279+ ld [%o5+4],$nhi_
280+ fzeros $nhi
281+
282+ fxtod $alo,$alo
283+ fxtod $ahi,$ahi
284+ fxtod $nlo,$nlo
285+ fxtod $nhi,$nhi
286+
287+ ldx [%sp+$bias+$frame+0],%o0
288+ fmuld $alo,$ba,$aloa
289+ ldx [%sp+$bias+$frame+8],%o1
290+ fmuld $nlo,$na,$nloa
291+ ldx [%sp+$bias+$frame+16],%o2
292+ fmuld $alo,$bb,$alob
293+ ldx [%sp+$bias+$frame+24],%o3
294+ fmuld $nlo,$nb,$nlob
295+
296+ srlx %o0,16,%o7
297+ std $alo,[$ap_l+$j] ! save smashed ap[j] in double format
298+ fmuld $alo,$bc,$aloc
299+ add %o7,%o1,%o1
300+ std $ahi,[$ap_h+$j]
301+ faddd $aloa,$nloa,$nloa
302+ fmuld $nlo,$nc,$nloc
303+ srlx %o1,16,%o7
304+ std $nlo,[$np_l+$j] ! save smashed np[j] in double format
305+ fmuld $alo,$bd,$alod
306+ add %o7,%o2,%o2
307+ std $nhi,[$np_h+$j]
308+ faddd $alob,$nlob,$nlob
309+ fmuld $nlo,$nd,$nlod
310+ srlx %o2,16,%o7
311+ fmuld $ahi,$ba,$ahia
312+ add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
313+ faddd $aloc,$nloc,$nloc
314+ fmuld $nhi,$na,$nhia
315+ !and %o0,$mask,%o0
316+ !and %o1,$mask,%o1
317+ !and %o2,$mask,%o2
318+ !sllx %o1,16,%o1
319+ !sllx %o2,32,%o2
320+ !sllx %o3,48,%o7
321+ !or %o1,%o0,%o0
322+ !or %o2,%o0,%o0
323+ !or %o7,%o0,%o0 ! 64-bit result
324+ srlx %o3,16,%g1 ! 34-bit carry
325+ fmuld $ahi,$bb,$ahib
326+
327+ faddd $alod,$nlod,$nlod
328+ fmuld $nhi,$nb,$nhib
329+ fmuld $ahi,$bc,$ahic
330+ faddd $ahia,$nhia,$nhia
331+ fmuld $nhi,$nc,$nhic
332+ fmuld $ahi,$bd,$ahid
333+ faddd $ahib,$nhib,$nhib
334+ fmuld $nhi,$nd,$nhid
335+
336+ faddd $dota,$nloa,$nloa
337+ faddd $dotb,$nlob,$nlob
338+ faddd $ahic,$nhic,$dota ! $nhic
339+ faddd $ahid,$nhid,$dotb ! $nhid
340+
341+ faddd $nloc,$nhia,$nloc
342+ faddd $nlod,$nhib,$nlod
343+
344+ fdtox $nloa,$nloa
345+ fdtox $nlob,$nlob
346+ fdtox $nloc,$nloc
347+ fdtox $nlod,$nlod
348+
349+ std $nloa,[%sp+$bias+$frame+0]
350+ std $nlob,[%sp+$bias+$frame+8]
351+ addcc $j,8,$j
352+ std $nloc,[%sp+$bias+$frame+16]
353+ bz,pn %icc,.L1stskip
354+ std $nlod,[%sp+$bias+$frame+24]
355+
356+.align 32 ! incidentally already aligned !
357+.L1st:
358+ add $ap,$j,%o4
359+ add $np,$j,%o5
360+ ld [%o4+0],$alo_ ! load a[j] as pair of 32-bit words
361+ fzeros $alo
362+ ld [%o4+4],$ahi_
363+ fzeros $ahi
364+ ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words
365+ fzeros $nlo
366+ ld [%o5+4],$nhi_
367+ fzeros $nhi
368+
369+ fxtod $alo,$alo
370+ fxtod $ahi,$ahi
371+ fxtod $nlo,$nlo
372+ fxtod $nhi,$nhi
373+
374+ ldx [%sp+$bias+$frame+0],%o0
375+ fmuld $alo,$ba,$aloa
376+ ldx [%sp+$bias+$frame+8],%o1
377+ fmuld $nlo,$na,$nloa
378+ ldx [%sp+$bias+$frame+16],%o2
379+ fmuld $alo,$bb,$alob
380+ ldx [%sp+$bias+$frame+24],%o3
381+ fmuld $nlo,$nb,$nlob
382+
383+ srlx %o0,16,%o7
384+ std $alo,[$ap_l+$j] ! save smashed ap[j] in double format
385+ fmuld $alo,$bc,$aloc
386+ add %o7,%o1,%o1
387+ std $ahi,[$ap_h+$j]
388+ faddd $aloa,$nloa,$nloa
389+ fmuld $nlo,$nc,$nloc
390+ srlx %o1,16,%o7
391+ std $nlo,[$np_l+$j] ! save smashed np[j] in double format
392+ fmuld $alo,$bd,$alod
393+ add %o7,%o2,%o2
394+ std $nhi,[$np_h+$j]
395+ faddd $alob,$nlob,$nlob
396+ fmuld $nlo,$nd,$nlod
397+ srlx %o2,16,%o7
398+ fmuld $ahi,$ba,$ahia
399+ add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
400+ and %o0,$mask,%o0
401+ faddd $aloc,$nloc,$nloc
402+ fmuld $nhi,$na,$nhia
403+ and %o1,$mask,%o1
404+ and %o2,$mask,%o2
405+ fmuld $ahi,$bb,$ahib
406+ sllx %o1,16,%o1
407+ faddd $alod,$nlod,$nlod
408+ fmuld $nhi,$nb,$nhib
409+ sllx %o2,32,%o2
410+ fmuld $ahi,$bc,$ahic
411+ sllx %o3,48,%o7
412+ or %o1,%o0,%o0
413+ faddd $ahia,$nhia,$nhia
414+ fmuld $nhi,$nc,$nhic
415+ or %o2,%o0,%o0
416+ fmuld $ahi,$bd,$ahid
417+ or %o7,%o0,%o0 ! 64-bit result
418+ faddd $ahib,$nhib,$nhib
419+ fmuld $nhi,$nd,$nhid
420+ addcc %g1,%o0,%o0
421+ faddd $dota,$nloa,$nloa
422+ srlx %o3,16,%g1 ! 34-bit carry
423+ faddd $dotb,$nlob,$nlob
424+ bcs,a %xcc,.+8
425+ add %g1,1,%g1
426+
427+ stx %o0,[$tp] ! tp[j-1]=
428+
429+ faddd $ahic,$nhic,$dota ! $nhic
430+ faddd $ahid,$nhid,$dotb ! $nhid
431+
432+ faddd $nloc,$nhia,$nloc
433+ faddd $nlod,$nhib,$nlod
434+
435+ fdtox $nloa,$nloa
436+ fdtox $nlob,$nlob
437+ fdtox $nloc,$nloc
438+ fdtox $nlod,$nlod
439+
440+ std $nloa,[%sp+$bias+$frame+0]
441+ std $nlob,[%sp+$bias+$frame+8]
442+ std $nloc,[%sp+$bias+$frame+16]
443+ std $nlod,[%sp+$bias+$frame+24]
444+
445+ addcc $j,8,$j
446+ bnz,pt %icc,.L1st
447+ add $tp,8,$tp
448+
449+.L1stskip:
450+ fdtox $dota,$dota
451+ fdtox $dotb,$dotb
452+
453+ ldx [%sp+$bias+$frame+0],%o0
454+ ldx [%sp+$bias+$frame+8],%o1
455+ ldx [%sp+$bias+$frame+16],%o2
456+ ldx [%sp+$bias+$frame+24],%o3
457+
458+ srlx %o0,16,%o7
459+ std $dota,[%sp+$bias+$frame+32]
460+ add %o7,%o1,%o1
461+ std $dotb,[%sp+$bias+$frame+40]
462+ srlx %o1,16,%o7
463+ add %o7,%o2,%o2
464+ srlx %o2,16,%o7
465+ add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
466+ and %o0,$mask,%o0
467+ and %o1,$mask,%o1
468+ and %o2,$mask,%o2
469+ sllx %o1,16,%o1
470+ sllx %o2,32,%o2
471+ sllx %o3,48,%o7
472+ or %o1,%o0,%o0
473+ or %o2,%o0,%o0
474+ or %o7,%o0,%o0 ! 64-bit result
475+ ldx [%sp+$bias+$frame+32],%o4
476+ addcc %g1,%o0,%o0
477+ ldx [%sp+$bias+$frame+40],%o5
478+ srlx %o3,16,%g1 ! 34-bit carry
479+ bcs,a %xcc,.+8
480+ add %g1,1,%g1
481+
482+ stx %o0,[$tp] ! tp[j-1]=
483+ add $tp,8,$tp
484+
485+ srlx %o4,16,%o7
486+ add %o7,%o5,%o5
487+ and %o4,$mask,%o4
488+ sllx %o5,16,%o7
489+ or %o7,%o4,%o4
490+ addcc %g1,%o4,%o4
491+ srlx %o5,48,%g1
492+ bcs,a %xcc,.+8
493+ add %g1,1,%g1
494+
495+ mov %g1,$carry
496+ stx %o4,[$tp] ! tp[num-1]=
497+
498+ ba .Louter
499+ add $i,8,$i
500+.align 32
501+.Louter:
502+ sub %g0,$num,$j ! j=-num
503+ add %sp,$bias+$frame+$locals,$tp
504+
505+ add $ap,$j,%o3
506+ add $bp,$i,%o4
507+
508+ ld [%o3+4],%g1 ! bp[i]
509+ ld [%o3+0],%o0
510+ ld [%o4+4],%g5 ! ap[0]
511+ sllx %g1,32,%g1
512+ ld [%o4+0],%o1
513+ sllx %g5,32,%g5
514+ or %g1,%o0,%o0
515+ or %g5,%o1,%o1
516+
517+ ldx [$tp],%o2 ! tp[0]
518+ mulx %o1,%o0,%o0
519+ addcc %o2,%o0,%o0
520+ mulx $n0,%o0,%o0 ! (ap[0]*bp[i]+t[0])*n0
521+ stx %o0,[%sp+$bias+$frame+0]
522+
523+ ! transfer b[i] to FPU as 4x16-bit values
524+ ldda [%o4+2]%asi,$ba
525+ ldda [%o4+0]%asi,$bb
526+ ldda [%o4+6]%asi,$bc
527+ ldda [%o4+4]%asi,$bd
528+
529+ ! transfer (ap[0]*b[i]+t[0])*n0 to FPU as 4x16-bit values
530+ ldda [%sp+$bias+$frame+6]%asi,$na
531+ fxtod $ba,$ba
532+ ldda [%sp+$bias+$frame+4]%asi,$nb
533+ fxtod $bb,$bb
534+ ldda [%sp+$bias+$frame+2]%asi,$nc
535+ fxtod $bc,$bc
536+ ldda [%sp+$bias+$frame+0]%asi,$nd
537+ fxtod $bd,$bd
538+ ldd [$ap_l+$j],$alo ! load a[j] in double format
539+ fxtod $na,$na
540+ ldd [$ap_h+$j],$ahi
541+ fxtod $nb,$nb
542+ ldd [$np_l+$j],$nlo ! load n[j] in double format
543+ fxtod $nc,$nc
544+ ldd [$np_h+$j],$nhi
545+ fxtod $nd,$nd
546+
547+ fmuld $alo,$ba,$aloa
548+ fmuld $nlo,$na,$nloa
549+ fmuld $alo,$bb,$alob
550+ fmuld $nlo,$nb,$nlob
551+ fmuld $alo,$bc,$aloc
552+ faddd $aloa,$nloa,$nloa
553+ fmuld $nlo,$nc,$nloc
554+ fmuld $alo,$bd,$alod
555+ faddd $alob,$nlob,$nlob
556+ fmuld $nlo,$nd,$nlod
557+ fmuld $ahi,$ba,$ahia
558+ faddd $aloc,$nloc,$nloc
559+ fmuld $nhi,$na,$nhia
560+ fmuld $ahi,$bb,$ahib
561+ faddd $alod,$nlod,$nlod
562+ fmuld $nhi,$nb,$nhib
563+ fmuld $ahi,$bc,$ahic
564+ faddd $ahia,$nhia,$nhia
565+ fmuld $nhi,$nc,$nhic
566+ fmuld $ahi,$bd,$ahid
567+ faddd $ahib,$nhib,$nhib
568+ fmuld $nhi,$nd,$nhid
569+
570+ faddd $ahic,$nhic,$dota ! $nhic
571+ faddd $ahid,$nhid,$dotb ! $nhid
572+
573+ faddd $nloc,$nhia,$nloc
574+ faddd $nlod,$nhib,$nlod
575+
576+ fdtox $nloa,$nloa
577+ fdtox $nlob,$nlob
578+ fdtox $nloc,$nloc
579+ fdtox $nlod,$nlod
580+
581+ std $nloa,[%sp+$bias+$frame+0]
582+ std $nlob,[%sp+$bias+$frame+8]
583+ std $nloc,[%sp+$bias+$frame+16]
584+ add $j,8,$j
585+ std $nlod,[%sp+$bias+$frame+24]
586+
587+ ldd [$ap_l+$j],$alo ! load a[j] in double format
588+ ldd [$ap_h+$j],$ahi
589+ ldd [$np_l+$j],$nlo ! load n[j] in double format
590+ ldd [$np_h+$j],$nhi
591+
592+ fmuld $alo,$ba,$aloa
593+ fmuld $nlo,$na,$nloa
594+ fmuld $alo,$bb,$alob
595+ fmuld $nlo,$nb,$nlob
596+ fmuld $alo,$bc,$aloc
597+ ldx [%sp+$bias+$frame+0],%o0
598+ faddd $aloa,$nloa,$nloa
599+ fmuld $nlo,$nc,$nloc
600+ ldx [%sp+$bias+$frame+8],%o1
601+ fmuld $alo,$bd,$alod
602+ ldx [%sp+$bias+$frame+16],%o2
603+ faddd $alob,$nlob,$nlob
604+ fmuld $nlo,$nd,$nlod
605+ ldx [%sp+$bias+$frame+24],%o3
606+ fmuld $ahi,$ba,$ahia
607+
608+ srlx %o0,16,%o7
609+ faddd $aloc,$nloc,$nloc
610+ fmuld $nhi,$na,$nhia
611+ add %o7,%o1,%o1
612+ fmuld $ahi,$bb,$ahib
613+ srlx %o1,16,%o7
614+ faddd $alod,$nlod,$nlod
615+ fmuld $nhi,$nb,$nhib
616+ add %o7,%o2,%o2
617+ fmuld $ahi,$bc,$ahic
618+ srlx %o2,16,%o7
619+ faddd $ahia,$nhia,$nhia
620+ fmuld $nhi,$nc,$nhic
621+ add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
622+ ! why?
623+ and %o0,$mask,%o0
624+ fmuld $ahi,$bd,$ahid
625+ and %o1,$mask,%o1
626+ and %o2,$mask,%o2
627+ faddd $ahib,$nhib,$nhib
628+ fmuld $nhi,$nd,$nhid
629+ sllx %o1,16,%o1
630+ faddd $dota,$nloa,$nloa
631+ sllx %o2,32,%o2
632+ faddd $dotb,$nlob,$nlob
633+ sllx %o3,48,%o7
634+ or %o1,%o0,%o0
635+ faddd $ahic,$nhic,$dota ! $nhic
636+ or %o2,%o0,%o0
637+ faddd $ahid,$nhid,$dotb ! $nhid
638+ or %o7,%o0,%o0 ! 64-bit result
639+ ldx [$tp],%o7
640+ faddd $nloc,$nhia,$nloc
641+ addcc %o7,%o0,%o0
642+ ! end-of-why?
643+ faddd $nlod,$nhib,$nlod
644+ srlx %o3,16,%g1 ! 34-bit carry
645+ fdtox $nloa,$nloa
646+ bcs,a %xcc,.+8
647+ add %g1,1,%g1
648+
649+ fdtox $nlob,$nlob
650+ fdtox $nloc,$nloc
651+ fdtox $nlod,$nlod
652+
653+ std $nloa,[%sp+$bias+$frame+0]
654+ std $nlob,[%sp+$bias+$frame+8]
655+ addcc $j,8,$j
656+ std $nloc,[%sp+$bias+$frame+16]
657+ bz,pn %icc,.Linnerskip
658+ std $nlod,[%sp+$bias+$frame+24]
659+
660+ ba .Linner
661+ nop
662+.align 32
663+.Linner:
664+ ldd [$ap_l+$j],$alo ! load a[j] in double format
665+ ldd [$ap_h+$j],$ahi
666+ ldd [$np_l+$j],$nlo ! load n[j] in double format
667+ ldd [$np_h+$j],$nhi
668+
669+ fmuld $alo,$ba,$aloa
670+ fmuld $nlo,$na,$nloa
671+ fmuld $alo,$bb,$alob
672+ fmuld $nlo,$nb,$nlob
673+ fmuld $alo,$bc,$aloc
674+ ldx [%sp+$bias+$frame+0],%o0
675+ faddd $aloa,$nloa,$nloa
676+ fmuld $nlo,$nc,$nloc
677+ ldx [%sp+$bias+$frame+8],%o1
678+ fmuld $alo,$bd,$alod
679+ ldx [%sp+$bias+$frame+16],%o2
680+ faddd $alob,$nlob,$nlob
681+ fmuld $nlo,$nd,$nlod
682+ ldx [%sp+$bias+$frame+24],%o3
683+ fmuld $ahi,$ba,$ahia
684+
685+ srlx %o0,16,%o7
686+ faddd $aloc,$nloc,$nloc
687+ fmuld $nhi,$na,$nhia
688+ add %o7,%o1,%o1
689+ fmuld $ahi,$bb,$ahib
690+ srlx %o1,16,%o7
691+ faddd $alod,$nlod,$nlod
692+ fmuld $nhi,$nb,$nhib
693+ add %o7,%o2,%o2
694+ fmuld $ahi,$bc,$ahic
695+ srlx %o2,16,%o7
696+ faddd $ahia,$nhia,$nhia
697+ fmuld $nhi,$nc,$nhic
698+ add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
699+ and %o0,$mask,%o0
700+ fmuld $ahi,$bd,$ahid
701+ and %o1,$mask,%o1
702+ and %o2,$mask,%o2
703+ faddd $ahib,$nhib,$nhib
704+ fmuld $nhi,$nd,$nhid
705+ sllx %o1,16,%o1
706+ faddd $dota,$nloa,$nloa
707+ sllx %o2,32,%o2
708+ faddd $dotb,$nlob,$nlob
709+ sllx %o3,48,%o7
710+ or %o1,%o0,%o0
711+ faddd $ahic,$nhic,$dota ! $nhic
712+ or %o2,%o0,%o0
713+ faddd $ahid,$nhid,$dotb ! $nhid
714+ or %o7,%o0,%o0 ! 64-bit result
715+ faddd $nloc,$nhia,$nloc
716+ addcc %g1,%o0,%o0
717+ ldx [$tp+8],%o7 ! tp[j]
718+ faddd $nlod,$nhib,$nlod
719+ srlx %o3,16,%g1 ! 34-bit carry
720+ fdtox $nloa,$nloa
721+ bcs,a %xcc,.+8
722+ add %g1,1,%g1
723+ fdtox $nlob,$nlob
724+ addcc %o7,%o0,%o0
725+ fdtox $nloc,$nloc
726+ bcs,a %xcc,.+8
727+ add %g1,1,%g1
728+
729+ stx %o0,[$tp] ! tp[j-1]
730+ fdtox $nlod,$nlod
731+
732+ std $nloa,[%sp+$bias+$frame+0]
733+ std $nlob,[%sp+$bias+$frame+8]
734+ std $nloc,[%sp+$bias+$frame+16]
735+ addcc $j,8,$j
736+ std $nlod,[%sp+$bias+$frame+24]
737+ bnz,pt %icc,.Linner
738+ add $tp,8,$tp
739+
740+.Linnerskip:
741+ fdtox $dota,$dota
742+ fdtox $dotb,$dotb
743+
744+ ldx [%sp+$bias+$frame+0],%o0
745+ ldx [%sp+$bias+$frame+8],%o1
746+ ldx [%sp+$bias+$frame+16],%o2
747+ ldx [%sp+$bias+$frame+24],%o3
748+
749+ srlx %o0,16,%o7
750+ std $dota,[%sp+$bias+$frame+32]
751+ add %o7,%o1,%o1
752+ std $dotb,[%sp+$bias+$frame+40]
753+ srlx %o1,16,%o7
754+ add %o7,%o2,%o2
755+ srlx %o2,16,%o7
756+ add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
757+ and %o0,$mask,%o0
758+ and %o1,$mask,%o1
759+ and %o2,$mask,%o2
760+ sllx %o1,16,%o1
761+ sllx %o2,32,%o2
762+ sllx %o3,48,%o7
763+ or %o1,%o0,%o0
764+ or %o2,%o0,%o0
765+ ldx [%sp+$bias+$frame+32],%o4
766+ or %o7,%o0,%o0 ! 64-bit result
767+ ldx [%sp+$bias+$frame+40],%o5
768+ addcc %g1,%o0,%o0
769+ ldx [$tp+8],%o7 ! tp[j]
770+ srlx %o3,16,%g1 ! 34-bit carry
771+ bcs,a %xcc,.+8
772+ add %g1,1,%g1
773+
774+ addcc %o7,%o0,%o0
775+ bcs,a %xcc,.+8
776+ add %g1,1,%g1
777+
778+ stx %o0,[$tp] ! tp[j-1]
779+ add $tp,8,$tp
780+
781+ srlx %o4,16,%o7
782+ add %o7,%o5,%o5
783+ and %o4,$mask,%o4
784+ sllx %o5,16,%o7
785+ or %o7,%o4,%o4
786+ addcc %g1,%o4,%o4
787+ srlx %o5,48,%g1
788+ bcs,a %xcc,.+8
789+ add %g1,1,%g1
790+
791+ addcc $carry,%o4,%o4
792+ stx %o4,[$tp] ! tp[num-1]
793+ mov %g1,$carry
794+ bcs,a %xcc,.+8
795+ add $carry,1,$carry
796+
797+ addcc $i,8,$i
798+ bnz %icc,.Louter
799+ nop
800+
801+ add $tp,8,$tp ! adjust tp to point at the end
802+ orn %g0,%g0,%g4
803+ sub %g0,$num,%o7 ! n=-num
804+ ba .Lsub
805+ subcc %g0,%g0,%g0 ! clear %icc.c
806+
807+.align 32
808+.Lsub:
809+ ldx [$tp+%o7],%o0
810+ add $np,%o7,%g1
811+ ld [%g1+0],%o2
812+ ld [%g1+4],%o3
813+ srlx %o0,32,%o1
814+ subccc %o0,%o2,%o2
815+ add $rp,%o7,%g1
816+ subccc %o1,%o3,%o3
817+ st %o2,[%g1+0]
818+ add %o7,8,%o7
819+ brnz,pt %o7,.Lsub
820+ st %o3,[%g1+4]
821+ subc $carry,0,%g4
822+ sub %g0,$num,%o7 ! n=-num
823+ ba .Lcopy
824+ nop
825+
826+.align 32
827+.Lcopy:
828+ ldx [$tp+%o7],%o0
829+ add $rp,%o7,%g1
830+ ld [%g1+0],%o2
831+ ld [%g1+4],%o3
832+ stx %g0,[$tp+%o7]
833+ and %o0,%g4,%o0
834+ srlx %o0,32,%o1
835+ andn %o2,%g4,%o2
836+ andn %o3,%g4,%o3
837+ or %o2,%o0,%o0
838+ or %o3,%o1,%o1
839+ st %o0,[%g1+0]
840+ add %o7,8,%o7
841+ brnz,pt %o7,.Lcopy
842+ st %o1,[%g1+4]
843+ sub %g0,$num,%o7 ! n=-num
844+
845+.Lzap:
846+ stx %g0,[$ap_l+%o7]
847+ stx %g0,[$ap_h+%o7]
848+ stx %g0,[$np_l+%o7]
849+ stx %g0,[$np_h+%o7]
850+ add %o7,8,%o7
851+ brnz,pt %o7,.Lzap
852+ nop
853+
854+ ldx [%sp+$bias+$frame+48],%o7
855+ wr %g0,%o7,%asi ! restore %asi
856+
857+ mov 1,%i0
858+.Lret:
859+ ret
860+ restore
861+.type $fname,#function
862+.size $fname,(.-$fname)
863+.asciz "Montgomery Multipltication for UltraSPARC, CRYPTOGAMS by <appro\@openssl.org>"
864+.align 32
865+___
866+
867+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
868+
869+# Below substitution makes it possible to compile without demanding
870+# VIS extentions on command line, e.g. -xarch=v9 vs. -xarch=v9a. I
871+# dare to do this, because VIS capability is detected at run-time now
872+# and this routine is not called on CPU not capable to execute it. Do
873+# note that fzeros is not the only VIS dependency! Another dependency
874+# is implicit and is just _a_ numerical value loaded to %asi register,
875+# which assembler can't recognize as VIS specific...
876+$code =~ s/fzeros\s+%f([0-9]+)/
877+ sprintf(".word\t0x%x\t! fzeros %%f%d",0x81b00c20|($1<<25),$1)
878+ /gem;
879+
880+print $code;
881+# flush
882+close STDOUT;
--- /dev/null
+++ b/crypto/bn/asm/via-mont.pl
@@ -0,0 +1,242 @@
1+#!/usr/bin/env perl
2+#
3+# ====================================================================
4+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5+# project. The module is, however, dual licensed under OpenSSL and
6+# CRYPTOGAMS licenses depending on where you obtain it. For further
7+# details see http://www.openssl.org/~appro/cryptogams/.
8+# ====================================================================
9+#
10+# Wrapper around 'rep montmul', VIA-specific instruction accessing
11+# PadLock Montgomery Multiplier. The wrapper is designed as drop-in
12+# replacement for OpenSSL bn_mul_mont [first implemented in 0.9.9].
13+#
14+# Below are interleaved outputs from 'openssl speed rsa dsa' for 4
15+# different software configurations on 1.5GHz VIA Esther processor.
16+# Lines marked with "software integer" denote performance of hand-
17+# coded integer-only assembler found in OpenSSL 0.9.7. "Software SSE2"
18+# refers to hand-coded SSE2 Montgomery multiplication procedure found
19+# OpenSSL 0.9.9. "Hardware VIA SDK" refers to padlock_pmm routine from
20+# Padlock SDK 2.0.1 available for download from VIA, which naturally
21+# utilizes the magic 'repz montmul' instruction. And finally "hardware
22+# this" refers to *this* implementation which also uses 'repz montmul'
23+#
24+# sign verify sign/s verify/s
25+# rsa 512 bits 0.001720s 0.000140s 581.4 7149.7 software integer
26+# rsa 512 bits 0.000690s 0.000086s 1450.3 11606.0 software SSE2
27+# rsa 512 bits 0.006136s 0.000201s 163.0 4974.5 hardware VIA SDK
28+# rsa 512 bits 0.000712s 0.000050s 1404.9 19858.5 hardware this
29+#
30+# rsa 1024 bits 0.008518s 0.000413s 117.4 2420.8 software integer
31+# rsa 1024 bits 0.004275s 0.000277s 233.9 3609.7 software SSE2
32+# rsa 1024 bits 0.012136s 0.000260s 82.4 3844.5 hardware VIA SDK
33+# rsa 1024 bits 0.002522s 0.000116s 396.5 8650.9 hardware this
34+#
35+# rsa 2048 bits 0.050101s 0.001371s 20.0 729.6 software integer
36+# rsa 2048 bits 0.030273s 0.001008s 33.0 991.9 software SSE2
37+# rsa 2048 bits 0.030833s 0.000976s 32.4 1025.1 hardware VIA SDK
38+# rsa 2048 bits 0.011879s 0.000342s 84.2 2921.7 hardware this
39+#
40+# rsa 4096 bits 0.327097s 0.004859s 3.1 205.8 software integer
41+# rsa 4096 bits 0.229318s 0.003859s 4.4 259.2 software SSE2
42+# rsa 4096 bits 0.233953s 0.003274s 4.3 305.4 hardware VIA SDK
43+# rsa 4096 bits 0.070493s 0.001166s 14.2 857.6 hardware this
44+#
45+# dsa 512 bits 0.001342s 0.001651s 745.2 605.7 software integer
46+# dsa 512 bits 0.000844s 0.000987s 1185.3 1013.1 software SSE2
47+# dsa 512 bits 0.001902s 0.002247s 525.6 444.9 hardware VIA SDK
48+# dsa 512 bits 0.000458s 0.000524s 2182.2 1909.1 hardware this
49+#
50+# dsa 1024 bits 0.003964s 0.004926s 252.3 203.0 software integer
51+# dsa 1024 bits 0.002686s 0.003166s 372.3 315.8 software SSE2
52+# dsa 1024 bits 0.002397s 0.002823s 417.1 354.3 hardware VIA SDK
53+# dsa 1024 bits 0.000978s 0.001170s 1022.2 855.0 hardware this
54+#
55+# dsa 2048 bits 0.013280s 0.016518s 75.3 60.5 software integer
56+# dsa 2048 bits 0.009911s 0.011522s 100.9 86.8 software SSE2
57+# dsa 2048 bits 0.009542s 0.011763s 104.8 85.0 hardware VIA SDK
58+# dsa 2048 bits 0.002884s 0.003352s 346.8 298.3 hardware this
59+#
60+# To give you some other reference point here is output for 2.4GHz P4
61+# running hand-coded SSE2 bn_mul_mont found in 0.9.9, i.e. "software
62+# SSE2" in above terms.
63+#
64+# rsa 512 bits 0.000407s 0.000047s 2454.2 21137.0
65+# rsa 1024 bits 0.002426s 0.000141s 412.1 7100.0
66+# rsa 2048 bits 0.015046s 0.000491s 66.5 2034.9
67+# rsa 4096 bits 0.109770s 0.002379s 9.1 420.3
68+# dsa 512 bits 0.000438s 0.000525s 2281.1 1904.1
69+# dsa 1024 bits 0.001346s 0.001595s 742.7 627.0
70+# dsa 2048 bits 0.004745s 0.005582s 210.7 179.1
71+#
72+# Conclusions:
73+# - VIA SDK leaves a *lot* of room for improvement (which this
74+# implementation successfully fills:-);
75+# - 'rep montmul' gives up to >3x performance improvement depending on
76+# key length;
77+# - in terms of absolute performance it delivers approximately as much
78+# as modern out-of-order 32-bit cores [again, for longer keys].
79+
80+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
81+push(@INC,"${dir}","${dir}../../perlasm");
82+require "x86asm.pl";
83+
84+&asm_init($ARGV[0],"via-mont.pl");
85+
86+# int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0, int num);
87+$func="bn_mul_mont_padlock";
88+
89+$pad=16*1; # amount of reserved bytes on top of every vector
90+
91+# stack layout
92+$mZeroPrime=&DWP(0,"esp"); # these are specified by VIA
93+$A=&DWP(4,"esp");
94+$B=&DWP(8,"esp");
95+$T=&DWP(12,"esp");
96+$M=&DWP(16,"esp");
97+$scratch=&DWP(20,"esp");
98+$rp=&DWP(24,"esp"); # these are mine
99+$sp=&DWP(28,"esp");
100+# &DWP(32,"esp") # 32 byte scratch area
101+# &DWP(64+(4*$num+$pad)*0,"esp") # padded tp[num]
102+# &DWP(64+(4*$num+$pad)*1,"esp") # padded copy of ap[num]
103+# &DWP(64+(4*$num+$pad)*2,"esp") # padded copy of bp[num]
104+# &DWP(64+(4*$num+$pad)*3,"esp") # padded copy of np[num]
105+# Note that SDK suggests to unconditionally allocate 2K per vector. This
106+# has quite an impact on performance. It naturally depends on key length,
107+# but to give an example 1024 bit private RSA key operations suffer >30%
108+# penalty. I allocate only as much as actually required...
109+
110+&function_begin($func);
111+ &xor ("eax","eax");
112+ &mov ("ecx",&wparam(5)); # num
113+ # meet VIA's limitations for num [note that the specification
114+ # expresses them in bits, while we work with amount of 32-bit words]
115+ &test ("ecx",3);
116+ &jnz (&label("leave")); # num % 4 != 0
117+ &cmp ("ecx",8);
118+ &jb (&label("leave")); # num < 8
119+ &cmp ("ecx",1024);
120+ &ja (&label("leave")); # num > 1024
121+
122+ &pushf ();
123+ &cld ();
124+
125+ &mov ("edi",&wparam(0)); # rp
126+ &mov ("eax",&wparam(1)); # ap
127+ &mov ("ebx",&wparam(2)); # bp
128+ &mov ("edx",&wparam(3)); # np
129+ &mov ("esi",&wparam(4)); # n0
130+ &mov ("esi",&DWP(0,"esi")); # *n0
131+
132+ &lea ("ecx",&DWP($pad,"","ecx",4)); # ecx becomes vector size in bytes
133+ &lea ("ebp",&DWP(64,"","ecx",4)); # allocate 4 vectors + 64 bytes
134+ &neg ("ebp");
135+ &add ("ebp","esp");
136+ &and ("ebp",-64); # align to cache-line
137+ &xchg ("ebp","esp"); # alloca
138+
139+ &mov ($rp,"edi"); # save rp
140+ &mov ($sp,"ebp"); # save esp
141+
142+ &mov ($mZeroPrime,"esi");
143+ &lea ("esi",&DWP(64,"esp")); # tp
144+ &mov ($T,"esi");
145+ &lea ("edi",&DWP(32,"esp")); # scratch area
146+ &mov ($scratch,"edi");
147+ &mov ("esi","eax");
148+
149+ &lea ("ebp",&DWP(-$pad,"ecx"));
150+ &shr ("ebp",2); # restore original num value in ebp
151+
152+ &xor ("eax","eax");
153+
154+ &mov ("ecx","ebp");
155+ &lea ("ecx",&DWP((32+$pad)/4,"ecx"));# padded tp + scratch
156+ &data_byte(0xf3,0xab); # rep stosl, bzero
157+
158+ &mov ("ecx","ebp");
159+ &lea ("edi",&DWP(64+$pad,"esp","ecx",4));# pointer to ap copy
160+ &mov ($A,"edi");
161+ &data_byte(0xf3,0xa5); # rep movsl, memcpy
162+ &mov ("ecx",$pad/4);
163+ &data_byte(0xf3,0xab); # rep stosl, bzero pad
164+ # edi points at the end of padded ap copy...
165+
166+ &mov ("ecx","ebp");
167+ &mov ("esi","ebx");
168+ &mov ($B,"edi");
169+ &data_byte(0xf3,0xa5); # rep movsl, memcpy
170+ &mov ("ecx",$pad/4);
171+ &data_byte(0xf3,0xab); # rep stosl, bzero pad
172+ # edi points at the end of padded bp copy...
173+
174+ &mov ("ecx","ebp");
175+ &mov ("esi","edx");
176+ &mov ($M,"edi");
177+ &data_byte(0xf3,0xa5); # rep movsl, memcpy
178+ &mov ("ecx",$pad/4);
179+ &data_byte(0xf3,0xab); # rep stosl, bzero pad
180+ # edi points at the end of padded np copy...
181+
182+ # let magic happen...
183+ &mov ("ecx","ebp");
184+ &mov ("esi","esp");
185+ &shl ("ecx",5); # convert word counter to bit counter
186+ &align (4);
187+ &data_byte(0xf3,0x0f,0xa6,0xc0);# rep montmul
188+
189+ &mov ("ecx","ebp");
190+ &lea ("esi",&DWP(64,"esp")); # tp
191+ # edi still points at the end of padded np copy...
192+ &neg ("ebp");
193+ &lea ("ebp",&DWP(-$pad,"edi","ebp",4)); # so just "rewind"
194+ &mov ("edi",$rp); # restore rp
195+ &xor ("edx","edx"); # i=0 and clear CF
196+
197+&set_label("sub",8);
198+ &mov ("eax",&DWP(0,"esi","edx",4));
199+ &sbb ("eax",&DWP(0,"ebp","edx",4));
200+ &mov (&DWP(0,"edi","edx",4),"eax"); # rp[i]=tp[i]-np[i]
201+ &lea ("edx",&DWP(1,"edx")); # i++
202+ &loop (&label("sub")); # doesn't affect CF!
203+
204+ &mov ("eax",&DWP(0,"esi","edx",4)); # upmost overflow bit
205+ &sbb ("eax",0);
206+ &and ("esi","eax");
207+ &not ("eax");
208+ &mov ("ebp","edi");
209+ &and ("ebp","eax");
210+ &or ("esi","ebp"); # tp=carry?tp:rp
211+
212+ &mov ("ecx","edx"); # num
213+ &xor ("edx","edx"); # i=0
214+
215+&set_label("copy",8);
216+ &mov ("eax",&DWP(0,"esi","edx",4));
217+ &mov (&DWP(64,"esp","edx",4),"ecx"); # zap tp
218+ &mov (&DWP(0,"edi","edx",4),"eax");
219+ &lea ("edx",&DWP(1,"edx")); # i++
220+ &loop (&label("copy"));
221+
222+ &mov ("ebp",$sp);
223+ &xor ("eax","eax");
224+
225+ &mov ("ecx",64/4);
226+ &mov ("edi","esp"); # zap frame including scratch area
227+ &data_byte(0xf3,0xab); # rep stosl, bzero
228+
229+ # zap copies of ap, bp and np
230+ &lea ("edi",&DWP(64+$pad,"esp","edx",4));# pointer to ap
231+ &lea ("ecx",&DWP(3*$pad/4,"edx","edx",2));
232+ &data_byte(0xf3,0xab); # rep stosl, bzero
233+
234+ &mov ("esp","ebp");
235+ &inc ("eax"); # signal "done"
236+ &popf ();
237+&set_label("leave");
238+&function_end($func);
239+
240+&asciz("Padlock Montgomery Multiplication, CRYPTOGAMS by <appro\@openssl.org>");
241+
242+&asm_finish();
--- /dev/null
+++ b/crypto/bn/asm/x86-mont.pl
@@ -0,0 +1,591 @@
1+#!/usr/bin/env perl
2+
3+# ====================================================================
4+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5+# project. The module is, however, dual licensed under OpenSSL and
6+# CRYPTOGAMS licenses depending on where you obtain it. For further
7+# details see http://www.openssl.org/~appro/cryptogams/.
8+# ====================================================================
9+
10+# October 2005
11+#
12+# This is a "teaser" code, as it can be improved in several ways...
13+# First of all non-SSE2 path should be implemented (yes, for now it
14+# performs Montgomery multiplication/convolution only on SSE2-capable
15+# CPUs such as P4, others fall down to original code). Then inner loop
16+# can be unrolled and modulo-scheduled to improve ILP and possibly
17+# moved to 128-bit XMM register bank (though it would require input
18+# rearrangement and/or increase bus bandwidth utilization). Dedicated
19+# squaring procedure should give further performance improvement...
20+# Yet, for being draft, the code improves rsa512 *sign* benchmark by
21+# 110%(!), rsa1024 one - by 70% and rsa4096 - by 20%:-)
22+
23+# December 2006
24+#
25+# Modulo-scheduling SSE2 loops results in further 15-20% improvement.
26+# Integer-only code [being equipped with dedicated squaring procedure]
27+# gives ~40% on rsa512 sign benchmark...
28+
29+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
30+push(@INC,"${dir}","${dir}../../perlasm");
31+require "x86asm.pl";
32+
33+&asm_init($ARGV[0],$0);
34+
35+$sse2=0;
36+for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
37+
38+&external_label("OPENSSL_ia32cap_P") if ($sse2);
39+
40+&function_begin("bn_mul_mont");
41+
42+$i="edx";
43+$j="ecx";
44+$ap="esi"; $tp="esi"; # overlapping variables!!!
45+$rp="edi"; $bp="edi"; # overlapping variables!!!
46+$np="ebp";
47+$num="ebx";
48+
49+$_num=&DWP(4*0,"esp"); # stack top layout
50+$_rp=&DWP(4*1,"esp");
51+$_ap=&DWP(4*2,"esp");
52+$_bp=&DWP(4*3,"esp");
53+$_np=&DWP(4*4,"esp");
54+$_n0=&DWP(4*5,"esp"); $_n0q=&QWP(4*5,"esp");
55+$_sp=&DWP(4*6,"esp");
56+$_bpend=&DWP(4*7,"esp");
57+$frame=32; # size of above frame rounded up to 16n
58+
59+ &xor ("eax","eax");
60+ &mov ("edi",&wparam(5)); # int num
61+ &cmp ("edi",4);
62+ &jl (&label("just_leave"));
63+
64+ &lea ("esi",&wparam(0)); # put aside pointer to argument block
65+ &lea ("edx",&wparam(1)); # load ap
66+ &mov ("ebp","esp"); # saved stack pointer!
67+ &add ("edi",2); # extra two words on top of tp
68+ &neg ("edi");
69+ &lea ("esp",&DWP(-$frame,"esp","edi",4)); # alloca($frame+4*(num+2))
70+ &neg ("edi");
71+
72+ # minimize cache contention by arraning 2K window between stack
73+ # pointer and ap argument [np is also position sensitive vector,
74+ # but it's assumed to be near ap, as it's allocated at ~same
75+ # time].
76+ &mov ("eax","esp");
77+ &sub ("eax","edx");
78+ &and ("eax",2047);
79+ &sub ("esp","eax"); # this aligns sp and ap modulo 2048
80+
81+ &xor ("edx","esp");
82+ &and ("edx",2048);
83+ &xor ("edx",2048);
84+ &sub ("esp","edx"); # this splits them apart modulo 4096
85+
86+ &and ("esp",-64); # align to cache line
87+
88+ ################################# load argument block...
89+ &mov ("eax",&DWP(0*4,"esi"));# BN_ULONG *rp
90+ &mov ("ebx",&DWP(1*4,"esi"));# const BN_ULONG *ap
91+ &mov ("ecx",&DWP(2*4,"esi"));# const BN_ULONG *bp
92+ &mov ("edx",&DWP(3*4,"esi"));# const BN_ULONG *np
93+ &mov ("esi",&DWP(4*4,"esi"));# const BN_ULONG *n0
94+ #&mov ("edi",&DWP(5*4,"esi"));# int num
95+
96+ &mov ("esi",&DWP(0,"esi")); # pull n0[0]
97+ &mov ($_rp,"eax"); # ... save a copy of argument block
98+ &mov ($_ap,"ebx");
99+ &mov ($_bp,"ecx");
100+ &mov ($_np,"edx");
101+ &mov ($_n0,"esi");
102+ &lea ($num,&DWP(-3,"edi")); # num=num-1 to assist modulo-scheduling
103+ #&mov ($_num,$num); # redundant as $num is not reused
104+ &mov ($_sp,"ebp"); # saved stack pointer!
105+
106+if($sse2) {
107+$acc0="mm0"; # mmx register bank layout
108+$acc1="mm1";
109+$car0="mm2";
110+$car1="mm3";
111+$mul0="mm4";
112+$mul1="mm5";
113+$temp="mm6";
114+$mask="mm7";
115+
116+ &picmeup("eax","OPENSSL_ia32cap_P");
117+ &bt (&DWP(0,"eax"),26);
118+ &jnc (&label("non_sse2"));
119+
120+ &mov ("eax",-1);
121+ &movd ($mask,"eax"); # mask 32 lower bits
122+
123+ &mov ($ap,$_ap); # load input pointers
124+ &mov ($bp,$_bp);
125+ &mov ($np,$_np);
126+
127+ &xor ($i,$i); # i=0
128+ &xor ($j,$j); # j=0
129+
130+ &movd ($mul0,&DWP(0,$bp)); # bp[0]
131+ &movd ($mul1,&DWP(0,$ap)); # ap[0]
132+ &movd ($car1,&DWP(0,$np)); # np[0]
133+
134+ &pmuludq($mul1,$mul0); # ap[0]*bp[0]
135+ &movq ($car0,$mul1);
136+ &movq ($acc0,$mul1); # I wish movd worked for
137+ &pand ($acc0,$mask); # inter-register transfers
138+
139+ &pmuludq($mul1,$_n0q); # *=n0
140+
141+ &pmuludq($car1,$mul1); # "t[0]"*np[0]*n0
142+ &paddq ($car1,$acc0);
143+
144+ &movd ($acc1,&DWP(4,$np)); # np[1]
145+ &movd ($acc0,&DWP(4,$ap)); # ap[1]
146+
147+ &psrlq ($car0,32);
148+ &psrlq ($car1,32);
149+
150+ &inc ($j); # j++
151+&set_label("1st",16);
152+ &pmuludq($acc0,$mul0); # ap[j]*bp[0]
153+ &pmuludq($acc1,$mul1); # np[j]*m1
154+ &paddq ($car0,$acc0); # +=c0
155+ &paddq ($car1,$acc1); # +=c1
156+
157+ &movq ($acc0,$car0);
158+ &pand ($acc0,$mask);
159+ &movd ($acc1,&DWP(4,$np,$j,4)); # np[j+1]
160+ &paddq ($car1,$acc0); # +=ap[j]*bp[0];
161+ &movd ($acc0,&DWP(4,$ap,$j,4)); # ap[j+1]
162+ &psrlq ($car0,32);
163+ &movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[j-1]=
164+ &psrlq ($car1,32);
165+
166+ &lea ($j,&DWP(1,$j));
167+ &cmp ($j,$num);
168+ &jl (&label("1st"));
169+
170+ &pmuludq($acc0,$mul0); # ap[num-1]*bp[0]
171+ &pmuludq($acc1,$mul1); # np[num-1]*m1
172+ &paddq ($car0,$acc0); # +=c0
173+ &paddq ($car1,$acc1); # +=c1
174+
175+ &movq ($acc0,$car0);
176+ &pand ($acc0,$mask);
177+ &paddq ($car1,$acc0); # +=ap[num-1]*bp[0];
178+ &movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[num-2]=
179+
180+ &psrlq ($car0,32);
181+ &psrlq ($car1,32);
182+
183+ &paddq ($car1,$car0);
184+ &movq (&QWP($frame,"esp",$num,4),$car1); # tp[num].tp[num-1]
185+
186+ &inc ($i); # i++
187+&set_label("outer");
188+ &xor ($j,$j); # j=0
189+
190+ &movd ($mul0,&DWP(0,$bp,$i,4)); # bp[i]
191+ &movd ($mul1,&DWP(0,$ap)); # ap[0]
192+ &movd ($temp,&DWP($frame,"esp")); # tp[0]
193+ &movd ($car1,&DWP(0,$np)); # np[0]
194+ &pmuludq($mul1,$mul0); # ap[0]*bp[i]
195+
196+ &paddq ($mul1,$temp); # +=tp[0]
197+ &movq ($acc0,$mul1);
198+ &movq ($car0,$mul1);
199+ &pand ($acc0,$mask);
200+
201+ &pmuludq($mul1,$_n0q); # *=n0
202+
203+ &pmuludq($car1,$mul1);
204+ &paddq ($car1,$acc0);
205+
206+ &movd ($temp,&DWP($frame+4,"esp")); # tp[1]
207+ &movd ($acc1,&DWP(4,$np)); # np[1]
208+ &movd ($acc0,&DWP(4,$ap)); # ap[1]
209+
210+ &psrlq ($car0,32);
211+ &psrlq ($car1,32);
212+ &paddq ($car0,$temp); # +=tp[1]
213+
214+ &inc ($j); # j++
215+ &dec ($num);
216+&set_label("inner");
217+ &pmuludq($acc0,$mul0); # ap[j]*bp[i]
218+ &pmuludq($acc1,$mul1); # np[j]*m1
219+ &paddq ($car0,$acc0); # +=c0
220+ &paddq ($car1,$acc1); # +=c1
221+
222+ &movq ($acc0,$car0);
223+ &movd ($temp,&DWP($frame+4,"esp",$j,4));# tp[j+1]
224+ &pand ($acc0,$mask);
225+ &movd ($acc1,&DWP(4,$np,$j,4)); # np[j+1]
226+ &paddq ($car1,$acc0); # +=ap[j]*bp[i]+tp[j]
227+ &movd ($acc0,&DWP(4,$ap,$j,4)); # ap[j+1]
228+ &psrlq ($car0,32);
229+ &movd (&DWP($frame-4,"esp",$j,4),$car1);# tp[j-1]=
230+ &psrlq ($car1,32);
231+ &paddq ($car0,$temp); # +=tp[j+1]
232+
233+ &dec ($num);
234+ &lea ($j,&DWP(1,$j)); # j++
235+ &jnz (&label("inner"));
236+
237+ &mov ($num,$j);
238+ &pmuludq($acc0,$mul0); # ap[num-1]*bp[i]
239+ &pmuludq($acc1,$mul1); # np[num-1]*m1
240+ &paddq ($car0,$acc0); # +=c0
241+ &paddq ($car1,$acc1); # +=c1
242+
243+ &movq ($acc0,$car0);
244+ &pand ($acc0,$mask);
245+ &paddq ($car1,$acc0); # +=ap[num-1]*bp[i]+tp[num-1]
246+ &movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[num-2]=
247+ &psrlq ($car0,32);
248+ &psrlq ($car1,32);
249+
250+ &movd ($temp,&DWP($frame+4,"esp",$num,4)); # += tp[num]
251+ &paddq ($car1,$car0);
252+ &paddq ($car1,$temp);
253+ &movq (&QWP($frame,"esp",$num,4),$car1); # tp[num].tp[num-1]
254+
255+ &lea ($i,&DWP(1,$i)); # i++
256+ &cmp ($i,$num);
257+ &jle (&label("outer"));
258+
259+ &emms (); # done with mmx bank
260+ &jmp (&label("common_tail"));
261+
262+&set_label("non_sse2",16);
263+}
264+
265+if (0) {
266+ &mov ("esp",$_sp);
267+ &xor ("eax","eax"); # signal "not fast enough [yet]"
268+ &jmp (&label("just_leave"));
269+ # While the below code provides competitive performance for
270+ # all key lengthes on modern Intel cores, it's still more
271+ # than 10% slower for 4096-bit key elsewhere:-( "Competitive"
272+ # means compared to the original integer-only assembler.
273+ # 512-bit RSA sign is better by ~40%, but that's about all
274+ # one can say about all CPUs...
275+} else {
276+$inp="esi"; # integer path uses these registers differently
277+$word="edi";
278+$carry="ebp";
279+
280+ &mov ($inp,$_ap);
281+ &lea ($carry,&DWP(1,$num));
282+ &mov ($word,$_bp);
283+ &xor ($j,$j); # j=0
284+ &mov ("edx",$inp);
285+ &and ($carry,1); # see if num is even
286+ &sub ("edx",$word); # see if ap==bp
287+ &lea ("eax",&DWP(4,$word,$num,4)); # &bp[num]
288+ &or ($carry,"edx");
289+ &mov ($word,&DWP(0,$word)); # bp[0]
290+ &jz (&label("bn_sqr_mont"));
291+ &mov ($_bpend,"eax");
292+ &mov ("eax",&DWP(0,$inp));
293+ &xor ("edx","edx");
294+
295+&set_label("mull",16);
296+ &mov ($carry,"edx");
297+ &mul ($word); # ap[j]*bp[0]
298+ &add ($carry,"eax");
299+ &lea ($j,&DWP(1,$j));
300+ &adc ("edx",0);
301+ &mov ("eax",&DWP(0,$inp,$j,4)); # ap[j+1]
302+ &cmp ($j,$num);
303+ &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]=
304+ &jl (&label("mull"));
305+
306+ &mov ($carry,"edx");
307+ &mul ($word); # ap[num-1]*bp[0]
308+ &mov ($word,$_n0);
309+ &add ("eax",$carry);
310+ &mov ($inp,$_np);
311+ &adc ("edx",0);
312+ &imul ($word,&DWP($frame,"esp")); # n0*tp[0]
313+
314+ &mov (&DWP($frame,"esp",$num,4),"eax"); # tp[num-1]=
315+ &xor ($j,$j);
316+ &mov (&DWP($frame+4,"esp",$num,4),"edx"); # tp[num]=
317+ &mov (&DWP($frame+8,"esp",$num,4),$j); # tp[num+1]=
318+
319+ &mov ("eax",&DWP(0,$inp)); # np[0]
320+ &mul ($word); # np[0]*m
321+ &add ("eax",&DWP($frame,"esp")); # +=tp[0]
322+ &mov ("eax",&DWP(4,$inp)); # np[1]
323+ &adc ("edx",0);
324+ &inc ($j);
325+
326+ &jmp (&label("2ndmadd"));
327+
328+&set_label("1stmadd",16);
329+ &mov ($carry,"edx");
330+ &mul ($word); # ap[j]*bp[i]
331+ &add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j]
332+ &lea ($j,&DWP(1,$j));
333+ &adc ("edx",0);
334+ &add ($carry,"eax");
335+ &mov ("eax",&DWP(0,$inp,$j,4)); # ap[j+1]
336+ &adc ("edx",0);
337+ &cmp ($j,$num);
338+ &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]=
339+ &jl (&label("1stmadd"));
340+
341+ &mov ($carry,"edx");
342+ &mul ($word); # ap[num-1]*bp[i]
343+ &add ("eax",&DWP($frame,"esp",$num,4)); # +=tp[num-1]
344+ &mov ($word,$_n0);
345+ &adc ("edx",0);
346+ &mov ($inp,$_np);
347+ &add ($carry,"eax");
348+ &adc ("edx",0);
349+ &imul ($word,&DWP($frame,"esp")); # n0*tp[0]
350+
351+ &xor ($j,$j);
352+ &add ("edx",&DWP($frame+4,"esp",$num,4)); # carry+=tp[num]
353+ &mov (&DWP($frame,"esp",$num,4),$carry); # tp[num-1]=
354+ &adc ($j,0);
355+ &mov ("eax",&DWP(0,$inp)); # np[0]
356+ &mov (&DWP($frame+4,"esp",$num,4),"edx"); # tp[num]=
357+ &mov (&DWP($frame+8,"esp",$num,4),$j); # tp[num+1]=
358+
359+ &mul ($word); # np[0]*m
360+ &add ("eax",&DWP($frame,"esp")); # +=tp[0]
361+ &mov ("eax",&DWP(4,$inp)); # np[1]
362+ &adc ("edx",0);
363+ &mov ($j,1);
364+
365+&set_label("2ndmadd",16);
366+ &mov ($carry,"edx");
367+ &mul ($word); # np[j]*m
368+ &add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j]
369+ &lea ($j,&DWP(1,$j));
370+ &adc ("edx",0);
371+ &add ($carry,"eax");
372+ &mov ("eax",&DWP(0,$inp,$j,4)); # np[j+1]
373+ &adc ("edx",0);
374+ &cmp ($j,$num);
375+ &mov (&DWP($frame-8,"esp",$j,4),$carry); # tp[j-1]=
376+ &jl (&label("2ndmadd"));
377+
378+ &mov ($carry,"edx");
379+ &mul ($word); # np[j]*m
380+ &add ($carry,&DWP($frame,"esp",$num,4)); # +=tp[num-1]
381+ &adc ("edx",0);
382+ &add ($carry,"eax");
383+ &adc ("edx",0);
384+ &mov (&DWP($frame-4,"esp",$num,4),$carry); # tp[num-2]=
385+
386+ &xor ("eax","eax");
387+ &mov ($j,$_bp); # &bp[i]
388+ &add ("edx",&DWP($frame+4,"esp",$num,4)); # carry+=tp[num]
389+ &adc ("eax",&DWP($frame+8,"esp",$num,4)); # +=tp[num+1]
390+ &lea ($j,&DWP(4,$j));
391+ &mov (&DWP($frame,"esp",$num,4),"edx"); # tp[num-1]=
392+ &cmp ($j,$_bpend);
393+ &mov (&DWP($frame+4,"esp",$num,4),"eax"); # tp[num]=
394+ &je (&label("common_tail"));
395+
396+ &mov ($word,&DWP(0,$j)); # bp[i+1]
397+ &mov ($inp,$_ap);
398+ &mov ($_bp,$j); # &bp[++i]
399+ &xor ($j,$j);
400+ &xor ("edx","edx");
401+ &mov ("eax",&DWP(0,$inp));
402+ &jmp (&label("1stmadd"));
403+
404+&set_label("bn_sqr_mont",16);
405+$sbit=$num;
406+ &mov ($_num,$num);
407+ &mov ($_bp,$j); # i=0
408+
409+ &mov ("eax",$word); # ap[0]
410+ &mul ($word); # ap[0]*ap[0]
411+ &mov (&DWP($frame,"esp"),"eax"); # tp[0]=
412+ &mov ($sbit,"edx");
413+ &shr ("edx",1);
414+ &and ($sbit,1);
415+ &inc ($j);
416+&set_label("sqr",16);
417+ &mov ("eax",&DWP(0,$inp,$j,4)); # ap[j]
418+ &mov ($carry,"edx");
419+ &mul ($word); # ap[j]*ap[0]
420+ &add ("eax",$carry);
421+ &lea ($j,&DWP(1,$j));
422+ &adc ("edx",0);
423+ &lea ($carry,&DWP(0,$sbit,"eax",2));
424+ &shr ("eax",31);
425+ &cmp ($j,$_num);
426+ &mov ($sbit,"eax");
427+ &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]=
428+ &jl (&label("sqr"));
429+
430+ &mov ("eax",&DWP(0,$inp,$j,4)); # ap[num-1]
431+ &mov ($carry,"edx");
432+ &mul ($word); # ap[num-1]*ap[0]
433+ &add ("eax",$carry);
434+ &mov ($word,$_n0);
435+ &adc ("edx",0);
436+ &mov ($inp,$_np);
437+ &lea ($carry,&DWP(0,$sbit,"eax",2));
438+ &imul ($word,&DWP($frame,"esp")); # n0*tp[0]
439+ &shr ("eax",31);
440+ &mov (&DWP($frame,"esp",$j,4),$carry); # tp[num-1]=
441+
442+ &lea ($carry,&DWP(0,"eax","edx",2));
443+ &mov ("eax",&DWP(0,$inp)); # np[0]
444+ &shr ("edx",31);
445+ &mov (&DWP($frame+4,"esp",$j,4),$carry); # tp[num]=
446+ &mov (&DWP($frame+8,"esp",$j,4),"edx"); # tp[num+1]=
447+
448+ &mul ($word); # np[0]*m
449+ &add ("eax",&DWP($frame,"esp")); # +=tp[0]
450+ &mov ($num,$j);
451+ &adc ("edx",0);
452+ &mov ("eax",&DWP(4,$inp)); # np[1]
453+ &mov ($j,1);
454+
455+&set_label("3rdmadd",16);
456+ &mov ($carry,"edx");
457+ &mul ($word); # np[j]*m
458+ &add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j]
459+ &adc ("edx",0);
460+ &add ($carry,"eax");
461+ &mov ("eax",&DWP(4,$inp,$j,4)); # np[j+1]
462+ &adc ("edx",0);
463+ &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j-1]=
464+
465+ &mov ($carry,"edx");
466+ &mul ($word); # np[j+1]*m
467+ &add ($carry,&DWP($frame+4,"esp",$j,4)); # +=tp[j+1]
468+ &lea ($j,&DWP(2,$j));
469+ &adc ("edx",0);
470+ &add ($carry,"eax");
471+ &mov ("eax",&DWP(0,$inp,$j,4)); # np[j+2]
472+ &adc ("edx",0);
473+ &cmp ($j,$num);
474+ &mov (&DWP($frame-8,"esp",$j,4),$carry); # tp[j]=
475+ &jl (&label("3rdmadd"));
476+
477+ &mov ($carry,"edx");
478+ &mul ($word); # np[j]*m
479+ &add ($carry,&DWP($frame,"esp",$num,4)); # +=tp[num-1]
480+ &adc ("edx",0);
481+ &add ($carry,"eax");
482+ &adc ("edx",0);
483+ &mov (&DWP($frame-4,"esp",$num,4),$carry); # tp[num-2]=
484+
485+ &mov ($j,$_bp); # i
486+ &xor ("eax","eax");
487+ &mov ($inp,$_ap);
488+ &add ("edx",&DWP($frame+4,"esp",$num,4)); # carry+=tp[num]
489+ &adc ("eax",&DWP($frame+8,"esp",$num,4)); # +=tp[num+1]
490+ &mov (&DWP($frame,"esp",$num,4),"edx"); # tp[num-1]=
491+ &cmp ($j,$num);
492+ &mov (&DWP($frame+4,"esp",$num,4),"eax"); # tp[num]=
493+ &je (&label("common_tail"));
494+
495+ &mov ($word,&DWP(4,$inp,$j,4)); # ap[i]
496+ &lea ($j,&DWP(1,$j));
497+ &mov ("eax",$word);
498+ &mov ($_bp,$j); # ++i
499+ &mul ($word); # ap[i]*ap[i]
500+ &add ("eax",&DWP($frame,"esp",$j,4)); # +=tp[i]
501+ &adc ("edx",0);
502+ &mov (&DWP($frame,"esp",$j,4),"eax"); # tp[i]=
503+ &xor ($carry,$carry);
504+ &cmp ($j,$num);
505+ &lea ($j,&DWP(1,$j));
506+ &je (&label("sqrlast"));
507+
508+ &mov ($sbit,"edx"); # zaps $num
509+ &shr ("edx",1);
510+ &and ($sbit,1);
511+&set_label("sqradd",16);
512+ &mov ("eax",&DWP(0,$inp,$j,4)); # ap[j]
513+ &mov ($carry,"edx");
514+ &mul ($word); # ap[j]*ap[i]
515+ &add ("eax",$carry);
516+ &lea ($carry,&DWP(0,"eax","eax"));
517+ &adc ("edx",0);
518+ &shr ("eax",31);
519+ &add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j]
520+ &lea ($j,&DWP(1,$j));
521+ &adc ("eax",0);
522+ &add ($carry,$sbit);
523+ &adc ("eax",0);
524+ &cmp ($j,$_num);
525+ &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]=
526+ &mov ($sbit,"eax");
527+ &jle (&label("sqradd"));
528+
529+ &mov ($carry,"edx");
530+ &lea ("edx",&DWP(0,$sbit,"edx",2));
531+ &shr ($carry,31);
532+&set_label("sqrlast");
533+ &mov ($word,$_n0);
534+ &mov ($inp,$_np);
535+ &imul ($word,&DWP($frame,"esp")); # n0*tp[0]
536+
537+ &add ("edx",&DWP($frame,"esp",$j,4)); # +=tp[num]
538+ &mov ("eax",&DWP(0,$inp)); # np[0]
539+ &adc ($carry,0);
540+ &mov (&DWP($frame,"esp",$j,4),"edx"); # tp[num]=
541+ &mov (&DWP($frame+4,"esp",$j,4),$carry); # tp[num+1]=
542+
543+ &mul ($word); # np[0]*m
544+ &add ("eax",&DWP($frame,"esp")); # +=tp[0]
545+ &lea ($num,&DWP(-1,$j));
546+ &adc ("edx",0);
547+ &mov ($j,1);
548+ &mov ("eax",&DWP(4,$inp)); # np[1]
549+
550+ &jmp (&label("3rdmadd"));
551+}
552+
553+&set_label("common_tail",16);
554+ &mov ($np,$_np); # load modulus pointer
555+ &mov ($rp,$_rp); # load result pointer
556+ &lea ($tp,&DWP($frame,"esp")); # [$ap and $bp are zapped]
557+
558+ &mov ("eax",&DWP(0,$tp)); # tp[0]
559+ &mov ($j,$num); # j=num-1
560+ &xor ($i,$i); # i=0 and clear CF!
561+
562+&set_label("sub",16);
563+ &sbb ("eax",&DWP(0,$np,$i,4));
564+ &mov (&DWP(0,$rp,$i,4),"eax"); # rp[i]=tp[i]-np[i]
565+ &dec ($j); # doesn't affect CF!
566+ &mov ("eax",&DWP(4,$tp,$i,4)); # tp[i+1]
567+ &lea ($i,&DWP(1,$i)); # i++
568+ &jge (&label("sub"));
569+
570+ &sbb ("eax",0); # handle upmost overflow bit
571+ &and ($tp,"eax");
572+ &not ("eax");
573+ &mov ($np,$rp);
574+ &and ($np,"eax");
575+ &or ($tp,$np); # tp=carry?tp:rp
576+
577+&set_label("copy",16); # copy or in-place refresh
578+ &mov ("eax",&DWP(0,$tp,$num,4));
579+ &mov (&DWP(0,$rp,$num,4),"eax"); # rp[i]=tp[i]
580+ &mov (&DWP($frame,"esp",$num,4),$j); # zap temporary vector
581+ &dec ($num);
582+ &jge (&label("copy"));
583+
584+ &mov ("esp",$_sp); # pull saved stack pointer
585+ &mov ("eax",1);
586+&set_label("just_leave");
587+&function_end("bn_mul_mont");
588+
589+&asciz("Montgomery Multiplication for x86, CRYPTOGAMS by <appro\@openssl.org>");
590+
591+&asm_finish();
--- /dev/null
+++ b/crypto/camellia/.cvsignore
@@ -0,0 +1,3 @@
1+lib
2+Makefile.save
3+cmll-*.s
--- /dev/null
+++ b/crypto/camellia/asm/cmll-x86.pl
@@ -0,0 +1,1138 @@
1+#!/usr/bin/env perl
2+
3+# ====================================================================
4+# Copyright (c) 2008 Andy Polyakov <appro@openssl.org>
5+#
6+# This module may be used under the terms of either the GNU General
7+# Public License version 2 or later, the GNU Lesser General Public
8+# License version 2.1 or later, the Mozilla Public License version
9+# 1.1 or the BSD License. The exact terms of either license are
10+# distributed along with this module. For further details see
11+# http://www.openssl.org/~appro/camellia/.
12+# ====================================================================
13+
14+# Performance in cycles per processed byte (less is better) in
15+# 'openssl speed ...' benchmark:
16+#
17+# AMD K8 Core2 PIII P4
18+# -evp camellia-128-ecb 21.5 22.8 27.0 28.9
19+# + over gcc 3.4.6 +90/11% +70/10% +53/4% +160/64%
20+# + over icc 8.0 +48/19% +21/15% +21/17% +55/37%
21+#
22+# camellia-128-cbc 17.3 21.1 23.9 25.9
23+#
24+# 128-bit key setup 196 280 256 240 cycles/key
25+# + over gcc 3.4.6 +30/0% +17/11% +11/0% +63/40%
26+# + over icc 8.0 +18/3% +10/0% +10/3% +21/10%
27+#
28+# Pairs of numbers in "+" rows represent performance improvement over
29+# compiler generated position-independent code, PIC, and non-PIC
30+# respectively. PIC results are of greater relevance, as this module
31+# is position-independent, i.e. suitable for a shared library or PIE.
32+# Position independence "costs" one register, which is why compilers
33+# are so close with non-PIC results, they have an extra register to
34+# spare. CBC results are better than ECB ones thanks to "zero-copy"
35+# private _x86_* interface, and are ~30-40% better than with compiler
36+# generated cmll_cbc.o, and reach ~80-90% of x86_64 performance on
37+# same CPU (where applicable).
38+
39+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
40+push(@INC,"${dir}","${dir}../../perlasm");
41+require "x86asm.pl";
42+
43+$OPENSSL=1;
44+
45+&asm_init($ARGV[0],"cmll-586.pl",$ARGV[$#ARGV] eq "386");
46+
47+@T=("eax","ebx","ecx","edx");
48+$idx="esi";
49+$key="edi";
50+$Tbl="ebp";
51+
52+# stack frame layout in _x86_Camellia_* routines, frame is allocated
53+# by caller
54+$__ra=&DWP(0,"esp"); # return address
55+$__s0=&DWP(4,"esp"); # s0 backing store
56+$__s1=&DWP(8,"esp"); # s1 backing store
57+$__s2=&DWP(12,"esp"); # s2 backing store
58+$__s3=&DWP(16,"esp"); # s3 backing store
59+$__end=&DWP(20,"esp"); # pointer to end/start of key schedule
60+
61+# stack frame layout in Camellia_[en|crypt] routines, which differs from
62+# above by 4 and overlaps by pointer to end/start of key schedule
63+$_end=&DWP(16,"esp");
64+$_esp=&DWP(20,"esp");
65+
66+# const unsigned int Camellia_SBOX[4][256];
67+# Well, sort of... Camellia_SBOX[0][] is interleaved with [1][],
68+# and [2][] - with [3][]. This is done to optimize code size.
69+$SBOX1_1110=0; # Camellia_SBOX[0]
70+$SBOX4_4404=4; # Camellia_SBOX[1]
71+$SBOX2_0222=2048; # Camellia_SBOX[2]
72+$SBOX3_3033=2052; # Camellia_SBOX[3]
73+&static_label("Camellia_SIGMA");
74+&static_label("Camellia_SBOX");
75+
76+sub Camellia_Feistel {
77+my $i=@_[0];
78+my $seed=defined(@_[1])?@_[1]:0;
79+my $scale=$seed<0?-8:8;
80+my $frame=defined(@_[2])?@_[2]:0;
81+my $j=($i&1)*2;
82+my $t0=@T[($j)%4],$t1=@T[($j+1)%4],$t2=@T[($j+2)%4],$t3=@T[($j+3)%4];
83+
84+ &xor ($t0,$idx); # t0^=key[0]
85+ &xor ($t1,&DWP($seed+$i*$scale+4,$key)); # t1^=key[1]
86+ &movz ($idx,&HB($t0)); # (t0>>8)&0xff
87+ &mov ($t3,&DWP($SBOX3_3033,$Tbl,$idx,8)); # t3=SBOX3_3033[0]
88+ &movz ($idx,&LB($t0)); # (t0>>0)&0xff
89+ &xor ($t3,&DWP($SBOX4_4404,$Tbl,$idx,8)); # t3^=SBOX4_4404[0]
90+ &shr ($t0,16);
91+ &movz ($idx,&LB($t1)); # (t1>>0)&0xff
92+ &mov ($t2,&DWP($SBOX1_1110,$Tbl,$idx,8)); # t2=SBOX1_1110[1]
93+ &movz ($idx,&HB($t0)); # (t0>>24)&0xff
94+ &xor ($t3,&DWP($SBOX1_1110,$Tbl,$idx,8)); # t3^=SBOX1_1110[0]
95+ &movz ($idx,&HB($t1)); # (t1>>8)&0xff
96+ &xor ($t2,&DWP($SBOX4_4404,$Tbl,$idx,8)); # t2^=SBOX4_4404[1]
97+ &shr ($t1,16);
98+ &movz ($t0,&LB($t0)); # (t0>>16)&0xff
99+ &xor ($t3,&DWP($SBOX2_0222,$Tbl,$t0,8)); # t3^=SBOX2_0222[0]
100+ &movz ($idx,&HB($t1)); # (t1>>24)&0xff
101+ &mov ($t0,&DWP($frame+4*(($j+3)%4),"esp")); # prefetch "s3"
102+ &xor ($t2,$t3); # t2^=t3
103+ &rotr ($t3,8); # t3=RightRotate(t3,8)
104+ &xor ($t2,&DWP($SBOX2_0222,$Tbl,$idx,8)); # t2^=SBOX2_0222[1]
105+ &movz ($idx,&LB($t1)); # (t1>>16)&0xff
106+ &mov ($t1,&DWP($frame+4*(($j+2)%4),"esp")); # prefetch "s2"
107+ &xor ($t3,$t0); # t3^=s3
108+ &xor ($t2,&DWP($SBOX3_3033,$Tbl,$idx,8)); # t2^=SBOX3_3033[1]
109+ &mov ($idx,&DWP($seed+($i+1)*$scale,$key)); # prefetch key[i+1]
110+ &xor ($t3,$t2); # t3^=t2
111+ &mov (&DWP($frame+4*(($j+3)%4),"esp"),$t3); # s3=t3
112+ &xor ($t2,$t1); # t2^=s2
113+ &mov (&DWP($frame+4*(($j+2)%4),"esp"),$t2); # s2=t2
114+}
115+
116+# void Camellia_EncryptBlock_Rounds(
117+# int grandRounds,
118+# const Byte plaintext[],
119+# const KEY_TABLE_TYPE keyTable,
120+# Byte ciphertext[])
121+&function_begin("Camellia_EncryptBlock_Rounds");
122+ &mov ("eax",&wparam(0)); # load grandRounds
123+ &mov ($idx,&wparam(1)); # load plaintext pointer
124+ &mov ($key,&wparam(2)); # load key schedule pointer
125+
126+ &mov ("ebx","esp");
127+ &sub ("esp",7*4); # place for s[0-3],keyEnd,esp and ra
128+ &and ("esp",-64);
129+
130+ # place stack frame just "above mod 1024" the key schedule
131+ # this ensures that cache associativity of 2 suffices
132+ &lea ("ecx",&DWP(-64-63,$key));
133+ &sub ("ecx","esp");
134+ &neg ("ecx");
135+ &and ("ecx",0x3C0); # modulo 1024, but aligned to cache-line
136+ &sub ("esp","ecx");
137+ &add ("esp",4); # 4 is reserved for callee's return address
138+
139+ &shl ("eax",6);
140+ &lea ("eax",&DWP(0,$key,"eax"));
141+ &mov ($_esp,"ebx"); # save %esp
142+ &mov ($_end,"eax"); # save keyEnd
143+
144+ &call (&label("pic_point"));
145+ &set_label("pic_point");
146+ &blindpop($Tbl);
147+ &lea ($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl));
148+
149+ &mov (@T[0],&DWP(0,$idx)); # load plaintext
150+ &mov (@T[1],&DWP(4,$idx));
151+ &mov (@T[2],&DWP(8,$idx));
152+ &bswap (@T[0]);
153+ &mov (@T[3],&DWP(12,$idx));
154+ &bswap (@T[1]);
155+ &bswap (@T[2]);
156+ &bswap (@T[3]);
157+
158+ &call ("_x86_Camellia_encrypt");
159+
160+ &mov ("esp",$_esp);
161+ &bswap (@T[0]);
162+ &mov ($idx,&wparam(3)); # load ciphertext pointer
163+ &bswap (@T[1]);
164+ &bswap (@T[2]);
165+ &bswap (@T[3]);
166+ &mov (&DWP(0,$idx),@T[0]); # write ciphertext
167+ &mov (&DWP(4,$idx),@T[1]);
168+ &mov (&DWP(8,$idx),@T[2]);
169+ &mov (&DWP(12,$idx),@T[3]);
170+&function_end("Camellia_EncryptBlock_Rounds");
171+# V1.x API
172+&function_begin_B("Camellia_EncryptBlock");
173+ &mov ("eax",128);
174+ &sub ("eax",&wparam(0)); # load keyBitLength
175+ &mov ("eax",3);
176+ &adc ("eax",0); # keyBitLength==128?3:4
177+ &mov (&wparam(0),"eax");
178+ &jmp (&label("Camellia_EncryptBlock_Rounds"));
179+&function_end_B("Camellia_EncryptBlock");
180+
181+if ($OPENSSL) {
182+# void Camellia_encrypt(
183+# const unsigned char *in,
184+# unsigned char *out,
185+# const CAMELLIA_KEY *key)
186+&function_begin("Camellia_encrypt");
187+ &mov ($idx,&wparam(0)); # load plaintext pointer
188+ &mov ($key,&wparam(2)); # load key schedule pointer
189+
190+ &mov ("ebx","esp");
191+ &sub ("esp",7*4); # place for s[0-3],keyEnd,esp and ra
192+ &and ("esp",-64);
193+ &mov ("eax",&DWP(272,$key)); # load grandRounds counter
194+
195+ # place stack frame just "above mod 1024" the key schedule
196+ # this ensures that cache associativity of 2 suffices
197+ &lea ("ecx",&DWP(-64-63,$key));
198+ &sub ("ecx","esp");
199+ &neg ("ecx");
200+ &and ("ecx",0x3C0); # modulo 1024, but aligned to cache-line
201+ &sub ("esp","ecx");
202+ &add ("esp",4); # 4 is reserved for callee's return address
203+
204+ &shl ("eax",6);
205+ &lea ("eax",&DWP(0,$key,"eax"));
206+ &mov ($_esp,"ebx"); # save %esp
207+ &mov ($_end,"eax"); # save keyEnd
208+
209+ &call (&label("pic_point"));
210+ &set_label("pic_point");
211+ &blindpop($Tbl);
212+ &lea ($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl));
213+
214+ &mov (@T[0],&DWP(0,$idx)); # load plaintext
215+ &mov (@T[1],&DWP(4,$idx));
216+ &mov (@T[2],&DWP(8,$idx));
217+ &bswap (@T[0]);
218+ &mov (@T[3],&DWP(12,$idx));
219+ &bswap (@T[1]);
220+ &bswap (@T[2]);
221+ &bswap (@T[3]);
222+
223+ &call ("_x86_Camellia_encrypt");
224+
225+ &mov ("esp",$_esp);
226+ &bswap (@T[0]);
227+ &mov ($idx,&wparam(1)); # load ciphertext pointer
228+ &bswap (@T[1]);
229+ &bswap (@T[2]);
230+ &bswap (@T[3]);
231+ &mov (&DWP(0,$idx),@T[0]); # write ciphertext
232+ &mov (&DWP(4,$idx),@T[1]);
233+ &mov (&DWP(8,$idx),@T[2]);
234+ &mov (&DWP(12,$idx),@T[3]);
235+&function_end("Camellia_encrypt");
236+}
237+
238+&function_begin_B("_x86_Camellia_encrypt");
239+ &xor (@T[0],&DWP(0,$key)); # ^=key[0-3]
240+ &xor (@T[1],&DWP(4,$key));
241+ &xor (@T[2],&DWP(8,$key));
242+ &xor (@T[3],&DWP(12,$key));
243+ &mov ($idx,&DWP(16,$key)); # prefetch key[4]
244+
245+ &mov ($__s0,@T[0]); # save s[0-3]
246+ &mov ($__s1,@T[1]);
247+ &mov ($__s2,@T[2]);
248+ &mov ($__s3,@T[3]);
249+
250+&set_label("loop",16);
251+ for ($i=0;$i<6;$i++) { Camellia_Feistel($i,16,4); }
252+
253+ &add ($key,16*4);
254+ &cmp ($key,$__end);
255+ &je (&label("done"));
256+
257+ # @T[0-1] are preloaded, $idx is preloaded with key[0]
258+ &and ($idx,@T[0]);
259+ &mov (@T[3],$__s3);
260+ &rotl ($idx,1);
261+ &mov (@T[2],@T[3]);
262+ &xor (@T[1],$idx);
263+ &or (@T[2],&DWP(12,$key));
264+ &mov ($__s1,@T[1]); # s1^=LeftRotate(s0&key[0],1);
265+ &xor (@T[2],$__s2);
266+
267+ &mov ($idx,&DWP(4,$key));
268+ &mov ($__s2,@T[2]); # s2^=s3|key[3];
269+ &or ($idx,@T[1]);
270+ &and (@T[2],&DWP(8,$key));
271+ &xor (@T[0],$idx);
272+ &rotl (@T[2],1);
273+ &mov ($__s0,@T[0]); # s0^=s1|key[1];
274+ &xor (@T[3],@T[2]);
275+ &mov ($idx,&DWP(16,$key)); # prefetch key[4]
276+ &mov ($__s3,@T[3]); # s3^=LeftRotate(s2&key[2],1);
277+ &jmp (&label("loop"));
278+
279+&set_label("done",8);
280+ &mov (@T[2],@T[0]); # SwapHalf
281+ &mov (@T[3],@T[1]);
282+ &mov (@T[0],$__s2);
283+ &mov (@T[1],$__s3);
284+ &xor (@T[0],$idx); # $idx is preloaded with key[0]
285+ &xor (@T[1],&DWP(4,$key));
286+ &xor (@T[2],&DWP(8,$key));
287+ &xor (@T[3],&DWP(12,$key));
288+ &ret ();
289+&function_end_B("_x86_Camellia_encrypt");
290+
291+# void Camellia_DecryptBlock_Rounds(
292+# int grandRounds,
293+# const Byte ciphertext[],
294+# const KEY_TABLE_TYPE keyTable,
295+# Byte plaintext[])
296+&function_begin("Camellia_DecryptBlock_Rounds");
297+ &mov ("eax",&wparam(0)); # load grandRounds
298+ &mov ($idx,&wparam(1)); # load ciphertext pointer
299+ &mov ($key,&wparam(2)); # load key schedule pointer
300+
301+ &mov ("ebx","esp");
302+ &sub ("esp",7*4); # place for s[0-3],keyEnd,esp and ra
303+ &and ("esp",-64);
304+
305+ # place stack frame just "above mod 1024" the key schedule
306+ # this ensures that cache associativity of 2 suffices
307+ &lea ("ecx",&DWP(-64-63,$key));
308+ &sub ("ecx","esp");
309+ &neg ("ecx");
310+ &and ("ecx",0x3C0); # modulo 1024, but aligned to cache-line
311+ &sub ("esp","ecx");
312+ &add ("esp",4); # 4 is reserved for callee's return address
313+
314+ &shl ("eax",6);
315+ &mov (&DWP(4*4,"esp"),$key); # save keyStart
316+ &lea ($key,&DWP(0,$key,"eax"));
317+ &mov (&DWP(5*4,"esp"),"ebx");# save %esp
318+
319+ &call (&label("pic_point"));
320+ &set_label("pic_point");
321+ &blindpop($Tbl);
322+ &lea ($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl));
323+
324+ &mov (@T[0],&DWP(0,$idx)); # load ciphertext
325+ &mov (@T[1],&DWP(4,$idx));
326+ &mov (@T[2],&DWP(8,$idx));
327+ &bswap (@T[0]);
328+ &mov (@T[3],&DWP(12,$idx));
329+ &bswap (@T[1]);
330+ &bswap (@T[2]);
331+ &bswap (@T[3]);
332+
333+ &call ("_x86_Camellia_decrypt");
334+
335+ &mov ("esp",&DWP(5*4,"esp"));
336+ &bswap (@T[0]);
337+ &mov ($idx,&wparam(3)); # load plaintext pointer
338+ &bswap (@T[1]);
339+ &bswap (@T[2]);
340+ &bswap (@T[3]);
341+ &mov (&DWP(0,$idx),@T[0]); # write plaintext
342+ &mov (&DWP(4,$idx),@T[1]);
343+ &mov (&DWP(8,$idx),@T[2]);
344+ &mov (&DWP(12,$idx),@T[3]);
345+&function_end("Camellia_DecryptBlock_Rounds");
346+# V1.x API
347+&function_begin_B("Camellia_DecryptBlock");
348+ &mov ("eax",128);
349+ &sub ("eax",&wparam(0)); # load keyBitLength
350+ &mov ("eax",3);
351+ &adc ("eax",0); # keyBitLength==128?3:4
352+ &mov (&wparam(0),"eax");
353+ &jmp (&label("Camellia_DecryptBlock_Rounds"));
354+&function_end_B("Camellia_DecryptBlock");
355+
356+if ($OPENSSL) {
357+# void Camellia_decrypt(
358+# const unsigned char *in,
359+# unsigned char *out,
360+# const CAMELLIA_KEY *key)
361+&function_begin("Camellia_decrypt");
362+ &mov ($idx,&wparam(0)); # load ciphertext pointer
363+ &mov ($key,&wparam(2)); # load key schedule pointer
364+
365+ &mov ("ebx","esp");
366+ &sub ("esp",7*4); # place for s[0-3],keyEnd,esp and ra
367+ &and ("esp",-64);
368+ &mov ("eax",&DWP(272,$key)); # load grandRounds counter
369+
370+ # place stack frame just "above mod 1024" the key schedule
371+ # this ensures that cache associativity of 2 suffices
372+ &lea ("ecx",&DWP(-64-63,$key));
373+ &sub ("ecx","esp");
374+ &neg ("ecx");
375+ &and ("ecx",0x3C0); # modulo 1024, but aligned to cache-line
376+ &sub ("esp","ecx");
377+ &add ("esp",4); # 4 is reserved for callee's return address
378+
379+ &shl ("eax",6);
380+ &mov (&DWP(4*4,"esp"),$key); # save keyStart
381+ &lea ($key,&DWP(0,$key,"eax"));
382+ &mov (&DWP(5*4,"esp"),"ebx");# save %esp
383+
384+ &call (&label("pic_point"));
385+ &set_label("pic_point");
386+ &blindpop($Tbl);
387+ &lea ($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl));
388+
389+ &mov (@T[0],&DWP(0,$idx)); # load ciphertext
390+ &mov (@T[1],&DWP(4,$idx));
391+ &mov (@T[2],&DWP(8,$idx));
392+ &bswap (@T[0]);
393+ &mov (@T[3],&DWP(12,$idx));
394+ &bswap (@T[1]);
395+ &bswap (@T[2]);
396+ &bswap (@T[3]);
397+
398+ &call ("_x86_Camellia_decrypt");
399+
400+ &mov ("esp",&DWP(5*4,"esp"));
401+ &bswap (@T[0]);
402+ &mov ($idx,&wparam(1)); # load plaintext pointer
403+ &bswap (@T[1]);
404+ &bswap (@T[2]);
405+ &bswap (@T[3]);
406+ &mov (&DWP(0,$idx),@T[0]); # write plaintext
407+ &mov (&DWP(4,$idx),@T[1]);
408+ &mov (&DWP(8,$idx),@T[2]);
409+ &mov (&DWP(12,$idx),@T[3]);
410+&function_end("Camellia_decrypt");
411+}
412+
413+&function_begin_B("_x86_Camellia_decrypt");
414+ &xor (@T[0],&DWP(0,$key)); # ^=key[0-3]
415+ &xor (@T[1],&DWP(4,$key));
416+ &xor (@T[2],&DWP(8,$key));
417+ &xor (@T[3],&DWP(12,$key));
418+ &mov ($idx,&DWP(-8,$key)); # prefetch key[-2]
419+
420+ &mov ($__s0,@T[0]); # save s[0-3]
421+ &mov ($__s1,@T[1]);
422+ &mov ($__s2,@T[2]);
423+ &mov ($__s3,@T[3]);
424+
425+&set_label("loop",16);
426+ for ($i=0;$i<6;$i++) { Camellia_Feistel($i,-8,4); }
427+
428+ &sub ($key,16*4);
429+ &cmp ($key,$__end);
430+ &je (&label("done"));
431+
432+ # @T[0-1] are preloaded, $idx is preloaded with key[2]
433+ &and ($idx,@T[0]);
434+ &mov (@T[3],$__s3);
435+ &rotl ($idx,1);
436+ &mov (@T[2],@T[3]);
437+ &xor (@T[1],$idx);
438+ &or (@T[2],&DWP(4,$key));
439+ &mov ($__s1,@T[1]); # s1^=LeftRotate(s0&key[0],1);
440+ &xor (@T[2],$__s2);
441+
442+ &mov ($idx,&DWP(12,$key));
443+ &mov ($__s2,@T[2]); # s2^=s3|key[3];
444+ &or ($idx,@T[1]);
445+ &and (@T[2],&DWP(0,$key));
446+ &xor (@T[0],$idx);
447+ &rotl (@T[2],1);
448+ &mov ($__s0,@T[0]); # s0^=s1|key[1];
449+ &xor (@T[3],@T[2]);
450+ &mov ($idx,&DWP(-8,$key)); # prefetch key[4]
451+ &mov ($__s3,@T[3]); # s3^=LeftRotate(s2&key[2],1);
452+ &jmp (&label("loop"));
453+
454+&set_label("done",8);
455+ &mov (@T[2],@T[0]); # SwapHalf
456+ &mov (@T[3],@T[1]);
457+ &mov (@T[0],$__s2);
458+ &mov (@T[1],$__s3);
459+ &xor (@T[2],$idx); # $idx is preloaded with key[2]
460+ &xor (@T[3],&DWP(12,$key));
461+ &xor (@T[0],&DWP(0,$key));
462+ &xor (@T[1],&DWP(4,$key));
463+ &ret ();
464+&function_end_B("_x86_Camellia_decrypt");
465+
466+# shld is very slow on Intel P4 family. Even on AMD it limits
467+# instruction decode rate [because it's VectorPath] and consequently
468+# performance. PIII, PM and Core[2] seem to be the only ones which
469+# execute this code ~7% faster...
470+sub __rotl128 {
471+ my ($i0,$i1,$i2,$i3,$rot,$rnd,@T)=@_;
472+
473+ $rnd *= 2;
474+ if ($rot) {
475+ &mov ($idx,$i0);
476+ &shld ($i0,$i1,$rot);
477+ &shld ($i1,$i2,$rot);
478+ &shld ($i2,$i3,$rot);
479+ &shld ($i3,$idx,$rot);
480+ }
481+ &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i0 eq @T[0]);
482+ &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i1 eq @T[0]);
483+ &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i2 eq @T[0]);
484+ &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i3 eq @T[0]);
485+}
486+
487+# ... Implementing 128-bit rotate without shld gives >3x performance
488+# improvement on P4, only ~7% degradation on other Intel CPUs and
489+# not worse performance on AMD. This is therefore preferred.
490+sub _rotl128 {
491+ my ($i0,$i1,$i2,$i3,$rot,$rnd,@T)=@_;
492+
493+ $rnd *= 2;
494+ if ($rot) {
495+ &mov ($Tbl,$i0);
496+ &shl ($i0,$rot);
497+ &mov ($idx,$i1);
498+ &shr ($idx,32-$rot);
499+ &shl ($i1,$rot);
500+ &or ($i0,$idx);
501+ &mov ($idx,$i2);
502+ &shl ($i2,$rot);
503+ &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i0 eq @T[0]);
504+ &shr ($idx,32-$rot);
505+ &or ($i1,$idx);
506+ &shr ($Tbl,32-$rot);
507+ &mov ($idx,$i3);
508+ &shr ($idx,32-$rot);
509+ &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i1 eq @T[0]);
510+ &shl ($i3,$rot);
511+ &or ($i2,$idx);
512+ &or ($i3,$Tbl);
513+ &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i2 eq @T[0]);
514+ &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i3 eq @T[0]);
515+ } else {
516+ &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i0 eq @T[0]);
517+ &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i1 eq @T[0]);
518+ &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i2 eq @T[0]);
519+ &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i3 eq @T[0]);
520+ }
521+}
522+
523+sub _saveround {
524+my ($rnd,$key,@T)=@_;
525+my $bias=int(@T[0])?shift(@T):0;
526+
527+ &mov (&DWP($bias+$rnd*8+0,$key),@T[0]);
528+ &mov (&DWP($bias+$rnd*8+4,$key),@T[1]) if ($#T>=1);
529+ &mov (&DWP($bias+$rnd*8+8,$key),@T[2]) if ($#T>=2);
530+ &mov (&DWP($bias+$rnd*8+12,$key),@T[3]) if ($#T>=3);
531+}
532+
533+sub _loadround {
534+my ($rnd,$key,@T)=@_;
535+my $bias=int(@T[0])?shift(@T):0;
536+
537+ &mov (@T[0],&DWP($bias+$rnd*8+0,$key));
538+ &mov (@T[1],&DWP($bias+$rnd*8+4,$key)) if ($#T>=1);
539+ &mov (@T[2],&DWP($bias+$rnd*8+8,$key)) if ($#T>=2);
540+ &mov (@T[3],&DWP($bias+$rnd*8+12,$key)) if ($#T>=3);
541+}
542+
543+# void Camellia_Ekeygen(
544+# const int keyBitLength,
545+# const Byte *rawKey,
546+# KEY_TABLE_TYPE keyTable)
547+&function_begin("Camellia_Ekeygen");
548+{ my $step=0;
549+
550+ &stack_push(4); # place for s[0-3]
551+
552+ &mov ($Tbl,&wparam(0)); # load arguments
553+ &mov ($idx,&wparam(1));
554+ &mov ($key,&wparam(2));
555+
556+ &mov (@T[0],&DWP(0,$idx)); # load 0-127 bits
557+ &mov (@T[1],&DWP(4,$idx));
558+ &mov (@T[2],&DWP(8,$idx));
559+ &mov (@T[3],&DWP(12,$idx));
560+
561+ &bswap (@T[0]);
562+ &bswap (@T[1]);
563+ &bswap (@T[2]);
564+ &bswap (@T[3]);
565+
566+ &_saveround (0,$key,@T); # KL<<<0
567+
568+ &cmp ($Tbl,128);
569+ &je (&label("1st128"));
570+
571+ &mov (@T[0],&DWP(16,$idx)); # load 128-191 bits
572+ &mov (@T[1],&DWP(20,$idx));
573+ &cmp ($Tbl,192);
574+ &je (&label("1st192"));
575+ &mov (@T[2],&DWP(24,$idx)); # load 192-255 bits
576+ &mov (@T[3],&DWP(28,$idx));
577+ &jmp (&label("1st256"));
578+&set_label("1st192",4);
579+ &mov (@T[2],@T[0]);
580+ &mov (@T[3],@T[1]);
581+ &not (@T[2]);
582+ &not (@T[3]);
583+&set_label("1st256",4);
584+ &bswap (@T[0]);
585+ &bswap (@T[1]);
586+ &bswap (@T[2]);
587+ &bswap (@T[3]);
588+
589+ &_saveround (4,$key,@T); # temporary storage for KR!
590+
591+ &xor (@T[0],&DWP(0*8+0,$key)); # KR^KL
592+ &xor (@T[1],&DWP(0*8+4,$key));
593+ &xor (@T[2],&DWP(1*8+0,$key));
594+ &xor (@T[3],&DWP(1*8+4,$key));
595+
596+&set_label("1st128",4);
597+ &call (&label("pic_point"));
598+ &set_label("pic_point");
599+ &blindpop($Tbl);
600+ &lea ($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl));
601+ &lea ($key,&DWP(&label("Camellia_SIGMA")."-".&label("Camellia_SBOX"),$Tbl));
602+
603+ &mov ($idx,&DWP($step*8,$key)); # prefetch SIGMA[0]
604+ &mov (&swtmp(0),@T[0]); # save s[0-3]
605+ &mov (&swtmp(1),@T[1]);
606+ &mov (&swtmp(2),@T[2]);
607+ &mov (&swtmp(3),@T[3]);
608+ &Camellia_Feistel($step++);
609+ &Camellia_Feistel($step++);
610+ &mov (@T[2],&swtmp(2));
611+ &mov (@T[3],&swtmp(3));
612+
613+ &mov ($idx,&wparam(2));
614+ &xor (@T[0],&DWP(0*8+0,$idx)); # ^KL
615+ &xor (@T[1],&DWP(0*8+4,$idx));
616+ &xor (@T[2],&DWP(1*8+0,$idx));
617+ &xor (@T[3],&DWP(1*8+4,$idx));
618+
619+ &mov ($idx,&DWP($step*8,$key)); # prefetch SIGMA[4]
620+ &mov (&swtmp(0),@T[0]); # save s[0-3]
621+ &mov (&swtmp(1),@T[1]);
622+ &mov (&swtmp(2),@T[2]);
623+ &mov (&swtmp(3),@T[3]);
624+ &Camellia_Feistel($step++);
625+ &Camellia_Feistel($step++);
626+ &mov (@T[2],&swtmp(2));
627+ &mov (@T[3],&swtmp(3));
628+
629+ &mov ($idx,&wparam(0));
630+ &cmp ($idx,128);
631+ &jne (&label("2nd256"));
632+
633+ &mov ($key,&wparam(2));
634+ &lea ($key,&DWP(128,$key)); # size optimization
635+
636+ ####### process KA
637+ &_saveround (2,$key,-128,@T); # KA<<<0
638+ &_rotl128 (@T,15,6,@T); # KA<<<15
639+ &_rotl128 (@T,15,8,@T); # KA<<<(15+15=30)
640+ &_rotl128 (@T,15,12,@T[0],@T[1]); # KA<<<(30+15=45)
641+ &_rotl128 (@T,15,14,@T); # KA<<<(45+15=60)
642+ push (@T,shift(@T)); # rotl128(@T,32);
643+ &_rotl128 (@T,2,20,@T); # KA<<<(60+32+2=94)
644+ &_rotl128 (@T,17,24,@T); # KA<<<(94+17=111)
645+
646+ ####### process KL
647+ &_loadround (0,$key,-128,@T); # load KL
648+ &_rotl128 (@T,15,4,@T); # KL<<<15
649+ &_rotl128 (@T,30,10,@T); # KL<<<(15+30=45)
650+ &_rotl128 (@T,15,13,@T[2],@T[3]); # KL<<<(45+15=60)
651+ &_rotl128 (@T,17,16,@T); # KL<<<(60+17=77)
652+ &_rotl128 (@T,17,18,@T); # KL<<<(77+17=94)
653+ &_rotl128 (@T,17,22,@T); # KL<<<(94+17=111)
654+
655+ while (@T[0] ne "eax") # restore order
656+ { unshift (@T,pop(@T)); }
657+
658+ &mov ("eax",3); # 3 grandRounds
659+ &jmp (&label("done"));
660+
661+&set_label("2nd256",16);
662+ &mov ($idx,&wparam(2));
663+ &_saveround (6,$idx,@T); # temporary storage for KA!
664+
665+ &xor (@T[0],&DWP(4*8+0,$idx)); # KA^KR
666+ &xor (@T[1],&DWP(4*8+4,$idx));
667+ &xor (@T[2],&DWP(5*8+0,$idx));
668+ &xor (@T[3],&DWP(5*8+4,$idx));
669+
670+ &mov ($idx,&DWP($step*8,$key)); # prefetch SIGMA[8]
671+ &mov (&swtmp(0),@T[0]); # save s[0-3]
672+ &mov (&swtmp(1),@T[1]);
673+ &mov (&swtmp(2),@T[2]);
674+ &mov (&swtmp(3),@T[3]);
675+ &Camellia_Feistel($step++);
676+ &Camellia_Feistel($step++);
677+ &mov (@T[2],&swtmp(2));
678+ &mov (@T[3],&swtmp(3));
679+
680+ &mov ($key,&wparam(2));
681+ &lea ($key,&DWP(128,$key)); # size optimization
682+
683+ ####### process KB
684+ &_saveround (2,$key,-128,@T); # KB<<<0
685+ &_rotl128 (@T,30,10,@T); # KB<<<30
686+ &_rotl128 (@T,30,20,@T); # KB<<<(30+30=60)
687+ push (@T,shift(@T)); # rotl128(@T,32);
688+ &_rotl128 (@T,19,32,@T); # KB<<<(60+32+19=111)
689+
690+ ####### process KR
691+ &_loadround (4,$key,-128,@T); # load KR
692+ &_rotl128 (@T,15,4,@T); # KR<<<15
693+ &_rotl128 (@T,15,8,@T); # KR<<<(15+15=30)
694+ &_rotl128 (@T,30,18,@T); # KR<<<(30+30=60)
695+ push (@T,shift(@T)); # rotl128(@T,32);
696+ &_rotl128 (@T,2,26,@T); # KR<<<(60+32+2=94)
697+
698+ ####### process KA
699+ &_loadround (6,$key,-128,@T); # load KA
700+ &_rotl128 (@T,15,6,@T); # KA<<<15
701+ &_rotl128 (@T,30,14,@T); # KA<<<(15+30=45)
702+ push (@T,shift(@T)); # rotl128(@T,32);
703+ &_rotl128 (@T,0,24,@T); # KA<<<(45+32+0=77)
704+ &_rotl128 (@T,17,28,@T); # KA<<<(77+17=94)
705+
706+ ####### process KL
707+ &_loadround (0,$key,-128,@T); # load KL
708+ push (@T,shift(@T)); # rotl128(@T,32);
709+ &_rotl128 (@T,13,12,@T); # KL<<<(32+13=45)
710+ &_rotl128 (@T,15,16,@T); # KL<<<(45+15=60)
711+ &_rotl128 (@T,17,22,@T); # KL<<<(60+17=77)
712+ push (@T,shift(@T)); # rotl128(@T,32);
713+ &_rotl128 (@T,2,30,@T); # KL<<<(77+32+2=111)
714+
715+ while (@T[0] ne "eax") # restore order
716+ { unshift (@T,pop(@T)); }
717+
718+ &mov ("eax",4); # 4 grandRounds
719+&set_label("done");
720+ &lea ("edx",&DWP(272-128,$key)); # end of key schedule
721+ &stack_pop(4);
722+}
723+&function_end("Camellia_Ekeygen");
724+
725+if ($OPENSSL) {
726+# int Camellia_set_key (
727+# const unsigned char *userKey,
728+# int bits,
729+# CAMELLIA_KEY *key)
730+&function_begin_B("Camellia_set_key");
731+ &push ("ebx");
732+ &mov ("ecx",&wparam(0)); # pull arguments
733+ &mov ("ebx",&wparam(1));
734+ &mov ("edx",&wparam(2));
735+
736+ &mov ("eax",-1);
737+ &test ("ecx","ecx");
738+ &jz (&label("done")); # userKey==NULL?
739+ &test ("edx","edx");
740+ &jz (&label("done")); # key==NULL?
741+
742+ &mov ("eax",-2);
743+ &cmp ("ebx",256);
744+ &je (&label("arg_ok")); # bits==256?
745+ &cmp ("ebx",192);
746+ &je (&label("arg_ok")); # bits==192?
747+ &cmp ("ebx",128);
748+ &jne (&label("done")); # bits!=128?
749+&set_label("arg_ok",4);
750+
751+ &push ("edx"); # push arguments
752+ &push ("ecx");
753+ &push ("ebx");
754+ &call ("Camellia_Ekeygen");
755+ &stack_pop(3);
756+
757+ # eax holds grandRounds and edx points at where to put it
758+ &mov (&DWP(0,"edx"),"eax");
759+ &xor ("eax","eax");
760+&set_label("done",4);
761+ &pop ("ebx");
762+ &ret ();
763+&function_end_B("Camellia_set_key");
764+}
765+
766+@SBOX=(
767+112,130, 44,236,179, 39,192,229,228,133, 87, 53,234, 12,174, 65,
768+ 35,239,107,147, 69, 25,165, 33,237, 14, 79, 78, 29,101,146,189,
769+134,184,175,143,124,235, 31,206, 62, 48,220, 95, 94,197, 11, 26,
770+166,225, 57,202,213, 71, 93, 61,217, 1, 90,214, 81, 86,108, 77,
771+139, 13,154,102,251,204,176, 45,116, 18, 43, 32,240,177,132,153,
772+223, 76,203,194, 52,126,118, 5,109,183,169, 49,209, 23, 4,215,
773+ 20, 88, 58, 97,222, 27, 17, 28, 50, 15,156, 22, 83, 24,242, 34,
774+254, 68,207,178,195,181,122,145, 36, 8,232,168, 96,252,105, 80,
775+170,208,160,125,161,137, 98,151, 84, 91, 30,149,224,255,100,210,
776+ 16,196, 0, 72,163,247,117,219,138, 3,230,218, 9, 63,221,148,
777+135, 92,131, 2,205, 74,144, 51,115,103,246,243,157,127,191,226,
778+ 82,155,216, 38,200, 55,198, 59,129,150,111, 75, 19,190, 99, 46,
779+233,121,167,140,159,110,188,142, 41,245,249,182, 47,253,180, 89,
780+120,152, 6,106,231, 70,113,186,212, 37,171, 66,136,162,141,250,
781+114, 7,185, 85,248,238,172, 10, 54, 73, 42,104, 60, 56,241,164,
782+ 64, 40,211,123,187,201, 67,193, 21,227,173,244,119,199,128,158);
783+
784+sub S1110 { my $i=shift; $i=@SBOX[$i]; return $i<<24|$i<<16|$i<<8; }
785+sub S4404 { my $i=shift; $i=($i<<1|$i>>7)&0xff; $i=@SBOX[$i]; return $i<<24|$i<<16|$i; }
786+sub S0222 { my $i=shift; $i=@SBOX[$i]; $i=($i<<1|$i>>7)&0xff; return $i<<16|$i<<8|$i; }
787+sub S3033 { my $i=shift; $i=@SBOX[$i]; $i=($i>>1|$i<<7)&0xff; return $i<<24|$i<<8|$i; }
788+
789+&set_label("Camellia_SIGMA",64);
790+&data_word(
791+ 0xa09e667f, 0x3bcc908b, 0xb67ae858, 0x4caa73b2,
792+ 0xc6ef372f, 0xe94f82be, 0x54ff53a5, 0xf1d36f1c,
793+ 0x10e527fa, 0xde682d1d, 0xb05688c2, 0xb3e6c1fd,
794+ 0, 0, 0, 0);
795+&set_label("Camellia_SBOX",64);
796+# tables are interleaved, remember?
797+for ($i=0;$i<256;$i++) { &data_word(&S1110($i),&S4404($i)); }
798+for ($i=0;$i<256;$i++) { &data_word(&S0222($i),&S3033($i)); }
799+
800+# void Camellia_cbc_encrypt (const void char *inp, unsigned char *out,
801+# size_t length, const CAMELLIA_KEY *key,
802+# unsigned char *ivp,const int enc);
803+{
804+# stack frame layout
805+# -4(%esp) # return address 0(%esp)
806+# 0(%esp) # s0 4(%esp)
807+# 4(%esp) # s1 8(%esp)
808+# 8(%esp) # s2 12(%esp)
809+# 12(%esp) # s3 16(%esp)
810+# 16(%esp) # end of key schedule 20(%esp)
811+# 20(%esp) # %esp backup
812+my $_inp=&DWP(24,"esp"); #copy of wparam(0)
813+my $_out=&DWP(28,"esp"); #copy of wparam(1)
814+my $_len=&DWP(32,"esp"); #copy of wparam(2)
815+my $_key=&DWP(36,"esp"); #copy of wparam(3)
816+my $_ivp=&DWP(40,"esp"); #copy of wparam(4)
817+my $ivec=&DWP(44,"esp"); #ivec[16]
818+my $_tmp=&DWP(44,"esp"); #volatile variable [yes, aliases with ivec]
819+my ($s0,$s1,$s2,$s3) = @T;
820+
821+&function_begin("Camellia_cbc_encrypt");
822+ &mov ($s2 eq "ecx"? $s2 : "",&wparam(2)); # load len
823+ &cmp ($s2,0);
824+ &je (&label("enc_out"));
825+
826+ &pushf ();
827+ &cld ();
828+
829+ &mov ($s0,&wparam(0)); # load inp
830+ &mov ($s1,&wparam(1)); # load out
831+ #&mov ($s2,&wparam(2)); # load len
832+ &mov ($s3,&wparam(3)); # load key
833+ &mov ($Tbl,&wparam(4)); # load ivp
834+
835+ # allocate aligned stack frame...
836+ &lea ($idx,&DWP(-64,"esp"));
837+ &and ($idx,-64);
838+
839+ # place stack frame just "above mod 1024" the key schedule
840+ # this ensures that cache associativity of 2 suffices
841+ &lea ($key,&DWP(-64-63,$s3));
842+ &sub ($key,$idx);
843+ &neg ($key);
844+ &and ($key,0x3C0); # modulo 1024, but aligned to cache-line
845+ &sub ($idx,$key);
846+
847+ &mov ($key,&wparam(5)); # load enc
848+
849+ &exch ("esp",$idx);
850+ &add ("esp",4); # reserve for return address!
851+ &mov ($_esp,$idx); # save %esp
852+
853+ &mov ($_inp,$s0); # save copy of inp
854+ &mov ($_out,$s1); # save copy of out
855+ &mov ($_len,$s2); # save copy of len
856+ &mov ($_key,$s3); # save copy of key
857+ &mov ($_ivp,$Tbl); # save copy of ivp
858+
859+ &call (&label("pic_point")); # make it PIC!
860+ &set_label("pic_point");
861+ &blindpop($Tbl);
862+ &lea ($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl));
863+
864+ &mov ($idx,32);
865+ &set_label("prefetch_sbox",4);
866+ &mov ($s0,&DWP(0,$Tbl));
867+ &mov ($s1,&DWP(32,$Tbl));
868+ &mov ($s2,&DWP(64,$Tbl));
869+ &mov ($s3,&DWP(96,$Tbl));
870+ &lea ($Tbl,&DWP(128,$Tbl));
871+ &dec ($idx);
872+ &jnz (&label("prefetch_sbox"));
873+ &mov ($s0,$_key);
874+ &sub ($Tbl,4096);
875+ &mov ($idx,$_inp);
876+ &mov ($s3,&DWP(272,$s0)); # load grandRounds
877+
878+ &cmp ($key,0);
879+ &je (&label("DECRYPT"));
880+
881+ &mov ($s2,$_len);
882+ &mov ($key,$_ivp);
883+ &shl ($s3,6);
884+ &lea ($s3,&DWP(0,$s0,$s3));
885+ &mov ($_end,$s3);
886+
887+ &test ($s2,0xFFFFFFF0);
888+ &jz (&label("enc_tail")); # short input...
889+
890+ &mov ($s0,&DWP(0,$key)); # load iv
891+ &mov ($s1,&DWP(4,$key));
892+
893+ &set_label("enc_loop",4);
894+ &mov ($s2,&DWP(8,$key));
895+ &mov ($s3,&DWP(12,$key));
896+
897+ &xor ($s0,&DWP(0,$idx)); # xor input data
898+ &xor ($s1,&DWP(4,$idx));
899+ &xor ($s2,&DWP(8,$idx));
900+ &bswap ($s0);
901+ &xor ($s3,&DWP(12,$idx));
902+ &bswap ($s1);
903+ &mov ($key,$_key); # load key
904+ &bswap ($s2);
905+ &bswap ($s3);
906+
907+ &call ("_x86_Camellia_encrypt");
908+
909+ &mov ($idx,$_inp); # load inp
910+ &mov ($key,$_out); # load out
911+
912+ &bswap ($s0);
913+ &bswap ($s1);
914+ &bswap ($s2);
915+ &mov (&DWP(0,$key),$s0); # save output data
916+ &bswap ($s3);
917+ &mov (&DWP(4,$key),$s1);
918+ &mov (&DWP(8,$key),$s2);
919+ &mov (&DWP(12,$key),$s3);
920+
921+ &mov ($s2,$_len); # load len
922+
923+ &lea ($idx,&DWP(16,$idx));
924+ &mov ($_inp,$idx); # save inp
925+
926+ &lea ($s3,&DWP(16,$key));
927+ &mov ($_out,$s3); # save out
928+
929+ &sub ($s2,16);
930+ &test ($s2,0xFFFFFFF0);
931+ &mov ($_len,$s2); # save len
932+ &jnz (&label("enc_loop"));
933+ &test ($s2,15);
934+ &jnz (&label("enc_tail"));
935+ &mov ($idx,$_ivp); # load ivp
936+ &mov ($s2,&DWP(8,$key)); # restore last dwords
937+ &mov ($s3,&DWP(12,$key));
938+ &mov (&DWP(0,$idx),$s0); # save ivec
939+ &mov (&DWP(4,$idx),$s1);
940+ &mov (&DWP(8,$idx),$s2);
941+ &mov (&DWP(12,$idx),$s3);
942+
943+ &mov ("esp",$_esp);
944+ &popf ();
945+ &set_label("enc_out");
946+ &function_end_A();
947+ &pushf (); # kludge, never executed
948+
949+ &set_label("enc_tail",4);
950+ &mov ($s0,$key eq "edi" ? $key : "");
951+ &mov ($key,$_out); # load out
952+ &push ($s0); # push ivp
953+ &mov ($s1,16);
954+ &sub ($s1,$s2);
955+ &cmp ($key,$idx); # compare with inp
956+ &je (&label("enc_in_place"));
957+ &align (4);
958+ &data_word(0xA4F3F689); # rep movsb # copy input
959+ &jmp (&label("enc_skip_in_place"));
960+ &set_label("enc_in_place");
961+ &lea ($key,&DWP(0,$key,$s2));
962+ &set_label("enc_skip_in_place");
963+ &mov ($s2,$s1);
964+ &xor ($s0,$s0);
965+ &align (4);
966+ &data_word(0xAAF3F689); # rep stosb # zero tail
967+ &pop ($key); # pop ivp
968+
969+ &mov ($idx,$_out); # output as input
970+ &mov ($s0,&DWP(0,$key));
971+ &mov ($s1,&DWP(4,$key));
972+ &mov ($_len,16); # len=16
973+ &jmp (&label("enc_loop")); # one more spin...
974+
975+#----------------------------- DECRYPT -----------------------------#
976+&set_label("DECRYPT",16);
977+ &shl ($s3,6);
978+ &lea ($s3,&DWP(0,$s0,$s3));
979+ &mov ($_end,$s0);
980+ &mov ($_key,$s3);
981+
982+ &cmp ($idx,$_out);
983+ &je (&label("dec_in_place")); # in-place processing...
984+
985+ &mov ($key,$_ivp); # load ivp
986+ &mov ($_tmp,$key);
987+
988+ &set_label("dec_loop",4);
989+ &mov ($s0,&DWP(0,$idx)); # read input
990+ &mov ($s1,&DWP(4,$idx));
991+ &mov ($s2,&DWP(8,$idx));
992+ &bswap ($s0);
993+ &mov ($s3,&DWP(12,$idx));
994+ &bswap ($s1);
995+ &mov ($key,$_key); # load key
996+ &bswap ($s2);
997+ &bswap ($s3);
998+
999+ &call ("_x86_Camellia_decrypt");
1000+
1001+ &mov ($key,$_tmp); # load ivp
1002+ &mov ($idx,$_len); # load len
1003+
1004+ &bswap ($s0);
1005+ &bswap ($s1);
1006+ &bswap ($s2);
1007+ &xor ($s0,&DWP(0,$key)); # xor iv
1008+ &bswap ($s3);
1009+ &xor ($s1,&DWP(4,$key));
1010+ &xor ($s2,&DWP(8,$key));
1011+ &xor ($s3,&DWP(12,$key));
1012+
1013+ &sub ($idx,16);
1014+ &jc (&label("dec_partial"));
1015+ &mov ($_len,$idx); # save len
1016+ &mov ($idx,$_inp); # load inp
1017+ &mov ($key,$_out); # load out
1018+
1019+ &mov (&DWP(0,$key),$s0); # write output
1020+ &mov (&DWP(4,$key),$s1);
1021+ &mov (&DWP(8,$key),$s2);
1022+ &mov (&DWP(12,$key),$s3);
1023+
1024+ &mov ($_tmp,$idx); # save ivp
1025+ &lea ($idx,&DWP(16,$idx));
1026+ &mov ($_inp,$idx); # save inp
1027+
1028+ &lea ($key,&DWP(16,$key));
1029+ &mov ($_out,$key); # save out
1030+
1031+ &jnz (&label("dec_loop"));
1032+ &mov ($key,$_tmp); # load temp ivp
1033+ &set_label("dec_end");
1034+ &mov ($idx,$_ivp); # load user ivp
1035+ &mov ($s0,&DWP(0,$key)); # load iv
1036+ &mov ($s1,&DWP(4,$key));
1037+ &mov ($s2,&DWP(8,$key));
1038+ &mov ($s3,&DWP(12,$key));
1039+ &mov (&DWP(0,$idx),$s0); # copy back to user
1040+ &mov (&DWP(4,$idx),$s1);
1041+ &mov (&DWP(8,$idx),$s2);
1042+ &mov (&DWP(12,$idx),$s3);
1043+ &jmp (&label("dec_out"));
1044+
1045+ &set_label("dec_partial",4);
1046+ &lea ($key,$ivec);
1047+ &mov (&DWP(0,$key),$s0); # dump output to stack
1048+ &mov (&DWP(4,$key),$s1);
1049+ &mov (&DWP(8,$key),$s2);
1050+ &mov (&DWP(12,$key),$s3);
1051+ &lea ($s2 eq "ecx" ? $s2 : "",&DWP(16,$idx));
1052+ &mov ($idx eq "esi" ? $idx : "",$key);
1053+ &mov ($key eq "edi" ? $key : "",$_out); # load out
1054+ &data_word(0xA4F3F689); # rep movsb # copy output
1055+ &mov ($key,$_inp); # use inp as temp ivp
1056+ &jmp (&label("dec_end"));
1057+
1058+ &set_label("dec_in_place",4);
1059+ &set_label("dec_in_place_loop");
1060+ &lea ($key,$ivec);
1061+ &mov ($s0,&DWP(0,$idx)); # read input
1062+ &mov ($s1,&DWP(4,$idx));
1063+ &mov ($s2,&DWP(8,$idx));
1064+ &mov ($s3,&DWP(12,$idx));
1065+
1066+ &mov (&DWP(0,$key),$s0); # copy to temp
1067+ &mov (&DWP(4,$key),$s1);
1068+ &mov (&DWP(8,$key),$s2);
1069+ &bswap ($s0);
1070+ &mov (&DWP(12,$key),$s3);
1071+ &bswap ($s1);
1072+ &mov ($key,$_key); # load key
1073+ &bswap ($s2);
1074+ &bswap ($s3);
1075+
1076+ &call ("_x86_Camellia_decrypt");
1077+
1078+ &mov ($key,$_ivp); # load ivp
1079+ &mov ($idx,$_out); # load out
1080+
1081+ &bswap ($s0);
1082+ &bswap ($s1);
1083+ &bswap ($s2);
1084+ &xor ($s0,&DWP(0,$key)); # xor iv
1085+ &bswap ($s3);
1086+ &xor ($s1,&DWP(4,$key));
1087+ &xor ($s2,&DWP(8,$key));
1088+ &xor ($s3,&DWP(12,$key));
1089+
1090+ &mov (&DWP(0,$idx),$s0); # write output
1091+ &mov (&DWP(4,$idx),$s1);
1092+ &mov (&DWP(8,$idx),$s2);
1093+ &mov (&DWP(12,$idx),$s3);
1094+
1095+ &lea ($idx,&DWP(16,$idx));
1096+ &mov ($_out,$idx); # save out
1097+
1098+ &lea ($idx,$ivec);
1099+ &mov ($s0,&DWP(0,$idx)); # read temp
1100+ &mov ($s1,&DWP(4,$idx));
1101+ &mov ($s2,&DWP(8,$idx));
1102+ &mov ($s3,&DWP(12,$idx));
1103+
1104+ &mov (&DWP(0,$key),$s0); # copy iv
1105+ &mov (&DWP(4,$key),$s1);
1106+ &mov (&DWP(8,$key),$s2);
1107+ &mov (&DWP(12,$key),$s3);
1108+
1109+ &mov ($idx,$_inp); # load inp
1110+
1111+ &lea ($idx,&DWP(16,$idx));
1112+ &mov ($_inp,$idx); # save inp
1113+
1114+ &mov ($s2,$_len); # load len
1115+ &sub ($s2,16);
1116+ &jc (&label("dec_in_place_partial"));
1117+ &mov ($_len,$s2); # save len
1118+ &jnz (&label("dec_in_place_loop"));
1119+ &jmp (&label("dec_out"));
1120+
1121+ &set_label("dec_in_place_partial",4);
1122+ # one can argue if this is actually required...
1123+ &mov ($key eq "edi" ? $key : "",$_out);
1124+ &lea ($idx eq "esi" ? $idx : "",$ivec);
1125+ &lea ($key,&DWP(0,$key,$s2));
1126+ &lea ($idx,&DWP(16,$idx,$s2));
1127+ &neg ($s2 eq "ecx" ? $s2 : "");
1128+ &data_word(0xA4F3F689); # rep movsb # restore tail
1129+
1130+ &set_label("dec_out",4);
1131+ &mov ("esp",$_esp);
1132+ &popf ();
1133+&function_end("Camellia_cbc_encrypt");
1134+}
1135+
1136+&asciz("Camellia for x86 by <appro@openssl.org>");
1137+
1138+&asm_finish();
--- /dev/null
+++ b/crypto/camellia/asm/cmll-x86_64.pl
@@ -0,0 +1,1080 @@
1+#!/usr/bin/env perl
2+
3+# ====================================================================
4+# Copyright (c) 2008 Andy Polyakov <appro@openssl.org>
5+#
6+# This module may be used under the terms of either the GNU General
7+# Public License version 2 or later, the GNU Lesser General Public
8+# License version 2.1 or later, the Mozilla Public License version
9+# 1.1 or the BSD License. The exact terms of either license are
10+# distributed along with this module. For further details see
11+# http://www.openssl.org/~appro/camellia/.
12+# ====================================================================
13+
14+# Performance in cycles per processed byte (less is better) in
15+# 'openssl speed ...' benchmark:
16+#
17+# AMD64 Core2 EM64T
18+# -evp camellia-128-ecb 16.7 21.0 22.7
19+# + over gcc 3.4.6 +25% +5% 0%
20+#
21+# camellia-128-cbc 15.7 20.4 21.1
22+#
23+# 128-bit key setup 128 216 205 cycles/key
24+# + over gcc 3.4.6 +54% +39% +15%
25+#
26+# Numbers in "+" rows represent performance improvement over compiler
27+# generated code. Key setup timings are impressive on AMD and Core2
28+# thanks to 64-bit operations being covertly deployed. Improvement on
29+# EM64T, pre-Core2 Intel x86_64 CPU, is not as impressive, because it
30+# apparently emulates some of 64-bit operations in [32-bit] microcode.
31+
32+$flavour = shift;
33+$output = shift;
34+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
35+
36+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
37+
38+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
39+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
40+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
41+die "can't locate x86_64-xlate.pl";
42+
43+open STDOUT,"| $^X $xlate $flavour $output";
44+
45+sub hi() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1h/; $r; }
46+sub lo() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/;
47+ $r =~ s/%[er]([sd]i)/%\1l/;
48+ $r =~ s/%(r[0-9]+)[d]?/%\1b/; $r; }
49+
50+$t0="%eax";$t1="%ebx";$t2="%ecx";$t3="%edx";
51+@S=("%r8d","%r9d","%r10d","%r11d");
52+$i0="%esi";
53+$i1="%edi";
54+$Tbl="%rbp"; # size optimization
55+$inp="%r12";
56+$out="%r13";
57+$key="%r14";
58+$keyend="%r15";
59+$arg0d=$win64?"%ecx":"%edi";
60+
61+# const unsigned int Camellia_SBOX[4][256];
62+# Well, sort of... Camellia_SBOX[0][] is interleaved with [1][],
63+# and [2][] - with [3][]. This is done to minimize code size.
64+$SBOX1_1110=0; # Camellia_SBOX[0]
65+$SBOX4_4404=4; # Camellia_SBOX[1]
66+$SBOX2_0222=2048; # Camellia_SBOX[2]
67+$SBOX3_3033=2052; # Camellia_SBOX[3]
68+
69+sub Camellia_Feistel {
70+my $i=@_[0];
71+my $seed=defined(@_[1])?@_[1]:0;
72+my $scale=$seed<0?-8:8;
73+my $j=($i&1)*2;
74+my $s0=@S[($j)%4],$s1=@S[($j+1)%4],$s2=@S[($j+2)%4],$s3=@S[($j+3)%4];
75+
76+$code.=<<___;
77+ xor $s0,$t0 # t0^=key[0]
78+ xor $s1,$t1 # t1^=key[1]
79+ movz `&hi("$t0")`,$i0 # (t0>>8)&0xff
80+ movz `&lo("$t1")`,$i1 # (t1>>0)&0xff
81+ mov $SBOX3_3033($Tbl,$i0,8),$t3 # t3=SBOX3_3033[0]
82+ mov $SBOX1_1110($Tbl,$i1,8),$t2 # t2=SBOX1_1110[1]
83+ movz `&lo("$t0")`,$i0 # (t0>>0)&0xff
84+ shr \$16,$t0
85+ movz `&hi("$t1")`,$i1 # (t1>>8)&0xff
86+ xor $SBOX4_4404($Tbl,$i0,8),$t3 # t3^=SBOX4_4404[0]
87+ shr \$16,$t1
88+ xor $SBOX4_4404($Tbl,$i1,8),$t2 # t2^=SBOX4_4404[1]
89+ movz `&hi("$t0")`,$i0 # (t0>>24)&0xff
90+ movz `&lo("$t1")`,$i1 # (t1>>16)&0xff
91+ xor $SBOX1_1110($Tbl,$i0,8),$t3 # t3^=SBOX1_1110[0]
92+ xor $SBOX3_3033($Tbl,$i1,8),$t2 # t2^=SBOX3_3033[1]
93+ movz `&lo("$t0")`,$i0 # (t0>>16)&0xff
94+ movz `&hi("$t1")`,$i1 # (t1>>24)&0xff
95+ xor $SBOX2_0222($Tbl,$i0,8),$t3 # t3^=SBOX2_0222[0]
96+ xor $SBOX2_0222($Tbl,$i1,8),$t2 # t2^=SBOX2_0222[1]
97+ mov `$seed+($i+1)*$scale`($key),$t1 # prefetch key[i+1]
98+ mov `$seed+($i+1)*$scale+4`($key),$t0
99+ xor $t3,$t2 # t2^=t3
100+ ror \$8,$t3 # t3=RightRotate(t3,8)
101+ xor $t2,$s2
102+ xor $t2,$s3
103+ xor $t3,$s3
104+___
105+}
106+
107+# void Camellia_EncryptBlock_Rounds(
108+# int grandRounds,
109+# const Byte plaintext[],
110+# const KEY_TABLE_TYPE keyTable,
111+# Byte ciphertext[])
112+$code=<<___;
113+.text
114+
115+# V1.x API
116+.globl Camellia_EncryptBlock
117+.type Camellia_EncryptBlock,\@abi-omnipotent
118+.align 16
119+Camellia_EncryptBlock:
120+ movl \$128,%eax
121+ subl $arg0d,%eax
122+ movl \$3,$arg0d
123+ adcl \$0,$arg0d # keyBitLength==128?3:4
124+ jmp .Lenc_rounds
125+.size Camellia_EncryptBlock,.-Camellia_EncryptBlock
126+# V2
127+.globl Camellia_EncryptBlock_Rounds
128+.type Camellia_EncryptBlock_Rounds,\@function,4
129+.align 16
130+.Lenc_rounds:
131+Camellia_EncryptBlock_Rounds:
132+ push %rbx
133+ push %rbp
134+ push %r13
135+ push %r14
136+ push %r15
137+.Lenc_prologue:
138+
139+ #mov %rsi,$inp # put away arguments
140+ mov %rcx,$out
141+ mov %rdx,$key
142+
143+ shl \$6,%edi # process grandRounds
144+ lea .LCamellia_SBOX(%rip),$Tbl
145+ lea ($key,%rdi),$keyend
146+
147+ mov 0(%rsi),@S[0] # load plaintext
148+ mov 4(%rsi),@S[1]
149+ mov 8(%rsi),@S[2]
150+ bswap @S[0]
151+ mov 12(%rsi),@S[3]
152+ bswap @S[1]
153+ bswap @S[2]
154+ bswap @S[3]
155+
156+ call _x86_64_Camellia_encrypt
157+
158+ bswap @S[0]
159+ bswap @S[1]
160+ bswap @S[2]
161+ mov @S[0],0($out)
162+ bswap @S[3]
163+ mov @S[1],4($out)
164+ mov @S[2],8($out)
165+ mov @S[3],12($out)
166+
167+ mov 0(%rsp),%r15
168+ mov 8(%rsp),%r14
169+ mov 16(%rsp),%r13
170+ mov 24(%rsp),%rbp
171+ mov 32(%rsp),%rbx
172+ lea 40(%rsp),%rsp
173+.Lenc_epilogue:
174+ ret
175+.size Camellia_EncryptBlock_Rounds,.-Camellia_EncryptBlock_Rounds
176+
177+.type _x86_64_Camellia_encrypt,\@abi-omnipotent
178+.align 16
179+_x86_64_Camellia_encrypt:
180+ xor 0($key),@S[1]
181+ xor 4($key),@S[0] # ^=key[0-3]
182+ xor 8($key),@S[3]
183+ xor 12($key),@S[2]
184+.align 16
185+.Leloop:
186+ mov 16($key),$t1 # prefetch key[4-5]
187+ mov 20($key),$t0
188+
189+___
190+ for ($i=0;$i<6;$i++) { Camellia_Feistel($i,16); }
191+$code.=<<___;
192+ lea 16*4($key),$key
193+ cmp $keyend,$key
194+ mov 8($key),$t3 # prefetch key[2-3]
195+ mov 12($key),$t2
196+ je .Ledone
197+
198+ and @S[0],$t0
199+ or @S[3],$t3
200+ rol \$1,$t0
201+ xor $t3,@S[2] # s2^=s3|key[3];
202+ xor $t0,@S[1] # s1^=LeftRotate(s0&key[0],1);
203+ and @S[2],$t2
204+ or @S[1],$t1
205+ rol \$1,$t2
206+ xor $t1,@S[0] # s0^=s1|key[1];
207+ xor $t2,@S[3] # s3^=LeftRotate(s2&key[2],1);
208+ jmp .Leloop
209+
210+.align 16
211+.Ledone:
212+ xor @S[2],$t0 # SwapHalf
213+ xor @S[3],$t1
214+ xor @S[0],$t2
215+ xor @S[1],$t3
216+
217+ mov $t0,@S[0]
218+ mov $t1,@S[1]
219+ mov $t2,@S[2]
220+ mov $t3,@S[3]
221+
222+ .byte 0xf3,0xc3 # rep ret
223+.size _x86_64_Camellia_encrypt,.-_x86_64_Camellia_encrypt
224+
225+# V1.x API
226+.globl Camellia_DecryptBlock
227+.type Camellia_DecryptBlock,\@abi-omnipotent
228+.align 16
229+Camellia_DecryptBlock:
230+ movl \$128,%eax
231+ subl $arg0d,%eax
232+ movl \$3,$arg0d
233+ adcl \$0,$arg0d # keyBitLength==128?3:4
234+ jmp .Ldec_rounds
235+.size Camellia_DecryptBlock,.-Camellia_DecryptBlock
236+# V2
237+.globl Camellia_DecryptBlock_Rounds
238+.type Camellia_DecryptBlock_Rounds,\@function,4
239+.align 16
240+.Ldec_rounds:
241+Camellia_DecryptBlock_Rounds:
242+ push %rbx
243+ push %rbp
244+ push %r13
245+ push %r14
246+ push %r15
247+.Ldec_prologue:
248+
249+ #mov %rsi,$inp # put away arguments
250+ mov %rcx,$out
251+ mov %rdx,$keyend
252+
253+ shl \$6,%edi # process grandRounds
254+ lea .LCamellia_SBOX(%rip),$Tbl
255+ lea ($keyend,%rdi),$key
256+
257+ mov 0(%rsi),@S[0] # load plaintext
258+ mov 4(%rsi),@S[1]
259+ mov 8(%rsi),@S[2]
260+ bswap @S[0]
261+ mov 12(%rsi),@S[3]
262+ bswap @S[1]
263+ bswap @S[2]
264+ bswap @S[3]
265+
266+ call _x86_64_Camellia_decrypt
267+
268+ bswap @S[0]
269+ bswap @S[1]
270+ bswap @S[2]
271+ mov @S[0],0($out)
272+ bswap @S[3]
273+ mov @S[1],4($out)
274+ mov @S[2],8($out)
275+ mov @S[3],12($out)
276+
277+ mov 0(%rsp),%r15
278+ mov 8(%rsp),%r14
279+ mov 16(%rsp),%r13
280+ mov 24(%rsp),%rbp
281+ mov 32(%rsp),%rbx
282+ lea 40(%rsp),%rsp
283+.Ldec_epilogue:
284+ ret
285+.size Camellia_DecryptBlock_Rounds,.-Camellia_DecryptBlock_Rounds
286+
287+.type _x86_64_Camellia_decrypt,\@abi-omnipotent
288+.align 16
289+_x86_64_Camellia_decrypt:
290+ xor 0($key),@S[1]
291+ xor 4($key),@S[0] # ^=key[0-3]
292+ xor 8($key),@S[3]
293+ xor 12($key),@S[2]
294+.align 16
295+.Ldloop:
296+ mov -8($key),$t1 # prefetch key[4-5]
297+ mov -4($key),$t0
298+
299+___
300+ for ($i=0;$i<6;$i++) { Camellia_Feistel($i,-8); }
301+$code.=<<___;
302+ lea -16*4($key),$key
303+ cmp $keyend,$key
304+ mov 0($key),$t3 # prefetch key[2-3]
305+ mov 4($key),$t2
306+ je .Lddone
307+
308+ and @S[0],$t0
309+ or @S[3],$t3
310+ rol \$1,$t0
311+ xor $t3,@S[2] # s2^=s3|key[3];
312+ xor $t0,@S[1] # s1^=LeftRotate(s0&key[0],1);
313+ and @S[2],$t2
314+ or @S[1],$t1
315+ rol \$1,$t2
316+ xor $t1,@S[0] # s0^=s1|key[1];
317+ xor $t2,@S[3] # s3^=LeftRotate(s2&key[2],1);
318+
319+ jmp .Ldloop
320+
321+.align 16
322+.Lddone:
323+ xor @S[2],$t2
324+ xor @S[3],$t3
325+ xor @S[0],$t0
326+ xor @S[1],$t1
327+
328+ mov $t2,@S[0] # SwapHalf
329+ mov $t3,@S[1]
330+ mov $t0,@S[2]
331+ mov $t1,@S[3]
332+
333+ .byte 0xf3,0xc3 # rep ret
334+.size _x86_64_Camellia_decrypt,.-_x86_64_Camellia_decrypt
335+___
336+
337+sub _saveround {
338+my ($rnd,$key,@T)=@_;
339+my $bias=int(@T[0])?shift(@T):0;
340+
341+ if ($#T==3) {
342+ $code.=<<___;
343+ mov @T[1],`$bias+$rnd*8+0`($key)
344+ mov @T[0],`$bias+$rnd*8+4`($key)
345+ mov @T[3],`$bias+$rnd*8+8`($key)
346+ mov @T[2],`$bias+$rnd*8+12`($key)
347+___
348+ } else {
349+ $code.=" mov @T[0],`$bias+$rnd*8+0`($key)\n";
350+ $code.=" mov @T[1],`$bias+$rnd*8+8`($key)\n" if ($#T>=1);
351+ }
352+}
353+
354+sub _loadround {
355+my ($rnd,$key,@T)=@_;
356+my $bias=int(@T[0])?shift(@T):0;
357+
358+$code.=" mov `$bias+$rnd*8+0`($key),@T[0]\n";
359+$code.=" mov `$bias+$rnd*8+8`($key),@T[1]\n" if ($#T>=1);
360+}
361+
362+# shld is very slow on Intel EM64T family. Even on AMD it limits
363+# instruction decode rate [because it's VectorPath] and consequently
364+# performance...
365+sub __rotl128 {
366+my ($i0,$i1,$rot)=@_;
367+
368+ if ($rot) {
369+ $code.=<<___;
370+ mov $i0,%r11
371+ shld \$$rot,$i1,$i0
372+ shld \$$rot,%r11,$i1
373+___
374+ }
375+}
376+
377+# ... Implementing 128-bit rotate without shld gives 80% better
378+# performance EM64T, +15% on AMD64 and only ~7% degradation on
379+# Core2. This is therefore preferred.
380+sub _rotl128 {
381+my ($i0,$i1,$rot)=@_;
382+
383+ if ($rot) {
384+ $code.=<<___;
385+ mov $i0,%r11
386+ shl \$$rot,$i0
387+ mov $i1,%r9
388+ shr \$`64-$rot`,%r9
389+ shr \$`64-$rot`,%r11
390+ or %r9,$i0
391+ shl \$$rot,$i1
392+ or %r11,$i1
393+___
394+ }
395+}
396+
397+{ my $step=0;
398+
399+$code.=<<___;
400+.globl Camellia_Ekeygen
401+.type Camellia_Ekeygen,\@function,3
402+.align 16
403+Camellia_Ekeygen:
404+ push %rbx
405+ push %rbp
406+ push %r13
407+ push %r14
408+ push %r15
409+.Lkey_prologue:
410+
411+ mov %rdi,$keyend # put away arguments, keyBitLength
412+ mov %rdx,$out # keyTable
413+
414+ mov 0(%rsi),@S[0] # load 0-127 bits
415+ mov 4(%rsi),@S[1]
416+ mov 8(%rsi),@S[2]
417+ mov 12(%rsi),@S[3]
418+
419+ bswap @S[0]
420+ bswap @S[1]
421+ bswap @S[2]
422+ bswap @S[3]
423+___
424+ &_saveround (0,$out,@S); # KL<<<0
425+$code.=<<___;
426+ cmp \$128,$keyend # check keyBitLength
427+ je .L1st128
428+
429+ mov 16(%rsi),@S[0] # load 128-191 bits
430+ mov 20(%rsi),@S[1]
431+ cmp \$192,$keyend
432+ je .L1st192
433+ mov 24(%rsi),@S[2] # load 192-255 bits
434+ mov 28(%rsi),@S[3]
435+ jmp .L1st256
436+.L1st192:
437+ mov @S[0],@S[2]
438+ mov @S[1],@S[3]
439+ not @S[2]
440+ not @S[3]
441+.L1st256:
442+ bswap @S[0]
443+ bswap @S[1]
444+ bswap @S[2]
445+ bswap @S[3]
446+___
447+ &_saveround (4,$out,@S); # temp storage for KR!
448+$code.=<<___;
449+ xor 0($out),@S[1] # KR^KL
450+ xor 4($out),@S[0]
451+ xor 8($out),@S[3]
452+ xor 12($out),@S[2]
453+
454+.L1st128:
455+ lea .LCamellia_SIGMA(%rip),$key
456+ lea .LCamellia_SBOX(%rip),$Tbl
457+
458+ mov 0($key),$t1
459+ mov 4($key),$t0
460+___
461+ &Camellia_Feistel($step++);
462+ &Camellia_Feistel($step++);
463+$code.=<<___;
464+ xor 0($out),@S[1] # ^KL
465+ xor 4($out),@S[0]
466+ xor 8($out),@S[3]
467+ xor 12($out),@S[2]
468+___
469+ &Camellia_Feistel($step++);
470+ &Camellia_Feistel($step++);
471+$code.=<<___;
472+ cmp \$128,$keyend
473+ jne .L2nd256
474+
475+ lea 128($out),$out # size optimization
476+ shl \$32,%r8 # @S[0]||
477+ shl \$32,%r10 # @S[2]||
478+ or %r9,%r8 # ||@S[1]
479+ or %r11,%r10 # ||@S[3]
480+___
481+ &_loadround (0,$out,-128,"%rax","%rbx"); # KL
482+ &_saveround (2,$out,-128,"%r8","%r10"); # KA<<<0
483+ &_rotl128 ("%rax","%rbx",15);
484+ &_saveround (4,$out,-128,"%rax","%rbx"); # KL<<<15
485+ &_rotl128 ("%r8","%r10",15);
486+ &_saveround (6,$out,-128,"%r8","%r10"); # KA<<<15
487+ &_rotl128 ("%r8","%r10",15); # 15+15=30
488+ &_saveround (8,$out,-128,"%r8","%r10"); # KA<<<30
489+ &_rotl128 ("%rax","%rbx",30); # 15+30=45
490+ &_saveround (10,$out,-128,"%rax","%rbx"); # KL<<<45
491+ &_rotl128 ("%r8","%r10",15); # 30+15=45
492+ &_saveround (12,$out,-128,"%r8"); # KA<<<45
493+ &_rotl128 ("%rax","%rbx",15); # 45+15=60
494+ &_saveround (13,$out,-128,"%rbx"); # KL<<<60
495+ &_rotl128 ("%r8","%r10",15); # 45+15=60
496+ &_saveround (14,$out,-128,"%r8","%r10"); # KA<<<60
497+ &_rotl128 ("%rax","%rbx",17); # 60+17=77
498+ &_saveround (16,$out,-128,"%rax","%rbx"); # KL<<<77
499+ &_rotl128 ("%rax","%rbx",17); # 77+17=94
500+ &_saveround (18,$out,-128,"%rax","%rbx"); # KL<<<94
501+ &_rotl128 ("%r8","%r10",34); # 60+34=94
502+ &_saveround (20,$out,-128,"%r8","%r10"); # KA<<<94
503+ &_rotl128 ("%rax","%rbx",17); # 94+17=111
504+ &_saveround (22,$out,-128,"%rax","%rbx"); # KL<<<111
505+ &_rotl128 ("%r8","%r10",17); # 94+17=111
506+ &_saveround (24,$out,-128,"%r8","%r10"); # KA<<<111
507+$code.=<<___;
508+ mov \$3,%eax
509+ jmp .Ldone
510+.align 16
511+.L2nd256:
512+___
513+ &_saveround (6,$out,@S); # temp storage for KA!
514+$code.=<<___;
515+ xor `4*8+0`($out),@S[1] # KA^KR
516+ xor `4*8+4`($out),@S[0]
517+ xor `5*8+0`($out),@S[3]
518+ xor `5*8+4`($out),@S[2]
519+___
520+ &Camellia_Feistel($step++);
521+ &Camellia_Feistel($step++);
522+
523+ &_loadround (0,$out,"%rax","%rbx"); # KL
524+ &_loadround (4,$out,"%rcx","%rdx"); # KR
525+ &_loadround (6,$out,"%r14","%r15"); # KA
526+$code.=<<___;
527+ lea 128($out),$out # size optimization
528+ shl \$32,%r8 # @S[0]||
529+ shl \$32,%r10 # @S[2]||
530+ or %r9,%r8 # ||@S[1]
531+ or %r11,%r10 # ||@S[3]
532+___
533+ &_saveround (2,$out,-128,"%r8","%r10"); # KB<<<0
534+ &_rotl128 ("%rcx","%rdx",15);
535+ &_saveround (4,$out,-128,"%rcx","%rdx"); # KR<<<15
536+ &_rotl128 ("%r14","%r15",15);
537+ &_saveround (6,$out,-128,"%r14","%r15"); # KA<<<15
538+ &_rotl128 ("%rcx","%rdx",15); # 15+15=30
539+ &_saveround (8,$out,-128,"%rcx","%rdx"); # KR<<<30
540+ &_rotl128 ("%r8","%r10",30);
541+ &_saveround (10,$out,-128,"%r8","%r10"); # KB<<<30
542+ &_rotl128 ("%rax","%rbx",45);
543+ &_saveround (12,$out,-128,"%rax","%rbx"); # KL<<<45
544+ &_rotl128 ("%r14","%r15",30); # 15+30=45
545+ &_saveround (14,$out,-128,"%r14","%r15"); # KA<<<45
546+ &_rotl128 ("%rax","%rbx",15); # 45+15=60
547+ &_saveround (16,$out,-128,"%rax","%rbx"); # KL<<<60
548+ &_rotl128 ("%rcx","%rdx",30); # 30+30=60
549+ &_saveround (18,$out,-128,"%rcx","%rdx"); # KR<<<60
550+ &_rotl128 ("%r8","%r10",30); # 30+30=60
551+ &_saveround (20,$out,-128,"%r8","%r10"); # KB<<<60
552+ &_rotl128 ("%rax","%rbx",17); # 60+17=77
553+ &_saveround (22,$out,-128,"%rax","%rbx"); # KL<<<77
554+ &_rotl128 ("%r14","%r15",32); # 45+32=77
555+ &_saveround (24,$out,-128,"%r14","%r15"); # KA<<<77
556+ &_rotl128 ("%rcx","%rdx",34); # 60+34=94
557+ &_saveround (26,$out,-128,"%rcx","%rdx"); # KR<<<94
558+ &_rotl128 ("%r14","%r15",17); # 77+17=94
559+ &_saveround (28,$out,-128,"%r14","%r15"); # KA<<<77
560+ &_rotl128 ("%rax","%rbx",34); # 77+34=111
561+ &_saveround (30,$out,-128,"%rax","%rbx"); # KL<<<111
562+ &_rotl128 ("%r8","%r10",51); # 60+51=111
563+ &_saveround (32,$out,-128,"%r8","%r10"); # KB<<<111
564+$code.=<<___;
565+ mov \$4,%eax
566+.Ldone:
567+ mov 0(%rsp),%r15
568+ mov 8(%rsp),%r14
569+ mov 16(%rsp),%r13
570+ mov 24(%rsp),%rbp
571+ mov 32(%rsp),%rbx
572+ lea 40(%rsp),%rsp
573+.Lkey_epilogue:
574+ ret
575+.size Camellia_Ekeygen,.-Camellia_Ekeygen
576+___
577+}
578+
579+@SBOX=(
580+112,130, 44,236,179, 39,192,229,228,133, 87, 53,234, 12,174, 65,
581+ 35,239,107,147, 69, 25,165, 33,237, 14, 79, 78, 29,101,146,189,
582+134,184,175,143,124,235, 31,206, 62, 48,220, 95, 94,197, 11, 26,
583+166,225, 57,202,213, 71, 93, 61,217, 1, 90,214, 81, 86,108, 77,
584+139, 13,154,102,251,204,176, 45,116, 18, 43, 32,240,177,132,153,
585+223, 76,203,194, 52,126,118, 5,109,183,169, 49,209, 23, 4,215,
586+ 20, 88, 58, 97,222, 27, 17, 28, 50, 15,156, 22, 83, 24,242, 34,
587+254, 68,207,178,195,181,122,145, 36, 8,232,168, 96,252,105, 80,
588+170,208,160,125,161,137, 98,151, 84, 91, 30,149,224,255,100,210,
589+ 16,196, 0, 72,163,247,117,219,138, 3,230,218, 9, 63,221,148,
590+135, 92,131, 2,205, 74,144, 51,115,103,246,243,157,127,191,226,
591+ 82,155,216, 38,200, 55,198, 59,129,150,111, 75, 19,190, 99, 46,
592+233,121,167,140,159,110,188,142, 41,245,249,182, 47,253,180, 89,
593+120,152, 6,106,231, 70,113,186,212, 37,171, 66,136,162,141,250,
594+114, 7,185, 85,248,238,172, 10, 54, 73, 42,104, 60, 56,241,164,
595+ 64, 40,211,123,187,201, 67,193, 21,227,173,244,119,199,128,158);
596+
597+sub S1110 { my $i=shift; $i=@SBOX[$i]; $i=$i<<24|$i<<16|$i<<8; sprintf("0x%08x",$i); }
598+sub S4404 { my $i=shift; $i=($i<<1|$i>>7)&0xff; $i=@SBOX[$i]; $i=$i<<24|$i<<16|$i; sprintf("0x%08x",$i); }
599+sub S0222 { my $i=shift; $i=@SBOX[$i]; $i=($i<<1|$i>>7)&0xff; $i=$i<<16|$i<<8|$i; sprintf("0x%08x",$i); }
600+sub S3033 { my $i=shift; $i=@SBOX[$i]; $i=($i>>1|$i<<7)&0xff; $i=$i<<24|$i<<8|$i; sprintf("0x%08x",$i); }
601+
602+$code.=<<___;
603+.align 64
604+.LCamellia_SIGMA:
605+.long 0x3bcc908b, 0xa09e667f, 0x4caa73b2, 0xb67ae858
606+.long 0xe94f82be, 0xc6ef372f, 0xf1d36f1c, 0x54ff53a5
607+.long 0xde682d1d, 0x10e527fa, 0xb3e6c1fd, 0xb05688c2
608+.long 0, 0, 0, 0
609+.LCamellia_SBOX:
610+___
611+# tables are interleaved, remember?
612+sub data_word { $code.=".long\t".join(',',@_)."\n"; }
613+for ($i=0;$i<256;$i++) { &data_word(&S1110($i),&S4404($i)); }
614+for ($i=0;$i<256;$i++) { &data_word(&S0222($i),&S3033($i)); }
615+
616+# void Camellia_cbc_encrypt (const void char *inp, unsigned char *out,
617+# size_t length, const CAMELLIA_KEY *key,
618+# unsigned char *ivp,const int enc);
619+{
620+$_key="0(%rsp)";
621+$_end="8(%rsp)"; # inp+len&~15
622+$_res="16(%rsp)"; # len&15
623+$ivec="24(%rsp)";
624+$_ivp="40(%rsp)";
625+$_rsp="48(%rsp)";
626+
627+$code.=<<___;
628+.globl Camellia_cbc_encrypt
629+.type Camellia_cbc_encrypt,\@function,6
630+.align 16
631+Camellia_cbc_encrypt:
632+ cmp \$0,%rdx
633+ je .Lcbc_abort
634+ push %rbx
635+ push %rbp
636+ push %r12
637+ push %r13
638+ push %r14
639+ push %r15
640+.Lcbc_prologue:
641+
642+ mov %rsp,%rbp
643+ sub \$64,%rsp
644+ and \$-64,%rsp
645+
646+ # place stack frame just "above mod 1024" the key schedule,
647+ # this ensures that cache associativity suffices
648+ lea -64-63(%rcx),%r10
649+ sub %rsp,%r10
650+ neg %r10
651+ and \$0x3C0,%r10
652+ sub %r10,%rsp
653+ #add \$8,%rsp # 8 is reserved for callee's ra
654+
655+ mov %rdi,$inp # inp argument
656+ mov %rsi,$out # out argument
657+ mov %r8,%rbx # ivp argument
658+ mov %rcx,$key # key argument
659+ mov 272(%rcx),$keyend # grandRounds
660+
661+ mov %r8,$_ivp
662+ mov %rbp,$_rsp
663+
664+.Lcbc_body:
665+ lea .LCamellia_SBOX(%rip),$Tbl
666+
667+ mov \$32,%ecx
668+.align 4
669+.Lcbc_prefetch_sbox:
670+ mov 0($Tbl),%rax
671+ mov 32($Tbl),%rsi
672+ mov 64($Tbl),%rdi
673+ mov 96($Tbl),%r11
674+ lea 128($Tbl),$Tbl
675+ loop .Lcbc_prefetch_sbox
676+ sub \$4096,$Tbl
677+ shl \$6,$keyend
678+ mov %rdx,%rcx # len argument
679+ lea ($key,$keyend),$keyend
680+
681+ cmp \$0,%r9d # enc argument
682+ je .LCBC_DECRYPT
683+
684+ and \$-16,%rdx
685+ and \$15,%rcx # length residue
686+ lea ($inp,%rdx),%rdx
687+ mov $key,$_key
688+ mov %rdx,$_end
689+ mov %rcx,$_res
690+
691+ cmp $inp,%rdx
692+ mov 0(%rbx),@S[0] # load IV
693+ mov 4(%rbx),@S[1]
694+ mov 8(%rbx),@S[2]
695+ mov 12(%rbx),@S[3]
696+ je .Lcbc_enc_tail
697+ jmp .Lcbc_eloop
698+
699+.align 16
700+.Lcbc_eloop:
701+ xor 0($inp),@S[0]
702+ xor 4($inp),@S[1]
703+ xor 8($inp),@S[2]
704+ bswap @S[0]
705+ xor 12($inp),@S[3]
706+ bswap @S[1]
707+ bswap @S[2]
708+ bswap @S[3]
709+
710+ call _x86_64_Camellia_encrypt
711+
712+ mov $_key,$key # "rewind" the key
713+ bswap @S[0]
714+ mov $_end,%rdx
715+ bswap @S[1]
716+ mov $_res,%rcx
717+ bswap @S[2]
718+ mov @S[0],0($out)
719+ bswap @S[3]
720+ mov @S[1],4($out)
721+ mov @S[2],8($out)
722+ lea 16($inp),$inp
723+ mov @S[3],12($out)
724+ cmp %rdx,$inp
725+ lea 16($out),$out
726+ jne .Lcbc_eloop
727+
728+ cmp \$0,%rcx
729+ jne .Lcbc_enc_tail
730+
731+ mov $_ivp,$out
732+ mov @S[0],0($out) # write out IV residue
733+ mov @S[1],4($out)
734+ mov @S[2],8($out)
735+ mov @S[3],12($out)
736+ jmp .Lcbc_done
737+
738+.align 16
739+.Lcbc_enc_tail:
740+ xor %rax,%rax
741+ mov %rax,0+$ivec
742+ mov %rax,8+$ivec
743+ mov %rax,$_res
744+
745+.Lcbc_enc_pushf:
746+ pushfq
747+ cld
748+ mov $inp,%rsi
749+ lea 8+$ivec,%rdi
750+ .long 0x9066A4F3 # rep movsb
751+ popfq
752+.Lcbc_enc_popf:
753+
754+ lea $ivec,$inp
755+ lea 16+$ivec,%rax
756+ mov %rax,$_end
757+ jmp .Lcbc_eloop # one more time
758+
759+.align 16
760+.LCBC_DECRYPT:
761+ xchg $key,$keyend
762+ add \$15,%rdx
763+ and \$15,%rcx # length residue
764+ and \$-16,%rdx
765+ mov $key,$_key
766+ lea ($inp,%rdx),%rdx
767+ mov %rdx,$_end
768+ mov %rcx,$_res
769+
770+ mov (%rbx),%rax # load IV
771+ mov 8(%rbx),%rbx
772+ jmp .Lcbc_dloop
773+.align 16
774+.Lcbc_dloop:
775+ mov 0($inp),@S[0]
776+ mov 4($inp),@S[1]
777+ mov 8($inp),@S[2]
778+ bswap @S[0]
779+ mov 12($inp),@S[3]
780+ bswap @S[1]
781+ mov %rax,0+$ivec # save IV to temporary storage
782+ bswap @S[2]
783+ mov %rbx,8+$ivec
784+ bswap @S[3]
785+
786+ call _x86_64_Camellia_decrypt
787+
788+ mov $_key,$key # "rewind" the key
789+ mov $_end,%rdx
790+ mov $_res,%rcx
791+
792+ bswap @S[0]
793+ mov ($inp),%rax # load IV for next iteration
794+ bswap @S[1]
795+ mov 8($inp),%rbx
796+ bswap @S[2]
797+ xor 0+$ivec,@S[0]
798+ bswap @S[3]
799+ xor 4+$ivec,@S[1]
800+ xor 8+$ivec,@S[2]
801+ lea 16($inp),$inp
802+ xor 12+$ivec,@S[3]
803+ cmp %rdx,$inp
804+ je .Lcbc_ddone
805+
806+ mov @S[0],0($out)
807+ mov @S[1],4($out)
808+ mov @S[2],8($out)
809+ mov @S[3],12($out)
810+
811+ lea 16($out),$out
812+ jmp .Lcbc_dloop
813+
814+.align 16
815+.Lcbc_ddone:
816+ mov $_ivp,%rdx
817+ cmp \$0,%rcx
818+ jne .Lcbc_dec_tail
819+
820+ mov @S[0],0($out)
821+ mov @S[1],4($out)
822+ mov @S[2],8($out)
823+ mov @S[3],12($out)
824+
825+ mov %rax,(%rdx) # write out IV residue
826+ mov %rbx,8(%rdx)
827+ jmp .Lcbc_done
828+.align 16
829+.Lcbc_dec_tail:
830+ mov @S[0],0+$ivec
831+ mov @S[1],4+$ivec
832+ mov @S[2],8+$ivec
833+ mov @S[3],12+$ivec
834+
835+.Lcbc_dec_pushf:
836+ pushfq
837+ cld
838+ lea 8+$ivec,%rsi
839+ lea ($out),%rdi
840+ .long 0x9066A4F3 # rep movsb
841+ popfq
842+.Lcbc_dec_popf:
843+
844+ mov %rax,(%rdx) # write out IV residue
845+ mov %rbx,8(%rdx)
846+ jmp .Lcbc_done
847+
848+.align 16
849+.Lcbc_done:
850+ mov $_rsp,%rcx
851+ mov 0(%rcx),%r15
852+ mov 8(%rcx),%r14
853+ mov 16(%rcx),%r13
854+ mov 24(%rcx),%r12
855+ mov 32(%rcx),%rbp
856+ mov 40(%rcx),%rbx
857+ lea 48(%rcx),%rsp
858+.Lcbc_abort:
859+ ret
860+.size Camellia_cbc_encrypt,.-Camellia_cbc_encrypt
861+
862+.asciz "Camellia for x86_64 by <appro@openssl.org>"
863+___
864+}
865+
866+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
867+# CONTEXT *context,DISPATCHER_CONTEXT *disp)
868+if ($win64) {
869+$rec="%rcx";
870+$frame="%rdx";
871+$context="%r8";
872+$disp="%r9";
873+
874+$code.=<<___;
875+.extern __imp_RtlVirtualUnwind
876+.type common_se_handler,\@abi-omnipotent
877+.align 16
878+common_se_handler:
879+ push %rsi
880+ push %rdi
881+ push %rbx
882+ push %rbp
883+ push %r12
884+ push %r13
885+ push %r14
886+ push %r15
887+ pushfq
888+ lea -64(%rsp),%rsp
889+
890+ mov 120($context),%rax # pull context->Rax
891+ mov 248($context),%rbx # pull context->Rip
892+
893+ mov 8($disp),%rsi # disp->ImageBase
894+ mov 56($disp),%r11 # disp->HandlerData
895+
896+ mov 0(%r11),%r10d # HandlerData[0]
897+ lea (%rsi,%r10),%r10 # prologue label
898+ cmp %r10,%rbx # context->Rip<prologue label
899+ jb .Lin_prologue
900+
901+ mov 152($context),%rax # pull context->Rsp
902+
903+ mov 4(%r11),%r10d # HandlerData[1]
904+ lea (%rsi,%r10),%r10 # epilogue label
905+ cmp %r10,%rbx # context->Rip>=epilogue label
906+ jae .Lin_prologue
907+
908+ lea 40(%rax),%rax
909+ mov -8(%rax),%rbx
910+ mov -16(%rax),%rbp
911+ mov -24(%rax),%r13
912+ mov -32(%rax),%r14
913+ mov -40(%rax),%r15
914+ mov %rbx,144($context) # restore context->Rbx
915+ mov %rbp,160($context) # restore context->Rbp
916+ mov %r13,224($context) # restore context->R13
917+ mov %r14,232($context) # restore context->R14
918+ mov %r15,240($context) # restore context->R15
919+
920+.Lin_prologue:
921+ mov 8(%rax),%rdi
922+ mov 16(%rax),%rsi
923+ mov %rax,152($context) # restore context->Rsp
924+ mov %rsi,168($context) # restore context->Rsi
925+ mov %rdi,176($context) # restore context->Rdi
926+
927+ jmp .Lcommon_seh_exit
928+.size common_se_handler,.-common_se_handler
929+
930+.type cbc_se_handler,\@abi-omnipotent
931+.align 16
932+cbc_se_handler:
933+ push %rsi
934+ push %rdi
935+ push %rbx
936+ push %rbp
937+ push %r12
938+ push %r13
939+ push %r14
940+ push %r15
941+ pushfq
942+ lea -64(%rsp),%rsp
943+
944+ mov 120($context),%rax # pull context->Rax
945+ mov 248($context),%rbx # pull context->Rip
946+
947+ lea .Lcbc_prologue(%rip),%r10
948+ cmp %r10,%rbx # context->Rip<.Lcbc_prologue
949+ jb .Lin_cbc_prologue
950+
951+ lea .Lcbc_body(%rip),%r10
952+ cmp %r10,%rbx # context->Rip<.Lcbc_body
953+ jb .Lin_cbc_frame_setup
954+
955+ mov 152($context),%rax # pull context->Rsp
956+
957+ lea .Lcbc_abort(%rip),%r10
958+ cmp %r10,%rbx # context->Rip>=.Lcbc_abort
959+ jae .Lin_cbc_prologue
960+
961+ # handle pushf/popf in Camellia_cbc_encrypt
962+ lea .Lcbc_enc_pushf(%rip),%r10
963+ cmp %r10,%rbx # context->Rip<=.Lcbc_enc_pushf
964+ jbe .Lin_cbc_no_flag
965+ lea 8(%rax),%rax
966+ lea .Lcbc_enc_popf(%rip),%r10
967+ cmp %r10,%rbx # context->Rip<.Lcbc_enc_popf
968+ jb .Lin_cbc_no_flag
969+ lea -8(%rax),%rax
970+ lea .Lcbc_dec_pushf(%rip),%r10
971+ cmp %r10,%rbx # context->Rip<=.Lcbc_dec_pushf
972+ jbe .Lin_cbc_no_flag
973+ lea 8(%rax),%rax
974+ lea .Lcbc_dec_popf(%rip),%r10
975+ cmp %r10,%rbx # context->Rip<.Lcbc_dec_popf
976+ jb .Lin_cbc_no_flag
977+ lea -8(%rax),%rax
978+
979+.Lin_cbc_no_flag:
980+ mov 48(%rax),%rax # $_rsp
981+ lea 48(%rax),%rax
982+
983+.Lin_cbc_frame_setup:
984+ mov -8(%rax),%rbx
985+ mov -16(%rax),%rbp
986+ mov -24(%rax),%r12
987+ mov -32(%rax),%r13
988+ mov -40(%rax),%r14
989+ mov -48(%rax),%r15
990+ mov %rbx,144($context) # restore context->Rbx
991+ mov %rbp,160($context) # restore context->Rbp
992+ mov %r12,216($context) # restore context->R12
993+ mov %r13,224($context) # restore context->R13
994+ mov %r14,232($context) # restore context->R14
995+ mov %r15,240($context) # restore context->R15
996+
997+.Lin_cbc_prologue:
998+ mov 8(%rax),%rdi
999+ mov 16(%rax),%rsi
1000+ mov %rax,152($context) # restore context->Rsp
1001+ mov %rsi,168($context) # restore context->Rsi
1002+ mov %rdi,176($context) # restore context->Rdi
1003+
1004+.align 4
1005+.Lcommon_seh_exit:
1006+
1007+ mov 40($disp),%rdi # disp->ContextRecord
1008+ mov $context,%rsi # context
1009+ mov \$`1232/8`,%ecx # sizeof(CONTEXT)
1010+ .long 0xa548f3fc # cld; rep movsq
1011+
1012+ mov $disp,%rsi
1013+ xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
1014+ mov 8(%rsi),%rdx # arg2, disp->ImageBase
1015+ mov 0(%rsi),%r8 # arg3, disp->ControlPc
1016+ mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
1017+ mov 40(%rsi),%r10 # disp->ContextRecord
1018+ lea 56(%rsi),%r11 # &disp->HandlerData
1019+ lea 24(%rsi),%r12 # &disp->EstablisherFrame
1020+ mov %r10,32(%rsp) # arg5
1021+ mov %r11,40(%rsp) # arg6
1022+ mov %r12,48(%rsp) # arg7
1023+ mov %rcx,56(%rsp) # arg8, (NULL)
1024+ call *__imp_RtlVirtualUnwind(%rip)
1025+
1026+ mov \$1,%eax # ExceptionContinueSearch
1027+ lea 64(%rsp),%rsp
1028+ popfq
1029+ pop %r15
1030+ pop %r14
1031+ pop %r13
1032+ pop %r12
1033+ pop %rbp
1034+ pop %rbx
1035+ pop %rdi
1036+ pop %rsi
1037+ ret
1038+.size cbc_se_handler,.-cbc_se_handler
1039+
1040+.section .pdata
1041+.align 4
1042+ .rva .LSEH_begin_Camellia_EncryptBlock_Rounds
1043+ .rva .LSEH_end_Camellia_EncryptBlock_Rounds
1044+ .rva .LSEH_info_Camellia_EncryptBlock_Rounds
1045+
1046+ .rva .LSEH_begin_Camellia_DecryptBlock_Rounds
1047+ .rva .LSEH_end_Camellia_DecryptBlock_Rounds
1048+ .rva .LSEH_info_Camellia_DecryptBlock_Rounds
1049+
1050+ .rva .LSEH_begin_Camellia_Ekeygen
1051+ .rva .LSEH_end_Camellia_Ekeygen
1052+ .rva .LSEH_info_Camellia_Ekeygen
1053+
1054+ .rva .LSEH_begin_Camellia_cbc_encrypt
1055+ .rva .LSEH_end_Camellia_cbc_encrypt
1056+ .rva .LSEH_info_Camellia_cbc_encrypt
1057+
1058+.section .xdata
1059+.align 8
1060+.LSEH_info_Camellia_EncryptBlock_Rounds:
1061+ .byte 9,0,0,0
1062+ .rva common_se_handler
1063+ .rva .Lenc_prologue,.Lenc_epilogue # HandlerData[]
1064+.LSEH_info_Camellia_DecryptBlock_Rounds:
1065+ .byte 9,0,0,0
1066+ .rva common_se_handler
1067+ .rva .Ldec_prologue,.Ldec_epilogue # HandlerData[]
1068+.LSEH_info_Camellia_Ekeygen:
1069+ .byte 9,0,0,0
1070+ .rva common_se_handler
1071+ .rva .Lkey_prologue,.Lkey_epilogue # HandlerData[]
1072+.LSEH_info_Camellia_cbc_encrypt:
1073+ .byte 9,0,0,0
1074+ .rva cbc_se_handler
1075+___
1076+}
1077+
1078+$code =~ s/\`([^\`]*)\`/eval $1/gem;
1079+print $code;
1080+close STDOUT;
--- /dev/null
+++ b/crypto/cms/.cvsignore
@@ -0,0 +1,4 @@
1+lib
2+Makefile.save
3+*.flc
4+semantic.cache
--- /dev/null
+++ b/crypto/ppccpuid.pl
@@ -0,0 +1,94 @@
1+#!/usr/bin/env perl
2+
3+$flavour = shift;
4+
5+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
6+( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
7+( $xlate="${dir}perlasm/ppc-xlate.pl" and -f $xlate) or
8+die "can't locate ppc-xlate.pl";
9+
10+open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
11+
12+if ($flavour=~/64/) {
13+ $CMPLI="cmpldi";
14+ $SHRLI="srdi";
15+ $SIGNX="extsw";
16+} else {
17+ $CMPLI="cmplwi";
18+ $SHRLI="srwi";
19+ $SIGNX="mr";
20+}
21+
22+$code=<<___;
23+.machine "any"
24+.text
25+
26+.globl .OPENSSL_cpuid_setup
27+.align 4
28+.OPENSSL_cpuid_setup:
29+ blr
30+
31+.globl .OPENSSL_wipe_cpu
32+.align 4
33+.OPENSSL_wipe_cpu:
34+ xor r0,r0,r0
35+ mr r3,r1
36+ xor r4,r4,r4
37+ xor r5,r5,r5
38+ xor r6,r6,r6
39+ xor r7,r7,r7
40+ xor r8,r8,r8
41+ xor r9,r9,r9
42+ xor r10,r10,r10
43+ xor r11,r11,r11
44+ xor r12,r12,r12
45+ blr
46+
47+.globl .OPENSSL_atomic_add
48+.align 4
49+.OPENSSL_atomic_add:
50+Loop: lwarx r5,0,r3
51+ add r0,r4,r5
52+ stwcx. r0,0,r3
53+ bne- Loop
54+ $SIGNX r3,r0
55+ blr
56+
57+.globl .OPENSSL_rdtsc
58+.align 4
59+.OPENSSL_rdtsc:
60+ mftb r3
61+ mftbu r4
62+ blr
63+
64+.globl .OPENSSL_cleanse
65+.align 4
66+.OPENSSL_cleanse:
67+ $CMPLI r4,7
68+ li r0,0
69+ bge Lot
70+Little: mtctr r4
71+ stb r0,0(r3)
72+ addi r3,r3,1
73+ bdnz- \$-8
74+ blr
75+Lot: andi. r5,r3,3
76+ beq Laligned
77+ stb r0,0(r3)
78+ subi r4,r4,1
79+ addi r3,r3,1
80+ b Lot
81+Laligned:
82+ $SHRLI r5,r4,2
83+ mtctr r5
84+ stw r0,0(r3)
85+ addi r3,r3,4
86+ bdnz- \$-8
87+ andi. r4,r4,3
88+ bne Little
89+ blr
90+___
91+
92+$code =~ s/\`([^\`]*)\`/eval $1/gem;
93+print $code;
94+close STDOUT;
--- /dev/null
+++ b/crypto/s390xcpuid.S
@@ -0,0 +1,90 @@
1+.text
2+
3+.globl OPENSSL_cpuid_setup
4+.type OPENSSL_cpuid_setup,@function
5+.align 16
6+OPENSSL_cpuid_setup:
7+ br %r14 # reserved for future
8+.size OPENSSL_cpuid_setup,.-OPENSSL_cpuid_setup
9+
10+.globl OPENSSL_s390x_facilities
11+.type OPENSSL_s390x_facilities,@function
12+.align 16
13+OPENSSL_s390x_facilities:
14+ lghi %r0,0
15+ .long 0xb2b0f010 # stfle 16(%r15)
16+ lg %r2,16(%r15)
17+ br %r14
18+.size OPENSSL_s390x_facilities,.-OPENSSL_s390x_facilities
19+
20+.globl OPENSSL_rdtsc
21+.type OPENSSL_rdtsc,@function
22+.align 16
23+OPENSSL_rdtsc:
24+ stck 16(%r15)
25+ lg %r2,16(%r15)
26+ br %r14
27+.size OPENSSL_rdtsc,.-OPENSSL_rdtsc
28+
29+.globl OPENSSL_atomic_add
30+.type OPENSSL_atomic_add,@function
31+.align 16
32+OPENSSL_atomic_add:
33+ l %r1,0(%r2)
34+.Lspin: lr %r0,%r1
35+ ar %r0,%r3
36+ cs %r1,%r0,0(%r2)
37+ brc 4,.Lspin
38+ lgfr %r2,%r0 # OpenSSL expects the new value
39+ br %r14
40+.size OPENSSL_atomic_add,.-OPENSSL_atomic_add
41+
42+.globl OPENSSL_wipe_cpu
43+.type OPENSSL_wipe_cpu,@function
44+.align 16
45+OPENSSL_wipe_cpu:
46+ xgr %r0,%r0
47+ xgr %r1,%r1
48+ lgr %r2,%r15
49+ xgr %r3,%r3
50+ xgr %r4,%r4
51+ lzdr %f0
52+ lzdr %f1
53+ lzdr %f2
54+ lzdr %f3
55+ lzdr %f4
56+ lzdr %f5
57+ lzdr %f6
58+ lzdr %f7
59+ br %r14
60+.size OPENSSL_wipe_cpu,.-OPENSSL_wipe_cpu
61+
62+.globl OPENSSL_cleanse
63+.type OPENSSL_cleanse,@function
64+.align 16
65+OPENSSL_cleanse:
66+ lghi %r4,15
67+ lghi %r0,0
68+ clgr %r3,%r4
69+ jh .Lot
70+.Little:
71+ stc %r0,0(%r2)
72+ la %r2,1(%r2)
73+ brctg %r3,.Little
74+ br %r14
75+.align 4
76+.Lot: tmll %r2,7
77+ jz .Laligned
78+ stc %r0,0(%r2)
79+ la %r2,1(%r2)
80+ brctg %r3,.Lot
81+.Laligned:
82+ srlg %r4,%r3,3
83+.Loop: stg %r0,0(%r2)
84+ la %r2,8(%r2)
85+ brctg %r4,.Loop
86+ lghi %r4,7
87+ ngr %r3,%r4
88+ jnz .Little
89+ br %r14
90+.size OPENSSL_cleanse,.-OPENSSL_cleanse
--- /dev/null
+++ b/crypto/sparcv9cap.c
@@ -0,0 +1,154 @@
1+#include <stdio.h>
2+#include <stdlib.h>
3+#include <string.h>
4+#include <sys/time.h>
5+#include <openssl/bn.h>
6+
7+#define SPARCV9_TICK_PRIVILEGED (1<<0)
8+#define SPARCV9_PREFER_FPU (1<<1)
9+#define SPARCV9_VIS1 (1<<2)
10+#define SPARCV9_VIS2 (1<<3) /* reserved */
11+#define SPARCV9_FMADD (1<<4) /* reserved for SPARC64 V */
12+static int OPENSSL_sparcv9cap_P=SPARCV9_TICK_PRIVILEGED;
13+
14+int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0, int num)
15+ {
16+ int bn_mul_mont_fpu(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0, int num);
17+ int bn_mul_mont_int(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0, int num);
18+
19+ if ((OPENSSL_sparcv9cap_P&(SPARCV9_PREFER_FPU|SPARCV9_VIS1)) ==
20+ (SPARCV9_PREFER_FPU|SPARCV9_VIS1))
21+ return bn_mul_mont_fpu(rp,ap,bp,np,n0,num);
22+ else
23+ return bn_mul_mont_int(rp,ap,bp,np,n0,num);
24+ }
25+
26+unsigned long OPENSSL_rdtsc(void)
27+ {
28+ unsigned long _sparcv9_rdtick(void);
29+
30+ if (OPENSSL_sparcv9cap_P&SPARCV9_TICK_PRIVILEGED)
31+#if defined(__sun) && defined(__SVR4)
32+ return gethrtime();
33+#else
34+ return 0;
35+#endif
36+ else
37+ return _sparcv9_rdtick();
38+ }
39+
40+#if defined(__sun) && defined(__SVR4)
41+
42+#include <dlfcn.h>
43+#include <libdevinfo.h>
44+#include <sys/systeminfo.h>
45+
46+typedef di_node_t (*di_init_t)(const char *,uint_t);
47+typedef void (*di_fini_t)(di_node_t);
48+typedef char * (*di_node_name_t)(di_node_t);
49+typedef int (*di_walk_node_t)(di_node_t,uint_t,di_node_name_t,int (*)(di_node_t,di_node_name_t));
50+
51+#define DLLINK(h,name) (name=(name##_t)dlsym((h),#name))
52+
53+static int walk_nodename(di_node_t node, di_node_name_t di_node_name)
54+ {
55+ char *name = (*di_node_name)(node);
56+
57+ /* This is expected to catch all UltraSPARC flavors prior T1 */
58+ if (!strcmp (name,"SUNW,UltraSPARC") ||
59+ !strncmp(name,"SUNW,UltraSPARC-I",17)) /* covers II,III,IV */
60+ {
61+ OPENSSL_sparcv9cap_P |= SPARCV9_PREFER_FPU|SPARCV9_VIS1;
62+
63+ /* %tick is privileged only on UltraSPARC-I/II, but not IIe */
64+ if (name[14]!='\0' && name[17]!='\0' && name[18]!='\0')
65+ OPENSSL_sparcv9cap_P &= ~SPARCV9_TICK_PRIVILEGED;
66+
67+ return DI_WALK_TERMINATE;
68+ }
69+ /* This is expected to catch remaining UltraSPARCs, such as T1 */
70+ else if (!strncmp(name,"SUNW,UltraSPARC",15))
71+ {
72+ OPENSSL_sparcv9cap_P &= ~SPARCV9_TICK_PRIVILEGED;
73+
74+ return DI_WALK_TERMINATE;
75+ }
76+
77+ return DI_WALK_CONTINUE;
78+ }
79+
80+void OPENSSL_cpuid_setup(void)
81+ {
82+ void *h;
83+ char *e,si[256];
84+ static int trigger=0;
85+
86+ if (trigger) return;
87+ trigger=1;
88+
89+ if ((e=getenv("OPENSSL_sparcv9cap")))
90+ {
91+ OPENSSL_sparcv9cap_P=strtoul(e,NULL,0);
92+ return;
93+ }
94+
95+ if (sysinfo(SI_MACHINE,si,sizeof(si))>0)
96+ {
97+ if (strcmp(si,"sun4v"))
98+ /* FPU is preferred for all CPUs, but US-T1/2 */
99+ OPENSSL_sparcv9cap_P |= SPARCV9_PREFER_FPU;
100+ }
101+
102+ if (sysinfo(SI_ISALIST,si,sizeof(si))>0)
103+ {
104+ if (strstr(si,"+vis"))
105+ OPENSSL_sparcv9cap_P |= SPARCV9_VIS1;
106+ if (strstr(si,"+vis2"))
107+ {
108+ OPENSSL_sparcv9cap_P |= SPARCV9_VIS2;
109+ OPENSSL_sparcv9cap_P &= ~SPARCV9_TICK_PRIVILEGED;
110+ return;
111+ }
112+ }
113+
114+ if ((h = dlopen("libdevinfo.so.1",RTLD_LAZY))) do
115+ {
116+ di_init_t di_init;
117+ di_fini_t di_fini;
118+ di_walk_node_t di_walk_node;
119+ di_node_name_t di_node_name;
120+ di_node_t root_node;
121+
122+ if (!DLLINK(h,di_init)) break;
123+ if (!DLLINK(h,di_fini)) break;
124+ if (!DLLINK(h,di_walk_node)) break;
125+ if (!DLLINK(h,di_node_name)) break;
126+
127+ if ((root_node = (*di_init)("/",DINFOSUBTREE))!=DI_NODE_NIL)
128+ {
129+ (*di_walk_node)(root_node,DI_WALK_SIBFIRST,
130+ di_node_name,walk_nodename);
131+ (*di_fini)(root_node);
132+ }
133+ } while(0);
134+
135+ if (h) dlclose(h);
136+ }
137+
138+#else
139+
140+void OPENSSL_cpuid_setup(void)
141+ {
142+ char *e;
143+
144+ if ((e=getenv("OPENSSL_sparcv9cap")))
145+ {
146+ OPENSSL_sparcv9cap_P=strtoul(e,NULL,0);
147+ return;
148+ }
149+
150+ /* For now we assume that the rest supports UltraSPARC-I* only */
151+ OPENSSL_sparcv9cap_P |= SPARCV9_PREFER_FPU|SPARCV9_VIS1;
152+ }
153+
154+#endif