Coverage Report

Created: 2026-05-30 09:47

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/tmp/bitcoin/src/crypto/sha256_arm_shani.cpp
Line
Count
Source
1
// Copyright (c) 2022-present The Bitcoin Core developers
2
// Distributed under the MIT software license, see the accompanying
3
// file COPYING or http://www.opensource.org/licenses/mit-license.php.
4
//
5
// Based on https://github.com/noloader/SHA-Intrinsics/blob/master/sha256-arm.c,
6
// Written and placed in public domain by Jeffrey Walton.
7
// Based on code from ARM, and by Johannes Schneiders, Skip Hovsmith and
8
// Barry O'Rourke for the mbedTLS project.
9
// Variant specialized for 64-byte inputs added by Pieter Wuille.
10
11
#ifdef ENABLE_ARM_SHANI
12
13
#include <array>
14
#include <cstdint>
15
#include <cstddef>
16
#include <arm_neon.h>
17
18
namespace {
19
alignas(uint32x4_t) static constexpr std::array<uint32_t, 64> K =
20
{
21
    0x428A2F98, 0x71374491, 0xB5C0FBCF, 0xE9B5DBA5,
22
    0x3956C25B, 0x59F111F1, 0x923F82A4, 0xAB1C5ED5,
23
    0xD807AA98, 0x12835B01, 0x243185BE, 0x550C7DC3,
24
    0x72BE5D74, 0x80DEB1FE, 0x9BDC06A7, 0xC19BF174,
25
    0xE49B69C1, 0xEFBE4786, 0x0FC19DC6, 0x240CA1CC,
26
    0x2DE92C6F, 0x4A7484AA, 0x5CB0A9DC, 0x76F988DA,
27
    0x983E5152, 0xA831C66D, 0xB00327C8, 0xBF597FC7,
28
    0xC6E00BF3, 0xD5A79147, 0x06CA6351, 0x14292967,
29
    0x27B70A85, 0x2E1B2138, 0x4D2C6DFC, 0x53380D13,
30
    0x650A7354, 0x766A0ABB, 0x81C2C92E, 0x92722C85,
31
    0xA2BFE8A1, 0xA81A664B, 0xC24B8B70, 0xC76C51A3,
32
    0xD192E819, 0xD6990624, 0xF40E3585, 0x106AA070,
33
    0x19A4C116, 0x1E376C08, 0x2748774C, 0x34B0BCB5,
34
    0x391C0CB3, 0x4ED8AA4A, 0x5B9CCA4F, 0x682E6FF3,
35
    0x748F82EE, 0x78A5636F, 0x84C87814, 0x8CC70208,
36
    0x90BEFFFA, 0xA4506CEB, 0xBEF9A3F7, 0xC67178F2,
37
};
38
}
39
40
namespace sha256_arm_shani {
41
void Transform(uint32_t* s, const unsigned char* chunk, size_t blocks)
42
50.3M
{
43
50.3M
    uint32x4_t STATE0, STATE1, ABCD_SAVE, EFGH_SAVE;
44
50.3M
    uint32x4_t MSG0, MSG1, MSG2, MSG3;
45
50.3M
    uint32x4_t TMP0, TMP2;
46
47
    // Load state
48
50.3M
    STATE0 = vld1q_u32(&s[0]);
49
50.3M
    STATE1 = vld1q_u32(&s[4]);
50
51
220M
    while (blocks--)
52
170M
    {
53
        // Save state
54
170M
        ABCD_SAVE = STATE0;
55
170M
        EFGH_SAVE = STATE1;
56
57
        // Load and convert input chunk to Big Endian
58
170M
        MSG0 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(chunk + 0)));
59
170M
        MSG1 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(chunk + 16)));
60
170M
        MSG2 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(chunk + 32)));
61
170M
        MSG3 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(chunk + 48)));
62
170M
        chunk += 64;
63
64
        // Original implementation preloaded message and constant addition which was 1-3% slower.
65
        // Now included as first step in quad round code saving one Q Neon register
66
        // "TMP0 = vaddq_u32(MSG0, vld1q_u32(&K[0]));"
67
68
        // Rounds 1-4
69
170M
        TMP0 = vaddq_u32(MSG0, vld1q_u32(&K[0]));
70
170M
        TMP2 = STATE0;
71
170M
        MSG0 = vsha256su0q_u32(MSG0, MSG1);
72
170M
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
73
170M
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
74
170M
        MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3);
75
76
        // Rounds 5-8
77
170M
        TMP0 = vaddq_u32(MSG1, vld1q_u32(&K[4]));
78
170M
        TMP2 = STATE0;
79
170M
        MSG1 = vsha256su0q_u32(MSG1, MSG2);
80
170M
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
81
170M
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
82
170M
        MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0);
83
84
        // Rounds 9-12
85
170M
        TMP0 = vaddq_u32(MSG2, vld1q_u32(&K[8]));
86
170M
        TMP2 = STATE0;
87
170M
        MSG2 = vsha256su0q_u32(MSG2, MSG3);
88
170M
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
89
170M
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
90
170M
        MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1);
91
92
        // Rounds 13-16
93
170M
        TMP0 = vaddq_u32(MSG3, vld1q_u32(&K[12]));
94
170M
        TMP2 = STATE0;
95
170M
        MSG3 = vsha256su0q_u32(MSG3, MSG0);
96
170M
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
97
170M
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
98
170M
        MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2);
99
100
        // Rounds 17-20
101
170M
        TMP0 = vaddq_u32(MSG0, vld1q_u32(&K[16]));
102
170M
        TMP2 = STATE0;
103
170M
        MSG0 = vsha256su0q_u32(MSG0, MSG1);
104
170M
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
105
170M
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
106
170M
        MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3);
107
108
        // Rounds 21-24
109
170M
        TMP0 = vaddq_u32(MSG1, vld1q_u32(&K[20]));
110
170M
        TMP2 = STATE0;
111
170M
        MSG1 = vsha256su0q_u32(MSG1, MSG2);
112
170M
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
113
170M
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
114
170M
        MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0);
115
116
        // Rounds 25-28
117
170M
        TMP0 = vaddq_u32(MSG2, vld1q_u32(&K[24]));
118
170M
        TMP2 = STATE0;
119
170M
        MSG2 = vsha256su0q_u32(MSG2, MSG3);
120
170M
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
121
170M
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
122
170M
        MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1);
123
124
        // Rounds 29-32
125
170M
        TMP0 = vaddq_u32(MSG3, vld1q_u32(&K[28]));
126
170M
        TMP2 = STATE0;
127
170M
        MSG3 = vsha256su0q_u32(MSG3, MSG0);
128
170M
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
129
170M
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
130
170M
        MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2);
131
132
        // Rounds 33-36
133
170M
        TMP0 = vaddq_u32(MSG0, vld1q_u32(&K[32]));
134
170M
        TMP2 = STATE0;
135
170M
        MSG0 = vsha256su0q_u32(MSG0, MSG1);
136
170M
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
137
170M
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
138
170M
        MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3);
139
140
        // Rounds 37-40
141
170M
        TMP0 = vaddq_u32(MSG1, vld1q_u32(&K[36]));
142
170M
        TMP2 = STATE0;
143
170M
        MSG1 = vsha256su0q_u32(MSG1, MSG2);
144
170M
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
145
170M
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
146
170M
        MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0);
147
148
        // Rounds 41-44
149
170M
        TMP0 = vaddq_u32(MSG2, vld1q_u32(&K[40]));
150
170M
        TMP2 = STATE0;
151
170M
        MSG2 = vsha256su0q_u32(MSG2, MSG3);
152
170M
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
153
170M
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
154
170M
        MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1);
155
156
        // Rounds 45-48
157
170M
        TMP0 = vaddq_u32(MSG3, vld1q_u32(&K[44]));
158
170M
        TMP2 = STATE0;
159
170M
        MSG3 = vsha256su0q_u32(MSG3, MSG0);
160
170M
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
161
170M
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
162
170M
        MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2);
163
164
        // Rounds 49-52
165
170M
        TMP0 = vaddq_u32(MSG0, vld1q_u32(&K[48]));
166
170M
        TMP2 = STATE0;
167
170M
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
168
170M
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
169
170
        // Rounds 53-56
171
170M
        TMP0 = vaddq_u32(MSG1, vld1q_u32(&K[52]));
172
170M
        TMP2 = STATE0;
173
170M
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
174
170M
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
175
176
        // Rounds 57-60
177
170M
        TMP0 = vaddq_u32(MSG2, vld1q_u32(&K[56]));
178
170M
        TMP2 = STATE0;
179
170M
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
180
170M
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
181
182
        // Rounds 61-64
183
170M
        TMP0 = vaddq_u32(MSG3, vld1q_u32(&K[60]));
184
170M
        TMP2 = STATE0;
185
170M
        STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
186
170M
        STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
187
188
        // Update state
189
170M
        STATE0 = vaddq_u32(STATE0, ABCD_SAVE);
190
170M
        STATE1 = vaddq_u32(STATE1, EFGH_SAVE);
191
170M
    }
192
193
    // Save final state
194
50.3M
    vst1q_u32(&s[0], STATE0);
195
50.3M
    vst1q_u32(&s[4], STATE1);
196
50.3M
}
197
}
198
199
namespace sha256d64_arm_shani {
200
void Transform_2way(unsigned char* output, const unsigned char* input)
201
174k
{
202
    /* Initial state. */
203
174k
    alignas(uint32x4_t) static constexpr std::array<uint32_t, 8> INIT = {
204
174k
        0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a,
205
174k
        0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19
206
174k
    };
207
208
    /* Precomputed message schedule for the 2nd transform. */
209
174k
    alignas(uint32x4_t) static constexpr std::array<uint32_t, 64> MIDS = {
210
174k
        0xc28a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
211
174k
        0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
212
174k
        0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
213
174k
        0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf374,
214
174k
        0x649b69c1, 0xf0fe4786, 0x0fe1edc6, 0x240cf254,
215
174k
        0x4fe9346f, 0x6cc984be, 0x61b9411e, 0x16f988fa,
216
174k
        0xf2c65152, 0xa88e5a6d, 0xb019fc65, 0xb9d99ec7,
217
174k
        0x9a1231c3, 0xe70eeaa0, 0xfdb1232b, 0xc7353eb0,
218
174k
        0x3069bad5, 0xcb976d5f, 0x5a0f118f, 0xdc1eeefd,
219
174k
        0x0a35b689, 0xde0b7a04, 0x58f4ca9d, 0xe15d5b16,
220
174k
        0x007f3e86, 0x37088980, 0xa507ea32, 0x6fab9537,
221
174k
        0x17406110, 0x0d8cd6f1, 0xcdaa3b6d, 0xc0bbbe37,
222
174k
        0x83613bda, 0xdb48a363, 0x0b02e931, 0x6fd15ca7,
223
174k
        0x521afaca, 0x31338431, 0x6ed41a95, 0x6d437890,
224
174k
        0xc39c91f2, 0x9eccabbd, 0xb5c9a0e6, 0x532fb63c,
225
174k
        0xd2c741c6, 0x07237ea3, 0xa4954b68, 0x4c191d76
226
174k
    };
227
228
    /* A few precomputed message schedule values for the 3rd transform. */
229
174k
    alignas(uint32x4_t) static constexpr std::array<uint32_t, 12> FINS = {
230
174k
        0x5807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
231
174k
        0x80000000, 0x00000000, 0x00000000, 0x00000000,
232
174k
        0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf274
233
174k
    };
234
235
    /* Padding processed in the 3rd transform (byteswapped). */
236
174k
    alignas(uint32x4_t) static constexpr std::array<uint32_t, 8> FINAL = {0x80000000, 0, 0, 0, 0, 0, 0, 0x100};
237
238
174k
    uint32x4_t STATE0A, STATE0B, STATE1A, STATE1B, ABCD_SAVEA, ABCD_SAVEB, EFGH_SAVEA, EFGH_SAVEB;
239
174k
    uint32x4_t MSG0A, MSG0B, MSG1A, MSG1B, MSG2A, MSG2B, MSG3A, MSG3B;
240
174k
    uint32x4_t TMP0A, TMP0B, TMP2A, TMP2B, TMP;
241
242
    // Transform 1: Load state
243
174k
    STATE0A = vld1q_u32(&INIT[0]);
244
174k
    STATE0B = STATE0A;
245
174k
    STATE1A = vld1q_u32(&INIT[4]);
246
174k
    STATE1B = STATE1A;
247
248
    // Transform 1: Load and convert input chunk to Big Endian
249
174k
    MSG0A = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 0)));
250
174k
    MSG1A = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 16)));
251
174k
    MSG2A = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 32)));
252
174k
    MSG3A = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 48)));
253
174k
    MSG0B = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 64)));
254
174k
    MSG1B = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 80)));
255
174k
    MSG2B = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 96)));
256
174k
    MSG3B = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 112)));
257
258
    // Transform 1: Rounds 1-4
259
174k
    TMP = vld1q_u32(&K[0]);
260
174k
    TMP0A = vaddq_u32(MSG0A, TMP);
261
174k
    TMP0B = vaddq_u32(MSG0B, TMP);
262
174k
    TMP2A = STATE0A;
263
174k
    TMP2B = STATE0B;
264
174k
    MSG0A = vsha256su0q_u32(MSG0A, MSG1A);
265
174k
    MSG0B = vsha256su0q_u32(MSG0B, MSG1B);
266
174k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
267
174k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
268
174k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
269
174k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
270
174k
    MSG0A = vsha256su1q_u32(MSG0A, MSG2A, MSG3A);
271
174k
    MSG0B = vsha256su1q_u32(MSG0B, MSG2B, MSG3B);
272
273
    // Transform 1: Rounds 5-8
274
174k
    TMP = vld1q_u32(&K[4]);
275
174k
    TMP0A = vaddq_u32(MSG1A, TMP);
276
174k
    TMP0B = vaddq_u32(MSG1B, TMP);
277
174k
    TMP2A = STATE0A;
278
174k
    TMP2B = STATE0B;
279
174k
    MSG1A = vsha256su0q_u32(MSG1A, MSG2A);
280
174k
    MSG1B = vsha256su0q_u32(MSG1B, MSG2B);
281
174k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
282
174k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
283
174k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
284
174k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
285
174k
    MSG1A = vsha256su1q_u32(MSG1A, MSG3A, MSG0A);
286
174k
    MSG1B = vsha256su1q_u32(MSG1B, MSG3B, MSG0B);
287
288
    // Transform 1: Rounds 9-12
289
174k
    TMP = vld1q_u32(&K[8]);
290
174k
    TMP0A = vaddq_u32(MSG2A, TMP);
291
174k
    TMP0B = vaddq_u32(MSG2B, TMP);
292
174k
    TMP2A = STATE0A;
293
174k
    TMP2B = STATE0B;
294
174k
    MSG2A = vsha256su0q_u32(MSG2A, MSG3A);
295
174k
    MSG2B = vsha256su0q_u32(MSG2B, MSG3B);
296
174k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
297
174k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
298
174k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
299
174k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
300
174k
    MSG2A = vsha256su1q_u32(MSG2A, MSG0A, MSG1A);
301
174k
    MSG2B = vsha256su1q_u32(MSG2B, MSG0B, MSG1B);
302
303
    // Transform 1: Rounds 13-16
304
174k
    TMP = vld1q_u32(&K[12]);
305
174k
    TMP0A = vaddq_u32(MSG3A, TMP);
306
174k
    TMP0B = vaddq_u32(MSG3B, TMP);
307
174k
    TMP2A = STATE0A;
308
174k
    TMP2B = STATE0B;
309
174k
    MSG3A = vsha256su0q_u32(MSG3A, MSG0A);
310
174k
    MSG3B = vsha256su0q_u32(MSG3B, MSG0B);
311
174k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
312
174k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
313
174k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
314
174k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
315
174k
    MSG3A = vsha256su1q_u32(MSG3A, MSG1A, MSG2A);
316
174k
    MSG3B = vsha256su1q_u32(MSG3B, MSG1B, MSG2B);
317
318
    // Transform 1: Rounds 17-20
319
174k
    TMP = vld1q_u32(&K[16]);
320
174k
    TMP0A = vaddq_u32(MSG0A, TMP);
321
174k
    TMP0B = vaddq_u32(MSG0B, TMP);
322
174k
    TMP2A = STATE0A;
323
174k
    TMP2B = STATE0B;
324
174k
    MSG0A = vsha256su0q_u32(MSG0A, MSG1A);
325
174k
    MSG0B = vsha256su0q_u32(MSG0B, MSG1B);
326
174k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
327
174k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
328
174k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
329
174k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
330
174k
    MSG0A = vsha256su1q_u32(MSG0A, MSG2A, MSG3A);
331
174k
    MSG0B = vsha256su1q_u32(MSG0B, MSG2B, MSG3B);
332
333
    // Transform 1: Rounds 21-24
334
174k
    TMP = vld1q_u32(&K[20]);
335
174k
    TMP0A = vaddq_u32(MSG1A, TMP);
336
174k
    TMP0B = vaddq_u32(MSG1B, TMP);
337
174k
    TMP2A = STATE0A;
338
174k
    TMP2B = STATE0B;
339
174k
    MSG1A = vsha256su0q_u32(MSG1A, MSG2A);
340
174k
    MSG1B = vsha256su0q_u32(MSG1B, MSG2B);
341
174k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
342
174k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
343
174k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
344
174k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
345
174k
    MSG1A = vsha256su1q_u32(MSG1A, MSG3A, MSG0A);
346
174k
    MSG1B = vsha256su1q_u32(MSG1B, MSG3B, MSG0B);
347
348
    // Transform 1: Rounds 25-28
349
174k
    TMP = vld1q_u32(&K[24]);
350
174k
    TMP0A = vaddq_u32(MSG2A, TMP);
351
174k
    TMP0B = vaddq_u32(MSG2B, TMP);
352
174k
    TMP2A = STATE0A;
353
174k
    TMP2B = STATE0B;
354
174k
    MSG2A = vsha256su0q_u32(MSG2A, MSG3A);
355
174k
    MSG2B = vsha256su0q_u32(MSG2B, MSG3B);
356
174k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
357
174k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
358
174k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
359
174k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
360
174k
    MSG2A = vsha256su1q_u32(MSG2A, MSG0A, MSG1A);
361
174k
    MSG2B = vsha256su1q_u32(MSG2B, MSG0B, MSG1B);
362
363
    // Transform 1: Rounds 29-32
364
174k
    TMP = vld1q_u32(&K[28]);
365
174k
    TMP0A = vaddq_u32(MSG3A, TMP);
366
174k
    TMP0B = vaddq_u32(MSG3B, TMP);
367
174k
    TMP2A = STATE0A;
368
174k
    TMP2B = STATE0B;
369
174k
    MSG3A = vsha256su0q_u32(MSG3A, MSG0A);
370
174k
    MSG3B = vsha256su0q_u32(MSG3B, MSG0B);
371
174k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
372
174k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
373
174k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
374
174k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
375
174k
    MSG3A = vsha256su1q_u32(MSG3A, MSG1A, MSG2A);
376
174k
    MSG3B = vsha256su1q_u32(MSG3B, MSG1B, MSG2B);
377
378
    // Transform 1: Rounds 33-36
379
174k
    TMP = vld1q_u32(&K[32]);
380
174k
    TMP0A = vaddq_u32(MSG0A, TMP);
381
174k
    TMP0B = vaddq_u32(MSG0B, TMP);
382
174k
    TMP2A = STATE0A;
383
174k
    TMP2B = STATE0B;
384
174k
    MSG0A = vsha256su0q_u32(MSG0A, MSG1A);
385
174k
    MSG0B = vsha256su0q_u32(MSG0B, MSG1B);
386
174k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
387
174k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
388
174k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
389
174k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
390
174k
    MSG0A = vsha256su1q_u32(MSG0A, MSG2A, MSG3A);
391
174k
    MSG0B = vsha256su1q_u32(MSG0B, MSG2B, MSG3B);
392
393
    // Transform 1: Rounds 37-40
394
174k
    TMP = vld1q_u32(&K[36]);
395
174k
    TMP0A = vaddq_u32(MSG1A, TMP);
396
174k
    TMP0B = vaddq_u32(MSG1B, TMP);
397
174k
    TMP2A = STATE0A;
398
174k
    TMP2B = STATE0B;
399
174k
    MSG1A = vsha256su0q_u32(MSG1A, MSG2A);
400
174k
    MSG1B = vsha256su0q_u32(MSG1B, MSG2B);
401
174k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
402
174k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
403
174k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
404
174k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
405
174k
    MSG1A = vsha256su1q_u32(MSG1A, MSG3A, MSG0A);
406
174k
    MSG1B = vsha256su1q_u32(MSG1B, MSG3B, MSG0B);
407
408
    // Transform 1: Rounds 41-44
409
174k
    TMP = vld1q_u32(&K[40]);
410
174k
    TMP0A = vaddq_u32(MSG2A, TMP);
411
174k
    TMP0B = vaddq_u32(MSG2B, TMP);
412
174k
    TMP2A = STATE0A;
413
174k
    TMP2B = STATE0B;
414
174k
    MSG2A = vsha256su0q_u32(MSG2A, MSG3A);
415
174k
    MSG2B = vsha256su0q_u32(MSG2B, MSG3B);
416
174k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
417
174k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
418
174k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
419
174k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
420
174k
    MSG2A = vsha256su1q_u32(MSG2A, MSG0A, MSG1A);
421
174k
    MSG2B = vsha256su1q_u32(MSG2B, MSG0B, MSG1B);
422
423
    // Transform 1: Rounds 45-48
424
174k
    TMP = vld1q_u32(&K[44]);
425
174k
    TMP0A = vaddq_u32(MSG3A, TMP);
426
174k
    TMP0B = vaddq_u32(MSG3B, TMP);
427
174k
    TMP2A = STATE0A;
428
174k
    TMP2B = STATE0B;
429
174k
    MSG3A = vsha256su0q_u32(MSG3A, MSG0A);
430
174k
    MSG3B = vsha256su0q_u32(MSG3B, MSG0B);
431
174k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
432
174k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
433
174k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
434
174k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
435
174k
    MSG3A = vsha256su1q_u32(MSG3A, MSG1A, MSG2A);
436
174k
    MSG3B = vsha256su1q_u32(MSG3B, MSG1B, MSG2B);
437
438
    // Transform 1: Rounds 49-52
439
174k
    TMP = vld1q_u32(&K[48]);
440
174k
    TMP0A = vaddq_u32(MSG0A, TMP);
441
174k
    TMP0B = vaddq_u32(MSG0B, TMP);
442
174k
    TMP2A = STATE0A;
443
174k
    TMP2B = STATE0B;
444
174k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
445
174k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
446
174k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
447
174k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
448
449
    // Transform 1: Rounds 53-56
450
174k
    TMP = vld1q_u32(&K[52]);
451
174k
    TMP0A = vaddq_u32(MSG1A, TMP);
452
174k
    TMP0B = vaddq_u32(MSG1B, TMP);
453
174k
    TMP2A = STATE0A;
454
174k
    TMP2B = STATE0B;
455
174k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
456
174k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
457
174k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
458
174k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
459
460
    // Transform 1: Rounds 57-60
461
174k
    TMP = vld1q_u32(&K[56]);
462
174k
    TMP0A = vaddq_u32(MSG2A, TMP);
463
174k
    TMP0B = vaddq_u32(MSG2B, TMP);
464
174k
    TMP2A = STATE0A;
465
174k
    TMP2B = STATE0B;
466
174k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
467
174k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
468
174k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
469
174k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
470
471
    // Transform 1: Rounds 61-64
472
174k
    TMP = vld1q_u32(&K[60]);
473
174k
    TMP0A = vaddq_u32(MSG3A, TMP);
474
174k
    TMP0B = vaddq_u32(MSG3B, TMP);
475
174k
    TMP2A = STATE0A;
476
174k
    TMP2B = STATE0B;
477
174k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
478
174k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
479
174k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
480
174k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
481
482
    // Transform 1: Update state
483
174k
    TMP = vld1q_u32(&INIT[0]);
484
174k
    STATE0A = vaddq_u32(STATE0A, TMP);
485
174k
    STATE0B = vaddq_u32(STATE0B, TMP);
486
174k
    TMP = vld1q_u32(&INIT[4]);
487
174k
    STATE1A = vaddq_u32(STATE1A, TMP);
488
174k
    STATE1B = vaddq_u32(STATE1B, TMP);
489
490
    // Transform 2: Save state
491
174k
    ABCD_SAVEA = STATE0A;
492
174k
    ABCD_SAVEB = STATE0B;
493
174k
    EFGH_SAVEA = STATE1A;
494
174k
    EFGH_SAVEB = STATE1B;
495
496
    // Transform 2: Rounds 1-4
497
174k
    TMP = vld1q_u32(&MIDS[0]);
498
174k
    TMP2A = STATE0A;
499
174k
    TMP2B = STATE0B;
500
174k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
501
174k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
502
174k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
503
174k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
504
505
    // Transform 2: Rounds 5-8
506
174k
    TMP = vld1q_u32(&MIDS[4]);
507
174k
    TMP2A = STATE0A;
508
174k
    TMP2B = STATE0B;
509
174k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
510
174k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
511
174k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
512
174k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
513
514
    // Transform 2: Rounds 9-12
515
174k
    TMP = vld1q_u32(&MIDS[8]);
516
174k
    TMP2A = STATE0A;
517
174k
    TMP2B = STATE0B;
518
174k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
519
174k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
520
174k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
521
174k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
522
523
    // Transform 2: Rounds 13-16
524
174k
    TMP = vld1q_u32(&MIDS[12]);
525
174k
    TMP2A = STATE0A;
526
174k
    TMP2B = STATE0B;
527
174k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
528
174k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
529
174k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
530
174k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
531
532
    // Transform 2: Rounds 17-20
533
174k
    TMP = vld1q_u32(&MIDS[16]);
534
174k
    TMP2A = STATE0A;
535
174k
    TMP2B = STATE0B;
536
174k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
537
174k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
538
174k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
539
174k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
540
541
    // Transform 2: Rounds 21-24
542
174k
    TMP = vld1q_u32(&MIDS[20]);
543
174k
    TMP2A = STATE0A;
544
174k
    TMP2B = STATE0B;
545
174k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
546
174k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
547
174k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
548
174k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
549
550
    // Transform 2: Rounds 25-28
551
174k
    TMP = vld1q_u32(&MIDS[24]);
552
174k
    TMP2A = STATE0A;
553
174k
    TMP2B = STATE0B;
554
174k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
555
174k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
556
174k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
557
174k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
558
559
    // Transform 2: Rounds 29-32
560
174k
    TMP = vld1q_u32(&MIDS[28]);
561
174k
    TMP2A = STATE0A;
562
174k
    TMP2B = STATE0B;
563
174k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
564
174k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
565
174k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
566
174k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
567
568
    // Transform 2: Rounds 33-36
569
174k
    TMP = vld1q_u32(&MIDS[32]);
570
174k
    TMP2A = STATE0A;
571
174k
    TMP2B = STATE0B;
572
174k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
573
174k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
574
174k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
575
174k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
576
577
    // Transform 2: Rounds 37-40
578
174k
    TMP = vld1q_u32(&MIDS[36]);
579
174k
    TMP2A = STATE0A;
580
174k
    TMP2B = STATE0B;
581
174k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
582
174k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
583
174k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
584
174k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
585
586
    // Transform 2: Rounds 41-44
587
174k
    TMP = vld1q_u32(&MIDS[40]);
588
174k
    TMP2A = STATE0A;
589
174k
    TMP2B = STATE0B;
590
174k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
591
174k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
592
174k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
593
174k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
594
595
    // Transform 2: Rounds 45-48
596
174k
    TMP = vld1q_u32(&MIDS[44]);
597
174k
    TMP2A = STATE0A;
598
174k
    TMP2B = STATE0B;
599
174k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
600
174k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
601
174k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
602
174k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
603
604
    // Transform 2: Rounds 49-52
605
174k
    TMP = vld1q_u32(&MIDS[48]);
606
174k
    TMP2A = STATE0A;
607
174k
    TMP2B = STATE0B;
608
174k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
609
174k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
610
174k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
611
174k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
612
613
    // Transform 2: Rounds 53-56
614
174k
    TMP = vld1q_u32(&MIDS[52]);
615
174k
    TMP2A = STATE0A;
616
174k
    TMP2B = STATE0B;
617
174k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
618
174k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
619
174k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
620
174k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
621
622
    // Transform 2: Rounds 57-60
623
174k
    TMP = vld1q_u32(&MIDS[56]);
624
174k
    TMP2A = STATE0A;
625
174k
    TMP2B = STATE0B;
626
174k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
627
174k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
628
174k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
629
174k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
630
631
    // Transform 2: Rounds 61-64
632
174k
    TMP = vld1q_u32(&MIDS[60]);
633
174k
    TMP2A = STATE0A;
634
174k
    TMP2B = STATE0B;
635
174k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
636
174k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
637
174k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
638
174k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
639
640
    // Transform 2: Update state
641
174k
    STATE0A = vaddq_u32(STATE0A, ABCD_SAVEA);
642
174k
    STATE0B = vaddq_u32(STATE0B, ABCD_SAVEB);
643
174k
    STATE1A = vaddq_u32(STATE1A, EFGH_SAVEA);
644
174k
    STATE1B = vaddq_u32(STATE1B, EFGH_SAVEB);
645
646
    // Transform 3: Pad previous output
647
174k
    MSG0A = STATE0A;
648
174k
    MSG0B = STATE0B;
649
174k
    MSG1A = STATE1A;
650
174k
    MSG1B = STATE1B;
651
174k
    MSG2A = vld1q_u32(&FINAL[0]);
652
174k
    MSG2B = MSG2A;
653
174k
    MSG3A = vld1q_u32(&FINAL[4]);
654
174k
    MSG3B = MSG3A;
655
656
    // Transform 3: Load state
657
174k
    STATE0A = vld1q_u32(&INIT[0]);
658
174k
    STATE0B = STATE0A;
659
174k
    STATE1A = vld1q_u32(&INIT[4]);
660
174k
    STATE1B = STATE1A;
661
662
    // Transform 3: Rounds 1-4
663
174k
    TMP = vld1q_u32(&K[0]);
664
174k
    TMP0A = vaddq_u32(MSG0A, TMP);
665
174k
    TMP0B = vaddq_u32(MSG0B, TMP);
666
174k
    TMP2A = STATE0A;
667
174k
    TMP2B = STATE0B;
668
174k
    MSG0A = vsha256su0q_u32(MSG0A, MSG1A);
669
174k
    MSG0B = vsha256su0q_u32(MSG0B, MSG1B);
670
174k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
671
174k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
672
174k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
673
174k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
674
174k
    MSG0A = vsha256su1q_u32(MSG0A, MSG2A, MSG3A);
675
174k
    MSG0B = vsha256su1q_u32(MSG0B, MSG2B, MSG3B);
676
677
    // Transform 3: Rounds 5-8
678
174k
    TMP = vld1q_u32(&K[4]);
679
174k
    TMP0A = vaddq_u32(MSG1A, TMP);
680
174k
    TMP0B = vaddq_u32(MSG1B, TMP);
681
174k
    TMP2A = STATE0A;
682
174k
    TMP2B = STATE0B;
683
174k
    MSG1A = vsha256su0q_u32(MSG1A, MSG2A);
684
174k
    MSG1B = vsha256su0q_u32(MSG1B, MSG2B);
685
174k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
686
174k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
687
174k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
688
174k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
689
174k
    MSG1A = vsha256su1q_u32(MSG1A, MSG3A, MSG0A);
690
174k
    MSG1B = vsha256su1q_u32(MSG1B, MSG3B, MSG0B);
691
692
    // Transform 3: Rounds 9-12
693
174k
    TMP = vld1q_u32(&FINS[0]);
694
174k
    TMP2A = STATE0A;
695
174k
    TMP2B = STATE0B;
696
174k
    MSG2A = vld1q_u32(&FINS[4]);
697
174k
    MSG2B = MSG2A;
698
174k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
699
174k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
700
174k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
701
174k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
702
174k
    MSG2A = vsha256su1q_u32(MSG2A, MSG0A, MSG1A);
703
174k
    MSG2B = vsha256su1q_u32(MSG2B, MSG0B, MSG1B);
704
705
    // Transform 3: Rounds 13-16
706
174k
    TMP = vld1q_u32(&FINS[8]);
707
174k
    TMP2A = STATE0A;
708
174k
    TMP2B = STATE0B;
709
174k
    MSG3A = vsha256su0q_u32(MSG3A, MSG0A);
710
174k
    MSG3B = vsha256su0q_u32(MSG3B, MSG0B);
711
174k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
712
174k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
713
174k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
714
174k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
715
174k
    MSG3A = vsha256su1q_u32(MSG3A, MSG1A, MSG2A);
716
174k
    MSG3B = vsha256su1q_u32(MSG3B, MSG1B, MSG2B);
717
718
    // Transform 3: Rounds 17-20
719
174k
    TMP = vld1q_u32(&K[16]);
720
174k
    TMP0A = vaddq_u32(MSG0A, TMP);
721
174k
    TMP0B = vaddq_u32(MSG0B, TMP);
722
174k
    TMP2A = STATE0A;
723
174k
    TMP2B = STATE0B;
724
174k
    MSG0A = vsha256su0q_u32(MSG0A, MSG1A);
725
174k
    MSG0B = vsha256su0q_u32(MSG0B, MSG1B);
726
174k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
727
174k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
728
174k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
729
174k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
730
174k
    MSG0A = vsha256su1q_u32(MSG0A, MSG2A, MSG3A);
731
174k
    MSG0B = vsha256su1q_u32(MSG0B, MSG2B, MSG3B);
732
733
    // Transform 3: Rounds 21-24
734
174k
    TMP = vld1q_u32(&K[20]);
735
174k
    TMP0A = vaddq_u32(MSG1A, TMP);
736
174k
    TMP0B = vaddq_u32(MSG1B, TMP);
737
174k
    TMP2A = STATE0A;
738
174k
    TMP2B = STATE0B;
739
174k
    MSG1A = vsha256su0q_u32(MSG1A, MSG2A);
740
174k
    MSG1B = vsha256su0q_u32(MSG1B, MSG2B);
741
174k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
742
174k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
743
174k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
744
174k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
745
174k
    MSG1A = vsha256su1q_u32(MSG1A, MSG3A, MSG0A);
746
174k
    MSG1B = vsha256su1q_u32(MSG1B, MSG3B, MSG0B);
747
748
    // Transform 3: Rounds 25-28
749
174k
    TMP = vld1q_u32(&K[24]);
750
174k
    TMP0A = vaddq_u32(MSG2A, TMP);
751
174k
    TMP0B = vaddq_u32(MSG2B, TMP);
752
174k
    TMP2A = STATE0A;
753
174k
    TMP2B = STATE0B;
754
174k
    MSG2A = vsha256su0q_u32(MSG2A, MSG3A);
755
174k
    MSG2B = vsha256su0q_u32(MSG2B, MSG3B);
756
174k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
757
174k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
758
174k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
759
174k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
760
174k
    MSG2A = vsha256su1q_u32(MSG2A, MSG0A, MSG1A);
761
174k
    MSG2B = vsha256su1q_u32(MSG2B, MSG0B, MSG1B);
762
763
    // Transform 3: Rounds 29-32
764
174k
    TMP = vld1q_u32(&K[28]);
765
174k
    TMP0A = vaddq_u32(MSG3A, TMP);
766
174k
    TMP0B = vaddq_u32(MSG3B, TMP);
767
174k
    TMP2A = STATE0A;
768
174k
    TMP2B = STATE0B;
769
174k
    MSG3A = vsha256su0q_u32(MSG3A, MSG0A);
770
174k
    MSG3B = vsha256su0q_u32(MSG3B, MSG0B);
771
174k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
772
174k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
773
174k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
774
174k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
775
174k
    MSG3A = vsha256su1q_u32(MSG3A, MSG1A, MSG2A);
776
174k
    MSG3B = vsha256su1q_u32(MSG3B, MSG1B, MSG2B);
777
778
    // Transform 3: Rounds 33-36
779
174k
    TMP = vld1q_u32(&K[32]);
780
174k
    TMP0A = vaddq_u32(MSG0A, TMP);
781
174k
    TMP0B = vaddq_u32(MSG0B, TMP);
782
174k
    TMP2A = STATE0A;
783
174k
    TMP2B = STATE0B;
784
174k
    MSG0A = vsha256su0q_u32(MSG0A, MSG1A);
785
174k
    MSG0B = vsha256su0q_u32(MSG0B, MSG1B);
786
174k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
787
174k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
788
174k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
789
174k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
790
174k
    MSG0A = vsha256su1q_u32(MSG0A, MSG2A, MSG3A);
791
174k
    MSG0B = vsha256su1q_u32(MSG0B, MSG2B, MSG3B);
792
793
    // Transform 3: Rounds 37-40
794
174k
    TMP = vld1q_u32(&K[36]);
795
174k
    TMP0A = vaddq_u32(MSG1A, TMP);
796
174k
    TMP0B = vaddq_u32(MSG1B, TMP);
797
174k
    TMP2A = STATE0A;
798
174k
    TMP2B = STATE0B;
799
174k
    MSG1A = vsha256su0q_u32(MSG1A, MSG2A);
800
174k
    MSG1B = vsha256su0q_u32(MSG1B, MSG2B);
801
174k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
802
174k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
803
174k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
804
174k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
805
174k
    MSG1A = vsha256su1q_u32(MSG1A, MSG3A, MSG0A);
806
174k
    MSG1B = vsha256su1q_u32(MSG1B, MSG3B, MSG0B);
807
808
    // Transform 3: Rounds 41-44
809
174k
    TMP = vld1q_u32(&K[40]);
810
174k
    TMP0A = vaddq_u32(MSG2A, TMP);
811
174k
    TMP0B = vaddq_u32(MSG2B, TMP);
812
174k
    TMP2A = STATE0A;
813
174k
    TMP2B = STATE0B;
814
174k
    MSG2A = vsha256su0q_u32(MSG2A, MSG3A);
815
174k
    MSG2B = vsha256su0q_u32(MSG2B, MSG3B);
816
174k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
817
174k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
818
174k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
819
174k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
820
174k
    MSG2A = vsha256su1q_u32(MSG2A, MSG0A, MSG1A);
821
174k
    MSG2B = vsha256su1q_u32(MSG2B, MSG0B, MSG1B);
822
823
    // Transform 3: Rounds 45-48
824
174k
    TMP = vld1q_u32(&K[44]);
825
174k
    TMP0A = vaddq_u32(MSG3A, TMP);
826
174k
    TMP0B = vaddq_u32(MSG3B, TMP);
827
174k
    TMP2A = STATE0A;
828
174k
    TMP2B = STATE0B;
829
174k
    MSG3A = vsha256su0q_u32(MSG3A, MSG0A);
830
174k
    MSG3B = vsha256su0q_u32(MSG3B, MSG0B);
831
174k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
832
174k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
833
174k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
834
174k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
835
174k
    MSG3A = vsha256su1q_u32(MSG3A, MSG1A, MSG2A);
836
174k
    MSG3B = vsha256su1q_u32(MSG3B, MSG1B, MSG2B);
837
838
    // Transform 3: Rounds 49-52
839
174k
    TMP = vld1q_u32(&K[48]);
840
174k
    TMP0A = vaddq_u32(MSG0A, TMP);
841
174k
    TMP0B = vaddq_u32(MSG0B, TMP);
842
174k
    TMP2A = STATE0A;
843
174k
    TMP2B = STATE0B;
844
174k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
845
174k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
846
174k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
847
174k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
848
849
    // Transform 3: Rounds 53-56
850
174k
    TMP = vld1q_u32(&K[52]);
851
174k
    TMP0A = vaddq_u32(MSG1A, TMP);
852
174k
    TMP0B = vaddq_u32(MSG1B, TMP);
853
174k
    TMP2A = STATE0A;
854
174k
    TMP2B = STATE0B;
855
174k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
856
174k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
857
174k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
858
174k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
859
860
    // Transform 3: Rounds 57-60
861
174k
    TMP = vld1q_u32(&K[56]);
862
174k
    TMP0A = vaddq_u32(MSG2A, TMP);
863
174k
    TMP0B = vaddq_u32(MSG2B, TMP);
864
174k
    TMP2A = STATE0A;
865
174k
    TMP2B = STATE0B;
866
174k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
867
174k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
868
174k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
869
174k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
870
871
    // Transform 3: Rounds 61-64
872
174k
    TMP = vld1q_u32(&K[60]);
873
174k
    TMP0A = vaddq_u32(MSG3A, TMP);
874
174k
    TMP0B = vaddq_u32(MSG3B, TMP);
875
174k
    TMP2A = STATE0A;
876
174k
    TMP2B = STATE0B;
877
174k
    STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
878
174k
    STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
879
174k
    STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
880
174k
    STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
881
882
    // Transform 3: Update state
883
174k
    TMP = vld1q_u32(&INIT[0]);
884
174k
    STATE0A = vaddq_u32(STATE0A, TMP);
885
174k
    STATE0B = vaddq_u32(STATE0B, TMP);
886
174k
    TMP = vld1q_u32(&INIT[4]);
887
174k
    STATE1A = vaddq_u32(STATE1A, TMP);
888
174k
    STATE1B = vaddq_u32(STATE1B, TMP);
889
890
    // Store result
891
174k
    vst1q_u8(output, vrev32q_u8(vreinterpretq_u8_u32(STATE0A)));
892
174k
    vst1q_u8(output + 16, vrev32q_u8(vreinterpretq_u8_u32(STATE1A)));
893
174k
    vst1q_u8(output + 32, vrev32q_u8(vreinterpretq_u8_u32(STATE0B)));
894
    vst1q_u8(output + 48, vrev32q_u8(vreinterpretq_u8_u32(STATE1B)));
895
174k
}
896
}
897
898
#endif