diff options
author | pineappleEA <pineaea@gmail.com> | 2024-02-09 20:23:03 +0100 |
---|---|---|
committer | pineappleEA <pineaea@gmail.com> | 2024-02-09 20:23:03 +0100 |
commit | b48b6e3b79c09c46384627ce7ae47a81f77187b8 (patch) | |
tree | bdcb00d2a9fba0a7580fcfe5ebf282814eb48841 | |
parent | 5a87b5c4005aa344b56ef9cd7eeb2d11d8c03d93 (diff) |
early-access version 4125EA-4125
71 files changed, 10736 insertions, 566 deletions
@@ -1,7 +1,7 @@ | |||
1 | yuzu emulator early access | 1 | yuzu emulator early access |
2 | ============= | 2 | ============= |
3 | 3 | ||
4 | This is the source code for early-access 4124. | 4 | This is the source code for early-access 4125. |
5 | 5 | ||
6 | ## Legal Notice | 6 | ## Legal Notice |
7 | 7 | ||
diff --git a/externals/CMakeLists.txt b/externals/CMakeLists.txt index 836aed14f..9693edcb5 100755 --- a/externals/CMakeLists.txt +++ b/externals/CMakeLists.txt | |||
@@ -314,3 +314,10 @@ endif() | |||
314 | if (NOT TARGET SimpleIni::SimpleIni) | 314 | if (NOT TARGET SimpleIni::SimpleIni) |
315 | add_subdirectory(simpleini) | 315 | add_subdirectory(simpleini) |
316 | endif() | 316 | endif() |
317 | |||
318 | # sse2neon | ||
319 | if (ARCHITECTURE_arm64 AND NOT TARGET sse2neon) | ||
320 | add_library(sse2neon INTERFACE) | ||
321 | target_include_directories(sse2neon INTERFACE sse2neon) | ||
322 | endif() | ||
323 | |||
diff --git a/externals/sse2neon/sse2neon.h b/externals/sse2neon/sse2neon.h new file mode 100755 index 000000000..56254b5f9 --- /dev/null +++ b/externals/sse2neon/sse2neon.h | |||
@@ -0,0 +1,9282 @@ | |||
1 | #ifndef SSE2NEON_H | ||
2 | #define SSE2NEON_H | ||
3 | |||
4 | /* | ||
5 | * sse2neon is freely redistributable under the MIT License. | ||
6 | * | ||
7 | * Copyright (c) 2015-2024 SSE2NEON Contributors. | ||
8 | * | ||
9 | * Permission is hereby granted, free of charge, to any person obtaining a copy | ||
10 | * of this software and associated documentation files (the "Software"), to deal | ||
11 | * in the Software without restriction, including without limitation the rights | ||
12 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | ||
13 | * copies of the Software, and to permit persons to whom the Software is | ||
14 | * furnished to do so, subject to the following conditions: | ||
15 | * | ||
16 | * The above copyright notice and this permission notice shall be included in | ||
17 | * all copies or substantial portions of the Software. | ||
18 | * | ||
19 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
20 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
21 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||
22 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
23 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | ||
24 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
25 | * SOFTWARE. | ||
26 | */ | ||
27 | |||
28 | // This header file provides a simple API translation layer | ||
29 | // between SSE intrinsics to their corresponding Arm/Aarch64 NEON versions | ||
30 | // | ||
31 | // Contributors to this work are: | ||
32 | // John W. Ratcliff <jratcliffscarab@gmail.com> | ||
33 | // Brandon Rowlett <browlett@nvidia.com> | ||
34 | // Ken Fast <kfast@gdeb.com> | ||
35 | // Eric van Beurden <evanbeurden@nvidia.com> | ||
36 | // Alexander Potylitsin <apotylitsin@nvidia.com> | ||
37 | // Hasindu Gamaarachchi <hasindu2008@gmail.com> | ||
38 | // Jim Huang <jserv@ccns.ncku.edu.tw> | ||
39 | // Mark Cheng <marktwtn@gmail.com> | ||
40 | // Malcolm James MacLeod <malcolm@gulden.com> | ||
41 | // Devin Hussey (easyaspi314) <husseydevin@gmail.com> | ||
42 | // Sebastian Pop <spop@amazon.com> | ||
43 | // Developer Ecosystem Engineering <DeveloperEcosystemEngineering@apple.com> | ||
44 | // Danila Kutenin <danilak@google.com> | ||
45 | // François Turban (JishinMaster) <francois.turban@gmail.com> | ||
46 | // Pei-Hsuan Hung <afcidk@gmail.com> | ||
47 | // Yang-Hao Yuan <yuanyanghau@gmail.com> | ||
48 | // Syoyo Fujita <syoyo@lighttransport.com> | ||
49 | // Brecht Van Lommel <brecht@blender.org> | ||
50 | // Jonathan Hue <jhue@adobe.com> | ||
51 | // Cuda Chen <clh960524@gmail.com> | ||
52 | // Aymen Qader <aymen.qader@arm.com> | ||
53 | // Anthony Roberts <anthony.roberts@linaro.org> | ||
54 | |||
55 | /* Tunable configurations */ | ||
56 | |||
57 | /* Enable precise implementation of math operations | ||
58 | * This would slow down the computation a bit, but gives consistent result with | ||
59 | * x86 SSE. (e.g. would solve a hole or NaN pixel in the rendering result) | ||
60 | */ | ||
61 | /* _mm_min|max_ps|ss|pd|sd */ | ||
62 | #ifndef SSE2NEON_PRECISE_MINMAX | ||
63 | #define SSE2NEON_PRECISE_MINMAX (0) | ||
64 | #endif | ||
65 | /* _mm_rcp_ps and _mm_div_ps */ | ||
66 | #ifndef SSE2NEON_PRECISE_DIV | ||
67 | #define SSE2NEON_PRECISE_DIV (0) | ||
68 | #endif | ||
69 | /* _mm_sqrt_ps and _mm_rsqrt_ps */ | ||
70 | #ifndef SSE2NEON_PRECISE_SQRT | ||
71 | #define SSE2NEON_PRECISE_SQRT (0) | ||
72 | #endif | ||
73 | /* _mm_dp_pd */ | ||
74 | #ifndef SSE2NEON_PRECISE_DP | ||
75 | #define SSE2NEON_PRECISE_DP (0) | ||
76 | #endif | ||
77 | |||
78 | /* Enable inclusion of windows.h on MSVC platforms | ||
79 | * This makes _mm_clflush functional on windows, as there is no builtin. | ||
80 | */ | ||
81 | #ifndef SSE2NEON_INCLUDE_WINDOWS_H | ||
82 | #define SSE2NEON_INCLUDE_WINDOWS_H (0) | ||
83 | #endif | ||
84 | |||
85 | /* compiler specific definitions */ | ||
86 | #if defined(__GNUC__) || defined(__clang__) | ||
87 | #pragma push_macro("FORCE_INLINE") | ||
88 | #pragma push_macro("ALIGN_STRUCT") | ||
89 | #define FORCE_INLINE static inline __attribute__((always_inline)) | ||
90 | #define ALIGN_STRUCT(x) __attribute__((aligned(x))) | ||
91 | #define _sse2neon_likely(x) __builtin_expect(!!(x), 1) | ||
92 | #define _sse2neon_unlikely(x) __builtin_expect(!!(x), 0) | ||
93 | #elif defined(_MSC_VER) | ||
94 | #if _MSVC_TRADITIONAL | ||
95 | #error Using the traditional MSVC preprocessor is not supported! Use /Zc:preprocessor instead. | ||
96 | #endif | ||
97 | #ifndef FORCE_INLINE | ||
98 | #define FORCE_INLINE static inline | ||
99 | #endif | ||
100 | #ifndef ALIGN_STRUCT | ||
101 | #define ALIGN_STRUCT(x) __declspec(align(x)) | ||
102 | #endif | ||
103 | #define _sse2neon_likely(x) (x) | ||
104 | #define _sse2neon_unlikely(x) (x) | ||
105 | #else | ||
106 | #pragma message("Macro name collisions may happen with unsupported compilers.") | ||
107 | #endif | ||
108 | |||
109 | #if !defined(__clang__) && defined(__GNUC__) && __GNUC__ < 10 | ||
110 | #warning "GCC versions earlier than 10 are not supported." | ||
111 | #endif | ||
112 | |||
113 | /* C language does not allow initializing a variable with a function call. */ | ||
114 | #ifdef __cplusplus | ||
115 | #define _sse2neon_const static const | ||
116 | #else | ||
117 | #define _sse2neon_const const | ||
118 | #endif | ||
119 | |||
120 | #include <stdint.h> | ||
121 | #include <stdlib.h> | ||
122 | |||
123 | #if defined(_WIN32) | ||
124 | /* Definitions for _mm_{malloc,free} are provided by <malloc.h> | ||
125 | * from both MinGW-w64 and MSVC. | ||
126 | */ | ||
127 | #define SSE2NEON_ALLOC_DEFINED | ||
128 | #endif | ||
129 | |||
130 | /* If using MSVC */ | ||
131 | #ifdef _MSC_VER | ||
132 | #include <intrin.h> | ||
133 | #if SSE2NEON_INCLUDE_WINDOWS_H | ||
134 | #include <processthreadsapi.h> | ||
135 | #include <windows.h> | ||
136 | #endif | ||
137 | |||
138 | #if !defined(__cplusplus) | ||
139 | #error SSE2NEON only supports C++ compilation with this compiler | ||
140 | #endif | ||
141 | |||
142 | #ifdef SSE2NEON_ALLOC_DEFINED | ||
143 | #include <malloc.h> | ||
144 | #endif | ||
145 | |||
146 | #if (defined(_M_AMD64) || defined(__x86_64__)) || \ | ||
147 | (defined(_M_ARM64) || defined(__arm64__)) | ||
148 | #define SSE2NEON_HAS_BITSCAN64 | ||
149 | #endif | ||
150 | #endif | ||
151 | |||
152 | #if defined(__GNUC__) || defined(__clang__) | ||
153 | #define _sse2neon_define0(type, s, body) \ | ||
154 | __extension__({ \ | ||
155 | type _a = (s); \ | ||
156 | body \ | ||
157 | }) | ||
158 | #define _sse2neon_define1(type, s, body) \ | ||
159 | __extension__({ \ | ||
160 | type _a = (s); \ | ||
161 | body \ | ||
162 | }) | ||
163 | #define _sse2neon_define2(type, a, b, body) \ | ||
164 | __extension__({ \ | ||
165 | type _a = (a), _b = (b); \ | ||
166 | body \ | ||
167 | }) | ||
168 | #define _sse2neon_return(ret) (ret) | ||
169 | #else | ||
170 | #define _sse2neon_define0(type, a, body) [=](type _a) { body }(a) | ||
171 | #define _sse2neon_define1(type, a, body) [](type _a) { body }(a) | ||
172 | #define _sse2neon_define2(type, a, b, body) \ | ||
173 | [](type _a, type _b) { body }((a), (b)) | ||
174 | #define _sse2neon_return(ret) return ret | ||
175 | #endif | ||
176 | |||
177 | #define _sse2neon_init(...) \ | ||
178 | { \ | ||
179 | __VA_ARGS__ \ | ||
180 | } | ||
181 | |||
182 | /* Compiler barrier */ | ||
183 | #if defined(_MSC_VER) | ||
184 | #define SSE2NEON_BARRIER() _ReadWriteBarrier() | ||
185 | #else | ||
186 | #define SSE2NEON_BARRIER() \ | ||
187 | do { \ | ||
188 | __asm__ __volatile__("" ::: "memory"); \ | ||
189 | (void) 0; \ | ||
190 | } while (0) | ||
191 | #endif | ||
192 | |||
193 | /* Memory barriers | ||
194 | * __atomic_thread_fence does not include a compiler barrier; instead, | ||
195 | * the barrier is part of __atomic_load/__atomic_store's "volatile-like" | ||
196 | * semantics. | ||
197 | */ | ||
198 | #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) | ||
199 | #include <stdatomic.h> | ||
200 | #endif | ||
201 | |||
202 | FORCE_INLINE void _sse2neon_smp_mb(void) | ||
203 | { | ||
204 | SSE2NEON_BARRIER(); | ||
205 | #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) && \ | ||
206 | !defined(__STDC_NO_ATOMICS__) | ||
207 | atomic_thread_fence(memory_order_seq_cst); | ||
208 | #elif defined(__GNUC__) || defined(__clang__) | ||
209 | __atomic_thread_fence(__ATOMIC_SEQ_CST); | ||
210 | #else /* MSVC */ | ||
211 | __dmb(_ARM64_BARRIER_ISH); | ||
212 | #endif | ||
213 | } | ||
214 | |||
215 | /* Architecture-specific build options */ | ||
216 | /* FIXME: #pragma GCC push_options is only available on GCC */ | ||
217 | #if defined(__GNUC__) | ||
218 | #if defined(__arm__) && __ARM_ARCH == 7 | ||
219 | /* According to ARM C Language Extensions Architecture specification, | ||
220 | * __ARM_NEON is defined to a value indicating the Advanced SIMD (NEON) | ||
221 | * architecture supported. | ||
222 | */ | ||
223 | #if !defined(__ARM_NEON) || !defined(__ARM_NEON__) | ||
224 | #error "You must enable NEON instructions (e.g. -mfpu=neon) to use SSE2NEON." | ||
225 | #endif | ||
226 | #if !defined(__clang__) | ||
227 | #pragma GCC push_options | ||
228 | #pragma GCC target("fpu=neon") | ||
229 | #endif | ||
230 | #elif defined(__aarch64__) || defined(_M_ARM64) | ||
231 | #if !defined(__clang__) && !defined(_MSC_VER) | ||
232 | #pragma GCC push_options | ||
233 | #pragma GCC target("+simd") | ||
234 | #endif | ||
235 | #elif __ARM_ARCH == 8 | ||
236 | #if !defined(__ARM_NEON) || !defined(__ARM_NEON__) | ||
237 | #error \ | ||
238 | "You must enable NEON instructions (e.g. -mfpu=neon-fp-armv8) to use SSE2NEON." | ||
239 | #endif | ||
240 | #if !defined(__clang__) && !defined(_MSC_VER) | ||
241 | #pragma GCC push_options | ||
242 | #endif | ||
243 | #else | ||
244 | #error "Unsupported target. Must be either ARMv7-A+NEON or ARMv8-A." | ||
245 | #endif | ||
246 | #endif | ||
247 | |||
248 | #include <arm_neon.h> | ||
249 | #if (!defined(__aarch64__) && !defined(_M_ARM64)) && (__ARM_ARCH == 8) | ||
250 | #if defined __has_include && __has_include(<arm_acle.h>) | ||
251 | #include <arm_acle.h> | ||
252 | #endif | ||
253 | #endif | ||
254 | |||
255 | /* Apple Silicon cache lines are double of what is commonly used by Intel, AMD | ||
256 | * and other Arm microarchitectures use. | ||
257 | * From sysctl -a on Apple M1: | ||
258 | * hw.cachelinesize: 128 | ||
259 | */ | ||
260 | #if defined(__APPLE__) && (defined(__aarch64__) || defined(__arm64__)) | ||
261 | #define SSE2NEON_CACHELINE_SIZE 128 | ||
262 | #else | ||
263 | #define SSE2NEON_CACHELINE_SIZE 64 | ||
264 | #endif | ||
265 | |||
266 | /* Rounding functions require either Aarch64 instructions or libm fallback */ | ||
267 | #if !defined(__aarch64__) && !defined(_M_ARM64) | ||
268 | #include <math.h> | ||
269 | #endif | ||
270 | |||
271 | /* On ARMv7, some registers, such as PMUSERENR and PMCCNTR, are read-only | ||
272 | * or even not accessible in user mode. | ||
273 | * To write or access to these registers in user mode, | ||
274 | * we have to perform syscall instead. | ||
275 | */ | ||
276 | #if (!defined(__aarch64__) && !defined(_M_ARM64)) | ||
277 | #include <sys/time.h> | ||
278 | #endif | ||
279 | |||
280 | /* "__has_builtin" can be used to query support for built-in functions | ||
281 | * provided by gcc/clang and other compilers that support it. | ||
282 | */ | ||
283 | #ifndef __has_builtin /* GCC prior to 10 or non-clang compilers */ | ||
284 | /* Compatibility with gcc <= 9 */ | ||
285 | #if defined(__GNUC__) && (__GNUC__ <= 9) | ||
286 | #define __has_builtin(x) HAS##x | ||
287 | #define HAS__builtin_popcount 1 | ||
288 | #define HAS__builtin_popcountll 1 | ||
289 | |||
290 | // __builtin_shuffle introduced in GCC 4.7.0 | ||
291 | #if (__GNUC__ >= 5) || ((__GNUC__ == 4) && (__GNUC_MINOR__ >= 7)) | ||
292 | #define HAS__builtin_shuffle 1 | ||
293 | #else | ||
294 | #define HAS__builtin_shuffle 0 | ||
295 | #endif | ||
296 | |||
297 | #define HAS__builtin_shufflevector 0 | ||
298 | #define HAS__builtin_nontemporal_store 0 | ||
299 | #else | ||
300 | #define __has_builtin(x) 0 | ||
301 | #endif | ||
302 | #endif | ||
303 | |||
304 | /** | ||
305 | * MACRO for shuffle parameter for _mm_shuffle_ps(). | ||
306 | * Argument fp3 is a digit[0123] that represents the fp from argument "b" | ||
307 | * of mm_shuffle_ps that will be placed in fp3 of result. fp2 is the same | ||
308 | * for fp2 in result. fp1 is a digit[0123] that represents the fp from | ||
309 | * argument "a" of mm_shuffle_ps that will be places in fp1 of result. | ||
310 | * fp0 is the same for fp0 of result. | ||
311 | */ | ||
312 | #define _MM_SHUFFLE(fp3, fp2, fp1, fp0) \ | ||
313 | (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0))) | ||
314 | |||
315 | #if __has_builtin(__builtin_shufflevector) | ||
316 | #define _sse2neon_shuffle(type, a, b, ...) \ | ||
317 | __builtin_shufflevector(a, b, __VA_ARGS__) | ||
318 | #elif __has_builtin(__builtin_shuffle) | ||
319 | #define _sse2neon_shuffle(type, a, b, ...) \ | ||
320 | __extension__({ \ | ||
321 | type tmp = {__VA_ARGS__}; \ | ||
322 | __builtin_shuffle(a, b, tmp); \ | ||
323 | }) | ||
324 | #endif | ||
325 | |||
326 | #ifdef _sse2neon_shuffle | ||
327 | #define vshuffle_s16(a, b, ...) _sse2neon_shuffle(int16x4_t, a, b, __VA_ARGS__) | ||
328 | #define vshuffleq_s16(a, b, ...) _sse2neon_shuffle(int16x8_t, a, b, __VA_ARGS__) | ||
329 | #define vshuffle_s32(a, b, ...) _sse2neon_shuffle(int32x2_t, a, b, __VA_ARGS__) | ||
330 | #define vshuffleq_s32(a, b, ...) _sse2neon_shuffle(int32x4_t, a, b, __VA_ARGS__) | ||
331 | #define vshuffle_s64(a, b, ...) _sse2neon_shuffle(int64x1_t, a, b, __VA_ARGS__) | ||
332 | #define vshuffleq_s64(a, b, ...) _sse2neon_shuffle(int64x2_t, a, b, __VA_ARGS__) | ||
333 | #endif | ||
334 | |||
335 | /* Rounding mode macros. */ | ||
336 | #define _MM_FROUND_TO_NEAREST_INT 0x00 | ||
337 | #define _MM_FROUND_TO_NEG_INF 0x01 | ||
338 | #define _MM_FROUND_TO_POS_INF 0x02 | ||
339 | #define _MM_FROUND_TO_ZERO 0x03 | ||
340 | #define _MM_FROUND_CUR_DIRECTION 0x04 | ||
341 | #define _MM_FROUND_NO_EXC 0x08 | ||
342 | #define _MM_FROUND_RAISE_EXC 0x00 | ||
343 | #define _MM_FROUND_NINT (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_RAISE_EXC) | ||
344 | #define _MM_FROUND_FLOOR (_MM_FROUND_TO_NEG_INF | _MM_FROUND_RAISE_EXC) | ||
345 | #define _MM_FROUND_CEIL (_MM_FROUND_TO_POS_INF | _MM_FROUND_RAISE_EXC) | ||
346 | #define _MM_FROUND_TRUNC (_MM_FROUND_TO_ZERO | _MM_FROUND_RAISE_EXC) | ||
347 | #define _MM_FROUND_RINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_RAISE_EXC) | ||
348 | #define _MM_FROUND_NEARBYINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC) | ||
349 | #define _MM_ROUND_NEAREST 0x0000 | ||
350 | #define _MM_ROUND_DOWN 0x2000 | ||
351 | #define _MM_ROUND_UP 0x4000 | ||
352 | #define _MM_ROUND_TOWARD_ZERO 0x6000 | ||
353 | /* Flush zero mode macros. */ | ||
354 | #define _MM_FLUSH_ZERO_MASK 0x8000 | ||
355 | #define _MM_FLUSH_ZERO_ON 0x8000 | ||
356 | #define _MM_FLUSH_ZERO_OFF 0x0000 | ||
357 | /* Denormals are zeros mode macros. */ | ||
358 | #define _MM_DENORMALS_ZERO_MASK 0x0040 | ||
359 | #define _MM_DENORMALS_ZERO_ON 0x0040 | ||
360 | #define _MM_DENORMALS_ZERO_OFF 0x0000 | ||
361 | |||
362 | /* indicate immediate constant argument in a given range */ | ||
363 | #define __constrange(a, b) const | ||
364 | |||
365 | /* A few intrinsics accept traditional data types like ints or floats, but | ||
366 | * most operate on data types that are specific to SSE. | ||
367 | * If a vector type ends in d, it contains doubles, and if it does not have | ||
368 | * a suffix, it contains floats. An integer vector type can contain any type | ||
369 | * of integer, from chars to shorts to unsigned long longs. | ||
370 | */ | ||
371 | typedef int64x1_t __m64; | ||
372 | typedef float32x4_t __m128; /* 128-bit vector containing 4 floats */ | ||
373 | // On ARM 32-bit architecture, the float64x2_t is not supported. | ||
374 | // The data type __m128d should be represented in a different way for related | ||
375 | // intrinsic conversion. | ||
376 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
377 | typedef float64x2_t __m128d; /* 128-bit vector containing 2 doubles */ | ||
378 | #else | ||
379 | typedef float32x4_t __m128d; | ||
380 | #endif | ||
381 | typedef int64x2_t __m128i; /* 128-bit vector containing integers */ | ||
382 | |||
383 | // __int64 is defined in the Intrinsics Guide which maps to different datatype | ||
384 | // in different data model | ||
385 | #if !(defined(_WIN32) || defined(_WIN64) || defined(__int64)) | ||
386 | #if (defined(__x86_64__) || defined(__i386__)) | ||
387 | #define __int64 long long | ||
388 | #else | ||
389 | #define __int64 int64_t | ||
390 | #endif | ||
391 | #endif | ||
392 | |||
393 | /* type-safe casting between types */ | ||
394 | |||
395 | #define vreinterpretq_m128_f16(x) vreinterpretq_f32_f16(x) | ||
396 | #define vreinterpretq_m128_f32(x) (x) | ||
397 | #define vreinterpretq_m128_f64(x) vreinterpretq_f32_f64(x) | ||
398 | |||
399 | #define vreinterpretq_m128_u8(x) vreinterpretq_f32_u8(x) | ||
400 | #define vreinterpretq_m128_u16(x) vreinterpretq_f32_u16(x) | ||
401 | #define vreinterpretq_m128_u32(x) vreinterpretq_f32_u32(x) | ||
402 | #define vreinterpretq_m128_u64(x) vreinterpretq_f32_u64(x) | ||
403 | |||
404 | #define vreinterpretq_m128_s8(x) vreinterpretq_f32_s8(x) | ||
405 | #define vreinterpretq_m128_s16(x) vreinterpretq_f32_s16(x) | ||
406 | #define vreinterpretq_m128_s32(x) vreinterpretq_f32_s32(x) | ||
407 | #define vreinterpretq_m128_s64(x) vreinterpretq_f32_s64(x) | ||
408 | |||
409 | #define vreinterpretq_f16_m128(x) vreinterpretq_f16_f32(x) | ||
410 | #define vreinterpretq_f32_m128(x) (x) | ||
411 | #define vreinterpretq_f64_m128(x) vreinterpretq_f64_f32(x) | ||
412 | |||
413 | #define vreinterpretq_u8_m128(x) vreinterpretq_u8_f32(x) | ||
414 | #define vreinterpretq_u16_m128(x) vreinterpretq_u16_f32(x) | ||
415 | #define vreinterpretq_u32_m128(x) vreinterpretq_u32_f32(x) | ||
416 | #define vreinterpretq_u64_m128(x) vreinterpretq_u64_f32(x) | ||
417 | |||
418 | #define vreinterpretq_s8_m128(x) vreinterpretq_s8_f32(x) | ||
419 | #define vreinterpretq_s16_m128(x) vreinterpretq_s16_f32(x) | ||
420 | #define vreinterpretq_s32_m128(x) vreinterpretq_s32_f32(x) | ||
421 | #define vreinterpretq_s64_m128(x) vreinterpretq_s64_f32(x) | ||
422 | |||
423 | #define vreinterpretq_m128i_s8(x) vreinterpretq_s64_s8(x) | ||
424 | #define vreinterpretq_m128i_s16(x) vreinterpretq_s64_s16(x) | ||
425 | #define vreinterpretq_m128i_s32(x) vreinterpretq_s64_s32(x) | ||
426 | #define vreinterpretq_m128i_s64(x) (x) | ||
427 | |||
428 | #define vreinterpretq_m128i_u8(x) vreinterpretq_s64_u8(x) | ||
429 | #define vreinterpretq_m128i_u16(x) vreinterpretq_s64_u16(x) | ||
430 | #define vreinterpretq_m128i_u32(x) vreinterpretq_s64_u32(x) | ||
431 | #define vreinterpretq_m128i_u64(x) vreinterpretq_s64_u64(x) | ||
432 | |||
433 | #define vreinterpretq_f32_m128i(x) vreinterpretq_f32_s64(x) | ||
434 | #define vreinterpretq_f64_m128i(x) vreinterpretq_f64_s64(x) | ||
435 | |||
436 | #define vreinterpretq_s8_m128i(x) vreinterpretq_s8_s64(x) | ||
437 | #define vreinterpretq_s16_m128i(x) vreinterpretq_s16_s64(x) | ||
438 | #define vreinterpretq_s32_m128i(x) vreinterpretq_s32_s64(x) | ||
439 | #define vreinterpretq_s64_m128i(x) (x) | ||
440 | |||
441 | #define vreinterpretq_u8_m128i(x) vreinterpretq_u8_s64(x) | ||
442 | #define vreinterpretq_u16_m128i(x) vreinterpretq_u16_s64(x) | ||
443 | #define vreinterpretq_u32_m128i(x) vreinterpretq_u32_s64(x) | ||
444 | #define vreinterpretq_u64_m128i(x) vreinterpretq_u64_s64(x) | ||
445 | |||
446 | #define vreinterpret_m64_s8(x) vreinterpret_s64_s8(x) | ||
447 | #define vreinterpret_m64_s16(x) vreinterpret_s64_s16(x) | ||
448 | #define vreinterpret_m64_s32(x) vreinterpret_s64_s32(x) | ||
449 | #define vreinterpret_m64_s64(x) (x) | ||
450 | |||
451 | #define vreinterpret_m64_u8(x) vreinterpret_s64_u8(x) | ||
452 | #define vreinterpret_m64_u16(x) vreinterpret_s64_u16(x) | ||
453 | #define vreinterpret_m64_u32(x) vreinterpret_s64_u32(x) | ||
454 | #define vreinterpret_m64_u64(x) vreinterpret_s64_u64(x) | ||
455 | |||
456 | #define vreinterpret_m64_f16(x) vreinterpret_s64_f16(x) | ||
457 | #define vreinterpret_m64_f32(x) vreinterpret_s64_f32(x) | ||
458 | #define vreinterpret_m64_f64(x) vreinterpret_s64_f64(x) | ||
459 | |||
460 | #define vreinterpret_u8_m64(x) vreinterpret_u8_s64(x) | ||
461 | #define vreinterpret_u16_m64(x) vreinterpret_u16_s64(x) | ||
462 | #define vreinterpret_u32_m64(x) vreinterpret_u32_s64(x) | ||
463 | #define vreinterpret_u64_m64(x) vreinterpret_u64_s64(x) | ||
464 | |||
465 | #define vreinterpret_s8_m64(x) vreinterpret_s8_s64(x) | ||
466 | #define vreinterpret_s16_m64(x) vreinterpret_s16_s64(x) | ||
467 | #define vreinterpret_s32_m64(x) vreinterpret_s32_s64(x) | ||
468 | #define vreinterpret_s64_m64(x) (x) | ||
469 | |||
470 | #define vreinterpret_f32_m64(x) vreinterpret_f32_s64(x) | ||
471 | |||
472 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
473 | #define vreinterpretq_m128d_s32(x) vreinterpretq_f64_s32(x) | ||
474 | #define vreinterpretq_m128d_s64(x) vreinterpretq_f64_s64(x) | ||
475 | |||
476 | #define vreinterpretq_m128d_u64(x) vreinterpretq_f64_u64(x) | ||
477 | |||
478 | #define vreinterpretq_m128d_f32(x) vreinterpretq_f64_f32(x) | ||
479 | #define vreinterpretq_m128d_f64(x) (x) | ||
480 | |||
481 | #define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f64(x) | ||
482 | |||
483 | #define vreinterpretq_u32_m128d(x) vreinterpretq_u32_f64(x) | ||
484 | #define vreinterpretq_u64_m128d(x) vreinterpretq_u64_f64(x) | ||
485 | |||
486 | #define vreinterpretq_f64_m128d(x) (x) | ||
487 | #define vreinterpretq_f32_m128d(x) vreinterpretq_f32_f64(x) | ||
488 | #else | ||
489 | #define vreinterpretq_m128d_s32(x) vreinterpretq_f32_s32(x) | ||
490 | #define vreinterpretq_m128d_s64(x) vreinterpretq_f32_s64(x) | ||
491 | |||
492 | #define vreinterpretq_m128d_u32(x) vreinterpretq_f32_u32(x) | ||
493 | #define vreinterpretq_m128d_u64(x) vreinterpretq_f32_u64(x) | ||
494 | |||
495 | #define vreinterpretq_m128d_f32(x) (x) | ||
496 | |||
497 | #define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f32(x) | ||
498 | |||
499 | #define vreinterpretq_u32_m128d(x) vreinterpretq_u32_f32(x) | ||
500 | #define vreinterpretq_u64_m128d(x) vreinterpretq_u64_f32(x) | ||
501 | |||
502 | #define vreinterpretq_f32_m128d(x) (x) | ||
503 | #endif | ||
504 | |||
505 | // A struct is defined in this header file called 'SIMDVec' which can be used | ||
506 | // by applications which attempt to access the contents of an __m128 struct | ||
507 | // directly. It is important to note that accessing the __m128 struct directly | ||
508 | // is bad coding practice by Microsoft: @see: | ||
509 | // https://learn.microsoft.com/en-us/cpp/cpp/m128 | ||
510 | // | ||
511 | // However, some legacy source code may try to access the contents of an __m128 | ||
512 | // struct directly so the developer can use the SIMDVec as an alias for it. Any | ||
513 | // casting must be done manually by the developer, as you cannot cast or | ||
514 | // otherwise alias the base NEON data type for intrinsic operations. | ||
515 | // | ||
516 | // union intended to allow direct access to an __m128 variable using the names | ||
517 | // that the MSVC compiler provides. This union should really only be used when | ||
518 | // trying to access the members of the vector as integer values. GCC/clang | ||
519 | // allow native access to the float members through a simple array access | ||
520 | // operator (in C since 4.6, in C++ since 4.8). | ||
521 | // | ||
522 | // Ideally direct accesses to SIMD vectors should not be used since it can cause | ||
523 | // a performance hit. If it really is needed however, the original __m128 | ||
524 | // variable can be aliased with a pointer to this union and used to access | ||
525 | // individual components. The use of this union should be hidden behind a macro | ||
526 | // that is used throughout the codebase to access the members instead of always | ||
527 | // declaring this type of variable. | ||
528 | typedef union ALIGN_STRUCT(16) SIMDVec { | ||
529 | float m128_f32[4]; // as floats - DON'T USE. Added for convenience. | ||
530 | int8_t m128_i8[16]; // as signed 8-bit integers. | ||
531 | int16_t m128_i16[8]; // as signed 16-bit integers. | ||
532 | int32_t m128_i32[4]; // as signed 32-bit integers. | ||
533 | int64_t m128_i64[2]; // as signed 64-bit integers. | ||
534 | uint8_t m128_u8[16]; // as unsigned 8-bit integers. | ||
535 | uint16_t m128_u16[8]; // as unsigned 16-bit integers. | ||
536 | uint32_t m128_u32[4]; // as unsigned 32-bit integers. | ||
537 | uint64_t m128_u64[2]; // as unsigned 64-bit integers. | ||
538 | } SIMDVec; | ||
539 | |||
540 | // casting using SIMDVec | ||
541 | #define vreinterpretq_nth_u64_m128i(x, n) (((SIMDVec *) &x)->m128_u64[n]) | ||
542 | #define vreinterpretq_nth_u32_m128i(x, n) (((SIMDVec *) &x)->m128_u32[n]) | ||
543 | #define vreinterpretq_nth_u8_m128i(x, n) (((SIMDVec *) &x)->m128_u8[n]) | ||
544 | |||
545 | /* SSE macros */ | ||
546 | #define _MM_GET_FLUSH_ZERO_MODE _sse2neon_mm_get_flush_zero_mode | ||
547 | #define _MM_SET_FLUSH_ZERO_MODE _sse2neon_mm_set_flush_zero_mode | ||
548 | #define _MM_GET_DENORMALS_ZERO_MODE _sse2neon_mm_get_denormals_zero_mode | ||
549 | #define _MM_SET_DENORMALS_ZERO_MODE _sse2neon_mm_set_denormals_zero_mode | ||
550 | |||
551 | // Function declaration | ||
552 | // SSE | ||
553 | FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE(void); | ||
554 | FORCE_INLINE __m128 _mm_move_ss(__m128, __m128); | ||
555 | FORCE_INLINE __m128 _mm_or_ps(__m128, __m128); | ||
556 | FORCE_INLINE __m128 _mm_set_ps1(float); | ||
557 | FORCE_INLINE __m128 _mm_setzero_ps(void); | ||
558 | // SSE2 | ||
559 | FORCE_INLINE __m128i _mm_and_si128(__m128i, __m128i); | ||
560 | FORCE_INLINE __m128i _mm_castps_si128(__m128); | ||
561 | FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i, __m128i); | ||
562 | FORCE_INLINE __m128i _mm_cvtps_epi32(__m128); | ||
563 | FORCE_INLINE __m128d _mm_move_sd(__m128d, __m128d); | ||
564 | FORCE_INLINE __m128i _mm_or_si128(__m128i, __m128i); | ||
565 | FORCE_INLINE __m128i _mm_set_epi32(int, int, int, int); | ||
566 | FORCE_INLINE __m128i _mm_set_epi64x(int64_t, int64_t); | ||
567 | FORCE_INLINE __m128d _mm_set_pd(double, double); | ||
568 | FORCE_INLINE __m128i _mm_set1_epi32(int); | ||
569 | FORCE_INLINE __m128i _mm_setzero_si128(void); | ||
570 | // SSE4.1 | ||
571 | FORCE_INLINE __m128d _mm_ceil_pd(__m128d); | ||
572 | FORCE_INLINE __m128 _mm_ceil_ps(__m128); | ||
573 | FORCE_INLINE __m128d _mm_floor_pd(__m128d); | ||
574 | FORCE_INLINE __m128 _mm_floor_ps(__m128); | ||
575 | FORCE_INLINE __m128d _mm_round_pd(__m128d, int); | ||
576 | FORCE_INLINE __m128 _mm_round_ps(__m128, int); | ||
577 | // SSE4.2 | ||
578 | FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t, uint8_t); | ||
579 | |||
580 | /* Backwards compatibility for compilers with lack of specific type support */ | ||
581 | |||
582 | // Older gcc does not define vld1q_u8_x4 type | ||
583 | #if defined(__GNUC__) && !defined(__clang__) && \ | ||
584 | ((__GNUC__ <= 13 && defined(__arm__)) || \ | ||
585 | (__GNUC__ == 10 && __GNUC_MINOR__ < 3 && defined(__aarch64__)) || \ | ||
586 | (__GNUC__ <= 9 && defined(__aarch64__))) | ||
587 | FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p) | ||
588 | { | ||
589 | uint8x16x4_t ret; | ||
590 | ret.val[0] = vld1q_u8(p + 0); | ||
591 | ret.val[1] = vld1q_u8(p + 16); | ||
592 | ret.val[2] = vld1q_u8(p + 32); | ||
593 | ret.val[3] = vld1q_u8(p + 48); | ||
594 | return ret; | ||
595 | } | ||
596 | #else | ||
597 | // Wraps vld1q_u8_x4 | ||
598 | FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p) | ||
599 | { | ||
600 | return vld1q_u8_x4(p); | ||
601 | } | ||
602 | #endif | ||
603 | |||
604 | #if !defined(__aarch64__) && !defined(_M_ARM64) | ||
605 | /* emulate vaddv u8 variant */ | ||
606 | FORCE_INLINE uint8_t _sse2neon_vaddv_u8(uint8x8_t v8) | ||
607 | { | ||
608 | const uint64x1_t v1 = vpaddl_u32(vpaddl_u16(vpaddl_u8(v8))); | ||
609 | return vget_lane_u8(vreinterpret_u8_u64(v1), 0); | ||
610 | } | ||
611 | #else | ||
612 | // Wraps vaddv_u8 | ||
613 | FORCE_INLINE uint8_t _sse2neon_vaddv_u8(uint8x8_t v8) | ||
614 | { | ||
615 | return vaddv_u8(v8); | ||
616 | } | ||
617 | #endif | ||
618 | |||
619 | #if !defined(__aarch64__) && !defined(_M_ARM64) | ||
620 | /* emulate vaddvq u8 variant */ | ||
621 | FORCE_INLINE uint8_t _sse2neon_vaddvq_u8(uint8x16_t a) | ||
622 | { | ||
623 | uint8x8_t tmp = vpadd_u8(vget_low_u8(a), vget_high_u8(a)); | ||
624 | uint8_t res = 0; | ||
625 | for (int i = 0; i < 8; ++i) | ||
626 | res += tmp[i]; | ||
627 | return res; | ||
628 | } | ||
629 | #else | ||
630 | // Wraps vaddvq_u8 | ||
631 | FORCE_INLINE uint8_t _sse2neon_vaddvq_u8(uint8x16_t a) | ||
632 | { | ||
633 | return vaddvq_u8(a); | ||
634 | } | ||
635 | #endif | ||
636 | |||
637 | #if !defined(__aarch64__) && !defined(_M_ARM64) | ||
638 | /* emulate vaddvq u16 variant */ | ||
639 | FORCE_INLINE uint16_t _sse2neon_vaddvq_u16(uint16x8_t a) | ||
640 | { | ||
641 | uint32x4_t m = vpaddlq_u16(a); | ||
642 | uint64x2_t n = vpaddlq_u32(m); | ||
643 | uint64x1_t o = vget_low_u64(n) + vget_high_u64(n); | ||
644 | |||
645 | return vget_lane_u32((uint32x2_t) o, 0); | ||
646 | } | ||
647 | #else | ||
648 | // Wraps vaddvq_u16 | ||
649 | FORCE_INLINE uint16_t _sse2neon_vaddvq_u16(uint16x8_t a) | ||
650 | { | ||
651 | return vaddvq_u16(a); | ||
652 | } | ||
653 | #endif | ||
654 | |||
655 | /* Function Naming Conventions | ||
656 | * The naming convention of SSE intrinsics is straightforward. A generic SSE | ||
657 | * intrinsic function is given as follows: | ||
658 | * _mm_<name>_<data_type> | ||
659 | * | ||
660 | * The parts of this format are given as follows: | ||
661 | * 1. <name> describes the operation performed by the intrinsic | ||
662 | * 2. <data_type> identifies the data type of the function's primary arguments | ||
663 | * | ||
664 | * This last part, <data_type>, is a little complicated. It identifies the | ||
665 | * content of the input values, and can be set to any of the following values: | ||
666 | * + ps - vectors contain floats (ps stands for packed single-precision) | ||
667 | * + pd - vectors contain doubles (pd stands for packed double-precision) | ||
668 | * + epi8/epi16/epi32/epi64 - vectors contain 8-bit/16-bit/32-bit/64-bit | ||
669 | * signed integers | ||
670 | * + epu8/epu16/epu32/epu64 - vectors contain 8-bit/16-bit/32-bit/64-bit | ||
671 | * unsigned integers | ||
672 | * + si128 - unspecified 128-bit vector or 256-bit vector | ||
673 | * + m128/m128i/m128d - identifies input vector types when they are different | ||
674 | * than the type of the returned vector | ||
675 | * | ||
676 | * For example, _mm_setzero_ps. The _mm implies that the function returns | ||
677 | * a 128-bit vector. The _ps at the end implies that the argument vectors | ||
678 | * contain floats. | ||
679 | * | ||
680 | * A complete example: Byte Shuffle - pshufb (_mm_shuffle_epi8) | ||
681 | * // Set packed 16-bit integers. 128 bits, 8 short, per 16 bits | ||
682 | * __m128i v_in = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8); | ||
683 | * // Set packed 8-bit integers | ||
684 | * // 128 bits, 16 chars, per 8 bits | ||
685 | * __m128i v_perm = _mm_setr_epi8(1, 0, 2, 3, 8, 9, 10, 11, | ||
686 | * 4, 5, 12, 13, 6, 7, 14, 15); | ||
687 | * // Shuffle packed 8-bit integers | ||
688 | * __m128i v_out = _mm_shuffle_epi8(v_in, v_perm); // pshufb | ||
689 | */ | ||
690 | |||
691 | /* Constants for use with _mm_prefetch. */ | ||
692 | enum _mm_hint { | ||
693 | _MM_HINT_NTA = 0, /* load data to L1 and L2 cache, mark it as NTA */ | ||
694 | _MM_HINT_T0 = 1, /* load data to L1 and L2 cache */ | ||
695 | _MM_HINT_T1 = 2, /* load data to L2 cache only */ | ||
696 | _MM_HINT_T2 = 3, /* load data to L2 cache only, mark it as NTA */ | ||
697 | }; | ||
698 | |||
699 | // The bit field mapping to the FPCR(floating-point control register) | ||
700 | typedef struct { | ||
701 | uint16_t res0; | ||
702 | uint8_t res1 : 6; | ||
703 | uint8_t bit22 : 1; | ||
704 | uint8_t bit23 : 1; | ||
705 | uint8_t bit24 : 1; | ||
706 | uint8_t res2 : 7; | ||
707 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
708 | uint32_t res3; | ||
709 | #endif | ||
710 | } fpcr_bitfield; | ||
711 | |||
712 | // Takes the upper 64 bits of a and places it in the low end of the result | ||
713 | // Takes the lower 64 bits of b and places it into the high end of the result. | ||
714 | FORCE_INLINE __m128 _mm_shuffle_ps_1032(__m128 a, __m128 b) | ||
715 | { | ||
716 | float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a)); | ||
717 | float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b)); | ||
718 | return vreinterpretq_m128_f32(vcombine_f32(a32, b10)); | ||
719 | } | ||
720 | |||
721 | // takes the lower two 32-bit values from a and swaps them and places in high | ||
722 | // end of result takes the higher two 32 bit values from b and swaps them and | ||
723 | // places in low end of result. | ||
724 | FORCE_INLINE __m128 _mm_shuffle_ps_2301(__m128 a, __m128 b) | ||
725 | { | ||
726 | float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a))); | ||
727 | float32x2_t b23 = vrev64_f32(vget_high_f32(vreinterpretq_f32_m128(b))); | ||
728 | return vreinterpretq_m128_f32(vcombine_f32(a01, b23)); | ||
729 | } | ||
730 | |||
731 | FORCE_INLINE __m128 _mm_shuffle_ps_0321(__m128 a, __m128 b) | ||
732 | { | ||
733 | float32x2_t a21 = vget_high_f32( | ||
734 | vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3)); | ||
735 | float32x2_t b03 = vget_low_f32( | ||
736 | vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3)); | ||
737 | return vreinterpretq_m128_f32(vcombine_f32(a21, b03)); | ||
738 | } | ||
739 | |||
740 | FORCE_INLINE __m128 _mm_shuffle_ps_2103(__m128 a, __m128 b) | ||
741 | { | ||
742 | float32x2_t a03 = vget_low_f32( | ||
743 | vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3)); | ||
744 | float32x2_t b21 = vget_high_f32( | ||
745 | vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3)); | ||
746 | return vreinterpretq_m128_f32(vcombine_f32(a03, b21)); | ||
747 | } | ||
748 | |||
749 | FORCE_INLINE __m128 _mm_shuffle_ps_1010(__m128 a, __m128 b) | ||
750 | { | ||
751 | float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a)); | ||
752 | float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b)); | ||
753 | return vreinterpretq_m128_f32(vcombine_f32(a10, b10)); | ||
754 | } | ||
755 | |||
756 | FORCE_INLINE __m128 _mm_shuffle_ps_1001(__m128 a, __m128 b) | ||
757 | { | ||
758 | float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a))); | ||
759 | float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b)); | ||
760 | return vreinterpretq_m128_f32(vcombine_f32(a01, b10)); | ||
761 | } | ||
762 | |||
763 | FORCE_INLINE __m128 _mm_shuffle_ps_0101(__m128 a, __m128 b) | ||
764 | { | ||
765 | float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a))); | ||
766 | float32x2_t b01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(b))); | ||
767 | return vreinterpretq_m128_f32(vcombine_f32(a01, b01)); | ||
768 | } | ||
769 | |||
770 | // keeps the low 64 bits of b in the low and puts the high 64 bits of a in the | ||
771 | // high | ||
772 | FORCE_INLINE __m128 _mm_shuffle_ps_3210(__m128 a, __m128 b) | ||
773 | { | ||
774 | float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a)); | ||
775 | float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b)); | ||
776 | return vreinterpretq_m128_f32(vcombine_f32(a10, b32)); | ||
777 | } | ||
778 | |||
779 | FORCE_INLINE __m128 _mm_shuffle_ps_0011(__m128 a, __m128 b) | ||
780 | { | ||
781 | float32x2_t a11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 1); | ||
782 | float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0); | ||
783 | return vreinterpretq_m128_f32(vcombine_f32(a11, b00)); | ||
784 | } | ||
785 | |||
786 | FORCE_INLINE __m128 _mm_shuffle_ps_0022(__m128 a, __m128 b) | ||
787 | { | ||
788 | float32x2_t a22 = | ||
789 | vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0); | ||
790 | float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0); | ||
791 | return vreinterpretq_m128_f32(vcombine_f32(a22, b00)); | ||
792 | } | ||
793 | |||
794 | FORCE_INLINE __m128 _mm_shuffle_ps_2200(__m128 a, __m128 b) | ||
795 | { | ||
796 | float32x2_t a00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 0); | ||
797 | float32x2_t b22 = | ||
798 | vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(b)), 0); | ||
799 | return vreinterpretq_m128_f32(vcombine_f32(a00, b22)); | ||
800 | } | ||
801 | |||
802 | FORCE_INLINE __m128 _mm_shuffle_ps_3202(__m128 a, __m128 b) | ||
803 | { | ||
804 | float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0); | ||
805 | float32x2_t a22 = | ||
806 | vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0); | ||
807 | float32x2_t a02 = vset_lane_f32(a0, a22, 1); /* TODO: use vzip ?*/ | ||
808 | float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b)); | ||
809 | return vreinterpretq_m128_f32(vcombine_f32(a02, b32)); | ||
810 | } | ||
811 | |||
812 | FORCE_INLINE __m128 _mm_shuffle_ps_1133(__m128 a, __m128 b) | ||
813 | { | ||
814 | float32x2_t a33 = | ||
815 | vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 1); | ||
816 | float32x2_t b11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 1); | ||
817 | return vreinterpretq_m128_f32(vcombine_f32(a33, b11)); | ||
818 | } | ||
819 | |||
820 | FORCE_INLINE __m128 _mm_shuffle_ps_2010(__m128 a, __m128 b) | ||
821 | { | ||
822 | float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a)); | ||
823 | float32_t b2 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 2); | ||
824 | float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0); | ||
825 | float32x2_t b20 = vset_lane_f32(b2, b00, 1); | ||
826 | return vreinterpretq_m128_f32(vcombine_f32(a10, b20)); | ||
827 | } | ||
828 | |||
829 | FORCE_INLINE __m128 _mm_shuffle_ps_2001(__m128 a, __m128 b) | ||
830 | { | ||
831 | float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a))); | ||
832 | float32_t b2 = vgetq_lane_f32(b, 2); | ||
833 | float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0); | ||
834 | float32x2_t b20 = vset_lane_f32(b2, b00, 1); | ||
835 | return vreinterpretq_m128_f32(vcombine_f32(a01, b20)); | ||
836 | } | ||
837 | |||
838 | FORCE_INLINE __m128 _mm_shuffle_ps_2032(__m128 a, __m128 b) | ||
839 | { | ||
840 | float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a)); | ||
841 | float32_t b2 = vgetq_lane_f32(b, 2); | ||
842 | float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0); | ||
843 | float32x2_t b20 = vset_lane_f32(b2, b00, 1); | ||
844 | return vreinterpretq_m128_f32(vcombine_f32(a32, b20)); | ||
845 | } | ||
846 | |||
847 | // For MSVC, we check only if it is ARM64, as every single ARM64 processor | ||
848 | // supported by WoA has crypto extensions. If this changes in the future, | ||
849 | // this can be verified via the runtime-only method of: | ||
850 | // IsProcessorFeaturePresent(PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE) | ||
851 | #if (defined(_M_ARM64) && !defined(__clang__)) || \ | ||
852 | (defined(__ARM_FEATURE_CRYPTO) && \ | ||
853 | (defined(__aarch64__) || __has_builtin(__builtin_arm_crypto_vmullp64))) | ||
854 | // Wraps vmull_p64 | ||
855 | FORCE_INLINE uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b) | ||
856 | { | ||
857 | poly64_t a = vget_lane_p64(vreinterpret_p64_u64(_a), 0); | ||
858 | poly64_t b = vget_lane_p64(vreinterpret_p64_u64(_b), 0); | ||
859 | #if defined(_MSC_VER) | ||
860 | __n64 a1 = {a}, b1 = {b}; | ||
861 | return vreinterpretq_u64_p128(vmull_p64(a1, b1)); | ||
862 | #else | ||
863 | return vreinterpretq_u64_p128(vmull_p64(a, b)); | ||
864 | #endif | ||
865 | } | ||
866 | #else // ARMv7 polyfill | ||
867 | // ARMv7/some A64 lacks vmull_p64, but it has vmull_p8. | ||
868 | // | ||
869 | // vmull_p8 calculates 8 8-bit->16-bit polynomial multiplies, but we need a | ||
870 | // 64-bit->128-bit polynomial multiply. | ||
871 | // | ||
872 | // It needs some work and is somewhat slow, but it is still faster than all | ||
873 | // known scalar methods. | ||
874 | // | ||
875 | // Algorithm adapted to C from | ||
876 | // https://www.workofard.com/2017/07/ghash-for-low-end-cores/, which is adapted | ||
877 | // from "Fast Software Polynomial Multiplication on ARM Processors Using the | ||
878 | // NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and Ricardo Dahab | ||
879 | // (https://hal.inria.fr/hal-01506572) | ||
880 | static uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b) | ||
881 | { | ||
882 | poly8x8_t a = vreinterpret_p8_u64(_a); | ||
883 | poly8x8_t b = vreinterpret_p8_u64(_b); | ||
884 | |||
885 | // Masks | ||
886 | uint8x16_t k48_32 = vcombine_u8(vcreate_u8(0x0000ffffffffffff), | ||
887 | vcreate_u8(0x00000000ffffffff)); | ||
888 | uint8x16_t k16_00 = vcombine_u8(vcreate_u8(0x000000000000ffff), | ||
889 | vcreate_u8(0x0000000000000000)); | ||
890 | |||
891 | // Do the multiplies, rotating with vext to get all combinations | ||
892 | uint8x16_t d = vreinterpretq_u8_p16(vmull_p8(a, b)); // D = A0 * B0 | ||
893 | uint8x16_t e = | ||
894 | vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 1))); // E = A0 * B1 | ||
895 | uint8x16_t f = | ||
896 | vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 1), b)); // F = A1 * B0 | ||
897 | uint8x16_t g = | ||
898 | vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 2))); // G = A0 * B2 | ||
899 | uint8x16_t h = | ||
900 | vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 2), b)); // H = A2 * B0 | ||
901 | uint8x16_t i = | ||
902 | vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 3))); // I = A0 * B3 | ||
903 | uint8x16_t j = | ||
904 | vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 3), b)); // J = A3 * B0 | ||
905 | uint8x16_t k = | ||
906 | vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 4))); // L = A0 * B4 | ||
907 | |||
908 | // Add cross products | ||
909 | uint8x16_t l = veorq_u8(e, f); // L = E + F | ||
910 | uint8x16_t m = veorq_u8(g, h); // M = G + H | ||
911 | uint8x16_t n = veorq_u8(i, j); // N = I + J | ||
912 | |||
913 | // Interleave. Using vzip1 and vzip2 prevents Clang from emitting TBL | ||
914 | // instructions. | ||
915 | #if defined(__aarch64__) | ||
916 | uint8x16_t lm_p0 = vreinterpretq_u8_u64( | ||
917 | vzip1q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m))); | ||
918 | uint8x16_t lm_p1 = vreinterpretq_u8_u64( | ||
919 | vzip2q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m))); | ||
920 | uint8x16_t nk_p0 = vreinterpretq_u8_u64( | ||
921 | vzip1q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k))); | ||
922 | uint8x16_t nk_p1 = vreinterpretq_u8_u64( | ||
923 | vzip2q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k))); | ||
924 | #else | ||
925 | uint8x16_t lm_p0 = vcombine_u8(vget_low_u8(l), vget_low_u8(m)); | ||
926 | uint8x16_t lm_p1 = vcombine_u8(vget_high_u8(l), vget_high_u8(m)); | ||
927 | uint8x16_t nk_p0 = vcombine_u8(vget_low_u8(n), vget_low_u8(k)); | ||
928 | uint8x16_t nk_p1 = vcombine_u8(vget_high_u8(n), vget_high_u8(k)); | ||
929 | #endif | ||
930 | // t0 = (L) (P0 + P1) << 8 | ||
931 | // t1 = (M) (P2 + P3) << 16 | ||
932 | uint8x16_t t0t1_tmp = veorq_u8(lm_p0, lm_p1); | ||
933 | uint8x16_t t0t1_h = vandq_u8(lm_p1, k48_32); | ||
934 | uint8x16_t t0t1_l = veorq_u8(t0t1_tmp, t0t1_h); | ||
935 | |||
936 | // t2 = (N) (P4 + P5) << 24 | ||
937 | // t3 = (K) (P6 + P7) << 32 | ||
938 | uint8x16_t t2t3_tmp = veorq_u8(nk_p0, nk_p1); | ||
939 | uint8x16_t t2t3_h = vandq_u8(nk_p1, k16_00); | ||
940 | uint8x16_t t2t3_l = veorq_u8(t2t3_tmp, t2t3_h); | ||
941 | |||
942 | // De-interleave | ||
943 | #if defined(__aarch64__) | ||
944 | uint8x16_t t0 = vreinterpretq_u8_u64( | ||
945 | vuzp1q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h))); | ||
946 | uint8x16_t t1 = vreinterpretq_u8_u64( | ||
947 | vuzp2q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h))); | ||
948 | uint8x16_t t2 = vreinterpretq_u8_u64( | ||
949 | vuzp1q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h))); | ||
950 | uint8x16_t t3 = vreinterpretq_u8_u64( | ||
951 | vuzp2q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h))); | ||
952 | #else | ||
953 | uint8x16_t t1 = vcombine_u8(vget_high_u8(t0t1_l), vget_high_u8(t0t1_h)); | ||
954 | uint8x16_t t0 = vcombine_u8(vget_low_u8(t0t1_l), vget_low_u8(t0t1_h)); | ||
955 | uint8x16_t t3 = vcombine_u8(vget_high_u8(t2t3_l), vget_high_u8(t2t3_h)); | ||
956 | uint8x16_t t2 = vcombine_u8(vget_low_u8(t2t3_l), vget_low_u8(t2t3_h)); | ||
957 | #endif | ||
958 | // Shift the cross products | ||
959 | uint8x16_t t0_shift = vextq_u8(t0, t0, 15); // t0 << 8 | ||
960 | uint8x16_t t1_shift = vextq_u8(t1, t1, 14); // t1 << 16 | ||
961 | uint8x16_t t2_shift = vextq_u8(t2, t2, 13); // t2 << 24 | ||
962 | uint8x16_t t3_shift = vextq_u8(t3, t3, 12); // t3 << 32 | ||
963 | |||
964 | // Accumulate the products | ||
965 | uint8x16_t cross1 = veorq_u8(t0_shift, t1_shift); | ||
966 | uint8x16_t cross2 = veorq_u8(t2_shift, t3_shift); | ||
967 | uint8x16_t mix = veorq_u8(d, cross1); | ||
968 | uint8x16_t r = veorq_u8(mix, cross2); | ||
969 | return vreinterpretq_u64_u8(r); | ||
970 | } | ||
971 | #endif // ARMv7 polyfill | ||
972 | |||
973 | // C equivalent: | ||
974 | // __m128i _mm_shuffle_epi32_default(__m128i a, | ||
975 | // __constrange(0, 255) int imm) { | ||
976 | // __m128i ret; | ||
977 | // ret[0] = a[imm & 0x3]; ret[1] = a[(imm >> 2) & 0x3]; | ||
978 | // ret[2] = a[(imm >> 4) & 0x03]; ret[3] = a[(imm >> 6) & 0x03]; | ||
979 | // return ret; | ||
980 | // } | ||
981 | #define _mm_shuffle_epi32_default(a, imm) \ | ||
982 | vreinterpretq_m128i_s32(vsetq_lane_s32( \ | ||
983 | vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 6) & 0x3), \ | ||
984 | vsetq_lane_s32( \ | ||
985 | vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 4) & 0x3), \ | ||
986 | vsetq_lane_s32(vgetq_lane_s32(vreinterpretq_s32_m128i(a), \ | ||
987 | ((imm) >> 2) & 0x3), \ | ||
988 | vmovq_n_s32(vgetq_lane_s32( \ | ||
989 | vreinterpretq_s32_m128i(a), (imm) & (0x3))), \ | ||
990 | 1), \ | ||
991 | 2), \ | ||
992 | 3)) | ||
993 | |||
994 | // Takes the upper 64 bits of a and places it in the low end of the result | ||
995 | // Takes the lower 64 bits of a and places it into the high end of the result. | ||
996 | FORCE_INLINE __m128i _mm_shuffle_epi_1032(__m128i a) | ||
997 | { | ||
998 | int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a)); | ||
999 | int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a)); | ||
1000 | return vreinterpretq_m128i_s32(vcombine_s32(a32, a10)); | ||
1001 | } | ||
1002 | |||
1003 | // takes the lower two 32-bit values from a and swaps them and places in low end | ||
1004 | // of result takes the higher two 32 bit values from a and swaps them and places | ||
1005 | // in high end of result. | ||
1006 | FORCE_INLINE __m128i _mm_shuffle_epi_2301(__m128i a) | ||
1007 | { | ||
1008 | int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a))); | ||
1009 | int32x2_t a23 = vrev64_s32(vget_high_s32(vreinterpretq_s32_m128i(a))); | ||
1010 | return vreinterpretq_m128i_s32(vcombine_s32(a01, a23)); | ||
1011 | } | ||
1012 | |||
1013 | // rotates the least significant 32 bits into the most significant 32 bits, and | ||
1014 | // shifts the rest down | ||
1015 | FORCE_INLINE __m128i _mm_shuffle_epi_0321(__m128i a) | ||
1016 | { | ||
1017 | return vreinterpretq_m128i_s32( | ||
1018 | vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 1)); | ||
1019 | } | ||
1020 | |||
1021 | // rotates the most significant 32 bits into the least significant 32 bits, and | ||
1022 | // shifts the rest up | ||
1023 | FORCE_INLINE __m128i _mm_shuffle_epi_2103(__m128i a) | ||
1024 | { | ||
1025 | return vreinterpretq_m128i_s32( | ||
1026 | vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 3)); | ||
1027 | } | ||
1028 | |||
1029 | // gets the lower 64 bits of a, and places it in the upper 64 bits | ||
1030 | // gets the lower 64 bits of a and places it in the lower 64 bits | ||
1031 | FORCE_INLINE __m128i _mm_shuffle_epi_1010(__m128i a) | ||
1032 | { | ||
1033 | int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a)); | ||
1034 | return vreinterpretq_m128i_s32(vcombine_s32(a10, a10)); | ||
1035 | } | ||
1036 | |||
1037 | // gets the lower 64 bits of a, swaps the 0 and 1 elements, and places it in the | ||
1038 | // lower 64 bits gets the lower 64 bits of a, and places it in the upper 64 bits | ||
1039 | FORCE_INLINE __m128i _mm_shuffle_epi_1001(__m128i a) | ||
1040 | { | ||
1041 | int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a))); | ||
1042 | int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a)); | ||
1043 | return vreinterpretq_m128i_s32(vcombine_s32(a01, a10)); | ||
1044 | } | ||
1045 | |||
1046 | // gets the lower 64 bits of a, swaps the 0 and 1 elements and places it in the | ||
1047 | // upper 64 bits gets the lower 64 bits of a, swaps the 0 and 1 elements, and | ||
1048 | // places it in the lower 64 bits | ||
1049 | FORCE_INLINE __m128i _mm_shuffle_epi_0101(__m128i a) | ||
1050 | { | ||
1051 | int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a))); | ||
1052 | return vreinterpretq_m128i_s32(vcombine_s32(a01, a01)); | ||
1053 | } | ||
1054 | |||
1055 | FORCE_INLINE __m128i _mm_shuffle_epi_2211(__m128i a) | ||
1056 | { | ||
1057 | int32x2_t a11 = vdup_lane_s32(vget_low_s32(vreinterpretq_s32_m128i(a)), 1); | ||
1058 | int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0); | ||
1059 | return vreinterpretq_m128i_s32(vcombine_s32(a11, a22)); | ||
1060 | } | ||
1061 | |||
1062 | FORCE_INLINE __m128i _mm_shuffle_epi_0122(__m128i a) | ||
1063 | { | ||
1064 | int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0); | ||
1065 | int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a))); | ||
1066 | return vreinterpretq_m128i_s32(vcombine_s32(a22, a01)); | ||
1067 | } | ||
1068 | |||
1069 | FORCE_INLINE __m128i _mm_shuffle_epi_3332(__m128i a) | ||
1070 | { | ||
1071 | int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a)); | ||
1072 | int32x2_t a33 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 1); | ||
1073 | return vreinterpretq_m128i_s32(vcombine_s32(a32, a33)); | ||
1074 | } | ||
1075 | |||
1076 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
1077 | #define _mm_shuffle_epi32_splat(a, imm) \ | ||
1078 | vreinterpretq_m128i_s32(vdupq_laneq_s32(vreinterpretq_s32_m128i(a), (imm))) | ||
1079 | #else | ||
1080 | #define _mm_shuffle_epi32_splat(a, imm) \ | ||
1081 | vreinterpretq_m128i_s32( \ | ||
1082 | vdupq_n_s32(vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm)))) | ||
1083 | #endif | ||
1084 | |||
1085 | // NEON does not support a general purpose permute intrinsic. | ||
1086 | // Shuffle single-precision (32-bit) floating-point elements in a using the | ||
1087 | // control in imm8, and store the results in dst. | ||
1088 | // | ||
1089 | // C equivalent: | ||
1090 | // __m128 _mm_shuffle_ps_default(__m128 a, __m128 b, | ||
1091 | // __constrange(0, 255) int imm) { | ||
1092 | // __m128 ret; | ||
1093 | // ret[0] = a[imm & 0x3]; ret[1] = a[(imm >> 2) & 0x3]; | ||
1094 | // ret[2] = b[(imm >> 4) & 0x03]; ret[3] = b[(imm >> 6) & 0x03]; | ||
1095 | // return ret; | ||
1096 | // } | ||
1097 | // | ||
1098 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_ps | ||
1099 | #define _mm_shuffle_ps_default(a, b, imm) \ | ||
1100 | vreinterpretq_m128_f32(vsetq_lane_f32( \ | ||
1101 | vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 6) & 0x3), \ | ||
1102 | vsetq_lane_f32( \ | ||
1103 | vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 4) & 0x3), \ | ||
1104 | vsetq_lane_f32( \ | ||
1105 | vgetq_lane_f32(vreinterpretq_f32_m128(a), ((imm) >> 2) & 0x3), \ | ||
1106 | vmovq_n_f32( \ | ||
1107 | vgetq_lane_f32(vreinterpretq_f32_m128(a), (imm) & (0x3))), \ | ||
1108 | 1), \ | ||
1109 | 2), \ | ||
1110 | 3)) | ||
1111 | |||
1112 | // Shuffle 16-bit integers in the low 64 bits of a using the control in imm8. | ||
1113 | // Store the results in the low 64 bits of dst, with the high 64 bits being | ||
1114 | // copied from a to dst. | ||
1115 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shufflelo_epi16 | ||
1116 | #define _mm_shufflelo_epi16_function(a, imm) \ | ||
1117 | _sse2neon_define1( \ | ||
1118 | __m128i, a, int16x8_t ret = vreinterpretq_s16_m128i(_a); \ | ||
1119 | int16x4_t lowBits = vget_low_s16(ret); \ | ||
1120 | ret = vsetq_lane_s16(vget_lane_s16(lowBits, (imm) & (0x3)), ret, 0); \ | ||
1121 | ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 2) & 0x3), ret, \ | ||
1122 | 1); \ | ||
1123 | ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 4) & 0x3), ret, \ | ||
1124 | 2); \ | ||
1125 | ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 6) & 0x3), ret, \ | ||
1126 | 3); \ | ||
1127 | _sse2neon_return(vreinterpretq_m128i_s16(ret));) | ||
1128 | |||
1129 | // Shuffle 16-bit integers in the high 64 bits of a using the control in imm8. | ||
1130 | // Store the results in the high 64 bits of dst, with the low 64 bits being | ||
1131 | // copied from a to dst. | ||
1132 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shufflehi_epi16 | ||
1133 | #define _mm_shufflehi_epi16_function(a, imm) \ | ||
1134 | _sse2neon_define1( \ | ||
1135 | __m128i, a, int16x8_t ret = vreinterpretq_s16_m128i(_a); \ | ||
1136 | int16x4_t highBits = vget_high_s16(ret); \ | ||
1137 | ret = vsetq_lane_s16(vget_lane_s16(highBits, (imm) & (0x3)), ret, 4); \ | ||
1138 | ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 2) & 0x3), ret, \ | ||
1139 | 5); \ | ||
1140 | ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 4) & 0x3), ret, \ | ||
1141 | 6); \ | ||
1142 | ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 6) & 0x3), ret, \ | ||
1143 | 7); \ | ||
1144 | _sse2neon_return(vreinterpretq_m128i_s16(ret));) | ||
1145 | |||
1146 | /* MMX */ | ||
1147 | |||
1148 | //_mm_empty is a no-op on arm | ||
1149 | FORCE_INLINE void _mm_empty(void) {} | ||
1150 | |||
1151 | /* SSE */ | ||
1152 | |||
1153 | // Add packed single-precision (32-bit) floating-point elements in a and b, and | ||
1154 | // store the results in dst. | ||
1155 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_ps | ||
1156 | FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b) | ||
1157 | { | ||
1158 | return vreinterpretq_m128_f32( | ||
1159 | vaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); | ||
1160 | } | ||
1161 | |||
1162 | // Add the lower single-precision (32-bit) floating-point element in a and b, | ||
1163 | // store the result in the lower element of dst, and copy the upper 3 packed | ||
1164 | // elements from a to the upper elements of dst. | ||
1165 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_ss | ||
1166 | FORCE_INLINE __m128 _mm_add_ss(__m128 a, __m128 b) | ||
1167 | { | ||
1168 | float32_t b0 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 0); | ||
1169 | float32x4_t value = vsetq_lane_f32(b0, vdupq_n_f32(0), 0); | ||
1170 | // the upper values in the result must be the remnants of <a>. | ||
1171 | return vreinterpretq_m128_f32(vaddq_f32(a, value)); | ||
1172 | } | ||
1173 | |||
1174 | // Compute the bitwise AND of packed single-precision (32-bit) floating-point | ||
1175 | // elements in a and b, and store the results in dst. | ||
1176 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_ps | ||
1177 | FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b) | ||
1178 | { | ||
1179 | return vreinterpretq_m128_s32( | ||
1180 | vandq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b))); | ||
1181 | } | ||
1182 | |||
1183 | // Compute the bitwise NOT of packed single-precision (32-bit) floating-point | ||
1184 | // elements in a and then AND with b, and store the results in dst. | ||
1185 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_ps | ||
1186 | FORCE_INLINE __m128 _mm_andnot_ps(__m128 a, __m128 b) | ||
1187 | { | ||
1188 | return vreinterpretq_m128_s32( | ||
1189 | vbicq_s32(vreinterpretq_s32_m128(b), | ||
1190 | vreinterpretq_s32_m128(a))); // *NOTE* argument swap | ||
1191 | } | ||
1192 | |||
1193 | // Average packed unsigned 16-bit integers in a and b, and store the results in | ||
1194 | // dst. | ||
1195 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_pu16 | ||
1196 | FORCE_INLINE __m64 _mm_avg_pu16(__m64 a, __m64 b) | ||
1197 | { | ||
1198 | return vreinterpret_m64_u16( | ||
1199 | vrhadd_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b))); | ||
1200 | } | ||
1201 | |||
1202 | // Average packed unsigned 8-bit integers in a and b, and store the results in | ||
1203 | // dst. | ||
1204 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_pu8 | ||
1205 | FORCE_INLINE __m64 _mm_avg_pu8(__m64 a, __m64 b) | ||
1206 | { | ||
1207 | return vreinterpret_m64_u8( | ||
1208 | vrhadd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b))); | ||
1209 | } | ||
1210 | |||
1211 | // Compare packed single-precision (32-bit) floating-point elements in a and b | ||
1212 | // for equality, and store the results in dst. | ||
1213 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_ps | ||
1214 | FORCE_INLINE __m128 _mm_cmpeq_ps(__m128 a, __m128 b) | ||
1215 | { | ||
1216 | return vreinterpretq_m128_u32( | ||
1217 | vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); | ||
1218 | } | ||
1219 | |||
1220 | // Compare the lower single-precision (32-bit) floating-point elements in a and | ||
1221 | // b for equality, store the result in the lower element of dst, and copy the | ||
1222 | // upper 3 packed elements from a to the upper elements of dst. | ||
1223 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_ss | ||
1224 | FORCE_INLINE __m128 _mm_cmpeq_ss(__m128 a, __m128 b) | ||
1225 | { | ||
1226 | return _mm_move_ss(a, _mm_cmpeq_ps(a, b)); | ||
1227 | } | ||
1228 | |||
1229 | // Compare packed single-precision (32-bit) floating-point elements in a and b | ||
1230 | // for greater-than-or-equal, and store the results in dst. | ||
1231 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_ps | ||
1232 | FORCE_INLINE __m128 _mm_cmpge_ps(__m128 a, __m128 b) | ||
1233 | { | ||
1234 | return vreinterpretq_m128_u32( | ||
1235 | vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); | ||
1236 | } | ||
1237 | |||
1238 | // Compare the lower single-precision (32-bit) floating-point elements in a and | ||
1239 | // b for greater-than-or-equal, store the result in the lower element of dst, | ||
1240 | // and copy the upper 3 packed elements from a to the upper elements of dst. | ||
1241 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_ss | ||
1242 | FORCE_INLINE __m128 _mm_cmpge_ss(__m128 a, __m128 b) | ||
1243 | { | ||
1244 | return _mm_move_ss(a, _mm_cmpge_ps(a, b)); | ||
1245 | } | ||
1246 | |||
1247 | // Compare packed single-precision (32-bit) floating-point elements in a and b | ||
1248 | // for greater-than, and store the results in dst. | ||
1249 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_ps | ||
1250 | FORCE_INLINE __m128 _mm_cmpgt_ps(__m128 a, __m128 b) | ||
1251 | { | ||
1252 | return vreinterpretq_m128_u32( | ||
1253 | vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); | ||
1254 | } | ||
1255 | |||
1256 | // Compare the lower single-precision (32-bit) floating-point elements in a and | ||
1257 | // b for greater-than, store the result in the lower element of dst, and copy | ||
1258 | // the upper 3 packed elements from a to the upper elements of dst. | ||
1259 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_ss | ||
1260 | FORCE_INLINE __m128 _mm_cmpgt_ss(__m128 a, __m128 b) | ||
1261 | { | ||
1262 | return _mm_move_ss(a, _mm_cmpgt_ps(a, b)); | ||
1263 | } | ||
1264 | |||
1265 | // Compare packed single-precision (32-bit) floating-point elements in a and b | ||
1266 | // for less-than-or-equal, and store the results in dst. | ||
1267 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_ps | ||
1268 | FORCE_INLINE __m128 _mm_cmple_ps(__m128 a, __m128 b) | ||
1269 | { | ||
1270 | return vreinterpretq_m128_u32( | ||
1271 | vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); | ||
1272 | } | ||
1273 | |||
1274 | // Compare the lower single-precision (32-bit) floating-point elements in a and | ||
1275 | // b for less-than-or-equal, store the result in the lower element of dst, and | ||
1276 | // copy the upper 3 packed elements from a to the upper elements of dst. | ||
1277 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_ss | ||
1278 | FORCE_INLINE __m128 _mm_cmple_ss(__m128 a, __m128 b) | ||
1279 | { | ||
1280 | return _mm_move_ss(a, _mm_cmple_ps(a, b)); | ||
1281 | } | ||
1282 | |||
1283 | // Compare packed single-precision (32-bit) floating-point elements in a and b | ||
1284 | // for less-than, and store the results in dst. | ||
1285 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_ps | ||
1286 | FORCE_INLINE __m128 _mm_cmplt_ps(__m128 a, __m128 b) | ||
1287 | { | ||
1288 | return vreinterpretq_m128_u32( | ||
1289 | vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); | ||
1290 | } | ||
1291 | |||
1292 | // Compare the lower single-precision (32-bit) floating-point elements in a and | ||
1293 | // b for less-than, store the result in the lower element of dst, and copy the | ||
1294 | // upper 3 packed elements from a to the upper elements of dst. | ||
1295 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_ss | ||
1296 | FORCE_INLINE __m128 _mm_cmplt_ss(__m128 a, __m128 b) | ||
1297 | { | ||
1298 | return _mm_move_ss(a, _mm_cmplt_ps(a, b)); | ||
1299 | } | ||
1300 | |||
1301 | // Compare packed single-precision (32-bit) floating-point elements in a and b | ||
1302 | // for not-equal, and store the results in dst. | ||
1303 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_ps | ||
1304 | FORCE_INLINE __m128 _mm_cmpneq_ps(__m128 a, __m128 b) | ||
1305 | { | ||
1306 | return vreinterpretq_m128_u32(vmvnq_u32( | ||
1307 | vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)))); | ||
1308 | } | ||
1309 | |||
1310 | // Compare the lower single-precision (32-bit) floating-point elements in a and | ||
1311 | // b for not-equal, store the result in the lower element of dst, and copy the | ||
1312 | // upper 3 packed elements from a to the upper elements of dst. | ||
1313 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_ss | ||
1314 | FORCE_INLINE __m128 _mm_cmpneq_ss(__m128 a, __m128 b) | ||
1315 | { | ||
1316 | return _mm_move_ss(a, _mm_cmpneq_ps(a, b)); | ||
1317 | } | ||
1318 | |||
1319 | // Compare packed single-precision (32-bit) floating-point elements in a and b | ||
1320 | // for not-greater-than-or-equal, and store the results in dst. | ||
1321 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_ps | ||
1322 | FORCE_INLINE __m128 _mm_cmpnge_ps(__m128 a, __m128 b) | ||
1323 | { | ||
1324 | return vreinterpretq_m128_u32(vmvnq_u32( | ||
1325 | vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)))); | ||
1326 | } | ||
1327 | |||
1328 | // Compare the lower single-precision (32-bit) floating-point elements in a and | ||
1329 | // b for not-greater-than-or-equal, store the result in the lower element of | ||
1330 | // dst, and copy the upper 3 packed elements from a to the upper elements of | ||
1331 | // dst. | ||
1332 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_ss | ||
1333 | FORCE_INLINE __m128 _mm_cmpnge_ss(__m128 a, __m128 b) | ||
1334 | { | ||
1335 | return _mm_move_ss(a, _mm_cmpnge_ps(a, b)); | ||
1336 | } | ||
1337 | |||
1338 | // Compare packed single-precision (32-bit) floating-point elements in a and b | ||
1339 | // for not-greater-than, and store the results in dst. | ||
1340 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_ps | ||
1341 | FORCE_INLINE __m128 _mm_cmpngt_ps(__m128 a, __m128 b) | ||
1342 | { | ||
1343 | return vreinterpretq_m128_u32(vmvnq_u32( | ||
1344 | vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)))); | ||
1345 | } | ||
1346 | |||
1347 | // Compare the lower single-precision (32-bit) floating-point elements in a and | ||
1348 | // b for not-greater-than, store the result in the lower element of dst, and | ||
1349 | // copy the upper 3 packed elements from a to the upper elements of dst. | ||
1350 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_ss | ||
1351 | FORCE_INLINE __m128 _mm_cmpngt_ss(__m128 a, __m128 b) | ||
1352 | { | ||
1353 | return _mm_move_ss(a, _mm_cmpngt_ps(a, b)); | ||
1354 | } | ||
1355 | |||
1356 | // Compare packed single-precision (32-bit) floating-point elements in a and b | ||
1357 | // for not-less-than-or-equal, and store the results in dst. | ||
1358 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_ps | ||
1359 | FORCE_INLINE __m128 _mm_cmpnle_ps(__m128 a, __m128 b) | ||
1360 | { | ||
1361 | return vreinterpretq_m128_u32(vmvnq_u32( | ||
1362 | vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)))); | ||
1363 | } | ||
1364 | |||
1365 | // Compare the lower single-precision (32-bit) floating-point elements in a and | ||
1366 | // b for not-less-than-or-equal, store the result in the lower element of dst, | ||
1367 | // and copy the upper 3 packed elements from a to the upper elements of dst. | ||
1368 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_ss | ||
1369 | FORCE_INLINE __m128 _mm_cmpnle_ss(__m128 a, __m128 b) | ||
1370 | { | ||
1371 | return _mm_move_ss(a, _mm_cmpnle_ps(a, b)); | ||
1372 | } | ||
1373 | |||
1374 | // Compare packed single-precision (32-bit) floating-point elements in a and b | ||
1375 | // for not-less-than, and store the results in dst. | ||
1376 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_ps | ||
1377 | FORCE_INLINE __m128 _mm_cmpnlt_ps(__m128 a, __m128 b) | ||
1378 | { | ||
1379 | return vreinterpretq_m128_u32(vmvnq_u32( | ||
1380 | vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)))); | ||
1381 | } | ||
1382 | |||
1383 | // Compare the lower single-precision (32-bit) floating-point elements in a and | ||
1384 | // b for not-less-than, store the result in the lower element of dst, and copy | ||
1385 | // the upper 3 packed elements from a to the upper elements of dst. | ||
1386 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_ss | ||
1387 | FORCE_INLINE __m128 _mm_cmpnlt_ss(__m128 a, __m128 b) | ||
1388 | { | ||
1389 | return _mm_move_ss(a, _mm_cmpnlt_ps(a, b)); | ||
1390 | } | ||
1391 | |||
1392 | // Compare packed single-precision (32-bit) floating-point elements in a and b | ||
1393 | // to see if neither is NaN, and store the results in dst. | ||
1394 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_ps | ||
1395 | // | ||
1396 | // See also: | ||
1397 | // http://stackoverflow.com/questions/8627331/what-does-ordered-unordered-comparison-mean | ||
1398 | // http://stackoverflow.com/questions/29349621/neon-isnanval-intrinsics | ||
1399 | FORCE_INLINE __m128 _mm_cmpord_ps(__m128 a, __m128 b) | ||
1400 | { | ||
1401 | // Note: NEON does not have ordered compare builtin | ||
1402 | // Need to compare a eq a and b eq b to check for NaN | ||
1403 | // Do AND of results to get final | ||
1404 | uint32x4_t ceqaa = | ||
1405 | vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)); | ||
1406 | uint32x4_t ceqbb = | ||
1407 | vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b)); | ||
1408 | return vreinterpretq_m128_u32(vandq_u32(ceqaa, ceqbb)); | ||
1409 | } | ||
1410 | |||
1411 | // Compare the lower single-precision (32-bit) floating-point elements in a and | ||
1412 | // b to see if neither is NaN, store the result in the lower element of dst, and | ||
1413 | // copy the upper 3 packed elements from a to the upper elements of dst. | ||
1414 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_ss | ||
1415 | FORCE_INLINE __m128 _mm_cmpord_ss(__m128 a, __m128 b) | ||
1416 | { | ||
1417 | return _mm_move_ss(a, _mm_cmpord_ps(a, b)); | ||
1418 | } | ||
1419 | |||
1420 | // Compare packed single-precision (32-bit) floating-point elements in a and b | ||
1421 | // to see if either is NaN, and store the results in dst. | ||
1422 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_ps | ||
1423 | FORCE_INLINE __m128 _mm_cmpunord_ps(__m128 a, __m128 b) | ||
1424 | { | ||
1425 | uint32x4_t f32a = | ||
1426 | vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)); | ||
1427 | uint32x4_t f32b = | ||
1428 | vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b)); | ||
1429 | return vreinterpretq_m128_u32(vmvnq_u32(vandq_u32(f32a, f32b))); | ||
1430 | } | ||
1431 | |||
1432 | // Compare the lower single-precision (32-bit) floating-point elements in a and | ||
1433 | // b to see if either is NaN, store the result in the lower element of dst, and | ||
1434 | // copy the upper 3 packed elements from a to the upper elements of dst. | ||
1435 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_ss | ||
1436 | FORCE_INLINE __m128 _mm_cmpunord_ss(__m128 a, __m128 b) | ||
1437 | { | ||
1438 | return _mm_move_ss(a, _mm_cmpunord_ps(a, b)); | ||
1439 | } | ||
1440 | |||
1441 | // Compare the lower single-precision (32-bit) floating-point element in a and b | ||
1442 | // for equality, and return the boolean result (0 or 1). | ||
1443 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comieq_ss | ||
1444 | FORCE_INLINE int _mm_comieq_ss(__m128 a, __m128 b) | ||
1445 | { | ||
1446 | uint32x4_t a_eq_b = | ||
1447 | vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)); | ||
1448 | return vgetq_lane_u32(a_eq_b, 0) & 0x1; | ||
1449 | } | ||
1450 | |||
1451 | // Compare the lower single-precision (32-bit) floating-point element in a and b | ||
1452 | // for greater-than-or-equal, and return the boolean result (0 or 1). | ||
1453 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comige_ss | ||
1454 | FORCE_INLINE int _mm_comige_ss(__m128 a, __m128 b) | ||
1455 | { | ||
1456 | uint32x4_t a_ge_b = | ||
1457 | vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)); | ||
1458 | return vgetq_lane_u32(a_ge_b, 0) & 0x1; | ||
1459 | } | ||
1460 | |||
1461 | // Compare the lower single-precision (32-bit) floating-point element in a and b | ||
1462 | // for greater-than, and return the boolean result (0 or 1). | ||
1463 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comigt_ss | ||
1464 | FORCE_INLINE int _mm_comigt_ss(__m128 a, __m128 b) | ||
1465 | { | ||
1466 | uint32x4_t a_gt_b = | ||
1467 | vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)); | ||
1468 | return vgetq_lane_u32(a_gt_b, 0) & 0x1; | ||
1469 | } | ||
1470 | |||
1471 | // Compare the lower single-precision (32-bit) floating-point element in a and b | ||
1472 | // for less-than-or-equal, and return the boolean result (0 or 1). | ||
1473 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comile_ss | ||
1474 | FORCE_INLINE int _mm_comile_ss(__m128 a, __m128 b) | ||
1475 | { | ||
1476 | uint32x4_t a_le_b = | ||
1477 | vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)); | ||
1478 | return vgetq_lane_u32(a_le_b, 0) & 0x1; | ||
1479 | } | ||
1480 | |||
1481 | // Compare the lower single-precision (32-bit) floating-point element in a and b | ||
1482 | // for less-than, and return the boolean result (0 or 1). | ||
1483 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comilt_ss | ||
1484 | FORCE_INLINE int _mm_comilt_ss(__m128 a, __m128 b) | ||
1485 | { | ||
1486 | uint32x4_t a_lt_b = | ||
1487 | vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)); | ||
1488 | return vgetq_lane_u32(a_lt_b, 0) & 0x1; | ||
1489 | } | ||
1490 | |||
1491 | // Compare the lower single-precision (32-bit) floating-point element in a and b | ||
1492 | // for not-equal, and return the boolean result (0 or 1). | ||
1493 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comineq_ss | ||
1494 | FORCE_INLINE int _mm_comineq_ss(__m128 a, __m128 b) | ||
1495 | { | ||
1496 | return !_mm_comieq_ss(a, b); | ||
1497 | } | ||
1498 | |||
1499 | // Convert packed signed 32-bit integers in b to packed single-precision | ||
1500 | // (32-bit) floating-point elements, store the results in the lower 2 elements | ||
1501 | // of dst, and copy the upper 2 packed elements from a to the upper elements of | ||
1502 | // dst. | ||
1503 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_pi2ps | ||
1504 | FORCE_INLINE __m128 _mm_cvt_pi2ps(__m128 a, __m64 b) | ||
1505 | { | ||
1506 | return vreinterpretq_m128_f32( | ||
1507 | vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)), | ||
1508 | vget_high_f32(vreinterpretq_f32_m128(a)))); | ||
1509 | } | ||
1510 | |||
1511 | // Convert packed single-precision (32-bit) floating-point elements in a to | ||
1512 | // packed 32-bit integers, and store the results in dst. | ||
1513 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_ps2pi | ||
1514 | FORCE_INLINE __m64 _mm_cvt_ps2pi(__m128 a) | ||
1515 | { | ||
1516 | #if (defined(__aarch64__) || defined(_M_ARM64)) || \ | ||
1517 | defined(__ARM_FEATURE_DIRECTED_ROUNDING) | ||
1518 | return vreinterpret_m64_s32( | ||
1519 | vget_low_s32(vcvtnq_s32_f32(vrndiq_f32(vreinterpretq_f32_m128(a))))); | ||
1520 | #else | ||
1521 | return vreinterpret_m64_s32(vcvt_s32_f32(vget_low_f32( | ||
1522 | vreinterpretq_f32_m128(_mm_round_ps(a, _MM_FROUND_CUR_DIRECTION))))); | ||
1523 | #endif | ||
1524 | } | ||
1525 | |||
1526 | // Convert the signed 32-bit integer b to a single-precision (32-bit) | ||
1527 | // floating-point element, store the result in the lower element of dst, and | ||
1528 | // copy the upper 3 packed elements from a to the upper elements of dst. | ||
1529 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_si2ss | ||
1530 | FORCE_INLINE __m128 _mm_cvt_si2ss(__m128 a, int b) | ||
1531 | { | ||
1532 | return vreinterpretq_m128_f32( | ||
1533 | vsetq_lane_f32((float) b, vreinterpretq_f32_m128(a), 0)); | ||
1534 | } | ||
1535 | |||
1536 | // Convert the lower single-precision (32-bit) floating-point element in a to a | ||
1537 | // 32-bit integer, and store the result in dst. | ||
1538 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_ss2si | ||
1539 | FORCE_INLINE int _mm_cvt_ss2si(__m128 a) | ||
1540 | { | ||
1541 | #if (defined(__aarch64__) || defined(_M_ARM64)) || \ | ||
1542 | defined(__ARM_FEATURE_DIRECTED_ROUNDING) | ||
1543 | return vgetq_lane_s32(vcvtnq_s32_f32(vrndiq_f32(vreinterpretq_f32_m128(a))), | ||
1544 | 0); | ||
1545 | #else | ||
1546 | float32_t data = vgetq_lane_f32( | ||
1547 | vreinterpretq_f32_m128(_mm_round_ps(a, _MM_FROUND_CUR_DIRECTION)), 0); | ||
1548 | return (int32_t) data; | ||
1549 | #endif | ||
1550 | } | ||
1551 | |||
1552 | // Convert packed 16-bit integers in a to packed single-precision (32-bit) | ||
1553 | // floating-point elements, and store the results in dst. | ||
1554 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi16_ps | ||
1555 | FORCE_INLINE __m128 _mm_cvtpi16_ps(__m64 a) | ||
1556 | { | ||
1557 | return vreinterpretq_m128_f32( | ||
1558 | vcvtq_f32_s32(vmovl_s16(vreinterpret_s16_m64(a)))); | ||
1559 | } | ||
1560 | |||
1561 | // Convert packed 32-bit integers in b to packed single-precision (32-bit) | ||
1562 | // floating-point elements, store the results in the lower 2 elements of dst, | ||
1563 | // and copy the upper 2 packed elements from a to the upper elements of dst. | ||
1564 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi32_ps | ||
1565 | FORCE_INLINE __m128 _mm_cvtpi32_ps(__m128 a, __m64 b) | ||
1566 | { | ||
1567 | return vreinterpretq_m128_f32( | ||
1568 | vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)), | ||
1569 | vget_high_f32(vreinterpretq_f32_m128(a)))); | ||
1570 | } | ||
1571 | |||
1572 | // Convert packed signed 32-bit integers in a to packed single-precision | ||
1573 | // (32-bit) floating-point elements, store the results in the lower 2 elements | ||
1574 | // of dst, then convert the packed signed 32-bit integers in b to | ||
1575 | // single-precision (32-bit) floating-point element, and store the results in | ||
1576 | // the upper 2 elements of dst. | ||
1577 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi32x2_ps | ||
1578 | FORCE_INLINE __m128 _mm_cvtpi32x2_ps(__m64 a, __m64 b) | ||
1579 | { | ||
1580 | return vreinterpretq_m128_f32(vcvtq_f32_s32( | ||
1581 | vcombine_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b)))); | ||
1582 | } | ||
1583 | |||
1584 | // Convert the lower packed 8-bit integers in a to packed single-precision | ||
1585 | // (32-bit) floating-point elements, and store the results in dst. | ||
1586 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi8_ps | ||
1587 | FORCE_INLINE __m128 _mm_cvtpi8_ps(__m64 a) | ||
1588 | { | ||
1589 | return vreinterpretq_m128_f32(vcvtq_f32_s32( | ||
1590 | vmovl_s16(vget_low_s16(vmovl_s8(vreinterpret_s8_m64(a)))))); | ||
1591 | } | ||
1592 | |||
1593 | // Convert packed single-precision (32-bit) floating-point elements in a to | ||
1594 | // packed 16-bit integers, and store the results in dst. Note: this intrinsic | ||
1595 | // will generate 0x7FFF, rather than 0x8000, for input values between 0x7FFF and | ||
1596 | // 0x7FFFFFFF. | ||
1597 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pi16 | ||
1598 | FORCE_INLINE __m64 _mm_cvtps_pi16(__m128 a) | ||
1599 | { | ||
1600 | return vreinterpret_m64_s16( | ||
1601 | vqmovn_s32(vreinterpretq_s32_m128i(_mm_cvtps_epi32(a)))); | ||
1602 | } | ||
1603 | |||
1604 | // Convert packed single-precision (32-bit) floating-point elements in a to | ||
1605 | // packed 32-bit integers, and store the results in dst. | ||
1606 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pi32 | ||
1607 | #define _mm_cvtps_pi32(a) _mm_cvt_ps2pi(a) | ||
1608 | |||
1609 | // Convert packed single-precision (32-bit) floating-point elements in a to | ||
1610 | // packed 8-bit integers, and store the results in lower 4 elements of dst. | ||
1611 | // Note: this intrinsic will generate 0x7F, rather than 0x80, for input values | ||
1612 | // between 0x7F and 0x7FFFFFFF. | ||
1613 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pi8 | ||
1614 | FORCE_INLINE __m64 _mm_cvtps_pi8(__m128 a) | ||
1615 | { | ||
1616 | return vreinterpret_m64_s8(vqmovn_s16( | ||
1617 | vcombine_s16(vreinterpret_s16_m64(_mm_cvtps_pi16(a)), vdup_n_s16(0)))); | ||
1618 | } | ||
1619 | |||
1620 | // Convert packed unsigned 16-bit integers in a to packed single-precision | ||
1621 | // (32-bit) floating-point elements, and store the results in dst. | ||
1622 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpu16_ps | ||
1623 | FORCE_INLINE __m128 _mm_cvtpu16_ps(__m64 a) | ||
1624 | { | ||
1625 | return vreinterpretq_m128_f32( | ||
1626 | vcvtq_f32_u32(vmovl_u16(vreinterpret_u16_m64(a)))); | ||
1627 | } | ||
1628 | |||
1629 | // Convert the lower packed unsigned 8-bit integers in a to packed | ||
1630 | // single-precision (32-bit) floating-point elements, and store the results in | ||
1631 | // dst. | ||
1632 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpu8_ps | ||
1633 | FORCE_INLINE __m128 _mm_cvtpu8_ps(__m64 a) | ||
1634 | { | ||
1635 | return vreinterpretq_m128_f32(vcvtq_f32_u32( | ||
1636 | vmovl_u16(vget_low_u16(vmovl_u8(vreinterpret_u8_m64(a)))))); | ||
1637 | } | ||
1638 | |||
1639 | // Convert the signed 32-bit integer b to a single-precision (32-bit) | ||
1640 | // floating-point element, store the result in the lower element of dst, and | ||
1641 | // copy the upper 3 packed elements from a to the upper elements of dst. | ||
1642 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_ss | ||
1643 | #define _mm_cvtsi32_ss(a, b) _mm_cvt_si2ss(a, b) | ||
1644 | |||
1645 | // Convert the signed 64-bit integer b to a single-precision (32-bit) | ||
1646 | // floating-point element, store the result in the lower element of dst, and | ||
1647 | // copy the upper 3 packed elements from a to the upper elements of dst. | ||
1648 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64_ss | ||
1649 | FORCE_INLINE __m128 _mm_cvtsi64_ss(__m128 a, int64_t b) | ||
1650 | { | ||
1651 | return vreinterpretq_m128_f32( | ||
1652 | vsetq_lane_f32((float) b, vreinterpretq_f32_m128(a), 0)); | ||
1653 | } | ||
1654 | |||
1655 | // Copy the lower single-precision (32-bit) floating-point element of a to dst. | ||
1656 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_f32 | ||
1657 | FORCE_INLINE float _mm_cvtss_f32(__m128 a) | ||
1658 | { | ||
1659 | return vgetq_lane_f32(vreinterpretq_f32_m128(a), 0); | ||
1660 | } | ||
1661 | |||
1662 | // Convert the lower single-precision (32-bit) floating-point element in a to a | ||
1663 | // 32-bit integer, and store the result in dst. | ||
1664 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_si32 | ||
1665 | #define _mm_cvtss_si32(a) _mm_cvt_ss2si(a) | ||
1666 | |||
1667 | // Convert the lower single-precision (32-bit) floating-point element in a to a | ||
1668 | // 64-bit integer, and store the result in dst. | ||
1669 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_si64 | ||
1670 | FORCE_INLINE int64_t _mm_cvtss_si64(__m128 a) | ||
1671 | { | ||
1672 | #if (defined(__aarch64__) || defined(_M_ARM64)) || \ | ||
1673 | defined(__ARM_FEATURE_DIRECTED_ROUNDING) | ||
1674 | return (int64_t) vgetq_lane_f32(vrndiq_f32(vreinterpretq_f32_m128(a)), 0); | ||
1675 | #else | ||
1676 | float32_t data = vgetq_lane_f32( | ||
1677 | vreinterpretq_f32_m128(_mm_round_ps(a, _MM_FROUND_CUR_DIRECTION)), 0); | ||
1678 | return (int64_t) data; | ||
1679 | #endif | ||
1680 | } | ||
1681 | |||
1682 | // Convert packed single-precision (32-bit) floating-point elements in a to | ||
1683 | // packed 32-bit integers with truncation, and store the results in dst. | ||
1684 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtt_ps2pi | ||
1685 | FORCE_INLINE __m64 _mm_cvtt_ps2pi(__m128 a) | ||
1686 | { | ||
1687 | return vreinterpret_m64_s32( | ||
1688 | vget_low_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)))); | ||
1689 | } | ||
1690 | |||
1691 | // Convert the lower single-precision (32-bit) floating-point element in a to a | ||
1692 | // 32-bit integer with truncation, and store the result in dst. | ||
1693 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtt_ss2si | ||
1694 | FORCE_INLINE int _mm_cvtt_ss2si(__m128 a) | ||
1695 | { | ||
1696 | return vgetq_lane_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)), 0); | ||
1697 | } | ||
1698 | |||
1699 | // Convert packed single-precision (32-bit) floating-point elements in a to | ||
1700 | // packed 32-bit integers with truncation, and store the results in dst. | ||
1701 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttps_pi32 | ||
1702 | #define _mm_cvttps_pi32(a) _mm_cvtt_ps2pi(a) | ||
1703 | |||
1704 | // Convert the lower single-precision (32-bit) floating-point element in a to a | ||
1705 | // 32-bit integer with truncation, and store the result in dst. | ||
1706 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttss_si32 | ||
1707 | #define _mm_cvttss_si32(a) _mm_cvtt_ss2si(a) | ||
1708 | |||
1709 | // Convert the lower single-precision (32-bit) floating-point element in a to a | ||
1710 | // 64-bit integer with truncation, and store the result in dst. | ||
1711 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttss_si64 | ||
1712 | FORCE_INLINE int64_t _mm_cvttss_si64(__m128 a) | ||
1713 | { | ||
1714 | return (int64_t) vgetq_lane_f32(vreinterpretq_f32_m128(a), 0); | ||
1715 | } | ||
1716 | |||
1717 | // Divide packed single-precision (32-bit) floating-point elements in a by | ||
1718 | // packed elements in b, and store the results in dst. | ||
1719 | // Due to ARMv7-A NEON's lack of a precise division intrinsic, we implement | ||
1720 | // division by multiplying a by b's reciprocal before using the Newton-Raphson | ||
1721 | // method to approximate the results. | ||
1722 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_ps | ||
1723 | FORCE_INLINE __m128 _mm_div_ps(__m128 a, __m128 b) | ||
1724 | { | ||
1725 | #if (defined(__aarch64__) || defined(_M_ARM64)) && !SSE2NEON_PRECISE_DIV | ||
1726 | return vreinterpretq_m128_f32( | ||
1727 | vdivq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); | ||
1728 | #else | ||
1729 | float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(b)); | ||
1730 | recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(b))); | ||
1731 | // Additional Netwon-Raphson iteration for accuracy | ||
1732 | recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(b))); | ||
1733 | return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(a), recip)); | ||
1734 | #endif | ||
1735 | } | ||
1736 | |||
1737 | // Divide the lower single-precision (32-bit) floating-point element in a by the | ||
1738 | // lower single-precision (32-bit) floating-point element in b, store the result | ||
1739 | // in the lower element of dst, and copy the upper 3 packed elements from a to | ||
1740 | // the upper elements of dst. | ||
1741 | // Warning: ARMv7-A does not produce the same result compared to Intel and not | ||
1742 | // IEEE-compliant. | ||
1743 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_ss | ||
1744 | FORCE_INLINE __m128 _mm_div_ss(__m128 a, __m128 b) | ||
1745 | { | ||
1746 | float32_t value = | ||
1747 | vgetq_lane_f32(vreinterpretq_f32_m128(_mm_div_ps(a, b)), 0); | ||
1748 | return vreinterpretq_m128_f32( | ||
1749 | vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0)); | ||
1750 | } | ||
1751 | |||
1752 | // Extract a 16-bit integer from a, selected with imm8, and store the result in | ||
1753 | // the lower element of dst. | ||
1754 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_pi16 | ||
1755 | #define _mm_extract_pi16(a, imm) \ | ||
1756 | (int32_t) vget_lane_u16(vreinterpret_u16_m64(a), (imm)) | ||
1757 | |||
1758 | // Free aligned memory that was allocated with _mm_malloc. | ||
1759 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_free | ||
1760 | #if !defined(SSE2NEON_ALLOC_DEFINED) | ||
1761 | FORCE_INLINE void _mm_free(void *addr) | ||
1762 | { | ||
1763 | free(addr); | ||
1764 | } | ||
1765 | #endif | ||
1766 | |||
1767 | FORCE_INLINE uint64_t _sse2neon_get_fpcr(void) | ||
1768 | { | ||
1769 | uint64_t value; | ||
1770 | #if defined(_MSC_VER) | ||
1771 | value = _ReadStatusReg(ARM64_FPCR); | ||
1772 | #else | ||
1773 | __asm__ __volatile__("mrs %0, FPCR" : "=r"(value)); /* read */ | ||
1774 | #endif | ||
1775 | return value; | ||
1776 | } | ||
1777 | |||
1778 | FORCE_INLINE void _sse2neon_set_fpcr(uint64_t value) | ||
1779 | { | ||
1780 | #if defined(_MSC_VER) | ||
1781 | _WriteStatusReg(ARM64_FPCR, value); | ||
1782 | #else | ||
1783 | __asm__ __volatile__("msr FPCR, %0" ::"r"(value)); /* write */ | ||
1784 | #endif | ||
1785 | } | ||
1786 | |||
1787 | // Macro: Get the flush zero bits from the MXCSR control and status register. | ||
1788 | // The flush zero may contain any of the following flags: _MM_FLUSH_ZERO_ON or | ||
1789 | // _MM_FLUSH_ZERO_OFF | ||
1790 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_FLUSH_ZERO_MODE | ||
1791 | FORCE_INLINE unsigned int _sse2neon_mm_get_flush_zero_mode(void) | ||
1792 | { | ||
1793 | union { | ||
1794 | fpcr_bitfield field; | ||
1795 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
1796 | uint64_t value; | ||
1797 | #else | ||
1798 | uint32_t value; | ||
1799 | #endif | ||
1800 | } r; | ||
1801 | |||
1802 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
1803 | r.value = _sse2neon_get_fpcr(); | ||
1804 | #else | ||
1805 | __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */ | ||
1806 | #endif | ||
1807 | |||
1808 | return r.field.bit24 ? _MM_FLUSH_ZERO_ON : _MM_FLUSH_ZERO_OFF; | ||
1809 | } | ||
1810 | |||
1811 | // Macro: Get the rounding mode bits from the MXCSR control and status register. | ||
1812 | // The rounding mode may contain any of the following flags: _MM_ROUND_NEAREST, | ||
1813 | // _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO | ||
1814 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_ROUNDING_MODE | ||
1815 | FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE(void) | ||
1816 | { | ||
1817 | union { | ||
1818 | fpcr_bitfield field; | ||
1819 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
1820 | uint64_t value; | ||
1821 | #else | ||
1822 | uint32_t value; | ||
1823 | #endif | ||
1824 | } r; | ||
1825 | |||
1826 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
1827 | r.value = _sse2neon_get_fpcr(); | ||
1828 | #else | ||
1829 | __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */ | ||
1830 | #endif | ||
1831 | |||
1832 | if (r.field.bit22) { | ||
1833 | return r.field.bit23 ? _MM_ROUND_TOWARD_ZERO : _MM_ROUND_UP; | ||
1834 | } else { | ||
1835 | return r.field.bit23 ? _MM_ROUND_DOWN : _MM_ROUND_NEAREST; | ||
1836 | } | ||
1837 | } | ||
1838 | |||
1839 | // Copy a to dst, and insert the 16-bit integer i into dst at the location | ||
1840 | // specified by imm8. | ||
1841 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_pi16 | ||
1842 | #define _mm_insert_pi16(a, b, imm) \ | ||
1843 | vreinterpret_m64_s16(vset_lane_s16((b), vreinterpret_s16_m64(a), (imm))) | ||
1844 | |||
1845 | // Load 128-bits (composed of 4 packed single-precision (32-bit) floating-point | ||
1846 | // elements) from memory into dst. mem_addr must be aligned on a 16-byte | ||
1847 | // boundary or a general-protection exception may be generated. | ||
1848 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ps | ||
1849 | FORCE_INLINE __m128 _mm_load_ps(const float *p) | ||
1850 | { | ||
1851 | return vreinterpretq_m128_f32(vld1q_f32(p)); | ||
1852 | } | ||
1853 | |||
1854 | // Load a single-precision (32-bit) floating-point element from memory into all | ||
1855 | // elements of dst. | ||
1856 | // | ||
1857 | // dst[31:0] := MEM[mem_addr+31:mem_addr] | ||
1858 | // dst[63:32] := MEM[mem_addr+31:mem_addr] | ||
1859 | // dst[95:64] := MEM[mem_addr+31:mem_addr] | ||
1860 | // dst[127:96] := MEM[mem_addr+31:mem_addr] | ||
1861 | // | ||
1862 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ps1 | ||
1863 | #define _mm_load_ps1 _mm_load1_ps | ||
1864 | |||
1865 | // Load a single-precision (32-bit) floating-point element from memory into the | ||
1866 | // lower of dst, and zero the upper 3 elements. mem_addr does not need to be | ||
1867 | // aligned on any particular boundary. | ||
1868 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ss | ||
1869 | FORCE_INLINE __m128 _mm_load_ss(const float *p) | ||
1870 | { | ||
1871 | return vreinterpretq_m128_f32(vsetq_lane_f32(*p, vdupq_n_f32(0), 0)); | ||
1872 | } | ||
1873 | |||
1874 | // Load a single-precision (32-bit) floating-point element from memory into all | ||
1875 | // elements of dst. | ||
1876 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load1_ps | ||
1877 | FORCE_INLINE __m128 _mm_load1_ps(const float *p) | ||
1878 | { | ||
1879 | return vreinterpretq_m128_f32(vld1q_dup_f32(p)); | ||
1880 | } | ||
1881 | |||
1882 | // Load 2 single-precision (32-bit) floating-point elements from memory into the | ||
1883 | // upper 2 elements of dst, and copy the lower 2 elements from a to dst. | ||
1884 | // mem_addr does not need to be aligned on any particular boundary. | ||
1885 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadh_pi | ||
1886 | FORCE_INLINE __m128 _mm_loadh_pi(__m128 a, __m64 const *p) | ||
1887 | { | ||
1888 | return vreinterpretq_m128_f32( | ||
1889 | vcombine_f32(vget_low_f32(a), vld1_f32((const float32_t *) p))); | ||
1890 | } | ||
1891 | |||
1892 | // Load 2 single-precision (32-bit) floating-point elements from memory into the | ||
1893 | // lower 2 elements of dst, and copy the upper 2 elements from a to dst. | ||
1894 | // mem_addr does not need to be aligned on any particular boundary. | ||
1895 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadl_pi | ||
1896 | FORCE_INLINE __m128 _mm_loadl_pi(__m128 a, __m64 const *p) | ||
1897 | { | ||
1898 | return vreinterpretq_m128_f32( | ||
1899 | vcombine_f32(vld1_f32((const float32_t *) p), vget_high_f32(a))); | ||
1900 | } | ||
1901 | |||
1902 | // Load 4 single-precision (32-bit) floating-point elements from memory into dst | ||
1903 | // in reverse order. mem_addr must be aligned on a 16-byte boundary or a | ||
1904 | // general-protection exception may be generated. | ||
1905 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadr_ps | ||
1906 | FORCE_INLINE __m128 _mm_loadr_ps(const float *p) | ||
1907 | { | ||
1908 | float32x4_t v = vrev64q_f32(vld1q_f32(p)); | ||
1909 | return vreinterpretq_m128_f32(vextq_f32(v, v, 2)); | ||
1910 | } | ||
1911 | |||
1912 | // Load 128-bits (composed of 4 packed single-precision (32-bit) floating-point | ||
1913 | // elements) from memory into dst. mem_addr does not need to be aligned on any | ||
1914 | // particular boundary. | ||
1915 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_ps | ||
1916 | FORCE_INLINE __m128 _mm_loadu_ps(const float *p) | ||
1917 | { | ||
1918 | // for neon, alignment doesn't matter, so _mm_load_ps and _mm_loadu_ps are | ||
1919 | // equivalent for neon | ||
1920 | return vreinterpretq_m128_f32(vld1q_f32(p)); | ||
1921 | } | ||
1922 | |||
1923 | // Load unaligned 16-bit integer from memory into the first element of dst. | ||
1924 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si16 | ||
1925 | FORCE_INLINE __m128i _mm_loadu_si16(const void *p) | ||
1926 | { | ||
1927 | return vreinterpretq_m128i_s16( | ||
1928 | vsetq_lane_s16(*(const int16_t *) p, vdupq_n_s16(0), 0)); | ||
1929 | } | ||
1930 | |||
1931 | // Load unaligned 64-bit integer from memory into the first element of dst. | ||
1932 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si64 | ||
1933 | FORCE_INLINE __m128i _mm_loadu_si64(const void *p) | ||
1934 | { | ||
1935 | return vreinterpretq_m128i_s64( | ||
1936 | vcombine_s64(vld1_s64((const int64_t *) p), vdup_n_s64(0))); | ||
1937 | } | ||
1938 | |||
1939 | // Allocate size bytes of memory, aligned to the alignment specified in align, | ||
1940 | // and return a pointer to the allocated memory. _mm_free should be used to free | ||
1941 | // memory that is allocated with _mm_malloc. | ||
1942 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_malloc | ||
1943 | #if !defined(SSE2NEON_ALLOC_DEFINED) | ||
1944 | FORCE_INLINE void *_mm_malloc(size_t size, size_t align) | ||
1945 | { | ||
1946 | void *ptr; | ||
1947 | if (align == 1) | ||
1948 | return malloc(size); | ||
1949 | if (align == 2 || (sizeof(void *) == 8 && align == 4)) | ||
1950 | align = sizeof(void *); | ||
1951 | if (!posix_memalign(&ptr, align, size)) | ||
1952 | return ptr; | ||
1953 | return NULL; | ||
1954 | } | ||
1955 | #endif | ||
1956 | |||
1957 | // Conditionally store 8-bit integer elements from a into memory using mask | ||
1958 | // (elements are not stored when the highest bit is not set in the corresponding | ||
1959 | // element) and a non-temporal memory hint. | ||
1960 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskmove_si64 | ||
1961 | FORCE_INLINE void _mm_maskmove_si64(__m64 a, __m64 mask, char *mem_addr) | ||
1962 | { | ||
1963 | int8x8_t shr_mask = vshr_n_s8(vreinterpret_s8_m64(mask), 7); | ||
1964 | __m128 b = _mm_load_ps((const float *) mem_addr); | ||
1965 | int8x8_t masked = | ||
1966 | vbsl_s8(vreinterpret_u8_s8(shr_mask), vreinterpret_s8_m64(a), | ||
1967 | vreinterpret_s8_u64(vget_low_u64(vreinterpretq_u64_m128(b)))); | ||
1968 | vst1_s8((int8_t *) mem_addr, masked); | ||
1969 | } | ||
1970 | |||
1971 | // Conditionally store 8-bit integer elements from a into memory using mask | ||
1972 | // (elements are not stored when the highest bit is not set in the corresponding | ||
1973 | // element) and a non-temporal memory hint. | ||
1974 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_maskmovq | ||
1975 | #define _m_maskmovq(a, mask, mem_addr) _mm_maskmove_si64(a, mask, mem_addr) | ||
1976 | |||
1977 | // Compare packed signed 16-bit integers in a and b, and store packed maximum | ||
1978 | // values in dst. | ||
1979 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pi16 | ||
1980 | FORCE_INLINE __m64 _mm_max_pi16(__m64 a, __m64 b) | ||
1981 | { | ||
1982 | return vreinterpret_m64_s16( | ||
1983 | vmax_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b))); | ||
1984 | } | ||
1985 | |||
1986 | // Compare packed single-precision (32-bit) floating-point elements in a and b, | ||
1987 | // and store packed maximum values in dst. dst does not follow the IEEE Standard | ||
1988 | // for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are NaN or | ||
1989 | // signed-zero values. | ||
1990 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_ps | ||
1991 | FORCE_INLINE __m128 _mm_max_ps(__m128 a, __m128 b) | ||
1992 | { | ||
1993 | #if SSE2NEON_PRECISE_MINMAX | ||
1994 | float32x4_t _a = vreinterpretq_f32_m128(a); | ||
1995 | float32x4_t _b = vreinterpretq_f32_m128(b); | ||
1996 | return vreinterpretq_m128_f32(vbslq_f32(vcgtq_f32(_a, _b), _a, _b)); | ||
1997 | #else | ||
1998 | return vreinterpretq_m128_f32( | ||
1999 | vmaxq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); | ||
2000 | #endif | ||
2001 | } | ||
2002 | |||
2003 | // Compare packed unsigned 8-bit integers in a and b, and store packed maximum | ||
2004 | // values in dst. | ||
2005 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pu8 | ||
2006 | FORCE_INLINE __m64 _mm_max_pu8(__m64 a, __m64 b) | ||
2007 | { | ||
2008 | return vreinterpret_m64_u8( | ||
2009 | vmax_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b))); | ||
2010 | } | ||
2011 | |||
2012 | // Compare the lower single-precision (32-bit) floating-point elements in a and | ||
2013 | // b, store the maximum value in the lower element of dst, and copy the upper 3 | ||
2014 | // packed elements from a to the upper element of dst. dst does not follow the | ||
2015 | // IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when | ||
2016 | // inputs are NaN or signed-zero values. | ||
2017 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_ss | ||
2018 | FORCE_INLINE __m128 _mm_max_ss(__m128 a, __m128 b) | ||
2019 | { | ||
2020 | float32_t value = vgetq_lane_f32(_mm_max_ps(a, b), 0); | ||
2021 | return vreinterpretq_m128_f32( | ||
2022 | vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0)); | ||
2023 | } | ||
2024 | |||
2025 | // Compare packed signed 16-bit integers in a and b, and store packed minimum | ||
2026 | // values in dst. | ||
2027 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_pi16 | ||
2028 | FORCE_INLINE __m64 _mm_min_pi16(__m64 a, __m64 b) | ||
2029 | { | ||
2030 | return vreinterpret_m64_s16( | ||
2031 | vmin_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b))); | ||
2032 | } | ||
2033 | |||
2034 | // Compare packed single-precision (32-bit) floating-point elements in a and b, | ||
2035 | // and store packed minimum values in dst. dst does not follow the IEEE Standard | ||
2036 | // for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are NaN or | ||
2037 | // signed-zero values. | ||
2038 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_ps | ||
2039 | FORCE_INLINE __m128 _mm_min_ps(__m128 a, __m128 b) | ||
2040 | { | ||
2041 | #if SSE2NEON_PRECISE_MINMAX | ||
2042 | float32x4_t _a = vreinterpretq_f32_m128(a); | ||
2043 | float32x4_t _b = vreinterpretq_f32_m128(b); | ||
2044 | return vreinterpretq_m128_f32(vbslq_f32(vcltq_f32(_a, _b), _a, _b)); | ||
2045 | #else | ||
2046 | return vreinterpretq_m128_f32( | ||
2047 | vminq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); | ||
2048 | #endif | ||
2049 | } | ||
2050 | |||
2051 | // Compare packed unsigned 8-bit integers in a and b, and store packed minimum | ||
2052 | // values in dst. | ||
2053 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_pu8 | ||
2054 | FORCE_INLINE __m64 _mm_min_pu8(__m64 a, __m64 b) | ||
2055 | { | ||
2056 | return vreinterpret_m64_u8( | ||
2057 | vmin_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b))); | ||
2058 | } | ||
2059 | |||
2060 | // Compare the lower single-precision (32-bit) floating-point elements in a and | ||
2061 | // b, store the minimum value in the lower element of dst, and copy the upper 3 | ||
2062 | // packed elements from a to the upper element of dst. dst does not follow the | ||
2063 | // IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when | ||
2064 | // inputs are NaN or signed-zero values. | ||
2065 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_ss | ||
2066 | FORCE_INLINE __m128 _mm_min_ss(__m128 a, __m128 b) | ||
2067 | { | ||
2068 | float32_t value = vgetq_lane_f32(_mm_min_ps(a, b), 0); | ||
2069 | return vreinterpretq_m128_f32( | ||
2070 | vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0)); | ||
2071 | } | ||
2072 | |||
2073 | // Move the lower single-precision (32-bit) floating-point element from b to the | ||
2074 | // lower element of dst, and copy the upper 3 packed elements from a to the | ||
2075 | // upper elements of dst. | ||
2076 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_ss | ||
2077 | FORCE_INLINE __m128 _mm_move_ss(__m128 a, __m128 b) | ||
2078 | { | ||
2079 | return vreinterpretq_m128_f32( | ||
2080 | vsetq_lane_f32(vgetq_lane_f32(vreinterpretq_f32_m128(b), 0), | ||
2081 | vreinterpretq_f32_m128(a), 0)); | ||
2082 | } | ||
2083 | |||
2084 | // Move the upper 2 single-precision (32-bit) floating-point elements from b to | ||
2085 | // the lower 2 elements of dst, and copy the upper 2 elements from a to the | ||
2086 | // upper 2 elements of dst. | ||
2087 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movehl_ps | ||
2088 | FORCE_INLINE __m128 _mm_movehl_ps(__m128 a, __m128 b) | ||
2089 | { | ||
2090 | #if defined(aarch64__) | ||
2091 | return vreinterpretq_m128_u64( | ||
2092 | vzip2q_u64(vreinterpretq_u64_m128(b), vreinterpretq_u64_m128(a))); | ||
2093 | #else | ||
2094 | float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a)); | ||
2095 | float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b)); | ||
2096 | return vreinterpretq_m128_f32(vcombine_f32(b32, a32)); | ||
2097 | #endif | ||
2098 | } | ||
2099 | |||
2100 | // Move the lower 2 single-precision (32-bit) floating-point elements from b to | ||
2101 | // the upper 2 elements of dst, and copy the lower 2 elements from a to the | ||
2102 | // lower 2 elements of dst. | ||
2103 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movelh_ps | ||
2104 | FORCE_INLINE __m128 _mm_movelh_ps(__m128 __A, __m128 __B) | ||
2105 | { | ||
2106 | float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(__A)); | ||
2107 | float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(__B)); | ||
2108 | return vreinterpretq_m128_f32(vcombine_f32(a10, b10)); | ||
2109 | } | ||
2110 | |||
2111 | // Create mask from the most significant bit of each 8-bit element in a, and | ||
2112 | // store the result in dst. | ||
2113 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_pi8 | ||
2114 | FORCE_INLINE int _mm_movemask_pi8(__m64 a) | ||
2115 | { | ||
2116 | uint8x8_t input = vreinterpret_u8_m64(a); | ||
2117 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
2118 | static const int8_t shift[8] = {0, 1, 2, 3, 4, 5, 6, 7}; | ||
2119 | uint8x8_t tmp = vshr_n_u8(input, 7); | ||
2120 | return vaddv_u8(vshl_u8(tmp, vld1_s8(shift))); | ||
2121 | #else | ||
2122 | // Refer the implementation of `_mm_movemask_epi8` | ||
2123 | uint16x4_t high_bits = vreinterpret_u16_u8(vshr_n_u8(input, 7)); | ||
2124 | uint32x2_t paired16 = | ||
2125 | vreinterpret_u32_u16(vsra_n_u16(high_bits, high_bits, 7)); | ||
2126 | uint8x8_t paired32 = | ||
2127 | vreinterpret_u8_u32(vsra_n_u32(paired16, paired16, 14)); | ||
2128 | return vget_lane_u8(paired32, 0) | ((int) vget_lane_u8(paired32, 4) << 4); | ||
2129 | #endif | ||
2130 | } | ||
2131 | |||
2132 | // Set each bit of mask dst based on the most significant bit of the | ||
2133 | // corresponding packed single-precision (32-bit) floating-point element in a. | ||
2134 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_ps | ||
2135 | FORCE_INLINE int _mm_movemask_ps(__m128 a) | ||
2136 | { | ||
2137 | uint32x4_t input = vreinterpretq_u32_m128(a); | ||
2138 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
2139 | static const int32_t shift[4] = {0, 1, 2, 3}; | ||
2140 | uint32x4_t tmp = vshrq_n_u32(input, 31); | ||
2141 | return vaddvq_u32(vshlq_u32(tmp, vld1q_s32(shift))); | ||
2142 | #else | ||
2143 | // Uses the exact same method as _mm_movemask_epi8, see that for details. | ||
2144 | // Shift out everything but the sign bits with a 32-bit unsigned shift | ||
2145 | // right. | ||
2146 | uint64x2_t high_bits = vreinterpretq_u64_u32(vshrq_n_u32(input, 31)); | ||
2147 | // Merge the two pairs together with a 64-bit unsigned shift right + add. | ||
2148 | uint8x16_t paired = | ||
2149 | vreinterpretq_u8_u64(vsraq_n_u64(high_bits, high_bits, 31)); | ||
2150 | // Extract the result. | ||
2151 | return vgetq_lane_u8(paired, 0) | (vgetq_lane_u8(paired, 8) << 2); | ||
2152 | #endif | ||
2153 | } | ||
2154 | |||
2155 | // Multiply packed single-precision (32-bit) floating-point elements in a and b, | ||
2156 | // and store the results in dst. | ||
2157 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_ps | ||
2158 | FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b) | ||
2159 | { | ||
2160 | return vreinterpretq_m128_f32( | ||
2161 | vmulq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); | ||
2162 | } | ||
2163 | |||
2164 | // Multiply the lower single-precision (32-bit) floating-point element in a and | ||
2165 | // b, store the result in the lower element of dst, and copy the upper 3 packed | ||
2166 | // elements from a to the upper elements of dst. | ||
2167 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_ss | ||
2168 | FORCE_INLINE __m128 _mm_mul_ss(__m128 a, __m128 b) | ||
2169 | { | ||
2170 | return _mm_move_ss(a, _mm_mul_ps(a, b)); | ||
2171 | } | ||
2172 | |||
2173 | // Multiply the packed unsigned 16-bit integers in a and b, producing | ||
2174 | // intermediate 32-bit integers, and store the high 16 bits of the intermediate | ||
2175 | // integers in dst. | ||
2176 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_pu16 | ||
2177 | FORCE_INLINE __m64 _mm_mulhi_pu16(__m64 a, __m64 b) | ||
2178 | { | ||
2179 | return vreinterpret_m64_u16(vshrn_n_u32( | ||
2180 | vmull_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b)), 16)); | ||
2181 | } | ||
2182 | |||
2183 | // Compute the bitwise OR of packed single-precision (32-bit) floating-point | ||
2184 | // elements in a and b, and store the results in dst. | ||
2185 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_or_ps | ||
2186 | FORCE_INLINE __m128 _mm_or_ps(__m128 a, __m128 b) | ||
2187 | { | ||
2188 | return vreinterpretq_m128_s32( | ||
2189 | vorrq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b))); | ||
2190 | } | ||
2191 | |||
2192 | // Average packed unsigned 8-bit integers in a and b, and store the results in | ||
2193 | // dst. | ||
2194 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pavgb | ||
2195 | #define _m_pavgb(a, b) _mm_avg_pu8(a, b) | ||
2196 | |||
2197 | // Average packed unsigned 16-bit integers in a and b, and store the results in | ||
2198 | // dst. | ||
2199 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pavgw | ||
2200 | #define _m_pavgw(a, b) _mm_avg_pu16(a, b) | ||
2201 | |||
2202 | // Extract a 16-bit integer from a, selected with imm8, and store the result in | ||
2203 | // the lower element of dst. | ||
2204 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pextrw | ||
2205 | #define _m_pextrw(a, imm) _mm_extract_pi16(a, imm) | ||
2206 | |||
2207 | // Copy a to dst, and insert the 16-bit integer i into dst at the location | ||
2208 | // specified by imm8. | ||
2209 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=m_pinsrw | ||
2210 | #define _m_pinsrw(a, i, imm) _mm_insert_pi16(a, i, imm) | ||
2211 | |||
2212 | // Compare packed signed 16-bit integers in a and b, and store packed maximum | ||
2213 | // values in dst. | ||
2214 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pmaxsw | ||
2215 | #define _m_pmaxsw(a, b) _mm_max_pi16(a, b) | ||
2216 | |||
2217 | // Compare packed unsigned 8-bit integers in a and b, and store packed maximum | ||
2218 | // values in dst. | ||
2219 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pmaxub | ||
2220 | #define _m_pmaxub(a, b) _mm_max_pu8(a, b) | ||
2221 | |||
2222 | // Compare packed signed 16-bit integers in a and b, and store packed minimum | ||
2223 | // values in dst. | ||
2224 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pminsw | ||
2225 | #define _m_pminsw(a, b) _mm_min_pi16(a, b) | ||
2226 | |||
2227 | // Compare packed unsigned 8-bit integers in a and b, and store packed minimum | ||
2228 | // values in dst. | ||
2229 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pminub | ||
2230 | #define _m_pminub(a, b) _mm_min_pu8(a, b) | ||
2231 | |||
2232 | // Create mask from the most significant bit of each 8-bit element in a, and | ||
2233 | // store the result in dst. | ||
2234 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pmovmskb | ||
2235 | #define _m_pmovmskb(a) _mm_movemask_pi8(a) | ||
2236 | |||
2237 | // Multiply the packed unsigned 16-bit integers in a and b, producing | ||
2238 | // intermediate 32-bit integers, and store the high 16 bits of the intermediate | ||
2239 | // integers in dst. | ||
2240 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pmulhuw | ||
2241 | #define _m_pmulhuw(a, b) _mm_mulhi_pu16(a, b) | ||
2242 | |||
2243 | // Fetch the line of data from memory that contains address p to a location in | ||
2244 | // the cache hierarchy specified by the locality hint i. | ||
2245 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_prefetch | ||
2246 | FORCE_INLINE void _mm_prefetch(char const *p, int i) | ||
2247 | { | ||
2248 | (void) i; | ||
2249 | #if defined(_MSC_VER) | ||
2250 | switch (i) { | ||
2251 | case _MM_HINT_NTA: | ||
2252 | __prefetch2(p, 1); | ||
2253 | break; | ||
2254 | case _MM_HINT_T0: | ||
2255 | __prefetch2(p, 0); | ||
2256 | break; | ||
2257 | case _MM_HINT_T1: | ||
2258 | __prefetch2(p, 2); | ||
2259 | break; | ||
2260 | case _MM_HINT_T2: | ||
2261 | __prefetch2(p, 4); | ||
2262 | break; | ||
2263 | } | ||
2264 | #else | ||
2265 | switch (i) { | ||
2266 | case _MM_HINT_NTA: | ||
2267 | __builtin_prefetch(p, 0, 0); | ||
2268 | break; | ||
2269 | case _MM_HINT_T0: | ||
2270 | __builtin_prefetch(p, 0, 3); | ||
2271 | break; | ||
2272 | case _MM_HINT_T1: | ||
2273 | __builtin_prefetch(p, 0, 2); | ||
2274 | break; | ||
2275 | case _MM_HINT_T2: | ||
2276 | __builtin_prefetch(p, 0, 1); | ||
2277 | break; | ||
2278 | } | ||
2279 | #endif | ||
2280 | } | ||
2281 | |||
2282 | // Compute the absolute differences of packed unsigned 8-bit integers in a and | ||
2283 | // b, then horizontally sum each consecutive 8 differences to produce four | ||
2284 | // unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low | ||
2285 | // 16 bits of dst. | ||
2286 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=m_psadbw | ||
2287 | #define _m_psadbw(a, b) _mm_sad_pu8(a, b) | ||
2288 | |||
2289 | // Shuffle 16-bit integers in a using the control in imm8, and store the results | ||
2290 | // in dst. | ||
2291 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pshufw | ||
2292 | #define _m_pshufw(a, imm) _mm_shuffle_pi16(a, imm) | ||
2293 | |||
2294 | // Compute the approximate reciprocal of packed single-precision (32-bit) | ||
2295 | // floating-point elements in a, and store the results in dst. The maximum | ||
2296 | // relative error for this approximation is less than 1.5*2^-12. | ||
2297 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rcp_ps | ||
2298 | FORCE_INLINE __m128 _mm_rcp_ps(__m128 in) | ||
2299 | { | ||
2300 | float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(in)); | ||
2301 | recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in))); | ||
2302 | #if SSE2NEON_PRECISE_DIV | ||
2303 | // Additional Netwon-Raphson iteration for accuracy | ||
2304 | recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in))); | ||
2305 | #endif | ||
2306 | return vreinterpretq_m128_f32(recip); | ||
2307 | } | ||
2308 | |||
2309 | // Compute the approximate reciprocal of the lower single-precision (32-bit) | ||
2310 | // floating-point element in a, store the result in the lower element of dst, | ||
2311 | // and copy the upper 3 packed elements from a to the upper elements of dst. The | ||
2312 | // maximum relative error for this approximation is less than 1.5*2^-12. | ||
2313 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rcp_ss | ||
2314 | FORCE_INLINE __m128 _mm_rcp_ss(__m128 a) | ||
2315 | { | ||
2316 | return _mm_move_ss(a, _mm_rcp_ps(a)); | ||
2317 | } | ||
2318 | |||
2319 | // Compute the approximate reciprocal square root of packed single-precision | ||
2320 | // (32-bit) floating-point elements in a, and store the results in dst. The | ||
2321 | // maximum relative error for this approximation is less than 1.5*2^-12. | ||
2322 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rsqrt_ps | ||
2323 | FORCE_INLINE __m128 _mm_rsqrt_ps(__m128 in) | ||
2324 | { | ||
2325 | float32x4_t out = vrsqrteq_f32(vreinterpretq_f32_m128(in)); | ||
2326 | |||
2327 | // Generate masks for detecting whether input has any 0.0f/-0.0f | ||
2328 | // (which becomes positive/negative infinity by IEEE-754 arithmetic rules). | ||
2329 | const uint32x4_t pos_inf = vdupq_n_u32(0x7F800000); | ||
2330 | const uint32x4_t neg_inf = vdupq_n_u32(0xFF800000); | ||
2331 | const uint32x4_t has_pos_zero = | ||
2332 | vceqq_u32(pos_inf, vreinterpretq_u32_f32(out)); | ||
2333 | const uint32x4_t has_neg_zero = | ||
2334 | vceqq_u32(neg_inf, vreinterpretq_u32_f32(out)); | ||
2335 | |||
2336 | out = vmulq_f32( | ||
2337 | out, vrsqrtsq_f32(vmulq_f32(vreinterpretq_f32_m128(in), out), out)); | ||
2338 | #if SSE2NEON_PRECISE_SQRT | ||
2339 | // Additional Netwon-Raphson iteration for accuracy | ||
2340 | out = vmulq_f32( | ||
2341 | out, vrsqrtsq_f32(vmulq_f32(vreinterpretq_f32_m128(in), out), out)); | ||
2342 | #endif | ||
2343 | |||
2344 | // Set output vector element to infinity/negative-infinity if | ||
2345 | // the corresponding input vector element is 0.0f/-0.0f. | ||
2346 | out = vbslq_f32(has_pos_zero, (float32x4_t) pos_inf, out); | ||
2347 | out = vbslq_f32(has_neg_zero, (float32x4_t) neg_inf, out); | ||
2348 | |||
2349 | return vreinterpretq_m128_f32(out); | ||
2350 | } | ||
2351 | |||
2352 | // Compute the approximate reciprocal square root of the lower single-precision | ||
2353 | // (32-bit) floating-point element in a, store the result in the lower element | ||
2354 | // of dst, and copy the upper 3 packed elements from a to the upper elements of | ||
2355 | // dst. | ||
2356 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rsqrt_ss | ||
2357 | FORCE_INLINE __m128 _mm_rsqrt_ss(__m128 in) | ||
2358 | { | ||
2359 | return vsetq_lane_f32(vgetq_lane_f32(_mm_rsqrt_ps(in), 0), in, 0); | ||
2360 | } | ||
2361 | |||
2362 | // Compute the absolute differences of packed unsigned 8-bit integers in a and | ||
2363 | // b, then horizontally sum each consecutive 8 differences to produce four | ||
2364 | // unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low | ||
2365 | // 16 bits of dst. | ||
2366 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sad_pu8 | ||
2367 | FORCE_INLINE __m64 _mm_sad_pu8(__m64 a, __m64 b) | ||
2368 | { | ||
2369 | uint64x1_t t = vpaddl_u32(vpaddl_u16( | ||
2370 | vpaddl_u8(vabd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b))))); | ||
2371 | return vreinterpret_m64_u16( | ||
2372 | vset_lane_u16((int) vget_lane_u64(t, 0), vdup_n_u16(0), 0)); | ||
2373 | } | ||
2374 | |||
2375 | // Macro: Set the flush zero bits of the MXCSR control and status register to | ||
2376 | // the value in unsigned 32-bit integer a. The flush zero may contain any of the | ||
2377 | // following flags: _MM_FLUSH_ZERO_ON or _MM_FLUSH_ZERO_OFF | ||
2378 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_FLUSH_ZERO_MODE | ||
2379 | FORCE_INLINE void _sse2neon_mm_set_flush_zero_mode(unsigned int flag) | ||
2380 | { | ||
2381 | // AArch32 Advanced SIMD arithmetic always uses the Flush-to-zero setting, | ||
2382 | // regardless of the value of the FZ bit. | ||
2383 | union { | ||
2384 | fpcr_bitfield field; | ||
2385 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
2386 | uint64_t value; | ||
2387 | #else | ||
2388 | uint32_t value; | ||
2389 | #endif | ||
2390 | } r; | ||
2391 | |||
2392 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
2393 | r.value = _sse2neon_get_fpcr(); | ||
2394 | #else | ||
2395 | __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */ | ||
2396 | #endif | ||
2397 | |||
2398 | r.field.bit24 = (flag & _MM_FLUSH_ZERO_MASK) == _MM_FLUSH_ZERO_ON; | ||
2399 | |||
2400 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
2401 | _sse2neon_set_fpcr(r.value); | ||
2402 | #else | ||
2403 | __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r)); /* write */ | ||
2404 | #endif | ||
2405 | } | ||
2406 | |||
2407 | // Set packed single-precision (32-bit) floating-point elements in dst with the | ||
2408 | // supplied values. | ||
2409 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ps | ||
2410 | FORCE_INLINE __m128 _mm_set_ps(float w, float z, float y, float x) | ||
2411 | { | ||
2412 | float ALIGN_STRUCT(16) data[4] = {x, y, z, w}; | ||
2413 | return vreinterpretq_m128_f32(vld1q_f32(data)); | ||
2414 | } | ||
2415 | |||
2416 | // Broadcast single-precision (32-bit) floating-point value a to all elements of | ||
2417 | // dst. | ||
2418 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ps1 | ||
2419 | FORCE_INLINE __m128 _mm_set_ps1(float _w) | ||
2420 | { | ||
2421 | return vreinterpretq_m128_f32(vdupq_n_f32(_w)); | ||
2422 | } | ||
2423 | |||
2424 | // Macro: Set the rounding mode bits of the MXCSR control and status register to | ||
2425 | // the value in unsigned 32-bit integer a. The rounding mode may contain any of | ||
2426 | // the following flags: _MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, | ||
2427 | // _MM_ROUND_TOWARD_ZERO | ||
2428 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_ROUNDING_MODE | ||
2429 | FORCE_INLINE void _MM_SET_ROUNDING_MODE(int rounding) | ||
2430 | { | ||
2431 | union { | ||
2432 | fpcr_bitfield field; | ||
2433 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
2434 | uint64_t value; | ||
2435 | #else | ||
2436 | uint32_t value; | ||
2437 | #endif | ||
2438 | } r; | ||
2439 | |||
2440 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
2441 | r.value = _sse2neon_get_fpcr(); | ||
2442 | #else | ||
2443 | __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */ | ||
2444 | #endif | ||
2445 | |||
2446 | switch (rounding) { | ||
2447 | case _MM_ROUND_TOWARD_ZERO: | ||
2448 | r.field.bit22 = 1; | ||
2449 | r.field.bit23 = 1; | ||
2450 | break; | ||
2451 | case _MM_ROUND_DOWN: | ||
2452 | r.field.bit22 = 0; | ||
2453 | r.field.bit23 = 1; | ||
2454 | break; | ||
2455 | case _MM_ROUND_UP: | ||
2456 | r.field.bit22 = 1; | ||
2457 | r.field.bit23 = 0; | ||
2458 | break; | ||
2459 | default: //_MM_ROUND_NEAREST | ||
2460 | r.field.bit22 = 0; | ||
2461 | r.field.bit23 = 0; | ||
2462 | } | ||
2463 | |||
2464 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
2465 | _sse2neon_set_fpcr(r.value); | ||
2466 | #else | ||
2467 | __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r)); /* write */ | ||
2468 | #endif | ||
2469 | } | ||
2470 | |||
2471 | // Copy single-precision (32-bit) floating-point element a to the lower element | ||
2472 | // of dst, and zero the upper 3 elements. | ||
2473 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ss | ||
2474 | FORCE_INLINE __m128 _mm_set_ss(float a) | ||
2475 | { | ||
2476 | return vreinterpretq_m128_f32(vsetq_lane_f32(a, vdupq_n_f32(0), 0)); | ||
2477 | } | ||
2478 | |||
2479 | // Broadcast single-precision (32-bit) floating-point value a to all elements of | ||
2480 | // dst. | ||
2481 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_ps | ||
2482 | FORCE_INLINE __m128 _mm_set1_ps(float _w) | ||
2483 | { | ||
2484 | return vreinterpretq_m128_f32(vdupq_n_f32(_w)); | ||
2485 | } | ||
2486 | |||
2487 | // Set the MXCSR control and status register with the value in unsigned 32-bit | ||
2488 | // integer a. | ||
2489 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setcsr | ||
2490 | // FIXME: _mm_setcsr() implementation supports changing the rounding mode only. | ||
2491 | FORCE_INLINE void _mm_setcsr(unsigned int a) | ||
2492 | { | ||
2493 | _MM_SET_ROUNDING_MODE(a); | ||
2494 | } | ||
2495 | |||
2496 | // Get the unsigned 32-bit value of the MXCSR control and status register. | ||
2497 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_getcsr | ||
2498 | // FIXME: _mm_getcsr() implementation supports reading the rounding mode only. | ||
2499 | FORCE_INLINE unsigned int _mm_getcsr(void) | ||
2500 | { | ||
2501 | return _MM_GET_ROUNDING_MODE(); | ||
2502 | } | ||
2503 | |||
2504 | // Set packed single-precision (32-bit) floating-point elements in dst with the | ||
2505 | // supplied values in reverse order. | ||
2506 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_ps | ||
2507 | FORCE_INLINE __m128 _mm_setr_ps(float w, float z, float y, float x) | ||
2508 | { | ||
2509 | float ALIGN_STRUCT(16) data[4] = {w, z, y, x}; | ||
2510 | return vreinterpretq_m128_f32(vld1q_f32(data)); | ||
2511 | } | ||
2512 | |||
2513 | // Return vector of type __m128 with all elements set to zero. | ||
2514 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_ps | ||
2515 | FORCE_INLINE __m128 _mm_setzero_ps(void) | ||
2516 | { | ||
2517 | return vreinterpretq_m128_f32(vdupq_n_f32(0)); | ||
2518 | } | ||
2519 | |||
2520 | // Shuffle 16-bit integers in a using the control in imm8, and store the results | ||
2521 | // in dst. | ||
2522 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_pi16 | ||
2523 | #ifdef _sse2neon_shuffle | ||
2524 | #define _mm_shuffle_pi16(a, imm) \ | ||
2525 | vreinterpret_m64_s16(vshuffle_s16( \ | ||
2526 | vreinterpret_s16_m64(a), vreinterpret_s16_m64(a), (imm & 0x3), \ | ||
2527 | ((imm >> 2) & 0x3), ((imm >> 4) & 0x3), ((imm >> 6) & 0x3))) | ||
2528 | #else | ||
2529 | #define _mm_shuffle_pi16(a, imm) \ | ||
2530 | _sse2neon_define1( \ | ||
2531 | __m64, a, int16x4_t ret; \ | ||
2532 | ret = vmov_n_s16( \ | ||
2533 | vget_lane_s16(vreinterpret_s16_m64(_a), (imm) & (0x3))); \ | ||
2534 | ret = vset_lane_s16( \ | ||
2535 | vget_lane_s16(vreinterpret_s16_m64(_a), ((imm) >> 2) & 0x3), ret, \ | ||
2536 | 1); \ | ||
2537 | ret = vset_lane_s16( \ | ||
2538 | vget_lane_s16(vreinterpret_s16_m64(_a), ((imm) >> 4) & 0x3), ret, \ | ||
2539 | 2); \ | ||
2540 | ret = vset_lane_s16( \ | ||
2541 | vget_lane_s16(vreinterpret_s16_m64(_a), ((imm) >> 6) & 0x3), ret, \ | ||
2542 | 3); \ | ||
2543 | _sse2neon_return(vreinterpret_m64_s16(ret));) | ||
2544 | #endif | ||
2545 | |||
2546 | // Perform a serializing operation on all store-to-memory instructions that were | ||
2547 | // issued prior to this instruction. Guarantees that every store instruction | ||
2548 | // that precedes, in program order, is globally visible before any store | ||
2549 | // instruction which follows the fence in program order. | ||
2550 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sfence | ||
2551 | FORCE_INLINE void _mm_sfence(void) | ||
2552 | { | ||
2553 | _sse2neon_smp_mb(); | ||
2554 | } | ||
2555 | |||
2556 | // Perform a serializing operation on all load-from-memory and store-to-memory | ||
2557 | // instructions that were issued prior to this instruction. Guarantees that | ||
2558 | // every memory access that precedes, in program order, the memory fence | ||
2559 | // instruction is globally visible before any memory instruction which follows | ||
2560 | // the fence in program order. | ||
2561 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mfence | ||
2562 | FORCE_INLINE void _mm_mfence(void) | ||
2563 | { | ||
2564 | _sse2neon_smp_mb(); | ||
2565 | } | ||
2566 | |||
2567 | // Perform a serializing operation on all load-from-memory instructions that | ||
2568 | // were issued prior to this instruction. Guarantees that every load instruction | ||
2569 | // that precedes, in program order, is globally visible before any load | ||
2570 | // instruction which follows the fence in program order. | ||
2571 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_lfence | ||
2572 | FORCE_INLINE void _mm_lfence(void) | ||
2573 | { | ||
2574 | _sse2neon_smp_mb(); | ||
2575 | } | ||
2576 | |||
2577 | // FORCE_INLINE __m128 _mm_shuffle_ps(__m128 a, __m128 b, __constrange(0,255) | ||
2578 | // int imm) | ||
2579 | #ifdef _sse2neon_shuffle | ||
2580 | #define _mm_shuffle_ps(a, b, imm) \ | ||
2581 | __extension__({ \ | ||
2582 | float32x4_t _input1 = vreinterpretq_f32_m128(a); \ | ||
2583 | float32x4_t _input2 = vreinterpretq_f32_m128(b); \ | ||
2584 | float32x4_t _shuf = \ | ||
2585 | vshuffleq_s32(_input1, _input2, (imm) & (0x3), ((imm) >> 2) & 0x3, \ | ||
2586 | (((imm) >> 4) & 0x3) + 4, (((imm) >> 6) & 0x3) + 4); \ | ||
2587 | vreinterpretq_m128_f32(_shuf); \ | ||
2588 | }) | ||
2589 | #else // generic | ||
2590 | #define _mm_shuffle_ps(a, b, imm) \ | ||
2591 | _sse2neon_define2( \ | ||
2592 | __m128, a, b, __m128 ret; switch (imm) { \ | ||
2593 | case _MM_SHUFFLE(1, 0, 3, 2): \ | ||
2594 | ret = _mm_shuffle_ps_1032(_a, _b); \ | ||
2595 | break; \ | ||
2596 | case _MM_SHUFFLE(2, 3, 0, 1): \ | ||
2597 | ret = _mm_shuffle_ps_2301(_a, _b); \ | ||
2598 | break; \ | ||
2599 | case _MM_SHUFFLE(0, 3, 2, 1): \ | ||
2600 | ret = _mm_shuffle_ps_0321(_a, _b); \ | ||
2601 | break; \ | ||
2602 | case _MM_SHUFFLE(2, 1, 0, 3): \ | ||
2603 | ret = _mm_shuffle_ps_2103(_a, _b); \ | ||
2604 | break; \ | ||
2605 | case _MM_SHUFFLE(1, 0, 1, 0): \ | ||
2606 | ret = _mm_movelh_ps(_a, _b); \ | ||
2607 | break; \ | ||
2608 | case _MM_SHUFFLE(1, 0, 0, 1): \ | ||
2609 | ret = _mm_shuffle_ps_1001(_a, _b); \ | ||
2610 | break; \ | ||
2611 | case _MM_SHUFFLE(0, 1, 0, 1): \ | ||
2612 | ret = _mm_shuffle_ps_0101(_a, _b); \ | ||
2613 | break; \ | ||
2614 | case _MM_SHUFFLE(3, 2, 1, 0): \ | ||
2615 | ret = _mm_shuffle_ps_3210(_a, _b); \ | ||
2616 | break; \ | ||
2617 | case _MM_SHUFFLE(0, 0, 1, 1): \ | ||
2618 | ret = _mm_shuffle_ps_0011(_a, _b); \ | ||
2619 | break; \ | ||
2620 | case _MM_SHUFFLE(0, 0, 2, 2): \ | ||
2621 | ret = _mm_shuffle_ps_0022(_a, _b); \ | ||
2622 | break; \ | ||
2623 | case _MM_SHUFFLE(2, 2, 0, 0): \ | ||
2624 | ret = _mm_shuffle_ps_2200(_a, _b); \ | ||
2625 | break; \ | ||
2626 | case _MM_SHUFFLE(3, 2, 0, 2): \ | ||
2627 | ret = _mm_shuffle_ps_3202(_a, _b); \ | ||
2628 | break; \ | ||
2629 | case _MM_SHUFFLE(3, 2, 3, 2): \ | ||
2630 | ret = _mm_movehl_ps(_b, _a); \ | ||
2631 | break; \ | ||
2632 | case _MM_SHUFFLE(1, 1, 3, 3): \ | ||
2633 | ret = _mm_shuffle_ps_1133(_a, _b); \ | ||
2634 | break; \ | ||
2635 | case _MM_SHUFFLE(2, 0, 1, 0): \ | ||
2636 | ret = _mm_shuffle_ps_2010(_a, _b); \ | ||
2637 | break; \ | ||
2638 | case _MM_SHUFFLE(2, 0, 0, 1): \ | ||
2639 | ret = _mm_shuffle_ps_2001(_a, _b); \ | ||
2640 | break; \ | ||
2641 | case _MM_SHUFFLE(2, 0, 3, 2): \ | ||
2642 | ret = _mm_shuffle_ps_2032(_a, _b); \ | ||
2643 | break; \ | ||
2644 | default: \ | ||
2645 | ret = _mm_shuffle_ps_default(_a, _b, (imm)); \ | ||
2646 | break; \ | ||
2647 | } _sse2neon_return(ret);) | ||
2648 | #endif | ||
2649 | |||
2650 | // Compute the square root of packed single-precision (32-bit) floating-point | ||
2651 | // elements in a, and store the results in dst. | ||
2652 | // Due to ARMv7-A NEON's lack of a precise square root intrinsic, we implement | ||
2653 | // square root by multiplying input in with its reciprocal square root before | ||
2654 | // using the Newton-Raphson method to approximate the results. | ||
2655 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_ps | ||
2656 | FORCE_INLINE __m128 _mm_sqrt_ps(__m128 in) | ||
2657 | { | ||
2658 | #if (defined(__aarch64__) || defined(_M_ARM64)) && !SSE2NEON_PRECISE_SQRT | ||
2659 | return vreinterpretq_m128_f32(vsqrtq_f32(vreinterpretq_f32_m128(in))); | ||
2660 | #else | ||
2661 | float32x4_t recip = vrsqrteq_f32(vreinterpretq_f32_m128(in)); | ||
2662 | |||
2663 | // Test for vrsqrteq_f32(0) -> positive infinity case. | ||
2664 | // Change to zero, so that s * 1/sqrt(s) result is zero too. | ||
2665 | const uint32x4_t pos_inf = vdupq_n_u32(0x7F800000); | ||
2666 | const uint32x4_t div_by_zero = | ||
2667 | vceqq_u32(pos_inf, vreinterpretq_u32_f32(recip)); | ||
2668 | recip = vreinterpretq_f32_u32( | ||
2669 | vandq_u32(vmvnq_u32(div_by_zero), vreinterpretq_u32_f32(recip))); | ||
2670 | |||
2671 | recip = vmulq_f32( | ||
2672 | vrsqrtsq_f32(vmulq_f32(recip, recip), vreinterpretq_f32_m128(in)), | ||
2673 | recip); | ||
2674 | // Additional Netwon-Raphson iteration for accuracy | ||
2675 | recip = vmulq_f32( | ||
2676 | vrsqrtsq_f32(vmulq_f32(recip, recip), vreinterpretq_f32_m128(in)), | ||
2677 | recip); | ||
2678 | |||
2679 | // sqrt(s) = s * 1/sqrt(s) | ||
2680 | return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(in), recip)); | ||
2681 | #endif | ||
2682 | } | ||
2683 | |||
2684 | // Compute the square root of the lower single-precision (32-bit) floating-point | ||
2685 | // element in a, store the result in the lower element of dst, and copy the | ||
2686 | // upper 3 packed elements from a to the upper elements of dst. | ||
2687 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_ss | ||
2688 | FORCE_INLINE __m128 _mm_sqrt_ss(__m128 in) | ||
2689 | { | ||
2690 | float32_t value = | ||
2691 | vgetq_lane_f32(vreinterpretq_f32_m128(_mm_sqrt_ps(in)), 0); | ||
2692 | return vreinterpretq_m128_f32( | ||
2693 | vsetq_lane_f32(value, vreinterpretq_f32_m128(in), 0)); | ||
2694 | } | ||
2695 | |||
2696 | // Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point | ||
2697 | // elements) from a into memory. mem_addr must be aligned on a 16-byte boundary | ||
2698 | // or a general-protection exception may be generated. | ||
2699 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ps | ||
2700 | FORCE_INLINE void _mm_store_ps(float *p, __m128 a) | ||
2701 | { | ||
2702 | vst1q_f32(p, vreinterpretq_f32_m128(a)); | ||
2703 | } | ||
2704 | |||
2705 | // Store the lower single-precision (32-bit) floating-point element from a into | ||
2706 | // 4 contiguous elements in memory. mem_addr must be aligned on a 16-byte | ||
2707 | // boundary or a general-protection exception may be generated. | ||
2708 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ps1 | ||
2709 | FORCE_INLINE void _mm_store_ps1(float *p, __m128 a) | ||
2710 | { | ||
2711 | float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0); | ||
2712 | vst1q_f32(p, vdupq_n_f32(a0)); | ||
2713 | } | ||
2714 | |||
2715 | // Store the lower single-precision (32-bit) floating-point element from a into | ||
2716 | // memory. mem_addr does not need to be aligned on any particular boundary. | ||
2717 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ss | ||
2718 | FORCE_INLINE void _mm_store_ss(float *p, __m128 a) | ||
2719 | { | ||
2720 | vst1q_lane_f32(p, vreinterpretq_f32_m128(a), 0); | ||
2721 | } | ||
2722 | |||
2723 | // Store the lower single-precision (32-bit) floating-point element from a into | ||
2724 | // 4 contiguous elements in memory. mem_addr must be aligned on a 16-byte | ||
2725 | // boundary or a general-protection exception may be generated. | ||
2726 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store1_ps | ||
2727 | #define _mm_store1_ps _mm_store_ps1 | ||
2728 | |||
2729 | // Store the upper 2 single-precision (32-bit) floating-point elements from a | ||
2730 | // into memory. | ||
2731 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeh_pi | ||
2732 | FORCE_INLINE void _mm_storeh_pi(__m64 *p, __m128 a) | ||
2733 | { | ||
2734 | *p = vreinterpret_m64_f32(vget_high_f32(a)); | ||
2735 | } | ||
2736 | |||
2737 | // Store the lower 2 single-precision (32-bit) floating-point elements from a | ||
2738 | // into memory. | ||
2739 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storel_pi | ||
2740 | FORCE_INLINE void _mm_storel_pi(__m64 *p, __m128 a) | ||
2741 | { | ||
2742 | *p = vreinterpret_m64_f32(vget_low_f32(a)); | ||
2743 | } | ||
2744 | |||
2745 | // Store 4 single-precision (32-bit) floating-point elements from a into memory | ||
2746 | // in reverse order. mem_addr must be aligned on a 16-byte boundary or a | ||
2747 | // general-protection exception may be generated. | ||
2748 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storer_ps | ||
2749 | FORCE_INLINE void _mm_storer_ps(float *p, __m128 a) | ||
2750 | { | ||
2751 | float32x4_t tmp = vrev64q_f32(vreinterpretq_f32_m128(a)); | ||
2752 | float32x4_t rev = vextq_f32(tmp, tmp, 2); | ||
2753 | vst1q_f32(p, rev); | ||
2754 | } | ||
2755 | |||
2756 | // Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point | ||
2757 | // elements) from a into memory. mem_addr does not need to be aligned on any | ||
2758 | // particular boundary. | ||
2759 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_ps | ||
2760 | FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a) | ||
2761 | { | ||
2762 | vst1q_f32(p, vreinterpretq_f32_m128(a)); | ||
2763 | } | ||
2764 | |||
2765 | // Stores 16-bits of integer data a at the address p. | ||
2766 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si16 | ||
2767 | FORCE_INLINE void _mm_storeu_si16(void *p, __m128i a) | ||
2768 | { | ||
2769 | vst1q_lane_s16((int16_t *) p, vreinterpretq_s16_m128i(a), 0); | ||
2770 | } | ||
2771 | |||
2772 | // Stores 64-bits of integer data a at the address p. | ||
2773 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si64 | ||
2774 | FORCE_INLINE void _mm_storeu_si64(void *p, __m128i a) | ||
2775 | { | ||
2776 | vst1q_lane_s64((int64_t *) p, vreinterpretq_s64_m128i(a), 0); | ||
2777 | } | ||
2778 | |||
2779 | // Store 64-bits of integer data from a into memory using a non-temporal memory | ||
2780 | // hint. | ||
2781 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_pi | ||
2782 | FORCE_INLINE void _mm_stream_pi(__m64 *p, __m64 a) | ||
2783 | { | ||
2784 | vst1_s64((int64_t *) p, vreinterpret_s64_m64(a)); | ||
2785 | } | ||
2786 | |||
2787 | // Store 128-bits (composed of 4 packed single-precision (32-bit) floating- | ||
2788 | // point elements) from a into memory using a non-temporal memory hint. | ||
2789 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_ps | ||
2790 | FORCE_INLINE void _mm_stream_ps(float *p, __m128 a) | ||
2791 | { | ||
2792 | #if __has_builtin(__builtin_nontemporal_store) | ||
2793 | __builtin_nontemporal_store(a, (float32x4_t *) p); | ||
2794 | #else | ||
2795 | vst1q_f32(p, vreinterpretq_f32_m128(a)); | ||
2796 | #endif | ||
2797 | } | ||
2798 | |||
2799 | // Subtract packed single-precision (32-bit) floating-point elements in b from | ||
2800 | // packed single-precision (32-bit) floating-point elements in a, and store the | ||
2801 | // results in dst. | ||
2802 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_ps | ||
2803 | FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b) | ||
2804 | { | ||
2805 | return vreinterpretq_m128_f32( | ||
2806 | vsubq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); | ||
2807 | } | ||
2808 | |||
2809 | // Subtract the lower single-precision (32-bit) floating-point element in b from | ||
2810 | // the lower single-precision (32-bit) floating-point element in a, store the | ||
2811 | // result in the lower element of dst, and copy the upper 3 packed elements from | ||
2812 | // a to the upper elements of dst. | ||
2813 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_ss | ||
2814 | FORCE_INLINE __m128 _mm_sub_ss(__m128 a, __m128 b) | ||
2815 | { | ||
2816 | return _mm_move_ss(a, _mm_sub_ps(a, b)); | ||
2817 | } | ||
2818 | |||
2819 | // Macro: Transpose the 4x4 matrix formed by the 4 rows of single-precision | ||
2820 | // (32-bit) floating-point elements in row0, row1, row2, and row3, and store the | ||
2821 | // transposed matrix in these vectors (row0 now contains column 0, etc.). | ||
2822 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=MM_TRANSPOSE4_PS | ||
2823 | #define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \ | ||
2824 | do { \ | ||
2825 | float32x4x2_t ROW01 = vtrnq_f32(row0, row1); \ | ||
2826 | float32x4x2_t ROW23 = vtrnq_f32(row2, row3); \ | ||
2827 | row0 = vcombine_f32(vget_low_f32(ROW01.val[0]), \ | ||
2828 | vget_low_f32(ROW23.val[0])); \ | ||
2829 | row1 = vcombine_f32(vget_low_f32(ROW01.val[1]), \ | ||
2830 | vget_low_f32(ROW23.val[1])); \ | ||
2831 | row2 = vcombine_f32(vget_high_f32(ROW01.val[0]), \ | ||
2832 | vget_high_f32(ROW23.val[0])); \ | ||
2833 | row3 = vcombine_f32(vget_high_f32(ROW01.val[1]), \ | ||
2834 | vget_high_f32(ROW23.val[1])); \ | ||
2835 | } while (0) | ||
2836 | |||
2837 | // according to the documentation, these intrinsics behave the same as the | ||
2838 | // non-'u' versions. We'll just alias them here. | ||
2839 | #define _mm_ucomieq_ss _mm_comieq_ss | ||
2840 | #define _mm_ucomige_ss _mm_comige_ss | ||
2841 | #define _mm_ucomigt_ss _mm_comigt_ss | ||
2842 | #define _mm_ucomile_ss _mm_comile_ss | ||
2843 | #define _mm_ucomilt_ss _mm_comilt_ss | ||
2844 | #define _mm_ucomineq_ss _mm_comineq_ss | ||
2845 | |||
2846 | // Return vector of type __m128i with undefined elements. | ||
2847 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_undefined_si128 | ||
2848 | FORCE_INLINE __m128i _mm_undefined_si128(void) | ||
2849 | { | ||
2850 | #if defined(__GNUC__) || defined(__clang__) | ||
2851 | #pragma GCC diagnostic push | ||
2852 | #pragma GCC diagnostic ignored "-Wuninitialized" | ||
2853 | #endif | ||
2854 | __m128i a; | ||
2855 | #if defined(_MSC_VER) | ||
2856 | a = _mm_setzero_si128(); | ||
2857 | #endif | ||
2858 | return a; | ||
2859 | #if defined(__GNUC__) || defined(__clang__) | ||
2860 | #pragma GCC diagnostic pop | ||
2861 | #endif | ||
2862 | } | ||
2863 | |||
2864 | // Return vector of type __m128 with undefined elements. | ||
2865 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_undefined_ps | ||
2866 | FORCE_INLINE __m128 _mm_undefined_ps(void) | ||
2867 | { | ||
2868 | #if defined(__GNUC__) || defined(__clang__) | ||
2869 | #pragma GCC diagnostic push | ||
2870 | #pragma GCC diagnostic ignored "-Wuninitialized" | ||
2871 | #endif | ||
2872 | __m128 a; | ||
2873 | #if defined(_MSC_VER) | ||
2874 | a = _mm_setzero_ps(); | ||
2875 | #endif | ||
2876 | return a; | ||
2877 | #if defined(__GNUC__) || defined(__clang__) | ||
2878 | #pragma GCC diagnostic pop | ||
2879 | #endif | ||
2880 | } | ||
2881 | |||
2882 | // Unpack and interleave single-precision (32-bit) floating-point elements from | ||
2883 | // the high half a and b, and store the results in dst. | ||
2884 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_ps | ||
2885 | FORCE_INLINE __m128 _mm_unpackhi_ps(__m128 a, __m128 b) | ||
2886 | { | ||
2887 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
2888 | return vreinterpretq_m128_f32( | ||
2889 | vzip2q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); | ||
2890 | #else | ||
2891 | float32x2_t a1 = vget_high_f32(vreinterpretq_f32_m128(a)); | ||
2892 | float32x2_t b1 = vget_high_f32(vreinterpretq_f32_m128(b)); | ||
2893 | float32x2x2_t result = vzip_f32(a1, b1); | ||
2894 | return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1])); | ||
2895 | #endif | ||
2896 | } | ||
2897 | |||
2898 | // Unpack and interleave single-precision (32-bit) floating-point elements from | ||
2899 | // the low half of a and b, and store the results in dst. | ||
2900 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_ps | ||
2901 | FORCE_INLINE __m128 _mm_unpacklo_ps(__m128 a, __m128 b) | ||
2902 | { | ||
2903 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
2904 | return vreinterpretq_m128_f32( | ||
2905 | vzip1q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); | ||
2906 | #else | ||
2907 | float32x2_t a1 = vget_low_f32(vreinterpretq_f32_m128(a)); | ||
2908 | float32x2_t b1 = vget_low_f32(vreinterpretq_f32_m128(b)); | ||
2909 | float32x2x2_t result = vzip_f32(a1, b1); | ||
2910 | return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1])); | ||
2911 | #endif | ||
2912 | } | ||
2913 | |||
2914 | // Compute the bitwise XOR of packed single-precision (32-bit) floating-point | ||
2915 | // elements in a and b, and store the results in dst. | ||
2916 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_ps | ||
2917 | FORCE_INLINE __m128 _mm_xor_ps(__m128 a, __m128 b) | ||
2918 | { | ||
2919 | return vreinterpretq_m128_s32( | ||
2920 | veorq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b))); | ||
2921 | } | ||
2922 | |||
2923 | /* SSE2 */ | ||
2924 | |||
2925 | // Add packed 16-bit integers in a and b, and store the results in dst. | ||
2926 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi16 | ||
2927 | FORCE_INLINE __m128i _mm_add_epi16(__m128i a, __m128i b) | ||
2928 | { | ||
2929 | return vreinterpretq_m128i_s16( | ||
2930 | vaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); | ||
2931 | } | ||
2932 | |||
2933 | // Add packed 32-bit integers in a and b, and store the results in dst. | ||
2934 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi32 | ||
2935 | FORCE_INLINE __m128i _mm_add_epi32(__m128i a, __m128i b) | ||
2936 | { | ||
2937 | return vreinterpretq_m128i_s32( | ||
2938 | vaddq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); | ||
2939 | } | ||
2940 | |||
2941 | // Add packed 64-bit integers in a and b, and store the results in dst. | ||
2942 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi64 | ||
2943 | FORCE_INLINE __m128i _mm_add_epi64(__m128i a, __m128i b) | ||
2944 | { | ||
2945 | return vreinterpretq_m128i_s64( | ||
2946 | vaddq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b))); | ||
2947 | } | ||
2948 | |||
2949 | // Add packed 8-bit integers in a and b, and store the results in dst. | ||
2950 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi8 | ||
2951 | FORCE_INLINE __m128i _mm_add_epi8(__m128i a, __m128i b) | ||
2952 | { | ||
2953 | return vreinterpretq_m128i_s8( | ||
2954 | vaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); | ||
2955 | } | ||
2956 | |||
2957 | // Add packed double-precision (64-bit) floating-point elements in a and b, and | ||
2958 | // store the results in dst. | ||
2959 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_pd | ||
2960 | FORCE_INLINE __m128d _mm_add_pd(__m128d a, __m128d b) | ||
2961 | { | ||
2962 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
2963 | return vreinterpretq_m128d_f64( | ||
2964 | vaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); | ||
2965 | #else | ||
2966 | double *da = (double *) &a; | ||
2967 | double *db = (double *) &b; | ||
2968 | double c[2]; | ||
2969 | c[0] = da[0] + db[0]; | ||
2970 | c[1] = da[1] + db[1]; | ||
2971 | return vld1q_f32((float32_t *) c); | ||
2972 | #endif | ||
2973 | } | ||
2974 | |||
2975 | // Add the lower double-precision (64-bit) floating-point element in a and b, | ||
2976 | // store the result in the lower element of dst, and copy the upper element from | ||
2977 | // a to the upper element of dst. | ||
2978 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_sd | ||
2979 | FORCE_INLINE __m128d _mm_add_sd(__m128d a, __m128d b) | ||
2980 | { | ||
2981 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
2982 | return _mm_move_sd(a, _mm_add_pd(a, b)); | ||
2983 | #else | ||
2984 | double *da = (double *) &a; | ||
2985 | double *db = (double *) &b; | ||
2986 | double c[2]; | ||
2987 | c[0] = da[0] + db[0]; | ||
2988 | c[1] = da[1]; | ||
2989 | return vld1q_f32((float32_t *) c); | ||
2990 | #endif | ||
2991 | } | ||
2992 | |||
2993 | // Add 64-bit integers a and b, and store the result in dst. | ||
2994 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_si64 | ||
2995 | FORCE_INLINE __m64 _mm_add_si64(__m64 a, __m64 b) | ||
2996 | { | ||
2997 | return vreinterpret_m64_s64( | ||
2998 | vadd_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b))); | ||
2999 | } | ||
3000 | |||
3001 | // Add packed signed 16-bit integers in a and b using saturation, and store the | ||
3002 | // results in dst. | ||
3003 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epi16 | ||
3004 | FORCE_INLINE __m128i _mm_adds_epi16(__m128i a, __m128i b) | ||
3005 | { | ||
3006 | return vreinterpretq_m128i_s16( | ||
3007 | vqaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); | ||
3008 | } | ||
3009 | |||
3010 | // Add packed signed 8-bit integers in a and b using saturation, and store the | ||
3011 | // results in dst. | ||
3012 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epi8 | ||
3013 | FORCE_INLINE __m128i _mm_adds_epi8(__m128i a, __m128i b) | ||
3014 | { | ||
3015 | return vreinterpretq_m128i_s8( | ||
3016 | vqaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); | ||
3017 | } | ||
3018 | |||
3019 | // Add packed unsigned 16-bit integers in a and b using saturation, and store | ||
3020 | // the results in dst. | ||
3021 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epu16 | ||
3022 | FORCE_INLINE __m128i _mm_adds_epu16(__m128i a, __m128i b) | ||
3023 | { | ||
3024 | return vreinterpretq_m128i_u16( | ||
3025 | vqaddq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b))); | ||
3026 | } | ||
3027 | |||
3028 | // Add packed unsigned 8-bit integers in a and b using saturation, and store the | ||
3029 | // results in dst. | ||
3030 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epu8 | ||
3031 | FORCE_INLINE __m128i _mm_adds_epu8(__m128i a, __m128i b) | ||
3032 | { | ||
3033 | return vreinterpretq_m128i_u8( | ||
3034 | vqaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b))); | ||
3035 | } | ||
3036 | |||
3037 | // Compute the bitwise AND of packed double-precision (64-bit) floating-point | ||
3038 | // elements in a and b, and store the results in dst. | ||
3039 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_pd | ||
3040 | FORCE_INLINE __m128d _mm_and_pd(__m128d a, __m128d b) | ||
3041 | { | ||
3042 | return vreinterpretq_m128d_s64( | ||
3043 | vandq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b))); | ||
3044 | } | ||
3045 | |||
3046 | // Compute the bitwise AND of 128 bits (representing integer data) in a and b, | ||
3047 | // and store the result in dst. | ||
3048 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_si128 | ||
3049 | FORCE_INLINE __m128i _mm_and_si128(__m128i a, __m128i b) | ||
3050 | { | ||
3051 | return vreinterpretq_m128i_s32( | ||
3052 | vandq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); | ||
3053 | } | ||
3054 | |||
3055 | // Compute the bitwise NOT of packed double-precision (64-bit) floating-point | ||
3056 | // elements in a and then AND with b, and store the results in dst. | ||
3057 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_pd | ||
3058 | FORCE_INLINE __m128d _mm_andnot_pd(__m128d a, __m128d b) | ||
3059 | { | ||
3060 | // *NOTE* argument swap | ||
3061 | return vreinterpretq_m128d_s64( | ||
3062 | vbicq_s64(vreinterpretq_s64_m128d(b), vreinterpretq_s64_m128d(a))); | ||
3063 | } | ||
3064 | |||
3065 | // Compute the bitwise NOT of 128 bits (representing integer data) in a and then | ||
3066 | // AND with b, and store the result in dst. | ||
3067 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_si128 | ||
3068 | FORCE_INLINE __m128i _mm_andnot_si128(__m128i a, __m128i b) | ||
3069 | { | ||
3070 | return vreinterpretq_m128i_s32( | ||
3071 | vbicq_s32(vreinterpretq_s32_m128i(b), | ||
3072 | vreinterpretq_s32_m128i(a))); // *NOTE* argument swap | ||
3073 | } | ||
3074 | |||
3075 | // Average packed unsigned 16-bit integers in a and b, and store the results in | ||
3076 | // dst. | ||
3077 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_epu16 | ||
3078 | FORCE_INLINE __m128i _mm_avg_epu16(__m128i a, __m128i b) | ||
3079 | { | ||
3080 | return (__m128i) vrhaddq_u16(vreinterpretq_u16_m128i(a), | ||
3081 | vreinterpretq_u16_m128i(b)); | ||
3082 | } | ||
3083 | |||
3084 | // Average packed unsigned 8-bit integers in a and b, and store the results in | ||
3085 | // dst. | ||
3086 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_epu8 | ||
3087 | FORCE_INLINE __m128i _mm_avg_epu8(__m128i a, __m128i b) | ||
3088 | { | ||
3089 | return vreinterpretq_m128i_u8( | ||
3090 | vrhaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b))); | ||
3091 | } | ||
3092 | |||
3093 | // Shift a left by imm8 bytes while shifting in zeros, and store the results in | ||
3094 | // dst. | ||
3095 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_bslli_si128 | ||
3096 | #define _mm_bslli_si128(a, imm) _mm_slli_si128(a, imm) | ||
3097 | |||
3098 | // Shift a right by imm8 bytes while shifting in zeros, and store the results in | ||
3099 | // dst. | ||
3100 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_bsrli_si128 | ||
3101 | #define _mm_bsrli_si128(a, imm) _mm_srli_si128(a, imm) | ||
3102 | |||
3103 | // Cast vector of type __m128d to type __m128. This intrinsic is only used for | ||
3104 | // compilation and does not generate any instructions, thus it has zero latency. | ||
3105 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castpd_ps | ||
3106 | FORCE_INLINE __m128 _mm_castpd_ps(__m128d a) | ||
3107 | { | ||
3108 | return vreinterpretq_m128_s64(vreinterpretq_s64_m128d(a)); | ||
3109 | } | ||
3110 | |||
3111 | // Cast vector of type __m128d to type __m128i. This intrinsic is only used for | ||
3112 | // compilation and does not generate any instructions, thus it has zero latency. | ||
3113 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castpd_si128 | ||
3114 | FORCE_INLINE __m128i _mm_castpd_si128(__m128d a) | ||
3115 | { | ||
3116 | return vreinterpretq_m128i_s64(vreinterpretq_s64_m128d(a)); | ||
3117 | } | ||
3118 | |||
3119 | // Cast vector of type __m128 to type __m128d. This intrinsic is only used for | ||
3120 | // compilation and does not generate any instructions, thus it has zero latency. | ||
3121 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castps_pd | ||
3122 | FORCE_INLINE __m128d _mm_castps_pd(__m128 a) | ||
3123 | { | ||
3124 | return vreinterpretq_m128d_s32(vreinterpretq_s32_m128(a)); | ||
3125 | } | ||
3126 | |||
3127 | // Cast vector of type __m128 to type __m128i. This intrinsic is only used for | ||
3128 | // compilation and does not generate any instructions, thus it has zero latency. | ||
3129 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castps_si128 | ||
3130 | FORCE_INLINE __m128i _mm_castps_si128(__m128 a) | ||
3131 | { | ||
3132 | return vreinterpretq_m128i_s32(vreinterpretq_s32_m128(a)); | ||
3133 | } | ||
3134 | |||
3135 | // Cast vector of type __m128i to type __m128d. This intrinsic is only used for | ||
3136 | // compilation and does not generate any instructions, thus it has zero latency. | ||
3137 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castsi128_pd | ||
3138 | FORCE_INLINE __m128d _mm_castsi128_pd(__m128i a) | ||
3139 | { | ||
3140 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
3141 | return vreinterpretq_m128d_f64(vreinterpretq_f64_m128i(a)); | ||
3142 | #else | ||
3143 | return vreinterpretq_m128d_f32(vreinterpretq_f32_m128i(a)); | ||
3144 | #endif | ||
3145 | } | ||
3146 | |||
3147 | // Cast vector of type __m128i to type __m128. This intrinsic is only used for | ||
3148 | // compilation and does not generate any instructions, thus it has zero latency. | ||
3149 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castsi128_ps | ||
3150 | FORCE_INLINE __m128 _mm_castsi128_ps(__m128i a) | ||
3151 | { | ||
3152 | return vreinterpretq_m128_s32(vreinterpretq_s32_m128i(a)); | ||
3153 | } | ||
3154 | |||
3155 | // Invalidate and flush the cache line that contains p from all levels of the | ||
3156 | // cache hierarchy. | ||
3157 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_clflush | ||
3158 | #if defined(__APPLE__) | ||
3159 | #include <libkern/OSCacheControl.h> | ||
3160 | #endif | ||
3161 | FORCE_INLINE void _mm_clflush(void const *p) | ||
3162 | { | ||
3163 | (void) p; | ||
3164 | |||
3165 | /* sys_icache_invalidate is supported since macOS 10.5. | ||
3166 | * However, it does not work on non-jailbroken iOS devices, although the | ||
3167 | * compilation is successful. | ||
3168 | */ | ||
3169 | #if defined(__APPLE__) | ||
3170 | sys_icache_invalidate((void *) (uintptr_t) p, SSE2NEON_CACHELINE_SIZE); | ||
3171 | #elif defined(__GNUC__) || defined(__clang__) | ||
3172 | uintptr_t ptr = (uintptr_t) p; | ||
3173 | __builtin___clear_cache((char *) ptr, | ||
3174 | (char *) ptr + SSE2NEON_CACHELINE_SIZE); | ||
3175 | #elif (_MSC_VER) && SSE2NEON_INCLUDE_WINDOWS_H | ||
3176 | FlushInstructionCache(GetCurrentProcess(), p, SSE2NEON_CACHELINE_SIZE); | ||
3177 | #endif | ||
3178 | } | ||
3179 | |||
3180 | // Compare packed 16-bit integers in a and b for equality, and store the results | ||
3181 | // in dst. | ||
3182 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi16 | ||
3183 | FORCE_INLINE __m128i _mm_cmpeq_epi16(__m128i a, __m128i b) | ||
3184 | { | ||
3185 | return vreinterpretq_m128i_u16( | ||
3186 | vceqq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); | ||
3187 | } | ||
3188 | |||
3189 | // Compare packed 32-bit integers in a and b for equality, and store the results | ||
3190 | // in dst. | ||
3191 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi32 | ||
3192 | FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i a, __m128i b) | ||
3193 | { | ||
3194 | return vreinterpretq_m128i_u32( | ||
3195 | vceqq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); | ||
3196 | } | ||
3197 | |||
3198 | // Compare packed 8-bit integers in a and b for equality, and store the results | ||
3199 | // in dst. | ||
3200 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi8 | ||
3201 | FORCE_INLINE __m128i _mm_cmpeq_epi8(__m128i a, __m128i b) | ||
3202 | { | ||
3203 | return vreinterpretq_m128i_u8( | ||
3204 | vceqq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); | ||
3205 | } | ||
3206 | |||
3207 | // Compare packed double-precision (64-bit) floating-point elements in a and b | ||
3208 | // for equality, and store the results in dst. | ||
3209 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_pd | ||
3210 | FORCE_INLINE __m128d _mm_cmpeq_pd(__m128d a, __m128d b) | ||
3211 | { | ||
3212 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
3213 | return vreinterpretq_m128d_u64( | ||
3214 | vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); | ||
3215 | #else | ||
3216 | // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi) | ||
3217 | uint32x4_t cmp = | ||
3218 | vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(b)); | ||
3219 | uint32x4_t swapped = vrev64q_u32(cmp); | ||
3220 | return vreinterpretq_m128d_u32(vandq_u32(cmp, swapped)); | ||
3221 | #endif | ||
3222 | } | ||
3223 | |||
3224 | // Compare the lower double-precision (64-bit) floating-point elements in a and | ||
3225 | // b for equality, store the result in the lower element of dst, and copy the | ||
3226 | // upper element from a to the upper element of dst. | ||
3227 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_sd | ||
3228 | FORCE_INLINE __m128d _mm_cmpeq_sd(__m128d a, __m128d b) | ||
3229 | { | ||
3230 | return _mm_move_sd(a, _mm_cmpeq_pd(a, b)); | ||
3231 | } | ||
3232 | |||
3233 | // Compare packed double-precision (64-bit) floating-point elements in a and b | ||
3234 | // for greater-than-or-equal, and store the results in dst. | ||
3235 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_pd | ||
3236 | FORCE_INLINE __m128d _mm_cmpge_pd(__m128d a, __m128d b) | ||
3237 | { | ||
3238 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
3239 | return vreinterpretq_m128d_u64( | ||
3240 | vcgeq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); | ||
3241 | #else | ||
3242 | uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); | ||
3243 | uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); | ||
3244 | uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); | ||
3245 | uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b)); | ||
3246 | uint64_t d[2]; | ||
3247 | d[0] = (*(double *) &a0) >= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0); | ||
3248 | d[1] = (*(double *) &a1) >= (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0); | ||
3249 | |||
3250 | return vreinterpretq_m128d_u64(vld1q_u64(d)); | ||
3251 | #endif | ||
3252 | } | ||
3253 | |||
3254 | // Compare the lower double-precision (64-bit) floating-point elements in a and | ||
3255 | // b for greater-than-or-equal, store the result in the lower element of dst, | ||
3256 | // and copy the upper element from a to the upper element of dst. | ||
3257 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_sd | ||
3258 | FORCE_INLINE __m128d _mm_cmpge_sd(__m128d a, __m128d b) | ||
3259 | { | ||
3260 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
3261 | return _mm_move_sd(a, _mm_cmpge_pd(a, b)); | ||
3262 | #else | ||
3263 | // expand "_mm_cmpge_pd()" to reduce unnecessary operations | ||
3264 | uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); | ||
3265 | uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); | ||
3266 | uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); | ||
3267 | uint64_t d[2]; | ||
3268 | d[0] = (*(double *) &a0) >= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0); | ||
3269 | d[1] = a1; | ||
3270 | |||
3271 | return vreinterpretq_m128d_u64(vld1q_u64(d)); | ||
3272 | #endif | ||
3273 | } | ||
3274 | |||
3275 | // Compare packed signed 16-bit integers in a and b for greater-than, and store | ||
3276 | // the results in dst. | ||
3277 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi16 | ||
3278 | FORCE_INLINE __m128i _mm_cmpgt_epi16(__m128i a, __m128i b) | ||
3279 | { | ||
3280 | return vreinterpretq_m128i_u16( | ||
3281 | vcgtq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); | ||
3282 | } | ||
3283 | |||
3284 | // Compare packed signed 32-bit integers in a and b for greater-than, and store | ||
3285 | // the results in dst. | ||
3286 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi32 | ||
3287 | FORCE_INLINE __m128i _mm_cmpgt_epi32(__m128i a, __m128i b) | ||
3288 | { | ||
3289 | return vreinterpretq_m128i_u32( | ||
3290 | vcgtq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); | ||
3291 | } | ||
3292 | |||
3293 | // Compare packed signed 8-bit integers in a and b for greater-than, and store | ||
3294 | // the results in dst. | ||
3295 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi8 | ||
3296 | FORCE_INLINE __m128i _mm_cmpgt_epi8(__m128i a, __m128i b) | ||
3297 | { | ||
3298 | return vreinterpretq_m128i_u8( | ||
3299 | vcgtq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); | ||
3300 | } | ||
3301 | |||
3302 | // Compare packed double-precision (64-bit) floating-point elements in a and b | ||
3303 | // for greater-than, and store the results in dst. | ||
3304 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_pd | ||
3305 | FORCE_INLINE __m128d _mm_cmpgt_pd(__m128d a, __m128d b) | ||
3306 | { | ||
3307 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
3308 | return vreinterpretq_m128d_u64( | ||
3309 | vcgtq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); | ||
3310 | #else | ||
3311 | uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); | ||
3312 | uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); | ||
3313 | uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); | ||
3314 | uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b)); | ||
3315 | uint64_t d[2]; | ||
3316 | d[0] = (*(double *) &a0) > (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0); | ||
3317 | d[1] = (*(double *) &a1) > (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0); | ||
3318 | |||
3319 | return vreinterpretq_m128d_u64(vld1q_u64(d)); | ||
3320 | #endif | ||
3321 | } | ||
3322 | |||
3323 | // Compare the lower double-precision (64-bit) floating-point elements in a and | ||
3324 | // b for greater-than, store the result in the lower element of dst, and copy | ||
3325 | // the upper element from a to the upper element of dst. | ||
3326 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_sd | ||
3327 | FORCE_INLINE __m128d _mm_cmpgt_sd(__m128d a, __m128d b) | ||
3328 | { | ||
3329 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
3330 | return _mm_move_sd(a, _mm_cmpgt_pd(a, b)); | ||
3331 | #else | ||
3332 | // expand "_mm_cmpge_pd()" to reduce unnecessary operations | ||
3333 | uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); | ||
3334 | uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); | ||
3335 | uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); | ||
3336 | uint64_t d[2]; | ||
3337 | d[0] = (*(double *) &a0) > (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0); | ||
3338 | d[1] = a1; | ||
3339 | |||
3340 | return vreinterpretq_m128d_u64(vld1q_u64(d)); | ||
3341 | #endif | ||
3342 | } | ||
3343 | |||
3344 | // Compare packed double-precision (64-bit) floating-point elements in a and b | ||
3345 | // for less-than-or-equal, and store the results in dst. | ||
3346 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_pd | ||
3347 | FORCE_INLINE __m128d _mm_cmple_pd(__m128d a, __m128d b) | ||
3348 | { | ||
3349 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
3350 | return vreinterpretq_m128d_u64( | ||
3351 | vcleq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); | ||
3352 | #else | ||
3353 | uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); | ||
3354 | uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); | ||
3355 | uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); | ||
3356 | uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b)); | ||
3357 | uint64_t d[2]; | ||
3358 | d[0] = (*(double *) &a0) <= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0); | ||
3359 | d[1] = (*(double *) &a1) <= (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0); | ||
3360 | |||
3361 | return vreinterpretq_m128d_u64(vld1q_u64(d)); | ||
3362 | #endif | ||
3363 | } | ||
3364 | |||
3365 | // Compare the lower double-precision (64-bit) floating-point elements in a and | ||
3366 | // b for less-than-or-equal, store the result in the lower element of dst, and | ||
3367 | // copy the upper element from a to the upper element of dst. | ||
3368 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_sd | ||
3369 | FORCE_INLINE __m128d _mm_cmple_sd(__m128d a, __m128d b) | ||
3370 | { | ||
3371 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
3372 | return _mm_move_sd(a, _mm_cmple_pd(a, b)); | ||
3373 | #else | ||
3374 | // expand "_mm_cmpge_pd()" to reduce unnecessary operations | ||
3375 | uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); | ||
3376 | uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); | ||
3377 | uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); | ||
3378 | uint64_t d[2]; | ||
3379 | d[0] = (*(double *) &a0) <= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0); | ||
3380 | d[1] = a1; | ||
3381 | |||
3382 | return vreinterpretq_m128d_u64(vld1q_u64(d)); | ||
3383 | #endif | ||
3384 | } | ||
3385 | |||
3386 | // Compare packed signed 16-bit integers in a and b for less-than, and store the | ||
3387 | // results in dst. Note: This intrinsic emits the pcmpgtw instruction with the | ||
3388 | // order of the operands switched. | ||
3389 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi16 | ||
3390 | FORCE_INLINE __m128i _mm_cmplt_epi16(__m128i a, __m128i b) | ||
3391 | { | ||
3392 | return vreinterpretq_m128i_u16( | ||
3393 | vcltq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); | ||
3394 | } | ||
3395 | |||
3396 | // Compare packed signed 32-bit integers in a and b for less-than, and store the | ||
3397 | // results in dst. Note: This intrinsic emits the pcmpgtd instruction with the | ||
3398 | // order of the operands switched. | ||
3399 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi32 | ||
3400 | FORCE_INLINE __m128i _mm_cmplt_epi32(__m128i a, __m128i b) | ||
3401 | { | ||
3402 | return vreinterpretq_m128i_u32( | ||
3403 | vcltq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); | ||
3404 | } | ||
3405 | |||
3406 | // Compare packed signed 8-bit integers in a and b for less-than, and store the | ||
3407 | // results in dst. Note: This intrinsic emits the pcmpgtb instruction with the | ||
3408 | // order of the operands switched. | ||
3409 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi8 | ||
3410 | FORCE_INLINE __m128i _mm_cmplt_epi8(__m128i a, __m128i b) | ||
3411 | { | ||
3412 | return vreinterpretq_m128i_u8( | ||
3413 | vcltq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); | ||
3414 | } | ||
3415 | |||
3416 | // Compare packed double-precision (64-bit) floating-point elements in a and b | ||
3417 | // for less-than, and store the results in dst. | ||
3418 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_pd | ||
3419 | FORCE_INLINE __m128d _mm_cmplt_pd(__m128d a, __m128d b) | ||
3420 | { | ||
3421 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
3422 | return vreinterpretq_m128d_u64( | ||
3423 | vcltq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); | ||
3424 | #else | ||
3425 | uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); | ||
3426 | uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); | ||
3427 | uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); | ||
3428 | uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b)); | ||
3429 | uint64_t d[2]; | ||
3430 | d[0] = (*(double *) &a0) < (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0); | ||
3431 | d[1] = (*(double *) &a1) < (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0); | ||
3432 | |||
3433 | return vreinterpretq_m128d_u64(vld1q_u64(d)); | ||
3434 | #endif | ||
3435 | } | ||
3436 | |||
3437 | // Compare the lower double-precision (64-bit) floating-point elements in a and | ||
3438 | // b for less-than, store the result in the lower element of dst, and copy the | ||
3439 | // upper element from a to the upper element of dst. | ||
3440 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_sd | ||
3441 | FORCE_INLINE __m128d _mm_cmplt_sd(__m128d a, __m128d b) | ||
3442 | { | ||
3443 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
3444 | return _mm_move_sd(a, _mm_cmplt_pd(a, b)); | ||
3445 | #else | ||
3446 | uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); | ||
3447 | uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); | ||
3448 | uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); | ||
3449 | uint64_t d[2]; | ||
3450 | d[0] = (*(double *) &a0) < (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0); | ||
3451 | d[1] = a1; | ||
3452 | |||
3453 | return vreinterpretq_m128d_u64(vld1q_u64(d)); | ||
3454 | #endif | ||
3455 | } | ||
3456 | |||
3457 | // Compare packed double-precision (64-bit) floating-point elements in a and b | ||
3458 | // for not-equal, and store the results in dst. | ||
3459 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_pd | ||
3460 | FORCE_INLINE __m128d _mm_cmpneq_pd(__m128d a, __m128d b) | ||
3461 | { | ||
3462 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
3463 | return vreinterpretq_m128d_s32(vmvnq_s32(vreinterpretq_s32_u64( | ||
3464 | vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))))); | ||
3465 | #else | ||
3466 | // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi) | ||
3467 | uint32x4_t cmp = | ||
3468 | vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(b)); | ||
3469 | uint32x4_t swapped = vrev64q_u32(cmp); | ||
3470 | return vreinterpretq_m128d_u32(vmvnq_u32(vandq_u32(cmp, swapped))); | ||
3471 | #endif | ||
3472 | } | ||
3473 | |||
3474 | // Compare the lower double-precision (64-bit) floating-point elements in a and | ||
3475 | // b for not-equal, store the result in the lower element of dst, and copy the | ||
3476 | // upper element from a to the upper element of dst. | ||
3477 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_sd | ||
3478 | FORCE_INLINE __m128d _mm_cmpneq_sd(__m128d a, __m128d b) | ||
3479 | { | ||
3480 | return _mm_move_sd(a, _mm_cmpneq_pd(a, b)); | ||
3481 | } | ||
3482 | |||
3483 | // Compare packed double-precision (64-bit) floating-point elements in a and b | ||
3484 | // for not-greater-than-or-equal, and store the results in dst. | ||
3485 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_pd | ||
3486 | FORCE_INLINE __m128d _mm_cmpnge_pd(__m128d a, __m128d b) | ||
3487 | { | ||
3488 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
3489 | return vreinterpretq_m128d_u64(veorq_u64( | ||
3490 | vcgeq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)), | ||
3491 | vdupq_n_u64(UINT64_MAX))); | ||
3492 | #else | ||
3493 | uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); | ||
3494 | uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); | ||
3495 | uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); | ||
3496 | uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b)); | ||
3497 | uint64_t d[2]; | ||
3498 | d[0] = | ||
3499 | !((*(double *) &a0) >= (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0); | ||
3500 | d[1] = | ||
3501 | !((*(double *) &a1) >= (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0); | ||
3502 | |||
3503 | return vreinterpretq_m128d_u64(vld1q_u64(d)); | ||
3504 | #endif | ||
3505 | } | ||
3506 | |||
3507 | // Compare the lower double-precision (64-bit) floating-point elements in a and | ||
3508 | // b for not-greater-than-or-equal, store the result in the lower element of | ||
3509 | // dst, and copy the upper element from a to the upper element of dst. | ||
3510 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_sd | ||
3511 | FORCE_INLINE __m128d _mm_cmpnge_sd(__m128d a, __m128d b) | ||
3512 | { | ||
3513 | return _mm_move_sd(a, _mm_cmpnge_pd(a, b)); | ||
3514 | } | ||
3515 | |||
3516 | // Compare packed double-precision (64-bit) floating-point elements in a and b | ||
3517 | // for not-greater-than, and store the results in dst. | ||
3518 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_cmpngt_pd | ||
3519 | FORCE_INLINE __m128d _mm_cmpngt_pd(__m128d a, __m128d b) | ||
3520 | { | ||
3521 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
3522 | return vreinterpretq_m128d_u64(veorq_u64( | ||
3523 | vcgtq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)), | ||
3524 | vdupq_n_u64(UINT64_MAX))); | ||
3525 | #else | ||
3526 | uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); | ||
3527 | uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); | ||
3528 | uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); | ||
3529 | uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b)); | ||
3530 | uint64_t d[2]; | ||
3531 | d[0] = | ||
3532 | !((*(double *) &a0) > (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0); | ||
3533 | d[1] = | ||
3534 | !((*(double *) &a1) > (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0); | ||
3535 | |||
3536 | return vreinterpretq_m128d_u64(vld1q_u64(d)); | ||
3537 | #endif | ||
3538 | } | ||
3539 | |||
3540 | // Compare the lower double-precision (64-bit) floating-point elements in a and | ||
3541 | // b for not-greater-than, store the result in the lower element of dst, and | ||
3542 | // copy the upper element from a to the upper element of dst. | ||
3543 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_sd | ||
3544 | FORCE_INLINE __m128d _mm_cmpngt_sd(__m128d a, __m128d b) | ||
3545 | { | ||
3546 | return _mm_move_sd(a, _mm_cmpngt_pd(a, b)); | ||
3547 | } | ||
3548 | |||
3549 | // Compare packed double-precision (64-bit) floating-point elements in a and b | ||
3550 | // for not-less-than-or-equal, and store the results in dst. | ||
3551 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_pd | ||
3552 | FORCE_INLINE __m128d _mm_cmpnle_pd(__m128d a, __m128d b) | ||
3553 | { | ||
3554 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
3555 | return vreinterpretq_m128d_u64(veorq_u64( | ||
3556 | vcleq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)), | ||
3557 | vdupq_n_u64(UINT64_MAX))); | ||
3558 | #else | ||
3559 | uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); | ||
3560 | uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); | ||
3561 | uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); | ||
3562 | uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b)); | ||
3563 | uint64_t d[2]; | ||
3564 | d[0] = | ||
3565 | !((*(double *) &a0) <= (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0); | ||
3566 | d[1] = | ||
3567 | !((*(double *) &a1) <= (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0); | ||
3568 | |||
3569 | return vreinterpretq_m128d_u64(vld1q_u64(d)); | ||
3570 | #endif | ||
3571 | } | ||
3572 | |||
3573 | // Compare the lower double-precision (64-bit) floating-point elements in a and | ||
3574 | // b for not-less-than-or-equal, store the result in the lower element of dst, | ||
3575 | // and copy the upper element from a to the upper element of dst. | ||
3576 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_sd | ||
3577 | FORCE_INLINE __m128d _mm_cmpnle_sd(__m128d a, __m128d b) | ||
3578 | { | ||
3579 | return _mm_move_sd(a, _mm_cmpnle_pd(a, b)); | ||
3580 | } | ||
3581 | |||
3582 | // Compare packed double-precision (64-bit) floating-point elements in a and b | ||
3583 | // for not-less-than, and store the results in dst. | ||
3584 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_pd | ||
3585 | FORCE_INLINE __m128d _mm_cmpnlt_pd(__m128d a, __m128d b) | ||
3586 | { | ||
3587 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
3588 | return vreinterpretq_m128d_u64(veorq_u64( | ||
3589 | vcltq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)), | ||
3590 | vdupq_n_u64(UINT64_MAX))); | ||
3591 | #else | ||
3592 | uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); | ||
3593 | uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); | ||
3594 | uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); | ||
3595 | uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b)); | ||
3596 | uint64_t d[2]; | ||
3597 | d[0] = | ||
3598 | !((*(double *) &a0) < (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0); | ||
3599 | d[1] = | ||
3600 | !((*(double *) &a1) < (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0); | ||
3601 | |||
3602 | return vreinterpretq_m128d_u64(vld1q_u64(d)); | ||
3603 | #endif | ||
3604 | } | ||
3605 | |||
3606 | // Compare the lower double-precision (64-bit) floating-point elements in a and | ||
3607 | // b for not-less-than, store the result in the lower element of dst, and copy | ||
3608 | // the upper element from a to the upper element of dst. | ||
3609 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_sd | ||
3610 | FORCE_INLINE __m128d _mm_cmpnlt_sd(__m128d a, __m128d b) | ||
3611 | { | ||
3612 | return _mm_move_sd(a, _mm_cmpnlt_pd(a, b)); | ||
3613 | } | ||
3614 | |||
3615 | // Compare packed double-precision (64-bit) floating-point elements in a and b | ||
3616 | // to see if neither is NaN, and store the results in dst. | ||
3617 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_pd | ||
3618 | FORCE_INLINE __m128d _mm_cmpord_pd(__m128d a, __m128d b) | ||
3619 | { | ||
3620 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
3621 | // Excluding NaNs, any two floating point numbers can be compared. | ||
3622 | uint64x2_t not_nan_a = | ||
3623 | vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(a)); | ||
3624 | uint64x2_t not_nan_b = | ||
3625 | vceqq_f64(vreinterpretq_f64_m128d(b), vreinterpretq_f64_m128d(b)); | ||
3626 | return vreinterpretq_m128d_u64(vandq_u64(not_nan_a, not_nan_b)); | ||
3627 | #else | ||
3628 | uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); | ||
3629 | uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); | ||
3630 | uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); | ||
3631 | uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b)); | ||
3632 | uint64_t d[2]; | ||
3633 | d[0] = ((*(double *) &a0) == (*(double *) &a0) && | ||
3634 | (*(double *) &b0) == (*(double *) &b0)) | ||
3635 | ? ~UINT64_C(0) | ||
3636 | : UINT64_C(0); | ||
3637 | d[1] = ((*(double *) &a1) == (*(double *) &a1) && | ||
3638 | (*(double *) &b1) == (*(double *) &b1)) | ||
3639 | ? ~UINT64_C(0) | ||
3640 | : UINT64_C(0); | ||
3641 | |||
3642 | return vreinterpretq_m128d_u64(vld1q_u64(d)); | ||
3643 | #endif | ||
3644 | } | ||
3645 | |||
3646 | // Compare the lower double-precision (64-bit) floating-point elements in a and | ||
3647 | // b to see if neither is NaN, store the result in the lower element of dst, and | ||
3648 | // copy the upper element from a to the upper element of dst. | ||
3649 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_sd | ||
3650 | FORCE_INLINE __m128d _mm_cmpord_sd(__m128d a, __m128d b) | ||
3651 | { | ||
3652 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
3653 | return _mm_move_sd(a, _mm_cmpord_pd(a, b)); | ||
3654 | #else | ||
3655 | uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); | ||
3656 | uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); | ||
3657 | uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); | ||
3658 | uint64_t d[2]; | ||
3659 | d[0] = ((*(double *) &a0) == (*(double *) &a0) && | ||
3660 | (*(double *) &b0) == (*(double *) &b0)) | ||
3661 | ? ~UINT64_C(0) | ||
3662 | : UINT64_C(0); | ||
3663 | d[1] = a1; | ||
3664 | |||
3665 | return vreinterpretq_m128d_u64(vld1q_u64(d)); | ||
3666 | #endif | ||
3667 | } | ||
3668 | |||
3669 | // Compare packed double-precision (64-bit) floating-point elements in a and b | ||
3670 | // to see if either is NaN, and store the results in dst. | ||
3671 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_pd | ||
3672 | FORCE_INLINE __m128d _mm_cmpunord_pd(__m128d a, __m128d b) | ||
3673 | { | ||
3674 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
3675 | // Two NaNs are not equal in comparison operation. | ||
3676 | uint64x2_t not_nan_a = | ||
3677 | vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(a)); | ||
3678 | uint64x2_t not_nan_b = | ||
3679 | vceqq_f64(vreinterpretq_f64_m128d(b), vreinterpretq_f64_m128d(b)); | ||
3680 | return vreinterpretq_m128d_s32( | ||
3681 | vmvnq_s32(vreinterpretq_s32_u64(vandq_u64(not_nan_a, not_nan_b)))); | ||
3682 | #else | ||
3683 | uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); | ||
3684 | uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); | ||
3685 | uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); | ||
3686 | uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b)); | ||
3687 | uint64_t d[2]; | ||
3688 | d[0] = ((*(double *) &a0) == (*(double *) &a0) && | ||
3689 | (*(double *) &b0) == (*(double *) &b0)) | ||
3690 | ? UINT64_C(0) | ||
3691 | : ~UINT64_C(0); | ||
3692 | d[1] = ((*(double *) &a1) == (*(double *) &a1) && | ||
3693 | (*(double *) &b1) == (*(double *) &b1)) | ||
3694 | ? UINT64_C(0) | ||
3695 | : ~UINT64_C(0); | ||
3696 | |||
3697 | return vreinterpretq_m128d_u64(vld1q_u64(d)); | ||
3698 | #endif | ||
3699 | } | ||
3700 | |||
3701 | // Compare the lower double-precision (64-bit) floating-point elements in a and | ||
3702 | // b to see if either is NaN, store the result in the lower element of dst, and | ||
3703 | // copy the upper element from a to the upper element of dst. | ||
3704 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_sd | ||
3705 | FORCE_INLINE __m128d _mm_cmpunord_sd(__m128d a, __m128d b) | ||
3706 | { | ||
3707 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
3708 | return _mm_move_sd(a, _mm_cmpunord_pd(a, b)); | ||
3709 | #else | ||
3710 | uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); | ||
3711 | uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); | ||
3712 | uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); | ||
3713 | uint64_t d[2]; | ||
3714 | d[0] = ((*(double *) &a0) == (*(double *) &a0) && | ||
3715 | (*(double *) &b0) == (*(double *) &b0)) | ||
3716 | ? UINT64_C(0) | ||
3717 | : ~UINT64_C(0); | ||
3718 | d[1] = a1; | ||
3719 | |||
3720 | return vreinterpretq_m128d_u64(vld1q_u64(d)); | ||
3721 | #endif | ||
3722 | } | ||
3723 | |||
3724 | // Compare the lower double-precision (64-bit) floating-point element in a and b | ||
3725 | // for greater-than-or-equal, and return the boolean result (0 or 1). | ||
3726 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comige_sd | ||
3727 | FORCE_INLINE int _mm_comige_sd(__m128d a, __m128d b) | ||
3728 | { | ||
3729 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
3730 | return vgetq_lane_u64(vcgeq_f64(a, b), 0) & 0x1; | ||
3731 | #else | ||
3732 | uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); | ||
3733 | uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); | ||
3734 | |||
3735 | return (*(double *) &a0 >= *(double *) &b0); | ||
3736 | #endif | ||
3737 | } | ||
3738 | |||
3739 | // Compare the lower double-precision (64-bit) floating-point element in a and b | ||
3740 | // for greater-than, and return the boolean result (0 or 1). | ||
3741 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comigt_sd | ||
3742 | FORCE_INLINE int _mm_comigt_sd(__m128d a, __m128d b) | ||
3743 | { | ||
3744 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
3745 | return vgetq_lane_u64(vcgtq_f64(a, b), 0) & 0x1; | ||
3746 | #else | ||
3747 | uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); | ||
3748 | uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); | ||
3749 | |||
3750 | return (*(double *) &a0 > *(double *) &b0); | ||
3751 | #endif | ||
3752 | } | ||
3753 | |||
3754 | // Compare the lower double-precision (64-bit) floating-point element in a and b | ||
3755 | // for less-than-or-equal, and return the boolean result (0 or 1). | ||
3756 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comile_sd | ||
3757 | FORCE_INLINE int _mm_comile_sd(__m128d a, __m128d b) | ||
3758 | { | ||
3759 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
3760 | return vgetq_lane_u64(vcleq_f64(a, b), 0) & 0x1; | ||
3761 | #else | ||
3762 | uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); | ||
3763 | uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); | ||
3764 | |||
3765 | return (*(double *) &a0 <= *(double *) &b0); | ||
3766 | #endif | ||
3767 | } | ||
3768 | |||
3769 | // Compare the lower double-precision (64-bit) floating-point element in a and b | ||
3770 | // for less-than, and return the boolean result (0 or 1). | ||
3771 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comilt_sd | ||
3772 | FORCE_INLINE int _mm_comilt_sd(__m128d a, __m128d b) | ||
3773 | { | ||
3774 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
3775 | return vgetq_lane_u64(vcltq_f64(a, b), 0) & 0x1; | ||
3776 | #else | ||
3777 | uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); | ||
3778 | uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); | ||
3779 | |||
3780 | return (*(double *) &a0 < *(double *) &b0); | ||
3781 | #endif | ||
3782 | } | ||
3783 | |||
3784 | // Compare the lower double-precision (64-bit) floating-point element in a and b | ||
3785 | // for equality, and return the boolean result (0 or 1). | ||
3786 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comieq_sd | ||
3787 | FORCE_INLINE int _mm_comieq_sd(__m128d a, __m128d b) | ||
3788 | { | ||
3789 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
3790 | return vgetq_lane_u64(vceqq_f64(a, b), 0) & 0x1; | ||
3791 | #else | ||
3792 | uint32x4_t a_not_nan = | ||
3793 | vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(a)); | ||
3794 | uint32x4_t b_not_nan = | ||
3795 | vceqq_u32(vreinterpretq_u32_m128d(b), vreinterpretq_u32_m128d(b)); | ||
3796 | uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan); | ||
3797 | uint32x4_t a_eq_b = | ||
3798 | vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(b)); | ||
3799 | uint64x2_t and_results = vandq_u64(vreinterpretq_u64_u32(a_and_b_not_nan), | ||
3800 | vreinterpretq_u64_u32(a_eq_b)); | ||
3801 | return vgetq_lane_u64(and_results, 0) & 0x1; | ||
3802 | #endif | ||
3803 | } | ||
3804 | |||
3805 | // Compare the lower double-precision (64-bit) floating-point element in a and b | ||
3806 | // for not-equal, and return the boolean result (0 or 1). | ||
3807 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comineq_sd | ||
3808 | FORCE_INLINE int _mm_comineq_sd(__m128d a, __m128d b) | ||
3809 | { | ||
3810 | return !_mm_comieq_sd(a, b); | ||
3811 | } | ||
3812 | |||
3813 | // Convert packed signed 32-bit integers in a to packed double-precision | ||
3814 | // (64-bit) floating-point elements, and store the results in dst. | ||
3815 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_pd | ||
3816 | FORCE_INLINE __m128d _mm_cvtepi32_pd(__m128i a) | ||
3817 | { | ||
3818 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
3819 | return vreinterpretq_m128d_f64( | ||
3820 | vcvtq_f64_s64(vmovl_s32(vget_low_s32(vreinterpretq_s32_m128i(a))))); | ||
3821 | #else | ||
3822 | double a0 = (double) vgetq_lane_s32(vreinterpretq_s32_m128i(a), 0); | ||
3823 | double a1 = (double) vgetq_lane_s32(vreinterpretq_s32_m128i(a), 1); | ||
3824 | return _mm_set_pd(a1, a0); | ||
3825 | #endif | ||
3826 | } | ||
3827 | |||
3828 | // Convert packed signed 32-bit integers in a to packed single-precision | ||
3829 | // (32-bit) floating-point elements, and store the results in dst. | ||
3830 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_ps | ||
3831 | FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a) | ||
3832 | { | ||
3833 | return vreinterpretq_m128_f32(vcvtq_f32_s32(vreinterpretq_s32_m128i(a))); | ||
3834 | } | ||
3835 | |||
3836 | // Convert packed double-precision (64-bit) floating-point elements in a to | ||
3837 | // packed 32-bit integers, and store the results in dst. | ||
3838 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_epi32 | ||
3839 | FORCE_INLINE __m128i _mm_cvtpd_epi32(__m128d a) | ||
3840 | { | ||
3841 | // vrnd32xq_f64 not supported on clang | ||
3842 | #if defined(__ARM_FEATURE_FRINT) && !defined(__clang__) | ||
3843 | float64x2_t rounded = vrnd32xq_f64(vreinterpretq_f64_m128d(a)); | ||
3844 | int64x2_t integers = vcvtq_s64_f64(rounded); | ||
3845 | return vreinterpretq_m128i_s32( | ||
3846 | vcombine_s32(vmovn_s64(integers), vdup_n_s32(0))); | ||
3847 | #else | ||
3848 | __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION); | ||
3849 | double d0 = ((double *) &rnd)[0]; | ||
3850 | double d1 = ((double *) &rnd)[1]; | ||
3851 | return _mm_set_epi32(0, 0, (int32_t) d1, (int32_t) d0); | ||
3852 | #endif | ||
3853 | } | ||
3854 | |||
3855 | // Convert packed double-precision (64-bit) floating-point elements in a to | ||
3856 | // packed 32-bit integers, and store the results in dst. | ||
3857 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_pi32 | ||
3858 | FORCE_INLINE __m64 _mm_cvtpd_pi32(__m128d a) | ||
3859 | { | ||
3860 | __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION); | ||
3861 | double d0 = ((double *) &rnd)[0]; | ||
3862 | double d1 = ((double *) &rnd)[1]; | ||
3863 | int32_t ALIGN_STRUCT(16) data[2] = {(int32_t) d0, (int32_t) d1}; | ||
3864 | return vreinterpret_m64_s32(vld1_s32(data)); | ||
3865 | } | ||
3866 | |||
3867 | // Convert packed double-precision (64-bit) floating-point elements in a to | ||
3868 | // packed single-precision (32-bit) floating-point elements, and store the | ||
3869 | // results in dst. | ||
3870 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_ps | ||
3871 | FORCE_INLINE __m128 _mm_cvtpd_ps(__m128d a) | ||
3872 | { | ||
3873 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
3874 | float32x2_t tmp = vcvt_f32_f64(vreinterpretq_f64_m128d(a)); | ||
3875 | return vreinterpretq_m128_f32(vcombine_f32(tmp, vdup_n_f32(0))); | ||
3876 | #else | ||
3877 | float a0 = (float) ((double *) &a)[0]; | ||
3878 | float a1 = (float) ((double *) &a)[1]; | ||
3879 | return _mm_set_ps(0, 0, a1, a0); | ||
3880 | #endif | ||
3881 | } | ||
3882 | |||
3883 | // Convert packed signed 32-bit integers in a to packed double-precision | ||
3884 | // (64-bit) floating-point elements, and store the results in dst. | ||
3885 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi32_pd | ||
3886 | FORCE_INLINE __m128d _mm_cvtpi32_pd(__m64 a) | ||
3887 | { | ||
3888 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
3889 | return vreinterpretq_m128d_f64( | ||
3890 | vcvtq_f64_s64(vmovl_s32(vreinterpret_s32_m64(a)))); | ||
3891 | #else | ||
3892 | double a0 = (double) vget_lane_s32(vreinterpret_s32_m64(a), 0); | ||
3893 | double a1 = (double) vget_lane_s32(vreinterpret_s32_m64(a), 1); | ||
3894 | return _mm_set_pd(a1, a0); | ||
3895 | #endif | ||
3896 | } | ||
3897 | |||
3898 | // Convert packed single-precision (32-bit) floating-point elements in a to | ||
3899 | // packed 32-bit integers, and store the results in dst. | ||
3900 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_epi32 | ||
3901 | // *NOTE*. The default rounding mode on SSE is 'round to even', which ARMv7-A | ||
3902 | // does not support! It is supported on ARMv8-A however. | ||
3903 | FORCE_INLINE __m128i _mm_cvtps_epi32(__m128 a) | ||
3904 | { | ||
3905 | #if defined(__ARM_FEATURE_FRINT) | ||
3906 | return vreinterpretq_m128i_s32(vcvtq_s32_f32(vrnd32xq_f32(a))); | ||
3907 | #elif (defined(__aarch64__) || defined(_M_ARM64)) || \ | ||
3908 | defined(__ARM_FEATURE_DIRECTED_ROUNDING) | ||
3909 | switch (_MM_GET_ROUNDING_MODE()) { | ||
3910 | case _MM_ROUND_NEAREST: | ||
3911 | return vreinterpretq_m128i_s32(vcvtnq_s32_f32(a)); | ||
3912 | case _MM_ROUND_DOWN: | ||
3913 | return vreinterpretq_m128i_s32(vcvtmq_s32_f32(a)); | ||
3914 | case _MM_ROUND_UP: | ||
3915 | return vreinterpretq_m128i_s32(vcvtpq_s32_f32(a)); | ||
3916 | default: // _MM_ROUND_TOWARD_ZERO | ||
3917 | return vreinterpretq_m128i_s32(vcvtq_s32_f32(a)); | ||
3918 | } | ||
3919 | #else | ||
3920 | float *f = (float *) &a; | ||
3921 | switch (_MM_GET_ROUNDING_MODE()) { | ||
3922 | case _MM_ROUND_NEAREST: { | ||
3923 | uint32x4_t signmask = vdupq_n_u32(0x80000000); | ||
3924 | float32x4_t half = vbslq_f32(signmask, vreinterpretq_f32_m128(a), | ||
3925 | vdupq_n_f32(0.5f)); /* +/- 0.5 */ | ||
3926 | int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32( | ||
3927 | vreinterpretq_f32_m128(a), half)); /* round to integer: [a + 0.5]*/ | ||
3928 | int32x4_t r_trunc = vcvtq_s32_f32( | ||
3929 | vreinterpretq_f32_m128(a)); /* truncate to integer: [a] */ | ||
3930 | int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32( | ||
3931 | vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31)); /* 1 or 0 */ | ||
3932 | int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone), | ||
3933 | vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */ | ||
3934 | float32x4_t delta = vsubq_f32( | ||
3935 | vreinterpretq_f32_m128(a), | ||
3936 | vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */ | ||
3937 | uint32x4_t is_delta_half = | ||
3938 | vceqq_f32(delta, half); /* delta == +/- 0.5 */ | ||
3939 | return vreinterpretq_m128i_s32( | ||
3940 | vbslq_s32(is_delta_half, r_even, r_normal)); | ||
3941 | } | ||
3942 | case _MM_ROUND_DOWN: | ||
3943 | return _mm_set_epi32(floorf(f[3]), floorf(f[2]), floorf(f[1]), | ||
3944 | floorf(f[0])); | ||
3945 | case _MM_ROUND_UP: | ||
3946 | return _mm_set_epi32(ceilf(f[3]), ceilf(f[2]), ceilf(f[1]), | ||
3947 | ceilf(f[0])); | ||
3948 | default: // _MM_ROUND_TOWARD_ZERO | ||
3949 | return _mm_set_epi32((int32_t) f[3], (int32_t) f[2], (int32_t) f[1], | ||
3950 | (int32_t) f[0]); | ||
3951 | } | ||
3952 | #endif | ||
3953 | } | ||
3954 | |||
3955 | // Convert packed single-precision (32-bit) floating-point elements in a to | ||
3956 | // packed double-precision (64-bit) floating-point elements, and store the | ||
3957 | // results in dst. | ||
3958 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pd | ||
3959 | FORCE_INLINE __m128d _mm_cvtps_pd(__m128 a) | ||
3960 | { | ||
3961 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
3962 | return vreinterpretq_m128d_f64( | ||
3963 | vcvt_f64_f32(vget_low_f32(vreinterpretq_f32_m128(a)))); | ||
3964 | #else | ||
3965 | double a0 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 0); | ||
3966 | double a1 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 1); | ||
3967 | return _mm_set_pd(a1, a0); | ||
3968 | #endif | ||
3969 | } | ||
3970 | |||
3971 | // Copy the lower double-precision (64-bit) floating-point element of a to dst. | ||
3972 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_f64 | ||
3973 | FORCE_INLINE double _mm_cvtsd_f64(__m128d a) | ||
3974 | { | ||
3975 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
3976 | return (double) vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0); | ||
3977 | #else | ||
3978 | return ((double *) &a)[0]; | ||
3979 | #endif | ||
3980 | } | ||
3981 | |||
3982 | // Convert the lower double-precision (64-bit) floating-point element in a to a | ||
3983 | // 32-bit integer, and store the result in dst. | ||
3984 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si32 | ||
3985 | FORCE_INLINE int32_t _mm_cvtsd_si32(__m128d a) | ||
3986 | { | ||
3987 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
3988 | return (int32_t) vgetq_lane_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)), 0); | ||
3989 | #else | ||
3990 | __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION); | ||
3991 | double ret = ((double *) &rnd)[0]; | ||
3992 | return (int32_t) ret; | ||
3993 | #endif | ||
3994 | } | ||
3995 | |||
3996 | // Convert the lower double-precision (64-bit) floating-point element in a to a | ||
3997 | // 64-bit integer, and store the result in dst. | ||
3998 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si64 | ||
3999 | FORCE_INLINE int64_t _mm_cvtsd_si64(__m128d a) | ||
4000 | { | ||
4001 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
4002 | return (int64_t) vgetq_lane_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)), 0); | ||
4003 | #else | ||
4004 | __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION); | ||
4005 | double ret = ((double *) &rnd)[0]; | ||
4006 | return (int64_t) ret; | ||
4007 | #endif | ||
4008 | } | ||
4009 | |||
4010 | // Convert the lower double-precision (64-bit) floating-point element in a to a | ||
4011 | // 64-bit integer, and store the result in dst. | ||
4012 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si64x | ||
4013 | #define _mm_cvtsd_si64x _mm_cvtsd_si64 | ||
4014 | |||
4015 | // Convert the lower double-precision (64-bit) floating-point element in b to a | ||
4016 | // single-precision (32-bit) floating-point element, store the result in the | ||
4017 | // lower element of dst, and copy the upper 3 packed elements from a to the | ||
4018 | // upper elements of dst. | ||
4019 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_ss | ||
4020 | FORCE_INLINE __m128 _mm_cvtsd_ss(__m128 a, __m128d b) | ||
4021 | { | ||
4022 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
4023 | return vreinterpretq_m128_f32(vsetq_lane_f32( | ||
4024 | vget_lane_f32(vcvt_f32_f64(vreinterpretq_f64_m128d(b)), 0), | ||
4025 | vreinterpretq_f32_m128(a), 0)); | ||
4026 | #else | ||
4027 | return vreinterpretq_m128_f32(vsetq_lane_f32((float) ((double *) &b)[0], | ||
4028 | vreinterpretq_f32_m128(a), 0)); | ||
4029 | #endif | ||
4030 | } | ||
4031 | |||
4032 | // Copy the lower 32-bit integer in a to dst. | ||
4033 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si32 | ||
4034 | FORCE_INLINE int _mm_cvtsi128_si32(__m128i a) | ||
4035 | { | ||
4036 | return vgetq_lane_s32(vreinterpretq_s32_m128i(a), 0); | ||
4037 | } | ||
4038 | |||
4039 | // Copy the lower 64-bit integer in a to dst. | ||
4040 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si64 | ||
4041 | FORCE_INLINE int64_t _mm_cvtsi128_si64(__m128i a) | ||
4042 | { | ||
4043 | return vgetq_lane_s64(vreinterpretq_s64_m128i(a), 0); | ||
4044 | } | ||
4045 | |||
4046 | // Copy the lower 64-bit integer in a to dst. | ||
4047 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si64x | ||
4048 | #define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a) | ||
4049 | |||
4050 | // Convert the signed 32-bit integer b to a double-precision (64-bit) | ||
4051 | // floating-point element, store the result in the lower element of dst, and | ||
4052 | // copy the upper element from a to the upper element of dst. | ||
4053 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_sd | ||
4054 | FORCE_INLINE __m128d _mm_cvtsi32_sd(__m128d a, int32_t b) | ||
4055 | { | ||
4056 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
4057 | return vreinterpretq_m128d_f64( | ||
4058 | vsetq_lane_f64((double) b, vreinterpretq_f64_m128d(a), 0)); | ||
4059 | #else | ||
4060 | double bf = (double) b; | ||
4061 | return vreinterpretq_m128d_s64( | ||
4062 | vsetq_lane_s64(*(int64_t *) &bf, vreinterpretq_s64_m128d(a), 0)); | ||
4063 | #endif | ||
4064 | } | ||
4065 | |||
4066 | // Copy the lower 64-bit integer in a to dst. | ||
4067 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si64x | ||
4068 | #define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a) | ||
4069 | |||
4070 | // Copy 32-bit integer a to the lower elements of dst, and zero the upper | ||
4071 | // elements of dst. | ||
4072 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_si128 | ||
4073 | FORCE_INLINE __m128i _mm_cvtsi32_si128(int a) | ||
4074 | { | ||
4075 | return vreinterpretq_m128i_s32(vsetq_lane_s32(a, vdupq_n_s32(0), 0)); | ||
4076 | } | ||
4077 | |||
4078 | // Convert the signed 64-bit integer b to a double-precision (64-bit) | ||
4079 | // floating-point element, store the result in the lower element of dst, and | ||
4080 | // copy the upper element from a to the upper element of dst. | ||
4081 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64_sd | ||
4082 | FORCE_INLINE __m128d _mm_cvtsi64_sd(__m128d a, int64_t b) | ||
4083 | { | ||
4084 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
4085 | return vreinterpretq_m128d_f64( | ||
4086 | vsetq_lane_f64((double) b, vreinterpretq_f64_m128d(a), 0)); | ||
4087 | #else | ||
4088 | double bf = (double) b; | ||
4089 | return vreinterpretq_m128d_s64( | ||
4090 | vsetq_lane_s64(*(int64_t *) &bf, vreinterpretq_s64_m128d(a), 0)); | ||
4091 | #endif | ||
4092 | } | ||
4093 | |||
4094 | // Copy 64-bit integer a to the lower element of dst, and zero the upper | ||
4095 | // element. | ||
4096 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64_si128 | ||
4097 | FORCE_INLINE __m128i _mm_cvtsi64_si128(int64_t a) | ||
4098 | { | ||
4099 | return vreinterpretq_m128i_s64(vsetq_lane_s64(a, vdupq_n_s64(0), 0)); | ||
4100 | } | ||
4101 | |||
4102 | // Copy 64-bit integer a to the lower element of dst, and zero the upper | ||
4103 | // element. | ||
4104 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64x_si128 | ||
4105 | #define _mm_cvtsi64x_si128(a) _mm_cvtsi64_si128(a) | ||
4106 | |||
4107 | // Convert the signed 64-bit integer b to a double-precision (64-bit) | ||
4108 | // floating-point element, store the result in the lower element of dst, and | ||
4109 | // copy the upper element from a to the upper element of dst. | ||
4110 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64x_sd | ||
4111 | #define _mm_cvtsi64x_sd(a, b) _mm_cvtsi64_sd(a, b) | ||
4112 | |||
4113 | // Convert the lower single-precision (32-bit) floating-point element in b to a | ||
4114 | // double-precision (64-bit) floating-point element, store the result in the | ||
4115 | // lower element of dst, and copy the upper element from a to the upper element | ||
4116 | // of dst. | ||
4117 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_sd | ||
4118 | FORCE_INLINE __m128d _mm_cvtss_sd(__m128d a, __m128 b) | ||
4119 | { | ||
4120 | double d = (double) vgetq_lane_f32(vreinterpretq_f32_m128(b), 0); | ||
4121 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
4122 | return vreinterpretq_m128d_f64( | ||
4123 | vsetq_lane_f64(d, vreinterpretq_f64_m128d(a), 0)); | ||
4124 | #else | ||
4125 | return vreinterpretq_m128d_s64( | ||
4126 | vsetq_lane_s64(*(int64_t *) &d, vreinterpretq_s64_m128d(a), 0)); | ||
4127 | #endif | ||
4128 | } | ||
4129 | |||
4130 | // Convert packed double-precision (64-bit) floating-point elements in a to | ||
4131 | // packed 32-bit integers with truncation, and store the results in dst. | ||
4132 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttpd_epi32 | ||
4133 | FORCE_INLINE __m128i _mm_cvttpd_epi32(__m128d a) | ||
4134 | { | ||
4135 | double a0 = ((double *) &a)[0]; | ||
4136 | double a1 = ((double *) &a)[1]; | ||
4137 | return _mm_set_epi32(0, 0, (int32_t) a1, (int32_t) a0); | ||
4138 | } | ||
4139 | |||
4140 | // Convert packed double-precision (64-bit) floating-point elements in a to | ||
4141 | // packed 32-bit integers with truncation, and store the results in dst. | ||
4142 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttpd_pi32 | ||
4143 | FORCE_INLINE __m64 _mm_cvttpd_pi32(__m128d a) | ||
4144 | { | ||
4145 | double a0 = ((double *) &a)[0]; | ||
4146 | double a1 = ((double *) &a)[1]; | ||
4147 | int32_t ALIGN_STRUCT(16) data[2] = {(int32_t) a0, (int32_t) a1}; | ||
4148 | return vreinterpret_m64_s32(vld1_s32(data)); | ||
4149 | } | ||
4150 | |||
4151 | // Convert packed single-precision (32-bit) floating-point elements in a to | ||
4152 | // packed 32-bit integers with truncation, and store the results in dst. | ||
4153 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttps_epi32 | ||
4154 | FORCE_INLINE __m128i _mm_cvttps_epi32(__m128 a) | ||
4155 | { | ||
4156 | return vreinterpretq_m128i_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a))); | ||
4157 | } | ||
4158 | |||
4159 | // Convert the lower double-precision (64-bit) floating-point element in a to a | ||
4160 | // 32-bit integer with truncation, and store the result in dst. | ||
4161 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si32 | ||
4162 | FORCE_INLINE int32_t _mm_cvttsd_si32(__m128d a) | ||
4163 | { | ||
4164 | double ret = *((double *) &a); | ||
4165 | return (int32_t) ret; | ||
4166 | } | ||
4167 | |||
4168 | // Convert the lower double-precision (64-bit) floating-point element in a to a | ||
4169 | // 64-bit integer with truncation, and store the result in dst. | ||
4170 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si64 | ||
4171 | FORCE_INLINE int64_t _mm_cvttsd_si64(__m128d a) | ||
4172 | { | ||
4173 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
4174 | return vgetq_lane_s64(vcvtq_s64_f64(vreinterpretq_f64_m128d(a)), 0); | ||
4175 | #else | ||
4176 | double ret = *((double *) &a); | ||
4177 | return (int64_t) ret; | ||
4178 | #endif | ||
4179 | } | ||
4180 | |||
4181 | // Convert the lower double-precision (64-bit) floating-point element in a to a | ||
4182 | // 64-bit integer with truncation, and store the result in dst. | ||
4183 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si64x | ||
4184 | #define _mm_cvttsd_si64x(a) _mm_cvttsd_si64(a) | ||
4185 | |||
4186 | // Divide packed double-precision (64-bit) floating-point elements in a by | ||
4187 | // packed elements in b, and store the results in dst. | ||
4188 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_pd | ||
4189 | FORCE_INLINE __m128d _mm_div_pd(__m128d a, __m128d b) | ||
4190 | { | ||
4191 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
4192 | return vreinterpretq_m128d_f64( | ||
4193 | vdivq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); | ||
4194 | #else | ||
4195 | double *da = (double *) &a; | ||
4196 | double *db = (double *) &b; | ||
4197 | double c[2]; | ||
4198 | c[0] = da[0] / db[0]; | ||
4199 | c[1] = da[1] / db[1]; | ||
4200 | return vld1q_f32((float32_t *) c); | ||
4201 | #endif | ||
4202 | } | ||
4203 | |||
4204 | // Divide the lower double-precision (64-bit) floating-point element in a by the | ||
4205 | // lower double-precision (64-bit) floating-point element in b, store the result | ||
4206 | // in the lower element of dst, and copy the upper element from a to the upper | ||
4207 | // element of dst. | ||
4208 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_sd | ||
4209 | FORCE_INLINE __m128d _mm_div_sd(__m128d a, __m128d b) | ||
4210 | { | ||
4211 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
4212 | float64x2_t tmp = | ||
4213 | vdivq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)); | ||
4214 | return vreinterpretq_m128d_f64( | ||
4215 | vsetq_lane_f64(vgetq_lane_f64(vreinterpretq_f64_m128d(a), 1), tmp, 1)); | ||
4216 | #else | ||
4217 | return _mm_move_sd(a, _mm_div_pd(a, b)); | ||
4218 | #endif | ||
4219 | } | ||
4220 | |||
4221 | // Extract a 16-bit integer from a, selected with imm8, and store the result in | ||
4222 | // the lower element of dst. | ||
4223 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi16 | ||
4224 | // FORCE_INLINE int _mm_extract_epi16(__m128i a, __constrange(0,8) int imm) | ||
4225 | #define _mm_extract_epi16(a, imm) \ | ||
4226 | vgetq_lane_u16(vreinterpretq_u16_m128i(a), (imm)) | ||
4227 | |||
4228 | // Copy a to dst, and insert the 16-bit integer i into dst at the location | ||
4229 | // specified by imm8. | ||
4230 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi16 | ||
4231 | // FORCE_INLINE __m128i _mm_insert_epi16(__m128i a, int b, | ||
4232 | // __constrange(0,8) int imm) | ||
4233 | #define _mm_insert_epi16(a, b, imm) \ | ||
4234 | vreinterpretq_m128i_s16( \ | ||
4235 | vsetq_lane_s16((b), vreinterpretq_s16_m128i(a), (imm))) | ||
4236 | |||
4237 | // Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point | ||
4238 | // elements) from memory into dst. mem_addr must be aligned on a 16-byte | ||
4239 | // boundary or a general-protection exception may be generated. | ||
4240 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_pd | ||
4241 | FORCE_INLINE __m128d _mm_load_pd(const double *p) | ||
4242 | { | ||
4243 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
4244 | return vreinterpretq_m128d_f64(vld1q_f64(p)); | ||
4245 | #else | ||
4246 | const float *fp = (const float *) p; | ||
4247 | float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], fp[2], fp[3]}; | ||
4248 | return vreinterpretq_m128d_f32(vld1q_f32(data)); | ||
4249 | #endif | ||
4250 | } | ||
4251 | |||
4252 | // Load a double-precision (64-bit) floating-point element from memory into both | ||
4253 | // elements of dst. | ||
4254 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_pd1 | ||
4255 | #define _mm_load_pd1 _mm_load1_pd | ||
4256 | |||
4257 | // Load a double-precision (64-bit) floating-point element from memory into the | ||
4258 | // lower of dst, and zero the upper element. mem_addr does not need to be | ||
4259 | // aligned on any particular boundary. | ||
4260 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_sd | ||
4261 | FORCE_INLINE __m128d _mm_load_sd(const double *p) | ||
4262 | { | ||
4263 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
4264 | return vreinterpretq_m128d_f64(vsetq_lane_f64(*p, vdupq_n_f64(0), 0)); | ||
4265 | #else | ||
4266 | const float *fp = (const float *) p; | ||
4267 | float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], 0, 0}; | ||
4268 | return vreinterpretq_m128d_f32(vld1q_f32(data)); | ||
4269 | #endif | ||
4270 | } | ||
4271 | |||
4272 | // Load 128-bits of integer data from memory into dst. mem_addr must be aligned | ||
4273 | // on a 16-byte boundary or a general-protection exception may be generated. | ||
4274 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_si128 | ||
4275 | FORCE_INLINE __m128i _mm_load_si128(const __m128i *p) | ||
4276 | { | ||
4277 | return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p)); | ||
4278 | } | ||
4279 | |||
4280 | // Load a double-precision (64-bit) floating-point element from memory into both | ||
4281 | // elements of dst. | ||
4282 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load1_pd | ||
4283 | FORCE_INLINE __m128d _mm_load1_pd(const double *p) | ||
4284 | { | ||
4285 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
4286 | return vreinterpretq_m128d_f64(vld1q_dup_f64(p)); | ||
4287 | #else | ||
4288 | return vreinterpretq_m128d_s64(vdupq_n_s64(*(const int64_t *) p)); | ||
4289 | #endif | ||
4290 | } | ||
4291 | |||
4292 | // Load a double-precision (64-bit) floating-point element from memory into the | ||
4293 | // upper element of dst, and copy the lower element from a to dst. mem_addr does | ||
4294 | // not need to be aligned on any particular boundary. | ||
4295 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadh_pd | ||
4296 | FORCE_INLINE __m128d _mm_loadh_pd(__m128d a, const double *p) | ||
4297 | { | ||
4298 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
4299 | return vreinterpretq_m128d_f64( | ||
4300 | vcombine_f64(vget_low_f64(vreinterpretq_f64_m128d(a)), vld1_f64(p))); | ||
4301 | #else | ||
4302 | return vreinterpretq_m128d_f32(vcombine_f32( | ||
4303 | vget_low_f32(vreinterpretq_f32_m128d(a)), vld1_f32((const float *) p))); | ||
4304 | #endif | ||
4305 | } | ||
4306 | |||
4307 | // Load 64-bit integer from memory into the first element of dst. | ||
4308 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadl_epi64 | ||
4309 | FORCE_INLINE __m128i _mm_loadl_epi64(__m128i const *p) | ||
4310 | { | ||
4311 | /* Load the lower 64 bits of the value pointed to by p into the | ||
4312 | * lower 64 bits of the result, zeroing the upper 64 bits of the result. | ||
4313 | */ | ||
4314 | return vreinterpretq_m128i_s32( | ||
4315 | vcombine_s32(vld1_s32((int32_t const *) p), vcreate_s32(0))); | ||
4316 | } | ||
4317 | |||
4318 | // Load a double-precision (64-bit) floating-point element from memory into the | ||
4319 | // lower element of dst, and copy the upper element from a to dst. mem_addr does | ||
4320 | // not need to be aligned on any particular boundary. | ||
4321 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadl_pd | ||
4322 | FORCE_INLINE __m128d _mm_loadl_pd(__m128d a, const double *p) | ||
4323 | { | ||
4324 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
4325 | return vreinterpretq_m128d_f64( | ||
4326 | vcombine_f64(vld1_f64(p), vget_high_f64(vreinterpretq_f64_m128d(a)))); | ||
4327 | #else | ||
4328 | return vreinterpretq_m128d_f32( | ||
4329 | vcombine_f32(vld1_f32((const float *) p), | ||
4330 | vget_high_f32(vreinterpretq_f32_m128d(a)))); | ||
4331 | #endif | ||
4332 | } | ||
4333 | |||
4334 | // Load 2 double-precision (64-bit) floating-point elements from memory into dst | ||
4335 | // in reverse order. mem_addr must be aligned on a 16-byte boundary or a | ||
4336 | // general-protection exception may be generated. | ||
4337 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadr_pd | ||
4338 | FORCE_INLINE __m128d _mm_loadr_pd(const double *p) | ||
4339 | { | ||
4340 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
4341 | float64x2_t v = vld1q_f64(p); | ||
4342 | return vreinterpretq_m128d_f64(vextq_f64(v, v, 1)); | ||
4343 | #else | ||
4344 | int64x2_t v = vld1q_s64((const int64_t *) p); | ||
4345 | return vreinterpretq_m128d_s64(vextq_s64(v, v, 1)); | ||
4346 | #endif | ||
4347 | } | ||
4348 | |||
4349 | // Loads two double-precision from unaligned memory, floating-point values. | ||
4350 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_pd | ||
4351 | FORCE_INLINE __m128d _mm_loadu_pd(const double *p) | ||
4352 | { | ||
4353 | return _mm_load_pd(p); | ||
4354 | } | ||
4355 | |||
4356 | // Load 128-bits of integer data from memory into dst. mem_addr does not need to | ||
4357 | // be aligned on any particular boundary. | ||
4358 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si128 | ||
4359 | FORCE_INLINE __m128i _mm_loadu_si128(const __m128i *p) | ||
4360 | { | ||
4361 | return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p)); | ||
4362 | } | ||
4363 | |||
4364 | // Load unaligned 32-bit integer from memory into the first element of dst. | ||
4365 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si32 | ||
4366 | FORCE_INLINE __m128i _mm_loadu_si32(const void *p) | ||
4367 | { | ||
4368 | return vreinterpretq_m128i_s32( | ||
4369 | vsetq_lane_s32(*(const int32_t *) p, vdupq_n_s32(0), 0)); | ||
4370 | } | ||
4371 | |||
4372 | // Multiply packed signed 16-bit integers in a and b, producing intermediate | ||
4373 | // signed 32-bit integers. Horizontally add adjacent pairs of intermediate | ||
4374 | // 32-bit integers, and pack the results in dst. | ||
4375 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_madd_epi16 | ||
4376 | FORCE_INLINE __m128i _mm_madd_epi16(__m128i a, __m128i b) | ||
4377 | { | ||
4378 | int32x4_t low = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)), | ||
4379 | vget_low_s16(vreinterpretq_s16_m128i(b))); | ||
4380 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
4381 | int32x4_t high = | ||
4382 | vmull_high_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)); | ||
4383 | |||
4384 | return vreinterpretq_m128i_s32(vpaddq_s32(low, high)); | ||
4385 | #else | ||
4386 | int32x4_t high = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)), | ||
4387 | vget_high_s16(vreinterpretq_s16_m128i(b))); | ||
4388 | |||
4389 | int32x2_t low_sum = vpadd_s32(vget_low_s32(low), vget_high_s32(low)); | ||
4390 | int32x2_t high_sum = vpadd_s32(vget_low_s32(high), vget_high_s32(high)); | ||
4391 | |||
4392 | return vreinterpretq_m128i_s32(vcombine_s32(low_sum, high_sum)); | ||
4393 | #endif | ||
4394 | } | ||
4395 | |||
4396 | // Conditionally store 8-bit integer elements from a into memory using mask | ||
4397 | // (elements are not stored when the highest bit is not set in the corresponding | ||
4398 | // element) and a non-temporal memory hint. mem_addr does not need to be aligned | ||
4399 | // on any particular boundary. | ||
4400 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskmoveu_si128 | ||
4401 | FORCE_INLINE void _mm_maskmoveu_si128(__m128i a, __m128i mask, char *mem_addr) | ||
4402 | { | ||
4403 | int8x16_t shr_mask = vshrq_n_s8(vreinterpretq_s8_m128i(mask), 7); | ||
4404 | __m128 b = _mm_load_ps((const float *) mem_addr); | ||
4405 | int8x16_t masked = | ||
4406 | vbslq_s8(vreinterpretq_u8_s8(shr_mask), vreinterpretq_s8_m128i(a), | ||
4407 | vreinterpretq_s8_m128(b)); | ||
4408 | vst1q_s8((int8_t *) mem_addr, masked); | ||
4409 | } | ||
4410 | |||
4411 | // Compare packed signed 16-bit integers in a and b, and store packed maximum | ||
4412 | // values in dst. | ||
4413 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi16 | ||
4414 | FORCE_INLINE __m128i _mm_max_epi16(__m128i a, __m128i b) | ||
4415 | { | ||
4416 | return vreinterpretq_m128i_s16( | ||
4417 | vmaxq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); | ||
4418 | } | ||
4419 | |||
4420 | // Compare packed unsigned 8-bit integers in a and b, and store packed maximum | ||
4421 | // values in dst. | ||
4422 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu8 | ||
4423 | FORCE_INLINE __m128i _mm_max_epu8(__m128i a, __m128i b) | ||
4424 | { | ||
4425 | return vreinterpretq_m128i_u8( | ||
4426 | vmaxq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b))); | ||
4427 | } | ||
4428 | |||
4429 | // Compare packed double-precision (64-bit) floating-point elements in a and b, | ||
4430 | // and store packed maximum values in dst. | ||
4431 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pd | ||
4432 | FORCE_INLINE __m128d _mm_max_pd(__m128d a, __m128d b) | ||
4433 | { | ||
4434 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
4435 | #if SSE2NEON_PRECISE_MINMAX | ||
4436 | float64x2_t _a = vreinterpretq_f64_m128d(a); | ||
4437 | float64x2_t _b = vreinterpretq_f64_m128d(b); | ||
4438 | return vreinterpretq_m128d_f64(vbslq_f64(vcgtq_f64(_a, _b), _a, _b)); | ||
4439 | #else | ||
4440 | return vreinterpretq_m128d_f64( | ||
4441 | vmaxq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); | ||
4442 | #endif | ||
4443 | #else | ||
4444 | uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); | ||
4445 | uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); | ||
4446 | uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); | ||
4447 | uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b)); | ||
4448 | uint64_t d[2]; | ||
4449 | d[0] = (*(double *) &a0) > (*(double *) &b0) ? a0 : b0; | ||
4450 | d[1] = (*(double *) &a1) > (*(double *) &b1) ? a1 : b1; | ||
4451 | |||
4452 | return vreinterpretq_m128d_u64(vld1q_u64(d)); | ||
4453 | #endif | ||
4454 | } | ||
4455 | |||
4456 | // Compare the lower double-precision (64-bit) floating-point elements in a and | ||
4457 | // b, store the maximum value in the lower element of dst, and copy the upper | ||
4458 | // element from a to the upper element of dst. | ||
4459 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_sd | ||
4460 | FORCE_INLINE __m128d _mm_max_sd(__m128d a, __m128d b) | ||
4461 | { | ||
4462 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
4463 | return _mm_move_sd(a, _mm_max_pd(a, b)); | ||
4464 | #else | ||
4465 | double *da = (double *) &a; | ||
4466 | double *db = (double *) &b; | ||
4467 | double c[2] = {da[0] > db[0] ? da[0] : db[0], da[1]}; | ||
4468 | return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) c)); | ||
4469 | #endif | ||
4470 | } | ||
4471 | |||
4472 | // Compare packed signed 16-bit integers in a and b, and store packed minimum | ||
4473 | // values in dst. | ||
4474 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi16 | ||
4475 | FORCE_INLINE __m128i _mm_min_epi16(__m128i a, __m128i b) | ||
4476 | { | ||
4477 | return vreinterpretq_m128i_s16( | ||
4478 | vminq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); | ||
4479 | } | ||
4480 | |||
4481 | // Compare packed unsigned 8-bit integers in a and b, and store packed minimum | ||
4482 | // values in dst. | ||
4483 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epu8 | ||
4484 | FORCE_INLINE __m128i _mm_min_epu8(__m128i a, __m128i b) | ||
4485 | { | ||
4486 | return vreinterpretq_m128i_u8( | ||
4487 | vminq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b))); | ||
4488 | } | ||
4489 | |||
4490 | // Compare packed double-precision (64-bit) floating-point elements in a and b, | ||
4491 | // and store packed minimum values in dst. | ||
4492 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_pd | ||
4493 | FORCE_INLINE __m128d _mm_min_pd(__m128d a, __m128d b) | ||
4494 | { | ||
4495 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
4496 | #if SSE2NEON_PRECISE_MINMAX | ||
4497 | float64x2_t _a = vreinterpretq_f64_m128d(a); | ||
4498 | float64x2_t _b = vreinterpretq_f64_m128d(b); | ||
4499 | return vreinterpretq_m128d_f64(vbslq_f64(vcltq_f64(_a, _b), _a, _b)); | ||
4500 | #else | ||
4501 | return vreinterpretq_m128d_f64( | ||
4502 | vminq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); | ||
4503 | #endif | ||
4504 | #else | ||
4505 | uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); | ||
4506 | uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); | ||
4507 | uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); | ||
4508 | uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b)); | ||
4509 | uint64_t d[2]; | ||
4510 | d[0] = (*(double *) &a0) < (*(double *) &b0) ? a0 : b0; | ||
4511 | d[1] = (*(double *) &a1) < (*(double *) &b1) ? a1 : b1; | ||
4512 | return vreinterpretq_m128d_u64(vld1q_u64(d)); | ||
4513 | #endif | ||
4514 | } | ||
4515 | |||
4516 | // Compare the lower double-precision (64-bit) floating-point elements in a and | ||
4517 | // b, store the minimum value in the lower element of dst, and copy the upper | ||
4518 | // element from a to the upper element of dst. | ||
4519 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_sd | ||
4520 | FORCE_INLINE __m128d _mm_min_sd(__m128d a, __m128d b) | ||
4521 | { | ||
4522 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
4523 | return _mm_move_sd(a, _mm_min_pd(a, b)); | ||
4524 | #else | ||
4525 | double *da = (double *) &a; | ||
4526 | double *db = (double *) &b; | ||
4527 | double c[2] = {da[0] < db[0] ? da[0] : db[0], da[1]}; | ||
4528 | return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) c)); | ||
4529 | #endif | ||
4530 | } | ||
4531 | |||
4532 | // Copy the lower 64-bit integer in a to the lower element of dst, and zero the | ||
4533 | // upper element. | ||
4534 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_epi64 | ||
4535 | FORCE_INLINE __m128i _mm_move_epi64(__m128i a) | ||
4536 | { | ||
4537 | return vreinterpretq_m128i_s64( | ||
4538 | vsetq_lane_s64(0, vreinterpretq_s64_m128i(a), 1)); | ||
4539 | } | ||
4540 | |||
4541 | // Move the lower double-precision (64-bit) floating-point element from b to the | ||
4542 | // lower element of dst, and copy the upper element from a to the upper element | ||
4543 | // of dst. | ||
4544 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_sd | ||
4545 | FORCE_INLINE __m128d _mm_move_sd(__m128d a, __m128d b) | ||
4546 | { | ||
4547 | return vreinterpretq_m128d_f32( | ||
4548 | vcombine_f32(vget_low_f32(vreinterpretq_f32_m128d(b)), | ||
4549 | vget_high_f32(vreinterpretq_f32_m128d(a)))); | ||
4550 | } | ||
4551 | |||
4552 | // Create mask from the most significant bit of each 8-bit element in a, and | ||
4553 | // store the result in dst. | ||
4554 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_epi8 | ||
4555 | FORCE_INLINE int _mm_movemask_epi8(__m128i a) | ||
4556 | { | ||
4557 | // Use increasingly wide shifts+adds to collect the sign bits | ||
4558 | // together. | ||
4559 | // Since the widening shifts would be rather confusing to follow in little | ||
4560 | // endian, everything will be illustrated in big endian order instead. This | ||
4561 | // has a different result - the bits would actually be reversed on a big | ||
4562 | // endian machine. | ||
4563 | |||
4564 | // Starting input (only half the elements are shown): | ||
4565 | // 89 ff 1d c0 00 10 99 33 | ||
4566 | uint8x16_t input = vreinterpretq_u8_m128i(a); | ||
4567 | |||
4568 | // Shift out everything but the sign bits with an unsigned shift right. | ||
4569 | // | ||
4570 | // Bytes of the vector:: | ||
4571 | // 89 ff 1d c0 00 10 99 33 | ||
4572 | // \ \ \ \ \ \ \ \ high_bits = (uint16x4_t)(input >> 7) | ||
4573 | // | | | | | | | | | ||
4574 | // 01 01 00 01 00 00 01 00 | ||
4575 | // | ||
4576 | // Bits of first important lane(s): | ||
4577 | // 10001001 (89) | ||
4578 | // \______ | ||
4579 | // | | ||
4580 | // 00000001 (01) | ||
4581 | uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(input, 7)); | ||
4582 | |||
4583 | // Merge the even lanes together with a 16-bit unsigned shift right + add. | ||
4584 | // 'xx' represents garbage data which will be ignored in the final result. | ||
4585 | // In the important bytes, the add functions like a binary OR. | ||
4586 | // | ||
4587 | // 01 01 00 01 00 00 01 00 | ||
4588 | // \_ | \_ | \_ | \_ | paired16 = (uint32x4_t)(input + (input >> 7)) | ||
4589 | // \| \| \| \| | ||
4590 | // xx 03 xx 01 xx 00 xx 02 | ||
4591 | // | ||
4592 | // 00000001 00000001 (01 01) | ||
4593 | // \_______ | | ||
4594 | // \| | ||
4595 | // xxxxxxxx xxxxxx11 (xx 03) | ||
4596 | uint32x4_t paired16 = | ||
4597 | vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7)); | ||
4598 | |||
4599 | // Repeat with a wider 32-bit shift + add. | ||
4600 | // xx 03 xx 01 xx 00 xx 02 | ||
4601 | // \____ | \____ | paired32 = (uint64x1_t)(paired16 + (paired16 >> | ||
4602 | // 14)) | ||
4603 | // \| \| | ||
4604 | // xx xx xx 0d xx xx xx 02 | ||
4605 | // | ||
4606 | // 00000011 00000001 (03 01) | ||
4607 | // \\_____ || | ||
4608 | // '----.\|| | ||
4609 | // xxxxxxxx xxxx1101 (xx 0d) | ||
4610 | uint64x2_t paired32 = | ||
4611 | vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14)); | ||
4612 | |||
4613 | // Last, an even wider 64-bit shift + add to get our result in the low 8 bit | ||
4614 | // lanes. xx xx xx 0d xx xx xx 02 | ||
4615 | // \_________ | paired64 = (uint8x8_t)(paired32 + (paired32 >> | ||
4616 | // 28)) | ||
4617 | // \| | ||
4618 | // xx xx xx xx xx xx xx d2 | ||
4619 | // | ||
4620 | // 00001101 00000010 (0d 02) | ||
4621 | // \ \___ | | | ||
4622 | // '---. \| | | ||
4623 | // xxxxxxxx 11010010 (xx d2) | ||
4624 | uint8x16_t paired64 = | ||
4625 | vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28)); | ||
4626 | |||
4627 | // Extract the low 8 bits from each 64-bit lane with 2 8-bit extracts. | ||
4628 | // xx xx xx xx xx xx xx d2 | ||
4629 | // || return paired64[0] | ||
4630 | // d2 | ||
4631 | // Note: Little endian would return the correct value 4b (01001011) instead. | ||
4632 | return vgetq_lane_u8(paired64, 0) | ((int) vgetq_lane_u8(paired64, 8) << 8); | ||
4633 | } | ||
4634 | |||
4635 | // Set each bit of mask dst based on the most significant bit of the | ||
4636 | // corresponding packed double-precision (64-bit) floating-point element in a. | ||
4637 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_pd | ||
4638 | FORCE_INLINE int _mm_movemask_pd(__m128d a) | ||
4639 | { | ||
4640 | uint64x2_t input = vreinterpretq_u64_m128d(a); | ||
4641 | uint64x2_t high_bits = vshrq_n_u64(input, 63); | ||
4642 | return (int) (vgetq_lane_u64(high_bits, 0) | | ||
4643 | (vgetq_lane_u64(high_bits, 1) << 1)); | ||
4644 | } | ||
4645 | |||
4646 | // Copy the lower 64-bit integer in a to dst. | ||
4647 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movepi64_pi64 | ||
4648 | FORCE_INLINE __m64 _mm_movepi64_pi64(__m128i a) | ||
4649 | { | ||
4650 | return vreinterpret_m64_s64(vget_low_s64(vreinterpretq_s64_m128i(a))); | ||
4651 | } | ||
4652 | |||
4653 | // Copy the 64-bit integer a to the lower element of dst, and zero the upper | ||
4654 | // element. | ||
4655 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movpi64_epi64 | ||
4656 | FORCE_INLINE __m128i _mm_movpi64_epi64(__m64 a) | ||
4657 | { | ||
4658 | return vreinterpretq_m128i_s64( | ||
4659 | vcombine_s64(vreinterpret_s64_m64(a), vdup_n_s64(0))); | ||
4660 | } | ||
4661 | |||
4662 | // Multiply the low unsigned 32-bit integers from each packed 64-bit element in | ||
4663 | // a and b, and store the unsigned 64-bit results in dst. | ||
4664 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_epu32 | ||
4665 | FORCE_INLINE __m128i _mm_mul_epu32(__m128i a, __m128i b) | ||
4666 | { | ||
4667 | // vmull_u32 upcasts instead of masking, so we downcast. | ||
4668 | uint32x2_t a_lo = vmovn_u64(vreinterpretq_u64_m128i(a)); | ||
4669 | uint32x2_t b_lo = vmovn_u64(vreinterpretq_u64_m128i(b)); | ||
4670 | return vreinterpretq_m128i_u64(vmull_u32(a_lo, b_lo)); | ||
4671 | } | ||
4672 | |||
4673 | // Multiply packed double-precision (64-bit) floating-point elements in a and b, | ||
4674 | // and store the results in dst. | ||
4675 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_pd | ||
4676 | FORCE_INLINE __m128d _mm_mul_pd(__m128d a, __m128d b) | ||
4677 | { | ||
4678 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
4679 | return vreinterpretq_m128d_f64( | ||
4680 | vmulq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); | ||
4681 | #else | ||
4682 | double *da = (double *) &a; | ||
4683 | double *db = (double *) &b; | ||
4684 | double c[2]; | ||
4685 | c[0] = da[0] * db[0]; | ||
4686 | c[1] = da[1] * db[1]; | ||
4687 | return vld1q_f32((float32_t *) c); | ||
4688 | #endif | ||
4689 | } | ||
4690 | |||
4691 | // Multiply the lower double-precision (64-bit) floating-point element in a and | ||
4692 | // b, store the result in the lower element of dst, and copy the upper element | ||
4693 | // from a to the upper element of dst. | ||
4694 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mul_sd | ||
4695 | FORCE_INLINE __m128d _mm_mul_sd(__m128d a, __m128d b) | ||
4696 | { | ||
4697 | return _mm_move_sd(a, _mm_mul_pd(a, b)); | ||
4698 | } | ||
4699 | |||
4700 | // Multiply the low unsigned 32-bit integers from a and b, and store the | ||
4701 | // unsigned 64-bit result in dst. | ||
4702 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_su32 | ||
4703 | FORCE_INLINE __m64 _mm_mul_su32(__m64 a, __m64 b) | ||
4704 | { | ||
4705 | return vreinterpret_m64_u64(vget_low_u64( | ||
4706 | vmull_u32(vreinterpret_u32_m64(a), vreinterpret_u32_m64(b)))); | ||
4707 | } | ||
4708 | |||
4709 | // Multiply the packed signed 16-bit integers in a and b, producing intermediate | ||
4710 | // 32-bit integers, and store the high 16 bits of the intermediate integers in | ||
4711 | // dst. | ||
4712 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_epi16 | ||
4713 | FORCE_INLINE __m128i _mm_mulhi_epi16(__m128i a, __m128i b) | ||
4714 | { | ||
4715 | /* FIXME: issue with large values because of result saturation */ | ||
4716 | // int16x8_t ret = vqdmulhq_s16(vreinterpretq_s16_m128i(a), | ||
4717 | // vreinterpretq_s16_m128i(b)); /* =2*a*b */ return | ||
4718 | // vreinterpretq_m128i_s16(vshrq_n_s16(ret, 1)); | ||
4719 | int16x4_t a3210 = vget_low_s16(vreinterpretq_s16_m128i(a)); | ||
4720 | int16x4_t b3210 = vget_low_s16(vreinterpretq_s16_m128i(b)); | ||
4721 | int32x4_t ab3210 = vmull_s16(a3210, b3210); /* 3333222211110000 */ | ||
4722 | int16x4_t a7654 = vget_high_s16(vreinterpretq_s16_m128i(a)); | ||
4723 | int16x4_t b7654 = vget_high_s16(vreinterpretq_s16_m128i(b)); | ||
4724 | int32x4_t ab7654 = vmull_s16(a7654, b7654); /* 7777666655554444 */ | ||
4725 | uint16x8x2_t r = | ||
4726 | vuzpq_u16(vreinterpretq_u16_s32(ab3210), vreinterpretq_u16_s32(ab7654)); | ||
4727 | return vreinterpretq_m128i_u16(r.val[1]); | ||
4728 | } | ||
4729 | |||
4730 | // Multiply the packed unsigned 16-bit integers in a and b, producing | ||
4731 | // intermediate 32-bit integers, and store the high 16 bits of the intermediate | ||
4732 | // integers in dst. | ||
4733 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_epu16 | ||
4734 | FORCE_INLINE __m128i _mm_mulhi_epu16(__m128i a, __m128i b) | ||
4735 | { | ||
4736 | uint16x4_t a3210 = vget_low_u16(vreinterpretq_u16_m128i(a)); | ||
4737 | uint16x4_t b3210 = vget_low_u16(vreinterpretq_u16_m128i(b)); | ||
4738 | uint32x4_t ab3210 = vmull_u16(a3210, b3210); | ||
4739 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
4740 | uint32x4_t ab7654 = | ||
4741 | vmull_high_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)); | ||
4742 | uint16x8_t r = vuzp2q_u16(vreinterpretq_u16_u32(ab3210), | ||
4743 | vreinterpretq_u16_u32(ab7654)); | ||
4744 | return vreinterpretq_m128i_u16(r); | ||
4745 | #else | ||
4746 | uint16x4_t a7654 = vget_high_u16(vreinterpretq_u16_m128i(a)); | ||
4747 | uint16x4_t b7654 = vget_high_u16(vreinterpretq_u16_m128i(b)); | ||
4748 | uint32x4_t ab7654 = vmull_u16(a7654, b7654); | ||
4749 | uint16x8x2_t r = | ||
4750 | vuzpq_u16(vreinterpretq_u16_u32(ab3210), vreinterpretq_u16_u32(ab7654)); | ||
4751 | return vreinterpretq_m128i_u16(r.val[1]); | ||
4752 | #endif | ||
4753 | } | ||
4754 | |||
4755 | // Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit | ||
4756 | // integers, and store the low 16 bits of the intermediate integers in dst. | ||
4757 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mullo_epi16 | ||
4758 | FORCE_INLINE __m128i _mm_mullo_epi16(__m128i a, __m128i b) | ||
4759 | { | ||
4760 | return vreinterpretq_m128i_s16( | ||
4761 | vmulq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); | ||
4762 | } | ||
4763 | |||
4764 | // Compute the bitwise OR of packed double-precision (64-bit) floating-point | ||
4765 | // elements in a and b, and store the results in dst. | ||
4766 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_or_pd | ||
4767 | FORCE_INLINE __m128d _mm_or_pd(__m128d a, __m128d b) | ||
4768 | { | ||
4769 | return vreinterpretq_m128d_s64( | ||
4770 | vorrq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b))); | ||
4771 | } | ||
4772 | |||
4773 | // Compute the bitwise OR of 128 bits (representing integer data) in a and b, | ||
4774 | // and store the result in dst. | ||
4775 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_or_si128 | ||
4776 | FORCE_INLINE __m128i _mm_or_si128(__m128i a, __m128i b) | ||
4777 | { | ||
4778 | return vreinterpretq_m128i_s32( | ||
4779 | vorrq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); | ||
4780 | } | ||
4781 | |||
4782 | // Convert packed signed 16-bit integers from a and b to packed 8-bit integers | ||
4783 | // using signed saturation, and store the results in dst. | ||
4784 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi16 | ||
4785 | FORCE_INLINE __m128i _mm_packs_epi16(__m128i a, __m128i b) | ||
4786 | { | ||
4787 | return vreinterpretq_m128i_s8( | ||
4788 | vcombine_s8(vqmovn_s16(vreinterpretq_s16_m128i(a)), | ||
4789 | vqmovn_s16(vreinterpretq_s16_m128i(b)))); | ||
4790 | } | ||
4791 | |||
4792 | // Convert packed signed 32-bit integers from a and b to packed 16-bit integers | ||
4793 | // using signed saturation, and store the results in dst. | ||
4794 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi32 | ||
4795 | FORCE_INLINE __m128i _mm_packs_epi32(__m128i a, __m128i b) | ||
4796 | { | ||
4797 | return vreinterpretq_m128i_s16( | ||
4798 | vcombine_s16(vqmovn_s32(vreinterpretq_s32_m128i(a)), | ||
4799 | vqmovn_s32(vreinterpretq_s32_m128i(b)))); | ||
4800 | } | ||
4801 | |||
4802 | // Convert packed signed 16-bit integers from a and b to packed 8-bit integers | ||
4803 | // using unsigned saturation, and store the results in dst. | ||
4804 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi16 | ||
4805 | FORCE_INLINE __m128i _mm_packus_epi16(const __m128i a, const __m128i b) | ||
4806 | { | ||
4807 | return vreinterpretq_m128i_u8( | ||
4808 | vcombine_u8(vqmovun_s16(vreinterpretq_s16_m128i(a)), | ||
4809 | vqmovun_s16(vreinterpretq_s16_m128i(b)))); | ||
4810 | } | ||
4811 | |||
4812 | // Pause the processor. This is typically used in spin-wait loops and depending | ||
4813 | // on the x86 processor typical values are in the 40-100 cycle range. The | ||
4814 | // 'yield' instruction isn't a good fit because it's effectively a nop on most | ||
4815 | // Arm cores. Experience with several databases has shown has shown an 'isb' is | ||
4816 | // a reasonable approximation. | ||
4817 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_pause | ||
4818 | FORCE_INLINE void _mm_pause(void) | ||
4819 | { | ||
4820 | #if defined(_MSC_VER) | ||
4821 | __isb(_ARM64_BARRIER_SY); | ||
4822 | #else | ||
4823 | __asm__ __volatile__("isb\n"); | ||
4824 | #endif | ||
4825 | } | ||
4826 | |||
4827 | // Compute the absolute differences of packed unsigned 8-bit integers in a and | ||
4828 | // b, then horizontally sum each consecutive 8 differences to produce two | ||
4829 | // unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low | ||
4830 | // 16 bits of 64-bit elements in dst. | ||
4831 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sad_epu8 | ||
4832 | FORCE_INLINE __m128i _mm_sad_epu8(__m128i a, __m128i b) | ||
4833 | { | ||
4834 | uint16x8_t t = vpaddlq_u8(vabdq_u8((uint8x16_t) a, (uint8x16_t) b)); | ||
4835 | return vreinterpretq_m128i_u64(vpaddlq_u32(vpaddlq_u16(t))); | ||
4836 | } | ||
4837 | |||
4838 | // Set packed 16-bit integers in dst with the supplied values. | ||
4839 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi16 | ||
4840 | FORCE_INLINE __m128i _mm_set_epi16(short i7, | ||
4841 | short i6, | ||
4842 | short i5, | ||
4843 | short i4, | ||
4844 | short i3, | ||
4845 | short i2, | ||
4846 | short i1, | ||
4847 | short i0) | ||
4848 | { | ||
4849 | int16_t ALIGN_STRUCT(16) data[8] = {i0, i1, i2, i3, i4, i5, i6, i7}; | ||
4850 | return vreinterpretq_m128i_s16(vld1q_s16(data)); | ||
4851 | } | ||
4852 | |||
4853 | // Set packed 32-bit integers in dst with the supplied values. | ||
4854 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi32 | ||
4855 | FORCE_INLINE __m128i _mm_set_epi32(int i3, int i2, int i1, int i0) | ||
4856 | { | ||
4857 | int32_t ALIGN_STRUCT(16) data[4] = {i0, i1, i2, i3}; | ||
4858 | return vreinterpretq_m128i_s32(vld1q_s32(data)); | ||
4859 | } | ||
4860 | |||
4861 | // Set packed 64-bit integers in dst with the supplied values. | ||
4862 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi64 | ||
4863 | FORCE_INLINE __m128i _mm_set_epi64(__m64 i1, __m64 i2) | ||
4864 | { | ||
4865 | return _mm_set_epi64x(vget_lane_s64(i1, 0), vget_lane_s64(i2, 0)); | ||
4866 | } | ||
4867 | |||
4868 | // Set packed 64-bit integers in dst with the supplied values. | ||
4869 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi64x | ||
4870 | FORCE_INLINE __m128i _mm_set_epi64x(int64_t i1, int64_t i2) | ||
4871 | { | ||
4872 | return vreinterpretq_m128i_s64( | ||
4873 | vcombine_s64(vcreate_s64(i2), vcreate_s64(i1))); | ||
4874 | } | ||
4875 | |||
4876 | // Set packed 8-bit integers in dst with the supplied values. | ||
4877 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi8 | ||
4878 | FORCE_INLINE __m128i _mm_set_epi8(signed char b15, | ||
4879 | signed char b14, | ||
4880 | signed char b13, | ||
4881 | signed char b12, | ||
4882 | signed char b11, | ||
4883 | signed char b10, | ||
4884 | signed char b9, | ||
4885 | signed char b8, | ||
4886 | signed char b7, | ||
4887 | signed char b6, | ||
4888 | signed char b5, | ||
4889 | signed char b4, | ||
4890 | signed char b3, | ||
4891 | signed char b2, | ||
4892 | signed char b1, | ||
4893 | signed char b0) | ||
4894 | { | ||
4895 | int8_t ALIGN_STRUCT(16) | ||
4896 | data[16] = {(int8_t) b0, (int8_t) b1, (int8_t) b2, (int8_t) b3, | ||
4897 | (int8_t) b4, (int8_t) b5, (int8_t) b6, (int8_t) b7, | ||
4898 | (int8_t) b8, (int8_t) b9, (int8_t) b10, (int8_t) b11, | ||
4899 | (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15}; | ||
4900 | return (__m128i) vld1q_s8(data); | ||
4901 | } | ||
4902 | |||
4903 | // Set packed double-precision (64-bit) floating-point elements in dst with the | ||
4904 | // supplied values. | ||
4905 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_pd | ||
4906 | FORCE_INLINE __m128d _mm_set_pd(double e1, double e0) | ||
4907 | { | ||
4908 | double ALIGN_STRUCT(16) data[2] = {e0, e1}; | ||
4909 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
4910 | return vreinterpretq_m128d_f64(vld1q_f64((float64_t *) data)); | ||
4911 | #else | ||
4912 | return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) data)); | ||
4913 | #endif | ||
4914 | } | ||
4915 | |||
4916 | // Broadcast double-precision (64-bit) floating-point value a to all elements of | ||
4917 | // dst. | ||
4918 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_pd1 | ||
4919 | #define _mm_set_pd1 _mm_set1_pd | ||
4920 | |||
4921 | // Copy double-precision (64-bit) floating-point element a to the lower element | ||
4922 | // of dst, and zero the upper element. | ||
4923 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_sd | ||
4924 | FORCE_INLINE __m128d _mm_set_sd(double a) | ||
4925 | { | ||
4926 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
4927 | return vreinterpretq_m128d_f64(vsetq_lane_f64(a, vdupq_n_f64(0), 0)); | ||
4928 | #else | ||
4929 | return _mm_set_pd(0, a); | ||
4930 | #endif | ||
4931 | } | ||
4932 | |||
4933 | // Broadcast 16-bit integer a to all elements of dst. | ||
4934 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi16 | ||
4935 | FORCE_INLINE __m128i _mm_set1_epi16(short w) | ||
4936 | { | ||
4937 | return vreinterpretq_m128i_s16(vdupq_n_s16(w)); | ||
4938 | } | ||
4939 | |||
4940 | // Broadcast 32-bit integer a to all elements of dst. | ||
4941 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi32 | ||
4942 | FORCE_INLINE __m128i _mm_set1_epi32(int _i) | ||
4943 | { | ||
4944 | return vreinterpretq_m128i_s32(vdupq_n_s32(_i)); | ||
4945 | } | ||
4946 | |||
4947 | // Broadcast 64-bit integer a to all elements of dst. | ||
4948 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi64 | ||
4949 | FORCE_INLINE __m128i _mm_set1_epi64(__m64 _i) | ||
4950 | { | ||
4951 | return vreinterpretq_m128i_s64(vdupq_lane_s64(_i, 0)); | ||
4952 | } | ||
4953 | |||
4954 | // Broadcast 64-bit integer a to all elements of dst. | ||
4955 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi64x | ||
4956 | FORCE_INLINE __m128i _mm_set1_epi64x(int64_t _i) | ||
4957 | { | ||
4958 | return vreinterpretq_m128i_s64(vdupq_n_s64(_i)); | ||
4959 | } | ||
4960 | |||
4961 | // Broadcast 8-bit integer a to all elements of dst. | ||
4962 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi8 | ||
4963 | FORCE_INLINE __m128i _mm_set1_epi8(signed char w) | ||
4964 | { | ||
4965 | return vreinterpretq_m128i_s8(vdupq_n_s8(w)); | ||
4966 | } | ||
4967 | |||
4968 | // Broadcast double-precision (64-bit) floating-point value a to all elements of | ||
4969 | // dst. | ||
4970 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_pd | ||
4971 | FORCE_INLINE __m128d _mm_set1_pd(double d) | ||
4972 | { | ||
4973 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
4974 | return vreinterpretq_m128d_f64(vdupq_n_f64(d)); | ||
4975 | #else | ||
4976 | return vreinterpretq_m128d_s64(vdupq_n_s64(*(int64_t *) &d)); | ||
4977 | #endif | ||
4978 | } | ||
4979 | |||
4980 | // Set packed 16-bit integers in dst with the supplied values in reverse order. | ||
4981 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi16 | ||
4982 | FORCE_INLINE __m128i _mm_setr_epi16(short w0, | ||
4983 | short w1, | ||
4984 | short w2, | ||
4985 | short w3, | ||
4986 | short w4, | ||
4987 | short w5, | ||
4988 | short w6, | ||
4989 | short w7) | ||
4990 | { | ||
4991 | int16_t ALIGN_STRUCT(16) data[8] = {w0, w1, w2, w3, w4, w5, w6, w7}; | ||
4992 | return vreinterpretq_m128i_s16(vld1q_s16((int16_t *) data)); | ||
4993 | } | ||
4994 | |||
4995 | // Set packed 32-bit integers in dst with the supplied values in reverse order. | ||
4996 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi32 | ||
4997 | FORCE_INLINE __m128i _mm_setr_epi32(int i3, int i2, int i1, int i0) | ||
4998 | { | ||
4999 | int32_t ALIGN_STRUCT(16) data[4] = {i3, i2, i1, i0}; | ||
5000 | return vreinterpretq_m128i_s32(vld1q_s32(data)); | ||
5001 | } | ||
5002 | |||
5003 | // Set packed 64-bit integers in dst with the supplied values in reverse order. | ||
5004 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi64 | ||
5005 | FORCE_INLINE __m128i _mm_setr_epi64(__m64 e1, __m64 e0) | ||
5006 | { | ||
5007 | return vreinterpretq_m128i_s64(vcombine_s64(e1, e0)); | ||
5008 | } | ||
5009 | |||
5010 | // Set packed 8-bit integers in dst with the supplied values in reverse order. | ||
5011 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi8 | ||
5012 | FORCE_INLINE __m128i _mm_setr_epi8(signed char b0, | ||
5013 | signed char b1, | ||
5014 | signed char b2, | ||
5015 | signed char b3, | ||
5016 | signed char b4, | ||
5017 | signed char b5, | ||
5018 | signed char b6, | ||
5019 | signed char b7, | ||
5020 | signed char b8, | ||
5021 | signed char b9, | ||
5022 | signed char b10, | ||
5023 | signed char b11, | ||
5024 | signed char b12, | ||
5025 | signed char b13, | ||
5026 | signed char b14, | ||
5027 | signed char b15) | ||
5028 | { | ||
5029 | int8_t ALIGN_STRUCT(16) | ||
5030 | data[16] = {(int8_t) b0, (int8_t) b1, (int8_t) b2, (int8_t) b3, | ||
5031 | (int8_t) b4, (int8_t) b5, (int8_t) b6, (int8_t) b7, | ||
5032 | (int8_t) b8, (int8_t) b9, (int8_t) b10, (int8_t) b11, | ||
5033 | (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15}; | ||
5034 | return (__m128i) vld1q_s8(data); | ||
5035 | } | ||
5036 | |||
5037 | // Set packed double-precision (64-bit) floating-point elements in dst with the | ||
5038 | // supplied values in reverse order. | ||
5039 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_pd | ||
5040 | FORCE_INLINE __m128d _mm_setr_pd(double e1, double e0) | ||
5041 | { | ||
5042 | return _mm_set_pd(e0, e1); | ||
5043 | } | ||
5044 | |||
5045 | // Return vector of type __m128d with all elements set to zero. | ||
5046 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_pd | ||
5047 | FORCE_INLINE __m128d _mm_setzero_pd(void) | ||
5048 | { | ||
5049 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
5050 | return vreinterpretq_m128d_f64(vdupq_n_f64(0)); | ||
5051 | #else | ||
5052 | return vreinterpretq_m128d_f32(vdupq_n_f32(0)); | ||
5053 | #endif | ||
5054 | } | ||
5055 | |||
5056 | // Return vector of type __m128i with all elements set to zero. | ||
5057 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_si128 | ||
5058 | FORCE_INLINE __m128i _mm_setzero_si128(void) | ||
5059 | { | ||
5060 | return vreinterpretq_m128i_s32(vdupq_n_s32(0)); | ||
5061 | } | ||
5062 | |||
5063 | // Shuffle 32-bit integers in a using the control in imm8, and store the results | ||
5064 | // in dst. | ||
5065 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_epi32 | ||
5066 | // FORCE_INLINE __m128i _mm_shuffle_epi32(__m128i a, | ||
5067 | // __constrange(0,255) int imm) | ||
5068 | #if defined(_sse2neon_shuffle) | ||
5069 | #define _mm_shuffle_epi32(a, imm) \ | ||
5070 | __extension__({ \ | ||
5071 | int32x4_t _input = vreinterpretq_s32_m128i(a); \ | ||
5072 | int32x4_t _shuf = \ | ||
5073 | vshuffleq_s32(_input, _input, (imm) & (0x3), ((imm) >> 2) & 0x3, \ | ||
5074 | ((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3); \ | ||
5075 | vreinterpretq_m128i_s32(_shuf); \ | ||
5076 | }) | ||
5077 | #else // generic | ||
5078 | #define _mm_shuffle_epi32(a, imm) \ | ||
5079 | _sse2neon_define1( \ | ||
5080 | __m128i, a, __m128i ret; switch (imm) { \ | ||
5081 | case _MM_SHUFFLE(1, 0, 3, 2): \ | ||
5082 | ret = _mm_shuffle_epi_1032(_a); \ | ||
5083 | break; \ | ||
5084 | case _MM_SHUFFLE(2, 3, 0, 1): \ | ||
5085 | ret = _mm_shuffle_epi_2301(_a); \ | ||
5086 | break; \ | ||
5087 | case _MM_SHUFFLE(0, 3, 2, 1): \ | ||
5088 | ret = _mm_shuffle_epi_0321(_a); \ | ||
5089 | break; \ | ||
5090 | case _MM_SHUFFLE(2, 1, 0, 3): \ | ||
5091 | ret = _mm_shuffle_epi_2103(_a); \ | ||
5092 | break; \ | ||
5093 | case _MM_SHUFFLE(1, 0, 1, 0): \ | ||
5094 | ret = _mm_shuffle_epi_1010(_a); \ | ||
5095 | break; \ | ||
5096 | case _MM_SHUFFLE(1, 0, 0, 1): \ | ||
5097 | ret = _mm_shuffle_epi_1001(_a); \ | ||
5098 | break; \ | ||
5099 | case _MM_SHUFFLE(0, 1, 0, 1): \ | ||
5100 | ret = _mm_shuffle_epi_0101(_a); \ | ||
5101 | break; \ | ||
5102 | case _MM_SHUFFLE(2, 2, 1, 1): \ | ||
5103 | ret = _mm_shuffle_epi_2211(_a); \ | ||
5104 | break; \ | ||
5105 | case _MM_SHUFFLE(0, 1, 2, 2): \ | ||
5106 | ret = _mm_shuffle_epi_0122(_a); \ | ||
5107 | break; \ | ||
5108 | case _MM_SHUFFLE(3, 3, 3, 2): \ | ||
5109 | ret = _mm_shuffle_epi_3332(_a); \ | ||
5110 | break; \ | ||
5111 | case _MM_SHUFFLE(0, 0, 0, 0): \ | ||
5112 | ret = _mm_shuffle_epi32_splat(_a, 0); \ | ||
5113 | break; \ | ||
5114 | case _MM_SHUFFLE(1, 1, 1, 1): \ | ||
5115 | ret = _mm_shuffle_epi32_splat(_a, 1); \ | ||
5116 | break; \ | ||
5117 | case _MM_SHUFFLE(2, 2, 2, 2): \ | ||
5118 | ret = _mm_shuffle_epi32_splat(_a, 2); \ | ||
5119 | break; \ | ||
5120 | case _MM_SHUFFLE(3, 3, 3, 3): \ | ||
5121 | ret = _mm_shuffle_epi32_splat(_a, 3); \ | ||
5122 | break; \ | ||
5123 | default: \ | ||
5124 | ret = _mm_shuffle_epi32_default(_a, (imm)); \ | ||
5125 | break; \ | ||
5126 | } _sse2neon_return(ret);) | ||
5127 | #endif | ||
5128 | |||
5129 | // Shuffle double-precision (64-bit) floating-point elements using the control | ||
5130 | // in imm8, and store the results in dst. | ||
5131 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_pd | ||
5132 | #ifdef _sse2neon_shuffle | ||
5133 | #define _mm_shuffle_pd(a, b, imm8) \ | ||
5134 | vreinterpretq_m128d_s64( \ | ||
5135 | vshuffleq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b), \ | ||
5136 | imm8 & 0x1, ((imm8 & 0x2) >> 1) + 2)) | ||
5137 | #else | ||
5138 | #define _mm_shuffle_pd(a, b, imm8) \ | ||
5139 | _mm_castsi128_pd(_mm_set_epi64x( \ | ||
5140 | vgetq_lane_s64(vreinterpretq_s64_m128d(b), (imm8 & 0x2) >> 1), \ | ||
5141 | vgetq_lane_s64(vreinterpretq_s64_m128d(a), imm8 & 0x1))) | ||
5142 | #endif | ||
5143 | |||
5144 | // FORCE_INLINE __m128i _mm_shufflehi_epi16(__m128i a, | ||
5145 | // __constrange(0,255) int imm) | ||
5146 | #if defined(_sse2neon_shuffle) | ||
5147 | #define _mm_shufflehi_epi16(a, imm) \ | ||
5148 | __extension__({ \ | ||
5149 | int16x8_t _input = vreinterpretq_s16_m128i(a); \ | ||
5150 | int16x8_t _shuf = \ | ||
5151 | vshuffleq_s16(_input, _input, 0, 1, 2, 3, ((imm) & (0x3)) + 4, \ | ||
5152 | (((imm) >> 2) & 0x3) + 4, (((imm) >> 4) & 0x3) + 4, \ | ||
5153 | (((imm) >> 6) & 0x3) + 4); \ | ||
5154 | vreinterpretq_m128i_s16(_shuf); \ | ||
5155 | }) | ||
5156 | #else // generic | ||
5157 | #define _mm_shufflehi_epi16(a, imm) _mm_shufflehi_epi16_function((a), (imm)) | ||
5158 | #endif | ||
5159 | |||
5160 | // FORCE_INLINE __m128i _mm_shufflelo_epi16(__m128i a, | ||
5161 | // __constrange(0,255) int imm) | ||
5162 | #if defined(_sse2neon_shuffle) | ||
5163 | #define _mm_shufflelo_epi16(a, imm) \ | ||
5164 | __extension__({ \ | ||
5165 | int16x8_t _input = vreinterpretq_s16_m128i(a); \ | ||
5166 | int16x8_t _shuf = vshuffleq_s16( \ | ||
5167 | _input, _input, ((imm) & (0x3)), (((imm) >> 2) & 0x3), \ | ||
5168 | (((imm) >> 4) & 0x3), (((imm) >> 6) & 0x3), 4, 5, 6, 7); \ | ||
5169 | vreinterpretq_m128i_s16(_shuf); \ | ||
5170 | }) | ||
5171 | #else // generic | ||
5172 | #define _mm_shufflelo_epi16(a, imm) _mm_shufflelo_epi16_function((a), (imm)) | ||
5173 | #endif | ||
5174 | |||
5175 | // Shift packed 16-bit integers in a left by count while shifting in zeros, and | ||
5176 | // store the results in dst. | ||
5177 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi16 | ||
5178 | FORCE_INLINE __m128i _mm_sll_epi16(__m128i a, __m128i count) | ||
5179 | { | ||
5180 | uint64_t c = vreinterpretq_nth_u64_m128i(count, 0); | ||
5181 | if (_sse2neon_unlikely(c & ~15)) | ||
5182 | return _mm_setzero_si128(); | ||
5183 | |||
5184 | int16x8_t vc = vdupq_n_s16((int16_t) c); | ||
5185 | return vreinterpretq_m128i_s16(vshlq_s16(vreinterpretq_s16_m128i(a), vc)); | ||
5186 | } | ||
5187 | |||
5188 | // Shift packed 32-bit integers in a left by count while shifting in zeros, and | ||
5189 | // store the results in dst. | ||
5190 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi32 | ||
5191 | FORCE_INLINE __m128i _mm_sll_epi32(__m128i a, __m128i count) | ||
5192 | { | ||
5193 | uint64_t c = vreinterpretq_nth_u64_m128i(count, 0); | ||
5194 | if (_sse2neon_unlikely(c & ~31)) | ||
5195 | return _mm_setzero_si128(); | ||
5196 | |||
5197 | int32x4_t vc = vdupq_n_s32((int32_t) c); | ||
5198 | return vreinterpretq_m128i_s32(vshlq_s32(vreinterpretq_s32_m128i(a), vc)); | ||
5199 | } | ||
5200 | |||
5201 | // Shift packed 64-bit integers in a left by count while shifting in zeros, and | ||
5202 | // store the results in dst. | ||
5203 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi64 | ||
5204 | FORCE_INLINE __m128i _mm_sll_epi64(__m128i a, __m128i count) | ||
5205 | { | ||
5206 | uint64_t c = vreinterpretq_nth_u64_m128i(count, 0); | ||
5207 | if (_sse2neon_unlikely(c & ~63)) | ||
5208 | return _mm_setzero_si128(); | ||
5209 | |||
5210 | int64x2_t vc = vdupq_n_s64((int64_t) c); | ||
5211 | return vreinterpretq_m128i_s64(vshlq_s64(vreinterpretq_s64_m128i(a), vc)); | ||
5212 | } | ||
5213 | |||
5214 | // Shift packed 16-bit integers in a left by imm8 while shifting in zeros, and | ||
5215 | // store the results in dst. | ||
5216 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi16 | ||
5217 | FORCE_INLINE __m128i _mm_slli_epi16(__m128i a, int imm) | ||
5218 | { | ||
5219 | if (_sse2neon_unlikely(imm & ~15)) | ||
5220 | return _mm_setzero_si128(); | ||
5221 | return vreinterpretq_m128i_s16( | ||
5222 | vshlq_s16(vreinterpretq_s16_m128i(a), vdupq_n_s16(imm))); | ||
5223 | } | ||
5224 | |||
5225 | // Shift packed 32-bit integers in a left by imm8 while shifting in zeros, and | ||
5226 | // store the results in dst. | ||
5227 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi32 | ||
5228 | FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, int imm) | ||
5229 | { | ||
5230 | if (_sse2neon_unlikely(imm & ~31)) | ||
5231 | return _mm_setzero_si128(); | ||
5232 | return vreinterpretq_m128i_s32( | ||
5233 | vshlq_s32(vreinterpretq_s32_m128i(a), vdupq_n_s32(imm))); | ||
5234 | } | ||
5235 | |||
5236 | // Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and | ||
5237 | // store the results in dst. | ||
5238 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi64 | ||
5239 | FORCE_INLINE __m128i _mm_slli_epi64(__m128i a, int imm) | ||
5240 | { | ||
5241 | if (_sse2neon_unlikely(imm & ~63)) | ||
5242 | return _mm_setzero_si128(); | ||
5243 | return vreinterpretq_m128i_s64( | ||
5244 | vshlq_s64(vreinterpretq_s64_m128i(a), vdupq_n_s64(imm))); | ||
5245 | } | ||
5246 | |||
5247 | // Shift a left by imm8 bytes while shifting in zeros, and store the results in | ||
5248 | // dst. | ||
5249 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_si128 | ||
5250 | #define _mm_slli_si128(a, imm) \ | ||
5251 | _sse2neon_define1( \ | ||
5252 | __m128i, a, int8x16_t ret; \ | ||
5253 | if (_sse2neon_unlikely(imm == 0)) ret = vreinterpretq_s8_m128i(_a); \ | ||
5254 | else if (_sse2neon_unlikely((imm) & ~15)) ret = vdupq_n_s8(0); \ | ||
5255 | else ret = vextq_s8(vdupq_n_s8(0), vreinterpretq_s8_m128i(_a), \ | ||
5256 | ((imm <= 0 || imm > 15) ? 0 : (16 - imm))); \ | ||
5257 | _sse2neon_return(vreinterpretq_m128i_s8(ret));) | ||
5258 | |||
5259 | // Compute the square root of packed double-precision (64-bit) floating-point | ||
5260 | // elements in a, and store the results in dst. | ||
5261 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_pd | ||
5262 | FORCE_INLINE __m128d _mm_sqrt_pd(__m128d a) | ||
5263 | { | ||
5264 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
5265 | return vreinterpretq_m128d_f64(vsqrtq_f64(vreinterpretq_f64_m128d(a))); | ||
5266 | #else | ||
5267 | double a0 = sqrt(((double *) &a)[0]); | ||
5268 | double a1 = sqrt(((double *) &a)[1]); | ||
5269 | return _mm_set_pd(a1, a0); | ||
5270 | #endif | ||
5271 | } | ||
5272 | |||
5273 | // Compute the square root of the lower double-precision (64-bit) floating-point | ||
5274 | // element in b, store the result in the lower element of dst, and copy the | ||
5275 | // upper element from a to the upper element of dst. | ||
5276 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_sd | ||
5277 | FORCE_INLINE __m128d _mm_sqrt_sd(__m128d a, __m128d b) | ||
5278 | { | ||
5279 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
5280 | return _mm_move_sd(a, _mm_sqrt_pd(b)); | ||
5281 | #else | ||
5282 | return _mm_set_pd(((double *) &a)[1], sqrt(((double *) &b)[0])); | ||
5283 | #endif | ||
5284 | } | ||
5285 | |||
5286 | // Shift packed 16-bit integers in a right by count while shifting in sign bits, | ||
5287 | // and store the results in dst. | ||
5288 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sra_epi16 | ||
5289 | FORCE_INLINE __m128i _mm_sra_epi16(__m128i a, __m128i count) | ||
5290 | { | ||
5291 | int64_t c = vgetq_lane_s64(count, 0); | ||
5292 | if (_sse2neon_unlikely(c & ~15)) | ||
5293 | return _mm_cmplt_epi16(a, _mm_setzero_si128()); | ||
5294 | return vreinterpretq_m128i_s16( | ||
5295 | vshlq_s16((int16x8_t) a, vdupq_n_s16((int) -c))); | ||
5296 | } | ||
5297 | |||
5298 | // Shift packed 32-bit integers in a right by count while shifting in sign bits, | ||
5299 | // and store the results in dst. | ||
5300 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sra_epi32 | ||
5301 | FORCE_INLINE __m128i _mm_sra_epi32(__m128i a, __m128i count) | ||
5302 | { | ||
5303 | int64_t c = vgetq_lane_s64(count, 0); | ||
5304 | if (_sse2neon_unlikely(c & ~31)) | ||
5305 | return _mm_cmplt_epi32(a, _mm_setzero_si128()); | ||
5306 | return vreinterpretq_m128i_s32( | ||
5307 | vshlq_s32((int32x4_t) a, vdupq_n_s32((int) -c))); | ||
5308 | } | ||
5309 | |||
5310 | // Shift packed 16-bit integers in a right by imm8 while shifting in sign | ||
5311 | // bits, and store the results in dst. | ||
5312 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srai_epi16 | ||
5313 | FORCE_INLINE __m128i _mm_srai_epi16(__m128i a, int imm) | ||
5314 | { | ||
5315 | const int count = (imm & ~15) ? 15 : imm; | ||
5316 | return (__m128i) vshlq_s16((int16x8_t) a, vdupq_n_s16(-count)); | ||
5317 | } | ||
5318 | |||
5319 | // Shift packed 32-bit integers in a right by imm8 while shifting in sign bits, | ||
5320 | // and store the results in dst. | ||
5321 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srai_epi32 | ||
5322 | // FORCE_INLINE __m128i _mm_srai_epi32(__m128i a, __constrange(0,255) int imm) | ||
5323 | #define _mm_srai_epi32(a, imm) \ | ||
5324 | _sse2neon_define0( \ | ||
5325 | __m128i, a, __m128i ret; if (_sse2neon_unlikely((imm) == 0)) { \ | ||
5326 | ret = _a; \ | ||
5327 | } else if (_sse2neon_likely(0 < (imm) && (imm) < 32)) { \ | ||
5328 | ret = vreinterpretq_m128i_s32( \ | ||
5329 | vshlq_s32(vreinterpretq_s32_m128i(_a), vdupq_n_s32(-(imm)))); \ | ||
5330 | } else { \ | ||
5331 | ret = vreinterpretq_m128i_s32( \ | ||
5332 | vshrq_n_s32(vreinterpretq_s32_m128i(_a), 31)); \ | ||
5333 | } _sse2neon_return(ret);) | ||
5334 | |||
5335 | // Shift packed 16-bit integers in a right by count while shifting in zeros, and | ||
5336 | // store the results in dst. | ||
5337 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi16 | ||
5338 | FORCE_INLINE __m128i _mm_srl_epi16(__m128i a, __m128i count) | ||
5339 | { | ||
5340 | uint64_t c = vreinterpretq_nth_u64_m128i(count, 0); | ||
5341 | if (_sse2neon_unlikely(c & ~15)) | ||
5342 | return _mm_setzero_si128(); | ||
5343 | |||
5344 | int16x8_t vc = vdupq_n_s16(-(int16_t) c); | ||
5345 | return vreinterpretq_m128i_u16(vshlq_u16(vreinterpretq_u16_m128i(a), vc)); | ||
5346 | } | ||
5347 | |||
5348 | // Shift packed 32-bit integers in a right by count while shifting in zeros, and | ||
5349 | // store the results in dst. | ||
5350 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi32 | ||
5351 | FORCE_INLINE __m128i _mm_srl_epi32(__m128i a, __m128i count) | ||
5352 | { | ||
5353 | uint64_t c = vreinterpretq_nth_u64_m128i(count, 0); | ||
5354 | if (_sse2neon_unlikely(c & ~31)) | ||
5355 | return _mm_setzero_si128(); | ||
5356 | |||
5357 | int32x4_t vc = vdupq_n_s32(-(int32_t) c); | ||
5358 | return vreinterpretq_m128i_u32(vshlq_u32(vreinterpretq_u32_m128i(a), vc)); | ||
5359 | } | ||
5360 | |||
5361 | // Shift packed 64-bit integers in a right by count while shifting in zeros, and | ||
5362 | // store the results in dst. | ||
5363 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi64 | ||
5364 | FORCE_INLINE __m128i _mm_srl_epi64(__m128i a, __m128i count) | ||
5365 | { | ||
5366 | uint64_t c = vreinterpretq_nth_u64_m128i(count, 0); | ||
5367 | if (_sse2neon_unlikely(c & ~63)) | ||
5368 | return _mm_setzero_si128(); | ||
5369 | |||
5370 | int64x2_t vc = vdupq_n_s64(-(int64_t) c); | ||
5371 | return vreinterpretq_m128i_u64(vshlq_u64(vreinterpretq_u64_m128i(a), vc)); | ||
5372 | } | ||
5373 | |||
5374 | // Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and | ||
5375 | // store the results in dst. | ||
5376 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi16 | ||
5377 | #define _mm_srli_epi16(a, imm) \ | ||
5378 | _sse2neon_define0( \ | ||
5379 | __m128i, a, __m128i ret; if (_sse2neon_unlikely((imm) & ~15)) { \ | ||
5380 | ret = _mm_setzero_si128(); \ | ||
5381 | } else { \ | ||
5382 | ret = vreinterpretq_m128i_u16( \ | ||
5383 | vshlq_u16(vreinterpretq_u16_m128i(_a), vdupq_n_s16(-(imm)))); \ | ||
5384 | } _sse2neon_return(ret);) | ||
5385 | |||
5386 | // Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and | ||
5387 | // store the results in dst. | ||
5388 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi32 | ||
5389 | // FORCE_INLINE __m128i _mm_srli_epi32(__m128i a, __constrange(0,255) int imm) | ||
5390 | #define _mm_srli_epi32(a, imm) \ | ||
5391 | _sse2neon_define0( \ | ||
5392 | __m128i, a, __m128i ret; if (_sse2neon_unlikely((imm) & ~31)) { \ | ||
5393 | ret = _mm_setzero_si128(); \ | ||
5394 | } else { \ | ||
5395 | ret = vreinterpretq_m128i_u32( \ | ||
5396 | vshlq_u32(vreinterpretq_u32_m128i(_a), vdupq_n_s32(-(imm)))); \ | ||
5397 | } _sse2neon_return(ret);) | ||
5398 | |||
5399 | // Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and | ||
5400 | // store the results in dst. | ||
5401 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi64 | ||
5402 | #define _mm_srli_epi64(a, imm) \ | ||
5403 | _sse2neon_define0( \ | ||
5404 | __m128i, a, __m128i ret; if (_sse2neon_unlikely((imm) & ~63)) { \ | ||
5405 | ret = _mm_setzero_si128(); \ | ||
5406 | } else { \ | ||
5407 | ret = vreinterpretq_m128i_u64( \ | ||
5408 | vshlq_u64(vreinterpretq_u64_m128i(_a), vdupq_n_s64(-(imm)))); \ | ||
5409 | } _sse2neon_return(ret);) | ||
5410 | |||
5411 | // Shift a right by imm8 bytes while shifting in zeros, and store the results in | ||
5412 | // dst. | ||
5413 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_si128 | ||
5414 | #define _mm_srli_si128(a, imm) \ | ||
5415 | _sse2neon_define1( \ | ||
5416 | __m128i, a, int8x16_t ret; \ | ||
5417 | if (_sse2neon_unlikely((imm) & ~15)) ret = vdupq_n_s8(0); \ | ||
5418 | else ret = vextq_s8(vreinterpretq_s8_m128i(_a), vdupq_n_s8(0), \ | ||
5419 | (imm > 15 ? 0 : imm)); \ | ||
5420 | _sse2neon_return(vreinterpretq_m128i_s8(ret));) | ||
5421 | |||
5422 | // Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point | ||
5423 | // elements) from a into memory. mem_addr must be aligned on a 16-byte boundary | ||
5424 | // or a general-protection exception may be generated. | ||
5425 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_pd | ||
5426 | FORCE_INLINE void _mm_store_pd(double *mem_addr, __m128d a) | ||
5427 | { | ||
5428 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
5429 | vst1q_f64((float64_t *) mem_addr, vreinterpretq_f64_m128d(a)); | ||
5430 | #else | ||
5431 | vst1q_f32((float32_t *) mem_addr, vreinterpretq_f32_m128d(a)); | ||
5432 | #endif | ||
5433 | } | ||
5434 | |||
5435 | // Store the lower double-precision (64-bit) floating-point element from a into | ||
5436 | // 2 contiguous elements in memory. mem_addr must be aligned on a 16-byte | ||
5437 | // boundary or a general-protection exception may be generated. | ||
5438 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_pd1 | ||
5439 | FORCE_INLINE void _mm_store_pd1(double *mem_addr, __m128d a) | ||
5440 | { | ||
5441 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
5442 | float64x1_t a_low = vget_low_f64(vreinterpretq_f64_m128d(a)); | ||
5443 | vst1q_f64((float64_t *) mem_addr, | ||
5444 | vreinterpretq_f64_m128d(vcombine_f64(a_low, a_low))); | ||
5445 | #else | ||
5446 | float32x2_t a_low = vget_low_f32(vreinterpretq_f32_m128d(a)); | ||
5447 | vst1q_f32((float32_t *) mem_addr, | ||
5448 | vreinterpretq_f32_m128d(vcombine_f32(a_low, a_low))); | ||
5449 | #endif | ||
5450 | } | ||
5451 | |||
5452 | // Store the lower double-precision (64-bit) floating-point element from a into | ||
5453 | // memory. mem_addr does not need to be aligned on any particular boundary. | ||
5454 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_store_sd | ||
5455 | FORCE_INLINE void _mm_store_sd(double *mem_addr, __m128d a) | ||
5456 | { | ||
5457 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
5458 | vst1_f64((float64_t *) mem_addr, vget_low_f64(vreinterpretq_f64_m128d(a))); | ||
5459 | #else | ||
5460 | vst1_u64((uint64_t *) mem_addr, vget_low_u64(vreinterpretq_u64_m128d(a))); | ||
5461 | #endif | ||
5462 | } | ||
5463 | |||
5464 | // Store 128-bits of integer data from a into memory. mem_addr must be aligned | ||
5465 | // on a 16-byte boundary or a general-protection exception may be generated. | ||
5466 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_si128 | ||
5467 | FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a) | ||
5468 | { | ||
5469 | vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a)); | ||
5470 | } | ||
5471 | |||
5472 | // Store the lower double-precision (64-bit) floating-point element from a into | ||
5473 | // 2 contiguous elements in memory. mem_addr must be aligned on a 16-byte | ||
5474 | // boundary or a general-protection exception may be generated. | ||
5475 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#expand=9,526,5601&text=_mm_store1_pd | ||
5476 | #define _mm_store1_pd _mm_store_pd1 | ||
5477 | |||
5478 | // Store the upper double-precision (64-bit) floating-point element from a into | ||
5479 | // memory. | ||
5480 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeh_pd | ||
5481 | FORCE_INLINE void _mm_storeh_pd(double *mem_addr, __m128d a) | ||
5482 | { | ||
5483 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
5484 | vst1_f64((float64_t *) mem_addr, vget_high_f64(vreinterpretq_f64_m128d(a))); | ||
5485 | #else | ||
5486 | vst1_f32((float32_t *) mem_addr, vget_high_f32(vreinterpretq_f32_m128d(a))); | ||
5487 | #endif | ||
5488 | } | ||
5489 | |||
5490 | // Store 64-bit integer from the first element of a into memory. | ||
5491 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storel_epi64 | ||
5492 | FORCE_INLINE void _mm_storel_epi64(__m128i *a, __m128i b) | ||
5493 | { | ||
5494 | vst1_u64((uint64_t *) a, vget_low_u64(vreinterpretq_u64_m128i(b))); | ||
5495 | } | ||
5496 | |||
5497 | // Store the lower double-precision (64-bit) floating-point element from a into | ||
5498 | // memory. | ||
5499 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storel_pd | ||
5500 | FORCE_INLINE void _mm_storel_pd(double *mem_addr, __m128d a) | ||
5501 | { | ||
5502 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
5503 | vst1_f64((float64_t *) mem_addr, vget_low_f64(vreinterpretq_f64_m128d(a))); | ||
5504 | #else | ||
5505 | vst1_f32((float32_t *) mem_addr, vget_low_f32(vreinterpretq_f32_m128d(a))); | ||
5506 | #endif | ||
5507 | } | ||
5508 | |||
5509 | // Store 2 double-precision (64-bit) floating-point elements from a into memory | ||
5510 | // in reverse order. mem_addr must be aligned on a 16-byte boundary or a | ||
5511 | // general-protection exception may be generated. | ||
5512 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storer_pd | ||
5513 | FORCE_INLINE void _mm_storer_pd(double *mem_addr, __m128d a) | ||
5514 | { | ||
5515 | float32x4_t f = vreinterpretq_f32_m128d(a); | ||
5516 | _mm_store_pd(mem_addr, vreinterpretq_m128d_f32(vextq_f32(f, f, 2))); | ||
5517 | } | ||
5518 | |||
5519 | // Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point | ||
5520 | // elements) from a into memory. mem_addr does not need to be aligned on any | ||
5521 | // particular boundary. | ||
5522 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_pd | ||
5523 | FORCE_INLINE void _mm_storeu_pd(double *mem_addr, __m128d a) | ||
5524 | { | ||
5525 | _mm_store_pd(mem_addr, a); | ||
5526 | } | ||
5527 | |||
5528 | // Store 128-bits of integer data from a into memory. mem_addr does not need to | ||
5529 | // be aligned on any particular boundary. | ||
5530 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si128 | ||
5531 | FORCE_INLINE void _mm_storeu_si128(__m128i *p, __m128i a) | ||
5532 | { | ||
5533 | vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a)); | ||
5534 | } | ||
5535 | |||
5536 | // Store 32-bit integer from the first element of a into memory. mem_addr does | ||
5537 | // not need to be aligned on any particular boundary. | ||
5538 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si32 | ||
5539 | FORCE_INLINE void _mm_storeu_si32(void *p, __m128i a) | ||
5540 | { | ||
5541 | vst1q_lane_s32((int32_t *) p, vreinterpretq_s32_m128i(a), 0); | ||
5542 | } | ||
5543 | |||
5544 | // Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point | ||
5545 | // elements) from a into memory using a non-temporal memory hint. mem_addr must | ||
5546 | // be aligned on a 16-byte boundary or a general-protection exception may be | ||
5547 | // generated. | ||
5548 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_pd | ||
5549 | FORCE_INLINE void _mm_stream_pd(double *p, __m128d a) | ||
5550 | { | ||
5551 | #if __has_builtin(__builtin_nontemporal_store) | ||
5552 | __builtin_nontemporal_store(a, (__m128d *) p); | ||
5553 | #elif defined(__aarch64__) || defined(_M_ARM64) | ||
5554 | vst1q_f64(p, vreinterpretq_f64_m128d(a)); | ||
5555 | #else | ||
5556 | vst1q_s64((int64_t *) p, vreinterpretq_s64_m128d(a)); | ||
5557 | #endif | ||
5558 | } | ||
5559 | |||
5560 | // Store 128-bits of integer data from a into memory using a non-temporal memory | ||
5561 | // hint. mem_addr must be aligned on a 16-byte boundary or a general-protection | ||
5562 | // exception may be generated. | ||
5563 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_si128 | ||
5564 | FORCE_INLINE void _mm_stream_si128(__m128i *p, __m128i a) | ||
5565 | { | ||
5566 | #if __has_builtin(__builtin_nontemporal_store) | ||
5567 | __builtin_nontemporal_store(a, p); | ||
5568 | #else | ||
5569 | vst1q_s64((int64_t *) p, vreinterpretq_s64_m128i(a)); | ||
5570 | #endif | ||
5571 | } | ||
5572 | |||
5573 | // Store 32-bit integer a into memory using a non-temporal hint to minimize | ||
5574 | // cache pollution. If the cache line containing address mem_addr is already in | ||
5575 | // the cache, the cache will be updated. | ||
5576 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_si32 | ||
5577 | FORCE_INLINE void _mm_stream_si32(int *p, int a) | ||
5578 | { | ||
5579 | vst1q_lane_s32((int32_t *) p, vdupq_n_s32(a), 0); | ||
5580 | } | ||
5581 | |||
5582 | // Store 64-bit integer a into memory using a non-temporal hint to minimize | ||
5583 | // cache pollution. If the cache line containing address mem_addr is already in | ||
5584 | // the cache, the cache will be updated. | ||
5585 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_si64 | ||
5586 | FORCE_INLINE void _mm_stream_si64(__int64 *p, __int64 a) | ||
5587 | { | ||
5588 | vst1_s64((int64_t *) p, vdup_n_s64((int64_t) a)); | ||
5589 | } | ||
5590 | |||
5591 | // Subtract packed 16-bit integers in b from packed 16-bit integers in a, and | ||
5592 | // store the results in dst. | ||
5593 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi16 | ||
5594 | FORCE_INLINE __m128i _mm_sub_epi16(__m128i a, __m128i b) | ||
5595 | { | ||
5596 | return vreinterpretq_m128i_s16( | ||
5597 | vsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); | ||
5598 | } | ||
5599 | |||
5600 | // Subtract packed 32-bit integers in b from packed 32-bit integers in a, and | ||
5601 | // store the results in dst. | ||
5602 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi32 | ||
5603 | FORCE_INLINE __m128i _mm_sub_epi32(__m128i a, __m128i b) | ||
5604 | { | ||
5605 | return vreinterpretq_m128i_s32( | ||
5606 | vsubq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); | ||
5607 | } | ||
5608 | |||
5609 | // Subtract packed 64-bit integers in b from packed 64-bit integers in a, and | ||
5610 | // store the results in dst. | ||
5611 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi64 | ||
5612 | FORCE_INLINE __m128i _mm_sub_epi64(__m128i a, __m128i b) | ||
5613 | { | ||
5614 | return vreinterpretq_m128i_s64( | ||
5615 | vsubq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b))); | ||
5616 | } | ||
5617 | |||
5618 | // Subtract packed 8-bit integers in b from packed 8-bit integers in a, and | ||
5619 | // store the results in dst. | ||
5620 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi8 | ||
5621 | FORCE_INLINE __m128i _mm_sub_epi8(__m128i a, __m128i b) | ||
5622 | { | ||
5623 | return vreinterpretq_m128i_s8( | ||
5624 | vsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); | ||
5625 | } | ||
5626 | |||
5627 | // Subtract packed double-precision (64-bit) floating-point elements in b from | ||
5628 | // packed double-precision (64-bit) floating-point elements in a, and store the | ||
5629 | // results in dst. | ||
5630 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_sub_pd | ||
5631 | FORCE_INLINE __m128d _mm_sub_pd(__m128d a, __m128d b) | ||
5632 | { | ||
5633 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
5634 | return vreinterpretq_m128d_f64( | ||
5635 | vsubq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); | ||
5636 | #else | ||
5637 | double *da = (double *) &a; | ||
5638 | double *db = (double *) &b; | ||
5639 | double c[2]; | ||
5640 | c[0] = da[0] - db[0]; | ||
5641 | c[1] = da[1] - db[1]; | ||
5642 | return vld1q_f32((float32_t *) c); | ||
5643 | #endif | ||
5644 | } | ||
5645 | |||
5646 | // Subtract the lower double-precision (64-bit) floating-point element in b from | ||
5647 | // the lower double-precision (64-bit) floating-point element in a, store the | ||
5648 | // result in the lower element of dst, and copy the upper element from a to the | ||
5649 | // upper element of dst. | ||
5650 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_sd | ||
5651 | FORCE_INLINE __m128d _mm_sub_sd(__m128d a, __m128d b) | ||
5652 | { | ||
5653 | return _mm_move_sd(a, _mm_sub_pd(a, b)); | ||
5654 | } | ||
5655 | |||
5656 | // Subtract 64-bit integer b from 64-bit integer a, and store the result in dst. | ||
5657 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_si64 | ||
5658 | FORCE_INLINE __m64 _mm_sub_si64(__m64 a, __m64 b) | ||
5659 | { | ||
5660 | return vreinterpret_m64_s64( | ||
5661 | vsub_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b))); | ||
5662 | } | ||
5663 | |||
5664 | // Subtract packed signed 16-bit integers in b from packed 16-bit integers in a | ||
5665 | // using saturation, and store the results in dst. | ||
5666 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epi16 | ||
5667 | FORCE_INLINE __m128i _mm_subs_epi16(__m128i a, __m128i b) | ||
5668 | { | ||
5669 | return vreinterpretq_m128i_s16( | ||
5670 | vqsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); | ||
5671 | } | ||
5672 | |||
5673 | // Subtract packed signed 8-bit integers in b from packed 8-bit integers in a | ||
5674 | // using saturation, and store the results in dst. | ||
5675 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epi8 | ||
5676 | FORCE_INLINE __m128i _mm_subs_epi8(__m128i a, __m128i b) | ||
5677 | { | ||
5678 | return vreinterpretq_m128i_s8( | ||
5679 | vqsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); | ||
5680 | } | ||
5681 | |||
5682 | // Subtract packed unsigned 16-bit integers in b from packed unsigned 16-bit | ||
5683 | // integers in a using saturation, and store the results in dst. | ||
5684 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epu16 | ||
5685 | FORCE_INLINE __m128i _mm_subs_epu16(__m128i a, __m128i b) | ||
5686 | { | ||
5687 | return vreinterpretq_m128i_u16( | ||
5688 | vqsubq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b))); | ||
5689 | } | ||
5690 | |||
5691 | // Subtract packed unsigned 8-bit integers in b from packed unsigned 8-bit | ||
5692 | // integers in a using saturation, and store the results in dst. | ||
5693 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epu8 | ||
5694 | FORCE_INLINE __m128i _mm_subs_epu8(__m128i a, __m128i b) | ||
5695 | { | ||
5696 | return vreinterpretq_m128i_u8( | ||
5697 | vqsubq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b))); | ||
5698 | } | ||
5699 | |||
5700 | #define _mm_ucomieq_sd _mm_comieq_sd | ||
5701 | #define _mm_ucomige_sd _mm_comige_sd | ||
5702 | #define _mm_ucomigt_sd _mm_comigt_sd | ||
5703 | #define _mm_ucomile_sd _mm_comile_sd | ||
5704 | #define _mm_ucomilt_sd _mm_comilt_sd | ||
5705 | #define _mm_ucomineq_sd _mm_comineq_sd | ||
5706 | |||
5707 | // Return vector of type __m128d with undefined elements. | ||
5708 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_undefined_pd | ||
5709 | FORCE_INLINE __m128d _mm_undefined_pd(void) | ||
5710 | { | ||
5711 | #if defined(__GNUC__) || defined(__clang__) | ||
5712 | #pragma GCC diagnostic push | ||
5713 | #pragma GCC diagnostic ignored "-Wuninitialized" | ||
5714 | #endif | ||
5715 | __m128d a; | ||
5716 | #if defined(_MSC_VER) | ||
5717 | a = _mm_setzero_pd(); | ||
5718 | #endif | ||
5719 | return a; | ||
5720 | #if defined(__GNUC__) || defined(__clang__) | ||
5721 | #pragma GCC diagnostic pop | ||
5722 | #endif | ||
5723 | } | ||
5724 | |||
5725 | // Unpack and interleave 16-bit integers from the high half of a and b, and | ||
5726 | // store the results in dst. | ||
5727 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi16 | ||
5728 | FORCE_INLINE __m128i _mm_unpackhi_epi16(__m128i a, __m128i b) | ||
5729 | { | ||
5730 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
5731 | return vreinterpretq_m128i_s16( | ||
5732 | vzip2q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); | ||
5733 | #else | ||
5734 | int16x4_t a1 = vget_high_s16(vreinterpretq_s16_m128i(a)); | ||
5735 | int16x4_t b1 = vget_high_s16(vreinterpretq_s16_m128i(b)); | ||
5736 | int16x4x2_t result = vzip_s16(a1, b1); | ||
5737 | return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1])); | ||
5738 | #endif | ||
5739 | } | ||
5740 | |||
5741 | // Unpack and interleave 32-bit integers from the high half of a and b, and | ||
5742 | // store the results in dst. | ||
5743 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi32 | ||
5744 | FORCE_INLINE __m128i _mm_unpackhi_epi32(__m128i a, __m128i b) | ||
5745 | { | ||
5746 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
5747 | return vreinterpretq_m128i_s32( | ||
5748 | vzip2q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); | ||
5749 | #else | ||
5750 | int32x2_t a1 = vget_high_s32(vreinterpretq_s32_m128i(a)); | ||
5751 | int32x2_t b1 = vget_high_s32(vreinterpretq_s32_m128i(b)); | ||
5752 | int32x2x2_t result = vzip_s32(a1, b1); | ||
5753 | return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1])); | ||
5754 | #endif | ||
5755 | } | ||
5756 | |||
5757 | // Unpack and interleave 64-bit integers from the high half of a and b, and | ||
5758 | // store the results in dst. | ||
5759 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi64 | ||
5760 | FORCE_INLINE __m128i _mm_unpackhi_epi64(__m128i a, __m128i b) | ||
5761 | { | ||
5762 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
5763 | return vreinterpretq_m128i_s64( | ||
5764 | vzip2q_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b))); | ||
5765 | #else | ||
5766 | int64x1_t a_h = vget_high_s64(vreinterpretq_s64_m128i(a)); | ||
5767 | int64x1_t b_h = vget_high_s64(vreinterpretq_s64_m128i(b)); | ||
5768 | return vreinterpretq_m128i_s64(vcombine_s64(a_h, b_h)); | ||
5769 | #endif | ||
5770 | } | ||
5771 | |||
5772 | // Unpack and interleave 8-bit integers from the high half of a and b, and store | ||
5773 | // the results in dst. | ||
5774 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi8 | ||
5775 | FORCE_INLINE __m128i _mm_unpackhi_epi8(__m128i a, __m128i b) | ||
5776 | { | ||
5777 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
5778 | return vreinterpretq_m128i_s8( | ||
5779 | vzip2q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); | ||
5780 | #else | ||
5781 | int8x8_t a1 = | ||
5782 | vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(a))); | ||
5783 | int8x8_t b1 = | ||
5784 | vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(b))); | ||
5785 | int8x8x2_t result = vzip_s8(a1, b1); | ||
5786 | return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1])); | ||
5787 | #endif | ||
5788 | } | ||
5789 | |||
5790 | // Unpack and interleave double-precision (64-bit) floating-point elements from | ||
5791 | // the high half of a and b, and store the results in dst. | ||
5792 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_pd | ||
5793 | FORCE_INLINE __m128d _mm_unpackhi_pd(__m128d a, __m128d b) | ||
5794 | { | ||
5795 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
5796 | return vreinterpretq_m128d_f64( | ||
5797 | vzip2q_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); | ||
5798 | #else | ||
5799 | return vreinterpretq_m128d_s64( | ||
5800 | vcombine_s64(vget_high_s64(vreinterpretq_s64_m128d(a)), | ||
5801 | vget_high_s64(vreinterpretq_s64_m128d(b)))); | ||
5802 | #endif | ||
5803 | } | ||
5804 | |||
5805 | // Unpack and interleave 16-bit integers from the low half of a and b, and store | ||
5806 | // the results in dst. | ||
5807 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi16 | ||
5808 | FORCE_INLINE __m128i _mm_unpacklo_epi16(__m128i a, __m128i b) | ||
5809 | { | ||
5810 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
5811 | return vreinterpretq_m128i_s16( | ||
5812 | vzip1q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); | ||
5813 | #else | ||
5814 | int16x4_t a1 = vget_low_s16(vreinterpretq_s16_m128i(a)); | ||
5815 | int16x4_t b1 = vget_low_s16(vreinterpretq_s16_m128i(b)); | ||
5816 | int16x4x2_t result = vzip_s16(a1, b1); | ||
5817 | return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1])); | ||
5818 | #endif | ||
5819 | } | ||
5820 | |||
5821 | // Unpack and interleave 32-bit integers from the low half of a and b, and store | ||
5822 | // the results in dst. | ||
5823 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi32 | ||
5824 | FORCE_INLINE __m128i _mm_unpacklo_epi32(__m128i a, __m128i b) | ||
5825 | { | ||
5826 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
5827 | return vreinterpretq_m128i_s32( | ||
5828 | vzip1q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); | ||
5829 | #else | ||
5830 | int32x2_t a1 = vget_low_s32(vreinterpretq_s32_m128i(a)); | ||
5831 | int32x2_t b1 = vget_low_s32(vreinterpretq_s32_m128i(b)); | ||
5832 | int32x2x2_t result = vzip_s32(a1, b1); | ||
5833 | return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1])); | ||
5834 | #endif | ||
5835 | } | ||
5836 | |||
5837 | // Unpack and interleave 64-bit integers from the low half of a and b, and store | ||
5838 | // the results in dst. | ||
5839 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi64 | ||
5840 | FORCE_INLINE __m128i _mm_unpacklo_epi64(__m128i a, __m128i b) | ||
5841 | { | ||
5842 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
5843 | return vreinterpretq_m128i_s64( | ||
5844 | vzip1q_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b))); | ||
5845 | #else | ||
5846 | int64x1_t a_l = vget_low_s64(vreinterpretq_s64_m128i(a)); | ||
5847 | int64x1_t b_l = vget_low_s64(vreinterpretq_s64_m128i(b)); | ||
5848 | return vreinterpretq_m128i_s64(vcombine_s64(a_l, b_l)); | ||
5849 | #endif | ||
5850 | } | ||
5851 | |||
5852 | // Unpack and interleave 8-bit integers from the low half of a and b, and store | ||
5853 | // the results in dst. | ||
5854 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi8 | ||
5855 | FORCE_INLINE __m128i _mm_unpacklo_epi8(__m128i a, __m128i b) | ||
5856 | { | ||
5857 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
5858 | return vreinterpretq_m128i_s8( | ||
5859 | vzip1q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); | ||
5860 | #else | ||
5861 | int8x8_t a1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(a))); | ||
5862 | int8x8_t b1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(b))); | ||
5863 | int8x8x2_t result = vzip_s8(a1, b1); | ||
5864 | return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1])); | ||
5865 | #endif | ||
5866 | } | ||
5867 | |||
5868 | // Unpack and interleave double-precision (64-bit) floating-point elements from | ||
5869 | // the low half of a and b, and store the results in dst. | ||
5870 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_pd | ||
5871 | FORCE_INLINE __m128d _mm_unpacklo_pd(__m128d a, __m128d b) | ||
5872 | { | ||
5873 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
5874 | return vreinterpretq_m128d_f64( | ||
5875 | vzip1q_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); | ||
5876 | #else | ||
5877 | return vreinterpretq_m128d_s64( | ||
5878 | vcombine_s64(vget_low_s64(vreinterpretq_s64_m128d(a)), | ||
5879 | vget_low_s64(vreinterpretq_s64_m128d(b)))); | ||
5880 | #endif | ||
5881 | } | ||
5882 | |||
5883 | // Compute the bitwise XOR of packed double-precision (64-bit) floating-point | ||
5884 | // elements in a and b, and store the results in dst. | ||
5885 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_pd | ||
5886 | FORCE_INLINE __m128d _mm_xor_pd(__m128d a, __m128d b) | ||
5887 | { | ||
5888 | return vreinterpretq_m128d_s64( | ||
5889 | veorq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b))); | ||
5890 | } | ||
5891 | |||
5892 | // Compute the bitwise XOR of 128 bits (representing integer data) in a and b, | ||
5893 | // and store the result in dst. | ||
5894 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_si128 | ||
5895 | FORCE_INLINE __m128i _mm_xor_si128(__m128i a, __m128i b) | ||
5896 | { | ||
5897 | return vreinterpretq_m128i_s32( | ||
5898 | veorq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); | ||
5899 | } | ||
5900 | |||
5901 | /* SSE3 */ | ||
5902 | |||
5903 | // Alternatively add and subtract packed double-precision (64-bit) | ||
5904 | // floating-point elements in a to/from packed elements in b, and store the | ||
5905 | // results in dst. | ||
5906 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_addsub_pd | ||
5907 | FORCE_INLINE __m128d _mm_addsub_pd(__m128d a, __m128d b) | ||
5908 | { | ||
5909 | _sse2neon_const __m128d mask = _mm_set_pd(1.0f, -1.0f); | ||
5910 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
5911 | return vreinterpretq_m128d_f64(vfmaq_f64(vreinterpretq_f64_m128d(a), | ||
5912 | vreinterpretq_f64_m128d(b), | ||
5913 | vreinterpretq_f64_m128d(mask))); | ||
5914 | #else | ||
5915 | return _mm_add_pd(_mm_mul_pd(b, mask), a); | ||
5916 | #endif | ||
5917 | } | ||
5918 | |||
5919 | // Alternatively add and subtract packed single-precision (32-bit) | ||
5920 | // floating-point elements in a to/from packed elements in b, and store the | ||
5921 | // results in dst. | ||
5922 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=addsub_ps | ||
5923 | FORCE_INLINE __m128 _mm_addsub_ps(__m128 a, __m128 b) | ||
5924 | { | ||
5925 | _sse2neon_const __m128 mask = _mm_setr_ps(-1.0f, 1.0f, -1.0f, 1.0f); | ||
5926 | #if (defined(__aarch64__) || defined(_M_ARM64)) || \ | ||
5927 | defined(__ARM_FEATURE_FMA) /* VFPv4+ */ | ||
5928 | return vreinterpretq_m128_f32(vfmaq_f32(vreinterpretq_f32_m128(a), | ||
5929 | vreinterpretq_f32_m128(mask), | ||
5930 | vreinterpretq_f32_m128(b))); | ||
5931 | #else | ||
5932 | return _mm_add_ps(_mm_mul_ps(b, mask), a); | ||
5933 | #endif | ||
5934 | } | ||
5935 | |||
5936 | // Horizontally add adjacent pairs of double-precision (64-bit) floating-point | ||
5937 | // elements in a and b, and pack the results in dst. | ||
5938 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_pd | ||
5939 | FORCE_INLINE __m128d _mm_hadd_pd(__m128d a, __m128d b) | ||
5940 | { | ||
5941 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
5942 | return vreinterpretq_m128d_f64( | ||
5943 | vpaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); | ||
5944 | #else | ||
5945 | double *da = (double *) &a; | ||
5946 | double *db = (double *) &b; | ||
5947 | double c[] = {da[0] + da[1], db[0] + db[1]}; | ||
5948 | return vreinterpretq_m128d_u64(vld1q_u64((uint64_t *) c)); | ||
5949 | #endif | ||
5950 | } | ||
5951 | |||
5952 | // Horizontally add adjacent pairs of single-precision (32-bit) floating-point | ||
5953 | // elements in a and b, and pack the results in dst. | ||
5954 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_ps | ||
5955 | FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b) | ||
5956 | { | ||
5957 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
5958 | return vreinterpretq_m128_f32( | ||
5959 | vpaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); | ||
5960 | #else | ||
5961 | float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a)); | ||
5962 | float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a)); | ||
5963 | float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b)); | ||
5964 | float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b)); | ||
5965 | return vreinterpretq_m128_f32( | ||
5966 | vcombine_f32(vpadd_f32(a10, a32), vpadd_f32(b10, b32))); | ||
5967 | #endif | ||
5968 | } | ||
5969 | |||
5970 | // Horizontally subtract adjacent pairs of double-precision (64-bit) | ||
5971 | // floating-point elements in a and b, and pack the results in dst. | ||
5972 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_pd | ||
5973 | FORCE_INLINE __m128d _mm_hsub_pd(__m128d _a, __m128d _b) | ||
5974 | { | ||
5975 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
5976 | float64x2_t a = vreinterpretq_f64_m128d(_a); | ||
5977 | float64x2_t b = vreinterpretq_f64_m128d(_b); | ||
5978 | return vreinterpretq_m128d_f64( | ||
5979 | vsubq_f64(vuzp1q_f64(a, b), vuzp2q_f64(a, b))); | ||
5980 | #else | ||
5981 | double *da = (double *) &_a; | ||
5982 | double *db = (double *) &_b; | ||
5983 | double c[] = {da[0] - da[1], db[0] - db[1]}; | ||
5984 | return vreinterpretq_m128d_u64(vld1q_u64((uint64_t *) c)); | ||
5985 | #endif | ||
5986 | } | ||
5987 | |||
5988 | // Horizontally subtract adjacent pairs of single-precision (32-bit) | ||
5989 | // floating-point elements in a and b, and pack the results in dst. | ||
5990 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_ps | ||
5991 | FORCE_INLINE __m128 _mm_hsub_ps(__m128 _a, __m128 _b) | ||
5992 | { | ||
5993 | float32x4_t a = vreinterpretq_f32_m128(_a); | ||
5994 | float32x4_t b = vreinterpretq_f32_m128(_b); | ||
5995 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
5996 | return vreinterpretq_m128_f32( | ||
5997 | vsubq_f32(vuzp1q_f32(a, b), vuzp2q_f32(a, b))); | ||
5998 | #else | ||
5999 | float32x4x2_t c = vuzpq_f32(a, b); | ||
6000 | return vreinterpretq_m128_f32(vsubq_f32(c.val[0], c.val[1])); | ||
6001 | #endif | ||
6002 | } | ||
6003 | |||
6004 | // Load 128-bits of integer data from unaligned memory into dst. This intrinsic | ||
6005 | // may perform better than _mm_loadu_si128 when the data crosses a cache line | ||
6006 | // boundary. | ||
6007 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_lddqu_si128 | ||
6008 | #define _mm_lddqu_si128 _mm_loadu_si128 | ||
6009 | |||
6010 | // Load a double-precision (64-bit) floating-point element from memory into both | ||
6011 | // elements of dst. | ||
6012 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loaddup_pd | ||
6013 | #define _mm_loaddup_pd _mm_load1_pd | ||
6014 | |||
6015 | // Duplicate the low double-precision (64-bit) floating-point element from a, | ||
6016 | // and store the results in dst. | ||
6017 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movedup_pd | ||
6018 | FORCE_INLINE __m128d _mm_movedup_pd(__m128d a) | ||
6019 | { | ||
6020 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
6021 | return vreinterpretq_m128d_f64( | ||
6022 | vdupq_laneq_f64(vreinterpretq_f64_m128d(a), 0)); | ||
6023 | #else | ||
6024 | return vreinterpretq_m128d_u64( | ||
6025 | vdupq_n_u64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0))); | ||
6026 | #endif | ||
6027 | } | ||
6028 | |||
6029 | // Duplicate odd-indexed single-precision (32-bit) floating-point elements | ||
6030 | // from a, and store the results in dst. | ||
6031 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movehdup_ps | ||
6032 | FORCE_INLINE __m128 _mm_movehdup_ps(__m128 a) | ||
6033 | { | ||
6034 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
6035 | return vreinterpretq_m128_f32( | ||
6036 | vtrn2q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a))); | ||
6037 | #elif defined(_sse2neon_shuffle) | ||
6038 | return vreinterpretq_m128_f32(vshuffleq_s32( | ||
6039 | vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 1, 1, 3, 3)); | ||
6040 | #else | ||
6041 | float32_t a1 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 1); | ||
6042 | float32_t a3 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 3); | ||
6043 | float ALIGN_STRUCT(16) data[4] = {a1, a1, a3, a3}; | ||
6044 | return vreinterpretq_m128_f32(vld1q_f32(data)); | ||
6045 | #endif | ||
6046 | } | ||
6047 | |||
6048 | // Duplicate even-indexed single-precision (32-bit) floating-point elements | ||
6049 | // from a, and store the results in dst. | ||
6050 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_moveldup_ps | ||
6051 | FORCE_INLINE __m128 _mm_moveldup_ps(__m128 a) | ||
6052 | { | ||
6053 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
6054 | return vreinterpretq_m128_f32( | ||
6055 | vtrn1q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a))); | ||
6056 | #elif defined(_sse2neon_shuffle) | ||
6057 | return vreinterpretq_m128_f32(vshuffleq_s32( | ||
6058 | vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 0, 0, 2, 2)); | ||
6059 | #else | ||
6060 | float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0); | ||
6061 | float32_t a2 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 2); | ||
6062 | float ALIGN_STRUCT(16) data[4] = {a0, a0, a2, a2}; | ||
6063 | return vreinterpretq_m128_f32(vld1q_f32(data)); | ||
6064 | #endif | ||
6065 | } | ||
6066 | |||
6067 | /* SSSE3 */ | ||
6068 | |||
6069 | // Compute the absolute value of packed signed 16-bit integers in a, and store | ||
6070 | // the unsigned results in dst. | ||
6071 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi16 | ||
6072 | FORCE_INLINE __m128i _mm_abs_epi16(__m128i a) | ||
6073 | { | ||
6074 | return vreinterpretq_m128i_s16(vabsq_s16(vreinterpretq_s16_m128i(a))); | ||
6075 | } | ||
6076 | |||
6077 | // Compute the absolute value of packed signed 32-bit integers in a, and store | ||
6078 | // the unsigned results in dst. | ||
6079 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi32 | ||
6080 | FORCE_INLINE __m128i _mm_abs_epi32(__m128i a) | ||
6081 | { | ||
6082 | return vreinterpretq_m128i_s32(vabsq_s32(vreinterpretq_s32_m128i(a))); | ||
6083 | } | ||
6084 | |||
6085 | // Compute the absolute value of packed signed 8-bit integers in a, and store | ||
6086 | // the unsigned results in dst. | ||
6087 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi8 | ||
6088 | FORCE_INLINE __m128i _mm_abs_epi8(__m128i a) | ||
6089 | { | ||
6090 | return vreinterpretq_m128i_s8(vabsq_s8(vreinterpretq_s8_m128i(a))); | ||
6091 | } | ||
6092 | |||
6093 | // Compute the absolute value of packed signed 16-bit integers in a, and store | ||
6094 | // the unsigned results in dst. | ||
6095 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_pi16 | ||
6096 | FORCE_INLINE __m64 _mm_abs_pi16(__m64 a) | ||
6097 | { | ||
6098 | return vreinterpret_m64_s16(vabs_s16(vreinterpret_s16_m64(a))); | ||
6099 | } | ||
6100 | |||
6101 | // Compute the absolute value of packed signed 32-bit integers in a, and store | ||
6102 | // the unsigned results in dst. | ||
6103 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_pi32 | ||
6104 | FORCE_INLINE __m64 _mm_abs_pi32(__m64 a) | ||
6105 | { | ||
6106 | return vreinterpret_m64_s32(vabs_s32(vreinterpret_s32_m64(a))); | ||
6107 | } | ||
6108 | |||
6109 | // Compute the absolute value of packed signed 8-bit integers in a, and store | ||
6110 | // the unsigned results in dst. | ||
6111 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_pi8 | ||
6112 | FORCE_INLINE __m64 _mm_abs_pi8(__m64 a) | ||
6113 | { | ||
6114 | return vreinterpret_m64_s8(vabs_s8(vreinterpret_s8_m64(a))); | ||
6115 | } | ||
6116 | |||
6117 | // Concatenate 16-byte blocks in a and b into a 32-byte temporary result, shift | ||
6118 | // the result right by imm8 bytes, and store the low 16 bytes in dst. | ||
6119 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_alignr_epi8 | ||
6120 | #if defined(__GNUC__) && !defined(__clang__) | ||
6121 | #define _mm_alignr_epi8(a, b, imm) \ | ||
6122 | __extension__({ \ | ||
6123 | uint8x16_t _a = vreinterpretq_u8_m128i(a); \ | ||
6124 | uint8x16_t _b = vreinterpretq_u8_m128i(b); \ | ||
6125 | __m128i ret; \ | ||
6126 | if (_sse2neon_unlikely((imm) & ~31)) \ | ||
6127 | ret = vreinterpretq_m128i_u8(vdupq_n_u8(0)); \ | ||
6128 | else if (imm >= 16) \ | ||
6129 | ret = _mm_srli_si128(a, imm >= 16 ? imm - 16 : 0); \ | ||
6130 | else \ | ||
6131 | ret = \ | ||
6132 | vreinterpretq_m128i_u8(vextq_u8(_b, _a, imm < 16 ? imm : 0)); \ | ||
6133 | ret; \ | ||
6134 | }) | ||
6135 | |||
6136 | #else | ||
6137 | #define _mm_alignr_epi8(a, b, imm) \ | ||
6138 | _sse2neon_define2( \ | ||
6139 | __m128i, a, b, uint8x16_t __a = vreinterpretq_u8_m128i(_a); \ | ||
6140 | uint8x16_t __b = vreinterpretq_u8_m128i(_b); __m128i ret; \ | ||
6141 | if (_sse2neon_unlikely((imm) & ~31)) ret = \ | ||
6142 | vreinterpretq_m128i_u8(vdupq_n_u8(0)); \ | ||
6143 | else if (imm >= 16) ret = \ | ||
6144 | _mm_srli_si128(_a, imm >= 16 ? imm - 16 : 0); \ | ||
6145 | else ret = \ | ||
6146 | vreinterpretq_m128i_u8(vextq_u8(__b, __a, imm < 16 ? imm : 0)); \ | ||
6147 | _sse2neon_return(ret);) | ||
6148 | |||
6149 | #endif | ||
6150 | |||
6151 | // Concatenate 8-byte blocks in a and b into a 16-byte temporary result, shift | ||
6152 | // the result right by imm8 bytes, and store the low 8 bytes in dst. | ||
6153 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_alignr_pi8 | ||
6154 | #define _mm_alignr_pi8(a, b, imm) \ | ||
6155 | _sse2neon_define2( \ | ||
6156 | __m64, a, b, __m64 ret; if (_sse2neon_unlikely((imm) >= 16)) { \ | ||
6157 | ret = vreinterpret_m64_s8(vdup_n_s8(0)); \ | ||
6158 | } else { \ | ||
6159 | uint8x8_t tmp_low; \ | ||
6160 | uint8x8_t tmp_high; \ | ||
6161 | if ((imm) >= 8) { \ | ||
6162 | const int idx = (imm) -8; \ | ||
6163 | tmp_low = vreinterpret_u8_m64(_a); \ | ||
6164 | tmp_high = vdup_n_u8(0); \ | ||
6165 | ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \ | ||
6166 | } else { \ | ||
6167 | const int idx = (imm); \ | ||
6168 | tmp_low = vreinterpret_u8_m64(_b); \ | ||
6169 | tmp_high = vreinterpret_u8_m64(_a); \ | ||
6170 | ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \ | ||
6171 | } \ | ||
6172 | } _sse2neon_return(ret);) | ||
6173 | |||
6174 | // Horizontally add adjacent pairs of 16-bit integers in a and b, and pack the | ||
6175 | // signed 16-bit results in dst. | ||
6176 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_epi16 | ||
6177 | FORCE_INLINE __m128i _mm_hadd_epi16(__m128i _a, __m128i _b) | ||
6178 | { | ||
6179 | int16x8_t a = vreinterpretq_s16_m128i(_a); | ||
6180 | int16x8_t b = vreinterpretq_s16_m128i(_b); | ||
6181 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
6182 | return vreinterpretq_m128i_s16(vpaddq_s16(a, b)); | ||
6183 | #else | ||
6184 | return vreinterpretq_m128i_s16( | ||
6185 | vcombine_s16(vpadd_s16(vget_low_s16(a), vget_high_s16(a)), | ||
6186 | vpadd_s16(vget_low_s16(b), vget_high_s16(b)))); | ||
6187 | #endif | ||
6188 | } | ||
6189 | |||
6190 | // Horizontally add adjacent pairs of 32-bit integers in a and b, and pack the | ||
6191 | // signed 32-bit results in dst. | ||
6192 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_epi32 | ||
6193 | FORCE_INLINE __m128i _mm_hadd_epi32(__m128i _a, __m128i _b) | ||
6194 | { | ||
6195 | int32x4_t a = vreinterpretq_s32_m128i(_a); | ||
6196 | int32x4_t b = vreinterpretq_s32_m128i(_b); | ||
6197 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
6198 | return vreinterpretq_m128i_s32(vpaddq_s32(a, b)); | ||
6199 | #else | ||
6200 | return vreinterpretq_m128i_s32( | ||
6201 | vcombine_s32(vpadd_s32(vget_low_s32(a), vget_high_s32(a)), | ||
6202 | vpadd_s32(vget_low_s32(b), vget_high_s32(b)))); | ||
6203 | #endif | ||
6204 | } | ||
6205 | |||
6206 | // Horizontally add adjacent pairs of 16-bit integers in a and b, and pack the | ||
6207 | // signed 16-bit results in dst. | ||
6208 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_pi16 | ||
6209 | FORCE_INLINE __m64 _mm_hadd_pi16(__m64 a, __m64 b) | ||
6210 | { | ||
6211 | return vreinterpret_m64_s16( | ||
6212 | vpadd_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b))); | ||
6213 | } | ||
6214 | |||
6215 | // Horizontally add adjacent pairs of 32-bit integers in a and b, and pack the | ||
6216 | // signed 32-bit results in dst. | ||
6217 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_pi32 | ||
6218 | FORCE_INLINE __m64 _mm_hadd_pi32(__m64 a, __m64 b) | ||
6219 | { | ||
6220 | return vreinterpret_m64_s32( | ||
6221 | vpadd_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b))); | ||
6222 | } | ||
6223 | |||
6224 | // Horizontally add adjacent pairs of signed 16-bit integers in a and b using | ||
6225 | // saturation, and pack the signed 16-bit results in dst. | ||
6226 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadds_epi16 | ||
6227 | FORCE_INLINE __m128i _mm_hadds_epi16(__m128i _a, __m128i _b) | ||
6228 | { | ||
6229 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
6230 | int16x8_t a = vreinterpretq_s16_m128i(_a); | ||
6231 | int16x8_t b = vreinterpretq_s16_m128i(_b); | ||
6232 | return vreinterpretq_s64_s16( | ||
6233 | vqaddq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b))); | ||
6234 | #else | ||
6235 | int32x4_t a = vreinterpretq_s32_m128i(_a); | ||
6236 | int32x4_t b = vreinterpretq_s32_m128i(_b); | ||
6237 | // Interleave using vshrn/vmovn | ||
6238 | // [a0|a2|a4|a6|b0|b2|b4|b6] | ||
6239 | // [a1|a3|a5|a7|b1|b3|b5|b7] | ||
6240 | int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b)); | ||
6241 | int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16)); | ||
6242 | // Saturated add | ||
6243 | return vreinterpretq_m128i_s16(vqaddq_s16(ab0246, ab1357)); | ||
6244 | #endif | ||
6245 | } | ||
6246 | |||
6247 | // Horizontally add adjacent pairs of signed 16-bit integers in a and b using | ||
6248 | // saturation, and pack the signed 16-bit results in dst. | ||
6249 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadds_pi16 | ||
6250 | FORCE_INLINE __m64 _mm_hadds_pi16(__m64 _a, __m64 _b) | ||
6251 | { | ||
6252 | int16x4_t a = vreinterpret_s16_m64(_a); | ||
6253 | int16x4_t b = vreinterpret_s16_m64(_b); | ||
6254 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
6255 | return vreinterpret_s64_s16(vqadd_s16(vuzp1_s16(a, b), vuzp2_s16(a, b))); | ||
6256 | #else | ||
6257 | int16x4x2_t res = vuzp_s16(a, b); | ||
6258 | return vreinterpret_s64_s16(vqadd_s16(res.val[0], res.val[1])); | ||
6259 | #endif | ||
6260 | } | ||
6261 | |||
6262 | // Horizontally subtract adjacent pairs of 16-bit integers in a and b, and pack | ||
6263 | // the signed 16-bit results in dst. | ||
6264 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_epi16 | ||
6265 | FORCE_INLINE __m128i _mm_hsub_epi16(__m128i _a, __m128i _b) | ||
6266 | { | ||
6267 | int16x8_t a = vreinterpretq_s16_m128i(_a); | ||
6268 | int16x8_t b = vreinterpretq_s16_m128i(_b); | ||
6269 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
6270 | return vreinterpretq_m128i_s16( | ||
6271 | vsubq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b))); | ||
6272 | #else | ||
6273 | int16x8x2_t c = vuzpq_s16(a, b); | ||
6274 | return vreinterpretq_m128i_s16(vsubq_s16(c.val[0], c.val[1])); | ||
6275 | #endif | ||
6276 | } | ||
6277 | |||
6278 | // Horizontally subtract adjacent pairs of 32-bit integers in a and b, and pack | ||
6279 | // the signed 32-bit results in dst. | ||
6280 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_epi32 | ||
6281 | FORCE_INLINE __m128i _mm_hsub_epi32(__m128i _a, __m128i _b) | ||
6282 | { | ||
6283 | int32x4_t a = vreinterpretq_s32_m128i(_a); | ||
6284 | int32x4_t b = vreinterpretq_s32_m128i(_b); | ||
6285 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
6286 | return vreinterpretq_m128i_s32( | ||
6287 | vsubq_s32(vuzp1q_s32(a, b), vuzp2q_s32(a, b))); | ||
6288 | #else | ||
6289 | int32x4x2_t c = vuzpq_s32(a, b); | ||
6290 | return vreinterpretq_m128i_s32(vsubq_s32(c.val[0], c.val[1])); | ||
6291 | #endif | ||
6292 | } | ||
6293 | |||
6294 | // Horizontally subtract adjacent pairs of 16-bit integers in a and b, and pack | ||
6295 | // the signed 16-bit results in dst. | ||
6296 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_pi16 | ||
6297 | FORCE_INLINE __m64 _mm_hsub_pi16(__m64 _a, __m64 _b) | ||
6298 | { | ||
6299 | int16x4_t a = vreinterpret_s16_m64(_a); | ||
6300 | int16x4_t b = vreinterpret_s16_m64(_b); | ||
6301 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
6302 | return vreinterpret_m64_s16(vsub_s16(vuzp1_s16(a, b), vuzp2_s16(a, b))); | ||
6303 | #else | ||
6304 | int16x4x2_t c = vuzp_s16(a, b); | ||
6305 | return vreinterpret_m64_s16(vsub_s16(c.val[0], c.val[1])); | ||
6306 | #endif | ||
6307 | } | ||
6308 | |||
6309 | // Horizontally subtract adjacent pairs of 32-bit integers in a and b, and pack | ||
6310 | // the signed 32-bit results in dst. | ||
6311 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_hsub_pi32 | ||
6312 | FORCE_INLINE __m64 _mm_hsub_pi32(__m64 _a, __m64 _b) | ||
6313 | { | ||
6314 | int32x2_t a = vreinterpret_s32_m64(_a); | ||
6315 | int32x2_t b = vreinterpret_s32_m64(_b); | ||
6316 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
6317 | return vreinterpret_m64_s32(vsub_s32(vuzp1_s32(a, b), vuzp2_s32(a, b))); | ||
6318 | #else | ||
6319 | int32x2x2_t c = vuzp_s32(a, b); | ||
6320 | return vreinterpret_m64_s32(vsub_s32(c.val[0], c.val[1])); | ||
6321 | #endif | ||
6322 | } | ||
6323 | |||
6324 | // Horizontally subtract adjacent pairs of signed 16-bit integers in a and b | ||
6325 | // using saturation, and pack the signed 16-bit results in dst. | ||
6326 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsubs_epi16 | ||
6327 | FORCE_INLINE __m128i _mm_hsubs_epi16(__m128i _a, __m128i _b) | ||
6328 | { | ||
6329 | int16x8_t a = vreinterpretq_s16_m128i(_a); | ||
6330 | int16x8_t b = vreinterpretq_s16_m128i(_b); | ||
6331 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
6332 | return vreinterpretq_m128i_s16( | ||
6333 | vqsubq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b))); | ||
6334 | #else | ||
6335 | int16x8x2_t c = vuzpq_s16(a, b); | ||
6336 | return vreinterpretq_m128i_s16(vqsubq_s16(c.val[0], c.val[1])); | ||
6337 | #endif | ||
6338 | } | ||
6339 | |||
6340 | // Horizontally subtract adjacent pairs of signed 16-bit integers in a and b | ||
6341 | // using saturation, and pack the signed 16-bit results in dst. | ||
6342 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsubs_pi16 | ||
6343 | FORCE_INLINE __m64 _mm_hsubs_pi16(__m64 _a, __m64 _b) | ||
6344 | { | ||
6345 | int16x4_t a = vreinterpret_s16_m64(_a); | ||
6346 | int16x4_t b = vreinterpret_s16_m64(_b); | ||
6347 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
6348 | return vreinterpret_m64_s16(vqsub_s16(vuzp1_s16(a, b), vuzp2_s16(a, b))); | ||
6349 | #else | ||
6350 | int16x4x2_t c = vuzp_s16(a, b); | ||
6351 | return vreinterpret_m64_s16(vqsub_s16(c.val[0], c.val[1])); | ||
6352 | #endif | ||
6353 | } | ||
6354 | |||
6355 | // Vertically multiply each unsigned 8-bit integer from a with the corresponding | ||
6356 | // signed 8-bit integer from b, producing intermediate signed 16-bit integers. | ||
6357 | // Horizontally add adjacent pairs of intermediate signed 16-bit integers, | ||
6358 | // and pack the saturated results in dst. | ||
6359 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maddubs_epi16 | ||
6360 | FORCE_INLINE __m128i _mm_maddubs_epi16(__m128i _a, __m128i _b) | ||
6361 | { | ||
6362 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
6363 | uint8x16_t a = vreinterpretq_u8_m128i(_a); | ||
6364 | int8x16_t b = vreinterpretq_s8_m128i(_b); | ||
6365 | int16x8_t tl = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a))), | ||
6366 | vmovl_s8(vget_low_s8(b))); | ||
6367 | int16x8_t th = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(a))), | ||
6368 | vmovl_s8(vget_high_s8(b))); | ||
6369 | return vreinterpretq_m128i_s16( | ||
6370 | vqaddq_s16(vuzp1q_s16(tl, th), vuzp2q_s16(tl, th))); | ||
6371 | #else | ||
6372 | // This would be much simpler if x86 would choose to zero extend OR sign | ||
6373 | // extend, not both. This could probably be optimized better. | ||
6374 | uint16x8_t a = vreinterpretq_u16_m128i(_a); | ||
6375 | int16x8_t b = vreinterpretq_s16_m128i(_b); | ||
6376 | |||
6377 | // Zero extend a | ||
6378 | int16x8_t a_odd = vreinterpretq_s16_u16(vshrq_n_u16(a, 8)); | ||
6379 | int16x8_t a_even = vreinterpretq_s16_u16(vbicq_u16(a, vdupq_n_u16(0xff00))); | ||
6380 | |||
6381 | // Sign extend by shifting left then shifting right. | ||
6382 | int16x8_t b_even = vshrq_n_s16(vshlq_n_s16(b, 8), 8); | ||
6383 | int16x8_t b_odd = vshrq_n_s16(b, 8); | ||
6384 | |||
6385 | // multiply | ||
6386 | int16x8_t prod1 = vmulq_s16(a_even, b_even); | ||
6387 | int16x8_t prod2 = vmulq_s16(a_odd, b_odd); | ||
6388 | |||
6389 | // saturated add | ||
6390 | return vreinterpretq_m128i_s16(vqaddq_s16(prod1, prod2)); | ||
6391 | #endif | ||
6392 | } | ||
6393 | |||
6394 | // Vertically multiply each unsigned 8-bit integer from a with the corresponding | ||
6395 | // signed 8-bit integer from b, producing intermediate signed 16-bit integers. | ||
6396 | // Horizontally add adjacent pairs of intermediate signed 16-bit integers, and | ||
6397 | // pack the saturated results in dst. | ||
6398 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maddubs_pi16 | ||
6399 | FORCE_INLINE __m64 _mm_maddubs_pi16(__m64 _a, __m64 _b) | ||
6400 | { | ||
6401 | uint16x4_t a = vreinterpret_u16_m64(_a); | ||
6402 | int16x4_t b = vreinterpret_s16_m64(_b); | ||
6403 | |||
6404 | // Zero extend a | ||
6405 | int16x4_t a_odd = vreinterpret_s16_u16(vshr_n_u16(a, 8)); | ||
6406 | int16x4_t a_even = vreinterpret_s16_u16(vand_u16(a, vdup_n_u16(0xff))); | ||
6407 | |||
6408 | // Sign extend by shifting left then shifting right. | ||
6409 | int16x4_t b_even = vshr_n_s16(vshl_n_s16(b, 8), 8); | ||
6410 | int16x4_t b_odd = vshr_n_s16(b, 8); | ||
6411 | |||
6412 | // multiply | ||
6413 | int16x4_t prod1 = vmul_s16(a_even, b_even); | ||
6414 | int16x4_t prod2 = vmul_s16(a_odd, b_odd); | ||
6415 | |||
6416 | // saturated add | ||
6417 | return vreinterpret_m64_s16(vqadd_s16(prod1, prod2)); | ||
6418 | } | ||
6419 | |||
6420 | // Multiply packed signed 16-bit integers in a and b, producing intermediate | ||
6421 | // signed 32-bit integers. Shift right by 15 bits while rounding up, and store | ||
6422 | // the packed 16-bit integers in dst. | ||
6423 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhrs_epi16 | ||
6424 | FORCE_INLINE __m128i _mm_mulhrs_epi16(__m128i a, __m128i b) | ||
6425 | { | ||
6426 | // Has issues due to saturation | ||
6427 | // return vreinterpretq_m128i_s16(vqrdmulhq_s16(a, b)); | ||
6428 | |||
6429 | // Multiply | ||
6430 | int32x4_t mul_lo = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)), | ||
6431 | vget_low_s16(vreinterpretq_s16_m128i(b))); | ||
6432 | int32x4_t mul_hi = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)), | ||
6433 | vget_high_s16(vreinterpretq_s16_m128i(b))); | ||
6434 | |||
6435 | // Rounding narrowing shift right | ||
6436 | // narrow = (int16_t)((mul + 16384) >> 15); | ||
6437 | int16x4_t narrow_lo = vrshrn_n_s32(mul_lo, 15); | ||
6438 | int16x4_t narrow_hi = vrshrn_n_s32(mul_hi, 15); | ||
6439 | |||
6440 | // Join together | ||
6441 | return vreinterpretq_m128i_s16(vcombine_s16(narrow_lo, narrow_hi)); | ||
6442 | } | ||
6443 | |||
6444 | // Multiply packed signed 16-bit integers in a and b, producing intermediate | ||
6445 | // signed 32-bit integers. Truncate each intermediate integer to the 18 most | ||
6446 | // significant bits, round by adding 1, and store bits [16:1] to dst. | ||
6447 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhrs_pi16 | ||
6448 | FORCE_INLINE __m64 _mm_mulhrs_pi16(__m64 a, __m64 b) | ||
6449 | { | ||
6450 | int32x4_t mul_extend = | ||
6451 | vmull_s16((vreinterpret_s16_m64(a)), (vreinterpret_s16_m64(b))); | ||
6452 | |||
6453 | // Rounding narrowing shift right | ||
6454 | return vreinterpret_m64_s16(vrshrn_n_s32(mul_extend, 15)); | ||
6455 | } | ||
6456 | |||
6457 | // Shuffle packed 8-bit integers in a according to shuffle control mask in the | ||
6458 | // corresponding 8-bit element of b, and store the results in dst. | ||
6459 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_epi8 | ||
6460 | FORCE_INLINE __m128i _mm_shuffle_epi8(__m128i a, __m128i b) | ||
6461 | { | ||
6462 | int8x16_t tbl = vreinterpretq_s8_m128i(a); // input a | ||
6463 | uint8x16_t idx = vreinterpretq_u8_m128i(b); // input b | ||
6464 | uint8x16_t idx_masked = | ||
6465 | vandq_u8(idx, vdupq_n_u8(0x8F)); // avoid using meaningless bits | ||
6466 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
6467 | return vreinterpretq_m128i_s8(vqtbl1q_s8(tbl, idx_masked)); | ||
6468 | #elif defined(__GNUC__) | ||
6469 | int8x16_t ret; | ||
6470 | // %e and %f represent the even and odd D registers | ||
6471 | // respectively. | ||
6472 | __asm__ __volatile__( | ||
6473 | "vtbl.8 %e[ret], {%e[tbl], %f[tbl]}, %e[idx]\n" | ||
6474 | "vtbl.8 %f[ret], {%e[tbl], %f[tbl]}, %f[idx]\n" | ||
6475 | : [ret] "=&w"(ret) | ||
6476 | : [tbl] "w"(tbl), [idx] "w"(idx_masked)); | ||
6477 | return vreinterpretq_m128i_s8(ret); | ||
6478 | #else | ||
6479 | // use this line if testing on aarch64 | ||
6480 | int8x8x2_t a_split = {vget_low_s8(tbl), vget_high_s8(tbl)}; | ||
6481 | return vreinterpretq_m128i_s8( | ||
6482 | vcombine_s8(vtbl2_s8(a_split, vget_low_u8(idx_masked)), | ||
6483 | vtbl2_s8(a_split, vget_high_u8(idx_masked)))); | ||
6484 | #endif | ||
6485 | } | ||
6486 | |||
6487 | // Shuffle packed 8-bit integers in a according to shuffle control mask in the | ||
6488 | // corresponding 8-bit element of b, and store the results in dst. | ||
6489 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_pi8 | ||
6490 | FORCE_INLINE __m64 _mm_shuffle_pi8(__m64 a, __m64 b) | ||
6491 | { | ||
6492 | const int8x8_t controlMask = | ||
6493 | vand_s8(vreinterpret_s8_m64(b), vdup_n_s8((int8_t) (0x1 << 7 | 0x07))); | ||
6494 | int8x8_t res = vtbl1_s8(vreinterpret_s8_m64(a), controlMask); | ||
6495 | return vreinterpret_m64_s8(res); | ||
6496 | } | ||
6497 | |||
6498 | // Negate packed 16-bit integers in a when the corresponding signed | ||
6499 | // 16-bit integer in b is negative, and store the results in dst. | ||
6500 | // Element in dst are zeroed out when the corresponding element | ||
6501 | // in b is zero. | ||
6502 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_epi16 | ||
6503 | FORCE_INLINE __m128i _mm_sign_epi16(__m128i _a, __m128i _b) | ||
6504 | { | ||
6505 | int16x8_t a = vreinterpretq_s16_m128i(_a); | ||
6506 | int16x8_t b = vreinterpretq_s16_m128i(_b); | ||
6507 | |||
6508 | // signed shift right: faster than vclt | ||
6509 | // (b < 0) ? 0xFFFF : 0 | ||
6510 | uint16x8_t ltMask = vreinterpretq_u16_s16(vshrq_n_s16(b, 15)); | ||
6511 | // (b == 0) ? 0xFFFF : 0 | ||
6512 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
6513 | int16x8_t zeroMask = vreinterpretq_s16_u16(vceqzq_s16(b)); | ||
6514 | #else | ||
6515 | int16x8_t zeroMask = vreinterpretq_s16_u16(vceqq_s16(b, vdupq_n_s16(0))); | ||
6516 | #endif | ||
6517 | |||
6518 | // bitwise select either a or negative 'a' (vnegq_s16(a) equals to negative | ||
6519 | // 'a') based on ltMask | ||
6520 | int16x8_t masked = vbslq_s16(ltMask, vnegq_s16(a), a); | ||
6521 | // res = masked & (~zeroMask) | ||
6522 | int16x8_t res = vbicq_s16(masked, zeroMask); | ||
6523 | return vreinterpretq_m128i_s16(res); | ||
6524 | } | ||
6525 | |||
6526 | // Negate packed 32-bit integers in a when the corresponding signed | ||
6527 | // 32-bit integer in b is negative, and store the results in dst. | ||
6528 | // Element in dst are zeroed out when the corresponding element | ||
6529 | // in b is zero. | ||
6530 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_epi32 | ||
6531 | FORCE_INLINE __m128i _mm_sign_epi32(__m128i _a, __m128i _b) | ||
6532 | { | ||
6533 | int32x4_t a = vreinterpretq_s32_m128i(_a); | ||
6534 | int32x4_t b = vreinterpretq_s32_m128i(_b); | ||
6535 | |||
6536 | // signed shift right: faster than vclt | ||
6537 | // (b < 0) ? 0xFFFFFFFF : 0 | ||
6538 | uint32x4_t ltMask = vreinterpretq_u32_s32(vshrq_n_s32(b, 31)); | ||
6539 | |||
6540 | // (b == 0) ? 0xFFFFFFFF : 0 | ||
6541 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
6542 | int32x4_t zeroMask = vreinterpretq_s32_u32(vceqzq_s32(b)); | ||
6543 | #else | ||
6544 | int32x4_t zeroMask = vreinterpretq_s32_u32(vceqq_s32(b, vdupq_n_s32(0))); | ||
6545 | #endif | ||
6546 | |||
6547 | // bitwise select either a or negative 'a' (vnegq_s32(a) equals to negative | ||
6548 | // 'a') based on ltMask | ||
6549 | int32x4_t masked = vbslq_s32(ltMask, vnegq_s32(a), a); | ||
6550 | // res = masked & (~zeroMask) | ||
6551 | int32x4_t res = vbicq_s32(masked, zeroMask); | ||
6552 | return vreinterpretq_m128i_s32(res); | ||
6553 | } | ||
6554 | |||
6555 | // Negate packed 8-bit integers in a when the corresponding signed | ||
6556 | // 8-bit integer in b is negative, and store the results in dst. | ||
6557 | // Element in dst are zeroed out when the corresponding element | ||
6558 | // in b is zero. | ||
6559 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_epi8 | ||
6560 | FORCE_INLINE __m128i _mm_sign_epi8(__m128i _a, __m128i _b) | ||
6561 | { | ||
6562 | int8x16_t a = vreinterpretq_s8_m128i(_a); | ||
6563 | int8x16_t b = vreinterpretq_s8_m128i(_b); | ||
6564 | |||
6565 | // signed shift right: faster than vclt | ||
6566 | // (b < 0) ? 0xFF : 0 | ||
6567 | uint8x16_t ltMask = vreinterpretq_u8_s8(vshrq_n_s8(b, 7)); | ||
6568 | |||
6569 | // (b == 0) ? 0xFF : 0 | ||
6570 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
6571 | int8x16_t zeroMask = vreinterpretq_s8_u8(vceqzq_s8(b)); | ||
6572 | #else | ||
6573 | int8x16_t zeroMask = vreinterpretq_s8_u8(vceqq_s8(b, vdupq_n_s8(0))); | ||
6574 | #endif | ||
6575 | |||
6576 | // bitwise select either a or negative 'a' (vnegq_s8(a) return negative 'a') | ||
6577 | // based on ltMask | ||
6578 | int8x16_t masked = vbslq_s8(ltMask, vnegq_s8(a), a); | ||
6579 | // res = masked & (~zeroMask) | ||
6580 | int8x16_t res = vbicq_s8(masked, zeroMask); | ||
6581 | |||
6582 | return vreinterpretq_m128i_s8(res); | ||
6583 | } | ||
6584 | |||
6585 | // Negate packed 16-bit integers in a when the corresponding signed 16-bit | ||
6586 | // integer in b is negative, and store the results in dst. Element in dst are | ||
6587 | // zeroed out when the corresponding element in b is zero. | ||
6588 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_pi16 | ||
6589 | FORCE_INLINE __m64 _mm_sign_pi16(__m64 _a, __m64 _b) | ||
6590 | { | ||
6591 | int16x4_t a = vreinterpret_s16_m64(_a); | ||
6592 | int16x4_t b = vreinterpret_s16_m64(_b); | ||
6593 | |||
6594 | // signed shift right: faster than vclt | ||
6595 | // (b < 0) ? 0xFFFF : 0 | ||
6596 | uint16x4_t ltMask = vreinterpret_u16_s16(vshr_n_s16(b, 15)); | ||
6597 | |||
6598 | // (b == 0) ? 0xFFFF : 0 | ||
6599 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
6600 | int16x4_t zeroMask = vreinterpret_s16_u16(vceqz_s16(b)); | ||
6601 | #else | ||
6602 | int16x4_t zeroMask = vreinterpret_s16_u16(vceq_s16(b, vdup_n_s16(0))); | ||
6603 | #endif | ||
6604 | |||
6605 | // bitwise select either a or negative 'a' (vneg_s16(a) return negative 'a') | ||
6606 | // based on ltMask | ||
6607 | int16x4_t masked = vbsl_s16(ltMask, vneg_s16(a), a); | ||
6608 | // res = masked & (~zeroMask) | ||
6609 | int16x4_t res = vbic_s16(masked, zeroMask); | ||
6610 | |||
6611 | return vreinterpret_m64_s16(res); | ||
6612 | } | ||
6613 | |||
6614 | // Negate packed 32-bit integers in a when the corresponding signed 32-bit | ||
6615 | // integer in b is negative, and store the results in dst. Element in dst are | ||
6616 | // zeroed out when the corresponding element in b is zero. | ||
6617 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_pi32 | ||
6618 | FORCE_INLINE __m64 _mm_sign_pi32(__m64 _a, __m64 _b) | ||
6619 | { | ||
6620 | int32x2_t a = vreinterpret_s32_m64(_a); | ||
6621 | int32x2_t b = vreinterpret_s32_m64(_b); | ||
6622 | |||
6623 | // signed shift right: faster than vclt | ||
6624 | // (b < 0) ? 0xFFFFFFFF : 0 | ||
6625 | uint32x2_t ltMask = vreinterpret_u32_s32(vshr_n_s32(b, 31)); | ||
6626 | |||
6627 | // (b == 0) ? 0xFFFFFFFF : 0 | ||
6628 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
6629 | int32x2_t zeroMask = vreinterpret_s32_u32(vceqz_s32(b)); | ||
6630 | #else | ||
6631 | int32x2_t zeroMask = vreinterpret_s32_u32(vceq_s32(b, vdup_n_s32(0))); | ||
6632 | #endif | ||
6633 | |||
6634 | // bitwise select either a or negative 'a' (vneg_s32(a) return negative 'a') | ||
6635 | // based on ltMask | ||
6636 | int32x2_t masked = vbsl_s32(ltMask, vneg_s32(a), a); | ||
6637 | // res = masked & (~zeroMask) | ||
6638 | int32x2_t res = vbic_s32(masked, zeroMask); | ||
6639 | |||
6640 | return vreinterpret_m64_s32(res); | ||
6641 | } | ||
6642 | |||
6643 | // Negate packed 8-bit integers in a when the corresponding signed 8-bit integer | ||
6644 | // in b is negative, and store the results in dst. Element in dst are zeroed out | ||
6645 | // when the corresponding element in b is zero. | ||
6646 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_pi8 | ||
6647 | FORCE_INLINE __m64 _mm_sign_pi8(__m64 _a, __m64 _b) | ||
6648 | { | ||
6649 | int8x8_t a = vreinterpret_s8_m64(_a); | ||
6650 | int8x8_t b = vreinterpret_s8_m64(_b); | ||
6651 | |||
6652 | // signed shift right: faster than vclt | ||
6653 | // (b < 0) ? 0xFF : 0 | ||
6654 | uint8x8_t ltMask = vreinterpret_u8_s8(vshr_n_s8(b, 7)); | ||
6655 | |||
6656 | // (b == 0) ? 0xFF : 0 | ||
6657 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
6658 | int8x8_t zeroMask = vreinterpret_s8_u8(vceqz_s8(b)); | ||
6659 | #else | ||
6660 | int8x8_t zeroMask = vreinterpret_s8_u8(vceq_s8(b, vdup_n_s8(0))); | ||
6661 | #endif | ||
6662 | |||
6663 | // bitwise select either a or negative 'a' (vneg_s8(a) return negative 'a') | ||
6664 | // based on ltMask | ||
6665 | int8x8_t masked = vbsl_s8(ltMask, vneg_s8(a), a); | ||
6666 | // res = masked & (~zeroMask) | ||
6667 | int8x8_t res = vbic_s8(masked, zeroMask); | ||
6668 | |||
6669 | return vreinterpret_m64_s8(res); | ||
6670 | } | ||
6671 | |||
6672 | /* SSE4.1 */ | ||
6673 | |||
6674 | // Blend packed 16-bit integers from a and b using control mask imm8, and store | ||
6675 | // the results in dst. | ||
6676 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_epi16 | ||
6677 | // FORCE_INLINE __m128i _mm_blend_epi16(__m128i a, __m128i b, | ||
6678 | // __constrange(0,255) int imm) | ||
6679 | #define _mm_blend_epi16(a, b, imm) \ | ||
6680 | _sse2neon_define2( \ | ||
6681 | __m128i, a, b, \ | ||
6682 | const uint16_t _mask[8] = \ | ||
6683 | _sse2neon_init(((imm) & (1 << 0)) ? (uint16_t) -1 : 0x0, \ | ||
6684 | ((imm) & (1 << 1)) ? (uint16_t) -1 : 0x0, \ | ||
6685 | ((imm) & (1 << 2)) ? (uint16_t) -1 : 0x0, \ | ||
6686 | ((imm) & (1 << 3)) ? (uint16_t) -1 : 0x0, \ | ||
6687 | ((imm) & (1 << 4)) ? (uint16_t) -1 : 0x0, \ | ||
6688 | ((imm) & (1 << 5)) ? (uint16_t) -1 : 0x0, \ | ||
6689 | ((imm) & (1 << 6)) ? (uint16_t) -1 : 0x0, \ | ||
6690 | ((imm) & (1 << 7)) ? (uint16_t) -1 : 0x0); \ | ||
6691 | uint16x8_t _mask_vec = vld1q_u16(_mask); \ | ||
6692 | uint16x8_t __a = vreinterpretq_u16_m128i(_a); \ | ||
6693 | uint16x8_t __b = vreinterpretq_u16_m128i(_b); _sse2neon_return( \ | ||
6694 | vreinterpretq_m128i_u16(vbslq_u16(_mask_vec, __b, __a)));) | ||
6695 | |||
6696 | // Blend packed double-precision (64-bit) floating-point elements from a and b | ||
6697 | // using control mask imm8, and store the results in dst. | ||
6698 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_pd | ||
6699 | #define _mm_blend_pd(a, b, imm) \ | ||
6700 | _sse2neon_define2( \ | ||
6701 | __m128d, a, b, \ | ||
6702 | const uint64_t _mask[2] = \ | ||
6703 | _sse2neon_init(((imm) & (1 << 0)) ? ~UINT64_C(0) : UINT64_C(0), \ | ||
6704 | ((imm) & (1 << 1)) ? ~UINT64_C(0) : UINT64_C(0)); \ | ||
6705 | uint64x2_t _mask_vec = vld1q_u64(_mask); \ | ||
6706 | uint64x2_t __a = vreinterpretq_u64_m128d(_a); \ | ||
6707 | uint64x2_t __b = vreinterpretq_u64_m128d(_b); _sse2neon_return( \ | ||
6708 | vreinterpretq_m128d_u64(vbslq_u64(_mask_vec, __b, __a)));) | ||
6709 | |||
6710 | // Blend packed single-precision (32-bit) floating-point elements from a and b | ||
6711 | // using mask, and store the results in dst. | ||
6712 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_ps | ||
6713 | FORCE_INLINE __m128 _mm_blend_ps(__m128 _a, __m128 _b, const char imm8) | ||
6714 | { | ||
6715 | const uint32_t ALIGN_STRUCT(16) | ||
6716 | data[4] = {((imm8) & (1 << 0)) ? UINT32_MAX : 0, | ||
6717 | ((imm8) & (1 << 1)) ? UINT32_MAX : 0, | ||
6718 | ((imm8) & (1 << 2)) ? UINT32_MAX : 0, | ||
6719 | ((imm8) & (1 << 3)) ? UINT32_MAX : 0}; | ||
6720 | uint32x4_t mask = vld1q_u32(data); | ||
6721 | float32x4_t a = vreinterpretq_f32_m128(_a); | ||
6722 | float32x4_t b = vreinterpretq_f32_m128(_b); | ||
6723 | return vreinterpretq_m128_f32(vbslq_f32(mask, b, a)); | ||
6724 | } | ||
6725 | |||
6726 | // Blend packed 8-bit integers from a and b using mask, and store the results in | ||
6727 | // dst. | ||
6728 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_epi8 | ||
6729 | FORCE_INLINE __m128i _mm_blendv_epi8(__m128i _a, __m128i _b, __m128i _mask) | ||
6730 | { | ||
6731 | // Use a signed shift right to create a mask with the sign bit | ||
6732 | uint8x16_t mask = | ||
6733 | vreinterpretq_u8_s8(vshrq_n_s8(vreinterpretq_s8_m128i(_mask), 7)); | ||
6734 | uint8x16_t a = vreinterpretq_u8_m128i(_a); | ||
6735 | uint8x16_t b = vreinterpretq_u8_m128i(_b); | ||
6736 | return vreinterpretq_m128i_u8(vbslq_u8(mask, b, a)); | ||
6737 | } | ||
6738 | |||
6739 | // Blend packed double-precision (64-bit) floating-point elements from a and b | ||
6740 | // using mask, and store the results in dst. | ||
6741 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_pd | ||
6742 | FORCE_INLINE __m128d _mm_blendv_pd(__m128d _a, __m128d _b, __m128d _mask) | ||
6743 | { | ||
6744 | uint64x2_t mask = | ||
6745 | vreinterpretq_u64_s64(vshrq_n_s64(vreinterpretq_s64_m128d(_mask), 63)); | ||
6746 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
6747 | float64x2_t a = vreinterpretq_f64_m128d(_a); | ||
6748 | float64x2_t b = vreinterpretq_f64_m128d(_b); | ||
6749 | return vreinterpretq_m128d_f64(vbslq_f64(mask, b, a)); | ||
6750 | #else | ||
6751 | uint64x2_t a = vreinterpretq_u64_m128d(_a); | ||
6752 | uint64x2_t b = vreinterpretq_u64_m128d(_b); | ||
6753 | return vreinterpretq_m128d_u64(vbslq_u64(mask, b, a)); | ||
6754 | #endif | ||
6755 | } | ||
6756 | |||
6757 | // Blend packed single-precision (32-bit) floating-point elements from a and b | ||
6758 | // using mask, and store the results in dst. | ||
6759 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_ps | ||
6760 | FORCE_INLINE __m128 _mm_blendv_ps(__m128 _a, __m128 _b, __m128 _mask) | ||
6761 | { | ||
6762 | // Use a signed shift right to create a mask with the sign bit | ||
6763 | uint32x4_t mask = | ||
6764 | vreinterpretq_u32_s32(vshrq_n_s32(vreinterpretq_s32_m128(_mask), 31)); | ||
6765 | float32x4_t a = vreinterpretq_f32_m128(_a); | ||
6766 | float32x4_t b = vreinterpretq_f32_m128(_b); | ||
6767 | return vreinterpretq_m128_f32(vbslq_f32(mask, b, a)); | ||
6768 | } | ||
6769 | |||
6770 | // Round the packed double-precision (64-bit) floating-point elements in a up | ||
6771 | // to an integer value, and store the results as packed double-precision | ||
6772 | // floating-point elements in dst. | ||
6773 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_pd | ||
6774 | FORCE_INLINE __m128d _mm_ceil_pd(__m128d a) | ||
6775 | { | ||
6776 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
6777 | return vreinterpretq_m128d_f64(vrndpq_f64(vreinterpretq_f64_m128d(a))); | ||
6778 | #else | ||
6779 | double *f = (double *) &a; | ||
6780 | return _mm_set_pd(ceil(f[1]), ceil(f[0])); | ||
6781 | #endif | ||
6782 | } | ||
6783 | |||
6784 | // Round the packed single-precision (32-bit) floating-point elements in a up to | ||
6785 | // an integer value, and store the results as packed single-precision | ||
6786 | // floating-point elements in dst. | ||
6787 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_ps | ||
6788 | FORCE_INLINE __m128 _mm_ceil_ps(__m128 a) | ||
6789 | { | ||
6790 | #if (defined(__aarch64__) || defined(_M_ARM64)) || \ | ||
6791 | defined(__ARM_FEATURE_DIRECTED_ROUNDING) | ||
6792 | return vreinterpretq_m128_f32(vrndpq_f32(vreinterpretq_f32_m128(a))); | ||
6793 | #else | ||
6794 | float *f = (float *) &a; | ||
6795 | return _mm_set_ps(ceilf(f[3]), ceilf(f[2]), ceilf(f[1]), ceilf(f[0])); | ||
6796 | #endif | ||
6797 | } | ||
6798 | |||
6799 | // Round the lower double-precision (64-bit) floating-point element in b up to | ||
6800 | // an integer value, store the result as a double-precision floating-point | ||
6801 | // element in the lower element of dst, and copy the upper element from a to the | ||
6802 | // upper element of dst. | ||
6803 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_sd | ||
6804 | FORCE_INLINE __m128d _mm_ceil_sd(__m128d a, __m128d b) | ||
6805 | { | ||
6806 | return _mm_move_sd(a, _mm_ceil_pd(b)); | ||
6807 | } | ||
6808 | |||
6809 | // Round the lower single-precision (32-bit) floating-point element in b up to | ||
6810 | // an integer value, store the result as a single-precision floating-point | ||
6811 | // element in the lower element of dst, and copy the upper 3 packed elements | ||
6812 | // from a to the upper elements of dst. | ||
6813 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_ss | ||
6814 | FORCE_INLINE __m128 _mm_ceil_ss(__m128 a, __m128 b) | ||
6815 | { | ||
6816 | return _mm_move_ss(a, _mm_ceil_ps(b)); | ||
6817 | } | ||
6818 | |||
6819 | // Compare packed 64-bit integers in a and b for equality, and store the results | ||
6820 | // in dst | ||
6821 | FORCE_INLINE __m128i _mm_cmpeq_epi64(__m128i a, __m128i b) | ||
6822 | { | ||
6823 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
6824 | return vreinterpretq_m128i_u64( | ||
6825 | vceqq_u64(vreinterpretq_u64_m128i(a), vreinterpretq_u64_m128i(b))); | ||
6826 | #else | ||
6827 | // ARMv7 lacks vceqq_u64 | ||
6828 | // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi) | ||
6829 | uint32x4_t cmp = | ||
6830 | vceqq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b)); | ||
6831 | uint32x4_t swapped = vrev64q_u32(cmp); | ||
6832 | return vreinterpretq_m128i_u32(vandq_u32(cmp, swapped)); | ||
6833 | #endif | ||
6834 | } | ||
6835 | |||
6836 | // Sign extend packed 16-bit integers in a to packed 32-bit integers, and store | ||
6837 | // the results in dst. | ||
6838 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi16_epi32 | ||
6839 | FORCE_INLINE __m128i _mm_cvtepi16_epi32(__m128i a) | ||
6840 | { | ||
6841 | return vreinterpretq_m128i_s32( | ||
6842 | vmovl_s16(vget_low_s16(vreinterpretq_s16_m128i(a)))); | ||
6843 | } | ||
6844 | |||
6845 | // Sign extend packed 16-bit integers in a to packed 64-bit integers, and store | ||
6846 | // the results in dst. | ||
6847 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi16_epi64 | ||
6848 | FORCE_INLINE __m128i _mm_cvtepi16_epi64(__m128i a) | ||
6849 | { | ||
6850 | int16x8_t s16x8 = vreinterpretq_s16_m128i(a); /* xxxx xxxx xxxx 0B0A */ | ||
6851 | int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */ | ||
6852 | int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */ | ||
6853 | return vreinterpretq_m128i_s64(s64x2); | ||
6854 | } | ||
6855 | |||
6856 | // Sign extend packed 32-bit integers in a to packed 64-bit integers, and store | ||
6857 | // the results in dst. | ||
6858 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_epi64 | ||
6859 | FORCE_INLINE __m128i _mm_cvtepi32_epi64(__m128i a) | ||
6860 | { | ||
6861 | return vreinterpretq_m128i_s64( | ||
6862 | vmovl_s32(vget_low_s32(vreinterpretq_s32_m128i(a)))); | ||
6863 | } | ||
6864 | |||
6865 | // Sign extend packed 8-bit integers in a to packed 16-bit integers, and store | ||
6866 | // the results in dst. | ||
6867 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi8_epi16 | ||
6868 | FORCE_INLINE __m128i _mm_cvtepi8_epi16(__m128i a) | ||
6869 | { | ||
6870 | int8x16_t s8x16 = vreinterpretq_s8_m128i(a); /* xxxx xxxx xxxx DCBA */ | ||
6871 | int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0D0C 0B0A */ | ||
6872 | return vreinterpretq_m128i_s16(s16x8); | ||
6873 | } | ||
6874 | |||
6875 | // Sign extend packed 8-bit integers in a to packed 32-bit integers, and store | ||
6876 | // the results in dst. | ||
6877 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi8_epi32 | ||
6878 | FORCE_INLINE __m128i _mm_cvtepi8_epi32(__m128i a) | ||
6879 | { | ||
6880 | int8x16_t s8x16 = vreinterpretq_s8_m128i(a); /* xxxx xxxx xxxx DCBA */ | ||
6881 | int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0D0C 0B0A */ | ||
6882 | int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000D 000C 000B 000A */ | ||
6883 | return vreinterpretq_m128i_s32(s32x4); | ||
6884 | } | ||
6885 | |||
6886 | // Sign extend packed 8-bit integers in the low 8 bytes of a to packed 64-bit | ||
6887 | // integers, and store the results in dst. | ||
6888 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi8_epi64 | ||
6889 | FORCE_INLINE __m128i _mm_cvtepi8_epi64(__m128i a) | ||
6890 | { | ||
6891 | int8x16_t s8x16 = vreinterpretq_s8_m128i(a); /* xxxx xxxx xxxx xxBA */ | ||
6892 | int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0x0x 0B0A */ | ||
6893 | int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */ | ||
6894 | int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */ | ||
6895 | return vreinterpretq_m128i_s64(s64x2); | ||
6896 | } | ||
6897 | |||
6898 | // Zero extend packed unsigned 16-bit integers in a to packed 32-bit integers, | ||
6899 | // and store the results in dst. | ||
6900 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu16_epi32 | ||
6901 | FORCE_INLINE __m128i _mm_cvtepu16_epi32(__m128i a) | ||
6902 | { | ||
6903 | return vreinterpretq_m128i_u32( | ||
6904 | vmovl_u16(vget_low_u16(vreinterpretq_u16_m128i(a)))); | ||
6905 | } | ||
6906 | |||
6907 | // Zero extend packed unsigned 16-bit integers in a to packed 64-bit integers, | ||
6908 | // and store the results in dst. | ||
6909 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu16_epi64 | ||
6910 | FORCE_INLINE __m128i _mm_cvtepu16_epi64(__m128i a) | ||
6911 | { | ||
6912 | uint16x8_t u16x8 = vreinterpretq_u16_m128i(a); /* xxxx xxxx xxxx 0B0A */ | ||
6913 | uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */ | ||
6914 | uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */ | ||
6915 | return vreinterpretq_m128i_u64(u64x2); | ||
6916 | } | ||
6917 | |||
6918 | // Zero extend packed unsigned 32-bit integers in a to packed 64-bit integers, | ||
6919 | // and store the results in dst. | ||
6920 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu32_epi64 | ||
6921 | FORCE_INLINE __m128i _mm_cvtepu32_epi64(__m128i a) | ||
6922 | { | ||
6923 | return vreinterpretq_m128i_u64( | ||
6924 | vmovl_u32(vget_low_u32(vreinterpretq_u32_m128i(a)))); | ||
6925 | } | ||
6926 | |||
6927 | // Zero extend packed unsigned 8-bit integers in a to packed 16-bit integers, | ||
6928 | // and store the results in dst. | ||
6929 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi16 | ||
6930 | FORCE_INLINE __m128i _mm_cvtepu8_epi16(__m128i a) | ||
6931 | { | ||
6932 | uint8x16_t u8x16 = vreinterpretq_u8_m128i(a); /* xxxx xxxx HGFE DCBA */ | ||
6933 | uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0H0G 0F0E 0D0C 0B0A */ | ||
6934 | return vreinterpretq_m128i_u16(u16x8); | ||
6935 | } | ||
6936 | |||
6937 | // Zero extend packed unsigned 8-bit integers in a to packed 32-bit integers, | ||
6938 | // and store the results in dst. | ||
6939 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi32 | ||
6940 | FORCE_INLINE __m128i _mm_cvtepu8_epi32(__m128i a) | ||
6941 | { | ||
6942 | uint8x16_t u8x16 = vreinterpretq_u8_m128i(a); /* xxxx xxxx xxxx DCBA */ | ||
6943 | uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0D0C 0B0A */ | ||
6944 | uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000D 000C 000B 000A */ | ||
6945 | return vreinterpretq_m128i_u32(u32x4); | ||
6946 | } | ||
6947 | |||
6948 | // Zero extend packed unsigned 8-bit integers in the low 8 bytes of a to packed | ||
6949 | // 64-bit integers, and store the results in dst. | ||
6950 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi64 | ||
6951 | FORCE_INLINE __m128i _mm_cvtepu8_epi64(__m128i a) | ||
6952 | { | ||
6953 | uint8x16_t u8x16 = vreinterpretq_u8_m128i(a); /* xxxx xxxx xxxx xxBA */ | ||
6954 | uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0x0x 0B0A */ | ||
6955 | uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */ | ||
6956 | uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */ | ||
6957 | return vreinterpretq_m128i_u64(u64x2); | ||
6958 | } | ||
6959 | |||
6960 | // Conditionally multiply the packed double-precision (64-bit) floating-point | ||
6961 | // elements in a and b using the high 4 bits in imm8, sum the four products, and | ||
6962 | // conditionally store the sum in dst using the low 4 bits of imm8. | ||
6963 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dp_pd | ||
6964 | FORCE_INLINE __m128d _mm_dp_pd(__m128d a, __m128d b, const int imm) | ||
6965 | { | ||
6966 | // Generate mask value from constant immediate bit value | ||
6967 | const int64_t bit0Mask = imm & 0x01 ? UINT64_MAX : 0; | ||
6968 | const int64_t bit1Mask = imm & 0x02 ? UINT64_MAX : 0; | ||
6969 | #if !SSE2NEON_PRECISE_DP | ||
6970 | const int64_t bit4Mask = imm & 0x10 ? UINT64_MAX : 0; | ||
6971 | const int64_t bit5Mask = imm & 0x20 ? UINT64_MAX : 0; | ||
6972 | #endif | ||
6973 | // Conditional multiplication | ||
6974 | #if !SSE2NEON_PRECISE_DP | ||
6975 | __m128d mul = _mm_mul_pd(a, b); | ||
6976 | const __m128d mulMask = | ||
6977 | _mm_castsi128_pd(_mm_set_epi64x(bit5Mask, bit4Mask)); | ||
6978 | __m128d tmp = _mm_and_pd(mul, mulMask); | ||
6979 | #else | ||
6980 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
6981 | double d0 = (imm & 0x10) ? vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0) * | ||
6982 | vgetq_lane_f64(vreinterpretq_f64_m128d(b), 0) | ||
6983 | : 0; | ||
6984 | double d1 = (imm & 0x20) ? vgetq_lane_f64(vreinterpretq_f64_m128d(a), 1) * | ||
6985 | vgetq_lane_f64(vreinterpretq_f64_m128d(b), 1) | ||
6986 | : 0; | ||
6987 | #else | ||
6988 | double d0 = (imm & 0x10) ? ((double *) &a)[0] * ((double *) &b)[0] : 0; | ||
6989 | double d1 = (imm & 0x20) ? ((double *) &a)[1] * ((double *) &b)[1] : 0; | ||
6990 | #endif | ||
6991 | __m128d tmp = _mm_set_pd(d1, d0); | ||
6992 | #endif | ||
6993 | // Sum the products | ||
6994 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
6995 | double sum = vpaddd_f64(vreinterpretq_f64_m128d(tmp)); | ||
6996 | #else | ||
6997 | double sum = *((double *) &tmp) + *(((double *) &tmp) + 1); | ||
6998 | #endif | ||
6999 | // Conditionally store the sum | ||
7000 | const __m128d sumMask = | ||
7001 | _mm_castsi128_pd(_mm_set_epi64x(bit1Mask, bit0Mask)); | ||
7002 | __m128d res = _mm_and_pd(_mm_set_pd1(sum), sumMask); | ||
7003 | return res; | ||
7004 | } | ||
7005 | |||
7006 | // Conditionally multiply the packed single-precision (32-bit) floating-point | ||
7007 | // elements in a and b using the high 4 bits in imm8, sum the four products, | ||
7008 | // and conditionally store the sum in dst using the low 4 bits of imm. | ||
7009 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dp_ps | ||
7010 | FORCE_INLINE __m128 _mm_dp_ps(__m128 a, __m128 b, const int imm) | ||
7011 | { | ||
7012 | float32x4_t elementwise_prod = _mm_mul_ps(a, b); | ||
7013 | |||
7014 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
7015 | /* shortcuts */ | ||
7016 | if (imm == 0xFF) { | ||
7017 | return _mm_set1_ps(vaddvq_f32(elementwise_prod)); | ||
7018 | } | ||
7019 | |||
7020 | if ((imm & 0x0F) == 0x0F) { | ||
7021 | if (!(imm & (1 << 4))) | ||
7022 | elementwise_prod = vsetq_lane_f32(0.0f, elementwise_prod, 0); | ||
7023 | if (!(imm & (1 << 5))) | ||
7024 | elementwise_prod = vsetq_lane_f32(0.0f, elementwise_prod, 1); | ||
7025 | if (!(imm & (1 << 6))) | ||
7026 | elementwise_prod = vsetq_lane_f32(0.0f, elementwise_prod, 2); | ||
7027 | if (!(imm & (1 << 7))) | ||
7028 | elementwise_prod = vsetq_lane_f32(0.0f, elementwise_prod, 3); | ||
7029 | |||
7030 | return _mm_set1_ps(vaddvq_f32(elementwise_prod)); | ||
7031 | } | ||
7032 | #endif | ||
7033 | |||
7034 | float s = 0.0f; | ||
7035 | |||
7036 | if (imm & (1 << 4)) | ||
7037 | s += vgetq_lane_f32(elementwise_prod, 0); | ||
7038 | if (imm & (1 << 5)) | ||
7039 | s += vgetq_lane_f32(elementwise_prod, 1); | ||
7040 | if (imm & (1 << 6)) | ||
7041 | s += vgetq_lane_f32(elementwise_prod, 2); | ||
7042 | if (imm & (1 << 7)) | ||
7043 | s += vgetq_lane_f32(elementwise_prod, 3); | ||
7044 | |||
7045 | const float32_t res[4] = { | ||
7046 | (imm & 0x1) ? s : 0.0f, | ||
7047 | (imm & 0x2) ? s : 0.0f, | ||
7048 | (imm & 0x4) ? s : 0.0f, | ||
7049 | (imm & 0x8) ? s : 0.0f, | ||
7050 | }; | ||
7051 | return vreinterpretq_m128_f32(vld1q_f32(res)); | ||
7052 | } | ||
7053 | |||
7054 | // Extract a 32-bit integer from a, selected with imm8, and store the result in | ||
7055 | // dst. | ||
7056 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi32 | ||
7057 | // FORCE_INLINE int _mm_extract_epi32(__m128i a, __constrange(0,4) int imm) | ||
7058 | #define _mm_extract_epi32(a, imm) \ | ||
7059 | vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm)) | ||
7060 | |||
7061 | // Extract a 64-bit integer from a, selected with imm8, and store the result in | ||
7062 | // dst. | ||
7063 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi64 | ||
7064 | // FORCE_INLINE __int64 _mm_extract_epi64(__m128i a, __constrange(0,2) int imm) | ||
7065 | #define _mm_extract_epi64(a, imm) \ | ||
7066 | vgetq_lane_s64(vreinterpretq_s64_m128i(a), (imm)) | ||
7067 | |||
7068 | // Extract an 8-bit integer from a, selected with imm8, and store the result in | ||
7069 | // the lower element of dst. FORCE_INLINE int _mm_extract_epi8(__m128i a, | ||
7070 | // __constrange(0,16) int imm) | ||
7071 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi8 | ||
7072 | #define _mm_extract_epi8(a, imm) vgetq_lane_u8(vreinterpretq_u8_m128i(a), (imm)) | ||
7073 | |||
7074 | // Extracts the selected single-precision (32-bit) floating-point from a. | ||
7075 | // FORCE_INLINE int _mm_extract_ps(__m128 a, __constrange(0,4) int imm) | ||
7076 | #define _mm_extract_ps(a, imm) vgetq_lane_s32(vreinterpretq_s32_m128(a), (imm)) | ||
7077 | |||
7078 | // Round the packed double-precision (64-bit) floating-point elements in a down | ||
7079 | // to an integer value, and store the results as packed double-precision | ||
7080 | // floating-point elements in dst. | ||
7081 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_pd | ||
7082 | FORCE_INLINE __m128d _mm_floor_pd(__m128d a) | ||
7083 | { | ||
7084 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
7085 | return vreinterpretq_m128d_f64(vrndmq_f64(vreinterpretq_f64_m128d(a))); | ||
7086 | #else | ||
7087 | double *f = (double *) &a; | ||
7088 | return _mm_set_pd(floor(f[1]), floor(f[0])); | ||
7089 | #endif | ||
7090 | } | ||
7091 | |||
7092 | // Round the packed single-precision (32-bit) floating-point elements in a down | ||
7093 | // to an integer value, and store the results as packed single-precision | ||
7094 | // floating-point elements in dst. | ||
7095 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_ps | ||
7096 | FORCE_INLINE __m128 _mm_floor_ps(__m128 a) | ||
7097 | { | ||
7098 | #if (defined(__aarch64__) || defined(_M_ARM64)) || \ | ||
7099 | defined(__ARM_FEATURE_DIRECTED_ROUNDING) | ||
7100 | return vreinterpretq_m128_f32(vrndmq_f32(vreinterpretq_f32_m128(a))); | ||
7101 | #else | ||
7102 | float *f = (float *) &a; | ||
7103 | return _mm_set_ps(floorf(f[3]), floorf(f[2]), floorf(f[1]), floorf(f[0])); | ||
7104 | #endif | ||
7105 | } | ||
7106 | |||
7107 | // Round the lower double-precision (64-bit) floating-point element in b down to | ||
7108 | // an integer value, store the result as a double-precision floating-point | ||
7109 | // element in the lower element of dst, and copy the upper element from a to the | ||
7110 | // upper element of dst. | ||
7111 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_sd | ||
7112 | FORCE_INLINE __m128d _mm_floor_sd(__m128d a, __m128d b) | ||
7113 | { | ||
7114 | return _mm_move_sd(a, _mm_floor_pd(b)); | ||
7115 | } | ||
7116 | |||
7117 | // Round the lower single-precision (32-bit) floating-point element in b down to | ||
7118 | // an integer value, store the result as a single-precision floating-point | ||
7119 | // element in the lower element of dst, and copy the upper 3 packed elements | ||
7120 | // from a to the upper elements of dst. | ||
7121 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_ss | ||
7122 | FORCE_INLINE __m128 _mm_floor_ss(__m128 a, __m128 b) | ||
7123 | { | ||
7124 | return _mm_move_ss(a, _mm_floor_ps(b)); | ||
7125 | } | ||
7126 | |||
7127 | // Copy a to dst, and insert the 32-bit integer i into dst at the location | ||
7128 | // specified by imm8. | ||
7129 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi32 | ||
7130 | // FORCE_INLINE __m128i _mm_insert_epi32(__m128i a, int b, | ||
7131 | // __constrange(0,4) int imm) | ||
7132 | #define _mm_insert_epi32(a, b, imm) \ | ||
7133 | vreinterpretq_m128i_s32( \ | ||
7134 | vsetq_lane_s32((b), vreinterpretq_s32_m128i(a), (imm))) | ||
7135 | |||
7136 | // Copy a to dst, and insert the 64-bit integer i into dst at the location | ||
7137 | // specified by imm8. | ||
7138 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi64 | ||
7139 | // FORCE_INLINE __m128i _mm_insert_epi64(__m128i a, __int64 b, | ||
7140 | // __constrange(0,2) int imm) | ||
7141 | #define _mm_insert_epi64(a, b, imm) \ | ||
7142 | vreinterpretq_m128i_s64( \ | ||
7143 | vsetq_lane_s64((b), vreinterpretq_s64_m128i(a), (imm))) | ||
7144 | |||
7145 | // Copy a to dst, and insert the lower 8-bit integer from i into dst at the | ||
7146 | // location specified by imm8. | ||
7147 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi8 | ||
7148 | // FORCE_INLINE __m128i _mm_insert_epi8(__m128i a, int b, | ||
7149 | // __constrange(0,16) int imm) | ||
7150 | #define _mm_insert_epi8(a, b, imm) \ | ||
7151 | vreinterpretq_m128i_s8(vsetq_lane_s8((b), vreinterpretq_s8_m128i(a), (imm))) | ||
7152 | |||
7153 | // Copy a to tmp, then insert a single-precision (32-bit) floating-point | ||
7154 | // element from b into tmp using the control in imm8. Store tmp to dst using | ||
7155 | // the mask in imm8 (elements are zeroed out when the corresponding bit is set). | ||
7156 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=insert_ps | ||
7157 | #define _mm_insert_ps(a, b, imm8) \ | ||
7158 | _sse2neon_define2( \ | ||
7159 | __m128, a, b, \ | ||
7160 | float32x4_t tmp1 = \ | ||
7161 | vsetq_lane_f32(vgetq_lane_f32(_b, (imm8 >> 6) & 0x3), \ | ||
7162 | vreinterpretq_f32_m128(_a), 0); \ | ||
7163 | float32x4_t tmp2 = \ | ||
7164 | vsetq_lane_f32(vgetq_lane_f32(tmp1, 0), \ | ||
7165 | vreinterpretq_f32_m128(_a), ((imm8 >> 4) & 0x3)); \ | ||
7166 | const uint32_t data[4] = \ | ||
7167 | _sse2neon_init(((imm8) & (1 << 0)) ? UINT32_MAX : 0, \ | ||
7168 | ((imm8) & (1 << 1)) ? UINT32_MAX : 0, \ | ||
7169 | ((imm8) & (1 << 2)) ? UINT32_MAX : 0, \ | ||
7170 | ((imm8) & (1 << 3)) ? UINT32_MAX : 0); \ | ||
7171 | uint32x4_t mask = vld1q_u32(data); \ | ||
7172 | float32x4_t all_zeros = vdupq_n_f32(0); \ | ||
7173 | \ | ||
7174 | _sse2neon_return(vreinterpretq_m128_f32( \ | ||
7175 | vbslq_f32(mask, all_zeros, vreinterpretq_f32_m128(tmp2))));) | ||
7176 | |||
7177 | // Compare packed signed 32-bit integers in a and b, and store packed maximum | ||
7178 | // values in dst. | ||
7179 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi32 | ||
7180 | FORCE_INLINE __m128i _mm_max_epi32(__m128i a, __m128i b) | ||
7181 | { | ||
7182 | return vreinterpretq_m128i_s32( | ||
7183 | vmaxq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); | ||
7184 | } | ||
7185 | |||
7186 | // Compare packed signed 8-bit integers in a and b, and store packed maximum | ||
7187 | // values in dst. | ||
7188 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi8 | ||
7189 | FORCE_INLINE __m128i _mm_max_epi8(__m128i a, __m128i b) | ||
7190 | { | ||
7191 | return vreinterpretq_m128i_s8( | ||
7192 | vmaxq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); | ||
7193 | } | ||
7194 | |||
7195 | // Compare packed unsigned 16-bit integers in a and b, and store packed maximum | ||
7196 | // values in dst. | ||
7197 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu16 | ||
7198 | FORCE_INLINE __m128i _mm_max_epu16(__m128i a, __m128i b) | ||
7199 | { | ||
7200 | return vreinterpretq_m128i_u16( | ||
7201 | vmaxq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b))); | ||
7202 | } | ||
7203 | |||
7204 | // Compare packed unsigned 32-bit integers in a and b, and store packed maximum | ||
7205 | // values in dst. | ||
7206 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu32 | ||
7207 | FORCE_INLINE __m128i _mm_max_epu32(__m128i a, __m128i b) | ||
7208 | { | ||
7209 | return vreinterpretq_m128i_u32( | ||
7210 | vmaxq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b))); | ||
7211 | } | ||
7212 | |||
7213 | // Compare packed signed 32-bit integers in a and b, and store packed minimum | ||
7214 | // values in dst. | ||
7215 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi32 | ||
7216 | FORCE_INLINE __m128i _mm_min_epi32(__m128i a, __m128i b) | ||
7217 | { | ||
7218 | return vreinterpretq_m128i_s32( | ||
7219 | vminq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); | ||
7220 | } | ||
7221 | |||
7222 | // Compare packed signed 8-bit integers in a and b, and store packed minimum | ||
7223 | // values in dst. | ||
7224 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi8 | ||
7225 | FORCE_INLINE __m128i _mm_min_epi8(__m128i a, __m128i b) | ||
7226 | { | ||
7227 | return vreinterpretq_m128i_s8( | ||
7228 | vminq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); | ||
7229 | } | ||
7230 | |||
7231 | // Compare packed unsigned 16-bit integers in a and b, and store packed minimum | ||
7232 | // values in dst. | ||
7233 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epu16 | ||
7234 | FORCE_INLINE __m128i _mm_min_epu16(__m128i a, __m128i b) | ||
7235 | { | ||
7236 | return vreinterpretq_m128i_u16( | ||
7237 | vminq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b))); | ||
7238 | } | ||
7239 | |||
7240 | // Compare packed unsigned 32-bit integers in a and b, and store packed minimum | ||
7241 | // values in dst. | ||
7242 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu32 | ||
7243 | FORCE_INLINE __m128i _mm_min_epu32(__m128i a, __m128i b) | ||
7244 | { | ||
7245 | return vreinterpretq_m128i_u32( | ||
7246 | vminq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b))); | ||
7247 | } | ||
7248 | |||
7249 | // Horizontally compute the minimum amongst the packed unsigned 16-bit integers | ||
7250 | // in a, store the minimum and index in dst, and zero the remaining bits in dst. | ||
7251 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_minpos_epu16 | ||
7252 | FORCE_INLINE __m128i _mm_minpos_epu16(__m128i a) | ||
7253 | { | ||
7254 | __m128i dst; | ||
7255 | uint16_t min, idx = 0; | ||
7256 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
7257 | // Find the minimum value | ||
7258 | min = vminvq_u16(vreinterpretq_u16_m128i(a)); | ||
7259 | |||
7260 | // Get the index of the minimum value | ||
7261 | static const uint16_t idxv[] = {0, 1, 2, 3, 4, 5, 6, 7}; | ||
7262 | uint16x8_t minv = vdupq_n_u16(min); | ||
7263 | uint16x8_t cmeq = vceqq_u16(minv, vreinterpretq_u16_m128i(a)); | ||
7264 | idx = vminvq_u16(vornq_u16(vld1q_u16(idxv), cmeq)); | ||
7265 | #else | ||
7266 | // Find the minimum value | ||
7267 | __m64 tmp; | ||
7268 | tmp = vreinterpret_m64_u16( | ||
7269 | vmin_u16(vget_low_u16(vreinterpretq_u16_m128i(a)), | ||
7270 | vget_high_u16(vreinterpretq_u16_m128i(a)))); | ||
7271 | tmp = vreinterpret_m64_u16( | ||
7272 | vpmin_u16(vreinterpret_u16_m64(tmp), vreinterpret_u16_m64(tmp))); | ||
7273 | tmp = vreinterpret_m64_u16( | ||
7274 | vpmin_u16(vreinterpret_u16_m64(tmp), vreinterpret_u16_m64(tmp))); | ||
7275 | min = vget_lane_u16(vreinterpret_u16_m64(tmp), 0); | ||
7276 | // Get the index of the minimum value | ||
7277 | int i; | ||
7278 | for (i = 0; i < 8; i++) { | ||
7279 | if (min == vgetq_lane_u16(vreinterpretq_u16_m128i(a), 0)) { | ||
7280 | idx = (uint16_t) i; | ||
7281 | break; | ||
7282 | } | ||
7283 | a = _mm_srli_si128(a, 2); | ||
7284 | } | ||
7285 | #endif | ||
7286 | // Generate result | ||
7287 | dst = _mm_setzero_si128(); | ||
7288 | dst = vreinterpretq_m128i_u16( | ||
7289 | vsetq_lane_u16(min, vreinterpretq_u16_m128i(dst), 0)); | ||
7290 | dst = vreinterpretq_m128i_u16( | ||
7291 | vsetq_lane_u16(idx, vreinterpretq_u16_m128i(dst), 1)); | ||
7292 | return dst; | ||
7293 | } | ||
7294 | |||
7295 | // Compute the sum of absolute differences (SADs) of quadruplets of unsigned | ||
7296 | // 8-bit integers in a compared to those in b, and store the 16-bit results in | ||
7297 | // dst. Eight SADs are performed using one quadruplet from b and eight | ||
7298 | // quadruplets from a. One quadruplet is selected from b starting at on the | ||
7299 | // offset specified in imm8. Eight quadruplets are formed from sequential 8-bit | ||
7300 | // integers selected from a starting at the offset specified in imm8. | ||
7301 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mpsadbw_epu8 | ||
7302 | FORCE_INLINE __m128i _mm_mpsadbw_epu8(__m128i a, __m128i b, const int imm) | ||
7303 | { | ||
7304 | uint8x16_t _a, _b; | ||
7305 | |||
7306 | switch (imm & 0x4) { | ||
7307 | case 0: | ||
7308 | // do nothing | ||
7309 | _a = vreinterpretq_u8_m128i(a); | ||
7310 | break; | ||
7311 | case 4: | ||
7312 | _a = vreinterpretq_u8_u32(vextq_u32(vreinterpretq_u32_m128i(a), | ||
7313 | vreinterpretq_u32_m128i(a), 1)); | ||
7314 | break; | ||
7315 | default: | ||
7316 | #if defined(__GNUC__) || defined(__clang__) | ||
7317 | __builtin_unreachable(); | ||
7318 | #elif defined(_MSC_VER) | ||
7319 | __assume(0); | ||
7320 | #endif | ||
7321 | break; | ||
7322 | } | ||
7323 | |||
7324 | switch (imm & 0x3) { | ||
7325 | case 0: | ||
7326 | _b = vreinterpretq_u8_u32( | ||
7327 | vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 0))); | ||
7328 | break; | ||
7329 | case 1: | ||
7330 | _b = vreinterpretq_u8_u32( | ||
7331 | vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 1))); | ||
7332 | break; | ||
7333 | case 2: | ||
7334 | _b = vreinterpretq_u8_u32( | ||
7335 | vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 2))); | ||
7336 | break; | ||
7337 | case 3: | ||
7338 | _b = vreinterpretq_u8_u32( | ||
7339 | vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 3))); | ||
7340 | break; | ||
7341 | default: | ||
7342 | #if defined(__GNUC__) || defined(__clang__) | ||
7343 | __builtin_unreachable(); | ||
7344 | #elif defined(_MSC_VER) | ||
7345 | __assume(0); | ||
7346 | #endif | ||
7347 | break; | ||
7348 | } | ||
7349 | |||
7350 | int16x8_t c04, c15, c26, c37; | ||
7351 | uint8x8_t low_b = vget_low_u8(_b); | ||
7352 | c04 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a), low_b)); | ||
7353 | uint8x16_t _a_1 = vextq_u8(_a, _a, 1); | ||
7354 | c15 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a_1), low_b)); | ||
7355 | uint8x16_t _a_2 = vextq_u8(_a, _a, 2); | ||
7356 | c26 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a_2), low_b)); | ||
7357 | uint8x16_t _a_3 = vextq_u8(_a, _a, 3); | ||
7358 | c37 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a_3), low_b)); | ||
7359 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
7360 | // |0|4|2|6| | ||
7361 | c04 = vpaddq_s16(c04, c26); | ||
7362 | // |1|5|3|7| | ||
7363 | c15 = vpaddq_s16(c15, c37); | ||
7364 | |||
7365 | int32x4_t trn1_c = | ||
7366 | vtrn1q_s32(vreinterpretq_s32_s16(c04), vreinterpretq_s32_s16(c15)); | ||
7367 | int32x4_t trn2_c = | ||
7368 | vtrn2q_s32(vreinterpretq_s32_s16(c04), vreinterpretq_s32_s16(c15)); | ||
7369 | return vreinterpretq_m128i_s16(vpaddq_s16(vreinterpretq_s16_s32(trn1_c), | ||
7370 | vreinterpretq_s16_s32(trn2_c))); | ||
7371 | #else | ||
7372 | int16x4_t c01, c23, c45, c67; | ||
7373 | c01 = vpadd_s16(vget_low_s16(c04), vget_low_s16(c15)); | ||
7374 | c23 = vpadd_s16(vget_low_s16(c26), vget_low_s16(c37)); | ||
7375 | c45 = vpadd_s16(vget_high_s16(c04), vget_high_s16(c15)); | ||
7376 | c67 = vpadd_s16(vget_high_s16(c26), vget_high_s16(c37)); | ||
7377 | |||
7378 | return vreinterpretq_m128i_s16( | ||
7379 | vcombine_s16(vpadd_s16(c01, c23), vpadd_s16(c45, c67))); | ||
7380 | #endif | ||
7381 | } | ||
7382 | |||
7383 | // Multiply the low signed 32-bit integers from each packed 64-bit element in | ||
7384 | // a and b, and store the signed 64-bit results in dst. | ||
7385 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_epi32 | ||
7386 | FORCE_INLINE __m128i _mm_mul_epi32(__m128i a, __m128i b) | ||
7387 | { | ||
7388 | // vmull_s32 upcasts instead of masking, so we downcast. | ||
7389 | int32x2_t a_lo = vmovn_s64(vreinterpretq_s64_m128i(a)); | ||
7390 | int32x2_t b_lo = vmovn_s64(vreinterpretq_s64_m128i(b)); | ||
7391 | return vreinterpretq_m128i_s64(vmull_s32(a_lo, b_lo)); | ||
7392 | } | ||
7393 | |||
7394 | // Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit | ||
7395 | // integers, and store the low 32 bits of the intermediate integers in dst. | ||
7396 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mullo_epi32 | ||
7397 | FORCE_INLINE __m128i _mm_mullo_epi32(__m128i a, __m128i b) | ||
7398 | { | ||
7399 | return vreinterpretq_m128i_s32( | ||
7400 | vmulq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); | ||
7401 | } | ||
7402 | |||
7403 | // Convert packed signed 32-bit integers from a and b to packed 16-bit integers | ||
7404 | // using unsigned saturation, and store the results in dst. | ||
7405 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi32 | ||
7406 | FORCE_INLINE __m128i _mm_packus_epi32(__m128i a, __m128i b) | ||
7407 | { | ||
7408 | return vreinterpretq_m128i_u16( | ||
7409 | vcombine_u16(vqmovun_s32(vreinterpretq_s32_m128i(a)), | ||
7410 | vqmovun_s32(vreinterpretq_s32_m128i(b)))); | ||
7411 | } | ||
7412 | |||
7413 | // Round the packed double-precision (64-bit) floating-point elements in a using | ||
7414 | // the rounding parameter, and store the results as packed double-precision | ||
7415 | // floating-point elements in dst. | ||
7416 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_pd | ||
7417 | FORCE_INLINE __m128d _mm_round_pd(__m128d a, int rounding) | ||
7418 | { | ||
7419 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
7420 | switch (rounding) { | ||
7421 | case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC): | ||
7422 | return vreinterpretq_m128d_f64(vrndnq_f64(vreinterpretq_f64_m128d(a))); | ||
7423 | case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC): | ||
7424 | return _mm_floor_pd(a); | ||
7425 | case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC): | ||
7426 | return _mm_ceil_pd(a); | ||
7427 | case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC): | ||
7428 | return vreinterpretq_m128d_f64(vrndq_f64(vreinterpretq_f64_m128d(a))); | ||
7429 | default: //_MM_FROUND_CUR_DIRECTION | ||
7430 | return vreinterpretq_m128d_f64(vrndiq_f64(vreinterpretq_f64_m128d(a))); | ||
7431 | } | ||
7432 | #else | ||
7433 | double *v_double = (double *) &a; | ||
7434 | |||
7435 | if (rounding == (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC) || | ||
7436 | (rounding == _MM_FROUND_CUR_DIRECTION && | ||
7437 | _MM_GET_ROUNDING_MODE() == _MM_ROUND_NEAREST)) { | ||
7438 | double res[2], tmp; | ||
7439 | for (int i = 0; i < 2; i++) { | ||
7440 | tmp = (v_double[i] < 0) ? -v_double[i] : v_double[i]; | ||
7441 | double roundDown = floor(tmp); // Round down value | ||
7442 | double roundUp = ceil(tmp); // Round up value | ||
7443 | double diffDown = tmp - roundDown; | ||
7444 | double diffUp = roundUp - tmp; | ||
7445 | if (diffDown < diffUp) { | ||
7446 | /* If it's closer to the round down value, then use it */ | ||
7447 | res[i] = roundDown; | ||
7448 | } else if (diffDown > diffUp) { | ||
7449 | /* If it's closer to the round up value, then use it */ | ||
7450 | res[i] = roundUp; | ||
7451 | } else { | ||
7452 | /* If it's equidistant between round up and round down value, | ||
7453 | * pick the one which is an even number */ | ||
7454 | double half = roundDown / 2; | ||
7455 | if (half != floor(half)) { | ||
7456 | /* If the round down value is odd, return the round up value | ||
7457 | */ | ||
7458 | res[i] = roundUp; | ||
7459 | } else { | ||
7460 | /* If the round up value is odd, return the round down value | ||
7461 | */ | ||
7462 | res[i] = roundDown; | ||
7463 | } | ||
7464 | } | ||
7465 | res[i] = (v_double[i] < 0) ? -res[i] : res[i]; | ||
7466 | } | ||
7467 | return _mm_set_pd(res[1], res[0]); | ||
7468 | } else if (rounding == (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC) || | ||
7469 | (rounding == _MM_FROUND_CUR_DIRECTION && | ||
7470 | _MM_GET_ROUNDING_MODE() == _MM_ROUND_DOWN)) { | ||
7471 | return _mm_floor_pd(a); | ||
7472 | } else if (rounding == (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC) || | ||
7473 | (rounding == _MM_FROUND_CUR_DIRECTION && | ||
7474 | _MM_GET_ROUNDING_MODE() == _MM_ROUND_UP)) { | ||
7475 | return _mm_ceil_pd(a); | ||
7476 | } | ||
7477 | return _mm_set_pd(v_double[1] > 0 ? floor(v_double[1]) : ceil(v_double[1]), | ||
7478 | v_double[0] > 0 ? floor(v_double[0]) : ceil(v_double[0])); | ||
7479 | #endif | ||
7480 | } | ||
7481 | |||
7482 | // Round the packed single-precision (32-bit) floating-point elements in a using | ||
7483 | // the rounding parameter, and store the results as packed single-precision | ||
7484 | // floating-point elements in dst. | ||
7485 | // software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_ps | ||
7486 | FORCE_INLINE __m128 _mm_round_ps(__m128 a, int rounding) | ||
7487 | { | ||
7488 | #if (defined(__aarch64__) || defined(_M_ARM64)) || \ | ||
7489 | defined(__ARM_FEATURE_DIRECTED_ROUNDING) | ||
7490 | switch (rounding) { | ||
7491 | case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC): | ||
7492 | return vreinterpretq_m128_f32(vrndnq_f32(vreinterpretq_f32_m128(a))); | ||
7493 | case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC): | ||
7494 | return _mm_floor_ps(a); | ||
7495 | case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC): | ||
7496 | return _mm_ceil_ps(a); | ||
7497 | case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC): | ||
7498 | return vreinterpretq_m128_f32(vrndq_f32(vreinterpretq_f32_m128(a))); | ||
7499 | default: //_MM_FROUND_CUR_DIRECTION | ||
7500 | return vreinterpretq_m128_f32(vrndiq_f32(vreinterpretq_f32_m128(a))); | ||
7501 | } | ||
7502 | #else | ||
7503 | float *v_float = (float *) &a; | ||
7504 | |||
7505 | if (rounding == (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC) || | ||
7506 | (rounding == _MM_FROUND_CUR_DIRECTION && | ||
7507 | _MM_GET_ROUNDING_MODE() == _MM_ROUND_NEAREST)) { | ||
7508 | uint32x4_t signmask = vdupq_n_u32(0x80000000); | ||
7509 | float32x4_t half = vbslq_f32(signmask, vreinterpretq_f32_m128(a), | ||
7510 | vdupq_n_f32(0.5f)); /* +/- 0.5 */ | ||
7511 | int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32( | ||
7512 | vreinterpretq_f32_m128(a), half)); /* round to integer: [a + 0.5]*/ | ||
7513 | int32x4_t r_trunc = vcvtq_s32_f32( | ||
7514 | vreinterpretq_f32_m128(a)); /* truncate to integer: [a] */ | ||
7515 | int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32( | ||
7516 | vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31)); /* 1 or 0 */ | ||
7517 | int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone), | ||
7518 | vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */ | ||
7519 | float32x4_t delta = vsubq_f32( | ||
7520 | vreinterpretq_f32_m128(a), | ||
7521 | vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */ | ||
7522 | uint32x4_t is_delta_half = | ||
7523 | vceqq_f32(delta, half); /* delta == +/- 0.5 */ | ||
7524 | return vreinterpretq_m128_f32( | ||
7525 | vcvtq_f32_s32(vbslq_s32(is_delta_half, r_even, r_normal))); | ||
7526 | } else if (rounding == (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC) || | ||
7527 | (rounding == _MM_FROUND_CUR_DIRECTION && | ||
7528 | _MM_GET_ROUNDING_MODE() == _MM_ROUND_DOWN)) { | ||
7529 | return _mm_floor_ps(a); | ||
7530 | } else if (rounding == (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC) || | ||
7531 | (rounding == _MM_FROUND_CUR_DIRECTION && | ||
7532 | _MM_GET_ROUNDING_MODE() == _MM_ROUND_UP)) { | ||
7533 | return _mm_ceil_ps(a); | ||
7534 | } | ||
7535 | return _mm_set_ps(v_float[3] > 0 ? floorf(v_float[3]) : ceilf(v_float[3]), | ||
7536 | v_float[2] > 0 ? floorf(v_float[2]) : ceilf(v_float[2]), | ||
7537 | v_float[1] > 0 ? floorf(v_float[1]) : ceilf(v_float[1]), | ||
7538 | v_float[0] > 0 ? floorf(v_float[0]) : ceilf(v_float[0])); | ||
7539 | #endif | ||
7540 | } | ||
7541 | |||
7542 | // Round the lower double-precision (64-bit) floating-point element in b using | ||
7543 | // the rounding parameter, store the result as a double-precision floating-point | ||
7544 | // element in the lower element of dst, and copy the upper element from a to the | ||
7545 | // upper element of dst. | ||
7546 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_sd | ||
7547 | FORCE_INLINE __m128d _mm_round_sd(__m128d a, __m128d b, int rounding) | ||
7548 | { | ||
7549 | return _mm_move_sd(a, _mm_round_pd(b, rounding)); | ||
7550 | } | ||
7551 | |||
7552 | // Round the lower single-precision (32-bit) floating-point element in b using | ||
7553 | // the rounding parameter, store the result as a single-precision floating-point | ||
7554 | // element in the lower element of dst, and copy the upper 3 packed elements | ||
7555 | // from a to the upper elements of dst. Rounding is done according to the | ||
7556 | // rounding[3:0] parameter, which can be one of: | ||
7557 | // (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and | ||
7558 | // suppress exceptions | ||
7559 | // (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and | ||
7560 | // suppress exceptions | ||
7561 | // (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress | ||
7562 | // exceptions | ||
7563 | // (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress | ||
7564 | // exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see | ||
7565 | // _MM_SET_ROUNDING_MODE | ||
7566 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_ss | ||
7567 | FORCE_INLINE __m128 _mm_round_ss(__m128 a, __m128 b, int rounding) | ||
7568 | { | ||
7569 | return _mm_move_ss(a, _mm_round_ps(b, rounding)); | ||
7570 | } | ||
7571 | |||
7572 | // Load 128-bits of integer data from memory into dst using a non-temporal | ||
7573 | // memory hint. mem_addr must be aligned on a 16-byte boundary or a | ||
7574 | // general-protection exception may be generated. | ||
7575 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_load_si128 | ||
7576 | FORCE_INLINE __m128i _mm_stream_load_si128(__m128i *p) | ||
7577 | { | ||
7578 | #if __has_builtin(__builtin_nontemporal_store) | ||
7579 | return __builtin_nontemporal_load(p); | ||
7580 | #else | ||
7581 | return vreinterpretq_m128i_s64(vld1q_s64((int64_t *) p)); | ||
7582 | #endif | ||
7583 | } | ||
7584 | |||
7585 | // Compute the bitwise NOT of a and then AND with a 128-bit vector containing | ||
7586 | // all 1's, and return 1 if the result is zero, otherwise return 0. | ||
7587 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_test_all_ones | ||
7588 | FORCE_INLINE int _mm_test_all_ones(__m128i a) | ||
7589 | { | ||
7590 | return (uint64_t) (vgetq_lane_s64(a, 0) & vgetq_lane_s64(a, 1)) == | ||
7591 | ~(uint64_t) 0; | ||
7592 | } | ||
7593 | |||
7594 | // Compute the bitwise AND of 128 bits (representing integer data) in a and | ||
7595 | // mask, and return 1 if the result is zero, otherwise return 0. | ||
7596 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_test_all_zeros | ||
7597 | FORCE_INLINE int _mm_test_all_zeros(__m128i a, __m128i mask) | ||
7598 | { | ||
7599 | int64x2_t a_and_mask = | ||
7600 | vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(mask)); | ||
7601 | return !(vgetq_lane_s64(a_and_mask, 0) | vgetq_lane_s64(a_and_mask, 1)); | ||
7602 | } | ||
7603 | |||
7604 | // Compute the bitwise AND of 128 bits (representing integer data) in a and | ||
7605 | // mask, and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute | ||
7606 | // the bitwise NOT of a and then AND with mask, and set CF to 1 if the result is | ||
7607 | // zero, otherwise set CF to 0. Return 1 if both the ZF and CF values are zero, | ||
7608 | // otherwise return 0. | ||
7609 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_test_mix_ones_zero | ||
7610 | // Note: Argument names may be wrong in the Intel intrinsics guide. | ||
7611 | FORCE_INLINE int _mm_test_mix_ones_zeros(__m128i a, __m128i mask) | ||
7612 | { | ||
7613 | uint64x2_t v = vreinterpretq_u64_m128i(a); | ||
7614 | uint64x2_t m = vreinterpretq_u64_m128i(mask); | ||
7615 | |||
7616 | // find ones (set-bits) and zeros (clear-bits) under clip mask | ||
7617 | uint64x2_t ones = vandq_u64(m, v); | ||
7618 | uint64x2_t zeros = vbicq_u64(m, v); | ||
7619 | |||
7620 | // If both 128-bit variables are populated (non-zero) then return 1. | ||
7621 | // For comparision purposes, first compact each var down to 32-bits. | ||
7622 | uint32x2_t reduced = vpmax_u32(vqmovn_u64(ones), vqmovn_u64(zeros)); | ||
7623 | |||
7624 | // if folding minimum is non-zero then both vars must be non-zero | ||
7625 | return (vget_lane_u32(vpmin_u32(reduced, reduced), 0) != 0); | ||
7626 | } | ||
7627 | |||
7628 | // Compute the bitwise AND of 128 bits (representing integer data) in a and b, | ||
7629 | // and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the | ||
7630 | // bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero, | ||
7631 | // otherwise set CF to 0. Return the CF value. | ||
7632 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testc_si128 | ||
7633 | FORCE_INLINE int _mm_testc_si128(__m128i a, __m128i b) | ||
7634 | { | ||
7635 | int64x2_t s64 = | ||
7636 | vbicq_s64(vreinterpretq_s64_m128i(b), vreinterpretq_s64_m128i(a)); | ||
7637 | return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1)); | ||
7638 | } | ||
7639 | |||
7640 | // Compute the bitwise AND of 128 bits (representing integer data) in a and b, | ||
7641 | // and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the | ||
7642 | // bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero, | ||
7643 | // otherwise set CF to 0. Return 1 if both the ZF and CF values are zero, | ||
7644 | // otherwise return 0. | ||
7645 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testnzc_si128 | ||
7646 | #define _mm_testnzc_si128(a, b) _mm_test_mix_ones_zeros(a, b) | ||
7647 | |||
7648 | // Compute the bitwise AND of 128 bits (representing integer data) in a and b, | ||
7649 | // and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the | ||
7650 | // bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero, | ||
7651 | // otherwise set CF to 0. Return the ZF value. | ||
7652 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testz_si128 | ||
7653 | FORCE_INLINE int _mm_testz_si128(__m128i a, __m128i b) | ||
7654 | { | ||
7655 | int64x2_t s64 = | ||
7656 | vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)); | ||
7657 | return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1)); | ||
7658 | } | ||
7659 | |||
7660 | /* SSE4.2 */ | ||
7661 | |||
7662 | static const uint16_t ALIGN_STRUCT(16) _sse2neon_cmpestr_mask16b[8] = { | ||
7663 | 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, | ||
7664 | }; | ||
7665 | static const uint8_t ALIGN_STRUCT(16) _sse2neon_cmpestr_mask8b[16] = { | ||
7666 | 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, | ||
7667 | 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, | ||
7668 | }; | ||
7669 | |||
7670 | /* specify the source data format */ | ||
7671 | #define _SIDD_UBYTE_OPS 0x00 /* unsigned 8-bit characters */ | ||
7672 | #define _SIDD_UWORD_OPS 0x01 /* unsigned 16-bit characters */ | ||
7673 | #define _SIDD_SBYTE_OPS 0x02 /* signed 8-bit characters */ | ||
7674 | #define _SIDD_SWORD_OPS 0x03 /* signed 16-bit characters */ | ||
7675 | |||
7676 | /* specify the comparison operation */ | ||
7677 | #define _SIDD_CMP_EQUAL_ANY 0x00 /* compare equal any: strchr */ | ||
7678 | #define _SIDD_CMP_RANGES 0x04 /* compare ranges */ | ||
7679 | #define _SIDD_CMP_EQUAL_EACH 0x08 /* compare equal each: strcmp */ | ||
7680 | #define _SIDD_CMP_EQUAL_ORDERED 0x0C /* compare equal ordered */ | ||
7681 | |||
7682 | /* specify the polarity */ | ||
7683 | #define _SIDD_POSITIVE_POLARITY 0x00 | ||
7684 | #define _SIDD_MASKED_POSITIVE_POLARITY 0x20 | ||
7685 | #define _SIDD_NEGATIVE_POLARITY 0x10 /* negate results */ | ||
7686 | #define _SIDD_MASKED_NEGATIVE_POLARITY \ | ||
7687 | 0x30 /* negate results only before end of string */ | ||
7688 | |||
7689 | /* specify the output selection in _mm_cmpXstri */ | ||
7690 | #define _SIDD_LEAST_SIGNIFICANT 0x00 | ||
7691 | #define _SIDD_MOST_SIGNIFICANT 0x40 | ||
7692 | |||
7693 | /* specify the output selection in _mm_cmpXstrm */ | ||
7694 | #define _SIDD_BIT_MASK 0x00 | ||
7695 | #define _SIDD_UNIT_MASK 0x40 | ||
7696 | |||
7697 | /* Pattern Matching for C macros. | ||
7698 | * https://github.com/pfultz2/Cloak/wiki/C-Preprocessor-tricks,-tips,-and-idioms | ||
7699 | */ | ||
7700 | |||
7701 | /* catenate */ | ||
7702 | #define SSE2NEON_PRIMITIVE_CAT(a, ...) a##__VA_ARGS__ | ||
7703 | #define SSE2NEON_CAT(a, b) SSE2NEON_PRIMITIVE_CAT(a, b) | ||
7704 | |||
7705 | #define SSE2NEON_IIF(c) SSE2NEON_PRIMITIVE_CAT(SSE2NEON_IIF_, c) | ||
7706 | /* run the 2nd parameter */ | ||
7707 | #define SSE2NEON_IIF_0(t, ...) __VA_ARGS__ | ||
7708 | /* run the 1st parameter */ | ||
7709 | #define SSE2NEON_IIF_1(t, ...) t | ||
7710 | |||
7711 | #define SSE2NEON_COMPL(b) SSE2NEON_PRIMITIVE_CAT(SSE2NEON_COMPL_, b) | ||
7712 | #define SSE2NEON_COMPL_0 1 | ||
7713 | #define SSE2NEON_COMPL_1 0 | ||
7714 | |||
7715 | #define SSE2NEON_DEC(x) SSE2NEON_PRIMITIVE_CAT(SSE2NEON_DEC_, x) | ||
7716 | #define SSE2NEON_DEC_1 0 | ||
7717 | #define SSE2NEON_DEC_2 1 | ||
7718 | #define SSE2NEON_DEC_3 2 | ||
7719 | #define SSE2NEON_DEC_4 3 | ||
7720 | #define SSE2NEON_DEC_5 4 | ||
7721 | #define SSE2NEON_DEC_6 5 | ||
7722 | #define SSE2NEON_DEC_7 6 | ||
7723 | #define SSE2NEON_DEC_8 7 | ||
7724 | #define SSE2NEON_DEC_9 8 | ||
7725 | #define SSE2NEON_DEC_10 9 | ||
7726 | #define SSE2NEON_DEC_11 10 | ||
7727 | #define SSE2NEON_DEC_12 11 | ||
7728 | #define SSE2NEON_DEC_13 12 | ||
7729 | #define SSE2NEON_DEC_14 13 | ||
7730 | #define SSE2NEON_DEC_15 14 | ||
7731 | #define SSE2NEON_DEC_16 15 | ||
7732 | |||
7733 | /* detection */ | ||
7734 | #define SSE2NEON_CHECK_N(x, n, ...) n | ||
7735 | #define SSE2NEON_CHECK(...) SSE2NEON_CHECK_N(__VA_ARGS__, 0, ) | ||
7736 | #define SSE2NEON_PROBE(x) x, 1, | ||
7737 | |||
7738 | #define SSE2NEON_NOT(x) SSE2NEON_CHECK(SSE2NEON_PRIMITIVE_CAT(SSE2NEON_NOT_, x)) | ||
7739 | #define SSE2NEON_NOT_0 SSE2NEON_PROBE(~) | ||
7740 | |||
7741 | #define SSE2NEON_BOOL(x) SSE2NEON_COMPL(SSE2NEON_NOT(x)) | ||
7742 | #define SSE2NEON_IF(c) SSE2NEON_IIF(SSE2NEON_BOOL(c)) | ||
7743 | |||
7744 | #define SSE2NEON_EAT(...) | ||
7745 | #define SSE2NEON_EXPAND(...) __VA_ARGS__ | ||
7746 | #define SSE2NEON_WHEN(c) SSE2NEON_IF(c)(SSE2NEON_EXPAND, SSE2NEON_EAT) | ||
7747 | |||
7748 | /* recursion */ | ||
7749 | /* deferred expression */ | ||
7750 | #define SSE2NEON_EMPTY() | ||
7751 | #define SSE2NEON_DEFER(id) id SSE2NEON_EMPTY() | ||
7752 | #define SSE2NEON_OBSTRUCT(...) __VA_ARGS__ SSE2NEON_DEFER(SSE2NEON_EMPTY)() | ||
7753 | #define SSE2NEON_EXPAND(...) __VA_ARGS__ | ||
7754 | |||
7755 | #define SSE2NEON_EVAL(...) \ | ||
7756 | SSE2NEON_EVAL1(SSE2NEON_EVAL1(SSE2NEON_EVAL1(__VA_ARGS__))) | ||
7757 | #define SSE2NEON_EVAL1(...) \ | ||
7758 | SSE2NEON_EVAL2(SSE2NEON_EVAL2(SSE2NEON_EVAL2(__VA_ARGS__))) | ||
7759 | #define SSE2NEON_EVAL2(...) \ | ||
7760 | SSE2NEON_EVAL3(SSE2NEON_EVAL3(SSE2NEON_EVAL3(__VA_ARGS__))) | ||
7761 | #define SSE2NEON_EVAL3(...) __VA_ARGS__ | ||
7762 | |||
7763 | #define SSE2NEON_REPEAT(count, macro, ...) \ | ||
7764 | SSE2NEON_WHEN(count) \ | ||
7765 | (SSE2NEON_OBSTRUCT(SSE2NEON_REPEAT_INDIRECT)()( \ | ||
7766 | SSE2NEON_DEC(count), macro, \ | ||
7767 | __VA_ARGS__) SSE2NEON_OBSTRUCT(macro)(SSE2NEON_DEC(count), \ | ||
7768 | __VA_ARGS__)) | ||
7769 | #define SSE2NEON_REPEAT_INDIRECT() SSE2NEON_REPEAT | ||
7770 | |||
7771 | #define SSE2NEON_SIZE_OF_byte 8 | ||
7772 | #define SSE2NEON_NUMBER_OF_LANES_byte 16 | ||
7773 | #define SSE2NEON_SIZE_OF_word 16 | ||
7774 | #define SSE2NEON_NUMBER_OF_LANES_word 8 | ||
7775 | |||
7776 | #define SSE2NEON_COMPARE_EQUAL_THEN_FILL_LANE(i, type) \ | ||
7777 | mtx[i] = vreinterpretq_m128i_##type(vceqq_##type( \ | ||
7778 | vdupq_n_##type(vgetq_lane_##type(vreinterpretq_##type##_m128i(b), i)), \ | ||
7779 | vreinterpretq_##type##_m128i(a))); | ||
7780 | |||
7781 | #define SSE2NEON_FILL_LANE(i, type) \ | ||
7782 | vec_b[i] = \ | ||
7783 | vdupq_n_##type(vgetq_lane_##type(vreinterpretq_##type##_m128i(b), i)); | ||
7784 | |||
7785 | #define PCMPSTR_RANGES(a, b, mtx, data_type_prefix, type_prefix, size, \ | ||
7786 | number_of_lanes, byte_or_word) \ | ||
7787 | do { \ | ||
7788 | SSE2NEON_CAT( \ | ||
7789 | data_type_prefix, \ | ||
7790 | SSE2NEON_CAT(size, \ | ||
7791 | SSE2NEON_CAT(x, SSE2NEON_CAT(number_of_lanes, _t)))) \ | ||
7792 | vec_b[number_of_lanes]; \ | ||
7793 | __m128i mask = SSE2NEON_IIF(byte_or_word)( \ | ||
7794 | vreinterpretq_m128i_u16(vdupq_n_u16(0xff)), \ | ||
7795 | vreinterpretq_m128i_u32(vdupq_n_u32(0xffff))); \ | ||
7796 | SSE2NEON_EVAL(SSE2NEON_REPEAT(number_of_lanes, SSE2NEON_FILL_LANE, \ | ||
7797 | SSE2NEON_CAT(type_prefix, size))) \ | ||
7798 | for (int i = 0; i < number_of_lanes; i++) { \ | ||
7799 | mtx[i] = SSE2NEON_CAT(vreinterpretq_m128i_u, \ | ||
7800 | size)(SSE2NEON_CAT(vbslq_u, size)( \ | ||
7801 | SSE2NEON_CAT(vreinterpretq_u, \ | ||
7802 | SSE2NEON_CAT(size, _m128i))(mask), \ | ||
7803 | SSE2NEON_CAT(vcgeq_, SSE2NEON_CAT(type_prefix, size))( \ | ||
7804 | vec_b[i], \ | ||
7805 | SSE2NEON_CAT( \ | ||
7806 | vreinterpretq_, \ | ||
7807 | SSE2NEON_CAT(type_prefix, \ | ||
7808 | SSE2NEON_CAT(size, _m128i(a))))), \ | ||
7809 | SSE2NEON_CAT(vcleq_, SSE2NEON_CAT(type_prefix, size))( \ | ||
7810 | vec_b[i], \ | ||
7811 | SSE2NEON_CAT( \ | ||
7812 | vreinterpretq_, \ | ||
7813 | SSE2NEON_CAT(type_prefix, \ | ||
7814 | SSE2NEON_CAT(size, _m128i(a))))))); \ | ||
7815 | } \ | ||
7816 | } while (0) | ||
7817 | |||
7818 | #define PCMPSTR_EQ(a, b, mtx, size, number_of_lanes) \ | ||
7819 | do { \ | ||
7820 | SSE2NEON_EVAL(SSE2NEON_REPEAT(number_of_lanes, \ | ||
7821 | SSE2NEON_COMPARE_EQUAL_THEN_FILL_LANE, \ | ||
7822 | SSE2NEON_CAT(u, size))) \ | ||
7823 | } while (0) | ||
7824 | |||
7825 | #define SSE2NEON_CMP_EQUAL_ANY_IMPL(type) \ | ||
7826 | static int _sse2neon_cmp_##type##_equal_any(__m128i a, int la, __m128i b, \ | ||
7827 | int lb) \ | ||
7828 | { \ | ||
7829 | __m128i mtx[16]; \ | ||
7830 | PCMPSTR_EQ(a, b, mtx, SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type), \ | ||
7831 | SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type)); \ | ||
7832 | return SSE2NEON_CAT( \ | ||
7833 | _sse2neon_aggregate_equal_any_, \ | ||
7834 | SSE2NEON_CAT( \ | ||
7835 | SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type), \ | ||
7836 | SSE2NEON_CAT(x, SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, \ | ||
7837 | type))))(la, lb, mtx); \ | ||
7838 | } | ||
7839 | |||
7840 | #define SSE2NEON_CMP_RANGES_IMPL(type, data_type, us, byte_or_word) \ | ||
7841 | static int _sse2neon_cmp_##us##type##_ranges(__m128i a, int la, __m128i b, \ | ||
7842 | int lb) \ | ||
7843 | { \ | ||
7844 | __m128i mtx[16]; \ | ||
7845 | PCMPSTR_RANGES( \ | ||
7846 | a, b, mtx, data_type, us, SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type), \ | ||
7847 | SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type), byte_or_word); \ | ||
7848 | return SSE2NEON_CAT( \ | ||
7849 | _sse2neon_aggregate_ranges_, \ | ||
7850 | SSE2NEON_CAT( \ | ||
7851 | SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type), \ | ||
7852 | SSE2NEON_CAT(x, SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, \ | ||
7853 | type))))(la, lb, mtx); \ | ||
7854 | } | ||
7855 | |||
7856 | #define SSE2NEON_CMP_EQUAL_ORDERED_IMPL(type) \ | ||
7857 | static int _sse2neon_cmp_##type##_equal_ordered(__m128i a, int la, \ | ||
7858 | __m128i b, int lb) \ | ||
7859 | { \ | ||
7860 | __m128i mtx[16]; \ | ||
7861 | PCMPSTR_EQ(a, b, mtx, SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type), \ | ||
7862 | SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type)); \ | ||
7863 | return SSE2NEON_CAT( \ | ||
7864 | _sse2neon_aggregate_equal_ordered_, \ | ||
7865 | SSE2NEON_CAT( \ | ||
7866 | SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type), \ | ||
7867 | SSE2NEON_CAT(x, \ | ||
7868 | SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type))))( \ | ||
7869 | SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type), la, lb, mtx); \ | ||
7870 | } | ||
7871 | |||
7872 | static int _sse2neon_aggregate_equal_any_8x16(int la, int lb, __m128i mtx[16]) | ||
7873 | { | ||
7874 | int res = 0; | ||
7875 | int m = (1 << la) - 1; | ||
7876 | uint8x8_t vec_mask = vld1_u8(_sse2neon_cmpestr_mask8b); | ||
7877 | uint8x8_t t_lo = vtst_u8(vdup_n_u8(m & 0xff), vec_mask); | ||
7878 | uint8x8_t t_hi = vtst_u8(vdup_n_u8(m >> 8), vec_mask); | ||
7879 | uint8x16_t vec = vcombine_u8(t_lo, t_hi); | ||
7880 | for (int j = 0; j < lb; j++) { | ||
7881 | mtx[j] = vreinterpretq_m128i_u8( | ||
7882 | vandq_u8(vec, vreinterpretq_u8_m128i(mtx[j]))); | ||
7883 | mtx[j] = vreinterpretq_m128i_u8( | ||
7884 | vshrq_n_u8(vreinterpretq_u8_m128i(mtx[j]), 7)); | ||
7885 | int tmp = _sse2neon_vaddvq_u8(vreinterpretq_u8_m128i(mtx[j])) ? 1 : 0; | ||
7886 | res |= (tmp << j); | ||
7887 | } | ||
7888 | return res; | ||
7889 | } | ||
7890 | |||
7891 | static int _sse2neon_aggregate_equal_any_16x8(int la, int lb, __m128i mtx[16]) | ||
7892 | { | ||
7893 | int res = 0; | ||
7894 | int m = (1 << la) - 1; | ||
7895 | uint16x8_t vec = | ||
7896 | vtstq_u16(vdupq_n_u16(m), vld1q_u16(_sse2neon_cmpestr_mask16b)); | ||
7897 | for (int j = 0; j < lb; j++) { | ||
7898 | mtx[j] = vreinterpretq_m128i_u16( | ||
7899 | vandq_u16(vec, vreinterpretq_u16_m128i(mtx[j]))); | ||
7900 | mtx[j] = vreinterpretq_m128i_u16( | ||
7901 | vshrq_n_u16(vreinterpretq_u16_m128i(mtx[j]), 15)); | ||
7902 | int tmp = _sse2neon_vaddvq_u16(vreinterpretq_u16_m128i(mtx[j])) ? 1 : 0; | ||
7903 | res |= (tmp << j); | ||
7904 | } | ||
7905 | return res; | ||
7906 | } | ||
7907 | |||
7908 | /* clang-format off */ | ||
7909 | #define SSE2NEON_GENERATE_CMP_EQUAL_ANY(prefix) \ | ||
7910 | prefix##IMPL(byte) \ | ||
7911 | prefix##IMPL(word) | ||
7912 | /* clang-format on */ | ||
7913 | |||
7914 | SSE2NEON_GENERATE_CMP_EQUAL_ANY(SSE2NEON_CMP_EQUAL_ANY_) | ||
7915 | |||
7916 | static int _sse2neon_aggregate_ranges_16x8(int la, int lb, __m128i mtx[16]) | ||
7917 | { | ||
7918 | int res = 0; | ||
7919 | int m = (1 << la) - 1; | ||
7920 | uint16x8_t vec = | ||
7921 | vtstq_u16(vdupq_n_u16(m), vld1q_u16(_sse2neon_cmpestr_mask16b)); | ||
7922 | for (int j = 0; j < lb; j++) { | ||
7923 | mtx[j] = vreinterpretq_m128i_u16( | ||
7924 | vandq_u16(vec, vreinterpretq_u16_m128i(mtx[j]))); | ||
7925 | mtx[j] = vreinterpretq_m128i_u16( | ||
7926 | vshrq_n_u16(vreinterpretq_u16_m128i(mtx[j]), 15)); | ||
7927 | __m128i tmp = vreinterpretq_m128i_u32( | ||
7928 | vshrq_n_u32(vreinterpretq_u32_m128i(mtx[j]), 16)); | ||
7929 | uint32x4_t vec_res = vandq_u32(vreinterpretq_u32_m128i(mtx[j]), | ||
7930 | vreinterpretq_u32_m128i(tmp)); | ||
7931 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
7932 | int t = vaddvq_u32(vec_res) ? 1 : 0; | ||
7933 | #else | ||
7934 | uint64x2_t sumh = vpaddlq_u32(vec_res); | ||
7935 | int t = vgetq_lane_u64(sumh, 0) + vgetq_lane_u64(sumh, 1); | ||
7936 | #endif | ||
7937 | res |= (t << j); | ||
7938 | } | ||
7939 | return res; | ||
7940 | } | ||
7941 | |||
7942 | static int _sse2neon_aggregate_ranges_8x16(int la, int lb, __m128i mtx[16]) | ||
7943 | { | ||
7944 | int res = 0; | ||
7945 | int m = (1 << la) - 1; | ||
7946 | uint8x8_t vec_mask = vld1_u8(_sse2neon_cmpestr_mask8b); | ||
7947 | uint8x8_t t_lo = vtst_u8(vdup_n_u8(m & 0xff), vec_mask); | ||
7948 | uint8x8_t t_hi = vtst_u8(vdup_n_u8(m >> 8), vec_mask); | ||
7949 | uint8x16_t vec = vcombine_u8(t_lo, t_hi); | ||
7950 | for (int j = 0; j < lb; j++) { | ||
7951 | mtx[j] = vreinterpretq_m128i_u8( | ||
7952 | vandq_u8(vec, vreinterpretq_u8_m128i(mtx[j]))); | ||
7953 | mtx[j] = vreinterpretq_m128i_u8( | ||
7954 | vshrq_n_u8(vreinterpretq_u8_m128i(mtx[j]), 7)); | ||
7955 | __m128i tmp = vreinterpretq_m128i_u16( | ||
7956 | vshrq_n_u16(vreinterpretq_u16_m128i(mtx[j]), 8)); | ||
7957 | uint16x8_t vec_res = vandq_u16(vreinterpretq_u16_m128i(mtx[j]), | ||
7958 | vreinterpretq_u16_m128i(tmp)); | ||
7959 | int t = _sse2neon_vaddvq_u16(vec_res) ? 1 : 0; | ||
7960 | res |= (t << j); | ||
7961 | } | ||
7962 | return res; | ||
7963 | } | ||
7964 | |||
7965 | #define SSE2NEON_CMP_RANGES_IS_BYTE 1 | ||
7966 | #define SSE2NEON_CMP_RANGES_IS_WORD 0 | ||
7967 | |||
7968 | /* clang-format off */ | ||
7969 | #define SSE2NEON_GENERATE_CMP_RANGES(prefix) \ | ||
7970 | prefix##IMPL(byte, uint, u, prefix##IS_BYTE) \ | ||
7971 | prefix##IMPL(byte, int, s, prefix##IS_BYTE) \ | ||
7972 | prefix##IMPL(word, uint, u, prefix##IS_WORD) \ | ||
7973 | prefix##IMPL(word, int, s, prefix##IS_WORD) | ||
7974 | /* clang-format on */ | ||
7975 | |||
7976 | SSE2NEON_GENERATE_CMP_RANGES(SSE2NEON_CMP_RANGES_) | ||
7977 | |||
7978 | #undef SSE2NEON_CMP_RANGES_IS_BYTE | ||
7979 | #undef SSE2NEON_CMP_RANGES_IS_WORD | ||
7980 | |||
7981 | static int _sse2neon_cmp_byte_equal_each(__m128i a, int la, __m128i b, int lb) | ||
7982 | { | ||
7983 | uint8x16_t mtx = | ||
7984 | vceqq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)); | ||
7985 | int m0 = (la < lb) ? 0 : ((1 << la) - (1 << lb)); | ||
7986 | int m1 = 0x10000 - (1 << la); | ||
7987 | int tb = 0x10000 - (1 << lb); | ||
7988 | uint8x8_t vec_mask, vec0_lo, vec0_hi, vec1_lo, vec1_hi; | ||
7989 | uint8x8_t tmp_lo, tmp_hi, res_lo, res_hi; | ||
7990 | vec_mask = vld1_u8(_sse2neon_cmpestr_mask8b); | ||
7991 | vec0_lo = vtst_u8(vdup_n_u8(m0), vec_mask); | ||
7992 | vec0_hi = vtst_u8(vdup_n_u8(m0 >> 8), vec_mask); | ||
7993 | vec1_lo = vtst_u8(vdup_n_u8(m1), vec_mask); | ||
7994 | vec1_hi = vtst_u8(vdup_n_u8(m1 >> 8), vec_mask); | ||
7995 | tmp_lo = vtst_u8(vdup_n_u8(tb), vec_mask); | ||
7996 | tmp_hi = vtst_u8(vdup_n_u8(tb >> 8), vec_mask); | ||
7997 | |||
7998 | res_lo = vbsl_u8(vec0_lo, vdup_n_u8(0), vget_low_u8(mtx)); | ||
7999 | res_hi = vbsl_u8(vec0_hi, vdup_n_u8(0), vget_high_u8(mtx)); | ||
8000 | res_lo = vbsl_u8(vec1_lo, tmp_lo, res_lo); | ||
8001 | res_hi = vbsl_u8(vec1_hi, tmp_hi, res_hi); | ||
8002 | res_lo = vand_u8(res_lo, vec_mask); | ||
8003 | res_hi = vand_u8(res_hi, vec_mask); | ||
8004 | |||
8005 | int res = _sse2neon_vaddv_u8(res_lo) + (_sse2neon_vaddv_u8(res_hi) << 8); | ||
8006 | return res; | ||
8007 | } | ||
8008 | |||
8009 | static int _sse2neon_cmp_word_equal_each(__m128i a, int la, __m128i b, int lb) | ||
8010 | { | ||
8011 | uint16x8_t mtx = | ||
8012 | vceqq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)); | ||
8013 | int m0 = (la < lb) ? 0 : ((1 << la) - (1 << lb)); | ||
8014 | int m1 = 0x100 - (1 << la); | ||
8015 | int tb = 0x100 - (1 << lb); | ||
8016 | uint16x8_t vec_mask = vld1q_u16(_sse2neon_cmpestr_mask16b); | ||
8017 | uint16x8_t vec0 = vtstq_u16(vdupq_n_u16(m0), vec_mask); | ||
8018 | uint16x8_t vec1 = vtstq_u16(vdupq_n_u16(m1), vec_mask); | ||
8019 | uint16x8_t tmp = vtstq_u16(vdupq_n_u16(tb), vec_mask); | ||
8020 | mtx = vbslq_u16(vec0, vdupq_n_u16(0), mtx); | ||
8021 | mtx = vbslq_u16(vec1, tmp, mtx); | ||
8022 | mtx = vandq_u16(mtx, vec_mask); | ||
8023 | return _sse2neon_vaddvq_u16(mtx); | ||
8024 | } | ||
8025 | |||
8026 | #define SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UBYTE 1 | ||
8027 | #define SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UWORD 0 | ||
8028 | |||
8029 | #define SSE2NEON_AGGREGATE_EQUAL_ORDER_IMPL(size, number_of_lanes, data_type) \ | ||
8030 | static int _sse2neon_aggregate_equal_ordered_##size##x##number_of_lanes( \ | ||
8031 | int bound, int la, int lb, __m128i mtx[16]) \ | ||
8032 | { \ | ||
8033 | int res = 0; \ | ||
8034 | int m1 = SSE2NEON_IIF(data_type)(0x10000, 0x100) - (1 << la); \ | ||
8035 | uint##size##x8_t vec_mask = SSE2NEON_IIF(data_type)( \ | ||
8036 | vld1_u##size(_sse2neon_cmpestr_mask##size##b), \ | ||
8037 | vld1q_u##size(_sse2neon_cmpestr_mask##size##b)); \ | ||
8038 | uint##size##x##number_of_lanes##_t vec1 = SSE2NEON_IIF(data_type)( \ | ||
8039 | vcombine_u##size(vtst_u##size(vdup_n_u##size(m1), vec_mask), \ | ||
8040 | vtst_u##size(vdup_n_u##size(m1 >> 8), vec_mask)), \ | ||
8041 | vtstq_u##size(vdupq_n_u##size(m1), vec_mask)); \ | ||
8042 | uint##size##x##number_of_lanes##_t vec_minusone = vdupq_n_u##size(-1); \ | ||
8043 | uint##size##x##number_of_lanes##_t vec_zero = vdupq_n_u##size(0); \ | ||
8044 | for (int j = 0; j < lb; j++) { \ | ||
8045 | mtx[j] = vreinterpretq_m128i_u##size(vbslq_u##size( \ | ||
8046 | vec1, vec_minusone, vreinterpretq_u##size##_m128i(mtx[j]))); \ | ||
8047 | } \ | ||
8048 | for (int j = lb; j < bound; j++) { \ | ||
8049 | mtx[j] = vreinterpretq_m128i_u##size( \ | ||
8050 | vbslq_u##size(vec1, vec_minusone, vec_zero)); \ | ||
8051 | } \ | ||
8052 | unsigned SSE2NEON_IIF(data_type)(char, short) *ptr = \ | ||
8053 | (unsigned SSE2NEON_IIF(data_type)(char, short) *) mtx; \ | ||
8054 | for (int i = 0; i < bound; i++) { \ | ||
8055 | int val = 1; \ | ||
8056 | for (int j = 0, k = i; j < bound - i && k < bound; j++, k++) \ | ||
8057 | val &= ptr[k * bound + j]; \ | ||
8058 | res += val << i; \ | ||
8059 | } \ | ||
8060 | return res; \ | ||
8061 | } | ||
8062 | |||
8063 | /* clang-format off */ | ||
8064 | #define SSE2NEON_GENERATE_AGGREGATE_EQUAL_ORDER(prefix) \ | ||
8065 | prefix##IMPL(8, 16, prefix##IS_UBYTE) \ | ||
8066 | prefix##IMPL(16, 8, prefix##IS_UWORD) | ||
8067 | /* clang-format on */ | ||
8068 | |||
8069 | SSE2NEON_GENERATE_AGGREGATE_EQUAL_ORDER(SSE2NEON_AGGREGATE_EQUAL_ORDER_) | ||
8070 | |||
8071 | #undef SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UBYTE | ||
8072 | #undef SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UWORD | ||
8073 | |||
8074 | /* clang-format off */ | ||
8075 | #define SSE2NEON_GENERATE_CMP_EQUAL_ORDERED(prefix) \ | ||
8076 | prefix##IMPL(byte) \ | ||
8077 | prefix##IMPL(word) | ||
8078 | /* clang-format on */ | ||
8079 | |||
8080 | SSE2NEON_GENERATE_CMP_EQUAL_ORDERED(SSE2NEON_CMP_EQUAL_ORDERED_) | ||
8081 | |||
8082 | #define SSE2NEON_CMPESTR_LIST \ | ||
8083 | _(CMP_UBYTE_EQUAL_ANY, cmp_byte_equal_any) \ | ||
8084 | _(CMP_UWORD_EQUAL_ANY, cmp_word_equal_any) \ | ||
8085 | _(CMP_SBYTE_EQUAL_ANY, cmp_byte_equal_any) \ | ||
8086 | _(CMP_SWORD_EQUAL_ANY, cmp_word_equal_any) \ | ||
8087 | _(CMP_UBYTE_RANGES, cmp_ubyte_ranges) \ | ||
8088 | _(CMP_UWORD_RANGES, cmp_uword_ranges) \ | ||
8089 | _(CMP_SBYTE_RANGES, cmp_sbyte_ranges) \ | ||
8090 | _(CMP_SWORD_RANGES, cmp_sword_ranges) \ | ||
8091 | _(CMP_UBYTE_EQUAL_EACH, cmp_byte_equal_each) \ | ||
8092 | _(CMP_UWORD_EQUAL_EACH, cmp_word_equal_each) \ | ||
8093 | _(CMP_SBYTE_EQUAL_EACH, cmp_byte_equal_each) \ | ||
8094 | _(CMP_SWORD_EQUAL_EACH, cmp_word_equal_each) \ | ||
8095 | _(CMP_UBYTE_EQUAL_ORDERED, cmp_byte_equal_ordered) \ | ||
8096 | _(CMP_UWORD_EQUAL_ORDERED, cmp_word_equal_ordered) \ | ||
8097 | _(CMP_SBYTE_EQUAL_ORDERED, cmp_byte_equal_ordered) \ | ||
8098 | _(CMP_SWORD_EQUAL_ORDERED, cmp_word_equal_ordered) | ||
8099 | |||
8100 | enum { | ||
8101 | #define _(name, func_suffix) name, | ||
8102 | SSE2NEON_CMPESTR_LIST | ||
8103 | #undef _ | ||
8104 | }; | ||
8105 | typedef int (*cmpestr_func_t)(__m128i a, int la, __m128i b, int lb); | ||
8106 | static cmpestr_func_t _sse2neon_cmpfunc_table[] = { | ||
8107 | #define _(name, func_suffix) _sse2neon_##func_suffix, | ||
8108 | SSE2NEON_CMPESTR_LIST | ||
8109 | #undef _ | ||
8110 | }; | ||
8111 | |||
8112 | FORCE_INLINE int _sse2neon_sido_negative(int res, int lb, int imm8, int bound) | ||
8113 | { | ||
8114 | switch (imm8 & 0x30) { | ||
8115 | case _SIDD_NEGATIVE_POLARITY: | ||
8116 | res ^= 0xffffffff; | ||
8117 | break; | ||
8118 | case _SIDD_MASKED_NEGATIVE_POLARITY: | ||
8119 | res ^= (1 << lb) - 1; | ||
8120 | break; | ||
8121 | default: | ||
8122 | break; | ||
8123 | } | ||
8124 | |||
8125 | return res & ((bound == 8) ? 0xFF : 0xFFFF); | ||
8126 | } | ||
8127 | |||
8128 | FORCE_INLINE int _sse2neon_clz(unsigned int x) | ||
8129 | { | ||
8130 | #ifdef _MSC_VER | ||
8131 | unsigned long cnt = 0; | ||
8132 | if (_BitScanReverse(&cnt, x)) | ||
8133 | return 31 - cnt; | ||
8134 | return 32; | ||
8135 | #else | ||
8136 | return x != 0 ? __builtin_clz(x) : 32; | ||
8137 | #endif | ||
8138 | } | ||
8139 | |||
8140 | FORCE_INLINE int _sse2neon_ctz(unsigned int x) | ||
8141 | { | ||
8142 | #ifdef _MSC_VER | ||
8143 | unsigned long cnt = 0; | ||
8144 | if (_BitScanForward(&cnt, x)) | ||
8145 | return cnt; | ||
8146 | return 32; | ||
8147 | #else | ||
8148 | return x != 0 ? __builtin_ctz(x) : 32; | ||
8149 | #endif | ||
8150 | } | ||
8151 | |||
8152 | FORCE_INLINE int _sse2neon_ctzll(unsigned long long x) | ||
8153 | { | ||
8154 | #ifdef _MSC_VER | ||
8155 | unsigned long cnt; | ||
8156 | #if defined(SSE2NEON_HAS_BITSCAN64) | ||
8157 | if (_BitScanForward64(&cnt, x)) | ||
8158 | return (int) (cnt); | ||
8159 | #else | ||
8160 | if (_BitScanForward(&cnt, (unsigned long) (x))) | ||
8161 | return (int) cnt; | ||
8162 | if (_BitScanForward(&cnt, (unsigned long) (x >> 32))) | ||
8163 | return (int) (cnt + 32); | ||
8164 | #endif /* SSE2NEON_HAS_BITSCAN64 */ | ||
8165 | return 64; | ||
8166 | #else /* assume GNU compatible compilers */ | ||
8167 | return x != 0 ? __builtin_ctzll(x) : 64; | ||
8168 | #endif | ||
8169 | } | ||
8170 | |||
8171 | #define SSE2NEON_MIN(x, y) (x) < (y) ? (x) : (y) | ||
8172 | |||
8173 | #define SSE2NEON_CMPSTR_SET_UPPER(var, imm) \ | ||
8174 | const int var = (imm & 0x01) ? 8 : 16 | ||
8175 | |||
8176 | #define SSE2NEON_CMPESTRX_LEN_PAIR(a, b, la, lb) \ | ||
8177 | int tmp1 = la ^ (la >> 31); \ | ||
8178 | la = tmp1 - (la >> 31); \ | ||
8179 | int tmp2 = lb ^ (lb >> 31); \ | ||
8180 | lb = tmp2 - (lb >> 31); \ | ||
8181 | la = SSE2NEON_MIN(la, bound); \ | ||
8182 | lb = SSE2NEON_MIN(lb, bound) | ||
8183 | |||
8184 | // Compare all pairs of character in string a and b, | ||
8185 | // then aggregate the result. | ||
8186 | // As the only difference of PCMPESTR* and PCMPISTR* is the way to calculate the | ||
8187 | // length of string, we use SSE2NEON_CMP{I,E}STRX_GET_LEN to get the length of | ||
8188 | // string a and b. | ||
8189 | #define SSE2NEON_COMP_AGG(a, b, la, lb, imm8, IE) \ | ||
8190 | SSE2NEON_CMPSTR_SET_UPPER(bound, imm8); \ | ||
8191 | SSE2NEON_##IE##_LEN_PAIR(a, b, la, lb); \ | ||
8192 | int r2 = (_sse2neon_cmpfunc_table[imm8 & 0x0f])(a, la, b, lb); \ | ||
8193 | r2 = _sse2neon_sido_negative(r2, lb, imm8, bound) | ||
8194 | |||
8195 | #define SSE2NEON_CMPSTR_GENERATE_INDEX(r2, bound, imm8) \ | ||
8196 | return (r2 == 0) ? bound \ | ||
8197 | : ((imm8 & 0x40) ? (31 - _sse2neon_clz(r2)) \ | ||
8198 | : _sse2neon_ctz(r2)) | ||
8199 | |||
8200 | #define SSE2NEON_CMPSTR_GENERATE_MASK(dst) \ | ||
8201 | __m128i dst = vreinterpretq_m128i_u8(vdupq_n_u8(0)); \ | ||
8202 | if (imm8 & 0x40) { \ | ||
8203 | if (bound == 8) { \ | ||
8204 | uint16x8_t tmp = vtstq_u16(vdupq_n_u16(r2), \ | ||
8205 | vld1q_u16(_sse2neon_cmpestr_mask16b)); \ | ||
8206 | dst = vreinterpretq_m128i_u16(vbslq_u16( \ | ||
8207 | tmp, vdupq_n_u16(-1), vreinterpretq_u16_m128i(dst))); \ | ||
8208 | } else { \ | ||
8209 | uint8x16_t vec_r2 = \ | ||
8210 | vcombine_u8(vdup_n_u8(r2), vdup_n_u8(r2 >> 8)); \ | ||
8211 | uint8x16_t tmp = \ | ||
8212 | vtstq_u8(vec_r2, vld1q_u8(_sse2neon_cmpestr_mask8b)); \ | ||
8213 | dst = vreinterpretq_m128i_u8( \ | ||
8214 | vbslq_u8(tmp, vdupq_n_u8(-1), vreinterpretq_u8_m128i(dst))); \ | ||
8215 | } \ | ||
8216 | } else { \ | ||
8217 | if (bound == 16) { \ | ||
8218 | dst = vreinterpretq_m128i_u16( \ | ||
8219 | vsetq_lane_u16(r2 & 0xffff, vreinterpretq_u16_m128i(dst), 0)); \ | ||
8220 | } else { \ | ||
8221 | dst = vreinterpretq_m128i_u8( \ | ||
8222 | vsetq_lane_u8(r2 & 0xff, vreinterpretq_u8_m128i(dst), 0)); \ | ||
8223 | } \ | ||
8224 | } \ | ||
8225 | return dst | ||
8226 | |||
8227 | // Compare packed strings in a and b with lengths la and lb using the control | ||
8228 | // in imm8, and returns 1 if b did not contain a null character and the | ||
8229 | // resulting mask was zero, and 0 otherwise. | ||
8230 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestra | ||
8231 | FORCE_INLINE int _mm_cmpestra(__m128i a, | ||
8232 | int la, | ||
8233 | __m128i b, | ||
8234 | int lb, | ||
8235 | const int imm8) | ||
8236 | { | ||
8237 | int lb_cpy = lb; | ||
8238 | SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX); | ||
8239 | return !r2 & (lb_cpy > bound); | ||
8240 | } | ||
8241 | |||
8242 | // Compare packed strings in a and b with lengths la and lb using the control in | ||
8243 | // imm8, and returns 1 if the resulting mask was non-zero, and 0 otherwise. | ||
8244 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrc | ||
8245 | FORCE_INLINE int _mm_cmpestrc(__m128i a, | ||
8246 | int la, | ||
8247 | __m128i b, | ||
8248 | int lb, | ||
8249 | const int imm8) | ||
8250 | { | ||
8251 | SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX); | ||
8252 | return r2 != 0; | ||
8253 | } | ||
8254 | |||
8255 | // Compare packed strings in a and b with lengths la and lb using the control | ||
8256 | // in imm8, and store the generated index in dst. | ||
8257 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestri | ||
8258 | FORCE_INLINE int _mm_cmpestri(__m128i a, | ||
8259 | int la, | ||
8260 | __m128i b, | ||
8261 | int lb, | ||
8262 | const int imm8) | ||
8263 | { | ||
8264 | SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX); | ||
8265 | SSE2NEON_CMPSTR_GENERATE_INDEX(r2, bound, imm8); | ||
8266 | } | ||
8267 | |||
8268 | // Compare packed strings in a and b with lengths la and lb using the control | ||
8269 | // in imm8, and store the generated mask in dst. | ||
8270 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrm | ||
8271 | FORCE_INLINE __m128i | ||
8272 | _mm_cmpestrm(__m128i a, int la, __m128i b, int lb, const int imm8) | ||
8273 | { | ||
8274 | SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX); | ||
8275 | SSE2NEON_CMPSTR_GENERATE_MASK(dst); | ||
8276 | } | ||
8277 | |||
8278 | // Compare packed strings in a and b with lengths la and lb using the control in | ||
8279 | // imm8, and returns bit 0 of the resulting bit mask. | ||
8280 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestro | ||
8281 | FORCE_INLINE int _mm_cmpestro(__m128i a, | ||
8282 | int la, | ||
8283 | __m128i b, | ||
8284 | int lb, | ||
8285 | const int imm8) | ||
8286 | { | ||
8287 | SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX); | ||
8288 | return r2 & 1; | ||
8289 | } | ||
8290 | |||
8291 | // Compare packed strings in a and b with lengths la and lb using the control in | ||
8292 | // imm8, and returns 1 if any character in a was null, and 0 otherwise. | ||
8293 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrs | ||
8294 | FORCE_INLINE int _mm_cmpestrs(__m128i a, | ||
8295 | int la, | ||
8296 | __m128i b, | ||
8297 | int lb, | ||
8298 | const int imm8) | ||
8299 | { | ||
8300 | (void) a; | ||
8301 | (void) b; | ||
8302 | (void) lb; | ||
8303 | SSE2NEON_CMPSTR_SET_UPPER(bound, imm8); | ||
8304 | return la <= (bound - 1); | ||
8305 | } | ||
8306 | |||
8307 | // Compare packed strings in a and b with lengths la and lb using the control in | ||
8308 | // imm8, and returns 1 if any character in b was null, and 0 otherwise. | ||
8309 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrz | ||
8310 | FORCE_INLINE int _mm_cmpestrz(__m128i a, | ||
8311 | int la, | ||
8312 | __m128i b, | ||
8313 | int lb, | ||
8314 | const int imm8) | ||
8315 | { | ||
8316 | (void) a; | ||
8317 | (void) b; | ||
8318 | (void) la; | ||
8319 | SSE2NEON_CMPSTR_SET_UPPER(bound, imm8); | ||
8320 | return lb <= (bound - 1); | ||
8321 | } | ||
8322 | |||
8323 | #define SSE2NEON_CMPISTRX_LENGTH(str, len, imm8) \ | ||
8324 | do { \ | ||
8325 | if (imm8 & 0x01) { \ | ||
8326 | uint16x8_t equal_mask_##str = \ | ||
8327 | vceqq_u16(vreinterpretq_u16_m128i(str), vdupq_n_u16(0)); \ | ||
8328 | uint8x8_t res_##str = vshrn_n_u16(equal_mask_##str, 4); \ | ||
8329 | uint64_t matches_##str = \ | ||
8330 | vget_lane_u64(vreinterpret_u64_u8(res_##str), 0); \ | ||
8331 | len = _sse2neon_ctzll(matches_##str) >> 3; \ | ||
8332 | } else { \ | ||
8333 | uint16x8_t equal_mask_##str = vreinterpretq_u16_u8( \ | ||
8334 | vceqq_u8(vreinterpretq_u8_m128i(str), vdupq_n_u8(0))); \ | ||
8335 | uint8x8_t res_##str = vshrn_n_u16(equal_mask_##str, 4); \ | ||
8336 | uint64_t matches_##str = \ | ||
8337 | vget_lane_u64(vreinterpret_u64_u8(res_##str), 0); \ | ||
8338 | len = _sse2neon_ctzll(matches_##str) >> 2; \ | ||
8339 | } \ | ||
8340 | } while (0) | ||
8341 | |||
8342 | #define SSE2NEON_CMPISTRX_LEN_PAIR(a, b, la, lb) \ | ||
8343 | int la, lb; \ | ||
8344 | do { \ | ||
8345 | SSE2NEON_CMPISTRX_LENGTH(a, la, imm8); \ | ||
8346 | SSE2NEON_CMPISTRX_LENGTH(b, lb, imm8); \ | ||
8347 | } while (0) | ||
8348 | |||
8349 | // Compare packed strings with implicit lengths in a and b using the control in | ||
8350 | // imm8, and returns 1 if b did not contain a null character and the resulting | ||
8351 | // mask was zero, and 0 otherwise. | ||
8352 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistra | ||
8353 | FORCE_INLINE int _mm_cmpistra(__m128i a, __m128i b, const int imm8) | ||
8354 | { | ||
8355 | SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX); | ||
8356 | return !r2 & (lb >= bound); | ||
8357 | } | ||
8358 | |||
8359 | // Compare packed strings with implicit lengths in a and b using the control in | ||
8360 | // imm8, and returns 1 if the resulting mask was non-zero, and 0 otherwise. | ||
8361 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrc | ||
8362 | FORCE_INLINE int _mm_cmpistrc(__m128i a, __m128i b, const int imm8) | ||
8363 | { | ||
8364 | SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX); | ||
8365 | return r2 != 0; | ||
8366 | } | ||
8367 | |||
8368 | // Compare packed strings with implicit lengths in a and b using the control in | ||
8369 | // imm8, and store the generated index in dst. | ||
8370 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistri | ||
8371 | FORCE_INLINE int _mm_cmpistri(__m128i a, __m128i b, const int imm8) | ||
8372 | { | ||
8373 | SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX); | ||
8374 | SSE2NEON_CMPSTR_GENERATE_INDEX(r2, bound, imm8); | ||
8375 | } | ||
8376 | |||
8377 | // Compare packed strings with implicit lengths in a and b using the control in | ||
8378 | // imm8, and store the generated mask in dst. | ||
8379 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrm | ||
8380 | FORCE_INLINE __m128i _mm_cmpistrm(__m128i a, __m128i b, const int imm8) | ||
8381 | { | ||
8382 | SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX); | ||
8383 | SSE2NEON_CMPSTR_GENERATE_MASK(dst); | ||
8384 | } | ||
8385 | |||
8386 | // Compare packed strings with implicit lengths in a and b using the control in | ||
8387 | // imm8, and returns bit 0 of the resulting bit mask. | ||
8388 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistro | ||
8389 | FORCE_INLINE int _mm_cmpistro(__m128i a, __m128i b, const int imm8) | ||
8390 | { | ||
8391 | SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX); | ||
8392 | return r2 & 1; | ||
8393 | } | ||
8394 | |||
8395 | // Compare packed strings with implicit lengths in a and b using the control in | ||
8396 | // imm8, and returns 1 if any character in a was null, and 0 otherwise. | ||
8397 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrs | ||
8398 | FORCE_INLINE int _mm_cmpistrs(__m128i a, __m128i b, const int imm8) | ||
8399 | { | ||
8400 | (void) b; | ||
8401 | SSE2NEON_CMPSTR_SET_UPPER(bound, imm8); | ||
8402 | int la; | ||
8403 | SSE2NEON_CMPISTRX_LENGTH(a, la, imm8); | ||
8404 | return la <= (bound - 1); | ||
8405 | } | ||
8406 | |||
8407 | // Compare packed strings with implicit lengths in a and b using the control in | ||
8408 | // imm8, and returns 1 if any character in b was null, and 0 otherwise. | ||
8409 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrz | ||
8410 | FORCE_INLINE int _mm_cmpistrz(__m128i a, __m128i b, const int imm8) | ||
8411 | { | ||
8412 | (void) a; | ||
8413 | SSE2NEON_CMPSTR_SET_UPPER(bound, imm8); | ||
8414 | int lb; | ||
8415 | SSE2NEON_CMPISTRX_LENGTH(b, lb, imm8); | ||
8416 | return lb <= (bound - 1); | ||
8417 | } | ||
8418 | |||
8419 | // Compares the 2 signed 64-bit integers in a and the 2 signed 64-bit integers | ||
8420 | // in b for greater than. | ||
8421 | FORCE_INLINE __m128i _mm_cmpgt_epi64(__m128i a, __m128i b) | ||
8422 | { | ||
8423 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
8424 | return vreinterpretq_m128i_u64( | ||
8425 | vcgtq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b))); | ||
8426 | #else | ||
8427 | return vreinterpretq_m128i_s64(vshrq_n_s64( | ||
8428 | vqsubq_s64(vreinterpretq_s64_m128i(b), vreinterpretq_s64_m128i(a)), | ||
8429 | 63)); | ||
8430 | #endif | ||
8431 | } | ||
8432 | |||
8433 | // Starting with the initial value in crc, accumulates a CRC32 value for | ||
8434 | // unsigned 16-bit integer v, and stores the result in dst. | ||
8435 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u16 | ||
8436 | FORCE_INLINE uint32_t _mm_crc32_u16(uint32_t crc, uint16_t v) | ||
8437 | { | ||
8438 | #if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32) | ||
8439 | __asm__ __volatile__("crc32ch %w[c], %w[c], %w[v]\n\t" | ||
8440 | : [c] "+r"(crc) | ||
8441 | : [v] "r"(v)); | ||
8442 | #elif ((__ARM_ARCH == 8) && defined(__ARM_FEATURE_CRC32)) || \ | ||
8443 | (defined(_M_ARM64) && !defined(__clang__)) | ||
8444 | crc = __crc32ch(crc, v); | ||
8445 | #else | ||
8446 | crc = _mm_crc32_u8(crc, v & 0xff); | ||
8447 | crc = _mm_crc32_u8(crc, (v >> 8) & 0xff); | ||
8448 | #endif | ||
8449 | return crc; | ||
8450 | } | ||
8451 | |||
8452 | // Starting with the initial value in crc, accumulates a CRC32 value for | ||
8453 | // unsigned 32-bit integer v, and stores the result in dst. | ||
8454 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u32 | ||
8455 | FORCE_INLINE uint32_t _mm_crc32_u32(uint32_t crc, uint32_t v) | ||
8456 | { | ||
8457 | #if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32) | ||
8458 | __asm__ __volatile__("crc32cw %w[c], %w[c], %w[v]\n\t" | ||
8459 | : [c] "+r"(crc) | ||
8460 | : [v] "r"(v)); | ||
8461 | #elif ((__ARM_ARCH == 8) && defined(__ARM_FEATURE_CRC32)) || \ | ||
8462 | (defined(_M_ARM64) && !defined(__clang__)) | ||
8463 | crc = __crc32cw(crc, v); | ||
8464 | #else | ||
8465 | crc = _mm_crc32_u16(crc, v & 0xffff); | ||
8466 | crc = _mm_crc32_u16(crc, (v >> 16) & 0xffff); | ||
8467 | #endif | ||
8468 | return crc; | ||
8469 | } | ||
8470 | |||
8471 | // Starting with the initial value in crc, accumulates a CRC32 value for | ||
8472 | // unsigned 64-bit integer v, and stores the result in dst. | ||
8473 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u64 | ||
8474 | FORCE_INLINE uint64_t _mm_crc32_u64(uint64_t crc, uint64_t v) | ||
8475 | { | ||
8476 | #if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32) | ||
8477 | __asm__ __volatile__("crc32cx %w[c], %w[c], %x[v]\n\t" | ||
8478 | : [c] "+r"(crc) | ||
8479 | : [v] "r"(v)); | ||
8480 | #elif (defined(_M_ARM64) && !defined(__clang__)) | ||
8481 | crc = __crc32cd((uint32_t) crc, v); | ||
8482 | #else | ||
8483 | crc = _mm_crc32_u32((uint32_t) (crc), v & 0xffffffff); | ||
8484 | crc = _mm_crc32_u32((uint32_t) (crc), (v >> 32) & 0xffffffff); | ||
8485 | #endif | ||
8486 | return crc; | ||
8487 | } | ||
8488 | |||
8489 | // Starting with the initial value in crc, accumulates a CRC32 value for | ||
8490 | // unsigned 8-bit integer v, and stores the result in dst. | ||
8491 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u8 | ||
8492 | FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t crc, uint8_t v) | ||
8493 | { | ||
8494 | #if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32) | ||
8495 | __asm__ __volatile__("crc32cb %w[c], %w[c], %w[v]\n\t" | ||
8496 | : [c] "+r"(crc) | ||
8497 | : [v] "r"(v)); | ||
8498 | #elif ((__ARM_ARCH == 8) && defined(__ARM_FEATURE_CRC32)) || \ | ||
8499 | (defined(_M_ARM64) && !defined(__clang__)) | ||
8500 | crc = __crc32cb(crc, v); | ||
8501 | #else | ||
8502 | crc ^= v; | ||
8503 | #if defined(__ARM_FEATURE_CRYPTO) | ||
8504 | // Adapted from: https://mary.rs/lab/crc32/ | ||
8505 | // Barrent reduction | ||
8506 | uint64x2_t orig = | ||
8507 | vcombine_u64(vcreate_u64((uint64_t) (crc) << 24), vcreate_u64(0x0)); | ||
8508 | uint64x2_t tmp = orig; | ||
8509 | |||
8510 | // Polynomial P(x) of CRC32C | ||
8511 | uint64_t p = 0x105EC76F1; | ||
8512 | // Barrett Reduction (in bit-reflected form) constant mu_{64} = \lfloor | ||
8513 | // 2^{64} / P(x) \rfloor = 0x11f91caf6 | ||
8514 | uint64_t mu = 0x1dea713f1; | ||
8515 | |||
8516 | // Multiply by mu_{64} | ||
8517 | tmp = _sse2neon_vmull_p64(vget_low_u64(tmp), vcreate_u64(mu)); | ||
8518 | // Divide by 2^{64} (mask away the unnecessary bits) | ||
8519 | tmp = | ||
8520 | vandq_u64(tmp, vcombine_u64(vcreate_u64(0xFFFFFFFF), vcreate_u64(0x0))); | ||
8521 | // Multiply by P(x) (shifted left by 1 for alignment reasons) | ||
8522 | tmp = _sse2neon_vmull_p64(vget_low_u64(tmp), vcreate_u64(p)); | ||
8523 | // Subtract original from result | ||
8524 | tmp = veorq_u64(tmp, orig); | ||
8525 | |||
8526 | // Extract the 'lower' (in bit-reflected sense) 32 bits | ||
8527 | crc = vgetq_lane_u32(vreinterpretq_u32_u64(tmp), 1); | ||
8528 | #else // Fall back to the generic table lookup approach | ||
8529 | // Adapted from: https://create.stephan-brumme.com/crc32/ | ||
8530 | // Apply half-byte comparision algorithm for the best ratio between | ||
8531 | // performance and lookup table. | ||
8532 | |||
8533 | // The lookup table just needs to store every 16th entry | ||
8534 | // of the standard look-up table. | ||
8535 | static const uint32_t crc32_half_byte_tbl[] = { | ||
8536 | 0x00000000, 0x105ec76f, 0x20bd8ede, 0x30e349b1, 0x417b1dbc, 0x5125dad3, | ||
8537 | 0x61c69362, 0x7198540d, 0x82f63b78, 0x92a8fc17, 0xa24bb5a6, 0xb21572c9, | ||
8538 | 0xc38d26c4, 0xd3d3e1ab, 0xe330a81a, 0xf36e6f75, | ||
8539 | }; | ||
8540 | |||
8541 | crc = (crc >> 4) ^ crc32_half_byte_tbl[crc & 0x0F]; | ||
8542 | crc = (crc >> 4) ^ crc32_half_byte_tbl[crc & 0x0F]; | ||
8543 | #endif | ||
8544 | #endif | ||
8545 | return crc; | ||
8546 | } | ||
8547 | |||
8548 | /* AES */ | ||
8549 | |||
8550 | #if !defined(__ARM_FEATURE_CRYPTO) && (!defined(_M_ARM64) || defined(__clang__)) | ||
8551 | /* clang-format off */ | ||
8552 | #define SSE2NEON_AES_SBOX(w) \ | ||
8553 | { \ | ||
8554 | w(0x63), w(0x7c), w(0x77), w(0x7b), w(0xf2), w(0x6b), w(0x6f), \ | ||
8555 | w(0xc5), w(0x30), w(0x01), w(0x67), w(0x2b), w(0xfe), w(0xd7), \ | ||
8556 | w(0xab), w(0x76), w(0xca), w(0x82), w(0xc9), w(0x7d), w(0xfa), \ | ||
8557 | w(0x59), w(0x47), w(0xf0), w(0xad), w(0xd4), w(0xa2), w(0xaf), \ | ||
8558 | w(0x9c), w(0xa4), w(0x72), w(0xc0), w(0xb7), w(0xfd), w(0x93), \ | ||
8559 | w(0x26), w(0x36), w(0x3f), w(0xf7), w(0xcc), w(0x34), w(0xa5), \ | ||
8560 | w(0xe5), w(0xf1), w(0x71), w(0xd8), w(0x31), w(0x15), w(0x04), \ | ||
8561 | w(0xc7), w(0x23), w(0xc3), w(0x18), w(0x96), w(0x05), w(0x9a), \ | ||
8562 | w(0x07), w(0x12), w(0x80), w(0xe2), w(0xeb), w(0x27), w(0xb2), \ | ||
8563 | w(0x75), w(0x09), w(0x83), w(0x2c), w(0x1a), w(0x1b), w(0x6e), \ | ||
8564 | w(0x5a), w(0xa0), w(0x52), w(0x3b), w(0xd6), w(0xb3), w(0x29), \ | ||
8565 | w(0xe3), w(0x2f), w(0x84), w(0x53), w(0xd1), w(0x00), w(0xed), \ | ||
8566 | w(0x20), w(0xfc), w(0xb1), w(0x5b), w(0x6a), w(0xcb), w(0xbe), \ | ||
8567 | w(0x39), w(0x4a), w(0x4c), w(0x58), w(0xcf), w(0xd0), w(0xef), \ | ||
8568 | w(0xaa), w(0xfb), w(0x43), w(0x4d), w(0x33), w(0x85), w(0x45), \ | ||
8569 | w(0xf9), w(0x02), w(0x7f), w(0x50), w(0x3c), w(0x9f), w(0xa8), \ | ||
8570 | w(0x51), w(0xa3), w(0x40), w(0x8f), w(0x92), w(0x9d), w(0x38), \ | ||
8571 | w(0xf5), w(0xbc), w(0xb6), w(0xda), w(0x21), w(0x10), w(0xff), \ | ||
8572 | w(0xf3), w(0xd2), w(0xcd), w(0x0c), w(0x13), w(0xec), w(0x5f), \ | ||
8573 | w(0x97), w(0x44), w(0x17), w(0xc4), w(0xa7), w(0x7e), w(0x3d), \ | ||
8574 | w(0x64), w(0x5d), w(0x19), w(0x73), w(0x60), w(0x81), w(0x4f), \ | ||
8575 | w(0xdc), w(0x22), w(0x2a), w(0x90), w(0x88), w(0x46), w(0xee), \ | ||
8576 | w(0xb8), w(0x14), w(0xde), w(0x5e), w(0x0b), w(0xdb), w(0xe0), \ | ||
8577 | w(0x32), w(0x3a), w(0x0a), w(0x49), w(0x06), w(0x24), w(0x5c), \ | ||
8578 | w(0xc2), w(0xd3), w(0xac), w(0x62), w(0x91), w(0x95), w(0xe4), \ | ||
8579 | w(0x79), w(0xe7), w(0xc8), w(0x37), w(0x6d), w(0x8d), w(0xd5), \ | ||
8580 | w(0x4e), w(0xa9), w(0x6c), w(0x56), w(0xf4), w(0xea), w(0x65), \ | ||
8581 | w(0x7a), w(0xae), w(0x08), w(0xba), w(0x78), w(0x25), w(0x2e), \ | ||
8582 | w(0x1c), w(0xa6), w(0xb4), w(0xc6), w(0xe8), w(0xdd), w(0x74), \ | ||
8583 | w(0x1f), w(0x4b), w(0xbd), w(0x8b), w(0x8a), w(0x70), w(0x3e), \ | ||
8584 | w(0xb5), w(0x66), w(0x48), w(0x03), w(0xf6), w(0x0e), w(0x61), \ | ||
8585 | w(0x35), w(0x57), w(0xb9), w(0x86), w(0xc1), w(0x1d), w(0x9e), \ | ||
8586 | w(0xe1), w(0xf8), w(0x98), w(0x11), w(0x69), w(0xd9), w(0x8e), \ | ||
8587 | w(0x94), w(0x9b), w(0x1e), w(0x87), w(0xe9), w(0xce), w(0x55), \ | ||
8588 | w(0x28), w(0xdf), w(0x8c), w(0xa1), w(0x89), w(0x0d), w(0xbf), \ | ||
8589 | w(0xe6), w(0x42), w(0x68), w(0x41), w(0x99), w(0x2d), w(0x0f), \ | ||
8590 | w(0xb0), w(0x54), w(0xbb), w(0x16) \ | ||
8591 | } | ||
8592 | #define SSE2NEON_AES_RSBOX(w) \ | ||
8593 | { \ | ||
8594 | w(0x52), w(0x09), w(0x6a), w(0xd5), w(0x30), w(0x36), w(0xa5), \ | ||
8595 | w(0x38), w(0xbf), w(0x40), w(0xa3), w(0x9e), w(0x81), w(0xf3), \ | ||
8596 | w(0xd7), w(0xfb), w(0x7c), w(0xe3), w(0x39), w(0x82), w(0x9b), \ | ||
8597 | w(0x2f), w(0xff), w(0x87), w(0x34), w(0x8e), w(0x43), w(0x44), \ | ||
8598 | w(0xc4), w(0xde), w(0xe9), w(0xcb), w(0x54), w(0x7b), w(0x94), \ | ||
8599 | w(0x32), w(0xa6), w(0xc2), w(0x23), w(0x3d), w(0xee), w(0x4c), \ | ||
8600 | w(0x95), w(0x0b), w(0x42), w(0xfa), w(0xc3), w(0x4e), w(0x08), \ | ||
8601 | w(0x2e), w(0xa1), w(0x66), w(0x28), w(0xd9), w(0x24), w(0xb2), \ | ||
8602 | w(0x76), w(0x5b), w(0xa2), w(0x49), w(0x6d), w(0x8b), w(0xd1), \ | ||
8603 | w(0x25), w(0x72), w(0xf8), w(0xf6), w(0x64), w(0x86), w(0x68), \ | ||
8604 | w(0x98), w(0x16), w(0xd4), w(0xa4), w(0x5c), w(0xcc), w(0x5d), \ | ||
8605 | w(0x65), w(0xb6), w(0x92), w(0x6c), w(0x70), w(0x48), w(0x50), \ | ||
8606 | w(0xfd), w(0xed), w(0xb9), w(0xda), w(0x5e), w(0x15), w(0x46), \ | ||
8607 | w(0x57), w(0xa7), w(0x8d), w(0x9d), w(0x84), w(0x90), w(0xd8), \ | ||
8608 | w(0xab), w(0x00), w(0x8c), w(0xbc), w(0xd3), w(0x0a), w(0xf7), \ | ||
8609 | w(0xe4), w(0x58), w(0x05), w(0xb8), w(0xb3), w(0x45), w(0x06), \ | ||
8610 | w(0xd0), w(0x2c), w(0x1e), w(0x8f), w(0xca), w(0x3f), w(0x0f), \ | ||
8611 | w(0x02), w(0xc1), w(0xaf), w(0xbd), w(0x03), w(0x01), w(0x13), \ | ||
8612 | w(0x8a), w(0x6b), w(0x3a), w(0x91), w(0x11), w(0x41), w(0x4f), \ | ||
8613 | w(0x67), w(0xdc), w(0xea), w(0x97), w(0xf2), w(0xcf), w(0xce), \ | ||
8614 | w(0xf0), w(0xb4), w(0xe6), w(0x73), w(0x96), w(0xac), w(0x74), \ | ||
8615 | w(0x22), w(0xe7), w(0xad), w(0x35), w(0x85), w(0xe2), w(0xf9), \ | ||
8616 | w(0x37), w(0xe8), w(0x1c), w(0x75), w(0xdf), w(0x6e), w(0x47), \ | ||
8617 | w(0xf1), w(0x1a), w(0x71), w(0x1d), w(0x29), w(0xc5), w(0x89), \ | ||
8618 | w(0x6f), w(0xb7), w(0x62), w(0x0e), w(0xaa), w(0x18), w(0xbe), \ | ||
8619 | w(0x1b), w(0xfc), w(0x56), w(0x3e), w(0x4b), w(0xc6), w(0xd2), \ | ||
8620 | w(0x79), w(0x20), w(0x9a), w(0xdb), w(0xc0), w(0xfe), w(0x78), \ | ||
8621 | w(0xcd), w(0x5a), w(0xf4), w(0x1f), w(0xdd), w(0xa8), w(0x33), \ | ||
8622 | w(0x88), w(0x07), w(0xc7), w(0x31), w(0xb1), w(0x12), w(0x10), \ | ||
8623 | w(0x59), w(0x27), w(0x80), w(0xec), w(0x5f), w(0x60), w(0x51), \ | ||
8624 | w(0x7f), w(0xa9), w(0x19), w(0xb5), w(0x4a), w(0x0d), w(0x2d), \ | ||
8625 | w(0xe5), w(0x7a), w(0x9f), w(0x93), w(0xc9), w(0x9c), w(0xef), \ | ||
8626 | w(0xa0), w(0xe0), w(0x3b), w(0x4d), w(0xae), w(0x2a), w(0xf5), \ | ||
8627 | w(0xb0), w(0xc8), w(0xeb), w(0xbb), w(0x3c), w(0x83), w(0x53), \ | ||
8628 | w(0x99), w(0x61), w(0x17), w(0x2b), w(0x04), w(0x7e), w(0xba), \ | ||
8629 | w(0x77), w(0xd6), w(0x26), w(0xe1), w(0x69), w(0x14), w(0x63), \ | ||
8630 | w(0x55), w(0x21), w(0x0c), w(0x7d) \ | ||
8631 | } | ||
8632 | /* clang-format on */ | ||
8633 | |||
8634 | /* X Macro trick. See https://en.wikipedia.org/wiki/X_Macro */ | ||
8635 | #define SSE2NEON_AES_H0(x) (x) | ||
8636 | static const uint8_t _sse2neon_sbox[256] = SSE2NEON_AES_SBOX(SSE2NEON_AES_H0); | ||
8637 | static const uint8_t _sse2neon_rsbox[256] = SSE2NEON_AES_RSBOX(SSE2NEON_AES_H0); | ||
8638 | #undef SSE2NEON_AES_H0 | ||
8639 | |||
8640 | /* x_time function and matrix multiply function */ | ||
8641 | #if !defined(__aarch64__) && !defined(_M_ARM64) | ||
8642 | #define SSE2NEON_XT(x) (((x) << 1) ^ ((((x) >> 7) & 1) * 0x1b)) | ||
8643 | #define SSE2NEON_MULTIPLY(x, y) \ | ||
8644 | (((y & 1) * x) ^ ((y >> 1 & 1) * SSE2NEON_XT(x)) ^ \ | ||
8645 | ((y >> 2 & 1) * SSE2NEON_XT(SSE2NEON_XT(x))) ^ \ | ||
8646 | ((y >> 3 & 1) * SSE2NEON_XT(SSE2NEON_XT(SSE2NEON_XT(x)))) ^ \ | ||
8647 | ((y >> 4 & 1) * SSE2NEON_XT(SSE2NEON_XT(SSE2NEON_XT(SSE2NEON_XT(x)))))) | ||
8648 | #endif | ||
8649 | |||
8650 | // In the absence of crypto extensions, implement aesenc using regular NEON | ||
8651 | // intrinsics instead. See: | ||
8652 | // https://www.workofard.com/2017/01/accelerated-aes-for-the-arm64-linux-kernel/ | ||
8653 | // https://www.workofard.com/2017/07/ghash-for-low-end-cores/ and | ||
8654 | // for more information. | ||
8655 | FORCE_INLINE __m128i _mm_aesenc_si128(__m128i a, __m128i RoundKey) | ||
8656 | { | ||
8657 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
8658 | static const uint8_t shift_rows[] = { | ||
8659 | 0x0, 0x5, 0xa, 0xf, 0x4, 0x9, 0xe, 0x3, | ||
8660 | 0x8, 0xd, 0x2, 0x7, 0xc, 0x1, 0x6, 0xb, | ||
8661 | }; | ||
8662 | static const uint8_t ror32by8[] = { | ||
8663 | 0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4, | ||
8664 | 0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc, | ||
8665 | }; | ||
8666 | |||
8667 | uint8x16_t v; | ||
8668 | uint8x16_t w = vreinterpretq_u8_m128i(a); | ||
8669 | |||
8670 | /* shift rows */ | ||
8671 | w = vqtbl1q_u8(w, vld1q_u8(shift_rows)); | ||
8672 | |||
8673 | /* sub bytes */ | ||
8674 | // Here, we separate the whole 256-bytes table into 4 64-bytes tables, and | ||
8675 | // look up each of the table. After each lookup, we load the next table | ||
8676 | // which locates at the next 64-bytes. In the meantime, the index in the | ||
8677 | // table would be smaller than it was, so the index parameters of | ||
8678 | // `vqtbx4q_u8()` need to be added the same constant as the loaded tables. | ||
8679 | v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_sbox), w); | ||
8680 | // 'w-0x40' equals to 'vsubq_u8(w, vdupq_n_u8(0x40))' | ||
8681 | v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x40), w - 0x40); | ||
8682 | v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x80), w - 0x80); | ||
8683 | v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0xc0), w - 0xc0); | ||
8684 | |||
8685 | /* mix columns */ | ||
8686 | w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b); | ||
8687 | w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v); | ||
8688 | w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8)); | ||
8689 | |||
8690 | /* add round key */ | ||
8691 | return vreinterpretq_m128i_u8(w) ^ RoundKey; | ||
8692 | |||
8693 | #else /* ARMv7-A implementation for a table-based AES */ | ||
8694 | #define SSE2NEON_AES_B2W(b0, b1, b2, b3) \ | ||
8695 | (((uint32_t) (b3) << 24) | ((uint32_t) (b2) << 16) | \ | ||
8696 | ((uint32_t) (b1) << 8) | (uint32_t) (b0)) | ||
8697 | // muliplying 'x' by 2 in GF(2^8) | ||
8698 | #define SSE2NEON_AES_F2(x) ((x << 1) ^ (((x >> 7) & 1) * 0x011b /* WPOLY */)) | ||
8699 | // muliplying 'x' by 3 in GF(2^8) | ||
8700 | #define SSE2NEON_AES_F3(x) (SSE2NEON_AES_F2(x) ^ x) | ||
8701 | #define SSE2NEON_AES_U0(p) \ | ||
8702 | SSE2NEON_AES_B2W(SSE2NEON_AES_F2(p), p, p, SSE2NEON_AES_F3(p)) | ||
8703 | #define SSE2NEON_AES_U1(p) \ | ||
8704 | SSE2NEON_AES_B2W(SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p, p) | ||
8705 | #define SSE2NEON_AES_U2(p) \ | ||
8706 | SSE2NEON_AES_B2W(p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p) | ||
8707 | #define SSE2NEON_AES_U3(p) \ | ||
8708 | SSE2NEON_AES_B2W(p, p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p)) | ||
8709 | |||
8710 | // this generates a table containing every possible permutation of | ||
8711 | // shift_rows() and sub_bytes() with mix_columns(). | ||
8712 | static const uint32_t ALIGN_STRUCT(16) aes_table[4][256] = { | ||
8713 | SSE2NEON_AES_SBOX(SSE2NEON_AES_U0), | ||
8714 | SSE2NEON_AES_SBOX(SSE2NEON_AES_U1), | ||
8715 | SSE2NEON_AES_SBOX(SSE2NEON_AES_U2), | ||
8716 | SSE2NEON_AES_SBOX(SSE2NEON_AES_U3), | ||
8717 | }; | ||
8718 | #undef SSE2NEON_AES_B2W | ||
8719 | #undef SSE2NEON_AES_F2 | ||
8720 | #undef SSE2NEON_AES_F3 | ||
8721 | #undef SSE2NEON_AES_U0 | ||
8722 | #undef SSE2NEON_AES_U1 | ||
8723 | #undef SSE2NEON_AES_U2 | ||
8724 | #undef SSE2NEON_AES_U3 | ||
8725 | |||
8726 | uint32_t x0 = _mm_cvtsi128_si32(a); // get a[31:0] | ||
8727 | uint32_t x1 = | ||
8728 | _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0x55)); // get a[63:32] | ||
8729 | uint32_t x2 = | ||
8730 | _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0xAA)); // get a[95:64] | ||
8731 | uint32_t x3 = | ||
8732 | _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0xFF)); // get a[127:96] | ||
8733 | |||
8734 | // finish the modulo addition step in mix_columns() | ||
8735 | __m128i out = _mm_set_epi32( | ||
8736 | (aes_table[0][x3 & 0xff] ^ aes_table[1][(x0 >> 8) & 0xff] ^ | ||
8737 | aes_table[2][(x1 >> 16) & 0xff] ^ aes_table[3][x2 >> 24]), | ||
8738 | (aes_table[0][x2 & 0xff] ^ aes_table[1][(x3 >> 8) & 0xff] ^ | ||
8739 | aes_table[2][(x0 >> 16) & 0xff] ^ aes_table[3][x1 >> 24]), | ||
8740 | (aes_table[0][x1 & 0xff] ^ aes_table[1][(x2 >> 8) & 0xff] ^ | ||
8741 | aes_table[2][(x3 >> 16) & 0xff] ^ aes_table[3][x0 >> 24]), | ||
8742 | (aes_table[0][x0 & 0xff] ^ aes_table[1][(x1 >> 8) & 0xff] ^ | ||
8743 | aes_table[2][(x2 >> 16) & 0xff] ^ aes_table[3][x3 >> 24])); | ||
8744 | |||
8745 | return _mm_xor_si128(out, RoundKey); | ||
8746 | #endif | ||
8747 | } | ||
8748 | |||
8749 | // Perform one round of an AES decryption flow on data (state) in a using the | ||
8750 | // round key in RoundKey, and store the result in dst. | ||
8751 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdec_si128 | ||
8752 | FORCE_INLINE __m128i _mm_aesdec_si128(__m128i a, __m128i RoundKey) | ||
8753 | { | ||
8754 | #if defined(__aarch64__) | ||
8755 | static const uint8_t inv_shift_rows[] = { | ||
8756 | 0x0, 0xd, 0xa, 0x7, 0x4, 0x1, 0xe, 0xb, | ||
8757 | 0x8, 0x5, 0x2, 0xf, 0xc, 0x9, 0x6, 0x3, | ||
8758 | }; | ||
8759 | static const uint8_t ror32by8[] = { | ||
8760 | 0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4, | ||
8761 | 0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc, | ||
8762 | }; | ||
8763 | |||
8764 | uint8x16_t v; | ||
8765 | uint8x16_t w = vreinterpretq_u8_m128i(a); | ||
8766 | |||
8767 | // inverse shift rows | ||
8768 | w = vqtbl1q_u8(w, vld1q_u8(inv_shift_rows)); | ||
8769 | |||
8770 | // inverse sub bytes | ||
8771 | v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_rsbox), w); | ||
8772 | v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x40), w - 0x40); | ||
8773 | v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x80), w - 0x80); | ||
8774 | v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0xc0), w - 0xc0); | ||
8775 | |||
8776 | // inverse mix columns | ||
8777 | // multiplying 'v' by 4 in GF(2^8) | ||
8778 | w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b); | ||
8779 | w = (w << 1) ^ (uint8x16_t) (((int8x16_t) w >> 7) & 0x1b); | ||
8780 | v ^= w; | ||
8781 | v ^= (uint8x16_t) vrev32q_u16((uint16x8_t) w); | ||
8782 | |||
8783 | w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & | ||
8784 | 0x1b); // muliplying 'v' by 2 in GF(2^8) | ||
8785 | w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v); | ||
8786 | w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8)); | ||
8787 | |||
8788 | // add round key | ||
8789 | return vreinterpretq_m128i_u8(w) ^ RoundKey; | ||
8790 | |||
8791 | #else /* ARMv7-A NEON implementation */ | ||
8792 | /* FIXME: optimized for NEON */ | ||
8793 | uint8_t i, e, f, g, h, v[4][4]; | ||
8794 | uint8_t *_a = (uint8_t *) &a; | ||
8795 | for (i = 0; i < 16; ++i) { | ||
8796 | v[((i / 4) + (i % 4)) % 4][i % 4] = _sse2neon_rsbox[_a[i]]; | ||
8797 | } | ||
8798 | |||
8799 | // inverse mix columns | ||
8800 | for (i = 0; i < 4; ++i) { | ||
8801 | e = v[i][0]; | ||
8802 | f = v[i][1]; | ||
8803 | g = v[i][2]; | ||
8804 | h = v[i][3]; | ||
8805 | |||
8806 | v[i][0] = SSE2NEON_MULTIPLY(e, 0x0e) ^ SSE2NEON_MULTIPLY(f, 0x0b) ^ | ||
8807 | SSE2NEON_MULTIPLY(g, 0x0d) ^ SSE2NEON_MULTIPLY(h, 0x09); | ||
8808 | v[i][1] = SSE2NEON_MULTIPLY(e, 0x09) ^ SSE2NEON_MULTIPLY(f, 0x0e) ^ | ||
8809 | SSE2NEON_MULTIPLY(g, 0x0b) ^ SSE2NEON_MULTIPLY(h, 0x0d); | ||
8810 | v[i][2] = SSE2NEON_MULTIPLY(e, 0x0d) ^ SSE2NEON_MULTIPLY(f, 0x09) ^ | ||
8811 | SSE2NEON_MULTIPLY(g, 0x0e) ^ SSE2NEON_MULTIPLY(h, 0x0b); | ||
8812 | v[i][3] = SSE2NEON_MULTIPLY(e, 0x0b) ^ SSE2NEON_MULTIPLY(f, 0x0d) ^ | ||
8813 | SSE2NEON_MULTIPLY(g, 0x09) ^ SSE2NEON_MULTIPLY(h, 0x0e); | ||
8814 | } | ||
8815 | |||
8816 | return vreinterpretq_m128i_u8(vld1q_u8((uint8_t *) v)) ^ RoundKey; | ||
8817 | #endif | ||
8818 | } | ||
8819 | |||
8820 | // Perform the last round of an AES encryption flow on data (state) in a using | ||
8821 | // the round key in RoundKey, and store the result in dst. | ||
8822 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesenclast_si128 | ||
8823 | FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey) | ||
8824 | { | ||
8825 | #if defined(__aarch64__) | ||
8826 | static const uint8_t shift_rows[] = { | ||
8827 | 0x0, 0x5, 0xa, 0xf, 0x4, 0x9, 0xe, 0x3, | ||
8828 | 0x8, 0xd, 0x2, 0x7, 0xc, 0x1, 0x6, 0xb, | ||
8829 | }; | ||
8830 | |||
8831 | uint8x16_t v; | ||
8832 | uint8x16_t w = vreinterpretq_u8_m128i(a); | ||
8833 | |||
8834 | // shift rows | ||
8835 | w = vqtbl1q_u8(w, vld1q_u8(shift_rows)); | ||
8836 | |||
8837 | // sub bytes | ||
8838 | v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_sbox), w); | ||
8839 | v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x40), w - 0x40); | ||
8840 | v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x80), w - 0x80); | ||
8841 | v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0xc0), w - 0xc0); | ||
8842 | |||
8843 | // add round key | ||
8844 | return vreinterpretq_m128i_u8(v) ^ RoundKey; | ||
8845 | |||
8846 | #else /* ARMv7-A implementation */ | ||
8847 | uint8_t v[16] = { | ||
8848 | _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 0)], | ||
8849 | _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 5)], | ||
8850 | _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 10)], | ||
8851 | _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 15)], | ||
8852 | _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 4)], | ||
8853 | _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 9)], | ||
8854 | _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 14)], | ||
8855 | _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 3)], | ||
8856 | _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 8)], | ||
8857 | _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 13)], | ||
8858 | _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 2)], | ||
8859 | _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 7)], | ||
8860 | _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 12)], | ||
8861 | _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 1)], | ||
8862 | _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 6)], | ||
8863 | _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 11)], | ||
8864 | }; | ||
8865 | |||
8866 | return vreinterpretq_m128i_u8(vld1q_u8(v)) ^ RoundKey; | ||
8867 | #endif | ||
8868 | } | ||
8869 | |||
8870 | // Perform the last round of an AES decryption flow on data (state) in a using | ||
8871 | // the round key in RoundKey, and store the result in dst. | ||
8872 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdeclast_si128 | ||
8873 | FORCE_INLINE __m128i _mm_aesdeclast_si128(__m128i a, __m128i RoundKey) | ||
8874 | { | ||
8875 | #if defined(__aarch64__) | ||
8876 | static const uint8_t inv_shift_rows[] = { | ||
8877 | 0x0, 0xd, 0xa, 0x7, 0x4, 0x1, 0xe, 0xb, | ||
8878 | 0x8, 0x5, 0x2, 0xf, 0xc, 0x9, 0x6, 0x3, | ||
8879 | }; | ||
8880 | |||
8881 | uint8x16_t v; | ||
8882 | uint8x16_t w = vreinterpretq_u8_m128i(a); | ||
8883 | |||
8884 | // inverse shift rows | ||
8885 | w = vqtbl1q_u8(w, vld1q_u8(inv_shift_rows)); | ||
8886 | |||
8887 | // inverse sub bytes | ||
8888 | v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_rsbox), w); | ||
8889 | v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x40), w - 0x40); | ||
8890 | v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x80), w - 0x80); | ||
8891 | v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0xc0), w - 0xc0); | ||
8892 | |||
8893 | // add round key | ||
8894 | return vreinterpretq_m128i_u8(v) ^ RoundKey; | ||
8895 | |||
8896 | #else /* ARMv7-A NEON implementation */ | ||
8897 | /* FIXME: optimized for NEON */ | ||
8898 | uint8_t v[4][4]; | ||
8899 | uint8_t *_a = (uint8_t *) &a; | ||
8900 | for (int i = 0; i < 16; ++i) { | ||
8901 | v[((i / 4) + (i % 4)) % 4][i % 4] = _sse2neon_rsbox[_a[i]]; | ||
8902 | } | ||
8903 | |||
8904 | return vreinterpretq_m128i_u8(vld1q_u8((uint8_t *) v)) ^ RoundKey; | ||
8905 | #endif | ||
8906 | } | ||
8907 | |||
8908 | // Perform the InvMixColumns transformation on a and store the result in dst. | ||
8909 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesimc_si128 | ||
8910 | FORCE_INLINE __m128i _mm_aesimc_si128(__m128i a) | ||
8911 | { | ||
8912 | #if defined(__aarch64__) | ||
8913 | static const uint8_t ror32by8[] = { | ||
8914 | 0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4, | ||
8915 | 0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc, | ||
8916 | }; | ||
8917 | uint8x16_t v = vreinterpretq_u8_m128i(a); | ||
8918 | uint8x16_t w; | ||
8919 | |||
8920 | // multiplying 'v' by 4 in GF(2^8) | ||
8921 | w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b); | ||
8922 | w = (w << 1) ^ (uint8x16_t) (((int8x16_t) w >> 7) & 0x1b); | ||
8923 | v ^= w; | ||
8924 | v ^= (uint8x16_t) vrev32q_u16((uint16x8_t) w); | ||
8925 | |||
8926 | // multiplying 'v' by 2 in GF(2^8) | ||
8927 | w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b); | ||
8928 | w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v); | ||
8929 | w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8)); | ||
8930 | return vreinterpretq_m128i_u8(w); | ||
8931 | |||
8932 | #else /* ARMv7-A NEON implementation */ | ||
8933 | uint8_t i, e, f, g, h, v[4][4]; | ||
8934 | vst1q_u8((uint8_t *) v, vreinterpretq_u8_m128i(a)); | ||
8935 | for (i = 0; i < 4; ++i) { | ||
8936 | e = v[i][0]; | ||
8937 | f = v[i][1]; | ||
8938 | g = v[i][2]; | ||
8939 | h = v[i][3]; | ||
8940 | |||
8941 | v[i][0] = SSE2NEON_MULTIPLY(e, 0x0e) ^ SSE2NEON_MULTIPLY(f, 0x0b) ^ | ||
8942 | SSE2NEON_MULTIPLY(g, 0x0d) ^ SSE2NEON_MULTIPLY(h, 0x09); | ||
8943 | v[i][1] = SSE2NEON_MULTIPLY(e, 0x09) ^ SSE2NEON_MULTIPLY(f, 0x0e) ^ | ||
8944 | SSE2NEON_MULTIPLY(g, 0x0b) ^ SSE2NEON_MULTIPLY(h, 0x0d); | ||
8945 | v[i][2] = SSE2NEON_MULTIPLY(e, 0x0d) ^ SSE2NEON_MULTIPLY(f, 0x09) ^ | ||
8946 | SSE2NEON_MULTIPLY(g, 0x0e) ^ SSE2NEON_MULTIPLY(h, 0x0b); | ||
8947 | v[i][3] = SSE2NEON_MULTIPLY(e, 0x0b) ^ SSE2NEON_MULTIPLY(f, 0x0d) ^ | ||
8948 | SSE2NEON_MULTIPLY(g, 0x09) ^ SSE2NEON_MULTIPLY(h, 0x0e); | ||
8949 | } | ||
8950 | |||
8951 | return vreinterpretq_m128i_u8(vld1q_u8((uint8_t *) v)); | ||
8952 | #endif | ||
8953 | } | ||
8954 | |||
8955 | // Assist in expanding the AES cipher key by computing steps towards generating | ||
8956 | // a round key for encryption cipher using data from a and an 8-bit round | ||
8957 | // constant specified in imm8, and store the result in dst. | ||
8958 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aeskeygenassist_si128 | ||
8959 | // | ||
8960 | // Emits the Advanced Encryption Standard (AES) instruction aeskeygenassist. | ||
8961 | // This instruction generates a round key for AES encryption. See | ||
8962 | // https://kazakov.life/2017/11/01/cryptocurrency-mining-on-ios-devices/ | ||
8963 | // for details. | ||
8964 | FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon) | ||
8965 | { | ||
8966 | #if defined(__aarch64__) | ||
8967 | uint8x16_t _a = vreinterpretq_u8_m128i(a); | ||
8968 | uint8x16_t v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_sbox), _a); | ||
8969 | v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x40), _a - 0x40); | ||
8970 | v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x80), _a - 0x80); | ||
8971 | v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0xc0), _a - 0xc0); | ||
8972 | |||
8973 | uint32x4_t v_u32 = vreinterpretq_u32_u8(v); | ||
8974 | uint32x4_t ror_v = vorrq_u32(vshrq_n_u32(v_u32, 8), vshlq_n_u32(v_u32, 24)); | ||
8975 | uint32x4_t ror_xor_v = veorq_u32(ror_v, vdupq_n_u32(rcon)); | ||
8976 | |||
8977 | return vreinterpretq_m128i_u32(vtrn2q_u32(v_u32, ror_xor_v)); | ||
8978 | |||
8979 | #else /* ARMv7-A NEON implementation */ | ||
8980 | uint32_t X1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0x55)); | ||
8981 | uint32_t X3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0xFF)); | ||
8982 | for (int i = 0; i < 4; ++i) { | ||
8983 | ((uint8_t *) &X1)[i] = _sse2neon_sbox[((uint8_t *) &X1)[i]]; | ||
8984 | ((uint8_t *) &X3)[i] = _sse2neon_sbox[((uint8_t *) &X3)[i]]; | ||
8985 | } | ||
8986 | return _mm_set_epi32(((X3 >> 8) | (X3 << 24)) ^ rcon, X3, | ||
8987 | ((X1 >> 8) | (X1 << 24)) ^ rcon, X1); | ||
8988 | #endif | ||
8989 | } | ||
8990 | #undef SSE2NEON_AES_SBOX | ||
8991 | #undef SSE2NEON_AES_RSBOX | ||
8992 | |||
8993 | #if defined(__aarch64__) | ||
8994 | #undef SSE2NEON_XT | ||
8995 | #undef SSE2NEON_MULTIPLY | ||
8996 | #endif | ||
8997 | |||
8998 | #else /* __ARM_FEATURE_CRYPTO */ | ||
8999 | // Implements equivalent of 'aesenc' by combining AESE (with an empty key) and | ||
9000 | // AESMC and then manually applying the real key as an xor operation. This | ||
9001 | // unfortunately means an additional xor op; the compiler should be able to | ||
9002 | // optimize this away for repeated calls however. See | ||
9003 | // https://blog.michaelbrase.com/2018/05/08/emulating-x86-aes-intrinsics-on-armv8-a | ||
9004 | // for more details. | ||
9005 | FORCE_INLINE __m128i _mm_aesenc_si128(__m128i a, __m128i b) | ||
9006 | { | ||
9007 | return vreinterpretq_m128i_u8(veorq_u8( | ||
9008 | vaesmcq_u8(vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0))), | ||
9009 | vreinterpretq_u8_m128i(b))); | ||
9010 | } | ||
9011 | |||
9012 | // Perform one round of an AES decryption flow on data (state) in a using the | ||
9013 | // round key in RoundKey, and store the result in dst. | ||
9014 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdec_si128 | ||
9015 | FORCE_INLINE __m128i _mm_aesdec_si128(__m128i a, __m128i RoundKey) | ||
9016 | { | ||
9017 | return vreinterpretq_m128i_u8(veorq_u8( | ||
9018 | vaesimcq_u8(vaesdq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0))), | ||
9019 | vreinterpretq_u8_m128i(RoundKey))); | ||
9020 | } | ||
9021 | |||
9022 | // Perform the last round of an AES encryption flow on data (state) in a using | ||
9023 | // the round key in RoundKey, and store the result in dst. | ||
9024 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesenclast_si128 | ||
9025 | FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey) | ||
9026 | { | ||
9027 | return _mm_xor_si128(vreinterpretq_m128i_u8(vaeseq_u8( | ||
9028 | vreinterpretq_u8_m128i(a), vdupq_n_u8(0))), | ||
9029 | RoundKey); | ||
9030 | } | ||
9031 | |||
9032 | // Perform the last round of an AES decryption flow on data (state) in a using | ||
9033 | // the round key in RoundKey, and store the result in dst. | ||
9034 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdeclast_si128 | ||
9035 | FORCE_INLINE __m128i _mm_aesdeclast_si128(__m128i a, __m128i RoundKey) | ||
9036 | { | ||
9037 | return vreinterpretq_m128i_u8( | ||
9038 | veorq_u8(vaesdq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0)), | ||
9039 | vreinterpretq_u8_m128i(RoundKey))); | ||
9040 | } | ||
9041 | |||
9042 | // Perform the InvMixColumns transformation on a and store the result in dst. | ||
9043 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesimc_si128 | ||
9044 | FORCE_INLINE __m128i _mm_aesimc_si128(__m128i a) | ||
9045 | { | ||
9046 | return vreinterpretq_m128i_u8(vaesimcq_u8(vreinterpretq_u8_m128i(a))); | ||
9047 | } | ||
9048 | |||
9049 | // Assist in expanding the AES cipher key by computing steps towards generating | ||
9050 | // a round key for encryption cipher using data from a and an 8-bit round | ||
9051 | // constant specified in imm8, and store the result in dst." | ||
9052 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aeskeygenassist_si128 | ||
9053 | FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon) | ||
9054 | { | ||
9055 | // AESE does ShiftRows and SubBytes on A | ||
9056 | uint8x16_t u8 = vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0)); | ||
9057 | |||
9058 | #ifndef _MSC_VER | ||
9059 | uint8x16_t dest = { | ||
9060 | // Undo ShiftRows step from AESE and extract X1 and X3 | ||
9061 | u8[0x4], u8[0x1], u8[0xE], u8[0xB], // SubBytes(X1) | ||
9062 | u8[0x1], u8[0xE], u8[0xB], u8[0x4], // ROT(SubBytes(X1)) | ||
9063 | u8[0xC], u8[0x9], u8[0x6], u8[0x3], // SubBytes(X3) | ||
9064 | u8[0x9], u8[0x6], u8[0x3], u8[0xC], // ROT(SubBytes(X3)) | ||
9065 | }; | ||
9066 | uint32x4_t r = {0, (unsigned) rcon, 0, (unsigned) rcon}; | ||
9067 | return vreinterpretq_m128i_u8(dest) ^ vreinterpretq_m128i_u32(r); | ||
9068 | #else | ||
9069 | // We have to do this hack because MSVC is strictly adhering to the CPP | ||
9070 | // standard, in particular C++03 8.5.1 sub-section 15, which states that | ||
9071 | // unions must be initialized by their first member type. | ||
9072 | |||
9073 | // As per the Windows ARM64 ABI, it is always little endian, so this works | ||
9074 | __n128 dest{ | ||
9075 | ((uint64_t) u8.n128_u8[0x4] << 0) | ((uint64_t) u8.n128_u8[0x1] << 8) | | ||
9076 | ((uint64_t) u8.n128_u8[0xE] << 16) | | ||
9077 | ((uint64_t) u8.n128_u8[0xB] << 24) | | ||
9078 | ((uint64_t) u8.n128_u8[0x1] << 32) | | ||
9079 | ((uint64_t) u8.n128_u8[0xE] << 40) | | ||
9080 | ((uint64_t) u8.n128_u8[0xB] << 48) | | ||
9081 | ((uint64_t) u8.n128_u8[0x4] << 56), | ||
9082 | ((uint64_t) u8.n128_u8[0xC] << 0) | ((uint64_t) u8.n128_u8[0x9] << 8) | | ||
9083 | ((uint64_t) u8.n128_u8[0x6] << 16) | | ||
9084 | ((uint64_t) u8.n128_u8[0x3] << 24) | | ||
9085 | ((uint64_t) u8.n128_u8[0x9] << 32) | | ||
9086 | ((uint64_t) u8.n128_u8[0x6] << 40) | | ||
9087 | ((uint64_t) u8.n128_u8[0x3] << 48) | | ||
9088 | ((uint64_t) u8.n128_u8[0xC] << 56)}; | ||
9089 | |||
9090 | dest.n128_u32[1] = dest.n128_u32[1] ^ rcon; | ||
9091 | dest.n128_u32[3] = dest.n128_u32[3] ^ rcon; | ||
9092 | |||
9093 | return dest; | ||
9094 | #endif | ||
9095 | } | ||
9096 | #endif | ||
9097 | |||
9098 | /* Others */ | ||
9099 | |||
9100 | // Perform a carry-less multiplication of two 64-bit integers, selected from a | ||
9101 | // and b according to imm8, and store the results in dst. | ||
9102 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_clmulepi64_si128 | ||
9103 | FORCE_INLINE __m128i _mm_clmulepi64_si128(__m128i _a, __m128i _b, const int imm) | ||
9104 | { | ||
9105 | uint64x2_t a = vreinterpretq_u64_m128i(_a); | ||
9106 | uint64x2_t b = vreinterpretq_u64_m128i(_b); | ||
9107 | switch (imm & 0x11) { | ||
9108 | case 0x00: | ||
9109 | return vreinterpretq_m128i_u64( | ||
9110 | _sse2neon_vmull_p64(vget_low_u64(a), vget_low_u64(b))); | ||
9111 | case 0x01: | ||
9112 | return vreinterpretq_m128i_u64( | ||
9113 | _sse2neon_vmull_p64(vget_high_u64(a), vget_low_u64(b))); | ||
9114 | case 0x10: | ||
9115 | return vreinterpretq_m128i_u64( | ||
9116 | _sse2neon_vmull_p64(vget_low_u64(a), vget_high_u64(b))); | ||
9117 | case 0x11: | ||
9118 | return vreinterpretq_m128i_u64( | ||
9119 | _sse2neon_vmull_p64(vget_high_u64(a), vget_high_u64(b))); | ||
9120 | default: | ||
9121 | abort(); | ||
9122 | } | ||
9123 | } | ||
9124 | |||
9125 | FORCE_INLINE unsigned int _sse2neon_mm_get_denormals_zero_mode(void) | ||
9126 | { | ||
9127 | union { | ||
9128 | fpcr_bitfield field; | ||
9129 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
9130 | uint64_t value; | ||
9131 | #else | ||
9132 | uint32_t value; | ||
9133 | #endif | ||
9134 | } r; | ||
9135 | |||
9136 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
9137 | r.value = _sse2neon_get_fpcr(); | ||
9138 | #else | ||
9139 | __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */ | ||
9140 | #endif | ||
9141 | |||
9142 | return r.field.bit24 ? _MM_DENORMALS_ZERO_ON : _MM_DENORMALS_ZERO_OFF; | ||
9143 | } | ||
9144 | |||
9145 | // Count the number of bits set to 1 in unsigned 32-bit integer a, and | ||
9146 | // return that count in dst. | ||
9147 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_popcnt_u32 | ||
9148 | FORCE_INLINE int _mm_popcnt_u32(unsigned int a) | ||
9149 | { | ||
9150 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
9151 | #if __has_builtin(__builtin_popcount) | ||
9152 | return __builtin_popcount(a); | ||
9153 | #elif defined(_MSC_VER) | ||
9154 | return _CountOneBits(a); | ||
9155 | #else | ||
9156 | return (int) vaddlv_u8(vcnt_u8(vcreate_u8((uint64_t) a))); | ||
9157 | #endif | ||
9158 | #else | ||
9159 | uint32_t count = 0; | ||
9160 | uint8x8_t input_val, count8x8_val; | ||
9161 | uint16x4_t count16x4_val; | ||
9162 | uint32x2_t count32x2_val; | ||
9163 | |||
9164 | input_val = vld1_u8((uint8_t *) &a); | ||
9165 | count8x8_val = vcnt_u8(input_val); | ||
9166 | count16x4_val = vpaddl_u8(count8x8_val); | ||
9167 | count32x2_val = vpaddl_u16(count16x4_val); | ||
9168 | |||
9169 | vst1_u32(&count, count32x2_val); | ||
9170 | return count; | ||
9171 | #endif | ||
9172 | } | ||
9173 | |||
9174 | // Count the number of bits set to 1 in unsigned 64-bit integer a, and | ||
9175 | // return that count in dst. | ||
9176 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_popcnt_u64 | ||
9177 | FORCE_INLINE int64_t _mm_popcnt_u64(uint64_t a) | ||
9178 | { | ||
9179 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
9180 | #if __has_builtin(__builtin_popcountll) | ||
9181 | return __builtin_popcountll(a); | ||
9182 | #elif defined(_MSC_VER) | ||
9183 | return _CountOneBits64(a); | ||
9184 | #else | ||
9185 | return (int64_t) vaddlv_u8(vcnt_u8(vcreate_u8(a))); | ||
9186 | #endif | ||
9187 | #else | ||
9188 | uint64_t count = 0; | ||
9189 | uint8x8_t input_val, count8x8_val; | ||
9190 | uint16x4_t count16x4_val; | ||
9191 | uint32x2_t count32x2_val; | ||
9192 | uint64x1_t count64x1_val; | ||
9193 | |||
9194 | input_val = vld1_u8((uint8_t *) &a); | ||
9195 | count8x8_val = vcnt_u8(input_val); | ||
9196 | count16x4_val = vpaddl_u8(count8x8_val); | ||
9197 | count32x2_val = vpaddl_u16(count16x4_val); | ||
9198 | count64x1_val = vpaddl_u32(count32x2_val); | ||
9199 | vst1_u64(&count, count64x1_val); | ||
9200 | return count; | ||
9201 | #endif | ||
9202 | } | ||
9203 | |||
9204 | FORCE_INLINE void _sse2neon_mm_set_denormals_zero_mode(unsigned int flag) | ||
9205 | { | ||
9206 | // AArch32 Advanced SIMD arithmetic always uses the Flush-to-zero setting, | ||
9207 | // regardless of the value of the FZ bit. | ||
9208 | union { | ||
9209 | fpcr_bitfield field; | ||
9210 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
9211 | uint64_t value; | ||
9212 | #else | ||
9213 | uint32_t value; | ||
9214 | #endif | ||
9215 | } r; | ||
9216 | |||
9217 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
9218 | r.value = _sse2neon_get_fpcr(); | ||
9219 | #else | ||
9220 | __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */ | ||
9221 | #endif | ||
9222 | |||
9223 | r.field.bit24 = (flag & _MM_DENORMALS_ZERO_MASK) == _MM_DENORMALS_ZERO_ON; | ||
9224 | |||
9225 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
9226 | _sse2neon_set_fpcr(r.value); | ||
9227 | #else | ||
9228 | __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r)); /* write */ | ||
9229 | #endif | ||
9230 | } | ||
9231 | |||
9232 | // Return the current 64-bit value of the processor's time-stamp counter. | ||
9233 | // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=rdtsc | ||
9234 | FORCE_INLINE uint64_t _rdtsc(void) | ||
9235 | { | ||
9236 | #if defined(__aarch64__) || defined(_M_ARM64) | ||
9237 | uint64_t val; | ||
9238 | |||
9239 | /* According to ARM DDI 0487F.c, from Armv8.0 to Armv8.5 inclusive, the | ||
9240 | * system counter is at least 56 bits wide; from Armv8.6, the counter | ||
9241 | * must be 64 bits wide. So the system counter could be less than 64 | ||
9242 | * bits wide and it is attributed with the flag 'cap_user_time_short' | ||
9243 | * is true. | ||
9244 | */ | ||
9245 | #if defined(_MSC_VER) | ||
9246 | val = _ReadStatusReg(ARM64_SYSREG(3, 3, 14, 0, 2)); | ||
9247 | #else | ||
9248 | __asm__ __volatile__("mrs %0, cntvct_el0" : "=r"(val)); | ||
9249 | #endif | ||
9250 | |||
9251 | return val; | ||
9252 | #else | ||
9253 | uint32_t pmccntr, pmuseren, pmcntenset; | ||
9254 | // Read the user mode Performance Monitoring Unit (PMU) | ||
9255 | // User Enable Register (PMUSERENR) access permissions. | ||
9256 | __asm__ __volatile__("mrc p15, 0, %0, c9, c14, 0" : "=r"(pmuseren)); | ||
9257 | if (pmuseren & 1) { // Allows reading PMUSERENR for user mode code. | ||
9258 | __asm__ __volatile__("mrc p15, 0, %0, c9, c12, 1" : "=r"(pmcntenset)); | ||
9259 | if (pmcntenset & 0x80000000UL) { // Is it counting? | ||
9260 | __asm__ __volatile__("mrc p15, 0, %0, c9, c13, 0" : "=r"(pmccntr)); | ||
9261 | // The counter is set up to count every 64th cycle | ||
9262 | return (uint64_t) (pmccntr) << 6; | ||
9263 | } | ||
9264 | } | ||
9265 | |||
9266 | // Fallback to syscall as we can't enable PMUSERENR in user mode. | ||
9267 | struct timeval tv; | ||
9268 | gettimeofday(&tv, NULL); | ||
9269 | return (uint64_t) (tv.tv_sec) * 1000000 + tv.tv_usec; | ||
9270 | #endif | ||
9271 | } | ||
9272 | |||
9273 | #if defined(__GNUC__) || defined(__clang__) | ||
9274 | #pragma pop_macro("ALIGN_STRUCT") | ||
9275 | #pragma pop_macro("FORCE_INLINE") | ||
9276 | #endif | ||
9277 | |||
9278 | #if defined(__GNUC__) && !defined(__clang__) | ||
9279 | #pragma GCC pop_options | ||
9280 | #endif | ||
9281 | |||
9282 | #endif | ||
diff --git a/src/android/app/src/main/AndroidManifest.xml b/src/android/app/src/main/AndroidManifest.xml index f011bd696..7890b30ca 100755 --- a/src/android/app/src/main/AndroidManifest.xml +++ b/src/android/app/src/main/AndroidManifest.xml | |||
@@ -12,8 +12,6 @@ SPDX-License-Identifier: GPL-3.0-or-later | |||
12 | <uses-feature android:name="android.hardware.vulkan.version" android:version="0x401000" android:required="true" /> | 12 | <uses-feature android:name="android.hardware.vulkan.version" android:version="0x401000" android:required="true" /> |
13 | 13 | ||
14 | <uses-permission android:name="android.permission.INTERNET" /> | 14 | <uses-permission android:name="android.permission.INTERNET" /> |
15 | <uses-permission android:name="android.permission.FOREGROUND_SERVICE" /> | ||
16 | <uses-permission android:name="android.permission.FOREGROUND_SERVICE_SPECIAL_USE" /> | ||
17 | <uses-permission android:name="android.permission.NFC" /> | 15 | <uses-permission android:name="android.permission.NFC" /> |
18 | <uses-permission android:name="android.permission.POST_NOTIFICATIONS" /> | 16 | <uses-permission android:name="android.permission.POST_NOTIFICATIONS" /> |
19 | 17 | ||
@@ -80,10 +78,6 @@ SPDX-License-Identifier: GPL-3.0-or-later | |||
80 | android:resource="@xml/nfc_tech_filter" /> | 78 | android:resource="@xml/nfc_tech_filter" /> |
81 | </activity> | 79 | </activity> |
82 | 80 | ||
83 | <service android:name="org.yuzu.yuzu_emu.utils.ForegroundService" android:foregroundServiceType="specialUse"> | ||
84 | <property android:name="android.app.PROPERTY_SPECIAL_USE_FGS_SUBTYPE" android:value="Keep emulation running in background"/> | ||
85 | </service> | ||
86 | |||
87 | <provider | 81 | <provider |
88 | android:name=".features.DocumentProvider" | 82 | android:name=".features.DocumentProvider" |
89 | android:authorities="${applicationId}.user" | 83 | android:authorities="${applicationId}.user" |
diff --git a/src/android/app/src/main/java/org/yuzu/yuzu_emu/YuzuApplication.kt b/src/android/app/src/main/java/org/yuzu/yuzu_emu/YuzuApplication.kt index d114bd53d..76778c10a 100755 --- a/src/android/app/src/main/java/org/yuzu/yuzu_emu/YuzuApplication.kt +++ b/src/android/app/src/main/java/org/yuzu/yuzu_emu/YuzuApplication.kt | |||
@@ -17,17 +17,6 @@ fun Context.getPublicFilesDir(): File = getExternalFilesDir(null) ?: filesDir | |||
17 | 17 | ||
18 | class YuzuApplication : Application() { | 18 | class YuzuApplication : Application() { |
19 | private fun createNotificationChannels() { | 19 | private fun createNotificationChannels() { |
20 | val emulationChannel = NotificationChannel( | ||
21 | getString(R.string.emulation_notification_channel_id), | ||
22 | getString(R.string.emulation_notification_channel_name), | ||
23 | NotificationManager.IMPORTANCE_LOW | ||
24 | ) | ||
25 | emulationChannel.description = getString( | ||
26 | R.string.emulation_notification_channel_description | ||
27 | ) | ||
28 | emulationChannel.setSound(null, null) | ||
29 | emulationChannel.vibrationPattern = null | ||
30 | |||
31 | val noticeChannel = NotificationChannel( | 20 | val noticeChannel = NotificationChannel( |
32 | getString(R.string.notice_notification_channel_id), | 21 | getString(R.string.notice_notification_channel_id), |
33 | getString(R.string.notice_notification_channel_name), | 22 | getString(R.string.notice_notification_channel_name), |
@@ -39,7 +28,6 @@ class YuzuApplication : Application() { | |||
39 | // Register the channel with the system; you can't change the importance | 28 | // Register the channel with the system; you can't change the importance |
40 | // or other notification behaviors after this | 29 | // or other notification behaviors after this |
41 | val notificationManager = getSystemService(NotificationManager::class.java) | 30 | val notificationManager = getSystemService(NotificationManager::class.java) |
42 | notificationManager.createNotificationChannel(emulationChannel) | ||
43 | notificationManager.createNotificationChannel(noticeChannel) | 31 | notificationManager.createNotificationChannel(noticeChannel) |
44 | } | 32 | } |
45 | 33 | ||
diff --git a/src/android/app/src/main/java/org/yuzu/yuzu_emu/activities/EmulationActivity.kt b/src/android/app/src/main/java/org/yuzu/yuzu_emu/activities/EmulationActivity.kt index 564aaf305..7a8d03610 100755 --- a/src/android/app/src/main/java/org/yuzu/yuzu_emu/activities/EmulationActivity.kt +++ b/src/android/app/src/main/java/org/yuzu/yuzu_emu/activities/EmulationActivity.kt | |||
@@ -4,7 +4,6 @@ | |||
4 | package org.yuzu.yuzu_emu.activities | 4 | package org.yuzu.yuzu_emu.activities |
5 | 5 | ||
6 | import android.annotation.SuppressLint | 6 | import android.annotation.SuppressLint |
7 | import android.app.Activity | ||
8 | import android.app.PendingIntent | 7 | import android.app.PendingIntent |
9 | import android.app.PictureInPictureParams | 8 | import android.app.PictureInPictureParams |
10 | import android.app.RemoteAction | 9 | import android.app.RemoteAction |
@@ -45,7 +44,6 @@ import org.yuzu.yuzu_emu.features.settings.model.IntSetting | |||
45 | import org.yuzu.yuzu_emu.features.settings.model.Settings | 44 | import org.yuzu.yuzu_emu.features.settings.model.Settings |
46 | import org.yuzu.yuzu_emu.model.EmulationViewModel | 45 | import org.yuzu.yuzu_emu.model.EmulationViewModel |
47 | import org.yuzu.yuzu_emu.model.Game | 46 | import org.yuzu.yuzu_emu.model.Game |
48 | import org.yuzu.yuzu_emu.utils.ForegroundService | ||
49 | import org.yuzu.yuzu_emu.utils.InputHandler | 47 | import org.yuzu.yuzu_emu.utils.InputHandler |
50 | import org.yuzu.yuzu_emu.utils.Log | 48 | import org.yuzu.yuzu_emu.utils.Log |
51 | import org.yuzu.yuzu_emu.utils.MemoryUtil | 49 | import org.yuzu.yuzu_emu.utils.MemoryUtil |
@@ -74,11 +72,6 @@ class EmulationActivity : AppCompatActivity(), SensorEventListener { | |||
74 | 72 | ||
75 | private val emulationViewModel: EmulationViewModel by viewModels() | 73 | private val emulationViewModel: EmulationViewModel by viewModels() |
76 | 74 | ||
77 | override fun onDestroy() { | ||
78 | stopForegroundService(this) | ||
79 | super.onDestroy() | ||
80 | } | ||
81 | |||
82 | override fun onCreate(savedInstanceState: Bundle?) { | 75 | override fun onCreate(savedInstanceState: Bundle?) { |
83 | Log.gameLaunched = true | 76 | Log.gameLaunched = true |
84 | ThemeHelper.setTheme(this) | 77 | ThemeHelper.setTheme(this) |
@@ -125,10 +118,6 @@ class EmulationActivity : AppCompatActivity(), SensorEventListener { | |||
125 | .apply() | 118 | .apply() |
126 | } | 119 | } |
127 | } | 120 | } |
128 | |||
129 | // Start a foreground service to prevent the app from getting killed in the background | ||
130 | val startIntent = Intent(this, ForegroundService::class.java) | ||
131 | startForegroundService(startIntent) | ||
132 | } | 121 | } |
133 | 122 | ||
134 | override fun onKeyDown(keyCode: Int, event: KeyEvent): Boolean { | 123 | override fun onKeyDown(keyCode: Int, event: KeyEvent): Boolean { |
@@ -481,12 +470,6 @@ class EmulationActivity : AppCompatActivity(), SensorEventListener { | |||
481 | activity.startActivity(launcher) | 470 | activity.startActivity(launcher) |
482 | } | 471 | } |
483 | 472 | ||
484 | fun stopForegroundService(activity: Activity) { | ||
485 | val startIntent = Intent(activity, ForegroundService::class.java) | ||
486 | startIntent.action = ForegroundService.ACTION_STOP | ||
487 | activity.startForegroundService(startIntent) | ||
488 | } | ||
489 | |||
490 | private fun areCoordinatesOutside(view: View?, x: Float, y: Float): Boolean { | 473 | private fun areCoordinatesOutside(view: View?, x: Float, y: Float): Boolean { |
491 | if (view == null) { | 474 | if (view == null) { |
492 | return true | 475 | return true |
diff --git a/src/android/app/src/main/java/org/yuzu/yuzu_emu/features/settings/model/BooleanSetting.kt b/src/android/app/src/main/java/org/yuzu/yuzu_emu/features/settings/model/BooleanSetting.kt index 86bd33672..664478472 100755 --- a/src/android/app/src/main/java/org/yuzu/yuzu_emu/features/settings/model/BooleanSetting.kt +++ b/src/android/app/src/main/java/org/yuzu/yuzu_emu/features/settings/model/BooleanSetting.kt | |||
@@ -25,7 +25,8 @@ enum class BooleanSetting(override val key: String) : AbstractBooleanSetting { | |||
25 | HAPTIC_FEEDBACK("haptic_feedback"), | 25 | HAPTIC_FEEDBACK("haptic_feedback"), |
26 | SHOW_PERFORMANCE_OVERLAY("show_performance_overlay"), | 26 | SHOW_PERFORMANCE_OVERLAY("show_performance_overlay"), |
27 | SHOW_INPUT_OVERLAY("show_input_overlay"), | 27 | SHOW_INPUT_OVERLAY("show_input_overlay"), |
28 | TOUCHSCREEN("touchscreen"); | 28 | TOUCHSCREEN("touchscreen"), |
29 | SHOW_THERMAL_OVERLAY("show_thermal_overlay"); | ||
29 | 30 | ||
30 | override fun getBoolean(needsGlobal: Boolean): Boolean = | 31 | override fun getBoolean(needsGlobal: Boolean): Boolean = |
31 | NativeConfig.getBoolean(key, needsGlobal) | 32 | NativeConfig.getBoolean(key, needsGlobal) |
diff --git a/src/android/app/src/main/java/org/yuzu/yuzu_emu/features/settings/ui/SettingsFragment.kt b/src/android/app/src/main/java/org/yuzu/yuzu_emu/features/settings/ui/SettingsFragment.kt index d7ab0b5d9..6f6e7be10 100755 --- a/src/android/app/src/main/java/org/yuzu/yuzu_emu/features/settings/ui/SettingsFragment.kt +++ b/src/android/app/src/main/java/org/yuzu/yuzu_emu/features/settings/ui/SettingsFragment.kt | |||
@@ -8,7 +8,6 @@ import android.os.Bundle | |||
8 | import android.view.LayoutInflater | 8 | import android.view.LayoutInflater |
9 | import android.view.View | 9 | import android.view.View |
10 | import android.view.ViewGroup | 10 | import android.view.ViewGroup |
11 | import android.view.ViewGroup.MarginLayoutParams | ||
12 | import androidx.core.view.ViewCompat | 11 | import androidx.core.view.ViewCompat |
13 | import androidx.core.view.WindowInsetsCompat | 12 | import androidx.core.view.WindowInsetsCompat |
14 | import androidx.core.view.updatePadding | 13 | import androidx.core.view.updatePadding |
@@ -27,6 +26,7 @@ import org.yuzu.yuzu_emu.R | |||
27 | import org.yuzu.yuzu_emu.databinding.FragmentSettingsBinding | 26 | import org.yuzu.yuzu_emu.databinding.FragmentSettingsBinding |
28 | import org.yuzu.yuzu_emu.features.settings.model.Settings | 27 | import org.yuzu.yuzu_emu.features.settings.model.Settings |
29 | import org.yuzu.yuzu_emu.model.SettingsViewModel | 28 | import org.yuzu.yuzu_emu.model.SettingsViewModel |
29 | import org.yuzu.yuzu_emu.utils.ViewUtils.updateMargins | ||
30 | 30 | ||
31 | class SettingsFragment : Fragment() { | 31 | class SettingsFragment : Fragment() { |
32 | private lateinit var presenter: SettingsFragmentPresenter | 32 | private lateinit var presenter: SettingsFragmentPresenter |
@@ -125,18 +125,10 @@ class SettingsFragment : Fragment() { | |||
125 | val leftInsets = barInsets.left + cutoutInsets.left | 125 | val leftInsets = barInsets.left + cutoutInsets.left |
126 | val rightInsets = barInsets.right + cutoutInsets.right | 126 | val rightInsets = barInsets.right + cutoutInsets.right |
127 | 127 | ||
128 | val mlpSettingsList = binding.listSettings.layoutParams as MarginLayoutParams | 128 | binding.listSettings.updateMargins(left = leftInsets, right = rightInsets) |
129 | mlpSettingsList.leftMargin = leftInsets | 129 | binding.listSettings.updatePadding(bottom = barInsets.bottom) |
130 | mlpSettingsList.rightMargin = rightInsets | 130 | |
131 | binding.listSettings.layoutParams = mlpSettingsList | 131 | binding.appbarSettings.updateMargins(left = leftInsets, right = rightInsets) |
132 | binding.listSettings.updatePadding( | ||
133 | bottom = barInsets.bottom | ||
134 | ) | ||
135 | |||
136 | val mlpAppBar = binding.appbarSettings.layoutParams as MarginLayoutParams | ||
137 | mlpAppBar.leftMargin = leftInsets | ||
138 | mlpAppBar.rightMargin = rightInsets | ||
139 | binding.appbarSettings.layoutParams = mlpAppBar | ||
140 | windowInsets | 132 | windowInsets |
141 | } | 133 | } |
142 | } | 134 | } |
diff --git a/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/AboutFragment.kt b/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/AboutFragment.kt index 5ab38ffda..ff4f0e5df 100755 --- a/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/AboutFragment.kt +++ b/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/AboutFragment.kt | |||
@@ -13,7 +13,6 @@ import android.os.Bundle | |||
13 | import android.view.LayoutInflater | 13 | import android.view.LayoutInflater |
14 | import android.view.View | 14 | import android.view.View |
15 | import android.view.ViewGroup | 15 | import android.view.ViewGroup |
16 | import android.view.ViewGroup.MarginLayoutParams | ||
17 | import android.widget.Toast | 16 | import android.widget.Toast |
18 | import androidx.core.view.ViewCompat | 17 | import androidx.core.view.ViewCompat |
19 | import androidx.core.view.WindowInsetsCompat | 18 | import androidx.core.view.WindowInsetsCompat |
@@ -26,6 +25,7 @@ import org.yuzu.yuzu_emu.BuildConfig | |||
26 | import org.yuzu.yuzu_emu.R | 25 | import org.yuzu.yuzu_emu.R |
27 | import org.yuzu.yuzu_emu.databinding.FragmentAboutBinding | 26 | import org.yuzu.yuzu_emu.databinding.FragmentAboutBinding |
28 | import org.yuzu.yuzu_emu.model.HomeViewModel | 27 | import org.yuzu.yuzu_emu.model.HomeViewModel |
28 | import org.yuzu.yuzu_emu.utils.ViewUtils.updateMargins | ||
29 | 29 | ||
30 | class AboutFragment : Fragment() { | 30 | class AboutFragment : Fragment() { |
31 | private var _binding: FragmentAboutBinding? = null | 31 | private var _binding: FragmentAboutBinding? = null |
@@ -114,15 +114,8 @@ class AboutFragment : Fragment() { | |||
114 | val leftInsets = barInsets.left + cutoutInsets.left | 114 | val leftInsets = barInsets.left + cutoutInsets.left |
115 | val rightInsets = barInsets.right + cutoutInsets.right | 115 | val rightInsets = barInsets.right + cutoutInsets.right |
116 | 116 | ||
117 | val mlpToolbar = binding.toolbarAbout.layoutParams as MarginLayoutParams | 117 | binding.toolbarAbout.updateMargins(left = leftInsets, right = rightInsets) |
118 | mlpToolbar.leftMargin = leftInsets | 118 | binding.scrollAbout.updateMargins(left = leftInsets, right = rightInsets) |
119 | mlpToolbar.rightMargin = rightInsets | ||
120 | binding.toolbarAbout.layoutParams = mlpToolbar | ||
121 | |||
122 | val mlpScrollAbout = binding.scrollAbout.layoutParams as MarginLayoutParams | ||
123 | mlpScrollAbout.leftMargin = leftInsets | ||
124 | mlpScrollAbout.rightMargin = rightInsets | ||
125 | binding.scrollAbout.layoutParams = mlpScrollAbout | ||
126 | 119 | ||
127 | binding.contentAbout.updatePadding(bottom = barInsets.bottom) | 120 | binding.contentAbout.updatePadding(bottom = barInsets.bottom) |
128 | 121 | ||
diff --git a/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/AddonsFragment.kt b/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/AddonsFragment.kt index adb65812c..f5647fa95 100755 --- a/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/AddonsFragment.kt +++ b/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/AddonsFragment.kt | |||
@@ -31,6 +31,7 @@ import org.yuzu.yuzu_emu.model.AddonViewModel | |||
31 | import org.yuzu.yuzu_emu.model.HomeViewModel | 31 | import org.yuzu.yuzu_emu.model.HomeViewModel |
32 | import org.yuzu.yuzu_emu.utils.AddonUtil | 32 | import org.yuzu.yuzu_emu.utils.AddonUtil |
33 | import org.yuzu.yuzu_emu.utils.FileUtil.copyFilesTo | 33 | import org.yuzu.yuzu_emu.utils.FileUtil.copyFilesTo |
34 | import org.yuzu.yuzu_emu.utils.ViewUtils.updateMargins | ||
34 | import java.io.File | 35 | import java.io.File |
35 | 36 | ||
36 | class AddonsFragment : Fragment() { | 37 | class AddonsFragment : Fragment() { |
@@ -202,27 +203,19 @@ class AddonsFragment : Fragment() { | |||
202 | val leftInsets = barInsets.left + cutoutInsets.left | 203 | val leftInsets = barInsets.left + cutoutInsets.left |
203 | val rightInsets = barInsets.right + cutoutInsets.right | 204 | val rightInsets = barInsets.right + cutoutInsets.right |
204 | 205 | ||
205 | val mlpToolbar = binding.toolbarAddons.layoutParams as ViewGroup.MarginLayoutParams | 206 | binding.toolbarAddons.updateMargins(left = leftInsets, right = rightInsets) |
206 | mlpToolbar.leftMargin = leftInsets | 207 | binding.listAddons.updateMargins(left = leftInsets, right = rightInsets) |
207 | mlpToolbar.rightMargin = rightInsets | ||
208 | binding.toolbarAddons.layoutParams = mlpToolbar | ||
209 | |||
210 | val mlpAddonsList = binding.listAddons.layoutParams as ViewGroup.MarginLayoutParams | ||
211 | mlpAddonsList.leftMargin = leftInsets | ||
212 | mlpAddonsList.rightMargin = rightInsets | ||
213 | binding.listAddons.layoutParams = mlpAddonsList | ||
214 | binding.listAddons.updatePadding( | 208 | binding.listAddons.updatePadding( |
215 | bottom = barInsets.bottom + | 209 | bottom = barInsets.bottom + |
216 | resources.getDimensionPixelSize(R.dimen.spacing_bottom_list_fab) | 210 | resources.getDimensionPixelSize(R.dimen.spacing_bottom_list_fab) |
217 | ) | 211 | ) |
218 | 212 | ||
219 | val fabSpacing = resources.getDimensionPixelSize(R.dimen.spacing_fab) | 213 | val fabSpacing = resources.getDimensionPixelSize(R.dimen.spacing_fab) |
220 | val mlpFab = | 214 | binding.buttonInstall.updateMargins( |
221 | binding.buttonInstall.layoutParams as ViewGroup.MarginLayoutParams | 215 | left = leftInsets + fabSpacing, |
222 | mlpFab.leftMargin = leftInsets + fabSpacing | 216 | right = rightInsets + fabSpacing, |
223 | mlpFab.rightMargin = rightInsets + fabSpacing | 217 | bottom = barInsets.bottom + fabSpacing |
224 | mlpFab.bottomMargin = barInsets.bottom + fabSpacing | 218 | ) |
225 | binding.buttonInstall.layoutParams = mlpFab | ||
226 | 219 | ||
227 | windowInsets | 220 | windowInsets |
228 | } | 221 | } |
diff --git a/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/AppletLauncherFragment.kt b/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/AppletLauncherFragment.kt index 1f66b440d..73ca40484 100755 --- a/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/AppletLauncherFragment.kt +++ b/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/AppletLauncherFragment.kt | |||
@@ -21,6 +21,7 @@ import org.yuzu.yuzu_emu.databinding.FragmentAppletLauncherBinding | |||
21 | import org.yuzu.yuzu_emu.model.Applet | 21 | import org.yuzu.yuzu_emu.model.Applet |
22 | import org.yuzu.yuzu_emu.model.AppletInfo | 22 | import org.yuzu.yuzu_emu.model.AppletInfo |
23 | import org.yuzu.yuzu_emu.model.HomeViewModel | 23 | import org.yuzu.yuzu_emu.model.HomeViewModel |
24 | import org.yuzu.yuzu_emu.utils.ViewUtils.updateMargins | ||
24 | 25 | ||
25 | class AppletLauncherFragment : Fragment() { | 26 | class AppletLauncherFragment : Fragment() { |
26 | private var _binding: FragmentAppletLauncherBinding? = null | 27 | private var _binding: FragmentAppletLauncherBinding? = null |
@@ -95,16 +96,8 @@ class AppletLauncherFragment : Fragment() { | |||
95 | val leftInsets = barInsets.left + cutoutInsets.left | 96 | val leftInsets = barInsets.left + cutoutInsets.left |
96 | val rightInsets = barInsets.right + cutoutInsets.right | 97 | val rightInsets = barInsets.right + cutoutInsets.right |
97 | 98 | ||
98 | val mlpAppBar = binding.toolbarApplets.layoutParams as ViewGroup.MarginLayoutParams | 99 | binding.toolbarApplets.updateMargins(left = leftInsets, right = rightInsets) |
99 | mlpAppBar.leftMargin = leftInsets | 100 | binding.listApplets.updateMargins(left = leftInsets, right = rightInsets) |
100 | mlpAppBar.rightMargin = rightInsets | ||
101 | binding.toolbarApplets.layoutParams = mlpAppBar | ||
102 | |||
103 | val mlpListApplets = | ||
104 | binding.listApplets.layoutParams as ViewGroup.MarginLayoutParams | ||
105 | mlpListApplets.leftMargin = leftInsets | ||
106 | mlpListApplets.rightMargin = rightInsets | ||
107 | binding.listApplets.layoutParams = mlpListApplets | ||
108 | 101 | ||
109 | binding.listApplets.updatePadding(bottom = barInsets.bottom) | 102 | binding.listApplets.updatePadding(bottom = barInsets.bottom) |
110 | 103 | ||
diff --git a/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/DriverManagerFragment.kt b/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/DriverManagerFragment.kt index bf017cd7c..41cff46c1 100755 --- a/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/DriverManagerFragment.kt +++ b/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/DriverManagerFragment.kt | |||
@@ -34,6 +34,7 @@ import org.yuzu.yuzu_emu.model.HomeViewModel | |||
34 | import org.yuzu.yuzu_emu.utils.FileUtil | 34 | import org.yuzu.yuzu_emu.utils.FileUtil |
35 | import org.yuzu.yuzu_emu.utils.GpuDriverHelper | 35 | import org.yuzu.yuzu_emu.utils.GpuDriverHelper |
36 | import org.yuzu.yuzu_emu.utils.NativeConfig | 36 | import org.yuzu.yuzu_emu.utils.NativeConfig |
37 | import org.yuzu.yuzu_emu.utils.ViewUtils.updateMargins | ||
37 | import java.io.File | 38 | import java.io.File |
38 | import java.io.IOException | 39 | import java.io.IOException |
39 | 40 | ||
@@ -141,23 +142,15 @@ class DriverManagerFragment : Fragment() { | |||
141 | val leftInsets = barInsets.left + cutoutInsets.left | 142 | val leftInsets = barInsets.left + cutoutInsets.left |
142 | val rightInsets = barInsets.right + cutoutInsets.right | 143 | val rightInsets = barInsets.right + cutoutInsets.right |
143 | 144 | ||
144 | val mlpAppBar = binding.toolbarDrivers.layoutParams as ViewGroup.MarginLayoutParams | 145 | binding.toolbarDrivers.updateMargins(left = leftInsets, right = rightInsets) |
145 | mlpAppBar.leftMargin = leftInsets | 146 | binding.listDrivers.updateMargins(left = leftInsets, right = rightInsets) |
146 | mlpAppBar.rightMargin = rightInsets | ||
147 | binding.toolbarDrivers.layoutParams = mlpAppBar | ||
148 | |||
149 | val mlplistDrivers = binding.listDrivers.layoutParams as ViewGroup.MarginLayoutParams | ||
150 | mlplistDrivers.leftMargin = leftInsets | ||
151 | mlplistDrivers.rightMargin = rightInsets | ||
152 | binding.listDrivers.layoutParams = mlplistDrivers | ||
153 | 147 | ||
154 | val fabSpacing = resources.getDimensionPixelSize(R.dimen.spacing_fab) | 148 | val fabSpacing = resources.getDimensionPixelSize(R.dimen.spacing_fab) |
155 | val mlpFab = | 149 | binding.buttonInstall.updateMargins( |
156 | binding.buttonInstall.layoutParams as ViewGroup.MarginLayoutParams | 150 | left = leftInsets + fabSpacing, |
157 | mlpFab.leftMargin = leftInsets + fabSpacing | 151 | right = rightInsets + fabSpacing, |
158 | mlpFab.rightMargin = rightInsets + fabSpacing | 152 | bottom = barInsets.bottom + fabSpacing |
159 | mlpFab.bottomMargin = barInsets.bottom + fabSpacing | 153 | ) |
160 | binding.buttonInstall.layoutParams = mlpFab | ||
161 | 154 | ||
162 | binding.listDrivers.updatePadding( | 155 | binding.listDrivers.updatePadding( |
163 | bottom = barInsets.bottom + | 156 | bottom = barInsets.bottom + |
diff --git a/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/EarlyAccessFragment.kt b/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/EarlyAccessFragment.kt index dbc16da4a..0534b68ce 100755 --- a/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/EarlyAccessFragment.kt +++ b/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/EarlyAccessFragment.kt | |||
@@ -19,6 +19,7 @@ import com.google.android.material.transition.MaterialSharedAxis | |||
19 | import org.yuzu.yuzu_emu.R | 19 | import org.yuzu.yuzu_emu.R |
20 | import org.yuzu.yuzu_emu.databinding.FragmentEarlyAccessBinding | 20 | import org.yuzu.yuzu_emu.databinding.FragmentEarlyAccessBinding |
21 | import org.yuzu.yuzu_emu.model.HomeViewModel | 21 | import org.yuzu.yuzu_emu.model.HomeViewModel |
22 | import org.yuzu.yuzu_emu.utils.ViewUtils.updateMargins | ||
22 | 23 | ||
23 | class EarlyAccessFragment : Fragment() { | 24 | class EarlyAccessFragment : Fragment() { |
24 | private var _binding: FragmentEarlyAccessBinding? = null | 25 | private var _binding: FragmentEarlyAccessBinding? = null |
@@ -73,10 +74,7 @@ class EarlyAccessFragment : Fragment() { | |||
73 | val leftInsets = barInsets.left + cutoutInsets.left | 74 | val leftInsets = barInsets.left + cutoutInsets.left |
74 | val rightInsets = barInsets.right + cutoutInsets.right | 75 | val rightInsets = barInsets.right + cutoutInsets.right |
75 | 76 | ||
76 | val mlpAppBar = binding.appbarEa.layoutParams as ViewGroup.MarginLayoutParams | 77 | binding.appbarEa.updateMargins(left = leftInsets, right = rightInsets) |
77 | mlpAppBar.leftMargin = leftInsets | ||
78 | mlpAppBar.rightMargin = rightInsets | ||
79 | binding.appbarEa.layoutParams = mlpAppBar | ||
80 | 78 | ||
81 | binding.scrollEa.updatePadding( | 79 | binding.scrollEa.updatePadding( |
82 | left = leftInsets, | 80 | left = leftInsets, |
diff --git a/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/EmulationFragment.kt b/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/EmulationFragment.kt index 937b8faf1..44af896da 100755 --- a/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/EmulationFragment.kt +++ b/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/EmulationFragment.kt | |||
@@ -13,6 +13,7 @@ import android.net.Uri | |||
13 | import android.os.Bundle | 13 | import android.os.Bundle |
14 | import android.os.Handler | 14 | import android.os.Handler |
15 | import android.os.Looper | 15 | import android.os.Looper |
16 | import android.os.PowerManager | ||
16 | import android.os.SystemClock | 17 | import android.os.SystemClock |
17 | import android.view.* | 18 | import android.view.* |
18 | import android.widget.TextView | 19 | import android.widget.TextView |
@@ -23,6 +24,7 @@ import androidx.core.content.res.ResourcesCompat | |||
23 | import androidx.core.graphics.Insets | 24 | import androidx.core.graphics.Insets |
24 | import androidx.core.view.ViewCompat | 25 | import androidx.core.view.ViewCompat |
25 | import androidx.core.view.WindowInsetsCompat | 26 | import androidx.core.view.WindowInsetsCompat |
27 | import androidx.core.view.updatePadding | ||
26 | import androidx.drawerlayout.widget.DrawerLayout | 28 | import androidx.drawerlayout.widget.DrawerLayout |
27 | import androidx.drawerlayout.widget.DrawerLayout.DrawerListener | 29 | import androidx.drawerlayout.widget.DrawerLayout.DrawerListener |
28 | import androidx.fragment.app.Fragment | 30 | import androidx.fragment.app.Fragment |
@@ -38,7 +40,6 @@ import androidx.window.layout.WindowLayoutInfo | |||
38 | import com.google.android.material.dialog.MaterialAlertDialogBuilder | 40 | import com.google.android.material.dialog.MaterialAlertDialogBuilder |
39 | import com.google.android.material.slider.Slider | 41 | import com.google.android.material.slider.Slider |
40 | import kotlinx.coroutines.Dispatchers | 42 | import kotlinx.coroutines.Dispatchers |
41 | import kotlinx.coroutines.flow.collect | ||
42 | import kotlinx.coroutines.flow.collectLatest | 43 | import kotlinx.coroutines.flow.collectLatest |
43 | import kotlinx.coroutines.launch | 44 | import kotlinx.coroutines.launch |
44 | import org.yuzu.yuzu_emu.HomeNavigationDirections | 45 | import org.yuzu.yuzu_emu.HomeNavigationDirections |
@@ -64,6 +65,7 @@ class EmulationFragment : Fragment(), SurfaceHolder.Callback { | |||
64 | private lateinit var emulationState: EmulationState | 65 | private lateinit var emulationState: EmulationState |
65 | private var emulationActivity: EmulationActivity? = null | 66 | private var emulationActivity: EmulationActivity? = null |
66 | private var perfStatsUpdater: (() -> Unit)? = null | 67 | private var perfStatsUpdater: (() -> Unit)? = null |
68 | private var thermalStatsUpdater: (() -> Unit)? = null | ||
67 | 69 | ||
68 | private var _binding: FragmentEmulationBinding? = null | 70 | private var _binding: FragmentEmulationBinding? = null |
69 | private val binding get() = _binding!! | 71 | private val binding get() = _binding!! |
@@ -77,6 +79,8 @@ class EmulationFragment : Fragment(), SurfaceHolder.Callback { | |||
77 | 79 | ||
78 | private var isInFoldableLayout = false | 80 | private var isInFoldableLayout = false |
79 | 81 | ||
82 | private lateinit var powerManager: PowerManager | ||
83 | |||
80 | override fun onAttach(context: Context) { | 84 | override fun onAttach(context: Context) { |
81 | super.onAttach(context) | 85 | super.onAttach(context) |
82 | if (context is EmulationActivity) { | 86 | if (context is EmulationActivity) { |
@@ -102,6 +106,8 @@ class EmulationFragment : Fragment(), SurfaceHolder.Callback { | |||
102 | super.onCreate(savedInstanceState) | 106 | super.onCreate(savedInstanceState) |
103 | updateOrientation() | 107 | updateOrientation() |
104 | 108 | ||
109 | powerManager = requireContext().getSystemService(Context.POWER_SERVICE) as PowerManager | ||
110 | |||
105 | val intentUri: Uri? = requireActivity().intent.data | 111 | val intentUri: Uri? = requireActivity().intent.data |
106 | var intentGame: Game? = null | 112 | var intentGame: Game? = null |
107 | if (intentUri != null) { | 113 | if (intentUri != null) { |
@@ -394,8 +400,9 @@ class EmulationFragment : Fragment(), SurfaceHolder.Callback { | |||
394 | 400 | ||
395 | emulationState.updateSurface() | 401 | emulationState.updateSurface() |
396 | 402 | ||
397 | // Setup overlay | 403 | // Setup overlays |
398 | updateShowFpsOverlay() | 404 | updateShowFpsOverlay() |
405 | updateThermalOverlay() | ||
399 | } | 406 | } |
400 | } | 407 | } |
401 | } | 408 | } |
@@ -553,6 +560,38 @@ class EmulationFragment : Fragment(), SurfaceHolder.Callback { | |||
553 | } | 560 | } |
554 | } | 561 | } |
555 | 562 | ||
563 | private fun updateThermalOverlay() { | ||
564 | if (BooleanSetting.SHOW_THERMAL_OVERLAY.getBoolean()) { | ||
565 | thermalStatsUpdater = { | ||
566 | if (emulationViewModel.emulationStarted.value && | ||
567 | !emulationViewModel.isEmulationStopping.value | ||
568 | ) { | ||
569 | val thermalStatus = when (powerManager.currentThermalStatus) { | ||
570 | PowerManager.THERMAL_STATUS_LIGHT -> "😥" | ||
571 | PowerManager.THERMAL_STATUS_MODERATE -> "🥵" | ||
572 | PowerManager.THERMAL_STATUS_SEVERE -> "🔥" | ||
573 | PowerManager.THERMAL_STATUS_CRITICAL, | ||
574 | PowerManager.THERMAL_STATUS_EMERGENCY, | ||
575 | PowerManager.THERMAL_STATUS_SHUTDOWN -> "☢️" | ||
576 | |||
577 | else -> "🙂" | ||
578 | } | ||
579 | if (_binding != null) { | ||
580 | binding.showThermalsText.text = thermalStatus | ||
581 | } | ||
582 | thermalStatsUpdateHandler.postDelayed(thermalStatsUpdater!!, 1000) | ||
583 | } | ||
584 | } | ||
585 | thermalStatsUpdateHandler.post(thermalStatsUpdater!!) | ||
586 | binding.showThermalsText.visibility = View.VISIBLE | ||
587 | } else { | ||
588 | if (thermalStatsUpdater != null) { | ||
589 | thermalStatsUpdateHandler.removeCallbacks(thermalStatsUpdater!!) | ||
590 | } | ||
591 | binding.showThermalsText.visibility = View.GONE | ||
592 | } | ||
593 | } | ||
594 | |||
556 | @SuppressLint("SourceLockedOrientationActivity") | 595 | @SuppressLint("SourceLockedOrientationActivity") |
557 | private fun updateOrientation() { | 596 | private fun updateOrientation() { |
558 | emulationActivity?.let { | 597 | emulationActivity?.let { |
@@ -641,6 +680,8 @@ class EmulationFragment : Fragment(), SurfaceHolder.Callback { | |||
641 | popup.menu.apply { | 680 | popup.menu.apply { |
642 | findItem(R.id.menu_toggle_fps).isChecked = | 681 | findItem(R.id.menu_toggle_fps).isChecked = |
643 | BooleanSetting.SHOW_PERFORMANCE_OVERLAY.getBoolean() | 682 | BooleanSetting.SHOW_PERFORMANCE_OVERLAY.getBoolean() |
683 | findItem(R.id.thermal_indicator).isChecked = | ||
684 | BooleanSetting.SHOW_THERMAL_OVERLAY.getBoolean() | ||
644 | findItem(R.id.menu_rel_stick_center).isChecked = | 685 | findItem(R.id.menu_rel_stick_center).isChecked = |
645 | BooleanSetting.JOYSTICK_REL_CENTER.getBoolean() | 686 | BooleanSetting.JOYSTICK_REL_CENTER.getBoolean() |
646 | findItem(R.id.menu_dpad_slide).isChecked = BooleanSetting.DPAD_SLIDE.getBoolean() | 687 | findItem(R.id.menu_dpad_slide).isChecked = BooleanSetting.DPAD_SLIDE.getBoolean() |
@@ -660,6 +701,13 @@ class EmulationFragment : Fragment(), SurfaceHolder.Callback { | |||
660 | true | 701 | true |
661 | } | 702 | } |
662 | 703 | ||
704 | R.id.thermal_indicator -> { | ||
705 | it.isChecked = !it.isChecked | ||
706 | BooleanSetting.SHOW_THERMAL_OVERLAY.setBoolean(it.isChecked) | ||
707 | updateThermalOverlay() | ||
708 | true | ||
709 | } | ||
710 | |||
663 | R.id.menu_edit_overlay -> { | 711 | R.id.menu_edit_overlay -> { |
664 | binding.drawerLayout.close() | 712 | binding.drawerLayout.close() |
665 | binding.surfaceInputOverlay.requestFocus() | 713 | binding.surfaceInputOverlay.requestFocus() |
@@ -850,7 +898,7 @@ class EmulationFragment : Fragment(), SurfaceHolder.Callback { | |||
850 | right = cutInsets.right | 898 | right = cutInsets.right |
851 | } | 899 | } |
852 | 900 | ||
853 | v.setPadding(left, cutInsets.top, right, 0) | 901 | v.updatePadding(left = left, top = cutInsets.top, right = right) |
854 | windowInsets | 902 | windowInsets |
855 | } | 903 | } |
856 | } | 904 | } |
@@ -1003,5 +1051,6 @@ class EmulationFragment : Fragment(), SurfaceHolder.Callback { | |||
1003 | 1051 | ||
1004 | companion object { | 1052 | companion object { |
1005 | private val perfStatsUpdateHandler = Handler(Looper.myLooper()!!) | 1053 | private val perfStatsUpdateHandler = Handler(Looper.myLooper()!!) |
1054 | private val thermalStatsUpdateHandler = Handler(Looper.myLooper()!!) | ||
1006 | } | 1055 | } |
1007 | } | 1056 | } |
diff --git a/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/GameFoldersFragment.kt b/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/GameFoldersFragment.kt index 341a37fdb..5c558b1a5 100755 --- a/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/GameFoldersFragment.kt +++ b/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/GameFoldersFragment.kt | |||
@@ -26,6 +26,7 @@ import org.yuzu.yuzu_emu.databinding.FragmentFoldersBinding | |||
26 | import org.yuzu.yuzu_emu.model.GamesViewModel | 26 | import org.yuzu.yuzu_emu.model.GamesViewModel |
27 | import org.yuzu.yuzu_emu.model.HomeViewModel | 27 | import org.yuzu.yuzu_emu.model.HomeViewModel |
28 | import org.yuzu.yuzu_emu.ui.main.MainActivity | 28 | import org.yuzu.yuzu_emu.ui.main.MainActivity |
29 | import org.yuzu.yuzu_emu.utils.ViewUtils.updateMargins | ||
29 | 30 | ||
30 | class GameFoldersFragment : Fragment() { | 31 | class GameFoldersFragment : Fragment() { |
31 | private var _binding: FragmentFoldersBinding? = null | 32 | private var _binding: FragmentFoldersBinding? = null |
@@ -100,23 +101,16 @@ class GameFoldersFragment : Fragment() { | |||
100 | val leftInsets = barInsets.left + cutoutInsets.left | 101 | val leftInsets = barInsets.left + cutoutInsets.left |
101 | val rightInsets = barInsets.right + cutoutInsets.right | 102 | val rightInsets = barInsets.right + cutoutInsets.right |
102 | 103 | ||
103 | val mlpToolbar = binding.toolbarFolders.layoutParams as ViewGroup.MarginLayoutParams | 104 | binding.toolbarFolders.updateMargins(left = leftInsets, right = rightInsets) |
104 | mlpToolbar.leftMargin = leftInsets | ||
105 | mlpToolbar.rightMargin = rightInsets | ||
106 | binding.toolbarFolders.layoutParams = mlpToolbar | ||
107 | 105 | ||
108 | val fabSpacing = resources.getDimensionPixelSize(R.dimen.spacing_fab) | 106 | val fabSpacing = resources.getDimensionPixelSize(R.dimen.spacing_fab) |
109 | val mlpFab = | 107 | binding.buttonAdd.updateMargins( |
110 | binding.buttonAdd.layoutParams as ViewGroup.MarginLayoutParams | 108 | left = leftInsets + fabSpacing, |
111 | mlpFab.leftMargin = leftInsets + fabSpacing | 109 | right = rightInsets + fabSpacing, |
112 | mlpFab.rightMargin = rightInsets + fabSpacing | 110 | bottom = barInsets.bottom + fabSpacing |
113 | mlpFab.bottomMargin = barInsets.bottom + fabSpacing | 111 | ) |
114 | binding.buttonAdd.layoutParams = mlpFab | 112 | |
115 | 113 | binding.listFolders.updateMargins(left = leftInsets, right = rightInsets) | |
116 | val mlpListFolders = binding.listFolders.layoutParams as ViewGroup.MarginLayoutParams | ||
117 | mlpListFolders.leftMargin = leftInsets | ||
118 | mlpListFolders.rightMargin = rightInsets | ||
119 | binding.listFolders.layoutParams = mlpListFolders | ||
120 | 114 | ||
121 | binding.listFolders.updatePadding( | 115 | binding.listFolders.updatePadding( |
122 | bottom = barInsets.bottom + | 116 | bottom = barInsets.bottom + |
diff --git a/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/GameInfoFragment.kt b/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/GameInfoFragment.kt index 5aa3f453f..dbd56e84f 100755 --- a/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/GameInfoFragment.kt +++ b/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/GameInfoFragment.kt | |||
@@ -27,6 +27,7 @@ import org.yuzu.yuzu_emu.databinding.FragmentGameInfoBinding | |||
27 | import org.yuzu.yuzu_emu.model.GameVerificationResult | 27 | import org.yuzu.yuzu_emu.model.GameVerificationResult |
28 | import org.yuzu.yuzu_emu.model.HomeViewModel | 28 | import org.yuzu.yuzu_emu.model.HomeViewModel |
29 | import org.yuzu.yuzu_emu.utils.GameMetadata | 29 | import org.yuzu.yuzu_emu.utils.GameMetadata |
30 | import org.yuzu.yuzu_emu.utils.ViewUtils.updateMargins | ||
30 | 31 | ||
31 | class GameInfoFragment : Fragment() { | 32 | class GameInfoFragment : Fragment() { |
32 | private var _binding: FragmentGameInfoBinding? = null | 33 | private var _binding: FragmentGameInfoBinding? = null |
@@ -122,11 +123,13 @@ class GameInfoFragment : Fragment() { | |||
122 | titleId = R.string.verify_success, | 123 | titleId = R.string.verify_success, |
123 | descriptionId = R.string.operation_completed_successfully | 124 | descriptionId = R.string.operation_completed_successfully |
124 | ) | 125 | ) |
126 | |||
125 | GameVerificationResult.Failed -> | 127 | GameVerificationResult.Failed -> |
126 | MessageDialogFragment.newInstance( | 128 | MessageDialogFragment.newInstance( |
127 | titleId = R.string.verify_failure, | 129 | titleId = R.string.verify_failure, |
128 | descriptionId = R.string.verify_failure_description | 130 | descriptionId = R.string.verify_failure_description |
129 | ) | 131 | ) |
132 | |||
130 | GameVerificationResult.NotImplemented -> | 133 | GameVerificationResult.NotImplemented -> |
131 | MessageDialogFragment.newInstance( | 134 | MessageDialogFragment.newInstance( |
132 | titleId = R.string.verify_no_result, | 135 | titleId = R.string.verify_no_result, |
@@ -165,15 +168,8 @@ class GameInfoFragment : Fragment() { | |||
165 | val leftInsets = barInsets.left + cutoutInsets.left | 168 | val leftInsets = barInsets.left + cutoutInsets.left |
166 | val rightInsets = barInsets.right + cutoutInsets.right | 169 | val rightInsets = barInsets.right + cutoutInsets.right |
167 | 170 | ||
168 | val mlpToolbar = binding.toolbarInfo.layoutParams as ViewGroup.MarginLayoutParams | 171 | binding.toolbarInfo.updateMargins(left = leftInsets, right = rightInsets) |
169 | mlpToolbar.leftMargin = leftInsets | 172 | binding.scrollInfo.updateMargins(left = leftInsets, right = rightInsets) |
170 | mlpToolbar.rightMargin = rightInsets | ||
171 | binding.toolbarInfo.layoutParams = mlpToolbar | ||
172 | |||
173 | val mlpScrollAbout = binding.scrollInfo.layoutParams as ViewGroup.MarginLayoutParams | ||
174 | mlpScrollAbout.leftMargin = leftInsets | ||
175 | mlpScrollAbout.rightMargin = rightInsets | ||
176 | binding.scrollInfo.layoutParams = mlpScrollAbout | ||
177 | 173 | ||
178 | binding.contentInfo.updatePadding(bottom = barInsets.bottom) | 174 | binding.contentInfo.updatePadding(bottom = barInsets.bottom) |
179 | 175 | ||
diff --git a/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/GamePropertiesFragment.kt b/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/GamePropertiesFragment.kt index 582df0133..d14b2c634 100755 --- a/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/GamePropertiesFragment.kt +++ b/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/GamePropertiesFragment.kt | |||
@@ -46,6 +46,7 @@ import org.yuzu.yuzu_emu.utils.FileUtil | |||
46 | import org.yuzu.yuzu_emu.utils.GameIconUtils | 46 | import org.yuzu.yuzu_emu.utils.GameIconUtils |
47 | import org.yuzu.yuzu_emu.utils.GpuDriverHelper | 47 | import org.yuzu.yuzu_emu.utils.GpuDriverHelper |
48 | import org.yuzu.yuzu_emu.utils.MemoryUtil | 48 | import org.yuzu.yuzu_emu.utils.MemoryUtil |
49 | import org.yuzu.yuzu_emu.utils.ViewUtils.updateMargins | ||
49 | import java.io.BufferedOutputStream | 50 | import java.io.BufferedOutputStream |
50 | import java.io.File | 51 | import java.io.File |
51 | 52 | ||
@@ -320,46 +321,25 @@ class GamePropertiesFragment : Fragment() { | |||
320 | 321 | ||
321 | val smallLayout = resources.getBoolean(R.bool.small_layout) | 322 | val smallLayout = resources.getBoolean(R.bool.small_layout) |
322 | if (smallLayout) { | 323 | if (smallLayout) { |
323 | val mlpListAll = | 324 | binding.listAll.updateMargins(left = leftInsets, right = rightInsets) |
324 | binding.listAll.layoutParams as ViewGroup.MarginLayoutParams | ||
325 | mlpListAll.leftMargin = leftInsets | ||
326 | mlpListAll.rightMargin = rightInsets | ||
327 | binding.listAll.layoutParams = mlpListAll | ||
328 | } else { | 325 | } else { |
329 | if (ViewCompat.getLayoutDirection(binding.root) == | 326 | if (ViewCompat.getLayoutDirection(binding.root) == |
330 | ViewCompat.LAYOUT_DIRECTION_LTR | 327 | ViewCompat.LAYOUT_DIRECTION_LTR |
331 | ) { | 328 | ) { |
332 | val mlpListAll = | 329 | binding.listAll.updateMargins(right = rightInsets) |
333 | binding.listAll.layoutParams as ViewGroup.MarginLayoutParams | 330 | binding.iconLayout!!.updateMargins(top = barInsets.top, left = leftInsets) |
334 | mlpListAll.rightMargin = rightInsets | ||
335 | binding.listAll.layoutParams = mlpListAll | ||
336 | |||
337 | val mlpIconLayout = | ||
338 | binding.iconLayout!!.layoutParams as ViewGroup.MarginLayoutParams | ||
339 | mlpIconLayout.topMargin = barInsets.top | ||
340 | mlpIconLayout.leftMargin = leftInsets | ||
341 | binding.iconLayout!!.layoutParams = mlpIconLayout | ||
342 | } else { | 331 | } else { |
343 | val mlpListAll = | 332 | binding.listAll.updateMargins(left = leftInsets) |
344 | binding.listAll.layoutParams as ViewGroup.MarginLayoutParams | 333 | binding.iconLayout!!.updateMargins(top = barInsets.top, right = rightInsets) |
345 | mlpListAll.leftMargin = leftInsets | ||
346 | binding.listAll.layoutParams = mlpListAll | ||
347 | |||
348 | val mlpIconLayout = | ||
349 | binding.iconLayout!!.layoutParams as ViewGroup.MarginLayoutParams | ||
350 | mlpIconLayout.topMargin = barInsets.top | ||
351 | mlpIconLayout.rightMargin = rightInsets | ||
352 | binding.iconLayout!!.layoutParams = mlpIconLayout | ||
353 | } | 334 | } |
354 | } | 335 | } |
355 | 336 | ||
356 | val fabSpacing = resources.getDimensionPixelSize(R.dimen.spacing_fab) | 337 | val fabSpacing = resources.getDimensionPixelSize(R.dimen.spacing_fab) |
357 | val mlpFab = | 338 | binding.buttonStart.updateMargins( |
358 | binding.buttonStart.layoutParams as ViewGroup.MarginLayoutParams | 339 | left = leftInsets + fabSpacing, |
359 | mlpFab.leftMargin = leftInsets + fabSpacing | 340 | right = rightInsets + fabSpacing, |
360 | mlpFab.rightMargin = rightInsets + fabSpacing | 341 | bottom = barInsets.bottom + fabSpacing |
361 | mlpFab.bottomMargin = barInsets.bottom + fabSpacing | 342 | ) |
362 | binding.buttonStart.layoutParams = mlpFab | ||
363 | 343 | ||
364 | binding.layoutAll.updatePadding( | 344 | binding.layoutAll.updatePadding( |
365 | top = barInsets.top, | 345 | top = barInsets.top, |
diff --git a/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/HomeSettingsFragment.kt b/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/HomeSettingsFragment.kt index 1f3578b22..87e130d3e 100755 --- a/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/HomeSettingsFragment.kt +++ b/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/HomeSettingsFragment.kt | |||
@@ -12,7 +12,6 @@ import android.provider.DocumentsContract | |||
12 | import android.view.LayoutInflater | 12 | import android.view.LayoutInflater |
13 | import android.view.View | 13 | import android.view.View |
14 | import android.view.ViewGroup | 14 | import android.view.ViewGroup |
15 | import android.view.ViewGroup.MarginLayoutParams | ||
16 | import android.widget.Toast | 15 | import android.widget.Toast |
17 | import androidx.appcompat.app.AppCompatActivity | 16 | import androidx.appcompat.app.AppCompatActivity |
18 | import androidx.core.app.ActivityCompat | 17 | import androidx.core.app.ActivityCompat |
@@ -44,6 +43,7 @@ import org.yuzu.yuzu_emu.ui.main.MainActivity | |||
44 | import org.yuzu.yuzu_emu.utils.FileUtil | 43 | import org.yuzu.yuzu_emu.utils.FileUtil |
45 | import org.yuzu.yuzu_emu.utils.GpuDriverHelper | 44 | import org.yuzu.yuzu_emu.utils.GpuDriverHelper |
46 | import org.yuzu.yuzu_emu.utils.Log | 45 | import org.yuzu.yuzu_emu.utils.Log |
46 | import org.yuzu.yuzu_emu.utils.ViewUtils.updateMargins | ||
47 | 47 | ||
48 | class HomeSettingsFragment : Fragment() { | 48 | class HomeSettingsFragment : Fragment() { |
49 | private var _binding: FragmentHomeSettingsBinding? = null | 49 | private var _binding: FragmentHomeSettingsBinding? = null |
@@ -408,10 +408,7 @@ class HomeSettingsFragment : Fragment() { | |||
408 | bottom = barInsets.bottom | 408 | bottom = barInsets.bottom |
409 | ) | 409 | ) |
410 | 410 | ||
411 | val mlpScrollSettings = binding.scrollViewSettings.layoutParams as MarginLayoutParams | 411 | binding.scrollViewSettings.updateMargins(left = leftInsets, right = rightInsets) |
412 | mlpScrollSettings.leftMargin = leftInsets | ||
413 | mlpScrollSettings.rightMargin = rightInsets | ||
414 | binding.scrollViewSettings.layoutParams = mlpScrollSettings | ||
415 | 412 | ||
416 | binding.linearLayoutSettings.updatePadding(bottom = spacingNavigation) | 413 | binding.linearLayoutSettings.updatePadding(bottom = spacingNavigation) |
417 | 414 | ||
diff --git a/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/InstallableFragment.kt b/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/InstallableFragment.kt index 7df8e6bf4..63112dc6f 100755 --- a/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/InstallableFragment.kt +++ b/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/InstallableFragment.kt | |||
@@ -34,6 +34,7 @@ import org.yuzu.yuzu_emu.model.TaskState | |||
34 | import org.yuzu.yuzu_emu.ui.main.MainActivity | 34 | import org.yuzu.yuzu_emu.ui.main.MainActivity |
35 | import org.yuzu.yuzu_emu.utils.DirectoryInitialization | 35 | import org.yuzu.yuzu_emu.utils.DirectoryInitialization |
36 | import org.yuzu.yuzu_emu.utils.FileUtil | 36 | import org.yuzu.yuzu_emu.utils.FileUtil |
37 | import org.yuzu.yuzu_emu.utils.ViewUtils.updateMargins | ||
37 | import java.io.BufferedOutputStream | 38 | import java.io.BufferedOutputStream |
38 | import java.io.File | 39 | import java.io.File |
39 | import java.math.BigInteger | 40 | import java.math.BigInteger |
@@ -172,16 +173,8 @@ class InstallableFragment : Fragment() { | |||
172 | val leftInsets = barInsets.left + cutoutInsets.left | 173 | val leftInsets = barInsets.left + cutoutInsets.left |
173 | val rightInsets = barInsets.right + cutoutInsets.right | 174 | val rightInsets = barInsets.right + cutoutInsets.right |
174 | 175 | ||
175 | val mlpAppBar = binding.toolbarInstallables.layoutParams as ViewGroup.MarginLayoutParams | 176 | binding.toolbarInstallables.updateMargins(left = leftInsets, right = rightInsets) |
176 | mlpAppBar.leftMargin = leftInsets | 177 | binding.listInstallables.updateMargins(left = leftInsets, right = rightInsets) |
177 | mlpAppBar.rightMargin = rightInsets | ||
178 | binding.toolbarInstallables.layoutParams = mlpAppBar | ||
179 | |||
180 | val mlpScrollAbout = | ||
181 | binding.listInstallables.layoutParams as ViewGroup.MarginLayoutParams | ||
182 | mlpScrollAbout.leftMargin = leftInsets | ||
183 | mlpScrollAbout.rightMargin = rightInsets | ||
184 | binding.listInstallables.layoutParams = mlpScrollAbout | ||
185 | 178 | ||
186 | binding.listInstallables.updatePadding(bottom = barInsets.bottom) | 179 | binding.listInstallables.updatePadding(bottom = barInsets.bottom) |
187 | 180 | ||
diff --git a/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/LicensesFragment.kt b/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/LicensesFragment.kt index b6e9129f7..f17f621f8 100755 --- a/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/LicensesFragment.kt +++ b/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/LicensesFragment.kt | |||
@@ -7,7 +7,6 @@ import android.os.Bundle | |||
7 | import android.view.LayoutInflater | 7 | import android.view.LayoutInflater |
8 | import android.view.View | 8 | import android.view.View |
9 | import android.view.ViewGroup | 9 | import android.view.ViewGroup |
10 | import android.view.ViewGroup.MarginLayoutParams | ||
11 | import androidx.appcompat.app.AppCompatActivity | 10 | import androidx.appcompat.app.AppCompatActivity |
12 | import androidx.core.view.ViewCompat | 11 | import androidx.core.view.ViewCompat |
13 | import androidx.core.view.WindowInsetsCompat | 12 | import androidx.core.view.WindowInsetsCompat |
@@ -22,6 +21,7 @@ import org.yuzu.yuzu_emu.adapters.LicenseAdapter | |||
22 | import org.yuzu.yuzu_emu.databinding.FragmentLicensesBinding | 21 | import org.yuzu.yuzu_emu.databinding.FragmentLicensesBinding |
23 | import org.yuzu.yuzu_emu.model.HomeViewModel | 22 | import org.yuzu.yuzu_emu.model.HomeViewModel |
24 | import org.yuzu.yuzu_emu.model.License | 23 | import org.yuzu.yuzu_emu.model.License |
24 | import org.yuzu.yuzu_emu.utils.ViewUtils.updateMargins | ||
25 | 25 | ||
26 | class LicensesFragment : Fragment() { | 26 | class LicensesFragment : Fragment() { |
27 | private var _binding: FragmentLicensesBinding? = null | 27 | private var _binding: FragmentLicensesBinding? = null |
@@ -122,15 +122,8 @@ class LicensesFragment : Fragment() { | |||
122 | val leftInsets = barInsets.left + cutoutInsets.left | 122 | val leftInsets = barInsets.left + cutoutInsets.left |
123 | val rightInsets = barInsets.right + cutoutInsets.right | 123 | val rightInsets = barInsets.right + cutoutInsets.right |
124 | 124 | ||
125 | val mlpAppBar = binding.appbarLicenses.layoutParams as MarginLayoutParams | 125 | binding.appbarLicenses.updateMargins(left = leftInsets, right = rightInsets) |
126 | mlpAppBar.leftMargin = leftInsets | 126 | binding.listLicenses.updateMargins(left = leftInsets, right = rightInsets) |
127 | mlpAppBar.rightMargin = rightInsets | ||
128 | binding.appbarLicenses.layoutParams = mlpAppBar | ||
129 | |||
130 | val mlpScrollAbout = binding.listLicenses.layoutParams as MarginLayoutParams | ||
131 | mlpScrollAbout.leftMargin = leftInsets | ||
132 | mlpScrollAbout.rightMargin = rightInsets | ||
133 | binding.listLicenses.layoutParams = mlpScrollAbout | ||
134 | 127 | ||
135 | binding.listLicenses.updatePadding(bottom = barInsets.bottom) | 128 | binding.listLicenses.updatePadding(bottom = barInsets.bottom) |
136 | 129 | ||
diff --git a/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/SettingsSearchFragment.kt b/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/SettingsSearchFragment.kt index f95d545bf..a135b80b4 100755 --- a/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/SettingsSearchFragment.kt +++ b/src/android/app/src/main/java/org/yuzu/yuzu_emu/fragments/SettingsSearchFragment.kt | |||
@@ -29,6 +29,7 @@ import org.yuzu.yuzu_emu.features.settings.model.view.SettingsItem | |||
29 | import org.yuzu.yuzu_emu.features.settings.ui.SettingsAdapter | 29 | import org.yuzu.yuzu_emu.features.settings.ui.SettingsAdapter |
30 | import org.yuzu.yuzu_emu.model.SettingsViewModel | 30 | import org.yuzu.yuzu_emu.model.SettingsViewModel |
31 | import org.yuzu.yuzu_emu.utils.NativeConfig | 31 | import org.yuzu.yuzu_emu.utils.NativeConfig |
32 | import org.yuzu.yuzu_emu.utils.ViewUtils.updateMargins | ||
32 | 33 | ||
33 | class SettingsSearchFragment : Fragment() { | 34 | class SettingsSearchFragment : Fragment() { |
34 | private var _binding: FragmentSettingsSearchBinding? = null | 35 | private var _binding: FragmentSettingsSearchBinding? = null |
@@ -174,15 +175,14 @@ class SettingsSearchFragment : Fragment() { | |||
174 | bottom = barInsets.bottom | 175 | bottom = barInsets.bottom |
175 | ) | 176 | ) |
176 | 177 | ||
177 | val mlpSettingsList = binding.settingsList.layoutParams as ViewGroup.MarginLayoutParams | 178 | binding.settingsList.updateMargins( |
178 | mlpSettingsList.leftMargin = leftInsets + sideMargin | 179 | left = leftInsets + sideMargin, |
179 | mlpSettingsList.rightMargin = rightInsets + sideMargin | 180 | right = rightInsets + sideMargin |
180 | binding.settingsList.layoutParams = mlpSettingsList | 181 | ) |
181 | 182 | binding.divider.updateMargins( | |
182 | val mlpDivider = binding.divider.layoutParams as ViewGroup.MarginLayoutParams | 183 | left = leftInsets + sideMargin, |
183 | mlpDivider.leftMargin = leftInsets + sideMargin | 184 | right = rightInsets + sideMargin |
184 | mlpDivider.rightMargin = rightInsets + sideMargin | 185 | ) |
185 | binding.divider.layoutParams = mlpDivider | ||
186 | 186 | ||
187 | windowInsets | 187 | windowInsets |
188 | } | 188 | } |
diff --git a/src/android/app/src/main/java/org/yuzu/yuzu_emu/ui/GamesFragment.kt b/src/android/app/src/main/java/org/yuzu/yuzu_emu/ui/GamesFragment.kt index 54380323e..23ca49b53 100755 --- a/src/android/app/src/main/java/org/yuzu/yuzu_emu/ui/GamesFragment.kt +++ b/src/android/app/src/main/java/org/yuzu/yuzu_emu/ui/GamesFragment.kt | |||
@@ -8,7 +8,6 @@ import android.os.Bundle | |||
8 | import android.view.LayoutInflater | 8 | import android.view.LayoutInflater |
9 | import android.view.View | 9 | import android.view.View |
10 | import android.view.ViewGroup | 10 | import android.view.ViewGroup |
11 | import android.view.ViewGroup.MarginLayoutParams | ||
12 | import androidx.appcompat.app.AppCompatActivity | 11 | import androidx.appcompat.app.AppCompatActivity |
13 | import androidx.core.view.ViewCompat | 12 | import androidx.core.view.ViewCompat |
14 | import androidx.core.view.WindowInsetsCompat | 13 | import androidx.core.view.WindowInsetsCompat |
@@ -27,6 +26,7 @@ import org.yuzu.yuzu_emu.databinding.FragmentGamesBinding | |||
27 | import org.yuzu.yuzu_emu.layout.AutofitGridLayoutManager | 26 | import org.yuzu.yuzu_emu.layout.AutofitGridLayoutManager |
28 | import org.yuzu.yuzu_emu.model.GamesViewModel | 27 | import org.yuzu.yuzu_emu.model.GamesViewModel |
29 | import org.yuzu.yuzu_emu.model.HomeViewModel | 28 | import org.yuzu.yuzu_emu.model.HomeViewModel |
29 | import org.yuzu.yuzu_emu.utils.ViewUtils.updateMargins | ||
30 | 30 | ||
31 | class GamesFragment : Fragment() { | 31 | class GamesFragment : Fragment() { |
32 | private var _binding: FragmentGamesBinding? = null | 32 | private var _binding: FragmentGamesBinding? = null |
@@ -169,15 +169,16 @@ class GamesFragment : Fragment() { | |||
169 | 169 | ||
170 | val leftInsets = barInsets.left + cutoutInsets.left | 170 | val leftInsets = barInsets.left + cutoutInsets.left |
171 | val rightInsets = barInsets.right + cutoutInsets.right | 171 | val rightInsets = barInsets.right + cutoutInsets.right |
172 | val mlpSwipe = binding.swipeRefresh.layoutParams as MarginLayoutParams | 172 | val left: Int |
173 | val right: Int | ||
173 | if (ViewCompat.getLayoutDirection(view) == ViewCompat.LAYOUT_DIRECTION_LTR) { | 174 | if (ViewCompat.getLayoutDirection(view) == ViewCompat.LAYOUT_DIRECTION_LTR) { |
174 | mlpSwipe.leftMargin = leftInsets + spacingNavigationRail | 175 | left = leftInsets + spacingNavigationRail |
175 | mlpSwipe.rightMargin = rightInsets | 176 | right = rightInsets |
176 | } else { | 177 | } else { |
177 | mlpSwipe.leftMargin = leftInsets | 178 | left = leftInsets |
178 | mlpSwipe.rightMargin = rightInsets + spacingNavigationRail | 179 | right = rightInsets + spacingNavigationRail |
179 | } | 180 | } |
180 | binding.swipeRefresh.layoutParams = mlpSwipe | 181 | binding.swipeRefresh.updateMargins(left = left, right = right) |
181 | 182 | ||
182 | binding.noticeText.updatePadding(bottom = spacingNavigation) | 183 | binding.noticeText.updatePadding(bottom = spacingNavigation) |
183 | 184 | ||
diff --git a/src/android/app/src/main/java/org/yuzu/yuzu_emu/ui/main/MainActivity.kt b/src/android/app/src/main/java/org/yuzu/yuzu_emu/ui/main/MainActivity.kt index b3967d294..4df4ac4c6 100755 --- a/src/android/app/src/main/java/org/yuzu/yuzu_emu/ui/main/MainActivity.kt +++ b/src/android/app/src/main/java/org/yuzu/yuzu_emu/ui/main/MainActivity.kt | |||
@@ -34,7 +34,6 @@ import kotlinx.coroutines.launch | |||
34 | import org.yuzu.yuzu_emu.HomeNavigationDirections | 34 | import org.yuzu.yuzu_emu.HomeNavigationDirections |
35 | import org.yuzu.yuzu_emu.NativeLibrary | 35 | import org.yuzu.yuzu_emu.NativeLibrary |
36 | import org.yuzu.yuzu_emu.R | 36 | import org.yuzu.yuzu_emu.R |
37 | import org.yuzu.yuzu_emu.activities.EmulationActivity | ||
38 | import org.yuzu.yuzu_emu.databinding.ActivityMainBinding | 37 | import org.yuzu.yuzu_emu.databinding.ActivityMainBinding |
39 | import org.yuzu.yuzu_emu.features.settings.model.Settings | 38 | import org.yuzu.yuzu_emu.features.settings.model.Settings |
40 | import org.yuzu.yuzu_emu.fragments.AddGameFolderDialogFragment | 39 | import org.yuzu.yuzu_emu.fragments.AddGameFolderDialogFragment |
@@ -177,9 +176,6 @@ class MainActivity : AppCompatActivity(), ThemeProvider { | |||
177 | } | 176 | } |
178 | } | 177 | } |
179 | 178 | ||
180 | // Dismiss previous notifications (should not happen unless a crash occurred) | ||
181 | EmulationActivity.stopForegroundService(this) | ||
182 | |||
183 | setInsets() | 179 | setInsets() |
184 | } | 180 | } |
185 | 181 | ||
@@ -298,11 +294,6 @@ class MainActivity : AppCompatActivity(), ThemeProvider { | |||
298 | super.onResume() | 294 | super.onResume() |
299 | } | 295 | } |
300 | 296 | ||
301 | override fun onDestroy() { | ||
302 | EmulationActivity.stopForegroundService(this) | ||
303 | super.onDestroy() | ||
304 | } | ||
305 | |||
306 | private fun setInsets() = | 297 | private fun setInsets() = |
307 | ViewCompat.setOnApplyWindowInsetsListener( | 298 | ViewCompat.setOnApplyWindowInsetsListener( |
308 | binding.root | 299 | binding.root |
diff --git a/src/android/app/src/main/java/org/yuzu/yuzu_emu/utils/ViewUtils.kt b/src/android/app/src/main/java/org/yuzu/yuzu_emu/utils/ViewUtils.kt index f9a3e4126..ffbfa9337 100755 --- a/src/android/app/src/main/java/org/yuzu/yuzu_emu/utils/ViewUtils.kt +++ b/src/android/app/src/main/java/org/yuzu/yuzu_emu/utils/ViewUtils.kt | |||
@@ -4,6 +4,7 @@ | |||
4 | package org.yuzu.yuzu_emu.utils | 4 | package org.yuzu.yuzu_emu.utils |
5 | 5 | ||
6 | import android.view.View | 6 | import android.view.View |
7 | import android.view.ViewGroup | ||
7 | 8 | ||
8 | object ViewUtils { | 9 | object ViewUtils { |
9 | fun showView(view: View, length: Long = 300) { | 10 | fun showView(view: View, length: Long = 300) { |
@@ -32,4 +33,28 @@ object ViewUtils { | |||
32 | view.visibility = View.INVISIBLE | 33 | view.visibility = View.INVISIBLE |
33 | }.start() | 34 | }.start() |
34 | } | 35 | } |
36 | |||
37 | fun View.updateMargins( | ||
38 | left: Int = -1, | ||
39 | top: Int = -1, | ||
40 | right: Int = -1, | ||
41 | bottom: Int = -1 | ||
42 | ) { | ||
43 | val layoutParams = this.layoutParams as ViewGroup.MarginLayoutParams | ||
44 | layoutParams.apply { | ||
45 | if (left != -1) { | ||
46 | leftMargin = left | ||
47 | } | ||
48 | if (top != -1) { | ||
49 | topMargin = top | ||
50 | } | ||
51 | if (right != -1) { | ||
52 | rightMargin = right | ||
53 | } | ||
54 | if (bottom != -1) { | ||
55 | bottomMargin = bottom | ||
56 | } | ||
57 | } | ||
58 | this.layoutParams = layoutParams | ||
59 | } | ||
35 | } | 60 | } |
diff --git a/src/android/app/src/main/jni/CMakeLists.txt b/src/android/app/src/main/jni/CMakeLists.txt index abc6055ab..20b319c12 100755 --- a/src/android/app/src/main/jni/CMakeLists.txt +++ b/src/android/app/src/main/jni/CMakeLists.txt | |||
@@ -2,14 +2,8 @@ | |||
2 | # SPDX-License-Identifier: GPL-3.0-or-later | 2 | # SPDX-License-Identifier: GPL-3.0-or-later |
3 | 3 | ||
4 | add_library(yuzu-android SHARED | 4 | add_library(yuzu-android SHARED |
5 | android_common/android_common.cpp | ||
6 | android_common/android_common.h | ||
7 | applets/software_keyboard.cpp | ||
8 | applets/software_keyboard.h | ||
9 | emu_window/emu_window.cpp | 5 | emu_window/emu_window.cpp |
10 | emu_window/emu_window.h | 6 | emu_window/emu_window.h |
11 | id_cache.cpp | ||
12 | id_cache.h | ||
13 | native.cpp | 7 | native.cpp |
14 | native.h | 8 | native.h |
15 | native_config.cpp | 9 | native_config.cpp |
diff --git a/src/android/app/src/main/jni/android_settings.h b/src/android/app/src/main/jni/android_settings.h index cf93304da..4a3bc8e53 100755 --- a/src/android/app/src/main/jni/android_settings.h +++ b/src/android/app/src/main/jni/android_settings.h | |||
@@ -60,6 +60,8 @@ struct Values { | |||
60 | Settings::Category::Overlay}; | 60 | Settings::Category::Overlay}; |
61 | Settings::Setting<bool> show_performance_overlay{linkage, true, "show_performance_overlay", | 61 | Settings::Setting<bool> show_performance_overlay{linkage, true, "show_performance_overlay", |
62 | Settings::Category::Overlay}; | 62 | Settings::Category::Overlay}; |
63 | Settings::Setting<bool> show_thermal_overlay{linkage, false, "show_thermal_overlay", | ||
64 | Settings::Category::Overlay}; | ||
63 | Settings::Setting<bool> show_input_overlay{linkage, true, "show_input_overlay", | 65 | Settings::Setting<bool> show_input_overlay{linkage, true, "show_input_overlay", |
64 | Settings::Category::Overlay}; | 66 | Settings::Category::Overlay}; |
65 | Settings::Setting<bool> touchscreen{linkage, true, "touchscreen", Settings::Category::Overlay}; | 67 | Settings::Setting<bool> touchscreen{linkage, true, "touchscreen", Settings::Category::Overlay}; |
diff --git a/src/android/app/src/main/jni/emu_window/emu_window.cpp b/src/android/app/src/main/jni/emu_window/emu_window.cpp index c4f631924..c927cddda 100755 --- a/src/android/app/src/main/jni/emu_window/emu_window.cpp +++ b/src/android/app/src/main/jni/emu_window/emu_window.cpp | |||
@@ -3,6 +3,7 @@ | |||
3 | 3 | ||
4 | #include <android/native_window_jni.h> | 4 | #include <android/native_window_jni.h> |
5 | 5 | ||
6 | #include "common/android/id_cache.h" | ||
6 | #include "common/logging/log.h" | 7 | #include "common/logging/log.h" |
7 | #include "input_common/drivers/touch_screen.h" | 8 | #include "input_common/drivers/touch_screen.h" |
8 | #include "input_common/drivers/virtual_amiibo.h" | 9 | #include "input_common/drivers/virtual_amiibo.h" |
@@ -60,7 +61,8 @@ void EmuWindow_Android::OnRemoveNfcTag() { | |||
60 | 61 | ||
61 | void EmuWindow_Android::OnFrameDisplayed() { | 62 | void EmuWindow_Android::OnFrameDisplayed() { |
62 | if (!m_first_frame) { | 63 | if (!m_first_frame) { |
63 | EmulationSession::GetInstance().OnEmulationStarted(); | 64 | Common::Android::RunJNIOnFiber<void>( |
65 | [&](JNIEnv* env) { EmulationSession::GetInstance().OnEmulationStarted(); }); | ||
64 | m_first_frame = true; | 66 | m_first_frame = true; |
65 | } | 67 | } |
66 | } | 68 | } |
diff --git a/src/android/app/src/main/jni/game_metadata.cpp b/src/android/app/src/main/jni/game_metadata.cpp index 8f0da1413..c33763b47 100755 --- a/src/android/app/src/main/jni/game_metadata.cpp +++ b/src/android/app/src/main/jni/game_metadata.cpp | |||
@@ -1,13 +1,12 @@ | |||
1 | // SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project | 1 | // SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project |
2 | // SPDX-License-Identifier: GPL-2.0-or-later | 2 | // SPDX-License-Identifier: GPL-2.0-or-later |
3 | 3 | ||
4 | #include "common/android/android_common.h" | ||
4 | #include "core/core.h" | 5 | #include "core/core.h" |
5 | #include "core/file_sys/fs_filesystem.h" | 6 | #include "core/file_sys/fs_filesystem.h" |
6 | #include "core/file_sys/patch_manager.h" | 7 | #include "core/file_sys/patch_manager.h" |
7 | #include "core/loader/loader.h" | 8 | #include "core/loader/loader.h" |
8 | #include "core/loader/nro.h" | 9 | #include "core/loader/nro.h" |
9 | #include "jni.h" | ||
10 | #include "jni/android_common/android_common.h" | ||
11 | #include "native.h" | 10 | #include "native.h" |
12 | 11 | ||
13 | struct RomMetadata { | 12 | struct RomMetadata { |
@@ -79,7 +78,7 @@ extern "C" { | |||
79 | jboolean Java_org_yuzu_yuzu_1emu_utils_GameMetadata_getIsValid(JNIEnv* env, jobject obj, | 78 | jboolean Java_org_yuzu_yuzu_1emu_utils_GameMetadata_getIsValid(JNIEnv* env, jobject obj, |
80 | jstring jpath) { | 79 | jstring jpath) { |
81 | const auto file = EmulationSession::GetInstance().System().GetFilesystem()->OpenFile( | 80 | const auto file = EmulationSession::GetInstance().System().GetFilesystem()->OpenFile( |
82 | GetJString(env, jpath), FileSys::OpenMode::Read); | 81 | Common::Android::GetJString(env, jpath), FileSys::OpenMode::Read); |
83 | if (!file) { | 82 | if (!file) { |
84 | return false; | 83 | return false; |
85 | } | 84 | } |
@@ -104,27 +103,31 @@ jboolean Java_org_yuzu_yuzu_1emu_utils_GameMetadata_getIsValid(JNIEnv* env, jobj | |||
104 | 103 | ||
105 | jstring Java_org_yuzu_yuzu_1emu_utils_GameMetadata_getTitle(JNIEnv* env, jobject obj, | 104 | jstring Java_org_yuzu_yuzu_1emu_utils_GameMetadata_getTitle(JNIEnv* env, jobject obj, |
106 | jstring jpath) { | 105 | jstring jpath) { |
107 | return ToJString(env, GetRomMetadata(GetJString(env, jpath)).title); | 106 | return Common::Android::ToJString( |
107 | env, GetRomMetadata(Common::Android::GetJString(env, jpath)).title); | ||
108 | } | 108 | } |
109 | 109 | ||
110 | jstring Java_org_yuzu_yuzu_1emu_utils_GameMetadata_getProgramId(JNIEnv* env, jobject obj, | 110 | jstring Java_org_yuzu_yuzu_1emu_utils_GameMetadata_getProgramId(JNIEnv* env, jobject obj, |
111 | jstring jpath) { | 111 | jstring jpath) { |
112 | return ToJString(env, std::to_string(GetRomMetadata(GetJString(env, jpath)).programId)); | 112 | return Common::Android::ToJString( |
113 | env, std::to_string(GetRomMetadata(Common::Android::GetJString(env, jpath)).programId)); | ||
113 | } | 114 | } |
114 | 115 | ||
115 | jstring Java_org_yuzu_yuzu_1emu_utils_GameMetadata_getDeveloper(JNIEnv* env, jobject obj, | 116 | jstring Java_org_yuzu_yuzu_1emu_utils_GameMetadata_getDeveloper(JNIEnv* env, jobject obj, |
116 | jstring jpath) { | 117 | jstring jpath) { |
117 | return ToJString(env, GetRomMetadata(GetJString(env, jpath)).developer); | 118 | return Common::Android::ToJString( |
119 | env, GetRomMetadata(Common::Android::GetJString(env, jpath)).developer); | ||
118 | } | 120 | } |
119 | 121 | ||
120 | jstring Java_org_yuzu_yuzu_1emu_utils_GameMetadata_getVersion(JNIEnv* env, jobject obj, | 122 | jstring Java_org_yuzu_yuzu_1emu_utils_GameMetadata_getVersion(JNIEnv* env, jobject obj, |
121 | jstring jpath, jboolean jreload) { | 123 | jstring jpath, jboolean jreload) { |
122 | return ToJString(env, GetRomMetadata(GetJString(env, jpath), jreload).version); | 124 | return Common::Android::ToJString( |
125 | env, GetRomMetadata(Common::Android::GetJString(env, jpath), jreload).version); | ||
123 | } | 126 | } |
124 | 127 | ||
125 | jbyteArray Java_org_yuzu_yuzu_1emu_utils_GameMetadata_getIcon(JNIEnv* env, jobject obj, | 128 | jbyteArray Java_org_yuzu_yuzu_1emu_utils_GameMetadata_getIcon(JNIEnv* env, jobject obj, |
126 | jstring jpath) { | 129 | jstring jpath) { |
127 | auto icon_data = GetRomMetadata(GetJString(env, jpath)).icon; | 130 | auto icon_data = GetRomMetadata(Common::Android::GetJString(env, jpath)).icon; |
128 | jbyteArray icon = env->NewByteArray(static_cast<jsize>(icon_data.size())); | 131 | jbyteArray icon = env->NewByteArray(static_cast<jsize>(icon_data.size())); |
129 | env->SetByteArrayRegion(icon, 0, env->GetArrayLength(icon), | 132 | env->SetByteArrayRegion(icon, 0, env->GetArrayLength(icon), |
130 | reinterpret_cast<jbyte*>(icon_data.data())); | 133 | reinterpret_cast<jbyte*>(icon_data.data())); |
@@ -133,7 +136,8 @@ jbyteArray Java_org_yuzu_yuzu_1emu_utils_GameMetadata_getIcon(JNIEnv* env, jobje | |||
133 | 136 | ||
134 | jboolean Java_org_yuzu_yuzu_1emu_utils_GameMetadata_getIsHomebrew(JNIEnv* env, jobject obj, | 137 | jboolean Java_org_yuzu_yuzu_1emu_utils_GameMetadata_getIsHomebrew(JNIEnv* env, jobject obj, |
135 | jstring jpath) { | 138 | jstring jpath) { |
136 | return static_cast<jboolean>(GetRomMetadata(GetJString(env, jpath)).isHomebrew); | 139 | return static_cast<jboolean>( |
140 | GetRomMetadata(Common::Android::GetJString(env, jpath)).isHomebrew); | ||
137 | } | 141 | } |
138 | 142 | ||
139 | void Java_org_yuzu_yuzu_1emu_utils_GameMetadata_resetMetadata(JNIEnv* env, jobject obj) { | 143 | void Java_org_yuzu_yuzu_1emu_utils_GameMetadata_resetMetadata(JNIEnv* env, jobject obj) { |
diff --git a/src/android/app/src/main/jni/native.cpp b/src/android/app/src/main/jni/native.cpp index 654510129..4acc60956 100755 --- a/src/android/app/src/main/jni/native.cpp +++ b/src/android/app/src/main/jni/native.cpp | |||
@@ -20,6 +20,8 @@ | |||
20 | #include <frontend_common/content_manager.h> | 20 | #include <frontend_common/content_manager.h> |
21 | #include <jni.h> | 21 | #include <jni.h> |
22 | 22 | ||
23 | #include "common/android/android_common.h" | ||
24 | #include "common/android/id_cache.h" | ||
23 | #include "common/detached_tasks.h" | 25 | #include "common/detached_tasks.h" |
24 | #include "common/dynamic_library.h" | 26 | #include "common/dynamic_library.h" |
25 | #include "common/fs/path_util.h" | 27 | #include "common/fs/path_util.h" |
@@ -57,8 +59,6 @@ | |||
57 | #include "hid_core/frontend/emulated_controller.h" | 59 | #include "hid_core/frontend/emulated_controller.h" |
58 | #include "hid_core/hid_core.h" | 60 | #include "hid_core/hid_core.h" |
59 | #include "hid_core/hid_types.h" | 61 | #include "hid_core/hid_types.h" |
60 | #include "jni/android_common/android_common.h" | ||
61 | #include "jni/id_cache.h" | ||
62 | #include "jni/native.h" | 62 | #include "jni/native.h" |
63 | #include "video_core/renderer_base.h" | 63 | #include "video_core/renderer_base.h" |
64 | #include "video_core/renderer_vulkan/renderer_vulkan.h" | 64 | #include "video_core/renderer_vulkan/renderer_vulkan.h" |
@@ -228,7 +228,7 @@ Core::SystemResultStatus EmulationSession::InitializeEmulation(const std::string | |||
228 | std::make_unique<EmuWindow_Android>(&m_input_subsystem, m_native_window, m_vulkan_library); | 228 | std::make_unique<EmuWindow_Android>(&m_input_subsystem, m_native_window, m_vulkan_library); |
229 | 229 | ||
230 | // Initialize system. | 230 | // Initialize system. |
231 | jauto android_keyboard = std::make_unique<SoftwareKeyboard::AndroidKeyboard>(); | 231 | jauto android_keyboard = std::make_unique<Common::Android::SoftwareKeyboard::AndroidKeyboard>(); |
232 | m_software_keyboard = android_keyboard.get(); | 232 | m_software_keyboard = android_keyboard.get(); |
233 | m_system.SetShuttingDown(false); | 233 | m_system.SetShuttingDown(false); |
234 | m_system.ApplySettings(); | 234 | m_system.ApplySettings(); |
@@ -411,37 +411,39 @@ void EmulationSession::OnGamepadDisconnectEvent([[maybe_unused]] int index) { | |||
411 | controller->Disconnect(); | 411 | controller->Disconnect(); |
412 | } | 412 | } |
413 | 413 | ||
414 | SoftwareKeyboard::AndroidKeyboard* EmulationSession::SoftwareKeyboard() { | 414 | Common::Android::SoftwareKeyboard::AndroidKeyboard* EmulationSession::SoftwareKeyboard() { |
415 | return m_software_keyboard; | 415 | return m_software_keyboard; |
416 | } | 416 | } |
417 | 417 | ||
418 | void EmulationSession::LoadDiskCacheProgress(VideoCore::LoadCallbackStage stage, int progress, | 418 | void EmulationSession::LoadDiskCacheProgress(VideoCore::LoadCallbackStage stage, int progress, |
419 | int max) { | 419 | int max) { |
420 | JNIEnv* env = IDCache::GetEnvForThread(); | 420 | JNIEnv* env = Common::Android::GetEnvForThread(); |
421 | env->CallStaticVoidMethod(IDCache::GetDiskCacheProgressClass(), | 421 | env->CallStaticVoidMethod(Common::Android::GetDiskCacheProgressClass(), |
422 | IDCache::GetDiskCacheLoadProgress(), static_cast<jint>(stage), | 422 | Common::Android::GetDiskCacheLoadProgress(), static_cast<jint>(stage), |
423 | static_cast<jint>(progress), static_cast<jint>(max)); | 423 | static_cast<jint>(progress), static_cast<jint>(max)); |
424 | } | 424 | } |
425 | 425 | ||
426 | void EmulationSession::OnEmulationStarted() { | 426 | void EmulationSession::OnEmulationStarted() { |
427 | JNIEnv* env = IDCache::GetEnvForThread(); | 427 | JNIEnv* env = Common::Android::GetEnvForThread(); |
428 | env->CallStaticVoidMethod(IDCache::GetNativeLibraryClass(), IDCache::GetOnEmulationStarted()); | 428 | env->CallStaticVoidMethod(Common::Android::GetNativeLibraryClass(), |
429 | Common::Android::GetOnEmulationStarted()); | ||
429 | } | 430 | } |
430 | 431 | ||
431 | void EmulationSession::OnEmulationStopped(Core::SystemResultStatus result) { | 432 | void EmulationSession::OnEmulationStopped(Core::SystemResultStatus result) { |
432 | JNIEnv* env = IDCache::GetEnvForThread(); | 433 | JNIEnv* env = Common::Android::GetEnvForThread(); |
433 | env->CallStaticVoidMethod(IDCache::GetNativeLibraryClass(), IDCache::GetOnEmulationStopped(), | 434 | env->CallStaticVoidMethod(Common::Android::GetNativeLibraryClass(), |
434 | static_cast<jint>(result)); | 435 | Common::Android::GetOnEmulationStopped(), static_cast<jint>(result)); |
435 | } | 436 | } |
436 | 437 | ||
437 | void EmulationSession::ChangeProgram(std::size_t program_index) { | 438 | void EmulationSession::ChangeProgram(std::size_t program_index) { |
438 | JNIEnv* env = IDCache::GetEnvForThread(); | 439 | JNIEnv* env = Common::Android::GetEnvForThread(); |
439 | env->CallStaticVoidMethod(IDCache::GetNativeLibraryClass(), IDCache::GetOnProgramChanged(), | 440 | env->CallStaticVoidMethod(Common::Android::GetNativeLibraryClass(), |
441 | Common::Android::GetOnProgramChanged(), | ||
440 | static_cast<jint>(program_index)); | 442 | static_cast<jint>(program_index)); |
441 | } | 443 | } |
442 | 444 | ||
443 | u64 EmulationSession::GetProgramId(JNIEnv* env, jstring jprogramId) { | 445 | u64 EmulationSession::GetProgramId(JNIEnv* env, jstring jprogramId) { |
444 | auto program_id_string = GetJString(env, jprogramId); | 446 | auto program_id_string = Common::Android::GetJString(env, jprogramId); |
445 | try { | 447 | try { |
446 | return std::stoull(program_id_string); | 448 | return std::stoull(program_id_string); |
447 | } catch (...) { | 449 | } catch (...) { |
@@ -491,7 +493,7 @@ void Java_org_yuzu_yuzu_1emu_NativeLibrary_surfaceDestroyed(JNIEnv* env, jobject | |||
491 | 493 | ||
492 | void Java_org_yuzu_yuzu_1emu_NativeLibrary_setAppDirectory(JNIEnv* env, jobject instance, | 494 | void Java_org_yuzu_yuzu_1emu_NativeLibrary_setAppDirectory(JNIEnv* env, jobject instance, |
493 | [[maybe_unused]] jstring j_directory) { | 495 | [[maybe_unused]] jstring j_directory) { |
494 | Common::FS::SetAppDirectory(GetJString(env, j_directory)); | 496 | Common::FS::SetAppDirectory(Common::Android::GetJString(env, j_directory)); |
495 | } | 497 | } |
496 | 498 | ||
497 | int Java_org_yuzu_yuzu_1emu_NativeLibrary_installFileToNand(JNIEnv* env, jobject instance, | 499 | int Java_org_yuzu_yuzu_1emu_NativeLibrary_installFileToNand(JNIEnv* env, jobject instance, |
@@ -501,21 +503,22 @@ int Java_org_yuzu_yuzu_1emu_NativeLibrary_installFileToNand(JNIEnv* env, jobject | |||
501 | jlambdaClass, "invoke", "(Ljava/lang/Object;Ljava/lang/Object;)Ljava/lang/Object;"); | 503 | jlambdaClass, "invoke", "(Ljava/lang/Object;Ljava/lang/Object;)Ljava/lang/Object;"); |
502 | const auto callback = [env, jcallback, jlambdaInvokeMethod](size_t max, size_t progress) { | 504 | const auto callback = [env, jcallback, jlambdaInvokeMethod](size_t max, size_t progress) { |
503 | auto jwasCancelled = env->CallObjectMethod(jcallback, jlambdaInvokeMethod, | 505 | auto jwasCancelled = env->CallObjectMethod(jcallback, jlambdaInvokeMethod, |
504 | ToJDouble(env, max), ToJDouble(env, progress)); | 506 | Common::Android::ToJDouble(env, max), |
505 | return GetJBoolean(env, jwasCancelled); | 507 | Common::Android::ToJDouble(env, progress)); |
508 | return Common::Android::GetJBoolean(env, jwasCancelled); | ||
506 | }; | 509 | }; |
507 | 510 | ||
508 | return static_cast<int>( | 511 | return static_cast<int>( |
509 | ContentManager::InstallNSP(EmulationSession::GetInstance().System(), | 512 | ContentManager::InstallNSP(EmulationSession::GetInstance().System(), |
510 | *EmulationSession::GetInstance().System().GetFilesystem(), | 513 | *EmulationSession::GetInstance().System().GetFilesystem(), |
511 | GetJString(env, j_file), callback)); | 514 | Common::Android::GetJString(env, j_file), callback)); |
512 | } | 515 | } |
513 | 516 | ||
514 | jboolean Java_org_yuzu_yuzu_1emu_NativeLibrary_doesUpdateMatchProgram(JNIEnv* env, jobject jobj, | 517 | jboolean Java_org_yuzu_yuzu_1emu_NativeLibrary_doesUpdateMatchProgram(JNIEnv* env, jobject jobj, |
515 | jstring jprogramId, | 518 | jstring jprogramId, |
516 | jstring jupdatePath) { | 519 | jstring jupdatePath) { |
517 | u64 program_id = EmulationSession::GetProgramId(env, jprogramId); | 520 | u64 program_id = EmulationSession::GetProgramId(env, jprogramId); |
518 | std::string updatePath = GetJString(env, jupdatePath); | 521 | std::string updatePath = Common::Android::GetJString(env, jupdatePath); |
519 | std::shared_ptr<FileSys::NSP> nsp = std::make_shared<FileSys::NSP>( | 522 | std::shared_ptr<FileSys::NSP> nsp = std::make_shared<FileSys::NSP>( |
520 | EmulationSession::GetInstance().System().GetFilesystem()->OpenFile( | 523 | EmulationSession::GetInstance().System().GetFilesystem()->OpenFile( |
521 | updatePath, FileSys::OpenMode::Read)); | 524 | updatePath, FileSys::OpenMode::Read)); |
@@ -538,8 +541,10 @@ void JNICALL Java_org_yuzu_yuzu_1emu_NativeLibrary_initializeGpuDriver(JNIEnv* e | |||
538 | jstring custom_driver_name, | 541 | jstring custom_driver_name, |
539 | jstring file_redirect_dir) { | 542 | jstring file_redirect_dir) { |
540 | EmulationSession::GetInstance().InitializeGpuDriver( | 543 | EmulationSession::GetInstance().InitializeGpuDriver( |
541 | GetJString(env, hook_lib_dir), GetJString(env, custom_driver_dir), | 544 | Common::Android::GetJString(env, hook_lib_dir), |
542 | GetJString(env, custom_driver_name), GetJString(env, file_redirect_dir)); | 545 | Common::Android::GetJString(env, custom_driver_dir), |
546 | Common::Android::GetJString(env, custom_driver_name), | ||
547 | Common::Android::GetJString(env, file_redirect_dir)); | ||
543 | } | 548 | } |
544 | 549 | ||
545 | [[maybe_unused]] static bool CheckKgslPresent() { | 550 | [[maybe_unused]] static bool CheckKgslPresent() { |
@@ -566,7 +571,7 @@ jobjectArray Java_org_yuzu_yuzu_1emu_utils_GpuDriverHelper_getSystemDriverInfo( | |||
566 | JNIEnv* env, jobject j_obj, jobject j_surf, jstring j_hook_lib_dir) { | 571 | JNIEnv* env, jobject j_obj, jobject j_surf, jstring j_hook_lib_dir) { |
567 | const char* file_redirect_dir_{}; | 572 | const char* file_redirect_dir_{}; |
568 | int featureFlags{}; | 573 | int featureFlags{}; |
569 | std::string hook_lib_dir = GetJString(env, j_hook_lib_dir); | 574 | std::string hook_lib_dir = Common::Android::GetJString(env, j_hook_lib_dir); |
570 | auto handle = adrenotools_open_libvulkan(RTLD_NOW, featureFlags, nullptr, hook_lib_dir.c_str(), | 575 | auto handle = adrenotools_open_libvulkan(RTLD_NOW, featureFlags, nullptr, hook_lib_dir.c_str(), |
571 | nullptr, nullptr, file_redirect_dir_, nullptr); | 576 | nullptr, nullptr, file_redirect_dir_, nullptr); |
572 | auto driver_library = std::make_shared<Common::DynamicLibrary>(handle); | 577 | auto driver_library = std::make_shared<Common::DynamicLibrary>(handle); |
@@ -587,9 +592,10 @@ jobjectArray Java_org_yuzu_yuzu_1emu_utils_GpuDriverHelper_getSystemDriverInfo( | |||
587 | fmt::format("{}.{}.{}", VK_API_VERSION_MAJOR(driver_version), | 592 | fmt::format("{}.{}.{}", VK_API_VERSION_MAJOR(driver_version), |
588 | VK_API_VERSION_MINOR(driver_version), VK_API_VERSION_PATCH(driver_version)); | 593 | VK_API_VERSION_MINOR(driver_version), VK_API_VERSION_PATCH(driver_version)); |
589 | 594 | ||
590 | jobjectArray j_driver_info = | 595 | jobjectArray j_driver_info = env->NewObjectArray( |
591 | env->NewObjectArray(2, IDCache::GetStringClass(), ToJString(env, version_string)); | 596 | 2, Common::Android::GetStringClass(), Common::Android::ToJString(env, version_string)); |
592 | env->SetObjectArrayElement(j_driver_info, 1, ToJString(env, device.GetDriverName())); | 597 | env->SetObjectArrayElement(j_driver_info, 1, |
598 | Common::Android::ToJString(env, device.GetDriverName())); | ||
593 | return j_driver_info; | 599 | return j_driver_info; |
594 | } | 600 | } |
595 | 601 | ||
@@ -742,15 +748,15 @@ jdoubleArray Java_org_yuzu_yuzu_1emu_NativeLibrary_getPerfStats(JNIEnv* env, jcl | |||
742 | 748 | ||
743 | jstring Java_org_yuzu_yuzu_1emu_NativeLibrary_getCpuBackend(JNIEnv* env, jclass clazz) { | 749 | jstring Java_org_yuzu_yuzu_1emu_NativeLibrary_getCpuBackend(JNIEnv* env, jclass clazz) { |
744 | if (Settings::IsNceEnabled()) { | 750 | if (Settings::IsNceEnabled()) { |
745 | return ToJString(env, "NCE"); | 751 | return Common::Android::ToJString(env, "NCE"); |
746 | } | 752 | } |
747 | 753 | ||
748 | return ToJString(env, "JIT"); | 754 | return Common::Android::ToJString(env, "JIT"); |
749 | } | 755 | } |
750 | 756 | ||
751 | jstring Java_org_yuzu_yuzu_1emu_NativeLibrary_getGpuDriver(JNIEnv* env, jobject jobj) { | 757 | jstring Java_org_yuzu_yuzu_1emu_NativeLibrary_getGpuDriver(JNIEnv* env, jobject jobj) { |
752 | return ToJString(env, | 758 | return Common::Android::ToJString( |
753 | EmulationSession::GetInstance().System().GPU().Renderer().GetDeviceVendor()); | 759 | env, EmulationSession::GetInstance().System().GPU().Renderer().GetDeviceVendor()); |
754 | } | 760 | } |
755 | 761 | ||
756 | void Java_org_yuzu_yuzu_1emu_NativeLibrary_applySettings(JNIEnv* env, jobject jobj) { | 762 | void Java_org_yuzu_yuzu_1emu_NativeLibrary_applySettings(JNIEnv* env, jobject jobj) { |
@@ -764,13 +770,14 @@ void Java_org_yuzu_yuzu_1emu_NativeLibrary_logSettings(JNIEnv* env, jobject jobj | |||
764 | void Java_org_yuzu_yuzu_1emu_NativeLibrary_run(JNIEnv* env, jobject jobj, jstring j_path, | 770 | void Java_org_yuzu_yuzu_1emu_NativeLibrary_run(JNIEnv* env, jobject jobj, jstring j_path, |
765 | jint j_program_index, | 771 | jint j_program_index, |
766 | jboolean j_frontend_initiated) { | 772 | jboolean j_frontend_initiated) { |
767 | const std::string path = GetJString(env, j_path); | 773 | const std::string path = Common::Android::GetJString(env, j_path); |
768 | 774 | ||
769 | const Core::SystemResultStatus result{ | 775 | const Core::SystemResultStatus result{ |
770 | RunEmulation(path, j_program_index, j_frontend_initiated)}; | 776 | RunEmulation(path, j_program_index, j_frontend_initiated)}; |
771 | if (result != Core::SystemResultStatus::Success) { | 777 | if (result != Core::SystemResultStatus::Success) { |
772 | env->CallStaticVoidMethod(IDCache::GetNativeLibraryClass(), | 778 | env->CallStaticVoidMethod(Common::Android::GetNativeLibraryClass(), |
773 | IDCache::GetExitEmulationActivity(), static_cast<int>(result)); | 779 | Common::Android::GetExitEmulationActivity(), |
780 | static_cast<int>(result)); | ||
774 | } | 781 | } |
775 | } | 782 | } |
776 | 783 | ||
@@ -781,7 +788,7 @@ void Java_org_yuzu_yuzu_1emu_NativeLibrary_logDeviceInfo(JNIEnv* env, jclass cla | |||
781 | 788 | ||
782 | void Java_org_yuzu_yuzu_1emu_NativeLibrary_submitInlineKeyboardText(JNIEnv* env, jclass clazz, | 789 | void Java_org_yuzu_yuzu_1emu_NativeLibrary_submitInlineKeyboardText(JNIEnv* env, jclass clazz, |
783 | jstring j_text) { | 790 | jstring j_text) { |
784 | const std::u16string input = Common::UTF8ToUTF16(GetJString(env, j_text)); | 791 | const std::u16string input = Common::UTF8ToUTF16(Common::Android::GetJString(env, j_text)); |
785 | EmulationSession::GetInstance().SoftwareKeyboard()->SubmitInlineKeyboardText(input); | 792 | EmulationSession::GetInstance().SoftwareKeyboard()->SubmitInlineKeyboardText(input); |
786 | } | 793 | } |
787 | 794 | ||
@@ -815,16 +822,16 @@ jstring Java_org_yuzu_yuzu_1emu_NativeLibrary_getAppletLaunchPath(JNIEnv* env, j | |||
815 | auto bis_system = | 822 | auto bis_system = |
816 | EmulationSession::GetInstance().System().GetFileSystemController().GetSystemNANDContents(); | 823 | EmulationSession::GetInstance().System().GetFileSystemController().GetSystemNANDContents(); |
817 | if (!bis_system) { | 824 | if (!bis_system) { |
818 | return ToJString(env, ""); | 825 | return Common::Android::ToJString(env, ""); |
819 | } | 826 | } |
820 | 827 | ||
821 | auto applet_nca = | 828 | auto applet_nca = |
822 | bis_system->GetEntry(static_cast<u64>(jid), FileSys::ContentRecordType::Program); | 829 | bis_system->GetEntry(static_cast<u64>(jid), FileSys::ContentRecordType::Program); |
823 | if (!applet_nca) { | 830 | if (!applet_nca) { |
824 | return ToJString(env, ""); | 831 | return Common::Android::ToJString(env, ""); |
825 | } | 832 | } |
826 | 833 | ||
827 | return ToJString(env, applet_nca->GetFullPath()); | 834 | return Common::Android::ToJString(env, applet_nca->GetFullPath()); |
828 | } | 835 | } |
829 | 836 | ||
830 | void Java_org_yuzu_yuzu_1emu_NativeLibrary_setCurrentAppletId(JNIEnv* env, jclass clazz, | 837 | void Java_org_yuzu_yuzu_1emu_NativeLibrary_setCurrentAppletId(JNIEnv* env, jclass clazz, |
@@ -857,7 +864,7 @@ jboolean Java_org_yuzu_yuzu_1emu_NativeLibrary_isFirmwareAvailable(JNIEnv* env, | |||
857 | jobjectArray Java_org_yuzu_yuzu_1emu_NativeLibrary_getPatchesForFile(JNIEnv* env, jobject jobj, | 864 | jobjectArray Java_org_yuzu_yuzu_1emu_NativeLibrary_getPatchesForFile(JNIEnv* env, jobject jobj, |
858 | jstring jpath, | 865 | jstring jpath, |
859 | jstring jprogramId) { | 866 | jstring jprogramId) { |
860 | const auto path = GetJString(env, jpath); | 867 | const auto path = Common::Android::GetJString(env, jpath); |
861 | const auto vFile = | 868 | const auto vFile = |
862 | Core::GetGameFileFromPath(EmulationSession::GetInstance().System().GetFilesystem(), path); | 869 | Core::GetGameFileFromPath(EmulationSession::GetInstance().System().GetFilesystem(), path); |
863 | if (vFile == nullptr) { | 870 | if (vFile == nullptr) { |
@@ -875,14 +882,15 @@ jobjectArray Java_org_yuzu_yuzu_1emu_NativeLibrary_getPatchesForFile(JNIEnv* env | |||
875 | 882 | ||
876 | auto patches = pm.GetPatches(update_raw); | 883 | auto patches = pm.GetPatches(update_raw); |
877 | jobjectArray jpatchArray = | 884 | jobjectArray jpatchArray = |
878 | env->NewObjectArray(patches.size(), IDCache::GetPatchClass(), nullptr); | 885 | env->NewObjectArray(patches.size(), Common::Android::GetPatchClass(), nullptr); |
879 | int i = 0; | 886 | int i = 0; |
880 | for (const auto& patch : patches) { | 887 | for (const auto& patch : patches) { |
881 | jobject jpatch = env->NewObject( | 888 | jobject jpatch = env->NewObject( |
882 | IDCache::GetPatchClass(), IDCache::GetPatchConstructor(), patch.enabled, | 889 | Common::Android::GetPatchClass(), Common::Android::GetPatchConstructor(), patch.enabled, |
883 | ToJString(env, patch.name), ToJString(env, patch.version), | 890 | Common::Android::ToJString(env, patch.name), |
884 | static_cast<jint>(patch.type), ToJString(env, std::to_string(patch.program_id)), | 891 | Common::Android::ToJString(env, patch.version), static_cast<jint>(patch.type), |
885 | ToJString(env, std::to_string(patch.title_id))); | 892 | Common::Android::ToJString(env, std::to_string(patch.program_id)), |
893 | Common::Android::ToJString(env, std::to_string(patch.title_id))); | ||
886 | env->SetObjectArrayElement(jpatchArray, i, jpatch); | 894 | env->SetObjectArrayElement(jpatchArray, i, jpatch); |
887 | ++i; | 895 | ++i; |
888 | } | 896 | } |
@@ -906,7 +914,7 @@ void Java_org_yuzu_yuzu_1emu_NativeLibrary_removeMod(JNIEnv* env, jobject jobj, | |||
906 | jstring jname) { | 914 | jstring jname) { |
907 | auto program_id = EmulationSession::GetProgramId(env, jprogramId); | 915 | auto program_id = EmulationSession::GetProgramId(env, jprogramId); |
908 | ContentManager::RemoveMod(EmulationSession::GetInstance().System().GetFileSystemController(), | 916 | ContentManager::RemoveMod(EmulationSession::GetInstance().System().GetFileSystemController(), |
909 | program_id, GetJString(env, jname)); | 917 | program_id, Common::Android::GetJString(env, jname)); |
910 | } | 918 | } |
911 | 919 | ||
912 | jobjectArray Java_org_yuzu_yuzu_1emu_NativeLibrary_verifyInstalledContents(JNIEnv* env, | 920 | jobjectArray Java_org_yuzu_yuzu_1emu_NativeLibrary_verifyInstalledContents(JNIEnv* env, |
@@ -917,17 +925,18 @@ jobjectArray Java_org_yuzu_yuzu_1emu_NativeLibrary_verifyInstalledContents(JNIEn | |||
917 | jlambdaClass, "invoke", "(Ljava/lang/Object;Ljava/lang/Object;)Ljava/lang/Object;"); | 925 | jlambdaClass, "invoke", "(Ljava/lang/Object;Ljava/lang/Object;)Ljava/lang/Object;"); |
918 | const auto callback = [env, jcallback, jlambdaInvokeMethod](size_t max, size_t progress) { | 926 | const auto callback = [env, jcallback, jlambdaInvokeMethod](size_t max, size_t progress) { |
919 | auto jwasCancelled = env->CallObjectMethod(jcallback, jlambdaInvokeMethod, | 927 | auto jwasCancelled = env->CallObjectMethod(jcallback, jlambdaInvokeMethod, |
920 | ToJDouble(env, max), ToJDouble(env, progress)); | 928 | Common::Android::ToJDouble(env, max), |
921 | return GetJBoolean(env, jwasCancelled); | 929 | Common::Android::ToJDouble(env, progress)); |
930 | return Common::Android::GetJBoolean(env, jwasCancelled); | ||
922 | }; | 931 | }; |
923 | 932 | ||
924 | auto& session = EmulationSession::GetInstance(); | 933 | auto& session = EmulationSession::GetInstance(); |
925 | std::vector<std::string> result = ContentManager::VerifyInstalledContents( | 934 | std::vector<std::string> result = ContentManager::VerifyInstalledContents( |
926 | session.System(), *session.GetContentProvider(), callback); | 935 | session.System(), *session.GetContentProvider(), callback); |
927 | jobjectArray jresult = | 936 | jobjectArray jresult = env->NewObjectArray(result.size(), Common::Android::GetStringClass(), |
928 | env->NewObjectArray(result.size(), IDCache::GetStringClass(), ToJString(env, "")); | 937 | Common::Android::ToJString(env, "")); |
929 | for (size_t i = 0; i < result.size(); ++i) { | 938 | for (size_t i = 0; i < result.size(); ++i) { |
930 | env->SetObjectArrayElement(jresult, i, ToJString(env, result[i])); | 939 | env->SetObjectArrayElement(jresult, i, Common::Android::ToJString(env, result[i])); |
931 | } | 940 | } |
932 | return jresult; | 941 | return jresult; |
933 | } | 942 | } |
@@ -939,19 +948,20 @@ jint Java_org_yuzu_yuzu_1emu_NativeLibrary_verifyGameContents(JNIEnv* env, jobje | |||
939 | jlambdaClass, "invoke", "(Ljava/lang/Object;Ljava/lang/Object;)Ljava/lang/Object;"); | 948 | jlambdaClass, "invoke", "(Ljava/lang/Object;Ljava/lang/Object;)Ljava/lang/Object;"); |
940 | const auto callback = [env, jcallback, jlambdaInvokeMethod](size_t max, size_t progress) { | 949 | const auto callback = [env, jcallback, jlambdaInvokeMethod](size_t max, size_t progress) { |
941 | auto jwasCancelled = env->CallObjectMethod(jcallback, jlambdaInvokeMethod, | 950 | auto jwasCancelled = env->CallObjectMethod(jcallback, jlambdaInvokeMethod, |
942 | ToJDouble(env, max), ToJDouble(env, progress)); | 951 | Common::Android::ToJDouble(env, max), |
943 | return GetJBoolean(env, jwasCancelled); | 952 | Common::Android::ToJDouble(env, progress)); |
953 | return Common::Android::GetJBoolean(env, jwasCancelled); | ||
944 | }; | 954 | }; |
945 | auto& session = EmulationSession::GetInstance(); | 955 | auto& session = EmulationSession::GetInstance(); |
946 | return static_cast<jint>( | 956 | return static_cast<jint>(ContentManager::VerifyGameContents( |
947 | ContentManager::VerifyGameContents(session.System(), GetJString(env, jpath), callback)); | 957 | session.System(), Common::Android::GetJString(env, jpath), callback)); |
948 | } | 958 | } |
949 | 959 | ||
950 | jstring Java_org_yuzu_yuzu_1emu_NativeLibrary_getSavePath(JNIEnv* env, jobject jobj, | 960 | jstring Java_org_yuzu_yuzu_1emu_NativeLibrary_getSavePath(JNIEnv* env, jobject jobj, |
951 | jstring jprogramId) { | 961 | jstring jprogramId) { |
952 | auto program_id = EmulationSession::GetProgramId(env, jprogramId); | 962 | auto program_id = EmulationSession::GetProgramId(env, jprogramId); |
953 | if (program_id == 0) { | 963 | if (program_id == 0) { |
954 | return ToJString(env, ""); | 964 | return Common::Android::ToJString(env, ""); |
955 | } | 965 | } |
956 | 966 | ||
957 | auto& system = EmulationSession::GetInstance().System(); | 967 | auto& system = EmulationSession::GetInstance().System(); |
@@ -968,7 +978,7 @@ jstring Java_org_yuzu_yuzu_1emu_NativeLibrary_getSavePath(JNIEnv* env, jobject j | |||
968 | const auto user_save_data_path = FileSys::SaveDataFactory::GetFullPath( | 978 | const auto user_save_data_path = FileSys::SaveDataFactory::GetFullPath( |
969 | {}, vfsNandDir, FileSys::SaveDataSpaceId::NandUser, FileSys::SaveDataType::SaveData, | 979 | {}, vfsNandDir, FileSys::SaveDataSpaceId::NandUser, FileSys::SaveDataType::SaveData, |
970 | program_id, user_id->AsU128(), 0); | 980 | program_id, user_id->AsU128(), 0); |
971 | return ToJString(env, user_save_data_path); | 981 | return Common::Android::ToJString(env, user_save_data_path); |
972 | } | 982 | } |
973 | 983 | ||
974 | jstring Java_org_yuzu_yuzu_1emu_NativeLibrary_getDefaultProfileSaveDataRoot(JNIEnv* env, | 984 | jstring Java_org_yuzu_yuzu_1emu_NativeLibrary_getDefaultProfileSaveDataRoot(JNIEnv* env, |
@@ -981,12 +991,13 @@ jstring Java_org_yuzu_yuzu_1emu_NativeLibrary_getDefaultProfileSaveDataRoot(JNIE | |||
981 | 991 | ||
982 | const auto user_save_data_root = | 992 | const auto user_save_data_root = |
983 | FileSys::SaveDataFactory::GetUserGameSaveDataRoot(user_id->AsU128(), jfuture); | 993 | FileSys::SaveDataFactory::GetUserGameSaveDataRoot(user_id->AsU128(), jfuture); |
984 | return ToJString(env, user_save_data_root); | 994 | return Common::Android::ToJString(env, user_save_data_root); |
985 | } | 995 | } |
986 | 996 | ||
987 | void Java_org_yuzu_yuzu_1emu_NativeLibrary_addFileToFilesystemProvider(JNIEnv* env, jobject jobj, | 997 | void Java_org_yuzu_yuzu_1emu_NativeLibrary_addFileToFilesystemProvider(JNIEnv* env, jobject jobj, |
988 | jstring jpath) { | 998 | jstring jpath) { |
989 | EmulationSession::GetInstance().ConfigureFilesystemProvider(GetJString(env, jpath)); | 999 | EmulationSession::GetInstance().ConfigureFilesystemProvider( |
1000 | Common::Android::GetJString(env, jpath)); | ||
990 | } | 1001 | } |
991 | 1002 | ||
992 | void Java_org_yuzu_yuzu_1emu_NativeLibrary_clearFilesystemProvider(JNIEnv* env, jobject jobj) { | 1003 | void Java_org_yuzu_yuzu_1emu_NativeLibrary_clearFilesystemProvider(JNIEnv* env, jobject jobj) { |
diff --git a/src/android/app/src/main/jni/native.h b/src/android/app/src/main/jni/native.h index e49d4e015..47936e305 100755 --- a/src/android/app/src/main/jni/native.h +++ b/src/android/app/src/main/jni/native.h | |||
@@ -2,13 +2,13 @@ | |||
2 | // SPDX-License-Identifier: GPL-2.0-or-later | 2 | // SPDX-License-Identifier: GPL-2.0-or-later |
3 | 3 | ||
4 | #include <android/native_window_jni.h> | 4 | #include <android/native_window_jni.h> |
5 | #include "common/android/applets/software_keyboard.h" | ||
5 | #include "common/detached_tasks.h" | 6 | #include "common/detached_tasks.h" |
6 | #include "core/core.h" | 7 | #include "core/core.h" |
7 | #include "core/file_sys/registered_cache.h" | 8 | #include "core/file_sys/registered_cache.h" |
8 | #include "core/hle/service/acc/profile_manager.h" | 9 | #include "core/hle/service/acc/profile_manager.h" |
9 | #include "core/perf_stats.h" | 10 | #include "core/perf_stats.h" |
10 | #include "frontend_common/content_manager.h" | 11 | #include "frontend_common/content_manager.h" |
11 | #include "jni/applets/software_keyboard.h" | ||
12 | #include "jni/emu_window/emu_window.h" | 12 | #include "jni/emu_window/emu_window.h" |
13 | #include "video_core/rasterizer_interface.h" | 13 | #include "video_core/rasterizer_interface.h" |
14 | 14 | ||
@@ -54,7 +54,7 @@ public: | |||
54 | void SetDeviceType([[maybe_unused]] int index, int type); | 54 | void SetDeviceType([[maybe_unused]] int index, int type); |
55 | void OnGamepadConnectEvent([[maybe_unused]] int index); | 55 | void OnGamepadConnectEvent([[maybe_unused]] int index); |
56 | void OnGamepadDisconnectEvent([[maybe_unused]] int index); | 56 | void OnGamepadDisconnectEvent([[maybe_unused]] int index); |
57 | SoftwareKeyboard::AndroidKeyboard* SoftwareKeyboard(); | 57 | Common::Android::SoftwareKeyboard::AndroidKeyboard* SoftwareKeyboard(); |
58 | 58 | ||
59 | static void OnEmulationStarted(); | 59 | static void OnEmulationStarted(); |
60 | 60 | ||
@@ -79,7 +79,7 @@ private: | |||
79 | Core::SystemResultStatus m_load_result{Core::SystemResultStatus::ErrorNotInitialized}; | 79 | Core::SystemResultStatus m_load_result{Core::SystemResultStatus::ErrorNotInitialized}; |
80 | std::atomic<bool> m_is_running = false; | 80 | std::atomic<bool> m_is_running = false; |
81 | std::atomic<bool> m_is_paused = false; | 81 | std::atomic<bool> m_is_paused = false; |
82 | SoftwareKeyboard::AndroidKeyboard* m_software_keyboard{}; | 82 | Common::Android::SoftwareKeyboard::AndroidKeyboard* m_software_keyboard{}; |
83 | std::unique_ptr<FileSys::ManualContentProvider> m_manual_provider; | 83 | std::unique_ptr<FileSys::ManualContentProvider> m_manual_provider; |
84 | int m_applet_id{1}; | 84 | int m_applet_id{1}; |
85 | 85 | ||
diff --git a/src/android/app/src/main/jni/native_config.cpp b/src/android/app/src/main/jni/native_config.cpp index c6c3343dc..8ae10fbc7 100755 --- a/src/android/app/src/main/jni/native_config.cpp +++ b/src/android/app/src/main/jni/native_config.cpp | |||
@@ -8,11 +8,11 @@ | |||
8 | 8 | ||
9 | #include "android_config.h" | 9 | #include "android_config.h" |
10 | #include "android_settings.h" | 10 | #include "android_settings.h" |
11 | #include "common/android/android_common.h" | ||
12 | #include "common/android/id_cache.h" | ||
11 | #include "common/logging/log.h" | 13 | #include "common/logging/log.h" |
12 | #include "common/settings.h" | 14 | #include "common/settings.h" |
13 | #include "frontend_common/config.h" | 15 | #include "frontend_common/config.h" |
14 | #include "jni/android_common/android_common.h" | ||
15 | #include "jni/id_cache.h" | ||
16 | #include "native.h" | 16 | #include "native.h" |
17 | 17 | ||
18 | std::unique_ptr<AndroidConfig> global_config; | 18 | std::unique_ptr<AndroidConfig> global_config; |
@@ -20,7 +20,7 @@ std::unique_ptr<AndroidConfig> per_game_config; | |||
20 | 20 | ||
21 | template <typename T> | 21 | template <typename T> |
22 | Settings::Setting<T>* getSetting(JNIEnv* env, jstring jkey) { | 22 | Settings::Setting<T>* getSetting(JNIEnv* env, jstring jkey) { |
23 | auto key = GetJString(env, jkey); | 23 | auto key = Common::Android::GetJString(env, jkey); |
24 | auto basic_setting = Settings::values.linkage.by_key[key]; | 24 | auto basic_setting = Settings::values.linkage.by_key[key]; |
25 | if (basic_setting != 0) { | 25 | if (basic_setting != 0) { |
26 | return static_cast<Settings::Setting<T>*>(basic_setting); | 26 | return static_cast<Settings::Setting<T>*>(basic_setting); |
@@ -55,7 +55,7 @@ void Java_org_yuzu_yuzu_1emu_utils_NativeConfig_initializePerGameConfig(JNIEnv* | |||
55 | jstring jprogramId, | 55 | jstring jprogramId, |
56 | jstring jfileName) { | 56 | jstring jfileName) { |
57 | auto program_id = EmulationSession::GetProgramId(env, jprogramId); | 57 | auto program_id = EmulationSession::GetProgramId(env, jprogramId); |
58 | auto file_name = GetJString(env, jfileName); | 58 | auto file_name = Common::Android::GetJString(env, jfileName); |
59 | const auto config_file_name = program_id == 0 ? file_name : fmt::format("{:016X}", program_id); | 59 | const auto config_file_name = program_id == 0 ? file_name : fmt::format("{:016X}", program_id); |
60 | per_game_config = | 60 | per_game_config = |
61 | std::make_unique<AndroidConfig>(config_file_name, Config::ConfigType::PerGameConfig); | 61 | std::make_unique<AndroidConfig>(config_file_name, Config::ConfigType::PerGameConfig); |
@@ -186,9 +186,9 @@ jstring Java_org_yuzu_yuzu_1emu_utils_NativeConfig_getString(JNIEnv* env, jobjec | |||
186 | jboolean needGlobal) { | 186 | jboolean needGlobal) { |
187 | auto setting = getSetting<std::string>(env, jkey); | 187 | auto setting = getSetting<std::string>(env, jkey); |
188 | if (setting == nullptr) { | 188 | if (setting == nullptr) { |
189 | return ToJString(env, ""); | 189 | return Common::Android::ToJString(env, ""); |
190 | } | 190 | } |
191 | return ToJString(env, setting->GetValue(static_cast<bool>(needGlobal))); | 191 | return Common::Android::ToJString(env, setting->GetValue(static_cast<bool>(needGlobal))); |
192 | } | 192 | } |
193 | 193 | ||
194 | void Java_org_yuzu_yuzu_1emu_utils_NativeConfig_setString(JNIEnv* env, jobject obj, jstring jkey, | 194 | void Java_org_yuzu_yuzu_1emu_utils_NativeConfig_setString(JNIEnv* env, jobject obj, jstring jkey, |
@@ -198,7 +198,7 @@ void Java_org_yuzu_yuzu_1emu_utils_NativeConfig_setString(JNIEnv* env, jobject o | |||
198 | return; | 198 | return; |
199 | } | 199 | } |
200 | 200 | ||
201 | setting->SetValue(GetJString(env, value)); | 201 | setting->SetValue(Common::Android::GetJString(env, value)); |
202 | } | 202 | } |
203 | 203 | ||
204 | jboolean Java_org_yuzu_yuzu_1emu_utils_NativeConfig_getIsRuntimeModifiable(JNIEnv* env, jobject obj, | 204 | jboolean Java_org_yuzu_yuzu_1emu_utils_NativeConfig_getIsRuntimeModifiable(JNIEnv* env, jobject obj, |
@@ -214,13 +214,13 @@ jstring Java_org_yuzu_yuzu_1emu_utils_NativeConfig_getPairedSettingKey(JNIEnv* e | |||
214 | jstring jkey) { | 214 | jstring jkey) { |
215 | auto setting = getSetting<std::string>(env, jkey); | 215 | auto setting = getSetting<std::string>(env, jkey); |
216 | if (setting == nullptr) { | 216 | if (setting == nullptr) { |
217 | return ToJString(env, ""); | 217 | return Common::Android::ToJString(env, ""); |
218 | } | 218 | } |
219 | if (setting->PairedSetting() == nullptr) { | 219 | if (setting->PairedSetting() == nullptr) { |
220 | return ToJString(env, ""); | 220 | return Common::Android::ToJString(env, ""); |
221 | } | 221 | } |
222 | 222 | ||
223 | return ToJString(env, setting->PairedSetting()->GetLabel()); | 223 | return Common::Android::ToJString(env, setting->PairedSetting()->GetLabel()); |
224 | } | 224 | } |
225 | 225 | ||
226 | jboolean Java_org_yuzu_yuzu_1emu_utils_NativeConfig_getIsSwitchable(JNIEnv* env, jobject obj, | 226 | jboolean Java_org_yuzu_yuzu_1emu_utils_NativeConfig_getIsSwitchable(JNIEnv* env, jobject obj, |
@@ -262,21 +262,21 @@ jstring Java_org_yuzu_yuzu_1emu_utils_NativeConfig_getDefaultToString(JNIEnv* en | |||
262 | jstring jkey) { | 262 | jstring jkey) { |
263 | auto setting = getSetting<std::string>(env, jkey); | 263 | auto setting = getSetting<std::string>(env, jkey); |
264 | if (setting != nullptr) { | 264 | if (setting != nullptr) { |
265 | return ToJString(env, setting->DefaultToString()); | 265 | return Common::Android::ToJString(env, setting->DefaultToString()); |
266 | } | 266 | } |
267 | return ToJString(env, ""); | 267 | return Common::Android::ToJString(env, ""); |
268 | } | 268 | } |
269 | 269 | ||
270 | jobjectArray Java_org_yuzu_yuzu_1emu_utils_NativeConfig_getGameDirs(JNIEnv* env, jobject obj) { | 270 | jobjectArray Java_org_yuzu_yuzu_1emu_utils_NativeConfig_getGameDirs(JNIEnv* env, jobject obj) { |
271 | jclass gameDirClass = IDCache::GetGameDirClass(); | 271 | jclass gameDirClass = Common::Android::GetGameDirClass(); |
272 | jmethodID gameDirConstructor = IDCache::GetGameDirConstructor(); | 272 | jmethodID gameDirConstructor = Common::Android::GetGameDirConstructor(); |
273 | jobjectArray jgameDirArray = | 273 | jobjectArray jgameDirArray = |
274 | env->NewObjectArray(AndroidSettings::values.game_dirs.size(), gameDirClass, nullptr); | 274 | env->NewObjectArray(AndroidSettings::values.game_dirs.size(), gameDirClass, nullptr); |
275 | for (size_t i = 0; i < AndroidSettings::values.game_dirs.size(); ++i) { | 275 | for (size_t i = 0; i < AndroidSettings::values.game_dirs.size(); ++i) { |
276 | jobject jgameDir = | 276 | jobject jgameDir = env->NewObject( |
277 | env->NewObject(gameDirClass, gameDirConstructor, | 277 | gameDirClass, gameDirConstructor, |
278 | ToJString(env, AndroidSettings::values.game_dirs[i].path), | 278 | Common::Android::ToJString(env, AndroidSettings::values.game_dirs[i].path), |
279 | static_cast<jboolean>(AndroidSettings::values.game_dirs[i].deep_scan)); | 279 | static_cast<jboolean>(AndroidSettings::values.game_dirs[i].deep_scan)); |
280 | env->SetObjectArrayElement(jgameDirArray, i, jgameDir); | 280 | env->SetObjectArrayElement(jgameDirArray, i, jgameDir); |
281 | } | 281 | } |
282 | return jgameDirArray; | 282 | return jgameDirArray; |
@@ -292,14 +292,14 @@ void Java_org_yuzu_yuzu_1emu_utils_NativeConfig_setGameDirs(JNIEnv* env, jobject | |||
292 | } | 292 | } |
293 | 293 | ||
294 | jobject dir = env->GetObjectArrayElement(gameDirs, 0); | 294 | jobject dir = env->GetObjectArrayElement(gameDirs, 0); |
295 | jclass gameDirClass = IDCache::GetGameDirClass(); | 295 | jclass gameDirClass = Common::Android::GetGameDirClass(); |
296 | jfieldID uriStringField = env->GetFieldID(gameDirClass, "uriString", "Ljava/lang/String;"); | 296 | jfieldID uriStringField = env->GetFieldID(gameDirClass, "uriString", "Ljava/lang/String;"); |
297 | jfieldID deepScanBooleanField = env->GetFieldID(gameDirClass, "deepScan", "Z"); | 297 | jfieldID deepScanBooleanField = env->GetFieldID(gameDirClass, "deepScan", "Z"); |
298 | for (int i = 0; i < size; ++i) { | 298 | for (int i = 0; i < size; ++i) { |
299 | dir = env->GetObjectArrayElement(gameDirs, i); | 299 | dir = env->GetObjectArrayElement(gameDirs, i); |
300 | jstring juriString = static_cast<jstring>(env->GetObjectField(dir, uriStringField)); | 300 | jstring juriString = static_cast<jstring>(env->GetObjectField(dir, uriStringField)); |
301 | jboolean jdeepScanBoolean = env->GetBooleanField(dir, deepScanBooleanField); | 301 | jboolean jdeepScanBoolean = env->GetBooleanField(dir, deepScanBooleanField); |
302 | std::string uriString = GetJString(env, juriString); | 302 | std::string uriString = Common::Android::GetJString(env, juriString); |
303 | AndroidSettings::values.game_dirs.push_back( | 303 | AndroidSettings::values.game_dirs.push_back( |
304 | AndroidSettings::GameDir{uriString, static_cast<bool>(jdeepScanBoolean)}); | 304 | AndroidSettings::GameDir{uriString, static_cast<bool>(jdeepScanBoolean)}); |
305 | } | 305 | } |
@@ -307,13 +307,13 @@ void Java_org_yuzu_yuzu_1emu_utils_NativeConfig_setGameDirs(JNIEnv* env, jobject | |||
307 | 307 | ||
308 | void Java_org_yuzu_yuzu_1emu_utils_NativeConfig_addGameDir(JNIEnv* env, jobject obj, | 308 | void Java_org_yuzu_yuzu_1emu_utils_NativeConfig_addGameDir(JNIEnv* env, jobject obj, |
309 | jobject gameDir) { | 309 | jobject gameDir) { |
310 | jclass gameDirClass = IDCache::GetGameDirClass(); | 310 | jclass gameDirClass = Common::Android::GetGameDirClass(); |
311 | jfieldID uriStringField = env->GetFieldID(gameDirClass, "uriString", "Ljava/lang/String;"); | 311 | jfieldID uriStringField = env->GetFieldID(gameDirClass, "uriString", "Ljava/lang/String;"); |
312 | jfieldID deepScanBooleanField = env->GetFieldID(gameDirClass, "deepScan", "Z"); | 312 | jfieldID deepScanBooleanField = env->GetFieldID(gameDirClass, "deepScan", "Z"); |
313 | 313 | ||
314 | jstring juriString = static_cast<jstring>(env->GetObjectField(gameDir, uriStringField)); | 314 | jstring juriString = static_cast<jstring>(env->GetObjectField(gameDir, uriStringField)); |
315 | jboolean jdeepScanBoolean = env->GetBooleanField(gameDir, deepScanBooleanField); | 315 | jboolean jdeepScanBoolean = env->GetBooleanField(gameDir, deepScanBooleanField); |
316 | std::string uriString = GetJString(env, juriString); | 316 | std::string uriString = Common::Android::GetJString(env, juriString); |
317 | AndroidSettings::values.game_dirs.push_back( | 317 | AndroidSettings::values.game_dirs.push_back( |
318 | AndroidSettings::GameDir{uriString, static_cast<bool>(jdeepScanBoolean)}); | 318 | AndroidSettings::GameDir{uriString, static_cast<bool>(jdeepScanBoolean)}); |
319 | } | 319 | } |
@@ -323,9 +323,11 @@ jobjectArray Java_org_yuzu_yuzu_1emu_utils_NativeConfig_getDisabledAddons(JNIEnv | |||
323 | auto program_id = EmulationSession::GetProgramId(env, jprogramId); | 323 | auto program_id = EmulationSession::GetProgramId(env, jprogramId); |
324 | auto& disabledAddons = Settings::values.disabled_addons[program_id]; | 324 | auto& disabledAddons = Settings::values.disabled_addons[program_id]; |
325 | jobjectArray jdisabledAddonsArray = | 325 | jobjectArray jdisabledAddonsArray = |
326 | env->NewObjectArray(disabledAddons.size(), IDCache::GetStringClass(), ToJString(env, "")); | 326 | env->NewObjectArray(disabledAddons.size(), Common::Android::GetStringClass(), |
327 | Common::Android::ToJString(env, "")); | ||
327 | for (size_t i = 0; i < disabledAddons.size(); ++i) { | 328 | for (size_t i = 0; i < disabledAddons.size(); ++i) { |
328 | env->SetObjectArrayElement(jdisabledAddonsArray, i, ToJString(env, disabledAddons[i])); | 329 | env->SetObjectArrayElement(jdisabledAddonsArray, i, |
330 | Common::Android::ToJString(env, disabledAddons[i])); | ||
329 | } | 331 | } |
330 | return jdisabledAddonsArray; | 332 | return jdisabledAddonsArray; |
331 | } | 333 | } |
@@ -339,7 +341,7 @@ void Java_org_yuzu_yuzu_1emu_utils_NativeConfig_setDisabledAddons(JNIEnv* env, j | |||
339 | const int size = env->GetArrayLength(jdisabledAddons); | 341 | const int size = env->GetArrayLength(jdisabledAddons); |
340 | for (int i = 0; i < size; ++i) { | 342 | for (int i = 0; i < size; ++i) { |
341 | auto jaddon = static_cast<jstring>(env->GetObjectArrayElement(jdisabledAddons, i)); | 343 | auto jaddon = static_cast<jstring>(env->GetObjectArrayElement(jdisabledAddons, i)); |
342 | disabled_addons.push_back(GetJString(env, jaddon)); | 344 | disabled_addons.push_back(Common::Android::GetJString(env, jaddon)); |
343 | } | 345 | } |
344 | Settings::values.disabled_addons[program_id] = disabled_addons; | 346 | Settings::values.disabled_addons[program_id] = disabled_addons; |
345 | } | 347 | } |
@@ -348,26 +350,27 @@ jobjectArray Java_org_yuzu_yuzu_1emu_utils_NativeConfig_getOverlayControlData(JN | |||
348 | jobject obj) { | 350 | jobject obj) { |
349 | jobjectArray joverlayControlDataArray = | 351 | jobjectArray joverlayControlDataArray = |
350 | env->NewObjectArray(AndroidSettings::values.overlay_control_data.size(), | 352 | env->NewObjectArray(AndroidSettings::values.overlay_control_data.size(), |
351 | IDCache::GetOverlayControlDataClass(), nullptr); | 353 | Common::Android::GetOverlayControlDataClass(), nullptr); |
352 | for (size_t i = 0; i < AndroidSettings::values.overlay_control_data.size(); ++i) { | 354 | for (size_t i = 0; i < AndroidSettings::values.overlay_control_data.size(); ++i) { |
353 | const auto& control_data = AndroidSettings::values.overlay_control_data[i]; | 355 | const auto& control_data = AndroidSettings::values.overlay_control_data[i]; |
354 | jobject jlandscapePosition = | 356 | jobject jlandscapePosition = |
355 | env->NewObject(IDCache::GetPairClass(), IDCache::GetPairConstructor(), | 357 | env->NewObject(Common::Android::GetPairClass(), Common::Android::GetPairConstructor(), |
356 | ToJDouble(env, control_data.landscape_position.first), | 358 | Common::Android::ToJDouble(env, control_data.landscape_position.first), |
357 | ToJDouble(env, control_data.landscape_position.second)); | 359 | Common::Android::ToJDouble(env, control_data.landscape_position.second)); |
358 | jobject jportraitPosition = | 360 | jobject jportraitPosition = |
359 | env->NewObject(IDCache::GetPairClass(), IDCache::GetPairConstructor(), | 361 | env->NewObject(Common::Android::GetPairClass(), Common::Android::GetPairConstructor(), |
360 | ToJDouble(env, control_data.portrait_position.first), | 362 | Common::Android::ToJDouble(env, control_data.portrait_position.first), |
361 | ToJDouble(env, control_data.portrait_position.second)); | 363 | Common::Android::ToJDouble(env, control_data.portrait_position.second)); |
362 | jobject jfoldablePosition = | 364 | jobject jfoldablePosition = |
363 | env->NewObject(IDCache::GetPairClass(), IDCache::GetPairConstructor(), | 365 | env->NewObject(Common::Android::GetPairClass(), Common::Android::GetPairConstructor(), |
364 | ToJDouble(env, control_data.foldable_position.first), | 366 | Common::Android::ToJDouble(env, control_data.foldable_position.first), |
365 | ToJDouble(env, control_data.foldable_position.second)); | 367 | Common::Android::ToJDouble(env, control_data.foldable_position.second)); |
366 | 368 | ||
367 | jobject jcontrolData = env->NewObject( | 369 | jobject jcontrolData = |
368 | IDCache::GetOverlayControlDataClass(), IDCache::GetOverlayControlDataConstructor(), | 370 | env->NewObject(Common::Android::GetOverlayControlDataClass(), |
369 | ToJString(env, control_data.id), control_data.enabled, jlandscapePosition, | 371 | Common::Android::GetOverlayControlDataConstructor(), |
370 | jportraitPosition, jfoldablePosition); | 372 | Common::Android::ToJString(env, control_data.id), control_data.enabled, |
373 | jlandscapePosition, jportraitPosition, jfoldablePosition); | ||
371 | env->SetObjectArrayElement(joverlayControlDataArray, i, jcontrolData); | 374 | env->SetObjectArrayElement(joverlayControlDataArray, i, jcontrolData); |
372 | } | 375 | } |
373 | return joverlayControlDataArray; | 376 | return joverlayControlDataArray; |
@@ -384,33 +387,41 @@ void Java_org_yuzu_yuzu_1emu_utils_NativeConfig_setOverlayControlData( | |||
384 | 387 | ||
385 | for (int i = 0; i < size; ++i) { | 388 | for (int i = 0; i < size; ++i) { |
386 | jobject joverlayControlData = env->GetObjectArrayElement(joverlayControlDataArray, i); | 389 | jobject joverlayControlData = env->GetObjectArrayElement(joverlayControlDataArray, i); |
387 | jstring jidString = static_cast<jstring>( | 390 | jstring jidString = static_cast<jstring>(env->GetObjectField( |
388 | env->GetObjectField(joverlayControlData, IDCache::GetOverlayControlDataIdField())); | 391 | joverlayControlData, Common::Android::GetOverlayControlDataIdField())); |
389 | bool enabled = static_cast<bool>(env->GetBooleanField( | 392 | bool enabled = static_cast<bool>(env->GetBooleanField( |
390 | joverlayControlData, IDCache::GetOverlayControlDataEnabledField())); | 393 | joverlayControlData, Common::Android::GetOverlayControlDataEnabledField())); |
391 | 394 | ||
392 | jobject jlandscapePosition = env->GetObjectField( | 395 | jobject jlandscapePosition = env->GetObjectField( |
393 | joverlayControlData, IDCache::GetOverlayControlDataLandscapePositionField()); | 396 | joverlayControlData, Common::Android::GetOverlayControlDataLandscapePositionField()); |
394 | std::pair<double, double> landscape_position = std::make_pair( | 397 | std::pair<double, double> landscape_position = std::make_pair( |
395 | GetJDouble(env, env->GetObjectField(jlandscapePosition, IDCache::GetPairFirstField())), | 398 | Common::Android::GetJDouble( |
396 | GetJDouble(env, | 399 | env, env->GetObjectField(jlandscapePosition, Common::Android::GetPairFirstField())), |
397 | env->GetObjectField(jlandscapePosition, IDCache::GetPairSecondField()))); | 400 | Common::Android::GetJDouble( |
401 | env, | ||
402 | env->GetObjectField(jlandscapePosition, Common::Android::GetPairSecondField()))); | ||
398 | 403 | ||
399 | jobject jportraitPosition = env->GetObjectField( | 404 | jobject jportraitPosition = env->GetObjectField( |
400 | joverlayControlData, IDCache::GetOverlayControlDataPortraitPositionField()); | 405 | joverlayControlData, Common::Android::GetOverlayControlDataPortraitPositionField()); |
401 | std::pair<double, double> portrait_position = std::make_pair( | 406 | std::pair<double, double> portrait_position = std::make_pair( |
402 | GetJDouble(env, env->GetObjectField(jportraitPosition, IDCache::GetPairFirstField())), | 407 | Common::Android::GetJDouble( |
403 | GetJDouble(env, env->GetObjectField(jportraitPosition, IDCache::GetPairSecondField()))); | 408 | env, env->GetObjectField(jportraitPosition, Common::Android::GetPairFirstField())), |
409 | Common::Android::GetJDouble( | ||
410 | env, | ||
411 | env->GetObjectField(jportraitPosition, Common::Android::GetPairSecondField()))); | ||
404 | 412 | ||
405 | jobject jfoldablePosition = env->GetObjectField( | 413 | jobject jfoldablePosition = env->GetObjectField( |
406 | joverlayControlData, IDCache::GetOverlayControlDataFoldablePositionField()); | 414 | joverlayControlData, Common::Android::GetOverlayControlDataFoldablePositionField()); |
407 | std::pair<double, double> foldable_position = std::make_pair( | 415 | std::pair<double, double> foldable_position = std::make_pair( |
408 | GetJDouble(env, env->GetObjectField(jfoldablePosition, IDCache::GetPairFirstField())), | 416 | Common::Android::GetJDouble( |
409 | GetJDouble(env, env->GetObjectField(jfoldablePosition, IDCache::GetPairSecondField()))); | 417 | env, env->GetObjectField(jfoldablePosition, Common::Android::GetPairFirstField())), |
418 | Common::Android::GetJDouble( | ||
419 | env, | ||
420 | env->GetObjectField(jfoldablePosition, Common::Android::GetPairSecondField()))); | ||
410 | 421 | ||
411 | AndroidSettings::values.overlay_control_data.push_back(AndroidSettings::OverlayControlData{ | 422 | AndroidSettings::values.overlay_control_data.push_back(AndroidSettings::OverlayControlData{ |
412 | GetJString(env, jidString), enabled, landscape_position, portrait_position, | 423 | Common::Android::GetJString(env, jidString), enabled, landscape_position, |
413 | foldable_position}); | 424 | portrait_position, foldable_position}); |
414 | } | 425 | } |
415 | } | 426 | } |
416 | 427 | ||
diff --git a/src/android/app/src/main/jni/native_log.cpp b/src/android/app/src/main/jni/native_log.cpp index 33d691dc8..95dd1f057 100755 --- a/src/android/app/src/main/jni/native_log.cpp +++ b/src/android/app/src/main/jni/native_log.cpp | |||
@@ -1,31 +1,30 @@ | |||
1 | // SPDX-FileCopyrightText: 2023 yuzu Emulator Project | 1 | // SPDX-FileCopyrightText: 2023 yuzu Emulator Project |
2 | // SPDX-License-Identifier: GPL-2.0-or-later | 2 | // SPDX-License-Identifier: GPL-2.0-or-later |
3 | 3 | ||
4 | #include <common/android/android_common.h> | ||
4 | #include <common/logging/log.h> | 5 | #include <common/logging/log.h> |
5 | #include <jni.h> | 6 | #include <jni.h> |
6 | 7 | ||
7 | #include "android_common/android_common.h" | ||
8 | |||
9 | extern "C" { | 8 | extern "C" { |
10 | 9 | ||
11 | void Java_org_yuzu_yuzu_1emu_utils_Log_debug(JNIEnv* env, jobject obj, jstring jmessage) { | 10 | void Java_org_yuzu_yuzu_1emu_utils_Log_debug(JNIEnv* env, jobject obj, jstring jmessage) { |
12 | LOG_DEBUG(Frontend, "{}", GetJString(env, jmessage)); | 11 | LOG_DEBUG(Frontend, "{}", Common::Android::GetJString(env, jmessage)); |
13 | } | 12 | } |
14 | 13 | ||
15 | void Java_org_yuzu_yuzu_1emu_utils_Log_warning(JNIEnv* env, jobject obj, jstring jmessage) { | 14 | void Java_org_yuzu_yuzu_1emu_utils_Log_warning(JNIEnv* env, jobject obj, jstring jmessage) { |
16 | LOG_WARNING(Frontend, "{}", GetJString(env, jmessage)); | 15 | LOG_WARNING(Frontend, "{}", Common::Android::GetJString(env, jmessage)); |
17 | } | 16 | } |
18 | 17 | ||
19 | void Java_org_yuzu_yuzu_1emu_utils_Log_info(JNIEnv* env, jobject obj, jstring jmessage) { | 18 | void Java_org_yuzu_yuzu_1emu_utils_Log_info(JNIEnv* env, jobject obj, jstring jmessage) { |
20 | LOG_INFO(Frontend, "{}", GetJString(env, jmessage)); | 19 | LOG_INFO(Frontend, "{}", Common::Android::GetJString(env, jmessage)); |
21 | } | 20 | } |
22 | 21 | ||
23 | void Java_org_yuzu_yuzu_1emu_utils_Log_error(JNIEnv* env, jobject obj, jstring jmessage) { | 22 | void Java_org_yuzu_yuzu_1emu_utils_Log_error(JNIEnv* env, jobject obj, jstring jmessage) { |
24 | LOG_ERROR(Frontend, "{}", GetJString(env, jmessage)); | 23 | LOG_ERROR(Frontend, "{}", Common::Android::GetJString(env, jmessage)); |
25 | } | 24 | } |
26 | 25 | ||
27 | void Java_org_yuzu_yuzu_1emu_utils_Log_critical(JNIEnv* env, jobject obj, jstring jmessage) { | 26 | void Java_org_yuzu_yuzu_1emu_utils_Log_critical(JNIEnv* env, jobject obj, jstring jmessage) { |
28 | LOG_CRITICAL(Frontend, "{}", GetJString(env, jmessage)); | 27 | LOG_CRITICAL(Frontend, "{}", Common::Android::GetJString(env, jmessage)); |
29 | } | 28 | } |
30 | 29 | ||
31 | } // extern "C" | 30 | } // extern "C" |
diff --git a/src/android/app/src/main/res/layout/fragment_emulation.xml b/src/android/app/src/main/res/layout/fragment_emulation.xml index 0d2bfe8d6..e99a15783 100755 --- a/src/android/app/src/main/res/layout/fragment_emulation.xml +++ b/src/android/app/src/main/res/layout/fragment_emulation.xml | |||
@@ -140,6 +140,7 @@ | |||
140 | android:id="@+id/overlay_container" | 140 | android:id="@+id/overlay_container" |
141 | android:layout_width="match_parent" | 141 | android:layout_width="match_parent" |
142 | android:layout_height="match_parent" | 142 | android:layout_height="match_parent" |
143 | android:layout_marginHorizontal="20dp" | ||
143 | android:fitsSystemWindows="true"> | 144 | android:fitsSystemWindows="true"> |
144 | 145 | ||
145 | <com.google.android.material.textview.MaterialTextView | 146 | <com.google.android.material.textview.MaterialTextView |
@@ -150,7 +151,19 @@ | |||
150 | android:layout_gravity="left" | 151 | android:layout_gravity="left" |
151 | android:clickable="false" | 152 | android:clickable="false" |
152 | android:focusable="false" | 153 | android:focusable="false" |
153 | android:paddingHorizontal="20dp" | 154 | android:textColor="@android:color/white" |
155 | android:shadowColor="@android:color/black" | ||
156 | android:shadowRadius="3" | ||
157 | tools:ignore="RtlHardcoded" /> | ||
158 | |||
159 | <com.google.android.material.textview.MaterialTextView | ||
160 | android:id="@+id/show_thermals_text" | ||
161 | style="@style/TextAppearance.Material3.BodySmall" | ||
162 | android:layout_width="wrap_content" | ||
163 | android:layout_height="wrap_content" | ||
164 | android:layout_gravity="right" | ||
165 | android:clickable="false" | ||
166 | android:focusable="false" | ||
154 | android:textColor="@android:color/white" | 167 | android:textColor="@android:color/white" |
155 | android:shadowColor="@android:color/black" | 168 | android:shadowColor="@android:color/black" |
156 | android:shadowRadius="3" | 169 | android:shadowRadius="3" |
diff --git a/src/android/app/src/main/res/menu/menu_overlay_options.xml b/src/android/app/src/main/res/menu/menu_overlay_options.xml index 363781652..a9e807427 100755 --- a/src/android/app/src/main/res/menu/menu_overlay_options.xml +++ b/src/android/app/src/main/res/menu/menu_overlay_options.xml | |||
@@ -7,6 +7,11 @@ | |||
7 | android:checkable="true" /> | 7 | android:checkable="true" /> |
8 | 8 | ||
9 | <item | 9 | <item |
10 | android:id="@+id/thermal_indicator" | ||
11 | android:title="@string/emulation_thermal_indicator" | ||
12 | android:checkable="true" /> | ||
13 | |||
14 | <item | ||
10 | android:id="@+id/menu_edit_overlay" | 15 | android:id="@+id/menu_edit_overlay" |
11 | android:title="@string/emulation_touch_overlay_edit" /> | 16 | android:title="@string/emulation_touch_overlay_edit" /> |
12 | 17 | ||
diff --git a/src/android/app/src/main/res/values-ar/strings.xml b/src/android/app/src/main/res/values-ar/strings.xml index 53678f465..41d741847 100755 --- a/src/android/app/src/main/res/values-ar/strings.xml +++ b/src/android/app/src/main/res/values-ar/strings.xml | |||
@@ -1,9 +1,6 @@ | |||
1 | <?xml version="1.0" encoding="utf-8"?> | 1 | <?xml version="1.0" encoding="utf-8"?> |
2 | <resources xmlns:tools="http://schemas.android.com/tools" tools:ignore="MissingTranslation"> | 2 | <resources xmlns:tools="http://schemas.android.com/tools" tools:ignore="MissingTranslation"> |
3 | 3 | ||
4 | <string name="emulation_notification_channel_name">المحاكي نشط</string> | ||
5 | <string name="emulation_notification_channel_description">اظهار اشعار دائم عندما يكون المحاكي نشطاً</string> | ||
6 | <string name="emulation_notification_running">يوزو قيد التشغيل</string> | ||
7 | <string name="notice_notification_channel_name">الإشعارات والأخطاء</string> | 4 | <string name="notice_notification_channel_name">الإشعارات والأخطاء</string> |
8 | <string name="notice_notification_channel_description">اظهار اشعار عند حصول اي مشكلة.</string> | 5 | <string name="notice_notification_channel_description">اظهار اشعار عند حصول اي مشكلة.</string> |
9 | <string name="notification_permission_not_granted">لم يتم منح إذن الإشعار</string> | 6 | <string name="notification_permission_not_granted">لم يتم منح إذن الإشعار</string> |
diff --git a/src/android/app/src/main/res/values-ckb/strings.xml b/src/android/app/src/main/res/values-ckb/strings.xml index 7e1eb2b8d..827339505 100755 --- a/src/android/app/src/main/res/values-ckb/strings.xml +++ b/src/android/app/src/main/res/values-ckb/strings.xml | |||
@@ -2,9 +2,6 @@ | |||
2 | <resources xmlns:tools="http://schemas.android.com/tools" tools:ignore="MissingTranslation"> | 2 | <resources xmlns:tools="http://schemas.android.com/tools" tools:ignore="MissingTranslation"> |
3 | 3 | ||
4 | <string name="app_disclaimer">ئەم نەرمەکاڵایە یارییەکانی کۆنسۆلی نینتێندۆ سویچ کارپێدەکات. هیچ ناونیشانێکی یاری و کلیلی تێدا نییە..<br /><br />پێش ئەوەی دەست پێ بکەیت، تکایە شوێنی فایلی <![CDATA[<b> prod.keys </b>]]> دیاریبکە لە نێو کۆگای ئامێرەکەت.<br /><br /><![CDATA[<a href=\"https://yuzu-emu.org/help/quickstart\">زیاتر فێربە</a>]]></string> | 4 | <string name="app_disclaimer">ئەم نەرمەکاڵایە یارییەکانی کۆنسۆلی نینتێندۆ سویچ کارپێدەکات. هیچ ناونیشانێکی یاری و کلیلی تێدا نییە..<br /><br />پێش ئەوەی دەست پێ بکەیت، تکایە شوێنی فایلی <![CDATA[<b> prod.keys </b>]]> دیاریبکە لە نێو کۆگای ئامێرەکەت.<br /><br /><![CDATA[<a href=\"https://yuzu-emu.org/help/quickstart\">زیاتر فێربە</a>]]></string> |
5 | <string name="emulation_notification_channel_name">ئیمولەیشن کارایە</string> | ||
6 | <string name="emulation_notification_channel_description">ئاگادارکردنەوەیەکی بەردەوام نیشان دەدات کاتێک ئیمولەیشن کاردەکات.</string> | ||
7 | <string name="emulation_notification_running">یوزو کاردەکات</string> | ||
8 | <string name="notice_notification_channel_name">ئاگاداری و هەڵەکان</string> | 5 | <string name="notice_notification_channel_name">ئاگاداری و هەڵەکان</string> |
9 | <string name="notice_notification_channel_description">ئاگادارکردنەوەکان پیشان دەدات کاتێک شتێک بە هەڵەدا دەچێت.</string> | 6 | <string name="notice_notification_channel_description">ئاگادارکردنەوەکان پیشان دەدات کاتێک شتێک بە هەڵەدا دەچێت.</string> |
10 | <string name="notification_permission_not_granted">مۆڵەتی ئاگادارکردنەوە نەدراوە!</string> | 7 | <string name="notification_permission_not_granted">مۆڵەتی ئاگادارکردنەوە نەدراوە!</string> |
diff --git a/src/android/app/src/main/res/values-cs/strings.xml b/src/android/app/src/main/res/values-cs/strings.xml index b9a4a11e4..8f8e2848d 100755 --- a/src/android/app/src/main/res/values-cs/strings.xml +++ b/src/android/app/src/main/res/values-cs/strings.xml | |||
@@ -1,7 +1,6 @@ | |||
1 | <?xml version="1.0" encoding="utf-8"?> | 1 | <?xml version="1.0" encoding="utf-8"?> |
2 | <resources xmlns:tools="http://schemas.android.com/tools" tools:ignore="MissingTranslation"> | 2 | <resources xmlns:tools="http://schemas.android.com/tools" tools:ignore="MissingTranslation"> |
3 | 3 | ||
4 | <string name="emulation_notification_channel_name">Emulace je aktivní</string> | ||
5 | <string name="notice_notification_channel_name">Upozornění a chyby</string> | 4 | <string name="notice_notification_channel_name">Upozornění a chyby</string> |
6 | <string name="notice_notification_channel_description">Ukáže oznámení v případě chyby.</string> | 5 | <string name="notice_notification_channel_description">Ukáže oznámení v případě chyby.</string> |
7 | <string name="notification_permission_not_granted">Oznámení nejsou oprávněna!</string> | 6 | <string name="notification_permission_not_granted">Oznámení nejsou oprávněna!</string> |
diff --git a/src/android/app/src/main/res/values-de/strings.xml b/src/android/app/src/main/res/values-de/strings.xml index 483ea8c88..fb25b3c93 100755 --- a/src/android/app/src/main/res/values-de/strings.xml +++ b/src/android/app/src/main/res/values-de/strings.xml | |||
@@ -2,9 +2,6 @@ | |||
2 | <resources xmlns:tools="http://schemas.android.com/tools" tools:ignore="MissingTranslation"> | 2 | <resources xmlns:tools="http://schemas.android.com/tools" tools:ignore="MissingTranslation"> |
3 | 3 | ||
4 | <string name="app_disclaimer">Diese Software kann Spiele für die Nintendo Switch abspielen. Keine Spiele oder Spielekeys sind enthalten.<br /><br />Bevor du beginnst, bitte halte deine <![CDATA[<b> prod.keys </b>]]> auf deinem Gerät bereit. .<br /><br /><![CDATA[<a href=\"https://yuzu-emu.org/help/quickstart\">Mehr Infos</a>]]></string> | 4 | <string name="app_disclaimer">Diese Software kann Spiele für die Nintendo Switch abspielen. Keine Spiele oder Spielekeys sind enthalten.<br /><br />Bevor du beginnst, bitte halte deine <![CDATA[<b> prod.keys </b>]]> auf deinem Gerät bereit. .<br /><br /><![CDATA[<a href=\"https://yuzu-emu.org/help/quickstart\">Mehr Infos</a>]]></string> |
5 | <string name="emulation_notification_channel_name">Emulation ist aktiv</string> | ||
6 | <string name="emulation_notification_channel_description">Zeigt eine dauerhafte Benachrichtigung an, wenn die Emulation läuft.</string> | ||
7 | <string name="emulation_notification_running">yuzu läuft</string> | ||
8 | <string name="notice_notification_channel_name">Hinweise und Fehler</string> | 5 | <string name="notice_notification_channel_name">Hinweise und Fehler</string> |
9 | <string name="notice_notification_channel_description">Zeigt Benachrichtigungen an, wenn etwas schief läuft.</string> | 6 | <string name="notice_notification_channel_description">Zeigt Benachrichtigungen an, wenn etwas schief läuft.</string> |
10 | <string name="notification_permission_not_granted">Berechtigung für Benachrichtigungen nicht erlaubt!</string> | 7 | <string name="notification_permission_not_granted">Berechtigung für Benachrichtigungen nicht erlaubt!</string> |
diff --git a/src/android/app/src/main/res/values-es/strings.xml b/src/android/app/src/main/res/values-es/strings.xml index c3825710b..7ecbeaba4 100755 --- a/src/android/app/src/main/res/values-es/strings.xml +++ b/src/android/app/src/main/res/values-es/strings.xml | |||
@@ -2,9 +2,6 @@ | |||
2 | <resources xmlns:tools="http://schemas.android.com/tools" tools:ignore="MissingTranslation"> | 2 | <resources xmlns:tools="http://schemas.android.com/tools" tools:ignore="MissingTranslation"> |
3 | 3 | ||
4 | <string name="app_disclaimer">Este software ejecuta juegos para la videoconsola Nintendo Switch. Los videojuegos o claves no vienen incluidos.<br /><br />Antes de empezar, por favor, localice el archivo <![CDATA[<b> prod.keys </b>]]>en el almacenamiento de su dispositivo..<br /><br /><![CDATA[<a href=\"https://yuzu-emu.org/help/quickstart\">Saber más</a>]]></string> | 4 | <string name="app_disclaimer">Este software ejecuta juegos para la videoconsola Nintendo Switch. Los videojuegos o claves no vienen incluidos.<br /><br />Antes de empezar, por favor, localice el archivo <![CDATA[<b> prod.keys </b>]]>en el almacenamiento de su dispositivo..<br /><br /><![CDATA[<a href=\"https://yuzu-emu.org/help/quickstart\">Saber más</a>]]></string> |
5 | <string name="emulation_notification_channel_name">Emulación activa</string> | ||
6 | <string name="emulation_notification_channel_description">Muestra una notificación persistente cuando la emulación está activa.</string> | ||
7 | <string name="emulation_notification_running">yuzu está ejecutándose</string> | ||
8 | <string name="notice_notification_channel_name">Avisos y errores</string> | 5 | <string name="notice_notification_channel_name">Avisos y errores</string> |
9 | <string name="notice_notification_channel_description">Mostrar notificaciones cuándo algo vaya mal.</string> | 6 | <string name="notice_notification_channel_description">Mostrar notificaciones cuándo algo vaya mal.</string> |
10 | <string name="notification_permission_not_granted">¡Permisos de notificación no concedidos!</string> | 7 | <string name="notification_permission_not_granted">¡Permisos de notificación no concedidos!</string> |
diff --git a/src/android/app/src/main/res/values-fr/strings.xml b/src/android/app/src/main/res/values-fr/strings.xml index 667fe33cb..a848b9163 100755 --- a/src/android/app/src/main/res/values-fr/strings.xml +++ b/src/android/app/src/main/res/values-fr/strings.xml | |||
@@ -2,9 +2,6 @@ | |||
2 | <resources xmlns:tools="http://schemas.android.com/tools" tools:ignore="MissingTranslation"> | 2 | <resources xmlns:tools="http://schemas.android.com/tools" tools:ignore="MissingTranslation"> |
3 | 3 | ||
4 | <string name="app_disclaimer">Ce logiciel exécutera des jeux pour la console de jeu Nintendo Switch. Aucun jeux ou clés n\'est inclus.<br /><br />Avant de commencer, veuillez localiser votre fichier <![CDATA[<b> prod.keys </b>]]> sur le stockage de votre appareil.<br /><br /><![CDATA[<a href=\"https://yuzu-emu.org/help/quickstart\">En savoir plus</a>]]></string> | 4 | <string name="app_disclaimer">Ce logiciel exécutera des jeux pour la console de jeu Nintendo Switch. Aucun jeux ou clés n\'est inclus.<br /><br />Avant de commencer, veuillez localiser votre fichier <![CDATA[<b> prod.keys </b>]]> sur le stockage de votre appareil.<br /><br /><![CDATA[<a href=\"https://yuzu-emu.org/help/quickstart\">En savoir plus</a>]]></string> |
5 | <string name="emulation_notification_channel_name">L\'émulation est active</string> | ||
6 | <string name="emulation_notification_channel_description">Affiche une notification persistante lorsque l\'émulation est en cours d\'exécution.</string> | ||
7 | <string name="emulation_notification_running">yuzu est en cours d\'exécution</string> | ||
8 | <string name="notice_notification_channel_name">Avis et erreurs</string> | 5 | <string name="notice_notification_channel_name">Avis et erreurs</string> |
9 | <string name="notice_notification_channel_description">Affiche des notifications en cas de problème.</string> | 6 | <string name="notice_notification_channel_description">Affiche des notifications en cas de problème.</string> |
10 | <string name="notification_permission_not_granted">Permission de notification non accordée !</string> | 7 | <string name="notification_permission_not_granted">Permission de notification non accordée !</string> |
diff --git a/src/android/app/src/main/res/values-he/strings.xml b/src/android/app/src/main/res/values-he/strings.xml index 41e4450c6..6096605a9 100755 --- a/src/android/app/src/main/res/values-he/strings.xml +++ b/src/android/app/src/main/res/values-he/strings.xml | |||
@@ -2,9 +2,6 @@ | |||
2 | <resources xmlns:tools="http://schemas.android.com/tools" tools:ignore="MissingTranslation"> | 2 | <resources xmlns:tools="http://schemas.android.com/tools" tools:ignore="MissingTranslation"> |
3 | 3 | ||
4 | <string name="app_disclaimer">התוכנה תריץ משחקים לקונסולת ה Nintendo Switch. אף משחק או קבצים בעלי זכויות יוצרים נכללים.<br /><br /> לפני שאת/ה מתחיל בבקשה מצא את קובץ <![CDATA[<b>prod.keys</b>]]> על המכשיר.<br /><br /><![CDATA[<a href=\"https://yuzu-emu.org/help/quickstart\">קרא עוד</a>]]></string> | 4 | <string name="app_disclaimer">התוכנה תריץ משחקים לקונסולת ה Nintendo Switch. אף משחק או קבצים בעלי זכויות יוצרים נכללים.<br /><br /> לפני שאת/ה מתחיל בבקשה מצא את קובץ <![CDATA[<b>prod.keys</b>]]> על המכשיר.<br /><br /><![CDATA[<a href=\"https://yuzu-emu.org/help/quickstart\">קרא עוד</a>]]></string> |
5 | <string name="emulation_notification_channel_name">אמולציה פעילה</string> | ||
6 | <string name="emulation_notification_channel_description">מציג התראה מתמשכת כאשר האמולציה פועלת.</string> | ||
7 | <string name="emulation_notification_running">yuzu רץ</string> | ||
8 | <string name="notice_notification_channel_name">התראות ותקלות</string> | 5 | <string name="notice_notification_channel_name">התראות ותקלות</string> |
9 | <string name="notice_notification_channel_description">מציג התראות כאשר משהו הולך לא כשורה.</string> | 6 | <string name="notice_notification_channel_description">מציג התראות כאשר משהו הולך לא כשורה.</string> |
10 | <string name="notification_permission_not_granted">הרשאות התראות לא ניתנה!</string> | 7 | <string name="notification_permission_not_granted">הרשאות התראות לא ניתנה!</string> |
diff --git a/src/android/app/src/main/res/values-hu/strings.xml b/src/android/app/src/main/res/values-hu/strings.xml index 554da0816..f3a29e0c3 100755 --- a/src/android/app/src/main/res/values-hu/strings.xml +++ b/src/android/app/src/main/res/values-hu/strings.xml | |||
@@ -2,9 +2,6 @@ | |||
2 | <resources xmlns:tools="http://schemas.android.com/tools" tools:ignore="MissingTranslation"> | 2 | <resources xmlns:tools="http://schemas.android.com/tools" tools:ignore="MissingTranslation"> |
3 | 3 | ||
4 | <string name="app_disclaimer">Ez a szoftver Nintendo Switch játékkonzolhoz készült játékokat futtat. Nem tartalmaz játékokat vagy kulcsokat. .<br /><br />Mielőtt hozzákezdenél, kérjük, válaszd ki a <![CDATA[<b>prod.keys</b>]]> fájl helyét a készülék tárhelyén<br /><br /><![CDATA[<a href=\"https://yuzu-emu.org/help/quickstart\">Tudj meg többet</a>]]></string> | 4 | <string name="app_disclaimer">Ez a szoftver Nintendo Switch játékkonzolhoz készült játékokat futtat. Nem tartalmaz játékokat vagy kulcsokat. .<br /><br />Mielőtt hozzákezdenél, kérjük, válaszd ki a <![CDATA[<b>prod.keys</b>]]> fájl helyét a készülék tárhelyén<br /><br /><![CDATA[<a href=\"https://yuzu-emu.org/help/quickstart\">Tudj meg többet</a>]]></string> |
5 | <string name="emulation_notification_channel_name">Emuláció aktív</string> | ||
6 | <string name="emulation_notification_channel_description">Állandó értesítést jelenít meg, amíg az emuláció fut.</string> | ||
7 | <string name="emulation_notification_running">A yuzu fut</string> | ||
8 | <string name="notice_notification_channel_name">Megjegyzések és hibák</string> | 5 | <string name="notice_notification_channel_name">Megjegyzések és hibák</string> |
9 | <string name="notice_notification_channel_description">Értesítések megjelenítése, ha valami rosszul sül el.</string> | 6 | <string name="notice_notification_channel_description">Értesítések megjelenítése, ha valami rosszul sül el.</string> |
10 | <string name="notification_permission_not_granted">Nincs engedély az értesítés megjelenítéséhez!</string> | 7 | <string name="notification_permission_not_granted">Nincs engedély az értesítés megjelenítéséhez!</string> |
diff --git a/src/android/app/src/main/res/values-it/strings.xml b/src/android/app/src/main/res/values-it/strings.xml index 61b39f57f..433d84f5c 100755 --- a/src/android/app/src/main/res/values-it/strings.xml +++ b/src/android/app/src/main/res/values-it/strings.xml | |||
@@ -2,9 +2,6 @@ | |||
2 | <resources xmlns:tools="http://schemas.android.com/tools" tools:ignore="MissingTranslation"> | 2 | <resources xmlns:tools="http://schemas.android.com/tools" tools:ignore="MissingTranslation"> |
3 | 3 | ||
4 | <string name="app_disclaimer">Questo software permette di giocare ai giochi della console Nintendo Switch. Nessun gioco o chiave è inclusa.<br /><br />Prima di iniziare, perfavore individua il file <![CDATA[<b>prod.keys </b>]]> nella memoria del tuo dispositivo.<br /><br /><![CDATA[<a href=\"https://yuzu-emu.org/help/quickstart\">Scopri di più</a>]]></string> | 4 | <string name="app_disclaimer">Questo software permette di giocare ai giochi della console Nintendo Switch. Nessun gioco o chiave è inclusa.<br /><br />Prima di iniziare, perfavore individua il file <![CDATA[<b>prod.keys </b>]]> nella memoria del tuo dispositivo.<br /><br /><![CDATA[<a href=\"https://yuzu-emu.org/help/quickstart\">Scopri di più</a>]]></string> |
5 | <string name="emulation_notification_channel_name">L\'emulatore è attivo</string> | ||
6 | <string name="emulation_notification_channel_description">Mostra una notifica persistente quando l\'emulatore è in esecuzione.</string> | ||
7 | <string name="emulation_notification_running">yuzu è in esecuzione</string> | ||
8 | <string name="notice_notification_channel_name">Avvisi ed errori</string> | 5 | <string name="notice_notification_channel_name">Avvisi ed errori</string> |
9 | <string name="notice_notification_channel_description">Mostra le notifiche quando qualcosa va storto.</string> | 6 | <string name="notice_notification_channel_description">Mostra le notifiche quando qualcosa va storto.</string> |
10 | <string name="notification_permission_not_granted">Autorizzazione di notifica non concessa!</string> | 7 | <string name="notification_permission_not_granted">Autorizzazione di notifica non concessa!</string> |
diff --git a/src/android/app/src/main/res/values-ja/strings.xml b/src/android/app/src/main/res/values-ja/strings.xml index 0cff40bb6..da73ad651 100755 --- a/src/android/app/src/main/res/values-ja/strings.xml +++ b/src/android/app/src/main/res/values-ja/strings.xml | |||
@@ -2,9 +2,6 @@ | |||
2 | <resources xmlns:tools="http://schemas.android.com/tools" tools:ignore="MissingTranslation"> | 2 | <resources xmlns:tools="http://schemas.android.com/tools" tools:ignore="MissingTranslation"> |
3 | 3 | ||
4 | <string name="app_disclaimer">このソフトウェアでは、Nintendo Switchのゲームを実行できます。 ゲームソフトやキーは含まれません。<br /><br />事前に、 <![CDATA[<b> prod.keys </b>]]> ファイルをストレージに配置しておいてください。<br /><br /><![CDATA[<a href=\"https://yuzu-emu.org/help/quickstart\">詳細</a>]]></string> | 4 | <string name="app_disclaimer">このソフトウェアでは、Nintendo Switchのゲームを実行できます。 ゲームソフトやキーは含まれません。<br /><br />事前に、 <![CDATA[<b> prod.keys </b>]]> ファイルをストレージに配置しておいてください。<br /><br /><![CDATA[<a href=\"https://yuzu-emu.org/help/quickstart\">詳細</a>]]></string> |
5 | <string name="emulation_notification_channel_name">エミュレーションが有効です</string> | ||
6 | <string name="emulation_notification_channel_description">エミュレーションの実行中に常設通知を表示します。</string> | ||
7 | <string name="emulation_notification_running">yuzu は実行中です</string> | ||
8 | <string name="notice_notification_channel_name">通知とエラー</string> | 5 | <string name="notice_notification_channel_name">通知とエラー</string> |
9 | <string name="notice_notification_channel_description">問題の発生時に通知を表示します。</string> | 6 | <string name="notice_notification_channel_description">問題の発生時に通知を表示します。</string> |
10 | <string name="notification_permission_not_granted">通知が許可されていません!</string> | 7 | <string name="notification_permission_not_granted">通知が許可されていません!</string> |
diff --git a/src/android/app/src/main/res/values-ko/strings.xml b/src/android/app/src/main/res/values-ko/strings.xml index eaa6c23ce..904353d34 100755 --- a/src/android/app/src/main/res/values-ko/strings.xml +++ b/src/android/app/src/main/res/values-ko/strings.xml | |||
@@ -2,9 +2,6 @@ | |||
2 | <resources xmlns:tools="http://schemas.android.com/tools" tools:ignore="MissingTranslation"> | 2 | <resources xmlns:tools="http://schemas.android.com/tools" tools:ignore="MissingTranslation"> |
3 | 3 | ||
4 | <string name="app_disclaimer">이 소프트웨어는 Nintendo Switch 게임을 실행합니다. 게임 타이틀이나 키는 포함되어 있지 않습니다.<br /><br />시작하기 전에 장치 저장소에서 <![CDATA[<b> prod.keys </b>]]> 파일을 찾아주세요.<br /><br /><![CDATA[<a href=\"https://yuzu-emu.org/help/quickstart\">자세히 알아보기</a>]]></string> | 4 | <string name="app_disclaimer">이 소프트웨어는 Nintendo Switch 게임을 실행합니다. 게임 타이틀이나 키는 포함되어 있지 않습니다.<br /><br />시작하기 전에 장치 저장소에서 <![CDATA[<b> prod.keys </b>]]> 파일을 찾아주세요.<br /><br /><![CDATA[<a href=\"https://yuzu-emu.org/help/quickstart\">자세히 알아보기</a>]]></string> |
5 | <string name="emulation_notification_channel_name">에뮬레이션이 활성화됨</string> | ||
6 | <string name="emulation_notification_channel_description">에뮬레이션이 실행 중일 때 지속적으로 알림을 표시합니다.</string> | ||
7 | <string name="emulation_notification_running">yuzu가 실행 중입니다.</string> | ||
8 | <string name="notice_notification_channel_name">알림 및 오류</string> | 5 | <string name="notice_notification_channel_name">알림 및 오류</string> |
9 | <string name="notice_notification_channel_description">문제가 발생하면 알림을 표시합니다.</string> | 6 | <string name="notice_notification_channel_description">문제가 발생하면 알림을 표시합니다.</string> |
10 | <string name="notification_permission_not_granted">알림 권한이 부여되지 않았습니다!</string> | 7 | <string name="notification_permission_not_granted">알림 권한이 부여되지 않았습니다!</string> |
diff --git a/src/android/app/src/main/res/values-nb/strings.xml b/src/android/app/src/main/res/values-nb/strings.xml index e92dc62d9..fe3af5920 100755 --- a/src/android/app/src/main/res/values-nb/strings.xml +++ b/src/android/app/src/main/res/values-nb/strings.xml | |||
@@ -2,9 +2,6 @@ | |||
2 | <resources xmlns:tools="http://schemas.android.com/tools" tools:ignore="MissingTranslation"> | 2 | <resources xmlns:tools="http://schemas.android.com/tools" tools:ignore="MissingTranslation"> |
3 | 3 | ||
4 | <string name="app_disclaimer">Denne programvaren vil kjøre spill for Nintendo Switch-spillkonsollen. Ingen spilltitler eller nøkler er inkludert.<br /><br />Før du begynner, må du finne <![CDATA[<b> prod.keys </b>]]> filen din på enhetslagringen.<br /><br /><![CDATA[<a href=\"https://yuzu-emu.org/help/quickstart\">Lær mer</a>]]></string> | 4 | <string name="app_disclaimer">Denne programvaren vil kjøre spill for Nintendo Switch-spillkonsollen. Ingen spilltitler eller nøkler er inkludert.<br /><br />Før du begynner, må du finne <![CDATA[<b> prod.keys </b>]]> filen din på enhetslagringen.<br /><br /><![CDATA[<a href=\"https://yuzu-emu.org/help/quickstart\">Lær mer</a>]]></string> |
5 | <string name="emulation_notification_channel_name">Emulering er aktiv</string> | ||
6 | <string name="emulation_notification_channel_description">Viser et vedvarende varsel når emuleringen kjører.</string> | ||
7 | <string name="emulation_notification_running">Yuzu kjører</string> | ||
8 | <string name="notice_notification_channel_name">Merknader og feil</string> | 5 | <string name="notice_notification_channel_name">Merknader og feil</string> |
9 | <string name="notice_notification_channel_description">Viser varsler når noe går galt.</string> | 6 | <string name="notice_notification_channel_description">Viser varsler når noe går galt.</string> |
10 | <string name="notification_permission_not_granted">Varslingstillatelse ikke gitt!</string> | 7 | <string name="notification_permission_not_granted">Varslingstillatelse ikke gitt!</string> |
diff --git a/src/android/app/src/main/res/values-pl/strings.xml b/src/android/app/src/main/res/values-pl/strings.xml index fbd0ad7e9..2af7fd7b4 100755 --- a/src/android/app/src/main/res/values-pl/strings.xml +++ b/src/android/app/src/main/res/values-pl/strings.xml | |||
@@ -2,9 +2,6 @@ | |||
2 | <resources xmlns:tools="http://schemas.android.com/tools" tools:ignore="MissingTranslation"> | 2 | <resources xmlns:tools="http://schemas.android.com/tools" tools:ignore="MissingTranslation"> |
3 | 3 | ||
4 | <string name="app_disclaimer">To oprogramowanie umożliwia uruchomienie gier z konsoli Nintendo Switch. Nie zawiera gier ani wymaganych kluczy.<br /><br />Zanim zaczniesz, wybierz plik kluczy <![CDATA[<b> prod.keys </b>]]> z katalogu w pamięci masowej.<br /><br /><![CDATA[<a href=\"https://yuzu-emu.org/help/quickstart\">Dowiedz się więcej</a>]]></string> | 4 | <string name="app_disclaimer">To oprogramowanie umożliwia uruchomienie gier z konsoli Nintendo Switch. Nie zawiera gier ani wymaganych kluczy.<br /><br />Zanim zaczniesz, wybierz plik kluczy <![CDATA[<b> prod.keys </b>]]> z katalogu w pamięci masowej.<br /><br /><![CDATA[<a href=\"https://yuzu-emu.org/help/quickstart\">Dowiedz się więcej</a>]]></string> |
5 | <string name="emulation_notification_channel_name">Emulacja jest uruchomiona</string> | ||
6 | <string name="emulation_notification_channel_description">Pokaż trwałe powiadomienie gdy emulacja jest uruchomiona.</string> | ||
7 | <string name="emulation_notification_running">yuzu jest uruchomiony</string> | ||
8 | <string name="notice_notification_channel_name">Powiadomienia błędy</string> | 5 | <string name="notice_notification_channel_name">Powiadomienia błędy</string> |
9 | <string name="notice_notification_channel_description">Pokaż powiadomienie gdy coś pójdzie źle</string> | 6 | <string name="notice_notification_channel_description">Pokaż powiadomienie gdy coś pójdzie źle</string> |
10 | <string name="notification_permission_not_granted">Nie zezwolono na powiadomienia!</string> | 7 | <string name="notification_permission_not_granted">Nie zezwolono na powiadomienia!</string> |
diff --git a/src/android/app/src/main/res/values-pt-rBR/strings.xml b/src/android/app/src/main/res/values-pt-rBR/strings.xml index a87eb11e4..130252590 100755 --- a/src/android/app/src/main/res/values-pt-rBR/strings.xml +++ b/src/android/app/src/main/res/values-pt-rBR/strings.xml | |||
@@ -2,9 +2,6 @@ | |||
2 | <resources xmlns:tools="http://schemas.android.com/tools" tools:ignore="MissingTranslation"> | 2 | <resources xmlns:tools="http://schemas.android.com/tools" tools:ignore="MissingTranslation"> |
3 | 3 | ||
4 | <string name="app_disclaimer">Este software executa jogos do console Nintendo Switch. Não estão inclusos nem jogos ou chaves.<br /><br />Antes de começar, por favor localize o arquivo <![CDATA[<b> prod.keys </b>]]> no armazenamento de seu dispositivo.<br /><br /><![CDATA[<a href=\"https://yuzu-emu.org/help/quickstart\">Saiba mais</a>]]></string> | 4 | <string name="app_disclaimer">Este software executa jogos do console Nintendo Switch. Não estão inclusos nem jogos ou chaves.<br /><br />Antes de começar, por favor localize o arquivo <![CDATA[<b> prod.keys </b>]]> no armazenamento de seu dispositivo.<br /><br /><![CDATA[<a href=\"https://yuzu-emu.org/help/quickstart\">Saiba mais</a>]]></string> |
5 | <string name="emulation_notification_channel_name">A emulação está Ativa</string> | ||
6 | <string name="emulation_notification_channel_description">Mostra uma notificação permanente enquanto a emulação estiver em andamento.</string> | ||
7 | <string name="emulation_notification_running">O Yuzu está em execução </string> | ||
8 | <string name="notice_notification_channel_name">Notificações e erros</string> | 5 | <string name="notice_notification_channel_name">Notificações e erros</string> |
9 | <string name="notice_notification_channel_description">Mostra notificações quando algo dá errado.</string> | 6 | <string name="notice_notification_channel_description">Mostra notificações quando algo dá errado.</string> |
10 | <string name="notification_permission_not_granted">Acesso às notificações não concedido!</string> | 7 | <string name="notification_permission_not_granted">Acesso às notificações não concedido!</string> |
diff --git a/src/android/app/src/main/res/values-pt-rPT/strings.xml b/src/android/app/src/main/res/values-pt-rPT/strings.xml index 684a71616..0fdbae4f8 100755 --- a/src/android/app/src/main/res/values-pt-rPT/strings.xml +++ b/src/android/app/src/main/res/values-pt-rPT/strings.xml | |||
@@ -2,9 +2,6 @@ | |||
2 | <resources xmlns:tools="http://schemas.android.com/tools" tools:ignore="MissingTranslation"> | 2 | <resources xmlns:tools="http://schemas.android.com/tools" tools:ignore="MissingTranslation"> |
3 | 3 | ||
4 | <string name="app_disclaimer">Este software corre jogos para a consola Nintendo Switch. Não estão incluídas nem jogos ou chaves. <br /><br />Antes de começares, por favor localiza o ficheiro <![CDATA[1 prod.keys 1]]> no armazenamento do teu dispositivo.<br /><br /><![CDATA[2Learn more2]]></string> | 4 | <string name="app_disclaimer">Este software corre jogos para a consola Nintendo Switch. Não estão incluídas nem jogos ou chaves. <br /><br />Antes de começares, por favor localiza o ficheiro <![CDATA[1 prod.keys 1]]> no armazenamento do teu dispositivo.<br /><br /><![CDATA[2Learn more2]]></string> |
5 | <string name="emulation_notification_channel_name">Emulação está Ativa</string> | ||
6 | <string name="emulation_notification_channel_description">Mostra uma notificação permanente enquanto a emulação está a correr.</string> | ||
7 | <string name="emulation_notification_running">Yuzu está em execução </string> | ||
8 | <string name="notice_notification_channel_name">Notificações e erros</string> | 5 | <string name="notice_notification_channel_name">Notificações e erros</string> |
9 | <string name="notice_notification_channel_description">Mostra notificações quendo algo corre mal.</string> | 6 | <string name="notice_notification_channel_description">Mostra notificações quendo algo corre mal.</string> |
10 | <string name="notification_permission_not_granted">Permissões de notificação não permitidas </string> | 7 | <string name="notification_permission_not_granted">Permissões de notificação não permitidas </string> |
diff --git a/src/android/app/src/main/res/values-ru/strings.xml b/src/android/app/src/main/res/values-ru/strings.xml index 099b2c9eb..2dfd4a824 100755 --- a/src/android/app/src/main/res/values-ru/strings.xml +++ b/src/android/app/src/main/res/values-ru/strings.xml | |||
@@ -2,9 +2,6 @@ | |||
2 | <resources xmlns:tools="http://schemas.android.com/tools" tools:ignore="MissingTranslation"> | 2 | <resources xmlns:tools="http://schemas.android.com/tools" tools:ignore="MissingTranslation"> |
3 | 3 | ||
4 | <string name="app_disclaimer">Это программное обеспечение позволяет запускать игры для игровой консоли Nintendo Switch. Мы не предоставляем сами игры или ключи.<br /><br />Перед началом работы найдите файл <![CDATA[<b> prod.keys </b>]]> в хранилище устройства..<br /><br /><![CDATA[<a href=\"https://yuzu-emu.org/help/quickstart\">Узнать больше</a>]]></string> | 4 | <string name="app_disclaimer">Это программное обеспечение позволяет запускать игры для игровой консоли Nintendo Switch. Мы не предоставляем сами игры или ключи.<br /><br />Перед началом работы найдите файл <![CDATA[<b> prod.keys </b>]]> в хранилище устройства..<br /><br /><![CDATA[<a href=\"https://yuzu-emu.org/help/quickstart\">Узнать больше</a>]]></string> |
5 | <string name="emulation_notification_channel_name">Эмуляция активна</string> | ||
6 | <string name="emulation_notification_channel_description">Показывает постоянное уведомление, когда запущена эмуляция.</string> | ||
7 | <string name="emulation_notification_running">yuzu запущен</string> | ||
8 | <string name="notice_notification_channel_name">Уведомления и ошибки</string> | 5 | <string name="notice_notification_channel_name">Уведомления и ошибки</string> |
9 | <string name="notice_notification_channel_description">Показывать уведомления, когда что-то пошло не так</string> | 6 | <string name="notice_notification_channel_description">Показывать уведомления, когда что-то пошло не так</string> |
10 | <string name="notification_permission_not_granted">Вы не предоставили разрешение на уведомления!</string> | 7 | <string name="notification_permission_not_granted">Вы не предоставили разрешение на уведомления!</string> |
diff --git a/src/android/app/src/main/res/values-uk/strings.xml b/src/android/app/src/main/res/values-uk/strings.xml index 361f0b726..9a2804a93 100755 --- a/src/android/app/src/main/res/values-uk/strings.xml +++ b/src/android/app/src/main/res/values-uk/strings.xml | |||
@@ -2,9 +2,6 @@ | |||
2 | <resources xmlns:tools="http://schemas.android.com/tools" tools:ignore="MissingTranslation"> | 2 | <resources xmlns:tools="http://schemas.android.com/tools" tools:ignore="MissingTranslation"> |
3 | 3 | ||
4 | <string name="app_disclaimer">Це програмне забезпечення дозволяє запускати ігри для ігрової консолі Nintendo Switch. Ми не надаємо самі ігри або ключі.<br /><br />Перед початком роботи знайдіть ваш файл <![CDATA[<b> prod.keys </b>]]> у сховищі пристрою.<br /><br /><![CDATA[<a href=\"https://yuzu-emu.org/help/quickstart\">Дізнатися більше</a>]]></string> | 4 | <string name="app_disclaimer">Це програмне забезпечення дозволяє запускати ігри для ігрової консолі Nintendo Switch. Ми не надаємо самі ігри або ключі.<br /><br />Перед початком роботи знайдіть ваш файл <![CDATA[<b> prod.keys </b>]]> у сховищі пристрою.<br /><br /><![CDATA[<a href=\"https://yuzu-emu.org/help/quickstart\">Дізнатися більше</a>]]></string> |
5 | <string name="emulation_notification_channel_name">Емуляція активна</string> | ||
6 | <string name="emulation_notification_channel_description">Показує постійне сповіщення, коли запущено емуляцію.</string> | ||
7 | <string name="emulation_notification_running">yuzu запущено</string> | ||
8 | <string name="notice_notification_channel_name">Сповіщення та помилки</string> | 5 | <string name="notice_notification_channel_name">Сповіщення та помилки</string> |
9 | <string name="notice_notification_channel_description">Показувати сповіщення, коли щось пішло не так</string> | 6 | <string name="notice_notification_channel_description">Показувати сповіщення, коли щось пішло не так</string> |
10 | <string name="notification_permission_not_granted">Ви не надали дозвіл сповіщень!</string> | 7 | <string name="notification_permission_not_granted">Ви не надали дозвіл сповіщень!</string> |
diff --git a/src/android/app/src/main/res/values-vi/strings.xml b/src/android/app/src/main/res/values-vi/strings.xml index 0a722f329..dc06610c7 100755 --- a/src/android/app/src/main/res/values-vi/strings.xml +++ b/src/android/app/src/main/res/values-vi/strings.xml | |||
@@ -2,9 +2,6 @@ | |||
2 | <resources xmlns:tools="http://schemas.android.com/tools" tools:ignore="MissingTranslation"> | 2 | <resources xmlns:tools="http://schemas.android.com/tools" tools:ignore="MissingTranslation"> |
3 | 3 | ||
4 | <string name="app_disclaimer">Phần mềm này sẽ chạy các game cho máy chơi game Nintendo Switch. Không có title games hoặc keys được bao gồm.<br /><br />Trước khi bạn bắt đầu, hãy tìm tập tin <![CDATA[<b> prod.keys </b>]]> trên bộ nhớ thiết bị của bạn.<br /><br /><![CDATA[<a href=\"https://yuzu-emu.org/help/quickstart\">Tìm hiểu thêm</a>]]></string> | 4 | <string name="app_disclaimer">Phần mềm này sẽ chạy các game cho máy chơi game Nintendo Switch. Không có title games hoặc keys được bao gồm.<br /><br />Trước khi bạn bắt đầu, hãy tìm tập tin <![CDATA[<b> prod.keys </b>]]> trên bộ nhớ thiết bị của bạn.<br /><br /><![CDATA[<a href=\"https://yuzu-emu.org/help/quickstart\">Tìm hiểu thêm</a>]]></string> |
5 | <string name="emulation_notification_channel_name">Giả lập đang chạy</string> | ||
6 | <string name="emulation_notification_channel_description">Hiển thị thông báo liên tục khi giả lập đang chạy.</string> | ||
7 | <string name="emulation_notification_running">yuzu đang chạy</string> | ||
8 | <string name="notice_notification_channel_name">Thông báo và lỗi</string> | 5 | <string name="notice_notification_channel_name">Thông báo và lỗi</string> |
9 | <string name="notice_notification_channel_description">Hiển thị thông báo khi có sự cố xảy ra.</string> | 6 | <string name="notice_notification_channel_description">Hiển thị thông báo khi có sự cố xảy ra.</string> |
10 | <string name="notification_permission_not_granted">Ứng dụng không được cấp quyền thông báo!</string> | 7 | <string name="notification_permission_not_granted">Ứng dụng không được cấp quyền thông báo!</string> |
diff --git a/src/android/app/src/main/res/values-zh-rCN/strings.xml b/src/android/app/src/main/res/values-zh-rCN/strings.xml index b840591a4..6acf6f391 100755 --- a/src/android/app/src/main/res/values-zh-rCN/strings.xml +++ b/src/android/app/src/main/res/values-zh-rCN/strings.xml | |||
@@ -2,9 +2,6 @@ | |||
2 | <resources xmlns:tools="http://schemas.android.com/tools" tools:ignore="MissingTranslation"> | 2 | <resources xmlns:tools="http://schemas.android.com/tools" tools:ignore="MissingTranslation"> |
3 | 3 | ||
4 | <string name="app_disclaimer">此软件可以运行 Nintendo Switch 游戏,但不包含任何游戏和密钥文件。<br /><br />在开始前,请找到放置于设备存储中的 <![CDATA[<b> prod.keys </b>]]> 文件。<br /><br /><![CDATA[<a href=\"https://yuzu-emu.org/help/quickstart\">了解更多</a>]]></string> | 4 | <string name="app_disclaimer">此软件可以运行 Nintendo Switch 游戏,但不包含任何游戏和密钥文件。<br /><br />在开始前,请找到放置于设备存储中的 <![CDATA[<b> prod.keys </b>]]> 文件。<br /><br /><![CDATA[<a href=\"https://yuzu-emu.org/help/quickstart\">了解更多</a>]]></string> |
5 | <string name="emulation_notification_channel_name">正在进行模拟</string> | ||
6 | <string name="emulation_notification_channel_description">在模拟运行时显示持久通知。</string> | ||
7 | <string name="emulation_notification_running">yuzu 正在运行</string> | ||
8 | <string name="notice_notification_channel_name">通知及错误提醒</string> | 5 | <string name="notice_notification_channel_name">通知及错误提醒</string> |
9 | <string name="notice_notification_channel_description">当发生错误时显示通知。</string> | 6 | <string name="notice_notification_channel_description">当发生错误时显示通知。</string> |
10 | <string name="notification_permission_not_granted">未授予通知权限!</string> | 7 | <string name="notification_permission_not_granted">未授予通知权限!</string> |
diff --git a/src/android/app/src/main/res/values-zh-rTW/strings.xml b/src/android/app/src/main/res/values-zh-rTW/strings.xml index d39255714..411fc5947 100755 --- a/src/android/app/src/main/res/values-zh-rTW/strings.xml +++ b/src/android/app/src/main/res/values-zh-rTW/strings.xml | |||
@@ -2,9 +2,6 @@ | |||
2 | <resources xmlns:tools="http://schemas.android.com/tools" tools:ignore="MissingTranslation"> | 2 | <resources xmlns:tools="http://schemas.android.com/tools" tools:ignore="MissingTranslation"> |
3 | 3 | ||
4 | <string name="app_disclaimer">此軟體可以執行 Nintendo Switch 主機遊戲,但不包含任何遊戲和金鑰。<br /><br />在您開始前,請找到放置於您的裝置儲存空間的 <![CDATA[<b> prod.keys </b>]]> 檔案。<br /><br /><![CDATA[<a href=\"https://yuzu-emu.org/help/quickstart\">深入瞭解</a>]]></string> | 4 | <string name="app_disclaimer">此軟體可以執行 Nintendo Switch 主機遊戲,但不包含任何遊戲和金鑰。<br /><br />在您開始前,請找到放置於您的裝置儲存空間的 <![CDATA[<b> prod.keys </b>]]> 檔案。<br /><br /><![CDATA[<a href=\"https://yuzu-emu.org/help/quickstart\">深入瞭解</a>]]></string> |
5 | <string name="emulation_notification_channel_name">模擬進行中</string> | ||
6 | <string name="emulation_notification_channel_description">在模擬執行時顯示持續通知。</string> | ||
7 | <string name="emulation_notification_running">yuzu 正在執行</string> | ||
8 | <string name="notice_notification_channel_name">通知和錯誤</string> | 5 | <string name="notice_notification_channel_name">通知和錯誤</string> |
9 | <string name="notice_notification_channel_description">發生錯誤時顯示通知。</string> | 6 | <string name="notice_notification_channel_description">發生錯誤時顯示通知。</string> |
10 | <string name="notification_permission_not_granted">未授予通知權限!</string> | 7 | <string name="notification_permission_not_granted">未授予通知權限!</string> |
diff --git a/src/android/app/src/main/res/values/strings.xml b/src/android/app/src/main/res/values/strings.xml index 3cd1586fd..489e00107 100755 --- a/src/android/app/src/main/res/values/strings.xml +++ b/src/android/app/src/main/res/values/strings.xml | |||
@@ -4,10 +4,6 @@ | |||
4 | <!-- General application strings --> | 4 | <!-- General application strings --> |
5 | <string name="app_name" translatable="false">yuzu</string> | 5 | <string name="app_name" translatable="false">yuzu</string> |
6 | <string name="app_disclaimer">This software will run games for the Nintendo Switch game console. No game titles or keys are included.<br /><br />Before you begin, please locate your <![CDATA[<b> prod.keys </b>]]> file on your device storage.<br /><br /><![CDATA[<a href="https://yuzu-emu.org/help/quickstart">Learn more</a>]]></string> | 6 | <string name="app_disclaimer">This software will run games for the Nintendo Switch game console. No game titles or keys are included.<br /><br />Before you begin, please locate your <![CDATA[<b> prod.keys </b>]]> file on your device storage.<br /><br /><![CDATA[<a href="https://yuzu-emu.org/help/quickstart">Learn more</a>]]></string> |
7 | <string name="emulation_notification_channel_name">Emulation is Active</string> | ||
8 | <string name="emulation_notification_channel_id" translatable="false">emulationIsActive</string> | ||
9 | <string name="emulation_notification_channel_description">Shows a persistent notification when emulation is running.</string> | ||
10 | <string name="emulation_notification_running">yuzu is running</string> | ||
11 | <string name="notice_notification_channel_name">Notices and errors</string> | 7 | <string name="notice_notification_channel_name">Notices and errors</string> |
12 | <string name="notice_notification_channel_id" translatable="false">noticesAndErrors</string> | 8 | <string name="notice_notification_channel_id" translatable="false">noticesAndErrors</string> |
13 | <string name="notice_notification_channel_description">Shows notifications when something goes wrong.</string> | 9 | <string name="notice_notification_channel_description">Shows notifications when something goes wrong.</string> |
@@ -380,6 +376,7 @@ | |||
380 | <string name="emulation_exit">Exit emulation</string> | 376 | <string name="emulation_exit">Exit emulation</string> |
381 | <string name="emulation_done">Done</string> | 377 | <string name="emulation_done">Done</string> |
382 | <string name="emulation_fps_counter">FPS counter</string> | 378 | <string name="emulation_fps_counter">FPS counter</string> |
379 | <string name="emulation_thermal_indicator">Thermal indicator</string> | ||
383 | <string name="emulation_toggle_controls">Toggle controls</string> | 380 | <string name="emulation_toggle_controls">Toggle controls</string> |
384 | <string name="emulation_rel_stick_center">Relative stick center</string> | 381 | <string name="emulation_rel_stick_center">Relative stick center</string> |
385 | <string name="emulation_dpad_slide">D-pad slide</string> | 382 | <string name="emulation_dpad_slide">D-pad slide</string> |
diff --git a/src/common/CMakeLists.txt b/src/common/CMakeLists.txt index 429dd3e26..1cede53b6 100755 --- a/src/common/CMakeLists.txt +++ b/src/common/CMakeLists.txt | |||
@@ -182,9 +182,15 @@ endif() | |||
182 | 182 | ||
183 | if(ANDROID) | 183 | if(ANDROID) |
184 | target_sources(common | 184 | target_sources(common |
185 | PRIVATE | 185 | PUBLIC |
186 | fs/fs_android.cpp | 186 | fs/fs_android.cpp |
187 | fs/fs_android.h | 187 | fs/fs_android.h |
188 | android/android_common.cpp | ||
189 | android/android_common.h | ||
190 | android/id_cache.cpp | ||
191 | android/id_cache.h | ||
192 | android/applets/software_keyboard.cpp | ||
193 | android/applets/software_keyboard.h | ||
188 | ) | 194 | ) |
189 | endif() | 195 | endif() |
190 | 196 | ||
diff --git a/src/common/android/android_common.cpp b/src/common/android/android_common.cpp new file mode 100755 index 000000000..e79005658 --- /dev/null +++ b/src/common/android/android_common.cpp | |||
@@ -0,0 +1,65 @@ | |||
1 | // SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project | ||
2 | // SPDX-License-Identifier: GPL-2.0-or-later | ||
3 | |||
4 | #include "android_common.h" | ||
5 | |||
6 | #include <string> | ||
7 | #include <string_view> | ||
8 | |||
9 | #include <jni.h> | ||
10 | |||
11 | #include "common/android/id_cache.h" | ||
12 | #include "common/string_util.h" | ||
13 | |||
14 | namespace Common::Android { | ||
15 | |||
16 | std::string GetJString(JNIEnv* env, jstring jstr) { | ||
17 | if (!jstr) { | ||
18 | return {}; | ||
19 | } | ||
20 | |||
21 | const jchar* jchars = env->GetStringChars(jstr, nullptr); | ||
22 | const jsize length = env->GetStringLength(jstr); | ||
23 | const std::u16string_view string_view(reinterpret_cast<const char16_t*>(jchars), | ||
24 | static_cast<u32>(length)); | ||
25 | const std::string converted_string = Common::UTF16ToUTF8(string_view); | ||
26 | env->ReleaseStringChars(jstr, jchars); | ||
27 | |||
28 | return converted_string; | ||
29 | } | ||
30 | |||
31 | jstring ToJString(JNIEnv* env, std::string_view str) { | ||
32 | const std::u16string converted_string = Common::UTF8ToUTF16(str); | ||
33 | return env->NewString(reinterpret_cast<const jchar*>(converted_string.data()), | ||
34 | static_cast<jint>(converted_string.size())); | ||
35 | } | ||
36 | |||
37 | jstring ToJString(JNIEnv* env, std::u16string_view str) { | ||
38 | return ToJString(env, Common::UTF16ToUTF8(str)); | ||
39 | } | ||
40 | |||
41 | double GetJDouble(JNIEnv* env, jobject jdouble) { | ||
42 | return env->GetDoubleField(jdouble, GetDoubleValueField()); | ||
43 | } | ||
44 | |||
45 | jobject ToJDouble(JNIEnv* env, double value) { | ||
46 | return env->NewObject(GetDoubleClass(), GetDoubleConstructor(), value); | ||
47 | } | ||
48 | |||
49 | s32 GetJInteger(JNIEnv* env, jobject jinteger) { | ||
50 | return env->GetIntField(jinteger, GetIntegerValueField()); | ||
51 | } | ||
52 | |||
53 | jobject ToJInteger(JNIEnv* env, s32 value) { | ||
54 | return env->NewObject(GetIntegerClass(), GetIntegerConstructor(), value); | ||
55 | } | ||
56 | |||
57 | bool GetJBoolean(JNIEnv* env, jobject jboolean) { | ||
58 | return env->GetBooleanField(jboolean, GetBooleanValueField()); | ||
59 | } | ||
60 | |||
61 | jobject ToJBoolean(JNIEnv* env, bool value) { | ||
62 | return env->NewObject(GetBooleanClass(), GetBooleanConstructor(), value); | ||
63 | } | ||
64 | |||
65 | } // namespace Common::Android | ||
diff --git a/src/common/android/android_common.h b/src/common/android/android_common.h new file mode 100755 index 000000000..d0ccb4ec2 --- /dev/null +++ b/src/common/android/android_common.h | |||
@@ -0,0 +1,26 @@ | |||
1 | // SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project | ||
2 | // SPDX-License-Identifier: GPL-2.0-or-later | ||
3 | |||
4 | #pragma once | ||
5 | |||
6 | #include <string> | ||
7 | |||
8 | #include <jni.h> | ||
9 | #include "common/common_types.h" | ||
10 | |||
11 | namespace Common::Android { | ||
12 | |||
13 | std::string GetJString(JNIEnv* env, jstring jstr); | ||
14 | jstring ToJString(JNIEnv* env, std::string_view str); | ||
15 | jstring ToJString(JNIEnv* env, std::u16string_view str); | ||
16 | |||
17 | double GetJDouble(JNIEnv* env, jobject jdouble); | ||
18 | jobject ToJDouble(JNIEnv* env, double value); | ||
19 | |||
20 | s32 GetJInteger(JNIEnv* env, jobject jinteger); | ||
21 | jobject ToJInteger(JNIEnv* env, s32 value); | ||
22 | |||
23 | bool GetJBoolean(JNIEnv* env, jobject jboolean); | ||
24 | jobject ToJBoolean(JNIEnv* env, bool value); | ||
25 | |||
26 | } // namespace Common::Android | ||
diff --git a/src/common/android/applets/software_keyboard.cpp b/src/common/android/applets/software_keyboard.cpp new file mode 100755 index 000000000..477e62b16 --- /dev/null +++ b/src/common/android/applets/software_keyboard.cpp | |||
@@ -0,0 +1,277 @@ | |||
1 | // SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project | ||
2 | // SPDX-License-Identifier: GPL-2.0-or-later | ||
3 | |||
4 | #include <map> | ||
5 | #include <thread> | ||
6 | |||
7 | #include <jni.h> | ||
8 | |||
9 | #include "common/android/android_common.h" | ||
10 | #include "common/android/applets/software_keyboard.h" | ||
11 | #include "common/android/id_cache.h" | ||
12 | #include "common/logging/log.h" | ||
13 | #include "common/string_util.h" | ||
14 | #include "core/core.h" | ||
15 | |||
16 | static jclass s_software_keyboard_class; | ||
17 | static jclass s_keyboard_config_class; | ||
18 | static jclass s_keyboard_data_class; | ||
19 | static jmethodID s_swkbd_execute_normal; | ||
20 | static jmethodID s_swkbd_execute_inline; | ||
21 | |||
22 | namespace Common::Android::SoftwareKeyboard { | ||
23 | |||
24 | static jobject ToJKeyboardParams(const Core::Frontend::KeyboardInitializeParameters& config) { | ||
25 | JNIEnv* env = GetEnvForThread(); | ||
26 | jobject object = env->AllocObject(s_keyboard_config_class); | ||
27 | |||
28 | env->SetObjectField(object, | ||
29 | env->GetFieldID(s_keyboard_config_class, "ok_text", "Ljava/lang/String;"), | ||
30 | ToJString(env, config.ok_text)); | ||
31 | env->SetObjectField( | ||
32 | object, env->GetFieldID(s_keyboard_config_class, "header_text", "Ljava/lang/String;"), | ||
33 | ToJString(env, config.header_text)); | ||
34 | env->SetObjectField(object, | ||
35 | env->GetFieldID(s_keyboard_config_class, "sub_text", "Ljava/lang/String;"), | ||
36 | ToJString(env, config.sub_text)); | ||
37 | env->SetObjectField( | ||
38 | object, env->GetFieldID(s_keyboard_config_class, "guide_text", "Ljava/lang/String;"), | ||
39 | ToJString(env, config.guide_text)); | ||
40 | env->SetObjectField( | ||
41 | object, env->GetFieldID(s_keyboard_config_class, "initial_text", "Ljava/lang/String;"), | ||
42 | ToJString(env, config.initial_text)); | ||
43 | env->SetShortField(object, | ||
44 | env->GetFieldID(s_keyboard_config_class, "left_optional_symbol_key", "S"), | ||
45 | static_cast<jshort>(config.left_optional_symbol_key)); | ||
46 | env->SetShortField(object, | ||
47 | env->GetFieldID(s_keyboard_config_class, "right_optional_symbol_key", "S"), | ||
48 | static_cast<jshort>(config.right_optional_symbol_key)); | ||
49 | env->SetIntField(object, env->GetFieldID(s_keyboard_config_class, "max_text_length", "I"), | ||
50 | static_cast<jint>(config.max_text_length)); | ||
51 | env->SetIntField(object, env->GetFieldID(s_keyboard_config_class, "min_text_length", "I"), | ||
52 | static_cast<jint>(config.min_text_length)); | ||
53 | env->SetIntField(object, | ||
54 | env->GetFieldID(s_keyboard_config_class, "initial_cursor_position", "I"), | ||
55 | static_cast<jint>(config.initial_cursor_position)); | ||
56 | env->SetIntField(object, env->GetFieldID(s_keyboard_config_class, "type", "I"), | ||
57 | static_cast<jint>(config.type)); | ||
58 | env->SetIntField(object, env->GetFieldID(s_keyboard_config_class, "password_mode", "I"), | ||
59 | static_cast<jint>(config.password_mode)); | ||
60 | env->SetIntField(object, env->GetFieldID(s_keyboard_config_class, "text_draw_type", "I"), | ||
61 | static_cast<jint>(config.text_draw_type)); | ||
62 | env->SetIntField(object, env->GetFieldID(s_keyboard_config_class, "key_disable_flags", "I"), | ||
63 | static_cast<jint>(config.key_disable_flags.raw)); | ||
64 | env->SetBooleanField(object, | ||
65 | env->GetFieldID(s_keyboard_config_class, "use_blur_background", "Z"), | ||
66 | static_cast<jboolean>(config.use_blur_background)); | ||
67 | env->SetBooleanField(object, | ||
68 | env->GetFieldID(s_keyboard_config_class, "enable_backspace_button", "Z"), | ||
69 | static_cast<jboolean>(config.enable_backspace_button)); | ||
70 | env->SetBooleanField(object, | ||
71 | env->GetFieldID(s_keyboard_config_class, "enable_return_button", "Z"), | ||
72 | static_cast<jboolean>(config.enable_return_button)); | ||
73 | env->SetBooleanField(object, | ||
74 | env->GetFieldID(s_keyboard_config_class, "disable_cancel_button", "Z"), | ||
75 | static_cast<jboolean>(config.disable_cancel_button)); | ||
76 | |||
77 | return object; | ||
78 | } | ||
79 | |||
80 | AndroidKeyboard::ResultData AndroidKeyboard::ResultData::CreateFromFrontend(jobject object) { | ||
81 | JNIEnv* env = GetEnvForThread(); | ||
82 | const jstring string = reinterpret_cast<jstring>(env->GetObjectField( | ||
83 | object, env->GetFieldID(s_keyboard_data_class, "text", "Ljava/lang/String;"))); | ||
84 | return ResultData{GetJString(env, string), | ||
85 | static_cast<Service::AM::Frontend::SwkbdResult>(env->GetIntField( | ||
86 | object, env->GetFieldID(s_keyboard_data_class, "result", "I")))}; | ||
87 | } | ||
88 | |||
89 | AndroidKeyboard::~AndroidKeyboard() = default; | ||
90 | |||
91 | void AndroidKeyboard::InitializeKeyboard( | ||
92 | bool is_inline, Core::Frontend::KeyboardInitializeParameters initialize_parameters, | ||
93 | SubmitNormalCallback submit_normal_callback_, SubmitInlineCallback submit_inline_callback_) { | ||
94 | if (is_inline) { | ||
95 | LOG_WARNING( | ||
96 | Frontend, | ||
97 | "(STUBBED) called, backend requested to initialize the inline software keyboard."); | ||
98 | |||
99 | submit_inline_callback = std::move(submit_inline_callback_); | ||
100 | } else { | ||
101 | LOG_WARNING( | ||
102 | Frontend, | ||
103 | "(STUBBED) called, backend requested to initialize the normal software keyboard."); | ||
104 | |||
105 | submit_normal_callback = std::move(submit_normal_callback_); | ||
106 | } | ||
107 | |||
108 | parameters = std::move(initialize_parameters); | ||
109 | |||
110 | LOG_INFO(Frontend, | ||
111 | "\nKeyboardInitializeParameters:" | ||
112 | "\nok_text={}" | ||
113 | "\nheader_text={}" | ||
114 | "\nsub_text={}" | ||
115 | "\nguide_text={}" | ||
116 | "\ninitial_text={}" | ||
117 | "\nmax_text_length={}" | ||
118 | "\nmin_text_length={}" | ||
119 | "\ninitial_cursor_position={}" | ||
120 | "\ntype={}" | ||
121 | "\npassword_mode={}" | ||
122 | "\ntext_draw_type={}" | ||
123 | "\nkey_disable_flags={}" | ||
124 | "\nuse_blur_background={}" | ||
125 | "\nenable_backspace_button={}" | ||
126 | "\nenable_return_button={}" | ||
127 | "\ndisable_cancel_button={}", | ||
128 | Common::UTF16ToUTF8(parameters.ok_text), Common::UTF16ToUTF8(parameters.header_text), | ||
129 | Common::UTF16ToUTF8(parameters.sub_text), Common::UTF16ToUTF8(parameters.guide_text), | ||
130 | Common::UTF16ToUTF8(parameters.initial_text), parameters.max_text_length, | ||
131 | parameters.min_text_length, parameters.initial_cursor_position, parameters.type, | ||
132 | parameters.password_mode, parameters.text_draw_type, parameters.key_disable_flags.raw, | ||
133 | parameters.use_blur_background, parameters.enable_backspace_button, | ||
134 | parameters.enable_return_button, parameters.disable_cancel_button); | ||
135 | } | ||
136 | |||
137 | void AndroidKeyboard::ShowNormalKeyboard() const { | ||
138 | LOG_DEBUG(Frontend, "called, backend requested to show the normal software keyboard."); | ||
139 | |||
140 | ResultData data{}; | ||
141 | |||
142 | // Pivot to a new thread, as we cannot call GetEnvForThread() from a Fiber. | ||
143 | std::thread([&] { | ||
144 | data = ResultData::CreateFromFrontend(GetEnvForThread()->CallStaticObjectMethod( | ||
145 | s_software_keyboard_class, s_swkbd_execute_normal, ToJKeyboardParams(parameters))); | ||
146 | }).join(); | ||
147 | |||
148 | SubmitNormalText(data); | ||
149 | } | ||
150 | |||
151 | void AndroidKeyboard::ShowTextCheckDialog( | ||
152 | Service::AM::Frontend::SwkbdTextCheckResult text_check_result, | ||
153 | std::u16string text_check_message) const { | ||
154 | LOG_WARNING(Frontend, "(STUBBED) called, backend requested to show the text check dialog."); | ||
155 | } | ||
156 | |||
157 | void AndroidKeyboard::ShowInlineKeyboard( | ||
158 | Core::Frontend::InlineAppearParameters appear_parameters) const { | ||
159 | LOG_WARNING(Frontend, | ||
160 | "(STUBBED) called, backend requested to show the inline software keyboard."); | ||
161 | |||
162 | LOG_INFO(Frontend, | ||
163 | "\nInlineAppearParameters:" | ||
164 | "\nmax_text_length={}" | ||
165 | "\nmin_text_length={}" | ||
166 | "\nkey_top_scale_x={}" | ||
167 | "\nkey_top_scale_y={}" | ||
168 | "\nkey_top_translate_x={}" | ||
169 | "\nkey_top_translate_y={}" | ||
170 | "\ntype={}" | ||
171 | "\nkey_disable_flags={}" | ||
172 | "\nkey_top_as_floating={}" | ||
173 | "\nenable_backspace_button={}" | ||
174 | "\nenable_return_button={}" | ||
175 | "\ndisable_cancel_button={}", | ||
176 | appear_parameters.max_text_length, appear_parameters.min_text_length, | ||
177 | appear_parameters.key_top_scale_x, appear_parameters.key_top_scale_y, | ||
178 | appear_parameters.key_top_translate_x, appear_parameters.key_top_translate_y, | ||
179 | appear_parameters.type, appear_parameters.key_disable_flags.raw, | ||
180 | appear_parameters.key_top_as_floating, appear_parameters.enable_backspace_button, | ||
181 | appear_parameters.enable_return_button, appear_parameters.disable_cancel_button); | ||
182 | |||
183 | // Pivot to a new thread, as we cannot call GetEnvForThread() from a Fiber. | ||
184 | m_is_inline_active = true; | ||
185 | std::thread([&] { | ||
186 | GetEnvForThread()->CallStaticVoidMethod(s_software_keyboard_class, s_swkbd_execute_inline, | ||
187 | ToJKeyboardParams(parameters)); | ||
188 | }).join(); | ||
189 | } | ||
190 | |||
191 | void AndroidKeyboard::HideInlineKeyboard() const { | ||
192 | LOG_WARNING(Frontend, | ||
193 | "(STUBBED) called, backend requested to hide the inline software keyboard."); | ||
194 | } | ||
195 | |||
196 | void AndroidKeyboard::InlineTextChanged( | ||
197 | Core::Frontend::InlineTextParameters text_parameters) const { | ||
198 | LOG_WARNING(Frontend, | ||
199 | "(STUBBED) called, backend requested to change the inline keyboard text."); | ||
200 | |||
201 | LOG_INFO(Frontend, | ||
202 | "\nInlineTextParameters:" | ||
203 | "\ninput_text={}" | ||
204 | "\ncursor_position={}", | ||
205 | Common::UTF16ToUTF8(text_parameters.input_text), text_parameters.cursor_position); | ||
206 | |||
207 | submit_inline_callback(Service::AM::Frontend::SwkbdReplyType::ChangedString, | ||
208 | text_parameters.input_text, text_parameters.cursor_position); | ||
209 | } | ||
210 | |||
211 | void AndroidKeyboard::ExitKeyboard() const { | ||
212 | LOG_WARNING(Frontend, "(STUBBED) called, backend requested to exit the software keyboard."); | ||
213 | } | ||
214 | |||
215 | void AndroidKeyboard::SubmitInlineKeyboardText(std::u16string submitted_text) { | ||
216 | if (!m_is_inline_active) { | ||
217 | return; | ||
218 | } | ||
219 | |||
220 | m_current_text += submitted_text; | ||
221 | |||
222 | submit_inline_callback(Service::AM::Frontend::SwkbdReplyType::ChangedString, m_current_text, | ||
223 | static_cast<int>(m_current_text.size())); | ||
224 | } | ||
225 | |||
226 | void AndroidKeyboard::SubmitInlineKeyboardInput(int key_code) { | ||
227 | static constexpr int KEYCODE_BACK = 4; | ||
228 | static constexpr int KEYCODE_ENTER = 66; | ||
229 | static constexpr int KEYCODE_DEL = 67; | ||
230 | |||
231 | if (!m_is_inline_active) { | ||
232 | return; | ||
233 | } | ||
234 | |||
235 | switch (key_code) { | ||
236 | case KEYCODE_BACK: | ||
237 | case KEYCODE_ENTER: | ||
238 | m_is_inline_active = false; | ||
239 | submit_inline_callback(Service::AM::Frontend::SwkbdReplyType::DecidedEnter, m_current_text, | ||
240 | static_cast<s32>(m_current_text.size())); | ||
241 | break; | ||
242 | case KEYCODE_DEL: | ||
243 | m_current_text.pop_back(); | ||
244 | submit_inline_callback(Service::AM::Frontend::SwkbdReplyType::ChangedString, m_current_text, | ||
245 | static_cast<int>(m_current_text.size())); | ||
246 | break; | ||
247 | } | ||
248 | } | ||
249 | |||
250 | void AndroidKeyboard::SubmitNormalText(const ResultData& data) const { | ||
251 | submit_normal_callback(data.result, Common::UTF8ToUTF16(data.text), true); | ||
252 | } | ||
253 | |||
254 | void InitJNI(JNIEnv* env) { | ||
255 | s_software_keyboard_class = reinterpret_cast<jclass>( | ||
256 | env->NewGlobalRef(env->FindClass("org/yuzu/yuzu_emu/applets/keyboard/SoftwareKeyboard"))); | ||
257 | s_keyboard_config_class = reinterpret_cast<jclass>(env->NewGlobalRef( | ||
258 | env->FindClass("org/yuzu/yuzu_emu/applets/keyboard/SoftwareKeyboard$KeyboardConfig"))); | ||
259 | s_keyboard_data_class = reinterpret_cast<jclass>(env->NewGlobalRef( | ||
260 | env->FindClass("org/yuzu/yuzu_emu/applets/keyboard/SoftwareKeyboard$KeyboardData"))); | ||
261 | |||
262 | s_swkbd_execute_normal = env->GetStaticMethodID( | ||
263 | s_software_keyboard_class, "executeNormal", | ||
264 | "(Lorg/yuzu/yuzu_emu/applets/keyboard/SoftwareKeyboard$KeyboardConfig;)Lorg/yuzu/yuzu_emu/" | ||
265 | "applets/keyboard/SoftwareKeyboard$KeyboardData;"); | ||
266 | s_swkbd_execute_inline = env->GetStaticMethodID( | ||
267 | s_software_keyboard_class, "executeInline", | ||
268 | "(Lorg/yuzu/yuzu_emu/applets/keyboard/SoftwareKeyboard$KeyboardConfig;)V"); | ||
269 | } | ||
270 | |||
271 | void CleanupJNI(JNIEnv* env) { | ||
272 | env->DeleteGlobalRef(s_software_keyboard_class); | ||
273 | env->DeleteGlobalRef(s_keyboard_config_class); | ||
274 | env->DeleteGlobalRef(s_keyboard_data_class); | ||
275 | } | ||
276 | |||
277 | } // namespace Common::Android::SoftwareKeyboard | ||
diff --git a/src/common/android/applets/software_keyboard.h b/src/common/android/applets/software_keyboard.h new file mode 100755 index 000000000..9fd09d27c --- /dev/null +++ b/src/common/android/applets/software_keyboard.h | |||
@@ -0,0 +1,78 @@ | |||
1 | // SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project | ||
2 | // SPDX-License-Identifier: GPL-2.0-or-later | ||
3 | |||
4 | #pragma once | ||
5 | |||
6 | #include <jni.h> | ||
7 | |||
8 | #include "core/frontend/applets/software_keyboard.h" | ||
9 | |||
10 | namespace Common::Android::SoftwareKeyboard { | ||
11 | |||
12 | class AndroidKeyboard final : public Core::Frontend::SoftwareKeyboardApplet { | ||
13 | public: | ||
14 | ~AndroidKeyboard() override; | ||
15 | |||
16 | void Close() const override { | ||
17 | ExitKeyboard(); | ||
18 | } | ||
19 | |||
20 | void InitializeKeyboard(bool is_inline, | ||
21 | Core::Frontend::KeyboardInitializeParameters initialize_parameters, | ||
22 | SubmitNormalCallback submit_normal_callback_, | ||
23 | SubmitInlineCallback submit_inline_callback_) override; | ||
24 | |||
25 | void ShowNormalKeyboard() const override; | ||
26 | |||
27 | void ShowTextCheckDialog(Service::AM::Frontend::SwkbdTextCheckResult text_check_result, | ||
28 | std::u16string text_check_message) const override; | ||
29 | |||
30 | void ShowInlineKeyboard( | ||
31 | Core::Frontend::InlineAppearParameters appear_parameters) const override; | ||
32 | |||
33 | void HideInlineKeyboard() const override; | ||
34 | |||
35 | void InlineTextChanged(Core::Frontend::InlineTextParameters text_parameters) const override; | ||
36 | |||
37 | void ExitKeyboard() const override; | ||
38 | |||
39 | void SubmitInlineKeyboardText(std::u16string submitted_text); | ||
40 | |||
41 | void SubmitInlineKeyboardInput(int key_code); | ||
42 | |||
43 | private: | ||
44 | struct ResultData { | ||
45 | static ResultData CreateFromFrontend(jobject object); | ||
46 | |||
47 | std::string text; | ||
48 | Service::AM::Frontend::SwkbdResult result{}; | ||
49 | }; | ||
50 | |||
51 | void SubmitNormalText(const ResultData& result) const; | ||
52 | |||
53 | Core::Frontend::KeyboardInitializeParameters parameters{}; | ||
54 | |||
55 | mutable SubmitNormalCallback submit_normal_callback; | ||
56 | mutable SubmitInlineCallback submit_inline_callback; | ||
57 | |||
58 | private: | ||
59 | mutable bool m_is_inline_active{}; | ||
60 | std::u16string m_current_text; | ||
61 | }; | ||
62 | |||
63 | // Should be called in JNI_Load | ||
64 | void InitJNI(JNIEnv* env); | ||
65 | |||
66 | // Should be called in JNI_Unload | ||
67 | void CleanupJNI(JNIEnv* env); | ||
68 | |||
69 | } // namespace Common::Android::SoftwareKeyboard | ||
70 | |||
71 | // Native function calls | ||
72 | extern "C" { | ||
73 | JNIEXPORT jobject JNICALL Java_org_citra_citra_1emu_applets_SoftwareKeyboard_ValidateFilters( | ||
74 | JNIEnv* env, jclass clazz, jstring text); | ||
75 | |||
76 | JNIEXPORT jobject JNICALL Java_org_citra_citra_1emu_applets_SoftwareKeyboard_ValidateInput( | ||
77 | JNIEnv* env, jclass clazz, jstring text); | ||
78 | } | ||
diff --git a/src/common/android/id_cache.cpp b/src/common/android/id_cache.cpp new file mode 100755 index 000000000..f39262db9 --- /dev/null +++ b/src/common/android/id_cache.cpp | |||
@@ -0,0 +1,428 @@ | |||
1 | // SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project | ||
2 | // SPDX-License-Identifier: GPL-2.0-or-later | ||
3 | |||
4 | #include <jni.h> | ||
5 | |||
6 | #include "applets/software_keyboard.h" | ||
7 | #include "common/android/id_cache.h" | ||
8 | #include "common/assert.h" | ||
9 | #include "common/fs/fs_android.h" | ||
10 | #include "video_core/rasterizer_interface.h" | ||
11 | |||
12 | static JavaVM* s_java_vm; | ||
13 | static jclass s_native_library_class; | ||
14 | static jclass s_disk_cache_progress_class; | ||
15 | static jclass s_load_callback_stage_class; | ||
16 | static jclass s_game_dir_class; | ||
17 | static jmethodID s_game_dir_constructor; | ||
18 | static jmethodID s_exit_emulation_activity; | ||
19 | static jmethodID s_disk_cache_load_progress; | ||
20 | static jmethodID s_on_emulation_started; | ||
21 | static jmethodID s_on_emulation_stopped; | ||
22 | static jmethodID s_on_program_changed; | ||
23 | |||
24 | static jclass s_game_class; | ||
25 | static jmethodID s_game_constructor; | ||
26 | static jfieldID s_game_title_field; | ||
27 | static jfieldID s_game_path_field; | ||
28 | static jfieldID s_game_program_id_field; | ||
29 | static jfieldID s_game_developer_field; | ||
30 | static jfieldID s_game_version_field; | ||
31 | static jfieldID s_game_is_homebrew_field; | ||
32 | |||
33 | static jclass s_string_class; | ||
34 | static jclass s_pair_class; | ||
35 | static jmethodID s_pair_constructor; | ||
36 | static jfieldID s_pair_first_field; | ||
37 | static jfieldID s_pair_second_field; | ||
38 | |||
39 | static jclass s_overlay_control_data_class; | ||
40 | static jmethodID s_overlay_control_data_constructor; | ||
41 | static jfieldID s_overlay_control_data_id_field; | ||
42 | static jfieldID s_overlay_control_data_enabled_field; | ||
43 | static jfieldID s_overlay_control_data_landscape_position_field; | ||
44 | static jfieldID s_overlay_control_data_portrait_position_field; | ||
45 | static jfieldID s_overlay_control_data_foldable_position_field; | ||
46 | |||
47 | static jclass s_patch_class; | ||
48 | static jmethodID s_patch_constructor; | ||
49 | static jfieldID s_patch_enabled_field; | ||
50 | static jfieldID s_patch_name_field; | ||
51 | static jfieldID s_patch_version_field; | ||
52 | static jfieldID s_patch_type_field; | ||
53 | static jfieldID s_patch_program_id_field; | ||
54 | static jfieldID s_patch_title_id_field; | ||
55 | |||
56 | static jclass s_double_class; | ||
57 | static jmethodID s_double_constructor; | ||
58 | static jfieldID s_double_value_field; | ||
59 | |||
60 | static jclass s_integer_class; | ||
61 | static jmethodID s_integer_constructor; | ||
62 | static jfieldID s_integer_value_field; | ||
63 | |||
64 | static jclass s_boolean_class; | ||
65 | static jmethodID s_boolean_constructor; | ||
66 | static jfieldID s_boolean_value_field; | ||
67 | |||
68 | static constexpr jint JNI_VERSION = JNI_VERSION_1_6; | ||
69 | |||
70 | namespace Common::Android { | ||
71 | |||
72 | JNIEnv* GetEnvForThread() { | ||
73 | thread_local static struct OwnedEnv { | ||
74 | OwnedEnv() { | ||
75 | status = s_java_vm->GetEnv(reinterpret_cast<void**>(&env), JNI_VERSION_1_6); | ||
76 | if (status == JNI_EDETACHED) | ||
77 | s_java_vm->AttachCurrentThread(&env, nullptr); | ||
78 | } | ||
79 | |||
80 | ~OwnedEnv() { | ||
81 | if (status == JNI_EDETACHED) | ||
82 | s_java_vm->DetachCurrentThread(); | ||
83 | } | ||
84 | |||
85 | int status; | ||
86 | JNIEnv* env = nullptr; | ||
87 | } owned; | ||
88 | return owned.env; | ||
89 | } | ||
90 | |||
91 | jclass GetNativeLibraryClass() { | ||
92 | return s_native_library_class; | ||
93 | } | ||
94 | |||
95 | jclass GetDiskCacheProgressClass() { | ||
96 | return s_disk_cache_progress_class; | ||
97 | } | ||
98 | |||
99 | jclass GetDiskCacheLoadCallbackStageClass() { | ||
100 | return s_load_callback_stage_class; | ||
101 | } | ||
102 | |||
103 | jclass GetGameDirClass() { | ||
104 | return s_game_dir_class; | ||
105 | } | ||
106 | |||
107 | jmethodID GetGameDirConstructor() { | ||
108 | return s_game_dir_constructor; | ||
109 | } | ||
110 | |||
111 | jmethodID GetExitEmulationActivity() { | ||
112 | return s_exit_emulation_activity; | ||
113 | } | ||
114 | |||
115 | jmethodID GetDiskCacheLoadProgress() { | ||
116 | return s_disk_cache_load_progress; | ||
117 | } | ||
118 | |||
119 | jmethodID GetOnEmulationStarted() { | ||
120 | return s_on_emulation_started; | ||
121 | } | ||
122 | |||
123 | jmethodID GetOnEmulationStopped() { | ||
124 | return s_on_emulation_stopped; | ||
125 | } | ||
126 | |||
127 | jmethodID GetOnProgramChanged() { | ||
128 | return s_on_program_changed; | ||
129 | } | ||
130 | |||
131 | jclass GetGameClass() { | ||
132 | return s_game_class; | ||
133 | } | ||
134 | |||
135 | jmethodID GetGameConstructor() { | ||
136 | return s_game_constructor; | ||
137 | } | ||
138 | |||
139 | jfieldID GetGameTitleField() { | ||
140 | return s_game_title_field; | ||
141 | } | ||
142 | |||
143 | jfieldID GetGamePathField() { | ||
144 | return s_game_path_field; | ||
145 | } | ||
146 | |||
147 | jfieldID GetGameProgramIdField() { | ||
148 | return s_game_program_id_field; | ||
149 | } | ||
150 | |||
151 | jfieldID GetGameDeveloperField() { | ||
152 | return s_game_developer_field; | ||
153 | } | ||
154 | |||
155 | jfieldID GetGameVersionField() { | ||
156 | return s_game_version_field; | ||
157 | } | ||
158 | |||
159 | jfieldID GetGameIsHomebrewField() { | ||
160 | return s_game_is_homebrew_field; | ||
161 | } | ||
162 | |||
163 | jclass GetStringClass() { | ||
164 | return s_string_class; | ||
165 | } | ||
166 | |||
167 | jclass GetPairClass() { | ||
168 | return s_pair_class; | ||
169 | } | ||
170 | |||
171 | jmethodID GetPairConstructor() { | ||
172 | return s_pair_constructor; | ||
173 | } | ||
174 | |||
175 | jfieldID GetPairFirstField() { | ||
176 | return s_pair_first_field; | ||
177 | } | ||
178 | |||
179 | jfieldID GetPairSecondField() { | ||
180 | return s_pair_second_field; | ||
181 | } | ||
182 | |||
183 | jclass GetOverlayControlDataClass() { | ||
184 | return s_overlay_control_data_class; | ||
185 | } | ||
186 | |||
187 | jmethodID GetOverlayControlDataConstructor() { | ||
188 | return s_overlay_control_data_constructor; | ||
189 | } | ||
190 | |||
191 | jfieldID GetOverlayControlDataIdField() { | ||
192 | return s_overlay_control_data_id_field; | ||
193 | } | ||
194 | |||
195 | jfieldID GetOverlayControlDataEnabledField() { | ||
196 | return s_overlay_control_data_enabled_field; | ||
197 | } | ||
198 | |||
199 | jfieldID GetOverlayControlDataLandscapePositionField() { | ||
200 | return s_overlay_control_data_landscape_position_field; | ||
201 | } | ||
202 | |||
203 | jfieldID GetOverlayControlDataPortraitPositionField() { | ||
204 | return s_overlay_control_data_portrait_position_field; | ||
205 | } | ||
206 | |||
207 | jfieldID GetOverlayControlDataFoldablePositionField() { | ||
208 | return s_overlay_control_data_foldable_position_field; | ||
209 | } | ||
210 | |||
211 | jclass GetPatchClass() { | ||
212 | return s_patch_class; | ||
213 | } | ||
214 | |||
215 | jmethodID GetPatchConstructor() { | ||
216 | return s_patch_constructor; | ||
217 | } | ||
218 | |||
219 | jfieldID GetPatchEnabledField() { | ||
220 | return s_patch_enabled_field; | ||
221 | } | ||
222 | |||
223 | jfieldID GetPatchNameField() { | ||
224 | return s_patch_name_field; | ||
225 | } | ||
226 | |||
227 | jfieldID GetPatchVersionField() { | ||
228 | return s_patch_version_field; | ||
229 | } | ||
230 | |||
231 | jfieldID GetPatchTypeField() { | ||
232 | return s_patch_type_field; | ||
233 | } | ||
234 | |||
235 | jfieldID GetPatchProgramIdField() { | ||
236 | return s_patch_program_id_field; | ||
237 | } | ||
238 | |||
239 | jfieldID GetPatchTitleIdField() { | ||
240 | return s_patch_title_id_field; | ||
241 | } | ||
242 | |||
243 | jclass GetDoubleClass() { | ||
244 | return s_double_class; | ||
245 | } | ||
246 | |||
247 | jmethodID GetDoubleConstructor() { | ||
248 | return s_double_constructor; | ||
249 | } | ||
250 | |||
251 | jfieldID GetDoubleValueField() { | ||
252 | return s_double_value_field; | ||
253 | } | ||
254 | |||
255 | jclass GetIntegerClass() { | ||
256 | return s_integer_class; | ||
257 | } | ||
258 | |||
259 | jmethodID GetIntegerConstructor() { | ||
260 | return s_integer_constructor; | ||
261 | } | ||
262 | |||
263 | jfieldID GetIntegerValueField() { | ||
264 | return s_integer_value_field; | ||
265 | } | ||
266 | |||
267 | jclass GetBooleanClass() { | ||
268 | return s_boolean_class; | ||
269 | } | ||
270 | |||
271 | jmethodID GetBooleanConstructor() { | ||
272 | return s_boolean_constructor; | ||
273 | } | ||
274 | |||
275 | jfieldID GetBooleanValueField() { | ||
276 | return s_boolean_value_field; | ||
277 | } | ||
278 | |||
279 | #ifdef __cplusplus | ||
280 | extern "C" { | ||
281 | #endif | ||
282 | |||
283 | jint JNI_OnLoad(JavaVM* vm, void* reserved) { | ||
284 | s_java_vm = vm; | ||
285 | |||
286 | JNIEnv* env; | ||
287 | if (vm->GetEnv(reinterpret_cast<void**>(&env), JNI_VERSION) != JNI_OK) | ||
288 | return JNI_ERR; | ||
289 | |||
290 | // Initialize Java classes | ||
291 | const jclass native_library_class = env->FindClass("org/yuzu/yuzu_emu/NativeLibrary"); | ||
292 | s_native_library_class = reinterpret_cast<jclass>(env->NewGlobalRef(native_library_class)); | ||
293 | s_disk_cache_progress_class = reinterpret_cast<jclass>(env->NewGlobalRef( | ||
294 | env->FindClass("org/yuzu/yuzu_emu/disk_shader_cache/DiskShaderCacheProgress"))); | ||
295 | s_load_callback_stage_class = reinterpret_cast<jclass>(env->NewGlobalRef(env->FindClass( | ||
296 | "org/yuzu/yuzu_emu/disk_shader_cache/DiskShaderCacheProgress$LoadCallbackStage"))); | ||
297 | |||
298 | const jclass game_dir_class = env->FindClass("org/yuzu/yuzu_emu/model/GameDir"); | ||
299 | s_game_dir_class = reinterpret_cast<jclass>(env->NewGlobalRef(game_dir_class)); | ||
300 | s_game_dir_constructor = env->GetMethodID(game_dir_class, "<init>", "(Ljava/lang/String;Z)V"); | ||
301 | env->DeleteLocalRef(game_dir_class); | ||
302 | |||
303 | // Initialize methods | ||
304 | s_exit_emulation_activity = | ||
305 | env->GetStaticMethodID(s_native_library_class, "exitEmulationActivity", "(I)V"); | ||
306 | s_disk_cache_load_progress = | ||
307 | env->GetStaticMethodID(s_disk_cache_progress_class, "loadProgress", "(III)V"); | ||
308 | s_on_emulation_started = | ||
309 | env->GetStaticMethodID(s_native_library_class, "onEmulationStarted", "()V"); | ||
310 | s_on_emulation_stopped = | ||
311 | env->GetStaticMethodID(s_native_library_class, "onEmulationStopped", "(I)V"); | ||
312 | s_on_program_changed = | ||
313 | env->GetStaticMethodID(s_native_library_class, "onProgramChanged", "(I)V"); | ||
314 | |||
315 | const jclass game_class = env->FindClass("org/yuzu/yuzu_emu/model/Game"); | ||
316 | s_game_class = reinterpret_cast<jclass>(env->NewGlobalRef(game_class)); | ||
317 | s_game_constructor = env->GetMethodID(game_class, "<init>", | ||
318 | "(Ljava/lang/String;Ljava/lang/String;Ljava/lang/" | ||
319 | "String;Ljava/lang/String;Ljava/lang/String;Z)V"); | ||
320 | s_game_title_field = env->GetFieldID(game_class, "title", "Ljava/lang/String;"); | ||
321 | s_game_path_field = env->GetFieldID(game_class, "path", "Ljava/lang/String;"); | ||
322 | s_game_program_id_field = env->GetFieldID(game_class, "programId", "Ljava/lang/String;"); | ||
323 | s_game_developer_field = env->GetFieldID(game_class, "developer", "Ljava/lang/String;"); | ||
324 | s_game_version_field = env->GetFieldID(game_class, "version", "Ljava/lang/String;"); | ||
325 | s_game_is_homebrew_field = env->GetFieldID(game_class, "isHomebrew", "Z"); | ||
326 | env->DeleteLocalRef(game_class); | ||
327 | |||
328 | const jclass string_class = env->FindClass("java/lang/String"); | ||
329 | s_string_class = reinterpret_cast<jclass>(env->NewGlobalRef(string_class)); | ||
330 | env->DeleteLocalRef(string_class); | ||
331 | |||
332 | const jclass pair_class = env->FindClass("kotlin/Pair"); | ||
333 | s_pair_class = reinterpret_cast<jclass>(env->NewGlobalRef(pair_class)); | ||
334 | s_pair_constructor = | ||
335 | env->GetMethodID(pair_class, "<init>", "(Ljava/lang/Object;Ljava/lang/Object;)V"); | ||
336 | s_pair_first_field = env->GetFieldID(pair_class, "first", "Ljava/lang/Object;"); | ||
337 | s_pair_second_field = env->GetFieldID(pair_class, "second", "Ljava/lang/Object;"); | ||
338 | env->DeleteLocalRef(pair_class); | ||
339 | |||
340 | const jclass overlay_control_data_class = | ||
341 | env->FindClass("org/yuzu/yuzu_emu/overlay/model/OverlayControlData"); | ||
342 | s_overlay_control_data_class = | ||
343 | reinterpret_cast<jclass>(env->NewGlobalRef(overlay_control_data_class)); | ||
344 | s_overlay_control_data_constructor = | ||
345 | env->GetMethodID(overlay_control_data_class, "<init>", | ||
346 | "(Ljava/lang/String;ZLkotlin/Pair;Lkotlin/Pair;Lkotlin/Pair;)V"); | ||
347 | s_overlay_control_data_id_field = | ||
348 | env->GetFieldID(overlay_control_data_class, "id", "Ljava/lang/String;"); | ||
349 | s_overlay_control_data_enabled_field = | ||
350 | env->GetFieldID(overlay_control_data_class, "enabled", "Z"); | ||
351 | s_overlay_control_data_landscape_position_field = | ||
352 | env->GetFieldID(overlay_control_data_class, "landscapePosition", "Lkotlin/Pair;"); | ||
353 | s_overlay_control_data_portrait_position_field = | ||
354 | env->GetFieldID(overlay_control_data_class, "portraitPosition", "Lkotlin/Pair;"); | ||
355 | s_overlay_control_data_foldable_position_field = | ||
356 | env->GetFieldID(overlay_control_data_class, "foldablePosition", "Lkotlin/Pair;"); | ||
357 | env->DeleteLocalRef(overlay_control_data_class); | ||
358 | |||
359 | const jclass patch_class = env->FindClass("org/yuzu/yuzu_emu/model/Patch"); | ||
360 | s_patch_class = reinterpret_cast<jclass>(env->NewGlobalRef(patch_class)); | ||
361 | s_patch_constructor = env->GetMethodID( | ||
362 | patch_class, "<init>", | ||
363 | "(ZLjava/lang/String;Ljava/lang/String;ILjava/lang/String;Ljava/lang/String;)V"); | ||
364 | s_patch_enabled_field = env->GetFieldID(patch_class, "enabled", "Z"); | ||
365 | s_patch_name_field = env->GetFieldID(patch_class, "name", "Ljava/lang/String;"); | ||
366 | s_patch_version_field = env->GetFieldID(patch_class, "version", "Ljava/lang/String;"); | ||
367 | s_patch_type_field = env->GetFieldID(patch_class, "type", "I"); | ||
368 | s_patch_program_id_field = env->GetFieldID(patch_class, "programId", "Ljava/lang/String;"); | ||
369 | s_patch_title_id_field = env->GetFieldID(patch_class, "titleId", "Ljava/lang/String;"); | ||
370 | env->DeleteLocalRef(patch_class); | ||
371 | |||
372 | const jclass double_class = env->FindClass("java/lang/Double"); | ||
373 | s_double_class = reinterpret_cast<jclass>(env->NewGlobalRef(double_class)); | ||
374 | s_double_constructor = env->GetMethodID(double_class, "<init>", "(D)V"); | ||
375 | s_double_value_field = env->GetFieldID(double_class, "value", "D"); | ||
376 | env->DeleteLocalRef(double_class); | ||
377 | |||
378 | const jclass int_class = env->FindClass("java/lang/Integer"); | ||
379 | s_integer_class = reinterpret_cast<jclass>(env->NewGlobalRef(int_class)); | ||
380 | s_integer_constructor = env->GetMethodID(int_class, "<init>", "(I)V"); | ||
381 | s_integer_value_field = env->GetFieldID(int_class, "value", "I"); | ||
382 | env->DeleteLocalRef(int_class); | ||
383 | |||
384 | const jclass boolean_class = env->FindClass("java/lang/Boolean"); | ||
385 | s_boolean_class = reinterpret_cast<jclass>(env->NewGlobalRef(boolean_class)); | ||
386 | s_boolean_constructor = env->GetMethodID(boolean_class, "<init>", "(Z)V"); | ||
387 | s_boolean_value_field = env->GetFieldID(boolean_class, "value", "Z"); | ||
388 | env->DeleteLocalRef(boolean_class); | ||
389 | |||
390 | // Initialize Android Storage | ||
391 | Common::FS::Android::RegisterCallbacks(env, s_native_library_class); | ||
392 | |||
393 | // Initialize applets | ||
394 | Common::Android::SoftwareKeyboard::InitJNI(env); | ||
395 | |||
396 | return JNI_VERSION; | ||
397 | } | ||
398 | |||
399 | void JNI_OnUnload(JavaVM* vm, void* reserved) { | ||
400 | JNIEnv* env; | ||
401 | if (vm->GetEnv(reinterpret_cast<void**>(&env), JNI_VERSION) != JNI_OK) { | ||
402 | return; | ||
403 | } | ||
404 | |||
405 | // UnInitialize Android Storage | ||
406 | Common::FS::Android::UnRegisterCallbacks(); | ||
407 | env->DeleteGlobalRef(s_native_library_class); | ||
408 | env->DeleteGlobalRef(s_disk_cache_progress_class); | ||
409 | env->DeleteGlobalRef(s_load_callback_stage_class); | ||
410 | env->DeleteGlobalRef(s_game_dir_class); | ||
411 | env->DeleteGlobalRef(s_game_class); | ||
412 | env->DeleteGlobalRef(s_string_class); | ||
413 | env->DeleteGlobalRef(s_pair_class); | ||
414 | env->DeleteGlobalRef(s_overlay_control_data_class); | ||
415 | env->DeleteGlobalRef(s_patch_class); | ||
416 | env->DeleteGlobalRef(s_double_class); | ||
417 | env->DeleteGlobalRef(s_integer_class); | ||
418 | env->DeleteGlobalRef(s_boolean_class); | ||
419 | |||
420 | // UnInitialize applets | ||
421 | SoftwareKeyboard::CleanupJNI(env); | ||
422 | } | ||
423 | |||
424 | #ifdef __cplusplus | ||
425 | } | ||
426 | #endif | ||
427 | |||
428 | } // namespace Common::Android | ||
diff --git a/src/common/android/id_cache.h b/src/common/android/id_cache.h new file mode 100755 index 000000000..47802f96c --- /dev/null +++ b/src/common/android/id_cache.h | |||
@@ -0,0 +1,88 @@ | |||
1 | // SPDX-FileCopyrightText: 2023 yuzu Emulator Project | ||
2 | // SPDX-License-Identifier: GPL-3.0-or-later | ||
3 | |||
4 | #pragma once | ||
5 | |||
6 | #include <future> | ||
7 | #include <jni.h> | ||
8 | |||
9 | #include "video_core/rasterizer_interface.h" | ||
10 | |||
11 | namespace Common::Android { | ||
12 | |||
13 | JNIEnv* GetEnvForThread(); | ||
14 | |||
15 | /** | ||
16 | * Starts a new thread to run JNI. Intended to be used when you must run JNI from a fiber. | ||
17 | * @tparam T Typename of the return value for the work param | ||
18 | * @param work Lambda that runs JNI code. This function will take care of attaching this thread to | ||
19 | * the JVM | ||
20 | * @return The result from the work lambda param | ||
21 | */ | ||
22 | template <typename T = void> | ||
23 | T RunJNIOnFiber(const std::function<T(JNIEnv*)>& work) { | ||
24 | std::future<T> j_result = std::async(std::launch::async, [&] { | ||
25 | auto env = GetEnvForThread(); | ||
26 | return work(env); | ||
27 | }); | ||
28 | return j_result.get(); | ||
29 | } | ||
30 | |||
31 | jclass GetNativeLibraryClass(); | ||
32 | |||
33 | jclass GetDiskCacheProgressClass(); | ||
34 | jclass GetDiskCacheLoadCallbackStageClass(); | ||
35 | jclass GetGameDirClass(); | ||
36 | jmethodID GetGameDirConstructor(); | ||
37 | jmethodID GetDiskCacheLoadProgress(); | ||
38 | |||
39 | jmethodID GetExitEmulationActivity(); | ||
40 | jmethodID GetOnEmulationStarted(); | ||
41 | jmethodID GetOnEmulationStopped(); | ||
42 | jmethodID GetOnProgramChanged(); | ||
43 | |||
44 | jclass GetGameClass(); | ||
45 | jmethodID GetGameConstructor(); | ||
46 | jfieldID GetGameTitleField(); | ||
47 | jfieldID GetGamePathField(); | ||
48 | jfieldID GetGameProgramIdField(); | ||
49 | jfieldID GetGameDeveloperField(); | ||
50 | jfieldID GetGameVersionField(); | ||
51 | jfieldID GetGameIsHomebrewField(); | ||
52 | |||
53 | jclass GetStringClass(); | ||
54 | jclass GetPairClass(); | ||
55 | jmethodID GetPairConstructor(); | ||
56 | jfieldID GetPairFirstField(); | ||
57 | jfieldID GetPairSecondField(); | ||
58 | |||
59 | jclass GetOverlayControlDataClass(); | ||
60 | jmethodID GetOverlayControlDataConstructor(); | ||
61 | jfieldID GetOverlayControlDataIdField(); | ||
62 | jfieldID GetOverlayControlDataEnabledField(); | ||
63 | jfieldID GetOverlayControlDataLandscapePositionField(); | ||
64 | jfieldID GetOverlayControlDataPortraitPositionField(); | ||
65 | jfieldID GetOverlayControlDataFoldablePositionField(); | ||
66 | |||
67 | jclass GetPatchClass(); | ||
68 | jmethodID GetPatchConstructor(); | ||
69 | jfieldID GetPatchEnabledField(); | ||
70 | jfieldID GetPatchNameField(); | ||
71 | jfieldID GetPatchVersionField(); | ||
72 | jfieldID GetPatchTypeField(); | ||
73 | jfieldID GetPatchProgramIdField(); | ||
74 | jfieldID GetPatchTitleIdField(); | ||
75 | |||
76 | jclass GetDoubleClass(); | ||
77 | jmethodID GetDoubleConstructor(); | ||
78 | jfieldID GetDoubleValueField(); | ||
79 | |||
80 | jclass GetIntegerClass(); | ||
81 | jmethodID GetIntegerConstructor(); | ||
82 | jfieldID GetIntegerValueField(); | ||
83 | |||
84 | jclass GetBooleanClass(); | ||
85 | jmethodID GetBooleanConstructor(); | ||
86 | jfieldID GetBooleanValueField(); | ||
87 | |||
88 | } // namespace Common::Android | ||
diff --git a/src/common/fs/fs_android.cpp b/src/common/fs/fs_android.cpp index 1dd826a4a..9a8053222 100755 --- a/src/common/fs/fs_android.cpp +++ b/src/common/fs/fs_android.cpp | |||
@@ -1,63 +1,38 @@ | |||
1 | // SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project | 1 | // SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project |
2 | // SPDX-License-Identifier: GPL-2.0-or-later | 2 | // SPDX-License-Identifier: GPL-2.0-or-later |
3 | 3 | ||
4 | #include "common/android/android_common.h" | ||
5 | #include "common/android/id_cache.h" | ||
6 | #include "common/assert.h" | ||
4 | #include "common/fs/fs_android.h" | 7 | #include "common/fs/fs_android.h" |
5 | #include "common/string_util.h" | 8 | #include "common/string_util.h" |
6 | 9 | ||
7 | namespace Common::FS::Android { | 10 | namespace Common::FS::Android { |
8 | 11 | ||
9 | JNIEnv* GetEnvForThread() { | ||
10 | thread_local static struct OwnedEnv { | ||
11 | OwnedEnv() { | ||
12 | status = g_jvm->GetEnv(reinterpret_cast<void**>(&env), JNI_VERSION_1_6); | ||
13 | if (status == JNI_EDETACHED) | ||
14 | g_jvm->AttachCurrentThread(&env, nullptr); | ||
15 | } | ||
16 | |||
17 | ~OwnedEnv() { | ||
18 | if (status == JNI_EDETACHED) | ||
19 | g_jvm->DetachCurrentThread(); | ||
20 | } | ||
21 | |||
22 | int status; | ||
23 | JNIEnv* env = nullptr; | ||
24 | } owned; | ||
25 | return owned.env; | ||
26 | } | ||
27 | |||
28 | void RegisterCallbacks(JNIEnv* env, jclass clazz) { | 12 | void RegisterCallbacks(JNIEnv* env, jclass clazz) { |
29 | env->GetJavaVM(&g_jvm); | 13 | env->GetJavaVM(&g_jvm); |
30 | native_library = clazz; | 14 | native_library = clazz; |
31 | 15 | ||
32 | #define FH(FunctionName, JMethodID, Caller, JMethodName, Signature) \ | 16 | s_get_parent_directory = env->GetStaticMethodID(native_library, "getParentDirectory", |
33 | F(JMethodID, JMethodName, Signature) | 17 | "(Ljava/lang/String;)Ljava/lang/String;"); |
34 | #define FR(FunctionName, ReturnValue, JMethodID, Caller, JMethodName, Signature) \ | 18 | s_get_filename = env->GetStaticMethodID(native_library, "getFilename", |
35 | F(JMethodID, JMethodName, Signature) | 19 | "(Ljava/lang/String;)Ljava/lang/String;"); |
36 | #define FS(FunctionName, ReturnValue, Parameters, JMethodID, JMethodName, Signature) \ | 20 | s_get_size = env->GetStaticMethodID(native_library, "getSize", "(Ljava/lang/String;)J"); |
37 | F(JMethodID, JMethodName, Signature) | 21 | s_is_directory = env->GetStaticMethodID(native_library, "isDirectory", "(Ljava/lang/String;)Z"); |
38 | #define F(JMethodID, JMethodName, Signature) \ | 22 | s_file_exists = env->GetStaticMethodID(native_library, "exists", "(Ljava/lang/String;)Z"); |
39 | JMethodID = env->GetStaticMethodID(native_library, JMethodName, Signature); | 23 | s_open_content_uri = env->GetStaticMethodID(native_library, "openContentUri", |
40 | ANDROID_SINGLE_PATH_HELPER_FUNCTIONS(FH) | 24 | "(Ljava/lang/String;Ljava/lang/String;)I"); |
41 | ANDROID_SINGLE_PATH_DETERMINE_FUNCTIONS(FR) | ||
42 | ANDROID_STORAGE_FUNCTIONS(FS) | ||
43 | #undef F | ||
44 | #undef FS | ||
45 | #undef FR | ||
46 | #undef FH | ||
47 | } | 25 | } |
48 | 26 | ||
49 | void UnRegisterCallbacks() { | 27 | void UnRegisterCallbacks() { |
50 | #define FH(FunctionName, JMethodID, Caller, JMethodName, Signature) F(JMethodID) | 28 | s_get_parent_directory = nullptr; |
51 | #define FR(FunctionName, ReturnValue, JMethodID, Caller, JMethodName, Signature) F(JMethodID) | 29 | s_get_filename = nullptr; |
52 | #define FS(FunctionName, ReturnValue, Parameters, JMethodID, JMethodName, Signature) F(JMethodID) | 30 | |
53 | #define F(JMethodID) JMethodID = nullptr; | 31 | s_get_size = nullptr; |
54 | ANDROID_SINGLE_PATH_HELPER_FUNCTIONS(FH) | 32 | s_is_directory = nullptr; |
55 | ANDROID_SINGLE_PATH_DETERMINE_FUNCTIONS(FR) | 33 | s_file_exists = nullptr; |
56 | ANDROID_STORAGE_FUNCTIONS(FS) | 34 | |
57 | #undef F | 35 | s_open_content_uri = nullptr; |
58 | #undef FS | ||
59 | #undef FR | ||
60 | #undef FH | ||
61 | } | 36 | } |
62 | 37 | ||
63 | bool IsContentUri(const std::string& path) { | 38 | bool IsContentUri(const std::string& path) { |
@@ -69,8 +44,8 @@ bool IsContentUri(const std::string& path) { | |||
69 | return path.find(prefix) == 0; | 44 | return path.find(prefix) == 0; |
70 | } | 45 | } |
71 | 46 | ||
72 | int OpenContentUri(const std::string& filepath, OpenMode openmode) { | 47 | s32 OpenContentUri(const std::string& filepath, OpenMode openmode) { |
73 | if (open_content_uri == nullptr) | 48 | if (s_open_content_uri == nullptr) |
74 | return -1; | 49 | return -1; |
75 | 50 | ||
76 | const char* mode = ""; | 51 | const char* mode = ""; |
@@ -82,50 +57,66 @@ int OpenContentUri(const std::string& filepath, OpenMode openmode) { | |||
82 | UNIMPLEMENTED(); | 57 | UNIMPLEMENTED(); |
83 | return -1; | 58 | return -1; |
84 | } | 59 | } |
85 | auto env = GetEnvForThread(); | 60 | auto env = Common::Android::GetEnvForThread(); |
86 | jstring j_filepath = env->NewStringUTF(filepath.c_str()); | 61 | jstring j_filepath = Common::Android::ToJString(env, filepath); |
87 | jstring j_mode = env->NewStringUTF(mode); | 62 | jstring j_mode = Common::Android::ToJString(env, mode); |
88 | return env->CallStaticIntMethod(native_library, open_content_uri, j_filepath, j_mode); | 63 | return env->CallStaticIntMethod(native_library, s_open_content_uri, j_filepath, j_mode); |
64 | } | ||
65 | |||
66 | u64 GetSize(const std::string& filepath) { | ||
67 | if (s_get_size == nullptr) { | ||
68 | return 0; | ||
69 | } | ||
70 | auto env = Common::Android::GetEnvForThread(); | ||
71 | return static_cast<u64>(env->CallStaticLongMethod( | ||
72 | native_library, s_get_size, | ||
73 | Common::Android::ToJString(Common::Android::GetEnvForThread(), filepath))); | ||
74 | } | ||
75 | |||
76 | bool IsDirectory(const std::string& filepath) { | ||
77 | if (s_is_directory == nullptr) { | ||
78 | return 0; | ||
79 | } | ||
80 | auto env = Common::Android::GetEnvForThread(); | ||
81 | return env->CallStaticBooleanMethod( | ||
82 | native_library, s_is_directory, | ||
83 | Common::Android::ToJString(Common::Android::GetEnvForThread(), filepath)); | ||
89 | } | 84 | } |
90 | 85 | ||
91 | #define FR(FunctionName, ReturnValue, JMethodID, Caller, JMethodName, Signature) \ | 86 | bool Exists(const std::string& filepath) { |
92 | F(FunctionName, ReturnValue, JMethodID, Caller) | 87 | if (s_file_exists == nullptr) { |
93 | #define F(FunctionName, ReturnValue, JMethodID, Caller) \ | 88 | return 0; |
94 | ReturnValue FunctionName(const std::string& filepath) { \ | ||
95 | if (JMethodID == nullptr) { \ | ||
96 | return 0; \ | ||
97 | } \ | ||
98 | auto env = GetEnvForThread(); \ | ||
99 | jstring j_filepath = env->NewStringUTF(filepath.c_str()); \ | ||
100 | return env->Caller(native_library, JMethodID, j_filepath); \ | ||
101 | } | 89 | } |
102 | ANDROID_SINGLE_PATH_DETERMINE_FUNCTIONS(FR) | 90 | auto env = Common::Android::GetEnvForThread(); |
103 | #undef F | 91 | return env->CallStaticBooleanMethod( |
104 | #undef FR | 92 | native_library, s_file_exists, |
105 | 93 | Common::Android::ToJString(Common::Android::GetEnvForThread(), filepath)); | |
106 | #define FH(FunctionName, JMethodID, Caller, JMethodName, Signature) \ | 94 | } |
107 | F(FunctionName, JMethodID, Caller) | 95 | |
108 | #define F(FunctionName, JMethodID, Caller) \ | 96 | std::string GetParentDirectory(const std::string& filepath) { |
109 | std::string FunctionName(const std::string& filepath) { \ | 97 | if (s_get_parent_directory == nullptr) { |
110 | if (JMethodID == nullptr) { \ | 98 | return 0; |
111 | return 0; \ | ||
112 | } \ | ||
113 | auto env = GetEnvForThread(); \ | ||
114 | jstring j_filepath = env->NewStringUTF(filepath.c_str()); \ | ||
115 | jstring j_return = \ | ||
116 | static_cast<jstring>(env->Caller(native_library, JMethodID, j_filepath)); \ | ||
117 | if (!j_return) { \ | ||
118 | return {}; \ | ||
119 | } \ | ||
120 | const jchar* jchars = env->GetStringChars(j_return, nullptr); \ | ||
121 | const jsize length = env->GetStringLength(j_return); \ | ||
122 | const std::u16string_view string_view(reinterpret_cast<const char16_t*>(jchars), length); \ | ||
123 | const std::string converted_string = Common::UTF16ToUTF8(string_view); \ | ||
124 | env->ReleaseStringChars(j_return, jchars); \ | ||
125 | return converted_string; \ | ||
126 | } | 99 | } |
127 | ANDROID_SINGLE_PATH_HELPER_FUNCTIONS(FH) | 100 | auto env = Common::Android::GetEnvForThread(); |
128 | #undef F | 101 | jstring j_return = static_cast<jstring>(env->CallStaticObjectMethod( |
129 | #undef FH | 102 | native_library, s_get_parent_directory, Common::Android::ToJString(env, filepath))); |
103 | if (!j_return) { | ||
104 | return {}; | ||
105 | } | ||
106 | return Common::Android::GetJString(env, j_return); | ||
107 | } | ||
108 | |||
109 | std::string GetFilename(const std::string& filepath) { | ||
110 | if (s_get_filename == nullptr) { | ||
111 | return 0; | ||
112 | } | ||
113 | auto env = Common::Android::GetEnvForThread(); | ||
114 | jstring j_return = static_cast<jstring>(env->CallStaticObjectMethod( | ||
115 | native_library, s_get_filename, Common::Android::ToJString(env, filepath))); | ||
116 | if (!j_return) { | ||
117 | return {}; | ||
118 | } | ||
119 | return Common::Android::GetJString(env, j_return); | ||
120 | } | ||
130 | 121 | ||
131 | } // namespace Common::FS::Android | 122 | } // namespace Common::FS::Android |
diff --git a/src/common/fs/fs_android.h b/src/common/fs/fs_android.h index 2c9234313..b33f4beb8 100755 --- a/src/common/fs/fs_android.h +++ b/src/common/fs/fs_android.h | |||
@@ -7,38 +7,17 @@ | |||
7 | #include <vector> | 7 | #include <vector> |
8 | #include <jni.h> | 8 | #include <jni.h> |
9 | 9 | ||
10 | #define ANDROID_STORAGE_FUNCTIONS(V) \ | ||
11 | V(OpenContentUri, int, (const std::string& filepath, OpenMode openmode), open_content_uri, \ | ||
12 | "openContentUri", "(Ljava/lang/String;Ljava/lang/String;)I") | ||
13 | |||
14 | #define ANDROID_SINGLE_PATH_DETERMINE_FUNCTIONS(V) \ | ||
15 | V(GetSize, std::uint64_t, get_size, CallStaticLongMethod, "getSize", "(Ljava/lang/String;)J") \ | ||
16 | V(IsDirectory, bool, is_directory, CallStaticBooleanMethod, "isDirectory", \ | ||
17 | "(Ljava/lang/String;)Z") \ | ||
18 | V(Exists, bool, file_exists, CallStaticBooleanMethod, "exists", "(Ljava/lang/String;)Z") | ||
19 | |||
20 | #define ANDROID_SINGLE_PATH_HELPER_FUNCTIONS(V) \ | ||
21 | V(GetParentDirectory, get_parent_directory, CallStaticObjectMethod, "getParentDirectory", \ | ||
22 | "(Ljava/lang/String;)Ljava/lang/String;") \ | ||
23 | V(GetFilename, get_filename, CallStaticObjectMethod, "getFilename", \ | ||
24 | "(Ljava/lang/String;)Ljava/lang/String;") | ||
25 | |||
26 | namespace Common::FS::Android { | 10 | namespace Common::FS::Android { |
27 | 11 | ||
28 | static JavaVM* g_jvm = nullptr; | 12 | static JavaVM* g_jvm = nullptr; |
29 | static jclass native_library = nullptr; | 13 | static jclass native_library = nullptr; |
30 | 14 | ||
31 | #define FH(FunctionName, JMethodID, Caller, JMethodName, Signature) F(JMethodID) | 15 | static jmethodID s_get_parent_directory; |
32 | #define FR(FunctionName, ReturnValue, JMethodID, Caller, JMethodName, Signature) F(JMethodID) | 16 | static jmethodID s_get_filename; |
33 | #define FS(FunctionName, ReturnValue, Parameters, JMethodID, JMethodName, Signature) F(JMethodID) | 17 | static jmethodID s_get_size; |
34 | #define F(JMethodID) static jmethodID JMethodID = nullptr; | 18 | static jmethodID s_is_directory; |
35 | ANDROID_SINGLE_PATH_HELPER_FUNCTIONS(FH) | 19 | static jmethodID s_file_exists; |
36 | ANDROID_SINGLE_PATH_DETERMINE_FUNCTIONS(FR) | 20 | static jmethodID s_open_content_uri; |
37 | ANDROID_STORAGE_FUNCTIONS(FS) | ||
38 | #undef F | ||
39 | #undef FS | ||
40 | #undef FR | ||
41 | #undef FH | ||
42 | 21 | ||
43 | enum class OpenMode { | 22 | enum class OpenMode { |
44 | Read, | 23 | Read, |
@@ -57,24 +36,11 @@ void UnRegisterCallbacks(); | |||
57 | 36 | ||
58 | bool IsContentUri(const std::string& path); | 37 | bool IsContentUri(const std::string& path); |
59 | 38 | ||
60 | #define FS(FunctionName, ReturnValue, Parameters, JMethodID, JMethodName, Signature) \ | 39 | int OpenContentUri(const std::string& filepath, OpenMode openmode); |
61 | F(FunctionName, Parameters, ReturnValue) | 40 | std::uint64_t GetSize(const std::string& filepath); |
62 | #define F(FunctionName, Parameters, ReturnValue) ReturnValue FunctionName Parameters; | 41 | bool IsDirectory(const std::string& filepath); |
63 | ANDROID_STORAGE_FUNCTIONS(FS) | 42 | bool Exists(const std::string& filepath); |
64 | #undef F | 43 | std::string GetParentDirectory(const std::string& filepath); |
65 | #undef FS | 44 | std::string GetFilename(const std::string& filepath); |
66 | |||
67 | #define FR(FunctionName, ReturnValue, JMethodID, Caller, JMethodName, Signature) \ | ||
68 | F(FunctionName, ReturnValue) | ||
69 | #define F(FunctionName, ReturnValue) ReturnValue FunctionName(const std::string& filepath); | ||
70 | ANDROID_SINGLE_PATH_DETERMINE_FUNCTIONS(FR) | ||
71 | #undef F | ||
72 | #undef FR | ||
73 | |||
74 | #define FH(FunctionName, JMethodID, Caller, JMethodName, Signature) F(FunctionName) | ||
75 | #define F(FunctionName) std::string FunctionName(const std::string& filepath); | ||
76 | ANDROID_SINGLE_PATH_HELPER_FUNCTIONS(FH) | ||
77 | #undef F | ||
78 | #undef FH | ||
79 | 45 | ||
80 | } // namespace Common::FS::Android | 46 | } // namespace Common::FS::Android |
diff --git a/src/core/file_sys/content_archive.cpp b/src/core/file_sys/content_archive.cpp index 7e543576e..33040d9c3 100755 --- a/src/core/file_sys/content_archive.cpp +++ b/src/core/file_sys/content_archive.cpp | |||
@@ -172,6 +172,10 @@ u32 NCA::GetSDKVersion() const { | |||
172 | return reader->GetSdkAddonVersion(); | 172 | return reader->GetSdkAddonVersion(); |
173 | } | 173 | } |
174 | 174 | ||
175 | u8 NCA::GetKeyGeneration() const { | ||
176 | return reader->GetKeyGeneration(); | ||
177 | } | ||
178 | |||
175 | bool NCA::IsUpdate() const { | 179 | bool NCA::IsUpdate() const { |
176 | return is_update; | 180 | return is_update; |
177 | } | 181 | } |
diff --git a/src/core/file_sys/content_archive.h b/src/core/file_sys/content_archive.h index 8cc82ccb8..1d02d1193 100755 --- a/src/core/file_sys/content_archive.h +++ b/src/core/file_sys/content_archive.h | |||
@@ -77,6 +77,7 @@ public: | |||
77 | u64 GetTitleId() const; | 77 | u64 GetTitleId() const; |
78 | RightsId GetRightsId() const; | 78 | RightsId GetRightsId() const; |
79 | u32 GetSDKVersion() const; | 79 | u32 GetSDKVersion() const; |
80 | u8 GetKeyGeneration() const; | ||
80 | bool IsUpdate() const; | 81 | bool IsUpdate() const; |
81 | 82 | ||
82 | VirtualFile GetRomFS() const; | 83 | VirtualFile GetRomFS() const; |
diff --git a/src/core/hle/service/am/library_applet_creator.cpp b/src/core/hle/service/am/library_applet_creator.cpp index c48ed29bc..3e2a1d9c1 100755 --- a/src/core/hle/service/am/library_applet_creator.cpp +++ b/src/core/hle/service/am/library_applet_creator.cpp | |||
@@ -102,8 +102,14 @@ std::shared_ptr<ILibraryAppletAccessor> CreateGuestApplet(Core::System& system, | |||
102 | return {}; | 102 | return {}; |
103 | } | 103 | } |
104 | 104 | ||
105 | // TODO: enable other versions of applets | ||
106 | enum : u8 { | ||
107 | Firmware1600 = 15, | ||
108 | Firmware1700 = 16, | ||
109 | }; | ||
110 | |||
105 | auto process = std::make_unique<Process>(system); | 111 | auto process = std::make_unique<Process>(system); |
106 | if (!process->Initialize(program_id)) { | 112 | if (!process->Initialize(program_id, Firmware1600, Firmware1700)) { |
107 | // Couldn't initialize the guest process | 113 | // Couldn't initialize the guest process |
108 | return {}; | 114 | return {}; |
109 | } | 115 | } |
diff --git a/src/core/hle/service/am/process.cpp b/src/core/hle/service/am/process.cpp index 16b685f86..992c50713 100755 --- a/src/core/hle/service/am/process.cpp +++ b/src/core/hle/service/am/process.cpp | |||
@@ -3,6 +3,7 @@ | |||
3 | 3 | ||
4 | #include "common/scope_exit.h" | 4 | #include "common/scope_exit.h" |
5 | 5 | ||
6 | #include "core/file_sys/content_archive.h" | ||
6 | #include "core/file_sys/nca_metadata.h" | 7 | #include "core/file_sys/nca_metadata.h" |
7 | #include "core/file_sys/registered_cache.h" | 8 | #include "core/file_sys/registered_cache.h" |
8 | #include "core/hle/kernel/k_process.h" | 9 | #include "core/hle/kernel/k_process.h" |
@@ -20,7 +21,7 @@ Process::~Process() { | |||
20 | this->Finalize(); | 21 | this->Finalize(); |
21 | } | 22 | } |
22 | 23 | ||
23 | bool Process::Initialize(u64 program_id) { | 24 | bool Process::Initialize(u64 program_id, u8 minimum_key_generation, u8 maximum_key_generation) { |
24 | // First, ensure we are not holding another process. | 25 | // First, ensure we are not holding another process. |
25 | this->Finalize(); | 26 | this->Finalize(); |
26 | 27 | ||
@@ -29,21 +30,33 @@ bool Process::Initialize(u64 program_id) { | |||
29 | 30 | ||
30 | // Attempt to load program NCA. | 31 | // Attempt to load program NCA. |
31 | const FileSys::RegisteredCache* bis_system{}; | 32 | const FileSys::RegisteredCache* bis_system{}; |
32 | FileSys::VirtualFile nca{}; | 33 | FileSys::VirtualFile nca_raw{}; |
33 | 34 | ||
34 | // Get the program NCA from built-in storage. | 35 | // Get the program NCA from built-in storage. |
35 | bis_system = fsc.GetSystemNANDContents(); | 36 | bis_system = fsc.GetSystemNANDContents(); |
36 | if (bis_system) { | 37 | if (bis_system) { |
37 | nca = bis_system->GetEntryRaw(program_id, FileSys::ContentRecordType::Program); | 38 | nca_raw = bis_system->GetEntryRaw(program_id, FileSys::ContentRecordType::Program); |
38 | } | 39 | } |
39 | 40 | ||
40 | // Ensure we retrieved a program NCA. | 41 | // Ensure we retrieved a program NCA. |
41 | if (!nca) { | 42 | if (!nca_raw) { |
42 | return false; | 43 | return false; |
43 | } | 44 | } |
44 | 45 | ||
46 | // Ensure we have a suitable version. | ||
47 | if (minimum_key_generation > 0) { | ||
48 | FileSys::NCA nca(nca_raw); | ||
49 | if (nca.GetStatus() == Loader::ResultStatus::Success && | ||
50 | (nca.GetKeyGeneration() < minimum_key_generation || | ||
51 | nca.GetKeyGeneration() > maximum_key_generation)) { | ||
52 | LOG_WARNING(Service_LDR, "Skipping program {:016X} with generation {}", program_id, | ||
53 | nca.GetKeyGeneration()); | ||
54 | return false; | ||
55 | } | ||
56 | } | ||
57 | |||
45 | // Get the appropriate loader to parse this NCA. | 58 | // Get the appropriate loader to parse this NCA. |
46 | auto app_loader = Loader::GetLoader(m_system, nca, program_id, 0); | 59 | auto app_loader = Loader::GetLoader(m_system, nca_raw, program_id, 0); |
47 | 60 | ||
48 | // Ensure we have a loader which can parse the NCA. | 61 | // Ensure we have a loader which can parse the NCA. |
49 | if (!app_loader) { | 62 | if (!app_loader) { |
diff --git a/src/core/hle/service/am/process.h b/src/core/hle/service/am/process.h index 4b908ade4..4b8102fb6 100755 --- a/src/core/hle/service/am/process.h +++ b/src/core/hle/service/am/process.h | |||
@@ -21,7 +21,7 @@ public: | |||
21 | explicit Process(Core::System& system); | 21 | explicit Process(Core::System& system); |
22 | ~Process(); | 22 | ~Process(); |
23 | 23 | ||
24 | bool Initialize(u64 program_id); | 24 | bool Initialize(u64 program_id, u8 minimum_key_generation, u8 maximum_key_generation); |
25 | void Finalize(); | 25 | void Finalize(); |
26 | 26 | ||
27 | bool Run(); | 27 | bool Run(); |
diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt index dd769b75f..000001b6d 100755 --- a/src/video_core/CMakeLists.txt +++ b/src/video_core/CMakeLists.txt | |||
@@ -390,4 +390,8 @@ if (ANDROID AND ARCHITECTURE_arm64) | |||
390 | target_link_libraries(video_core PRIVATE adrenotools) | 390 | target_link_libraries(video_core PRIVATE adrenotools) |
391 | endif() | 391 | endif() |
392 | 392 | ||
393 | if (ARCHITECTURE_arm64) | ||
394 | target_link_libraries(video_core PRIVATE sse2neon) | ||
395 | endif() | ||
396 | |||
393 | create_target_directory_groups(video_core) | 397 | create_target_directory_groups(video_core) |
diff --git a/src/video_core/host1x/vic.cpp b/src/video_core/host1x/vic.cpp index 705285fd0..14d87a9de 100755 --- a/src/video_core/host1x/vic.cpp +++ b/src/video_core/host1x/vic.cpp | |||
@@ -12,7 +12,10 @@ | |||
12 | #include <immintrin.h> | 12 | #include <immintrin.h> |
13 | #endif | 13 | #endif |
14 | #elif defined(ARCHITECTURE_arm64) | 14 | #elif defined(ARCHITECTURE_arm64) |
15 | #include <arm_neon.h> | 15 | #pragma GCC diagnostic push |
16 | #pragma GCC diagnostic ignored "-Wimplicit-int-conversion" | ||
17 | #include <sse2neon.h> | ||
18 | #pragma GCC diagnostic pop | ||
16 | #endif | 19 | #endif |
17 | 20 | ||
18 | extern "C" { | 21 | extern "C" { |
@@ -43,8 +46,6 @@ extern "C" { | |||
43 | 46 | ||
44 | #if defined(ARCHITECTURE_x86_64) | 47 | #if defined(ARCHITECTURE_x86_64) |
45 | #include "common/x64/cpu_detect.h" | 48 | #include "common/x64/cpu_detect.h" |
46 | #elif defined(ARCHITECTURE_arm64) | ||
47 | // Some ARM64 detect | ||
48 | #endif | 49 | #endif |
49 | 50 | ||
50 | namespace Tegra::Host1x { | 51 | namespace Tegra::Host1x { |
@@ -244,7 +245,9 @@ void Vic::ReadProgressiveY8__V8U8_N420(const SlotStruct& slot, | |||
244 | DecodeLinear(); | 245 | DecodeLinear(); |
245 | return; | 246 | return; |
246 | } | 247 | } |
248 | #endif | ||
247 | 249 | ||
250 | #if defined(ARCHITECTURE_x86_64) || defined(ARCHITECTURE_arm64) | ||
248 | const auto alpha = | 251 | const auto alpha = |
249 | _mm_slli_epi64(_mm_set1_epi64x(static_cast<s64>(slot.config.planar_alpha.Value())), 48); | 252 | _mm_slli_epi64(_mm_set1_epi64x(static_cast<s64>(slot.config.planar_alpha.Value())), 48); |
250 | 253 | ||
@@ -379,8 +382,6 @@ void Vic::ReadProgressiveY8__V8U8_N420(const SlotStruct& slot, | |||
379 | // clang-format on | 382 | // clang-format on |
380 | } | 383 | } |
381 | } | 384 | } |
382 | #elif defined(ARCHITECTURE_arm64) | ||
383 | DecodeLinear(); | ||
384 | #else | 385 | #else |
385 | DecodeLinear(); | 386 | DecodeLinear(); |
386 | #endif | 387 | #endif |
@@ -624,7 +625,9 @@ void Vic::Blend(const ConfigStruct& config, const SlotStruct& slot) { | |||
624 | DecodeLinear(); | 625 | DecodeLinear(); |
625 | return; | 626 | return; |
626 | } | 627 | } |
628 | #endif | ||
627 | 629 | ||
630 | #if defined(ARCHITECTURE_x86_64) || defined(ARCHITECTURE_arm64) | ||
628 | // Fill the columns, e.g | 631 | // Fill the columns, e.g |
629 | // c0 = [00 00 00 00] [r2c0 r2c0 r2c0 r2c0] [r1c0 r1c0 r1c0 r1c0] [r0c0 r0c0 r0c0 r0c0] | 632 | // c0 = [00 00 00 00] [r2c0 r2c0 r2c0 r2c0] [r1c0 r1c0 r1c0 r1c0] [r0c0 r0c0 r0c0 r0c0] |
630 | 633 | ||
@@ -767,8 +770,6 @@ void Vic::Blend(const ConfigStruct& config, const SlotStruct& slot) { | |||
767 | } | 770 | } |
768 | } | 771 | } |
769 | // clang-format on | 772 | // clang-format on |
770 | #elif defined(ARCHITECTURE_arm64) | ||
771 | DecodeLinear(); | ||
772 | #else | 773 | #else |
773 | DecodeLinear(); | 774 | DecodeLinear(); |
774 | #endif | 775 | #endif |
@@ -820,7 +821,9 @@ void Vic::WriteY8__V8U8_N420(const OutputSurfaceConfig& output_surface_config) { | |||
820 | DecodeLinear(out_luma, out_chroma); | 821 | DecodeLinear(out_luma, out_chroma); |
821 | return; | 822 | return; |
822 | } | 823 | } |
824 | #endif | ||
823 | 825 | ||
826 | #if defined(ARCHITECTURE_x86_64) || defined(ARCHITECTURE_arm64) | ||
824 | // luma_mask = [00 00] [00 00] [00 00] [FF FF] [00 00] [00 00] [00 00] [FF FF] | 827 | // luma_mask = [00 00] [00 00] [00 00] [FF FF] [00 00] [00 00] [00 00] [FF FF] |
825 | const auto luma_mask = _mm_set_epi16(0, 0, 0, -1, 0, 0, 0, -1); | 828 | const auto luma_mask = _mm_set_epi16(0, 0, 0, -1, 0, 0, 0, -1); |
826 | 829 | ||
@@ -947,8 +950,6 @@ void Vic::WriteY8__V8U8_N420(const OutputSurfaceConfig& output_surface_config) { | |||
947 | // clang-format on | 950 | // clang-format on |
948 | } | 951 | } |
949 | } | 952 | } |
950 | #elif defined(ARCHITECTURE_arm64) | ||
951 | DecodeLinear(out_luma, out_chroma); | ||
952 | #else | 953 | #else |
953 | DecodeLinear(out_luma, out_chroma); | 954 | DecodeLinear(out_luma, out_chroma); |
954 | #endif | 955 | #endif |
@@ -1079,7 +1080,9 @@ void Vic::WriteABGR(const OutputSurfaceConfig& output_surface_config) { | |||
1079 | DecodeLinear(out_buffer); | 1080 | DecodeLinear(out_buffer); |
1080 | return; | 1081 | return; |
1081 | } | 1082 | } |
1083 | #endif | ||
1082 | 1084 | ||
1085 | #if defined(ARCHITECTURE_x86_64) || defined(ARCHITECTURE_arm64) | ||
1083 | for (u32 y = 0; y < surface_height; y++) { | 1086 | for (u32 y = 0; y < surface_height; y++) { |
1084 | const auto src = y * surface_stride; | 1087 | const auto src = y * surface_stride; |
1085 | const auto dst = y * out_luma_stride; | 1088 | const auto dst = y * out_luma_stride; |
@@ -1144,8 +1147,6 @@ void Vic::WriteABGR(const OutputSurfaceConfig& output_surface_config) { | |||
1144 | // clang-format on | 1147 | // clang-format on |
1145 | } | 1148 | } |
1146 | } | 1149 | } |
1147 | #elif defined(ARCHITECTURE_arm64) | ||
1148 | DecodeLinear(out_buffer); | ||
1149 | #else | 1150 | #else |
1150 | DecodeLinear(out_buffer); | 1151 | DecodeLinear(out_buffer); |
1151 | #endif | 1152 | #endif |